]> code.delx.au - youtube-cgi/blob - youtube.cgi
8b103bee7847de97bbaa5ae68350d6c98e358e76
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/env python
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import itertools
8 import json
9 from lxml import html
10 import os
11 import re
12 import resource
13 import shutil
14 import subprocess
15 import sys
16 import time
17 import urllib
18 import urllib2
19 import urlparse
20
21
22 MAX_MEMORY_BYTES = 128 * 1024*1024
23 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
24
25 MIMETYPES = {
26 "video/mp4": "mp4",
27 "video/x-flv": "flv",
28 "video/3gpp": "3gp",
29 }
30
31 QUALITIES = {
32 "hd1080": 5,
33 "hd720": 4,
34 "large": 3,
35 "medium": 2,
36 "small": 1,
37 }
38
39
40 class VideoUnavailable(Exception):
41 pass
42
43 def print_form(url="", msg=""):
44 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
45 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
46 sys.stdout.write("""
47 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
48 <html xmlns="http://www.w3.org/1999/xhtml">
49 <head>
50 <title>delx.net.au - YouTube Scraper</title>
51 <link rel="stylesheet" type="text/css" href="/style.css"/>
52 <style type="text/css">
53 input[type="text"] {
54 width: 100%;
55 }
56 .error {
57 color: red;
58 }
59 </style>
60 </head>
61 <body>
62 <h1>delx.net.au - YouTube Scraper</h1>
63 {0}
64 <form action="" method="get">
65 <p>This page will let you easily download YouTube videos to watch offline. It
66 will automatically grab the highest quality version.</p>
67 <div><input type="text" name="url" value="{1}"/></div>
68 <div><input type="submit" value="Download!"/></div>
69 </form>
70 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
71 to easily download videos. Right-click the link and add it to bookmarks,
72 then when you're looking at a YouTube page select that bookmark from your
73 browser's bookmarks menu to download the video straight away.</p>
74 </body>
75 </html>
76 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
77
78 cookiejar = cookielib.CookieJar()
79 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
80 referrer = ""
81
82 def urlopen(url, offset=None):
83 global referrer
84 req = urllib2.Request(url)
85 if referrer:
86 req.add_header("Referer", referrer)
87 referrer = url
88
89 req.add_header("User-Agent", USER_AGENT)
90
91 if offset:
92 req.add_header("Range", "bytes=%d-" % offset)
93
94 res = urlopener.open(req)
95
96 content_range = res.info().getheader("Content-Range")
97 if content_range:
98 tokens = content_range.split()
99 assert tokens[0] == "bytes"
100 start = int(tokens[1].split("-")[0])
101 assert start == offset
102 return res
103
104 def parse_url(url):
105 f = urlopen(url)
106 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
107 f.close()
108 return doc
109
110 def append_to_qs(url, params):
111 r = list(urlparse.urlsplit(url))
112 qs = urlparse.parse_qs(r[3])
113 qs.update(params)
114 r[3] = urllib.urlencode(qs, True)
115 url = urlparse.urlunsplit(r)
116 return url
117
118 def convert_from_old_itag(player_config):
119 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
120 url_data["url"] = []
121 for itag_url in url_data["itag"]:
122 pos = itag_url.find("url=")
123 url_data["url"].append(itag_url[pos+4:])
124 player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
125
126 def get_player_config(doc):
127 player_config = None
128 for script in doc.xpath("//script"):
129 if not script.text:
130 continue
131 for line in script.text.split("\n"):
132 if "yt.playerConfig =" in line:
133 p1 = line.find("=")
134 p2 = line.rfind(";")
135 if p1 >= 0 and p2 > 0:
136 return json.loads(line[p1+1:p2])
137 if "'PLAYER_CONFIG': " in line:
138 p1 = line.find(":")
139 if p1 >= 0:
140 player_config = json.loads(line[p1+1:])
141 convert_from_old_itag(player_config)
142 return player_config
143
144 def get_best_video(player_config):
145 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
146 url_data = itertools.izip_longest(
147 url_data["url"],
148 url_data["type"],
149 url_data["quality"],
150 url_data.get("sig", []),
151 )
152 best_url = None
153 best_quality = None
154 best_extension = None
155 for video_url, mimetype, quality, signature in url_data:
156 mimetype = mimetype.split(";")[0]
157 if mimetype not in MIMETYPES:
158 continue
159 extension = MIMETYPES[mimetype]
160 quality = QUALITIES.get(quality.split(",")[0], -1)
161 if best_quality is None or quality > best_quality:
162 if signature:
163 video_url = append_to_qs(video_url, {"signature": signature})
164 best_url = video_url
165 best_quality = quality
166 best_extension = extension
167
168 return best_url, best_extension
169
170 def sanitize_filename(filename):
171 return (
172 re.sub("\s+", " ", filename.strip())
173 .replace("\\", "-")
174 .replace("/", "-")
175 .replace("\0", " ")
176 )
177
178 def get_video_url(doc):
179 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
180 if unavailable:
181 raise VideoUnavailable(unavailable[0].strip())
182
183 player_config = get_player_config(doc)
184 if not player_config:
185 raise VideoUnavailable("Could not find video URL")
186
187 video_url, extension = get_best_video(player_config)
188 if not video_url:
189 return None, None
190
191 title = doc.xpath("/html/head/title/text()")[0]
192 filename = sanitize_filename(title)
193 filename += "." + extension
194
195 return video_url, filename
196
197 def write_video(filename, video_data):
198 httpinfo = video_data.info()
199 encoded_filename = urllib.quote(filename.encode("utf-8"))
200 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
201 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
202 sys.stdout.write("\r\n")
203 shutil.copyfileobj(video_data, sys.stdout)
204 video_data.close()
205
206 def cgimain():
207 args = cgi.parse()
208 try:
209 url = args["url"][0]
210 except:
211 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
212 return
213
214 try:
215 doc = parse_url(url)
216 video_url, filename = get_video_url(doc)
217 video_data = urlopen(video_url)
218 write_video(filename, video_data)
219 except VideoUnavailable, e:
220 print_form(
221 url=url,
222 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
223 )
224 except Exception, e:
225 print_form(
226 url=url,
227 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
228 )
229 return
230
231 def pp_size(size):
232 suffixes = ["", "KiB", "MiB", "GiB"]
233 for i, suffix in enumerate(suffixes):
234 if size < 1024:
235 break
236 size /= 1024
237 return "%.2f %s" % (size, suffix)
238
239 def copy_with_progress(content_length, infile, outfile):
240 def print_status():
241 sys.stdout.write("\33[2K\r")
242 sys.stdout.write("%s / %s (%s/sec)" % (
243 pp_size(bytes_read),
244 pp_size(content_length),
245 pp_size(bytes_read / (now - start_ts)),
246 ))
247 sys.stdout.flush()
248
249 start_ts = time.time()
250 last_ts = 0
251 bytes_read = 0
252 while True:
253 now = time.time()
254 if now - last_ts > 0.5:
255 last_ts = now
256 print_status()
257
258 buf = infile.read(32768)
259 if not buf:
260 break
261 outfile.write(buf)
262 bytes_read += len(buf)
263
264 # Newline at the end
265 print_status()
266 print
267
268 def main():
269 try:
270 url = sys.argv[1]
271 except:
272 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
273 sys.exit(1)
274
275 doc = parse_url(url)
276 video_url, filename = get_video_url(doc)
277 print "Downloading", filename.encode("utf-8")
278
279 outfile = open(filename, "a")
280 offset = outfile.tell()
281 if offset > 0:
282 print "Resuming download from", pp_size(offset)
283 total_size = None
284
285 while True:
286 try:
287 video_data = urlopen(video_url, offset)
288 except urllib2.HTTPError, e:
289 if e.code == 416:
290 print "File is complete!"
291 break
292 else:
293 raise
294
295 content_length = int(video_data.info().getheader("Content-Length"))
296 if total_size is None:
297 total_size = content_length
298
299 try:
300 copy_with_progress(content_length, video_data, outfile)
301 except IOError, e:
302 print
303
304 video_data.close()
305 if outfile.tell() != total_size:
306 old_offset = offset
307 offset = outfile.tell()
308 if old_offset == offset:
309 time.sleep(1)
310 print "Restarting download from", pp_size(offset)
311 else:
312 break
313
314 outfile.close()
315
316
317 if __name__ == "__main__":
318 resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
319 if os.environ.has_key("SCRIPT_NAME"):
320 cgimain()
321 else:
322 try:
323 main()
324 except KeyboardInterrupt:
325 print "\nExiting..."
326 sys.exit(1)
327