7d3935771ebbbec6f11268b9ac75e641241413f1
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 if url.startswith("//"):
83 url = "http:" + url
84
85 global referrer
86 req = urllib2.Request(url)
87 if not referrer:
88 referrer = url
89 else:
90 req.add_header("Referer", referrer)
91
92 req.add_header("User-Agent", USER_AGENT)
93
94 if offset:
95 req.add_header("Range", "bytes=%d-" % offset)
96
97 res = urlopener.open(req)
98
99 content_range = res.info().getheader("Content-Range")
100 if content_range:
101 tokens = content_range.split()
102 assert tokens[0] == "bytes"
103 start = int(tokens[1].split("-")[0])
104 assert start == offset
105 return res
106
107 def parse_url(url):
108 f = urlopen(url)
109 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
110 f.close()
111 return doc
112
113 def append_to_qs(url, params):
114 r = list(urlparse.urlsplit(url))
115 qs = urlparse.parse_qs(r[3])
116 qs.update(params)
117 r[3] = urllib.urlencode(qs, True)
118 url = urlparse.urlunsplit(r)
119 return url
120
121 def get_player_config(doc):
122 player_config = None
123 for script in doc.xpath("//script"):
124 if not script.text:
125 continue
126 for line in script.text.split("\n"):
127 s = "ytplayer.config = {"
128 if s in line:
129 p1 = line.find(s) + len(s) - 1
130 p2 = line.find("};", p1) + 1
131 if p1 >= 0 and p2 > 0:
132 return json.loads(line[p1:p2])
133
134 def extract_js(script):
135 PREFIX = "(function(){"
136 SUFFIX = "})();\n"
137 assert script.startswith(PREFIX)
138 assert script.endswith(SUFFIX)
139
140 return script[len(PREFIX):-len(SUFFIX)]
141
142 def find_func_name(script):
143 FUNC_NAME = R"([a-zA-Z0-9$]+)"
144 FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
145 PATTERN = FUNC_NAME + FUNC_PARAMS + ";"
146
147 match = re.search(PATTERN, script)
148 func_name = match.groups()[0]
149 return func_name
150
151 def decode_signature(js_url, signature):
152 script = urlopen(js_url).read()
153 func_name = find_func_name(script)
154
155 params = {
156 "func_name": func_name,
157 "signature": json.dumps(signature),
158 "code": json.dumps(extract_js(script)),
159 }
160 p = subprocess.Popen(
161 "nodejs",
162 shell=True,
163 close_fds=True,
164 stdin=subprocess.PIPE,
165 stdout=subprocess.PIPE
166 )
167 js_decode_script = ("""
168 var vm = require('vm');
169
170 var sandbox = {
171 window: {
172 location: {
173 hash: '',
174 href: ''
175 },
176 history: {
177 pushState: function(){}
178 },
179 navigator: {}
180 },
181 document: {},
182 navigator: {},
183 signature: %(signature)s,
184 transformed_signature: null
185 };
186
187 var code_string = %(code)s + ';';
188 var exec_string = 'transformed_signature = %(func_name)s(signature);';
189 vm.runInNewContext(code_string + exec_string, sandbox);
190
191 console.log(sandbox.transformed_signature);
192 """ % params)
193
194 p.stdin.write(js_decode_script)
195 p.stdin.close()
196
197 transformed_signature = p.stdout.read().strip()
198 if p.wait() != 0:
199 raise Exception("js failed to execute: %d" % p.returncode)
200
201 return transformed_signature
202
203 def get_best_video(player_config):
204 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
205 js_url = player_config["assets"]["js"]
206
207 best_url = None
208 best_quality = None
209 best_extension = None
210 for url_data in url_data_list:
211 url_data = urlparse.parse_qs(url_data)
212 mimetype = url_data["type"][0].split(";")[0]
213 quality = url_data["quality"][0]
214
215 if url_data.has_key("stereo3d"):
216 continue
217 if quality not in QUALITIES:
218 continue
219 if mimetype not in MIMETYPES:
220 continue
221
222 extension = MIMETYPES[mimetype]
223 quality = QUALITIES.get(quality, -1)
224
225 if best_quality is not None and quality < best_quality:
226 continue
227
228 video_url = url_data["url"][0]
229 if "sig" in url_data:
230 signature = url_data["sig"][0]
231 elif "s" in url_data:
232 signature = decode_signature(js_url, url_data["s"][0])
233 else:
234 signature = None
235
236 if signature:
237 video_url = append_to_qs(video_url, {"signature": signature})
238
239 best_url = video_url
240 best_quality = quality
241 best_extension = extension
242
243 return best_url, best_extension
244
245 def sanitize_filename(filename):
246 return (
247 re.sub("\s+", " ", filename.strip())
248 .replace("\\", "-")
249 .replace("/", "-")
250 .replace("\0", " ")
251 )
252
253 def get_video_url(doc):
254 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
255 if unavailable:
256 raise VideoUnavailable(unavailable[0].strip())
257
258 player_config = get_player_config(doc)
259 if not player_config:
260 raise VideoUnavailable("Could not find video URL")
261
262 video_url, extension = get_best_video(player_config)
263 if not video_url:
264 return None, None
265
266 title = doc.xpath("/html/head/title/text()")[0]
267 filename = sanitize_filename(title)
268 filename += "." + extension
269
270 return video_url, filename
271
272 def write_video(filename, video_data):
273 httpinfo = video_data.info()
274 encoded_filename = urllib.quote(filename.encode("utf-8"))
275 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
276 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
277 sys.stdout.write("\r\n")
278 shutil.copyfileobj(video_data, sys.stdout)
279 video_data.close()
280
281 def cgimain():
282 args = cgi.parse()
283 try:
284 url = args["url"][0]
285 except:
286 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
287 return
288
289 try:
290 doc = parse_url(url)
291 video_url, filename = get_video_url(doc)
292 video_data = urlopen(video_url)
293 write_video(filename, video_data)
294 except VideoUnavailable, e:
295 print_form(
296 url=url,
297 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
298 )
299 except Exception, e:
300 print_form(
301 url=url,
302 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
303 )
304 return
305
306 def pp_size(size):
307 suffixes = ["", "KiB", "MiB", "GiB"]
308 for i, suffix in enumerate(suffixes):
309 if size < 1024:
310 break
311 size /= 1024
312 return "%.2f %s" % (size, suffix)
313
314 def copy_with_progress(content_length, infile, outfile):
315 def print_status():
316 rate = 0
317 if now != last_ts:
318 rate = last_bytes_read / (now - last_ts)
319 sys.stdout.write("\33[2K\r")
320 sys.stdout.write("%s / %s (%s/sec)" % (
321 pp_size(bytes_read),
322 pp_size(content_length),
323 pp_size(rate),
324 ))
325 sys.stdout.flush()
326
327 last_ts = 0
328 last_bytes_read = 0
329 bytes_read = 0
330 while True:
331 now = time.time()
332 if now - last_ts > 0.5:
333 print_status()
334 last_ts = now
335 last_bytes_read = 0
336
337 buf = infile.read(32768)
338 if not buf:
339 break
340 outfile.write(buf)
341 last_bytes_read += len(buf)
342 bytes_read += len(buf)
343
344 # Newline at the end
345 print_status()
346 print
347
348 def main():
349 try:
350 url = sys.argv[1]
351 except:
352 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
353 sys.exit(1)
354
355 doc = parse_url(url)
356 video_url, filename = get_video_url(doc)
357 print "Downloading", filename.encode("utf-8")
358
359 outfile = open(filename, "a")
360 offset = outfile.tell()
361 if offset > 0:
362 print "Resuming download from", pp_size(offset)
363 total_size = None
364
365 while True:
366 try:
367 video_data = urlopen(video_url, offset)
368 except urllib2.HTTPError, e:
369 if e.code == 416:
370 print "File is complete!"
371 break
372 else:
373 raise
374
375 content_length = int(video_data.info().getheader("Content-Length"))
376 if total_size is None:
377 total_size = content_length
378
379 try:
380 copy_with_progress(content_length, video_data, outfile)
381 except IOError, e:
382 print
383
384 video_data.close()
385 if outfile.tell() != total_size:
386 old_offset = offset
387 offset = outfile.tell()
388 if old_offset == offset:
389 time.sleep(1)
390 print "Restarting download from", pp_size(offset)
391 else:
392 break
393
394 outfile.close()
395
396
397 if __name__ == "__main__":
398 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
399 if os.environ.has_key("SCRIPT_NAME"):
400 cgimain()
401 else:
402 try:
403 main()
404 except KeyboardInterrupt:
405 print "\nExiting..."
406 sys.exit(1)
407