]> code.delx.au - youtube-cgi/blob - youtube.cgi
Fix to handle function call in expression instead of statement
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 if url.startswith("//"):
83 url = "http:" + url
84
85 global referrer
86 req = urllib2.Request(url)
87 if not referrer:
88 referrer = url
89 else:
90 req.add_header("Referer", referrer)
91
92 req.add_header("User-Agent", USER_AGENT)
93
94 if offset:
95 req.add_header("Range", "bytes=%d-" % offset)
96
97 res = urlopener.open(req)
98
99 content_range = res.info().getheader("Content-Range")
100 if content_range:
101 tokens = content_range.split()
102 assert tokens[0] == "bytes"
103 start = int(tokens[1].split("-")[0])
104 assert start == offset
105 return res
106
107 def parse_url(url):
108 f = urlopen(url)
109 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
110 f.close()
111 return doc
112
113 def append_to_qs(url, params):
114 r = list(urlparse.urlsplit(url))
115 qs = urlparse.parse_qs(r[3])
116 qs.update(params)
117 r[3] = urllib.urlencode(qs, True)
118 url = urlparse.urlunsplit(r)
119 return url
120
121 def get_player_config(doc):
122 player_config = None
123 for script in doc.xpath("//script"):
124 if not script.text:
125 continue
126 for line in script.text.split("\n"):
127 s = "ytplayer.config = {"
128 if s in line:
129 p1 = line.find(s) + len(s) - 1
130 p2 = line.find("};", p1) + 1
131 if p1 >= 0 and p2 > 0:
132 return json.loads(line[p1:p2])
133
134 def extract_js(script):
135 PREFIX = "var _yt_player={};(function(g){var window=this;"
136 SUFFIX = ";})(_yt_player);\n"
137 assert script.startswith(PREFIX)
138 assert script.endswith(SUFFIX)
139
140 return script[len(PREFIX):-len(SUFFIX)]
141
142 def find_func_name(script):
143 FUNC_NAME = R"([a-zA-Z0-9$]+)"
144 FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
145 TERMINATOR = R"[,;]"
146 PATTERN = FUNC_NAME + FUNC_PARAMS + TERMINATOR
147
148 match = re.search(PATTERN, script)
149 func_name = match.groups()[0]
150 return func_name
151
152 def decode_signature(js_url, signature):
153 script = urlopen(js_url).read()
154 func_name = find_func_name(script)
155
156 params = {
157 "func_name": func_name,
158 "signature": json.dumps(signature),
159 "code": json.dumps(extract_js(script)),
160 }
161 p = subprocess.Popen(
162 "nodejs",
163 shell=True,
164 close_fds=True,
165 stdin=subprocess.PIPE,
166 stdout=subprocess.PIPE
167 )
168 js_decode_script = ("""
169 var vm = require('vm');
170
171 var sandbox = {
172 location: {
173 hash: '',
174 href: '',
175 protocol: 'http:'
176 },
177 history: {
178 pushState: function(){}
179 },
180 document: {},
181 navigator: {
182 userAgent: ''
183 },
184 signature: %(signature)s,
185 transformed_signature: null,
186 g: function(){} // this is _yt_player
187 };
188 sandbox.window = sandbox;
189
190 var code_string = %(code)s + ';';
191 var exec_string = 'transformed_signature = %(func_name)s(signature);';
192 vm.runInNewContext(code_string + exec_string, sandbox);
193
194 console.log(sandbox.transformed_signature);
195 """ % params)
196
197 p.stdin.write(js_decode_script)
198 p.stdin.close()
199
200 transformed_signature = p.stdout.read().strip()
201 if p.wait() != 0:
202 raise Exception("js failed to execute: %d" % p.returncode)
203
204 return transformed_signature
205
206 def get_best_video(player_config):
207 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
208 js_url = player_config["assets"]["js"]
209
210 best_url = None
211 best_quality = None
212 best_extension = None
213 for url_data in url_data_list:
214 url_data = urlparse.parse_qs(url_data)
215 mimetype = url_data["type"][0].split(";")[0]
216 quality = url_data["quality"][0]
217
218 if url_data.has_key("stereo3d"):
219 continue
220 if quality not in QUALITIES:
221 continue
222 if mimetype not in MIMETYPES:
223 continue
224
225 extension = MIMETYPES[mimetype]
226 quality = QUALITIES.get(quality, -1)
227
228 if best_quality is not None and quality < best_quality:
229 continue
230
231 video_url = url_data["url"][0]
232 if "sig" in url_data:
233 signature = url_data["sig"][0]
234 elif "s" in url_data:
235 signature = decode_signature(js_url, url_data["s"][0])
236 else:
237 signature = None
238
239 if signature:
240 video_url = append_to_qs(video_url, {"signature": signature})
241
242 best_url = video_url
243 best_quality = quality
244 best_extension = extension
245
246 return best_url, best_extension
247
248 def sanitize_filename(filename):
249 return (
250 re.sub("\s+", " ", filename.strip())
251 .replace("\\", "-")
252 .replace("/", "-")
253 .replace("\0", " ")
254 )
255
256 def get_video_url(doc):
257 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
258 if unavailable:
259 raise VideoUnavailable(unavailable[0].strip())
260
261 player_config = get_player_config(doc)
262 if not player_config:
263 raise VideoUnavailable("Could not find video URL")
264
265 video_url, extension = get_best_video(player_config)
266 if not video_url:
267 return None, None
268
269 title = doc.xpath("/html/head/title/text()")[0]
270 filename = sanitize_filename(title)
271 filename += "." + extension
272
273 return video_url, filename
274
275 def write_video(filename, video_data):
276 httpinfo = video_data.info()
277 encoded_filename = urllib.quote(filename.encode("utf-8"))
278 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
279 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
280 sys.stdout.write("\r\n")
281 shutil.copyfileobj(video_data, sys.stdout)
282 video_data.close()
283
284 def cgimain():
285 args = cgi.parse()
286 try:
287 url = args["url"][0]
288 except:
289 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
290 return
291
292 try:
293 doc = parse_url(url)
294 video_url, filename = get_video_url(doc)
295 video_data = urlopen(video_url)
296 write_video(filename, video_data)
297 except VideoUnavailable, e:
298 print_form(
299 url=url,
300 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
301 )
302 except Exception, e:
303 print_form(
304 url=url,
305 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
306 )
307 return
308
309 def pp_size(size):
310 suffixes = ["", "KiB", "MiB", "GiB"]
311 for i, suffix in enumerate(suffixes):
312 if size < 1024:
313 break
314 size /= 1024
315 return "%.2f %s" % (size, suffix)
316
317 def copy_with_progress(content_length, infile, outfile):
318 def print_status():
319 rate = 0
320 if now != last_ts:
321 rate = last_bytes_read / (now - last_ts)
322 sys.stdout.write("\33[2K\r")
323 sys.stdout.write("%s / %s (%s/sec)" % (
324 pp_size(bytes_read),
325 pp_size(content_length),
326 pp_size(rate),
327 ))
328 sys.stdout.flush()
329
330 last_ts = 0
331 last_bytes_read = 0
332 bytes_read = 0
333 while True:
334 now = time.time()
335 if now - last_ts > 0.5:
336 print_status()
337 last_ts = now
338 last_bytes_read = 0
339
340 buf = infile.read(32768)
341 if not buf:
342 break
343 outfile.write(buf)
344 last_bytes_read += len(buf)
345 bytes_read += len(buf)
346
347 # Newline at the end
348 print_status()
349 print
350
351 def main():
352 try:
353 url = sys.argv[1]
354 except:
355 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
356 sys.exit(1)
357
358 doc = parse_url(url)
359 video_url, filename = get_video_url(doc)
360 print "Downloading", filename.encode("utf-8")
361
362 outfile = open(filename, "a")
363 offset = outfile.tell()
364 if offset > 0:
365 print "Resuming download from", pp_size(offset)
366 total_size = None
367
368 while True:
369 try:
370 video_data = urlopen(video_url, offset)
371 except urllib2.HTTPError, e:
372 if e.code == 416:
373 print "File is complete!"
374 break
375 else:
376 raise
377
378 content_length = int(video_data.info().getheader("Content-Length"))
379 if total_size is None:
380 total_size = content_length
381
382 try:
383 copy_with_progress(content_length, video_data, outfile)
384 except IOError, e:
385 print
386
387 video_data.close()
388 if outfile.tell() != total_size:
389 old_offset = offset
390 offset = outfile.tell()
391 if old_offset == offset:
392 time.sleep(1)
393 print "Restarting download from", pp_size(offset)
394 else:
395 break
396
397 outfile.close()
398
399
400 if __name__ == "__main__":
401 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
402 if os.environ.has_key("SCRIPT_NAME"):
403 cgimain()
404 else:
405 try:
406 main()
407 except KeyboardInterrupt:
408 print "\nExiting..."
409 sys.exit(1)
410