]> code.delx.au - youtube-cgi/blob - youtube.cgi
use nodejs sandboxing
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 if url.startswith("//"):
83 url = "http:" + url
84
85 global referrer
86 req = urllib2.Request(url)
87 if not referrer:
88 referrer = url
89 else:
90 req.add_header("Referer", referrer)
91
92 req.add_header("User-Agent", USER_AGENT)
93
94 if offset:
95 req.add_header("Range", "bytes=%d-" % offset)
96
97 res = urlopener.open(req)
98
99 content_range = res.info().getheader("Content-Range")
100 if content_range:
101 tokens = content_range.split()
102 assert tokens[0] == "bytes"
103 start = int(tokens[1].split("-")[0])
104 assert start == offset
105 return res
106
107 def parse_url(url):
108 f = urlopen(url)
109 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
110 f.close()
111 return doc
112
113 def append_to_qs(url, params):
114 r = list(urlparse.urlsplit(url))
115 qs = urlparse.parse_qs(r[3])
116 qs.update(params)
117 r[3] = urllib.urlencode(qs, True)
118 url = urlparse.urlunsplit(r)
119 return url
120
121 def get_player_config(doc):
122 player_config = None
123 for script in doc.xpath("//script"):
124 if not script.text:
125 continue
126 for line in script.text.split("\n"):
127 s = "ytplayer.config = {"
128 if s in line:
129 p1 = line.find(s) + len(s) - 1
130 p2 = line.find("};", p1) + 1
131 if p1 >= 0 and p2 > 0:
132 return json.loads(line[p1:p2])
133
134 def extract_js(script):
135 PREFIX = "(function(){"
136 SUFFIX = "})();\n"
137 assert script.startswith(PREFIX)
138 assert script.endswith(SUFFIX)
139
140 return script[len(PREFIX):-len(SUFFIX)]
141
142 def find_func_name(script):
143 FUNC_NAME = R"([a-zA-Z0-9$]+)"
144 FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
145 PATTERN = FUNC_NAME + FUNC_PARAMS + ";"
146
147 match = re.search(PATTERN, script)
148 func_name = match.groups()[0]
149 return func_name
150
151 def decode_signature(js_url, signature):
152 script = urlopen(js_url).read()
153 func_name = find_func_name(script)
154
155 params = {
156 "func_name": func_name,
157 "signature": json.dumps(signature),
158 "code": json.dumps(extract_js(script)),
159 }
160 p = subprocess.Popen(
161 "js",
162 shell=True,
163 close_fds=True,
164 stdin=subprocess.PIPE,
165 stdout=subprocess.PIPE
166 )
167 js_decode_script = ("""
168 var vm = require("vm");
169
170 var sandbox = {
171 window: {
172 location: {}
173 },
174 document: {},
175 navigator: {},
176 signature: %(signature)s,
177 transformed_signature: null
178 };
179
180 var execstring = ";transformed_signature = %(func_name)s(signature);";
181 vm.runInNewContext(%(code)s + execstring, sandbox);
182
183 console.log(sandbox.transformed_signature);
184 """ % params)
185
186 p.stdin.write(js_decode_script)
187 p.stdin.close()
188
189 transformed_signature = p.stdout.read().strip()
190 if p.wait() != 0:
191 raise Exception("js failed to execute: %d" % p.returncode)
192
193 return transformed_signature
194
195 def get_best_video(player_config):
196 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
197 js_url = player_config["assets"]["js"]
198
199 best_url = None
200 best_quality = None
201 best_extension = None
202 for url_data in url_data_list:
203 url_data = urlparse.parse_qs(url_data)
204 mimetype = url_data["type"][0].split(";")[0]
205 quality = url_data["quality"][0]
206
207 if url_data.has_key("stereo3d"):
208 continue
209 if quality not in QUALITIES:
210 continue
211 if mimetype not in MIMETYPES:
212 continue
213
214 extension = MIMETYPES[mimetype]
215 quality = QUALITIES.get(quality, -1)
216
217 if best_quality is not None and quality < best_quality:
218 continue
219
220 video_url = url_data["url"][0]
221 if "sig" in url_data:
222 signature = url_data["sig"][0]
223 elif "s" in url_data:
224 signature = decode_signature(js_url, url_data["s"][0])
225 else:
226 signature = None
227
228 if signature:
229 video_url = append_to_qs(video_url, {"signature": signature})
230
231 best_url = video_url
232 best_quality = quality
233 best_extension = extension
234
235 return best_url, best_extension
236
237 def sanitize_filename(filename):
238 return (
239 re.sub("\s+", " ", filename.strip())
240 .replace("\\", "-")
241 .replace("/", "-")
242 .replace("\0", " ")
243 )
244
245 def get_video_url(doc):
246 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
247 if unavailable:
248 raise VideoUnavailable(unavailable[0].strip())
249
250 player_config = get_player_config(doc)
251 if not player_config:
252 raise VideoUnavailable("Could not find video URL")
253
254 video_url, extension = get_best_video(player_config)
255 if not video_url:
256 return None, None
257
258 title = doc.xpath("/html/head/title/text()")[0]
259 filename = sanitize_filename(title)
260 filename += "." + extension
261
262 return video_url, filename
263
264 def write_video(filename, video_data):
265 httpinfo = video_data.info()
266 encoded_filename = urllib.quote(filename.encode("utf-8"))
267 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
268 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
269 sys.stdout.write("\r\n")
270 shutil.copyfileobj(video_data, sys.stdout)
271 video_data.close()
272
273 def cgimain():
274 args = cgi.parse()
275 try:
276 url = args["url"][0]
277 except:
278 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
279 return
280
281 try:
282 doc = parse_url(url)
283 video_url, filename = get_video_url(doc)
284 video_data = urlopen(video_url)
285 write_video(filename, video_data)
286 except VideoUnavailable, e:
287 print_form(
288 url=url,
289 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
290 )
291 except Exception, e:
292 print_form(
293 url=url,
294 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
295 )
296 return
297
298 def pp_size(size):
299 suffixes = ["", "KiB", "MiB", "GiB"]
300 for i, suffix in enumerate(suffixes):
301 if size < 1024:
302 break
303 size /= 1024
304 return "%.2f %s" % (size, suffix)
305
306 def copy_with_progress(content_length, infile, outfile):
307 def print_status():
308 rate = 0
309 if now != last_ts:
310 rate = last_bytes_read / (now - last_ts)
311 sys.stdout.write("\33[2K\r")
312 sys.stdout.write("%s / %s (%s/sec)" % (
313 pp_size(bytes_read),
314 pp_size(content_length),
315 pp_size(rate),
316 ))
317 sys.stdout.flush()
318
319 last_ts = 0
320 last_bytes_read = 0
321 bytes_read = 0
322 while True:
323 now = time.time()
324 if now - last_ts > 0.5:
325 print_status()
326 last_ts = now
327 last_bytes_read = 0
328
329 buf = infile.read(32768)
330 if not buf:
331 break
332 outfile.write(buf)
333 last_bytes_read += len(buf)
334 bytes_read += len(buf)
335
336 # Newline at the end
337 print_status()
338 print
339
340 def main():
341 try:
342 url = sys.argv[1]
343 except:
344 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
345 sys.exit(1)
346
347 doc = parse_url(url)
348 video_url, filename = get_video_url(doc)
349 print "Downloading", filename.encode("utf-8")
350
351 outfile = open(filename, "a")
352 offset = outfile.tell()
353 if offset > 0:
354 print "Resuming download from", pp_size(offset)
355 total_size = None
356
357 while True:
358 try:
359 video_data = urlopen(video_url, offset)
360 except urllib2.HTTPError, e:
361 if e.code == 416:
362 print "File is complete!"
363 break
364 else:
365 raise
366
367 content_length = int(video_data.info().getheader("Content-Length"))
368 if total_size is None:
369 total_size = content_length
370
371 try:
372 copy_with_progress(content_length, video_data, outfile)
373 except IOError, e:
374 print
375
376 video_data.close()
377 if outfile.tell() != total_size:
378 old_offset = offset
379 offset = outfile.tell()
380 if old_offset == offset:
381 time.sleep(1)
382 print "Restarting download from", pp_size(offset)
383 else:
384 break
385
386 outfile.close()
387
388
389 if __name__ == "__main__":
390 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
391 if os.environ.has_key("SCRIPT_NAME"):
392 cgimain()
393 else:
394 try:
395 main()
396 except KeyboardInterrupt:
397 print "\nExiting..."
398 sys.exit(1)
399