]> code.delx.au - youtube-cgi/blob - youtube.cgi
exec node instead of js
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 if url.startswith("//"):
83 url = "http:" + url
84
85 global referrer
86 req = urllib2.Request(url)
87 if not referrer:
88 referrer = url
89 else:
90 req.add_header("Referer", referrer)
91
92 req.add_header("User-Agent", USER_AGENT)
93
94 if offset:
95 req.add_header("Range", "bytes=%d-" % offset)
96
97 res = urlopener.open(req)
98
99 content_range = res.info().getheader("Content-Range")
100 if content_range:
101 tokens = content_range.split()
102 assert tokens[0] == "bytes"
103 start = int(tokens[1].split("-")[0])
104 assert start == offset
105 return res
106
107 def parse_url(url):
108 f = urlopen(url)
109 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
110 f.close()
111 return doc
112
113 def append_to_qs(url, params):
114 r = list(urlparse.urlsplit(url))
115 qs = urlparse.parse_qs(r[3])
116 qs.update(params)
117 r[3] = urllib.urlencode(qs, True)
118 url = urlparse.urlunsplit(r)
119 return url
120
121 def get_player_config(doc):
122 player_config = None
123 for script in doc.xpath("//script"):
124 if not script.text:
125 continue
126 for line in script.text.split("\n"):
127 s = "ytplayer.config = {"
128 if s in line:
129 p1 = line.find(s) + len(s) - 1
130 p2 = line.find("};", p1) + 1
131 if p1 >= 0 and p2 > 0:
132 return json.loads(line[p1:p2])
133
134 def extract_js(script):
135 PREFIX = "(function(){"
136 SUFFIX = "})();\n"
137 assert script.startswith(PREFIX)
138 assert script.endswith(SUFFIX)
139
140 return script[len(PREFIX):-len(SUFFIX)]
141
142 def find_func_name(script):
143 FUNC_NAME = R"([a-zA-Z0-9$]+)"
144 FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
145 PATTERN = FUNC_NAME + FUNC_PARAMS + ";"
146
147 match = re.search(PATTERN, script)
148 func_name = match.groups()[0]
149 return func_name
150
151 def decode_signature(js_url, signature):
152 script = urlopen(js_url).read()
153 func_name = find_func_name(script)
154
155 params = {
156 "func_name": func_name,
157 "signature": json.dumps(signature),
158 "code": json.dumps(extract_js(script)),
159 }
160 p = subprocess.Popen(
161 "node",
162 shell=True,
163 close_fds=True,
164 stdin=subprocess.PIPE,
165 stdout=subprocess.PIPE
166 )
167 js_decode_script = ("""
168 var vm = require("vm");
169
170 var sandbox = {
171 window: {
172 location: {},
173 history: {
174 pushState: function(){}
175 }
176 },
177 document: {},
178 navigator: {},
179 signature: %(signature)s,
180 transformed_signature: null
181 };
182
183 var execstring = ";transformed_signature = %(func_name)s(signature);";
184 vm.runInNewContext(%(code)s + execstring, sandbox);
185
186 console.log(sandbox.transformed_signature);
187 """ % params)
188
189 p.stdin.write(js_decode_script)
190 p.stdin.close()
191
192 transformed_signature = p.stdout.read().strip()
193 if p.wait() != 0:
194 raise Exception("js failed to execute: %d" % p.returncode)
195
196 return transformed_signature
197
198 def get_best_video(player_config):
199 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
200 js_url = player_config["assets"]["js"]
201
202 best_url = None
203 best_quality = None
204 best_extension = None
205 for url_data in url_data_list:
206 url_data = urlparse.parse_qs(url_data)
207 mimetype = url_data["type"][0].split(";")[0]
208 quality = url_data["quality"][0]
209
210 if url_data.has_key("stereo3d"):
211 continue
212 if quality not in QUALITIES:
213 continue
214 if mimetype not in MIMETYPES:
215 continue
216
217 extension = MIMETYPES[mimetype]
218 quality = QUALITIES.get(quality, -1)
219
220 if best_quality is not None and quality < best_quality:
221 continue
222
223 video_url = url_data["url"][0]
224 if "sig" in url_data:
225 signature = url_data["sig"][0]
226 elif "s" in url_data:
227 signature = decode_signature(js_url, url_data["s"][0])
228 else:
229 signature = None
230
231 if signature:
232 video_url = append_to_qs(video_url, {"signature": signature})
233
234 best_url = video_url
235 best_quality = quality
236 best_extension = extension
237
238 return best_url, best_extension
239
240 def sanitize_filename(filename):
241 return (
242 re.sub("\s+", " ", filename.strip())
243 .replace("\\", "-")
244 .replace("/", "-")
245 .replace("\0", " ")
246 )
247
248 def get_video_url(doc):
249 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
250 if unavailable:
251 raise VideoUnavailable(unavailable[0].strip())
252
253 player_config = get_player_config(doc)
254 if not player_config:
255 raise VideoUnavailable("Could not find video URL")
256
257 video_url, extension = get_best_video(player_config)
258 if not video_url:
259 return None, None
260
261 title = doc.xpath("/html/head/title/text()")[0]
262 filename = sanitize_filename(title)
263 filename += "." + extension
264
265 return video_url, filename
266
267 def write_video(filename, video_data):
268 httpinfo = video_data.info()
269 encoded_filename = urllib.quote(filename.encode("utf-8"))
270 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
271 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
272 sys.stdout.write("\r\n")
273 shutil.copyfileobj(video_data, sys.stdout)
274 video_data.close()
275
276 def cgimain():
277 args = cgi.parse()
278 try:
279 url = args["url"][0]
280 except:
281 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
282 return
283
284 try:
285 doc = parse_url(url)
286 video_url, filename = get_video_url(doc)
287 video_data = urlopen(video_url)
288 write_video(filename, video_data)
289 except VideoUnavailable, e:
290 print_form(
291 url=url,
292 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
293 )
294 except Exception, e:
295 print_form(
296 url=url,
297 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
298 )
299 return
300
301 def pp_size(size):
302 suffixes = ["", "KiB", "MiB", "GiB"]
303 for i, suffix in enumerate(suffixes):
304 if size < 1024:
305 break
306 size /= 1024
307 return "%.2f %s" % (size, suffix)
308
309 def copy_with_progress(content_length, infile, outfile):
310 def print_status():
311 rate = 0
312 if now != last_ts:
313 rate = last_bytes_read / (now - last_ts)
314 sys.stdout.write("\33[2K\r")
315 sys.stdout.write("%s / %s (%s/sec)" % (
316 pp_size(bytes_read),
317 pp_size(content_length),
318 pp_size(rate),
319 ))
320 sys.stdout.flush()
321
322 last_ts = 0
323 last_bytes_read = 0
324 bytes_read = 0
325 while True:
326 now = time.time()
327 if now - last_ts > 0.5:
328 print_status()
329 last_ts = now
330 last_bytes_read = 0
331
332 buf = infile.read(32768)
333 if not buf:
334 break
335 outfile.write(buf)
336 last_bytes_read += len(buf)
337 bytes_read += len(buf)
338
339 # Newline at the end
340 print_status()
341 print
342
343 def main():
344 try:
345 url = sys.argv[1]
346 except:
347 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
348 sys.exit(1)
349
350 doc = parse_url(url)
351 video_url, filename = get_video_url(doc)
352 print "Downloading", filename.encode("utf-8")
353
354 outfile = open(filename, "a")
355 offset = outfile.tell()
356 if offset > 0:
357 print "Resuming download from", pp_size(offset)
358 total_size = None
359
360 while True:
361 try:
362 video_data = urlopen(video_url, offset)
363 except urllib2.HTTPError, e:
364 if e.code == 416:
365 print "File is complete!"
366 break
367 else:
368 raise
369
370 content_length = int(video_data.info().getheader("Content-Length"))
371 if total_size is None:
372 total_size = content_length
373
374 try:
375 copy_with_progress(content_length, video_data, outfile)
376 except IOError, e:
377 print
378
379 video_data.close()
380 if outfile.tell() != total_size:
381 old_offset = offset
382 offset = outfile.tell()
383 if old_offset == offset:
384 time.sleep(1)
385 print "Restarting download from", pp_size(offset)
386 else:
387 break
388
389 outfile.close()
390
391
392 if __name__ == "__main__":
393 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
394 if os.environ.has_key("SCRIPT_NAME"):
395 cgimain()
396 else:
397 try:
398 main()
399 except KeyboardInterrupt:
400 print "\nExiting..."
401 sys.exit(1)
402