]> code.delx.au - youtube-cgi/blob - youtube.cgi
Fixed to handle YouTube JS changes
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 if url.startswith("//"):
83 url = "https:" + url
84 if not url.startswith("http://") and not url.startswith("https://"):
85 url = "https://www.youtube.com" + url
86
87 global referrer
88 req = urllib2.Request(url)
89 if not referrer:
90 referrer = url
91 else:
92 req.add_header("Referer", referrer)
93
94 req.add_header("User-Agent", USER_AGENT)
95
96 if offset:
97 req.add_header("Range", "bytes=%d-" % offset)
98
99 res = urlopener.open(req)
100
101 content_range = res.info().getheader("Content-Range")
102 if content_range:
103 tokens = content_range.split()
104 assert tokens[0] == "bytes"
105 start = int(tokens[1].split("-")[0])
106 assert start == offset
107 return res
108
109 def parse_url(url):
110 f = urlopen(url)
111 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
112 f.close()
113 return doc
114
115 def append_to_qs(url, params):
116 r = list(urlparse.urlsplit(url))
117 qs = urlparse.parse_qs(r[3])
118 qs.update(params)
119 r[3] = urllib.urlencode(qs, True)
120 url = urlparse.urlunsplit(r)
121 return url
122
123 def get_player_config(doc):
124 player_config = None
125 for script in doc.xpath("//script"):
126 if not script.text:
127 continue
128 for line in script.text.split("\n"):
129 s = "ytplayer.config = {"
130 if s in line:
131 p1 = line.find(s) + len(s) - 1
132 p2 = line.find("};", p1) + 1
133 if p1 >= 0 and p2 > 0:
134 return json.loads(line[p1:p2])
135
136 def extract_js(script):
137 PREFIX = "var _yt_player={};(function(g){var window=this;"
138 SUFFIX = ";})(_yt_player);\n"
139 assert script.startswith(PREFIX)
140 assert script.endswith(SUFFIX)
141
142 return script[len(PREFIX):-len(SUFFIX)]
143
144 def find_func_name(script):
145 FUNC_NAME = R"([a-zA-Z0-9$]+)"
146 FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
147 TERMINATOR = R"[,;\)]"
148 PATTERN = FUNC_NAME + FUNC_PARAMS + TERMINATOR
149
150 match = re.search(PATTERN, script)
151 func_name = match.groups()[0]
152 return func_name
153
154 def decode_signature(js_url, signature):
155 script = urlopen(js_url).read()
156 func_name = find_func_name(script)
157
158 params = {
159 "func_name": func_name,
160 "signature": json.dumps(signature),
161 "code": json.dumps(extract_js(script)),
162 }
163 p = subprocess.Popen(
164 "nodejs",
165 shell=True,
166 close_fds=True,
167 stdin=subprocess.PIPE,
168 stdout=subprocess.PIPE
169 )
170 js_decode_script = ("""
171 var vm = require('vm');
172
173 var sandbox = {
174 location: {
175 hash: '',
176 href: '',
177 protocol: 'http:'
178 },
179 history: {
180 pushState: function(){}
181 },
182 document: {},
183 navigator: {
184 userAgent: ''
185 },
186 signature: %(signature)s,
187 transformed_signature: null,
188 g: function(){} // this is _yt_player
189 };
190 sandbox.window = sandbox;
191
192 var code_string = %(code)s + ';';
193 var exec_string = 'transformed_signature = %(func_name)s(signature);';
194 vm.runInNewContext(code_string + exec_string, sandbox);
195
196 console.log(sandbox.transformed_signature);
197 """ % params)
198
199 p.stdin.write(js_decode_script)
200 p.stdin.close()
201
202 transformed_signature = p.stdout.read().strip()
203 if p.wait() != 0:
204 raise Exception("js failed to execute: %d" % p.returncode)
205
206 return transformed_signature
207
208 def get_best_video(player_config):
209 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
210 js_url = player_config["assets"]["js"]
211
212 best_url = None
213 best_quality = None
214 best_extension = None
215 for url_data in url_data_list:
216 url_data = urlparse.parse_qs(url_data)
217 mimetype = url_data["type"][0].split(";")[0]
218 quality = url_data["quality"][0]
219
220 if url_data.has_key("stereo3d"):
221 continue
222 if quality not in QUALITIES:
223 continue
224 if mimetype not in MIMETYPES:
225 continue
226
227 extension = MIMETYPES[mimetype]
228 quality = QUALITIES.get(quality, -1)
229
230 if best_quality is not None and quality < best_quality:
231 continue
232
233 video_url = url_data["url"][0]
234 if "sig" in url_data:
235 signature = url_data["sig"][0]
236 elif "s" in url_data:
237 signature = decode_signature(js_url, url_data["s"][0])
238 else:
239 signature = None
240
241 if signature:
242 video_url = append_to_qs(video_url, {"signature": signature})
243
244 best_url = video_url
245 best_quality = quality
246 best_extension = extension
247
248 return best_url, best_extension
249
250 def sanitize_filename(filename):
251 return (
252 re.sub("\s+", " ", filename.strip())
253 .replace("\\", "-")
254 .replace("/", "-")
255 .replace("\0", " ")
256 )
257
258 def get_video_url(doc):
259 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
260 if unavailable:
261 raise VideoUnavailable(unavailable[0].strip())
262
263 player_config = get_player_config(doc)
264 if not player_config:
265 raise VideoUnavailable("Could not find video URL")
266
267 video_url, extension = get_best_video(player_config)
268 if not video_url:
269 return None, None
270
271 title = doc.xpath("/html/head/title/text()")[0]
272 filename = sanitize_filename(title)
273 filename += "." + extension
274
275 return video_url, filename
276
277 def write_video(filename, video_data):
278 httpinfo = video_data.info()
279 encoded_filename = urllib.quote(filename.encode("utf-8"))
280 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
281 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
282 sys.stdout.write("\r\n")
283 shutil.copyfileobj(video_data, sys.stdout)
284 video_data.close()
285
286 def cgimain():
287 args = cgi.parse()
288 try:
289 url = args["url"][0]
290 except:
291 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
292 return
293
294 try:
295 doc = parse_url(url)
296 video_url, filename = get_video_url(doc)
297 video_data = urlopen(video_url)
298 write_video(filename, video_data)
299 except VideoUnavailable, e:
300 print_form(
301 url=url,
302 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
303 )
304 except Exception, e:
305 print_form(
306 url=url,
307 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
308 )
309 return
310
311 def pp_size(size):
312 suffixes = ["", "KiB", "MiB", "GiB"]
313 for i, suffix in enumerate(suffixes):
314 if size < 1024:
315 break
316 size /= 1024
317 return "%.2f %s" % (size, suffix)
318
319 def copy_with_progress(content_length, infile, outfile):
320 def print_status():
321 rate = 0
322 if now != last_ts:
323 rate = last_bytes_read / (now - last_ts)
324 sys.stdout.write("\33[2K\r")
325 sys.stdout.write("%s / %s (%s/sec)" % (
326 pp_size(bytes_read),
327 pp_size(content_length),
328 pp_size(rate),
329 ))
330 sys.stdout.flush()
331
332 last_ts = 0
333 last_bytes_read = 0
334 bytes_read = 0
335 while True:
336 now = time.time()
337 if now - last_ts > 0.5:
338 print_status()
339 last_ts = now
340 last_bytes_read = 0
341
342 buf = infile.read(32768)
343 if not buf:
344 break
345 outfile.write(buf)
346 last_bytes_read += len(buf)
347 bytes_read += len(buf)
348
349 # Newline at the end
350 print_status()
351 print
352
353 def main():
354 try:
355 url = sys.argv[1]
356 except:
357 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
358 sys.exit(1)
359
360 doc = parse_url(url)
361 video_url, filename = get_video_url(doc)
362 print "Downloading", filename.encode("utf-8")
363
364 outfile = open(filename, "a")
365 offset = outfile.tell()
366 if offset > 0:
367 print "Resuming download from", pp_size(offset)
368 total_size = None
369
370 while True:
371 try:
372 video_data = urlopen(video_url, offset)
373 except urllib2.HTTPError, e:
374 if e.code == 416:
375 print "File is complete!"
376 break
377 else:
378 raise
379
380 content_length = int(video_data.info().getheader("Content-Length"))
381 if total_size is None:
382 total_size = content_length
383
384 try:
385 copy_with_progress(content_length, video_data, outfile)
386 except IOError, e:
387 print
388
389 video_data.close()
390 if outfile.tell() != total_size:
391 old_offset = offset
392 offset = outfile.tell()
393 if old_offset == offset:
394 time.sleep(1)
395 print "Restarting download from", pp_size(offset)
396 else:
397 break
398
399 outfile.close()
400
401
402 if __name__ == "__main__":
403 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
404 if os.environ.has_key("SCRIPT_NAME"):
405 cgimain()
406 else:
407 try:
408 main()
409 except KeyboardInterrupt:
410 print "\nExiting..."
411 sys.exit(1)
412