]> code.delx.au - youtube-cgi/blob - youtube.cgi
better extract_js for decoding signatures
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38 JS_BROWSER_STUB = """
39 var window={};
40 var document={};
41 window.location={};
42 var navigator={};
43 """
44
45
46
47 class VideoUnavailable(Exception):
48 pass
49
50 def print_form(url="", msg=""):
51 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
52 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
53 sys.stdout.write("""
54 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
55 <html xmlns="http://www.w3.org/1999/xhtml">
56 <head>
57 <title>delx.net.au - YouTube Scraper</title>
58 <link rel="stylesheet" type="text/css" href="/style.css"/>
59 <style type="text/css">
60 input[type="text"] {
61 width: 100%;
62 }
63 .error {
64 color: red;
65 }
66 </style>
67 </head>
68 <body>
69 <h1>delx.net.au - YouTube Scraper</h1>
70 {0}
71 <form action="" method="get">
72 <p>This page will let you easily download YouTube videos to watch offline. It
73 will automatically grab the highest quality version.</p>
74 <div><input type="text" name="url" value="{1}"/></div>
75 <div><input type="submit" value="Download!"/></div>
76 </form>
77 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
78 to easily download videos. Right-click the link and add it to bookmarks,
79 then when you're looking at a YouTube page select that bookmark from your
80 browser's bookmarks menu to download the video straight away.</p>
81 </body>
82 </html>
83 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
84
85 cookiejar = cookielib.CookieJar()
86 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
87 referrer = ""
88
89 def urlopen(url, offset=None):
90 if url.startswith("//"):
91 url = "http:" + url
92
93 global referrer
94 req = urllib2.Request(url)
95 if not referrer:
96 referrer = url
97 else:
98 req.add_header("Referer", referrer)
99
100 req.add_header("User-Agent", USER_AGENT)
101
102 if offset:
103 req.add_header("Range", "bytes=%d-" % offset)
104
105 res = urlopener.open(req)
106
107 content_range = res.info().getheader("Content-Range")
108 if content_range:
109 tokens = content_range.split()
110 assert tokens[0] == "bytes"
111 start = int(tokens[1].split("-")[0])
112 assert start == offset
113 return res
114
115 def parse_url(url):
116 f = urlopen(url)
117 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
118 f.close()
119 return doc
120
121 def append_to_qs(url, params):
122 r = list(urlparse.urlsplit(url))
123 qs = urlparse.parse_qs(r[3])
124 qs.update(params)
125 r[3] = urllib.urlencode(qs, True)
126 url = urlparse.urlunsplit(r)
127 return url
128
129 def get_player_config(doc):
130 player_config = None
131 for script in doc.xpath("//script"):
132 if not script.text:
133 continue
134 for line in script.text.split("\n"):
135 s = "ytplayer.config = {"
136 if s in line:
137 p1 = line.find(s) + len(s) - 1
138 p2 = line.find("};", p1) + 1
139 if p1 >= 0 and p2 > 0:
140 return json.loads(line[p1:p2])
141
142 def extract_js(script):
143 PREFIX = "(function(){"
144 SUFFIX = "})();\n"
145 assert script.startswith(PREFIX)
146 assert script.endswith(SUFFIX)
147
148 return script[len(PREFIX):-len(SUFFIX)]
149
150 def find_func_name(script):
151 FUNC_NAME = R"([a-zA-Z0-9$]+)"
152 FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
153 PATTERN = FUNC_NAME + FUNC_PARAMS + ";"
154
155 match = re.search(PATTERN, script)
156 func_name = match.groups()[0]
157 return func_name
158
159 def decode_signature(js_url, s):
160 script = urlopen(js_url).read()
161 func_name = find_func_name(script)
162
163 p = subprocess.Popen(
164 "js",
165 shell=True,
166 close_fds=True,
167 stdin=subprocess.PIPE,
168 stdout=subprocess.PIPE
169 )
170 w = p.stdin.write
171 w(JS_BROWSER_STUB)
172 w(extract_js(script))
173 w("console.log(%s('%s'));\n" % (func_name, s))
174 p.stdin.close()
175
176 signature = p.stdout.read().strip()
177 if p.wait() != 0:
178 raise Exception("js failed to execute: %d" % p.returncode)
179
180 return signature
181
182 def get_best_video(player_config):
183 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
184 js_url = player_config["assets"]["js"]
185
186 best_url = None
187 best_quality = None
188 best_extension = None
189 for url_data in url_data_list:
190 url_data = urlparse.parse_qs(url_data)
191 mimetype = url_data["type"][0].split(";")[0]
192 quality = url_data["quality"][0]
193
194 if url_data.has_key("stereo3d"):
195 continue
196 if quality not in QUALITIES:
197 continue
198 if mimetype not in MIMETYPES:
199 continue
200
201 extension = MIMETYPES[mimetype]
202 quality = QUALITIES.get(quality, -1)
203
204 if best_quality is not None and quality < best_quality:
205 continue
206
207 video_url = url_data["url"][0]
208 if "sig" in url_data:
209 signature = url_data["sig"][0]
210 elif "s" in url_data:
211 signature = decode_signature(js_url, url_data["s"][0])
212 else:
213 signature = None
214
215 if signature:
216 video_url = append_to_qs(video_url, {"signature": signature})
217
218 best_url = video_url
219 best_quality = quality
220 best_extension = extension
221
222 return best_url, best_extension
223
224 def sanitize_filename(filename):
225 return (
226 re.sub("\s+", " ", filename.strip())
227 .replace("\\", "-")
228 .replace("/", "-")
229 .replace("\0", " ")
230 )
231
232 def get_video_url(doc):
233 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
234 if unavailable:
235 raise VideoUnavailable(unavailable[0].strip())
236
237 player_config = get_player_config(doc)
238 if not player_config:
239 raise VideoUnavailable("Could not find video URL")
240
241 video_url, extension = get_best_video(player_config)
242 if not video_url:
243 return None, None
244
245 title = doc.xpath("/html/head/title/text()")[0]
246 filename = sanitize_filename(title)
247 filename += "." + extension
248
249 return video_url, filename
250
251 def write_video(filename, video_data):
252 httpinfo = video_data.info()
253 encoded_filename = urllib.quote(filename.encode("utf-8"))
254 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
255 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
256 sys.stdout.write("\r\n")
257 shutil.copyfileobj(video_data, sys.stdout)
258 video_data.close()
259
260 def cgimain():
261 args = cgi.parse()
262 try:
263 url = args["url"][0]
264 except:
265 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
266 return
267
268 try:
269 doc = parse_url(url)
270 video_url, filename = get_video_url(doc)
271 video_data = urlopen(video_url)
272 write_video(filename, video_data)
273 except VideoUnavailable, e:
274 print_form(
275 url=url,
276 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
277 )
278 except Exception, e:
279 print_form(
280 url=url,
281 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
282 )
283 return
284
285 def pp_size(size):
286 suffixes = ["", "KiB", "MiB", "GiB"]
287 for i, suffix in enumerate(suffixes):
288 if size < 1024:
289 break
290 size /= 1024
291 return "%.2f %s" % (size, suffix)
292
293 def copy_with_progress(content_length, infile, outfile):
294 def print_status():
295 rate = 0
296 if now != last_ts:
297 rate = last_bytes_read / (now - last_ts)
298 sys.stdout.write("\33[2K\r")
299 sys.stdout.write("%s / %s (%s/sec)" % (
300 pp_size(bytes_read),
301 pp_size(content_length),
302 pp_size(rate),
303 ))
304 sys.stdout.flush()
305
306 last_ts = 0
307 last_bytes_read = 0
308 bytes_read = 0
309 while True:
310 now = time.time()
311 if now - last_ts > 0.5:
312 print_status()
313 last_ts = now
314 last_bytes_read = 0
315
316 buf = infile.read(32768)
317 if not buf:
318 break
319 outfile.write(buf)
320 last_bytes_read += len(buf)
321 bytes_read += len(buf)
322
323 # Newline at the end
324 print_status()
325 print
326
327 def main():
328 try:
329 url = sys.argv[1]
330 except:
331 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
332 sys.exit(1)
333
334 doc = parse_url(url)
335 video_url, filename = get_video_url(doc)
336 print "Downloading", filename.encode("utf-8")
337
338 outfile = open(filename, "a")
339 offset = outfile.tell()
340 if offset > 0:
341 print "Resuming download from", pp_size(offset)
342 total_size = None
343
344 while True:
345 try:
346 video_data = urlopen(video_url, offset)
347 except urllib2.HTTPError, e:
348 if e.code == 416:
349 print "File is complete!"
350 break
351 else:
352 raise
353
354 content_length = int(video_data.info().getheader("Content-Length"))
355 if total_size is None:
356 total_size = content_length
357
358 try:
359 copy_with_progress(content_length, video_data, outfile)
360 except IOError, e:
361 print
362
363 video_data.close()
364 if outfile.tell() != total_size:
365 old_offset = offset
366 offset = outfile.tell()
367 if old_offset == offset:
368 time.sleep(1)
369 print "Restarting download from", pp_size(offset)
370 else:
371 break
372
373 outfile.close()
374
375
376 if __name__ == "__main__":
377 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
378 if os.environ.has_key("SCRIPT_NAME"):
379 cgimain()
380 else:
381 try:
382 main()
383 except KeyboardInterrupt:
384 print "\nExiting..."
385 sys.exit(1)
386