]> code.delx.au - youtube-cgi/blob - youtube.cgi
2ad3b70de18c9109f02225a2a10de707592f016d
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 if url.startswith("//"):
83 url = "http:" + url
84
85 global referrer
86 req = urllib2.Request(url)
87 if not referrer:
88 referrer = url
89 else:
90 req.add_header("Referer", referrer)
91
92 req.add_header("User-Agent", USER_AGENT)
93
94 if offset:
95 req.add_header("Range", "bytes=%d-" % offset)
96
97 res = urlopener.open(req)
98
99 content_range = res.info().getheader("Content-Range")
100 if content_range:
101 tokens = content_range.split()
102 assert tokens[0] == "bytes"
103 start = int(tokens[1].split("-")[0])
104 assert start == offset
105 return res
106
107 def parse_url(url):
108 f = urlopen(url)
109 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
110 f.close()
111 return doc
112
113 def append_to_qs(url, params):
114 r = list(urlparse.urlsplit(url))
115 qs = urlparse.parse_qs(r[3])
116 qs.update(params)
117 r[3] = urllib.urlencode(qs, True)
118 url = urlparse.urlunsplit(r)
119 return url
120
121 def get_player_config(doc):
122 player_config = None
123 for script in doc.xpath("//script"):
124 if not script.text:
125 continue
126 for line in script.text.split("\n"):
127 s = "ytplayer.config = {"
128 if s in line:
129 p1 = line.find(s) + len(s) - 1
130 p2 = line.find("};", p1) + 1
131 if p1 >= 0 and p2 > 0:
132 return json.loads(line[p1:p2])
133
134 def extract_js(script):
135 PREFIX = "var _yt_player={};(function(g){var window=this;"
136 SUFFIX = ";})(_yt_player);\n"
137 assert script.startswith(PREFIX)
138 assert script.endswith(SUFFIX)
139
140 return script[len(PREFIX):-len(SUFFIX)]
141
142 def find_func_name(script):
143 FUNC_NAME = R"([a-zA-Z0-9$]+)"
144 FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
145 PATTERN = FUNC_NAME + FUNC_PARAMS + ";"
146
147 match = re.search(PATTERN, script)
148 func_name = match.groups()[0]
149 return func_name
150
151 def decode_signature(js_url, signature):
152 script = urlopen(js_url).read()
153 func_name = find_func_name(script)
154
155 params = {
156 "func_name": func_name,
157 "signature": json.dumps(signature),
158 "code": json.dumps(extract_js(script)),
159 }
160 p = subprocess.Popen(
161 "nodejs",
162 shell=True,
163 close_fds=True,
164 stdin=subprocess.PIPE,
165 stdout=subprocess.PIPE
166 )
167 js_decode_script = ("""
168 var vm = require('vm');
169
170 var sandbox = {
171 location: {
172 hash: '',
173 href: '',
174 protocol: 'http:'
175 },
176 history: {
177 pushState: function(){}
178 },
179 document: {},
180 navigator: {
181 userAgent: ''
182 },
183 signature: %(signature)s,
184 transformed_signature: null,
185 g: function(){} // this is _yt_player
186 };
187 sandbox.window = sandbox;
188
189 var code_string = %(code)s + ';';
190 var exec_string = 'transformed_signature = %(func_name)s(signature);';
191 vm.runInNewContext(code_string + exec_string, sandbox);
192
193 console.log(sandbox.transformed_signature);
194 """ % params)
195
196 p.stdin.write(js_decode_script)
197 p.stdin.close()
198
199 transformed_signature = p.stdout.read().strip()
200 if p.wait() != 0:
201 raise Exception("js failed to execute: %d" % p.returncode)
202
203 return transformed_signature
204
205 def get_best_video(player_config):
206 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
207 js_url = player_config["assets"]["js"]
208
209 best_url = None
210 best_quality = None
211 best_extension = None
212 for url_data in url_data_list:
213 url_data = urlparse.parse_qs(url_data)
214 mimetype = url_data["type"][0].split(";")[0]
215 quality = url_data["quality"][0]
216
217 if url_data.has_key("stereo3d"):
218 continue
219 if quality not in QUALITIES:
220 continue
221 if mimetype not in MIMETYPES:
222 continue
223
224 extension = MIMETYPES[mimetype]
225 quality = QUALITIES.get(quality, -1)
226
227 if best_quality is not None and quality < best_quality:
228 continue
229
230 video_url = url_data["url"][0]
231 if "sig" in url_data:
232 signature = url_data["sig"][0]
233 elif "s" in url_data:
234 signature = decode_signature(js_url, url_data["s"][0])
235 else:
236 signature = None
237
238 if signature:
239 video_url = append_to_qs(video_url, {"signature": signature})
240
241 best_url = video_url
242 best_quality = quality
243 best_extension = extension
244
245 return best_url, best_extension
246
247 def sanitize_filename(filename):
248 return (
249 re.sub("\s+", " ", filename.strip())
250 .replace("\\", "-")
251 .replace("/", "-")
252 .replace("\0", " ")
253 )
254
255 def get_video_url(doc):
256 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
257 if unavailable:
258 raise VideoUnavailable(unavailable[0].strip())
259
260 player_config = get_player_config(doc)
261 if not player_config:
262 raise VideoUnavailable("Could not find video URL")
263
264 video_url, extension = get_best_video(player_config)
265 if not video_url:
266 return None, None
267
268 title = doc.xpath("/html/head/title/text()")[0]
269 filename = sanitize_filename(title)
270 filename += "." + extension
271
272 return video_url, filename
273
274 def write_video(filename, video_data):
275 httpinfo = video_data.info()
276 encoded_filename = urllib.quote(filename.encode("utf-8"))
277 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
278 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
279 sys.stdout.write("\r\n")
280 shutil.copyfileobj(video_data, sys.stdout)
281 video_data.close()
282
283 def cgimain():
284 args = cgi.parse()
285 try:
286 url = args["url"][0]
287 except:
288 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
289 return
290
291 try:
292 doc = parse_url(url)
293 video_url, filename = get_video_url(doc)
294 video_data = urlopen(video_url)
295 write_video(filename, video_data)
296 except VideoUnavailable, e:
297 print_form(
298 url=url,
299 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
300 )
301 except Exception, e:
302 print_form(
303 url=url,
304 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
305 )
306 return
307
308 def pp_size(size):
309 suffixes = ["", "KiB", "MiB", "GiB"]
310 for i, suffix in enumerate(suffixes):
311 if size < 1024:
312 break
313 size /= 1024
314 return "%.2f %s" % (size, suffix)
315
316 def copy_with_progress(content_length, infile, outfile):
317 def print_status():
318 rate = 0
319 if now != last_ts:
320 rate = last_bytes_read / (now - last_ts)
321 sys.stdout.write("\33[2K\r")
322 sys.stdout.write("%s / %s (%s/sec)" % (
323 pp_size(bytes_read),
324 pp_size(content_length),
325 pp_size(rate),
326 ))
327 sys.stdout.flush()
328
329 last_ts = 0
330 last_bytes_read = 0
331 bytes_read = 0
332 while True:
333 now = time.time()
334 if now - last_ts > 0.5:
335 print_status()
336 last_ts = now
337 last_bytes_read = 0
338
339 buf = infile.read(32768)
340 if not buf:
341 break
342 outfile.write(buf)
343 last_bytes_read += len(buf)
344 bytes_read += len(buf)
345
346 # Newline at the end
347 print_status()
348 print
349
350 def main():
351 try:
352 url = sys.argv[1]
353 except:
354 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
355 sys.exit(1)
356
357 doc = parse_url(url)
358 video_url, filename = get_video_url(doc)
359 print "Downloading", filename.encode("utf-8")
360
361 outfile = open(filename, "a")
362 offset = outfile.tell()
363 if offset > 0:
364 print "Resuming download from", pp_size(offset)
365 total_size = None
366
367 while True:
368 try:
369 video_data = urlopen(video_url, offset)
370 except urllib2.HTTPError, e:
371 if e.code == 416:
372 print "File is complete!"
373 break
374 else:
375 raise
376
377 content_length = int(video_data.info().getheader("Content-Length"))
378 if total_size is None:
379 total_size = content_length
380
381 try:
382 copy_with_progress(content_length, video_data, outfile)
383 except IOError, e:
384 print
385
386 video_data.close()
387 if outfile.tell() != total_size:
388 old_offset = offset
389 offset = outfile.tell()
390 if old_offset == offset:
391 time.sleep(1)
392 print "Restarting download from", pp_size(offset)
393 else:
394 break
395
396 outfile.close()
397
398
399 if __name__ == "__main__":
400 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
401 if os.environ.has_key("SCRIPT_NAME"):
402 cgimain()
403 else:
404 try:
405 main()
406 except KeyboardInterrupt:
407 print "\nExiting..."
408 sys.exit(1)
409