]> code.delx.au - youtube-cgi/blob - youtube.cgi
dfd5d28ad885d2a5bcd6b3626a901080fbac02a1
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 if url.startswith("//"):
83 url = "http:" + url
84
85 global referrer
86 req = urllib2.Request(url)
87 if not referrer:
88 referrer = url
89 else:
90 req.add_header("Referer", referrer)
91
92 req.add_header("User-Agent", USER_AGENT)
93
94 if offset:
95 req.add_header("Range", "bytes=%d-" % offset)
96
97 res = urlopener.open(req)
98
99 content_range = res.info().getheader("Content-Range")
100 if content_range:
101 tokens = content_range.split()
102 assert tokens[0] == "bytes"
103 start = int(tokens[1].split("-")[0])
104 assert start == offset
105 return res
106
107 def parse_url(url):
108 f = urlopen(url)
109 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
110 f.close()
111 return doc
112
113 def append_to_qs(url, params):
114 r = list(urlparse.urlsplit(url))
115 qs = urlparse.parse_qs(r[3])
116 qs.update(params)
117 r[3] = urllib.urlencode(qs, True)
118 url = urlparse.urlunsplit(r)
119 return url
120
121 def get_player_config(doc):
122 player_config = None
123 for script in doc.xpath("//script"):
124 if not script.text:
125 continue
126 for line in script.text.split("\n"):
127 s = "ytplayer.config = {"
128 if s in line:
129 p1 = line.find(s) + len(s) - 1
130 p2 = line.find("};", p1) + 1
131 if p1 >= 0 and p2 > 0:
132 return json.loads(line[p1:p2])
133
134 def extract_js(script):
135 PREFIX = "var _yt_player={};(function(g){var window=this;"
136 SUFFIX = ";})(_yt_player);\n"
137 assert script.startswith(PREFIX)
138 assert script.endswith(SUFFIX)
139
140 return script[len(PREFIX):-len(SUFFIX)]
141
142 def find_func_name(script):
143 FUNC_NAME = R"([a-zA-Z0-9$]+)"
144 FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
145 PATTERN = FUNC_NAME + FUNC_PARAMS + ";"
146
147 match = re.search(PATTERN, script)
148 func_name = match.groups()[0]
149 return func_name
150
151 def decode_signature(js_url, signature):
152 script = urlopen(js_url).read()
153 func_name = find_func_name(script)
154
155 params = {
156 "func_name": func_name,
157 "signature": json.dumps(signature),
158 "code": json.dumps(extract_js(script)),
159 }
160 p = subprocess.Popen(
161 "nodejs",
162 shell=True,
163 close_fds=True,
164 stdin=subprocess.PIPE,
165 stdout=subprocess.PIPE
166 )
167 js_decode_script = ("""
168 var vm = require('vm');
169
170 var sandbox = {
171 location: {
172 hash: '',
173 href: '',
174 protocol: 'http:'
175 },
176 history: {
177 pushState: function(){}
178 },
179 document: {},
180 navigator: {
181 userAgent: ''
182 },
183 signature: %(signature)s,
184 transformed_signature: null
185 };
186 sandbox.window = sandbox;
187
188 var code_string = %(code)s + ';';
189 var exec_string = 'transformed_signature = %(func_name)s(signature);';
190 vm.runInNewContext(code_string + exec_string, sandbox);
191
192 console.log(sandbox.transformed_signature);
193 """ % params)
194
195 p.stdin.write(js_decode_script)
196 p.stdin.close()
197
198 transformed_signature = p.stdout.read().strip()
199 if p.wait() != 0:
200 raise Exception("js failed to execute: %d" % p.returncode)
201
202 return transformed_signature
203
204 def get_best_video(player_config):
205 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
206 js_url = player_config["assets"]["js"]
207
208 best_url = None
209 best_quality = None
210 best_extension = None
211 for url_data in url_data_list:
212 url_data = urlparse.parse_qs(url_data)
213 mimetype = url_data["type"][0].split(";")[0]
214 quality = url_data["quality"][0]
215
216 if url_data.has_key("stereo3d"):
217 continue
218 if quality not in QUALITIES:
219 continue
220 if mimetype not in MIMETYPES:
221 continue
222
223 extension = MIMETYPES[mimetype]
224 quality = QUALITIES.get(quality, -1)
225
226 if best_quality is not None and quality < best_quality:
227 continue
228
229 video_url = url_data["url"][0]
230 if "sig" in url_data:
231 signature = url_data["sig"][0]
232 elif "s" in url_data:
233 signature = decode_signature(js_url, url_data["s"][0])
234 else:
235 signature = None
236
237 if signature:
238 video_url = append_to_qs(video_url, {"signature": signature})
239
240 best_url = video_url
241 best_quality = quality
242 best_extension = extension
243
244 return best_url, best_extension
245
246 def sanitize_filename(filename):
247 return (
248 re.sub("\s+", " ", filename.strip())
249 .replace("\\", "-")
250 .replace("/", "-")
251 .replace("\0", " ")
252 )
253
254 def get_video_url(doc):
255 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
256 if unavailable:
257 raise VideoUnavailable(unavailable[0].strip())
258
259 player_config = get_player_config(doc)
260 if not player_config:
261 raise VideoUnavailable("Could not find video URL")
262
263 video_url, extension = get_best_video(player_config)
264 if not video_url:
265 return None, None
266
267 title = doc.xpath("/html/head/title/text()")[0]
268 filename = sanitize_filename(title)
269 filename += "." + extension
270
271 return video_url, filename
272
273 def write_video(filename, video_data):
274 httpinfo = video_data.info()
275 encoded_filename = urllib.quote(filename.encode("utf-8"))
276 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
277 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
278 sys.stdout.write("\r\n")
279 shutil.copyfileobj(video_data, sys.stdout)
280 video_data.close()
281
282 def cgimain():
283 args = cgi.parse()
284 try:
285 url = args["url"][0]
286 except:
287 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
288 return
289
290 try:
291 doc = parse_url(url)
292 video_url, filename = get_video_url(doc)
293 video_data = urlopen(video_url)
294 write_video(filename, video_data)
295 except VideoUnavailable, e:
296 print_form(
297 url=url,
298 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
299 )
300 except Exception, e:
301 print_form(
302 url=url,
303 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
304 )
305 return
306
307 def pp_size(size):
308 suffixes = ["", "KiB", "MiB", "GiB"]
309 for i, suffix in enumerate(suffixes):
310 if size < 1024:
311 break
312 size /= 1024
313 return "%.2f %s" % (size, suffix)
314
315 def copy_with_progress(content_length, infile, outfile):
316 def print_status():
317 rate = 0
318 if now != last_ts:
319 rate = last_bytes_read / (now - last_ts)
320 sys.stdout.write("\33[2K\r")
321 sys.stdout.write("%s / %s (%s/sec)" % (
322 pp_size(bytes_read),
323 pp_size(content_length),
324 pp_size(rate),
325 ))
326 sys.stdout.flush()
327
328 last_ts = 0
329 last_bytes_read = 0
330 bytes_read = 0
331 while True:
332 now = time.time()
333 if now - last_ts > 0.5:
334 print_status()
335 last_ts = now
336 last_bytes_read = 0
337
338 buf = infile.read(32768)
339 if not buf:
340 break
341 outfile.write(buf)
342 last_bytes_read += len(buf)
343 bytes_read += len(buf)
344
345 # Newline at the end
346 print_status()
347 print
348
349 def main():
350 try:
351 url = sys.argv[1]
352 except:
353 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
354 sys.exit(1)
355
356 doc = parse_url(url)
357 video_url, filename = get_video_url(doc)
358 print "Downloading", filename.encode("utf-8")
359
360 outfile = open(filename, "a")
361 offset = outfile.tell()
362 if offset > 0:
363 print "Resuming download from", pp_size(offset)
364 total_size = None
365
366 while True:
367 try:
368 video_data = urlopen(video_url, offset)
369 except urllib2.HTTPError, e:
370 if e.code == 416:
371 print "File is complete!"
372 break
373 else:
374 raise
375
376 content_length = int(video_data.info().getheader("Content-Length"))
377 if total_size is None:
378 total_size = content_length
379
380 try:
381 copy_with_progress(content_length, video_data, outfile)
382 except IOError, e:
383 print
384
385 video_data.close()
386 if outfile.tell() != total_size:
387 old_offset = offset
388 offset = outfile.tell()
389 if old_offset == offset:
390 time.sleep(1)
391 print "Restarting download from", pp_size(offset)
392 else:
393 break
394
395 outfile.close()
396
397
398 if __name__ == "__main__":
399 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
400 if os.environ.has_key("SCRIPT_NAME"):
401 cgimain()
402 else:
403 try:
404 main()
405 except KeyboardInterrupt:
406 print "\nExiting..."
407 sys.exit(1)
408