changed script prefix/suffix
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 if url.startswith("//"):
83 url = "http:" + url
84
85 global referrer
86 req = urllib2.Request(url)
87 if not referrer:
88 referrer = url
89 else:
90 req.add_header("Referer", referrer)
91
92 req.add_header("User-Agent", USER_AGENT)
93
94 if offset:
95 req.add_header("Range", "bytes=%d-" % offset)
96
97 res = urlopener.open(req)
98
99 content_range = res.info().getheader("Content-Range")
100 if content_range:
101 tokens = content_range.split()
102 assert tokens[0] == "bytes"
103 start = int(tokens[1].split("-")[0])
104 assert start == offset
105 return res
106
107 def parse_url(url):
108 f = urlopen(url)
109 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
110 f.close()
111 return doc
112
113 def append_to_qs(url, params):
114 r = list(urlparse.urlsplit(url))
115 qs = urlparse.parse_qs(r[3])
116 qs.update(params)
117 r[3] = urllib.urlencode(qs, True)
118 url = urlparse.urlunsplit(r)
119 return url
120
121 def get_player_config(doc):
122 player_config = None
123 for script in doc.xpath("//script"):
124 if not script.text:
125 continue
126 for line in script.text.split("\n"):
127 s = "ytplayer.config = {"
128 if s in line:
129 p1 = line.find(s) + len(s) - 1
130 p2 = line.find("};", p1) + 1
131 if p1 >= 0 and p2 > 0:
132 return json.loads(line[p1:p2])
133
134 def extract_js(script):
135 PREFIX = "var _yt_player={};(function(g){var window=this;"
136 SUFFIX = ";})(_yt_player);\n"
137 assert script.startswith(PREFIX)
138 assert script.endswith(SUFFIX)
139
140 return script[len(PREFIX):-len(SUFFIX)]
141
142 def find_func_name(script):
143 FUNC_NAME = R"([a-zA-Z0-9$]+)"
144 FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
145 PATTERN = FUNC_NAME + FUNC_PARAMS + ";"
146
147 match = re.search(PATTERN, script)
148 func_name = match.groups()[0]
149 return func_name
150
151 def decode_signature(js_url, signature):
152 script = urlopen(js_url).read()
153 func_name = find_func_name(script)
154
155 params = {
156 "func_name": func_name,
157 "signature": json.dumps(signature),
158 "code": json.dumps(extract_js(script)),
159 }
160 p = subprocess.Popen(
161 "nodejs",
162 shell=True,
163 close_fds=True,
164 stdin=subprocess.PIPE,
165 stdout=subprocess.PIPE
166 )
167 js_decode_script = ("""
168 var vm = require('vm');
169
170 var sandbox = {
171 location: {
172 hash: '',
173 href: '',
174 protocol: 'http:'
175 },
176 history: {
177 pushState: function(){}
178 },
179 document: {},
180 navigator: {},
181 signature: %(signature)s,
182 transformed_signature: null
183 };
184 sandbox.window = sandbox;
185
186 var code_string = %(code)s + ';';
187 var exec_string = 'transformed_signature = %(func_name)s(signature);';
188 vm.runInNewContext(code_string + exec_string, sandbox);
189
190 console.log(sandbox.transformed_signature);
191 """ % params)
192
193 p.stdin.write(js_decode_script)
194 p.stdin.close()
195
196 transformed_signature = p.stdout.read().strip()
197 if p.wait() != 0:
198 raise Exception("js failed to execute: %d" % p.returncode)
199
200 return transformed_signature
201
202 def get_best_video(player_config):
203 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
204 js_url = player_config["assets"]["js"]
205
206 best_url = None
207 best_quality = None
208 best_extension = None
209 for url_data in url_data_list:
210 url_data = urlparse.parse_qs(url_data)
211 mimetype = url_data["type"][0].split(";")[0]
212 quality = url_data["quality"][0]
213
214 if url_data.has_key("stereo3d"):
215 continue
216 if quality not in QUALITIES:
217 continue
218 if mimetype not in MIMETYPES:
219 continue
220
221 extension = MIMETYPES[mimetype]
222 quality = QUALITIES.get(quality, -1)
223
224 if best_quality is not None and quality < best_quality:
225 continue
226
227 video_url = url_data["url"][0]
228 if "sig" in url_data:
229 signature = url_data["sig"][0]
230 elif "s" in url_data:
231 signature = decode_signature(js_url, url_data["s"][0])
232 else:
233 signature = None
234
235 if signature:
236 video_url = append_to_qs(video_url, {"signature": signature})
237
238 best_url = video_url
239 best_quality = quality
240 best_extension = extension
241
242 return best_url, best_extension
243
244 def sanitize_filename(filename):
245 return (
246 re.sub("\s+", " ", filename.strip())
247 .replace("\\", "-")
248 .replace("/", "-")
249 .replace("\0", " ")
250 )
251
252 def get_video_url(doc):
253 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
254 if unavailable:
255 raise VideoUnavailable(unavailable[0].strip())
256
257 player_config = get_player_config(doc)
258 if not player_config:
259 raise VideoUnavailable("Could not find video URL")
260
261 video_url, extension = get_best_video(player_config)
262 if not video_url:
263 return None, None
264
265 title = doc.xpath("/html/head/title/text()")[0]
266 filename = sanitize_filename(title)
267 filename += "." + extension
268
269 return video_url, filename
270
271 def write_video(filename, video_data):
272 httpinfo = video_data.info()
273 encoded_filename = urllib.quote(filename.encode("utf-8"))
274 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
275 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
276 sys.stdout.write("\r\n")
277 shutil.copyfileobj(video_data, sys.stdout)
278 video_data.close()
279
280 def cgimain():
281 args = cgi.parse()
282 try:
283 url = args["url"][0]
284 except:
285 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
286 return
287
288 try:
289 doc = parse_url(url)
290 video_url, filename = get_video_url(doc)
291 video_data = urlopen(video_url)
292 write_video(filename, video_data)
293 except VideoUnavailable, e:
294 print_form(
295 url=url,
296 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
297 )
298 except Exception, e:
299 print_form(
300 url=url,
301 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
302 )
303 return
304
305 def pp_size(size):
306 suffixes = ["", "KiB", "MiB", "GiB"]
307 for i, suffix in enumerate(suffixes):
308 if size < 1024:
309 break
310 size /= 1024
311 return "%.2f %s" % (size, suffix)
312
313 def copy_with_progress(content_length, infile, outfile):
314 def print_status():
315 rate = 0
316 if now != last_ts:
317 rate = last_bytes_read / (now - last_ts)
318 sys.stdout.write("\33[2K\r")
319 sys.stdout.write("%s / %s (%s/sec)" % (
320 pp_size(bytes_read),
321 pp_size(content_length),
322 pp_size(rate),
323 ))
324 sys.stdout.flush()
325
326 last_ts = 0
327 last_bytes_read = 0
328 bytes_read = 0
329 while True:
330 now = time.time()
331 if now - last_ts > 0.5:
332 print_status()
333 last_ts = now
334 last_bytes_read = 0
335
336 buf = infile.read(32768)
337 if not buf:
338 break
339 outfile.write(buf)
340 last_bytes_read += len(buf)
341 bytes_read += len(buf)
342
343 # Newline at the end
344 print_status()
345 print
346
347 def main():
348 try:
349 url = sys.argv[1]
350 except:
351 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
352 sys.exit(1)
353
354 doc = parse_url(url)
355 video_url, filename = get_video_url(doc)
356 print "Downloading", filename.encode("utf-8")
357
358 outfile = open(filename, "a")
359 offset = outfile.tell()
360 if offset > 0:
361 print "Resuming download from", pp_size(offset)
362 total_size = None
363
364 while True:
365 try:
366 video_data = urlopen(video_url, offset)
367 except urllib2.HTTPError, e:
368 if e.code == 416:
369 print "File is complete!"
370 break
371 else:
372 raise
373
374 content_length = int(video_data.info().getheader("Content-Length"))
375 if total_size is None:
376 total_size = content_length
377
378 try:
379 copy_with_progress(content_length, video_data, outfile)
380 except IOError, e:
381 print
382
383 video_data.close()
384 if outfile.tell() != total_size:
385 old_offset = offset
386 offset = outfile.tell()
387 if old_offset == offset:
388 time.sleep(1)
389 print "Restarting download from", pp_size(offset)
390 else:
391 break
392
393 outfile.close()
394
395
396 if __name__ == "__main__":
397 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
398 if os.environ.has_key("SCRIPT_NAME"):
399 cgimain()
400 else:
401 try:
402 main()
403 except KeyboardInterrupt:
404 print "\nExiting..."
405 sys.exit(1)
406