]> code.delx.au - youtube-cgi/blob - youtube.cgi
fixed downloader
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 if url.startswith("//"):
83 url = "http:" + url
84
85 global referrer
86 req = urllib2.Request(url)
87 if not referrer:
88 referrer = url
89 else:
90 req.add_header("Referer", referrer)
91
92 req.add_header("User-Agent", USER_AGENT)
93
94 if offset:
95 req.add_header("Range", "bytes=%d-" % offset)
96
97 res = urlopener.open(req)
98
99 content_range = res.info().getheader("Content-Range")
100 if content_range:
101 tokens = content_range.split()
102 assert tokens[0] == "bytes"
103 start = int(tokens[1].split("-")[0])
104 assert start == offset
105 return res
106
107 def parse_url(url):
108 f = urlopen(url)
109 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
110 f.close()
111 return doc
112
113 def append_to_qs(url, params):
114 r = list(urlparse.urlsplit(url))
115 qs = urlparse.parse_qs(r[3])
116 qs.update(params)
117 r[3] = urllib.urlencode(qs, True)
118 url = urlparse.urlunsplit(r)
119 return url
120
121 def get_player_config(doc):
122 player_config = None
123 for script in doc.xpath("//script"):
124 if not script.text:
125 continue
126 for line in script.text.split("\n"):
127 s = "ytplayer.config = {"
128 if s in line:
129 p1 = line.find(s) + len(s) - 1
130 p2 = line.find("};", p1) + 1
131 if p1 >= 0 and p2 > 0:
132 return json.loads(line[p1:p2])
133
134 def extract_js(script):
135 PREFIX = "(function(){"
136 SUFFIX = "})();\n"
137 assert script.startswith(PREFIX)
138 assert script.endswith(SUFFIX)
139
140 return script[len(PREFIX):-len(SUFFIX)]
141
142 def find_func_name(script):
143 FUNC_NAME = R"([a-zA-Z0-9$]+)"
144 FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
145 PATTERN = FUNC_NAME + FUNC_PARAMS + ";"
146
147 match = re.search(PATTERN, script)
148 func_name = match.groups()[0]
149 return func_name
150
151 def decode_signature(js_url, signature):
152 script = urlopen(js_url).read()
153 func_name = find_func_name(script)
154
155 params = {
156 "func_name": func_name,
157 "signature": json.dumps(signature),
158 "code": json.dumps(extract_js(script)),
159 }
160 p = subprocess.Popen(
161 "nodejs",
162 shell=True,
163 close_fds=True,
164 stdin=subprocess.PIPE,
165 stdout=subprocess.PIPE
166 )
167 js_decode_script = ("""
168 var vm = require("vm");
169
170 var sandbox = {
171 window: {
172 location: {},
173 history: {
174 pushState: function(){}
175 },
176 navigator: {}
177 },
178 document: {},
179 navigator: {},
180 signature: %(signature)s,
181 transformed_signature: null
182 };
183
184 var execstring = ";transformed_signature = %(func_name)s(signature);";
185 vm.runInNewContext(%(code)s + execstring, sandbox);
186
187 console.log(sandbox.transformed_signature);
188 """ % params)
189
190 p.stdin.write(js_decode_script)
191 p.stdin.close()
192
193 transformed_signature = p.stdout.read().strip()
194 if p.wait() != 0:
195 raise Exception("js failed to execute: %d" % p.returncode)
196
197 return transformed_signature
198
199 def get_best_video(player_config):
200 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
201 js_url = player_config["assets"]["js"]
202
203 best_url = None
204 best_quality = None
205 best_extension = None
206 for url_data in url_data_list:
207 url_data = urlparse.parse_qs(url_data)
208 mimetype = url_data["type"][0].split(";")[0]
209 quality = url_data["quality"][0]
210
211 if url_data.has_key("stereo3d"):
212 continue
213 if quality not in QUALITIES:
214 continue
215 if mimetype not in MIMETYPES:
216 continue
217
218 extension = MIMETYPES[mimetype]
219 quality = QUALITIES.get(quality, -1)
220
221 if best_quality is not None and quality < best_quality:
222 continue
223
224 video_url = url_data["url"][0]
225 if "sig" in url_data:
226 signature = url_data["sig"][0]
227 elif "s" in url_data:
228 signature = decode_signature(js_url, url_data["s"][0])
229 else:
230 signature = None
231
232 if signature:
233 video_url = append_to_qs(video_url, {"signature": signature})
234
235 best_url = video_url
236 best_quality = quality
237 best_extension = extension
238
239 return best_url, best_extension
240
241 def sanitize_filename(filename):
242 return (
243 re.sub("\s+", " ", filename.strip())
244 .replace("\\", "-")
245 .replace("/", "-")
246 .replace("\0", " ")
247 )
248
249 def get_video_url(doc):
250 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
251 if unavailable:
252 raise VideoUnavailable(unavailable[0].strip())
253
254 player_config = get_player_config(doc)
255 if not player_config:
256 raise VideoUnavailable("Could not find video URL")
257
258 video_url, extension = get_best_video(player_config)
259 if not video_url:
260 return None, None
261
262 title = doc.xpath("/html/head/title/text()")[0]
263 filename = sanitize_filename(title)
264 filename += "." + extension
265
266 return video_url, filename
267
268 def write_video(filename, video_data):
269 httpinfo = video_data.info()
270 encoded_filename = urllib.quote(filename.encode("utf-8"))
271 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
272 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
273 sys.stdout.write("\r\n")
274 shutil.copyfileobj(video_data, sys.stdout)
275 video_data.close()
276
277 def cgimain():
278 args = cgi.parse()
279 try:
280 url = args["url"][0]
281 except:
282 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
283 return
284
285 try:
286 doc = parse_url(url)
287 video_url, filename = get_video_url(doc)
288 video_data = urlopen(video_url)
289 write_video(filename, video_data)
290 except VideoUnavailable, e:
291 print_form(
292 url=url,
293 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
294 )
295 except Exception, e:
296 print_form(
297 url=url,
298 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
299 )
300 return
301
302 def pp_size(size):
303 suffixes = ["", "KiB", "MiB", "GiB"]
304 for i, suffix in enumerate(suffixes):
305 if size < 1024:
306 break
307 size /= 1024
308 return "%.2f %s" % (size, suffix)
309
310 def copy_with_progress(content_length, infile, outfile):
311 def print_status():
312 rate = 0
313 if now != last_ts:
314 rate = last_bytes_read / (now - last_ts)
315 sys.stdout.write("\33[2K\r")
316 sys.stdout.write("%s / %s (%s/sec)" % (
317 pp_size(bytes_read),
318 pp_size(content_length),
319 pp_size(rate),
320 ))
321 sys.stdout.flush()
322
323 last_ts = 0
324 last_bytes_read = 0
325 bytes_read = 0
326 while True:
327 now = time.time()
328 if now - last_ts > 0.5:
329 print_status()
330 last_ts = now
331 last_bytes_read = 0
332
333 buf = infile.read(32768)
334 if not buf:
335 break
336 outfile.write(buf)
337 last_bytes_read += len(buf)
338 bytes_read += len(buf)
339
340 # Newline at the end
341 print_status()
342 print
343
344 def main():
345 try:
346 url = sys.argv[1]
347 except:
348 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
349 sys.exit(1)
350
351 doc = parse_url(url)
352 video_url, filename = get_video_url(doc)
353 print "Downloading", filename.encode("utf-8")
354
355 outfile = open(filename, "a")
356 offset = outfile.tell()
357 if offset > 0:
358 print "Resuming download from", pp_size(offset)
359 total_size = None
360
361 while True:
362 try:
363 video_data = urlopen(video_url, offset)
364 except urllib2.HTTPError, e:
365 if e.code == 416:
366 print "File is complete!"
367 break
368 else:
369 raise
370
371 content_length = int(video_data.info().getheader("Content-Length"))
372 if total_size is None:
373 total_size = content_length
374
375 try:
376 copy_with_progress(content_length, video_data, outfile)
377 except IOError, e:
378 print
379
380 video_data.close()
381 if outfile.tell() != total_size:
382 old_offset = offset
383 offset = outfile.tell()
384 if old_offset == offset:
385 time.sleep(1)
386 print "Restarting download from", pp_size(offset)
387 else:
388 break
389
390 outfile.close()
391
392
393 if __name__ == "__main__":
394 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
395 if os.environ.has_key("SCRIPT_NAME"):
396 cgimain()
397 else:
398 try:
399 main()
400 except KeyboardInterrupt:
401 print "\nExiting..."
402 sys.exit(1)
403