]> code.delx.au - youtube-cgi/blob - youtube.cgi
11d25382834538cca991e9ad4fe70c36d858aaec
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 if url.startswith("//"):
83 url = "http:" + url
84
85 global referrer
86 req = urllib2.Request(url)
87 if not referrer:
88 referrer = url
89 else:
90 req.add_header("Referer", referrer)
91
92 req.add_header("User-Agent", USER_AGENT)
93
94 if offset:
95 req.add_header("Range", "bytes=%d-" % offset)
96
97 res = urlopener.open(req)
98
99 content_range = res.info().getheader("Content-Range")
100 if content_range:
101 tokens = content_range.split()
102 assert tokens[0] == "bytes"
103 start = int(tokens[1].split("-")[0])
104 assert start == offset
105 return res
106
107 def parse_url(url):
108 f = urlopen(url)
109 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
110 f.close()
111 return doc
112
113 def append_to_qs(url, params):
114 r = list(urlparse.urlsplit(url))
115 qs = urlparse.parse_qs(r[3])
116 qs.update(params)
117 r[3] = urllib.urlencode(qs, True)
118 url = urlparse.urlunsplit(r)
119 return url
120
121 def get_player_config(doc):
122 player_config = None
123 for script in doc.xpath("//script"):
124 if not script.text:
125 continue
126 for line in script.text.split("\n"):
127 s = "ytplayer.config = {"
128 if s in line:
129 p1 = line.find(s) + len(s) - 1
130 p2 = line.find("};", p1) + 1
131 if p1 >= 0 and p2 > 0:
132 return json.loads(line[p1:p2])
133
134 def extract_function(output, script, func_name):
135 p1 = script.find("function " + func_name + "(")
136 p2 = script.find("}", p1)
137 code = script[p1:p2+1]
138 output.append(code)
139 deps = re.findall(R"[^\.][= ]([\$0-9a-zA-Z]+)\(", code)
140 deps = set(deps)
141 deps.remove(func_name)
142 for dep in deps:
143 extract_function(output, script, dep)
144
145 def find_func_name(script):
146 FUNC_NAME = R"([a-zA-Z0-9$]+)"
147 FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
148 PATTERN = FUNC_NAME + FUNC_PARAMS + ";"
149
150 match = re.search(PATTERN, script)
151 func_name = match.groups()[0]
152 return func_name
153
154 def decode_signature(js_url, s):
155 script = urlopen(js_url).read()
156 func_name = find_func_name(script)
157
158 codes = []
159 extract_function(codes, script, func_name)
160
161 p = subprocess.Popen(
162 "js",
163 shell=True,
164 close_fds=True,
165 stdin=subprocess.PIPE,
166 stdout=subprocess.PIPE
167 )
168 for code in codes:
169 p.stdin.write(code + "\n")
170 p.stdin.write("console.log(%s('%s'));\n" % (func_name, s))
171 p.stdin.close()
172
173 signature = p.stdout.read().strip()
174 if p.wait() != 0:
175 raise Exception("js failed to execute: %d" % p.returncode)
176
177 return signature
178
179 def get_best_video(player_config):
180 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
181 js_url = player_config["assets"]["js"]
182
183 best_url = None
184 best_quality = None
185 best_extension = None
186 for url_data in url_data_list:
187 url_data = urlparse.parse_qs(url_data)
188 mimetype = url_data["type"][0].split(";")[0]
189 quality = url_data["quality"][0]
190
191 if url_data.has_key("stereo3d"):
192 continue
193 if quality not in QUALITIES:
194 continue
195 if mimetype not in MIMETYPES:
196 continue
197
198 extension = MIMETYPES[mimetype]
199 quality = QUALITIES.get(quality, -1)
200
201 if best_quality is not None and quality < best_quality:
202 continue
203
204 video_url = url_data["url"][0]
205 if "sig" in url_data:
206 signature = url_data["sig"][0]
207 elif "s" in url_data:
208 signature = decode_signature(js_url, url_data["s"][0])
209 else:
210 signature = None
211
212 if signature:
213 video_url = append_to_qs(video_url, {"signature": signature})
214
215 best_url = video_url
216 best_quality = quality
217 best_extension = extension
218
219 return best_url, best_extension
220
221 def sanitize_filename(filename):
222 return (
223 re.sub("\s+", " ", filename.strip())
224 .replace("\\", "-")
225 .replace("/", "-")
226 .replace("\0", " ")
227 )
228
229 def get_video_url(doc):
230 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
231 if unavailable:
232 raise VideoUnavailable(unavailable[0].strip())
233
234 player_config = get_player_config(doc)
235 if not player_config:
236 raise VideoUnavailable("Could not find video URL")
237
238 video_url, extension = get_best_video(player_config)
239 if not video_url:
240 return None, None
241
242 title = doc.xpath("/html/head/title/text()")[0]
243 filename = sanitize_filename(title)
244 filename += "." + extension
245
246 return video_url, filename
247
248 def write_video(filename, video_data):
249 httpinfo = video_data.info()
250 encoded_filename = urllib.quote(filename.encode("utf-8"))
251 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
252 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
253 sys.stdout.write("\r\n")
254 shutil.copyfileobj(video_data, sys.stdout)
255 video_data.close()
256
257 def cgimain():
258 args = cgi.parse()
259 try:
260 url = args["url"][0]
261 except:
262 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
263 return
264
265 try:
266 doc = parse_url(url)
267 video_url, filename = get_video_url(doc)
268 video_data = urlopen(video_url)
269 write_video(filename, video_data)
270 except VideoUnavailable, e:
271 print_form(
272 url=url,
273 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
274 )
275 except Exception, e:
276 print_form(
277 url=url,
278 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
279 )
280 return
281
282 def pp_size(size):
283 suffixes = ["", "KiB", "MiB", "GiB"]
284 for i, suffix in enumerate(suffixes):
285 if size < 1024:
286 break
287 size /= 1024
288 return "%.2f %s" % (size, suffix)
289
290 def copy_with_progress(content_length, infile, outfile):
291 def print_status():
292 rate = 0
293 if now != last_ts:
294 rate = last_bytes_read / (now - last_ts)
295 sys.stdout.write("\33[2K\r")
296 sys.stdout.write("%s / %s (%s/sec)" % (
297 pp_size(bytes_read),
298 pp_size(content_length),
299 pp_size(rate),
300 ))
301 sys.stdout.flush()
302
303 last_ts = 0
304 last_bytes_read = 0
305 bytes_read = 0
306 while True:
307 now = time.time()
308 if now - last_ts > 0.5:
309 print_status()
310 last_ts = now
311 last_bytes_read = 0
312
313 buf = infile.read(32768)
314 if not buf:
315 break
316 outfile.write(buf)
317 last_bytes_read += len(buf)
318 bytes_read += len(buf)
319
320 # Newline at the end
321 print_status()
322 print
323
324 def main():
325 try:
326 url = sys.argv[1]
327 except:
328 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
329 sys.exit(1)
330
331 doc = parse_url(url)
332 video_url, filename = get_video_url(doc)
333 print "Downloading", filename.encode("utf-8")
334
335 outfile = open(filename, "a")
336 offset = outfile.tell()
337 if offset > 0:
338 print "Resuming download from", pp_size(offset)
339 total_size = None
340
341 while True:
342 try:
343 video_data = urlopen(video_url, offset)
344 except urllib2.HTTPError, e:
345 if e.code == 416:
346 print "File is complete!"
347 break
348 else:
349 raise
350
351 content_length = int(video_data.info().getheader("Content-Length"))
352 if total_size is None:
353 total_size = content_length
354
355 try:
356 copy_with_progress(content_length, video_data, outfile)
357 except IOError, e:
358 print
359
360 video_data.close()
361 if outfile.tell() != total_size:
362 old_offset = offset
363 offset = outfile.tell()
364 if old_offset == offset:
365 time.sleep(1)
366 print "Restarting download from", pp_size(offset)
367 else:
368 break
369
370 outfile.close()
371
372
373 if __name__ == "__main__":
374 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
375 if os.environ.has_key("SCRIPT_NAME"):
376 cgimain()
377 else:
378 try:
379 main()
380 except KeyboardInterrupt:
381 print "\nExiting..."
382 sys.exit(1)
383