]> code.delx.au - youtube-cgi/blob - youtube.cgi
switch from tabs to spaces
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 if url.startswith("//"):
83 url = "http:" + url
84
85 global referrer
86 req = urllib2.Request(url)
87 if referrer:
88 req.add_header("Referer", referrer)
89 referrer = url
90
91 req.add_header("User-Agent", USER_AGENT)
92
93 if offset:
94 req.add_header("Range", "bytes=%d-" % offset)
95
96 res = urlopener.open(req)
97
98 content_range = res.info().getheader("Content-Range")
99 if content_range:
100 tokens = content_range.split()
101 assert tokens[0] == "bytes"
102 start = int(tokens[1].split("-")[0])
103 assert start == offset
104 return res
105
106 def parse_url(url):
107 f = urlopen(url)
108 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
109 f.close()
110 return doc
111
112 def append_to_qs(url, params):
113 r = list(urlparse.urlsplit(url))
114 qs = urlparse.parse_qs(r[3])
115 qs.update(params)
116 r[3] = urllib.urlencode(qs, True)
117 url = urlparse.urlunsplit(r)
118 return url
119
120 def get_player_config(doc):
121 player_config = None
122 for script in doc.xpath("//script"):
123 if not script.text:
124 continue
125 for line in script.text.split("\n"):
126 s = "ytplayer.config = {"
127 if s in line:
128 p1 = line.find(s) + len(s) - 1
129 p2 = line.find("};", p1) + 1
130 if p1 >= 0 and p2 > 0:
131 return json.loads(line[p1:p2])
132
133 def extract_function(output, script, func_name):
134 p1 = script.find("function " + func_name + "(")
135 p2 = script.find("}", p1)
136 code = script[p1:p2+1]
137 output.append(code)
138 deps = re.findall(R"[^\.][= ]([\$0-9a-zA-Z]+)\(", code)
139 deps = set(deps)
140 deps.remove(func_name)
141 for dep in deps:
142 extract_function(output, script, dep)
143
144 def decode_signature(js_url, s):
145 script = urlopen(js_url).read()
146 func_name = re.search(R"\b([a-zA-Z]+)\([a-zA-Z]+\.s\);", script).groups()[0]
147
148 codes = []
149 extract_function(codes, script, func_name)
150
151 p = subprocess.Popen(
152 "js",
153 shell=True,
154 close_fds=True,
155 stdin=subprocess.PIPE,
156 stdout=subprocess.PIPE
157 )
158 for code in codes:
159 p.stdin.write(code + "\n")
160 p.stdin.write("console.log(%s('%s'));\n" % (func_name, s))
161 p.stdin.close()
162
163 signature = p.stdout.read().strip()
164 if p.wait() != 0:
165 raise Exception("js failed to execute: %d" % p.returncode)
166
167 return signature
168
169 def get_best_video(player_config):
170 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
171 js_url = player_config["assets"]["js"]
172
173 best_url = None
174 best_quality = None
175 best_extension = None
176 for url_data in url_data_list:
177 url_data = urlparse.parse_qs(url_data)
178 mimetype = url_data["type"][0].split(";")[0]
179 quality = url_data["quality"][0]
180
181 if url_data.has_key("stereo3d"):
182 continue
183 if quality not in QUALITIES:
184 continue
185 if mimetype not in MIMETYPES:
186 continue
187
188 extension = MIMETYPES[mimetype]
189 quality = QUALITIES.get(quality, -1)
190
191 if best_quality is not None and quality < best_quality:
192 continue
193
194 video_url = url_data["url"][0]
195 if "sig" in url_data:
196 signature = url_data["sig"][0]
197 elif "s" in url_data:
198 signature = decode_signature(js_url, url_data["s"][0])
199 else:
200 signature = None
201
202 if signature:
203 video_url = append_to_qs(video_url, {"signature": signature})
204
205 best_url = video_url
206 best_quality = quality
207 best_extension = extension
208
209 return best_url, best_extension
210
211 def sanitize_filename(filename):
212 return (
213 re.sub("\s+", " ", filename.strip())
214 .replace("\\", "-")
215 .replace("/", "-")
216 .replace("\0", " ")
217 )
218
219 def get_video_url(doc):
220 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
221 if unavailable:
222 raise VideoUnavailable(unavailable[0].strip())
223
224 player_config = get_player_config(doc)
225 if not player_config:
226 raise VideoUnavailable("Could not find video URL")
227
228 video_url, extension = get_best_video(player_config)
229 if not video_url:
230 return None, None
231
232 title = doc.xpath("/html/head/title/text()")[0]
233 filename = sanitize_filename(title)
234 filename += "." + extension
235
236 return video_url, filename
237
238 def write_video(filename, video_data):
239 httpinfo = video_data.info()
240 encoded_filename = urllib.quote(filename.encode("utf-8"))
241 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
242 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
243 sys.stdout.write("\r\n")
244 shutil.copyfileobj(video_data, sys.stdout)
245 video_data.close()
246
247 def cgimain():
248 args = cgi.parse()
249 try:
250 url = args["url"][0]
251 except:
252 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
253 return
254
255 try:
256 doc = parse_url(url)
257 video_url, filename = get_video_url(doc)
258 video_data = urlopen(video_url)
259 write_video(filename, video_data)
260 except VideoUnavailable, e:
261 print_form(
262 url=url,
263 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
264 )
265 except Exception, e:
266 print_form(
267 url=url,
268 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
269 )
270 return
271
272 def pp_size(size):
273 suffixes = ["", "KiB", "MiB", "GiB"]
274 for i, suffix in enumerate(suffixes):
275 if size < 1024:
276 break
277 size /= 1024
278 return "%.2f %s" % (size, suffix)
279
280 def copy_with_progress(content_length, infile, outfile):
281 def print_status():
282 rate = 0
283 if now != last_ts:
284 rate = last_bytes_read / (now - last_ts)
285 sys.stdout.write("\33[2K\r")
286 sys.stdout.write("%s / %s (%s/sec)" % (
287 pp_size(bytes_read),
288 pp_size(content_length),
289 pp_size(rate),
290 ))
291 sys.stdout.flush()
292
293 last_ts = 0
294 last_bytes_read = 0
295 bytes_read = 0
296 while True:
297 now = time.time()
298 if now - last_ts > 0.5:
299 print_status()
300 last_ts = now
301 last_bytes_read = 0
302
303 buf = infile.read(32768)
304 if not buf:
305 break
306 outfile.write(buf)
307 last_bytes_read += len(buf)
308 bytes_read += len(buf)
309
310 # Newline at the end
311 print_status()
312 print
313
314 def main():
315 try:
316 url = sys.argv[1]
317 except:
318 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
319 sys.exit(1)
320
321 doc = parse_url(url)
322 video_url, filename = get_video_url(doc)
323 print "Downloading", filename.encode("utf-8")
324
325 outfile = open(filename, "a")
326 offset = outfile.tell()
327 if offset > 0:
328 print "Resuming download from", pp_size(offset)
329 total_size = None
330
331 while True:
332 try:
333 video_data = urlopen(video_url, offset)
334 except urllib2.HTTPError, e:
335 if e.code == 416:
336 print "File is complete!"
337 break
338 else:
339 raise
340
341 content_length = int(video_data.info().getheader("Content-Length"))
342 if total_size is None:
343 total_size = content_length
344
345 try:
346 copy_with_progress(content_length, video_data, outfile)
347 except IOError, e:
348 print
349
350 video_data.close()
351 if outfile.tell() != total_size:
352 old_offset = offset
353 offset = outfile.tell()
354 if old_offset == offset:
355 time.sleep(1)
356 print "Restarting download from", pp_size(offset)
357 else:
358 break
359
360 outfile.close()
361
362
363 if __name__ == "__main__":
364 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
365 if os.environ.has_key("SCRIPT_NAME"):
366 cgimain()
367 else:
368 try:
369 main()
370 except KeyboardInterrupt:
371 print "\nExiting..."
372 sys.exit(1)
373