]> code.delx.au - youtube-cgi/blob - youtube.cgi
ignore stereo3d videos
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/python2
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 if url.startswith("//"):
83 url = "http:" + url
84
85 global referrer
86 req = urllib2.Request(url)
87 if referrer:
88 req.add_header("Referer", referrer)
89 referrer = url
90
91 req.add_header("User-Agent", USER_AGENT)
92
93 if offset:
94 req.add_header("Range", "bytes=%d-" % offset)
95
96 res = urlopener.open(req)
97
98 content_range = res.info().getheader("Content-Range")
99 if content_range:
100 tokens = content_range.split()
101 assert tokens[0] == "bytes"
102 start = int(tokens[1].split("-")[0])
103 assert start == offset
104 return res
105
106 def parse_url(url):
107 f = urlopen(url)
108 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
109 f.close()
110 return doc
111
112 def append_to_qs(url, params):
113 r = list(urlparse.urlsplit(url))
114 qs = urlparse.parse_qs(r[3])
115 qs.update(params)
116 r[3] = urllib.urlencode(qs, True)
117 url = urlparse.urlunsplit(r)
118 return url
119
120 def convert_from_old_itag(player_config):
121 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
122 url_data["url"] = []
123 for itag_url in url_data["itag"]:
124 pos = itag_url.find("url=")
125 url_data["url"].append(itag_url[pos+4:])
126 player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
127
128 def get_player_config(doc):
129 player_config = None
130 for script in doc.xpath("//script"):
131 if not script.text:
132 continue
133 for line in script.text.split("\n"):
134 if "yt.playerConfig =" in line:
135 p1 = line.find("=")
136 p2 = line.rfind(";")
137 if p1 >= 0 and p2 > 0:
138 return json.loads(line[p1+1:p2])
139 if "ytplayer.config =" in line:
140 p1 = line.find("ytplayer.config =")
141 p2 = line.rfind(";")
142 if p1 >= 0 and p2 > 0:
143 return json.loads(line[p1+18:p2])
144 if "'PLAYER_CONFIG': " in line:
145 p1 = line.find(":")
146 if p1 >= 0:
147 player_config = json.loads(line[p1+1:])
148 convert_from_old_itag(player_config)
149 return player_config
150
151 def extract_function(output, script, func_name):
152 p1 = script.find("function " + func_name + "(")
153 p2 = script.find("}", p1)
154 code = script[p1:p2+1]
155 output.append(code)
156 deps = re.findall(R"[^\.][= ]([\$0-9a-zA-Z]+)\(", code)
157 deps = set(deps)
158 deps.remove(func_name)
159 for dep in deps:
160 extract_function(output, script, dep)
161
162 def decode_signature(js_url, s):
163 script = urlopen(js_url).read()
164 func_name = re.search(R"\b([a-zA-Z]+)\([a-zA-Z]+\.s\);", script).groups()[0]
165
166 codes = []
167 extract_function(codes, script, func_name)
168
169 p = subprocess.Popen(
170 "js",
171 shell=True,
172 close_fds=True,
173 stdin=subprocess.PIPE,
174 stdout=subprocess.PIPE
175 )
176 for code in codes:
177 p.stdin.write(code + "\n")
178 p.stdin.write("console.log(%s('%s'));\n" % (func_name, s))
179 p.stdin.close()
180
181 signature = p.stdout.read().strip()
182 if p.wait() != 0:
183 raise Exception("js failed to execute: %d" % p.returncode)
184
185 return signature
186
187 def get_best_video(player_config):
188 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
189 js_url = player_config["assets"]["js"]
190
191 best_url = None
192 best_quality = None
193 best_extension = None
194 for url_data in url_data_list:
195 url_data = urlparse.parse_qs(url_data)
196 mimetype = url_data["type"][0].split(";")[0]
197 quality = url_data["quality"][0]
198
199 if url_data.has_key("stereo3d"):
200 continue
201 if quality not in QUALITIES:
202 continue
203 if mimetype not in MIMETYPES:
204 continue
205
206 extension = MIMETYPES[mimetype]
207 quality = QUALITIES.get(quality, -1)
208
209 if best_quality is not None and quality < best_quality:
210 continue
211
212 video_url = url_data["url"][0]
213 if "sig" in url_data:
214 signature = url_data["sig"][0]
215 elif "s" in url_data:
216 signature = decode_signature(js_url, url_data["s"][0])
217 else:
218 signature = None
219
220 if signature:
221 video_url = append_to_qs(video_url, {"signature": signature})
222
223 best_url = video_url
224 best_quality = quality
225 best_extension = extension
226
227 return best_url, best_extension
228
229 def sanitize_filename(filename):
230 return (
231 re.sub("\s+", " ", filename.strip())
232 .replace("\\", "-")
233 .replace("/", "-")
234 .replace("\0", " ")
235 )
236
237 def get_video_url(doc):
238 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
239 if unavailable:
240 raise VideoUnavailable(unavailable[0].strip())
241
242 player_config = get_player_config(doc)
243 if not player_config:
244 raise VideoUnavailable("Could not find video URL")
245
246 video_url, extension = get_best_video(player_config)
247 if not video_url:
248 return None, None
249
250 title = doc.xpath("/html/head/title/text()")[0]
251 filename = sanitize_filename(title)
252 filename += "." + extension
253
254 return video_url, filename
255
256 def write_video(filename, video_data):
257 httpinfo = video_data.info()
258 encoded_filename = urllib.quote(filename.encode("utf-8"))
259 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
260 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
261 sys.stdout.write("\r\n")
262 shutil.copyfileobj(video_data, sys.stdout)
263 video_data.close()
264
265 def cgimain():
266 args = cgi.parse()
267 try:
268 url = args["url"][0]
269 except:
270 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
271 return
272
273 try:
274 doc = parse_url(url)
275 video_url, filename = get_video_url(doc)
276 video_data = urlopen(video_url)
277 write_video(filename, video_data)
278 except VideoUnavailable, e:
279 print_form(
280 url=url,
281 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
282 )
283 except Exception, e:
284 print_form(
285 url=url,
286 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
287 )
288 return
289
290 def pp_size(size):
291 suffixes = ["", "KiB", "MiB", "GiB"]
292 for i, suffix in enumerate(suffixes):
293 if size < 1024:
294 break
295 size /= 1024
296 return "%.2f %s" % (size, suffix)
297
298 def copy_with_progress(content_length, infile, outfile):
299 def print_status():
300 rate = 0
301 if now != last_ts:
302 rate = last_bytes_read / (now - last_ts)
303 sys.stdout.write("\33[2K\r")
304 sys.stdout.write("%s / %s (%s/sec)" % (
305 pp_size(bytes_read),
306 pp_size(content_length),
307 pp_size(rate),
308 ))
309 sys.stdout.flush()
310
311 last_ts = 0
312 last_bytes_read = 0
313 bytes_read = 0
314 while True:
315 now = time.time()
316 if now - last_ts > 0.5:
317 print_status()
318 last_ts = now
319 last_bytes_read = 0
320
321 buf = infile.read(32768)
322 if not buf:
323 break
324 outfile.write(buf)
325 last_bytes_read += len(buf)
326 bytes_read += len(buf)
327
328 # Newline at the end
329 print_status()
330 print
331
332 def main():
333 try:
334 url = sys.argv[1]
335 except:
336 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
337 sys.exit(1)
338
339 doc = parse_url(url)
340 video_url, filename = get_video_url(doc)
341 print "Downloading", filename.encode("utf-8")
342
343 outfile = open(filename, "a")
344 offset = outfile.tell()
345 if offset > 0:
346 print "Resuming download from", pp_size(offset)
347 total_size = None
348
349 while True:
350 try:
351 video_data = urlopen(video_url, offset)
352 except urllib2.HTTPError, e:
353 if e.code == 416:
354 print "File is complete!"
355 break
356 else:
357 raise
358
359 content_length = int(video_data.info().getheader("Content-Length"))
360 if total_size is None:
361 total_size = content_length
362
363 try:
364 copy_with_progress(content_length, video_data, outfile)
365 except IOError, e:
366 print
367
368 video_data.close()
369 if outfile.tell() != total_size:
370 old_offset = offset
371 offset = outfile.tell()
372 if old_offset == offset:
373 time.sleep(1)
374 print "Restarting download from", pp_size(offset)
375 else:
376 break
377
378 outfile.close()
379
380
381 if __name__ == "__main__":
382 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
383 if os.environ.has_key("SCRIPT_NAME"):
384 cgimain()
385 else:
386 try:
387 main()
388 except KeyboardInterrupt:
389 print "\nExiting..."
390 sys.exit(1)
391