]> code.delx.au - youtube-cgi/blob - youtube.cgi
263259cd5ab1c71aea18c449d977a330660755fb
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/env python
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 global referrer
83 req = urllib2.Request(url)
84 if referrer:
85 req.add_header("Referer", referrer)
86 referrer = url
87
88 req.add_header("User-Agent", USER_AGENT)
89
90 if offset:
91 req.add_header("Range", "bytes=%d-" % offset)
92
93 res = urlopener.open(req)
94
95 content_range = res.info().getheader("Content-Range")
96 if content_range:
97 tokens = content_range.split()
98 assert tokens[0] == "bytes"
99 start = int(tokens[1].split("-")[0])
100 assert start == offset
101 return res
102
103 def parse_url(url):
104 f = urlopen(url)
105 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
106 f.close()
107 return doc
108
109 def append_to_qs(url, params):
110 r = list(urlparse.urlsplit(url))
111 qs = urlparse.parse_qs(r[3])
112 qs.update(params)
113 r[3] = urllib.urlencode(qs, True)
114 url = urlparse.urlunsplit(r)
115 return url
116
117 def convert_from_old_itag(player_config):
118 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
119 url_data["url"] = []
120 for itag_url in url_data["itag"]:
121 pos = itag_url.find("url=")
122 url_data["url"].append(itag_url[pos+4:])
123 player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
124
125 def get_player_config(doc):
126 player_config = None
127 for script in doc.xpath("//script"):
128 if not script.text:
129 continue
130 for line in script.text.split("\n"):
131 if "yt.playerConfig =" in line:
132 p1 = line.find("=")
133 p2 = line.rfind(";")
134 if p1 >= 0 and p2 > 0:
135 return json.loads(line[p1+1:p2])
136 if "ytplayer.config =" in line:
137 p1 = line.find("ytplayer.config =")
138 p2 = line.rfind(";")
139 if p1 >= 0 and p2 > 0:
140 return json.loads(line[p1+18:p2])
141 if "'PLAYER_CONFIG': " in line:
142 p1 = line.find(":")
143 if p1 >= 0:
144 player_config = json.loads(line[p1+1:])
145 convert_from_old_itag(player_config)
146 return player_config
147
148 def decode_signature(js_url, s):
149 script = urlopen(js_url).read()
150 func_name = re.search(R"\b([a-z]+)\([a-z]+\.s\);", script).groups()[0]
151 p1 = script.find("function " + func_name)
152 p2 = script.find("}", p1)
153 func_block = script[p1:p2+1]
154
155 p = subprocess.Popen(
156 ["js"],
157 stdin=subprocess.PIPE,
158 stdout=subprocess.PIPE
159 )
160 p.stdin.write(func_block + "\n")
161 p.stdin.write("console.log(%s('%s'));\n" % (func_name, s))
162 p.stdin.close()
163
164 signature = p.stdout.read().strip()
165 if p.wait() != 0:
166 raise Exception("js failed to execute: %d" % p.returncode)
167
168 return signature
169
170 def get_best_video(player_config):
171 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
172 js_url = player_config["assets"]["js"]
173
174 best_url = None
175 best_quality = None
176 best_extension = None
177 for url_data in url_data_list:
178 url_data = urlparse.parse_qs(url_data)
179 mimetype = url_data["type"][0].split(";")[0]
180 quality = url_data["quality"][0]
181
182 if quality not in QUALITIES:
183 continue
184 if mimetype not in MIMETYPES:
185 continue
186
187 extension = MIMETYPES[mimetype]
188 quality = QUALITIES.get(quality, -1)
189
190 if best_quality is not None and quality < best_quality:
191 continue
192
193 video_url = url_data["url"][0]
194 if "sig" in url_data:
195 signature = url_data["sig"][0]
196 else:
197 signature = decode_signature(js_url, url_data["s"][0])
198 video_url = append_to_qs(video_url, {"signature": signature})
199
200 best_url = video_url
201 best_quality = quality
202 best_extension = extension
203
204 return best_url, best_extension
205
206 def sanitize_filename(filename):
207 return (
208 re.sub("\s+", " ", filename.strip())
209 .replace("\\", "-")
210 .replace("/", "-")
211 .replace("\0", " ")
212 )
213
214 def get_video_url(doc):
215 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
216 if unavailable:
217 raise VideoUnavailable(unavailable[0].strip())
218
219 player_config = get_player_config(doc)
220 if not player_config:
221 raise VideoUnavailable("Could not find video URL")
222
223 video_url, extension = get_best_video(player_config)
224 if not video_url:
225 return None, None
226
227 title = doc.xpath("/html/head/title/text()")[0]
228 filename = sanitize_filename(title)
229 filename += "." + extension
230
231 return video_url, filename
232
233 def write_video(filename, video_data):
234 httpinfo = video_data.info()
235 encoded_filename = urllib.quote(filename.encode("utf-8"))
236 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
237 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
238 sys.stdout.write("\r\n")
239 shutil.copyfileobj(video_data, sys.stdout)
240 video_data.close()
241
242 def cgimain():
243 args = cgi.parse()
244 try:
245 url = args["url"][0]
246 except:
247 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
248 return
249
250 try:
251 doc = parse_url(url)
252 video_url, filename = get_video_url(doc)
253 video_data = urlopen(video_url)
254 write_video(filename, video_data)
255 except VideoUnavailable, e:
256 print_form(
257 url=url,
258 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
259 )
260 except Exception, e:
261 print_form(
262 url=url,
263 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
264 )
265 return
266
267 def pp_size(size):
268 suffixes = ["", "KiB", "MiB", "GiB"]
269 for i, suffix in enumerate(suffixes):
270 if size < 1024:
271 break
272 size /= 1024
273 return "%.2f %s" % (size, suffix)
274
275 def copy_with_progress(content_length, infile, outfile):
276 def print_status():
277 rate = 0
278 if now != last_ts:
279 rate = last_bytes_read / (now - last_ts)
280 sys.stdout.write("\33[2K\r")
281 sys.stdout.write("%s / %s (%s/sec)" % (
282 pp_size(bytes_read),
283 pp_size(content_length),
284 pp_size(rate),
285 ))
286 sys.stdout.flush()
287
288 last_ts = 0
289 last_bytes_read = 0
290 bytes_read = 0
291 while True:
292 now = time.time()
293 if now - last_ts > 0.5:
294 print_status()
295 last_ts = now
296 last_bytes_read = 0
297
298 buf = infile.read(32768)
299 if not buf:
300 break
301 outfile.write(buf)
302 last_bytes_read += len(buf)
303 bytes_read += len(buf)
304
305 # Newline at the end
306 print_status()
307 print
308
309 def main():
310 try:
311 url = sys.argv[1]
312 except:
313 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
314 sys.exit(1)
315
316 doc = parse_url(url)
317 video_url, filename = get_video_url(doc)
318 print "Downloading", filename.encode("utf-8")
319
320 outfile = open(filename, "a")
321 offset = outfile.tell()
322 if offset > 0:
323 print "Resuming download from", pp_size(offset)
324 total_size = None
325
326 while True:
327 try:
328 video_data = urlopen(video_url, offset)
329 except urllib2.HTTPError, e:
330 if e.code == 416:
331 print "File is complete!"
332 break
333 else:
334 raise
335
336 content_length = int(video_data.info().getheader("Content-Length"))
337 if total_size is None:
338 total_size = content_length
339
340 try:
341 copy_with_progress(content_length, video_data, outfile)
342 except IOError, e:
343 print
344
345 video_data.close()
346 if outfile.tell() != total_size:
347 old_offset = offset
348 offset = outfile.tell()
349 if old_offset == offset:
350 time.sleep(1)
351 print "Restarting download from", pp_size(offset)
352 else:
353 break
354
355 outfile.close()
356
357
358 if __name__ == "__main__":
359 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
360 if os.environ.has_key("SCRIPT_NAME"):
361 cgimain()
362 else:
363 try:
364 main()
365 except KeyboardInterrupt:
366 print "\nExiting..."
367 sys.exit(1)
368