]> code.delx.au - youtube-cgi/blob - youtube.cgi
fixed parsing of video url data
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/env python
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 global referrer
83 req = urllib2.Request(url)
84 if referrer:
85 req.add_header("Referer", referrer)
86 referrer = url
87
88 req.add_header("User-Agent", USER_AGENT)
89
90 if offset:
91 req.add_header("Range", "bytes=%d-" % offset)
92
93 res = urlopener.open(req)
94
95 content_range = res.info().getheader("Content-Range")
96 if content_range:
97 tokens = content_range.split()
98 assert tokens[0] == "bytes"
99 start = int(tokens[1].split("-")[0])
100 assert start == offset
101 return res
102
103 def parse_url(url):
104 f = urlopen(url)
105 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
106 f.close()
107 return doc
108
109 def append_to_qs(url, params):
110 r = list(urlparse.urlsplit(url))
111 qs = urlparse.parse_qs(r[3])
112 qs.update(params)
113 r[3] = urllib.urlencode(qs, True)
114 url = urlparse.urlunsplit(r)
115 return url
116
117 def convert_from_old_itag(player_config):
118 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
119 url_data["url"] = []
120 for itag_url in url_data["itag"]:
121 pos = itag_url.find("url=")
122 url_data["url"].append(itag_url[pos+4:])
123 player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
124
125 def get_player_config(doc):
126 player_config = None
127 for script in doc.xpath("//script"):
128 if not script.text:
129 continue
130 for line in script.text.split("\n"):
131 if "yt.playerConfig =" in line:
132 p1 = line.find("=")
133 p2 = line.rfind(";")
134 if p1 >= 0 and p2 > 0:
135 return json.loads(line[p1+1:p2])
136 if "'PLAYER_CONFIG': " in line:
137 p1 = line.find(":")
138 if p1 >= 0:
139 player_config = json.loads(line[p1+1:])
140 convert_from_old_itag(player_config)
141 return player_config
142
143 def get_best_video(player_config):
144 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
145
146 best_url = None
147 best_quality = None
148 best_extension = None
149 for url_data in url_data_list:
150 url_data = urlparse.parse_qs(url_data)
151 video_url = url_data["url"][0]
152 mimetype = url_data["type"][0].split(";")[0]
153 quality = url_data["quality"][0]
154 signature = url_data["sig"][0]
155
156 if quality not in QUALITIES:
157 continue
158 if mimetype not in MIMETYPES:
159 continue
160
161 extension = MIMETYPES[mimetype]
162 quality = QUALITIES.get(quality, -1)
163
164 if best_quality is None or quality > best_quality:
165 if signature:
166 video_url = append_to_qs(video_url, {"signature": signature})
167 best_url = video_url
168 best_quality = quality
169 best_extension = extension
170
171 return best_url, best_extension
172
173 def sanitize_filename(filename):
174 return (
175 re.sub("\s+", " ", filename.strip())
176 .replace("\\", "-")
177 .replace("/", "-")
178 .replace("\0", " ")
179 )
180
181 def get_video_url(doc):
182 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
183 if unavailable:
184 raise VideoUnavailable(unavailable[0].strip())
185
186 player_config = get_player_config(doc)
187 if not player_config:
188 raise VideoUnavailable("Could not find video URL")
189
190 video_url, extension = get_best_video(player_config)
191 if not video_url:
192 return None, None
193
194 title = doc.xpath("/html/head/title/text()")[0]
195 filename = sanitize_filename(title)
196 filename += "." + extension
197
198 return video_url, filename
199
200 def write_video(filename, video_data):
201 httpinfo = video_data.info()
202 encoded_filename = urllib.quote(filename.encode("utf-8"))
203 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
204 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
205 sys.stdout.write("\r\n")
206 shutil.copyfileobj(video_data, sys.stdout)
207 video_data.close()
208
209 def cgimain():
210 args = cgi.parse()
211 try:
212 url = args["url"][0]
213 except:
214 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
215 return
216
217 try:
218 doc = parse_url(url)
219 video_url, filename = get_video_url(doc)
220 video_data = urlopen(video_url)
221 write_video(filename, video_data)
222 except VideoUnavailable, e:
223 print_form(
224 url=url,
225 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
226 )
227 except Exception, e:
228 print_form(
229 url=url,
230 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
231 )
232 return
233
234 def pp_size(size):
235 suffixes = ["", "KiB", "MiB", "GiB"]
236 for i, suffix in enumerate(suffixes):
237 if size < 1024:
238 break
239 size /= 1024
240 return "%.2f %s" % (size, suffix)
241
242 def copy_with_progress(content_length, infile, outfile):
243 def print_status():
244 rate = 0
245 if now != last_ts:
246 rate = last_bytes_read / (now - last_ts)
247 sys.stdout.write("\33[2K\r")
248 sys.stdout.write("%s / %s (%s/sec)" % (
249 pp_size(bytes_read),
250 pp_size(content_length),
251 pp_size(rate),
252 ))
253 sys.stdout.flush()
254
255 last_ts = 0
256 last_bytes_read = 0
257 bytes_read = 0
258 while True:
259 now = time.time()
260 if now - last_ts > 0.5:
261 print_status()
262 last_ts = now
263 last_bytes_read = 0
264
265 buf = infile.read(32768)
266 if not buf:
267 break
268 outfile.write(buf)
269 last_bytes_read += len(buf)
270 bytes_read += len(buf)
271
272 # Newline at the end
273 print_status()
274 print
275
276 def main():
277 try:
278 url = sys.argv[1]
279 except:
280 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
281 sys.exit(1)
282
283 doc = parse_url(url)
284 video_url, filename = get_video_url(doc)
285 print "Downloading", filename.encode("utf-8")
286
287 outfile = open(filename, "a")
288 offset = outfile.tell()
289 if offset > 0:
290 print "Resuming download from", pp_size(offset)
291 total_size = None
292
293 while True:
294 try:
295 video_data = urlopen(video_url, offset)
296 except urllib2.HTTPError, e:
297 if e.code == 416:
298 print "File is complete!"
299 break
300 else:
301 raise
302
303 content_length = int(video_data.info().getheader("Content-Length"))
304 if total_size is None:
305 total_size = content_length
306
307 try:
308 copy_with_progress(content_length, video_data, outfile)
309 except IOError, e:
310 print
311
312 video_data.close()
313 if outfile.tell() != total_size:
314 old_offset = offset
315 offset = outfile.tell()
316 if old_offset == offset:
317 time.sleep(1)
318 print "Restarting download from", pp_size(offset)
319 else:
320 break
321
322 outfile.close()
323
324
325 if __name__ == "__main__":
326 resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
327 if os.environ.has_key("SCRIPT_NAME"):
328 cgimain()
329 else:
330 try:
331 main()
332 except KeyboardInterrupt:
333 print "\nExiting..."
334 sys.exit(1)
335