]> code.delx.au - youtube-cgi/blob - youtube.cgi
support new JS variable name
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/env python
2
3 from __future__ import division
4
5 import cookielib
6 import cgi
7 import json
8 from lxml import html
9 import os
10 import re
11 import resource
12 import shutil
13 import subprocess
14 import sys
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 MAX_MEMORY_BYTES = 128 * 1024*1024
22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
23
24 MIMETYPES = {
25 "video/mp4": "mp4",
26 "video/x-flv": "flv",
27 "video/3gpp": "3gp",
28 }
29
30 QUALITIES = {
31 "hd1080": 5,
32 "hd720": 4,
33 "large": 3,
34 "medium": 2,
35 "small": 1,
36 }
37
38
39 class VideoUnavailable(Exception):
40 pass
41
42 def print_form(url="", msg=""):
43 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
44 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
45 sys.stdout.write("""
46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
47 <html xmlns="http://www.w3.org/1999/xhtml">
48 <head>
49 <title>delx.net.au - YouTube Scraper</title>
50 <link rel="stylesheet" type="text/css" href="/style.css"/>
51 <style type="text/css">
52 input[type="text"] {
53 width: 100%;
54 }
55 .error {
56 color: red;
57 }
58 </style>
59 </head>
60 <body>
61 <h1>delx.net.au - YouTube Scraper</h1>
62 {0}
63 <form action="" method="get">
64 <p>This page will let you easily download YouTube videos to watch offline. It
65 will automatically grab the highest quality version.</p>
66 <div><input type="text" name="url" value="{1}"/></div>
67 <div><input type="submit" value="Download!"/></div>
68 </form>
69 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
70 to easily download videos. Right-click the link and add it to bookmarks,
71 then when you're looking at a YouTube page select that bookmark from your
72 browser's bookmarks menu to download the video straight away.</p>
73 </body>
74 </html>
75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
76
77 cookiejar = cookielib.CookieJar()
78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
79 referrer = ""
80
81 def urlopen(url, offset=None):
82 global referrer
83 req = urllib2.Request(url)
84 if referrer:
85 req.add_header("Referer", referrer)
86 referrer = url
87
88 req.add_header("User-Agent", USER_AGENT)
89
90 if offset:
91 req.add_header("Range", "bytes=%d-" % offset)
92
93 res = urlopener.open(req)
94
95 content_range = res.info().getheader("Content-Range")
96 if content_range:
97 tokens = content_range.split()
98 assert tokens[0] == "bytes"
99 start = int(tokens[1].split("-")[0])
100 assert start == offset
101 return res
102
103 def parse_url(url):
104 f = urlopen(url)
105 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
106 f.close()
107 return doc
108
109 def append_to_qs(url, params):
110 r = list(urlparse.urlsplit(url))
111 qs = urlparse.parse_qs(r[3])
112 qs.update(params)
113 r[3] = urllib.urlencode(qs, True)
114 url = urlparse.urlunsplit(r)
115 return url
116
117 def convert_from_old_itag(player_config):
118 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
119 url_data["url"] = []
120 for itag_url in url_data["itag"]:
121 pos = itag_url.find("url=")
122 url_data["url"].append(itag_url[pos+4:])
123 player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
124
125 def get_player_config(doc):
126 player_config = None
127 for script in doc.xpath("//script"):
128 if not script.text:
129 continue
130 for line in script.text.split("\n"):
131 if "yt.playerConfig =" in line:
132 p1 = line.find("=")
133 p2 = line.rfind(";")
134 if p1 >= 0 and p2 > 0:
135 return json.loads(line[p1+1:p2])
136 if "ytplayer.config =" in line:
137 p1 = line.find("ytplayer.config =")
138 p2 = line.rfind(";")
139 if p1 >= 0 and p2 > 0:
140 return json.loads(line[p1+18:p2])
141 if "'PLAYER_CONFIG': " in line:
142 p1 = line.find(":")
143 if p1 >= 0:
144 player_config = json.loads(line[p1+1:])
145 convert_from_old_itag(player_config)
146 return player_config
147
148 def get_best_video(player_config):
149 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
150
151 best_url = None
152 best_quality = None
153 best_extension = None
154 for url_data in url_data_list:
155 url_data = urlparse.parse_qs(url_data)
156 video_url = url_data["url"][0]
157 mimetype = url_data["type"][0].split(";")[0]
158 quality = url_data["quality"][0]
159 signature = url_data["sig"][0]
160
161 if quality not in QUALITIES:
162 continue
163 if mimetype not in MIMETYPES:
164 continue
165
166 extension = MIMETYPES[mimetype]
167 quality = QUALITIES.get(quality, -1)
168 video_url = append_to_qs(video_url, {"signature": signature})
169
170 if best_quality is None or quality > best_quality:
171 best_url = video_url
172 best_quality = quality
173 best_extension = extension
174
175 return best_url, best_extension
176
177 def sanitize_filename(filename):
178 return (
179 re.sub("\s+", " ", filename.strip())
180 .replace("\\", "-")
181 .replace("/", "-")
182 .replace("\0", " ")
183 )
184
185 def get_video_url(doc):
186 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
187 if unavailable:
188 raise VideoUnavailable(unavailable[0].strip())
189
190 player_config = get_player_config(doc)
191 if not player_config:
192 raise VideoUnavailable("Could not find video URL")
193
194 video_url, extension = get_best_video(player_config)
195 if not video_url:
196 return None, None
197
198 title = doc.xpath("/html/head/title/text()")[0]
199 filename = sanitize_filename(title)
200 filename += "." + extension
201
202 return video_url, filename
203
204 def write_video(filename, video_data):
205 httpinfo = video_data.info()
206 encoded_filename = urllib.quote(filename.encode("utf-8"))
207 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
208 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
209 sys.stdout.write("\r\n")
210 shutil.copyfileobj(video_data, sys.stdout)
211 video_data.close()
212
213 def cgimain():
214 args = cgi.parse()
215 try:
216 url = args["url"][0]
217 except:
218 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
219 return
220
221 try:
222 doc = parse_url(url)
223 video_url, filename = get_video_url(doc)
224 video_data = urlopen(video_url)
225 write_video(filename, video_data)
226 except VideoUnavailable, e:
227 print_form(
228 url=url,
229 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
230 )
231 except Exception, e:
232 print_form(
233 url=url,
234 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
235 )
236 return
237
238 def pp_size(size):
239 suffixes = ["", "KiB", "MiB", "GiB"]
240 for i, suffix in enumerate(suffixes):
241 if size < 1024:
242 break
243 size /= 1024
244 return "%.2f %s" % (size, suffix)
245
246 def copy_with_progress(content_length, infile, outfile):
247 def print_status():
248 rate = 0
249 if now != last_ts:
250 rate = last_bytes_read / (now - last_ts)
251 sys.stdout.write("\33[2K\r")
252 sys.stdout.write("%s / %s (%s/sec)" % (
253 pp_size(bytes_read),
254 pp_size(content_length),
255 pp_size(rate),
256 ))
257 sys.stdout.flush()
258
259 last_ts = 0
260 last_bytes_read = 0
261 bytes_read = 0
262 while True:
263 now = time.time()
264 if now - last_ts > 0.5:
265 print_status()
266 last_ts = now
267 last_bytes_read = 0
268
269 buf = infile.read(32768)
270 if not buf:
271 break
272 outfile.write(buf)
273 last_bytes_read += len(buf)
274 bytes_read += len(buf)
275
276 # Newline at the end
277 print_status()
278 print
279
280 def main():
281 try:
282 url = sys.argv[1]
283 except:
284 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
285 sys.exit(1)
286
287 doc = parse_url(url)
288 video_url, filename = get_video_url(doc)
289 print "Downloading", filename.encode("utf-8")
290
291 outfile = open(filename, "a")
292 offset = outfile.tell()
293 if offset > 0:
294 print "Resuming download from", pp_size(offset)
295 total_size = None
296
297 while True:
298 try:
299 video_data = urlopen(video_url, offset)
300 except urllib2.HTTPError, e:
301 if e.code == 416:
302 print "File is complete!"
303 break
304 else:
305 raise
306
307 content_length = int(video_data.info().getheader("Content-Length"))
308 if total_size is None:
309 total_size = content_length
310
311 try:
312 copy_with_progress(content_length, video_data, outfile)
313 except IOError, e:
314 print
315
316 video_data.close()
317 if outfile.tell() != total_size:
318 old_offset = offset
319 offset = outfile.tell()
320 if old_offset == offset:
321 time.sleep(1)
322 print "Restarting download from", pp_size(offset)
323 else:
324 break
325
326 outfile.close()
327
328
329 if __name__ == "__main__":
330 resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
331 if os.environ.has_key("SCRIPT_NAME"):
332 cgimain()
333 else:
334 try:
335 main()
336 except KeyboardInterrupt:
337 print "\nExiting..."
338 sys.exit(1)
339