]> code.delx.au - youtube-cgi/blob - youtube.cgi
17fcf1697b517ec053abb2ff98a6dfd74202ea93
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/env python
2
3 import cookielib
4 import cgi
5 import itertools
6 import json
7 from lxml import html
8 import os
9 import re
10 import resource
11 import shutil
12 import subprocess
13 import sys
14 import time
15 import urllib
16 import urllib2
17 import urlparse
18
19
20 MAX_MEMORY_BYTES = 128 * 1024*1024
21 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
22
23 MIMETYPES = {
24 "video/mp4": "mp4",
25 "video/x-flv": "flv",
26 "video/3gpp": "3gp",
27 }
28
29 QUALITIES = {
30 "large": 3,
31 "medium": 2,
32 "small": 1,
33 }
34
35
36 class VideoUnavailable(Exception):
37 pass
38
39 def print_form(url="", msg=""):
40 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
41 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
42 sys.stdout.write("""
43 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
44 <html xmlns="http://www.w3.org/1999/xhtml">
45 <head>
46 <title>delx.net.au - YouTube Scraper</title>
47 <link rel="stylesheet" type="text/css" href="/style.css"/>
48 <style type="text/css">
49 input[type="text"] {
50 width: 100%;
51 }
52 .error {
53 color: red;
54 }
55 </style>
56 </head>
57 <body>
58 <h1>delx.net.au - YouTube Scraper</h1>
59 {0}
60 <form action="" method="get">
61 <p>This page will let you easily download YouTube videos to watch offline. It
62 will automatically grab the highest quality version.</p>
63 <div><input type="text" name="url" value="{1}"/></div>
64 <div><input type="submit" value="Download!"/></div>
65 </form>
66 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
67 to easily download videos. Right-click the link and add it to bookmarks,
68 then when you're looking at a YouTube page select that bookmark from your
69 browser's bookmarks menu to download the video straight away.</p>
70 </body>
71 </html>
72 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
73
74 cookiejar = cookielib.CookieJar()
75 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
76 referrer = ""
77
78 def urlopen(url):
79 global referrer
80 req = urllib2.Request(url)
81 if referrer:
82 req.add_header("Referer", referrer)
83 referrer = url
84 req.add_header("User-Agent", USER_AGENT)
85 return urlopener.open(req)
86
87 def parse_url(url):
88 f = urlopen(url)
89 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
90 f.close()
91 return doc
92
93 def append_to_qs(url, params):
94 r = list(urlparse.urlsplit(url))
95 qs = urlparse.parse_qs(r[3])
96 qs.update(params)
97 r[3] = urllib.urlencode(qs, True)
98 url = urlparse.urlunsplit(r)
99 return url
100
101 def convert_from_old_itag(player_config):
102 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
103 url_data["url"] = []
104 for itag_url in url_data["itag"]:
105 pos = itag_url.find("url=")
106 url_data["url"].append(itag_url[pos+4:])
107 player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
108
109 def get_player_config(doc):
110 player_config = None
111 for script in doc.xpath("//script"):
112 if not script.text:
113 continue
114 for line in script.text.split("\n"):
115 if "yt.playerConfig =" in line:
116 p1 = line.find("=")
117 p2 = line.rfind(";")
118 if p1 >= 0 and p2 > 0:
119 return json.loads(line[p1+1:p2])
120 if "'PLAYER_CONFIG': " in line:
121 p1 = line.find(":")
122 if p1 >= 0:
123 player_config = json.loads(line[p1+1:])
124 convert_from_old_itag(player_config)
125 return player_config
126
127 def get_best_video(player_config):
128 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
129 url_data = itertools.izip_longest(
130 url_data["url"],
131 url_data["type"],
132 url_data["quality"],
133 url_data.get("sig", []),
134 )
135 best_url = None
136 best_quality = None
137 best_extension = None
138 for video_url, mimetype, quality, signature in url_data:
139 mimetype = mimetype.split(";")[0]
140 if mimetype not in MIMETYPES:
141 continue
142 extension = MIMETYPES[mimetype]
143 quality = QUALITIES.get(quality.split(",")[0], -1)
144 if best_quality is None or quality > best_quality:
145 if signature:
146 video_url = append_to_qs(video_url, {"signature": signature})
147 best_url = video_url
148 best_quality = quality
149 best_extension = extension
150
151 return best_url, best_extension
152
153 def sanitize_filename(filename):
154 return (
155 re.sub("\s+", " ", filename.strip())
156 .replace("\\", "-")
157 .replace("/", "-")
158 .replace("\0", " ")
159 )
160
161 def get_video_url(doc):
162 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
163 if unavailable:
164 raise VideoUnavailable(unavailable[0].strip())
165
166 player_config = get_player_config(doc)
167 if not player_config:
168 raise VideoUnavailable("Could not find video URL")
169
170 video_url, extension = get_best_video(player_config)
171 if not video_url:
172 return None, None
173
174 title = doc.xpath("/html/head/title/text()")[0]
175 filename = sanitize_filename(title)
176 filename += "." + extension
177
178 return video_url, filename
179
180 def write_video(filename, video_data):
181 httpinfo = video_data.info()
182 encoded_filename = urllib.quote(filename.encode("utf-8"))
183 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
184 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
185 sys.stdout.write("\r\n")
186 shutil.copyfileobj(video_data, sys.stdout)
187 video_data.close()
188
189 def cgimain():
190 args = cgi.parse()
191 try:
192 url = args["url"][0]
193 except:
194 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
195 return
196
197 try:
198 doc = parse_url(url)
199 video_url, filename = get_video_url(doc)
200 video_data = urlopen(video_url)
201 write_video(filename, video_data)
202 except VideoUnavailable, e:
203 print_form(
204 url=url,
205 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
206 )
207 except Exception, e:
208 print_form(
209 url=url,
210 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
211 )
212 return
213
214 def copy_with_progress(total_size, infile, outfile):
215 def pp_size(size):
216 suffixes = ["", "KiB", "MiB", "GiB"]
217 for i, suffix in enumerate(suffixes):
218 if size < 1024:
219 break
220 size /= 1024
221 return "%d %s" % (size, suffix)
222
223 start_ts = time.time()
224 last_ts = 0
225 bytes_read = 0
226 while True:
227 now = time.time()
228 if now - last_ts > 0.5:
229 last_ts = now
230 sys.stdout.write("\33[2K\r")
231 sys.stdout.write("%s / %s (%s/sec)" % (
232 pp_size(bytes_read),
233 pp_size(total_size),
234 pp_size(bytes_read / (now - start_ts)),
235 ))
236 sys.stdout.flush()
237
238 buf = infile.read(32768)
239 if not buf:
240 break
241 outfile.write(buf)
242 bytes_read += len(buf)
243
244 def main():
245 try:
246 url = sys.argv[1]
247 except:
248 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
249 sys.exit(1)
250 doc = parse_url(url)
251 video_url, filename = get_video_url(doc)
252 video_data = urlopen(video_url)
253 outfile = open(filename, "w")
254 total_size = int(video_data.info().getheader("Content-Length"))
255 print "Downloading", filename.encode("utf-8")
256 copy_with_progress(total_size, video_data, outfile)
257 video_data.close()
258 outfile.close()
259
260
261 if __name__ == "__main__":
262 resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
263 if os.environ.has_key("SCRIPT_NAME"):
264 cgimain()
265 else:
266 try:
267 main()
268 except KeyboardInterrupt:
269 print "\nExiting..."
270 sys.exit(1)
271