]> code.delx.au - youtube-cgi/blob - youtube.cgi
proper unicode support for filenames
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/env python
2
3 import cookielib
4 import cgi
5 import itertools
6 import json
7 from lxml import html
8 import os
9 import re
10 import resource
11 import shutil
12 import subprocess
13 import sys
14 import urllib
15 import urllib2
16 import urlparse
17
18
19 MAX_MEMORY_BYTES = 128 * 1024*1024
20 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
21
22 MIMETYPES = {
23 "video/mp4": "mp4",
24 "video/x-flv": "flv",
25 "video/3gpp": "3gp",
26 }
27
28 QUALITIES = {
29 "large": 3,
30 "medium": 2,
31 "small": 1,
32 }
33
34
35 class VideoUnavailable(Exception):
36 pass
37
38 def print_form(url="", msg=""):
39 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
40 sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
41 sys.stdout.write("""
42 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
43 <html xmlns="http://www.w3.org/1999/xhtml">
44 <head>
45 <title>delx.net.au - YouTube Scraper</title>
46 <link rel="stylesheet" type="text/css" href="/style.css"/>
47 <style type="text/css">
48 input[type="text"] {
49 width: 100%;
50 }
51 .error {
52 color: red;
53 }
54 </style>
55 </head>
56 <body>
57 <h1>delx.net.au - YouTube Scraper</h1>
58 {0}
59 <form action="" method="get">
60 <p>This page will let you easily download YouTube videos to watch offline. It
61 will automatically grab the highest quality version.</p>
62 <div><input type="text" name="url" value="{1}"/></div>
63 <div><input type="submit" value="Download!"/></div>
64 </form>
65 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
66 to easily download videos. Right-click the link and add it to bookmarks,
67 then when you're looking at a YouTube page select that bookmark from your
68 browser's bookmarks menu to download the video straight away.</p>
69 </body>
70 </html>
71 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
72
73 cookiejar = cookielib.CookieJar()
74 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
75 referrer = ""
76
77 def urlopen(url):
78 global referrer
79 req = urllib2.Request(url)
80 if referrer:
81 req.add_header("Referer", referrer)
82 referrer = url
83 req.add_header("User-Agent", USER_AGENT)
84 return urlopener.open(req)
85
86 def parse_url(url):
87 f = urlopen(url)
88 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
89 f.close()
90 return doc
91
92 def append_to_qs(url, params):
93 r = list(urlparse.urlsplit(url))
94 qs = urlparse.parse_qs(r[3])
95 qs.update(params)
96 r[3] = urllib.urlencode(qs, True)
97 url = urlparse.urlunsplit(r)
98 return url
99
100 def convert_from_old_itag(player_config):
101 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
102 url_data["url"] = []
103 for itag_url in url_data["itag"]:
104 pos = itag_url.find("url=")
105 url_data["url"].append(itag_url[pos+4:])
106 player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
107
108 def get_player_config(doc):
109 player_config = None
110 for script in doc.xpath("//script"):
111 if not script.text:
112 continue
113 for line in script.text.split("\n"):
114 if "yt.playerConfig =" in line:
115 p1 = line.find("=")
116 p2 = line.rfind(";")
117 if p1 >= 0 and p2 > 0:
118 return json.loads(line[p1+1:p2])
119 if "'PLAYER_CONFIG': " in line:
120 p1 = line.find(":")
121 if p1 >= 0:
122 player_config = json.loads(line[p1+1:])
123 convert_from_old_itag(player_config)
124 return player_config
125
126 def get_best_video(player_config):
127 url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
128 url_data = itertools.izip_longest(
129 url_data["url"],
130 url_data["type"],
131 url_data["quality"],
132 url_data.get("sig", []),
133 )
134 best_url = None
135 best_quality = None
136 best_extension = None
137 for video_url, mimetype, quality, signature in url_data:
138 mimetype = mimetype.split(";")[0]
139 if mimetype not in MIMETYPES:
140 continue
141 extension = MIMETYPES[mimetype]
142 quality = QUALITIES.get(quality.split(",")[0], -1)
143 if best_quality is None or quality > best_quality:
144 if signature:
145 video_url = append_to_qs(video_url, {"signature": signature})
146 best_url = video_url
147 best_quality = quality
148 best_extension = extension
149
150 return best_url, best_extension
151
152 def sanitize_filename(filename):
153 return (
154 re.sub("\s+", " ", filename.strip())
155 .replace("\\", "-")
156 .replace("/", "-")
157 .replace("\0", " ")
158 )
159
160 def get_video_url(doc):
161 unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
162 if unavailable:
163 raise VideoUnavailable(unavailable[0].strip())
164
165 player_config = get_player_config(doc)
166 if not player_config:
167 raise VideoUnavailable("Could not find video URL")
168
169 video_url, extension = get_best_video(player_config)
170 if not video_url:
171 return None, None
172
173 title = doc.xpath("/html/head/title/text()")[0]
174 filename = sanitize_filename(title)
175 filename += "." + extension
176
177 return video_url, filename
178
179 def write_video(filename, video_data):
180 httpinfo = video_data.info()
181 encoded_filename = urllib.quote(filename.encode("utf-8"))
182 sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
183 sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
184 sys.stdout.write("\r\n")
185 shutil.copyfileobj(video_data, sys.stdout)
186 video_data.close()
187
188 def cgimain():
189 args = cgi.parse()
190 try:
191 url = args["url"][0]
192 except:
193 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
194 return
195
196 try:
197 doc = parse_url(url)
198 video_url, filename = get_video_url(doc)
199 video_data = urlopen(video_url)
200 write_video(filename, video_data)
201 except VideoUnavailable, e:
202 print_form(
203 url=url,
204 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
205 )
206 except Exception, e:
207 print_form(
208 url=url,
209 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
210 )
211 return
212
213 def main():
214 try:
215 url = sys.argv[1]
216 except:
217 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
218 sys.exit(1)
219 doc = parse_url(url)
220 video_url, filename = get_video_url(doc)
221 data = urlopen(video_url)
222 outfile = open(filename, "w")
223 shutil.copyfileobj(data, outfile)
224 data.close()
225 outfile.close()
226
227
228 if __name__ == "__main__":
229 resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
230 if os.environ.has_key("SCRIPT_NAME"):
231 cgimain()
232 else:
233 main()
234