]> code.delx.au - youtube-cgi/blob - youtube.cgi
b94febfdfb4f549f411477bced8349183961eb5f
[youtube-cgi] / youtube.cgi
1 #!/usr/bin/env python3
2
3 import cgi
4 import html.parser
5 import http.cookiejar
6 import json
7 import os
8 import re
9 import shutil
10 import subprocess
11 import sys
12 import time
13 import urllib.error
14 import urllib.parse
15 import urllib.request
16
17
18 MAX_MEMORY_BYTES = 128 * 1024*1024
19 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
20
21 MIMETYPES = {
22 "video/mp4": "mp4",
23 "video/x-flv": "flv",
24 "video/3gpp": "3gp",
25 }
26
27 QUALITIES = {
28 "hd1080": 5,
29 "hd720": 4,
30 "large": 3,
31 "medium": 2,
32 "small": 1,
33 }
34
35
36 class VideoUnavailable(Exception):
37 pass
38
39 def print_form(url="", msg=""):
40 script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
41 sys.stdout.write("Content-Type: text/html\r\n\r\n")
42 sys.stdout.write("""
43 <!DOCTYPE html>
44 <html>
45 <head>
46 <title>delx.net.au - YouTube Scraper</title>
47 <link rel="stylesheet" type="text/css" href="/style.css">
48 <style type="text/css">
49 input[type="text"] {
50 width: 100%;
51 }
52 .error {
53 color: red;
54 }
55 </style>
56 </head>
57 <body>
58 <h1>delx.net.au - YouTube Scraper</h1>
59 {0}
60 <form action="" method="get">
61 <p>This page will let you easily download YouTube videos to watch offline. It
62 will automatically grab the highest quality version.</p>
63 <div><input type="text" name="url" value="{1}"/></div>
64 <div><input type="submit" value="Download!"/></div>
65 </form>
66 <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
67 to easily download videos. Right-click the link and add it to bookmarks,
68 then when you're looking at a YouTube page select that bookmark from your
69 browser's bookmarks menu to download the video straight away.</p>
70 </body>
71 </html>
72 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
73
74 cookiejar = http.cookiejar.CookieJar()
75 urlopener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookiejar))
76 referrer = ""
77
78 def urlopen(url, offset=None):
79 if url.startswith("//"):
80 url = "https:" + url
81 if not url.startswith("http://") and not url.startswith("https://"):
82 url = "https://www.youtube.com" + url
83
84 global referrer
85 req = urllib.request.Request(url)
86 if not referrer:
87 referrer = url
88 else:
89 req.add_header("Referer", referrer)
90
91 req.add_header("User-Agent", USER_AGENT)
92
93 if offset:
94 req.add_header("Range", "bytes=%d-" % offset)
95
96 res = urlopener.open(req)
97
98 content_range = res.getheader("Content-Range")
99 if content_range:
100 tokens = content_range.split()
101 assert tokens[0] == "bytes"
102 start = int(tokens[1].split("-")[0])
103 assert start == offset
104 return res
105
106 def parse_url(url, parser):
107 f = urlopen(url)
108 parser.feed(f.read().decode("utf-8"))
109 parser.close()
110 f.close()
111
112 def append_to_qs(url, params):
113 r = list(urllib.parse.urlsplit(url))
114 qs = urllib.parse.parse_qs(r[3])
115 qs.update(params)
116 r[3] = urllib.parse.urlencode(qs, True)
117 url = urllib.parse.urlunsplit(r)
118 return url
119
120 def get_player_config(scripts):
121 player_config = None
122 for script in scripts:
123 for line in script.split("\n"):
124 s = "ytplayer.config = {"
125 if s in line:
126 p1 = line.find(s) + len(s) - 1
127 p2 = line.find("};", p1) + 1
128 if p1 >= 0 and p2 > 0:
129 return json.loads(line[p1:p2])
130
131 def extract_js(script):
132 PREFIX = "var _yt_player={};(function(g){var window=this;"
133 SUFFIX = ";})(_yt_player);\n"
134 assert script.startswith(PREFIX)
135 assert script.endswith(SUFFIX)
136
137 return script[len(PREFIX):-len(SUFFIX)]
138
139 def find_func_name(script):
140 FUNC_NAME = R"([a-zA-Z0-9$]+)"
141 FUNC_PARAMS = R"(\([a-zA-Z,\.]+\.s\))"
142 TERMINATOR = R"[,;\)]"
143 PATTERN = FUNC_NAME + FUNC_PARAMS + TERMINATOR
144
145 match = re.search(PATTERN, script)
146 func_name = match.groups()[0]
147 return func_name
148
149 def decode_signature(js_url, signature):
150 f = urlopen(js_url)
151 script = f.read().decode("utf-8")
152 f.close()
153
154 func_name = find_func_name(script)
155
156 params = {
157 "func_name": func_name,
158 "signature": json.dumps(signature),
159 "code": json.dumps(extract_js(script)),
160 }
161 p = subprocess.Popen(
162 "node",
163 shell=True,
164 close_fds=True,
165 stdin=subprocess.PIPE,
166 stdout=subprocess.PIPE
167 )
168 js_decode_script = ("""
169 const vm = require('vm');
170
171 const sandbox = {
172 location: {
173 hash: '',
174 href: '',
175 protocol: 'http:'
176 },
177 history: {
178 pushState: function(){}
179 },
180 document: {},
181 navigator: {
182 userAgent: ''
183 },
184 XMLHttpRequest: class XMLHttpRequest {},
185 matchMedia: () => ({matches: () => {}, media: ''}),
186 signature: %(signature)s,
187 transformed_signature: null,
188 g: function(){} // this is _yt_player
189 };
190 sandbox.window = sandbox;
191
192 const code_string = %(code)s + ';';
193 const exec_string = 'transformed_signature = %(func_name)s("", "MARKER", signature);';
194 vm.runInNewContext(code_string + exec_string, sandbox);
195
196 function findSignature(obj) {
197 if (typeof obj !== 'object') {
198 return;
199 }
200 for (const [key, value] of Object.entries(obj)) {
201 if (key === 'MARKER') {
202 return value;
203 }
204 const result = findSignature(value);
205 if (result) {
206 return result;
207 }
208 }
209 }
210 console.log(findSignature(sandbox.transformed_signature));
211 """ % params)
212
213 p.stdin.write(js_decode_script.encode("utf-8"))
214 p.stdin.close()
215
216 transformed_signature = p.stdout.read().decode("utf-8").strip()
217 if p.wait() != 0:
218 raise Exception("js failed to execute: %d" % p.returncode)
219
220 return transformed_signature
221
222 def get_best_video(player_config):
223 url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
224 js_url = player_config["assets"]["js"]
225
226 best_url = None
227 best_quality = None
228 best_extension = None
229 for url_data in url_data_list:
230 url_data = urllib.parse.parse_qs(url_data)
231 mimetype = url_data["type"][0].split(";")[0]
232 quality = url_data["quality"][0]
233
234 if "stereo3d" in url_data:
235 continue
236 if quality not in QUALITIES:
237 continue
238 if mimetype not in MIMETYPES:
239 continue
240
241 extension = MIMETYPES[mimetype]
242 quality = QUALITIES.get(quality, -1)
243
244 if best_quality is not None and quality < best_quality:
245 continue
246
247 video_url = url_data["url"][0]
248 if "sig" in url_data:
249 signature = url_data["sig"][0]
250 elif "s" in url_data:
251 signature = decode_signature(js_url, url_data["s"][0])
252 else:
253 signature = None
254
255 if signature:
256 video_url = append_to_qs(video_url, {"signature": signature})
257
258 best_url = video_url
259 best_quality = quality
260 best_extension = extension
261
262 return best_url, best_extension
263
264 def sanitize_filename(filename):
265 return (
266 re.sub("\s+", " ", filename.strip())
267 .replace("\\", "-")
268 .replace("/", "-")
269 .replace("\0", " ")
270 )
271
272 def get_video_url(page):
273 player_config = get_player_config(page.scripts)
274 if not player_config:
275 raise VideoUnavailable(page.unavailable_message or "Could not find video URL")
276
277 video_url, extension = get_best_video(player_config)
278 if not video_url:
279 return None, None
280
281 filename = sanitize_filename(page.title)
282 filename += "." + extension
283
284 return video_url, filename
285
286 class YouTubeVideoPageParser(html.parser.HTMLParser):
287 def __init__(self):
288 super().__init__()
289 self.title = None
290 self.unavailable_message = None
291 self.scripts = []
292
293 def handle_starttag(self, tag, attrs):
294 attrs = dict(attrs)
295 self._handle_title(tag, attrs)
296 self._handle_unavailable_message(tag, attrs)
297 self._handle_script(tag, attrs)
298
299 def handle_endtag(self, tag):
300 self.handle_data = self._ignore_data
301
302 def _ignore_data(self, _):
303 pass
304
305 def _handle_title(self, tag, attrs):
306 if tag == "title":
307 self.handle_data = self._handle_title_data
308
309 def _handle_title_data(self, data):
310 self.title = data.strip()
311
312 def _handle_unavailable_message(self, tag, attrs):
313 if attrs.get("id", None) == "unavailable-message":
314 self.handle_data = self._handle_unavailable_message_data
315
316 def _handle_unavailable_message_data(self, data):
317 self.unavailable_message = data.strip()
318
319 def _handle_script(self, tag, attrs):
320 if tag == "script":
321 self.handle_data = self._handle_script_data
322
323 def _handle_script_data(self, data):
324 if data:
325 self.scripts.append(data)
326
327 def write_video(filename, video_data):
328 quoted_filename = urllib.parse.quote(filename.encode("utf-8"))
329 sys.stdout.buffer.write(
330 b"Content-Disposition: attachment; filename*=UTF-8''{0}\r\n"
331 .replace(b"{0}", quoted_filename.encode("utf-8"))
332 )
333 sys.stdout.buffer.write(
334 b"Content-Length: {0}\r\n"
335 .replace(b"{0}", video_data.getheader("Content-Length").encode("utf-8"))
336 )
337 sys.stdout.buffer.write(b"\r\n")
338 shutil.copyfileobj(video_data, sys.stdout.buffer)
339 video_data.close()
340
341 def cgimain():
342 args = cgi.parse()
343 try:
344 url = args["url"][0]
345 except:
346 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
347 return
348
349 try:
350 page = YouTubeVideoPageParser()
351 parse_url(url, page)
352 video_url, filename = get_video_url(page)
353 video_data = urlopen(video_url)
354 except VideoUnavailable as e:
355 print_form(
356 url=url,
357 msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.args[0])
358 )
359 except Exception as e:
360 print_form(
361 url=url,
362 msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
363 )
364 return
365
366 write_video(filename, video_data)
367
368 def pp_size(size):
369 suffixes = ["", "KiB", "MiB", "GiB"]
370 for i, suffix in enumerate(suffixes):
371 if size < 1024:
372 break
373 size /= 1024
374 return "%.2f %s" % (size, suffix)
375
376 def copy_with_progress(content_length, infile, outfile):
377 def print_status():
378 rate = 0
379 if now != last_ts:
380 rate = last_bytes_read / (now - last_ts)
381 sys.stdout.write("\33[2K\r")
382 sys.stdout.write("%s / %s (%s/sec)" % (
383 pp_size(bytes_read),
384 pp_size(content_length),
385 pp_size(rate),
386 ))
387 sys.stdout.flush()
388
389 last_ts = 0
390 last_bytes_read = 0
391 bytes_read = 0
392 while True:
393 now = time.time()
394 if now - last_ts > 0.5:
395 print_status()
396 last_ts = now
397 last_bytes_read = 0
398
399 buf = infile.read(32768)
400 if not buf:
401 break
402 outfile.write(buf)
403 last_bytes_read += len(buf)
404 bytes_read += len(buf)
405
406 # Newline at the end
407 print_status()
408 print()
409
410 def main():
411 try:
412 url = sys.argv[1]
413 except:
414 print("Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0], file=sys.stderr)
415 sys.exit(1)
416
417 page = YouTubeVideoPageParser()
418 parse_url(url, page)
419 video_url, filename = get_video_url(page)
420 print("Downloading", filename)
421
422 outfile = open(filename, "ab")
423 offset = outfile.tell()
424 if offset > 0:
425 print("Resuming download from", pp_size(offset))
426 total_size = None
427
428 while True:
429 try:
430 video_data = urlopen(video_url, offset)
431 except urllib.error.HTTPError as e:
432 if e.code == 416:
433 print("File is complete!")
434 break
435 else:
436 raise
437
438 content_length = int(video_data.getheader("Content-Length"))
439 if total_size is None:
440 total_size = content_length
441
442 try:
443 copy_with_progress(content_length, video_data, outfile)
444 except IOError as e:
445 print()
446
447 video_data.close()
448 if outfile.tell() != total_size:
449 old_offset = offset
450 offset = outfile.tell()
451 if old_offset == offset:
452 time.sleep(1)
453 print("Restarting download from", pp_size(offset))
454 else:
455 break
456
457 outfile.close()
458
459
460 if __name__ == "__main__":
461 if "SCRIPT_NAME" in os.environ:
462 cgimain()
463 else:
464 try:
465 main()
466 except KeyboardInterrupt:
467 print("\nExiting...")
468 sys.exit(1)
469