code.delx.au - youtube-cgi/blob - youtube.cgi

   1 #!/usr/bin/python2
   2
   3 from __future__ import division
   4
   5 import cookielib
   6 import cgi
   7 import json
   8 from lxml import html
   9 import os
  10 import re
  11 import resource
  12 import shutil
  13 import subprocess
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18 import urlparse
  19
  20
  21 MAX_MEMORY_BYTES = 128 * 1024*1024
  22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
  23
  24 MIMETYPES = {
  25     "video/mp4": "mp4",
  26     "video/x-flv": "flv",
  27     "video/3gpp": "3gp",
  28 }
  29
  30 QUALITIES = {
  31     "hd1080": 5,
  32     "hd720": 4,
  33     "large": 3,
  34     "medium": 2,
  35     "small": 1,
  36 }
  37
  38
  39 class VideoUnavailable(Exception):
  40     pass
  41
  42 def print_form(url="", msg=""):
  43     script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
  44     sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
  45     sys.stdout.write("""
  46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  47 <html xmlns="http://www.w3.org/1999/xhtml">
  48 <head>
  49     <title>delx.net.au - YouTube Scraper</title>
  50     <link rel="stylesheet" type="text/css" href="/style.css"/>
  51     <style type="text/css">
  52         input[type="text"] {
  53             width: 100%;
  54         }
  55         .error {
  56             color: red;
  57         }
  58     </style>
  59 </head>
  60 <body>
  61     <h1>delx.net.au - YouTube Scraper</h1>
  62     {0}
  63     <form action="" method="get">
  64     <p>This page will let you easily download YouTube videos to watch offline. It
  65     will automatically grab the highest quality version.</p>
  66     <div><input type="text" name="url" value="{1}"/></div>
  67     <div><input type="submit" value="Download!"/></div>
  68     </form>
  69     <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
  70     to easily download videos. Right-click the link and add it to bookmarks,
  71     then when you're looking at a YouTube page select that bookmark from your
  72     browser's bookmarks menu to download the video straight away.</p>
  73 </body>
  74 </html>
  75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
  76
  77 cookiejar = cookielib.CookieJar()
  78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
  79 referrer = ""
  80
  81 def urlopen(url, offset=None):
  82     if url.startswith("//"):
  83         url = "http:" + url
  84
  85     global referrer
  86     req = urllib2.Request(url)
  87     if not referrer:
  88         referrer = url
  89     else:
  90         req.add_header("Referer", referrer)
  91
  92     req.add_header("User-Agent", USER_AGENT)
  93
  94     if offset:
  95         req.add_header("Range", "bytes=%d-" % offset)
  96
  97     res = urlopener.open(req)
  98
  99     content_range = res.info().getheader("Content-Range")
 100     if content_range:
 101         tokens = content_range.split()
 102         assert tokens[0] == "bytes"
 103         start = int(tokens[1].split("-")[0])
 104         assert start == offset
 105     return res
 106
 107 def parse_url(url):
 108     f = urlopen(url)
 109     doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
 110     f.close()
 111     return doc
 112
 113 def append_to_qs(url, params):
 114     r = list(urlparse.urlsplit(url))
 115     qs = urlparse.parse_qs(r[3])
 116     qs.update(params)
 117     r[3] = urllib.urlencode(qs, True)
 118     url = urlparse.urlunsplit(r)
 119     return url
 120
 121 def get_player_config(doc):
 122     player_config = None
 123     for script in doc.xpath("//script"):
 124         if not script.text:
 125             continue
 126         for line in script.text.split("\n"):
 127             s = "ytplayer.config = {"
 128             if s in line:
 129                 p1 = line.find(s) + len(s) - 1
 130                 p2 = line.find("};", p1) + 1
 131                 if p1 >= 0 and p2 > 0:
 132                     return json.loads(line[p1:p2])
 133
 134 def extract_js(script):
 135     PREFIX = "var _yt_player={};(function(g){var window=this;"
 136     SUFFIX = ";})(_yt_player);\n"
 137     assert script.startswith(PREFIX)
 138     assert script.endswith(SUFFIX)
 139
 140     return script[len(PREFIX):-len(SUFFIX)]
 141
 142 def find_func_name(script):
 143     FUNC_NAME = R"([a-zA-Z0-9$]+)"
 144     FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
 145     PATTERN = FUNC_NAME + FUNC_PARAMS + ";"
 146
 147     match = re.search(PATTERN, script)
 148     func_name = match.groups()[0]
 149     return func_name
 150
 151 def decode_signature(js_url, signature):
 152     script = urlopen(js_url).read()
 153     func_name = find_func_name(script)
 154
 155     params = {
 156         "func_name": func_name,
 157         "signature": json.dumps(signature),
 158         "code": json.dumps(extract_js(script)),
 159     }
 160     p = subprocess.Popen(
 161         "nodejs",
 162         shell=True,
 163         close_fds=True,
 164         stdin=subprocess.PIPE,
 165         stdout=subprocess.PIPE
 166     )
 167     js_decode_script = ("""
 168         var vm = require('vm');
 169
 170         var sandbox = {
 171             location: {
 172                 hash: '',
 173                 href: '',
 174                 protocol: 'http:'
 175             },
 176             history: {
 177                 pushState: function(){}
 178             },
 179             document: {},
 180             navigator: {
 181                 userAgent: ''
 182             },
 183             signature: %(signature)s,
 184             transformed_signature: null
 185         };
 186         sandbox.window = sandbox;
 187
 188         var code_string = %(code)s + ';';
 189         var exec_string = 'transformed_signature = %(func_name)s(signature);';
 190         vm.runInNewContext(code_string + exec_string, sandbox);
 191
 192         console.log(sandbox.transformed_signature);
 193     """ % params)
 194
 195     p.stdin.write(js_decode_script)
 196     p.stdin.close()
 197
 198     transformed_signature = p.stdout.read().strip()
 199     if p.wait() != 0:
 200         raise Exception("js failed to execute: %d" % p.returncode)
 201
 202     return transformed_signature
 203
 204 def get_best_video(player_config):
 205     url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
 206     js_url = player_config["assets"]["js"]
 207
 208     best_url = None
 209     best_quality = None
 210     best_extension = None
 211     for url_data in url_data_list:
 212         url_data = urlparse.parse_qs(url_data)
 213         mimetype = url_data["type"][0].split(";")[0]
 214         quality = url_data["quality"][0]
 215
 216         if url_data.has_key("stereo3d"):
 217             continue
 218         if quality not in QUALITIES:
 219             continue
 220         if mimetype not in MIMETYPES:
 221             continue
 222
 223         extension = MIMETYPES[mimetype]
 224         quality = QUALITIES.get(quality, -1)
 225
 226         if best_quality is not None and quality < best_quality:
 227             continue
 228
 229         video_url = url_data["url"][0]
 230         if "sig" in url_data:
 231             signature = url_data["sig"][0]
 232         elif "s" in url_data:
 233             signature = decode_signature(js_url, url_data["s"][0])
 234         else:
 235             signature = None
 236
 237         if signature:
 238             video_url = append_to_qs(video_url, {"signature": signature})
 239
 240         best_url = video_url
 241         best_quality = quality
 242         best_extension = extension
 243
 244     return best_url, best_extension
 245
 246 def sanitize_filename(filename):
 247     return (
 248         re.sub("\s+", " ", filename.strip())
 249         .replace("\\", "-")
 250         .replace("/", "-")
 251         .replace("\0", " ")
 252     )
 253
 254 def get_video_url(doc):
 255     unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
 256     if unavailable:
 257         raise VideoUnavailable(unavailable[0].strip())
 258
 259     player_config = get_player_config(doc)
 260     if not player_config:
 261         raise VideoUnavailable("Could not find video URL")
 262
 263     video_url, extension = get_best_video(player_config)
 264     if not video_url:
 265         return None, None
 266
 267     title = doc.xpath("/html/head/title/text()")[0]
 268     filename = sanitize_filename(title)
 269     filename += "." + extension
 270
 271     return video_url, filename
 272
 273 def write_video(filename, video_data):
 274     httpinfo = video_data.info()
 275     encoded_filename = urllib.quote(filename.encode("utf-8"))
 276     sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
 277     sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
 278     sys.stdout.write("\r\n")
 279     shutil.copyfileobj(video_data, sys.stdout)
 280     video_data.close()
 281
 282 def cgimain():
 283     args = cgi.parse()
 284     try:
 285         url = args["url"][0]
 286     except:
 287         print_form(url="http://www.youtube.com/watch?v=FOOBAR")
 288         return
 289
 290     try:
 291         doc = parse_url(url)
 292         video_url, filename = get_video_url(doc)
 293         video_data = urlopen(video_url)
 294         write_video(filename, video_data)
 295     except VideoUnavailable, e:
 296         print_form(
 297             url=url,
 298             msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
 299         )
 300     except Exception, e:
 301         print_form(
 302             url=url,
 303             msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
 304         )
 305         return
 306
 307 def pp_size(size):
 308     suffixes = ["", "KiB", "MiB", "GiB"]
 309     for i, suffix in enumerate(suffixes):
 310         if size < 1024:
 311             break
 312         size /= 1024
 313     return "%.2f %s" % (size, suffix)
 314
 315 def copy_with_progress(content_length, infile, outfile):
 316     def print_status():
 317         rate = 0
 318         if now != last_ts:
 319             rate = last_bytes_read / (now - last_ts)
 320         sys.stdout.write("\33[2K\r")
 321         sys.stdout.write("%s / %s (%s/sec)" % (
 322             pp_size(bytes_read),
 323             pp_size(content_length),
 324             pp_size(rate),
 325         ))
 326         sys.stdout.flush()
 327
 328     last_ts = 0
 329     last_bytes_read = 0
 330     bytes_read = 0
 331     while True:
 332         now = time.time()
 333         if now - last_ts > 0.5:
 334             print_status()
 335             last_ts = now
 336             last_bytes_read = 0
 337
 338         buf = infile.read(32768)
 339         if not buf:
 340             break
 341         outfile.write(buf)
 342         last_bytes_read += len(buf)
 343         bytes_read += len(buf)
 344
 345     # Newline at the end
 346     print_status()
 347     print
 348
 349 def main():
 350     try:
 351         url = sys.argv[1]
 352     except:
 353         print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
 354         sys.exit(1)
 355
 356     doc = parse_url(url)
 357     video_url, filename = get_video_url(doc)
 358     print "Downloading", filename.encode("utf-8")
 359
 360     outfile = open(filename, "a")
 361     offset = outfile.tell()
 362     if offset > 0:
 363         print "Resuming download from", pp_size(offset)
 364     total_size = None
 365
 366     while True:
 367         try:
 368             video_data = urlopen(video_url, offset)
 369         except urllib2.HTTPError, e:
 370             if e.code == 416:
 371                 print "File is complete!"
 372                 break
 373             else:
 374                 raise
 375
 376         content_length = int(video_data.info().getheader("Content-Length"))
 377         if total_size is None:
 378             total_size = content_length
 379
 380         try:
 381             copy_with_progress(content_length, video_data, outfile)
 382         except IOError, e:
 383             print
 384
 385         video_data.close()
 386         if outfile.tell() != total_size:
 387             old_offset = offset
 388             offset = outfile.tell()
 389             if old_offset == offset:
 390                 time.sleep(1)
 391             print "Restarting download from", pp_size(offset)
 392         else:
 393             break
 394
 395     outfile.close()
 396
 397
 398 if __name__ == "__main__":
 399 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
 400     if os.environ.has_key("SCRIPT_NAME"):
 401         cgimain()
 402     else:
 403         try:
 404             main()
 405         except KeyboardInterrupt:
 406             print "\nExiting..."
 407             sys.exit(1)
 408