code.delx.au - youtube-cgi/blob - youtube.cgi

   1 #!/usr/bin/python2
   2
   3 from __future__ import division
   4
   5 import cookielib
   6 import cgi
   7 import json
   8 from lxml import html
   9 import os
  10 import re
  11 import resource
  12 import shutil
  13 import subprocess
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18 import urlparse
  19
  20
  21 MAX_MEMORY_BYTES = 128 * 1024*1024
  22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
  23
  24 MIMETYPES = {
  25     "video/mp4": "mp4",
  26     "video/x-flv": "flv",
  27     "video/3gpp": "3gp",
  28 }
  29
  30 QUALITIES = {
  31     "hd1080": 5,
  32     "hd720": 4,
  33     "large": 3,
  34     "medium": 2,
  35     "small": 1,
  36 }
  37
  38
  39 class VideoUnavailable(Exception):
  40     pass
  41
  42 def print_form(url="", msg=""):
  43     script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
  44     sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
  45     sys.stdout.write("""
  46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  47 <html xmlns="http://www.w3.org/1999/xhtml">
  48 <head>
  49     <title>delx.net.au - YouTube Scraper</title>
  50     <link rel="stylesheet" type="text/css" href="/style.css"/>
  51     <style type="text/css">
  52         input[type="text"] {
  53             width: 100%;
  54         }
  55         .error {
  56             color: red;
  57         }
  58     </style>
  59 </head>
  60 <body>
  61     <h1>delx.net.au - YouTube Scraper</h1>
  62     {0}
  63     <form action="" method="get">
  64     <p>This page will let you easily download YouTube videos to watch offline. It
  65     will automatically grab the highest quality version.</p>
  66     <div><input type="text" name="url" value="{1}"/></div>
  67     <div><input type="submit" value="Download!"/></div>
  68     </form>
  69     <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
  70     to easily download videos. Right-click the link and add it to bookmarks,
  71     then when you're looking at a YouTube page select that bookmark from your
  72     browser's bookmarks menu to download the video straight away.</p>
  73 </body>
  74 </html>
  75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
  76
  77 cookiejar = cookielib.CookieJar()
  78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
  79 referrer = ""
  80
  81 def urlopen(url, offset=None):
  82     if url.startswith("//"):
  83         url = "http:" + url
  84
  85     global referrer
  86     req = urllib2.Request(url)
  87     if not referrer:
  88         referrer = url
  89     else:
  90         req.add_header("Referer", referrer)
  91
  92     req.add_header("User-Agent", USER_AGENT)
  93
  94     if offset:
  95         req.add_header("Range", "bytes=%d-" % offset)
  96
  97     res = urlopener.open(req)
  98
  99     content_range = res.info().getheader("Content-Range")
 100     if content_range:
 101         tokens = content_range.split()
 102         assert tokens[0] == "bytes"
 103         start = int(tokens[1].split("-")[0])
 104         assert start == offset
 105     return res
 106
 107 def parse_url(url):
 108     f = urlopen(url)
 109     doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
 110     f.close()
 111     return doc
 112
 113 def append_to_qs(url, params):
 114     r = list(urlparse.urlsplit(url))
 115     qs = urlparse.parse_qs(r[3])
 116     qs.update(params)
 117     r[3] = urllib.urlencode(qs, True)
 118     url = urlparse.urlunsplit(r)
 119     return url
 120
 121 def get_player_config(doc):
 122     player_config = None
 123     for script in doc.xpath("//script"):
 124         if not script.text:
 125             continue
 126         for line in script.text.split("\n"):
 127             s = "ytplayer.config = {"
 128             if s in line:
 129                 p1 = line.find(s) + len(s) - 1
 130                 p2 = line.find("};", p1) + 1
 131                 if p1 >= 0 and p2 > 0:
 132                     return json.loads(line[p1:p2])
 133
 134 def extract_js(script):
 135     PREFIX = "var _yt_player={};(function(g){var window=this;"
 136     SUFFIX = ";})(_yt_player);\n"
 137     assert script.startswith(PREFIX)
 138     assert script.endswith(SUFFIX)
 139
 140     return script[len(PREFIX):-len(SUFFIX)]
 141
 142 def find_func_name(script):
 143     FUNC_NAME = R"([a-zA-Z0-9$]+)"
 144     FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
 145     PATTERN = FUNC_NAME + FUNC_PARAMS + ";"
 146
 147     match = re.search(PATTERN, script)
 148     func_name = match.groups()[0]
 149     return func_name
 150
 151 def decode_signature(js_url, signature):
 152     script = urlopen(js_url).read()
 153     func_name = find_func_name(script)
 154
 155     params = {
 156         "func_name": func_name,
 157         "signature": json.dumps(signature),
 158         "code": json.dumps(extract_js(script)),
 159     }
 160     p = subprocess.Popen(
 161         "nodejs",
 162         shell=True,
 163         close_fds=True,
 164         stdin=subprocess.PIPE,
 165         stdout=subprocess.PIPE
 166     )
 167     js_decode_script = ("""
 168         var vm = require('vm');
 169
 170         var sandbox = {
 171             location: {
 172                 hash: '',
 173                 href: '',
 174                 protocol: 'http:'
 175             },
 176             history: {
 177                 pushState: function(){}
 178             },
 179             document: {},
 180             navigator: {},
 181             signature: %(signature)s,
 182             transformed_signature: null
 183         };
 184         sandbox.window = sandbox;
 185
 186         var code_string = %(code)s + ';';
 187         var exec_string = 'transformed_signature = %(func_name)s(signature);';
 188         vm.runInNewContext(code_string + exec_string, sandbox);
 189
 190         console.log(sandbox.transformed_signature);
 191     """ % params)
 192
 193     p.stdin.write(js_decode_script)
 194     p.stdin.close()
 195
 196     transformed_signature = p.stdout.read().strip()
 197     if p.wait() != 0:
 198         raise Exception("js failed to execute: %d" % p.returncode)
 199
 200     return transformed_signature
 201
 202 def get_best_video(player_config):
 203     url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
 204     js_url = player_config["assets"]["js"]
 205
 206     best_url = None
 207     best_quality = None
 208     best_extension = None
 209     for url_data in url_data_list:
 210         url_data = urlparse.parse_qs(url_data)
 211         mimetype = url_data["type"][0].split(";")[0]
 212         quality = url_data["quality"][0]
 213
 214         if url_data.has_key("stereo3d"):
 215             continue
 216         if quality not in QUALITIES:
 217             continue
 218         if mimetype not in MIMETYPES:
 219             continue
 220
 221         extension = MIMETYPES[mimetype]
 222         quality = QUALITIES.get(quality, -1)
 223
 224         if best_quality is not None and quality < best_quality:
 225             continue
 226
 227         video_url = url_data["url"][0]
 228         if "sig" in url_data:
 229             signature = url_data["sig"][0]
 230         elif "s" in url_data:
 231             signature = decode_signature(js_url, url_data["s"][0])
 232         else:
 233             signature = None
 234
 235         if signature:
 236             video_url = append_to_qs(video_url, {"signature": signature})
 237
 238         best_url = video_url
 239         best_quality = quality
 240         best_extension = extension
 241
 242     return best_url, best_extension
 243
 244 def sanitize_filename(filename):
 245     return (
 246         re.sub("\s+", " ", filename.strip())
 247         .replace("\\", "-")
 248         .replace("/", "-")
 249         .replace("\0", " ")
 250     )
 251
 252 def get_video_url(doc):
 253     unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
 254     if unavailable:
 255         raise VideoUnavailable(unavailable[0].strip())
 256
 257     player_config = get_player_config(doc)
 258     if not player_config:
 259         raise VideoUnavailable("Could not find video URL")
 260
 261     video_url, extension = get_best_video(player_config)
 262     if not video_url:
 263         return None, None
 264
 265     title = doc.xpath("/html/head/title/text()")[0]
 266     filename = sanitize_filename(title)
 267     filename += "." + extension
 268
 269     return video_url, filename
 270
 271 def write_video(filename, video_data):
 272     httpinfo = video_data.info()
 273     encoded_filename = urllib.quote(filename.encode("utf-8"))
 274     sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
 275     sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
 276     sys.stdout.write("\r\n")
 277     shutil.copyfileobj(video_data, sys.stdout)
 278     video_data.close()
 279
 280 def cgimain():
 281     args = cgi.parse()
 282     try:
 283         url = args["url"][0]
 284     except:
 285         print_form(url="http://www.youtube.com/watch?v=FOOBAR")
 286         return
 287
 288     try:
 289         doc = parse_url(url)
 290         video_url, filename = get_video_url(doc)
 291         video_data = urlopen(video_url)
 292         write_video(filename, video_data)
 293     except VideoUnavailable, e:
 294         print_form(
 295             url=url,
 296             msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
 297         )
 298     except Exception, e:
 299         print_form(
 300             url=url,
 301             msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
 302         )
 303         return
 304
 305 def pp_size(size):
 306     suffixes = ["", "KiB", "MiB", "GiB"]
 307     for i, suffix in enumerate(suffixes):
 308         if size < 1024:
 309             break
 310         size /= 1024
 311     return "%.2f %s" % (size, suffix)
 312
 313 def copy_with_progress(content_length, infile, outfile):
 314     def print_status():
 315         rate = 0
 316         if now != last_ts:
 317             rate = last_bytes_read / (now - last_ts)
 318         sys.stdout.write("\33[2K\r")
 319         sys.stdout.write("%s / %s (%s/sec)" % (
 320             pp_size(bytes_read),
 321             pp_size(content_length),
 322             pp_size(rate),
 323         ))
 324         sys.stdout.flush()
 325
 326     last_ts = 0
 327     last_bytes_read = 0
 328     bytes_read = 0
 329     while True:
 330         now = time.time()
 331         if now - last_ts > 0.5:
 332             print_status()
 333             last_ts = now
 334             last_bytes_read = 0
 335
 336         buf = infile.read(32768)
 337         if not buf:
 338             break
 339         outfile.write(buf)
 340         last_bytes_read += len(buf)
 341         bytes_read += len(buf)
 342
 343     # Newline at the end
 344     print_status()
 345     print
 346
 347 def main():
 348     try:
 349         url = sys.argv[1]
 350     except:
 351         print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
 352         sys.exit(1)
 353
 354     doc = parse_url(url)
 355     video_url, filename = get_video_url(doc)
 356     print "Downloading", filename.encode("utf-8")
 357
 358     outfile = open(filename, "a")
 359     offset = outfile.tell()
 360     if offset > 0:
 361         print "Resuming download from", pp_size(offset)
 362     total_size = None
 363
 364     while True:
 365         try:
 366             video_data = urlopen(video_url, offset)
 367         except urllib2.HTTPError, e:
 368             if e.code == 416:
 369                 print "File is complete!"
 370                 break
 371             else:
 372                 raise
 373
 374         content_length = int(video_data.info().getheader("Content-Length"))
 375         if total_size is None:
 376             total_size = content_length
 377
 378         try:
 379             copy_with_progress(content_length, video_data, outfile)
 380         except IOError, e:
 381             print
 382
 383         video_data.close()
 384         if outfile.tell() != total_size:
 385             old_offset = offset
 386             offset = outfile.tell()
 387             if old_offset == offset:
 388                 time.sleep(1)
 389             print "Restarting download from", pp_size(offset)
 390         else:
 391             break
 392
 393     outfile.close()
 394
 395
 396 if __name__ == "__main__":
 397 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
 398     if os.environ.has_key("SCRIPT_NAME"):
 399         cgimain()
 400     else:
 401         try:
 402             main()
 403         except KeyboardInterrupt:
 404             print "\nExiting..."
 405             sys.exit(1)
 406