code.delx.au - youtube-cgi/blob - youtube.cgi

   1 #!/usr/bin/python2
   2
   3 from __future__ import division
   4
   5 import cookielib
   6 import cgi
   7 import json
   8 from lxml import html
   9 import os
  10 import re
  11 import resource
  12 import shutil
  13 import subprocess
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18 import urlparse
  19
  20
  21 MAX_MEMORY_BYTES = 128 * 1024*1024
  22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
  23
  24 MIMETYPES = {
  25     "video/mp4": "mp4",
  26     "video/x-flv": "flv",
  27     "video/3gpp": "3gp",
  28 }
  29
  30 QUALITIES = {
  31     "hd1080": 5,
  32     "hd720": 4,
  33     "large": 3,
  34     "medium": 2,
  35     "small": 1,
  36 }
  37
  38
  39 class VideoUnavailable(Exception):
  40     pass
  41
  42 def print_form(url="", msg=""):
  43     script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
  44     sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
  45     sys.stdout.write("""
  46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  47 <html xmlns="http://www.w3.org/1999/xhtml">
  48 <head>
  49     <title>delx.net.au - YouTube Scraper</title>
  50     <link rel="stylesheet" type="text/css" href="/style.css"/>
  51     <style type="text/css">
  52         input[type="text"] {
  53             width: 100%;
  54         }
  55         .error {
  56             color: red;
  57         }
  58     </style>
  59 </head>
  60 <body>
  61     <h1>delx.net.au - YouTube Scraper</h1>
  62     {0}
  63     <form action="" method="get">
  64     <p>This page will let you easily download YouTube videos to watch offline. It
  65     will automatically grab the highest quality version.</p>
  66     <div><input type="text" name="url" value="{1}"/></div>
  67     <div><input type="submit" value="Download!"/></div>
  68     </form>
  69     <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
  70     to easily download videos. Right-click the link and add it to bookmarks,
  71     then when you're looking at a YouTube page select that bookmark from your
  72     browser's bookmarks menu to download the video straight away.</p>
  73 </body>
  74 </html>
  75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
  76
  77 cookiejar = cookielib.CookieJar()
  78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
  79 referrer = ""
  80
  81 def urlopen(url, offset=None):
  82     if url.startswith("//"):
  83         url = "https:" + url
  84     if not url.startswith("http://") and not url.startswith("https://"):
  85         url = "https://www.youtube.com" + url
  86
  87     global referrer
  88     req = urllib2.Request(url)
  89     if not referrer:
  90         referrer = url
  91     else:
  92         req.add_header("Referer", referrer)
  93
  94     req.add_header("User-Agent", USER_AGENT)
  95
  96     if offset:
  97         req.add_header("Range", "bytes=%d-" % offset)
  98
  99     res = urlopener.open(req)
 100
 101     content_range = res.info().getheader("Content-Range")
 102     if content_range:
 103         tokens = content_range.split()
 104         assert tokens[0] == "bytes"
 105         start = int(tokens[1].split("-")[0])
 106         assert start == offset
 107     return res
 108
 109 def parse_url(url):
 110     f = urlopen(url)
 111     doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
 112     f.close()
 113     return doc
 114
 115 def append_to_qs(url, params):
 116     r = list(urlparse.urlsplit(url))
 117     qs = urlparse.parse_qs(r[3])
 118     qs.update(params)
 119     r[3] = urllib.urlencode(qs, True)
 120     url = urlparse.urlunsplit(r)
 121     return url
 122
 123 def get_player_config(doc):
 124     player_config = None
 125     for script in doc.xpath("//script"):
 126         if not script.text:
 127             continue
 128         for line in script.text.split("\n"):
 129             s = "ytplayer.config = {"
 130             if s in line:
 131                 p1 = line.find(s) + len(s) - 1
 132                 p2 = line.find("};", p1) + 1
 133                 if p1 >= 0 and p2 > 0:
 134                     return json.loads(line[p1:p2])
 135
 136 def extract_js(script):
 137     PREFIX = "var _yt_player={};(function(g){var window=this;"
 138     SUFFIX = ";})(_yt_player);\n"
 139     assert script.startswith(PREFIX)
 140     assert script.endswith(SUFFIX)
 141
 142     return script[len(PREFIX):-len(SUFFIX)]
 143
 144 def find_func_name(script):
 145     FUNC_NAME = R"([a-zA-Z0-9$]+)"
 146     FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
 147     TERMINATOR = R"[,;\)]"
 148     PATTERN = FUNC_NAME + FUNC_PARAMS + TERMINATOR
 149
 150     match = re.search(PATTERN, script)
 151     func_name = match.groups()[0]
 152     return func_name
 153
 154 def decode_signature(js_url, signature):
 155     script = urlopen(js_url).read()
 156     func_name = find_func_name(script)
 157
 158     params = {
 159         "func_name": func_name,
 160         "signature": json.dumps(signature),
 161         "code": json.dumps(extract_js(script)),
 162     }
 163     p = subprocess.Popen(
 164         "nodejs",
 165         shell=True,
 166         close_fds=True,
 167         stdin=subprocess.PIPE,
 168         stdout=subprocess.PIPE
 169     )
 170     js_decode_script = ("""
 171         var vm = require('vm');
 172
 173         var sandbox = {
 174             location: {
 175                 hash: '',
 176                 href: '',
 177                 protocol: 'http:'
 178             },
 179             history: {
 180                 pushState: function(){}
 181             },
 182             document: {},
 183             navigator: {
 184                 userAgent: ''
 185             },
 186             signature: %(signature)s,
 187             transformed_signature: null,
 188             g: function(){} // this is _yt_player
 189         };
 190         sandbox.window = sandbox;
 191
 192         var code_string = %(code)s + ';';
 193         var exec_string = 'transformed_signature = %(func_name)s(signature);';
 194         vm.runInNewContext(code_string + exec_string, sandbox);
 195
 196         console.log(sandbox.transformed_signature);
 197     """ % params)
 198
 199     p.stdin.write(js_decode_script)
 200     p.stdin.close()
 201
 202     transformed_signature = p.stdout.read().strip()
 203     if p.wait() != 0:
 204         raise Exception("js failed to execute: %d" % p.returncode)
 205
 206     return transformed_signature
 207
 208 def get_best_video(player_config):
 209     url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
 210     js_url = player_config["assets"]["js"]
 211
 212     best_url = None
 213     best_quality = None
 214     best_extension = None
 215     for url_data in url_data_list:
 216         url_data = urlparse.parse_qs(url_data)
 217         mimetype = url_data["type"][0].split(";")[0]
 218         quality = url_data["quality"][0]
 219
 220         if url_data.has_key("stereo3d"):
 221             continue
 222         if quality not in QUALITIES:
 223             continue
 224         if mimetype not in MIMETYPES:
 225             continue
 226
 227         extension = MIMETYPES[mimetype]
 228         quality = QUALITIES.get(quality, -1)
 229
 230         if best_quality is not None and quality < best_quality:
 231             continue
 232
 233         video_url = url_data["url"][0]
 234         if "sig" in url_data:
 235             signature = url_data["sig"][0]
 236         elif "s" in url_data:
 237             signature = decode_signature(js_url, url_data["s"][0])
 238         else:
 239             signature = None
 240
 241         if signature:
 242             video_url = append_to_qs(video_url, {"signature": signature})
 243
 244         best_url = video_url
 245         best_quality = quality
 246         best_extension = extension
 247
 248     return best_url, best_extension
 249
 250 def sanitize_filename(filename):
 251     return (
 252         re.sub("\s+", " ", filename.strip())
 253         .replace("\\", "-")
 254         .replace("/", "-")
 255         .replace("\0", " ")
 256     )
 257
 258 def get_video_url(doc):
 259     unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
 260     if unavailable:
 261         raise VideoUnavailable(unavailable[0].strip())
 262
 263     player_config = get_player_config(doc)
 264     if not player_config:
 265         raise VideoUnavailable("Could not find video URL")
 266
 267     video_url, extension = get_best_video(player_config)
 268     if not video_url:
 269         return None, None
 270
 271     title = doc.xpath("/html/head/title/text()")[0]
 272     filename = sanitize_filename(title)
 273     filename += "." + extension
 274
 275     return video_url, filename
 276
 277 def write_video(filename, video_data):
 278     httpinfo = video_data.info()
 279     encoded_filename = urllib.quote(filename.encode("utf-8"))
 280     sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
 281     sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
 282     sys.stdout.write("\r\n")
 283     shutil.copyfileobj(video_data, sys.stdout)
 284     video_data.close()
 285
 286 def cgimain():
 287     args = cgi.parse()
 288     try:
 289         url = args["url"][0]
 290     except:
 291         print_form(url="http://www.youtube.com/watch?v=FOOBAR")
 292         return
 293
 294     try:
 295         doc = parse_url(url)
 296         video_url, filename = get_video_url(doc)
 297         video_data = urlopen(video_url)
 298         write_video(filename, video_data)
 299     except VideoUnavailable, e:
 300         print_form(
 301             url=url,
 302             msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
 303         )
 304     except Exception, e:
 305         print_form(
 306             url=url,
 307             msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
 308         )
 309         return
 310
 311 def pp_size(size):
 312     suffixes = ["", "KiB", "MiB", "GiB"]
 313     for i, suffix in enumerate(suffixes):
 314         if size < 1024:
 315             break
 316         size /= 1024
 317     return "%.2f %s" % (size, suffix)
 318
 319 def copy_with_progress(content_length, infile, outfile):
 320     def print_status():
 321         rate = 0
 322         if now != last_ts:
 323             rate = last_bytes_read / (now - last_ts)
 324         sys.stdout.write("\33[2K\r")
 325         sys.stdout.write("%s / %s (%s/sec)" % (
 326             pp_size(bytes_read),
 327             pp_size(content_length),
 328             pp_size(rate),
 329         ))
 330         sys.stdout.flush()
 331
 332     last_ts = 0
 333     last_bytes_read = 0
 334     bytes_read = 0
 335     while True:
 336         now = time.time()
 337         if now - last_ts > 0.5:
 338             print_status()
 339             last_ts = now
 340             last_bytes_read = 0
 341
 342         buf = infile.read(32768)
 343         if not buf:
 344             break
 345         outfile.write(buf)
 346         last_bytes_read += len(buf)
 347         bytes_read += len(buf)
 348
 349     # Newline at the end
 350     print_status()
 351     print
 352
 353 def main():
 354     try:
 355         url = sys.argv[1]
 356     except:
 357         print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
 358         sys.exit(1)
 359
 360     doc = parse_url(url)
 361     video_url, filename = get_video_url(doc)
 362     print "Downloading", filename.encode("utf-8")
 363
 364     outfile = open(filename, "a")
 365     offset = outfile.tell()
 366     if offset > 0:
 367         print "Resuming download from", pp_size(offset)
 368     total_size = None
 369
 370     while True:
 371         try:
 372             video_data = urlopen(video_url, offset)
 373         except urllib2.HTTPError, e:
 374             if e.code == 416:
 375                 print "File is complete!"
 376                 break
 377             else:
 378                 raise
 379
 380         content_length = int(video_data.info().getheader("Content-Length"))
 381         if total_size is None:
 382             total_size = content_length
 383
 384         try:
 385             copy_with_progress(content_length, video_data, outfile)
 386         except IOError, e:
 387             print
 388
 389         video_data.close()
 390         if outfile.tell() != total_size:
 391             old_offset = offset
 392             offset = outfile.tell()
 393             if old_offset == offset:
 394                 time.sleep(1)
 395             print "Restarting download from", pp_size(offset)
 396         else:
 397             break
 398
 399     outfile.close()
 400
 401
 402 if __name__ == "__main__":
 403 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
 404     if os.environ.has_key("SCRIPT_NAME"):
 405         cgimain()
 406     else:
 407         try:
 408             main()
 409         except KeyboardInterrupt:
 410             print "\nExiting..."
 411             sys.exit(1)
 412