code.delx.au - youtube-cgi/blob - youtube.cgi

   1 #!/usr/bin/python2
   2
   3 from __future__ import division
   4
   5 import cookielib
   6 import cgi
   7 import json
   8 from lxml import html
   9 import os
  10 import re
  11 import resource
  12 import shutil
  13 import subprocess
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18 import urlparse
  19
  20
  21 MAX_MEMORY_BYTES = 128 * 1024*1024
  22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
  23
  24 MIMETYPES = {
  25     "video/mp4": "mp4",
  26     "video/x-flv": "flv",
  27     "video/3gpp": "3gp",
  28 }
  29
  30 QUALITIES = {
  31     "hd1080": 5,
  32     "hd720": 4,
  33     "large": 3,
  34     "medium": 2,
  35     "small": 1,
  36 }
  37
  38 JS_BROWSER_STUB = """
  39 var window={};
  40 var document={};
  41 window.location={};
  42 var navigator={};
  43 """
  44
  45
  46
  47 class VideoUnavailable(Exception):
  48     pass
  49
  50 def print_form(url="", msg=""):
  51     script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
  52     sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
  53     sys.stdout.write("""
  54 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  55 <html xmlns="http://www.w3.org/1999/xhtml">
  56 <head>
  57     <title>delx.net.au - YouTube Scraper</title>
  58     <link rel="stylesheet" type="text/css" href="/style.css"/>
  59     <style type="text/css">
  60         input[type="text"] {
  61             width: 100%;
  62         }
  63         .error {
  64             color: red;
  65         }
  66     </style>
  67 </head>
  68 <body>
  69     <h1>delx.net.au - YouTube Scraper</h1>
  70     {0}
  71     <form action="" method="get">
  72     <p>This page will let you easily download YouTube videos to watch offline. It
  73     will automatically grab the highest quality version.</p>
  74     <div><input type="text" name="url" value="{1}"/></div>
  75     <div><input type="submit" value="Download!"/></div>
  76     </form>
  77     <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
  78     to easily download videos. Right-click the link and add it to bookmarks,
  79     then when you're looking at a YouTube page select that bookmark from your
  80     browser's bookmarks menu to download the video straight away.</p>
  81 </body>
  82 </html>
  83 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
  84
  85 cookiejar = cookielib.CookieJar()
  86 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
  87 referrer = ""
  88
  89 def urlopen(url, offset=None):
  90     if url.startswith("//"):
  91         url = "http:" + url
  92
  93     global referrer
  94     req = urllib2.Request(url)
  95     if not referrer:
  96         referrer = url
  97     else:
  98         req.add_header("Referer", referrer)
  99
 100     req.add_header("User-Agent", USER_AGENT)
 101
 102     if offset:
 103         req.add_header("Range", "bytes=%d-" % offset)
 104
 105     res = urlopener.open(req)
 106
 107     content_range = res.info().getheader("Content-Range")
 108     if content_range:
 109         tokens = content_range.split()
 110         assert tokens[0] == "bytes"
 111         start = int(tokens[1].split("-")[0])
 112         assert start == offset
 113     return res
 114
 115 def parse_url(url):
 116     f = urlopen(url)
 117     doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
 118     f.close()
 119     return doc
 120
 121 def append_to_qs(url, params):
 122     r = list(urlparse.urlsplit(url))
 123     qs = urlparse.parse_qs(r[3])
 124     qs.update(params)
 125     r[3] = urllib.urlencode(qs, True)
 126     url = urlparse.urlunsplit(r)
 127     return url
 128
 129 def get_player_config(doc):
 130     player_config = None
 131     for script in doc.xpath("//script"):
 132         if not script.text:
 133             continue
 134         for line in script.text.split("\n"):
 135             s = "ytplayer.config = {"
 136             if s in line:
 137                 p1 = line.find(s) + len(s) - 1
 138                 p2 = line.find("};", p1) + 1
 139                 if p1 >= 0 and p2 > 0:
 140                     return json.loads(line[p1:p2])
 141
 142 def extract_js(script):
 143     PREFIX = "(function(){"
 144     SUFFIX = "})();\n"
 145     assert script.startswith(PREFIX)
 146     assert script.endswith(SUFFIX)
 147
 148     return script[len(PREFIX):-len(SUFFIX)]
 149
 150 def find_func_name(script):
 151     FUNC_NAME = R"([a-zA-Z0-9$]+)"
 152     FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
 153     PATTERN = FUNC_NAME + FUNC_PARAMS + ";"
 154
 155     match = re.search(PATTERN, script)
 156     func_name = match.groups()[0]
 157     return func_name
 158
 159 def decode_signature(js_url, s):
 160     script = urlopen(js_url).read()
 161     func_name = find_func_name(script)
 162
 163     p = subprocess.Popen(
 164         "js",
 165         shell=True,
 166         close_fds=True,
 167         stdin=subprocess.PIPE,
 168         stdout=subprocess.PIPE
 169     )
 170     w = p.stdin.write
 171     w(JS_BROWSER_STUB)
 172     w(extract_js(script))
 173     w("console.log(%s('%s'));\n" % (func_name, s))
 174     p.stdin.close()
 175
 176     signature = p.stdout.read().strip()
 177     if p.wait() != 0:
 178         raise Exception("js failed to execute: %d" % p.returncode)
 179
 180     return signature
 181
 182 def get_best_video(player_config):
 183     url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
 184     js_url = player_config["assets"]["js"]
 185
 186     best_url = None
 187     best_quality = None
 188     best_extension = None
 189     for url_data in url_data_list:
 190         url_data = urlparse.parse_qs(url_data)
 191         mimetype = url_data["type"][0].split(";")[0]
 192         quality = url_data["quality"][0]
 193
 194         if url_data.has_key("stereo3d"):
 195             continue
 196         if quality not in QUALITIES:
 197             continue
 198         if mimetype not in MIMETYPES:
 199             continue
 200
 201         extension = MIMETYPES[mimetype]
 202         quality = QUALITIES.get(quality, -1)
 203
 204         if best_quality is not None and quality < best_quality:
 205             continue
 206
 207         video_url = url_data["url"][0]
 208         if "sig" in url_data:
 209             signature = url_data["sig"][0]
 210         elif "s" in url_data:
 211             signature = decode_signature(js_url, url_data["s"][0])
 212         else:
 213             signature = None
 214
 215         if signature:
 216             video_url = append_to_qs(video_url, {"signature": signature})
 217
 218         best_url = video_url
 219         best_quality = quality
 220         best_extension = extension
 221
 222     return best_url, best_extension
 223
 224 def sanitize_filename(filename):
 225     return (
 226         re.sub("\s+", " ", filename.strip())
 227         .replace("\\", "-")
 228         .replace("/", "-")
 229         .replace("\0", " ")
 230     )
 231
 232 def get_video_url(doc):
 233     unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
 234     if unavailable:
 235         raise VideoUnavailable(unavailable[0].strip())
 236
 237     player_config = get_player_config(doc)
 238     if not player_config:
 239         raise VideoUnavailable("Could not find video URL")
 240
 241     video_url, extension = get_best_video(player_config)
 242     if not video_url:
 243         return None, None
 244
 245     title = doc.xpath("/html/head/title/text()")[0]
 246     filename = sanitize_filename(title)
 247     filename += "." + extension
 248
 249     return video_url, filename
 250
 251 def write_video(filename, video_data):
 252     httpinfo = video_data.info()
 253     encoded_filename = urllib.quote(filename.encode("utf-8"))
 254     sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
 255     sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
 256     sys.stdout.write("\r\n")
 257     shutil.copyfileobj(video_data, sys.stdout)
 258     video_data.close()
 259
 260 def cgimain():
 261     args = cgi.parse()
 262     try:
 263         url = args["url"][0]
 264     except:
 265         print_form(url="http://www.youtube.com/watch?v=FOOBAR")
 266         return
 267
 268     try:
 269         doc = parse_url(url)
 270         video_url, filename = get_video_url(doc)
 271         video_data = urlopen(video_url)
 272         write_video(filename, video_data)
 273     except VideoUnavailable, e:
 274         print_form(
 275             url=url,
 276             msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
 277         )
 278     except Exception, e:
 279         print_form(
 280             url=url,
 281             msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
 282         )
 283         return
 284
 285 def pp_size(size):
 286     suffixes = ["", "KiB", "MiB", "GiB"]
 287     for i, suffix in enumerate(suffixes):
 288         if size < 1024:
 289             break
 290         size /= 1024
 291     return "%.2f %s" % (size, suffix)
 292
 293 def copy_with_progress(content_length, infile, outfile):
 294     def print_status():
 295         rate = 0
 296         if now != last_ts:
 297             rate = last_bytes_read / (now - last_ts)
 298         sys.stdout.write("\33[2K\r")
 299         sys.stdout.write("%s / %s (%s/sec)" % (
 300             pp_size(bytes_read),
 301             pp_size(content_length),
 302             pp_size(rate),
 303         ))
 304         sys.stdout.flush()
 305
 306     last_ts = 0
 307     last_bytes_read = 0
 308     bytes_read = 0
 309     while True:
 310         now = time.time()
 311         if now - last_ts > 0.5:
 312             print_status()
 313             last_ts = now
 314             last_bytes_read = 0
 315
 316         buf = infile.read(32768)
 317         if not buf:
 318             break
 319         outfile.write(buf)
 320         last_bytes_read += len(buf)
 321         bytes_read += len(buf)
 322
 323     # Newline at the end
 324     print_status()
 325     print
 326
 327 def main():
 328     try:
 329         url = sys.argv[1]
 330     except:
 331         print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
 332         sys.exit(1)
 333
 334     doc = parse_url(url)
 335     video_url, filename = get_video_url(doc)
 336     print "Downloading", filename.encode("utf-8")
 337
 338     outfile = open(filename, "a")
 339     offset = outfile.tell()
 340     if offset > 0:
 341         print "Resuming download from", pp_size(offset)
 342     total_size = None
 343
 344     while True:
 345         try:
 346             video_data = urlopen(video_url, offset)
 347         except urllib2.HTTPError, e:
 348             if e.code == 416:
 349                 print "File is complete!"
 350                 break
 351             else:
 352                 raise
 353
 354         content_length = int(video_data.info().getheader("Content-Length"))
 355         if total_size is None:
 356             total_size = content_length
 357
 358         try:
 359             copy_with_progress(content_length, video_data, outfile)
 360         except IOError, e:
 361             print
 362
 363         video_data.close()
 364         if outfile.tell() != total_size:
 365             old_offset = offset
 366             offset = outfile.tell()
 367             if old_offset == offset:
 368                 time.sleep(1)
 369             print "Restarting download from", pp_size(offset)
 370         else:
 371             break
 372
 373     outfile.close()
 374
 375
 376 if __name__ == "__main__":
 377 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
 378     if os.environ.has_key("SCRIPT_NAME"):
 379         cgimain()
 380     else:
 381         try:
 382             main()
 383         except KeyboardInterrupt:
 384             print "\nExiting..."
 385             sys.exit(1)
 386