code.delx.au - youtube-cgi/blob - youtube.cgi

   1 #!/usr/bin/python2
   2
   3 from __future__ import division
   4
   5 import cookielib
   6 import cgi
   7 import json
   8 from lxml import html
   9 import os
  10 import re
  11 import resource
  12 import shutil
  13 import subprocess
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18 import urlparse
  19
  20
  21 MAX_MEMORY_BYTES = 128 * 1024*1024
  22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
  23
  24 MIMETYPES = {
  25     "video/mp4": "mp4",
  26     "video/x-flv": "flv",
  27     "video/3gpp": "3gp",
  28 }
  29
  30 QUALITIES = {
  31     "hd1080": 5,
  32     "hd720": 4,
  33     "large": 3,
  34     "medium": 2,
  35     "small": 1,
  36 }
  37
  38
  39 class VideoUnavailable(Exception):
  40     pass
  41
  42 def print_form(url="", msg=""):
  43     script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
  44     sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
  45     sys.stdout.write("""
  46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  47 <html xmlns="http://www.w3.org/1999/xhtml">
  48 <head>
  49     <title>delx.net.au - YouTube Scraper</title>
  50     <link rel="stylesheet" type="text/css" href="/style.css"/>
  51     <style type="text/css">
  52         input[type="text"] {
  53             width: 100%;
  54         }
  55         .error {
  56             color: red;
  57         }
  58     </style>
  59 </head>
  60 <body>
  61     <h1>delx.net.au - YouTube Scraper</h1>
  62     {0}
  63     <form action="" method="get">
  64     <p>This page will let you easily download YouTube videos to watch offline. It
  65     will automatically grab the highest quality version.</p>
  66     <div><input type="text" name="url" value="{1}"/></div>
  67     <div><input type="submit" value="Download!"/></div>
  68     </form>
  69     <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
  70     to easily download videos. Right-click the link and add it to bookmarks,
  71     then when you're looking at a YouTube page select that bookmark from your
  72     browser's bookmarks menu to download the video straight away.</p>
  73 </body>
  74 </html>
  75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
  76
  77 cookiejar = cookielib.CookieJar()
  78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
  79 referrer = ""
  80
  81 def urlopen(url, offset=None):
  82     if url.startswith("//"):
  83         url = "http:" + url
  84
  85     global referrer
  86     req = urllib2.Request(url)
  87     if not referrer:
  88         referrer = url
  89     else:
  90         req.add_header("Referer", referrer)
  91
  92     req.add_header("User-Agent", USER_AGENT)
  93
  94     if offset:
  95         req.add_header("Range", "bytes=%d-" % offset)
  96
  97     res = urlopener.open(req)
  98
  99     content_range = res.info().getheader("Content-Range")
 100     if content_range:
 101         tokens = content_range.split()
 102         assert tokens[0] == "bytes"
 103         start = int(tokens[1].split("-")[0])
 104         assert start == offset
 105     return res
 106
 107 def parse_url(url):
 108     f = urlopen(url)
 109     doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
 110     f.close()
 111     return doc
 112
 113 def append_to_qs(url, params):
 114     r = list(urlparse.urlsplit(url))
 115     qs = urlparse.parse_qs(r[3])
 116     qs.update(params)
 117     r[3] = urllib.urlencode(qs, True)
 118     url = urlparse.urlunsplit(r)
 119     return url
 120
 121 def get_player_config(doc):
 122     player_config = None
 123     for script in doc.xpath("//script"):
 124         if not script.text:
 125             continue
 126         for line in script.text.split("\n"):
 127             s = "ytplayer.config = {"
 128             if s in line:
 129                 p1 = line.find(s) + len(s) - 1
 130                 p2 = line.find("};", p1) + 1
 131                 if p1 >= 0 and p2 > 0:
 132                     return json.loads(line[p1:p2])
 133
 134 def extract_function(output, script, func_name):
 135     p1 = script.find("function " + func_name + "(")
 136     p2 = script.find("}", p1)
 137     code = script[p1:p2+1]
 138     output.append(code)
 139     deps = re.findall(R"[^\.][= ]([\$0-9a-zA-Z]+)\(", code)
 140     deps = set(deps)
 141     deps.remove(func_name)
 142     for dep in deps:
 143         extract_function(output, script, dep)
 144
 145 def find_func_name(script):
 146     FUNC_NAME = R"([a-zA-Z0-9$]+)"
 147     FUNC_PARAMS = R"(\([a-zA-Z]+\.s\))"
 148     PATTERN = FUNC_NAME + FUNC_PARAMS + ";"
 149
 150     match = re.search(PATTERN, script)
 151     func_name = match.groups()[0]
 152     return func_name
 153
 154 def decode_signature(js_url, s):
 155     script = urlopen(js_url).read()
 156     func_name = find_func_name(script)
 157
 158     codes = []
 159     extract_function(codes, script, func_name)
 160
 161     p = subprocess.Popen(
 162         "js",
 163         shell=True,
 164         close_fds=True,
 165         stdin=subprocess.PIPE,
 166         stdout=subprocess.PIPE
 167     )
 168     for code in codes:
 169         p.stdin.write(code + "\n")
 170     p.stdin.write("console.log(%s('%s'));\n" % (func_name, s))
 171     p.stdin.close()
 172
 173     signature = p.stdout.read().strip()
 174     if p.wait() != 0:
 175         raise Exception("js failed to execute: %d" % p.returncode)
 176
 177     return signature
 178
 179 def get_best_video(player_config):
 180     url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
 181     js_url = player_config["assets"]["js"]
 182
 183     best_url = None
 184     best_quality = None
 185     best_extension = None
 186     for url_data in url_data_list:
 187         url_data = urlparse.parse_qs(url_data)
 188         mimetype = url_data["type"][0].split(";")[0]
 189         quality = url_data["quality"][0]
 190
 191         if url_data.has_key("stereo3d"):
 192             continue
 193         if quality not in QUALITIES:
 194             continue
 195         if mimetype not in MIMETYPES:
 196             continue
 197
 198         extension = MIMETYPES[mimetype]
 199         quality = QUALITIES.get(quality, -1)
 200
 201         if best_quality is not None and quality < best_quality:
 202             continue
 203
 204         video_url = url_data["url"][0]
 205         if "sig" in url_data:
 206             signature = url_data["sig"][0]
 207         elif "s" in url_data:
 208             signature = decode_signature(js_url, url_data["s"][0])
 209         else:
 210             signature = None
 211
 212         if signature:
 213             video_url = append_to_qs(video_url, {"signature": signature})
 214
 215         best_url = video_url
 216         best_quality = quality
 217         best_extension = extension
 218
 219     return best_url, best_extension
 220
 221 def sanitize_filename(filename):
 222     return (
 223         re.sub("\s+", " ", filename.strip())
 224         .replace("\\", "-")
 225         .replace("/", "-")
 226         .replace("\0", " ")
 227     )
 228
 229 def get_video_url(doc):
 230     unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
 231     if unavailable:
 232         raise VideoUnavailable(unavailable[0].strip())
 233
 234     player_config = get_player_config(doc)
 235     if not player_config:
 236         raise VideoUnavailable("Could not find video URL")
 237
 238     video_url, extension = get_best_video(player_config)
 239     if not video_url:
 240         return None, None
 241
 242     title = doc.xpath("/html/head/title/text()")[0]
 243     filename = sanitize_filename(title)
 244     filename += "." + extension
 245
 246     return video_url, filename
 247
 248 def write_video(filename, video_data):
 249     httpinfo = video_data.info()
 250     encoded_filename = urllib.quote(filename.encode("utf-8"))
 251     sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
 252     sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
 253     sys.stdout.write("\r\n")
 254     shutil.copyfileobj(video_data, sys.stdout)
 255     video_data.close()
 256
 257 def cgimain():
 258     args = cgi.parse()
 259     try:
 260         url = args["url"][0]
 261     except:
 262         print_form(url="http://www.youtube.com/watch?v=FOOBAR")
 263         return
 264
 265     try:
 266         doc = parse_url(url)
 267         video_url, filename = get_video_url(doc)
 268         video_data = urlopen(video_url)
 269         write_video(filename, video_data)
 270     except VideoUnavailable, e:
 271         print_form(
 272             url=url,
 273             msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
 274         )
 275     except Exception, e:
 276         print_form(
 277             url=url,
 278             msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
 279         )
 280         return
 281
 282 def pp_size(size):
 283     suffixes = ["", "KiB", "MiB", "GiB"]
 284     for i, suffix in enumerate(suffixes):
 285         if size < 1024:
 286             break
 287         size /= 1024
 288     return "%.2f %s" % (size, suffix)
 289
 290 def copy_with_progress(content_length, infile, outfile):
 291     def print_status():
 292         rate = 0
 293         if now != last_ts:
 294             rate = last_bytes_read / (now - last_ts)
 295         sys.stdout.write("\33[2K\r")
 296         sys.stdout.write("%s / %s (%s/sec)" % (
 297             pp_size(bytes_read),
 298             pp_size(content_length),
 299             pp_size(rate),
 300         ))
 301         sys.stdout.flush()
 302
 303     last_ts = 0
 304     last_bytes_read = 0
 305     bytes_read = 0
 306     while True:
 307         now = time.time()
 308         if now - last_ts > 0.5:
 309             print_status()
 310             last_ts = now
 311             last_bytes_read = 0
 312
 313         buf = infile.read(32768)
 314         if not buf:
 315             break
 316         outfile.write(buf)
 317         last_bytes_read += len(buf)
 318         bytes_read += len(buf)
 319
 320     # Newline at the end
 321     print_status()
 322     print
 323
 324 def main():
 325     try:
 326         url = sys.argv[1]
 327     except:
 328         print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
 329         sys.exit(1)
 330
 331     doc = parse_url(url)
 332     video_url, filename = get_video_url(doc)
 333     print "Downloading", filename.encode("utf-8")
 334
 335     outfile = open(filename, "a")
 336     offset = outfile.tell()
 337     if offset > 0:
 338         print "Resuming download from", pp_size(offset)
 339     total_size = None
 340
 341     while True:
 342         try:
 343             video_data = urlopen(video_url, offset)
 344         except urllib2.HTTPError, e:
 345             if e.code == 416:
 346                 print "File is complete!"
 347                 break
 348             else:
 349                 raise
 350
 351         content_length = int(video_data.info().getheader("Content-Length"))
 352         if total_size is None:
 353             total_size = content_length
 354
 355         try:
 356             copy_with_progress(content_length, video_data, outfile)
 357         except IOError, e:
 358             print
 359
 360         video_data.close()
 361         if outfile.tell() != total_size:
 362             old_offset = offset
 363             offset = outfile.tell()
 364             if old_offset == offset:
 365                 time.sleep(1)
 366             print "Restarting download from", pp_size(offset)
 367         else:
 368             break
 369
 370     outfile.close()
 371
 372
 373 if __name__ == "__main__":
 374 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
 375     if os.environ.has_key("SCRIPT_NAME"):
 376         cgimain()
 377     else:
 378         try:
 379             main()
 380         except KeyboardInterrupt:
 381             print "\nExiting..."
 382             sys.exit(1)
 383