code.delx.au - youtube-cgi/blob - youtube.cgi

   1 #!/usr/bin/python2
   2
   3 from __future__ import division
   4
   5 import cookielib
   6 import cgi
   7 import json
   8 from lxml import html
   9 import os
  10 import re
  11 import resource
  12 import shutil
  13 import subprocess
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18 import urlparse
  19
  20
  21 MAX_MEMORY_BYTES = 128 * 1024*1024
  22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
  23
  24 MIMETYPES = {
  25     "video/mp4": "mp4",
  26     "video/x-flv": "flv",
  27     "video/3gpp": "3gp",
  28 }
  29
  30 QUALITIES = {
  31     "hd1080": 5,
  32     "hd720": 4,
  33     "large": 3,
  34     "medium": 2,
  35     "small": 1,
  36 }
  37
  38
  39 class VideoUnavailable(Exception):
  40     pass
  41
  42 def print_form(url="", msg=""):
  43     script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
  44     sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
  45     sys.stdout.write("""
  46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  47 <html xmlns="http://www.w3.org/1999/xhtml">
  48 <head>
  49     <title>delx.net.au - YouTube Scraper</title>
  50     <link rel="stylesheet" type="text/css" href="/style.css"/>
  51     <style type="text/css">
  52         input[type="text"] {
  53             width: 100%;
  54         }
  55         .error {
  56             color: red;
  57         }
  58     </style>
  59 </head>
  60 <body>
  61     <h1>delx.net.au - YouTube Scraper</h1>
  62     {0}
  63     <form action="" method="get">
  64     <p>This page will let you easily download YouTube videos to watch offline. It
  65     will automatically grab the highest quality version.</p>
  66     <div><input type="text" name="url" value="{1}"/></div>
  67     <div><input type="submit" value="Download!"/></div>
  68     </form>
  69     <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
  70     to easily download videos. Right-click the link and add it to bookmarks,
  71     then when you're looking at a YouTube page select that bookmark from your
  72     browser's bookmarks menu to download the video straight away.</p>
  73 </body>
  74 </html>
  75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
  76
  77 cookiejar = cookielib.CookieJar()
  78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
  79 referrer = ""
  80
  81 def urlopen(url, offset=None):
  82     if url.startswith("//"):
  83         url = "http:" + url
  84
  85     global referrer
  86     req = urllib2.Request(url)
  87     if referrer:
  88         req.add_header("Referer", referrer)
  89     referrer = url
  90
  91     req.add_header("User-Agent", USER_AGENT)
  92
  93     if offset:
  94         req.add_header("Range", "bytes=%d-" % offset)
  95
  96     res = urlopener.open(req)
  97
  98     content_range = res.info().getheader("Content-Range")
  99     if content_range:
 100         tokens = content_range.split()
 101         assert tokens[0] == "bytes"
 102         start = int(tokens[1].split("-")[0])
 103         assert start == offset
 104     return res
 105
 106 def parse_url(url):
 107     f = urlopen(url)
 108     doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
 109     f.close()
 110     return doc
 111
 112 def append_to_qs(url, params):
 113     r = list(urlparse.urlsplit(url))
 114     qs = urlparse.parse_qs(r[3])
 115     qs.update(params)
 116     r[3] = urllib.urlencode(qs, True)
 117     url = urlparse.urlunsplit(r)
 118     return url
 119
 120 def get_player_config(doc):
 121     player_config = None
 122     for script in doc.xpath("//script"):
 123         if not script.text:
 124             continue
 125         for line in script.text.split("\n"):
 126             s = "ytplayer.config = {"
 127             if s in line:
 128                 p1 = line.find(s) + len(s) - 1
 129                 p2 = line.find("};", p1) + 1
 130                 if p1 >= 0 and p2 > 0:
 131                     return json.loads(line[p1:p2])
 132
 133 def extract_function(output, script, func_name):
 134     p1 = script.find("function " + func_name + "(")
 135     p2 = script.find("}", p1)
 136     code = script[p1:p2+1]
 137     output.append(code)
 138     deps = re.findall(R"[^\.][= ]([\$0-9a-zA-Z]+)\(", code)
 139     deps = set(deps)
 140     deps.remove(func_name)
 141     for dep in deps:
 142         extract_function(output, script, dep)
 143
 144 def decode_signature(js_url, s):
 145     script = urlopen(js_url).read()
 146     func_name = re.search(R"\b([a-zA-Z]+)\([a-zA-Z]+\.s\);", script).groups()[0]
 147
 148     codes = []
 149     extract_function(codes, script, func_name)
 150
 151     p = subprocess.Popen(
 152         "js",
 153         shell=True,
 154         close_fds=True,
 155         stdin=subprocess.PIPE,
 156         stdout=subprocess.PIPE
 157     )
 158     for code in codes:
 159         p.stdin.write(code + "\n")
 160     p.stdin.write("console.log(%s('%s'));\n" % (func_name, s))
 161     p.stdin.close()
 162
 163     signature = p.stdout.read().strip()
 164     if p.wait() != 0:
 165         raise Exception("js failed to execute: %d" % p.returncode)
 166
 167     return signature
 168
 169 def get_best_video(player_config):
 170     url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
 171     js_url = player_config["assets"]["js"]
 172
 173     best_url = None
 174     best_quality = None
 175     best_extension = None
 176     for url_data in url_data_list:
 177         url_data = urlparse.parse_qs(url_data)
 178         mimetype = url_data["type"][0].split(";")[0]
 179         quality = url_data["quality"][0]
 180
 181         if url_data.has_key("stereo3d"):
 182             continue
 183         if quality not in QUALITIES:
 184             continue
 185         if mimetype not in MIMETYPES:
 186             continue
 187
 188         extension = MIMETYPES[mimetype]
 189         quality = QUALITIES.get(quality, -1)
 190
 191         if best_quality is not None and quality < best_quality:
 192             continue
 193
 194         video_url = url_data["url"][0]
 195         if "sig" in url_data:
 196             signature = url_data["sig"][0]
 197         elif "s" in url_data:
 198             signature = decode_signature(js_url, url_data["s"][0])
 199         else:
 200             signature = None
 201
 202         if signature:
 203             video_url = append_to_qs(video_url, {"signature": signature})
 204
 205         best_url = video_url
 206         best_quality = quality
 207         best_extension = extension
 208
 209     return best_url, best_extension
 210
 211 def sanitize_filename(filename):
 212     return (
 213         re.sub("\s+", " ", filename.strip())
 214         .replace("\\", "-")
 215         .replace("/", "-")
 216         .replace("\0", " ")
 217     )
 218
 219 def get_video_url(doc):
 220     unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
 221     if unavailable:
 222         raise VideoUnavailable(unavailable[0].strip())
 223
 224     player_config = get_player_config(doc)
 225     if not player_config:
 226         raise VideoUnavailable("Could not find video URL")
 227
 228     video_url, extension = get_best_video(player_config)
 229     if not video_url:
 230         return None, None
 231
 232     title = doc.xpath("/html/head/title/text()")[0]
 233     filename = sanitize_filename(title)
 234     filename += "." + extension
 235
 236     return video_url, filename
 237
 238 def write_video(filename, video_data):
 239     httpinfo = video_data.info()
 240     encoded_filename = urllib.quote(filename.encode("utf-8"))
 241     sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
 242     sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
 243     sys.stdout.write("\r\n")
 244     shutil.copyfileobj(video_data, sys.stdout)
 245     video_data.close()
 246
 247 def cgimain():
 248     args = cgi.parse()
 249     try:
 250         url = args["url"][0]
 251     except:
 252         print_form(url="http://www.youtube.com/watch?v=FOOBAR")
 253         return
 254
 255     try:
 256         doc = parse_url(url)
 257         video_url, filename = get_video_url(doc)
 258         video_data = urlopen(video_url)
 259         write_video(filename, video_data)
 260     except VideoUnavailable, e:
 261         print_form(
 262             url=url,
 263             msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
 264         )
 265     except Exception, e:
 266         print_form(
 267             url=url,
 268             msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
 269         )
 270         return
 271
 272 def pp_size(size):
 273     suffixes = ["", "KiB", "MiB", "GiB"]
 274     for i, suffix in enumerate(suffixes):
 275         if size < 1024:
 276             break
 277         size /= 1024
 278     return "%.2f %s" % (size, suffix)
 279
 280 def copy_with_progress(content_length, infile, outfile):
 281     def print_status():
 282         rate = 0
 283         if now != last_ts:
 284             rate = last_bytes_read / (now - last_ts)
 285         sys.stdout.write("\33[2K\r")
 286         sys.stdout.write("%s / %s (%s/sec)" % (
 287             pp_size(bytes_read),
 288             pp_size(content_length),
 289             pp_size(rate),
 290         ))
 291         sys.stdout.flush()
 292
 293     last_ts = 0
 294     last_bytes_read = 0
 295     bytes_read = 0
 296     while True:
 297         now = time.time()
 298         if now - last_ts > 0.5:
 299             print_status()
 300             last_ts = now
 301             last_bytes_read = 0
 302
 303         buf = infile.read(32768)
 304         if not buf:
 305             break
 306         outfile.write(buf)
 307         last_bytes_read += len(buf)
 308         bytes_read += len(buf)
 309
 310     # Newline at the end
 311     print_status()
 312     print
 313
 314 def main():
 315     try:
 316         url = sys.argv[1]
 317     except:
 318         print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
 319         sys.exit(1)
 320
 321     doc = parse_url(url)
 322     video_url, filename = get_video_url(doc)
 323     print "Downloading", filename.encode("utf-8")
 324
 325     outfile = open(filename, "a")
 326     offset = outfile.tell()
 327     if offset > 0:
 328         print "Resuming download from", pp_size(offset)
 329     total_size = None
 330
 331     while True:
 332         try:
 333             video_data = urlopen(video_url, offset)
 334         except urllib2.HTTPError, e:
 335             if e.code == 416:
 336                 print "File is complete!"
 337                 break
 338             else:
 339                 raise
 340
 341         content_length = int(video_data.info().getheader("Content-Length"))
 342         if total_size is None:
 343             total_size = content_length
 344
 345         try:
 346             copy_with_progress(content_length, video_data, outfile)
 347         except IOError, e:
 348             print
 349
 350         video_data.close()
 351         if outfile.tell() != total_size:
 352             old_offset = offset
 353             offset = outfile.tell()
 354             if old_offset == offset:
 355                 time.sleep(1)
 356             print "Restarting download from", pp_size(offset)
 357         else:
 358             break
 359
 360     outfile.close()
 361
 362
 363 if __name__ == "__main__":
 364 ### resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
 365     if os.environ.has_key("SCRIPT_NAME"):
 366         cgimain()
 367     else:
 368         try:
 369             main()
 370         except KeyboardInterrupt:
 371             print "\nExiting..."
 372             sys.exit(1)
 373