code.delx.au - youtube-cgi/blob - youtube.cgi

   1 #!/usr/bin/env python
   2
   3 from __future__ import division
   4
   5 import cookielib
   6 import cgi
   7 import json
   8 from lxml import html
   9 import os
  10 import re
  11 import resource
  12 import shutil
  13 import subprocess
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18 import urlparse
  19
  20
  21 MAX_MEMORY_BYTES = 128 * 1024*1024
  22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
  23
  24 MIMETYPES = {
  25         "video/mp4": "mp4",
  26         "video/x-flv": "flv",
  27         "video/3gpp": "3gp",
  28 }
  29
  30 QUALITIES = {
  31         "hd1080": 5,
  32         "hd720": 4,
  33         "large": 3,
  34         "medium": 2,
  35         "small": 1,
  36 }
  37
  38
  39 class VideoUnavailable(Exception):
  40         pass
  41
  42 def print_form(url="", msg=""):
  43         script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
  44         sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
  45         sys.stdout.write("""
  46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  47 <html xmlns="http://www.w3.org/1999/xhtml">
  48 <head>
  49         <title>delx.net.au - YouTube Scraper</title>
  50         <link rel="stylesheet" type="text/css" href="/style.css"/>
  51         <style type="text/css">
  52                 input[type="text"] {
  53                         width: 100%;
  54                 }
  55                 .error {
  56                         color: red;
  57                 }
  58         </style>
  59 </head>
  60 <body>
  61         <h1>delx.net.au - YouTube Scraper</h1>
  62         {0}
  63         <form action="" method="get">
  64         <p>This page will let you easily download YouTube videos to watch offline. It
  65         will automatically grab the highest quality version.</p>
  66         <div><input type="text" name="url" value="{1}"/></div>
  67         <div><input type="submit" value="Download!"/></div>
  68         </form>
  69         <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
  70         to easily download videos. Right-click the link and add it to bookmarks,
  71         then when you're looking at a YouTube page select that bookmark from your
  72         browser's bookmarks menu to download the video straight away.</p>
  73 </body>
  74 </html>
  75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
  76
  77 cookiejar = cookielib.CookieJar()
  78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
  79 referrer = ""
  80
  81 def urlopen(url, offset=None):
  82         global referrer
  83         req = urllib2.Request(url)
  84         if referrer:
  85                 req.add_header("Referer", referrer)
  86         referrer = url
  87
  88         req.add_header("User-Agent", USER_AGENT)
  89
  90         if offset:
  91                 req.add_header("Range", "bytes=%d-" % offset)
  92
  93         res = urlopener.open(req)
  94
  95         content_range = res.info().getheader("Content-Range")
  96         if content_range:
  97                 tokens = content_range.split()
  98                 assert tokens[0] == "bytes"
  99                 start = int(tokens[1].split("-")[0])
 100                 assert start == offset
 101         return res
 102
 103 def parse_url(url):
 104         f = urlopen(url)
 105         doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
 106         f.close()
 107         return doc
 108
 109 def append_to_qs(url, params):
 110         r = list(urlparse.urlsplit(url))
 111         qs = urlparse.parse_qs(r[3])
 112         qs.update(params)
 113         r[3] = urllib.urlencode(qs, True)
 114         url = urlparse.urlunsplit(r)
 115         return url
 116
 117 def convert_from_old_itag(player_config):
 118         url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
 119         url_data["url"] = []
 120         for itag_url in url_data["itag"]:
 121                 pos = itag_url.find("url=")
 122                 url_data["url"].append(itag_url[pos+4:])
 123         player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
 124
 125 def get_player_config(doc):
 126         player_config = None
 127         for script in doc.xpath("//script"):
 128                 if not script.text:
 129                         continue
 130                 for line in script.text.split("\n"):
 131                         if "yt.playerConfig =" in line:
 132                                 p1 = line.find("=")
 133                                 p2 = line.rfind(";")
 134                                 if p1 >= 0 and p2 > 0:
 135                                         return json.loads(line[p1+1:p2])
 136                         if "ytplayer.config =" in line:
 137                                 p1 = line.find("ytplayer.config =")
 138                                 p2 = line.rfind(";")
 139                                 if p1 >= 0 and p2 > 0:
 140                                         return json.loads(line[p1+18:p2])
 141                         if "'PLAYER_CONFIG': " in line:
 142                                 p1 = line.find(":")
 143                                 if p1 >= 0:
 144                                         player_config = json.loads(line[p1+1:])
 145                                         convert_from_old_itag(player_config)
 146                                         return player_config
 147
 148 def decode_signature(js_url, s):
 149         script = urlopen(js_url).read()
 150         func_name = re.search(R"\b([a-z]+)\([a-z]+\.s\);", script).groups()[0]
 151         p1 = script.find("function " + func_name)
 152         p2 = script.find("}", p1)
 153         func_block = script[p1:p2+1]
 154
 155         p = subprocess.Popen(
 156                 "js",
 157                 shell=True,
 158                 close_fds=True,
 159                 stdin=subprocess.PIPE,
 160                 stdout=subprocess.PIPE
 161         )
 162         p.stdin.write(func_block + "\n")
 163         p.stdin.write("console.log(%s('%s'));\n" % (func_name, s))
 164         p.stdin.close()
 165
 166         signature = p.stdout.read().strip()
 167         if p.wait() != 0:
 168                 raise Exception("js failed to execute: %d" % p.returncode)
 169
 170         return signature
 171
 172 def get_best_video(player_config):
 173         url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
 174         js_url = player_config["assets"]["js"]
 175
 176         best_url = None
 177         best_quality = None
 178         best_extension = None
 179         for url_data in url_data_list:
 180                 url_data = urlparse.parse_qs(url_data)
 181                 mimetype = url_data["type"][0].split(";")[0]
 182                 quality = url_data["quality"][0]
 183
 184                 if quality not in QUALITIES:
 185                         continue
 186                 if mimetype not in MIMETYPES:
 187                         continue
 188
 189                 extension = MIMETYPES[mimetype]
 190                 quality = QUALITIES.get(quality, -1)
 191
 192                 if best_quality is not None and quality < best_quality:
 193                         continue
 194
 195                 video_url = url_data["url"][0]
 196                 if "sig" in url_data:
 197                         signature = url_data["sig"][0]
 198                 else:
 199                         signature = decode_signature(js_url, url_data["s"][0])
 200                 video_url = append_to_qs(video_url, {"signature": signature})
 201
 202                 best_url = video_url
 203                 best_quality = quality
 204                 best_extension = extension
 205
 206         return best_url, best_extension
 207
 208 def sanitize_filename(filename):
 209         return (
 210                 re.sub("\s+", " ", filename.strip())
 211                 .replace("\\", "-")
 212                 .replace("/", "-")
 213                 .replace("\0", " ")
 214         )
 215
 216 def get_video_url(doc):
 217         unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
 218         if unavailable:
 219                 raise VideoUnavailable(unavailable[0].strip())
 220
 221         player_config = get_player_config(doc)
 222         if not player_config:
 223                 raise VideoUnavailable("Could not find video URL")
 224
 225         video_url, extension = get_best_video(player_config)
 226         if not video_url:
 227                 return None, None
 228
 229         title = doc.xpath("/html/head/title/text()")[0]
 230         filename = sanitize_filename(title)
 231         filename += "." + extension
 232
 233         return video_url, filename
 234
 235 def write_video(filename, video_data):
 236         httpinfo = video_data.info()
 237         encoded_filename = urllib.quote(filename.encode("utf-8"))
 238         sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
 239         sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
 240         sys.stdout.write("\r\n")
 241         shutil.copyfileobj(video_data, sys.stdout)
 242         video_data.close()
 243
 244 def cgimain():
 245         args = cgi.parse()
 246         try:
 247                 url = args["url"][0]
 248         except:
 249                 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
 250                 return
 251
 252         try:
 253                 doc = parse_url(url)
 254                 video_url, filename = get_video_url(doc)
 255                 video_data = urlopen(video_url)
 256                 write_video(filename, video_data)
 257         except VideoUnavailable, e:
 258                 print_form(
 259                         url=url,
 260                         msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
 261                 )
 262         except Exception, e:
 263                 print_form(
 264                         url=url,
 265                         msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
 266                 )
 267                 return
 268
 269 def pp_size(size):
 270         suffixes = ["", "KiB", "MiB", "GiB"]
 271         for i, suffix in enumerate(suffixes):
 272                 if size < 1024:
 273                         break
 274                 size /= 1024
 275         return "%.2f %s" % (size, suffix)
 276
 277 def copy_with_progress(content_length, infile, outfile):
 278         def print_status():
 279                 rate = 0
 280                 if now != last_ts:
 281                         rate = last_bytes_read / (now - last_ts)
 282                 sys.stdout.write("\33[2K\r")
 283                 sys.stdout.write("%s / %s (%s/sec)" % (
 284                         pp_size(bytes_read),
 285                         pp_size(content_length),
 286                         pp_size(rate),
 287                 ))
 288                 sys.stdout.flush()
 289
 290         last_ts = 0
 291         last_bytes_read = 0
 292         bytes_read = 0
 293         while True:
 294                 now = time.time()
 295                 if now - last_ts > 0.5:
 296                         print_status()
 297                         last_ts = now
 298                         last_bytes_read = 0
 299
 300                 buf = infile.read(32768)
 301                 if not buf:
 302                         break
 303                 outfile.write(buf)
 304                 last_bytes_read += len(buf)
 305                 bytes_read += len(buf)
 306
 307         # Newline at the end
 308         print_status()
 309         print
 310
 311 def main():
 312         try:
 313                 url = sys.argv[1]
 314         except:
 315                 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
 316                 sys.exit(1)
 317
 318         doc = parse_url(url)
 319         video_url, filename = get_video_url(doc)
 320         print "Downloading", filename.encode("utf-8")
 321
 322         outfile = open(filename, "a")
 323         offset = outfile.tell()
 324         if offset > 0:
 325                 print "Resuming download from", pp_size(offset)
 326         total_size = None
 327
 328         while True:
 329                 try:
 330                         video_data = urlopen(video_url, offset)
 331                 except urllib2.HTTPError, e:
 332                         if e.code == 416:
 333                                 print "File is complete!"
 334                                 break
 335                         else:
 336                                 raise
 337
 338                 content_length = int(video_data.info().getheader("Content-Length"))
 339                 if total_size is None:
 340                         total_size = content_length
 341
 342                 try:
 343                         copy_with_progress(content_length, video_data, outfile)
 344                 except IOError, e:
 345                         print
 346
 347                 video_data.close()
 348                 if outfile.tell() != total_size:
 349                         old_offset = offset
 350                         offset = outfile.tell()
 351                         if old_offset == offset:
 352                                 time.sleep(1)
 353                         print "Restarting download from", pp_size(offset)
 354                 else:
 355                         break
 356
 357         outfile.close()
 358
 359
 360 if __name__ == "__main__":
 361 ###     resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
 362         if os.environ.has_key("SCRIPT_NAME"):
 363                 cgimain()
 364         else:
 365                 try:
 366                         main()
 367                 except KeyboardInterrupt:
 368                         print "\nExiting..."
 369                         sys.exit(1)
 370