code.delx.au - youtube-cgi/blob - youtube.cgi

   1 #!/usr/bin/env python
   2
   3 from __future__ import division
   4
   5 import cookielib
   6 import cgi
   7 import json
   8 from lxml import html
   9 import os
  10 import re
  11 import resource
  12 import shutil
  13 import subprocess
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18 import urlparse
  19
  20
  21 MAX_MEMORY_BYTES = 128 * 1024*1024
  22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
  23
  24 MIMETYPES = {
  25         "video/mp4": "mp4",
  26         "video/x-flv": "flv",
  27         "video/3gpp": "3gp",
  28 }
  29
  30 QUALITIES = {
  31         "hd1080": 5,
  32         "hd720": 4,
  33         "large": 3,
  34         "medium": 2,
  35         "small": 1,
  36 }
  37
  38
  39 class VideoUnavailable(Exception):
  40         pass
  41
  42 def print_form(url="", msg=""):
  43         script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
  44         sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
  45         sys.stdout.write("""
  46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  47 <html xmlns="http://www.w3.org/1999/xhtml">
  48 <head>
  49         <title>delx.net.au - YouTube Scraper</title>
  50         <link rel="stylesheet" type="text/css" href="/style.css"/>
  51         <style type="text/css">
  52                 input[type="text"] {
  53                         width: 100%;
  54                 }
  55                 .error {
  56                         color: red;
  57                 }
  58         </style>
  59 </head>
  60 <body>
  61         <h1>delx.net.au - YouTube Scraper</h1>
  62         {0}
  63         <form action="" method="get">
  64         <p>This page will let you easily download YouTube videos to watch offline. It
  65         will automatically grab the highest quality version.</p>
  66         <div><input type="text" name="url" value="{1}"/></div>
  67         <div><input type="submit" value="Download!"/></div>
  68         </form>
  69         <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
  70         to easily download videos. Right-click the link and add it to bookmarks,
  71         then when you're looking at a YouTube page select that bookmark from your
  72         browser's bookmarks menu to download the video straight away.</p>
  73 </body>
  74 </html>
  75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
  76
  77 cookiejar = cookielib.CookieJar()
  78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
  79 referrer = ""
  80
  81 def urlopen(url, offset=None):
  82         global referrer
  83         req = urllib2.Request(url)
  84         if referrer:
  85                 req.add_header("Referer", referrer)
  86         referrer = url
  87
  88         req.add_header("User-Agent", USER_AGENT)
  89
  90         if offset:
  91                 req.add_header("Range", "bytes=%d-" % offset)
  92
  93         res = urlopener.open(req)
  94
  95         content_range = res.info().getheader("Content-Range")
  96         if content_range:
  97                 tokens = content_range.split()
  98                 assert tokens[0] == "bytes"
  99                 start = int(tokens[1].split("-")[0])
 100                 assert start == offset
 101         return res
 102
 103 def parse_url(url):
 104         f = urlopen(url)
 105         doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
 106         f.close()
 107         return doc
 108
 109 def append_to_qs(url, params):
 110         r = list(urlparse.urlsplit(url))
 111         qs = urlparse.parse_qs(r[3])
 112         qs.update(params)
 113         r[3] = urllib.urlencode(qs, True)
 114         url = urlparse.urlunsplit(r)
 115         return url
 116
 117 def convert_from_old_itag(player_config):
 118         url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
 119         url_data["url"] = []
 120         for itag_url in url_data["itag"]:
 121                 pos = itag_url.find("url=")
 122                 url_data["url"].append(itag_url[pos+4:])
 123         player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
 124
 125 def get_player_config(doc):
 126         player_config = None
 127         for script in doc.xpath("//script"):
 128                 if not script.text:
 129                         continue
 130                 for line in script.text.split("\n"):
 131                         if "yt.playerConfig =" in line:
 132                                 p1 = line.find("=")
 133                                 p2 = line.rfind(";")
 134                                 if p1 >= 0 and p2 > 0:
 135                                         return json.loads(line[p1+1:p2])
 136                         if "ytplayer.config =" in line:
 137                                 p1 = line.find("ytplayer.config =")
 138                                 p2 = line.rfind(";")
 139                                 if p1 >= 0 and p2 > 0:
 140                                         return json.loads(line[p1+18:p2])
 141                         if "'PLAYER_CONFIG': " in line:
 142                                 p1 = line.find(":")
 143                                 if p1 >= 0:
 144                                         player_config = json.loads(line[p1+1:])
 145                                         convert_from_old_itag(player_config)
 146                                         return player_config
 147
 148 def decode_signature(js_url, s):
 149         script = urlopen(js_url).read()
 150         func_name = re.search(R"\b([a-z]+)\([a-z]+\.s\);", script).groups()[0]
 151         p1 = script.find("function " + func_name)
 152         p2 = script.find("}", p1)
 153         func_block = script[p1:p2+1]
 154
 155         p = subprocess.Popen(
 156                 ["js"],
 157                 stdin=subprocess.PIPE,
 158                 stdout=subprocess.PIPE
 159         )
 160         p.stdin.write(func_block + "\n")
 161         p.stdin.write("console.log(%s('%s'));\n" % (func_name, s))
 162         p.stdin.close()
 163
 164         signature = p.stdout.read().strip()
 165         if p.wait() != 0:
 166                 raise Exception("js failed to execute: %d" % p.returncode)
 167
 168         return signature
 169
 170 def get_best_video(player_config):
 171         url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
 172         js_url = player_config["assets"]["js"]
 173
 174         best_url = None
 175         best_quality = None
 176         best_extension = None
 177         for url_data in url_data_list:
 178                 url_data = urlparse.parse_qs(url_data)
 179                 mimetype = url_data["type"][0].split(";")[0]
 180                 quality = url_data["quality"][0]
 181
 182                 if quality not in QUALITIES:
 183                         continue
 184                 if mimetype not in MIMETYPES:
 185                         continue
 186
 187                 extension = MIMETYPES[mimetype]
 188                 quality = QUALITIES.get(quality, -1)
 189
 190                 if best_quality is not None and quality < best_quality:
 191                         continue
 192
 193                 video_url = url_data["url"][0]
 194                 if "sig" in url_data:
 195                         signature = url_data["sig"][0]
 196                 else:
 197                         signature = decode_signature(js_url, url_data["s"][0])
 198                 video_url = append_to_qs(video_url, {"signature": signature})
 199
 200                 best_url = video_url
 201                 best_quality = quality
 202                 best_extension = extension
 203
 204         return best_url, best_extension
 205
 206 def sanitize_filename(filename):
 207         return (
 208                 re.sub("\s+", " ", filename.strip())
 209                 .replace("\\", "-")
 210                 .replace("/", "-")
 211                 .replace("\0", " ")
 212         )
 213
 214 def get_video_url(doc):
 215         unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
 216         if unavailable:
 217                 raise VideoUnavailable(unavailable[0].strip())
 218
 219         player_config = get_player_config(doc)
 220         if not player_config:
 221                 raise VideoUnavailable("Could not find video URL")
 222
 223         video_url, extension = get_best_video(player_config)
 224         if not video_url:
 225                 return None, None
 226
 227         title = doc.xpath("/html/head/title/text()")[0]
 228         filename = sanitize_filename(title)
 229         filename += "." + extension
 230
 231         return video_url, filename
 232
 233 def write_video(filename, video_data):
 234         httpinfo = video_data.info()
 235         encoded_filename = urllib.quote(filename.encode("utf-8"))
 236         sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
 237         sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
 238         sys.stdout.write("\r\n")
 239         shutil.copyfileobj(video_data, sys.stdout)
 240         video_data.close()
 241
 242 def cgimain():
 243         args = cgi.parse()
 244         try:
 245                 url = args["url"][0]
 246         except:
 247                 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
 248                 return
 249
 250         try:
 251                 doc = parse_url(url)
 252                 video_url, filename = get_video_url(doc)
 253                 video_data = urlopen(video_url)
 254                 write_video(filename, video_data)
 255         except VideoUnavailable, e:
 256                 print_form(
 257                         url=url,
 258                         msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
 259                 )
 260         except Exception, e:
 261                 print_form(
 262                         url=url,
 263                         msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
 264                 )
 265                 return
 266
 267 def pp_size(size):
 268         suffixes = ["", "KiB", "MiB", "GiB"]
 269         for i, suffix in enumerate(suffixes):
 270                 if size < 1024:
 271                         break
 272                 size /= 1024
 273         return "%.2f %s" % (size, suffix)
 274
 275 def copy_with_progress(content_length, infile, outfile):
 276         def print_status():
 277                 rate = 0
 278                 if now != last_ts:
 279                         rate = last_bytes_read / (now - last_ts)
 280                 sys.stdout.write("\33[2K\r")
 281                 sys.stdout.write("%s / %s (%s/sec)" % (
 282                         pp_size(bytes_read),
 283                         pp_size(content_length),
 284                         pp_size(rate),
 285                 ))
 286                 sys.stdout.flush()
 287
 288         last_ts = 0
 289         last_bytes_read = 0
 290         bytes_read = 0
 291         while True:
 292                 now = time.time()
 293                 if now - last_ts > 0.5:
 294                         print_status()
 295                         last_ts = now
 296                         last_bytes_read = 0
 297
 298                 buf = infile.read(32768)
 299                 if not buf:
 300                         break
 301                 outfile.write(buf)
 302                 last_bytes_read += len(buf)
 303                 bytes_read += len(buf)
 304
 305         # Newline at the end
 306         print_status()
 307         print
 308
 309 def main():
 310         try:
 311                 url = sys.argv[1]
 312         except:
 313                 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
 314                 sys.exit(1)
 315
 316         doc = parse_url(url)
 317         video_url, filename = get_video_url(doc)
 318         print "Downloading", filename.encode("utf-8")
 319
 320         outfile = open(filename, "a")
 321         offset = outfile.tell()
 322         if offset > 0:
 323                 print "Resuming download from", pp_size(offset)
 324         total_size = None
 325
 326         while True:
 327                 try:
 328                         video_data = urlopen(video_url, offset)
 329                 except urllib2.HTTPError, e:
 330                         if e.code == 416:
 331                                 print "File is complete!"
 332                                 break
 333                         else:
 334                                 raise
 335
 336                 content_length = int(video_data.info().getheader("Content-Length"))
 337                 if total_size is None:
 338                         total_size = content_length
 339
 340                 try:
 341                         copy_with_progress(content_length, video_data, outfile)
 342                 except IOError, e:
 343                         print
 344
 345                 video_data.close()
 346                 if outfile.tell() != total_size:
 347                         old_offset = offset
 348                         offset = outfile.tell()
 349                         if old_offset == offset:
 350                                 time.sleep(1)
 351                         print "Restarting download from", pp_size(offset)
 352                 else:
 353                         break
 354
 355         outfile.close()
 356
 357
 358 if __name__ == "__main__":
 359 ###     resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
 360         if os.environ.has_key("SCRIPT_NAME"):
 361                 cgimain()
 362         else:
 363                 try:
 364                         main()
 365                 except KeyboardInterrupt:
 366                         print "\nExiting..."
 367                         sys.exit(1)
 368