code.delx.au - youtube-cgi/blob - youtube.cgi

   1 #!/usr/bin/env python
   2
   3 from __future__ import division
   4
   5 import cookielib
   6 import cgi
   7 import json
   8 from lxml import html
   9 import os
  10 import re
  11 import resource
  12 import shutil
  13 import subprocess
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18 import urlparse
  19
  20
  21 MAX_MEMORY_BYTES = 128 * 1024*1024
  22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
  23
  24 MIMETYPES = {
  25         "video/mp4": "mp4",
  26         "video/x-flv": "flv",
  27         "video/3gpp": "3gp",
  28 }
  29
  30 QUALITIES = {
  31         "hd1080": 5,
  32         "hd720": 4,
  33         "large": 3,
  34         "medium": 2,
  35         "small": 1,
  36 }
  37
  38
  39 class VideoUnavailable(Exception):
  40         pass
  41
  42 def print_form(url="", msg=""):
  43         script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
  44         sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
  45         sys.stdout.write("""
  46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  47 <html xmlns="http://www.w3.org/1999/xhtml">
  48 <head>
  49         <title>delx.net.au - YouTube Scraper</title>
  50         <link rel="stylesheet" type="text/css" href="/style.css"/>
  51         <style type="text/css">
  52                 input[type="text"] {
  53                         width: 100%;
  54                 }
  55                 .error {
  56                         color: red;
  57                 }
  58         </style>
  59 </head>
  60 <body>
  61         <h1>delx.net.au - YouTube Scraper</h1>
  62         {0}
  63         <form action="" method="get">
  64         <p>This page will let you easily download YouTube videos to watch offline. It
  65         will automatically grab the highest quality version.</p>
  66         <div><input type="text" name="url" value="{1}"/></div>
  67         <div><input type="submit" value="Download!"/></div>
  68         </form>
  69         <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
  70         to easily download videos. Right-click the link and add it to bookmarks,
  71         then when you're looking at a YouTube page select that bookmark from your
  72         browser's bookmarks menu to download the video straight away.</p>
  73 </body>
  74 </html>
  75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
  76
  77 cookiejar = cookielib.CookieJar()
  78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
  79 referrer = ""
  80
  81 def urlopen(url, offset=None):
  82         global referrer
  83         req = urllib2.Request(url)
  84         if referrer:
  85                 req.add_header("Referer", referrer)
  86         referrer = url
  87
  88         req.add_header("User-Agent", USER_AGENT)
  89
  90         if offset:
  91                 req.add_header("Range", "bytes=%d-" % offset)
  92
  93         res = urlopener.open(req)
  94
  95         content_range = res.info().getheader("Content-Range")
  96         if content_range:
  97                 tokens = content_range.split()
  98                 assert tokens[0] == "bytes"
  99                 start = int(tokens[1].split("-")[0])
 100                 assert start == offset
 101         return res
 102
 103 def parse_url(url):
 104         f = urlopen(url)
 105         doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
 106         f.close()
 107         return doc
 108
 109 def append_to_qs(url, params):
 110         r = list(urlparse.urlsplit(url))
 111         qs = urlparse.parse_qs(r[3])
 112         qs.update(params)
 113         r[3] = urllib.urlencode(qs, True)
 114         url = urlparse.urlunsplit(r)
 115         return url
 116
 117 def convert_from_old_itag(player_config):
 118         url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
 119         url_data["url"] = []
 120         for itag_url in url_data["itag"]:
 121                 pos = itag_url.find("url=")
 122                 url_data["url"].append(itag_url[pos+4:])
 123         player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
 124
 125 def get_player_config(doc):
 126         player_config = None
 127         for script in doc.xpath("//script"):
 128                 if not script.text:
 129                         continue
 130                 for line in script.text.split("\n"):
 131                         if "yt.playerConfig =" in line:
 132                                 p1 = line.find("=")
 133                                 p2 = line.rfind(";")
 134                                 if p1 >= 0 and p2 > 0:
 135                                         return json.loads(line[p1+1:p2])
 136                         if "ytplayer.config =" in line:
 137                                 p1 = line.find("ytplayer.config =")
 138                                 p2 = line.rfind(";")
 139                                 if p1 >= 0 and p2 > 0:
 140                                         return json.loads(line[p1+18:p2])
 141                         if "'PLAYER_CONFIG': " in line:
 142                                 p1 = line.find(":")
 143                                 if p1 >= 0:
 144                                         player_config = json.loads(line[p1+1:])
 145                                         convert_from_old_itag(player_config)
 146                                         return player_config
 147
 148 def get_best_video(player_config):
 149         url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
 150
 151         best_url = None
 152         best_quality = None
 153         best_extension = None
 154         for url_data in url_data_list:
 155                 url_data = urlparse.parse_qs(url_data)
 156                 video_url = url_data["url"][0]
 157                 mimetype = url_data["type"][0].split(";")[0]
 158                 quality = url_data["quality"][0]
 159                 signature = url_data["sig"][0]
 160
 161                 if quality not in QUALITIES:
 162                         continue
 163                 if mimetype not in MIMETYPES:
 164                         continue
 165
 166                 extension = MIMETYPES[mimetype]
 167                 quality = QUALITIES.get(quality, -1)
 168                 video_url = append_to_qs(video_url, {"signature": signature})
 169
 170                 if best_quality is None or quality > best_quality:
 171                         best_url = video_url
 172                         best_quality = quality
 173                         best_extension = extension
 174
 175         return best_url, best_extension
 176
 177 def sanitize_filename(filename):
 178         return (
 179                 re.sub("\s+", " ", filename.strip())
 180                 .replace("\\", "-")
 181                 .replace("/", "-")
 182                 .replace("\0", " ")
 183         )
 184
 185 def get_video_url(doc):
 186         unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
 187         if unavailable:
 188                 raise VideoUnavailable(unavailable[0].strip())
 189
 190         player_config = get_player_config(doc)
 191         if not player_config:
 192                 raise VideoUnavailable("Could not find video URL")
 193
 194         video_url, extension = get_best_video(player_config)
 195         if not video_url:
 196                 return None, None
 197
 198         title = doc.xpath("/html/head/title/text()")[0]
 199         filename = sanitize_filename(title)
 200         filename += "." + extension
 201
 202         return video_url, filename
 203
 204 def write_video(filename, video_data):
 205         httpinfo = video_data.info()
 206         encoded_filename = urllib.quote(filename.encode("utf-8"))
 207         sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
 208         sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
 209         sys.stdout.write("\r\n")
 210         shutil.copyfileobj(video_data, sys.stdout)
 211         video_data.close()
 212
 213 def cgimain():
 214         args = cgi.parse()
 215         try:
 216                 url = args["url"][0]
 217         except:
 218                 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
 219                 return
 220
 221         try:
 222                 doc = parse_url(url)
 223                 video_url, filename = get_video_url(doc)
 224                 video_data = urlopen(video_url)
 225                 write_video(filename, video_data)
 226         except VideoUnavailable, e:
 227                 print_form(
 228                         url=url,
 229                         msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
 230                 )
 231         except Exception, e:
 232                 print_form(
 233                         url=url,
 234                         msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
 235                 )
 236                 return
 237
 238 def pp_size(size):
 239         suffixes = ["", "KiB", "MiB", "GiB"]
 240         for i, suffix in enumerate(suffixes):
 241                 if size < 1024:
 242                         break
 243                 size /= 1024
 244         return "%.2f %s" % (size, suffix)
 245
 246 def copy_with_progress(content_length, infile, outfile):
 247         def print_status():
 248                 rate = 0
 249                 if now != last_ts:
 250                         rate = last_bytes_read / (now - last_ts)
 251                 sys.stdout.write("\33[2K\r")
 252                 sys.stdout.write("%s / %s (%s/sec)" % (
 253                         pp_size(bytes_read),
 254                         pp_size(content_length),
 255                         pp_size(rate),
 256                 ))
 257                 sys.stdout.flush()
 258
 259         last_ts = 0
 260         last_bytes_read = 0
 261         bytes_read = 0
 262         while True:
 263                 now = time.time()
 264                 if now - last_ts > 0.5:
 265                         print_status()
 266                         last_ts = now
 267                         last_bytes_read = 0
 268
 269                 buf = infile.read(32768)
 270                 if not buf:
 271                         break
 272                 outfile.write(buf)
 273                 last_bytes_read += len(buf)
 274                 bytes_read += len(buf)
 275
 276         # Newline at the end
 277         print_status()
 278         print
 279
 280 def main():
 281         try:
 282                 url = sys.argv[1]
 283         except:
 284                 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
 285                 sys.exit(1)
 286
 287         doc = parse_url(url)
 288         video_url, filename = get_video_url(doc)
 289         print "Downloading", filename.encode("utf-8")
 290
 291         outfile = open(filename, "a")
 292         offset = outfile.tell()
 293         if offset > 0:
 294                 print "Resuming download from", pp_size(offset)
 295         total_size = None
 296
 297         while True:
 298                 try:
 299                         video_data = urlopen(video_url, offset)
 300                 except urllib2.HTTPError, e:
 301                         if e.code == 416:
 302                                 print "File is complete!"
 303                                 break
 304                         else:
 305                                 raise
 306
 307                 content_length = int(video_data.info().getheader("Content-Length"))
 308                 if total_size is None:
 309                         total_size = content_length
 310
 311                 try:
 312                         copy_with_progress(content_length, video_data, outfile)
 313                 except IOError, e:
 314                         print
 315
 316                 video_data.close()
 317                 if outfile.tell() != total_size:
 318                         old_offset = offset
 319                         offset = outfile.tell()
 320                         if old_offset == offset:
 321                                 time.sleep(1)
 322                         print "Restarting download from", pp_size(offset)
 323                 else:
 324                         break
 325
 326         outfile.close()
 327
 328
 329 if __name__ == "__main__":
 330         resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
 331         if os.environ.has_key("SCRIPT_NAME"):
 332                 cgimain()
 333         else:
 334                 try:
 335                         main()
 336                 except KeyboardInterrupt:
 337                         print "\nExiting..."
 338                         sys.exit(1)
 339