code.delx.au - youtube-cgi/blob - youtube.cgi

   1 #!/usr/bin/env python
   2
   3 from __future__ import division
   4
   5 import cookielib
   6 import cgi
   7 import itertools
   8 import json
   9 from lxml import html
  10 import os
  11 import re
  12 import resource
  13 import shutil
  14 import subprocess
  15 import sys
  16 import time
  17 import urllib
  18 import urllib2
  19 import urlparse
  20
  21
  22 MAX_MEMORY_BYTES = 128 * 1024*1024
  23 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
  24
  25 MIMETYPES = {
  26         "video/mp4": "mp4",
  27         "video/x-flv": "flv",
  28         "video/3gpp": "3gp",
  29 }
  30
  31 QUALITIES = {
  32         "hd1080": 5,
  33         "hd720": 4,
  34         "large": 3,
  35         "medium": 2,
  36         "small": 1,
  37 }
  38
  39
  40 class VideoUnavailable(Exception):
  41         pass
  42
  43 def print_form(url="", msg=""):
  44         script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
  45         sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
  46         sys.stdout.write("""
  47 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  48 <html xmlns="http://www.w3.org/1999/xhtml">
  49 <head>
  50         <title>delx.net.au - YouTube Scraper</title>
  51         <link rel="stylesheet" type="text/css" href="/style.css"/>
  52         <style type="text/css">
  53                 input[type="text"] {
  54                         width: 100%;
  55                 }
  56                 .error {
  57                         color: red;
  58                 }
  59         </style>
  60 </head>
  61 <body>
  62         <h1>delx.net.au - YouTube Scraper</h1>
  63         {0}
  64         <form action="" method="get">
  65         <p>This page will let you easily download YouTube videos to watch offline. It
  66         will automatically grab the highest quality version.</p>
  67         <div><input type="text" name="url" value="{1}"/></div>
  68         <div><input type="submit" value="Download!"/></div>
  69         </form>
  70         <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
  71         to easily download videos. Right-click the link and add it to bookmarks,
  72         then when you're looking at a YouTube page select that bookmark from your
  73         browser's bookmarks menu to download the video straight away.</p>
  74 </body>
  75 </html>
  76 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
  77
  78 cookiejar = cookielib.CookieJar()
  79 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
  80 referrer = ""
  81
  82 def urlopen(url, offset=None):
  83         global referrer
  84         req = urllib2.Request(url)
  85         if referrer:
  86                 req.add_header("Referer", referrer)
  87         referrer = url
  88
  89         req.add_header("User-Agent", USER_AGENT)
  90
  91         if offset:
  92                 req.add_header("Range", "bytes=%d-" % offset)
  93
  94         res = urlopener.open(req)
  95
  96         content_range = res.info().getheader("Content-Range")
  97         if content_range:
  98                 tokens = content_range.split()
  99                 assert tokens[0] == "bytes"
 100                 start = int(tokens[1].split("-")[0])
 101                 assert start == offset
 102         return res
 103
 104 def parse_url(url):
 105         f = urlopen(url)
 106         doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
 107         f.close()
 108         return doc
 109
 110 def append_to_qs(url, params):
 111         r = list(urlparse.urlsplit(url))
 112         qs = urlparse.parse_qs(r[3])
 113         qs.update(params)
 114         r[3] = urllib.urlencode(qs, True)
 115         url = urlparse.urlunsplit(r)
 116         return url
 117
 118 def convert_from_old_itag(player_config):
 119         url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
 120         url_data["url"] = []
 121         for itag_url in url_data["itag"]:
 122                 pos = itag_url.find("url=")
 123                 url_data["url"].append(itag_url[pos+4:])
 124         player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
 125
 126 def get_player_config(doc):
 127         player_config = None
 128         for script in doc.xpath("//script"):
 129                 if not script.text:
 130                         continue
 131                 for line in script.text.split("\n"):
 132                         if "yt.playerConfig =" in line:
 133                                 p1 = line.find("=")
 134                                 p2 = line.rfind(";")
 135                                 if p1 >= 0 and p2 > 0:
 136                                         return json.loads(line[p1+1:p2])
 137                         if "'PLAYER_CONFIG': " in line:
 138                                 p1 = line.find(":")
 139                                 if p1 >= 0:
 140                                         player_config = json.loads(line[p1+1:])
 141                                         convert_from_old_itag(player_config)
 142                                         return player_config
 143
 144 def get_best_video(player_config):
 145         url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
 146         url_data = itertools.izip_longest(
 147                 url_data["url"],
 148                 url_data["type"],
 149                 url_data["quality"],
 150                 url_data.get("sig", []),
 151         )
 152         best_url = None
 153         best_quality = None
 154         best_extension = None
 155         for video_url, mimetype, quality, signature in url_data:
 156                 mimetype = mimetype.split(";")[0]
 157                 if mimetype not in MIMETYPES:
 158                         continue
 159                 extension = MIMETYPES[mimetype]
 160                 quality = QUALITIES.get(quality.split(",")[0], -1)
 161                 if best_quality is None or quality > best_quality:
 162                         if signature:
 163                                 video_url = append_to_qs(video_url, {"signature": signature})
 164                         best_url = video_url
 165                         best_quality = quality
 166                         best_extension = extension
 167
 168         return best_url, best_extension
 169
 170 def sanitize_filename(filename):
 171         return (
 172                 re.sub("\s+", " ", filename.strip())
 173                 .replace("\\", "-")
 174                 .replace("/", "-")
 175                 .replace("\0", " ")
 176         )
 177
 178 def get_video_url(doc):
 179         unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
 180         if unavailable:
 181                 raise VideoUnavailable(unavailable[0].strip())
 182
 183         player_config = get_player_config(doc)
 184         if not player_config:
 185                 raise VideoUnavailable("Could not find video URL")
 186
 187         video_url, extension = get_best_video(player_config)
 188         if not video_url:
 189                 return None, None
 190
 191         title = doc.xpath("/html/head/title/text()")[0]
 192         filename = sanitize_filename(title)
 193         filename += "." + extension
 194
 195         return video_url, filename
 196
 197 def write_video(filename, video_data):
 198         httpinfo = video_data.info()
 199         encoded_filename = urllib.quote(filename.encode("utf-8"))
 200         sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
 201         sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
 202         sys.stdout.write("\r\n")
 203         shutil.copyfileobj(video_data, sys.stdout)
 204         video_data.close()
 205
 206 def cgimain():
 207         args = cgi.parse()
 208         try:
 209                 url = args["url"][0]
 210         except:
 211                 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
 212                 return
 213
 214         try:
 215                 doc = parse_url(url)
 216                 video_url, filename = get_video_url(doc)
 217                 video_data = urlopen(video_url)
 218                 write_video(filename, video_data)
 219         except VideoUnavailable, e:
 220                 print_form(
 221                         url=url,
 222                         msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
 223                 )
 224         except Exception, e:
 225                 print_form(
 226                         url=url,
 227                         msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
 228                 )
 229                 return
 230
 231 def pp_size(size):
 232         suffixes = ["", "KiB", "MiB", "GiB"]
 233         for i, suffix in enumerate(suffixes):
 234                 if size < 1024:
 235                         break
 236                 size /= 1024
 237         return "%.2f %s" % (size, suffix)
 238
 239 def copy_with_progress(content_length, infile, outfile):
 240         def print_status():
 241                 sys.stdout.write("\33[2K\r")
 242                 sys.stdout.write("%s / %s (%s/sec)" % (
 243                         pp_size(bytes_read),
 244                         pp_size(content_length),
 245                         pp_size(bytes_read / (now - start_ts)),
 246                 ))
 247                 sys.stdout.flush()
 248
 249         start_ts = time.time()
 250         last_ts = 0
 251         bytes_read = 0
 252         while True:
 253                 now = time.time()
 254                 if now - last_ts > 0.5:
 255                         last_ts = now
 256                         print_status()
 257
 258                 buf = infile.read(32768)
 259                 if not buf:
 260                         break
 261                 outfile.write(buf)
 262                 bytes_read += len(buf)
 263
 264         # Newline at the end
 265         print_status()
 266         print
 267
 268 def main():
 269         try:
 270                 url = sys.argv[1]
 271         except:
 272                 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
 273                 sys.exit(1)
 274
 275         doc = parse_url(url)
 276         video_url, filename = get_video_url(doc)
 277         print "Downloading", filename.encode("utf-8")
 278
 279         outfile = open(filename, "a")
 280         offset = outfile.tell()
 281         if offset > 0:
 282                 print "Resuming download from", pp_size(offset)
 283         total_size = None
 284
 285         while True:
 286                 try:
 287                         video_data = urlopen(video_url, offset)
 288                 except urllib2.HTTPError, e:
 289                         if e.code == 416:
 290                                 print "File is complete!"
 291                                 break
 292                         else:
 293                                 raise
 294
 295                 content_length = int(video_data.info().getheader("Content-Length"))
 296                 if total_size is None:
 297                         total_size = content_length
 298
 299                 try:
 300                         copy_with_progress(content_length, video_data, outfile)
 301                 except IOError, e:
 302                         print
 303
 304                 video_data.close()
 305                 if outfile.tell() != total_size:
 306                         old_offset = offset
 307                         offset = outfile.tell()
 308                         if old_offset == offset:
 309                                 time.sleep(1)
 310                         print "Restarting download from", pp_size(offset)
 311                 else:
 312                         break
 313
 314         outfile.close()
 315
 316
 317 if __name__ == "__main__":
 318         resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
 319         if os.environ.has_key("SCRIPT_NAME"):
 320                 cgimain()
 321         else:
 322                 try:
 323                         main()
 324                 except KeyboardInterrupt:
 325                         print "\nExiting..."
 326                         sys.exit(1)
 327