code.delx.au - youtube-cgi/blob - youtube.cgi

   1 #!/usr/bin/env python
   2
   3 import cookielib
   4 import cgi
   5 import itertools
   6 import json
   7 from lxml import html
   8 import os
   9 import re
  10 import resource
  11 import shutil
  12 import subprocess
  13 import sys
  14 import time
  15 import urllib
  16 import urllib2
  17 import urlparse
  18
  19
  20 MAX_MEMORY_BYTES = 128 * 1024*1024
  21 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
  22
  23 MIMETYPES = {
  24         "video/mp4": "mp4",
  25         "video/x-flv": "flv",
  26         "video/3gpp": "3gp",
  27 }
  28
  29 QUALITIES = {
  30         "large": 3,
  31         "medium": 2,
  32         "small": 1,
  33 }
  34
  35
  36 class VideoUnavailable(Exception):
  37         pass
  38
  39 def print_form(url="", msg=""):
  40         script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
  41         sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
  42         sys.stdout.write("""
  43 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  44 <html xmlns="http://www.w3.org/1999/xhtml">
  45 <head>
  46         <title>delx.net.au - YouTube Scraper</title>
  47         <link rel="stylesheet" type="text/css" href="/style.css"/>
  48         <style type="text/css">
  49                 input[type="text"] {
  50                         width: 100%;
  51                 }
  52                 .error {
  53                         color: red;
  54                 }
  55         </style>
  56 </head>
  57 <body>
  58         <h1>delx.net.au - YouTube Scraper</h1>
  59         {0}
  60         <form action="" method="get">
  61         <p>This page will let you easily download YouTube videos to watch offline. It
  62         will automatically grab the highest quality version.</p>
  63         <div><input type="text" name="url" value="{1}"/></div>
  64         <div><input type="submit" value="Download!"/></div>
  65         </form>
  66         <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
  67         to easily download videos. Right-click the link and add it to bookmarks,
  68         then when you're looking at a YouTube page select that bookmark from your
  69         browser's bookmarks menu to download the video straight away.</p>
  70 </body>
  71 </html>
  72 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
  73
  74 cookiejar = cookielib.CookieJar()
  75 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
  76 referrer = ""
  77
  78 def urlopen(url):
  79         global referrer
  80         req = urllib2.Request(url)
  81         if referrer:
  82                 req.add_header("Referer", referrer)
  83         referrer = url
  84         req.add_header("User-Agent", USER_AGENT)
  85         return urlopener.open(req)
  86
  87 def parse_url(url):
  88         f = urlopen(url)
  89         doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
  90         f.close()
  91         return doc
  92
  93 def append_to_qs(url, params):
  94         r = list(urlparse.urlsplit(url))
  95         qs = urlparse.parse_qs(r[3])
  96         qs.update(params)
  97         r[3] = urllib.urlencode(qs, True)
  98         url = urlparse.urlunsplit(r)
  99         return url
 100
 101 def convert_from_old_itag(player_config):
 102         url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
 103         url_data["url"] = []
 104         for itag_url in url_data["itag"]:
 105                 pos = itag_url.find("url=")
 106                 url_data["url"].append(itag_url[pos+4:])
 107         player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
 108
 109 def get_player_config(doc):
 110         player_config = None
 111         for script in doc.xpath("//script"):
 112                 if not script.text:
 113                         continue
 114                 for line in script.text.split("\n"):
 115                         if "yt.playerConfig =" in line:
 116                                 p1 = line.find("=")
 117                                 p2 = line.rfind(";")
 118                                 if p1 >= 0 and p2 > 0:
 119                                         return json.loads(line[p1+1:p2])
 120                         if "'PLAYER_CONFIG': " in line:
 121                                 p1 = line.find(":")
 122                                 if p1 >= 0:
 123                                         player_config = json.loads(line[p1+1:])
 124                                         convert_from_old_itag(player_config)
 125                                         return player_config
 126
 127 def get_best_video(player_config):
 128         url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
 129         url_data = itertools.izip_longest(
 130                 url_data["url"],
 131                 url_data["type"],
 132                 url_data["quality"],
 133                 url_data.get("sig", []),
 134         )
 135         best_url = None
 136         best_quality = None
 137         best_extension = None
 138         for video_url, mimetype, quality, signature in url_data:
 139                 mimetype = mimetype.split(";")[0]
 140                 if mimetype not in MIMETYPES:
 141                         continue
 142                 extension = MIMETYPES[mimetype]
 143                 quality = QUALITIES.get(quality.split(",")[0], -1)
 144                 if best_quality is None or quality > best_quality:
 145                         if signature:
 146                                 video_url = append_to_qs(video_url, {"signature": signature})
 147                         best_url = video_url
 148                         best_quality = quality
 149                         best_extension = extension
 150
 151         return best_url, best_extension
 152
 153 def sanitize_filename(filename):
 154         return (
 155                 re.sub("\s+", " ", filename.strip())
 156                 .replace("\\", "-")
 157                 .replace("/", "-")
 158                 .replace("\0", " ")
 159         )
 160
 161 def get_video_url(doc):
 162         unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
 163         if unavailable:
 164                 raise VideoUnavailable(unavailable[0].strip())
 165
 166         player_config = get_player_config(doc)
 167         if not player_config:
 168                 raise VideoUnavailable("Could not find video URL")
 169
 170         video_url, extension = get_best_video(player_config)
 171         if not video_url:
 172                 return None, None
 173
 174         title = doc.xpath("/html/head/title/text()")[0]
 175         filename = sanitize_filename(title)
 176         filename += "." + extension
 177
 178         return video_url, filename
 179
 180 def write_video(filename, video_data):
 181         httpinfo = video_data.info()
 182         encoded_filename = urllib.quote(filename.encode("utf-8"))
 183         sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
 184         sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
 185         sys.stdout.write("\r\n")
 186         shutil.copyfileobj(video_data, sys.stdout)
 187         video_data.close()
 188
 189 def cgimain():
 190         args = cgi.parse()
 191         try:
 192                 url = args["url"][0]
 193         except:
 194                 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
 195                 return
 196
 197         try:
 198                 doc = parse_url(url)
 199                 video_url, filename = get_video_url(doc)
 200                 video_data = urlopen(video_url)
 201                 write_video(filename, video_data)
 202         except VideoUnavailable, e:
 203                 print_form(
 204                         url=url,
 205                         msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
 206                 )
 207         except Exception, e:
 208                 print_form(
 209                         url=url,
 210                         msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
 211                 )
 212                 return
 213
 214 def copy_with_progress(total_size, infile, outfile):
 215         def pp_size(size):
 216                 suffixes = ["", "KiB", "MiB", "GiB"]
 217                 for i, suffix in enumerate(suffixes):
 218                         if size < 1024:
 219                                 break
 220                         size /= 1024
 221                 return "%d %s" % (size, suffix)
 222
 223         start_ts = time.time()
 224         last_ts = 0
 225         bytes_read = 0
 226         while True:
 227                 now = time.time()
 228                 if now - last_ts > 0.5:
 229                         last_ts = now
 230                         sys.stdout.write("\33[2K\r")
 231                         sys.stdout.write("%s / %s (%s/sec)" % (
 232                                 pp_size(bytes_read),
 233                                 pp_size(total_size),
 234                                 pp_size(bytes_read / (now - start_ts)),
 235                         ))
 236                         sys.stdout.flush()
 237
 238                 buf = infile.read(32768)
 239                 if not buf:
 240                         break
 241                 outfile.write(buf)
 242                 bytes_read += len(buf)
 243
 244 def main():
 245         try:
 246                 url = sys.argv[1]
 247         except:
 248                 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
 249                 sys.exit(1)
 250         doc = parse_url(url)
 251         video_url, filename = get_video_url(doc)
 252         video_data = urlopen(video_url)
 253         outfile = open(filename, "w")
 254         total_size = int(video_data.info().getheader("Content-Length"))
 255         print "Downloading", filename.encode("utf-8")
 256         copy_with_progress(total_size, video_data, outfile)
 257         video_data.close()
 258         outfile.close()
 259
 260
 261 if __name__ == "__main__":
 262         resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
 263         if os.environ.has_key("SCRIPT_NAME"):
 264                 cgimain()
 265         else:
 266                 try:
 267                         main()
 268                 except KeyboardInterrupt:
 269                         print "\nExiting..."
 270                         sys.exit(1)
 271