code.delx.au - youtube-cgi/blob - youtube.cgi

   1 #!/usr/bin/env python
   2
   3 from __future__ import division
   4
   5 import cookielib
   6 import cgi
   7 import json
   8 from lxml import html
   9 import os
  10 import re
  11 import resource
  12 import shutil
  13 import subprocess
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18 import urlparse
  19
  20
  21 MAX_MEMORY_BYTES = 128 * 1024*1024
  22 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
  23
  24 MIMETYPES = {
  25         "video/mp4": "mp4",
  26         "video/x-flv": "flv",
  27         "video/3gpp": "3gp",
  28 }
  29
  30 QUALITIES = {
  31         "hd1080": 5,
  32         "hd720": 4,
  33         "large": 3,
  34         "medium": 2,
  35         "small": 1,
  36 }
  37
  38
  39 class VideoUnavailable(Exception):
  40         pass
  41
  42 def print_form(url="", msg=""):
  43         script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
  44         sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
  45         sys.stdout.write("""
  46 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  47 <html xmlns="http://www.w3.org/1999/xhtml">
  48 <head>
  49         <title>delx.net.au - YouTube Scraper</title>
  50         <link rel="stylesheet" type="text/css" href="/style.css"/>
  51         <style type="text/css">
  52                 input[type="text"] {
  53                         width: 100%;
  54                 }
  55                 .error {
  56                         color: red;
  57                 }
  58         </style>
  59 </head>
  60 <body>
  61         <h1>delx.net.au - YouTube Scraper</h1>
  62         {0}
  63         <form action="" method="get">
  64         <p>This page will let you easily download YouTube videos to watch offline. It
  65         will automatically grab the highest quality version.</p>
  66         <div><input type="text" name="url" value="{1}"/></div>
  67         <div><input type="submit" value="Download!"/></div>
  68         </form>
  69         <p>Tip! Use this bookmarklet: <a href="javascript:(function(){window.location='{2}?url='+escape(location);})()">YouTube Download</a>
  70         to easily download videos. Right-click the link and add it to bookmarks,
  71         then when you're looking at a YouTube page select that bookmark from your
  72         browser's bookmarks menu to download the video straight away.</p>
  73 </body>
  74 </html>
  75 """.replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
  76
  77 cookiejar = cookielib.CookieJar()
  78 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
  79 referrer = ""
  80
  81 def urlopen(url, offset=None):
  82         global referrer
  83         req = urllib2.Request(url)
  84         if referrer:
  85                 req.add_header("Referer", referrer)
  86         referrer = url
  87
  88         req.add_header("User-Agent", USER_AGENT)
  89
  90         if offset:
  91                 req.add_header("Range", "bytes=%d-" % offset)
  92
  93         res = urlopener.open(req)
  94
  95         content_range = res.info().getheader("Content-Range")
  96         if content_range:
  97                 tokens = content_range.split()
  98                 assert tokens[0] == "bytes"
  99                 start = int(tokens[1].split("-")[0])
 100                 assert start == offset
 101         return res
 102
 103 def parse_url(url):
 104         f = urlopen(url)
 105         doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
 106         f.close()
 107         return doc
 108
 109 def append_to_qs(url, params):
 110         r = list(urlparse.urlsplit(url))
 111         qs = urlparse.parse_qs(r[3])
 112         qs.update(params)
 113         r[3] = urllib.urlencode(qs, True)
 114         url = urlparse.urlunsplit(r)
 115         return url
 116
 117 def convert_from_old_itag(player_config):
 118         url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
 119         url_data["url"] = []
 120         for itag_url in url_data["itag"]:
 121                 pos = itag_url.find("url=")
 122                 url_data["url"].append(itag_url[pos+4:])
 123         player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
 124
 125 def get_player_config(doc):
 126         player_config = None
 127         for script in doc.xpath("//script"):
 128                 if not script.text:
 129                         continue
 130                 for line in script.text.split("\n"):
 131                         if "yt.playerConfig =" in line:
 132                                 p1 = line.find("=")
 133                                 p2 = line.rfind(";")
 134                                 if p1 >= 0 and p2 > 0:
 135                                         return json.loads(line[p1+1:p2])
 136                         if "'PLAYER_CONFIG': " in line:
 137                                 p1 = line.find(":")
 138                                 if p1 >= 0:
 139                                         player_config = json.loads(line[p1+1:])
 140                                         convert_from_old_itag(player_config)
 141                                         return player_config
 142
 143 def get_best_video(player_config):
 144         url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
 145
 146         best_url = None
 147         best_quality = None
 148         best_extension = None
 149         for url_data in url_data_list:
 150                 url_data = urlparse.parse_qs(url_data)
 151                 video_url = url_data["url"][0]
 152                 mimetype = url_data["type"][0].split(";")[0]
 153                 quality = url_data["quality"][0]
 154                 signature = url_data["sig"][0]
 155
 156                 if quality not in QUALITIES:
 157                         continue
 158                 if mimetype not in MIMETYPES:
 159                         continue
 160
 161                 extension = MIMETYPES[mimetype]
 162                 quality = QUALITIES.get(quality, -1)
 163
 164                 if best_quality is None or quality > best_quality:
 165                         if signature:
 166                                 video_url = append_to_qs(video_url, {"signature": signature})
 167                         best_url = video_url
 168                         best_quality = quality
 169                         best_extension = extension
 170
 171         return best_url, best_extension
 172
 173 def sanitize_filename(filename):
 174         return (
 175                 re.sub("\s+", " ", filename.strip())
 176                 .replace("\\", "-")
 177                 .replace("/", "-")
 178                 .replace("\0", " ")
 179         )
 180
 181 def get_video_url(doc):
 182         unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
 183         if unavailable:
 184                 raise VideoUnavailable(unavailable[0].strip())
 185
 186         player_config = get_player_config(doc)
 187         if not player_config:
 188                 raise VideoUnavailable("Could not find video URL")
 189
 190         video_url, extension = get_best_video(player_config)
 191         if not video_url:
 192                 return None, None
 193
 194         title = doc.xpath("/html/head/title/text()")[0]
 195         filename = sanitize_filename(title)
 196         filename += "." + extension
 197
 198         return video_url, filename
 199
 200 def write_video(filename, video_data):
 201         httpinfo = video_data.info()
 202         encoded_filename = urllib.quote(filename.encode("utf-8"))
 203         sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
 204         sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
 205         sys.stdout.write("\r\n")
 206         shutil.copyfileobj(video_data, sys.stdout)
 207         video_data.close()
 208
 209 def cgimain():
 210         args = cgi.parse()
 211         try:
 212                 url = args["url"][0]
 213         except:
 214                 print_form(url="http://www.youtube.com/watch?v=FOOBAR")
 215                 return
 216
 217         try:
 218                 doc = parse_url(url)
 219                 video_url, filename = get_video_url(doc)
 220                 video_data = urlopen(video_url)
 221                 write_video(filename, video_data)
 222         except VideoUnavailable, e:
 223                 print_form(
 224                         url=url,
 225                         msg="<p class='error'>Sorry, there was an error: %s</p>" % cgi.escape(e.message)
 226                 )
 227         except Exception, e:
 228                 print_form(
 229                         url=url,
 230                         msg="<p class='error'>Sorry, there was an error. Check your URL?</p>"
 231                 )
 232                 return
 233
 234 def pp_size(size):
 235         suffixes = ["", "KiB", "MiB", "GiB"]
 236         for i, suffix in enumerate(suffixes):
 237                 if size < 1024:
 238                         break
 239                 size /= 1024
 240         return "%.2f %s" % (size, suffix)
 241
 242 def copy_with_progress(content_length, infile, outfile):
 243         def print_status():
 244                 rate = 0
 245                 if now != last_ts:
 246                         rate = last_bytes_read / (now - last_ts)
 247                 sys.stdout.write("\33[2K\r")
 248                 sys.stdout.write("%s / %s (%s/sec)" % (
 249                         pp_size(bytes_read),
 250                         pp_size(content_length),
 251                         pp_size(rate),
 252                 ))
 253                 sys.stdout.flush()
 254
 255         last_ts = 0
 256         last_bytes_read = 0
 257         bytes_read = 0
 258         while True:
 259                 now = time.time()
 260                 if now - last_ts > 0.5:
 261                         print_status()
 262                         last_ts = now
 263                         last_bytes_read = 0
 264
 265                 buf = infile.read(32768)
 266                 if not buf:
 267                         break
 268                 outfile.write(buf)
 269                 last_bytes_read += len(buf)
 270                 bytes_read += len(buf)
 271
 272         # Newline at the end
 273         print_status()
 274         print
 275
 276 def main():
 277         try:
 278                 url = sys.argv[1]
 279         except:
 280                 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
 281                 sys.exit(1)
 282
 283         doc = parse_url(url)
 284         video_url, filename = get_video_url(doc)
 285         print "Downloading", filename.encode("utf-8")
 286
 287         outfile = open(filename, "a")
 288         offset = outfile.tell()
 289         if offset > 0:
 290                 print "Resuming download from", pp_size(offset)
 291         total_size = None
 292
 293         while True:
 294                 try:
 295                         video_data = urlopen(video_url, offset)
 296                 except urllib2.HTTPError, e:
 297                         if e.code == 416:
 298                                 print "File is complete!"
 299                                 break
 300                         else:
 301                                 raise
 302
 303                 content_length = int(video_data.info().getheader("Content-Length"))
 304                 if total_size is None:
 305                         total_size = content_length
 306
 307                 try:
 308                         copy_with_progress(content_length, video_data, outfile)
 309                 except IOError, e:
 310                         print
 311
 312                 video_data.close()
 313                 if outfile.tell() != total_size:
 314                         old_offset = offset
 315                         offset = outfile.tell()
 316                         if old_offset == offset:
 317                                 time.sleep(1)
 318                         print "Restarting download from", pp_size(offset)
 319                 else:
 320                         break
 321
 322         outfile.close()
 323
 324
 325 if __name__ == "__main__":
 326         resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
 327         if os.environ.has_key("SCRIPT_NAME"):
 328                 cgimain()
 329         else:
 330                 try:
 331                         main()
 332                 except KeyboardInterrupt:
 333                         print "\nExiting..."
 334                         sys.exit(1)
 335