fixes for recent changes

[youtube-cgi] / youtube.cgi
diff --git a/youtube.cgi b/youtube.cgi

index 2546a148a819c7ddc5fa2ae61f9270a75a70ebc1..56b89371a968318a7851f0209785097039192df5 100755 (executable)
--- a/youtube.cgi
+++ b/youtube.cgi
@@ -1,16 +1,18 @@
-#!/usr/bin/env python
+#!/usr/bin/python2
+
+from __future__ import division
  
  import cookielib
  import cgi
-import itertools
  import json
-from lxml.html import document_fromstring, tostring
+from lxml import html
  import os
  import re
  import resource
  import shutil
  import subprocess
  import sys
+import time
  import urllib
  import urllib2
  import urlparse
@@ -26,6 +28,8 @@ MIMETYPES = {
  }
  
  QUALITIES = {
+       "hd1080": 5,
+       "hd720": 4,
         "large": 3,
         "medium": 2,
         "small": 1,
@@ -37,8 +41,8 @@ class VideoUnavailable(Exception):
  
  def print_form(url="", msg=""):
         script_url = "http://%s%s" % (os.environ["HTTP_HOST"], os.environ["REQUEST_URI"])
-       print "Content-Type: application/xhtml+xml\r\n\r\n"
-       print """
+       sys.stdout.write("Content-Type: application/xhtml+xml\r\n\r\n")
+       sys.stdout.write("""
  <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
  <html xmlns="http://www.w3.org/1999/xhtml">
  <head>
@@ -68,24 +72,40 @@ def print_form(url="", msg=""):
         browser's bookmarks menu to download the video straight away.</p>
  </body>
  </html>
-""".replace("{0}", msg).replace("{1}", url).replace("{2}", script_url)
+""".replace("{0}", msg).replace("{1}", url).replace("{2}", script_url))
  
  cookiejar = cookielib.CookieJar()
  urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
  referrer = ""
  
-def urlopen(url):
+def urlopen(url, offset=None):
+       if url.startswith("//"):
+               url = "http:" + url
+
         global referrer
         req = urllib2.Request(url)
         if referrer:
                 req.add_header("Referer", referrer)
         referrer = url
+
         req.add_header("User-Agent", USER_AGENT)
-       return urlopener.open(req)
+
+       if offset:
+               req.add_header("Range", "bytes=%d-" % offset)
+
+       res = urlopener.open(req)
+
+       content_range = res.info().getheader("Content-Range")
+       if content_range:
+               tokens = content_range.split()
+               assert tokens[0] == "bytes"
+               start = int(tokens[1].split("-")[0])
+               assert start == offset
+       return res
  
  def parse_url(url):
         f = urlopen(url)
-       doc = document_fromstring(f.read())
+       doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
         f.close()
         return doc
  
@@ -97,58 +117,105 @@ def append_to_qs(url, params):
         url = urlparse.urlunsplit(r)
         return url
  
-def convert_from_old_itag(player_config):
-       url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
-       url_data["url"] = []
-       for itag_url in url_data["itag"]:
-               pos = itag_url.find("url=")
-               url_data["url"].append(itag_url[pos+4:])
-       player_config["args"]["url_encoded_fmt_stream_map"] = urllib.urlencode(url_data, True)
-
  def get_player_config(doc):
         player_config = None
         for script in doc.xpath("//script"):
                 if not script.text:
                         continue
                 for line in script.text.split("\n"):
-                       if "yt.playerConfig =" in line:
-                               p1 = line.find("=")
-                               p2 = line.rfind(";")
+                       s = "ytplayer.config = {"
+                       if s in line:
+                               p1 = line.find(s) + len(s) - 1
+                               p2 = line.find("};", p1) + 1
                                 if p1 >= 0 and p2 > 0:
-                                       return json.loads(line[p1+1:p2])
-                       if "'PLAYER_CONFIG': " in line:
-                               p1 = line.find(":")
-                               if p1 >= 0:
-                                       player_config = json.loads(line[p1+1:])
-                                       convert_from_old_itag(player_config)
-                                       return player_config
+                                       return json.loads(line[p1:p2])
  
-def get_best_video(player_config):
-       url_data = urlparse.parse_qs(player_config["args"]["url_encoded_fmt_stream_map"])
-       url_data = itertools.izip_longest(
-               url_data["url"],
-               url_data["type"],
-               url_data["quality"],
-               url_data.get("sig", []),
+def extract_function(output, script, func_name):
+       p1 = script.find("function " + func_name + "(")
+       p2 = script.find("}", p1)
+       code = script[p1:p2+1]
+       output.append(code)
+       deps = re.findall(R"[^\.][= ]([\$0-9a-zA-Z]+)\(", code)
+       deps = set(deps)
+       deps.remove(func_name)
+       for dep in deps:
+               extract_function(output, script, dep)
+
+def decode_signature(js_url, s):
+       script = urlopen(js_url).read()
+       func_name = re.search(R"\b([a-zA-Z]+)\([a-zA-Z]+\.s\);", script).groups()[0]
+
+       codes = []
+       extract_function(codes, script, func_name)
+
+       p = subprocess.Popen(
+               "js",
+               shell=True,
+               close_fds=True,
+               stdin=subprocess.PIPE,
+               stdout=subprocess.PIPE
         )
+       for code in codes:
+               p.stdin.write(code + "\n")
+       p.stdin.write("console.log(%s('%s'));\n" % (func_name, s))
+       p.stdin.close()
+
+       signature = p.stdout.read().strip()
+       if p.wait() != 0:
+               raise Exception("js failed to execute: %d" % p.returncode)
+
+       return signature
+
+def get_best_video(player_config):
+       url_data_list = player_config["args"]["url_encoded_fmt_stream_map"].split(",")
+       js_url = player_config["assets"]["js"]
+
         best_url = None
         best_quality = None
         best_extension = None
-       for video_url, mimetype, quality, signature in url_data:
-               mimetype = mimetype.split(";")[0]
+       for url_data in url_data_list:
+               url_data = urlparse.parse_qs(url_data)
+               mimetype = url_data["type"][0].split(";")[0]
+               quality = url_data["quality"][0]
+
+               if url_data.has_key("stereo3d"):
+                       continue
+               if quality not in QUALITIES:
+                       continue
                 if mimetype not in MIMETYPES:
                         continue
-               extension = "." + MIMETYPES[mimetype]
-               quality = QUALITIES.get(quality.split(",")[0], -1)
-               if best_quality is None or quality > best_quality:
-                       if signature:
-                               video_url = append_to_qs(video_url, {"signature": signature})
-                       best_url = video_url
-                       best_quality = quality
-                       best_extension = extension
+
+               extension = MIMETYPES[mimetype]
+               quality = QUALITIES.get(quality, -1)
+
+               if best_quality is not None and quality < best_quality:
+                       continue
+
+               video_url = url_data["url"][0]
+               if "sig" in url_data:
+                       signature = url_data["sig"][0]
+               elif "s" in url_data:
+                       signature = decode_signature(js_url, url_data["s"][0])
+               else:
+                       signature = None
+
+               if signature:
+                       video_url = append_to_qs(video_url, {"signature": signature})
+
+               best_url = video_url
+               best_quality = quality
+               best_extension = extension
  
         return best_url, best_extension
  
+def sanitize_filename(filename):
+       return (
+               re.sub("\s+", " ", filename.strip())
+               .replace("\\", "-")
+               .replace("/", "-")
+               .replace("\0", " ")
+       )
+
  def get_video_url(doc):
         unavailable = doc.xpath("//div[@id='unavailable-message']/text()")
         if unavailable:
@@ -163,13 +230,20 @@ def get_video_url(doc):
                 return None, None
  
         title = doc.xpath("/html/head/title/text()")[0]
-       title = re.sub("\s+", " ", title.strip())
-       valid_chars = frozenset("-_.() abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
-       filename = "".join(c for c in title.encode("ascii", "ignore") if c in valid_chars)
-       filename += extension
+       filename = sanitize_filename(title)
+       filename += "." + extension
  
         return video_url, filename
  
+def write_video(filename, video_data):
+       httpinfo = video_data.info()
+       encoded_filename = urllib.quote(filename.encode("utf-8"))
+       sys.stdout.write("Content-Disposition: attachment; filename*=UTF-8''%s\r\n" % encoded_filename)
+       sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
+       sys.stdout.write("\r\n")
+       shutil.copyfileobj(video_data, sys.stdout)
+       video_data.close()
+
  def cgimain():
         args = cgi.parse()
         try:
@@ -181,13 +255,8 @@ def cgimain():
         try:
                 doc = parse_url(url)
                 video_url, filename = get_video_url(doc)
-               data = urlopen(video_url)
-               httpinfo = data.info()
-               sys.stdout.write("Content-Disposition: attachment; filename=\"%s\"\r\n" % filename)
-               sys.stdout.write("Content-Length: %s\r\n" % httpinfo.getheader("Content-Length"))
-               sys.stdout.write("\r\n")
-               shutil.copyfileobj(data, sys.stdout)
-               data.close()
+               video_data = urlopen(video_url)
+               write_video(filename, video_data)
         except VideoUnavailable, e:
                 print_form(
                         url=url,
@@ -200,25 +269,105 @@ def cgimain():
                 )
                 return
  
+def pp_size(size):
+       suffixes = ["", "KiB", "MiB", "GiB"]
+       for i, suffix in enumerate(suffixes):
+               if size < 1024:
+                       break
+               size /= 1024
+       return "%.2f %s" % (size, suffix)
+
+def copy_with_progress(content_length, infile, outfile):
+       def print_status():
+               rate = 0
+               if now != last_ts:
+                       rate = last_bytes_read / (now - last_ts)
+               sys.stdout.write("\33[2K\r")
+               sys.stdout.write("%s / %s (%s/sec)" % (
+                       pp_size(bytes_read),
+                       pp_size(content_length),
+                       pp_size(rate),
+               ))
+               sys.stdout.flush()
+
+       last_ts = 0
+       last_bytes_read = 0
+       bytes_read = 0
+       while True:
+               now = time.time()
+               if now - last_ts > 0.5:
+                       print_status()
+                       last_ts = now
+                       last_bytes_read = 0
+
+               buf = infile.read(32768)
+               if not buf:
+                       break
+               outfile.write(buf)
+               last_bytes_read += len(buf)
+               bytes_read += len(buf)
+
+       # Newline at the end
+       print_status()
+       print
+
  def main():
         try:
                 url = sys.argv[1]
         except:
                 print >>sys.stderr, "Usage: %s http://youtube.com/watch?v=FOOBAR" % sys.argv[0]
                 sys.exit(1)
+
         doc = parse_url(url)
         video_url, filename = get_video_url(doc)
-       data = urlopen(video_url)
-       outfile = open(filename, "w")
-       shutil.copyfileobj(data, outfile)
-       data.close()
+       print "Downloading", filename.encode("utf-8")
+
+       outfile = open(filename, "a")
+       offset = outfile.tell()
+       if offset > 0:
+               print "Resuming download from", pp_size(offset)
+       total_size = None
+
+       while True:
+               try:
+                       video_data = urlopen(video_url, offset)
+               except urllib2.HTTPError, e:
+                       if e.code == 416:
+                               print "File is complete!"
+                               break
+                       else:
+                               raise
+
+               content_length = int(video_data.info().getheader("Content-Length"))
+               if total_size is None:
+                       total_size = content_length
+
+               try:
+                       copy_with_progress(content_length, video_data, outfile)
+               except IOError, e:
+                       print
+
+               video_data.close()
+               if outfile.tell() != total_size:
+                       old_offset = offset
+                       offset = outfile.tell()
+                       if old_offset == offset:
+                               time.sleep(1)
+                       print "Restarting download from", pp_size(offset)
+               else:
+                       break
+
         outfile.close()
  
  
  if __name__ == "__main__":
-       resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
+###    resource.setrlimit(resource.RLIMIT_AS, (MAX_MEMORY_BYTES, MAX_MEMORY_BYTES))
         if os.environ.has_key("SCRIPT_NAME"):
                 cgimain()
         else:
-               main()
+               try:
+                       main()
+               except KeyboardInterrupt:
+                       print "\nExiting..."
+                       sys.exit(1)