X-Git-Url: https://code.delx.au/webdl/blobdiff_plain/50e95f75c852567542c10925a524bf3c7f3e5ab5..4f56fa3c6862c952d351d00dd8b056cbdc8ea2fd:/common.py diff --git a/common.py b/common.py index 2a5e378..17c160d 100644 --- a/common.py +++ b/common.py @@ -1,21 +1,19 @@ -from lxml import etree, html -import cookielib +import python2_compat + +import hashlib +import http.cookiejar import json -try: - import hashlib -except ImportError: - import md5 as hashlib +import logging +import lxml.etree +import lxml.html import os import re import shutil import signal import subprocess -import sys -import tempfile import time -import urllib -import urllib2 -import urlparse +import urllib.parse +import urllib.request try: @@ -24,9 +22,20 @@ try: except ImportError: pass -CACHE_DIR = os.path.expanduser("~/.cache/webdl") + +logging.basicConfig( + format = "%(levelname)s %(message)s", + level = logging.INFO if os.environ.get("DEBUG", None) is None else logging.DEBUG, +) + +CACHE_DIR = os.path.join( + os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")), + "webdl" +) + USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:21.0) Gecko/20100101 Firefox/21.0" + class Node(object): def __init__(self, title, parent=None): self.title = title @@ -57,9 +66,6 @@ def load_root_node(): import sbs sbs.fill_nodes(root_node) - import plus7 - plus7.fill_nodes(root_node) - import brightcove brightcove.fill_nodes(root_node) @@ -67,39 +73,51 @@ def load_root_node(): valid_chars = frozenset("-_.()!@#%^ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") def sanify_filename(filename): - filename = filename.encode("ascii", "ignore") filename = "".join(c for c in filename if c in valid_chars) + assert len(filename) > 0 return filename -cookiejar = cookielib.CookieJar() -urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar)) +def ensure_scheme(url): + parts = urllib.parse.urlparse(url) + if parts.scheme: + return url + parts = list(parts) + parts[0] = "http" + return urllib.parse.urlunparse(parts) + +cookiejar = http.cookiejar.CookieJar() +urlopener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookiejar)) def _urlopen(url, referrer=None): - req = urllib2.Request(url) + url = ensure_scheme(url) + req = urllib.request.Request(url) req.add_header("User-Agent", USER_AGENT) if referrer: req.add_header("Referer", referrer) return urlopener.open(req) def urlopen(url, max_age): -### print url + logging.debug("urlopen(%r, %r)", url, max_age) + if not os.path.isdir(CACHE_DIR): os.makedirs(CACHE_DIR) if max_age <= 0: return _urlopen(url) - filename = hashlib.md5(url).hexdigest() + filename = hashlib.md5(url.encode("utf-8")).hexdigest() filename = os.path.join(CACHE_DIR, filename) if os.path.exists(filename): file_age = int(time.time()) - os.path.getmtime(filename) if file_age < max_age: - return open(filename) + logging.debug("loading from cache: %s", filename) + return open(filename, "rb") + logging.debug("downloading: %s -> %s", url, filename) src = _urlopen(url) dst = open(filename, "wb") try: shutil.copyfileobj(src, dst) - except Exception, e: + except Exception as e: try: os.unlink(filename) except OSError: @@ -108,7 +126,7 @@ def urlopen(url, max_age): src.close() dst.close() - return open(filename) + return open(filename, "rb") def grab_text(url, max_age): f = urlopen(url, max_age) @@ -118,45 +136,47 @@ def grab_text(url, max_age): def grab_html(url, max_age): f = urlopen(url, max_age) - doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True)) + doc = lxml.html.parse(f, lxml.html.HTMLParser(encoding="utf-8", recover=True)) f.close() return doc def grab_xml(url, max_age): f = urlopen(url, max_age) - doc = etree.parse(f, etree.XMLParser(encoding="utf-8", recover=True)) + doc = lxml.etree.parse(f, lxml.etree.XMLParser(encoding="utf-8", recover=True)) f.close() return doc def grab_json(url, max_age, skip_assignment=False, skip_function=False): f = urlopen(url, max_age) + text = f.read().decode("utf-8") + if skip_assignment: - text = f.read() pos = text.find("=") - doc = json.loads(text[pos+1:]) + text = text[pos+1:] + elif skip_function: - text = f.read() pos = text.find("(") rpos = text.rfind(")") - doc = json.loads(text[pos+1:rpos]) - else: - doc = json.load(f) + text = text[pos+1:rpos] + + doc = json.loads(text) f.close() return doc def exec_subprocess(cmd): + logging.debug("Executing: %s", cmd) try: p = subprocess.Popen(cmd) ret = p.wait() if ret != 0: - print >>sys.stderr, cmd[0], "exited with error code:", ret + logging.error("%s exited with error code: %s", cmd[0], ret) return False else: return True - except OSError, e: - print >>sys.stderr, "Failed to run", cmd[0], e + except OSError as e: + logging.error("Failed to run: %s -- %s", cmd[0], e) except KeyboardInterrupt: - print "Cancelled", cmd + logging.info("Cancelled: %s", cmd) try: p.terminate() p.wait() @@ -168,59 +188,81 @@ def exec_subprocess(cmd): def check_command_exists(cmd): try: - subprocess.check_output(cmd) + subprocess.check_output(cmd, stderr=subprocess.STDOUT) return True except Exception: return False -def generate_remux_cmd(infile, outfile): - if check_command_exists(["avconv", "--help"]): - return [ - "avconv", - "-i", infile, - "-bsf:a", "aac_adtstoasc", - "-acodec", "copy", - "-vcodec", "copy", - outfile, - ] - - if check_command_exists(["ffmpeg", "--help"]): - return [ - "ffmpeg", - "-i", infile, - "-bsf:a", "aac_adtstoasc", - "-acodec", "copy", - "-vcodec", "copy", - outfile, - ] +def find_ffmpeg(): + for ffmpeg in ["avconv", "ffmpeg"]: + if check_command_exists([ffmpeg, "--help"]): + return ffmpeg + + raise Exception("You must install ffmpeg or libav-tools") + +def find_ffprobe(): + for ffprobe in ["avprobe", "ffprobe"]: + if check_command_exists([ffprobe, "--help"]): + return ffprobe raise Exception("You must install ffmpeg or libav-tools") +def get_duration(filename): + ffprobe = find_ffprobe() + + cmd = [ + ffprobe, + filename, + "-show_format_entry", "duration", + "-v", "quiet", + ] + output = subprocess.check_output(cmd).decode("utf-8") + for line in output.split("\n"): + if line.startswith("duration="): + return float(line.split("=")[1]) + + raise Exception("Unable to determine video duration of " + filename) + +def check_video_durations(flv_filename, mp4_filename): + flv_duration = get_duration(flv_filename) + mp4_duration = get_duration(mp4_filename) + + if abs(flv_duration - mp4_duration) > 1: + logging.error( + "The duration of %s is suspicious, did the remux fail? Expected %s == %s", + mp4_filename, flv_duration, mp4_duration + ) + return False + + return True + def remux(infile, outfile): - print "Converting %s to mp4" % infile - cmd = generate_remux_cmd(infile, outfile) + logging.info("Converting %s to mp4", infile) + + ffmpeg = find_ffmpeg() + cmd = [ + ffmpeg, + "-i", infile, + "-bsf:a", "aac_adtstoasc", + "-acodec", "copy", + "-vcodec", "copy", + outfile, + ] if not exec_subprocess(cmd): - # failed, error has already been logged return False - try: - flv_size = os.stat(infile).st_size - mp4_size = os.stat(outfile).st_size - if abs(flv_size - mp4_size) < 0.1 * flv_size: - os.unlink(infile) - return True - else: - print >>sys.stderr, "The size of", outfile, "is suspicious, did avconv fail?" - return False - except Exception, e: - print >>sys.stderr, "Conversion failed", e + + if not check_video_durations(infile, outfile): return False + os.unlink(infile) + return True + def convert_to_mp4(filename): - with open(filename) as f: + with open(filename, "rb") as f: fourcc = f.read(4) basename, ext = os.path.splitext(filename) - if ext == ".mp4" and fourcc == "FLV\x01": + if ext == ".mp4" and fourcc == b"FLV\x01": os.rename(filename, basename + ".flv") ext = ".flv" filename = basename + ext @@ -232,137 +274,57 @@ def convert_to_mp4(filename): return ext == ".mp4" -def download_rtmp(filename, vbase, vpath, hash_url=None): +def download_hds(filename, video_url, pvswf=None): filename = sanify_filename(filename) - print "Downloading: %s" % filename - if vpath.endswith(".flv"): - vpath = vpath[:-4] + logging.info("Downloading: %s", filename) + + video_url = "hds://" + video_url + if pvswf: + param = "%s pvswf=%s" % (video_url, pvswf) + else: + param = video_url + cmd = [ - "rtmpdump", + "livestreamer", "-o", filename, - "-r", vbase, - "-y", vpath, + param, + "best", ] - if hash_url is not None: - cmd += ["--swfVfy", hash_url] if exec_subprocess(cmd): return convert_to_mp4(filename) else: return False -def download_urllib(filename, url, referrer=None): +def download_hls(filename, video_url): filename = sanify_filename(filename) - print "Downloading: %s" % filename - try: - src = _urlopen(url, referrer) - dst = open(filename, "wb") - while True: - buf = src.read(1024*1024) - if not buf: - break - dst.write(buf) - sys.stdout.write(".") - sys.stdout.flush() - print - except KeyboardInterrupt: - print "\nCancelled", url - return False - finally: - try: - src.close() - except: - pass - try: - dst.close() - except: - pass - - return convert_to_mp4(filename) - -def download_hls_get_stream(url): - def parse_bandwidth(line): - params = line.split(":", 1)[1].split(",") - for kv in params: - k, v = kv.split("=", 1) - if k == "BANDWIDTH": - return int(v) - return 0 - - m3u8 = grab_text(url, 0) - best_bandwidth = None - best_url = None - for line in m3u8.split("\n"): - if line.startswith("#EXT-X-STREAM-INF:"): - bandwidth = parse_bandwidth(line) - if best_bandwidth is None or bandwidth > best_bandwidth: - best_bandwidth = bandwidth - best_url = None - elif not line.startswith("#"): - if best_url is None: - best_url = line.strip() - - if not best_url: - raise Exception("Failed to find best stream for HLS: " + url) - - return best_url - -def download_hls_segments(outf, url): - m3u8 = grab_text(url, 0) - - fail_if_not_last_segment = None - for line in m3u8.split("\n"): - if not line.strip() or line.startswith("#"): - continue - - if fail_if_not_last_segment: - raise e + video_url = "hlsvariant://" + video_url + logging.info("Downloading: %s", filename) - try: - download_hls_fetch_segment(outf, line) - except urllib2.HTTPError, e: - fail_if_not_last_segment = e - continue - sys.stdout.write(".") - sys.stdout.flush() - - sys.stdout.write("\n") - -def download_hls_fetch_segment(outf, segment_url): - try: - src = _urlopen(segment_url) - shutil.copyfileobj(src, outf) - except: - raise - finally: - try: - src.close() - except: - pass + cmd = [ + "livestreamer", + "-o", filename, + video_url, + "best", + ] + if exec_subprocess(cmd): + return convert_to_mp4(filename) + else: + return False -def download_hls(filename, m3u8_master_url, hack_url_func=None): +def download_http(filename, video_url): filename = sanify_filename(filename) - print "Downloading: %s" % filename - - if hack_url_func is None: - hack_url_func = lambda url: url + logging.info("Downloading: %s", filename) - tmpdir = tempfile.mkdtemp(prefix="webdl-hls") - - try: - best_stream_url = download_hls_get_stream(hack_url_func(m3u8_master_url)) - ts_file = open(filename, "wb") - download_hls_segments(ts_file, hack_url_func(best_stream_url)) - except KeyboardInterrupt: - print "\nCancelled", m3u8_master_url + cmd = [ + "curl", + "--fail", "--retry", "3", + "-o", filename, + video_url, + ] + if exec_subprocess(cmd): + return convert_to_mp4(filename) + else: return False - finally: - shutil.rmtree(tmpdir) - try: - ts_file.close() - except: - pass - - return convert_to_mp4(filename) def natural_sort(l, key=None): ignore_list = ["a", "the"] @@ -374,7 +336,7 @@ def natural_sort(l, key=None): for c in re.split("([0-9]+)", k): c = c.strip() if c.isdigit(): - newk.append(int(c)) + newk.append(c.zfill(5)) else: for subc in c.split(): if subc not in ignore_list: @@ -384,14 +346,14 @@ def natural_sort(l, key=None): return sorted(l, key=key_func) def append_to_qs(url, params): - r = list(urlparse.urlsplit(url)) - qs = urlparse.parse_qs(r[3]) - for k, v in params.iteritems(): + r = list(urllib.parse.urlsplit(url)) + qs = urllib.parse.parse_qs(r[3]) + for k, v in params.items(): if v is not None: qs[k] = v - elif qs.has_key(k): + elif k in qs: del qs[k] - r[3] = urllib.urlencode(qs, True) - url = urlparse.urlunsplit(r) + r[3] = urllib.parse.urlencode(sorted(qs.items()), True) + url = urllib.parse.urlunsplit(r) return url