X-Git-Url: https://code.delx.au/webdl/blobdiff_plain/5cf33ad1ef4fa0066c9ade22e3e6f151dd218a99..dde99023a3a72815f591afebb0ae770b6d30effb:/common.py diff --git a/common.py b/common.py index 6b64896..8b05719 100644 --- a/common.py +++ b/common.py @@ -1,131 +1,370 @@ -# vim:ts=4:sts=4:sw=4:noet - -from lxml import etree +import hashlib +import io import json -try: - import hashlib -except ImportError: - import md5 as hashlib +import logging +import lxml.etree +import lxml.html import os +import re +import requests +import requests_cache import shutil import signal import subprocess -import sys -import tempfile import time -import urllib +import urllib.parse + +USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:74.0) Gecko/20100101 Firefox/74.0" + +try: + import autosocks + autosocks.try_autosocks() +except ImportError: + pass + +logging.basicConfig( + format = "%(levelname)s %(message)s", + level = logging.INFO if os.environ.get("DEBUG", None) is None else logging.DEBUG, +) -import autosocks -autosocks.try_autosocks() +CACHE_FILE = os.path.join( + os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")), + "webdl", + "requests_cache" +) +if not os.path.isdir(os.path.dirname(CACHE_FILE)): + os.makedirs(os.path.dirname(CACHE_FILE)) + +requests_cache.install_cache(CACHE_FILE, backend='sqlite', expire_after=3600) -CACHE_DIR = os.path.expanduser("~/.cache/webdl") class Node(object): - def __init__(self, title, parent=None): - self.title = title - if parent: - parent.children.append(self) - self.parent = parent - self.children = [] - self.can_download = False + def __init__(self, title, parent=None): + self.title = title + if parent: + parent.children.append(self) + self.parent = parent + self.children = [] + self.can_download = False + + def get_children(self): + if not self.children: + self.fill_children() + self.children = natural_sort(self.children, key=lambda node: node.title) + return self.children - def download(self): - raise NotImplemented + def fill_children(self): + pass + + def download(self): + raise NotImplemented def load_root_node(): - root_node = Node("Root") - - print "Loading iView episode data...", - sys.stdout.flush() - import iview - iview_node = Node("ABC iView", root_node) - iview.fill_nodes(iview_node) - print "done" - - print "Loading SBS episode data...", - sys.stdout.flush() - import sbs - sbs_node = Node("SBS", root_node) - sbs.fill_nodes(sbs_node) - print "done" - - return root_node - - -def urlopen(url, max_age): - if not os.path.isdir(CACHE_DIR): - os.makedirs(CACHE_DIR) - - if max_age <= 0: - return urllib.urlopen(url) - - filename = hashlib.md5(url).hexdigest() - filename = os.path.join(CACHE_DIR, filename) - if os.path.exists(filename): - file_age = int(time.time()) - os.path.getmtime(filename) - if file_age < max_age: - return open(filename) - - src = urllib.urlopen(url) - dst = open(filename, "w") - shutil.copyfileobj(src, dst) - src.close() - dst.close() - - return open(filename) - -def grab_xml(url, max_age): - f = urlopen(url, max_age) - doc = etree.parse(f) - f.close() - return doc - -def grab_json(url, max_age): - f = urlopen(url, max_age) - doc = json.load(f) - f.close() - return doc - -def download_rtmp(filename, vbase, vpath): - if vpath.endswith(".flv"): - vpath = vpath[:-4] - cmd = [ - "rtmpdump", - "-o", filename, - "-r", vbase, - "-y", vpath, - ] - try: - p = subprocess.Popen(cmd) - ret = p.wait() - if ret != 0: - print >>sys.stderr, "rtmpdump exited with error code:", ret - return False - except OSError, e: - print >>sys.stderr, "Failed to run rtmpdump!", e - return False - except KeyboardInterrupt: - print "Cancelled", cmd - try: - p.terminate() - p.wait() - except KeyboardInterrupt: - p.send_signal(signal.SIGKILL) - p.wait() - -def download_urllib(filename, url): - print "Downloading: %s -> %s" % (url, filename) - try: - src = urllib.urlopen(url) - dst = open(filename, "w") - shutil.copyfileobj(src, dst) - return True - except KeyboardInterrupt: - print "\nCancelled", url - finally: - src.close() - dst.close() - return False + root_node = Node("Root") + + import iview + iview.fill_nodes(root_node) + + import sbs + sbs.fill_nodes(root_node) + + import ten + ten.fill_nodes(root_node) + + return root_node + +valid_chars = frozenset("-_.()!@#%^ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") +def sanify_filename(filename): + filename = "".join(c for c in filename if c in valid_chars) + assert len(filename) > 0 + return filename + +def ensure_scheme(url): + parts = urllib.parse.urlparse(url) + if parts.scheme: + return url + parts = list(parts) + parts[0] = "http" + return urllib.parse.urlunparse(parts) + +http_session = requests.Session() +http_session.headers["User-Agent"] = USER_AGENT + +def grab_text(url): + logging.debug("grab_text(%r)", url) + request = http_session.prepare_request(requests.Request("GET", url)) + response = http_session.send(request) + return response.text + +def grab_html(url): + logging.debug("grab_html(%r)", url) + request = http_session.prepare_request(requests.Request("GET", url)) + response = http_session.send(request, stream=True) + doc = lxml.html.parse(io.BytesIO(response.content), lxml.html.HTMLParser(encoding="utf-8", recover=True)) + response.close() + return doc + +def grab_xml(url): + logging.debug("grab_xml(%r)", url) + request = http_session.prepare_request(requests.Request("GET", url)) + response = http_session.send(request, stream=True) + doc = lxml.etree.parse(io.BytesIO(response.content), lxml.etree.XMLParser(encoding="utf-8", recover=True)) + response.close() + return doc + +def grab_json(url): + logging.debug("grab_json(%r)", url) + request = http_session.prepare_request(requests.Request("GET", url)) + response = http_session.send(request) + return response.json() + +def exec_subprocess(cmd): + logging.debug("Executing: %s", cmd) + try: + p = subprocess.Popen(cmd) + ret = p.wait() + if ret != 0: + logging.error("%s exited with error code: %s", cmd[0], ret) + return False + else: + return True + except OSError as e: + logging.error("Failed to run: %s -- %s", cmd[0], e) + except KeyboardInterrupt: + logging.info("Cancelled: %s", cmd) + try: + p.terminate() + p.wait() + except KeyboardInterrupt: + p.send_signal(signal.SIGKILL) + p.wait() + return False + + +def check_command_exists(cmd): + try: + subprocess.check_output(cmd, stderr=subprocess.STDOUT) + return True + except Exception: + return False + +def find_ffmpeg(): + if check_command_exists(["ffmpeg", "--help"]): + return "ffmpeg" + + if check_command_exists(["avconv", "--help"]): + logging.warn("Detected libav-tools! ffmpeg is recommended") + return "avconv" + + raise Exception("You must install ffmpeg or libav-tools") + +def find_ffprobe(): + if check_command_exists(["ffprobe", "--help"]): + return "ffprobe" + + if check_command_exists(["avprobe", "--help"]): + logging.warn("Detected libav-tools! ffmpeg is recommended") + return "avprobe" + + raise Exception("You must install ffmpeg or libav-tools") + +def get_duration(filename): + ffprobe = find_ffprobe() + + cmd = [ + ffprobe, + filename, + "-show_format_entry", "duration", + "-v", "quiet", + ] + output = subprocess.check_output(cmd).decode("utf-8") + for line in output.split("\n"): + m = re.search(R"([0-9]+)", line) + if not m: + continue + duration = m.group(1) + if duration.isdigit(): + return int(duration) + + + logging.debug("Falling back to full decode to find duration: %s % filename") + + ffmpeg = find_ffmpeg() + cmd = [ + ffmpeg, + "-i", filename, + "-vn", + "-f", "null", "-", + ] + output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode("utf-8") + duration = None + for line in re.split(R"[\r\n]", output): + m = re.search(R"time=([0-9:]*)\.", line) + if not m: + continue + [h, m, s] = m.group(1).split(":") + # ffmpeg prints the duration as it reads the file, we want the last one + duration = int(h) * 3600 + int(m) * 60 + int(s) + + if duration: + return duration + else: + raise Exception("Unable to determine video duration of " + filename) + +def check_video_durations(flv_filename, mp4_filename): + flv_duration = get_duration(flv_filename) + mp4_duration = get_duration(mp4_filename) + + if abs(flv_duration - mp4_duration) > 1: + logging.error( + "The duration of %s is suspicious, did the remux fail? Expected %s == %s", + mp4_filename, flv_duration, mp4_duration + ) + return False + + return True + +def remux(infile, outfile): + logging.info("Converting %s to mp4", infile) + + ffmpeg = find_ffmpeg() + cmd = [ + ffmpeg, + "-i", infile, + "-bsf:a", "aac_adtstoasc", + "-acodec", "copy", + "-vcodec", "copy", + "-y", + outfile, + ] + if not exec_subprocess(cmd): + return False + + if not check_video_durations(infile, outfile): + return False + + os.unlink(infile) + return True + +def convert_to_mp4(filename): + with open(filename, "rb") as f: + fourcc = f.read(4) + basename, ext = os.path.splitext(filename) + + if ext == ".mp4" and fourcc == b"FLV\x01": + os.rename(filename, basename + ".flv") + ext = ".flv" + filename = basename + ext + + if ext in (".flv", ".ts"): + filename_mp4 = basename + ".mp4" + return remux(filename, filename_mp4) + + return ext == ".mp4" + + +def download_hds(filename, video_url, pvswf=None): + filename = sanify_filename(filename) + logging.info("Downloading: %s", filename) + + video_url = "hds://" + video_url + if pvswf: + param = "%s pvswf=%s" % (video_url, pvswf) + else: + param = video_url + + cmd = [ + "streamlink", + "--force", + "--output", filename, + param, + "best", + ] + if exec_subprocess(cmd): + return convert_to_mp4(filename) + else: + return False + +def download_hls(filename, video_url): + filename = sanify_filename(filename) + video_url = "hlsvariant://" + video_url + logging.info("Downloading: %s", filename) + + cmd = [ + "streamlink", + "--http-header", "User-Agent=" + USER_AGENT, + "--force", + "--output", filename, + video_url, + "best", + ] + if exec_subprocess(cmd): + return convert_to_mp4(filename) + else: + return False + +def download_mpd(filename, video_url): + filename = sanify_filename(filename) + video_url = "dash://" + video_url + logging.info("Downloading: %s", filename) + + cmd = [ + "streamlink", + "--force", + "--output", filename, + video_url, + "best", + ] + if exec_subprocess(cmd): + return convert_to_mp4(filename) + else: + return False + +def download_http(filename, video_url): + filename = sanify_filename(filename) + logging.info("Downloading: %s", filename) + + cmd = [ + "curl", + "--fail", "--retry", "3", + "-o", filename, + video_url, + ] + if exec_subprocess(cmd): + return convert_to_mp4(filename) + else: + return False + +def natural_sort(l, key=None): + ignore_list = ["a", "the"] + def key_func(k): + if key is not None: + k = key(k) + k = k.lower() + newk = [] + for c in re.split("([0-9]+)", k): + c = c.strip() + if c.isdigit(): + newk.append(c.zfill(5)) + else: + for subc in c.split(): + if subc not in ignore_list: + newk.append(subc) + return newk + + return sorted(l, key=key_func) + +def append_to_qs(url, params): + r = list(urllib.parse.urlsplit(url)) + qs = urllib.parse.parse_qs(r[3]) + for k, v in params.items(): + if v is not None: + qs[k] = v + elif k in qs: + del qs[k] + r[3] = urllib.parse.urlencode(sorted(qs.items()), True) + url = urllib.parse.urlunsplit(r) + return url