X-Git-Url: https://code.delx.au/webdl/blobdiff_plain/28b806bd28a74527660fa5c11dacf9c4f8c526e3..e28e65545c03e6452287c6e0cb6427eff49bd358:/common.py diff --git a/common.py b/common.py index daede56..f0f827b 100644 --- a/common.py +++ b/common.py @@ -1,19 +1,18 @@ -import python2_compat - import hashlib -import http.cookiejar +import io import json import logging import lxml.etree import lxml.html import os import re +import requests +import requests_cache import shutil import signal import subprocess import time import urllib.parse -import urllib.request try: @@ -28,12 +27,15 @@ logging.basicConfig( level = logging.INFO if os.environ.get("DEBUG", None) is None else logging.DEBUG, ) -CACHE_DIR = os.path.join( +CACHE_FILE = os.path.join( os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")), - "webdl" + "webdl", + "requests_cache" ) +if not os.path.isdir(os.path.dirname(CACHE_FILE)): + os.makedirs(os.path.dirname(CACHE_FILE)) -USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:21.0) Gecko/20100101 Firefox/21.0" +requests_cache.install_cache(CACHE_FILE, backend='sqlite', expire_after=3600) class Node(object): @@ -66,8 +68,8 @@ def load_root_node(): import sbs sbs.fill_nodes(root_node) - import brightcove - brightcove.fill_nodes(root_node) + import ten + ten.fill_nodes(root_node) return root_node @@ -77,82 +79,44 @@ def sanify_filename(filename): assert len(filename) > 0 return filename -cookiejar = http.cookiejar.CookieJar() -urlopener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookiejar)) -def _urlopen(url, referrer=None): - req = urllib.request.Request(url) - req.add_header("User-Agent", USER_AGENT) - if referrer: - req.add_header("Referer", referrer) - return urlopener.open(req) - -def urlopen(url, max_age): - logging.debug("urlopen(%r, %r)", url, max_age) - - if not os.path.isdir(CACHE_DIR): - os.makedirs(CACHE_DIR) - - if max_age <= 0: - return _urlopen(url) - - filename = hashlib.md5(url.encode("utf-8")).hexdigest() - filename = os.path.join(CACHE_DIR, filename) - if os.path.exists(filename): - file_age = int(time.time()) - os.path.getmtime(filename) - if file_age < max_age: - logging.debug("loading from cache: %s", filename) - return open(filename, "rb") - - logging.debug("downloading: %s -> %s", url, filename) - src = _urlopen(url) - dst = open(filename, "wb") - try: - shutil.copyfileobj(src, dst) - except Exception as e: - try: - os.unlink(filename) - except OSError: - pass - raise e - src.close() - dst.close() - - return open(filename, "rb") - -def grab_text(url, max_age): - f = urlopen(url, max_age) - text = f.read().decode("utf-8") - f.close() - return text - -def grab_html(url, max_age): - f = urlopen(url, max_age) - doc = lxml.html.parse(f, lxml.html.HTMLParser(encoding="utf-8", recover=True)) - f.close() +def ensure_scheme(url): + parts = urllib.parse.urlparse(url) + if parts.scheme: + return url + parts = list(parts) + parts[0] = "http" + return urllib.parse.urlunparse(parts) + +http_session = requests.Session() +http_session.headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:21.0) Gecko/20100101 Firefox/21.0" + +def grab_text(url): + logging.debug("grab_text(%r)", url) + request = http_session.prepare_request(requests.Request("GET", url)) + response = http_session.send(request) + return response.text + +def grab_html(url): + logging.debug("grab_html(%r)", url) + request = http_session.prepare_request(requests.Request("GET", url)) + response = http_session.send(request, stream=True) + doc = lxml.html.parse(io.BytesIO(response.content), lxml.html.HTMLParser(encoding="utf-8", recover=True)) + response.close() return doc -def grab_xml(url, max_age): - f = urlopen(url, max_age) - doc = lxml.etree.parse(f, lxml.etree.XMLParser(encoding="utf-8", recover=True)) - f.close() +def grab_xml(url): + logging.debug("grab_xml(%r)", url) + request = http_session.prepare_request(requests.Request("GET", url)) + response = http_session.send(request, stream=True) + doc = lxml.etree.parse(io.BytesIO(response.content), lxml.etree.XMLParser(encoding="utf-8", recover=True)) + response.close() return doc -def grab_json(url, max_age, skip_assignment=False, skip_function=False): - f = urlopen(url, max_age) - text = f.read().decode("utf-8") - - if skip_assignment: - pos = text.find("=") - text = text[pos+1:] - - elif skip_function: - pos = text.find("(") - rpos = text.rfind(")") - text = text[pos+1:rpos] - - doc = json.loads(text) - f.close() - return doc +def grab_json(url): + logging.debug("grab_json(%r)", url) + request = http_session.prepare_request(requests.Request("GET", url)) + response = http_session.send(request) + return response.json() def exec_subprocess(cmd): logging.debug("Executing: %s", cmd) @@ -179,53 +143,110 @@ def exec_subprocess(cmd): def check_command_exists(cmd): try: - subprocess.check_output(cmd) + subprocess.check_output(cmd, stderr=subprocess.STDOUT) return True except Exception: return False -def generate_remux_cmd(infile, outfile): - if check_command_exists(["avconv", "--help"]): - return [ - "avconv", - "-i", infile, - "-bsf:a", "aac_adtstoasc", - "-acodec", "copy", - "-vcodec", "copy", - outfile, - ] - - if check_command_exists(["ffmpeg", "--help"]): - return [ - "ffmpeg", - "-i", infile, - "-bsf:a", "aac_adtstoasc", - "-acodec", "copy", - "-vcodec", "copy", - outfile, - ] +def find_ffmpeg(): + for ffmpeg in ["avconv", "ffmpeg"]: + if check_command_exists([ffmpeg, "--help"]): + return ffmpeg + + raise Exception("You must install ffmpeg or libav-tools") + +def find_ffprobe(): + for ffprobe in ["avprobe", "ffprobe"]: + if check_command_exists([ffprobe, "--help"]): + return ffprobe raise Exception("You must install ffmpeg or libav-tools") +def find_streamlink(): + for streamlink in ["streamlink", "livestreamer"]: + if check_command_exists([streamlink, "--help"]): + return streamlink + + raise Exception("You must install streamlink or livestreamer") + +def get_duration(filename): + ffprobe = find_ffprobe() + + cmd = [ + ffprobe, + filename, + "-show_format_entry", "duration", + "-v", "quiet", + ] + output = subprocess.check_output(cmd).decode("utf-8") + for line in output.split("\n"): + m = re.search(R"([0-9]+)", line) + if not m: + continue + duration = m.group(1) + if duration.isdigit(): + return int(duration) + + + logging.debug("Falling back to full decode to find duration: %s % filename") + + ffmpeg = find_ffmpeg() + cmd = [ + ffmpeg, + "-i", filename, + "-vn", + "-f", "null", "-", + ] + output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode("utf-8") + duration = None + for line in re.split(R"[\r\n]", output): + m = re.search(R"time=([0-9:]*)\.", line) + if not m: + continue + [h, m, s] = m.group(1).split(":") + # ffmpeg prints the duration as it reads the file, we want the last one + duration = int(h) * 3600 + int(m) * 60 + int(s) + + if duration: + return duration + else: + raise Exception("Unable to determine video duration of " + filename) + +def check_video_durations(flv_filename, mp4_filename): + flv_duration = get_duration(flv_filename) + mp4_duration = get_duration(mp4_filename) + + if abs(flv_duration - mp4_duration) > 1: + logging.error( + "The duration of %s is suspicious, did the remux fail? Expected %s == %s", + mp4_filename, flv_duration, mp4_duration + ) + return False + + return True + def remux(infile, outfile): logging.info("Converting %s to mp4", infile) - cmd = generate_remux_cmd(infile, outfile) + + ffmpeg = find_ffmpeg() + cmd = [ + ffmpeg, + "-i", infile, + "-bsf:a", "aac_adtstoasc", + "-acodec", "copy", + "-vcodec", "copy", + "-y", + outfile, + ] if not exec_subprocess(cmd): - # failed, error has already been logged return False - try: - flv_size = os.stat(infile).st_size - mp4_size = os.stat(outfile).st_size - if abs(flv_size - mp4_size) < 0.1 * flv_size: - os.unlink(infile) - return True - else: - logging.error("The size of %s is suspicious, did the remux fail?", outfile) - return False - except Exception as e: - logging.error("Conversion failed! %s", e) + + if not check_video_durations(infile, outfile): return False + os.unlink(infile) + return True + def convert_to_mp4(filename): with open(filename, "rb") as f: fourcc = f.read(4) @@ -244,17 +265,20 @@ def convert_to_mp4(filename): def download_hds(filename, video_url, pvswf=None): + streamlink = find_streamlink() + filename = sanify_filename(filename) logging.info("Downloading: %s", filename) - video_url = video_url.replace("http://", "hds://") + video_url = "hds://" + video_url if pvswf: param = "%s pvswf=%s" % (video_url, pvswf) else: param = video_url cmd = [ - "livestreamer", + streamlink, + "-f", "-o", filename, param, "best", @@ -265,12 +289,15 @@ def download_hds(filename, video_url, pvswf=None): return False def download_hls(filename, video_url): + streamlink = find_streamlink() + filename = sanify_filename(filename) - video_url = video_url.replace("http://", "hlsvariant://") + video_url = "hlsvariant://" + video_url logging.info("Downloading: %s", filename) cmd = [ - "livestreamer", + streamlink, + "-f", "-o", filename, video_url, "best", @@ -280,6 +307,40 @@ def download_hls(filename, video_url): else: return False +def download_mpd(filename, video_url): + streamlink = find_streamlink() + + filename = sanify_filename(filename) + video_url = "dash://" + video_url + logging.info("Downloading: %s", filename) + + cmd = [ + streamlink, + "-f", + "-o", filename, + video_url, + "best", + ] + if exec_subprocess(cmd): + return convert_to_mp4(filename) + else: + return False + +def download_http(filename, video_url): + filename = sanify_filename(filename) + logging.info("Downloading: %s", filename) + + cmd = [ + "curl", + "--fail", "--retry", "3", + "-o", filename, + video_url, + ] + if exec_subprocess(cmd): + return convert_to_mp4(filename) + else: + return False + def natural_sort(l, key=None): ignore_list = ["a", "the"] def key_func(k):