X-Git-Url: https://code.delx.au/webdl/blobdiff_plain/8b699d97edbf350ff789355c41f98eeaef84fb7f..29ecce3e80dd1b704b019a14bd458de76a13111b:/common.py diff --git a/common.py b/common.py index 4b89660..f77c86c 100644 --- a/common.py +++ b/common.py @@ -1,12 +1,14 @@ # vim:ts=4:sts=4:sw=4:noet -from lxml import etree +from lxml import etree, html +import cookielib import json try: import hashlib except ImportError: import md5 as hashlib import os +import re import shutil import signal import subprocess @@ -14,12 +16,18 @@ import sys import tempfile import time import urllib +import urllib2 +import urlparse -import autosocks -autosocks.try_autosocks() +try: + import autosocks + autosocks.try_autosocks() +except ImportError: + pass CACHE_DIR = os.path.expanduser("~/.cache/webdl") +USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:21.0) Gecko/20100101 Firefox/21.0" class Node(object): def __init__(self, title, parent=None): @@ -31,8 +39,13 @@ class Node(object): self.can_download = False def get_children(self): + if not self.children: + self.fill_children() return self.children + def fill_children(self): + pass + def download(self): raise NotImplemented @@ -46,14 +59,28 @@ def load_root_node(): import sbs sbs.fill_nodes(root_node) + import plus7 + plus7.fill_nodes(root_node) + + import brightcove + brightcove.fill_nodes(root_node) + return root_node -valid_chars = frozenset("-_.() abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") +valid_chars = frozenset("-_.()!@#%^ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") def sanify_filename(filename): - filename = filename.encode("ascii", "ignore") - filename = "".join(c for c in filename if c in valid_chars) - return filename + filename = filename.encode("ascii", "ignore") + filename = "".join(c for c in filename if c in valid_chars) + return filename +cookiejar = cookielib.CookieJar() +urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar)) +def _urlopen(url, referrer=None): + req = urllib2.Request(url) + req.add_header("User-Agent", USER_AGENT) + if referrer: + req.add_header("Referer", referrer) + return urlopener.open(req) def urlopen(url, max_age): ### print url @@ -61,7 +88,7 @@ def urlopen(url, max_age): os.makedirs(CACHE_DIR) if max_age <= 0: - return urllib.urlopen(url) + return _urlopen(url) filename = hashlib.md5(url).hexdigest() filename = os.path.join(CACHE_DIR, filename) @@ -70,7 +97,7 @@ def urlopen(url, max_age): if file_age < max_age: return open(filename) - src = urllib.urlopen(url) + src = _urlopen(url) dst = open(filename, "w") try: shutil.copyfileobj(src, dst) @@ -85,15 +112,37 @@ def urlopen(url, max_age): return open(filename) +def grab_text(url, max_age): + f = urlopen(url, max_age) + text = f.read().decode("utf-8") + f.close() + return text + +def grab_html(url, max_age): + f = urlopen(url, max_age) + doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True)) + f.close() + return doc + def grab_xml(url, max_age): f = urlopen(url, max_age) - doc = etree.parse(f) + doc = etree.parse(f, etree.XMLParser(encoding="utf-8", recover=True)) f.close() return doc -def grab_json(url, max_age): +def grab_json(url, max_age, skip_assignment=False, skip_function=False): f = urlopen(url, max_age) - doc = json.load(f) + if skip_assignment: + text = f.read() + pos = text.find("=") + doc = json.loads(text[pos+1:]) + elif skip_function: + text = f.read() + pos = text.find("(") + rpos = text.rfind(")") + doc = json.loads(text[pos+1:rpos]) + else: + doc = json.load(f) f.close() return doc @@ -123,7 +172,8 @@ def convert_flv_mp4(orig_filename): basename = os.path.splitext(orig_filename)[0] flv_filename = basename + ".flv" mp4_filename = basename + ".mp4" - os.rename(orig_filename, flv_filename) + if orig_filename != flv_filename: + os.rename(orig_filename, flv_filename) print "Converting %s to mp4" % flv_filename cmd = [ "ffmpeg", @@ -145,7 +195,7 @@ def convert_flv_mp4(orig_filename): print "Conversion failed", e def convert_filename(filename): - if filename.lower().endswith(".mp4"): + if os.path.splitext(filename.lower())[1] in (".mp4", ".flv"): f = open(filename) fourcc = f.read(4) f.close() @@ -165,15 +215,17 @@ def download_rtmp(filename, vbase, vpath, hash_url=None): ] if hash_url is not None: cmd += ["--swfVfy", hash_url] - success = exec_subprocess(cmd) - convert_filename(filename) - return success + if exec_subprocess(cmd): + convert_filename(filename) + return True + else: + return False -def download_urllib(filename, url): +def download_urllib(filename, url, referrer=None): filename = sanify_filename(filename) print "Downloading: %s" % filename try: - src = urllib.urlopen(url) + src = _urlopen(url, referrer) dst = open(filename, "w") while True: buf = src.read(1024*1024) @@ -182,6 +234,7 @@ def download_urllib(filename, url): dst.write(buf) sys.stdout.write(".") sys.stdout.flush() + print convert_filename(filename) return True except KeyboardInterrupt: @@ -197,3 +250,34 @@ def download_urllib(filename, url): pass return False +def natural_sort(l, key=None): + ignore_list = ["a", "the"] + def key_func(k): + if key is not None: + k = key(k) + k = k.lower() + newk = [] + for c in re.split("([0-9]+)", k): + c = c.strip() + if c.isdigit(): + newk.append(int(c)) + else: + for subc in c.split(): + if subc not in ignore_list: + newk.append(subc) + return newk + + return sorted(l, key=key_func) + +def append_to_qs(url, params): + r = list(urlparse.urlsplit(url)) + qs = urlparse.parse_qs(r[3]) + for k, v in params.iteritems(): + if v is not None: + qs[k] = v + elif qs.has_key(k): + del qs[k] + r[3] = urllib.urlencode(qs, True) + url = urlparse.urlunsplit(r) + return url +