X-Git-Url: https://code.delx.au/webdl/blobdiff_plain/146903f8906b9a6f721a1cf5e0cee848fbec76d0..b6ad3911d2593698d5ed405520aa2cc0e6ce2ea0:/common.py diff --git a/common.py b/common.py index dc9a8c5..f77c86c 100644 --- a/common.py +++ b/common.py @@ -1,12 +1,14 @@ # vim:ts=4:sts=4:sw=4:noet from lxml import etree, html +import cookielib import json try: import hashlib except ImportError: import md5 as hashlib import os +import re import shutil import signal import subprocess @@ -14,12 +16,18 @@ import sys import tempfile import time import urllib +import urllib2 +import urlparse -import autosocks -autosocks.try_autosocks() +try: + import autosocks + autosocks.try_autosocks() +except ImportError: + pass CACHE_DIR = os.path.expanduser("~/.cache/webdl") +USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:21.0) Gecko/20100101 Firefox/21.0" class Node(object): def __init__(self, title, parent=None): @@ -54,14 +62,25 @@ def load_root_node(): import plus7 plus7.fill_nodes(root_node) + import brightcove + brightcove.fill_nodes(root_node) + return root_node -valid_chars = frozenset("-_.() abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") +valid_chars = frozenset("-_.()!@#%^ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") def sanify_filename(filename): - filename = filename.encode("ascii", "ignore") - filename = "".join(c for c in filename if c in valid_chars) - return filename + filename = filename.encode("ascii", "ignore") + filename = "".join(c for c in filename if c in valid_chars) + return filename +cookiejar = cookielib.CookieJar() +urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar)) +def _urlopen(url, referrer=None): + req = urllib2.Request(url) + req.add_header("User-Agent", USER_AGENT) + if referrer: + req.add_header("Referer", referrer) + return urlopener.open(req) def urlopen(url, max_age): ### print url @@ -69,7 +88,7 @@ def urlopen(url, max_age): os.makedirs(CACHE_DIR) if max_age <= 0: - return urllib.urlopen(url) + return _urlopen(url) filename = hashlib.md5(url).hexdigest() filename = os.path.join(CACHE_DIR, filename) @@ -78,7 +97,7 @@ def urlopen(url, max_age): if file_age < max_age: return open(filename) - src = urllib.urlopen(url) + src = _urlopen(url) dst = open(filename, "w") try: shutil.copyfileobj(src, dst) @@ -93,6 +112,12 @@ def urlopen(url, max_age): return open(filename) +def grab_text(url, max_age): + f = urlopen(url, max_age) + text = f.read().decode("utf-8") + f.close() + return text + def grab_html(url, max_age): f = urlopen(url, max_age) doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True)) @@ -105,9 +130,19 @@ def grab_xml(url, max_age): f.close() return doc -def grab_json(url, max_age): +def grab_json(url, max_age, skip_assignment=False, skip_function=False): f = urlopen(url, max_age) - doc = json.load(f) + if skip_assignment: + text = f.read() + pos = text.find("=") + doc = json.loads(text[pos+1:]) + elif skip_function: + text = f.read() + pos = text.find("(") + rpos = text.rfind(")") + doc = json.loads(text[pos+1:rpos]) + else: + doc = json.load(f) f.close() return doc @@ -186,11 +221,11 @@ def download_rtmp(filename, vbase, vpath, hash_url=None): else: return False -def download_urllib(filename, url): +def download_urllib(filename, url, referrer=None): filename = sanify_filename(filename) print "Downloading: %s" % filename try: - src = urllib.urlopen(url) + src = _urlopen(url, referrer) dst = open(filename, "w") while True: buf = src.read(1024*1024) @@ -199,6 +234,7 @@ def download_urllib(filename, url): dst.write(buf) sys.stdout.write(".") sys.stdout.flush() + print convert_filename(filename) return True except KeyboardInterrupt: @@ -214,3 +250,34 @@ def download_urllib(filename, url): pass return False +def natural_sort(l, key=None): + ignore_list = ["a", "the"] + def key_func(k): + if key is not None: + k = key(k) + k = k.lower() + newk = [] + for c in re.split("([0-9]+)", k): + c = c.strip() + if c.isdigit(): + newk.append(int(c)) + else: + for subc in c.split(): + if subc not in ignore_list: + newk.append(subc) + return newk + + return sorted(l, key=key_func) + +def append_to_qs(url, params): + r = list(urlparse.urlsplit(url)) + qs = urlparse.parse_qs(r[3]) + for k, v in params.iteritems(): + if v is not None: + qs[k] = v + elif qs.has_key(k): + del qs[k] + r[3] = urllib.urlencode(qs, True) + url = urlparse.urlunsplit(r) + return url +