]> code.delx.au - webdl/blob - common.py
brightcove: seems to work well, no need to mark it experimental anymore
[webdl] / common.py
1 # vim:ts=4:sts=4:sw=4:noet
2
3 from lxml import etree, html
4 import cookielib
5 import json
6 try:
7 import hashlib
8 except ImportError:
9 import md5 as hashlib
10 import os
11 import re
12 import shutil
13 import signal
14 import subprocess
15 import sys
16 import tempfile
17 import time
18 import urllib
19 import urllib2
20 import urlparse
21
22
23 try:
24 import autosocks
25 autosocks.try_autosocks()
26 except ImportError:
27 pass
28
29 CACHE_DIR = os.path.expanduser("~/.cache/webdl")
30 USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:21.0) Gecko/20100101 Firefox/21.0"
31
32 class Node(object):
33 def __init__(self, title, parent=None):
34 self.title = title
35 if parent:
36 parent.children.append(self)
37 self.parent = parent
38 self.children = []
39 self.can_download = False
40
41 def get_children(self):
42 if not self.children:
43 self.fill_children()
44 return self.children
45
46 def fill_children(self):
47 pass
48
49 def download(self):
50 raise NotImplemented
51
52
53 def load_root_node():
54 root_node = Node("Root")
55
56 import iview
57 iview.fill_nodes(root_node)
58
59 import sbs
60 sbs.fill_nodes(root_node)
61
62 import plus7
63 plus7.fill_nodes(root_node)
64
65 import brightcove
66 brightcove.fill_nodes(root_node)
67
68 return root_node
69
70 valid_chars = frozenset("-_.()!@#%^ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
71 def sanify_filename(filename):
72 filename = filename.encode("ascii", "ignore")
73 filename = "".join(c for c in filename if c in valid_chars)
74 return filename
75
76 cookiejar = cookielib.CookieJar()
77 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
78 def _urlopen(url, referrer=None):
79 req = urllib2.Request(url)
80 req.add_header("User-Agent", USER_AGENT)
81 if referrer:
82 req.add_header("Referer", referrer)
83 return urlopener.open(req)
84
85 def urlopen(url, max_age):
86 ### print url
87 if not os.path.isdir(CACHE_DIR):
88 os.makedirs(CACHE_DIR)
89
90 if max_age <= 0:
91 return _urlopen(url)
92
93 filename = hashlib.md5(url).hexdigest()
94 filename = os.path.join(CACHE_DIR, filename)
95 if os.path.exists(filename):
96 file_age = int(time.time()) - os.path.getmtime(filename)
97 if file_age < max_age:
98 return open(filename)
99
100 src = _urlopen(url)
101 dst = open(filename, "w")
102 try:
103 shutil.copyfileobj(src, dst)
104 except Exception, e:
105 try:
106 os.unlink(filename)
107 except OSError:
108 pass
109 raise e
110 src.close()
111 dst.close()
112
113 return open(filename)
114
115 def grab_text(url, max_age):
116 f = urlopen(url, max_age)
117 text = f.read().decode("utf-8")
118 f.close()
119 return text
120
121 def grab_html(url, max_age):
122 f = urlopen(url, max_age)
123 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
124 f.close()
125 return doc
126
127 def grab_xml(url, max_age):
128 f = urlopen(url, max_age)
129 doc = etree.parse(f, etree.XMLParser(encoding="utf-8", recover=True))
130 f.close()
131 return doc
132
133 def grab_json(url, max_age, skip_assignment=False, skip_function=False):
134 f = urlopen(url, max_age)
135 if skip_assignment:
136 text = f.read()
137 pos = text.find("=")
138 doc = json.loads(text[pos+1:])
139 elif skip_function:
140 text = f.read()
141 pos = text.find("(")
142 rpos = text.rfind(")")
143 doc = json.loads(text[pos+1:rpos])
144 else:
145 doc = json.load(f)
146 f.close()
147 return doc
148
149 def exec_subprocess(cmd):
150 try:
151 p = subprocess.Popen(cmd)
152 ret = p.wait()
153 if ret != 0:
154 print >>sys.stderr, cmd[0], "exited with error code:", ret
155 return False
156 else:
157 return True
158 except OSError, e:
159 print >>sys.stderr, "Failed to run", cmd[0], e
160 except KeyboardInterrupt:
161 print "Cancelled", cmd
162 try:
163 p.terminate()
164 p.wait()
165 except KeyboardInterrupt:
166 p.send_signal(signal.SIGKILL)
167 p.wait()
168 return False
169
170
171 def convert_flv_mp4(orig_filename):
172 basename = os.path.splitext(orig_filename)[0]
173 flv_filename = basename + ".flv"
174 mp4_filename = basename + ".mp4"
175 if orig_filename != flv_filename:
176 os.rename(orig_filename, flv_filename)
177 print "Converting %s to mp4" % flv_filename
178 cmd = [
179 "ffmpeg",
180 "-i", flv_filename,
181 "-acodec", "copy",
182 "-vcodec", "copy",
183 mp4_filename,
184 ]
185 if not exec_subprocess(cmd):
186 return
187 try:
188 flv_size = os.stat(flv_filename).st_size
189 mp4_size = os.stat(mp4_filename).st_size
190 if abs(flv_size - mp4_size) < 0.05 * flv_size:
191 os.unlink(flv_filename)
192 else:
193 print >>sys.stderr, "The size of", mp4_filename, "is suspicious, did ffmpeg fail?"
194 except Exception, e:
195 print "Conversion failed", e
196
197 def convert_filename(filename):
198 if os.path.splitext(filename.lower())[1] in (".mp4", ".flv"):
199 f = open(filename)
200 fourcc = f.read(4)
201 f.close()
202 if fourcc == "FLV\x01":
203 convert_flv_mp4(filename)
204
205 def download_rtmp(filename, vbase, vpath, hash_url=None):
206 filename = sanify_filename(filename)
207 print "Downloading: %s" % filename
208 if vpath.endswith(".flv"):
209 vpath = vpath[:-4]
210 cmd = [
211 "rtmpdump",
212 "-o", filename,
213 "-r", vbase,
214 "-y", vpath,
215 ]
216 if hash_url is not None:
217 cmd += ["--swfVfy", hash_url]
218 if exec_subprocess(cmd):
219 convert_filename(filename)
220 return True
221 else:
222 return False
223
224 def download_urllib(filename, url, referrer=None):
225 filename = sanify_filename(filename)
226 print "Downloading: %s" % filename
227 try:
228 src = _urlopen(url, referrer)
229 dst = open(filename, "w")
230 while True:
231 buf = src.read(1024*1024)
232 if not buf:
233 break
234 dst.write(buf)
235 sys.stdout.write(".")
236 sys.stdout.flush()
237 print
238 convert_filename(filename)
239 return True
240 except KeyboardInterrupt:
241 print "\nCancelled", url
242 finally:
243 try:
244 src.close()
245 except:
246 pass
247 try:
248 dst.close()
249 except:
250 pass
251 return False
252
253 def natural_sort(l, key=None):
254 ignore_list = ["a", "the"]
255 def key_func(k):
256 if key is not None:
257 k = key(k)
258 k = k.lower()
259 newk = []
260 for c in re.split("([0-9]+)", k):
261 c = c.strip()
262 if c.isdigit():
263 newk.append(int(c))
264 else:
265 for subc in c.split():
266 if subc not in ignore_list:
267 newk.append(subc)
268 return newk
269
270 return sorted(l, key=key_func)
271
272 def append_to_qs(url, params):
273 r = list(urlparse.urlsplit(url))
274 qs = urlparse.parse_qs(r[3])
275 for k, v in params.iteritems():
276 if v is not None:
277 qs[k] = v
278 elif qs.has_key(k):
279 del qs[k]
280 r[3] = urllib.urlencode(qs, True)
281 url = urlparse.urlunsplit(r)
282 return url
283