]> code.delx.au - webdl/blob - common.py
accd6dd506eab83f6e6d2acf14ca94241f000731
[webdl] / common.py
1 from lxml import etree, html
2 import cookielib
3 import json
4 try:
5 import hashlib
6 except ImportError:
7 import md5 as hashlib
8 import os
9 import re
10 import shutil
11 import signal
12 import subprocess
13 import sys
14 import tempfile
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 try:
22 import autosocks
23 autosocks.try_autosocks()
24 except ImportError:
25 pass
26
27 CACHE_DIR = os.path.expanduser("~/.cache/webdl")
28 USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:21.0) Gecko/20100101 Firefox/21.0"
29
30 class Node(object):
31 def __init__(self, title, parent=None):
32 self.title = title
33 if parent:
34 parent.children.append(self)
35 self.parent = parent
36 self.children = []
37 self.can_download = False
38
39 def get_children(self):
40 if not self.children:
41 self.fill_children()
42 return self.children
43
44 def fill_children(self):
45 pass
46
47 def download(self):
48 raise NotImplemented
49
50
51 def load_root_node():
52 root_node = Node("Root")
53
54 import iview
55 iview.fill_nodes(root_node)
56
57 import sbs
58 sbs.fill_nodes(root_node)
59
60 import plus7
61 plus7.fill_nodes(root_node)
62
63 import brightcove
64 brightcove.fill_nodes(root_node)
65
66 return root_node
67
68 valid_chars = frozenset("-_.()!@#%^ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
69 def sanify_filename(filename):
70 filename = filename.encode("ascii", "ignore")
71 filename = "".join(c for c in filename if c in valid_chars)
72 return filename
73
74 cookiejar = cookielib.CookieJar()
75 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
76 def _urlopen(url, referrer=None):
77 req = urllib2.Request(url)
78 req.add_header("User-Agent", USER_AGENT)
79 if referrer:
80 req.add_header("Referer", referrer)
81 return urlopener.open(req)
82
83 def urlopen(url, max_age):
84 ### print url
85 if not os.path.isdir(CACHE_DIR):
86 os.makedirs(CACHE_DIR)
87
88 if max_age <= 0:
89 return _urlopen(url)
90
91 filename = hashlib.md5(url).hexdigest()
92 filename = os.path.join(CACHE_DIR, filename)
93 if os.path.exists(filename):
94 file_age = int(time.time()) - os.path.getmtime(filename)
95 if file_age < max_age:
96 return open(filename)
97
98 src = _urlopen(url)
99 dst = open(filename, "w")
100 try:
101 shutil.copyfileobj(src, dst)
102 except Exception, e:
103 try:
104 os.unlink(filename)
105 except OSError:
106 pass
107 raise e
108 src.close()
109 dst.close()
110
111 return open(filename)
112
113 def grab_text(url, max_age):
114 f = urlopen(url, max_age)
115 text = f.read().decode("utf-8")
116 f.close()
117 return text
118
119 def grab_html(url, max_age):
120 f = urlopen(url, max_age)
121 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
122 f.close()
123 return doc
124
125 def grab_xml(url, max_age):
126 f = urlopen(url, max_age)
127 doc = etree.parse(f, etree.XMLParser(encoding="utf-8", recover=True))
128 f.close()
129 return doc
130
131 def grab_json(url, max_age, skip_assignment=False, skip_function=False):
132 f = urlopen(url, max_age)
133 if skip_assignment:
134 text = f.read()
135 pos = text.find("=")
136 doc = json.loads(text[pos+1:])
137 elif skip_function:
138 text = f.read()
139 pos = text.find("(")
140 rpos = text.rfind(")")
141 doc = json.loads(text[pos+1:rpos])
142 else:
143 doc = json.load(f)
144 f.close()
145 return doc
146
147 def exec_subprocess(cmd):
148 try:
149 p = subprocess.Popen(cmd)
150 ret = p.wait()
151 if ret != 0:
152 print >>sys.stderr, cmd[0], "exited with error code:", ret
153 return False
154 else:
155 return True
156 except OSError, e:
157 print >>sys.stderr, "Failed to run", cmd[0], e
158 except KeyboardInterrupt:
159 print "Cancelled", cmd
160 try:
161 p.terminate()
162 p.wait()
163 except KeyboardInterrupt:
164 p.send_signal(signal.SIGKILL)
165 p.wait()
166 return False
167
168
169 def avconv_remux(infile, outfile):
170 print "Converting %s to mp4" % infile
171 cmd = [
172 "avconv",
173 "-i", infile,
174 "-acodec", "copy",
175 "-vcodec", "copy",
176 outfile,
177 ]
178 if not exec_subprocess(cmd):
179 # failed, error has already been logged
180 return False
181 try:
182 flv_size = os.stat(infile).st_size
183 mp4_size = os.stat(outfile).st_size
184 if abs(flv_size - mp4_size) < 0.1 * flv_size:
185 os.unlink(infile)
186 return True
187 else:
188 print >>sys.stderr, "The size of", outfile, "is suspicious, did avconv fail?"
189 return False
190 except Exception, e:
191 print >>sys.stderr, "Conversion failed", e
192 return False
193
194 def convert_to_mp4(filename):
195 with open(filename) as f:
196 fourcc = f.read(4)
197 basename, ext = os.path.splitext(filename)
198
199 if ext == ".mp4" and fourcc == "FLV\x01":
200 os.rename(filename, basename + ".flv")
201 ext = ".flv"
202 filename = basename + ext
203
204 if ext in (".flv", ".ts"):
205 filename_mp4 = basename + ".mp4"
206 return avconv_remux(filename, filename_mp4)
207
208 return ext == ".mp4"
209
210
211 def download_rtmp(filename, vbase, vpath, hash_url=None):
212 filename = sanify_filename(filename)
213 print "Downloading: %s" % filename
214 if vpath.endswith(".flv"):
215 vpath = vpath[:-4]
216 cmd = [
217 "rtmpdump",
218 "-o", filename,
219 "-r", vbase,
220 "-y", vpath,
221 ]
222 if hash_url is not None:
223 cmd += ["--swfVfy", hash_url]
224 if exec_subprocess(cmd):
225 return convert_to_mp4(filename)
226 else:
227 return False
228
229 def download_urllib(filename, url, referrer=None):
230 filename = sanify_filename(filename)
231 print "Downloading: %s" % filename
232 try:
233 src = _urlopen(url, referrer)
234 dst = open(filename, "w")
235 while True:
236 buf = src.read(1024*1024)
237 if not buf:
238 break
239 dst.write(buf)
240 sys.stdout.write(".")
241 sys.stdout.flush()
242 print
243 except KeyboardInterrupt:
244 print "\nCancelled", url
245 return False
246 finally:
247 try:
248 src.close()
249 except:
250 pass
251 try:
252 dst.close()
253 except:
254 pass
255
256 return convert_to_mp4(filename)
257
258 def download_hls_get_stream(url):
259 def parse_bandwidth(line):
260 params = line.split(":", 1)[1].split(",")
261 for kv in params:
262 k, v = kv.split("=", 1)
263 if k == "BANDWIDTH":
264 return int(v)
265 return 0
266
267 m3u8 = grab_text(url, 0)
268 best_bandwidth = None
269 best_url = None
270 for line in m3u8.split("\n"):
271 if line.startswith("#EXT-X-STREAM-INF:"):
272 bandwidth = parse_bandwidth(line)
273 if best_bandwidth is None or bandwidth > best_bandwidth:
274 best_bandwidth = bandwidth
275 best_url = None
276 elif not line.startswith("#"):
277 if best_url is None:
278 best_url = line.strip()
279
280 if not best_url:
281 raise Exception("Failed to find best stream for HLS: " + url)
282
283 return best_url
284
285 def download_hls_segments(outf, url):
286 m3u8 = grab_text(url, 0)
287
288 fail_if_not_last_segment = None
289 for line in m3u8.split("\n"):
290 if not line.strip() or line.startswith("#"):
291 continue
292
293 if fail_if_not_last_segment:
294 raise e
295
296 try:
297 download_hls_fetch_segment(outf, line)
298 except urllib2.HTTPError, e:
299 fail_if_not_last_segment = e
300 continue
301 sys.stdout.write(".")
302 sys.stdout.flush()
303
304 sys.stdout.write("\n")
305
306 def download_hls_fetch_segment(outf, segment_url):
307 try:
308 src = _urlopen(segment_url)
309 shutil.copyfileobj(src, outf)
310 except:
311 raise
312 finally:
313 try:
314 src.close()
315 except:
316 pass
317
318 def download_hls(filename, m3u8_master_url, hack_url_func=None):
319 if hack_url_func is None:
320 hack_url_func = lambda url: url
321
322 tmpdir = tempfile.mkdtemp(prefix="webdl-hls")
323
324 print "Downloading: %s" % filename
325
326 try:
327 best_stream_url = download_hls_get_stream(hack_url_func(m3u8_master_url))
328 ts_file = open(filename, "w")
329 download_hls_segments(ts_file, hack_url_func(best_stream_url))
330 except KeyboardInterrupt:
331 print "\nCancelled", m3u8_master_url
332 return False
333 finally:
334 shutil.rmtree(tmpdir)
335 try:
336 ts_file.close()
337 except:
338 pass
339
340 return convert_to_mp4(filename)
341
342 def natural_sort(l, key=None):
343 ignore_list = ["a", "the"]
344 def key_func(k):
345 if key is not None:
346 k = key(k)
347 k = k.lower()
348 newk = []
349 for c in re.split("([0-9]+)", k):
350 c = c.strip()
351 if c.isdigit():
352 newk.append(int(c))
353 else:
354 for subc in c.split():
355 if subc not in ignore_list:
356 newk.append(subc)
357 return newk
358
359 return sorted(l, key=key_func)
360
361 def append_to_qs(url, params):
362 r = list(urlparse.urlsplit(url))
363 qs = urlparse.parse_qs(r[3])
364 for k, v in params.iteritems():
365 if v is not None:
366 qs[k] = v
367 elif qs.has_key(k):
368 del qs[k]
369 r[3] = urllib.urlencode(qs, True)
370 url = urlparse.urlunsplit(r)
371 return url
372