]> code.delx.au - webdl/blob - common.py
iview: Use HLS instead of RTMP
[webdl] / common.py
1 from lxml import etree, html
2 import cookielib
3 import json
4 try:
5 import hashlib
6 except ImportError:
7 import md5 as hashlib
8 import os
9 import re
10 import shutil
11 import signal
12 import subprocess
13 import sys
14 import tempfile
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 try:
22 import autosocks
23 autosocks.try_autosocks()
24 except ImportError:
25 pass
26
27 CACHE_DIR = os.path.expanduser("~/.cache/webdl")
28 USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:21.0) Gecko/20100101 Firefox/21.0"
29
30 class Node(object):
31 def __init__(self, title, parent=None):
32 self.title = title
33 if parent:
34 parent.children.append(self)
35 self.parent = parent
36 self.children = []
37 self.can_download = False
38
39 def get_children(self):
40 if not self.children:
41 self.fill_children()
42 return self.children
43
44 def fill_children(self):
45 pass
46
47 def download(self):
48 raise NotImplemented
49
50
51 def load_root_node():
52 root_node = Node("Root")
53
54 import iview
55 iview.fill_nodes(root_node)
56
57 import sbs
58 sbs.fill_nodes(root_node)
59
60 import plus7
61 plus7.fill_nodes(root_node)
62
63 import brightcove
64 brightcove.fill_nodes(root_node)
65
66 return root_node
67
68 valid_chars = frozenset("-_.()!@#%^ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
69 def sanify_filename(filename):
70 filename = filename.encode("ascii", "ignore")
71 filename = "".join(c for c in filename if c in valid_chars)
72 return filename
73
74 cookiejar = cookielib.CookieJar()
75 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
76 def _urlopen(url, referrer=None):
77 req = urllib2.Request(url)
78 req.add_header("User-Agent", USER_AGENT)
79 if referrer:
80 req.add_header("Referer", referrer)
81 return urlopener.open(req)
82
83 def urlopen(url, max_age):
84 ### print url
85 if not os.path.isdir(CACHE_DIR):
86 os.makedirs(CACHE_DIR)
87
88 if max_age <= 0:
89 return _urlopen(url)
90
91 filename = hashlib.md5(url).hexdigest()
92 filename = os.path.join(CACHE_DIR, filename)
93 if os.path.exists(filename):
94 file_age = int(time.time()) - os.path.getmtime(filename)
95 if file_age < max_age:
96 return open(filename)
97
98 src = _urlopen(url)
99 dst = open(filename, "w")
100 try:
101 shutil.copyfileobj(src, dst)
102 except Exception, e:
103 try:
104 os.unlink(filename)
105 except OSError:
106 pass
107 raise e
108 src.close()
109 dst.close()
110
111 return open(filename)
112
113 def grab_text(url, max_age):
114 f = urlopen(url, max_age)
115 text = f.read().decode("utf-8")
116 f.close()
117 return text
118
119 def grab_html(url, max_age):
120 f = urlopen(url, max_age)
121 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
122 f.close()
123 return doc
124
125 def grab_xml(url, max_age):
126 f = urlopen(url, max_age)
127 doc = etree.parse(f, etree.XMLParser(encoding="utf-8", recover=True))
128 f.close()
129 return doc
130
131 def grab_json(url, max_age, skip_assignment=False, skip_function=False):
132 f = urlopen(url, max_age)
133 if skip_assignment:
134 text = f.read()
135 pos = text.find("=")
136 doc = json.loads(text[pos+1:])
137 elif skip_function:
138 text = f.read()
139 pos = text.find("(")
140 rpos = text.rfind(")")
141 doc = json.loads(text[pos+1:rpos])
142 else:
143 doc = json.load(f)
144 f.close()
145 return doc
146
147 def exec_subprocess(cmd):
148 try:
149 p = subprocess.Popen(cmd)
150 ret = p.wait()
151 if ret != 0:
152 print >>sys.stderr, cmd[0], "exited with error code:", ret
153 return False
154 else:
155 return True
156 except OSError, e:
157 print >>sys.stderr, "Failed to run", cmd[0], e
158 except KeyboardInterrupt:
159 print "Cancelled", cmd
160 try:
161 p.terminate()
162 p.wait()
163 except KeyboardInterrupt:
164 p.send_signal(signal.SIGKILL)
165 p.wait()
166 return False
167
168
169 def convert_flv_mp4(orig_filename):
170 basename = os.path.splitext(orig_filename)[0]
171 flv_filename = basename + ".flv"
172 mp4_filename = basename + ".mp4"
173 if orig_filename != flv_filename:
174 os.rename(orig_filename, flv_filename)
175 print "Converting %s to mp4" % flv_filename
176 if not avconv_remux(flv_filename, mp4_filename):
177 # failed, error has already been logged
178 return
179 try:
180 flv_size = os.stat(flv_filename).st_size
181 mp4_size = os.stat(mp4_filename).st_size
182 if abs(flv_size - mp4_size) < 0.05 * flv_size:
183 os.unlink(flv_filename)
184 else:
185 print >>sys.stderr, "The size of", mp4_filename, "is suspicious, did avconv fail?"
186 except Exception, e:
187 print "Conversion failed", e
188
189 def avconv_remux(infile, outfile):
190 cmd = [
191 "avconv",
192 "-i", infile,
193 "-acodec", "copy",
194 "-vcodec", "copy",
195 outfile,
196 ]
197 return exec_subprocess(cmd)
198
199 def convert_filename(filename):
200 if os.path.splitext(filename.lower())[1] in (".mp4", ".flv"):
201 f = open(filename)
202 fourcc = f.read(4)
203 f.close()
204 if fourcc == "FLV\x01":
205 convert_flv_mp4(filename)
206
207 def download_rtmp(filename, vbase, vpath, hash_url=None):
208 filename = sanify_filename(filename)
209 print "Downloading: %s" % filename
210 if vpath.endswith(".flv"):
211 vpath = vpath[:-4]
212 cmd = [
213 "rtmpdump",
214 "-o", filename,
215 "-r", vbase,
216 "-y", vpath,
217 ]
218 if hash_url is not None:
219 cmd += ["--swfVfy", hash_url]
220 if exec_subprocess(cmd):
221 convert_filename(filename)
222 return True
223 else:
224 return False
225
226 def download_urllib(filename, url, referrer=None):
227 filename = sanify_filename(filename)
228 print "Downloading: %s" % filename
229 try:
230 src = _urlopen(url, referrer)
231 dst = open(filename, "w")
232 while True:
233 buf = src.read(1024*1024)
234 if not buf:
235 break
236 dst.write(buf)
237 sys.stdout.write(".")
238 sys.stdout.flush()
239 print
240 except KeyboardInterrupt:
241 print "\nCancelled", url
242 return False
243 finally:
244 try:
245 src.close()
246 except:
247 pass
248 try:
249 dst.close()
250 except:
251 pass
252
253 convert_filename(filename)
254 return True
255
256 def download_hls_get_stream(url, hack_url_func):
257 url = hack_url_func(url)
258
259 def parse_bandwidth(line):
260 params = line.split(":", 1)[1].split(",")
261 for kv in params:
262 k, v = kv.split("=", 1)
263 if k == "BANDWIDTH":
264 return int(v)
265 return 0
266
267 m3u8 = grab_text(url, 0)
268 best_bandwidth = None
269 best_url = None
270 for line in m3u8.split("\n"):
271 if line.startswith("#EXT-X-STREAM-INF:"):
272 bandwidth = parse_bandwidth(line)
273 if best_bandwidth is None or bandwidth > best_bandwidth:
274 best_bandwidth = bandwidth
275 best_url = None
276 elif not line.startswith("#"):
277 if best_url is None:
278 best_url = line.strip()
279
280 if not best_url:
281 raise Exception("Failed to find best stream for HLS: " + url)
282
283 return best_url
284
285 def download_hls_segments(tmpdir, url, hack_url_func):
286 m3u8 = grab_text(url, 0)
287 result = []
288
289 local_m3u8_filename = tmpdir + "/index.m3u8"
290 local_m3u8 = open(local_m3u8_filename, "w")
291
292 i = 1
293 for line in m3u8.split("\n"):
294 if not line.strip():
295 continue
296 if line.startswith("#"):
297 local_m3u8.write(line + "\n")
298 continue
299
300 outfile = "%s/segment_%d.ts" % (tmpdir, i)
301 i += 1
302 local_m3u8.write(outfile + "\n")
303 download_hls_fetch_segment(line, outfile)
304 sys.stdout.write(".")
305 sys.stdout.flush()
306
307 sys.stdout.write("\n")
308
309 local_m3u8.close()
310 return local_m3u8_filename
311
312 def download_hls_fetch_segment(segment, outfile):
313 try:
314 src = _urlopen(segment)
315 dst = open(outfile, "w")
316 shutil.copyfileobj(src, dst)
317 finally:
318 try:
319 src.close()
320 except:
321 pass
322 try:
323 dst.close()
324 except:
325 pass
326
327 def download_hls(filename, m3u8_master_url, hack_url_func=None):
328 if hack_url_func is None:
329 hack_url_func = lambda url: url
330
331 tmpdir = tempfile.mkdtemp(prefix="webdl-hls")
332 filename = sanify_filename(filename)
333
334 print "Downloading: %s" % filename
335
336 try:
337 best_stream_url = download_hls_get_stream(m3u8_master_url, hack_url_func)
338 local_m3u8 = download_hls_segments(tmpdir, best_stream_url, hack_url_func)
339 avconv_remux(local_m3u8, filename)
340 return False
341 except KeyboardInterrupt:
342 print "\nCancelled", m3u8_master_url
343 return False
344 finally:
345 shutil.rmtree(tmpdir)
346
347 return True
348
349 def natural_sort(l, key=None):
350 ignore_list = ["a", "the"]
351 def key_func(k):
352 if key is not None:
353 k = key(k)
354 k = k.lower()
355 newk = []
356 for c in re.split("([0-9]+)", k):
357 c = c.strip()
358 if c.isdigit():
359 newk.append(int(c))
360 else:
361 for subc in c.split():
362 if subc not in ignore_list:
363 newk.append(subc)
364 return newk
365
366 return sorted(l, key=key_func)
367
368 def append_to_qs(url, params):
369 r = list(urlparse.urlsplit(url))
370 qs = urlparse.parse_qs(r[3])
371 for k, v in params.iteritems():
372 if v is not None:
373 qs[k] = v
374 elif qs.has_key(k):
375 del qs[k]
376 r[3] = urllib.urlencode(qs, True)
377 url = urlparse.urlunsplit(r)
378 return url
379