]> code.delx.au - webdl/blob - common.py
hls: allow last segment to be missing
[webdl] / common.py
1 from lxml import etree, html
2 import cookielib
3 import json
4 try:
5 import hashlib
6 except ImportError:
7 import md5 as hashlib
8 import os
9 import re
10 import shutil
11 import signal
12 import subprocess
13 import sys
14 import tempfile
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 try:
22 import autosocks
23 autosocks.try_autosocks()
24 except ImportError:
25 pass
26
27 CACHE_DIR = os.path.expanduser("~/.cache/webdl")
28 USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:21.0) Gecko/20100101 Firefox/21.0"
29
30 class Node(object):
31 def __init__(self, title, parent=None):
32 self.title = title
33 if parent:
34 parent.children.append(self)
35 self.parent = parent
36 self.children = []
37 self.can_download = False
38
39 def get_children(self):
40 if not self.children:
41 self.fill_children()
42 return self.children
43
44 def fill_children(self):
45 pass
46
47 def download(self):
48 raise NotImplemented
49
50
51 def load_root_node():
52 root_node = Node("Root")
53
54 import iview
55 iview.fill_nodes(root_node)
56
57 import sbs
58 sbs.fill_nodes(root_node)
59
60 import plus7
61 plus7.fill_nodes(root_node)
62
63 import brightcove
64 brightcove.fill_nodes(root_node)
65
66 return root_node
67
68 valid_chars = frozenset("-_.()!@#%^ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
69 def sanify_filename(filename):
70 filename = filename.encode("ascii", "ignore")
71 filename = "".join(c for c in filename if c in valid_chars)
72 return filename
73
74 cookiejar = cookielib.CookieJar()
75 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
76 def _urlopen(url, referrer=None):
77 req = urllib2.Request(url)
78 req.add_header("User-Agent", USER_AGENT)
79 if referrer:
80 req.add_header("Referer", referrer)
81 return urlopener.open(req)
82
83 def urlopen(url, max_age):
84 ### print url
85 if not os.path.isdir(CACHE_DIR):
86 os.makedirs(CACHE_DIR)
87
88 if max_age <= 0:
89 return _urlopen(url)
90
91 filename = hashlib.md5(url).hexdigest()
92 filename = os.path.join(CACHE_DIR, filename)
93 if os.path.exists(filename):
94 file_age = int(time.time()) - os.path.getmtime(filename)
95 if file_age < max_age:
96 return open(filename)
97
98 src = _urlopen(url)
99 dst = open(filename, "w")
100 try:
101 shutil.copyfileobj(src, dst)
102 except Exception, e:
103 try:
104 os.unlink(filename)
105 except OSError:
106 pass
107 raise e
108 src.close()
109 dst.close()
110
111 return open(filename)
112
113 def grab_text(url, max_age):
114 f = urlopen(url, max_age)
115 text = f.read().decode("utf-8")
116 f.close()
117 return text
118
119 def grab_html(url, max_age):
120 f = urlopen(url, max_age)
121 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
122 f.close()
123 return doc
124
125 def grab_xml(url, max_age):
126 f = urlopen(url, max_age)
127 doc = etree.parse(f, etree.XMLParser(encoding="utf-8", recover=True))
128 f.close()
129 return doc
130
131 def grab_json(url, max_age, skip_assignment=False, skip_function=False):
132 f = urlopen(url, max_age)
133 if skip_assignment:
134 text = f.read()
135 pos = text.find("=")
136 doc = json.loads(text[pos+1:])
137 elif skip_function:
138 text = f.read()
139 pos = text.find("(")
140 rpos = text.rfind(")")
141 doc = json.loads(text[pos+1:rpos])
142 else:
143 doc = json.load(f)
144 f.close()
145 return doc
146
147 def exec_subprocess(cmd):
148 try:
149 p = subprocess.Popen(cmd)
150 ret = p.wait()
151 if ret != 0:
152 print >>sys.stderr, cmd[0], "exited with error code:", ret
153 return False
154 else:
155 return True
156 except OSError, e:
157 print >>sys.stderr, "Failed to run", cmd[0], e
158 except KeyboardInterrupt:
159 print "Cancelled", cmd
160 try:
161 p.terminate()
162 p.wait()
163 except KeyboardInterrupt:
164 p.send_signal(signal.SIGKILL)
165 p.wait()
166 return False
167
168
169 def convert_flv_mp4(orig_filename):
170 basename = os.path.splitext(orig_filename)[0]
171 flv_filename = basename + ".flv"
172 mp4_filename = basename + ".mp4"
173 if orig_filename != flv_filename:
174 os.rename(orig_filename, flv_filename)
175 print "Converting %s to mp4" % flv_filename
176 if not avconv_remux(flv_filename, mp4_filename):
177 # failed, error has already been logged
178 return
179 try:
180 flv_size = os.stat(flv_filename).st_size
181 mp4_size = os.stat(mp4_filename).st_size
182 if abs(flv_size - mp4_size) < 0.05 * flv_size:
183 os.unlink(flv_filename)
184 else:
185 print >>sys.stderr, "The size of", mp4_filename, "is suspicious, did avconv fail?"
186 except Exception, e:
187 print "Conversion failed", e
188
189 def avconv_remux(infile, outfile):
190 cmd = [
191 "avconv",
192 "-i", infile,
193 "-acodec", "copy",
194 "-vcodec", "copy",
195 outfile,
196 ]
197 return exec_subprocess(cmd)
198
199 def convert_filename(filename):
200 if os.path.splitext(filename.lower())[1] in (".mp4", ".flv"):
201 f = open(filename)
202 fourcc = f.read(4)
203 f.close()
204 if fourcc == "FLV\x01":
205 convert_flv_mp4(filename)
206
207 def download_rtmp(filename, vbase, vpath, hash_url=None):
208 filename = sanify_filename(filename)
209 print "Downloading: %s" % filename
210 if vpath.endswith(".flv"):
211 vpath = vpath[:-4]
212 cmd = [
213 "rtmpdump",
214 "-o", filename,
215 "-r", vbase,
216 "-y", vpath,
217 ]
218 if hash_url is not None:
219 cmd += ["--swfVfy", hash_url]
220 if exec_subprocess(cmd):
221 convert_filename(filename)
222 return True
223 else:
224 return False
225
226 def download_urllib(filename, url, referrer=None):
227 filename = sanify_filename(filename)
228 print "Downloading: %s" % filename
229 try:
230 src = _urlopen(url, referrer)
231 dst = open(filename, "w")
232 while True:
233 buf = src.read(1024*1024)
234 if not buf:
235 break
236 dst.write(buf)
237 sys.stdout.write(".")
238 sys.stdout.flush()
239 print
240 except KeyboardInterrupt:
241 print "\nCancelled", url
242 return False
243 finally:
244 try:
245 src.close()
246 except:
247 pass
248 try:
249 dst.close()
250 except:
251 pass
252
253 convert_filename(filename)
254 return True
255
256 def download_hls_get_stream(url, hack_url_func):
257 url = hack_url_func(url)
258
259 def parse_bandwidth(line):
260 params = line.split(":", 1)[1].split(",")
261 for kv in params:
262 k, v = kv.split("=", 1)
263 if k == "BANDWIDTH":
264 return int(v)
265 return 0
266
267 m3u8 = grab_text(url, 0)
268 best_bandwidth = None
269 best_url = None
270 for line in m3u8.split("\n"):
271 if line.startswith("#EXT-X-STREAM-INF:"):
272 bandwidth = parse_bandwidth(line)
273 if best_bandwidth is None or bandwidth > best_bandwidth:
274 best_bandwidth = bandwidth
275 best_url = None
276 elif not line.startswith("#"):
277 if best_url is None:
278 best_url = line.strip()
279
280 if not best_url:
281 raise Exception("Failed to find best stream for HLS: " + url)
282
283 return best_url
284
285 def download_hls_segments(tmpdir, url, hack_url_func):
286 m3u8 = grab_text(url, 0)
287 result = []
288
289 local_m3u8_filename = tmpdir + "/index.m3u8"
290 local_m3u8 = open(local_m3u8_filename, "w")
291
292 i = 1
293 fail_if_not_last_segment = None
294 for line in m3u8.split("\n"):
295 if not line.strip():
296 continue
297 if line.startswith("#"):
298 local_m3u8.write(line + "\n")
299 continue
300
301 if fail_if_not_last_segment:
302 raise e
303
304 outfile = "%s/segment_%d.ts" % (tmpdir, i)
305 i += 1
306 try:
307 download_hls_fetch_segment(hack_url_func(line), outfile)
308 except urllib2.HTTPError, e:
309 fail_if_not_last_segment = e
310 continue
311 local_m3u8.write(outfile + "\n")
312 sys.stdout.write(".")
313 sys.stdout.flush()
314
315 sys.stdout.write("\n")
316
317 local_m3u8.close()
318 return local_m3u8_filename
319
320 def download_hls_fetch_segment(segment, outfile):
321 try:
322 src = _urlopen(segment)
323 dst = open(outfile, "w")
324 shutil.copyfileobj(src, dst)
325 except:
326 print >>sys.stderr, "Failed to fetch", segment
327 raise
328 finally:
329 try:
330 src.close()
331 except:
332 pass
333 try:
334 dst.close()
335 except:
336 pass
337
338 def download_hls(filename, m3u8_master_url, hack_url_func=None):
339 if hack_url_func is None:
340 hack_url_func = lambda url: url
341
342 tmpdir = tempfile.mkdtemp(prefix="webdl-hls")
343 filename = sanify_filename(filename)
344
345 print "Downloading: %s" % filename
346
347 try:
348 best_stream_url = download_hls_get_stream(m3u8_master_url, hack_url_func)
349 local_m3u8 = download_hls_segments(tmpdir, best_stream_url, hack_url_func)
350 avconv_remux(local_m3u8, filename)
351 return True
352 except KeyboardInterrupt:
353 print "\nCancelled", m3u8_master_url
354 return False
355 finally:
356 shutil.rmtree(tmpdir)
357
358 def natural_sort(l, key=None):
359 ignore_list = ["a", "the"]
360 def key_func(k):
361 if key is not None:
362 k = key(k)
363 k = k.lower()
364 newk = []
365 for c in re.split("([0-9]+)", k):
366 c = c.strip()
367 if c.isdigit():
368 newk.append(int(c))
369 else:
370 for subc in c.split():
371 if subc not in ignore_list:
372 newk.append(subc)
373 return newk
374
375 return sorted(l, key=key_func)
376
377 def append_to_qs(url, params):
378 r = list(urlparse.urlsplit(url))
379 qs = urlparse.parse_qs(r[3])
380 for k, v in params.iteritems():
381 if v is not None:
382 qs[k] = v
383 elif qs.has_key(k):
384 del qs[k]
385 r[3] = urllib.urlencode(qs, True)
386 url = urlparse.urlunsplit(r)
387 return url
388