]> code.delx.au - webdl/blob - common.py
webdl: support for ffmpeg as well as avconv
[webdl] / common.py
1 from lxml import etree, html
2 import cookielib
3 import json
4 try:
5 import hashlib
6 except ImportError:
7 import md5 as hashlib
8 import os
9 import re
10 import shutil
11 import signal
12 import subprocess
13 import sys
14 import tempfile
15 import time
16 import urllib
17 import urllib2
18 import urlparse
19
20
21 try:
22 import autosocks
23 autosocks.try_autosocks()
24 except ImportError:
25 pass
26
27 CACHE_DIR = os.path.expanduser("~/.cache/webdl")
28 USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:21.0) Gecko/20100101 Firefox/21.0"
29
30 class Node(object):
31 def __init__(self, title, parent=None):
32 self.title = title
33 if parent:
34 parent.children.append(self)
35 self.parent = parent
36 self.children = []
37 self.can_download = False
38
39 def get_children(self):
40 if not self.children:
41 self.fill_children()
42 return self.children
43
44 def fill_children(self):
45 pass
46
47 def download(self):
48 raise NotImplemented
49
50
51 def load_root_node():
52 root_node = Node("Root")
53
54 import iview
55 iview.fill_nodes(root_node)
56
57 import sbs
58 sbs.fill_nodes(root_node)
59
60 import plus7
61 plus7.fill_nodes(root_node)
62
63 import brightcove
64 brightcove.fill_nodes(root_node)
65
66 return root_node
67
68 valid_chars = frozenset("-_.()!@#%^ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
69 def sanify_filename(filename):
70 filename = filename.encode("ascii", "ignore")
71 filename = "".join(c for c in filename if c in valid_chars)
72 return filename
73
74 cookiejar = cookielib.CookieJar()
75 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
76 def _urlopen(url, referrer=None):
77 req = urllib2.Request(url)
78 req.add_header("User-Agent", USER_AGENT)
79 if referrer:
80 req.add_header("Referer", referrer)
81 return urlopener.open(req)
82
83 def urlopen(url, max_age):
84 ### print url
85 if not os.path.isdir(CACHE_DIR):
86 os.makedirs(CACHE_DIR)
87
88 if max_age <= 0:
89 return _urlopen(url)
90
91 filename = hashlib.md5(url).hexdigest()
92 filename = os.path.join(CACHE_DIR, filename)
93 if os.path.exists(filename):
94 file_age = int(time.time()) - os.path.getmtime(filename)
95 if file_age < max_age:
96 return open(filename)
97
98 src = _urlopen(url)
99 dst = open(filename, "w")
100 try:
101 shutil.copyfileobj(src, dst)
102 except Exception, e:
103 try:
104 os.unlink(filename)
105 except OSError:
106 pass
107 raise e
108 src.close()
109 dst.close()
110
111 return open(filename)
112
113 def grab_text(url, max_age):
114 f = urlopen(url, max_age)
115 text = f.read().decode("utf-8")
116 f.close()
117 return text
118
119 def grab_html(url, max_age):
120 f = urlopen(url, max_age)
121 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
122 f.close()
123 return doc
124
125 def grab_xml(url, max_age):
126 f = urlopen(url, max_age)
127 doc = etree.parse(f, etree.XMLParser(encoding="utf-8", recover=True))
128 f.close()
129 return doc
130
131 def grab_json(url, max_age, skip_assignment=False, skip_function=False):
132 f = urlopen(url, max_age)
133 if skip_assignment:
134 text = f.read()
135 pos = text.find("=")
136 doc = json.loads(text[pos+1:])
137 elif skip_function:
138 text = f.read()
139 pos = text.find("(")
140 rpos = text.rfind(")")
141 doc = json.loads(text[pos+1:rpos])
142 else:
143 doc = json.load(f)
144 f.close()
145 return doc
146
147 def exec_subprocess(cmd):
148 try:
149 p = subprocess.Popen(cmd)
150 ret = p.wait()
151 if ret != 0:
152 print >>sys.stderr, cmd[0], "exited with error code:", ret
153 return False
154 else:
155 return True
156 except OSError, e:
157 print >>sys.stderr, "Failed to run", cmd[0], e
158 except KeyboardInterrupt:
159 print "Cancelled", cmd
160 try:
161 p.terminate()
162 p.wait()
163 except KeyboardInterrupt:
164 p.send_signal(signal.SIGKILL)
165 p.wait()
166 return False
167
168
169 def check_command_exists(cmd):
170 try:
171 subprocess.check_output(cmd)
172 return True
173 except Exception:
174 return False
175
176 def generate_remux_cmd(infile, outfile):
177 if check_command_exists(["avconv", "--help"]):
178 return [
179 "avconv",
180 "-i", infile,
181 "-bsf:a", "aac_adtstoasc",
182 "-acodec", "copy",
183 "-vcodec", "copy",
184 outfile,
185 ]
186
187 if check_command_exists(["ffmpeg", "--help"]):
188 return [
189 "ffmpeg",
190 "-i", infile,
191 "-bsf:a", "aac_adtstoasc",
192 "-acodec", "copy",
193 "-vcodec", "copy",
194 outfile,
195 ]
196
197 raise Exception("You must install ffmpeg or libav-tools")
198
199 def remux(infile, outfile):
200 print "Converting %s to mp4" % infile
201 cmd = generate_remux_cmd(infile, outfile)
202 if not exec_subprocess(cmd):
203 # failed, error has already been logged
204 return False
205 try:
206 flv_size = os.stat(infile).st_size
207 mp4_size = os.stat(outfile).st_size
208 if abs(flv_size - mp4_size) < 0.1 * flv_size:
209 os.unlink(infile)
210 return True
211 else:
212 print >>sys.stderr, "The size of", outfile, "is suspicious, did avconv fail?"
213 return False
214 except Exception, e:
215 print >>sys.stderr, "Conversion failed", e
216 return False
217
218 def convert_to_mp4(filename):
219 with open(filename) as f:
220 fourcc = f.read(4)
221 basename, ext = os.path.splitext(filename)
222
223 if ext == ".mp4" and fourcc == "FLV\x01":
224 os.rename(filename, basename + ".flv")
225 ext = ".flv"
226 filename = basename + ext
227
228 if ext in (".flv", ".ts"):
229 filename_mp4 = basename + ".mp4"
230 return remux(filename, filename_mp4)
231
232 return ext == ".mp4"
233
234
235 def download_rtmp(filename, vbase, vpath, hash_url=None):
236 filename = sanify_filename(filename)
237 print "Downloading: %s" % filename
238 if vpath.endswith(".flv"):
239 vpath = vpath[:-4]
240 cmd = [
241 "rtmpdump",
242 "-o", filename,
243 "-r", vbase,
244 "-y", vpath,
245 ]
246 if hash_url is not None:
247 cmd += ["--swfVfy", hash_url]
248 if exec_subprocess(cmd):
249 return convert_to_mp4(filename)
250 else:
251 return False
252
253 def download_urllib(filename, url, referrer=None):
254 filename = sanify_filename(filename)
255 print "Downloading: %s" % filename
256 try:
257 src = _urlopen(url, referrer)
258 dst = open(filename, "w")
259 while True:
260 buf = src.read(1024*1024)
261 if not buf:
262 break
263 dst.write(buf)
264 sys.stdout.write(".")
265 sys.stdout.flush()
266 print
267 except KeyboardInterrupt:
268 print "\nCancelled", url
269 return False
270 finally:
271 try:
272 src.close()
273 except:
274 pass
275 try:
276 dst.close()
277 except:
278 pass
279
280 return convert_to_mp4(filename)
281
282 def download_hls_get_stream(url):
283 def parse_bandwidth(line):
284 params = line.split(":", 1)[1].split(",")
285 for kv in params:
286 k, v = kv.split("=", 1)
287 if k == "BANDWIDTH":
288 return int(v)
289 return 0
290
291 m3u8 = grab_text(url, 0)
292 best_bandwidth = None
293 best_url = None
294 for line in m3u8.split("\n"):
295 if line.startswith("#EXT-X-STREAM-INF:"):
296 bandwidth = parse_bandwidth(line)
297 if best_bandwidth is None or bandwidth > best_bandwidth:
298 best_bandwidth = bandwidth
299 best_url = None
300 elif not line.startswith("#"):
301 if best_url is None:
302 best_url = line.strip()
303
304 if not best_url:
305 raise Exception("Failed to find best stream for HLS: " + url)
306
307 return best_url
308
309 def download_hls_segments(outf, url):
310 m3u8 = grab_text(url, 0)
311
312 fail_if_not_last_segment = None
313 for line in m3u8.split("\n"):
314 if not line.strip() or line.startswith("#"):
315 continue
316
317 if fail_if_not_last_segment:
318 raise e
319
320 try:
321 download_hls_fetch_segment(outf, line)
322 except urllib2.HTTPError, e:
323 fail_if_not_last_segment = e
324 continue
325 sys.stdout.write(".")
326 sys.stdout.flush()
327
328 sys.stdout.write("\n")
329
330 def download_hls_fetch_segment(outf, segment_url):
331 try:
332 src = _urlopen(segment_url)
333 shutil.copyfileobj(src, outf)
334 except:
335 raise
336 finally:
337 try:
338 src.close()
339 except:
340 pass
341
342 def download_hls(filename, m3u8_master_url, hack_url_func=None):
343 filename = sanify_filename(filename)
344 print "Downloading: %s" % filename
345
346 if hack_url_func is None:
347 hack_url_func = lambda url: url
348
349 tmpdir = tempfile.mkdtemp(prefix="webdl-hls")
350
351 try:
352 best_stream_url = download_hls_get_stream(hack_url_func(m3u8_master_url))
353 ts_file = open(filename, "w")
354 download_hls_segments(ts_file, hack_url_func(best_stream_url))
355 except KeyboardInterrupt:
356 print "\nCancelled", m3u8_master_url
357 return False
358 finally:
359 shutil.rmtree(tmpdir)
360 try:
361 ts_file.close()
362 except:
363 pass
364
365 return convert_to_mp4(filename)
366
367 def natural_sort(l, key=None):
368 ignore_list = ["a", "the"]
369 def key_func(k):
370 if key is not None:
371 k = key(k)
372 k = k.lower()
373 newk = []
374 for c in re.split("([0-9]+)", k):
375 c = c.strip()
376 if c.isdigit():
377 newk.append(int(c))
378 else:
379 for subc in c.split():
380 if subc not in ignore_list:
381 newk.append(subc)
382 return newk
383
384 return sorted(l, key=key_func)
385
386 def append_to_qs(url, params):
387 r = list(urlparse.urlsplit(url))
388 qs = urlparse.parse_qs(r[3])
389 for k, v in params.iteritems():
390 if v is not None:
391 qs[k] = v
392 elif qs.has_key(k):
393 del qs[k]
394 r[3] = urllib.urlencode(qs, True)
395 url = urlparse.urlunsplit(r)
396 return url
397