]> code.delx.au - webdl/blob - common.py
README tweak
[webdl] / common.py
1 import python2_compat
2
3 import hashlib
4 import http.cookiejar
5 import json
6 import logging
7 import lxml.etree
8 import lxml.html
9 import os
10 import re
11 import shutil
12 import signal
13 import subprocess
14 import time
15 import urllib.parse
16 import urllib.request
17
18
19 try:
20 import autosocks
21 autosocks.try_autosocks()
22 except ImportError:
23 pass
24
25
26 logging.basicConfig(
27 format = "%(levelname)s %(message)s",
28 level = logging.INFO if os.environ.get("DEBUG", None) is None else logging.DEBUG,
29 )
30
31 CACHE_DIR = os.path.join(
32 os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")),
33 "webdl"
34 )
35
36 USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:21.0) Gecko/20100101 Firefox/21.0"
37
38
39 class Node(object):
40 def __init__(self, title, parent=None):
41 self.title = title
42 if parent:
43 parent.children.append(self)
44 self.parent = parent
45 self.children = []
46 self.can_download = False
47
48 def get_children(self):
49 if not self.children:
50 self.fill_children()
51 return self.children
52
53 def fill_children(self):
54 pass
55
56 def download(self):
57 raise NotImplemented
58
59
60 def load_root_node():
61 root_node = Node("Root")
62
63 import iview
64 iview.fill_nodes(root_node)
65
66 import sbs
67 sbs.fill_nodes(root_node)
68
69 import brightcove
70 brightcove.fill_nodes(root_node)
71
72 return root_node
73
74 valid_chars = frozenset("-_.()!@#%^ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
75 def sanify_filename(filename):
76 filename = "".join(c for c in filename if c in valid_chars)
77 assert len(filename) > 0
78 return filename
79
80 def ensure_scheme(url):
81 parts = urllib.parse.urlparse(url)
82 if parts.scheme:
83 return url
84 parts = list(parts)
85 parts[0] = "http"
86 return urllib.parse.urlunparse(parts)
87
88 cookiejar = http.cookiejar.CookieJar()
89 urlopener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookiejar))
90 def _urlopen(url, referrer=None):
91 url = ensure_scheme(url)
92 req = urllib.request.Request(url)
93 req.add_header("User-Agent", USER_AGENT)
94 if referrer:
95 req.add_header("Referer", referrer)
96 return urlopener.open(req)
97
98 def urlopen(url, max_age):
99 logging.debug("urlopen(%r, %r)", url, max_age)
100
101 if not os.path.isdir(CACHE_DIR):
102 os.makedirs(CACHE_DIR)
103
104 if max_age <= 0:
105 return _urlopen(url)
106
107 filename = hashlib.md5(url.encode("utf-8")).hexdigest()
108 filename = os.path.join(CACHE_DIR, filename)
109 if os.path.exists(filename):
110 file_age = int(time.time()) - os.path.getmtime(filename)
111 if file_age < max_age:
112 logging.debug("loading from cache: %s", filename)
113 return open(filename, "rb")
114
115 logging.debug("downloading: %s -> %s", url, filename)
116 src = _urlopen(url)
117 dst = open(filename, "wb")
118 try:
119 shutil.copyfileobj(src, dst)
120 except Exception as e:
121 try:
122 os.unlink(filename)
123 except OSError:
124 pass
125 raise e
126 src.close()
127 dst.close()
128
129 return open(filename, "rb")
130
131 def grab_text(url, max_age):
132 f = urlopen(url, max_age)
133 text = f.read().decode("utf-8")
134 f.close()
135 return text
136
137 def grab_html(url, max_age):
138 f = urlopen(url, max_age)
139 doc = lxml.html.parse(f, lxml.html.HTMLParser(encoding="utf-8", recover=True))
140 f.close()
141 return doc
142
143 def grab_xml(url, max_age):
144 f = urlopen(url, max_age)
145 doc = lxml.etree.parse(f, lxml.etree.XMLParser(encoding="utf-8", recover=True))
146 f.close()
147 return doc
148
149 def grab_json(url, max_age, skip_assignment=False, skip_function=False):
150 f = urlopen(url, max_age)
151 text = f.read().decode("utf-8")
152
153 if skip_assignment:
154 pos = text.find("=")
155 text = text[pos+1:]
156
157 elif skip_function:
158 pos = text.find("(")
159 rpos = text.rfind(")")
160 text = text[pos+1:rpos]
161
162 doc = json.loads(text)
163 f.close()
164 return doc
165
166 def exec_subprocess(cmd):
167 logging.debug("Executing: %s", cmd)
168 try:
169 p = subprocess.Popen(cmd)
170 ret = p.wait()
171 if ret != 0:
172 logging.error("%s exited with error code: %s", cmd[0], ret)
173 return False
174 else:
175 return True
176 except OSError as e:
177 logging.error("Failed to run: %s -- %s", cmd[0], e)
178 except KeyboardInterrupt:
179 logging.info("Cancelled: %s", cmd)
180 try:
181 p.terminate()
182 p.wait()
183 except KeyboardInterrupt:
184 p.send_signal(signal.SIGKILL)
185 p.wait()
186 return False
187
188
189 def check_command_exists(cmd):
190 try:
191 subprocess.check_output(cmd)
192 return True
193 except Exception:
194 return False
195
196 def generate_remux_cmd(infile, outfile):
197 if check_command_exists(["avconv", "--help"]):
198 return [
199 "avconv",
200 "-i", infile,
201 "-bsf:a", "aac_adtstoasc",
202 "-acodec", "copy",
203 "-vcodec", "copy",
204 outfile,
205 ]
206
207 if check_command_exists(["ffmpeg", "--help"]):
208 return [
209 "ffmpeg",
210 "-i", infile,
211 "-bsf:a", "aac_adtstoasc",
212 "-acodec", "copy",
213 "-vcodec", "copy",
214 outfile,
215 ]
216
217 raise Exception("You must install ffmpeg or libav-tools")
218
219 def remux(infile, outfile):
220 logging.info("Converting %s to mp4", infile)
221 cmd = generate_remux_cmd(infile, outfile)
222 if not exec_subprocess(cmd):
223 # failed, error has already been logged
224 return False
225 try:
226 flv_size = os.stat(infile).st_size
227 mp4_size = os.stat(outfile).st_size
228 if abs(flv_size - mp4_size) < 0.1 * flv_size:
229 os.unlink(infile)
230 return True
231 else:
232 logging.error("The size of %s is suspicious, did the remux fail?", outfile)
233 return False
234 except Exception as e:
235 logging.error("Conversion failed! %s", e)
236 return False
237
238 def convert_to_mp4(filename):
239 with open(filename, "rb") as f:
240 fourcc = f.read(4)
241 basename, ext = os.path.splitext(filename)
242
243 if ext == ".mp4" and fourcc == b"FLV\x01":
244 os.rename(filename, basename + ".flv")
245 ext = ".flv"
246 filename = basename + ext
247
248 if ext in (".flv", ".ts"):
249 filename_mp4 = basename + ".mp4"
250 return remux(filename, filename_mp4)
251
252 return ext == ".mp4"
253
254
255 def download_hds(filename, video_url, pvswf=None):
256 filename = sanify_filename(filename)
257 logging.info("Downloading: %s", filename)
258
259 video_url = "hds://" + video_url
260 if pvswf:
261 param = "%s pvswf=%s" % (video_url, pvswf)
262 else:
263 param = video_url
264
265 cmd = [
266 "livestreamer",
267 "-o", filename,
268 param,
269 "best",
270 ]
271 if exec_subprocess(cmd):
272 return convert_to_mp4(filename)
273 else:
274 return False
275
276 def download_hls(filename, video_url):
277 filename = sanify_filename(filename)
278 video_url = "hlsvariant://" + video_url
279 logging.info("Downloading: %s", filename)
280
281 cmd = [
282 "livestreamer",
283 "-o", filename,
284 video_url,
285 "best",
286 ]
287 if exec_subprocess(cmd):
288 return convert_to_mp4(filename)
289 else:
290 return False
291
292 def download_http(filename, video_url):
293 filename = sanify_filename(filename)
294 logging.info("Downloading: %s", filename)
295
296 cmd = [
297 "curl",
298 "--fail", "--retry", "3",
299 "-o", filename,
300 video_url,
301 ]
302 if exec_subprocess(cmd):
303 return convert_to_mp4(filename)
304 else:
305 return False
306
307 def natural_sort(l, key=None):
308 ignore_list = ["a", "the"]
309 def key_func(k):
310 if key is not None:
311 k = key(k)
312 k = k.lower()
313 newk = []
314 for c in re.split("([0-9]+)", k):
315 c = c.strip()
316 if c.isdigit():
317 newk.append(c.zfill(5))
318 else:
319 for subc in c.split():
320 if subc not in ignore_list:
321 newk.append(subc)
322 return newk
323
324 return sorted(l, key=key_func)
325
326 def append_to_qs(url, params):
327 r = list(urllib.parse.urlsplit(url))
328 qs = urllib.parse.parse_qs(r[3])
329 for k, v in params.items():
330 if v is not None:
331 qs[k] = v
332 elif k in qs:
333 del qs[k]
334 r[3] = urllib.parse.urlencode(sorted(qs.items()), True)
335 url = urllib.parse.urlunsplit(r)
336 return url
337