]> code.delx.au - webdl/blob - common.py
9a873f698b709cf56a1fbbb70779f179ebeac391
[webdl] / common.py
1 import hashlib
2 import io
3 import json
4 import logging
5 import lxml.etree
6 import lxml.html
7 import os
8 import re
9 import requests
10 import requests_cache
11 import shutil
12 import signal
13 import subprocess
14 import time
15 import urllib.parse
16
17
18 try:
19 import autosocks
20 autosocks.try_autosocks()
21 except ImportError:
22 pass
23
24
25 logging.basicConfig(
26 format = "%(levelname)s %(message)s",
27 level = logging.INFO if os.environ.get("DEBUG", None) is None else logging.DEBUG,
28 )
29
30 CACHE_FILE = os.path.join(
31 os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")),
32 "webdl",
33 "requests_cache"
34 )
35 if not os.path.isdir(os.path.dirname(CACHE_FILE)):
36 os.makedirs(os.path.dirname(CACHE_FILE))
37
38 requests_cache.install_cache(CACHE_FILE, backend='sqlite', expire_after=3600)
39
40
41 class Node(object):
42 def __init__(self, title, parent=None):
43 self.title = title
44 if parent:
45 parent.children.append(self)
46 self.parent = parent
47 self.children = []
48 self.can_download = False
49
50 def get_children(self):
51 if not self.children:
52 self.fill_children()
53 self.children = natural_sort(self.children, key=lambda node: node.title)
54 return self.children
55
56 def fill_children(self):
57 pass
58
59 def download(self):
60 raise NotImplemented
61
62
63 def load_root_node():
64 root_node = Node("Root")
65
66 import iview
67 iview.fill_nodes(root_node)
68
69 import sbs
70 sbs.fill_nodes(root_node)
71
72 import ten
73 ten.fill_nodes(root_node)
74
75 return root_node
76
77 valid_chars = frozenset("-_.()!@#%^ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
78 def sanify_filename(filename):
79 filename = "".join(c for c in filename if c in valid_chars)
80 assert len(filename) > 0
81 return filename
82
83 def ensure_scheme(url):
84 parts = urllib.parse.urlparse(url)
85 if parts.scheme:
86 return url
87 parts = list(parts)
88 parts[0] = "http"
89 return urllib.parse.urlunparse(parts)
90
91 http_session = requests.Session()
92 http_session.headers["User-Agent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:21.0) Gecko/20100101 Firefox/21.0"
93
94 def grab_text(url):
95 logging.debug("grab_text(%r)", url)
96 request = http_session.prepare_request(requests.Request("GET", url))
97 response = http_session.send(request)
98 return response.text
99
100 def grab_html(url):
101 logging.debug("grab_html(%r)", url)
102 request = http_session.prepare_request(requests.Request("GET", url))
103 response = http_session.send(request, stream=True)
104 doc = lxml.html.parse(io.BytesIO(response.content), lxml.html.HTMLParser(encoding="utf-8", recover=True))
105 response.close()
106 return doc
107
108 def grab_xml(url):
109 logging.debug("grab_xml(%r)", url)
110 request = http_session.prepare_request(requests.Request("GET", url))
111 response = http_session.send(request, stream=True)
112 doc = lxml.etree.parse(io.BytesIO(response.content), lxml.etree.XMLParser(encoding="utf-8", recover=True))
113 response.close()
114 return doc
115
116 def grab_json(url):
117 logging.debug("grab_json(%r)", url)
118 request = http_session.prepare_request(requests.Request("GET", url))
119 response = http_session.send(request)
120 return response.json()
121
122 def exec_subprocess(cmd):
123 logging.debug("Executing: %s", cmd)
124 try:
125 p = subprocess.Popen(cmd)
126 ret = p.wait()
127 if ret != 0:
128 logging.error("%s exited with error code: %s", cmd[0], ret)
129 return False
130 else:
131 return True
132 except OSError as e:
133 logging.error("Failed to run: %s -- %s", cmd[0], e)
134 except KeyboardInterrupt:
135 logging.info("Cancelled: %s", cmd)
136 try:
137 p.terminate()
138 p.wait()
139 except KeyboardInterrupt:
140 p.send_signal(signal.SIGKILL)
141 p.wait()
142 return False
143
144
145 def check_command_exists(cmd):
146 try:
147 subprocess.check_output(cmd, stderr=subprocess.STDOUT)
148 return True
149 except Exception:
150 return False
151
152 def find_ffmpeg():
153 if check_command_exists(["ffmpeg", "--help"]):
154 return "ffmpeg"
155
156 if check_command_exists(["avconv", "--help"]):
157 logging.warn("Detected libav-tools! ffmpeg is recommended")
158 return "avconv"
159
160 raise Exception("You must install ffmpeg or libav-tools")
161
162 def find_ffprobe():
163 if check_command_exists(["ffprobe", "--help"]):
164 return "ffprobe"
165
166 if check_command_exists(["avprobe", "--help"]):
167 logging.warn("Detected libav-tools! ffmpeg is recommended")
168 return "avprobe"
169
170 raise Exception("You must install ffmpeg or libav-tools")
171
172 def find_streamlink():
173 if check_command_exists(["streamlink", "--help"]):
174 return "streamlink"
175
176 if check_command_exists(["livestreamer", "--help"]):
177 logging.warn("Detected livestreamer! streamlink is recommended")
178 return "livestreamer"
179
180 raise Exception("You must install streamlink or livestreamer")
181
182 def get_duration(filename):
183 ffprobe = find_ffprobe()
184
185 cmd = [
186 ffprobe,
187 filename,
188 "-show_format_entry", "duration",
189 "-v", "quiet",
190 ]
191 output = subprocess.check_output(cmd).decode("utf-8")
192 for line in output.split("\n"):
193 m = re.search(R"([0-9]+)", line)
194 if not m:
195 continue
196 duration = m.group(1)
197 if duration.isdigit():
198 return int(duration)
199
200
201 logging.debug("Falling back to full decode to find duration: %s % filename")
202
203 ffmpeg = find_ffmpeg()
204 cmd = [
205 ffmpeg,
206 "-i", filename,
207 "-vn",
208 "-f", "null", "-",
209 ]
210 output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode("utf-8")
211 duration = None
212 for line in re.split(R"[\r\n]", output):
213 m = re.search(R"time=([0-9:]*)\.", line)
214 if not m:
215 continue
216 [h, m, s] = m.group(1).split(":")
217 # ffmpeg prints the duration as it reads the file, we want the last one
218 duration = int(h) * 3600 + int(m) * 60 + int(s)
219
220 if duration:
221 return duration
222 else:
223 raise Exception("Unable to determine video duration of " + filename)
224
225 def check_video_durations(flv_filename, mp4_filename):
226 flv_duration = get_duration(flv_filename)
227 mp4_duration = get_duration(mp4_filename)
228
229 if abs(flv_duration - mp4_duration) > 1:
230 logging.error(
231 "The duration of %s is suspicious, did the remux fail? Expected %s == %s",
232 mp4_filename, flv_duration, mp4_duration
233 )
234 return False
235
236 return True
237
238 def remux(infile, outfile):
239 logging.info("Converting %s to mp4", infile)
240
241 ffmpeg = find_ffmpeg()
242 cmd = [
243 ffmpeg,
244 "-i", infile,
245 "-bsf:a", "aac_adtstoasc",
246 "-acodec", "copy",
247 "-vcodec", "copy",
248 "-y",
249 outfile,
250 ]
251 if not exec_subprocess(cmd):
252 return False
253
254 if not check_video_durations(infile, outfile):
255 return False
256
257 os.unlink(infile)
258 return True
259
260 def convert_to_mp4(filename):
261 with open(filename, "rb") as f:
262 fourcc = f.read(4)
263 basename, ext = os.path.splitext(filename)
264
265 if ext == ".mp4" and fourcc == b"FLV\x01":
266 os.rename(filename, basename + ".flv")
267 ext = ".flv"
268 filename = basename + ext
269
270 if ext in (".flv", ".ts"):
271 filename_mp4 = basename + ".mp4"
272 return remux(filename, filename_mp4)
273
274 return ext == ".mp4"
275
276
277 def download_hds(filename, video_url, pvswf=None):
278 streamlink = find_streamlink()
279
280 filename = sanify_filename(filename)
281 logging.info("Downloading: %s", filename)
282
283 video_url = "hds://" + video_url
284 if pvswf:
285 param = "%s pvswf=%s" % (video_url, pvswf)
286 else:
287 param = video_url
288
289 cmd = [
290 streamlink,
291 "-f",
292 "-o", filename,
293 param,
294 "best",
295 ]
296 if exec_subprocess(cmd):
297 return convert_to_mp4(filename)
298 else:
299 return False
300
301 def download_hls(filename, video_url):
302 streamlink = find_streamlink()
303
304 filename = sanify_filename(filename)
305 video_url = "hlsvariant://" + video_url
306 logging.info("Downloading: %s", filename)
307
308 cmd = [
309 streamlink,
310 "-f",
311 "-o", filename,
312 video_url,
313 "best",
314 ]
315 if exec_subprocess(cmd):
316 return convert_to_mp4(filename)
317 else:
318 return False
319
320 def download_mpd(filename, video_url):
321 streamlink = find_streamlink()
322
323 filename = sanify_filename(filename)
324 video_url = "dash://" + video_url
325 logging.info("Downloading: %s", filename)
326
327 cmd = [
328 streamlink,
329 "-f",
330 "-o", filename,
331 video_url,
332 "best",
333 ]
334 if exec_subprocess(cmd):
335 return convert_to_mp4(filename)
336 else:
337 return False
338
339 def download_http(filename, video_url):
340 filename = sanify_filename(filename)
341 logging.info("Downloading: %s", filename)
342
343 cmd = [
344 "curl",
345 "--fail", "--retry", "3",
346 "-o", filename,
347 video_url,
348 ]
349 if exec_subprocess(cmd):
350 return convert_to_mp4(filename)
351 else:
352 return False
353
354 def natural_sort(l, key=None):
355 ignore_list = ["a", "the"]
356 def key_func(k):
357 if key is not None:
358 k = key(k)
359 k = k.lower()
360 newk = []
361 for c in re.split("([0-9]+)", k):
362 c = c.strip()
363 if c.isdigit():
364 newk.append(c.zfill(5))
365 else:
366 for subc in c.split():
367 if subc not in ignore_list:
368 newk.append(subc)
369 return newk
370
371 return sorted(l, key=key_func)
372
373 def append_to_qs(url, params):
374 r = list(urllib.parse.urlsplit(url))
375 qs = urllib.parse.parse_qs(r[3])
376 for k, v in params.items():
377 if v is not None:
378 qs[k] = v
379 elif k in qs:
380 del qs[k]
381 r[3] = urllib.parse.urlencode(sorted(qs.items()), True)
382 url = urllib.parse.urlunsplit(r)
383 return url
384