]> code.delx.au - webdl/blob - common.py
Fixed silly bug in removing ffmpeg detection
[webdl] / common.py
1 import hashlib
2 import io
3 import json
4 import logging
5 import lxml.etree
6 import lxml.html
7 import os
8 import re
9 import requests
10 import requests_cache
11 import shutil
12 import signal
13 import subprocess
14 import sys
15 import time
16 import urllib.parse
17
18 USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0"
19
20 try:
21 import autosocks
22 autosocks.try_autosocks()
23 except ImportError:
24 pass
25
26
27 logging.basicConfig(
28 format = "%(levelname)s %(message)s",
29 level = logging.INFO if os.environ.get("DEBUG", None) is None else logging.DEBUG,
30 stream = sys.stdout,
31 )
32
33 CACHE_FILE = os.path.join(
34 os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")),
35 "webdl",
36 "requests_cache"
37 )
38 if not os.path.isdir(os.path.dirname(CACHE_FILE)):
39 os.makedirs(os.path.dirname(CACHE_FILE))
40
41 requests_cache.install_cache(CACHE_FILE, backend='sqlite', expire_after=3600)
42
43
44 class Node(object):
45 def __init__(self, title, parent=None):
46 self.title = title
47 if parent:
48 parent.children.append(self)
49 self.parent = parent
50 self.children = []
51 self.can_download = False
52
53 def get_children(self):
54 if not self.children:
55 self.fill_children()
56 self.children = natural_sort(self.children, key=lambda node: node.title)
57 return self.children
58
59 def fill_children(self):
60 pass
61
62 def download(self):
63 raise NotImplemented
64
65
66 def load_root_node():
67 root_node = Node("Root")
68
69 import iview
70 iview.fill_nodes(root_node)
71
72 import sbs
73 sbs.fill_nodes(root_node)
74
75 import ten
76 ten.fill_nodes(root_node)
77
78 return root_node
79
80 valid_chars = frozenset("-_.()!@#%^ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
81 def sanify_filename(filename):
82 filename = "".join(c for c in filename if c in valid_chars)
83 assert len(filename) > 0
84 return filename
85
86 def ensure_scheme(url):
87 parts = urllib.parse.urlparse(url)
88 if parts.scheme:
89 return url
90 parts = list(parts)
91 parts[0] = "http"
92 return urllib.parse.urlunparse(parts)
93
94 http_session = requests.Session()
95 http_session.headers["User-Agent"] = USER_AGENT
96
97 def grab_text(url):
98 logging.debug("grab_text(%r)", url)
99 request = http_session.prepare_request(requests.Request("GET", url))
100 response = http_session.send(request)
101 return response.text
102
103 def grab_html(url):
104 logging.debug("grab_html(%r)", url)
105 request = http_session.prepare_request(requests.Request("GET", url))
106 response = http_session.send(request)
107 doc = lxml.html.parse(io.BytesIO(response.content), lxml.html.HTMLParser(encoding="utf-8", recover=True))
108 response.close()
109 return doc
110
111 def grab_xml(url):
112 logging.debug("grab_xml(%r)", url)
113 request = http_session.prepare_request(requests.Request("GET", url))
114 response = http_session.send(request)
115 doc = lxml.etree.parse(io.BytesIO(response.content), lxml.etree.XMLParser(encoding="utf-8", recover=True))
116 response.close()
117 return doc
118
119 def grab_json(url):
120 logging.debug("grab_json(%r)", url)
121 request = http_session.prepare_request(requests.Request("GET", url))
122 response = http_session.send(request)
123 return response.json()
124
125 def exec_subprocess(cmd):
126 logging.debug("Executing: %s", cmd)
127 try:
128 p = subprocess.Popen(cmd)
129 ret = p.wait()
130 if ret != 0:
131 logging.error("%s exited with error code: %s", cmd[0], ret)
132 return False
133 else:
134 return True
135 except OSError as e:
136 logging.error("Failed to run: %s -- %s", cmd[0], e)
137 except KeyboardInterrupt:
138 logging.info("Cancelled: %s", cmd)
139 try:
140 p.terminate()
141 p.wait()
142 except KeyboardInterrupt:
143 p.send_signal(signal.SIGKILL)
144 p.wait()
145 return False
146
147
148 def check_command_exists(cmd):
149 try:
150 subprocess.check_output(cmd, stderr=subprocess.STDOUT)
151 return True
152 except Exception:
153 return False
154
155 def get_duration(filename):
156 cmd = [
157 "ffprobe",
158 filename,
159 "-show_entries", "format=duration",
160 "-v", "quiet",
161 ]
162 output = subprocess.check_output(cmd).decode("utf-8")
163 for line in output.split("\n"):
164 m = re.search(R"([0-9]+)", line)
165 if not m:
166 continue
167 duration = m.group(1)
168 if duration.isdigit():
169 return int(duration)
170
171
172 logging.debug("Falling back to full decode to find duration: %s % filename")
173
174 cmd = [
175 "ffmpeg",
176 "-i", filename,
177 "-vn",
178 "-f", "null", "-",
179 ]
180 output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode("utf-8")
181 duration = None
182 for line in re.split(R"[\r\n]", output):
183 m = re.search(R"time=([0-9:]*)\.", line)
184 if not m:
185 continue
186 [h, m, s] = m.group(1).split(":")
187 # ffmpeg prints the duration as it reads the file, we want the last one
188 duration = int(h) * 3600 + int(m) * 60 + int(s)
189
190 if duration:
191 return duration
192 else:
193 raise Exception("Unable to determine video duration of " + filename)
194
195 def check_video_durations(flv_filename, mp4_filename):
196 flv_duration = get_duration(flv_filename)
197 mp4_duration = get_duration(mp4_filename)
198
199 if abs(flv_duration - mp4_duration) > 1:
200 logging.error(
201 "The duration of %s is suspicious, did the remux fail? Expected %s == %s",
202 mp4_filename, flv_duration, mp4_duration
203 )
204 return False
205
206 return True
207
208 def remux(infile, outfile):
209 logging.info("Converting %s to mp4", infile)
210
211 cmd = [
212 "ffmpeg",
213 "-i", infile,
214 "-bsf:a", "aac_adtstoasc",
215 "-acodec", "copy",
216 "-vcodec", "copy",
217 "-y",
218 outfile,
219 ]
220 if not exec_subprocess(cmd):
221 return False
222
223 if not check_video_durations(infile, outfile):
224 return False
225
226 os.unlink(infile)
227 return True
228
229 def convert_to_mp4(filename):
230 with open(filename, "rb") as f:
231 fourcc = f.read(4)
232 basename, ext = os.path.splitext(filename)
233
234 if ext == ".mp4" and fourcc == b"FLV\x01":
235 os.rename(filename, basename + ".flv")
236 ext = ".flv"
237 filename = basename + ext
238
239 if ext in (".flv", ".ts"):
240 filename_mp4 = basename + ".mp4"
241 return remux(filename, filename_mp4)
242
243 return ext == ".mp4"
244
245
246 def download_hds(filename, video_url, pvswf=None):
247 filename = sanify_filename(filename)
248 logging.info("Downloading: %s", filename)
249
250 video_url = "hds://" + video_url
251 if pvswf:
252 param = "%s pvswf=%s" % (video_url, pvswf)
253 else:
254 param = video_url
255
256 cmd = [
257 "streamlink",
258 "--force",
259 "--output", filename,
260 param,
261 "best",
262 ]
263 if exec_subprocess(cmd):
264 return convert_to_mp4(filename)
265 else:
266 return False
267
268 def download_hls(filename, video_url):
269 filename = sanify_filename(filename)
270 video_url = "hlsvariant://" + video_url
271 logging.info("Downloading: %s", filename)
272
273 cmd = [
274 "streamlink",
275 "--http-header", "User-Agent=" + USER_AGENT,
276 "--force",
277 "--output", filename,
278 video_url,
279 "best",
280 ]
281 if exec_subprocess(cmd):
282 return convert_to_mp4(filename)
283 else:
284 return False
285
286 def download_mpd(filename, video_url):
287 filename = sanify_filename(filename)
288 video_url = "dash://" + video_url
289 logging.info("Downloading: %s", filename)
290
291 cmd = [
292 "streamlink",
293 "--force",
294 "--output", filename,
295 video_url,
296 "best",
297 ]
298 if exec_subprocess(cmd):
299 return convert_to_mp4(filename)
300 else:
301 return False
302
303 def download_http(filename, video_url):
304 filename = sanify_filename(filename)
305 logging.info("Downloading: %s", filename)
306
307 cmd = [
308 "curl",
309 "--fail", "--retry", "3",
310 "-o", filename,
311 video_url,
312 ]
313 if exec_subprocess(cmd):
314 return convert_to_mp4(filename)
315 else:
316 return False
317
318 def natural_sort(l, key=None):
319 ignore_list = ["a", "the"]
320 def key_func(k):
321 if key is not None:
322 k = key(k)
323 k = k.lower()
324 newk = []
325 for c in re.split("([0-9]+)", k):
326 c = c.strip()
327 if c.isdigit():
328 newk.append(c.zfill(5))
329 else:
330 for subc in c.split():
331 if subc not in ignore_list:
332 newk.append(subc)
333 return newk
334
335 return sorted(l, key=key_func)
336
337 def append_to_qs(url, params):
338 r = list(urllib.parse.urlsplit(url))
339 qs = urllib.parse.parse_qs(r[3])
340 for k, v in params.items():
341 if v is not None:
342 qs[k] = v
343 elif k in qs:
344 del qs[k]
345 r[3] = urllib.parse.urlencode(sorted(qs.items()), True)
346 url = urllib.parse.urlunsplit(r)
347 return url
348