]> code.delx.au - webdl/blob - common.py
Print newline at the end of http download
[webdl] / common.py
1 # vim:ts=4:sts=4:sw=4:noet
2
3 from lxml import etree, html
4 import cookielib
5 import json
6 try:
7 import hashlib
8 except ImportError:
9 import md5 as hashlib
10 import os
11 import re
12 import shutil
13 import signal
14 import subprocess
15 import sys
16 import tempfile
17 import time
18 import urllib
19 import urllib2
20 import urlparse
21
22
23 import autosocks
24 autosocks.try_autosocks()
25
26 CACHE_DIR = os.path.expanduser("~/.cache/webdl")
27 USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1"
28
29 class Node(object):
30 def __init__(self, title, parent=None):
31 self.title = title
32 if parent:
33 parent.children.append(self)
34 self.parent = parent
35 self.children = []
36 self.can_download = False
37
38 def get_children(self):
39 if not self.children:
40 self.fill_children()
41 return self.children
42
43 def fill_children(self):
44 pass
45
46 def download(self):
47 raise NotImplemented
48
49
50 def load_root_node():
51 root_node = Node("Root")
52
53 import iview
54 iview.fill_nodes(root_node)
55
56 import sbs
57 sbs.fill_nodes(root_node)
58
59 import plus7
60 plus7.fill_nodes(root_node)
61
62 return root_node
63
64 valid_chars = frozenset("-_.()!@#%^ abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")
65 def sanify_filename(filename):
66 filename = filename.encode("ascii", "ignore")
67 filename = "".join(c for c in filename if c in valid_chars)
68 return filename
69
70 cookiejar = cookielib.CookieJar()
71 urlopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar))
72 def _urlopen(url, referrer=None):
73 req = urllib2.Request(url)
74 req.add_header("User-Agent", USER_AGENT)
75 if referrer:
76 req.add_header("Referer", referrer)
77 return urlopener.open(req)
78
79 def urlopen(url, max_age):
80 ### print url
81 if not os.path.isdir(CACHE_DIR):
82 os.makedirs(CACHE_DIR)
83
84 if max_age <= 0:
85 return _urlopen(url)
86
87 filename = hashlib.md5(url).hexdigest()
88 filename = os.path.join(CACHE_DIR, filename)
89 if os.path.exists(filename):
90 file_age = int(time.time()) - os.path.getmtime(filename)
91 if file_age < max_age:
92 return open(filename)
93
94 src = _urlopen(url)
95 dst = open(filename, "w")
96 try:
97 shutil.copyfileobj(src, dst)
98 except Exception, e:
99 try:
100 os.unlink(filename)
101 except OSError:
102 pass
103 raise e
104 src.close()
105 dst.close()
106
107 return open(filename)
108
109 def grab_html(url, max_age):
110 f = urlopen(url, max_age)
111 doc = html.parse(f, html.HTMLParser(encoding="utf-8", recover=True))
112 f.close()
113 return doc
114
115 def grab_xml(url, max_age):
116 f = urlopen(url, max_age)
117 doc = etree.parse(f, etree.XMLParser(encoding="utf-8", recover=True))
118 f.close()
119 return doc
120
121 def grab_json(url, max_age, skip_assignment=False):
122 f = urlopen(url, max_age)
123 if skip_assignment:
124 text = f.read()
125 pos = text.find("=")
126 doc = json.loads(text[pos+1:])
127 else:
128 doc = json.load(f)
129 f.close()
130 return doc
131
132 def exec_subprocess(cmd):
133 try:
134 p = subprocess.Popen(cmd)
135 ret = p.wait()
136 if ret != 0:
137 print >>sys.stderr, cmd[0], "exited with error code:", ret
138 return False
139 else:
140 return True
141 except OSError, e:
142 print >>sys.stderr, "Failed to run", cmd[0], e
143 except KeyboardInterrupt:
144 print "Cancelled", cmd
145 try:
146 p.terminate()
147 p.wait()
148 except KeyboardInterrupt:
149 p.send_signal(signal.SIGKILL)
150 p.wait()
151 return False
152
153
154 def convert_flv_mp4(orig_filename):
155 basename = os.path.splitext(orig_filename)[0]
156 flv_filename = basename + ".flv"
157 mp4_filename = basename + ".mp4"
158 if orig_filename != flv_filename:
159 os.rename(orig_filename, flv_filename)
160 print "Converting %s to mp4" % flv_filename
161 cmd = [
162 "ffmpeg",
163 "-i", flv_filename,
164 "-acodec", "copy",
165 "-vcodec", "copy",
166 mp4_filename,
167 ]
168 if not exec_subprocess(cmd):
169 return
170 try:
171 flv_size = os.stat(flv_filename).st_size
172 mp4_size = os.stat(mp4_filename).st_size
173 if abs(flv_size - mp4_size) < 0.05 * flv_size:
174 os.unlink(flv_filename)
175 else:
176 print >>sys.stderr, "The size of", mp4_filename, "is suspicious, did ffmpeg fail?"
177 except Exception, e:
178 print "Conversion failed", e
179
180 def convert_filename(filename):
181 if os.path.splitext(filename.lower())[1] in (".mp4", ".flv"):
182 f = open(filename)
183 fourcc = f.read(4)
184 f.close()
185 if fourcc == "FLV\x01":
186 convert_flv_mp4(filename)
187
188 def download_rtmp(filename, vbase, vpath, hash_url=None):
189 filename = sanify_filename(filename)
190 print "Downloading: %s" % filename
191 if vpath.endswith(".flv"):
192 vpath = vpath[:-4]
193 cmd = [
194 "rtmpdump",
195 "-o", filename,
196 "-r", vbase,
197 "-y", vpath,
198 ]
199 if hash_url is not None:
200 cmd += ["--swfVfy", hash_url]
201 if exec_subprocess(cmd):
202 convert_filename(filename)
203 return True
204 else:
205 return False
206
207 def download_urllib(filename, url, referrer=None):
208 filename = sanify_filename(filename)
209 print "Downloading: %s" % filename
210 try:
211 src = _urlopen(url, referrer)
212 dst = open(filename, "w")
213 while True:
214 buf = src.read(1024*1024)
215 if not buf:
216 break
217 dst.write(buf)
218 sys.stdout.write(".")
219 sys.stdout.flush()
220 print
221 convert_filename(filename)
222 return True
223 except KeyboardInterrupt:
224 print "\nCancelled", url
225 finally:
226 try:
227 src.close()
228 except:
229 pass
230 try:
231 dst.close()
232 except:
233 pass
234 return False
235
236 def natural_sort(l, key=None):
237 ignore_list = ["a", "the"]
238 def key_func(k):
239 if key is not None:
240 k = key(k)
241 k = k.lower()
242 newk = []
243 for c in re.split("([0-9]+)", k):
244 c = c.strip()
245 if c.isdigit():
246 newk.append(int(c))
247 else:
248 for subc in c.split():
249 if subc not in ignore_list:
250 newk.append(subc)
251 return newk
252
253 return sorted(l, key=key_func)
254
255 def append_to_qs(url, params):
256 r = list(urlparse.urlsplit(url))
257 qs = urlparse.parse_qs(r[3])
258 for k, v in params.iteritems():
259 if v is not None:
260 qs[k] = v
261 elif qs.has_key(k):
262 del qs[k]
263 r[3] = urllib.urlencode(qs, True)
264 url = urlparse.urlunsplit(r)
265 return url
266