-cookiejar = http.cookiejar.CookieJar()
-urlopener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookiejar))
-def _urlopen(url, referrer=None):
- url = ensure_scheme(url)
- req = urllib.request.Request(url)
- req.add_header("User-Agent", USER_AGENT)
- if referrer:
- req.add_header("Referer", referrer)
- return urlopener.open(req)
-
-def urlopen(url, max_age):
- logging.debug("urlopen(%r, %r)", url, max_age)
-
- if not os.path.isdir(CACHE_DIR):
- os.makedirs(CACHE_DIR)
-
- if max_age <= 0:
- return _urlopen(url)
-
- filename = hashlib.md5(url.encode("utf-8")).hexdigest()
- filename = os.path.join(CACHE_DIR, filename)
- if os.path.exists(filename):
- file_age = int(time.time()) - os.path.getmtime(filename)
- if file_age < max_age:
- logging.debug("loading from cache: %s", filename)
- return open(filename, "rb")
-
- logging.debug("downloading: %s -> %s", url, filename)
- src = _urlopen(url)
- dst = open(filename, "wb")
- try:
- shutil.copyfileobj(src, dst)
- except Exception as e:
- try:
- os.unlink(filename)
- except OSError:
- pass
- raise e
- src.close()
- dst.close()
-
- return open(filename, "rb")
-
-def grab_text(url, max_age):
- f = urlopen(url, max_age)
- text = f.read().decode("utf-8")
- f.close()
- return text
-
-def grab_html(url, max_age):
- f = urlopen(url, max_age)
- doc = lxml.html.parse(f, lxml.html.HTMLParser(encoding="utf-8", recover=True))
- f.close()
+http_session = requests.Session()
+http_session.headers["User-Agent"] = USER_AGENT
+
+def grab_text(url):
+ logging.debug("grab_text(%r)", url)
+ request = http_session.prepare_request(requests.Request("GET", url))
+ response = http_session.send(request)
+ return response.text
+
+def grab_html(url):
+ logging.debug("grab_html(%r)", url)
+ request = http_session.prepare_request(requests.Request("GET", url))
+ response = http_session.send(request, stream=True)
+ doc = lxml.html.parse(io.BytesIO(response.content), lxml.html.HTMLParser(encoding="utf-8", recover=True))
+ response.close()