From 3ab45e19d5866c230dcc67e9971a37039b92657a Mon Sep 17 00:00:00 2001 From: JackDandy Date: Tue, 28 Apr 2015 18:32:10 +0100 Subject: [PATCH] Update cachecontrol library 0.9.3 to 0.11.2. --- CHANGES.md | 1 + lib/cachecontrol/__init__.py | 4 + lib/cachecontrol/adapter.py | 59 +++++++++-- lib/cachecontrol/cache.py | 9 +- lib/cachecontrol/caches/file_cache.py | 32 +++++- lib/cachecontrol/caches/redis_cache.py | 3 + lib/cachecontrol/compat.py | 19 ++-- lib/cachecontrol/controller.py | 136 ++++++++++++++++--------- lib/cachecontrol/filewrapper.py | 63 ++++++++++++ lib/cachecontrol/heuristics.py | 134 ++++++++++++++++++++++++ lib/cachecontrol/patch_requests.py | 56 ---------- lib/cachecontrol/serialize.py | 127 +++++++++++++++++++---- lib/cachecontrol/session.py | 34 ------- lib/cachecontrol/wrapper.py | 11 +- 14 files changed, 496 insertions(+), 192 deletions(-) create mode 100644 lib/cachecontrol/filewrapper.py create mode 100644 lib/cachecontrol/heuristics.py delete mode 100644 lib/cachecontrol/patch_requests.py delete mode 100644 lib/cachecontrol/session.py diff --git a/CHANGES.md b/CHANGES.md index 716957b6..ce9f1c5b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -9,6 +9,7 @@ * Update change to suppress HTTPS verification InsecureRequestWarning to updated package as listed in hacks.txt * Remove listed hacks.txt record for check that SSLv3 is available because issue was addressed by vendor * Update chardet packages 2.2.1 to 2.3.0 (ff40135) +* Update cachecontrol library 0.9.3 to 0.11.2 * Add ToTV provider * Fix Backlog scheduler initialization and change backlog frequency from minutes to days * Change to consolidate and tidy some provider code diff --git a/lib/cachecontrol/__init__.py b/lib/cachecontrol/__init__.py index c18e70c0..fae051a0 100644 --- a/lib/cachecontrol/__init__.py +++ b/lib/cachecontrol/__init__.py @@ -2,6 +2,10 @@ Make it easy to import from cachecontrol without long namespaces. """ +__author__ = 'Eric Larson' +__email__ = 'eric@ionrock.org' +__version__ = '0.11.2' + from .wrapper import CacheControl from .adapter import CacheControlAdapter from .controller import CacheController diff --git a/lib/cachecontrol/adapter.py b/lib/cachecontrol/adapter.py index 03313f5e..54f1b512 100644 --- a/lib/cachecontrol/adapter.py +++ b/lib/cachecontrol/adapter.py @@ -1,16 +1,24 @@ -from lib.requests.adapters import HTTPAdapter +import functools + +from requests.adapters import HTTPAdapter from .controller import CacheController from .cache import DictCache +from .filewrapper import CallbackFileWrapper class CacheControlAdapter(HTTPAdapter): invalidating_methods = set(['PUT', 'DELETE']) - def __init__(self, cache=None, cache_etags=True, controller_class=None, - serializer=None, *args, **kw): + def __init__(self, cache=None, + cache_etags=True, + controller_class=None, + serializer=None, + heuristic=None, + *args, **kw): super(CacheControlAdapter, self).__init__(*args, **kw) self.cache = cache or DictCache() + self.heuristic = heuristic controller_factory = controller_class or CacheController self.controller = controller_factory( @@ -27,10 +35,13 @@ class CacheControlAdapter(HTTPAdapter): if request.method == 'GET': cached_response = self.controller.cached_request(request) if cached_response: - return self.build_response(request, cached_response, from_cache=True) + return self.build_response(request, cached_response, + from_cache=True) # check for etags and add headers if appropriate - request.headers.update(self.controller.conditional_headers(request)) + request.headers.update( + self.controller.conditional_headers(request) + ) resp = super(CacheControlAdapter, self).send(request, **kw) @@ -44,6 +55,8 @@ class CacheControlAdapter(HTTPAdapter): cached response """ if not from_cache and request.method == 'GET': + + # apply any expiration heuristics if response.status == 304: # We must have sent an ETag request. This could mean # that we've been expired already or that we simply @@ -56,14 +69,34 @@ class CacheControlAdapter(HTTPAdapter): if cached_response is not response: from_cache = True + # We are done with the server response, read a + # possible response body (compliant servers will + # not return one, but we cannot be 100% sure) and + # release the connection back to the pool. + response.read(decode_content=False) + response.release_conn() + response = cached_response + + # We always cache the 301 responses + elif response.status == 301: + self.controller.cache_response(request, response) else: - # try to cache the response - try: - self.controller.cache_response(request, response) - except Exception as e: - # Failed to cache the results - pass + # Check for any heuristics that might update headers + # before trying to cache. + if self.heuristic: + response = self.heuristic.apply(response) + + # Wrap the response file with a wrapper that will cache the + # response when the stream has been consumed. + response._fp = CallbackFileWrapper( + response._fp, + functools.partial( + self.controller.cache_response, + request, + response, + ) + ) resp = super(CacheControlAdapter, self).build_response( request, response @@ -78,3 +111,7 @@ class CacheControlAdapter(HTTPAdapter): resp.from_cache = from_cache return resp + + def close(self): + self.cache.close() + super(CacheControlAdapter, self).close() diff --git a/lib/cachecontrol/cache.py b/lib/cachecontrol/cache.py index b8a0098c..7389a73f 100644 --- a/lib/cachecontrol/cache.py +++ b/lib/cachecontrol/cache.py @@ -1,9 +1,10 @@ """ -The cache object API for implementing caches. The default is just a -dictionary, which in turns means it is not threadsafe for writing. +The cache object API for implementing caches. The default is a thread +safe in-memory dictionary. """ from threading import Lock + class BaseCache(object): def get(self, key): @@ -15,6 +16,10 @@ class BaseCache(object): def delete(self, key): raise NotImplemented() + def close(self): + pass + + class DictCache(BaseCache): def __init__(self, init_dict=None): diff --git a/lib/cachecontrol/caches/file_cache.py b/lib/cachecontrol/caches/file_cache.py index 711687ca..fd12c8a9 100644 --- a/lib/cachecontrol/caches/file_cache.py +++ b/lib/cachecontrol/caches/file_cache.py @@ -3,6 +3,9 @@ import os from lockfile import FileLock +from ..cache import BaseCache +from ..controller import CacheController + def _secure_open_write(filename, fmode): # We only want to write to this file, so open it in write only mode @@ -44,22 +47,24 @@ def _secure_open_write(filename, fmode): raise -class FileCache(object): +class FileCache(BaseCache): def __init__(self, directory, forever=False, filemode=0o0600, dirmode=0o0700): self.directory = directory self.forever = forever self.filemode = filemode - - if not os.path.isdir(self.directory): - os.makedirs(self.directory, dirmode) + self.dirmode = dirmode @staticmethod def encode(x): return hashlib.sha224(x.encode()).hexdigest() def _fn(self, name): - return os.path.join(self.directory, self.encode(name)) + # NOTE: This method should not change as some may depend on it. + # See: https://github.com/ionrock/cachecontrol/issues/63 + hashed = self.encode(name) + parts = list(hashed[:5]) + [hashed] + return os.path.join(self.directory, *parts) def get(self, key): name = self._fn(key) @@ -71,7 +76,15 @@ class FileCache(object): def set(self, key, value): name = self._fn(key) + + # Make sure the directory exists + try: + os.makedirs(os.path.dirname(name), self.dirmode) + except (IOError, OSError): + pass + with FileLock(name) as lock: + # Write our actual file with _secure_open_write(lock.path, self.filemode) as fh: fh.write(value) @@ -79,3 +92,12 @@ class FileCache(object): name = self._fn(key) if not self.forever: os.remove(name) + + +def url_to_file_path(url, filecache): + """Return the file cache path based on the URL. + + This does not ensure the file exists! + """ + key = CacheController.cache_url(url) + return filecache._fn(key) diff --git a/lib/cachecontrol/caches/redis_cache.py b/lib/cachecontrol/caches/redis_cache.py index 72b8ca31..9f5d55fd 100644 --- a/lib/cachecontrol/caches/redis_cache.py +++ b/lib/cachecontrol/caches/redis_cache.py @@ -36,3 +36,6 @@ class RedisCache(object): caution!""" for key in self.conn.keys(): self.conn.delete(key) + + def close(self): + self.conn.disconnect() diff --git a/lib/cachecontrol/compat.py b/lib/cachecontrol/compat.py index aa117d02..489eb868 100644 --- a/lib/cachecontrol/compat.py +++ b/lib/cachecontrol/compat.py @@ -4,23 +4,20 @@ except ImportError: from urlparse import urljoin -try: - import email.utils - parsedate_tz = email.utils.parsedate_tz -except ImportError: - import email.Utils - parsedate_tz = email.Utils.parsedate_tz - - try: import cPickle as pickle except ImportError: import pickle -# Handle the case where the requests has been patched to not have urllib3 -# bundled as part of it's source. +# Handle the case where the requests module has been patched to not have +# urllib3 bundled as part of its source. try: - from lib.requests.packages.urllib3.response import HTTPResponse + from requests.packages.urllib3.response import HTTPResponse except ImportError: from urllib3.response import HTTPResponse + +try: + from requests.packages.urllib3.util import is_fp_closed +except ImportError: + from urllib3.util import is_fp_closed diff --git a/lib/cachecontrol/controller.py b/lib/cachecontrol/controller.py index 3d208be0..f0380747 100644 --- a/lib/cachecontrol/controller.py +++ b/lib/cachecontrol/controller.py @@ -4,14 +4,14 @@ The httplib2 algorithms ported for use with requests. import re import calendar import time -import datetime +from email.utils import parsedate_tz -from lib.requests.structures import CaseInsensitiveDict +from requests.structures import CaseInsensitiveDict from .cache import DictCache -from .compat import parsedate_tz from .serialize import Serializer + URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?") @@ -21,7 +21,7 @@ def parse_uri(uri): (scheme, authority, path, query, fragment) = parse_uri(uri) """ groups = URI.match(uri).groups() - return groups[1], groups[3], groups[4], groups[6], groups[8] + return (groups[1], groups[3], groups[4], groups[6], groups[8]) class CacheController(object): @@ -32,26 +32,29 @@ class CacheController(object): self.cache_etags = cache_etags self.serializer = serializer or Serializer() - def _urlnorm(self, uri): + @classmethod + def _urlnorm(cls, uri): """Normalize the URL to create a safe key for the cache""" (scheme, authority, path, query, fragment) = parse_uri(uri) if not scheme or not authority: raise Exception("Only absolute URIs are allowed. uri = %s" % uri) - authority = authority.lower() + scheme = scheme.lower() + authority = authority.lower() + if not path: path = "/" # Could do syntax based normalization of the URI before # computing the digest. See Section 6.2.2 of Std 66. request_uri = query and "?".join([path, query]) or path - scheme = scheme.lower() defrag_uri = scheme + "://" + authority + request_uri return defrag_uri - def cache_url(self, uri): - return self._urlnorm(uri) + @classmethod + def cache_url(cls, uri): + return cls._urlnorm(uri) def parse_cache_control(self, headers): """ @@ -68,13 +71,20 @@ class CacheController(object): parts = headers[cc_header].split(',') parts_with_args = [ tuple([x.strip().lower() for x in part.split("=", 1)]) - for part in parts if -1 != part.find("=")] - parts_wo_args = [(name.strip().lower(), 1) - for name in parts if -1 == name.find("=")] + for part in parts if -1 != part.find("=") + ] + parts_wo_args = [ + (name.strip().lower(), 1) + for name in parts if -1 == name.find("=") + ] retval = dict(parts_with_args + parts_wo_args) return retval def cached_request(self, request): + """ + Return a cached response if it exists in the cache, otherwise + return False. + """ cache_url = self.cache_url(request.url) cc = self.parse_cache_control(request.headers) @@ -95,7 +105,24 @@ class CacheController(object): if not resp: return False + # If we have a cached 301, return it immediately. We don't + # need to test our response for other headers b/c it is + # intrinsically "cacheable" as it is Permanent. + # See: + # https://tools.ietf.org/html/rfc7231#section-6.4.2 + # + # Client can try to refresh the value by repeating the request + # with cache busting headers as usual (ie no-cache). + if resp.status == 301: + return resp + headers = CaseInsensitiveDict(resp.headers) + if not headers or 'date' not in headers: + # With date or etag, the cached response can never be used + # and should be deleted. + if 'etag' not in headers: + self.cache.delete(cache_url) + return False now = time.time() date = calendar.timegm( @@ -104,15 +131,19 @@ class CacheController(object): current_age = max(0, now - date) # TODO: There is an assumption that the result will be a - # urllib3 response object. This may not be best since we - # could probably avoid instantiating or constructing the - # response until we know we need it. + # urllib3 response object. This may not be best since we + # could probably avoid instantiating or constructing the + # response until we know we need it. resp_cc = self.parse_cache_control(headers) # determine freshness freshness_lifetime = 0 + + # Check the max-age pragma in the cache control header if 'max-age' in resp_cc and resp_cc['max-age'].isdigit(): freshness_lifetime = int(resp_cc['max-age']) + + # If there isn't a max-age, check for an expires header elif 'expires' in headers: expires = parsedate_tz(headers['expires']) if expires is not None: @@ -163,32 +194,24 @@ class CacheController(object): return new_headers - def cache_response(self, request, response): + def cache_response(self, request, response, body=None): """ Algorithm for caching requests. This assumes a requests Response object. """ # From httplib2: Don't cache 206's since we aren't going to - # handle byte range requests - if response.status not in [200, 203]: + # handle byte range requests + if response.status not in [200, 203, 300, 301]: return - # Cache Session Params - cache_auto = getattr(request, 'cache_auto', False) - cache_urls = getattr(request, 'cache_urls', []) - cache_max_age = getattr(request, 'cache_max_age', None) - response_headers = CaseInsensitiveDict(response.headers) - # Check if we are wanting to cache responses from specific urls only - cache_url = self.cache_url(request.url) - if len(cache_urls) > 0 and not any(s in cache_url for s in cache_urls): - return - cc_req = self.parse_cache_control(request.headers) cc = self.parse_cache_control(response_headers) + cache_url = self.cache_url(request.url) + # Delete it from the cache if we happen to have it stored there no_store = cc.get('no-store') or cc_req.get('no-store') if no_store and self.cache.get(cache_url): @@ -196,21 +219,18 @@ class CacheController(object): # If we've been given an etag, then keep the response if self.cache_etags and 'etag' in response_headers: - self.cache.set(cache_url, self.serializer.dumps(request, response)) + self.cache.set( + cache_url, + self.serializer.dumps(request, response, body=body), + ) - # If we want to cache sites not setup with cache headers then add the proper headers and keep the response - elif cache_auto and not cc and response_headers: - headers = {'Cache-Control': 'public,max-age=%d' % int(cache_max_age or 900)} - response.headers.update(headers) - - if 'expires' not in response_headers: - if getattr(response_headers, 'expires', None) is None: - expires = datetime.datetime.utcnow() + datetime.timedelta(days=1) - expires = expires.strftime("%a, %d %b %Y %H:%M:%S GMT") - headers = {'Expires': expires} - response.headers.update(headers) - - self.cache.set(cache_url, self.serializer.dumps(request, response)) + # Add to the cache any 301s. We do this before looking that + # the Date headers. + elif response.status == 301: + self.cache.set( + cache_url, + self.serializer.dumps(request, response) + ) # Add to the cache if the response headers demand it. If there # is no date header then we can't do anything about expiring @@ -219,10 +239,10 @@ class CacheController(object): # cache when there is a max-age > 0 if cc and cc.get('max-age'): if int(cc['max-age']) > 0: - if isinstance(cache_max_age, int): - cc['max-age'] = int(cache_max_age) - response.headers['cache-control'] = ''.join(['%s=%s' % (key, value) for (key, value) in cc.items()]) - self.cache.set(cache_url, self.serializer.dumps(request, response)) + self.cache.set( + cache_url, + self.serializer.dumps(request, response, body=body), + ) # If the request can expire, it means we should cache it # in the meantime. @@ -230,7 +250,7 @@ class CacheController(object): if response_headers['expires']: self.cache.set( cache_url, - self.serializer.dumps(request, response), + self.serializer.dumps(request, response, body=body), ) def update_cached_response(self, request, response): @@ -242,14 +262,30 @@ class CacheController(object): """ cache_url = self.cache_url(request.url) - cached_response = self.serializer.loads(request, self.cache.get(cache_url)) + cached_response = self.serializer.loads( + request, + self.cache.get(cache_url) + ) if not cached_response: # we didn't have a cached response return response - # did so lets update our headers - cached_response.headers.update(response.headers) + # Lets update our headers with the headers from the new request: + # http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-26#section-4.1 + # + # The server isn't supposed to send headers that would make + # the cached body invalid. But... just in case, we'll be sure + # to strip out ones we know that might be problmatic due to + # typical assumptions. + excluded_headers = [ + "content-length", + ] + + cached_response.headers.update( + dict((k, v) for k, v in response.headers.items() + if k.lower() not in excluded_headers) + ) # we want a 200 b/c we have content via the cache cached_response.status = 200 diff --git a/lib/cachecontrol/filewrapper.py b/lib/cachecontrol/filewrapper.py new file mode 100644 index 00000000..4b91bce0 --- /dev/null +++ b/lib/cachecontrol/filewrapper.py @@ -0,0 +1,63 @@ +from io import BytesIO + + +class CallbackFileWrapper(object): + """ + Small wrapper around a fp object which will tee everything read into a + buffer, and when that file is closed it will execute a callback with the + contents of that buffer. + + All attributes are proxied to the underlying file object. + + This class uses members with a double underscore (__) leading prefix so as + not to accidentally shadow an attribute. + """ + + def __init__(self, fp, callback): + self.__buf = BytesIO() + self.__fp = fp + self.__callback = callback + + def __getattr__(self, name): + # The vaguaries of garbage collection means that self.__fp is + # not always set. By using __getattribute__ and the private + # name[0] allows looking up the attribute value and raising an + # AttributeError when it doesn't exist. This stop thigns from + # infinitely recursing calls to getattr in the case where + # self.__fp hasn't been set. + # + # [0] https://docs.python.org/2/reference/expressions.html#atom-identifiers + fp = self.__getattribute__('_CallbackFileWrapper__fp') + return getattr(fp, name) + + def __is_fp_closed(self): + try: + return self.__fp.fp is None + except AttributeError: + pass + + try: + return self.__fp.closed + except AttributeError: + pass + + # We just don't cache it then. + # TODO: Add some logging here... + return False + + def read(self, amt=None): + data = self.__fp.read(amt) + self.__buf.write(data) + + if self.__is_fp_closed(): + if self.__callback: + self.__callback(self.__buf.getvalue()) + + # We assign this to None here, because otherwise we can get into + # really tricky problems where the CPython interpreter dead locks + # because the callback is holding a reference to something which + # has a __del__ method. Setting this to None breaks the cycle + # and allows the garbage collector to do it's thing normally. + self.__callback = None + + return data diff --git a/lib/cachecontrol/heuristics.py b/lib/cachecontrol/heuristics.py new file mode 100644 index 00000000..01b63141 --- /dev/null +++ b/lib/cachecontrol/heuristics.py @@ -0,0 +1,134 @@ +import calendar +import time + +from email.utils import formatdate, parsedate, parsedate_tz + +from datetime import datetime, timedelta + +TIME_FMT = "%a, %d %b %Y %H:%M:%S GMT" + + +def expire_after(delta, date=None): + date = date or datetime.now() + return date + delta + + +def datetime_to_header(dt): + return formatdate(calendar.timegm(dt.timetuple())) + + +class BaseHeuristic(object): + + def warning(self, response): + """ + Return a valid 1xx warning header value describing the cache + adjustments. + + The response is provided too allow warnings like 113 + http://tools.ietf.org/html/rfc7234#section-5.5.4 where we need + to explicitly say response is over 24 hours old. + """ + return '110 - "Response is Stale"' + + def update_headers(self, response): + """Update the response headers with any new headers. + + NOTE: This SHOULD always include some Warning header to + signify that the response was cached by the client, not + by way of the provided headers. + """ + return {} + + def apply(self, response): + warning_header_value = self.warning(response) + response.headers.update(self.update_headers(response)) + if warning_header_value is not None: + response.headers.update({'Warning': warning_header_value}) + return response + + +class OneDayCache(BaseHeuristic): + """ + Cache the response by providing an expires 1 day in the + future. + """ + def update_headers(self, response): + headers = {} + + if 'expires' not in response.headers: + date = parsedate(response.headers['date']) + expires = expire_after(timedelta(days=1), + date=datetime(*date[:6])) + headers['expires'] = datetime_to_header(expires) + headers['cache-control'] = 'public' + return headers + + +class ExpiresAfter(BaseHeuristic): + """ + Cache **all** requests for a defined time period. + """ + + def __init__(self, **kw): + self.delta = timedelta(**kw) + + def update_headers(self, response): + expires = expire_after(self.delta) + return { + 'expires': datetime_to_header(expires), + 'cache-control': 'public', + } + + def warning(self, response): + tmpl = '110 - Automatically cached for %s. Response might be stale' + return tmpl % self.delta + + +class LastModified(BaseHeuristic): + """ + If there is no Expires header already, fall back on Last-Modified + using the heuristic from + http://tools.ietf.org/html/rfc7234#section-4.2.2 + to calculate a reasonable value. + + Firefox also does something like this per + https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching_FAQ + http://lxr.mozilla.org/mozilla-release/source/netwerk/protocol/http/nsHttpResponseHead.cpp#397 + Unlike mozilla we limit this to 24-hr. + """ + cacheable_by_default_statuses = set([ + 200, 203, 204, 206, 300, 301, 404, 405, 410, 414, 501 + ]) + + def update_headers(self, resp): + headers = resp.headers + + if 'expires' in headers: + return {} + + if 'cache-control' in headers and headers['cache-control'] != 'public': + return {} + + if resp.status not in self.cacheable_by_default_statuses: + return {} + + if 'date' not in headers or 'last-modified' not in headers: + return {} + + date = calendar.timegm(parsedate_tz(headers['date'])) + last_modified = parsedate(headers['last-modified']) + if date is None or last_modified is None: + return {} + + now = time.time() + current_age = max(0, now - date) + delta = date - calendar.timegm(last_modified) + freshness_lifetime = max(0, min(delta / 10, 24 * 3600)) + if freshness_lifetime <= current_age: + return {} + + expires = date + freshness_lifetime + return {'expires': time.strftime(TIME_FMT, time.gmtime(expires))} + + def warning(self, resp): + return None diff --git a/lib/cachecontrol/patch_requests.py b/lib/cachecontrol/patch_requests.py deleted file mode 100644 index 3399223a..00000000 --- a/lib/cachecontrol/patch_requests.py +++ /dev/null @@ -1,56 +0,0 @@ -import requests - -from requests import models -from requests.packages.urllib3.response import HTTPResponse - -__attrs__ = [ - '_content', - 'status_code', - 'headers', - 'url', - 'history', - 'encoding', - 'reason', - 'cookies', - 'elapsed', -] - - -def response_getstate(self): - # consume everything - if not self._content_consumed: - self.content - - state = dict( - (attr, getattr(self, attr, None)) - for attr in __attrs__ - ) - - # deal with our raw content b/c we need it for our cookie jar - state['raw_original_response'] = self.raw._original_response - return state - - -def response_setstate(self, state): - for name, value in state.items(): - if name != 'raw_original_response': - setattr(self, name, value) - - setattr(self, 'raw', HTTPResponse()) - self.raw._original_response = state['raw_original_response'] - - -def make_responses_pickleable(): - try: - version_parts = [int(part) for part in requests.__version__.split('.')] - - # must be >= 2.2.x - if not version_parts[0] >= 2 or not version_parts[1] >= 2: - models.Response.__getstate__ = response_getstate - models.Response.__setstate__ = response_setstate - except: - raise - pass - - -make_responses_pickleable() \ No newline at end of file diff --git a/lib/cachecontrol/serialize.py b/lib/cachecontrol/serialize.py index 5316fa1c..6b17d80e 100644 --- a/lib/cachecontrol/serialize.py +++ b/lib/cachecontrol/serialize.py @@ -1,27 +1,59 @@ +import base64 import io +import json +import zlib from requests.structures import CaseInsensitiveDict from .compat import HTTPResponse, pickle +def _b64_encode_bytes(b): + return base64.b64encode(b).decode("ascii") + + +def _b64_encode_str(s): + return _b64_encode_bytes(s.encode("utf8")) + + +def _b64_decode_bytes(b): + return base64.b64decode(b.encode("ascii")) + + +def _b64_decode_str(s): + return _b64_decode_bytes(s).decode("utf8") + + class Serializer(object): + def dumps(self, request, response, body=None): response_headers = CaseInsensitiveDict(response.headers) if body is None: - # TODO: Figure out a way to handle this which doesn't break - # streaming body = response.read(decode_content=False) + + # NOTE: 99% sure this is dead code. I'm only leaving it + # here b/c I don't have a test yet to prove + # it. Basically, before using + # `cachecontrol.filewrapper.CallbackFileWrapper`, + # this made an effort to reset the file handle. The + # `CallbackFileWrapper` short circuits this code by + # setting the body as the content is consumed, the + # result being a `body` argument is *always* passed + # into cache_response, and in turn, + # `Serializer.dump`. response._fp = io.BytesIO(body) data = { "response": { - "body": body, - "headers": response.headers, + "body": _b64_encode_bytes(body), + "headers": dict( + (_b64_encode_str(k), _b64_encode_str(v)) + for k, v in response.headers.items() + ), "status": response.status, "version": response.version, - "reason": response.reason, + "reason": _b64_encode_str(response.reason), "strict": response.strict, "decode_content": response.decode_content, }, @@ -35,7 +67,20 @@ class Serializer(object): header = header.strip() data["vary"][header] = request.headers.get(header, None) - return b"cc=1," + pickle.dumps(data, pickle.HIGHEST_PROTOCOL) + # Encode our Vary headers to ensure they can be serialized as JSON + data["vary"] = dict( + (_b64_encode_str(k), _b64_encode_str(v) if v is not None else v) + for k, v in data["vary"].items() + ) + + return b",".join([ + b"cc=2", + zlib.compress( + json.dumps( + data, separators=(",", ":"), sort_keys=True, + ).encode("utf8"), + ), + ]) def loads(self, request, data): # Short circuit if we've been given an empty set of data @@ -66,6 +111,40 @@ class Serializer(object): # just treat it as a miss and return None return + def prepare_response(self, request, cached): + """Verify our vary headers match and construct a real urllib3 + HTTPResponse object. + """ + # Special case the '*' Vary value as it means we cannot actually + # determine if the cached response is suitable for this request. + if "*" in cached.get("vary", {}): + return + + # Ensure that the Vary headers for the cached response match our + # request + for header, value in cached.get("vary", {}).items(): + if request.headers.get(header, None) != value: + return + + body_raw = cached["response"].pop("body") + + try: + body = io.BytesIO(body_raw) + except TypeError: + # This can happen if cachecontrol serialized to v1 format (pickle) + # using Python 2. A Python 2 str(byte string) will be unpickled as + # a Python 3 str (unicode string), which will cause the above to + # fail with: + # + # TypeError: 'str' does not support the buffer interface + body = io.BytesIO(body_raw.encode('utf8')) + + return HTTPResponse( + body=body, + preload_content=False, + **cached["response"] + ) + def _loads_v0(self, request, data): # The original legacy cache data. This doesn't contain enough # information to construct everything we need, so we'll treat this as @@ -78,20 +157,28 @@ class Serializer(object): except ValueError: return - # Special case the '*' Vary value as it means we cannot actually - # determine if the cached response is suitable for this request. - if "*" in cached.get("vary", {}): + return self.prepare_response(request, cached) + + def _loads_v2(self, request, data): + try: + cached = json.loads(zlib.decompress(data).decode("utf8")) + except ValueError: return - # Ensure that the Vary headers for the cached response match our - # request - for header, value in cached.get("vary", {}).items(): - if request.headers.get(header, None) != value: - return - - body = io.BytesIO(cached["response"].pop("body")) - return HTTPResponse( - body=body, - preload_content=False, - **cached["response"] + # We need to decode the items that we've base64 encoded + cached["response"]["body"] = _b64_decode_bytes( + cached["response"]["body"] ) + cached["response"]["headers"] = dict( + (_b64_decode_str(k), _b64_decode_str(v)) + for k, v in cached["response"]["headers"].items() + ) + cached["response"]["reason"] = _b64_decode_str( + cached["response"]["reason"], + ) + cached["vary"] = dict( + (_b64_decode_str(k), _b64_decode_str(v) if v is not None else v) + for k, v in cached["vary"].items() + ) + + return self.prepare_response(request, cached) diff --git a/lib/cachecontrol/session.py b/lib/cachecontrol/session.py deleted file mode 100644 index 1758cd6b..00000000 --- a/lib/cachecontrol/session.py +++ /dev/null @@ -1,34 +0,0 @@ -from requests.sessions import Session - -class CacheControlSession(Session): - def __init__(self): - super(CacheControlSession, self).__init__() - - def get(self, *args, **kw): - # auto-cache response - self.cache_auto = False - if kw.get('cache_auto'): - self.cache_auto = kw.pop('cache_auto') - - # urls allowed to cache - self.cache_urls = [] - if kw.get('cache_urls'): - self.cache_urls = [str(args[0])] + kw.pop('cache_urls') - - # timeout for cached responses - self.cache_max_age = None - if kw.get('cache_max_age'): - self.cache_max_age = int(kw.pop('cache_max_age')) - - return super(CacheControlSession, self).get(*args, **kw) - - def prepare_request(self, *args, **kw): - # get response - req = super(CacheControlSession, self).prepare_request(*args, **kw) - - # attach params to request - req.cache_auto = self.cache_auto - req.cache_urls = self.cache_urls - req.cache_max_age = self.cache_max_age - - return req \ No newline at end of file diff --git a/lib/cachecontrol/wrapper.py b/lib/cachecontrol/wrapper.py index 0dc608a0..ea421aa7 100644 --- a/lib/cachecontrol/wrapper.py +++ b/lib/cachecontrol/wrapper.py @@ -1,14 +1,19 @@ from .adapter import CacheControlAdapter from .cache import DictCache -from .session import CacheControlSession -def CacheControl(sess=None, cache=None, cache_etags=True, serializer=None): - sess = sess or CacheControlSession() + +def CacheControl(sess, + cache=None, + cache_etags=True, + serializer=None, + heuristic=None): + cache = cache or DictCache() adapter = CacheControlAdapter( cache, cache_etags=cache_etags, serializer=serializer, + heuristic=heuristic, ) sess.mount('http://', adapter) sess.mount('https://', adapter)