Update cachecontrol library 0.9.3 to 0.11.2.

2025-04-04 17:51:05 +00:00 · 2015-04-28 18:32:10 +01:00 · 2015-04-28 18:32:10 +01:00 · 3ab45e19d5
commit 3ab45e19d5
parent f9568212da
14 changed files with 496 additions and 192 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -9,6 +9,7 @@
 * Update change to suppress HTTPS verification InsecureRequestWarning to updated package as listed in hacks.txt
 * Remove listed hacks.txt record for check that SSLv3 is available because issue was addressed by vendor
 * Update chardet packages 2.2.1 to 2.3.0 (ff40135)
+* Update cachecontrol library 0.9.3 to 0.11.2
 * Add ToTV provider
 * Fix Backlog scheduler initialization and change backlog frequency from minutes to days
 * Change to consolidate and tidy some provider code
--- a/lib/cachecontrol/init.py
+++ b/lib/cachecontrol/init.py
@ -2,6 +2,10 @@

 Make it easy to import from cachecontrol without long namespaces.
 """
+__author__ = 'Eric Larson'
+__email__ = 'eric@ionrock.org'
+__version__ = '0.11.2'
+
 from .wrapper import CacheControl
 from .adapter import CacheControlAdapter
 from .controller import CacheController
--- a/lib/cachecontrol/adapter.py
+++ b/lib/cachecontrol/adapter.py
@ -1,16 +1,24 @@
-from lib.requests.adapters import HTTPAdapter
+import functools
+
+from requests.adapters import HTTPAdapter

 from .controller import CacheController
 from .cache import DictCache
+from .filewrapper import CallbackFileWrapper


 class CacheControlAdapter(HTTPAdapter):
    invalidating_methods = set(['PUT', 'DELETE'])

-    def __init__(self, cache=None, cache_etags=True, controller_class=None,
-                 serializer=None, *args, **kw):
+    def __init__(self, cache=None,
+                 cache_etags=True,
+                 controller_class=None,
+                 serializer=None,
+                 heuristic=None,
+                 *args, **kw):
        super(CacheControlAdapter, self).__init__(*args, **kw)
        self.cache = cache or DictCache()
+        self.heuristic = heuristic

        controller_factory = controller_class or CacheController
        self.controller = controller_factory(
@ -27,10 +35,13 @@ class CacheControlAdapter(HTTPAdapter):
        if request.method == 'GET':
            cached_response = self.controller.cached_request(request)
            if cached_response:
-                return self.build_response(request, cached_response, from_cache=True)
+                return self.build_response(request, cached_response,
+                                           from_cache=True)

            # check for etags and add headers if appropriate
-            request.headers.update(self.controller.conditional_headers(request))
+            request.headers.update(
+                self.controller.conditional_headers(request)
+            )

        resp = super(CacheControlAdapter, self).send(request, **kw)

@ -44,6 +55,8 @@ class CacheControlAdapter(HTTPAdapter):
        cached response
        """
        if not from_cache and request.method == 'GET':
+
+            # apply any expiration heuristics
            if response.status == 304:
                # We must have sent an ETag request. This could mean
                # that we've been expired already or that we simply
@ -56,14 +69,34 @@ class CacheControlAdapter(HTTPAdapter):
                if cached_response is not response:
                    from_cache = True

+                # We are done with the server response, read a
+                # possible response body (compliant servers will
+                # not return one, but we cannot be 100% sure) and
+                # release the connection back to the pool.
+                response.read(decode_content=False)
+                response.release_conn()
+
                response = cached_response
+
+            # We always cache the 301 responses
+            elif response.status == 301:
+                self.controller.cache_response(request, response)
            else:
-                # try to cache the response
-                try:
-                    self.controller.cache_response(request, response)
-                except Exception as e:
-                    # Failed to cache the results
-                    pass
+                # Check for any heuristics that might update headers
+                # before trying to cache.
+                if self.heuristic:
+                    response = self.heuristic.apply(response)
+
+                # Wrap the response file with a wrapper that will cache the
+                #   response when the stream has been consumed.
+                response._fp = CallbackFileWrapper(
+                    response._fp,
+                    functools.partial(
+                        self.controller.cache_response,
+                        request,
+                        response,
+                    )
+                )

        resp = super(CacheControlAdapter, self).build_response(
            request, response
@ -78,3 +111,7 @@ class CacheControlAdapter(HTTPAdapter):
        resp.from_cache = from_cache

        return resp
+
+    def close(self):
+        self.cache.close()
+        super(CacheControlAdapter, self).close()
--- a/lib/cachecontrol/cache.py
+++ b/lib/cachecontrol/cache.py
@ -1,9 +1,10 @@
 """
-The cache object API for implementing caches. The default is just a
-dictionary, which in turns means it is not threadsafe for writing.
+The cache object API for implementing caches. The default is a thread
+safe in-memory dictionary.
 """
 from threading import Lock

+
 class BaseCache(object):

    def get(self, key):
@ -15,6 +16,10 @@ class BaseCache(object):
    def delete(self, key):
        raise NotImplemented()

+    def close(self):
+        pass
+
+
 class DictCache(BaseCache):

    def __init__(self, init_dict=None):
--- a/lib/cachecontrol/caches/file_cache.py
+++ b/lib/cachecontrol/caches/file_cache.py
@ -3,6 +3,9 @@ import os

 from lockfile import FileLock

+from ..cache import BaseCache
+from ..controller import CacheController
+

 def _secure_open_write(filename, fmode):
    # We only want to write to this file, so open it in write only mode
@ -44,22 +47,24 @@ def _secure_open_write(filename, fmode):
        raise


-class FileCache(object):
+class FileCache(BaseCache):
    def __init__(self, directory, forever=False, filemode=0o0600,
                 dirmode=0o0700):
        self.directory = directory
        self.forever = forever
        self.filemode = filemode
-
-        if not os.path.isdir(self.directory):
-            os.makedirs(self.directory, dirmode)
+        self.dirmode = dirmode

    @staticmethod
    def encode(x):
        return hashlib.sha224(x.encode()).hexdigest()

    def _fn(self, name):
-        return os.path.join(self.directory, self.encode(name))
+        # NOTE: This method should not change as some may depend on it.
+        #       See: https://github.com/ionrock/cachecontrol/issues/63
+        hashed = self.encode(name)
+        parts = list(hashed[:5]) + [hashed]
+        return os.path.join(self.directory, *parts)

    def get(self, key):
        name = self._fn(key)
@ -71,7 +76,15 @@ class FileCache(object):

    def set(self, key, value):
        name = self._fn(key)
+
+        # Make sure the directory exists
+        try:
+            os.makedirs(os.path.dirname(name), self.dirmode)
+        except (IOError, OSError):
+            pass
+
        with FileLock(name) as lock:
+            # Write our actual file
            with _secure_open_write(lock.path, self.filemode) as fh:
                fh.write(value)

@ -79,3 +92,12 @@ class FileCache(object):
        name = self._fn(key)
        if not self.forever:
            os.remove(name)
+
+
+def url_to_file_path(url, filecache):
+    """Return the file cache path based on the URL.
+
+    This does not ensure the file exists!
+    """
+    key = CacheController.cache_url(url)
+    return filecache._fn(key)
--- a/lib/cachecontrol/caches/redis_cache.py
+++ b/lib/cachecontrol/caches/redis_cache.py
@ -36,3 +36,6 @@ class RedisCache(object):
        caution!"""
        for key in self.conn.keys():
            self.conn.delete(key)
+
+    def close(self):
+        self.conn.disconnect()
--- a/lib/cachecontrol/compat.py
+++ b/lib/cachecontrol/compat.py
@ -4,23 +4,20 @@ except ImportError:
    from urlparse import urljoin


-try:
-    import email.utils
-    parsedate_tz = email.utils.parsedate_tz
-except ImportError:
-    import email.Utils
-    parsedate_tz = email.Utils.parsedate_tz
-
-
 try:
    import cPickle as pickle
 except ImportError:
    import pickle


-# Handle the case where the requests has been patched to not have urllib3
-# bundled as part of it's source.
+# Handle the case where the requests module has been patched to not have
+# urllib3 bundled as part of its source.
 try:
-    from lib.requests.packages.urllib3.response import HTTPResponse
+    from requests.packages.urllib3.response import HTTPResponse
 except ImportError:
    from urllib3.response import HTTPResponse
+
+try:
+    from requests.packages.urllib3.util import is_fp_closed
+except ImportError:
+    from urllib3.util import is_fp_closed
--- a/lib/cachecontrol/controller.py
+++ b/lib/cachecontrol/controller.py
@ -4,14 +4,14 @@ The httplib2 algorithms ported for use with requests.
 import re
 import calendar
 import time
-import datetime
+from email.utils import parsedate_tz

-from lib.requests.structures import CaseInsensitiveDict
+from requests.structures import CaseInsensitiveDict

 from .cache import DictCache
-from .compat import parsedate_tz
 from .serialize import Serializer

+
 URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")


@ -21,7 +21,7 @@ def parse_uri(uri):
        (scheme, authority, path, query, fragment) = parse_uri(uri)
    """
    groups = URI.match(uri).groups()
-    return groups[1], groups[3], groups[4], groups[6], groups[8]
+    return (groups[1], groups[3], groups[4], groups[6], groups[8])


 class CacheController(object):
@ -32,26 +32,29 @@ class CacheController(object):
        self.cache_etags = cache_etags
        self.serializer = serializer or Serializer()

-    def _urlnorm(self, uri):
+    @classmethod
+    def _urlnorm(cls, uri):
        """Normalize the URL to create a safe key for the cache"""
        (scheme, authority, path, query, fragment) = parse_uri(uri)
        if not scheme or not authority:
            raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
-        authority = authority.lower()
+
        scheme = scheme.lower()
+        authority = authority.lower()
+
        if not path:
            path = "/"

        # Could do syntax based normalization of the URI before
        # computing the digest. See Section 6.2.2 of Std 66.
        request_uri = query and "?".join([path, query]) or path
-        scheme = scheme.lower()
        defrag_uri = scheme + "://" + authority + request_uri

        return defrag_uri

-    def cache_url(self, uri):
-        return self._urlnorm(uri)
+    @classmethod
+    def cache_url(cls, uri):
+        return cls._urlnorm(uri)

    def parse_cache_control(self, headers):
        """
@ -68,13 +71,20 @@ class CacheController(object):
            parts = headers[cc_header].split(',')
            parts_with_args = [
                tuple([x.strip().lower() for x in part.split("=", 1)])
-                for part in parts if -1 != part.find("=")]
-            parts_wo_args = [(name.strip().lower(), 1)
-                             for name in parts if -1 == name.find("=")]
+                for part in parts if -1 != part.find("=")
+            ]
+            parts_wo_args = [
+                (name.strip().lower(), 1)
+                for name in parts if -1 == name.find("=")
+            ]
            retval = dict(parts_with_args + parts_wo_args)
        return retval

    def cached_request(self, request):
+        """
+        Return a cached response if it exists in the cache, otherwise
+        return False.
+        """
        cache_url = self.cache_url(request.url)
        cc = self.parse_cache_control(request.headers)

@ -95,7 +105,24 @@ class CacheController(object):
        if not resp:
            return False

+        # If we have a cached 301, return it immediately. We don't
+        # need to test our response for other headers b/c it is
+        # intrinsically "cacheable" as it is Permanent.
+        # See:
+        #   https://tools.ietf.org/html/rfc7231#section-6.4.2
+        #
+        # Client can try to refresh the value by repeating the request
+        # with cache busting headers as usual (ie no-cache).
+        if resp.status == 301:
+            return resp
+
        headers = CaseInsensitiveDict(resp.headers)
+        if not headers or 'date' not in headers:
+            # With date or etag, the cached response can never be used
+            # and should be deleted.
+            if 'etag' not in headers:
+                self.cache.delete(cache_url)
+            return False

        now = time.time()
        date = calendar.timegm(
@ -104,15 +131,19 @@ class CacheController(object):
        current_age = max(0, now - date)

        # TODO: There is an assumption that the result will be a
-        # urllib3 response object. This may not be best since we
-        # could probably avoid instantiating or constructing the
-        # response until we know we need it.
+        #       urllib3 response object. This may not be best since we
+        #       could probably avoid instantiating or constructing the
+        #       response until we know we need it.
        resp_cc = self.parse_cache_control(headers)

        # determine freshness
        freshness_lifetime = 0
+
+        # Check the max-age pragma in the cache control header
        if 'max-age' in resp_cc and resp_cc['max-age'].isdigit():
            freshness_lifetime = int(resp_cc['max-age'])
+
+        # If there isn't a max-age, check for an expires header
        elif 'expires' in headers:
            expires = parsedate_tz(headers['expires'])
            if expires is not None:
@ -163,32 +194,24 @@ class CacheController(object):

        return new_headers

-    def cache_response(self, request, response):
+    def cache_response(self, request, response, body=None):
        """
        Algorithm for caching requests.

        This assumes a requests Response object.
        """
        # From httplib2: Don't cache 206's since we aren't going to
-        # handle byte range requests
-        if response.status not in [200, 203]:
+        #                handle byte range requests
+        if response.status not in [200, 203, 300, 301]:
            return

-        # Cache Session Params
-        cache_auto = getattr(request, 'cache_auto', False)
-        cache_urls = getattr(request, 'cache_urls', [])
-        cache_max_age = getattr(request, 'cache_max_age', None)
-
        response_headers = CaseInsensitiveDict(response.headers)

-        # Check if we are wanting to cache responses from specific urls only
-        cache_url = self.cache_url(request.url)
-        if len(cache_urls) > 0 and not any(s in cache_url for s in cache_urls):
-                return
-
        cc_req = self.parse_cache_control(request.headers)
        cc = self.parse_cache_control(response_headers)

+        cache_url = self.cache_url(request.url)
+
        # Delete it from the cache if we happen to have it stored there
        no_store = cc.get('no-store') or cc_req.get('no-store')
        if no_store and self.cache.get(cache_url):
@ -196,21 +219,18 @@ class CacheController(object):

        # If we've been given an etag, then keep the response
        if self.cache_etags and 'etag' in response_headers:
-            self.cache.set(cache_url, self.serializer.dumps(request, response))
+            self.cache.set(
+                cache_url,
+                self.serializer.dumps(request, response, body=body),
+            )

-        # If we want to cache sites not setup with cache headers then add the proper headers and keep the response
-        elif cache_auto and not cc and response_headers:
-            headers = {'Cache-Control': 'public,max-age=%d' % int(cache_max_age or 900)}
-            response.headers.update(headers)
-
-            if 'expires' not in response_headers:
-                if getattr(response_headers, 'expires', None) is None:
-                    expires = datetime.datetime.utcnow() + datetime.timedelta(days=1)
-                    expires = expires.strftime("%a, %d %b %Y %H:%M:%S GMT")
-                    headers = {'Expires': expires}
-                    response.headers.update(headers)
-
-            self.cache.set(cache_url, self.serializer.dumps(request, response))
+        # Add to the cache any 301s. We do this before looking that
+        # the Date headers.
+        elif response.status == 301:
+            self.cache.set(
+                cache_url,
+                self.serializer.dumps(request, response)
+            )

        # Add to the cache if the response headers demand it. If there
        # is no date header then we can't do anything about expiring
@ -219,10 +239,10 @@ class CacheController(object):
            # cache when there is a max-age > 0
            if cc and cc.get('max-age'):
                if int(cc['max-age']) > 0:
-                    if isinstance(cache_max_age, int):
-                        cc['max-age'] = int(cache_max_age)
-                        response.headers['cache-control'] = ''.join(['%s=%s' % (key, value) for (key, value) in cc.items()])
-                    self.cache.set(cache_url, self.serializer.dumps(request, response))
+                    self.cache.set(
+                        cache_url,
+                        self.serializer.dumps(request, response, body=body),
+                    )

            # If the request can expire, it means we should cache it
            # in the meantime.
@ -230,7 +250,7 @@ class CacheController(object):
                if response_headers['expires']:
                    self.cache.set(
                        cache_url,
-                        self.serializer.dumps(request, response),
+                        self.serializer.dumps(request, response, body=body),
                    )

    def update_cached_response(self, request, response):
@ -242,14 +262,30 @@ class CacheController(object):
        """
        cache_url = self.cache_url(request.url)

-        cached_response = self.serializer.loads(request, self.cache.get(cache_url))
+        cached_response = self.serializer.loads(
+            request,
+            self.cache.get(cache_url)
+        )

        if not cached_response:
            # we didn't have a cached response
            return response

-        # did so lets update our headers
-        cached_response.headers.update(response.headers)
+        # Lets update our headers with the headers from the new request:
+        # http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-26#section-4.1
+        #
+        # The server isn't supposed to send headers that would make
+        # the cached body invalid. But... just in case, we'll be sure
+        # to strip out ones we know that might be problmatic due to
+        # typical assumptions.
+        excluded_headers = [
+            "content-length",
+        ]
+
+        cached_response.headers.update(
+            dict((k, v) for k, v in response.headers.items()
+                 if k.lower() not in excluded_headers)
+        )

        # we want a 200 b/c we have content via the cache
        cached_response.status = 200
--- a/lib/cachecontrol/filewrapper.py
+++ b/lib/cachecontrol/filewrapper.py
@ -0,0 +1,63 @@
+from io import BytesIO
+
+
+class CallbackFileWrapper(object):
+    """
+    Small wrapper around a fp object which will tee everything read into a
+    buffer, and when that file is closed it will execute a callback with the
+    contents of that buffer.
+
+    All attributes are proxied to the underlying file object.
+
+    This class uses members with a double underscore (__) leading prefix so as
+    not to accidentally shadow an attribute.
+    """
+
+    def __init__(self, fp, callback):
+        self.__buf = BytesIO()
+        self.__fp = fp
+        self.__callback = callback
+
+    def __getattr__(self, name):
+        # The vaguaries of garbage collection means that self.__fp is
+        # not always set.  By using __getattribute__ and the private
+        # name[0] allows looking up the attribute value and raising an
+        # AttributeError when it doesn't exist. This stop thigns from
+        # infinitely recursing calls to getattr in the case where
+        # self.__fp hasn't been set.
+        #
+        # [0] https://docs.python.org/2/reference/expressions.html#atom-identifiers
+        fp = self.__getattribute__('_CallbackFileWrapper__fp')
+        return getattr(fp, name)
+
+    def __is_fp_closed(self):
+        try:
+            return self.__fp.fp is None
+        except AttributeError:
+            pass
+
+        try:
+            return self.__fp.closed
+        except AttributeError:
+            pass
+
+        # We just don't cache it then.
+        # TODO: Add some logging here...
+        return False
+
+    def read(self, amt=None):
+        data = self.__fp.read(amt)
+        self.__buf.write(data)
+
+        if self.__is_fp_closed():
+            if self.__callback:
+                self.__callback(self.__buf.getvalue())
+
+            # We assign this to None here, because otherwise we can get into
+            # really tricky problems where the CPython interpreter dead locks
+            # because the callback is holding a reference to something which
+            # has a __del__ method. Setting this to None breaks the cycle
+            # and allows the garbage collector to do it's thing normally.
+            self.__callback = None
+
+        return data
--- a/lib/cachecontrol/heuristics.py
+++ b/lib/cachecontrol/heuristics.py
@ -0,0 +1,134 @@
+import calendar
+import time
+
+from email.utils import formatdate, parsedate, parsedate_tz
+
+from datetime import datetime, timedelta
+
+TIME_FMT = "%a, %d %b %Y %H:%M:%S GMT"
+
+
+def expire_after(delta, date=None):
+    date = date or datetime.now()
+    return date + delta
+
+
+def datetime_to_header(dt):
+    return formatdate(calendar.timegm(dt.timetuple()))
+
+
+class BaseHeuristic(object):
+
+    def warning(self, response):
+        """
+        Return a valid 1xx warning header value describing the cache
+        adjustments.
+
+        The response is provided too allow warnings like 113
+        http://tools.ietf.org/html/rfc7234#section-5.5.4 where we need
+        to explicitly say response is over 24 hours old.
+        """
+        return '110 - "Response is Stale"'
+
+    def update_headers(self, response):
+        """Update the response headers with any new headers.
+
+        NOTE: This SHOULD always include some Warning header to
+              signify that the response was cached by the client, not
+              by way of the provided headers.
+        """
+        return {}
+
+    def apply(self, response):
+        warning_header_value = self.warning(response)
+        response.headers.update(self.update_headers(response))
+        if warning_header_value is not None:
+            response.headers.update({'Warning': warning_header_value})
+        return response
+
+
+class OneDayCache(BaseHeuristic):
+    """
+    Cache the response by providing an expires 1 day in the
+    future.
+    """
+    def update_headers(self, response):
+        headers = {}
+
+        if 'expires' not in response.headers:
+            date = parsedate(response.headers['date'])
+            expires = expire_after(timedelta(days=1),
+                                   date=datetime(*date[:6]))
+            headers['expires'] = datetime_to_header(expires)
+            headers['cache-control'] = 'public'
+        return headers
+
+
+class ExpiresAfter(BaseHeuristic):
+    """
+    Cache **all** requests for a defined time period.
+    """
+
+    def __init__(self, **kw):
+        self.delta = timedelta(**kw)
+
+    def update_headers(self, response):
+        expires = expire_after(self.delta)
+        return {
+            'expires': datetime_to_header(expires),
+            'cache-control': 'public',
+        }
+
+    def warning(self, response):
+        tmpl = '110 - Automatically cached for %s. Response might be stale'
+        return tmpl % self.delta
+
+
+class LastModified(BaseHeuristic):
+    """
+    If there is no Expires header already, fall back on Last-Modified
+    using the heuristic from
+    http://tools.ietf.org/html/rfc7234#section-4.2.2
+    to calculate a reasonable value.
+
+    Firefox also does something like this per
+    https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching_FAQ
+    http://lxr.mozilla.org/mozilla-release/source/netwerk/protocol/http/nsHttpResponseHead.cpp#397
+    Unlike mozilla we limit this to 24-hr.
+    """
+    cacheable_by_default_statuses = set([
+        200, 203, 204, 206, 300, 301, 404, 405, 410, 414, 501
+    ])
+
+    def update_headers(self, resp):
+        headers = resp.headers
+
+        if 'expires' in headers:
+            return {}
+
+        if 'cache-control' in headers and headers['cache-control'] != 'public':
+            return {}
+
+        if resp.status not in self.cacheable_by_default_statuses:
+            return {}
+
+        if 'date' not in headers or 'last-modified' not in headers:
+            return {}
+
+        date = calendar.timegm(parsedate_tz(headers['date']))
+        last_modified = parsedate(headers['last-modified'])
+        if date is None or last_modified is None:
+            return {}
+
+        now = time.time()
+        current_age = max(0, now - date)
+        delta = date - calendar.timegm(last_modified)
+        freshness_lifetime = max(0, min(delta / 10, 24 * 3600))
+        if freshness_lifetime <= current_age:
+            return {}
+
+        expires = date + freshness_lifetime
+        return {'expires': time.strftime(TIME_FMT, time.gmtime(expires))}
+
+    def warning(self, resp):
+        return None
--- a/lib/cachecontrol/patch_requests.py
+++ b/lib/cachecontrol/patch_requests.py
@ -1,56 +0,0 @@
-import requests
-
-from requests import models
-from requests.packages.urllib3.response import HTTPResponse
-
-__attrs__ = [
-    '_content',
-    'status_code',
-    'headers',
-    'url',
-    'history',
-    'encoding',
-    'reason',
-    'cookies',
-    'elapsed',
-]
-
-
-def response_getstate(self):
-    # consume everything
-    if not self._content_consumed:
-        self.content
-
-    state = dict(
-        (attr, getattr(self, attr, None))
-        for attr in __attrs__
-    )
-
-    # deal with our raw content b/c we need it for our cookie jar
-    state['raw_original_response'] = self.raw._original_response
-    return state
-
-
-def response_setstate(self, state):
-    for name, value in state.items():
-        if name != 'raw_original_response':
-            setattr(self, name, value)
-
-    setattr(self, 'raw', HTTPResponse())
-    self.raw._original_response = state['raw_original_response']
-
-
-def make_responses_pickleable():
-    try:
-        version_parts = [int(part) for part in requests.__version__.split('.')]
-
-        # must be >= 2.2.x
-        if not version_parts[0] >= 2 or not version_parts[1] >= 2:
-            models.Response.__getstate__ = response_getstate
-            models.Response.__setstate__ = response_setstate
-    except:
-        raise
-        pass
-
-
-make_responses_pickleable()
--- a/lib/cachecontrol/serialize.py
+++ b/lib/cachecontrol/serialize.py
@ -1,27 +1,59 @@
+import base64
 import io
+import json
+import zlib

 from requests.structures import CaseInsensitiveDict

 from .compat import HTTPResponse, pickle


+def _b64_encode_bytes(b):
+    return base64.b64encode(b).decode("ascii")
+
+
+def _b64_encode_str(s):
+    return _b64_encode_bytes(s.encode("utf8"))
+
+
+def _b64_decode_bytes(b):
+    return base64.b64decode(b.encode("ascii"))
+
+
+def _b64_decode_str(s):
+    return _b64_decode_bytes(s).decode("utf8")
+
+
 class Serializer(object):
+
    def dumps(self, request, response, body=None):
        response_headers = CaseInsensitiveDict(response.headers)

        if body is None:
-            # TODO: Figure out a way to handle this which doesn't break
-            #   streaming
            body = response.read(decode_content=False)
+
+            # NOTE: 99% sure this is dead code. I'm only leaving it
+            #       here b/c I don't have a test yet to prove
+            #       it. Basically, before using
+            #       `cachecontrol.filewrapper.CallbackFileWrapper`,
+            #       this made an effort to reset the file handle. The
+            #       `CallbackFileWrapper` short circuits this code by
+            #       setting the body as the content is consumed, the
+            #       result being a `body` argument is *always* passed
+            #       into cache_response, and in turn,
+            #       `Serializer.dump`.
            response._fp = io.BytesIO(body)

        data = {
            "response": {
-                "body": body,
-                "headers": response.headers,
+                "body": _b64_encode_bytes(body),
+                "headers": dict(
+                    (_b64_encode_str(k), _b64_encode_str(v))
+                    for k, v in response.headers.items()
+                ),
                "status": response.status,
                "version": response.version,
-                "reason": response.reason,
+                "reason": _b64_encode_str(response.reason),
                "strict": response.strict,
                "decode_content": response.decode_content,
            },
@ -35,7 +67,20 @@ class Serializer(object):
                header = header.strip()
                data["vary"][header] = request.headers.get(header, None)

-        return b"cc=1," + pickle.dumps(data, pickle.HIGHEST_PROTOCOL)
+        # Encode our Vary headers to ensure they can be serialized as JSON
+        data["vary"] = dict(
+            (_b64_encode_str(k), _b64_encode_str(v) if v is not None else v)
+            for k, v in data["vary"].items()
+        )
+
+        return b",".join([
+            b"cc=2",
+            zlib.compress(
+                json.dumps(
+                    data, separators=(",", ":"), sort_keys=True,
+                ).encode("utf8"),
+            ),
+        ])

    def loads(self, request, data):
        # Short circuit if we've been given an empty set of data
@ -66,6 +111,40 @@ class Serializer(object):
            # just treat it as a miss and return None
            return

+    def prepare_response(self, request, cached):
+        """Verify our vary headers match and construct a real urllib3
+        HTTPResponse object.
+        """
+        # Special case the '*' Vary value as it means we cannot actually
+        # determine if the cached response is suitable for this request.
+        if "*" in cached.get("vary", {}):
+            return
+
+        # Ensure that the Vary headers for the cached response match our
+        # request
+        for header, value in cached.get("vary", {}).items():
+            if request.headers.get(header, None) != value:
+                return
+
+        body_raw = cached["response"].pop("body")
+
+        try:
+            body = io.BytesIO(body_raw)
+        except TypeError:
+            # This can happen if cachecontrol serialized to v1 format (pickle)
+            # using Python 2. A Python 2 str(byte string) will be unpickled as
+            # a Python 3 str (unicode string), which will cause the above to
+            # fail with:
+            #
+            #     TypeError: 'str' does not support the buffer interface
+            body = io.BytesIO(body_raw.encode('utf8'))
+
+        return HTTPResponse(
+            body=body,
+            preload_content=False,
+            **cached["response"]
+        )
+
    def _loads_v0(self, request, data):
        # The original legacy cache data. This doesn't contain enough
        # information to construct everything we need, so we'll treat this as
@ -78,20 +157,28 @@ class Serializer(object):
        except ValueError:
            return

-        # Special case the '*' Vary value as it means we cannot actually
-        # determine if the cached response is suitable for this request.
-        if "*" in cached.get("vary", {}):
+        return self.prepare_response(request, cached)
+
+    def _loads_v2(self, request, data):
+        try:
+            cached = json.loads(zlib.decompress(data).decode("utf8"))
+        except ValueError:
            return

-        # Ensure that the Vary headers for the cached response match our
-        # request
-        for header, value in cached.get("vary", {}).items():
-            if request.headers.get(header, None) != value:
-                return
-
-        body = io.BytesIO(cached["response"].pop("body"))
-        return HTTPResponse(
-            body=body,
-            preload_content=False,
-            **cached["response"]
+        # We need to decode the items that we've base64 encoded
+        cached["response"]["body"] = _b64_decode_bytes(
+            cached["response"]["body"]
        )
+        cached["response"]["headers"] = dict(
+            (_b64_decode_str(k), _b64_decode_str(v))
+            for k, v in cached["response"]["headers"].items()
+        )
+        cached["response"]["reason"] = _b64_decode_str(
+            cached["response"]["reason"],
+        )
+        cached["vary"] = dict(
+            (_b64_decode_str(k), _b64_decode_str(v) if v is not None else v)
+            for k, v in cached["vary"].items()
+        )
+
+        return self.prepare_response(request, cached)
--- a/lib/cachecontrol/session.py
+++ b/lib/cachecontrol/session.py
@ -1,34 +0,0 @@
-from requests.sessions import Session
-
-class CacheControlSession(Session):
-    def __init__(self):
-        super(CacheControlSession, self).__init__()
-
-    def get(self, *args, **kw):
-        # auto-cache response
-        self.cache_auto = False
-        if kw.get('cache_auto'):
-            self.cache_auto = kw.pop('cache_auto')
-
-        # urls allowed to cache
-        self.cache_urls = []
-        if kw.get('cache_urls'):
-            self.cache_urls = [str(args[0])] + kw.pop('cache_urls')
-
-        # timeout for cached responses
-        self.cache_max_age = None
-        if kw.get('cache_max_age'):
-            self.cache_max_age = int(kw.pop('cache_max_age'))
-
-        return super(CacheControlSession, self).get(*args, **kw)
-
-    def prepare_request(self, *args, **kw):
-        # get response
-        req = super(CacheControlSession, self).prepare_request(*args, **kw)
-
-        # attach params to request
-        req.cache_auto = self.cache_auto
-        req.cache_urls = self.cache_urls
-        req.cache_max_age = self.cache_max_age
-
-        return req
--- a/lib/cachecontrol/wrapper.py
+++ b/lib/cachecontrol/wrapper.py
@ -1,14 +1,19 @@
 from .adapter import CacheControlAdapter
 from .cache import DictCache
-from .session import CacheControlSession

-def CacheControl(sess=None, cache=None, cache_etags=True, serializer=None):
-    sess = sess or CacheControlSession()
+
+def CacheControl(sess,
+                 cache=None,
+                 cache_etags=True,
+                 serializer=None,
+                 heuristic=None):
+
    cache = cache or DictCache()
    adapter = CacheControlAdapter(
        cache,
        cache_etags=cache_etags,
        serializer=serializer,
+        heuristic=heuristic,
    )
    sess.mount('http://', adapter)
    sess.mount('https://', adapter)