diff --git a/CHANGES.md b/CHANGES.md index 05b00fbd..1d4c9544 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,6 +8,7 @@ * Remove redundant MultipartPostHandler * Update Beautiful Soup 4.4.0 (r390) to 4.4.0 (r397) * Update backports/ssl_match_hostname 3.4.0.2 to 3.5.0.1 (r18) +* Update cachecontrol library 0.11.2 to 0.11.5 ### 0.11.0 (2016-01-10 22:30:00 UTC) diff --git a/lib/cachecontrol/__init__.py b/lib/cachecontrol/__init__.py index fae051a0..d6af9b93 100644 --- a/lib/cachecontrol/__init__.py +++ b/lib/cachecontrol/__init__.py @@ -4,7 +4,7 @@ Make it easy to import from cachecontrol without long namespaces. """ __author__ = 'Eric Larson' __email__ = 'eric@ionrock.org' -__version__ = '0.11.2' +__version__ = '0.11.5' from .wrapper import CacheControl from .adapter import CacheControlAdapter diff --git a/lib/cachecontrol/_cmd.py b/lib/cachecontrol/_cmd.py new file mode 100644 index 00000000..fcb785dc --- /dev/null +++ b/lib/cachecontrol/_cmd.py @@ -0,0 +1,60 @@ +import logging + +import requests + +from cachecontrol.adapter import CacheControlAdapter +from cachecontrol.cache import DictCache +from cachecontrol.controller import logger + +from argparse import ArgumentParser + + +def setup_logging(): + logger.setLevel(logging.DEBUG) + handler = logging.StreamHandler() + logger.addHandler(handler) + + +def get_session(): + adapter = CacheControlAdapter( + DictCache(), + cache_etags=True, + serializer=None, + heuristic=None, + ) + sess = requests.Session() + sess.mount('http://', adapter) + sess.mount('https://', adapter) + + sess.cache_controller = adapter.controller + return sess + + +def get_args(): + parser = ArgumentParser() + parser.add_argument('url', help='The URL to try and cache') + return parser.parse_args() + + +def main(args=None): + args = get_args() + sess = get_session() + + # Make a request to get a response + resp = sess.get(args.url) + + # Turn on logging + setup_logging() + + # try setting the cache + sess.cache_controller.cache_response(resp.request, resp.raw) + + # Now try to get it + if sess.cache_controller.cached_request(resp.request): + print('Cached!') + else: + print('Not cached :(') + + +if __name__ == '__main__': + main() diff --git a/lib/cachecontrol/caches/file_cache.py b/lib/cachecontrol/caches/file_cache.py index 79b59091..b2152f8f 100644 --- a/lib/cachecontrol/caches/file_cache.py +++ b/lib/cachecontrol/caches/file_cache.py @@ -1,7 +1,8 @@ import hashlib import os -from lockfile import FileLock +from lockfile import LockFile +from lockfile.mkdirlockfile import MkdirLockFile from ..cache import BaseCache from ..controller import CacheController @@ -49,11 +50,23 @@ def _secure_open_write(filename, fmode): class FileCache(BaseCache): def __init__(self, directory, forever=False, filemode=0o0600, - dirmode=0o0700): + dirmode=0o0700, use_dir_lock=None, lock_class=None): + + if use_dir_lock is not None and lock_class is not None: + raise ValueError("Cannot use use_dir_lock and lock_class together") + + if use_dir_lock: + lock_class = MkdirLockFile + + if lock_class is None: + lock_class = LockFile + self.directory = directory self.forever = forever self.filemode = filemode self.dirmode = dirmode + self.lock_class = lock_class + @staticmethod def encode(x): @@ -83,7 +96,7 @@ class FileCache(BaseCache): except (IOError, OSError): pass - with FileLock(name) as lock: + with self.lock_class(name) as lock: # Write our actual file with _secure_open_write(lock.path, self.filemode) as fh: fh.write(value) diff --git a/lib/cachecontrol/compat.py b/lib/cachecontrol/compat.py index 489eb868..ce556579 100644 --- a/lib/cachecontrol/compat.py +++ b/lib/cachecontrol/compat.py @@ -21,3 +21,9 @@ try: from requests.packages.urllib3.util import is_fp_closed except ImportError: from urllib3.util import is_fp_closed + +# Replicate some six behaviour +try: + text_type = (unicode,) +except NameError: + text_type = (str,) diff --git a/lib/cachecontrol/controller.py b/lib/cachecontrol/controller.py index f0380747..af7367eb 100644 --- a/lib/cachecontrol/controller.py +++ b/lib/cachecontrol/controller.py @@ -1,6 +1,7 @@ """ The httplib2 algorithms ported for use with requests. """ +import logging import re import calendar import time @@ -12,6 +13,8 @@ from .cache import DictCache from .serialize import Serializer +logger = logging.getLogger(__name__) + URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?") @@ -86,23 +89,28 @@ class CacheController(object): return False. """ cache_url = self.cache_url(request.url) + logger.debug('Looking up "%s" in the cache', cache_url) cc = self.parse_cache_control(request.headers) - # non-caching states - no_cache = True if 'no-cache' in cc else False - if 'max-age' in cc and cc['max-age'] == 0: - no_cache = True - - # Bail out if no-cache was set - if no_cache: + # Bail out if the request insists on fresh data + if 'no-cache' in cc: + logger.debug('Request header has "no-cache", cache bypassed') return False - # It is in the cache, so lets see if it is going to be - # fresh enough - resp = self.serializer.loads(request, self.cache.get(cache_url)) + if 'max-age' in cc and cc['max-age'] == 0: + logger.debug('Request header has "max_age" as 0, cache bypassed') + return False - # Check to see if we have a cached object + # Request allows serving from the cache, let's see if we find something + cache_data = self.cache.get(cache_url) + if cache_data is None: + logger.debug('No cache entry available') + return False + + # Check whether it can be deserialized + resp = self.serializer.loads(request, cache_data) if not resp: + logger.warning('Cache entry deserialization failed, entry ignored') return False # If we have a cached 301, return it immediately. We don't @@ -114,14 +122,19 @@ class CacheController(object): # Client can try to refresh the value by repeating the request # with cache busting headers as usual (ie no-cache). if resp.status == 301: + msg = ('Returning cached "301 Moved Permanently" response ' + '(ignoring date and etag information)') + logger.debug(msg) return resp headers = CaseInsensitiveDict(resp.headers) if not headers or 'date' not in headers: - # With date or etag, the cached response can never be used - # and should be deleted. if 'etag' not in headers: + # Without date or etag, the cached response can never be used + # and should be deleted. + logger.debug('Purging cached response: no date or etag') self.cache.delete(cache_url) + logger.debug('Ignoring cached response: no date') return False now = time.time() @@ -129,6 +142,7 @@ class CacheController(object): parsedate_tz(headers['date']) ) current_age = max(0, now - date) + logger.debug('Current age based on date: %i', current_age) # TODO: There is an assumption that the result will be a # urllib3 response object. This may not be best since we @@ -142,6 +156,8 @@ class CacheController(object): # Check the max-age pragma in the cache control header if 'max-age' in resp_cc and resp_cc['max-age'].isdigit(): freshness_lifetime = int(resp_cc['max-age']) + logger.debug('Freshness lifetime from max-age: %i', + freshness_lifetime) # If there isn't a max-age, check for an expires header elif 'expires' in headers: @@ -149,11 +165,16 @@ class CacheController(object): if expires is not None: expire_time = calendar.timegm(expires) - date freshness_lifetime = max(0, expire_time) + logger.debug("Freshness lifetime from expires: %i", + freshness_lifetime) - # determine if we are setting freshness limit in the req + # Determine if we are setting freshness limit in the + # request. Note, this overrides what was in the response. if 'max-age' in cc: try: freshness_lifetime = int(cc['max-age']) + logger.debug('Freshness lifetime from request max-age: %i', + freshness_lifetime) except ValueError: freshness_lifetime = 0 @@ -164,15 +185,20 @@ class CacheController(object): min_fresh = 0 # adjust our current age by our min fresh current_age += min_fresh + logger.debug('Adjusted current age from min-fresh: %i', + current_age) - # see how fresh we actually are - fresh = (freshness_lifetime > current_age) - - if fresh: + # Return entry if it is fresh enough + if freshness_lifetime > current_age: + logger.debug('The response is "fresh", returning cached response') + logger.debug('%i > %i', freshness_lifetime, current_age) return resp # we're not fresh. If we don't have an Etag, clear it out if 'etag' not in headers: + logger.debug( + 'The cached response is "stale" with no etag, purging' + ) self.cache.delete(cache_url) # return the original handler @@ -202,7 +228,13 @@ class CacheController(object): """ # From httplib2: Don't cache 206's since we aren't going to # handle byte range requests - if response.status not in [200, 203, 300, 301]: + cacheable_status_codes = [200, 203, 300, 301] + if response.status not in cacheable_status_codes: + logger.debug( + 'Status code %s not in %s', + response.status, + cacheable_status_codes + ) return response_headers = CaseInsensitiveDict(response.headers) @@ -211,14 +243,23 @@ class CacheController(object): cc = self.parse_cache_control(response_headers) cache_url = self.cache_url(request.url) + logger.debug('Updating cache with response from "%s"', cache_url) # Delete it from the cache if we happen to have it stored there - no_store = cc.get('no-store') or cc_req.get('no-store') + no_store = False + if cc.get('no-store'): + no_store = True + logger.debug('Response header has "no-store"') + if cc_req.get('no-store'): + no_store = True + logger.debug('Request header has "no-store"') if no_store and self.cache.get(cache_url): + logger.debug('Purging existing cache entry to honor "no-store"') self.cache.delete(cache_url) # If we've been given an etag, then keep the response if self.cache_etags and 'etag' in response_headers: + logger.debug('Caching due to etag') self.cache.set( cache_url, self.serializer.dumps(request, response, body=body), @@ -227,6 +268,7 @@ class CacheController(object): # Add to the cache any 301s. We do this before looking that # the Date headers. elif response.status == 301: + logger.debug('Caching permanant redirect') self.cache.set( cache_url, self.serializer.dumps(request, response) @@ -239,6 +281,7 @@ class CacheController(object): # cache when there is a max-age > 0 if cc and cc.get('max-age'): if int(cc['max-age']) > 0: + logger.debug('Caching b/c date exists and max-age > 0') self.cache.set( cache_url, self.serializer.dumps(request, response, body=body), @@ -248,6 +291,7 @@ class CacheController(object): # in the meantime. elif 'expires' in response_headers: if response_headers['expires']: + logger.debug('Caching b/c of expires header') self.cache.set( cache_url, self.serializer.dumps(request, response, body=body), diff --git a/lib/cachecontrol/serialize.py b/lib/cachecontrol/serialize.py index 6b17d80e..13af04bd 100644 --- a/lib/cachecontrol/serialize.py +++ b/lib/cachecontrol/serialize.py @@ -5,7 +5,7 @@ import zlib from requests.structures import CaseInsensitiveDict -from .compat import HTTPResponse, pickle +from .compat import HTTPResponse, pickle, text_type def _b64_encode_bytes(b): @@ -16,6 +16,12 @@ def _b64_encode_str(s): return _b64_encode_bytes(s.encode("utf8")) +def _b64_encode(s): + if isinstance(s, text_type): + return _b64_encode_str(s) + return _b64_encode_bytes(s) + + def _b64_decode_bytes(b): return base64.b64decode(b.encode("ascii")) @@ -48,7 +54,7 @@ class Serializer(object): "response": { "body": _b64_encode_bytes(body), "headers": dict( - (_b64_encode_str(k), _b64_encode_str(v)) + (_b64_encode(k), _b64_encode(v)) for k, v in response.headers.items() ), "status": response.status, @@ -69,7 +75,7 @@ class Serializer(object): # Encode our Vary headers to ensure they can be serialized as JSON data["vary"] = dict( - (_b64_encode_str(k), _b64_encode_str(v) if v is not None else v) + (_b64_encode(k), _b64_encode(v) if v is not None else v) for k, v in data["vary"].items() )