diff --git a/CHANGES.md b/CHANGES.md index 87dd1350..ea0f5e2a 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,7 @@ * Update Beautiful Soup 4.11.1 (r642) to 4.12.2 * Update certifi 2023.05.07 to 2023.07.22 +* Update CacheControl 0.12.11 (c05ef9e) to 0.13.1 (783a338) * Update feedparser 6.0.10 (859ac57) to 6.0.10 (9865dec) * Update package resource API 67.5.1 (f51eccd) to 68.1.2 (1ef36f2) * Update soupsieve 2.3.2.post1 (792d566) to 2.4.1 (2e66beb) diff --git a/lib/cachecontrol/__init__.py b/lib/cachecontrol/__init__.py index f631ae6d..abbf8c06 100644 --- a/lib/cachecontrol/__init__.py +++ b/lib/cachecontrol/__init__.py @@ -8,11 +8,21 @@ Make it easy to import from cachecontrol without long namespaces. """ __author__ = "Eric Larson" __email__ = "eric@ionrock.org" -__version__ = "0.12.11" +__version__ = "0.13.1" -from .wrapper import CacheControl -from .adapter import CacheControlAdapter -from .controller import CacheController +from cachecontrol.adapter import CacheControlAdapter +from cachecontrol.controller import CacheController +from cachecontrol.wrapper import CacheControl + +__all__ = [ + "__author__", + "__email__", + "__version__", + "CacheControlAdapter", + "CacheController", + "CacheControl", +] import logging + logging.getLogger(__name__).addHandler(logging.NullHandler()) diff --git a/lib/cachecontrol/_cmd.py b/lib/cachecontrol/_cmd.py index ccee0079..684a4a8b 100644 --- a/lib/cachecontrol/_cmd.py +++ b/lib/cachecontrol/_cmd.py @@ -1,8 +1,11 @@ # SPDX-FileCopyrightText: 2015 Eric Larson # # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import logging +from argparse import ArgumentParser +from typing import TYPE_CHECKING import requests @@ -10,16 +13,19 @@ from cachecontrol.adapter import CacheControlAdapter from cachecontrol.cache import DictCache from cachecontrol.controller import logger -from argparse import ArgumentParser +if TYPE_CHECKING: + from argparse import Namespace + + from cachecontrol.controller import CacheController -def setup_logging(): +def setup_logging() -> None: logger.setLevel(logging.DEBUG) handler = logging.StreamHandler() logger.addHandler(handler) -def get_session(): +def get_session() -> requests.Session: adapter = CacheControlAdapter( DictCache(), cache_etags=True, serializer=None, heuristic=None ) @@ -27,17 +33,17 @@ def get_session(): sess.mount("http://", adapter) sess.mount("https://", adapter) - sess.cache_controller = adapter.controller + sess.cache_controller = adapter.controller # type: ignore[attr-defined] return sess -def get_args(): +def get_args() -> Namespace: parser = ArgumentParser() parser.add_argument("url", help="The URL to try and cache") return parser.parse_args() -def main(args=None): +def main() -> None: args = get_args() sess = get_session() @@ -48,10 +54,13 @@ def main(args=None): setup_logging() # try setting the cache - sess.cache_controller.cache_response(resp.request, resp.raw) + cache_controller: CacheController = ( + sess.cache_controller # type: ignore[attr-defined] + ) + cache_controller.cache_response(resp.request, resp.raw) # Now try to get it - if sess.cache_controller.cached_request(resp.request): + if cache_controller.cached_request(resp.request): print("Cached!") else: print("Not cached :(") diff --git a/lib/cachecontrol/adapter.py b/lib/cachecontrol/adapter.py index 22b49638..bf4a23dd 100644 --- a/lib/cachecontrol/adapter.py +++ b/lib/cachecontrol/adapter.py @@ -1,16 +1,26 @@ # SPDX-FileCopyrightText: 2015 Eric Larson # # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations -import types import functools +import types import zlib +from typing import TYPE_CHECKING, Any, Collection, Mapping from requests.adapters import HTTPAdapter -from .controller import CacheController, PERMANENT_REDIRECT_STATUSES -from .cache import DictCache -from .filewrapper import CallbackFileWrapper +from cachecontrol.cache import DictCache +from cachecontrol.controller import PERMANENT_REDIRECT_STATUSES, CacheController +from cachecontrol.filewrapper import CallbackFileWrapper + +if TYPE_CHECKING: + from requests import PreparedRequest, Response + from urllib3 import HTTPResponse + + from cachecontrol.cache import BaseCache + from cachecontrol.heuristics import BaseHeuristic + from cachecontrol.serialize import Serializer class CacheControlAdapter(HTTPAdapter): @@ -18,16 +28,16 @@ class CacheControlAdapter(HTTPAdapter): def __init__( self, - cache=None, - cache_etags=True, - controller_class=None, - serializer=None, - heuristic=None, - cacheable_methods=None, - *args, - **kw - ): - super(CacheControlAdapter, self).__init__(*args, **kw) + cache: BaseCache | None = None, + cache_etags: bool = True, + controller_class: type[CacheController] | None = None, + serializer: Serializer | None = None, + heuristic: BaseHeuristic | None = None, + cacheable_methods: Collection[str] | None = None, + *args: Any, + **kw: Any, + ) -> None: + super().__init__(*args, **kw) self.cache = DictCache() if cache is None else cache self.heuristic = heuristic self.cacheable_methods = cacheable_methods or ("GET",) @@ -37,7 +47,16 @@ class CacheControlAdapter(HTTPAdapter): self.cache, cache_etags=cache_etags, serializer=serializer ) - def send(self, request, cacheable_methods=None, **kw): + def send( + self, + request: PreparedRequest, + stream: bool = False, + timeout: None | float | tuple[float, float] | tuple[float, None] = None, + verify: bool | str = True, + cert: (None | bytes | str | tuple[bytes | str, bytes | str]) = None, + proxies: Mapping[str, str] | None = None, + cacheable_methods: Collection[str] | None = None, + ) -> Response: """ Send a request. Use the request information to see if it exists in the cache and cache the response if we need to and can. @@ -54,13 +73,17 @@ class CacheControlAdapter(HTTPAdapter): # check for etags and add headers if appropriate request.headers.update(self.controller.conditional_headers(request)) - resp = super(CacheControlAdapter, self).send(request, **kw) + resp = super().send(request, stream, timeout, verify, cert, proxies) return resp def build_response( - self, request, response, from_cache=False, cacheable_methods=None - ): + self, + request: PreparedRequest, + response: HTTPResponse, + from_cache: bool = False, + cacheable_methods: Collection[str] | None = None, + ) -> Response: """ Build a response by making a request or using the cache. @@ -102,36 +125,37 @@ class CacheControlAdapter(HTTPAdapter): else: # Wrap the response file with a wrapper that will cache the # response when the stream has been consumed. - response._fp = CallbackFileWrapper( - response._fp, + response._fp = CallbackFileWrapper( # type: ignore[attr-defined] + response._fp, # type: ignore[attr-defined] functools.partial( self.controller.cache_response, request, response ), ) if response.chunked: - super_update_chunk_length = response._update_chunk_length + super_update_chunk_length = response._update_chunk_length # type: ignore[attr-defined] - def _update_chunk_length(self): + def _update_chunk_length(self: HTTPResponse) -> None: super_update_chunk_length() if self.chunk_left == 0: - self._fp._close() + self._fp._close() # type: ignore[attr-defined] - response._update_chunk_length = types.MethodType( + response._update_chunk_length = types.MethodType( # type: ignore[attr-defined] _update_chunk_length, response ) - resp = super(CacheControlAdapter, self).build_response(request, response) + resp: Response = super().build_response(request, response) # type: ignore[no-untyped-call] # See if we should invalidate the cache. if request.method in self.invalidating_methods and resp.ok: + assert request.url is not None cache_url = self.controller.cache_url(request.url) self.cache.delete(cache_url) # Give the request a from_cache attr to let people use it - resp.from_cache = from_cache + resp.from_cache = from_cache # type: ignore[attr-defined] return resp - def close(self): + def close(self) -> None: self.cache.close() - super(CacheControlAdapter, self).close() + super().close() # type: ignore[no-untyped-call] diff --git a/lib/cachecontrol/cache.py b/lib/cachecontrol/cache.py index 2a965f59..3293b005 100644 --- a/lib/cachecontrol/cache.py +++ b/lib/cachecontrol/cache.py @@ -6,38 +6,46 @@ The cache object API for implementing caches. The default is a thread safe in-memory dictionary. """ +from __future__ import annotations + from threading import Lock +from typing import IO, TYPE_CHECKING, MutableMapping + +if TYPE_CHECKING: + from datetime import datetime -class BaseCache(object): - - def get(self, key): +class BaseCache: + def get(self, key: str) -> bytes | None: raise NotImplementedError() - def set(self, key, value, expires=None): + def set( + self, key: str, value: bytes, expires: int | datetime | None = None + ) -> None: raise NotImplementedError() - def delete(self, key): + def delete(self, key: str) -> None: raise NotImplementedError() - def close(self): + def close(self) -> None: pass class DictCache(BaseCache): - - def __init__(self, init_dict=None): + def __init__(self, init_dict: MutableMapping[str, bytes] | None = None) -> None: self.lock = Lock() self.data = init_dict or {} - def get(self, key): + def get(self, key: str) -> bytes | None: return self.data.get(key, None) - def set(self, key, value, expires=None): + def set( + self, key: str, value: bytes, expires: int | datetime | None = None + ) -> None: with self.lock: self.data.update({key: value}) - def delete(self, key): + def delete(self, key: str) -> None: with self.lock: if key in self.data: self.data.pop(key) @@ -55,10 +63,11 @@ class SeparateBodyBaseCache(BaseCache): Similarly, the body should be loaded separately via ``get_body()``. """ - def set_body(self, key, body): + + def set_body(self, key: str, body: bytes) -> None: raise NotImplementedError() - def get_body(self, key): + def get_body(self, key: str) -> IO[bytes] | None: """ Return the body as file-like object. """ diff --git a/lib/cachecontrol/caches/__init__.py b/lib/cachecontrol/caches/__init__.py index 37827291..44a1af87 100644 --- a/lib/cachecontrol/caches/__init__.py +++ b/lib/cachecontrol/caches/__init__.py @@ -2,8 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from .file_cache import FileCache, SeparateBodyFileCache -from .redis_cache import RedisCache - +from cachecontrol.caches.file_cache import FileCache, SeparateBodyFileCache +from cachecontrol.caches.redis_cache import RedisCache __all__ = ["FileCache", "SeparateBodyFileCache", "RedisCache"] diff --git a/lib/cachecontrol/caches/file_cache.py b/lib/cachecontrol/caches/file_cache.py index f5cc2c8f..86213500 100644 --- a/lib/cachecontrol/caches/file_cache.py +++ b/lib/cachecontrol/caches/file_cache.py @@ -1,22 +1,25 @@ # SPDX-FileCopyrightText: 2015 Eric Larson # # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import gc import hashlib import os from textwrap import dedent +from typing import IO, TYPE_CHECKING, Union +from pathlib import Path -from ..cache import BaseCache, SeparateBodyBaseCache -from ..controller import CacheController +from cachecontrol.cache import BaseCache, SeparateBodyBaseCache +from cachecontrol.controller import CacheController -try: - FileNotFoundError -except NameError: - # py2.X - FileNotFoundError = (IOError, OSError) +if TYPE_CHECKING: + from datetime import datetime + + from filelock import BaseFileLock -def _secure_open_write(filename, fmode): +def _secure_open_write(filename: str, fmode: int) -> IO[bytes]: # We only want to write to this file, so open it in write only mode flags = os.O_WRONLY @@ -40,7 +43,7 @@ def _secure_open_write(filename, fmode): try: os.remove(filename) gc.collect(2) - except (IOError, OSError): + except OSError: # The file must not exist already, so we can just skip ahead to opening pass @@ -63,23 +66,23 @@ class _FileCacheMixin: def __init__( self, - directory, - forever=False, - filemode=0o0600, - dirmode=0o0700, - lock_class=None, - ): - + directory: Union[str, Path], + forever: bool = False, + filemode: int = 0o0600, + dirmode: int = 0o0700, + lock_class: type[BaseFileLock] | None = None, + ) -> None: try: if lock_class is None: from filelock import FileLock + lock_class = FileLock except ImportError: notice = dedent( """ NOTE: In order to use the FileCache you must have filelock installed. You can install it via pip: - pip install filelock + pip install cachecontrol[filecache] """ ) raise ImportError(notice) @@ -91,17 +94,17 @@ class _FileCacheMixin: self.lock_class = lock_class @staticmethod - def encode(x): + def encode(x: str) -> str: return hashlib.sha224(x.encode()).hexdigest() - def _fn(self, name): + def _fn(self, name: str) -> str: # NOTE: This method should not change as some may depend on it. # See: https://github.com/ionrock/cachecontrol/issues/63 hashed = self.encode(name) parts = list(hashed[:5]) + [hashed] return os.path.join(self.directory, *parts) - def get(self, key): + def get(self, key: str) -> bytes | None: name = self._fn(key) try: with open(name, "rb") as fh: @@ -110,18 +113,20 @@ class _FileCacheMixin: except FileNotFoundError: return None - def set(self, key, value, expires=None): + def set( + self, key: str, value: bytes, expires: int | datetime | None = None + ) -> None: name = self._fn(key) self._write(name, value) - def _write(self, path, data: bytes): + def _write(self, path: str, data: bytes) -> None: """ Safely write the data to the given path. """ # Make sure the directory exists try: os.makedirs(os.path.dirname(path), self.dirmode) - except (IOError, OSError): + except OSError: pass with self.lock_class(path + ".lock"): @@ -129,7 +134,7 @@ class _FileCacheMixin: with _secure_open_write(path, self.filemode) as fh: fh.write(data) - def _delete(self, key, suffix): + def _delete(self, key: str, suffix: str) -> None: name = self._fn(key) + suffix if not self.forever: try: @@ -144,7 +149,7 @@ class FileCache(_FileCacheMixin, BaseCache): downloads. """ - def delete(self, key): + def delete(self, key: str) -> None: self._delete(key, "") @@ -154,23 +159,23 @@ class SeparateBodyFileCache(_FileCacheMixin, SeparateBodyBaseCache): peak memory usage. """ - def get_body(self, key): + def get_body(self, key: str) -> IO[bytes] | None: name = self._fn(key) + ".body" try: return open(name, "rb") except FileNotFoundError: return None - def set_body(self, key, body): + def set_body(self, key: str, body: bytes) -> None: name = self._fn(key) + ".body" self._write(name, body) - def delete(self, key): + def delete(self, key: str) -> None: self._delete(key, "") self._delete(key, ".body") -def url_to_file_path(url, filecache): +def url_to_file_path(url: str, filecache: FileCache) -> str: """Return the file cache path based on the URL. This does not ensure the file exists! diff --git a/lib/cachecontrol/caches/redis_cache.py b/lib/cachecontrol/caches/redis_cache.py index 7bcb38a2..f859e719 100644 --- a/lib/cachecontrol/caches/redis_cache.py +++ b/lib/cachecontrol/caches/redis_cache.py @@ -1,39 +1,48 @@ # SPDX-FileCopyrightText: 2015 Eric Larson # # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations -from __future__ import division -from datetime import datetime +from datetime import datetime, timezone +from typing import TYPE_CHECKING + from cachecontrol.cache import BaseCache +if TYPE_CHECKING: + from redis import Redis + class RedisCache(BaseCache): - - def __init__(self, conn): + def __init__(self, conn: Redis[bytes]) -> None: self.conn = conn - def get(self, key): + def get(self, key: str) -> bytes | None: return self.conn.get(key) - def set(self, key, value, expires=None): + def set( + self, key: str, value: bytes, expires: int | datetime | None = None + ) -> None: if not expires: self.conn.set(key, value) elif isinstance(expires, datetime): - expires = expires - datetime.utcnow() - self.conn.setex(key, int(expires.total_seconds()), value) + now_utc = datetime.now(timezone.utc) + if expires.tzinfo is None: + now_utc = now_utc.replace(tzinfo=None) + delta = expires - now_utc + self.conn.setex(key, int(delta.total_seconds()), value) else: self.conn.setex(key, expires, value) - def delete(self, key): + def delete(self, key: str) -> None: self.conn.delete(key) - def clear(self): + def clear(self) -> None: """Helper for clearing all the keys in a database. Use with caution!""" for key in self.conn.keys(): self.conn.delete(key) - def close(self): + def close(self) -> None: """Redis uses connection pooling, no need to close the connection.""" pass diff --git a/lib/cachecontrol/compat.py b/lib/cachecontrol/compat.py deleted file mode 100644 index 72c456cf..00000000 --- a/lib/cachecontrol/compat.py +++ /dev/null @@ -1,32 +0,0 @@ -# SPDX-FileCopyrightText: 2015 Eric Larson -# -# SPDX-License-Identifier: Apache-2.0 - -try: - from urllib.parse import urljoin -except ImportError: - from urlparse import urljoin - - -try: - import cPickle as pickle -except ImportError: - import pickle - -# Handle the case where the requests module has been patched to not have -# urllib3 bundled as part of its source. -try: - from requests.packages.urllib3.response import HTTPResponse -except ImportError: - from urllib3.response import HTTPResponse - -try: - from requests.packages.urllib3.util import is_fp_closed -except ImportError: - from urllib3.util import is_fp_closed - -# Replicate some six behaviour -try: - text_type = unicode -except NameError: - text_type = str diff --git a/lib/cachecontrol/controller.py b/lib/cachecontrol/controller.py index 48db4401..4f54a121 100644 --- a/lib/cachecontrol/controller.py +++ b/lib/cachecontrol/controller.py @@ -5,17 +5,27 @@ """ The httplib2 algorithms ported for use with requests. """ +from __future__ import annotations + +import calendar import logging import re -import calendar import time from email.utils import parsedate_tz +from typing import TYPE_CHECKING, Collection, Mapping from requests.structures import CaseInsensitiveDict -from .cache import DictCache, SeparateBodyBaseCache -from .serialize import Serializer +from cachecontrol.cache import DictCache, SeparateBodyBaseCache +from cachecontrol.serialize import Serializer +if TYPE_CHECKING: + from typing import Literal + + from requests import PreparedRequest + from urllib3 import HTTPResponse + + from cachecontrol.cache import BaseCache logger = logging.getLogger(__name__) @@ -24,20 +34,26 @@ URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?") PERMANENT_REDIRECT_STATUSES = (301, 308) -def parse_uri(uri): +def parse_uri(uri: str) -> tuple[str, str, str, str, str]: """Parses a URI using the regex given in Appendix B of RFC 3986. (scheme, authority, path, query, fragment) = parse_uri(uri) """ - groups = URI.match(uri).groups() + match = URI.match(uri) + assert match is not None + groups = match.groups() return (groups[1], groups[3], groups[4], groups[6], groups[8]) -class CacheController(object): +class CacheController: """An interface to see if request should cached or not.""" def __init__( - self, cache=None, cache_etags=True, serializer=None, status_codes=None + self, + cache: BaseCache | None = None, + cache_etags: bool = True, + serializer: Serializer | None = None, + status_codes: Collection[int] | None = None, ): self.cache = DictCache() if cache is None else cache self.cache_etags = cache_etags @@ -45,7 +61,7 @@ class CacheController(object): self.cacheable_status_codes = status_codes or (200, 203, 300, 301, 308) @classmethod - def _urlnorm(cls, uri): + def _urlnorm(cls, uri: str) -> str: """Normalize the URL to create a safe key for the cache""" (scheme, authority, path, query, fragment) = parse_uri(uri) if not scheme or not authority: @@ -65,10 +81,10 @@ class CacheController(object): return defrag_uri @classmethod - def cache_url(cls, uri): + def cache_url(cls, uri: str) -> str: return cls._urlnorm(uri) - def parse_cache_control(self, headers): + def parse_cache_control(self, headers: Mapping[str, str]) -> dict[str, int | None]: known_directives = { # https://tools.ietf.org/html/rfc7234#section-5.2 "max-age": (int, True), @@ -87,7 +103,7 @@ class CacheController(object): cc_headers = headers.get("cache-control", headers.get("Cache-Control", "")) - retval = {} + retval: dict[str, int | None] = {} for cc_directive in cc_headers.split(","): if not cc_directive.strip(): @@ -122,11 +138,12 @@ class CacheController(object): return retval - def _load_from_cache(self, request): + def _load_from_cache(self, request: PreparedRequest) -> HTTPResponse | None: """ Load a cached response, or return None if it's not available. """ cache_url = request.url + assert cache_url is not None cache_data = self.cache.get(cache_url) if cache_data is None: logger.debug("No cache entry available") @@ -142,11 +159,12 @@ class CacheController(object): logger.warning("Cache entry deserialization failed, entry ignored") return result - def cached_request(self, request): + def cached_request(self, request: PreparedRequest) -> HTTPResponse | Literal[False]: """ Return a cached response if it exists in the cache, otherwise return False. """ + assert request.url is not None cache_url = self.cache_url(request.url) logger.debug('Looking up "%s" in the cache', cache_url) cc = self.parse_cache_control(request.headers) @@ -182,7 +200,7 @@ class CacheController(object): logger.debug(msg) return resp - headers = CaseInsensitiveDict(resp.headers) + headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(resp.headers) if not headers or "date" not in headers: if "etag" not in headers: # Without date or etag, the cached response can never be used @@ -193,7 +211,9 @@ class CacheController(object): return False now = time.time() - date = calendar.timegm(parsedate_tz(headers["date"])) + time_tuple = parsedate_tz(headers["date"]) + assert time_tuple is not None + date = calendar.timegm(time_tuple[:6]) current_age = max(0, now - date) logger.debug("Current age based on date: %i", current_age) @@ -207,28 +227,30 @@ class CacheController(object): freshness_lifetime = 0 # Check the max-age pragma in the cache control header - if "max-age" in resp_cc: - freshness_lifetime = resp_cc["max-age"] + max_age = resp_cc.get("max-age") + if max_age is not None: + freshness_lifetime = max_age logger.debug("Freshness lifetime from max-age: %i", freshness_lifetime) # If there isn't a max-age, check for an expires header elif "expires" in headers: expires = parsedate_tz(headers["expires"]) if expires is not None: - expire_time = calendar.timegm(expires) - date + expire_time = calendar.timegm(expires[:6]) - date freshness_lifetime = max(0, expire_time) logger.debug("Freshness lifetime from expires: %i", freshness_lifetime) # Determine if we are setting freshness limit in the # request. Note, this overrides what was in the response. - if "max-age" in cc: - freshness_lifetime = cc["max-age"] + max_age = cc.get("max-age") + if max_age is not None: + freshness_lifetime = max_age logger.debug( "Freshness lifetime from request max-age: %i", freshness_lifetime ) - if "min-fresh" in cc: - min_fresh = cc["min-fresh"] + min_fresh = cc.get("min-fresh") + if min_fresh is not None: # adjust our current age by our min fresh current_age += min_fresh logger.debug("Adjusted current age from min-fresh: %i", current_age) @@ -247,12 +269,12 @@ class CacheController(object): # return the original handler return False - def conditional_headers(self, request): + def conditional_headers(self, request: PreparedRequest) -> dict[str, str]: resp = self._load_from_cache(request) new_headers = {} if resp: - headers = CaseInsensitiveDict(resp.headers) + headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(resp.headers) if "etag" in headers: new_headers["If-None-Match"] = headers["ETag"] @@ -262,7 +284,14 @@ class CacheController(object): return new_headers - def _cache_set(self, cache_url, request, response, body=None, expires_time=None): + def _cache_set( + self, + cache_url: str, + request: PreparedRequest, + response: HTTPResponse, + body: bytes | None = None, + expires_time: int | None = None, + ) -> None: """ Store the data in the cache. """ @@ -285,7 +314,13 @@ class CacheController(object): expires=expires_time, ) - def cache_response(self, request, response, body=None, status_codes=None): + def cache_response( + self, + request: PreparedRequest, + response: HTTPResponse, + body: bytes | None = None, + status_codes: Collection[int] | None = None, + ) -> None: """ Algorithm for caching requests. @@ -300,10 +335,14 @@ class CacheController(object): ) return - response_headers = CaseInsensitiveDict(response.headers) + response_headers: CaseInsensitiveDict[str] = CaseInsensitiveDict( + response.headers + ) if "date" in response_headers: - date = calendar.timegm(parsedate_tz(response_headers["date"])) + time_tuple = parsedate_tz(response_headers["date"]) + assert time_tuple is not None + date = calendar.timegm(time_tuple[:6]) else: date = 0 @@ -322,6 +361,7 @@ class CacheController(object): cc_req = self.parse_cache_control(request.headers) cc = self.parse_cache_control(response_headers) + assert request.url is not None cache_url = self.cache_url(request.url) logger.debug('Updating cache with response from "%s"', cache_url) @@ -354,11 +394,11 @@ class CacheController(object): if response_headers.get("expires"): expires = parsedate_tz(response_headers["expires"]) if expires is not None: - expires_time = calendar.timegm(expires) - date + expires_time = calendar.timegm(expires[:6]) - date expires_time = max(expires_time, 14 * 86400) - logger.debug("etag object cached for {0} seconds".format(expires_time)) + logger.debug(f"etag object cached for {expires_time} seconds") logger.debug("Caching due to etag") self._cache_set(cache_url, request, response, body, expires_time) @@ -372,11 +412,14 @@ class CacheController(object): # is no date header then we can't do anything about expiring # the cache. elif "date" in response_headers: - date = calendar.timegm(parsedate_tz(response_headers["date"])) + time_tuple = parsedate_tz(response_headers["date"]) + assert time_tuple is not None + date = calendar.timegm(time_tuple[:6]) # cache when there is a max-age > 0 - if "max-age" in cc and cc["max-age"] > 0: + max_age = cc.get("max-age") + if max_age is not None and max_age > 0: logger.debug("Caching b/c date exists and max-age > 0") - expires_time = cc["max-age"] + expires_time = max_age self._cache_set( cache_url, request, @@ -391,12 +434,12 @@ class CacheController(object): if response_headers["expires"]: expires = parsedate_tz(response_headers["expires"]) if expires is not None: - expires_time = calendar.timegm(expires) - date + expires_time = calendar.timegm(expires[:6]) - date else: expires_time = None logger.debug( - "Caching b/c of expires header. expires in {0} seconds".format( + "Caching b/c of expires header. expires in {} seconds".format( expires_time ) ) @@ -408,13 +451,16 @@ class CacheController(object): expires_time, ) - def update_cached_response(self, request, response): + def update_cached_response( + self, request: PreparedRequest, response: HTTPResponse + ) -> HTTPResponse: """On a 304 we will get a new set of headers that we want to update our cached value with, assuming we have one. This should only ever be called when we've sent an ETag and gotten a 304 as the response. """ + assert request.url is not None cache_url = self.cache_url(request.url) cached_response = self._load_from_cache(request) @@ -432,11 +478,11 @@ class CacheController(object): excluded_headers = ["content-length"] cached_response.headers.update( - dict( - (k, v) - for k, v in response.headers.items() + { + k: v + for k, v in response.headers.items() # type: ignore[no-untyped-call] if k.lower() not in excluded_headers - ) + } ) # we want a 200 b/c we have content via the cache diff --git a/lib/cachecontrol/filewrapper.py b/lib/cachecontrol/filewrapper.py index f5ed5f6f..25143902 100644 --- a/lib/cachecontrol/filewrapper.py +++ b/lib/cachecontrol/filewrapper.py @@ -1,12 +1,17 @@ # SPDX-FileCopyrightText: 2015 Eric Larson # # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations -from tempfile import NamedTemporaryFile import mmap +from tempfile import NamedTemporaryFile +from typing import TYPE_CHECKING, Any, Callable + +if TYPE_CHECKING: + from http.client import HTTPResponse -class CallbackFileWrapper(object): +class CallbackFileWrapper: """ Small wrapper around a fp object which will tee everything read into a buffer, and when that file is closed it will execute a callback with the @@ -25,12 +30,14 @@ class CallbackFileWrapper(object): performance impact. """ - def __init__(self, fp, callback): + def __init__( + self, fp: HTTPResponse, callback: Callable[[bytes], None] | None + ) -> None: self.__buf = NamedTemporaryFile("rb+", delete=True) self.__fp = fp self.__callback = callback - def __getattr__(self, name): + def __getattr__(self, name: str) -> Any: # The vaguaries of garbage collection means that self.__fp is # not always set. By using __getattribute__ and the private # name[0] allows looking up the attribute value and raising an @@ -42,7 +49,7 @@ class CallbackFileWrapper(object): fp = self.__getattribute__("_CallbackFileWrapper__fp") return getattr(fp, name) - def __is_fp_closed(self): + def __is_fp_closed(self) -> bool: try: return self.__fp.fp is None @@ -50,7 +57,8 @@ class CallbackFileWrapper(object): pass try: - return self.__fp.closed + closed: bool = self.__fp.closed + return closed except AttributeError: pass @@ -59,7 +67,7 @@ class CallbackFileWrapper(object): # TODO: Add some logging here... return False - def _close(self): + def _close(self) -> None: if self.__callback: if self.__buf.tell() == 0: # Empty file: @@ -86,8 +94,8 @@ class CallbackFileWrapper(object): # Important when caching big files. self.__buf.close() - def read(self, amt=None): - data = self.__fp.read(amt) + def read(self, amt: int | None = None) -> bytes: + data: bytes = self.__fp.read(amt) if data: # We may be dealing with b'', a sign that things are over: # it's passed e.g. after we've already closed self.__buf. @@ -97,8 +105,8 @@ class CallbackFileWrapper(object): return data - def _safe_read(self, amt): - data = self.__fp._safe_read(amt) + def _safe_read(self, amt: int) -> bytes: + data: bytes = self.__fp._safe_read(amt) # type: ignore[attr-defined] if amt == 2 and data == b"\r\n": # urllib executes this read to toss the CRLF at the end # of the chunk. diff --git a/lib/cachecontrol/heuristics.py b/lib/cachecontrol/heuristics.py index ebe4a96f..323262be 100644 --- a/lib/cachecontrol/heuristics.py +++ b/lib/cachecontrol/heuristics.py @@ -1,29 +1,31 @@ # SPDX-FileCopyrightText: 2015 Eric Larson # # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import calendar import time - +from datetime import datetime, timedelta, timezone from email.utils import formatdate, parsedate, parsedate_tz +from typing import TYPE_CHECKING, Any, Mapping -from datetime import datetime, timedelta +if TYPE_CHECKING: + from urllib3 import HTTPResponse TIME_FMT = "%a, %d %b %Y %H:%M:%S GMT" -def expire_after(delta, date=None): - date = date or datetime.utcnow() +def expire_after(delta: timedelta, date: datetime | None = None) -> datetime: + date = date or datetime.now(timezone.utc) return date + delta -def datetime_to_header(dt): +def datetime_to_header(dt: datetime) -> str: return formatdate(calendar.timegm(dt.timetuple())) -class BaseHeuristic(object): - - def warning(self, response): +class BaseHeuristic: + def warning(self, response: HTTPResponse) -> str | None: """ Return a valid 1xx warning header value describing the cache adjustments. @@ -34,7 +36,7 @@ class BaseHeuristic(object): """ return '110 - "Response is Stale"' - def update_headers(self, response): + def update_headers(self, response: HTTPResponse) -> dict[str, str]: """Update the response headers with any new headers. NOTE: This SHOULD always include some Warning header to @@ -43,7 +45,7 @@ class BaseHeuristic(object): """ return {} - def apply(self, response): + def apply(self, response: HTTPResponse) -> HTTPResponse: updated_headers = self.update_headers(response) if updated_headers: @@ -61,12 +63,12 @@ class OneDayCache(BaseHeuristic): future. """ - def update_headers(self, response): + def update_headers(self, response: HTTPResponse) -> dict[str, str]: headers = {} if "expires" not in response.headers: date = parsedate(response.headers["date"]) - expires = expire_after(timedelta(days=1), date=datetime(*date[:6])) + expires = expire_after(timedelta(days=1), date=datetime(*date[:6], tzinfo=timezone.utc)) # type: ignore[misc] headers["expires"] = datetime_to_header(expires) headers["cache-control"] = "public" return headers @@ -77,14 +79,14 @@ class ExpiresAfter(BaseHeuristic): Cache **all** requests for a defined time period. """ - def __init__(self, **kw): + def __init__(self, **kw: Any) -> None: self.delta = timedelta(**kw) - def update_headers(self, response): + def update_headers(self, response: HTTPResponse) -> dict[str, str]: expires = expire_after(self.delta) return {"expires": datetime_to_header(expires), "cache-control": "public"} - def warning(self, response): + def warning(self, response: HTTPResponse) -> str | None: tmpl = "110 - Automatically cached for %s. Response might be stale" return tmpl % self.delta @@ -101,12 +103,23 @@ class LastModified(BaseHeuristic): http://lxr.mozilla.org/mozilla-release/source/netwerk/protocol/http/nsHttpResponseHead.cpp#397 Unlike mozilla we limit this to 24-hr. """ + cacheable_by_default_statuses = { - 200, 203, 204, 206, 300, 301, 404, 405, 410, 414, 501 + 200, + 203, + 204, + 206, + 300, + 301, + 404, + 405, + 410, + 414, + 501, } - def update_headers(self, resp): - headers = resp.headers + def update_headers(self, resp: HTTPResponse) -> dict[str, str]: + headers: Mapping[str, str] = resp.headers if "expires" in headers: return {} @@ -120,9 +133,11 @@ class LastModified(BaseHeuristic): if "date" not in headers or "last-modified" not in headers: return {} - date = calendar.timegm(parsedate_tz(headers["date"])) + time_tuple = parsedate_tz(headers["date"]) + assert time_tuple is not None + date = calendar.timegm(time_tuple[:6]) last_modified = parsedate(headers["last-modified"]) - if date is None or last_modified is None: + if last_modified is None: return {} now = time.time() @@ -135,5 +150,5 @@ class LastModified(BaseHeuristic): expires = date + freshness_lifetime return {"expires": time.strftime(TIME_FMT, time.gmtime(expires))} - def warning(self, resp): + def warning(self, resp: HTTPResponse) -> str | None: return None diff --git a/lib/cachecontrol/py.typed b/lib/cachecontrol/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/lib/cachecontrol/serialize.py b/lib/cachecontrol/serialize.py index 70135d39..99045a0a 100644 --- a/lib/cachecontrol/serialize.py +++ b/lib/cachecontrol/serialize.py @@ -1,105 +1,91 @@ # SPDX-FileCopyrightText: 2015 Eric Larson # # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations -import base64 import io -import json -import zlib +from typing import IO, TYPE_CHECKING, Any, Mapping, cast import msgpack from requests.structures import CaseInsensitiveDict +from urllib3 import HTTPResponse -from .compat import HTTPResponse, pickle, text_type +if TYPE_CHECKING: + from requests import PreparedRequest -def _b64_decode_bytes(b): - return base64.b64decode(b.encode("ascii")) +class Serializer: + serde_version = "4" - -def _b64_decode_str(s): - return _b64_decode_bytes(s).decode("utf8") - - -_default_body_read = object() - - -class Serializer(object): - def dumps(self, request, response, body=None): - response_headers = CaseInsensitiveDict(response.headers) + def dumps( + self, + request: PreparedRequest, + response: HTTPResponse, + body: bytes | None = None, + ) -> bytes: + response_headers: CaseInsensitiveDict[str] = CaseInsensitiveDict( + response.headers + ) if body is None: # When a body isn't passed in, we'll read the response. We # also update the response with a new file handler to be # sure it acts as though it was never read. body = response.read(decode_content=False) - response._fp = io.BytesIO(body) + response._fp = io.BytesIO(body) # type: ignore[attr-defined] + response.length_remaining = len(body) - # NOTE: This is all a bit weird, but it's really important that on - # Python 2.x these objects are unicode and not str, even when - # they contain only ascii. The problem here is that msgpack - # understands the difference between unicode and bytes and we - # have it set to differentiate between them, however Python 2 - # doesn't know the difference. Forcing these to unicode will be - # enough to have msgpack know the difference. data = { - u"response": { - u"body": body, # Empty bytestring if body is stored separately - u"headers": dict( - (text_type(k), text_type(v)) for k, v in response.headers.items() - ), - u"status": response.status, - u"version": response.version, - u"reason": text_type(response.reason), - u"strict": response.strict, - u"decode_content": response.decode_content, + "response": { + "body": body, # Empty bytestring if body is stored separately + "headers": {str(k): str(v) for k, v in response.headers.items()}, # type: ignore[no-untyped-call] + "status": response.status, + "version": response.version, + "reason": str(response.reason), + "decode_content": response.decode_content, } } # Construct our vary headers - data[u"vary"] = {} - if u"vary" in response_headers: - varied_headers = response_headers[u"vary"].split(",") + data["vary"] = {} + if "vary" in response_headers: + varied_headers = response_headers["vary"].split(",") for header in varied_headers: - header = text_type(header).strip() + header = str(header).strip() header_value = request.headers.get(header, None) if header_value is not None: - header_value = text_type(header_value) - data[u"vary"][header] = header_value + header_value = str(header_value) + data["vary"][header] = header_value - return b",".join([b"cc=4", msgpack.dumps(data, use_bin_type=True)]) + return b",".join([f"cc={self.serde_version}".encode(), self.serialize(data)]) - def loads(self, request, data, body_file=None): + def serialize(self, data: dict[str, Any]) -> bytes: + return cast(bytes, msgpack.dumps(data, use_bin_type=True)) + + def loads( + self, + request: PreparedRequest, + data: bytes, + body_file: IO[bytes] | None = None, + ) -> HTTPResponse | None: # Short circuit if we've been given an empty set of data if not data: - return + return None - # Determine what version of the serializer the data was serialized - # with - try: - ver, data = data.split(b",", 1) - except ValueError: - ver = b"cc=0" + # Previous versions of this library supported other serialization + # formats, but these have all been removed. + if not data.startswith(f"cc={self.serde_version},".encode()): + return None - # Make sure that our "ver" is actually a version and isn't a false - # positive from a , being in the data stream. - if ver[:3] != b"cc=": - data = ver + data - ver = b"cc=0" + data = data[5:] + return self._loads_v4(request, data, body_file) - # Get the version number out of the cc=N - ver = ver.split(b"=", 1)[-1].decode("ascii") - - # Dispatch to the actual load method for the given version - try: - return getattr(self, "_loads_v{}".format(ver))(request, data, body_file) - - except AttributeError: - # This is a version we don't have a loads function for, so we'll - # just treat it as a miss and return None - return - - def prepare_response(self, request, cached, body_file=None): + def prepare_response( + self, + request: PreparedRequest, + cached: Mapping[str, Any], + body_file: IO[bytes] | None = None, + ) -> HTTPResponse | None: """Verify our vary headers match and construct a real urllib3 HTTPResponse object. """ @@ -108,23 +94,26 @@ class Serializer(object): # This case is also handled in the controller code when creating # a cache entry, but is left here for backwards compatibility. if "*" in cached.get("vary", {}): - return + return None # Ensure that the Vary headers for the cached response match our # request for header, value in cached.get("vary", {}).items(): if request.headers.get(header, None) != value: - return + return None body_raw = cached["response"].pop("body") - headers = CaseInsensitiveDict(data=cached["response"]["headers"]) + headers: CaseInsensitiveDict[str] = CaseInsensitiveDict( + data=cached["response"]["headers"] + ) if headers.get("transfer-encoding", "") == "chunked": headers.pop("transfer-encoding") cached["response"]["headers"] = headers try: + body: IO[bytes] if body_file is None: body = io.BytesIO(body_raw) else: @@ -138,53 +127,20 @@ class Serializer(object): # TypeError: 'str' does not support the buffer interface body = io.BytesIO(body_raw.encode("utf8")) + # Discard any `strict` parameter serialized by older version of cachecontrol. + cached["response"].pop("strict", None) + return HTTPResponse(body=body, preload_content=False, **cached["response"]) - def _loads_v0(self, request, data, body_file=None): - # The original legacy cache data. This doesn't contain enough - # information to construct everything we need, so we'll treat this as - # a miss. - return - - def _loads_v1(self, request, data, body_file=None): - try: - cached = pickle.loads(data) - except ValueError: - return - - return self.prepare_response(request, cached, body_file) - - def _loads_v2(self, request, data, body_file=None): - assert body_file is None - try: - cached = json.loads(zlib.decompress(data).decode("utf8")) - except (ValueError, zlib.error): - return - - # We need to decode the items that we've base64 encoded - cached["response"]["body"] = _b64_decode_bytes(cached["response"]["body"]) - cached["response"]["headers"] = dict( - (_b64_decode_str(k), _b64_decode_str(v)) - for k, v in cached["response"]["headers"].items() - ) - cached["response"]["reason"] = _b64_decode_str(cached["response"]["reason"]) - cached["vary"] = dict( - (_b64_decode_str(k), _b64_decode_str(v) if v is not None else v) - for k, v in cached["vary"].items() - ) - - return self.prepare_response(request, cached, body_file) - - def _loads_v3(self, request, data, body_file): - # Due to Python 2 encoding issues, it's impossible to know for sure - # exactly how to load v3 entries, thus we'll treat these as a miss so - # that they get rewritten out as v4 entries. - return - - def _loads_v4(self, request, data, body_file=None): + def _loads_v4( + self, + request: PreparedRequest, + data: bytes, + body_file: IO[bytes] | None = None, + ) -> HTTPResponse | None: try: cached = msgpack.loads(data, raw=False) except ValueError: - return + return None return self.prepare_response(request, cached, body_file) diff --git a/lib/cachecontrol/wrapper.py b/lib/cachecontrol/wrapper.py index b6ee7f20..37ee07c7 100644 --- a/lib/cachecontrol/wrapper.py +++ b/lib/cachecontrol/wrapper.py @@ -1,22 +1,32 @@ # SPDX-FileCopyrightText: 2015 Eric Larson # # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations -from .adapter import CacheControlAdapter -from .cache import DictCache +from typing import TYPE_CHECKING, Collection + +from cachecontrol.adapter import CacheControlAdapter +from cachecontrol.cache import DictCache + +if TYPE_CHECKING: + import requests + + from cachecontrol.cache import BaseCache + from cachecontrol.controller import CacheController + from cachecontrol.heuristics import BaseHeuristic + from cachecontrol.serialize import Serializer def CacheControl( - sess, - cache=None, - cache_etags=True, - serializer=None, - heuristic=None, - controller_class=None, - adapter_class=None, - cacheable_methods=None, -): - + sess: requests.Session, + cache: BaseCache | None = None, + cache_etags: bool = True, + serializer: Serializer | None = None, + heuristic: BaseHeuristic | None = None, + controller_class: type[CacheController] | None = None, + adapter_class: type[CacheControlAdapter] | None = None, + cacheable_methods: Collection[str] | None = None, +) -> requests.Session: cache = DictCache() if cache is None else cache adapter_class = adapter_class or CacheControlAdapter adapter = adapter_class(