From ce193ffcdb7d1cb8fb858fec586d548f83a3d49c Mon Sep 17 00:00:00 2001 From: echel0n Date: Sun, 16 Mar 2014 04:06:21 -0700 Subject: [PATCH] Replaced our cache handler 'CacheControl' with 'httpcache' as we found the previous was not stable enough and was causing more issues then good. Added cache handler to sickbeard it self so now everything should be running faster for searches and rss feeds. --- lib/cachecontrol/__init__.py | 13 -- lib/cachecontrol/adapter.py | 70 ------- lib/cachecontrol/cache.py | 36 ---- lib/cachecontrol/caches/__init__.py | 18 -- lib/cachecontrol/caches/file_cache.py | 43 ----- lib/cachecontrol/caches/redis_cache.py | 46 ----- lib/cachecontrol/compat.py | 12 -- lib/cachecontrol/controller.py | 256 ------------------------- lib/cachecontrol/patch_requests.py | 56 ------ lib/cachecontrol/wrapper.py | 10 - lib/httpcache/__init__.py | 14 ++ lib/httpcache/adapter.py | 55 ++++++ lib/httpcache/cache.py | 207 ++++++++++++++++++++ lib/httpcache/compat.py | 10 + lib/httpcache/structures.py | 59 ++++++ lib/httpcache/utils.py | 97 ++++++++++ lib/requests/adapters.py | 5 +- lib/tvdb_api/tvdb_api.py | 16 +- lib/tvrage_api/tvrage_api.py | 11 +- sickbeard/helpers.py | 15 +- sickbeard/indexers/test/test.py | 15 +- 21 files changed, 474 insertions(+), 590 deletions(-) delete mode 100644 lib/cachecontrol/__init__.py delete mode 100644 lib/cachecontrol/adapter.py delete mode 100644 lib/cachecontrol/cache.py delete mode 100644 lib/cachecontrol/caches/__init__.py delete mode 100644 lib/cachecontrol/caches/file_cache.py delete mode 100644 lib/cachecontrol/caches/redis_cache.py delete mode 100644 lib/cachecontrol/compat.py delete mode 100644 lib/cachecontrol/controller.py delete mode 100644 lib/cachecontrol/patch_requests.py delete mode 100644 lib/cachecontrol/wrapper.py create mode 100644 lib/httpcache/__init__.py create mode 100644 lib/httpcache/adapter.py create mode 100644 lib/httpcache/cache.py create mode 100644 lib/httpcache/compat.py create mode 100644 lib/httpcache/structures.py create mode 100644 lib/httpcache/utils.py diff --git a/lib/cachecontrol/__init__.py b/lib/cachecontrol/__init__.py deleted file mode 100644 index 693e11f1..00000000 --- a/lib/cachecontrol/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -"""CacheControl import Interface. - -Make it easy to import from cachecontrol without long namespaces. -""" - -# patch our requests.models.Response to make them pickleable in older -# versions of requests. - -import cachecontrol.patch_requests - -from cachecontrol.wrapper import CacheControl -from cachecontrol.adapter import CacheControlAdapter -from cachecontrol.controller import CacheController diff --git a/lib/cachecontrol/adapter.py b/lib/cachecontrol/adapter.py deleted file mode 100644 index e990746c..00000000 --- a/lib/cachecontrol/adapter.py +++ /dev/null @@ -1,70 +0,0 @@ -from requests.adapters import HTTPAdapter - -from cachecontrol.controller import CacheController -from cachecontrol.cache import DictCache - - -class CacheControlAdapter(HTTPAdapter): - invalidating_methods = set(['PUT', 'DELETE']) - - def __init__(self, cache=None, cache_etags=True, cache_force=False, *args, **kw): - super(CacheControlAdapter, self).__init__(*args, **kw) - self.cache = cache or DictCache() - self.controller = CacheController(self.cache, cache_etags=cache_etags, cache_force=cache_force) - - def send(self, request, **kw): - """Send a request. Use the request information to see if it - exists in the cache. - """ - if request.method == 'GET': - cached_response = self.controller.cached_request( - request.url, request.headers - ) - if cached_response: - # Cached responses should not have a raw field since - # they *cannot* be created from some stream. - cached_response.raw = None - return cached_response - - # check for etags and add headers if appropriate - headers = self.controller.add_headers(request.url) - request.headers.update(headers) - - resp = super(CacheControlAdapter, self).send(request, **kw) - return resp - - def build_response(self, request, response): - """Build a response by making a request or using the cache. - - This will end up calling send and returning a potentially - cached response - """ - resp = super(CacheControlAdapter, self).build_response( - request, response - ) - - # See if we should invalidate the cache. - if request.method in self.invalidating_methods and resp.ok: - cache_url = self.controller.cache_url(request.url) - self.cache.delete(cache_url) - - # Try to store the response if it is a GET - elif request.method == 'GET': - if response.status == 304: - # We must have sent an ETag request. This could mean - # that we've been expired already or that we simply - # have an etag. In either case, we want to try and - # update the cache if that is the case. - resp = self.controller.update_cached_response( - request, response - ) - else: - # try to cache the response - self.controller.cache_response(request, resp) - - # Give the request a from_cache attr to let people use it - # rather than testing for hasattr. - if not hasattr(resp, 'from_cache'): - resp.from_cache = False - - return resp diff --git a/lib/cachecontrol/cache.py b/lib/cachecontrol/cache.py deleted file mode 100644 index feb7d3ed..00000000 --- a/lib/cachecontrol/cache.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -The cache object API for implementing caches. The default is just a -dictionary, which in turns means it is not threadsafe for writing. -""" -from threading import Lock - - -class BaseCache(object): - - def get(self, key): - raise NotImplemented() - - def set(self, key, value): - raise NotImplemented() - - def delete(self, key): - raise NotImplemented() - - -class DictCache(BaseCache): - - def __init__(self, init_dict=None): - self.lock = Lock() - self.data = init_dict or {} - - def get(self, key): - return self.data.get(key, None) - - def set(self, key, value): - with self.lock: - self.data.update({key: value}) - - def delete(self, key): - with self.lock: - if key in self.data: - self.data.pop(key) diff --git a/lib/cachecontrol/caches/__init__.py b/lib/cachecontrol/caches/__init__.py deleted file mode 100644 index 5e851b03..00000000 --- a/lib/cachecontrol/caches/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -from textwrap import dedent - -try: - from cachecontrol.caches.file_cache import FileCache -except ImportError: - notice = dedent(''' - NOTE: In order to use the FileCache you must have - lockfile installed. You can install it via pip: - pip install lockfile - ''') - print(notice) - - -try: - import redis - from cachecontrol.caches.redis_cache import RedisCache -except ImportError: - pass diff --git a/lib/cachecontrol/caches/file_cache.py b/lib/cachecontrol/caches/file_cache.py deleted file mode 100644 index 3a7d1a4c..00000000 --- a/lib/cachecontrol/caches/file_cache.py +++ /dev/null @@ -1,43 +0,0 @@ -import os -import codecs - -from hashlib import md5 - -try: - from pickle import load, dump -except ImportError: - from cPickle import load, dump - -from lib.lockfile import FileLock - - -class FileCache(object): - - def __init__(self, directory, forever=False): - self.directory = directory - self.forever = forever - - if not os.path.isdir(self.directory): - os.mkdir(self.directory) - - def encode(self, x): - return md5(x.encode()).hexdigest() - - def _fn(self, name): - return os.path.join(self.directory, self.encode(name)) - - def get(self, key): - name = self._fn(key) - if os.path.exists(name): - return load(codecs.open(name, 'rb')) - - def set(self, key, value): - name = self._fn(key) - lock = FileLock(name) - with lock: - with codecs.open(lock.path, 'w+b') as fh: - dump(value, fh) - - def delete(self, key): - if not self.forever: - os.remove(self._fn(key)) diff --git a/lib/cachecontrol/caches/redis_cache.py b/lib/cachecontrol/caches/redis_cache.py deleted file mode 100644 index d3814ebc..00000000 --- a/lib/cachecontrol/caches/redis_cache.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import division - -from datetime import datetime - -try: - from cPickle import loads, dumps -except ImportError: # Python 3.x - from pickle import loads, dumps - - -def total_seconds(td): - """Python 2.6 compatability""" - if hasattr(td, 'total_seconds'): - return td.total_seconds() - - ms = td.microseconds - secs = (td.seconds + td.days * 24 * 3600) - return (ms + secs * 10**6) / 10**6 - - -class RedisCache(object): - - def __init__(self, conn): - self.conn = conn - - def get(self, key): - val = self.conn.get(key) - if val: - return loads(val) - return None - - def set(self, key, value, expires=None): - if not expires: - self.conn.set(key, dumps(value)) - else: - expires = expires - datetime.now() - self.conn.setex(key, total_seconds(expires), value) - - def delete(self, key): - self.conn.delete(key) - - def clear(self): - """Helper for clearing all the keys in a database. Use with - caution!""" - for key in self.conn.keys(): - self.conn.delete(key) diff --git a/lib/cachecontrol/compat.py b/lib/cachecontrol/compat.py deleted file mode 100644 index 1b6e596e..00000000 --- a/lib/cachecontrol/compat.py +++ /dev/null @@ -1,12 +0,0 @@ -try: - from urllib.parse import urljoin -except ImportError: - from urlparse import urljoin - - -try: - import email.utils - parsedate_tz = email.utils.parsedate_tz -except ImportError: - import email.Utils - parsedate_tz = email.Utils.parsedate_tz diff --git a/lib/cachecontrol/controller.py b/lib/cachecontrol/controller.py deleted file mode 100644 index 07ec8b84..00000000 --- a/lib/cachecontrol/controller.py +++ /dev/null @@ -1,256 +0,0 @@ -""" -The httplib2 algorithms ported for use with requests. -""" -import re -import calendar -import time -import os - -from cachecontrol.cache import DictCache -from cachecontrol.compat import parsedate_tz - - -URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?") - - -def parse_uri(uri): - """Parses a URI using the regex given in Appendix B of RFC 3986. - - (scheme, authority, path, query, fragment) = parse_uri(uri) - """ - groups = URI.match(uri).groups() - return (groups[1], groups[3], groups[4], groups[6], groups[8]) - - -class CacheController(object): - """An interface to see if request should cached or not. - """ - def __init__(self, cache=None, cache_etags=True, cache_force=False): - self.cache = cache or DictCache() - self.cache_etags = cache_etags - self.cache_force = cache_force - - def _urlnorm(self, uri): - """Normalize the URL to create a safe key for the cache""" - (scheme, authority, path, query, fragment) = parse_uri(uri) - if not scheme or not authority: - raise Exception("Only absolute URIs are allowed. uri = %s" % uri) - authority = authority.lower() - scheme = scheme.lower() - if not path: - path = "/" - - # Could do syntax based normalization of the URI before - # computing the digest. See Section 6.2.2 of Std 66. - request_uri = query and "?".join([path, query]) or path - scheme = scheme.lower() - defrag_uri = scheme + "://" + authority + request_uri - - return defrag_uri - - def cache_url(self, uri): - return self._urlnorm(uri) - - def parse_cache_control(self, headers): - """ - Parse the cache control headers returning a dictionary with values - for the different directives. - """ - retval = {} - - cc_header = 'cache-control' - if 'Cache-Control' in headers: - cc_header = 'Cache-Control' - - if cc_header in headers: - parts = headers[cc_header].split(',') - parts_with_args = [ - tuple([x.strip().lower() for x in part.split("=", 1)]) - for part in parts if -1 != part.find("=")] - parts_wo_args = [(name.strip().lower(), 1) - for name in parts if -1 == name.find("=")] - retval = dict(parts_with_args + parts_wo_args) - return retval - - def cached_request(self, url, headers): - cache_url = self.cache_url(url) - cc = self.parse_cache_control(headers) - - # non-caching states - no_cache = True if 'no-cache' in cc else False - if 'max-age' in cc and cc['max-age'] == 0: - no_cache = True - - # see if it is in the cache anyways - in_cache = self.cache.get(cache_url) - if no_cache or not in_cache: - return False - - # It is in the cache, so lets see if it is going to be - # fresh enough - resp = self.cache.get(cache_url) - - # Check our Vary header to make sure our request headers match - # up. We don't delete it from the though, we just don't return - # our cached value. - # - # NOTE: Because httplib2 stores raw content, it denotes - # headers that were sent in the original response by - # adding -varied-$name. We don't have to do that b/c we - # are storing the object which has a reference to the - # original request. If that changes, then I'd propose - # using the varied headers in the cache key to avoid the - # situation all together. - if 'vary' in resp.headers: - varied_headers = resp.headers['vary'].replace(' ', '').split(',') - original_headers = resp.request.headers - for header in varied_headers: - # If our headers don't match for the headers listed in - # the vary header, then don't use the cached response - if headers.get(header, None) != original_headers.get(header): - return False - - now = time.time() - date = calendar.timegm( - parsedate_tz(resp.headers['date']) - ) - current_age = max(0, now - date) - - # TODO: There is an assumption that the result will be a - # requests response object. This may not be best since we - # could probably avoid instantiating or constructing the - # response until we know we need it. - resp_cc = self.parse_cache_control(resp.headers) - - # determine freshness - freshness_lifetime = 0 - if 'max-age' in resp_cc and resp_cc['max-age'].isdigit(): - freshness_lifetime = int(resp_cc['max-age']) - elif 'expires' in resp.headers: - expires = parsedate_tz(resp.headers['expires']) - if expires is not None: - expire_time = calendar.timegm(expires) - date - freshness_lifetime = max(0, expire_time) - - # determine if we are setting freshness limit in the req - if 'max-age' in cc: - try: - freshness_lifetime = int(cc['max-age']) - except ValueError: - freshness_lifetime = 0 - - if 'min-fresh' in cc: - try: - min_fresh = int(cc['min-fresh']) - except ValueError: - min_fresh = 0 - # adjust our current age by our min fresh - current_age += min_fresh - - # see how fresh we actually are - fresh = (freshness_lifetime > current_age) - - if fresh: - if resp.ok: - # make sure we set the from_cache to true - resp.from_cache = True - return resp - return False - - # we're not fresh. If we don't have an Etag, clear it out - if 'etag' not in resp.headers: - self.cache.delete(cache_url) - - if 'etag' in resp.headers: - headers['If-None-Match'] = resp.headers['ETag'] - - if 'last-modified' in resp.headers: - headers['If-Modified-Since'] = resp.headers['Last-Modified'] - - # return the original handler - return False - - def add_headers(self, url, resp=None): - resp = self.cache.get(url) - if resp and 'etag' in resp.headers: - return {'If-None-Match': resp.headers['etag']} - return {} - - def cache_response(self, request, resp): - """ - Algorithm for caching requests. - - This assumes a requests Response object. - """ - # From httplib2: Don't cache 206's since we aren't going to - # handle byte range requests - if resp.status_code not in [200, 203]: - return - - cc_req = self.parse_cache_control(request.headers) - cc = self.parse_cache_control(resp.headers) - - cache_url = self.cache_url(request.url) - - # Delete it from the cache if we happen to have it stored there - no_store = cc.get('no-store') or cc_req.get('no-store') - if no_store and self.cache.get(cache_url): - self.cache.delete(cache_url) - - # If we've been given an etag, then keep the response - if self.cache_etags and 'etag' in resp.headers: - self.cache.set(cache_url, resp) - - # Add to the cache if the response headers demand it. If there - # is no date header then we can't do anything about expiring - # the cache. - elif 'date' in resp.headers: - # cache when there is a max-age > 0 - if cc and cc.get('max-age'): - if int(cc['max-age']) > 0: - self.cache.set(cache_url, resp) - - # If the request can expire, it means we should cache it - # in the meantime. - elif 'expires' in resp.headers: - if resp.headers['expires']: - self.cache.set(cache_url, resp) - - # If the request is for our local cache, it means we should cache it - elif self.cache_force: - resp.headers.update({'cache-control': 'max-age=900, private'}) - self.cache.set(cache_url, resp) - - def update_cached_response(self, request, response): - """On a 304 we will get a new set of headers that we want to - update our cached value with, assuming we have one. - - This should only ever be called when we've sent an ETag and - gotten a 304 as the response. - """ - cache_url = self.cache_url(request.url) - - resp = self.cache.get(cache_url) - - if not resp: - # we didn't have a cached response - return response - - # did so lets update our headers - resp.headers.update(resp.headers) - - # we want a 200 b/c we have content via the cache - request.status_code = 200 - - # update the request as it has the if-none-match header + any - # other headers that the server might have updated (ie Date, - # Cache-Control, Expires, etc.) - resp.request = request - - # update our cache - self.cache.set(cache_url, resp) - - # Let everyone know this was from the cache. - resp.from_cache = True - - return resp diff --git a/lib/cachecontrol/patch_requests.py b/lib/cachecontrol/patch_requests.py deleted file mode 100644 index cad60e17..00000000 --- a/lib/cachecontrol/patch_requests.py +++ /dev/null @@ -1,56 +0,0 @@ -import requests - -from requests import models -from requests.packages.urllib3.response import HTTPResponse - -__attrs__ = [ - '_content', - 'status_code', - 'headers', - 'url', - 'history', - 'encoding', - 'reason', - 'cookies', - 'elapsed', -] - - -def response_getstate(self): - # consume everything - if not self._content_consumed: - self.content - - state = dict( - (attr, getattr(self, attr, None)) - for attr in __attrs__ - ) - - # deal with our raw content b/c we need it for our cookie jar - state['raw_original_response'] = self.raw._original_response - return state - - -def response_setstate(self, state): - for name, value in state.items(): - if name != 'raw_original_response': - setattr(self, name, value) - - setattr(self, 'raw', HTTPResponse()) - self.raw._original_response = state['raw_original_response'] - - -def make_responses_pickleable(): - try: - version_parts = [int(part) for part in requests.__version__.split('.')] - - # must be >= 2.2.x - if not version_parts[0] >= 2 or not version_parts[1] >= 2: - models.Response.__getstate__ = response_getstate - models.Response.__setstate__ = response_setstate - except: - raise - pass - - -make_responses_pickleable() diff --git a/lib/cachecontrol/wrapper.py b/lib/cachecontrol/wrapper.py deleted file mode 100644 index 38b91536..00000000 --- a/lib/cachecontrol/wrapper.py +++ /dev/null @@ -1,10 +0,0 @@ -from cachecontrol.adapter import CacheControlAdapter -from cachecontrol.cache import DictCache - - -def CacheControl(sess, cache=None, cache_etags=True, cache_force=False): - cache = cache or DictCache() - adapter = CacheControlAdapter(cache, cache_etags=cache_etags, cache_force=cache_force) - sess.mount('http://', adapter) - - return sess diff --git a/lib/httpcache/__init__.py b/lib/httpcache/__init__.py new file mode 100644 index 00000000..0b8a963c --- /dev/null +++ b/lib/httpcache/__init__.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- +""" +__init__.py +~~~~~~~~~~~ + +Defines the public API to the httpcache module. +""" + +__version__ = '0.1.3' + +from .cache import HTTPCache +from .adapter import CachingHTTPAdapter + +__all__ = [HTTPCache, CachingHTTPAdapter] diff --git a/lib/httpcache/adapter.py b/lib/httpcache/adapter.py new file mode 100644 index 00000000..b1d511bf --- /dev/null +++ b/lib/httpcache/adapter.py @@ -0,0 +1,55 @@ +""" +adapter.py +~~~~~~~~~~ + +Contains an implementation of an HTTP adapter for Requests that is aware of the +cache contained in this module. +""" +from requests.adapters import HTTPAdapter +from .cache import HTTPCache + + +class CachingHTTPAdapter(HTTPAdapter): + """ + A HTTP-caching-aware Transport Adapter for Python Requests. The central + portion of the API. + + :param capacity: The maximum capacity of the backing cache. + """ + def __init__(self, capacity=50, **kwargs): + super(CachingHTTPAdapter, self).__init__(**kwargs) + + #: The HTTP Cache backing the adapter. + self.cache = HTTPCache(capacity=capacity) + + def send(self, request, **kwargs): + """ + Sends a PreparedRequest object, respecting RFC 2616's rules about HTTP + caching. Returns a Response object that may have been cached. + + :param request: The Requests :class:`PreparedRequest ` object to send. + """ + cached_resp = self.cache.retrieve(request) + + if cached_resp is not None: + return cached_resp + else: + return super(CachingHTTPAdapter, self).send(request, **kwargs) + + def build_response(self, request, response): + """ + Builds a Response object from a urllib3 response. May involve returning + a cached Response. + + :param request: The Requests :class:`PreparedRequest ` object sent. + :param response: The urllib3 response. + """ + resp = super(CachingHTTPAdapter, self).build_response(request, + response) + + if resp.status_code == 304: + resp = self.cache.handle_304(resp) + else: + self.cache.store(resp) + + return resp diff --git a/lib/httpcache/cache.py b/lib/httpcache/cache.py new file mode 100644 index 00000000..892e8f90 --- /dev/null +++ b/lib/httpcache/cache.py @@ -0,0 +1,207 @@ +# -*- coding: utf-8 -*- +""" +cache.py +~~~~~~~~ + +Contains the primary cache structure used in http-cache. +""" +from .structures import RecentOrderedDict +from .utils import (parse_date_header, build_date_header, + expires_from_cache_control, url_contains_query) +from datetime import datetime + + +# RFC 2616 specifies that we can cache 200 OK, 203 Non Authoritative, +# 206 Partial Content, 300 Multiple Choices, 301 Moved Permanently and +# 410 Gone responses. We don't cache 206s at the moment because we +# don't handle Range and Content-Range headers. +CACHEABLE_RCS = (200, 203, 300, 301, 410) + +# Cacheable verbs. +CACHEABLE_VERBS = ('GET', 'HEAD', 'OPTIONS') + +# Some verbs MUST invalidate the resource in the cache, according to RFC 2616. +# If we send one of these, or any verb we don't recognise, invalidate the +# cache entry for that URL. As it happens, these are also the cacheable +# verbs. That works out well for us. +NON_INVALIDATING_VERBS = CACHEABLE_VERBS + + +class HTTPCache(object): + """ + The HTTP Cache object. Manages caching of responses according to RFC 2616, + adding necessary headers to HTTP request objects, and returning cached + responses based on server responses. + + This object is not expected to be used by most users. It is exposed as part + of the public API for users who feel the need for more control. This API + may change in a minor version increase. Be warned. + + :param capacity: (Optional) The maximum capacity of the HTTP cache. + """ + def __init__(self, capacity=50): + #: The maximum capacity of the HTTP cache. When this many cache entries + #: end up in the cache, the oldest entries are removed. + self.capacity = capacity + + #: The cache backing store. Cache entries are stored here as key-value + #: pairs. The key is the URL used to retrieve the cached response. The + #: value is a python dict, which stores three objects: the response + #: (keyed off of 'response'), the retrieval or creation date (keyed off + #: of 'creation') and the cache expiry date (keyed off of 'expiry'). + #: This last value may be None. + self._cache = RecentOrderedDict() + + def store(self, response): + """ + Takes an HTTP response object and stores it in the cache according to + RFC 2616. Returns a boolean value indicating whether the response was + cached or not. + + :param response: Requests :class:`Response ` object to cache. + """ + # Define an internal utility function. + def date_header_or_default(header_name, default, response): + try: + date_header = response.headers[header_name] + except KeyError: + value = default + else: + value = parse_date_header(date_header) + return value + + if response.status_code not in CACHEABLE_RCS: + return False + + if response.request.method not in CACHEABLE_VERBS: + return False + + url = response.url + now = datetime.utcnow() + + # Get the value of the 'Date' header, if it exists. If it doesn't, just + # use now. + creation = date_header_or_default('Date', now, response) + + # Get the value of the 'Cache-Control' header, if it exists. + cc = response.headers.get('Cache-Control', None) + if cc is not None: + expiry = expires_from_cache_control(cc, now) + + # If the above returns None, we are explicitly instructed not to + # cache this. + if expiry is None: + return False + + # Get the value of the 'Expires' header, if it exists, and if we don't + # have anything from the 'Cache-Control' header. + if cc is None: + expiry = date_header_or_default('Expires', None, response) + + # If the expiry date is earlier or the same as the Date header, don't + # cache the response at all. + if expiry is not None and expiry <= creation: + return False + + # If there's a query portion of the url and it's a GET, don't cache + # this unless explicitly instructed to. + if expiry is None and response.request.method == 'GET': + if url_contains_query(url): + return False + + self._cache[url] = {'response': response, + 'creation': creation, + 'expiry': expiry} + + self.__reduce_cache_count() + + return True + + def handle_304(self, response): + """ + Given a 304 response, retrieves the cached entry. This unconditionally + returns the cached entry, so it can be used when the 'intelligent' + behaviour of retrieve() is not desired. + + Returns None if there is no entry in the cache. + + :param response: The 304 response to find the cached entry for. Should be a Requests :class:`Response `. + """ + try: + cached_response = self._cache[response.url]['response'] + except KeyError: + cached_response = None + + return cached_response + + def retrieve(self, request): + """ + Retrieves a cached response if possible. + + If there is a response that can be unconditionally returned (e.g. one + that had a Cache-Control header set), that response is returned. If + there is one that can be conditionally returned (if a 304 is returned), + applies an If-Modified-Since header to the request and returns None. + + :param request: The Requests :class:`PreparedRequest ` object. + """ + return_response = None + url = request.url + + try: + cached_response = self._cache[url] + except KeyError: + return None + + if request.method not in NON_INVALIDATING_VERBS: + del self._cache[url] + return None + + if cached_response['expiry'] is None: + # We have no explicit expiry time, so we weren't instructed to + # cache. Add an 'If-Modified-Since' header. + creation = cached_response['creation'] + header = build_date_header(creation) + request.headers['If-Modified-Since'] = header + else: + # We have an explicit expiry time. If we're earlier than the expiry + # time, return the response. + now = datetime.utcnow() + + if now <= cached_response['expiry']: + return_response = cached_response['response'] + else: + del self._cache[url] + + return return_response + + def __reduce_cache_count(self): + """ + Drops the number of entries in the cache to the capacity of the cache. + + Walks the backing RecentOrderedDict in order from oldest to youngest. + Deletes cache entries that are either invalid or being speculatively + cached until the number of cache entries drops to the capacity. If this + leaves the cache above capacity, begins deleting the least-used cache + entries that are still valid until the cache has space. + """ + if len(self._cache) <= self.capacity: + return + + to_delete = len(self._cache) - self.capacity + keys = list(self._cache.keys()) + + for key in keys: + if self._cache[key]['expiry'] is None: + del self._cache[key] + to_delete -= 1 + + if to_delete == 0: + return + + keys = list(self._cache.keys()) + + for i in range(to_delete): + del self._cache[keys[i]] + + return diff --git a/lib/httpcache/compat.py b/lib/httpcache/compat.py new file mode 100644 index 00000000..384d38e6 --- /dev/null +++ b/lib/httpcache/compat.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +""" +compat.py +~~~~~~~~~ + +Defines cross-platform functions and classes needed to achieve proper +functionality. +""" + +pass diff --git a/lib/httpcache/structures.py b/lib/httpcache/structures.py new file mode 100644 index 00000000..6ac24a92 --- /dev/null +++ b/lib/httpcache/structures.py @@ -0,0 +1,59 @@ +""" +structures.py +~~~~~~~~~~~~~ + +Defines structures used by the httpcache module. +""" + +class RecentOrderedDict(dict): + """ + A custom variant of the dictionary that ensures that the object most + recently inserted _or_ retrieved from the dictionary is enumerated first. + """ + def __init__(self): + self._data = {} + self._order = [] + + def __setitem__(self, key, value): + if key in self._data: + self._order.remove(key) + + self._order.append(key) + self._data[key] = value + + def __getitem__(self, key): + value = self._data[key] + self._order.remove(key) + self._order.append(key) + return value + + def __delitem__(self, key): + del self._data[key] + self._order.remove(key) + + def __iter__(self): + return self._order + + def __len__(self): + return len(self._order) + + def __contains__(self, value): + return self._data.__contains__(value) + + def items(self): + return [(key, self._data[key]) for key in self._order] + + def keys(self): + return self._order + + def values(self): + return [self._data[key] for key in self._order] + + def clear(self): + self._data = {} + self._order = [] + + def copy(self): + c = RecentOrderedDict() + c._data = self._data.copy() + c._order = self._order[:] diff --git a/lib/httpcache/utils.py b/lib/httpcache/utils.py new file mode 100644 index 00000000..0efe9f99 --- /dev/null +++ b/lib/httpcache/utils.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +""" +utils.py +~~~~~~~~ + +Utility functions for use with httpcache. +""" +from datetime import datetime, timedelta + +try: # Python 2 + from urlparse import urlparse +except ImportError: # Python 3 + from urllib.parse import urlparse + +RFC_1123_DT_STR = "%a, %d %b %Y %H:%M:%S GMT" +RFC_850_DT_STR = "%A, %d-%b-%y %H:%M:%S GMT" + + +def parse_date_header(header): + """ + Given a date header in the form specified by RFC 2616, return a Python + datetime object. + + RFC 2616 specifies three possible formats for date/time headers, and + makes it clear that all dates/times should be in UTC/GMT. That is assumed + by this library, which simply does everything in UTC. This currently does + not parse the C asctime() string, because that's effort. + + This function does _not_ follow Postel's Law. If a format does not strictly + match the defined strings, this function returns None. This is considered + 'safe' behaviour. + """ + try: + dt = datetime.strptime(header, RFC_1123_DT_STR) + except ValueError: + try: + dt = datetime.strptime(header, RFC_850_DT_STR) + except ValueError: + dt = None + except TypeError: + dt = None + + return dt + + +def build_date_header(dt): + """ + Given a Python datetime object, build a Date header value according to + RFC 2616. + + RFC 2616 specifies that the RFC 1123 form is to be preferred, so that is + what we use. + """ + return dt.strftime(RFC_1123_DT_STR) + + +def expires_from_cache_control(header, current_time): + """ + Given a Cache-Control header, builds a Python datetime object corresponding + to the expiry time (in UTC). This function should respect all relevant + Cache-Control directives. + + Takes current_time as an argument to ensure that 'max-age=0' generates the + correct behaviour without being special-cased. + + Returns None to indicate that a request must not be cached. + """ + # Cache control header values are made of multiple comma separated fields. + # Splitting them like this is probably a bad idea, but I'm going to roll with + # it for now. We'll come back to it. + fields = header.split(', ') + duration = None + + for field in fields: + # Right now we don't handle no-cache applied to specific fields. To be + # as 'nice' as possible, treat any no-cache as applying to the whole + # request. Bail early, because there's no reason to stick around. + if field.startswith('no-cache') or field == 'no-store': + return None + + if field.startswith('max-age'): + _, duration = field.split('=') + duration = int(duration) + + if duration: + interval = timedelta(seconds=int(duration)) + return current_time + interval + +def url_contains_query(url): + """ + A very stupid function for determining if a URL contains a query string + or not. + """ + if urlparse(url).query: + return True + else: + return False diff --git a/lib/requests/adapters.py b/lib/requests/adapters.py index 28bea07c..0f297ab2 100644 --- a/lib/requests/adapters.py +++ b/lib/requests/adapters.py @@ -9,6 +9,7 @@ and maintain connections. """ import socket +import copy from .models import Response from .packages.urllib3.poolmanager import PoolManager, proxy_from_url @@ -62,7 +63,7 @@ class HTTPAdapter(BaseAdapter): Usage:: - >>> import requests + >>> import lib.requests >>> s = requests.Session() >>> a = requests.adapters.HTTPAdapter(max_retries=3) >>> s.mount('http://', a) @@ -390,4 +391,4 @@ class HTTPAdapter(BaseAdapter): if not stream: r.content - return r + return r \ No newline at end of file diff --git a/lib/tvdb_api/tvdb_api.py b/lib/tvdb_api/tvdb_api.py index 9942ee51..62147a8f 100644 --- a/lib/tvdb_api/tvdb_api.py +++ b/lib/tvdb_api/tvdb_api.py @@ -39,19 +39,14 @@ except ImportError: gzip = None from lib import requests -from lib.cachecontrol.wrapper import CacheControl -from lib.cachecontrol.caches.file_cache import FileCache - from tvdb_ui import BaseUI, ConsoleUI from tvdb_exceptions import (tvdb_error, tvdb_userabort, tvdb_shownotfound, tvdb_seasonnotfound, tvdb_episodenotfound, tvdb_attributenotfound) - def log(): return logging.getLogger("tvdb_api") - class ShowContainer(dict): """Simple dict that holds a series of Show instances """ @@ -518,12 +513,15 @@ class Tvdb: # cacheControl if self.config['cache_enabled']: - sess = CacheControl(requests.Session(), cache_force=True, cache=FileCache(self.config['cache_location'])) + from lib.httpcache import CachingHTTPAdapter + sess = requests.Session() + sess.mount('http://', CachingHTTPAdapter()) else: sess = requests.Session() # get response from TVDB resp = sess.get(url, params=params) + sess.close() except requests.HTTPError, e: raise tvdb_error("HTTP error " + str(e.errno) + " while loading URL " + str(url)) @@ -536,7 +534,7 @@ class Tvdb: except Exception, e: raise tvdb_error("Unknown exception occured: " + str(e.message) + " while loading URL " + str(url)) - if resp.ok: + if resp.ok and resp.content: if 'application/zip' in resp.headers.get("Content-Type", ''): try: # TODO: The zip contains actors.xml and banners.xml, which are currently ignored [GH-20] @@ -559,11 +557,11 @@ class Tvdb: try: # TVDB doesn't sanitize \r (CR) from user input in some fields, # remove it to avoid errors. Change from SickBeard, from will14m - return ElementTree.fromstring(src.rstrip("\r")) + return ElementTree.fromstring(src.rstrip("\r")) if src else None except SyntaxError: src = self._loadUrl(url, params=params, language=language) try: - return ElementTree.fromstring(src.rstrip("\r")) + return ElementTree.fromstring(src.rstrip("\r")) if src else None except SyntaxError, exceptionmsg: errormsg = "There was an error with the XML retrieved from thetvdb.com:\n%s" % ( exceptionmsg diff --git a/lib/tvrage_api/tvrage_api.py b/lib/tvrage_api/tvrage_api.py index 422ef251..23c4c22e 100644 --- a/lib/tvrage_api/tvrage_api.py +++ b/lib/tvrage_api/tvrage_api.py @@ -30,10 +30,7 @@ except ImportError: import xml.etree.ElementTree as ElementTree from lib.dateutil.parser import parse - from lib import requests -from lib.cachecontrol.wrapper import CacheControl -from lib.cachecontrol.caches.file_cache import FileCache from tvrage_ui import BaseUI from tvrage_exceptions import (tvrage_error, tvrage_userabort, tvrage_shownotfound, @@ -42,7 +39,6 @@ from tvrage_exceptions import (tvrage_error, tvrage_userabort, tvrage_shownotfou def log(): return logging.getLogger("tvrage_api") - class ShowContainer(dict): """Simple dict that holds a series of Show instances """ @@ -351,12 +347,15 @@ class TVRage: # cacheControl if self.config['cache_enabled']: - sess = CacheControl(requests.Session(), cache_force=True, cache=FileCache(self.config['cache_location'])) + from lib.httpcache import CachingHTTPAdapter + sess = requests.Session() + sess.mount('http://', CachingHTTPAdapter()) else: sess = requests.Session() # get response from TVRage resp = sess.get(url, params=params) + sess.close() except requests.HTTPError, e: raise tvrage_error("HTTP error " + str(e.errno) + " while loading URL " + str(url)) @@ -366,7 +365,7 @@ class TVRage: except requests.Timeout, e: raise tvrage_error("Connection timed out " + str(e.message) + " while loading URL " + str(url)) - return resp.content if resp.ok else None + return resp.content if resp.ok and resp.content else None def _getetsrc(self, url, params=None): """Loads a URL using caching, returns an ElementTree of the source diff --git a/sickbeard/helpers.py b/sickbeard/helpers.py index 8e43edee..83c88602 100644 --- a/sickbeard/helpers.py +++ b/sickbeard/helpers.py @@ -35,6 +35,7 @@ import base64 from lib import requests from httplib import BadStatusLine from itertools import izip, cycle +from lib.httpcache import CachingHTTPAdapter try: import json @@ -169,6 +170,9 @@ def getURL(url, post_data=None, headers=None, params=None, timeout=None): Returns a byte-string retrieved from the url provider. """ + # Cache Handler + sess = requests.Session() + sess.mount('http://', CachingHTTPAdapter()) req_headers = ['User-Agent', USER_AGENT, 'Accept-Encoding', 'gzip,deflate'] if headers: @@ -182,8 +186,8 @@ Returns a byte-string retrieved from the url provider. url = urlparse.urlunparse(parsed) it = iter(req_headers) - sess = requests.session() resp = sess.get(url, params=params, data=post_data, headers=dict(zip(it, it))) + sess.close() except requests.HTTPError, e: logger.log(u"HTTP error " + str(e.errno) + " while loading URL " + url, logger.WARNING) return None @@ -196,7 +200,7 @@ Returns a byte-string retrieved from the url provider. logger.log(u"Connection timed out " + str(e.message) + " while loading URL " + url, logger.WARNING) return None - return resp.content if resp.ok else None + return resp.content if resp.ok and resp.content else None def _remove_file_failed(file): try: @@ -206,8 +210,11 @@ def _remove_file_failed(file): def download_file(url, filename): try: - sess = requests.session() + # cache handler + sess = requests.Session() + sess.mount('http://', CachingHTTPAdapter()) req = sess.get(url, stream=True) + #CHUNK = 16 * 1024 with open(filename, 'wb') as fp: for chunk in req.iter_content(chunk_size=(16 *1024)): @@ -215,7 +222,7 @@ def download_file(url, filename): fp.write(chunk) fp.flush() fp.close() - req.close() + sess.close() except requests.HTTPError, e: _remove_file_failed(filename) diff --git a/sickbeard/indexers/test/test.py b/sickbeard/indexers/test/test.py index 75bc58c2..3973f678 100644 --- a/sickbeard/indexers/test/test.py +++ b/sickbeard/indexers/test/test.py @@ -11,19 +11,16 @@ from sickbeard.indexers.indexer_api import indexerApi from sickbeard.indexers.indexer_exceptions import indexer_exception class APICheck(unittest.TestCase): - indexer_id = 'Continum' - indexer = 'TVRage' + indexer_id = 81189 + indexer = 'Tvdb' + lang = "en" + # Set our common indexer_api options here - INDEXER_API_PARMS = {'apikey': 'Uhewg1Rr0o62fvZvUIZt', - 'language': 'en', - 'useZip': True} - - - INDEXER_API_PARMS['indexer'] = indexer + INDEXER_API_PARMS = {'indexer': indexer} lindexer_api_parms = INDEXER_API_PARMS.copy() try: -# showurl = indexerApi(**lindexer_api_parms).config['base_url'] + str(indexer_id) + '/all/en.zip' + lang_id = indexerApi().config['langabbv_to_id'][lang] t = indexerApi(cache=True, **lindexer_api_parms) myEp = t[indexer_id]