#!/usr/bin/env python2 #encoding:utf-8 #author:dbr/Ben #project:tvdb_api #repository:http://github.com/dbr/tvdb_api #license:unlicense (http://unlicense.org/) """ urllib2 caching handler Modified from http://code.activestate.com/recipes/491261/ """ __author__ = "dbr/Ben" __version__ = "1.9" import os import time import errno from hashlib import md5 from threading import RLock from six import StringIO from six.moves import urllib, http_client as httplib cache_lock = RLock() def locked_function(origfunc): """Decorator to execute function under lock""" def wrapped(*args, **kwargs): cache_lock.acquire() try: return origfunc(*args, **kwargs) finally: cache_lock.release() return wrapped def calculate_cache_path(cache_location, url): """Checks if [cache_location]/[hash_of_url].headers and .body exist """ thumb = md5(url).hexdigest() header = os.path.join(cache_location, thumb + ".headers") body = os.path.join(cache_location, thumb + ".body") return header, body def check_cache_time(path, max_age): """Checks if a file has been created/modified in the [last max_age] seconds. False means the file is too old (or doesn't exist), True means it is up-to-date and valid""" if not os.path.isfile(path): return False cache_modified_time = os.stat(path).st_mtime time_now = time.time() if cache_modified_time < time_now - max_age: # Cache is old return False else: return True @locked_function def exists_in_cache(cache_location, url, max_age): """Returns if header AND body cache file exist (and are up-to-date)""" hpath, bpath = calculate_cache_path(cache_location, url) if os.path.exists(hpath) and os.path.exists(bpath): return( check_cache_time(hpath, max_age) and check_cache_time(bpath, max_age) ) else: # File does not exist return False @locked_function def store_in_cache(cache_location, url, response): """Tries to store response in cache.""" hpath, bpath = calculate_cache_path(cache_location, url) try: outf = open(hpath, "wb") headers = str(response.info()) outf.write(headers) outf.close() outf = open(bpath, "wb") outf.write(response.read()) outf.close() except IOError: return True else: return False @locked_function def delete_from_cache(cache_location, url): """Deletes a response in cache.""" hpath, bpath = calculate_cache_path(cache_location, url) try: if os.path.exists(hpath): os.remove(hpath) if os.path.exists(bpath): os.remove(bpath) except IOError: return True else: return False class CacheHandler(urllib.request.BaseHandler): """Stores responses in a persistant on-disk cache. If a subsequent GET request is made for the same URL, the stored response is returned, saving time, resources and bandwidth """ @locked_function def __init__(self, cache_location, max_age = 21600): """The location of the cache directory""" self.max_age = max_age self.cache_location = cache_location if not os.path.exists(self.cache_location): try: os.mkdir(self.cache_location) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(self.cache_location): # File exists, and it's a directory, # another process beat us to creating this dir, that's OK. pass else: # Our target dir is already a file, or different error, # relay the error! raise def default_open(self, request): """Handles GET requests, if the response is cached it returns it """ if "GET" != request.get_method(): return None # let the next handler try to handle the request if exists_in_cache( self.cache_location, request.get_full_url(), self.max_age ): return CachedResponse( self.cache_location, request.get_full_url(), set_cache_header = True ) else: return None def http_response(self, request, response): """Gets a HTTP response, if it was a GET request and the status code starts with 2 (200 OK etc) it caches it and returns a CachedResponse """ if ("GET" == request.get_method() and str(response.code).startswith("2")): if 'x-local-cache' not in response.info(): # Response is not cached set_cache_header = store_in_cache( self.cache_location, request.get_full_url(), response ) else: set_cache_header = True return CachedResponse( self.cache_location, request.get_full_url(), set_cache_header = set_cache_header ) else: return response class CachedResponse(StringIO): """An urllib2.response-like object for cached responses. To determine if a response is cached or coming directly from the network, check the x-local-cache header rather than the object type. """ @locked_function def __init__(self, cache_location, url, set_cache_header=True): self.cache_location = cache_location hpath, bpath = calculate_cache_path(cache_location, url) StringIO.__init__(self, open(bpath, "rb").read()) self.url = url self.code = 200 self.msg = "OK" headerbuf = open(hpath, "rb").read() if set_cache_header: headerbuf += "x-local-cache: %s\r\n" % (bpath) self.headers = httplib.HTTPMessage(StringIO(headerbuf)) def info(self): """Returns headers """ return self.headers def geturl(self): """Returns original URL """ return self.url @locked_function def recache(self): new_request = urllib.request.urlopen(self.url) set_cache_header = store_in_cache( self.cache_location, new_request.url, new_request ) CachedResponse.__init__(self, self.cache_location, self.url, True) @locked_function def delete_cache(self): delete_from_cache( self.cache_location, self.url ) if __name__ == "__main__": def main(): """Quick test/example of CacheHandler""" opener = urllib.request.build_opener(CacheHandler("/tmp/")) response = opener.open('http://google.com') print(response.headers) print('Response:', response.read()) response.recache() print(response.headers) print('After recache:', response.read()) # Test usage in threads from threading import Thread class CacheThreadTest(Thread): lastdata = None def run(self): req = opener.open("http://google.com") newdata = req.read() if None is self.lastdata: self.lastdata = newdata assert self.lastdata == newdata, "Data was not consistent, uhoh" req.recache() threads = [CacheThreadTest() for _ in range(50)] print('Starting threads') [t.start() for t in threads] print('..done') print('Joining threads') [t.join() for t in threads] print('..done') main()