controller.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. """
  2. The httplib2 algorithms ported for use with requests.
  3. """
  4. import re
  5. import calendar
  6. import time
  7. from email.utils import parsedate_tz
  8. from pip._vendor.requests.structures import CaseInsensitiveDict
  9. from .cache import DictCache
  10. from .serialize import Serializer
  11. URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
  12. def parse_uri(uri):
  13. """Parses a URI using the regex given in Appendix B of RFC 3986.
  14. (scheme, authority, path, query, fragment) = parse_uri(uri)
  15. """
  16. groups = URI.match(uri).groups()
  17. return (groups[1], groups[3], groups[4], groups[6], groups[8])
  18. class CacheController(object):
  19. """An interface to see if request should cached or not.
  20. """
  21. def __init__(self, cache=None, cache_etags=True, serializer=None):
  22. self.cache = cache or DictCache()
  23. self.cache_etags = cache_etags
  24. self.serializer = serializer or Serializer()
  25. @classmethod
  26. def _urlnorm(cls, uri):
  27. """Normalize the URL to create a safe key for the cache"""
  28. (scheme, authority, path, query, fragment) = parse_uri(uri)
  29. if not scheme or not authority:
  30. raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
  31. scheme = scheme.lower()
  32. authority = authority.lower()
  33. if not path:
  34. path = "/"
  35. # Could do syntax based normalization of the URI before
  36. # computing the digest. See Section 6.2.2 of Std 66.
  37. request_uri = query and "?".join([path, query]) or path
  38. defrag_uri = scheme + "://" + authority + request_uri
  39. return defrag_uri
  40. @classmethod
  41. def cache_url(cls, uri):
  42. return cls._urlnorm(uri)
  43. def parse_cache_control(self, headers):
  44. """
  45. Parse the cache control headers returning a dictionary with values
  46. for the different directives.
  47. """
  48. retval = {}
  49. cc_header = 'cache-control'
  50. if 'Cache-Control' in headers:
  51. cc_header = 'Cache-Control'
  52. if cc_header in headers:
  53. parts = headers[cc_header].split(',')
  54. parts_with_args = [
  55. tuple([x.strip().lower() for x in part.split("=", 1)])
  56. for part in parts if -1 != part.find("=")
  57. ]
  58. parts_wo_args = [
  59. (name.strip().lower(), 1)
  60. for name in parts if -1 == name.find("=")
  61. ]
  62. retval = dict(parts_with_args + parts_wo_args)
  63. return retval
  64. def cached_request(self, request):
  65. """
  66. Return a cached response if it exists in the cache, otherwise
  67. return False.
  68. """
  69. cache_url = self.cache_url(request.url)
  70. cc = self.parse_cache_control(request.headers)
  71. # non-caching states
  72. no_cache = True if 'no-cache' in cc else False
  73. if 'max-age' in cc and cc['max-age'] == 0:
  74. no_cache = True
  75. # Bail out if no-cache was set
  76. if no_cache:
  77. return False
  78. # It is in the cache, so lets see if it is going to be
  79. # fresh enough
  80. resp = self.serializer.loads(request, self.cache.get(cache_url))
  81. # Check to see if we have a cached object
  82. if not resp:
  83. return False
  84. # If we have a cached 301, return it immediately. We don't
  85. # need to test our response for other headers b/c it is
  86. # intrinsically "cacheable" as it is Permanent.
  87. # See:
  88. # https://tools.ietf.org/html/rfc7231#section-6.4.2
  89. #
  90. # Client can try to refresh the value by repeating the request
  91. # with cache busting headers as usual (ie no-cache).
  92. if resp.status == 301:
  93. return resp
  94. headers = CaseInsensitiveDict(resp.headers)
  95. if not headers or 'date' not in headers:
  96. # With date or etag, the cached response can never be used
  97. # and should be deleted.
  98. if 'etag' not in headers:
  99. self.cache.delete(cache_url)
  100. return False
  101. now = time.time()
  102. date = calendar.timegm(
  103. parsedate_tz(headers['date'])
  104. )
  105. current_age = max(0, now - date)
  106. # TODO: There is an assumption that the result will be a
  107. # urllib3 response object. This may not be best since we
  108. # could probably avoid instantiating or constructing the
  109. # response until we know we need it.
  110. resp_cc = self.parse_cache_control(headers)
  111. # determine freshness
  112. freshness_lifetime = 0
  113. # Check the max-age pragma in the cache control header
  114. if 'max-age' in resp_cc and resp_cc['max-age'].isdigit():
  115. freshness_lifetime = int(resp_cc['max-age'])
  116. # If there isn't a max-age, check for an expires header
  117. elif 'expires' in headers:
  118. expires = parsedate_tz(headers['expires'])
  119. if expires is not None:
  120. expire_time = calendar.timegm(expires) - date
  121. freshness_lifetime = max(0, expire_time)
  122. # determine if we are setting freshness limit in the req
  123. if 'max-age' in cc:
  124. try:
  125. freshness_lifetime = int(cc['max-age'])
  126. except ValueError:
  127. freshness_lifetime = 0
  128. if 'min-fresh' in cc:
  129. try:
  130. min_fresh = int(cc['min-fresh'])
  131. except ValueError:
  132. min_fresh = 0
  133. # adjust our current age by our min fresh
  134. current_age += min_fresh
  135. # see how fresh we actually are
  136. fresh = (freshness_lifetime > current_age)
  137. if fresh:
  138. return resp
  139. # we're not fresh. If we don't have an Etag, clear it out
  140. if 'etag' not in headers:
  141. self.cache.delete(cache_url)
  142. # return the original handler
  143. return False
  144. def conditional_headers(self, request):
  145. cache_url = self.cache_url(request.url)
  146. resp = self.serializer.loads(request, self.cache.get(cache_url))
  147. new_headers = {}
  148. if resp:
  149. headers = CaseInsensitiveDict(resp.headers)
  150. if 'etag' in headers:
  151. new_headers['If-None-Match'] = headers['ETag']
  152. if 'last-modified' in headers:
  153. new_headers['If-Modified-Since'] = headers['Last-Modified']
  154. return new_headers
  155. def cache_response(self, request, response, body=None):
  156. """
  157. Algorithm for caching requests.
  158. This assumes a requests Response object.
  159. """
  160. # From httplib2: Don't cache 206's since we aren't going to
  161. # handle byte range requests
  162. if response.status not in [200, 203, 300, 301]:
  163. return
  164. response_headers = CaseInsensitiveDict(response.headers)
  165. cc_req = self.parse_cache_control(request.headers)
  166. cc = self.parse_cache_control(response_headers)
  167. cache_url = self.cache_url(request.url)
  168. # Delete it from the cache if we happen to have it stored there
  169. no_store = cc.get('no-store') or cc_req.get('no-store')
  170. if no_store and self.cache.get(cache_url):
  171. self.cache.delete(cache_url)
  172. # If we've been given an etag, then keep the response
  173. if self.cache_etags and 'etag' in response_headers:
  174. self.cache.set(
  175. cache_url,
  176. self.serializer.dumps(request, response, body=body),
  177. )
  178. # Add to the cache any 301s. We do this before looking that
  179. # the Date headers.
  180. elif response.status == 301:
  181. self.cache.set(
  182. cache_url,
  183. self.serializer.dumps(request, response)
  184. )
  185. # Add to the cache if the response headers demand it. If there
  186. # is no date header then we can't do anything about expiring
  187. # the cache.
  188. elif 'date' in response_headers:
  189. # cache when there is a max-age > 0
  190. if cc and cc.get('max-age'):
  191. if int(cc['max-age']) > 0:
  192. self.cache.set(
  193. cache_url,
  194. self.serializer.dumps(request, response, body=body),
  195. )
  196. # If the request can expire, it means we should cache it
  197. # in the meantime.
  198. elif 'expires' in response_headers:
  199. if response_headers['expires']:
  200. self.cache.set(
  201. cache_url,
  202. self.serializer.dumps(request, response, body=body),
  203. )
  204. def update_cached_response(self, request, response):
  205. """On a 304 we will get a new set of headers that we want to
  206. update our cached value with, assuming we have one.
  207. This should only ever be called when we've sent an ETag and
  208. gotten a 304 as the response.
  209. """
  210. cache_url = self.cache_url(request.url)
  211. cached_response = self.serializer.loads(
  212. request,
  213. self.cache.get(cache_url)
  214. )
  215. if not cached_response:
  216. # we didn't have a cached response
  217. return response
  218. # Lets update our headers with the headers from the new request:
  219. # http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-26#section-4.1
  220. #
  221. # The server isn't supposed to send headers that would make
  222. # the cached body invalid. But... just in case, we'll be sure
  223. # to strip out ones we know that might be problmatic due to
  224. # typical assumptions.
  225. excluded_headers = [
  226. "content-length",
  227. ]
  228. cached_response.headers.update(
  229. dict((k, v) for k, v in response.headers.items()
  230. if k.lower() not in excluded_headers)
  231. )
  232. # we want a 200 b/c we have content via the cache
  233. cached_response.status = 200
  234. # update our cache
  235. self.cache.set(
  236. cache_url,
  237. self.serializer.dumps(request, cached_response),
  238. )
  239. return cached_response