|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +import gzip |
| 4 | +import io |
| 5 | +import pickle |
| 6 | +import re |
| 7 | +import struct |
| 8 | +from logging import getLogger |
| 9 | +from time import time |
| 10 | +from typing import TYPE_CHECKING |
| 11 | + |
| 12 | +from scrapy.http.headers import Headers |
| 13 | +from scrapy.responsetypes import responsetypes |
| 14 | + |
| 15 | +from apify import Configuration |
| 16 | +from apify.apify_storage_client import ApifyStorageClient |
| 17 | +from apify.scrapy._async_thread import AsyncThread |
| 18 | +from apify.storages import KeyValueStore |
| 19 | + |
| 20 | +if TYPE_CHECKING: |
| 21 | + from scrapy import Request, Spider |
| 22 | + from scrapy.http.response import Response |
| 23 | + from scrapy.settings import BaseSettings |
| 24 | + from scrapy.utils.request import RequestFingerprinterProtocol |
| 25 | + |
| 26 | +logger = getLogger(__name__) |
| 27 | + |
| 28 | + |
| 29 | +class ApifyCacheStorage: |
| 30 | + """A Scrapy cache storage that uses the Apify `KeyValueStore` to store responses. |
| 31 | +
|
| 32 | + It can be set as a storage for Scrapy's built-in `HttpCacheMiddleware`, which caches |
| 33 | + responses to requests. See HTTPCache middleware settings (prefixed with `HTTPCACHE_`) |
| 34 | + in the Scrapy documentation for more information. Requires the asyncio Twisted reactor |
| 35 | + to be installed. |
| 36 | + """ |
| 37 | + |
| 38 | + def __init__(self, settings: BaseSettings) -> None: |
| 39 | + self._expiration_max_items = 100 |
| 40 | + self._expiration_secs: int = settings.getint('HTTPCACHE_EXPIRATION_SECS') |
| 41 | + self._spider: Spider | None = None |
| 42 | + self._kvs: KeyValueStore | None = None |
| 43 | + self._fingerprinter: RequestFingerprinterProtocol | None = None |
| 44 | + self._async_thread: AsyncThread | None = None |
| 45 | + |
| 46 | + def open_spider(self, spider: Spider) -> None: |
| 47 | + """Open the cache storage for a spider.""" |
| 48 | + logger.debug('Using Apify key value cache storage', extra={'spider': spider}) |
| 49 | + self._spider = spider |
| 50 | + self._fingerprinter = spider.crawler.request_fingerprinter |
| 51 | + kvs_name = get_kvs_name(spider.name) |
| 52 | + |
| 53 | + async def open_kvs() -> KeyValueStore: |
| 54 | + config = Configuration.get_global_configuration() |
| 55 | + if config.is_at_home: |
| 56 | + storage_client = ApifyStorageClient.from_config(config) |
| 57 | + return await KeyValueStore.open(name=kvs_name, storage_client=storage_client) |
| 58 | + return await KeyValueStore.open(name=kvs_name) |
| 59 | + |
| 60 | + logger.debug("Starting background thread for cache storage's event loop") |
| 61 | + self._async_thread = AsyncThread() |
| 62 | + logger.debug(f"Opening cache storage's {kvs_name!r} key value store") |
| 63 | + self._kvs = self._async_thread.run_coro(open_kvs()) |
| 64 | + |
| 65 | + def close_spider(self, _: Spider, current_time: int | None = None) -> None: |
| 66 | + """Close the cache storage for a spider.""" |
| 67 | + if self._async_thread is None: |
| 68 | + raise ValueError('Async thread not initialized') |
| 69 | + |
| 70 | + logger.info(f'Cleaning up cache items (max {self._expiration_max_items})') |
| 71 | + if self._expiration_secs > 0: |
| 72 | + if current_time is None: |
| 73 | + current_time = int(time()) |
| 74 | + |
| 75 | + async def expire_kvs() -> None: |
| 76 | + if self._kvs is None: |
| 77 | + raise ValueError('Key value store not initialized') |
| 78 | + i = 0 |
| 79 | + async for item in self._kvs.iterate_keys(): |
| 80 | + value = await self._kvs.get_value(item.key) |
| 81 | + try: |
| 82 | + gzip_time = read_gzip_time(value) |
| 83 | + except Exception as e: |
| 84 | + logger.warning(f'Malformed cache item {item.key}: {e}') |
| 85 | + await self._kvs.set_value(item.key, None) |
| 86 | + else: |
| 87 | + if self._expiration_secs < current_time - gzip_time: |
| 88 | + logger.debug(f'Expired cache item {item.key}') |
| 89 | + await self._kvs.set_value(item.key, None) |
| 90 | + else: |
| 91 | + logger.debug(f'Valid cache item {item.key}') |
| 92 | + if i == self._expiration_max_items: |
| 93 | + break |
| 94 | + i += 1 |
| 95 | + |
| 96 | + self._async_thread.run_coro(expire_kvs()) |
| 97 | + |
| 98 | + logger.debug('Closing cache storage') |
| 99 | + try: |
| 100 | + self._async_thread.close() |
| 101 | + except KeyboardInterrupt: |
| 102 | + logger.warning('Shutdown interrupted by KeyboardInterrupt!') |
| 103 | + except Exception: |
| 104 | + logger.exception('Exception occurred while shutting down cache storage') |
| 105 | + finally: |
| 106 | + logger.debug('Cache storage closed') |
| 107 | + |
| 108 | + def retrieve_response(self, _: Spider, request: Request, current_time: int | None = None) -> Response | None: |
| 109 | + """Retrieve a response from the cache storage.""" |
| 110 | + if self._async_thread is None: |
| 111 | + raise ValueError('Async thread not initialized') |
| 112 | + if self._kvs is None: |
| 113 | + raise ValueError('Key value store not initialized') |
| 114 | + if self._fingerprinter is None: |
| 115 | + raise ValueError('Request fingerprinter not initialized') |
| 116 | + |
| 117 | + key = self._fingerprinter.fingerprint(request).hex() |
| 118 | + value = self._async_thread.run_coro(self._kvs.get_value(key)) |
| 119 | + |
| 120 | + if value is None: |
| 121 | + logger.debug('Cache miss', extra={'request': request}) |
| 122 | + return None |
| 123 | + |
| 124 | + if current_time is None: |
| 125 | + current_time = int(time()) |
| 126 | + if 0 < self._expiration_secs < current_time - read_gzip_time(value): |
| 127 | + logger.debug('Cache expired', extra={'request': request}) |
| 128 | + return None |
| 129 | + |
| 130 | + data = from_gzip(value) |
| 131 | + url = data['url'] |
| 132 | + status = data['status'] |
| 133 | + headers = Headers(data['headers']) |
| 134 | + body = data['body'] |
| 135 | + respcls = responsetypes.from_args(headers=headers, url=url, body=body) |
| 136 | + |
| 137 | + logger.debug('Cache hit', extra={'request': request}) |
| 138 | + return respcls(url=url, headers=headers, status=status, body=body) |
| 139 | + |
| 140 | + def store_response(self, _: Spider, request: Request, response: Response) -> None: |
| 141 | + """Store a response in the cache storage.""" |
| 142 | + if self._async_thread is None: |
| 143 | + raise ValueError('Async thread not initialized') |
| 144 | + if self._kvs is None: |
| 145 | + raise ValueError('Key value store not initialized') |
| 146 | + if self._fingerprinter is None: |
| 147 | + raise ValueError('Request fingerprinter not initialized') |
| 148 | + |
| 149 | + key = self._fingerprinter.fingerprint(request).hex() |
| 150 | + data = { |
| 151 | + 'status': response.status, |
| 152 | + 'url': response.url, |
| 153 | + 'headers': dict(response.headers), |
| 154 | + 'body': response.body, |
| 155 | + } |
| 156 | + value = to_gzip(data) |
| 157 | + self._async_thread.run_coro(self._kvs.set_value(key, value)) |
| 158 | + |
| 159 | + |
| 160 | +def to_gzip(data: dict, mtime: int | None = None) -> bytes: |
| 161 | + """Dump a dictionary to a gzip-compressed byte stream.""" |
| 162 | + with io.BytesIO() as byte_stream: |
| 163 | + with gzip.GzipFile(fileobj=byte_stream, mode='wb', mtime=mtime) as gzip_file: |
| 164 | + pickle.dump(data, gzip_file, protocol=4) |
| 165 | + return byte_stream.getvalue() |
| 166 | + |
| 167 | + |
| 168 | +def from_gzip(gzip_bytes: bytes) -> dict: |
| 169 | + """Load a dictionary from a gzip-compressed byte stream.""" |
| 170 | + with io.BytesIO(gzip_bytes) as byte_stream, gzip.GzipFile(fileobj=byte_stream, mode='rb') as gzip_file: |
| 171 | + data: dict = pickle.load(gzip_file) |
| 172 | + return data |
| 173 | + |
| 174 | + |
| 175 | +def read_gzip_time(gzip_bytes: bytes) -> int: |
| 176 | + """Read the modification time from a gzip-compressed byte stream without decompressing the data.""" |
| 177 | + header = gzip_bytes[:10] |
| 178 | + header_components = struct.unpack('<HBBI2B', header) |
| 179 | + mtime: int = header_components[3] |
| 180 | + return mtime |
| 181 | + |
| 182 | + |
| 183 | +def get_kvs_name(spider_name: str, max_length: int = 60) -> str: |
| 184 | + """Get the key value store name for a spider. |
| 185 | +
|
| 186 | + The key value store name is derived from the spider name by replacing all special characters |
| 187 | + with hyphens and trimming leading and trailing hyphens. The resulting name is prefixed with |
| 188 | + 'httpcache-' and truncated to the maximum length. |
| 189 | +
|
| 190 | + The documentation |
| 191 | + [about storages](https://docs.apify.com/platform/storage/usage#named-and-unnamed-storages) |
| 192 | + mentions that names can be up to 63 characters long, so the default max length is set to 60. |
| 193 | +
|
| 194 | + Such naming isn't unique per spider, but should be sufficiently unique for most use cases. |
| 195 | + The name of the key value store should indicate to which spider it belongs, e.g. in |
| 196 | + the listing in the Apify's console. |
| 197 | +
|
| 198 | + Args: |
| 199 | + spider_name: Value of the Spider instance's name attribute. |
| 200 | + max_length: Maximum length of the key value store name. |
| 201 | +
|
| 202 | + Returns: Key value store name. |
| 203 | +
|
| 204 | + Raises: |
| 205 | + ValueError: If the spider name contains only special characters. |
| 206 | + """ |
| 207 | + slug = re.sub(r'[^a-zA-Z0-9-]', '-', spider_name) |
| 208 | + slug = re.sub(r'-+', '-', slug) |
| 209 | + slug = slug.strip('-') |
| 210 | + if not slug: |
| 211 | + raise ValueError(f'Unsupported spider name: {spider_name!r} (slug: {slug!r})') |
| 212 | + return f'httpcache-{slug}'[:max_length] |
0 commit comments