Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 35 additions & 23 deletions src/apify/scrapy/_async_thread.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,51 +52,63 @@ def run_coro(
The result returned by the coroutine.

Raises:
RuntimeError: If the event loop is not running.
RuntimeError: If the event loop has been closed.
TimeoutError: If the coroutine does not complete within the timeout.
Exception: Any exception raised during coroutine execution.
"""
if timeout == 'default':
timeout = self._default_timeout

if not self._eventloop.is_running():
raise RuntimeError(f'The coroutine {coro} cannot be executed because the event loop is not running.')
if self._eventloop.is_closed():
raise RuntimeError(f'The coroutine {coro} cannot be executed because the event loop is closed.')

# Submit the coroutine to the event loop running in the other thread.
future = asyncio.run_coroutine_threadsafe(coro, self._eventloop)
try:
# Wait for the coroutine's result until the specified timeout.
return future.result(timeout=timeout.total_seconds())
except futures.TimeoutError as exc:
logger.exception('Coroutine execution timed out.', exc_info=exc)
raise
except Exception as exc:
logger.exception('Coroutine execution raised an exception.', exc_info=exc)
except futures.TimeoutError:
# `future.result` gave up, but the coroutine keeps running on the loop; cancel it so it does
# not outlive the timeout. The propagated error is logged once by the caller (or Scrapy), so
# this method does not log it itself.
future.cancel()
raise

def close(self, timeout: timedelta = timedelta(seconds=60)) -> None:
def close(self, timeout: timedelta | None = None) -> None:
"""Close the event loop and its thread gracefully.

This method cancels all pending tasks, stops the event loop, and waits for the thread to exit.
If the thread does not exit within the given timeout, a forced shutdown is attempted.

Args:
timeout: The maximum number of seconds to wait for the event loop thread to exit.
timeout: The maximum time to wait for the event loop thread to exit. Pass `None` to use the
`default_timeout` passed to the constructor.
"""
if self._eventloop.is_running():
# Cancel all pending tasks in the event loop.
self.run_coro(self._shutdown_tasks())
if timeout is None:
timeout = self._default_timeout

# A repeated close (e.g. a retried shutdown) would call into the already-closed loop and raise
# `RuntimeError: Event loop is closed`. The loop closes itself once it stops, so a second close
# is a no-op.
if self._eventloop.is_closed():
return

# Schedule the event loop to stop.
self._eventloop.call_soon_threadsafe(self._eventloop.stop)
try:
if self._eventloop.is_running():
# Cancel all pending tasks in the event loop, honouring the caller's timeout.
self.run_coro(self._shutdown_tasks(), timeout=timeout)
finally:
# Stop the loop and join its thread even if cancelling the pending tasks above raised or timed
# out. Skipping this would leave the loop running and leak its thread.
self._eventloop.call_soon_threadsafe(self._eventloop.stop)

# Wait for the event loop thread to finish execution.
self._thread.join(timeout=timeout.total_seconds())
# Wait for the event loop thread to finish execution.
self._thread.join(timeout=timeout.total_seconds())

# If the thread is still running after the timeout, force a shutdown.
if self._thread.is_alive():
logger.warning('Event loop thread did not exit cleanly! Forcing shutdown...')
self._force_exit_event_loop()
# If the thread is still running after the timeout, force a shutdown.
if self._thread.is_alive():
logger.warning('Event loop thread did not exit cleanly! Forcing shutdown...')
self._force_exit_event_loop()

def _start_event_loop(self) -> None:
"""Set up and run the asyncio event loop in the dedicated thread."""
Expand Down Expand Up @@ -125,5 +137,5 @@ def _force_exit_event_loop(self) -> None:
logger.info('Forced shutdown of the event loop and its thread...')
self._eventloop.call_soon_threadsafe(self._eventloop.stop)
self._thread.join(timeout=5)
except Exception as exc:
logger.exception('Exception occurred during forced event loop shutdown.', exc_info=exc)
except Exception:
logger.exception('Exception occurred during forced event loop shutdown.')
110 changes: 70 additions & 40 deletions src/apify/scrapy/extensions/_httpcache.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import io
import re
import struct
from datetime import timedelta
from logging import getLogger
from time import time
from typing import TYPE_CHECKING
Expand Down Expand Up @@ -38,6 +39,8 @@ def __init__(self, settings: BaseSettings) -> None:
# Upper bound on how many keys the per-spider-close cleanup sweeps (best-effort; `close_spider`).
self._expiration_max_items: int = settings.getint('APIFY_HTTPCACHE_EXPIRATION_MAX_ITEMS', 100)
self._expiration_secs: int = settings.getint('HTTPCACHE_EXPIRATION_SECS')
self._async_thread_timeout = timedelta(seconds=settings.getint('APIFY_ASYNC_THREAD_TIMEOUT_SECS', 60))
"""Caps how long each coroutine run on the background event loop may take; defaults to 60 seconds."""
self._spider: Spider | None = None
self._kvs: KeyValueStore | None = None
self._fingerprinter: RequestFingerprinterProtocol | None = None
Expand All @@ -62,55 +65,74 @@ async def open_kvs() -> KeyValueStore:
return await KeyValueStore.open(name=kvs_name)

logger.debug("Starting background thread for cache storage's event loop")
self._async_thread = AsyncThread()
self._async_thread = AsyncThread(default_timeout=self._async_thread_timeout)
logger.debug(f"Opening cache storage's {kvs_name!r} key value store")
self._kvs = self._async_thread.run_coro(open_kvs())
try:
self._kvs = self._async_thread.run_coro(open_kvs())
except Exception:
logger.exception('Failed to open the cache key-value store.')
# Opening the key-value store failed, so close the freshly started async thread instead of
# leaking its event-loop thread (`close_spider` may never run if `open_spider` fails). Guard
# the close so a secondary failure here cannot mask the original error.
try:
self._async_thread.close()
except Exception:
logger.exception('Failed to close the async thread after a failed cache storage open.')
raise

def close_spider(self, _: Spider, current_time: int | None = None) -> None:
"""Close the cache storage for a spider."""
if self._async_thread is None:
raise ValueError('Async thread not initialized')

logger.info(f'Cleaning up cache items (max {self._expiration_max_items})')
if self._expiration_secs > 0:
if current_time is None:
current_time = int(time())

async def expire_kvs() -> None:
if self._kvs is None:
raise ValueError('Key value store not initialized')
# Best-effort cleanup: at most `_expiration_max_items` keys per close, in no guaranteed order,
# so stale entries may linger. This only reclaims storage; `retrieve_response` already treats
# an expired entry as a cache miss.
processed = 0
async for item in self._kvs.iterate_keys():
if processed >= self._expiration_max_items:
break
processed += 1
value = await self._kvs.get_value(item.key)
try:
gzip_time = read_gzip_time(value)
except Exception as e:
logger.warning(f'Malformed cache item {item.key}: {e}')
await self._kvs.delete_value(item.key)
else:
if self._expiration_secs < current_time - gzip_time:
logger.debug(f'Expired cache item {item.key}')
# `close` always runs in the `finally`, so neither a cleanup failure below nor an early return can leak
# the event-loop thread.
try:
if self._expiration_secs > 0:
if current_time is None:
current_time = int(time())

async def expire_kvs() -> None:
if self._kvs is None:
raise ValueError('Key value store not initialized')
# Best-effort cleanup: at most `_expiration_max_items` keys per close, in no guaranteed order,
# so stale entries may linger. This only reclaims storage; `retrieve_response` already treats
# an expired entry as a cache miss.
processed = 0
async for item in self._kvs.iterate_keys():
if processed >= self._expiration_max_items:
break
processed += 1
value = await self._kvs.get_value(item.key)
try:
gzip_time = read_gzip_time(value)
except Exception as exc:
logger.warning(f'Malformed cache item {item.key}: {exc}')
await self._kvs.delete_value(item.key)
else:
logger.debug(f'Valid cache item {item.key}')

self._async_thread.run_coro(expire_kvs())

logger.debug('Closing cache storage')
try:
self._async_thread.close()
except KeyboardInterrupt:
logger.warning('Shutdown interrupted by KeyboardInterrupt!')
except Exception:
logger.exception('Exception occurred while shutting down cache storage')
if self._expiration_secs < current_time - gzip_time:
logger.debug(f'Expired cache item {item.key}')
await self._kvs.delete_value(item.key)
else:
logger.debug(f'Valid cache item {item.key}')

# Best-effort: log and swallow a cleanup failure rather than raise. The sweep only reclaims
# storage, so failing it must not turn a normal spider close into an error.
try:
self._async_thread.run_coro(expire_kvs())
except Exception:
logger.exception('Failed to clean up expired cache items.')
finally:
logger.debug('Cache storage closed')
logger.debug('Closing cache storage')
try:
self._async_thread.close()
except KeyboardInterrupt:
logger.warning('Shutdown interrupted by KeyboardInterrupt!')
except Exception:
logger.exception('Exception occurred while shutting down cache storage')
finally:
logger.debug('Cache storage closed')

def retrieve_response(self, _: Spider, request: Request, current_time: int | None = None) -> Response | None:
"""Retrieve a response from the cache storage."""
Expand All @@ -122,7 +144,11 @@ def retrieve_response(self, _: Spider, request: Request, current_time: int | Non
raise ValueError('Request fingerprinter not initialized')

key = self._fingerprinter.fingerprint(request).hex()
value = self._async_thread.run_coro(self._kvs.get_value(key))
try:
value = self._async_thread.run_coro(self._kvs.get_value(key))
except Exception:
logger.exception('Failed to retrieve a response from the cache.')
raise

if value is None:
logger.debug('Cache miss', extra={'request': request})
Expand Down Expand Up @@ -169,7 +195,11 @@ def store_response(self, _: Spider, request: Request, response: Response) -> Non
'body': response.body,
}
value = to_gzip(data)
self._async_thread.run_coro(self._kvs.set_value(key, value))
try:
self._async_thread.run_coro(self._kvs.set_value(key, value))
except Exception:
logger.exception('Failed to store a response in the cache.')
raise


def to_gzip(data: dict, mtime: int | None = None) -> bytes:
Expand Down
4 changes: 2 additions & 2 deletions src/apify/scrapy/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,8 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ
apify_request = ApifyRequest.from_url(**request_kwargs)
scrapy_request_dict = scrapy_request.to_dict(spider=spider)

except Exception as exc:
logger.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')
except Exception:
logger.exception(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; skipping it.')
return None

# Serialize the Scrapy request as JSON under 'scrapy_request'. Kept outside the broad except above so
Expand Down
38 changes: 27 additions & 11 deletions src/apify/scrapy/scheduler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from __future__ import annotations

import traceback
from datetime import timedelta
from logging import getLogger
from typing import TYPE_CHECKING

Expand All @@ -15,6 +15,7 @@
from apify.storages import RequestQueue

if TYPE_CHECKING:
from scrapy.crawler import Crawler
from scrapy.http.request import Request
from twisted.internet.defer import Deferred

Expand All @@ -27,7 +28,7 @@ class ApifyScheduler(BaseScheduler):
This scheduler requires the asyncio Twisted reactor to be installed.
"""

def __init__(self) -> None:
def __init__(self, async_thread_timeout: timedelta = timedelta(seconds=60)) -> None:
if not is_asyncio_reactor_installed():
raise ValueError(
f'{ApifyScheduler.__qualname__} requires the asyncio Twisted reactor. '
Expand All @@ -38,7 +39,17 @@ def __init__(self) -> None:
self.spider: Spider | None = None

# A thread with the asyncio event loop to run coroutines on.
self._async_thread = AsyncThread()
self._async_thread = AsyncThread(default_timeout=async_thread_timeout)

@classmethod
def from_crawler(cls, crawler: Crawler) -> ApifyScheduler:
"""Create the scheduler, reading the async-thread timeout from the Scrapy settings.

The `APIFY_ASYNC_THREAD_TIMEOUT_SECS` setting (in seconds) caps how long each coroutine run on the
background event loop may take before timing out; it defaults to 60 seconds.
"""
timeout_secs = crawler.settings.getint('APIFY_ASYNC_THREAD_TIMEOUT_SECS', 60)
return cls(async_thread_timeout=timedelta(seconds=timeout_secs))

def open(self, spider: Spider) -> Deferred[None] | None:
"""Open the scheduler.
Expand All @@ -61,8 +72,13 @@ async def open_rq() -> RequestQueue:
try:
self._rq = self._async_thread.run_coro(open_rq())
except Exception:
self._async_thread.close()
traceback.print_exc()
logger.exception('Failed to open the request queue.')
# Close the freshly started async thread so a failed open does not leak its event-loop thread.
# Guard the close so a secondary failure here cannot mask the original error.
try:
self._async_thread.close()
except Exception:
logger.exception('Failed to close the async thread after a failed scheduler open.')
raise

return None
Expand Down Expand Up @@ -100,7 +116,7 @@ def has_pending_requests(self) -> bool:
try:
is_finished = self._async_thread.run_coro(self._rq.is_finished())
except Exception:
traceback.print_exc()
logger.exception('Failed to check whether the request queue is finished.')
raise

return not is_finished
Expand Down Expand Up @@ -133,7 +149,7 @@ def enqueue_request(self, request: Request) -> bool:
try:
result = self._async_thread.run_coro(self._rq.add_request(apify_request))
except Exception:
traceback.print_exc()
logger.exception('Failed to enqueue the request to the request queue.')
raise

logger.debug(f'rq.add_request result: {result}')
Expand All @@ -152,7 +168,7 @@ def next_request(self) -> Request | None:
try:
apify_request = self._async_thread.run_coro(self._rq.fetch_next_request())
except Exception:
traceback.print_exc()
logger.exception('Failed to fetch the next request from the request queue.')
raise

logger.debug(f'Fetched apify_request: {apify_request}')
Expand All @@ -166,8 +182,8 @@ def next_request(self) -> Request | None:
# the whole run, so on failure it is logged and skipped (None) rather than propagating.
try:
scrapy_request = to_scrapy_request(apify_request, spider=self.spider)
except Exception:
logger.exception(f'Failed to convert Apify request {apify_request} to a Scrapy request; skipping it.')
except Exception as exc:
logger.warning(f'Failed to convert Apify request {apify_request} to a Scrapy request; skipping it: {exc}')
scrapy_request = None

# Mark the request as handled. This runs even when reconstruction failed above: an unrecoverable entry
Expand All @@ -176,7 +192,7 @@ def next_request(self) -> Request | None:
try:
self._async_thread.run_coro(self._rq.mark_request_as_handled(apify_request))
except Exception:
traceback.print_exc()
logger.exception('Failed to mark the request as handled in the request queue.')
raise

if scrapy_request is None:
Expand Down
Loading