Skip to content

Commit 6cf025e

Browse files
authored
chore: Add test server and some top level crawler tests (#517)
### Description - add a simple `TestServer` for the integration tests - add three new tests focused on top-level `crawler` usage of `RequestQueue` related arguments.
1 parent a3bb71e commit 6cf025e

File tree

7 files changed

+539
-100
lines changed

7 files changed

+539
-100
lines changed

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ scrapy = ["scrapy>=2.11.0"]
6363
[dependency-groups]
6464
dev = [
6565
"build~=1.3.0",
66+
"crawlee[parsel]",
6667
"dycw-pytest-only>=2.1.1",
6768
"griffe~=1.9.0",
6869
"mypy~=1.17.0",
@@ -76,6 +77,7 @@ dev = [
7677
"respx~=0.22.0",
7778
"ruff~=0.12.0",
7879
"setuptools", # setuptools are used by pytest but not explicitly required
80+
"uvicorn[standard]",
7981
]
8082

8183
[tool.hatch.build.targets.wheel]

tests/integration/actor_source_base/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@ RUN echo "Python version:" \
1212
&& echo "All installed Python packages:" \
1313
&& pip freeze
1414

15-
CMD ["python3", "-m", "src"]
15+
CMD ["sh", "-c", "python server.py & python -m src"]
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
# The test fixture will put the Apify SDK wheel path on the next line
22
APIFY_SDK_WHEEL_PLACEHOLDER
3+
uvicorn[standard]
4+
crawlee[parsel]
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
"""
2+
Test server is infinite server http://localhost:8080/{any_number} and each page has links to the next 10 pages.
3+
For example:
4+
http://localhost:8080/ contains links:
5+
http://localhost:8080/0, http://localhost:8080/1, ..., http://localhost:8080/9
6+
7+
http://localhost:8080/1 contains links:
8+
http://localhost:8080/10, http://localhost:8080/11, ..., http://localhost:8080/19
9+
10+
... and so on.
11+
"""
12+
13+
import asyncio
14+
import logging
15+
from collections.abc import Awaitable, Callable, Coroutine
16+
from socket import socket
17+
from typing import Any
18+
19+
from uvicorn import Config
20+
from uvicorn.server import Server
21+
from yarl import URL
22+
23+
Receive = Callable[[], Awaitable[dict[str, Any]]]
24+
Send = Callable[[dict[str, Any]], Coroutine[None, None, None]]
25+
26+
27+
async def send_html_response(send: Send, html_content: bytes, status: int = 200) -> None:
28+
"""Send an HTML response to the client."""
29+
await send(
30+
{
31+
'type': 'http.response.start',
32+
'status': status,
33+
'headers': [[b'content-type', b'text/html; charset=utf-8']],
34+
}
35+
)
36+
await send({'type': 'http.response.body', 'body': html_content})
37+
38+
39+
async def app(scope: dict[str, Any], _: Receive, send: Send) -> None:
40+
"""Main ASGI application handler that routes requests to specific handlers.
41+
42+
Args:
43+
scope: The ASGI connection scope.
44+
_: The ASGI receive function.
45+
send: The ASGI send function.
46+
"""
47+
assert scope['type'] == 'http'
48+
path = scope['path']
49+
50+
links = '\n'.join(f'<a href="{path}{i}">{path}{i}</a>' for i in range(10))
51+
await send_html_response(
52+
send,
53+
f"""\
54+
<html><head>
55+
<title>Title for {path} </title>
56+
</head>
57+
<body>
58+
{links}
59+
</body></html>""".encode(),
60+
)
61+
62+
63+
class TestServer(Server):
64+
"""A test HTTP server implementation based on Uvicorn Server."""
65+
66+
@property
67+
def url(self) -> URL:
68+
"""Get the base URL of the server.
69+
70+
Returns:
71+
A URL instance with the server's base URL.
72+
"""
73+
protocol = 'https' if self.config.is_ssl else 'http'
74+
return URL(f'{protocol}://{self.config.host}:{self.config.port}/')
75+
76+
async def serve(self, sockets: list[socket] | None = None) -> None:
77+
"""Run the server."""
78+
if sockets:
79+
raise RuntimeError('Simple TestServer does not support custom sockets')
80+
self.restart_requested = asyncio.Event()
81+
82+
loop = asyncio.get_event_loop()
83+
tasks = {
84+
loop.create_task(super().serve()),
85+
}
86+
await asyncio.wait(tasks)
87+
88+
89+
if __name__ == '__main__':
90+
asyncio.run(
91+
TestServer(
92+
config=Config(
93+
app=app,
94+
lifespan='off',
95+
loop='asyncio',
96+
port=8080,
97+
log_config=None,
98+
log_level=logging.CRITICAL,
99+
)
100+
).serve()
101+
)

tests/integration/test_actor_api_helpers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,12 +400,12 @@ async def main_server() -> None:
400400
async with Actor:
401401

402402
class WebhookHandler(BaseHTTPRequestHandler):
403-
def do_GET(self) -> None: # noqa: N802
403+
def do_GET(self) -> None:
404404
self.send_response(200)
405405
self.end_headers()
406406
self.wfile.write(bytes('Hello, world!', encoding='utf-8'))
407407

408-
def do_POST(self) -> None: # noqa: N802
408+
def do_POST(self) -> None:
409409
nonlocal webhook_body
410410
content_length = self.headers.get('content-length')
411411
length = int(content_length) if content_length else 0
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
5+
if TYPE_CHECKING:
6+
from .conftest import MakeActorFunction, RunActorFunction
7+
8+
9+
async def test_actor_on_platform_max_crawl_depth(
10+
make_actor: MakeActorFunction,
11+
run_actor: RunActorFunction,
12+
) -> None:
13+
"""Test that the actor respects max_crawl_depth."""
14+
15+
async def main() -> None:
16+
"""The crawler entry point."""
17+
import re
18+
19+
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
20+
21+
from apify import Actor
22+
23+
async with Actor:
24+
crawler = ParselCrawler(max_crawl_depth=2)
25+
finished = []
26+
enqueue_pattern = re.compile(r'http://localhost:8080/2+$')
27+
28+
@crawler.router.default_handler
29+
async def default_handler(context: ParselCrawlingContext) -> None:
30+
"""Default request handler."""
31+
context.log.info(f'Processing {context.request.url} ...')
32+
await context.enqueue_links(include=[enqueue_pattern])
33+
finished.append(context.request.url)
34+
35+
await crawler.run(['http://localhost:8080/'])
36+
assert finished == ['http://localhost:8080/', 'http://localhost:8080/2', 'http://localhost:8080/22']
37+
38+
actor = await make_actor(label='crawler-max-depth', main_func=main)
39+
run_result = await run_actor(actor)
40+
41+
assert run_result.status == 'SUCCEEDED'
42+
43+
44+
async def test_actor_on_platform_max_requests_per_crawl(
45+
make_actor: MakeActorFunction,
46+
run_actor: RunActorFunction,
47+
) -> None:
48+
"""Test that the actor respects max_requests_per_crawl."""
49+
50+
async def main() -> None:
51+
"""The crawler entry point."""
52+
from crawlee import ConcurrencySettings
53+
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
54+
55+
from apify import Actor
56+
57+
async with Actor:
58+
crawler = ParselCrawler(
59+
max_requests_per_crawl=3, concurrency_settings=ConcurrencySettings(max_concurrency=1)
60+
)
61+
finished = []
62+
63+
@crawler.router.default_handler
64+
async def default_handler(context: ParselCrawlingContext) -> None:
65+
"""Default request handler."""
66+
context.log.info(f'Processing {context.request.url} ...')
67+
await context.enqueue_links()
68+
finished.append(context.request.url)
69+
70+
await crawler.run(['http://localhost:8080/'])
71+
assert len(finished) == 3
72+
73+
actor = await make_actor(label='crawler-max-requests', main_func=main)
74+
run_result = await run_actor(actor)
75+
76+
assert run_result.status == 'SUCCEEDED'
77+
78+
79+
async def test_actor_on_platform_max_request_retries(
80+
make_actor: MakeActorFunction,
81+
run_actor: RunActorFunction,
82+
) -> None:
83+
"""Test that the actor respects max_request_retries."""
84+
85+
async def main() -> None:
86+
"""The crawler entry point."""
87+
from crawlee.crawlers import BasicCrawlingContext, ParselCrawler, ParselCrawlingContext
88+
89+
from apify import Actor
90+
91+
async with Actor:
92+
max_retries = 3
93+
crawler = ParselCrawler(max_request_retries=max_retries)
94+
failed_counter = 0
95+
96+
@crawler.error_handler
97+
async def error_handler(_: BasicCrawlingContext, __: Exception) -> None:
98+
nonlocal failed_counter
99+
failed_counter += 1
100+
101+
@crawler.router.default_handler
102+
async def default_handler(_: ParselCrawlingContext) -> None:
103+
raise RuntimeError('Some error')
104+
105+
await crawler.run(['http://localhost:8080/'])
106+
assert failed_counter == max_retries, f'{failed_counter=}'
107+
108+
actor = await make_actor(label='crawler-max-retries', main_func=main)
109+
run_result = await run_actor(actor)
110+
111+
assert run_result.status == 'SUCCEEDED'

0 commit comments

Comments
 (0)