Skip to content

Commit 3388d13

Browse files
authored
fix(scrapers): Use RequestQueue directly to avoid excessive RQ writes on runs with large startUrls list (#393)
- affects all generic scrapers - closes #392 - the change means that we will skip the RQ+RL tandem behavior from BasicCrawler - an alternative solution would be to initialize the tandem lazily, but this would be tricky due to 1) the amount of non-trivial code that depends on requestQueue being present and 2) input containing the requestQueueName option that we'd also need to take into account ### Caveats - with large `startUrls`, there will be a lot of RQ writes billed at the start of the Actor - this should not be a huge deal - however, the same bunch of writes will happen on a resume/migration/whatever - maybe we should add a `useState` flag so that we can only insert the `startUrls` once
1 parent 1b29cb3 commit 3388d13

File tree

9 files changed

+202
-55
lines changed

9 files changed

+202
-55
lines changed

package-lock.json

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@
7777
"@types/tough-cookie": "^4.0.5",
7878
"@types/ws": "^8.5.12",
7979
"commitlint": "^19.3.0",
80-
"crawlee": "^3.13.0",
80+
"crawlee": "^3.13.5",
8181
"eslint": "^9.23.0",
8282
"eslint-config-prettier": "^10.1.1",
8383
"fs-extra": "^11.2.0",

packages/actor-scraper/camoufox-scraper/src/internals/crawler_setup.ts

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ import type { Input } from './consts.js';
4343
import { ProxyRotation } from './consts.js';
4444

4545
const SESSION_STORE_NAME = 'APIFY-PLAYWRIGHT-SCRAPER-SESSION-STORE';
46+
const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
4647

4748
const { META_KEY, DEVTOOLS_TIMEOUT_SECS, SESSION_MAX_USAGE_COUNTS } =
4849
scraperToolsConstants;
@@ -76,7 +77,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
7677
requestQueueName?: string;
7778

7879
crawler!: PlaywrightCrawler;
79-
requestList!: RequestList;
8080
dataset!: Dataset;
8181
pagesOutputted!: number;
8282
private initPromise: Promise<void>;
@@ -167,7 +167,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
167167

168168
// Initialize async operations.
169169
this.crawler = null!;
170-
this.requestList = null!;
171170
this.requestQueue = null!;
172171
this.dataset = null!;
173172
this.keyValueStore = null!;
@@ -182,21 +181,46 @@ export class CrawlerSetup implements CrawlerSetupOptions {
182181
return req;
183182
});
184183

185-
this.requestList = await RequestList.open(
186-
'PLAYWRIGHT_SCRAPER',
187-
startUrls,
188-
);
184+
// KeyValueStore
185+
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
189186

190187
// RequestQueue
191188
this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
192189

190+
if (
191+
!(await this.keyValueStore.recordExists(
192+
REQUEST_QUEUE_INIT_FLAG_KEY,
193+
))
194+
) {
195+
const requests: Request[] = [];
196+
for await (const request of await RequestList.open(
197+
null,
198+
startUrls,
199+
)) {
200+
if (
201+
this.input.maxResultsPerCrawl > 0 &&
202+
requests.length >= 1.5 * this.input.maxResultsPerCrawl
203+
) {
204+
break;
205+
}
206+
requests.push(request);
207+
}
208+
209+
const { waitForAllRequestsToBeAdded } =
210+
await this.requestQueue.addRequestsBatched(requests);
211+
212+
void waitForAllRequestsToBeAdded.then(async () => {
213+
await this.keyValueStore.setValue(
214+
REQUEST_QUEUE_INIT_FLAG_KEY,
215+
'1',
216+
);
217+
});
218+
}
219+
193220
// Dataset
194221
this.dataset = await Dataset.open(this.datasetName);
195222
const info = await this.dataset.getInfo();
196223
this.pagesOutputted = info?.itemCount ?? 0;
197-
198-
// KeyValueStore
199-
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
200224
}
201225

202226
/**
@@ -207,7 +231,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
207231

208232
const options: PlaywrightCrawlerOptions = {
209233
requestHandler: this._requestHandler.bind(this),
210-
requestList: this.requestList,
211234
requestQueue: this.requestQueue,
212235
requestHandlerTimeoutSecs: this.devtools
213236
? DEVTOOLS_TIMEOUT_SECS

packages/actor-scraper/cheerio-scraper/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"type": "module",
77
"dependencies": {
88
"@apify/scraper-tools": "^1.1.4",
9-
"@crawlee/cheerio": "^3.13.2",
9+
"@crawlee/cheerio": "^3.13.4",
1010
"apify": "^3.2.6"
1111
},
1212
"devDependencies": {

packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ const SCHEMA = JSON.parse(
4444

4545
const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9;
4646
const SESSION_STORE_NAME = 'APIFY-CHEERIO-SCRAPER-SESSION-STORE';
47+
const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
4748

4849
/**
4950
* Holds all the information necessary for constructing a crawler
@@ -70,7 +71,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
7071
requestQueueName?: string;
7172

7273
crawler!: CheerioCrawler;
73-
requestList!: RequestList;
7474
dataset!: Dataset;
7575
pagesOutputted!: number;
7676
proxyConfiguration?: ProxyConfiguration;
@@ -151,7 +151,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
151151

152152
// Initialize async operations.
153153
this.crawler = null!;
154-
this.requestList = null!;
155154
this.requestQueue = null!;
156155
this.dataset = null!;
157156
this.keyValueStore = null!;
@@ -167,19 +166,47 @@ export class CrawlerSetup implements CrawlerSetupOptions {
167166
return req;
168167
});
169168

170-
this.requestList = await RequestList.open('CHEERIO_SCRAPER', startUrls);
169+
// KeyValueStore
170+
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
171171

172172
// RequestQueue
173173
this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
174174

175+
if (
176+
!(await this.keyValueStore.recordExists(
177+
REQUEST_QUEUE_INIT_FLAG_KEY,
178+
))
179+
) {
180+
const requests: Request[] = [];
181+
for await (const request of await RequestList.open(
182+
null,
183+
startUrls,
184+
)) {
185+
if (
186+
this.input.maxResultsPerCrawl > 0 &&
187+
requests.length >= 1.5 * this.input.maxResultsPerCrawl
188+
) {
189+
break;
190+
}
191+
requests.push(request);
192+
}
193+
194+
const { waitForAllRequestsToBeAdded } =
195+
await this.requestQueue.addRequestsBatched(requests);
196+
197+
void waitForAllRequestsToBeAdded.then(async () => {
198+
await this.keyValueStore.setValue(
199+
REQUEST_QUEUE_INIT_FLAG_KEY,
200+
'1',
201+
);
202+
});
203+
}
204+
175205
// Dataset
176206
this.dataset = await Dataset.open(this.datasetName);
177207
const info = await this.dataset.getInfo();
178208
this.pagesOutputted = info?.itemCount ?? 0;
179209

180-
// KeyValueStore
181-
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
182-
183210
// Proxy configuration
184211
this.proxyConfiguration = (await Actor.createProxyConfiguration(
185212
this.input.proxyConfiguration,
@@ -197,7 +224,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
197224
requestHandler: this._requestHandler.bind(this),
198225
preNavigationHooks: [],
199226
postNavigationHooks: [],
200-
requestList: this.requestList,
201227
requestQueue: this.requestQueue,
202228
navigationTimeoutSecs: this.input.pageLoadTimeoutSecs,
203229
requestHandlerTimeoutSecs: this.input.pageFunctionTimeoutSecs,

packages/actor-scraper/jsdom-scraper/src/internals/crawler_setup.ts

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ const SCHEMA = JSON.parse(
4343

4444
const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9;
4545
const SESSION_STORE_NAME = 'APIFY-JSDOM-SCRAPER-SESSION-STORE';
46+
const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
4647

4748
/**
4849
* Holds all the information necessary for constructing a crawler
@@ -69,7 +70,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
6970
requestQueueName?: string;
7071

7172
crawler!: JSDOMCrawler;
72-
requestList!: RequestList;
7373
dataset!: Dataset;
7474
pagesOutputted!: number;
7575
proxyConfiguration?: ProxyConfiguration;
@@ -150,7 +150,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
150150

151151
// Initialize async operations.
152152
this.crawler = null!;
153-
this.requestList = null!;
154153
this.requestQueue = null!;
155154
this.dataset = null!;
156155
this.keyValueStore = null!;
@@ -166,19 +165,47 @@ export class CrawlerSetup implements CrawlerSetupOptions {
166165
return req;
167166
});
168167

169-
this.requestList = await RequestList.open('JSDOM_SCRAPER', startUrls);
168+
// KeyValueStore
169+
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
170170

171171
// RequestQueue
172172
this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
173173

174+
if (
175+
!(await this.keyValueStore.recordExists(
176+
REQUEST_QUEUE_INIT_FLAG_KEY,
177+
))
178+
) {
179+
const requests: Request[] = [];
180+
for await (const request of await RequestList.open(
181+
null,
182+
startUrls,
183+
)) {
184+
if (
185+
this.input.maxResultsPerCrawl > 0 &&
186+
requests.length >= 1.5 * this.input.maxResultsPerCrawl
187+
) {
188+
break;
189+
}
190+
requests.push(request);
191+
}
192+
193+
const { waitForAllRequestsToBeAdded } =
194+
await this.requestQueue.addRequestsBatched(requests);
195+
196+
void waitForAllRequestsToBeAdded.then(async () => {
197+
await this.keyValueStore.setValue(
198+
REQUEST_QUEUE_INIT_FLAG_KEY,
199+
'1',
200+
);
201+
});
202+
}
203+
174204
// Dataset
175205
this.dataset = await Dataset.open(this.datasetName);
176206
const info = await this.dataset.getInfo();
177207
this.pagesOutputted = info?.itemCount ?? 0;
178208

179-
// KeyValueStore
180-
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
181-
182209
// Proxy configuration
183210
this.proxyConfiguration = (await Actor.createProxyConfiguration(
184211
this.input.proxyConfiguration,
@@ -198,7 +225,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
198225
runScripts: this.input.runScripts ?? true,
199226
hideInternalConsole: !(this.input.showInternalConsole ?? false),
200227
postNavigationHooks: [],
201-
requestList: this.requestList,
202228
requestQueue: this.requestQueue,
203229
navigationTimeoutSecs: this.input.pageLoadTimeoutSecs,
204230
requestHandlerTimeoutSecs: this.input.pageFunctionTimeoutSecs,

packages/actor-scraper/playwright-scraper/src/internals/crawler_setup.ts

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ import type { Input } from './consts.js';
4242
import { ProxyRotation } from './consts.js';
4343

4444
const SESSION_STORE_NAME = 'APIFY-PLAYWRIGHT-SCRAPER-SESSION-STORE';
45+
const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
4546

4647
const {
4748
META_KEY,
@@ -80,7 +81,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
8081
requestQueueName?: string;
8182

8283
crawler!: PlaywrightCrawler;
83-
requestList!: RequestList;
8484
dataset!: Dataset;
8585
pagesOutputted!: number;
8686
private initPromise: Promise<void>;
@@ -198,7 +198,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
198198

199199
// Initialize async operations.
200200
this.crawler = null!;
201-
this.requestList = null!;
202201
this.requestQueue = null!;
203202
this.dataset = null!;
204203
this.keyValueStore = null!;
@@ -213,21 +212,46 @@ export class CrawlerSetup implements CrawlerSetupOptions {
213212
return req;
214213
});
215214

216-
this.requestList = await RequestList.open(
217-
'PLAYWRIGHT_SCRAPER',
218-
startUrls,
219-
);
215+
// KeyValueStore
216+
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
220217

221218
// RequestQueue
222219
this.requestQueue = await RequestQueueV2.open(this.requestQueueName);
223220

221+
if (
222+
!(await this.keyValueStore.recordExists(
223+
REQUEST_QUEUE_INIT_FLAG_KEY,
224+
))
225+
) {
226+
const requests: Request[] = [];
227+
for await (const request of await RequestList.open(
228+
null,
229+
startUrls,
230+
)) {
231+
if (
232+
this.input.maxResultsPerCrawl > 0 &&
233+
requests.length >= 1.5 * this.input.maxResultsPerCrawl
234+
) {
235+
break;
236+
}
237+
requests.push(request);
238+
}
239+
240+
const { waitForAllRequestsToBeAdded } =
241+
await this.requestQueue.addRequestsBatched(requests);
242+
243+
void waitForAllRequestsToBeAdded.then(async () => {
244+
await this.keyValueStore.setValue(
245+
REQUEST_QUEUE_INIT_FLAG_KEY,
246+
'1',
247+
);
248+
});
249+
}
250+
224251
// Dataset
225252
this.dataset = await Dataset.open(this.datasetName);
226253
const info = await this.dataset.getInfo();
227254
this.pagesOutputted = info?.itemCount ?? 0;
228-
229-
// KeyValueStore
230-
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
231255
}
232256

233257
/**
@@ -241,7 +265,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
241265

242266
const options: PlaywrightCrawlerOptions = {
243267
requestHandler: this._requestHandler.bind(this),
244-
requestList: this.requestList,
245268
requestQueue: this.requestQueue,
246269
requestHandlerTimeoutSecs: this.devtools
247270
? DEVTOOLS_TIMEOUT_SECS

0 commit comments

Comments
 (0)