Skip to content
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
"@types/tough-cookie": "^4.0.5",
"@types/ws": "^8.5.12",
"commitlint": "^19.3.0",
"crawlee": "^3.13.0",
"crawlee": "^3.13.5",
"eslint": "^9.23.0",
"eslint-config-prettier": "^10.1.1",
"fs-extra": "^11.2.0",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import type { Input } from './consts.js';
import { ProxyRotation } from './consts.js';

const SESSION_STORE_NAME = 'APIFY-PLAYWRIGHT-SCRAPER-SESSION-STORE';
const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';

const { META_KEY, DEVTOOLS_TIMEOUT_SECS, SESSION_MAX_USAGE_COUNTS } =
scraperToolsConstants;
Expand Down Expand Up @@ -76,7 +77,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
requestQueueName?: string;

crawler!: PlaywrightCrawler;
requestList!: RequestList;
dataset!: Dataset;
pagesOutputted!: number;
private initPromise: Promise<void>;
Expand Down Expand Up @@ -167,7 +167,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {

// Initialize async operations.
this.crawler = null!;
this.requestList = null!;
this.requestQueue = null!;
this.dataset = null!;
this.keyValueStore = null!;
Expand All @@ -182,21 +181,46 @@ export class CrawlerSetup implements CrawlerSetupOptions {
return req;
});

this.requestList = await RequestList.open(
'PLAYWRIGHT_SCRAPER',
startUrls,
);
// KeyValueStore
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);

// RequestQueue
this.requestQueue = await RequestQueueV2.open(this.requestQueueName);

if (
!(await this.keyValueStore.recordExists(
REQUEST_QUEUE_INIT_FLAG_KEY,
))
) {
const requests: Request[] = [];
for await (const request of await RequestList.open(
null,
startUrls,
)) {
if (
this.input.maxResultsPerCrawl > 0 &&
requests.length >= 1.5 * this.input.maxResultsPerCrawl
) {
break;
}
requests.push(request);
}

const { waitForAllRequestsToBeAdded } =
await this.requestQueue.addRequestsBatched(requests);

void waitForAllRequestsToBeAdded.then(async () => {
await this.keyValueStore.setValue(
REQUEST_QUEUE_INIT_FLAG_KEY,
'1',
);
});
}

// Dataset
this.dataset = await Dataset.open(this.datasetName);
const info = await this.dataset.getInfo();
this.pagesOutputted = info?.itemCount ?? 0;

// KeyValueStore
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
}

/**
Expand All @@ -207,7 +231,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {

const options: PlaywrightCrawlerOptions = {
requestHandler: this._requestHandler.bind(this),
requestList: this.requestList,
requestQueue: this.requestQueue,
requestHandlerTimeoutSecs: this.devtools
? DEVTOOLS_TIMEOUT_SECS
Expand Down
2 changes: 1 addition & 1 deletion packages/actor-scraper/cheerio-scraper/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"type": "module",
"dependencies": {
"@apify/scraper-tools": "^1.1.4",
"@crawlee/cheerio": "^3.13.2",
"@crawlee/cheerio": "^3.13.4",
"apify": "^3.2.6"
},
"devDependencies": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ const SCHEMA = JSON.parse(

const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9;
const SESSION_STORE_NAME = 'APIFY-CHEERIO-SCRAPER-SESSION-STORE';
const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';

/**
* Holds all the information necessary for constructing a crawler
Expand All @@ -70,7 +71,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
requestQueueName?: string;

crawler!: CheerioCrawler;
requestList!: RequestList;
dataset!: Dataset;
pagesOutputted!: number;
proxyConfiguration?: ProxyConfiguration;
Expand Down Expand Up @@ -151,7 +151,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {

// Initialize async operations.
this.crawler = null!;
this.requestList = null!;
this.requestQueue = null!;
this.dataset = null!;
this.keyValueStore = null!;
Expand All @@ -167,19 +166,47 @@ export class CrawlerSetup implements CrawlerSetupOptions {
return req;
});

this.requestList = await RequestList.open('CHEERIO_SCRAPER', startUrls);
// KeyValueStore
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);

// RequestQueue
this.requestQueue = await RequestQueueV2.open(this.requestQueueName);

if (
!(await this.keyValueStore.recordExists(
REQUEST_QUEUE_INIT_FLAG_KEY,
))
) {
const requests: Request[] = [];
for await (const request of await RequestList.open(
null,
startUrls,
)) {
if (
this.input.maxResultsPerCrawl > 0 &&
requests.length >= 1.5 * this.input.maxResultsPerCrawl
) {
break;
}
requests.push(request);
}

const { waitForAllRequestsToBeAdded } =
await this.requestQueue.addRequestsBatched(requests);

void waitForAllRequestsToBeAdded.then(async () => {
await this.keyValueStore.setValue(
REQUEST_QUEUE_INIT_FLAG_KEY,
'1',
);
});
}

// Dataset
this.dataset = await Dataset.open(this.datasetName);
const info = await this.dataset.getInfo();
this.pagesOutputted = info?.itemCount ?? 0;

// KeyValueStore
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);

// Proxy configuration
this.proxyConfiguration = (await Actor.createProxyConfiguration(
this.input.proxyConfiguration,
Expand All @@ -197,7 +224,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
requestHandler: this._requestHandler.bind(this),
preNavigationHooks: [],
postNavigationHooks: [],
requestList: this.requestList,
requestQueue: this.requestQueue,
navigationTimeoutSecs: this.input.pageLoadTimeoutSecs,
requestHandlerTimeoutSecs: this.input.pageFunctionTimeoutSecs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ const SCHEMA = JSON.parse(

const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9;
const SESSION_STORE_NAME = 'APIFY-JSDOM-SCRAPER-SESSION-STORE';
const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';

/**
* Holds all the information necessary for constructing a crawler
Expand All @@ -69,7 +70,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
requestQueueName?: string;

crawler!: JSDOMCrawler;
requestList!: RequestList;
dataset!: Dataset;
pagesOutputted!: number;
proxyConfiguration?: ProxyConfiguration;
Expand Down Expand Up @@ -150,7 +150,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {

// Initialize async operations.
this.crawler = null!;
this.requestList = null!;
this.requestQueue = null!;
this.dataset = null!;
this.keyValueStore = null!;
Expand All @@ -166,19 +165,47 @@ export class CrawlerSetup implements CrawlerSetupOptions {
return req;
});

this.requestList = await RequestList.open('JSDOM_SCRAPER', startUrls);
// KeyValueStore
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);

// RequestQueue
this.requestQueue = await RequestQueueV2.open(this.requestQueueName);

if (
!(await this.keyValueStore.recordExists(
REQUEST_QUEUE_INIT_FLAG_KEY,
))
) {
const requests: Request[] = [];
for await (const request of await RequestList.open(
null,
startUrls,
)) {
if (
this.input.maxResultsPerCrawl > 0 &&
requests.length >= 1.5 * this.input.maxResultsPerCrawl
) {
break;
}
requests.push(request);
}

const { waitForAllRequestsToBeAdded } =
await this.requestQueue.addRequestsBatched(requests);

void waitForAllRequestsToBeAdded.then(async () => {
await this.keyValueStore.setValue(
REQUEST_QUEUE_INIT_FLAG_KEY,
'1',
);
});
}

// Dataset
this.dataset = await Dataset.open(this.datasetName);
const info = await this.dataset.getInfo();
this.pagesOutputted = info?.itemCount ?? 0;

// KeyValueStore
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);

// Proxy configuration
this.proxyConfiguration = (await Actor.createProxyConfiguration(
this.input.proxyConfiguration,
Expand All @@ -198,7 +225,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
runScripts: this.input.runScripts ?? true,
hideInternalConsole: !(this.input.showInternalConsole ?? false),
postNavigationHooks: [],
requestList: this.requestList,
requestQueue: this.requestQueue,
navigationTimeoutSecs: this.input.pageLoadTimeoutSecs,
requestHandlerTimeoutSecs: this.input.pageFunctionTimeoutSecs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import type { Input } from './consts.js';
import { ProxyRotation } from './consts.js';

const SESSION_STORE_NAME = 'APIFY-PLAYWRIGHT-SCRAPER-SESSION-STORE';
const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';

const {
META_KEY,
Expand Down Expand Up @@ -80,7 +81,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {
requestQueueName?: string;

crawler!: PlaywrightCrawler;
requestList!: RequestList;
dataset!: Dataset;
pagesOutputted!: number;
private initPromise: Promise<void>;
Expand Down Expand Up @@ -198,7 +198,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {

// Initialize async operations.
this.crawler = null!;
this.requestList = null!;
this.requestQueue = null!;
this.dataset = null!;
this.keyValueStore = null!;
Expand All @@ -213,21 +212,46 @@ export class CrawlerSetup implements CrawlerSetupOptions {
return req;
});

this.requestList = await RequestList.open(
'PLAYWRIGHT_SCRAPER',
startUrls,
);
// KeyValueStore
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);

// RequestQueue
this.requestQueue = await RequestQueueV2.open(this.requestQueueName);

if (
!(await this.keyValueStore.recordExists(
REQUEST_QUEUE_INIT_FLAG_KEY,
))
) {
const requests: Request[] = [];
for await (const request of await RequestList.open(
null,
startUrls,
)) {
if (
this.input.maxResultsPerCrawl > 0 &&
requests.length >= 1.5 * this.input.maxResultsPerCrawl
) {
break;
}
requests.push(request);
}

const { waitForAllRequestsToBeAdded } =
await this.requestQueue.addRequestsBatched(requests);

void waitForAllRequestsToBeAdded.then(async () => {
await this.keyValueStore.setValue(
REQUEST_QUEUE_INIT_FLAG_KEY,
'1',
);
});
}

// Dataset
this.dataset = await Dataset.open(this.datasetName);
const info = await this.dataset.getInfo();
this.pagesOutputted = info?.itemCount ?? 0;

// KeyValueStore
this.keyValueStore = await KeyValueStore.open(this.keyValueStoreName);
}

/**
Expand All @@ -241,7 +265,6 @@ export class CrawlerSetup implements CrawlerSetupOptions {

const options: PlaywrightCrawlerOptions = {
requestHandler: this._requestHandler.bind(this),
requestList: this.requestList,
requestQueue: this.requestQueue,
requestHandlerTimeoutSecs: this.devtools
? DEVTOOLS_TIMEOUT_SECS
Expand Down
Loading
Loading