Skip to content

Commit aff8475

Browse files
authored
chore: add respectRobotsTxtFile to generic scrapers (#378)
1 parent 96a34f3 commit aff8475

File tree

25 files changed

+67
-13
lines changed

25 files changed

+67
-13
lines changed

packages/actor-scraper/camoufox-scraper/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM apify/actor-node-playwright-firefox:20 AS builder
1+
FROM apify/actor-node-playwright-firefox:22 AS builder
22

33
COPY --chown=myuser package*.json ./
44

@@ -8,7 +8,7 @@ COPY --chown=myuser . ./
88

99
RUN npm run build
1010

11-
FROM apify/actor-node-playwright-firefox:20
11+
FROM apify/actor-node-playwright-firefox:22
1212

1313
COPY --from=builder --chown=myuser /home/myuser/dist ./dist
1414

packages/actor-scraper/camoufox-scraper/INPUT_SCHEMA.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,13 @@
5858
"description": "URL fragments (the parts of URL after a <code>#</code>) are not considered when the scraper determines whether a URL has already been visited. This means that when adding URLs such as <code>https://example.com/#foo</code> and <code>https://example.com/#bar</code>, only the first will be visited. Turn this option on to tell the scraper to visit both.",
5959
"default": false
6060
},
61+
"respectRobotsTxtFile": {
62+
"title": "Respect the robots.txt file",
63+
"type": "boolean",
64+
"description": "If enabled, the crawler will consult the robots.txt file for the target website before crawling each page. At the moment, the crawler does not use any specific user agent identifier. The crawl-delay directive is also not supported yet.",
65+
"default": false,
66+
"prefill": true
67+
},
6168
"pageFunction": {
6269
"title": "Page function",
6370
"type": "string",

packages/actor-scraper/camoufox-scraper/package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
"type": "module",
77
"dependencies": {
88
"@apify/scraper-tools": "^1.1.4",
9-
"@crawlee/core": "^3.11.5",
10-
"@crawlee/playwright": "^3.11.5",
11-
"@crawlee/utils": "^3.11.5",
9+
"@crawlee/core": "^3.13.2",
10+
"@crawlee/playwright": "^3.13.2",
11+
"@crawlee/utils": "^3.13.2",
1212
"apify": "^3.2.6",
1313
"camoufox-js": "^0.3.0",
1414
"idcac-playwright": "^0.1.3",

packages/actor-scraper/camoufox-scraper/src/internals/consts.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ export interface Input {
2626
excludes: GlobInput[];
2727
linkSelector?: string;
2828
keepUrlFragments: boolean;
29+
respectRobotsTxtFile: boolean;
2930
pageFunction: string;
3031
preNavigationHooks?: string;
3132
postNavigationHooks?: string;

packages/actor-scraper/camoufox-scraper/src/internals/crawler_setup.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
215215
preNavigationHooks: [],
216216
postNavigationHooks: [],
217217
failedRequestHandler: this._failedRequestHandler.bind(this),
218+
respectRobotsTxtFile: this.input.respectRobotsTxtFile,
218219
maxConcurrency: this.input.maxConcurrency,
219220
maxRequestRetries: this.input.maxRequestRetries,
220221
maxRequestsPerCrawl: this.input.maxPagesPerCrawl,

packages/actor-scraper/cheerio-scraper/INPUT_SCHEMA.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@
1919
"default": false,
2020
"groupCaption": "Options"
2121
},
22+
"respectRobotsTxtFile": {
23+
"title": "Respect the robots.txt file",
24+
"type": "boolean",
25+
"description": "If enabled, the crawler will consult the robots.txt file for the target website before crawling each page. At the moment, the crawler does not use any specific user agent identifier. The crawl-delay directive is also not supported yet.",
26+
"default": false,
27+
"prefill": true
28+
},
2229
"globs": {
2330
"title": "Glob Patterns",
2431
"type": "array",

packages/actor-scraper/cheerio-scraper/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"type": "module",
77
"dependencies": {
88
"@apify/scraper-tools": "^1.1.4",
9-
"@crawlee/cheerio": "^3.11.5",
9+
"@crawlee/cheerio": "^3.13.2",
1010
"apify": "^3.2.6"
1111
},
1212
"devDependencies": {

packages/actor-scraper/cheerio-scraper/src/internals/consts.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ export interface Input {
2525
excludes: GlobInput[];
2626
pseudoUrls: PseudoUrlInput[];
2727
keepUrlFragments: boolean;
28+
respectRobotsTxtFile: boolean;
2829
linkSelector?: string;
2930
pageFunction: string;
3031
preNavigationHooks?: string;

packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
203203
requestHandlerTimeoutSecs: this.input.pageFunctionTimeoutSecs,
204204
ignoreSslErrors: this.input.ignoreSslErrors,
205205
failedRequestHandler: this._failedRequestHandler.bind(this),
206+
respectRobotsTxtFile: this.input.respectRobotsTxtFile,
206207
maxRequestRetries: this.input.maxRequestRetries,
207208
maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
208209
additionalMimeTypes: this.input.additionalMimeTypes,

packages/actor-scraper/jsdom-scraper/INPUT_SCHEMA.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@
1919
"default": false,
2020
"groupCaption": "Options"
2121
},
22+
"respectRobotsTxtFile": {
23+
"title": "Respect the robots.txt file",
24+
"type": "boolean",
25+
"description": "If enabled, the crawler will consult the robots.txt file for the target website before crawling each page. At the moment, the crawler does not use any specific user agent identifier. The crawl-delay directive is also not supported yet.",
26+
"default": false,
27+
"prefill": true
28+
},
2229
"globs": {
2330
"title": "Glob Patterns",
2431
"type": "array",

packages/actor-scraper/jsdom-scraper/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"type": "module",
77
"dependencies": {
88
"@apify/scraper-tools": "^1.1.4",
9-
"@crawlee/jsdom": "^3.11.5",
9+
"@crawlee/jsdom": "^3.13.2",
1010
"apify": "^3.2.6"
1111
},
1212
"devDependencies": {

packages/actor-scraper/jsdom-scraper/src/internals/consts.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ export interface Input {
2525
pseudoUrls: PseudoUrlInput[];
2626
excludes: GlobInput[];
2727
keepUrlFragments: boolean;
28+
respectRobotsTxtFile: boolean;
2829
runScripts: boolean;
2930
showInternalConsole: boolean;
3031
linkSelector?: string;

packages/actor-scraper/jsdom-scraper/src/internals/crawler_setup.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
204204
requestHandlerTimeoutSecs: this.input.pageFunctionTimeoutSecs,
205205
ignoreSslErrors: this.input.ignoreSslErrors,
206206
failedRequestHandler: this._failedRequestHandler.bind(this),
207+
respectRobotsTxtFile: this.input.respectRobotsTxtFile,
207208
maxRequestRetries: this.input.maxRequestRetries,
208209
maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
209210
additionalMimeTypes: this.input.additionalMimeTypes,

packages/actor-scraper/playwright-scraper/INPUT_SCHEMA.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,13 @@
5757
"description": "URL fragments (the parts of URL after a <code>#</code>) are not considered when the scraper determines whether a URL has already been visited. This means that when adding URLs such as <code>https://example.com/#foo</code> and <code>https://example.com/#bar</code>, only the first will be visited. Turn this option on to tell the scraper to visit both.",
5858
"default": false
5959
},
60+
"respectRobotsTxtFile": {
61+
"title": "Respect the robots.txt file",
62+
"type": "boolean",
63+
"description": "If enabled, the crawler will consult the robots.txt file for the target website before crawling each page. At the moment, the crawler does not use any specific user agent identifier. The crawl-delay directive is also not supported yet.",
64+
"default": false,
65+
"prefill": true
66+
},
6067
"pageFunction": {
6168
"title": "Page function",
6269
"type": "string",

packages/actor-scraper/playwright-scraper/package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@
66
"type": "module",
77
"dependencies": {
88
"@apify/scraper-tools": "^1.1.4",
9-
"@crawlee/core": "^3.11.5",
10-
"@crawlee/playwright": "^3.11.5",
11-
"@crawlee/utils": "^3.11.5",
9+
"@crawlee/core": "^3.13.2",
10+
"@crawlee/playwright": "^3.13.2",
11+
"@crawlee/utils": "^3.13.2",
1212
"apify": "^3.2.6",
1313
"idcac-playwright": "^0.1.3",
1414
"playwright": "*"

packages/actor-scraper/playwright-scraper/src/internals/consts.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ export interface Input {
2626
excludes: GlobInput[];
2727
linkSelector?: string;
2828
keepUrlFragments: boolean;
29+
respectRobotsTxtFile: boolean;
2930
pageFunction: string;
3031
preNavigationHooks?: string;
3132
postNavigationHooks?: string;

packages/actor-scraper/playwright-scraper/src/internals/crawler_setup.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
249249
preNavigationHooks: [],
250250
postNavigationHooks: [],
251251
failedRequestHandler: this._failedRequestHandler.bind(this),
252+
respectRobotsTxtFile: this.input.respectRobotsTxtFile,
252253
maxConcurrency: this.input.maxConcurrency,
253254
maxRequestRetries: this.input.maxRequestRetries,
254255
maxRequestsPerCrawl: this.input.maxPagesPerCrawl,

packages/actor-scraper/puppeteer-scraper/INPUT_SCHEMA.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,13 @@
6363
"description": "URL fragments (the parts of URL after a <code>#</code>) are not considered when the scraper determines whether a URL has already been visited. This means that when adding URLs such as <code>https://example.com/#foo</code> and <code>https://example.com/#bar</code>, only the first will be visited. Turn this option on to tell the scraper to visit both.",
6464
"default": false
6565
},
66+
"respectRobotsTxtFile": {
67+
"title": "Respect the robots.txt file",
68+
"type": "boolean",
69+
"description": "If enabled, the crawler will consult the robots.txt file for the target website before crawling each page. At the moment, the crawler does not use any specific user agent identifier. The crawl-delay directive is also not supported yet.",
70+
"default": false,
71+
"prefill": true
72+
},
6673
"pageFunction": {
6774
"title": "Page function",
6875
"type": "string",

packages/actor-scraper/puppeteer-scraper/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"type": "module",
77
"dependencies": {
88
"@apify/scraper-tools": "^1.1.4",
9-
"@crawlee/puppeteer": "^3.11.5",
9+
"@crawlee/puppeteer": "^3.13.2",
1010
"apify": "^3.2.6",
1111
"idcac-playwright": "^0.1.3",
1212
"puppeteer": "*"

packages/actor-scraper/puppeteer-scraper/src/internals/consts.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ export interface Input {
2828
linkSelector?: string;
2929
clickableElementsSelector?: string;
3030
keepUrlFragments: boolean;
31+
respectRobotsTxtFile: boolean;
3132
pageFunction: string;
3233
preNavigationHooks?: string;
3334
postNavigationHooks?: string;

packages/actor-scraper/puppeteer-scraper/src/internals/crawler_setup.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
246246
preNavigationHooks: [],
247247
postNavigationHooks: [],
248248
failedRequestHandler: this._failedRequestHandler.bind(this),
249+
respectRobotsTxtFile: this.input.respectRobotsTxtFile,
249250
maxConcurrency: this.input.maxConcurrency,
250251
maxRequestRetries: this.input.maxRequestRetries,
251252
maxRequestsPerCrawl: this.input.maxPagesPerCrawl,

packages/actor-scraper/web-scraper/INPUT_SCHEMA.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@
2828
"default": false,
2929
"groupCaption": "Options"
3030
},
31+
"respectRobotsTxtFile": {
32+
"title": "Respect the robots.txt file",
33+
"type": "boolean",
34+
"description": "If enabled, the crawler will consult the robots.txt file for the target website before crawling each page. At the moment, the crawler does not use any specific user agent identifier. The crawl-delay directive is also not supported yet.",
35+
"default": false,
36+
"prefill": true
37+
},
3138
"linkSelector": {
3239
"title": "Link selector",
3340
"type": "string",

packages/actor-scraper/web-scraper/package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,10 @@
77
"type": "module",
88
"dependencies": {
99
"@apify/scraper-tools": "^1.1.4",
10-
"@crawlee/puppeteer": "^3.11.5",
10+
"@crawlee/puppeteer": "^3.13.2",
1111
"apify": "^3.2.6",
1212
"content-type": "^1.0.5",
13-
"crawlee": "^3.11.5",
13+
"crawlee": "^3.13.2",
1414
"devtools-server": "^0.0.2",
1515
"idcac-playwright": "^0.1.3",
1616
"puppeteer": "*"

packages/actor-scraper/web-scraper/src/internals/consts.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ export interface Input {
5252
pageFunction: string;
5353
runMode: RunMode;
5454
keepUrlFragments: boolean;
55+
respectRobotsTxtFile: boolean;
5556
linkSelector?: string;
5657
globs: GlobInput[];
5758
regexps: RegExpInput[];

packages/actor-scraper/web-scraper/src/internals/crawler_setup.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
270270
preNavigationHooks: [],
271271
postNavigationHooks: [],
272272
failedRequestHandler: this._failedRequestHandler.bind(this),
273+
respectRobotsTxtFile: this.input.respectRobotsTxtFile,
273274
maxConcurrency: this.isDevRun
274275
? MAX_CONCURRENCY_IN_DEVELOPMENT
275276
: this.input.maxConcurrency,

0 commit comments

Comments
 (0)