chore: add respectRobotsTxtFile to generic scrapers (#378)

B4nan · web-flow · commit aff8475cb4f7 · 2025-04-17T16:07:44.000+02:00
diff --git a/packages/actor-scraper/camoufox-scraper/Dockerfile b/packages/actor-scraper/camoufox-scraper/Dockerfile
@@ -1,4 +1,4 @@
-FROM apify/actor-node-playwright-firefox:20 AS builder
+FROM apify/actor-node-playwright-firefox:22 AS builder
 
 COPY --chown=myuser package*.json ./
 
@@ -8,7 +8,7 @@ COPY --chown=myuser . ./
 
 RUN npm run build
 
-FROM apify/actor-node-playwright-firefox:20
+FROM apify/actor-node-playwright-firefox:22
 
 COPY --from=builder --chown=myuser /home/myuser/dist ./dist
 
diff --git a/packages/actor-scraper/camoufox-scraper/INPUT_SCHEMA.json b/packages/actor-scraper/camoufox-scraper/INPUT_SCHEMA.json
@@ -58,6 +58,13 @@
             "description": "URL fragments (the parts of URL after a <code>#</code>) are not considered when the scraper determines whether a URL has already been visited. This means that when adding URLs such as <code>https://example.com/#foo</code> and <code>https://example.com/#bar</code>, only the first will be visited. Turn this option on to tell the scraper to visit both.",
             "default": false
         },
+        "respectRobotsTxtFile": {
+            "title": "Respect the robots.txt file",
+            "type": "boolean",
+            "description": "If enabled, the crawler will consult the robots.txt file for the target website before crawling each page. At the moment, the crawler does not use any specific user agent identifier. The crawl-delay directive is also not supported yet.",
+            "default": false,
+            "prefill": true
+        },
         "pageFunction": {
             "title": "Page function",
             "type": "string",
diff --git a/packages/actor-scraper/camoufox-scraper/package.json b/packages/actor-scraper/camoufox-scraper/package.json
@@ -6,9 +6,9 @@
     "type": "module",
     "dependencies": {
         "@apify/scraper-tools": "^1.1.4",
-        "@crawlee/core": "^3.11.5",
-        "@crawlee/playwright": "^3.11.5",
-        "@crawlee/utils": "^3.11.5",
+        "@crawlee/core": "^3.13.2",
+        "@crawlee/playwright": "^3.13.2",
+        "@crawlee/utils": "^3.13.2",
         "apify": "^3.2.6",
         "camoufox-js": "^0.3.0",
         "idcac-playwright": "^0.1.3",
diff --git a/packages/actor-scraper/camoufox-scraper/src/internals/consts.ts b/packages/actor-scraper/camoufox-scraper/src/internals/consts.ts
@@ -26,6 +26,7 @@ export interface Input {
     excludes: GlobInput[];
     linkSelector?: string;
     keepUrlFragments: boolean;
+    respectRobotsTxtFile: boolean;
     pageFunction: string;
     preNavigationHooks?: string;
     postNavigationHooks?: string;
diff --git a/packages/actor-scraper/camoufox-scraper/src/internals/crawler_setup.ts b/packages/actor-scraper/camoufox-scraper/src/internals/crawler_setup.ts
@@ -215,6 +215,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
             preNavigationHooks: [],
             postNavigationHooks: [],
             failedRequestHandler: this._failedRequestHandler.bind(this),
+            respectRobotsTxtFile: this.input.respectRobotsTxtFile,
             maxConcurrency: this.input.maxConcurrency,
             maxRequestRetries: this.input.maxRequestRetries,
             maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
diff --git a/packages/actor-scraper/cheerio-scraper/INPUT_SCHEMA.json b/packages/actor-scraper/cheerio-scraper/INPUT_SCHEMA.json
@@ -19,6 +19,13 @@
             "default": false,
             "groupCaption": "Options"
         },
+        "respectRobotsTxtFile": {
+            "title": "Respect the robots.txt file",
+            "type": "boolean",
+            "description": "If enabled, the crawler will consult the robots.txt file for the target website before crawling each page. At the moment, the crawler does not use any specific user agent identifier. The crawl-delay directive is also not supported yet.",
+            "default": false,
+            "prefill": true
+        },
         "globs": {
             "title": "Glob Patterns",
             "type": "array",
diff --git a/packages/actor-scraper/cheerio-scraper/package.json b/packages/actor-scraper/cheerio-scraper/package.json
@@ -6,7 +6,7 @@
     "type": "module",
     "dependencies": {
         "@apify/scraper-tools": "^1.1.4",
-        "@crawlee/cheerio": "^3.11.5",
+        "@crawlee/cheerio": "^3.13.2",
         "apify": "^3.2.6"
     },
     "devDependencies": {
diff --git a/packages/actor-scraper/cheerio-scraper/src/internals/consts.ts b/packages/actor-scraper/cheerio-scraper/src/internals/consts.ts
@@ -25,6 +25,7 @@ export interface Input {
     excludes: GlobInput[];
     pseudoUrls: PseudoUrlInput[];
     keepUrlFragments: boolean;
+    respectRobotsTxtFile: boolean;
     linkSelector?: string;
     pageFunction: string;
     preNavigationHooks?: string;
diff --git a/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts b/packages/actor-scraper/cheerio-scraper/src/internals/crawler_setup.ts
@@ -203,6 +203,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
             requestHandlerTimeoutSecs: this.input.pageFunctionTimeoutSecs,
             ignoreSslErrors: this.input.ignoreSslErrors,
             failedRequestHandler: this._failedRequestHandler.bind(this),
+            respectRobotsTxtFile: this.input.respectRobotsTxtFile,
             maxRequestRetries: this.input.maxRequestRetries,
             maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
             additionalMimeTypes: this.input.additionalMimeTypes,
diff --git a/packages/actor-scraper/jsdom-scraper/INPUT_SCHEMA.json b/packages/actor-scraper/jsdom-scraper/INPUT_SCHEMA.json
@@ -19,6 +19,13 @@
             "default": false,
             "groupCaption": "Options"
         },
+        "respectRobotsTxtFile": {
+            "title": "Respect the robots.txt file",
+            "type": "boolean",
+            "description": "If enabled, the crawler will consult the robots.txt file for the target website before crawling each page. At the moment, the crawler does not use any specific user agent identifier. The crawl-delay directive is also not supported yet.",
+            "default": false,
+            "prefill": true
+        },
         "globs": {
             "title": "Glob Patterns",
             "type": "array",
diff --git a/packages/actor-scraper/jsdom-scraper/package.json b/packages/actor-scraper/jsdom-scraper/package.json
@@ -6,7 +6,7 @@
     "type": "module",
     "dependencies": {
         "@apify/scraper-tools": "^1.1.4",
-        "@crawlee/jsdom": "^3.11.5",
+        "@crawlee/jsdom": "^3.13.2",
         "apify": "^3.2.6"
     },
     "devDependencies": {
diff --git a/packages/actor-scraper/jsdom-scraper/src/internals/consts.ts b/packages/actor-scraper/jsdom-scraper/src/internals/consts.ts
@@ -25,6 +25,7 @@ export interface Input {
     pseudoUrls: PseudoUrlInput[];
     excludes: GlobInput[];
     keepUrlFragments: boolean;
+    respectRobotsTxtFile: boolean;
     runScripts: boolean;
     showInternalConsole: boolean;
     linkSelector?: string;
diff --git a/packages/actor-scraper/jsdom-scraper/src/internals/crawler_setup.ts b/packages/actor-scraper/jsdom-scraper/src/internals/crawler_setup.ts
@@ -204,6 +204,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
             requestHandlerTimeoutSecs: this.input.pageFunctionTimeoutSecs,
             ignoreSslErrors: this.input.ignoreSslErrors,
             failedRequestHandler: this._failedRequestHandler.bind(this),
+            respectRobotsTxtFile: this.input.respectRobotsTxtFile,
             maxRequestRetries: this.input.maxRequestRetries,
             maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
             additionalMimeTypes: this.input.additionalMimeTypes,
diff --git a/packages/actor-scraper/playwright-scraper/INPUT_SCHEMA.json b/packages/actor-scraper/playwright-scraper/INPUT_SCHEMA.json
@@ -57,6 +57,13 @@
             "description": "URL fragments (the parts of URL after a <code>#</code>) are not considered when the scraper determines whether a URL has already been visited. This means that when adding URLs such as <code>https://example.com/#foo</code> and <code>https://example.com/#bar</code>, only the first will be visited. Turn this option on to tell the scraper to visit both.",
             "default": false
         },
+        "respectRobotsTxtFile": {
+            "title": "Respect the robots.txt file",
+            "type": "boolean",
+            "description": "If enabled, the crawler will consult the robots.txt file for the target website before crawling each page. At the moment, the crawler does not use any specific user agent identifier. The crawl-delay directive is also not supported yet.",
+            "default": false,
+            "prefill": true
+        },
         "pageFunction": {
             "title": "Page function",
             "type": "string",
diff --git a/packages/actor-scraper/playwright-scraper/package.json b/packages/actor-scraper/playwright-scraper/package.json
@@ -6,9 +6,9 @@
     "type": "module",
     "dependencies": {
         "@apify/scraper-tools": "^1.1.4",
-        "@crawlee/core": "^3.11.5",
-        "@crawlee/playwright": "^3.11.5",
-        "@crawlee/utils": "^3.11.5",
+        "@crawlee/core": "^3.13.2",
+        "@crawlee/playwright": "^3.13.2",
+        "@crawlee/utils": "^3.13.2",
         "apify": "^3.2.6",
         "idcac-playwright": "^0.1.3",
         "playwright": "*"
diff --git a/packages/actor-scraper/playwright-scraper/src/internals/consts.ts b/packages/actor-scraper/playwright-scraper/src/internals/consts.ts
@@ -26,6 +26,7 @@ export interface Input {
     excludes: GlobInput[];
     linkSelector?: string;
     keepUrlFragments: boolean;
+    respectRobotsTxtFile: boolean;
     pageFunction: string;
     preNavigationHooks?: string;
     postNavigationHooks?: string;
diff --git a/packages/actor-scraper/playwright-scraper/src/internals/crawler_setup.ts b/packages/actor-scraper/playwright-scraper/src/internals/crawler_setup.ts
@@ -249,6 +249,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
             preNavigationHooks: [],
             postNavigationHooks: [],
             failedRequestHandler: this._failedRequestHandler.bind(this),
+            respectRobotsTxtFile: this.input.respectRobotsTxtFile,
             maxConcurrency: this.input.maxConcurrency,
             maxRequestRetries: this.input.maxRequestRetries,
             maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
diff --git a/packages/actor-scraper/puppeteer-scraper/INPUT_SCHEMA.json b/packages/actor-scraper/puppeteer-scraper/INPUT_SCHEMA.json
@@ -63,6 +63,13 @@
             "description": "URL fragments (the parts of URL after a <code>#</code>) are not considered when the scraper determines whether a URL has already been visited. This means that when adding URLs such as <code>https://example.com/#foo</code> and <code>https://example.com/#bar</code>, only the first will be visited. Turn this option on to tell the scraper to visit both.",
             "default": false
         },
+        "respectRobotsTxtFile": {
+            "title": "Respect the robots.txt file",
+            "type": "boolean",
+            "description": "If enabled, the crawler will consult the robots.txt file for the target website before crawling each page. At the moment, the crawler does not use any specific user agent identifier. The crawl-delay directive is also not supported yet.",
+            "default": false,
+            "prefill": true
+        },
         "pageFunction": {
             "title": "Page function",
             "type": "string",
diff --git a/packages/actor-scraper/puppeteer-scraper/package.json b/packages/actor-scraper/puppeteer-scraper/package.json
@@ -6,7 +6,7 @@
     "type": "module",
     "dependencies": {
         "@apify/scraper-tools": "^1.1.4",
-        "@crawlee/puppeteer": "^3.11.5",
+        "@crawlee/puppeteer": "^3.13.2",
         "apify": "^3.2.6",
         "idcac-playwright": "^0.1.3",
         "puppeteer": "*"
diff --git a/packages/actor-scraper/puppeteer-scraper/src/internals/consts.ts b/packages/actor-scraper/puppeteer-scraper/src/internals/consts.ts
@@ -28,6 +28,7 @@ export interface Input {
     linkSelector?: string;
     clickableElementsSelector?: string;
     keepUrlFragments: boolean;
+    respectRobotsTxtFile: boolean;
     pageFunction: string;
     preNavigationHooks?: string;
     postNavigationHooks?: string;
diff --git a/packages/actor-scraper/puppeteer-scraper/src/internals/crawler_setup.ts b/packages/actor-scraper/puppeteer-scraper/src/internals/crawler_setup.ts
@@ -246,6 +246,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
             preNavigationHooks: [],
             postNavigationHooks: [],
             failedRequestHandler: this._failedRequestHandler.bind(this),
+            respectRobotsTxtFile: this.input.respectRobotsTxtFile,
             maxConcurrency: this.input.maxConcurrency,
             maxRequestRetries: this.input.maxRequestRetries,
             maxRequestsPerCrawl: this.input.maxPagesPerCrawl,
diff --git a/packages/actor-scraper/web-scraper/INPUT_SCHEMA.json b/packages/actor-scraper/web-scraper/INPUT_SCHEMA.json
@@ -28,6 +28,13 @@
             "default": false,
             "groupCaption": "Options"
         },
+        "respectRobotsTxtFile": {
+            "title": "Respect the robots.txt file",
+            "type": "boolean",
+            "description": "If enabled, the crawler will consult the robots.txt file for the target website before crawling each page. At the moment, the crawler does not use any specific user agent identifier. The crawl-delay directive is also not supported yet.",
+            "default": false,
+            "prefill": true
+        },
         "linkSelector": {
             "title": "Link selector",
             "type": "string",
diff --git a/packages/actor-scraper/web-scraper/package.json b/packages/actor-scraper/web-scraper/package.json
@@ -7,10 +7,10 @@
     "type": "module",
     "dependencies": {
         "@apify/scraper-tools": "^1.1.4",
-        "@crawlee/puppeteer": "^3.11.5",
+        "@crawlee/puppeteer": "^3.13.2",
         "apify": "^3.2.6",
         "content-type": "^1.0.5",
-        "crawlee": "^3.11.5",
+        "crawlee": "^3.13.2",
         "devtools-server": "^0.0.2",
         "idcac-playwright": "^0.1.3",
         "puppeteer": "*"
diff --git a/packages/actor-scraper/web-scraper/src/internals/consts.ts b/packages/actor-scraper/web-scraper/src/internals/consts.ts
@@ -52,6 +52,7 @@ export interface Input {
     pageFunction: string;
     runMode: RunMode;
     keepUrlFragments: boolean;
+    respectRobotsTxtFile: boolean;
     linkSelector?: string;
     globs: GlobInput[];
     regexps: RegExpInput[];
diff --git a/packages/actor-scraper/web-scraper/src/internals/crawler_setup.ts b/packages/actor-scraper/web-scraper/src/internals/crawler_setup.ts
@@ -270,6 +270,7 @@ export class CrawlerSetup implements CrawlerSetupOptions {
             preNavigationHooks: [],
             postNavigationHooks: [],
             failedRequestHandler: this._failedRequestHandler.bind(this),
+            respectRobotsTxtFile: this.input.respectRobotsTxtFile,
             maxConcurrency: this.isDevRun
                 ? MAX_CONCURRENCY_IN_DEVELOPMENT
                 : this.input.maxConcurrency,