flairNLP · addie9800 · Jun 12, 2025 · Jun 12, 2025 · Jul 31, 2025 · Jul 31, 2025
diff --git a/README.md b/README.md
@@ -25,6 +25,11 @@ Developed at <a href="https://www.informatik.hu-berlin.de/en/forschung-en/gebiet
 
 ---
 
+***Disclaimer**: Although we try to provide an indication of whether a publisher has not explicitly objected to the training of AI models on its data, we would like to point out that this information must be verified independently before their content is used.
+More details can be found [here](docs/5_advanced_topics.md#filtering-publishers-for-ai-training).*
+
+
+---
 Fundus is:
 
 * **A static news crawler.** 

diff --git a/docs/5_advanced_topics.md b/docs/5_advanced_topics.md
@@ -4,6 +4,7 @@
   * [How to search for publishers](#how-to-search-for-publishers)
     * [Using `search()`](#using-search)
   * [Working with deprecated publishers](#working-with-deprecated-publishers)
+  * [Filtering publishers for AI training](#filtering-publishers-for-ai-training)
 
 # Advanced Topics
 
@@ -33,4 +34,11 @@ When we notice that a publisher is uncrawlable for whatever reason, we will mark
 This mostly has internal usages, since the default value for the `Crawler` `ignore_deprecated` flag is `False`.
 You can alter this behaviour when initiating the `Crawler` and setting the `ignore_deprecated` flag.
 
+## Filtering publishers for AI training
+
+Some publishers explicitly disallow the use of their content for AI training purposes.
+We _try_ to respect these wishes by introducing the `skip_publishers_disallowing_training` parameter in the `crawl()` function.
+Users intending to use Fundus to gather training data for AI models should set this parameter to `True` to avoid collecting articles from publishers that wish for their content to not be used in this way.
+Yet, as publishers are not required to mention this in their robots.txt file, users should additionally check the terms of use of the publishers they want to crawl and set the `disallows_training` attribute of the `Publisher` class accordingly.
+
 In the [next section](6_logging.md) we introduce you to Fundus logging mechanics.
diff --git a/src/fundus/publishers/base_objects.py b/src/fundus/publishers/base_objects.py
@@ -1,6 +1,6 @@
 from collections import defaultdict
 from textwrap import indent
-from typing import Dict, Iterator, List, Optional, Set, Type, Union
+from typing import Dict, Iterable, Iterator, List, Optional, Set, Type, Union
 from urllib.robotparser import RobotFileParser
 from warnings import warn
 
@@ -11,7 +11,7 @@
 from fundus.logging import create_logger
 from fundus.parser.base_parser import ParserProxy
 from fundus.scraping.filter import URLFilter
-from fundus.scraping.session import session_handler
+from fundus.scraping.session import RequestInterruptedError, session_handler
 from fundus.scraping.url import NewsMap, RSSFeed, Sitemap, URLSource
 from fundus.utils.iteration import iterate_all_subclasses
 
@@ -27,43 +27,93 @@ class CustomRobotFileParser(RobotFileParser):
     This is in order to avoid 403 errors when fetching the robots.txt file.
     """
 
+    _disallow_training_keywords: Set[str] = {
+        "machine",
+        "learning",
+        "training",
+        "train",
+        "model",
+        "models",
+        "artificial",
+        "intelligence",
+        "large",
+        "language",
+        "llm",
+        "llms",
+    }
+
+    def __init__(self, url: str, headers: Optional[Dict[str, str]] = None):
+        self.headers = headers
+        self.disallows_training: bool = False
+        self.url = url
+        super().__init__(url)
+
     # noinspection PyAttributeOutsideInit
-    def read(self, headers: Optional[Dict[str, str]] = None) -> None:
+    def read(self) -> None:
         """Reads the robots.txt URL and feeds it to the parser."""
         try:
             # noinspection PyUnresolvedReferences
             session = session_handler.get_session()
-            response = session.get_with_interrupt(self.url, headers=headers)  # type: ignore[attr-defined]
+            response = session.get_with_interrupt(self.url, headers=self.headers)
         except HTTPError as err:
             if err.response.status_code in (401, 403):
+                logger.warning(
+                    f"Robots {self.url!r} disallowed access with status code {err.response.status_code}."
+                    " Defaulting to disallow all."
+                )
                 self.disallow_all = True
             elif 400 <= err.response.status_code < 500:
                 self.allow_all = True
+        except RequestInterruptedError as err:
+            logger.warning(f"Request for robots {self.url!r} interrupted: {err!r}. Defaulting to disallow all.")  #
+            self.disallow_all = True
         else:
             self.parse(response.text.splitlines())
 
+    def parse(self, lines: Iterable[str]) -> None:
+        for line in lines:
+            if line.strip().startswith("#") and set(line.split(" ")) & self._disallow_training_keywords:
+                self.disallows_training = True
+                break
+        super().parse(lines)
+
 
 class Robots:
-    def __init__(self, url: str):
+    def __init__(self, url: str, headers: Optional[Dict[str, str]] = None):
         self.url = url
-        self.robots_file_parser = CustomRobotFileParser(url)
+        self.robots_file_parser = CustomRobotFileParser(url, headers=headers)
         self.ready: bool = False
 
-    def read(self, headers: Optional[Dict[str, str]] = None) -> None:
+    def _read(self) -> None:
         try:
-            self.robots_file_parser.read(headers=headers)
+            self.robots_file_parser.read()
         except (ConnectionError, ReadTimeout):
             logger.warning(f"Could not load robots {self.url!r}. Ignoring robots and continuing.")
             self.robots_file_parser.allow_all = True
         self.ready = True
 
+    def ensure_ready(self) -> None:
+        """Ensure that the robots.txt file is read and parsed."""
+        if not self.ready:
+            self._read()
+
     def can_fetch(self, useragent: str, url: str) -> bool:
+        self.ensure_ready()
         return self.robots_file_parser.can_fetch(useragent, url)
 
     def crawl_delay(self, useragent: str) -> Optional[float]:
+        self.ensure_ready()
         delay = self.robots_file_parser.crawl_delay(useragent)
         return delay if delay is None else float(delay)
 
+    def disallows_training(self) -> bool:
+        self.ensure_ready()
+        return self.robots_file_parser.disallows_training
+
+    def disallow_all(self) -> bool:
+        self.ensure_ready()
+        return self.robots_file_parser.disallow_all
+
 
 class Publisher:
     __name__: str
@@ -85,6 +135,7 @@ def __init__(
         url_filter: Optional[URLFilter] = None,
         request_header: Optional[Dict[str, str]] = None,
         deprecated: bool = False,
+        disallows_training: bool = False,
     ):
         """Initialization of a new Publisher object
 
@@ -97,6 +148,10 @@ def __init__(
                 appended to crawled URLs
             url_filter (Optional[URLFilter]): Regex filter to apply determining URLs to be skipped
             request_header (Optional[Dict[str, str]]): Request header to be used for the GET-request
+            deprecated (bool): If True, the publisher is deprecated and skipped by default
+            disallows_training (bool): If True, the publisher disallows training on its articles in it's robots.txt file.
+                Note that this is only an indicator and users should verify the terms of use of the publisher before
+                using the articles for training purposes.
 
         """
         if not (name and domain and parser and sources):
@@ -108,7 +163,11 @@ def __init__(
         self.url_filter = url_filter
         self.request_header = request_header
         self.deprecated = deprecated
-        self.robots = Robots(self.domain + "robots.txt" if self.domain.endswith("/") else self.domain + "/robots.txt")
+        self.robots = Robots(
+            url=self.domain + "robots.txt" if self.domain.endswith("/") else self.domain + "/robots.txt",
+            headers=self.request_header,
+        )
+        self.disallows_training = disallows_training
         # we define the dict here manually instead of using default dict so that we can control
         # the order in which sources are proceeded.
 

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
@@ -222,6 +222,7 @@ def _build_article_iterator(
         extraction_filter: Optional[ExtractionFilter],
         url_filter: Optional[URLFilter],
         language_filter: Optional[List[str]],
+        skip_publishers_disallowing_training: bool = False,
     ) -> Iterator[Article]:
         raise NotImplementedError
 
@@ -236,6 +237,7 @@ def crawl(
         language_filter: Optional[List[str]] = None,
         only_unique: bool = True,
         save_to_file: Union[None, str, Path] = None,
+        skip_publishers_disallowing_training: bool = False,
     ) -> Iterator[Article]:
         """Yields articles from initialized scrapers
 
@@ -267,6 +269,9 @@ def crawl(
                 Always returns the first encountered article. Defaults to True.
             save_to_file (Union[None, str, Path]): If set, the crawled articles will be collected saved to the
                 specified file as a JSON list.
+            skip_publishers_disallowing_training (bool): If set to True, publishers that disallow training
+                are skipped. Note that this is an indicator only and users with the intention of using Fundus to gather
+                training data should always check the publisher's terms of use beforehand.
 
         Returns:
             Iterator[Article]: An iterator yielding objects of type Article.
@@ -364,7 +369,12 @@ def callback() -> None:
         try:
             with Timeout(seconds=timeout, silent=True, callback=callback, disable=timeout <= 0) as timer:
                 for article in self._build_article_iterator(
-                    tuple(fitting_publishers), error_handling, build_extraction_filter(), url_filter, language_filter
+                    tuple(fitting_publishers),
+                    error_handling,
+                    build_extraction_filter(),
+                    url_filter,
+                    language_filter,
+                    skip_publishers_disallowing_training,
                 ):
                     if max_articles_per_publisher and article_count[article.publisher] == max_articles_per_publisher:
                         if isinstance(self, Crawler) and not __EVENTS__.is_event_set("stop", article.publisher):
@@ -465,6 +475,7 @@ def _fetch_articles(
         extraction_filter: Optional[ExtractionFilter] = None,
         url_filter: Optional[URLFilter] = None,
         language_filter: Optional[List[str]] = None,
+        skip_publisher_disallowing_training: bool = False,
     ) -> Iterator[Article]:
         def build_delay() -> Optional[Delay]:
             if isinstance(self.delay, float):
@@ -481,6 +492,24 @@ def constant_delay() -> float:
             else:
                 raise TypeError("param <delay> of <Crawler.__init__>")
 
+        # register default events
+        __EVENTS__.register_event("stop")
+
+        if skip_publisher_disallowing_training and (
+            publisher.disallows_training or publisher.robots.disallows_training()
+        ):
+            logger.warning(
+                f"Skipping publisher {publisher.name!r} because it disallows training. "
+                f"Set <skip_publishers_disallowing_training> to False to include it."
+            )
+            return
+        if publisher.robots.disallow_all():
+            logger.warning(
+                f"Skipping publisher {publisher.name!r} because it disallows all crawling in robots.txt. "
+                f"Set <ignore_robots> to True to include it."
+            )
+            return
+
         scraper = WebScraper(
             publisher,
             self.restrict_sources_to,
@@ -523,13 +552,15 @@ def _build_article_iterator(
         extraction_filter: Optional[ExtractionFilter],
         url_filter: Optional[URLFilter],
         language_filter: Optional[List[str]],
+        skip_publisher_disallowing_training: bool = False,
     ) -> Iterator[Article]:
         article_task = partial(
             self._fetch_articles,
             error_handling=error_handling,
             extraction_filter=extraction_filter,
             url_filter=url_filter,
             language_filter=language_filter,
+            skip_publisher_disallowing_training=skip_publisher_disallowing_training,
         )
 
         if self.threading:
@@ -597,10 +628,19 @@ def _fetch_articles(
         extraction_filter: Optional[ExtractionFilter] = None,
         url_filter: Optional[URLFilter] = None,
         language_filter: Optional[List[str]] = None,
+        skip_publishers_disallowing_training: bool = False,
         bar: Optional[tqdm] = None,
     ) -> Iterator[Article]:
         retries: int = 0
         while True:
+            if skip_publishers_disallowing_training:
+                publishers = tuple(
+                    [
+                        publisher
+                        for publisher in publishers
+                        if not (publisher.disallows_training or publisher.robots.disallows_training())
+                    ]
+                )
             source = CCNewsSource(*publishers, warc_path=warc_path)
             scraper = CCNewsScraper(source)
             try:
@@ -731,6 +771,7 @@ def _build_article_iterator(
         extraction_filter: Optional[ExtractionFilter],
         url_filter: Optional[URLFilter],
         language_filter: Optional[List[str]],
+        skip_publishers_disallowing_training: bool = False,
         **kwargs,
     ) -> Iterator[Article]:
         warc_paths = tuple(self._get_warc_paths())
@@ -743,6 +784,7 @@ def _build_article_iterator(
                 extraction_filter=extraction_filter,
                 url_filter=url_filter,
                 language_filter=language_filter,
+                skip_publishers_disallowing_training=skip_publishers_disallowing_training,
                 bar=bar,
             )
 

diff --git a/src/fundus/scraping/html.py b/src/fundus/scraping/html.py
@@ -17,7 +17,11 @@
 from fundus.publishers.base_objects import Publisher, Robots
 from fundus.scraping.delay import Delay
 from fundus.scraping.filter import URLFilter
-from fundus.scraping.session import _default_header, session_handler
+from fundus.scraping.session import (
+    RequestInterruptedError,
+    _default_header,
+    session_handler,
+)
 from fundus.scraping.url import URLSource
 from fundus.utils.events import __EVENTS__
 
@@ -116,15 +120,10 @@ def __init__(
 
         self.delay = delay
 
-        # register default events
-        __EVENTS__.register_event("stop")
-
         # parse robots:
         self.robots: Optional[Robots] = None
         if not ignore_robots:
             self.robots = self.publisher.robots
-            if not self.robots.ready:
-                self.publisher.robots.read(headers=self.request_header)
 
             if not ignore_crawl_delay:
                 if robots_delay := self.robots.crawl_delay(self.request_header.get("user-agent") or "*"):
@@ -191,7 +190,9 @@ def filter_url(u: str) -> bool:
                 if isinstance(error, HTTPError) and error.response.status_code >= 500:
                     logger.warning(f"Skipped {self.publisher.name!r} due to server errors: {error!r}")
                 continue
-
+            except RequestInterruptedError as error:
+                logger.debug(f"Interrupt request for {url!r} executed. Stopping further requests.")
+                break
             except Exception as error:
                 logger.error(f"Warning! Skipped requested URL {url!r} because of an unexpected error {error!r}")
                 continue

diff --git a/src/fundus/scraping/session.py b/src/fundus/scraping/session.py
@@ -15,6 +15,12 @@
 _default_header = {"user-agent": "Fundus"}
 
 
+class RequestInterruptedError(Exception):
+    """Raised when a request is interrupted by a stop event."""
+
+    pass
+
+
 class InterruptableSession(requests.Session):
     def __init__(self, timeout: Optional[int] = None):
         super().__init__()
@@ -25,14 +31,17 @@ def get_with_interrupt(self, *args, **kwargs) -> requests.Response:
 
         This function hands over the request to another thread and checks every second
         for an interrupt event. If there was an interrupt event, this function raises
-        a requests.exceptions.Timeout error.
+        a RequestInterruptedError exception.
 
         Args:
             *args: requests.Session.get(*) arguments.
             **kwargs: requests.Session.get(**) keyword arguments.
 
         Returns:
             The response.
+
+        Raises:
+            RequestInterruptedError: If the request is interrupted by a stop event.
         """
 
         def _req():
@@ -53,15 +62,13 @@ def _req():
         while True:
             try:
                 response = response_queue.get(timeout=1)
-            except Empty:
-                if __EVENTS__.is_event_set("stop"):
-                    logger.debug(f"Interrupt request for {url!r}")
-                    response_queue.task_done()
-                    exit(1)
-            else:
                 if isinstance(response, Exception):
                     raise response
                 return response
+            except Empty:
+                if __EVENTS__.is_event_set("stop"):
+                    logger.debug(f"Interrupt request for {url!r}")
+                    raise RequestInterruptedError(f"Request to {url} was interrupted by stop event")
 
 
 @dataclass