Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
899f403
add `skip_publishers_disallowing_training`
addie9800 Jun 12, 2025
a30892f
minor fixes
addie9800 Jun 12, 2025
3d83628
remove code from try
addie9800 Jul 31, 2025
4a67efa
Merge branch 'master' into add-training-disclaimer
addie9800 Jul 31, 2025
583b021
black
addie9800 Jul 31, 2025
8a78912
use default request header
MaxDall Aug 4, 2025
2896c61
introduce `CrashThread` exception
addie9800 Aug 25, 2025
3f78145
introduce `ThreadEventDict`
addie9800 Aug 26, 2025
d09d9da
move publisher robots verification to background
addie9800 Aug 26, 2025
3ad9a63
fix typing
addie9800 Aug 26, 2025
ca44d33
fix typing
addie9800 Aug 26, 2025
26d7b01
Merge branch 'master' into add-training-disclaimer
addie9800 Sep 24, 2025
f52c6c4
rename `default_header` to match master
addie9800 Sep 24, 2025
7516db5
improve exception handling by inheriting from `BaseException` add fun…
MaxDall Oct 15, 2025
df9aead
fix a race condition where events where aliased after the main-thread…
MaxDall Oct 15, 2025
17a286e
fix a bug where no events where registered when aliasing; use bidirec…
MaxDall Oct 15, 2025
d3c79f7
add autogenerated doc strings
MaxDall Oct 15, 2025
f6a9c8b
add missing `bidict` dependency
MaxDall Oct 15, 2025
d3d7a64
simplify ThreadPoolExecutor Usage
addie9800 Oct 15, 2025
847e050
fix lambda expression and indentation
MaxDall Oct 21, 2025
b0cc0fa
add `disallow_training` as property
MaxDall Oct 21, 2025
0a5642d
Merge pull request #796 from flairNLP/add-training-disclaimer-proposal
MaxDall Oct 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ Developed at <a href="https://www.informatik.hu-berlin.de/en/forschung-en/gebiet

---

***Disclaimer**: Although we try to provide an indication of whether a publisher has not explicitly objected to the training of AI models on its data, we would like to point out that this information must be verified independently before their content is used.
More details can be found [here](docs/5_advanced_topics.md#filtering-publishers-for-ai-training).*


---
Fundus is:

* **A static news crawler.**
Expand Down
8 changes: 8 additions & 0 deletions docs/5_advanced_topics.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* [How to search for publishers](#how-to-search-for-publishers)
* [Using `search()`](#using-search)
* [Working with deprecated publishers](#working-with-deprecated-publishers)
* [Filtering publishers for AI training](#filtering-publishers-for-ai-training)

# Advanced Topics

Expand Down Expand Up @@ -33,4 +34,11 @@ When we notice that a publisher is uncrawlable for whatever reason, we will mark
This mostly has internal usages, since the default value for the `Crawler` `ignore_deprecated` flag is `False`.
You can alter this behaviour when initiating the `Crawler` and setting the `ignore_deprecated` flag.

## Filtering publishers for AI training

Some publishers explicitly disallow the use of their content for AI training purposes.
We _try_ to respect these wishes by introducing the `skip_publishers_disallowing_training` parameter in the `crawl()` function.
Users intending to use Fundus to gather training data for AI models should set this parameter to `True` to avoid collecting articles from publishers that wish for their content to not be used in this way.
Yet, as publishers are not required to mention this in their robots.txt file, users should additionally check the terms of use of the publishers they want to crawl and set the `disallows_training` attribute of the `Publisher` class accordingly.

In the [next section](6_logging.md) we introduce you to Fundus logging mechanics.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ dependencies = [
"dill>=0.3, <1",
"dict2xml>=1.7.6, <2",
"xmltodict>=0.13.0, <1",
"bidict>=0.23, <1"
]

[project.urls]
Expand Down
82 changes: 72 additions & 10 deletions src/fundus/publishers/base_objects.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import defaultdict
from textwrap import indent
from typing import Dict, Iterator, List, Optional, Set, Type, Union
from typing import Dict, Iterable, Iterator, List, Optional, Set, Type, Union
from urllib.robotparser import RobotFileParser
from warnings import warn

Expand All @@ -11,7 +11,7 @@
from fundus.logging import create_logger
from fundus.parser.base_parser import ParserProxy
from fundus.scraping.filter import URLFilter
from fundus.scraping.session import session_handler
from fundus.scraping.session import _default_header, session_handler
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap, URLSource
from fundus.utils.iteration import iterate_all_subclasses

Expand All @@ -27,43 +27,90 @@ class CustomRobotFileParser(RobotFileParser):
This is in order to avoid 403 errors when fetching the robots.txt file.
"""

_disallow_training_keywords: Set[str] = {
"machine",
"learning",
"training",
"train",
"model",
"models",
"artificial",
"intelligence",
"large",
"language",
"llm",
"llms",
}

def __init__(self, url: str, headers: Optional[Dict[str, str]] = None):
self.headers = headers
self.disallows_training: bool = False
self.url = url
super().__init__(url)

# noinspection PyAttributeOutsideInit
def read(self, headers: Optional[Dict[str, str]] = None) -> None:
def read(self) -> None:
"""Reads the robots.txt URL and feeds it to the parser."""
try:
# noinspection PyUnresolvedReferences
session = session_handler.get_session()
response = session.get_with_interrupt(self.url, headers=headers) # type: ignore[attr-defined]
response = session.get_with_interrupt(self.url, headers=self.headers)
except HTTPError as err:
if err.response.status_code in (401, 403):
logger.warning(
f"Robots {self.url!r} disallowed access with status code {err.response.status_code}."
" Defaulting to disallow all."
)
self.disallow_all = True
elif 400 <= err.response.status_code < 500:
self.allow_all = True
else:
self.parse(response.text.splitlines())

def parse(self, lines: Iterable[str]) -> None:
for line in lines:
if line.strip().startswith("#") and set(line.split(" ")) & self._disallow_training_keywords:
self.disallows_training = True
break
super().parse(lines)


class Robots:
def __init__(self, url: str):
def __init__(self, url: str, headers: Optional[Dict[str, str]] = None):
self.url = url
self.robots_file_parser = CustomRobotFileParser(url)
self.robots_file_parser = CustomRobotFileParser(url, headers=headers)
self.ready: bool = False

def read(self, headers: Optional[Dict[str, str]] = None) -> None:
def _read(self) -> None:
try:
self.robots_file_parser.read(headers=headers)
self.robots_file_parser.read()
except (ConnectionError, ReadTimeout):
logger.warning(f"Could not load robots {self.url!r}. Ignoring robots and continuing.")
self.robots_file_parser.allow_all = True
self.ready = True

def ensure_ready(self) -> None:
"""Ensure that the robots.txt file is read and parsed."""
if not self.ready:
self._read()

def can_fetch(self, useragent: str, url: str) -> bool:
self.ensure_ready()
return self.robots_file_parser.can_fetch(useragent, url)

def crawl_delay(self, useragent: str) -> Optional[float]:
self.ensure_ready()
delay = self.robots_file_parser.crawl_delay(useragent)
return delay if delay is None else float(delay)

def disallows_training(self) -> bool:
self.ensure_ready()
return self.robots_file_parser.disallows_training

def disallow_all(self) -> bool:
self.ensure_ready()
return self.robots_file_parser.disallow_all


class Publisher:
__name__: str
Expand All @@ -83,8 +130,9 @@ def __init__(
sources: List[URLSource],
query_parameter: Optional[Dict[str, str]] = None,
url_filter: Optional[URLFilter] = None,
request_header: Optional[Dict[str, str]] = None,
request_header: Optional[Dict[str, str]] = _default_header,
deprecated: bool = False,
disallows_training: bool = False,
suppress_robots: bool = False,
):
"""Initialization of a new Publisher object
Expand All @@ -98,6 +146,10 @@ def __init__(
appended to crawled URLs
url_filter (Optional[URLFilter]): Regex filter to apply determining URLs to be skipped
request_header (Optional[Dict[str, str]]): Request header to be used for the GET-request
deprecated (bool): If True, the publisher is deprecated and skipped by default
disallows_training (bool): If True, the publisher disallows training on its articles in it's robots.txt file.
Note that this is only an indicator and users should verify the terms of use of the publisher before
using the articles for training purposes.

"""
if not (name and domain and parser and sources):
Expand All @@ -109,14 +161,20 @@ def __init__(
self.url_filter = url_filter
self.request_header = request_header
self.deprecated = deprecated
self.robots = Robots(self.domain + "robots.txt" if self.domain.endswith("/") else self.domain + "/robots.txt")
self.robots = Robots(
url=self.domain + "robots.txt" if self.domain.endswith("/") else self.domain + "/robots.txt",
headers=self.request_header,
)
self._disallows_training = disallows_training

# Temporary fix to compensate for a bug in the RobotsFileParser treating rule lines
# like /? as / disallowing the entire site. we could think about replacing the urllib
# implementation with https://github.com/seomoz/reppy
if suppress_robots:
self.robots.robots_file_parser.allow_all = True

# we define the dict here manually instead of using default dict so that we can control
# the order in which sources are proceeded.
source_mapping: Dict[Type[URLSource], List[URLSource]] = defaultdict(list)

for url_source in sources:
Expand All @@ -129,6 +187,10 @@ def __init__(

self._source_mapping = dict(sorted(source_mapping.items(), key=lambda item: self.__SOURCE_ORDER__[item[0]]))

@property
def disallows_training(self) -> bool:
return self._disallows_training or self.robots.disallows_training()

@property
def source_mapping(self) -> Dict[Type[URLSource], List[URLSource]]:
return self._source_mapping
Expand Down
Loading