Skip to content
Open
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ Developed at <a href="https://www.informatik.hu-berlin.de/en/forschung-en/gebiet

---

***Disclaimer**: Although we try to provide an indication of whether a publisher has not explicitly objected to the training of AI models on its data, we would like to point out that this information must be verified independently before their content is used.
More details can be found [here](docs/5_advanced_topics.md#filtering-publishers-for-ai-training).*


---
Fundus is:

* **A static news crawler.**
Expand Down
8 changes: 8 additions & 0 deletions docs/5_advanced_topics.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* [How to search for publishers](#how-to-search-for-publishers)
* [Using `search()`](#using-search)
* [Working with deprecated publishers](#working-with-deprecated-publishers)
* [Filtering publishers for AI training](#filtering-publishers-for-ai-training)

# Advanced Topics

Expand Down Expand Up @@ -33,4 +34,11 @@ When we notice that a publisher is uncrawlable for whatever reason, we will mark
This mostly has internal usages, since the default value for the `Crawler` `ignore_deprecated` flag is `False`.
You can alter this behaviour when initiating the `Crawler` and setting the `ignore_deprecated` flag.

## Filtering publishers for AI training

Some publishers explicitly disallow the use of their content for AI training purposes.
We _try_ to respect these wishes by introducing the `skip_publishers_disallowing_training` parameter in the `crawl()` function.
Users intending to use Fundus to gather training data for AI models should set this parameter to `True` to avoid collecting articles from publishers that wish for their content to not be used in this way.
Yet, as publishers are not required to mention this in their robots.txt file, users should additionally check the terms of use of the publishers they want to crawl and set the `disallows_training` attribute of the `Publisher` class accordingly.

In the [next section](6_logging.md) we introduce you to Fundus logging mechanics.
77 changes: 68 additions & 9 deletions src/fundus/publishers/base_objects.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import defaultdict
from textwrap import indent
from typing import Dict, Iterator, List, Optional, Set, Type, Union
from typing import Dict, Iterable, Iterator, List, Optional, Set, Type, Union
from urllib.robotparser import RobotFileParser
from warnings import warn

Expand All @@ -11,7 +11,7 @@
from fundus.logging import create_logger
from fundus.parser.base_parser import ParserProxy
from fundus.scraping.filter import URLFilter
from fundus.scraping.session import session_handler
from fundus.scraping.session import RequestInterruptedError, session_handler
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap, URLSource
from fundus.utils.iteration import iterate_all_subclasses

Expand All @@ -27,43 +27,93 @@ class CustomRobotFileParser(RobotFileParser):
This is in order to avoid 403 errors when fetching the robots.txt file.
"""

_disallow_training_keywords: Set[str] = {
"machine",
"learning",
"training",
"train",
"model",
"models",
"artificial",
"intelligence",
"large",
"language",
"llm",
"llms",
}

def __init__(self, url: str, headers: Optional[Dict[str, str]] = None):
self.headers = headers
self.disallows_training: bool = False
self.url = url
super().__init__(url)

# noinspection PyAttributeOutsideInit
def read(self, headers: Optional[Dict[str, str]] = None) -> None:
def read(self) -> None:
"""Reads the robots.txt URL and feeds it to the parser."""
try:
# noinspection PyUnresolvedReferences
session = session_handler.get_session()
response = session.get_with_interrupt(self.url, headers=headers) # type: ignore[attr-defined]
response = session.get_with_interrupt(self.url, headers=self.headers)
except HTTPError as err:
if err.response.status_code in (401, 403):
logger.warning(
f"Robots {self.url!r} disallowed access with status code {err.response.status_code}."
" Defaulting to disallow all."
)
self.disallow_all = True
elif 400 <= err.response.status_code < 500:
self.allow_all = True
except RequestInterruptedError as err:
logger.warning(f"Request for robots {self.url!r} interrupted: {err!r}. Defaulting to disallow all.") #
self.disallow_all = True
else:
self.parse(response.text.splitlines())

def parse(self, lines: Iterable[str]) -> None:
for line in lines:
if line.strip().startswith("#") and set(line.split(" ")) & self._disallow_training_keywords:
self.disallows_training = True
break
super().parse(lines)


class Robots:
def __init__(self, url: str):
def __init__(self, url: str, headers: Optional[Dict[str, str]] = None):
self.url = url
self.robots_file_parser = CustomRobotFileParser(url)
self.robots_file_parser = CustomRobotFileParser(url, headers=headers)
self.ready: bool = False

def read(self, headers: Optional[Dict[str, str]] = None) -> None:
def _read(self) -> None:
try:
self.robots_file_parser.read(headers=headers)
self.robots_file_parser.read()
except (ConnectionError, ReadTimeout):
logger.warning(f"Could not load robots {self.url!r}. Ignoring robots and continuing.")
self.robots_file_parser.allow_all = True
self.ready = True

def ensure_ready(self) -> None:
"""Ensure that the robots.txt file is read and parsed."""
if not self.ready:
self._read()

def can_fetch(self, useragent: str, url: str) -> bool:
self.ensure_ready()
return self.robots_file_parser.can_fetch(useragent, url)

def crawl_delay(self, useragent: str) -> Optional[float]:
self.ensure_ready()
delay = self.robots_file_parser.crawl_delay(useragent)
return delay if delay is None else float(delay)

def disallows_training(self) -> bool:
self.ensure_ready()
return self.robots_file_parser.disallows_training

def disallow_all(self) -> bool:
self.ensure_ready()
return self.robots_file_parser.disallow_all


class Publisher:
__name__: str
Expand All @@ -85,6 +135,7 @@ def __init__(
url_filter: Optional[URLFilter] = None,
request_header: Optional[Dict[str, str]] = None,
deprecated: bool = False,
disallows_training: bool = False,
):
"""Initialization of a new Publisher object

Expand All @@ -97,6 +148,10 @@ def __init__(
appended to crawled URLs
url_filter (Optional[URLFilter]): Regex filter to apply determining URLs to be skipped
request_header (Optional[Dict[str, str]]): Request header to be used for the GET-request
deprecated (bool): If True, the publisher is deprecated and skipped by default
disallows_training (bool): If True, the publisher disallows training on its articles in it's robots.txt file.
Note that this is only an indicator and users should verify the terms of use of the publisher before
using the articles for training purposes.

"""
if not (name and domain and parser and sources):
Expand All @@ -108,7 +163,11 @@ def __init__(
self.url_filter = url_filter
self.request_header = request_header
self.deprecated = deprecated
self.robots = Robots(self.domain + "robots.txt" if self.domain.endswith("/") else self.domain + "/robots.txt")
self.robots = Robots(
url=self.domain + "robots.txt" if self.domain.endswith("/") else self.domain + "/robots.txt",
headers=self.request_header,
)
self.disallows_training = disallows_training
# we define the dict here manually instead of using default dict so that we can control
# the order in which sources are proceeded.

Expand Down
44 changes: 43 additions & 1 deletion src/fundus/scraping/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ def _build_article_iterator(
extraction_filter: Optional[ExtractionFilter],
url_filter: Optional[URLFilter],
language_filter: Optional[List[str]],
skip_publishers_disallowing_training: bool = False,
) -> Iterator[Article]:
raise NotImplementedError

Expand All @@ -236,6 +237,7 @@ def crawl(
language_filter: Optional[List[str]] = None,
only_unique: bool = True,
save_to_file: Union[None, str, Path] = None,
skip_publishers_disallowing_training: bool = False,
) -> Iterator[Article]:
"""Yields articles from initialized scrapers

Expand Down Expand Up @@ -267,6 +269,9 @@ def crawl(
Always returns the first encountered article. Defaults to True.
save_to_file (Union[None, str, Path]): If set, the crawled articles will be collected saved to the
specified file as a JSON list.
skip_publishers_disallowing_training (bool): If set to True, publishers that disallow training
are skipped. Note that this is an indicator only and users with the intention of using Fundus to gather
training data should always check the publisher's terms of use beforehand.

Returns:
Iterator[Article]: An iterator yielding objects of type Article.
Expand Down Expand Up @@ -364,7 +369,12 @@ def callback() -> None:
try:
with Timeout(seconds=timeout, silent=True, callback=callback, disable=timeout <= 0) as timer:
for article in self._build_article_iterator(
tuple(fitting_publishers), error_handling, build_extraction_filter(), url_filter, language_filter
tuple(fitting_publishers),
error_handling,
build_extraction_filter(),
url_filter,
language_filter,
skip_publishers_disallowing_training,
):
if max_articles_per_publisher and article_count[article.publisher] == max_articles_per_publisher:
if isinstance(self, Crawler) and not __EVENTS__.is_event_set("stop", article.publisher):
Expand Down Expand Up @@ -465,6 +475,7 @@ def _fetch_articles(
extraction_filter: Optional[ExtractionFilter] = None,
url_filter: Optional[URLFilter] = None,
language_filter: Optional[List[str]] = None,
skip_publisher_disallowing_training: bool = False,
) -> Iterator[Article]:
def build_delay() -> Optional[Delay]:
if isinstance(self.delay, float):
Expand All @@ -481,6 +492,24 @@ def constant_delay() -> float:
else:
raise TypeError("param <delay> of <Crawler.__init__>")

# register default events
__EVENTS__.register_event("stop")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have to admit, I'm not entirely comfortable with moving the event register here. It feels somewhat disconnected from the rest of the code. While I understand the necessity behind this choice, it does strike me as a potential design flaw that we should aim to address.

More broadly, enforcing strict access to events, requiring them to be registered before they can be checked, might not be the best approach. The original intention was to introduce a level of control, ensuring that accessing an unregistered event would result in an error. However, I'm not sure whether this tradeoff is worth it. What do you think?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you perhaps give me an example, where it might cause problems accessing unregistered events? Because especially with regard to the stop event, I feel like the process of registering is devalued by the fact that, we do it by default for all threads when crawling. I could imagine in that it becomes more important for future events, that could be added. So I would also tend towards finding a solution that perhaps automatically registers the stop event, because we will likely need it in most situations.


if skip_publisher_disallowing_training and (
publisher.disallows_training or publisher.robots.disallows_training()
):
logger.warning(
f"Skipping publisher {publisher.name!r} because it disallows training. "
f"Set <skip_publishers_disallowing_training> to False to include it."
)
return
if publisher.robots.disallow_all():
logger.warning(
f"Skipping publisher {publisher.name!r} because it disallows all crawling in robots.txt. "
f"Set <ignore_robots> to True to include it."
)
return

scraper = WebScraper(
publisher,
self.restrict_sources_to,
Expand Down Expand Up @@ -523,13 +552,15 @@ def _build_article_iterator(
extraction_filter: Optional[ExtractionFilter],
url_filter: Optional[URLFilter],
language_filter: Optional[List[str]],
skip_publisher_disallowing_training: bool = False,
) -> Iterator[Article]:
article_task = partial(
self._fetch_articles,
error_handling=error_handling,
extraction_filter=extraction_filter,
url_filter=url_filter,
language_filter=language_filter,
skip_publisher_disallowing_training=skip_publisher_disallowing_training,
)

if self.threading:
Expand Down Expand Up @@ -597,10 +628,19 @@ def _fetch_articles(
extraction_filter: Optional[ExtractionFilter] = None,
url_filter: Optional[URLFilter] = None,
language_filter: Optional[List[str]] = None,
skip_publishers_disallowing_training: bool = False,
bar: Optional[tqdm] = None,
) -> Iterator[Article]:
retries: int = 0
while True:
if skip_publishers_disallowing_training:
publishers = tuple(
[
publisher
for publisher in publishers
if not (publisher.disallows_training or publisher.robots.disallows_training())
]
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Performing this check for every WARC path introduces unnecessary overhead. It would be far more efficient to verify the publishers once during initialization and then propagate the result. However, this should be implemented in a way that doesn’t significantly slow down the CCNewsCrawlers initialization (i.e., not running sequentially) and doesn’t depend on the availability of each publisher’s robots.txt.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you already have an idea of how we would implement that? If we want to have it non-blocking, I have one idea for now. During initialization of the CCNewsCrawler, we start getting the publishers preferences in the background. We can then start crawling as normally, especially since we have to stream the entire WARC file anyway. We can then already start parsing articles for the supported publishers and either caching them until we have the results from the permission check or start yielding/discarding them, if we have the results. I don't really see a way to do that without crawling the robots.txt. I guess caching the preferences might also be an option, but I don't know, if that makes too much sense, since we'll have to crawl it at least once anyway and if people want to comply with the preferences, I think we should ensure they are up to date.

source = CCNewsSource(*publishers, warc_path=warc_path)
scraper = CCNewsScraper(source)
try:
Expand Down Expand Up @@ -731,6 +771,7 @@ def _build_article_iterator(
extraction_filter: Optional[ExtractionFilter],
url_filter: Optional[URLFilter],
language_filter: Optional[List[str]],
skip_publishers_disallowing_training: bool = False,
**kwargs,
) -> Iterator[Article]:
warc_paths = tuple(self._get_warc_paths())
Expand All @@ -743,6 +784,7 @@ def _build_article_iterator(
extraction_filter=extraction_filter,
url_filter=url_filter,
language_filter=language_filter,
skip_publishers_disallowing_training=skip_publishers_disallowing_training,
bar=bar,
)

Expand Down
15 changes: 8 additions & 7 deletions src/fundus/scraping/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
from fundus.publishers.base_objects import Publisher, Robots
from fundus.scraping.delay import Delay
from fundus.scraping.filter import URLFilter
from fundus.scraping.session import _default_header, session_handler
from fundus.scraping.session import (
RequestInterruptedError,
_default_header,
session_handler,
)
from fundus.scraping.url import URLSource
from fundus.utils.events import __EVENTS__

Expand Down Expand Up @@ -116,15 +120,10 @@ def __init__(

self.delay = delay

# register default events
__EVENTS__.register_event("stop")

# parse robots:
self.robots: Optional[Robots] = None
if not ignore_robots:
self.robots = self.publisher.robots
if not self.robots.ready:
self.publisher.robots.read(headers=self.request_header)

if not ignore_crawl_delay:
if robots_delay := self.robots.crawl_delay(self.request_header.get("user-agent") or "*"):
Expand Down Expand Up @@ -191,7 +190,9 @@ def filter_url(u: str) -> bool:
if isinstance(error, HTTPError) and error.response.status_code >= 500:
logger.warning(f"Skipped {self.publisher.name!r} due to server errors: {error!r}")
continue

except RequestInterruptedError as error:
logger.debug(f"Interrupt request for {url!r} executed. Stopping further requests.")
break
except Exception as error:
logger.error(f"Warning! Skipped requested URL {url!r} because of an unexpected error {error!r}")
continue
Expand Down
21 changes: 14 additions & 7 deletions src/fundus/scraping/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
_default_header = {"user-agent": "Fundus"}


class RequestInterruptedError(Exception):
"""Raised when a request is interrupted by a stop event."""

pass
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I use this part of the code to formulate my general concerns.

  1. Thanks a lot for the stability improvements. Getting threading under control seems to be a real hassle.
  2. I would appreciate implementing a best practice to keep pull requests functionally isolated whenever possible. Combining new crawler functionality with stability improvements (like threading changes) makes the review process significantly harder, and now the features depend on each other.
  3. In general, I prefer your solution, introducing a custom error for the interrupt and catching it when possible to control code flow and refrain from intentionally crashing threads with uncought errors. But I see a problem here, as it would require us to handle interruptions at every potential iteration point.

For example, take this part of the BaseScraper:

for html in source.fetch(url_filter=url_filter):
    parser = self.parser_mapping[html.source_info.publisher]

To prevent it from looping over every source (which it would do now), we would have to check for the stop event here as well, introducing the flag to the scope of the CCNewsCrawler, which I would like to prevent.
Having this in mind, I would prefer crashing the thread intentionally by raising uncought RequestInterruptedErrors, but what's your opinion on this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are definitely correct, sorry for the extra confusion with the combined PR. I would also very much appreciate to have them separate at the moment.

While I see the issue you raised, I'm not sure I prefer the alternative of purposely crashing the thread. Maybe there is some alternative to handle it a bit more gracefully? What is your opinion on a similar solution as in the queue setting, where we pass the exception back through the queue, we could do something similar for the iterator. Then we would avoid reusing the flag out of scope, but we can cleanly exit the loop with a break, instead of crashing out.



class InterruptableSession(requests.Session):
def __init__(self, timeout: Optional[int] = None):
super().__init__()
Expand All @@ -25,14 +31,17 @@ def get_with_interrupt(self, *args, **kwargs) -> requests.Response:

This function hands over the request to another thread and checks every second
for an interrupt event. If there was an interrupt event, this function raises
a requests.exceptions.Timeout error.
a RequestInterruptedError exception.

Args:
*args: requests.Session.get(*) arguments.
**kwargs: requests.Session.get(**) keyword arguments.

Returns:
The response.

Raises:
RequestInterruptedError: If the request is interrupted by a stop event.
"""

def _req():
Expand All @@ -53,15 +62,13 @@ def _req():
while True:
try:
response = response_queue.get(timeout=1)
except Empty:
if __EVENTS__.is_event_set("stop"):
logger.debug(f"Interrupt request for {url!r}")
response_queue.task_done()
exit(1)
else:
if isinstance(response, Exception):
raise response
return response
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the reason to add that to the try-except block?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was probably included there by accident.

except Empty:
if __EVENTS__.is_event_set("stop"):
logger.debug(f"Interrupt request for {url!r}")
raise RequestInterruptedError(f"Request to {url} was interrupted by stop event")


@dataclass
Expand Down
Loading