-
Notifications
You must be signed in to change notification settings - Fork 94
Add skip_publishers_disallowing_training
#772
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 2 commits
899f403
a30892f
3d83628
4a67efa
583b021
8a78912
2896c61
3f78145
d09d9da
3ad9a63
ca44d33
26d7b01
f52c6c4
d3d7a64
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -222,6 +222,7 @@ def _build_article_iterator( | |
extraction_filter: Optional[ExtractionFilter], | ||
url_filter: Optional[URLFilter], | ||
language_filter: Optional[List[str]], | ||
skip_publishers_disallowing_training: bool = False, | ||
) -> Iterator[Article]: | ||
raise NotImplementedError | ||
|
||
|
@@ -236,6 +237,7 @@ def crawl( | |
language_filter: Optional[List[str]] = None, | ||
only_unique: bool = True, | ||
save_to_file: Union[None, str, Path] = None, | ||
skip_publishers_disallowing_training: bool = False, | ||
) -> Iterator[Article]: | ||
"""Yields articles from initialized scrapers | ||
|
||
|
@@ -267,6 +269,9 @@ def crawl( | |
Always returns the first encountered article. Defaults to True. | ||
save_to_file (Union[None, str, Path]): If set, the crawled articles will be collected saved to the | ||
specified file as a JSON list. | ||
skip_publishers_disallowing_training (bool): If set to True, publishers that disallow training | ||
are skipped. Note that this is an indicator only and users with the intention of using Fundus to gather | ||
training data should always check the publisher's terms of use beforehand. | ||
|
||
Returns: | ||
Iterator[Article]: An iterator yielding objects of type Article. | ||
|
@@ -364,7 +369,12 @@ def callback() -> None: | |
try: | ||
with Timeout(seconds=timeout, silent=True, callback=callback, disable=timeout <= 0) as timer: | ||
for article in self._build_article_iterator( | ||
tuple(fitting_publishers), error_handling, build_extraction_filter(), url_filter, language_filter | ||
tuple(fitting_publishers), | ||
error_handling, | ||
build_extraction_filter(), | ||
url_filter, | ||
language_filter, | ||
skip_publishers_disallowing_training, | ||
): | ||
if max_articles_per_publisher and article_count[article.publisher] == max_articles_per_publisher: | ||
if isinstance(self, Crawler) and not __EVENTS__.is_event_set("stop", article.publisher): | ||
|
@@ -465,6 +475,7 @@ def _fetch_articles( | |
extraction_filter: Optional[ExtractionFilter] = None, | ||
url_filter: Optional[URLFilter] = None, | ||
language_filter: Optional[List[str]] = None, | ||
skip_publisher_disallowing_training: bool = False, | ||
) -> Iterator[Article]: | ||
def build_delay() -> Optional[Delay]: | ||
if isinstance(self.delay, float): | ||
|
@@ -481,6 +492,24 @@ def constant_delay() -> float: | |
else: | ||
raise TypeError("param <delay> of <Crawler.__init__>") | ||
|
||
# register default events | ||
__EVENTS__.register_event("stop") | ||
|
||
if skip_publisher_disallowing_training and ( | ||
publisher.disallows_training or publisher.robots.disallows_training() | ||
): | ||
logger.warning( | ||
f"Skipping publisher {publisher.name!r} because it disallows training. " | ||
f"Set <skip_publishers_disallowing_training> to False to include it." | ||
) | ||
return | ||
if publisher.robots.disallow_all(): | ||
logger.warning( | ||
f"Skipping publisher {publisher.name!r} because it disallows all crawling in robots.txt. " | ||
f"Set <ignore_robots> to True to include it." | ||
) | ||
return | ||
|
||
scraper = WebScraper( | ||
publisher, | ||
self.restrict_sources_to, | ||
|
@@ -523,13 +552,15 @@ def _build_article_iterator( | |
extraction_filter: Optional[ExtractionFilter], | ||
url_filter: Optional[URLFilter], | ||
language_filter: Optional[List[str]], | ||
skip_publisher_disallowing_training: bool = False, | ||
) -> Iterator[Article]: | ||
article_task = partial( | ||
self._fetch_articles, | ||
error_handling=error_handling, | ||
extraction_filter=extraction_filter, | ||
url_filter=url_filter, | ||
language_filter=language_filter, | ||
skip_publisher_disallowing_training=skip_publisher_disallowing_training, | ||
) | ||
|
||
if self.threading: | ||
|
@@ -597,10 +628,19 @@ def _fetch_articles( | |
extraction_filter: Optional[ExtractionFilter] = None, | ||
url_filter: Optional[URLFilter] = None, | ||
language_filter: Optional[List[str]] = None, | ||
skip_publishers_disallowing_training: bool = False, | ||
bar: Optional[tqdm] = None, | ||
) -> Iterator[Article]: | ||
retries: int = 0 | ||
while True: | ||
if skip_publishers_disallowing_training: | ||
publishers = tuple( | ||
[ | ||
publisher | ||
for publisher in publishers | ||
if not (publisher.disallows_training or publisher.robots.disallows_training()) | ||
] | ||
) | ||
|
||
source = CCNewsSource(*publishers, warc_path=warc_path) | ||
scraper = CCNewsScraper(source) | ||
try: | ||
|
@@ -731,6 +771,7 @@ def _build_article_iterator( | |
extraction_filter: Optional[ExtractionFilter], | ||
url_filter: Optional[URLFilter], | ||
language_filter: Optional[List[str]], | ||
skip_publishers_disallowing_training: bool = False, | ||
**kwargs, | ||
) -> Iterator[Article]: | ||
warc_paths = tuple(self._get_warc_paths()) | ||
|
@@ -743,6 +784,7 @@ def _build_article_iterator( | |
extraction_filter=extraction_filter, | ||
url_filter=url_filter, | ||
language_filter=language_filter, | ||
skip_publishers_disallowing_training=skip_publishers_disallowing_training, | ||
bar=bar, | ||
) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,12 @@ | |
_default_header = {"user-agent": "Fundus"} | ||
|
||
|
||
class RequestInterruptedError(Exception): | ||
"""Raised when a request is interrupted by a stop event.""" | ||
|
||
pass | ||
|
||
|
||
|
||
class InterruptableSession(requests.Session): | ||
def __init__(self, timeout: Optional[int] = None): | ||
super().__init__() | ||
|
@@ -25,14 +31,17 @@ def get_with_interrupt(self, *args, **kwargs) -> requests.Response: | |
|
||
This function hands over the request to another thread and checks every second | ||
for an interrupt event. If there was an interrupt event, this function raises | ||
a requests.exceptions.Timeout error. | ||
a RequestInterruptedError exception. | ||
|
||
Args: | ||
*args: requests.Session.get(*) arguments. | ||
**kwargs: requests.Session.get(**) keyword arguments. | ||
|
||
Returns: | ||
The response. | ||
|
||
Raises: | ||
RequestInterruptedError: If the request is interrupted by a stop event. | ||
""" | ||
|
||
def _req(): | ||
|
@@ -53,15 +62,13 @@ def _req(): | |
while True: | ||
try: | ||
response = response_queue.get(timeout=1) | ||
except Empty: | ||
if __EVENTS__.is_event_set("stop"): | ||
logger.debug(f"Interrupt request for {url!r}") | ||
response_queue.task_done() | ||
exit(1) | ||
else: | ||
if isinstance(response, Exception): | ||
raise response | ||
return response | ||
|
||
except Empty: | ||
if __EVENTS__.is_event_set("stop"): | ||
logger.debug(f"Interrupt request for {url!r}") | ||
raise RequestInterruptedError(f"Request to {url} was interrupted by stop event") | ||
|
||
|
||
@dataclass | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have to admit, I'm not entirely comfortable with moving the event register here. It feels somewhat disconnected from the rest of the code. While I understand the necessity behind this choice, it does strike me as a potential design flaw that we should aim to address.
More broadly, enforcing strict access to events, requiring them to be registered before they can be checked, might not be the best approach. The original intention was to introduce a level of control, ensuring that accessing an unregistered event would result in an error. However, I'm not sure whether this tradeoff is worth it. What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you perhaps give me an example, where it might cause problems accessing unregistered events? Because especially with regard to the stop event, I feel like the process of registering is devalued by the fact that, we do it by default for all threads when crawling. I could imagine in that it becomes more important for future events, that could be added. So I would also tend towards finding a solution that perhaps automatically registers the stop event, because we will likely need it in most situations.