From ab04424e87879cf79d365e39586035d6073a5dc9 Mon Sep 17 00:00:00 2001 From: Miu Razvan Date: Wed, 30 Jul 2025 16:32:12 +0300 Subject: [PATCH 1/2] Add remove_by_selector feature to filter unnecessary elements when scraping --- backend/onyx/connectors/web/connector.py | 10 +++++++++- backend/onyx/file_processing/html_utils.py | 18 ++++++++++++++++++ web/src/lib/connectors/connectors.tsx | 9 +++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/backend/onyx/connectors/web/connector.py b/backend/onyx/connectors/web/connector.py index ec850ebef5a..32203387951 100644 --- a/backend/onyx/connectors/web/connector.py +++ b/backend/onyx/connectors/web/connector.py @@ -36,7 +36,7 @@ from onyx.connectors.models import Document from onyx.connectors.models import TextSection from onyx.file_processing.extract_file_text import read_pdf_file -from onyx.file_processing.html_utils import web_html_cleanup +from onyx.file_processing.html_utils import web_html_cleanup, remove_by_selector from onyx.utils.logger import setup_logger from onyx.utils.sitemap import list_pages_for_site from shared_configs.configs import MULTI_TENANT @@ -438,13 +438,19 @@ def __init__( mintlify_cleanup: bool = True, # Mostly ok to apply to other websites as well batch_size: int = INDEX_BATCH_SIZE, scroll_before_scraping: bool = False, + remove_by_selector: list[str] = [], **kwargs: Any, ) -> None: self.mintlify_cleanup = mintlify_cleanup self.batch_size = batch_size self.recursive = False self.scroll_before_scraping = scroll_before_scraping + self.remove_by_selector = remove_by_selector or [] self.web_connector_type = web_connector_type + + if not isinstance(self.remove_by_selector, list): + self.remove_by_selector = [] + if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value: self.recursive = True self.to_visit_list = [_ensure_valid_url(base_url)] @@ -571,6 +577,8 @@ def _do_scrape( content = page.content() soup = BeautifulSoup(content, "html.parser") + + remove_by_selector(soup, self.remove_by_selector) if self.recursive: internal_links = get_internal_links( diff --git a/backend/onyx/file_processing/html_utils.py b/backend/onyx/file_processing/html_utils.py index bb2134ba18f..62c534a41b9 100644 --- a/backend/onyx/file_processing/html_utils.py +++ b/backend/onyx/file_processing/html_utils.py @@ -219,3 +219,21 @@ def web_html_cleanup( cleaned_text = page_text.replace("\u200b", "") return ParsedHTML(title=title, cleaned_text=cleaned_text) + +def remove_by_selector(soup: bs4.BeautifulSoup, selector: list[str]): + tag = soup.select_one("meta[name='remove_by_selector']") + if tag and tag.has_attr("content"): + page_selector = [tag["content"].strip()] + else: + page_selector = [] + + for sel in (selector + page_selector): + sel = sel.strip() + if not sel: + continue + for s in sel.split(","): + s = s.strip() + if not s: + continue + for tag in soup.select(s): + tag.decompose() \ No newline at end of file diff --git a/web/src/lib/connectors/connectors.tsx b/web/src/lib/connectors/connectors.tsx index 04ca5074a9e..374fae61fa7 100644 --- a/web/src/lib/connectors/connectors.tsx +++ b/web/src/lib/connectors/connectors.tsx @@ -177,6 +177,15 @@ export const connectorConfigs: Record< name: "scroll_before_scraping", optional: true, }, + { + type: "list", + query: "Remove by selector:", + label: "Remove by selector", + description: + "List of css selectors used to exclude html elements from scraping", + name: "remove_by_selector", + optional: true + }, ], overrideDefaultFreq: 60 * 60 * 24, }, From 6df9b0c4b42f40ecac9a70e067d7c6f732239ae3 Mon Sep 17 00:00:00 2001 From: Miu Razvan Date: Mon, 22 Sep 2025 18:01:40 +0300 Subject: [PATCH 2/2] clean up --- backend/onyx/connectors/web/connector.py | 3 --- backend/onyx/file_processing/html_utils.py | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/backend/onyx/connectors/web/connector.py b/backend/onyx/connectors/web/connector.py index 32203387951..81093349f6f 100644 --- a/backend/onyx/connectors/web/connector.py +++ b/backend/onyx/connectors/web/connector.py @@ -448,9 +448,6 @@ def __init__( self.remove_by_selector = remove_by_selector or [] self.web_connector_type = web_connector_type - if not isinstance(self.remove_by_selector, list): - self.remove_by_selector = [] - if web_connector_type == WEB_CONNECTOR_VALID_SETTINGS.RECURSIVE.value: self.recursive = True self.to_visit_list = [_ensure_valid_url(base_url)] diff --git a/backend/onyx/file_processing/html_utils.py b/backend/onyx/file_processing/html_utils.py index 62c534a41b9..39b98008e6f 100644 --- a/backend/onyx/file_processing/html_utils.py +++ b/backend/onyx/file_processing/html_utils.py @@ -235,5 +235,5 @@ def remove_by_selector(soup: bs4.BeautifulSoup, selector: list[str]): s = s.strip() if not s: continue - for tag in soup.select(s): - tag.decompose() \ No newline at end of file + for element in soup.select(s): + element.decompose()