Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 21 additions & 22 deletions backend/onyx/connectors/confluence/onyx_confluence.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ class ConfluenceRateLimitError(Exception):


_DEFAULT_PAGINATION_LIMIT = 1000
_MINIMUM_PAGINATION_LIMIT = 50


class OnyxConfluence:
Expand Down Expand Up @@ -311,8 +310,8 @@ def _initialize_connection_helper(
return confluence

# https://developer.atlassian.com/cloud/confluence/rate-limiting/
# this uses the native rate limiting option provided by the
# confluence client and otherwise applies a simpler set of error handling
# This uses the native rate limiting option provided by the
# confluence client and otherwise applies a simpler set of error handling.
def _make_rate_limited_confluence_method(
self, name: str, credential_provider: CredentialsProviderInterface | None
) -> Callable[..., Any]:
Expand Down Expand Up @@ -378,25 +377,6 @@ def wrapped_call(*args: list[Any], **kwargs: Any) -> Any:

return wrapped_call

# def _wrap_methods(self) -> None:
# """
# For each attribute that is callable (i.e., a method) and doesn't start with an underscore,
# wrap it with handle_confluence_rate_limit.
# """
# for attr_name in dir(self):
# if callable(getattr(self, attr_name)) and not attr_name.startswith("_"):
# setattr(
# self,
# attr_name,
# handle_confluence_rate_limit(getattr(self, attr_name)),
# )

# def _ensure_token_valid(self) -> None:
# if self._token_is_expired():
# self._refresh_token()
# # Re-init the Confluence client with the originally stored args
# self._confluence = Confluence(self._url, *self._args, **self._kwargs)

def __getattr__(self, name: str) -> Any:
"""Dynamically intercept attribute/method access."""
attr = getattr(self._confluence, name, None)
Expand Down Expand Up @@ -498,6 +478,10 @@ def _paginate_url(
raw_response = self.get(
path=url_suffix,
advanced_mode=True,
params={
"body-format": "atlas_doc_format",
"expand": "body.atlas_doc_format",
},
)
except Exception as e:
logger.exception(f"Error in confluence call to {url_suffix}")
Expand Down Expand Up @@ -926,6 +910,9 @@ def extract_text_from_confluence_html(
object_html = body.get("storage", body.get("view", {})).get("value")

soup = bs4.BeautifulSoup(object_html, "html.parser")

_remove_macro_stylings(soup=soup)

for user in soup.findAll("ri:user"):
user_id = (
user.attrs["ri:account-id"]
Expand Down Expand Up @@ -1007,3 +994,15 @@ def extract_text_from_confluence_html(
logger.warning(f"Error processing ac:link-body: {e}")

return format_document_soup(soup)


def _remove_macro_stylings(soup: bs4.BeautifulSoup) -> None:
for macro_root in soup.findAll("ac:structured-macro"):
if not isinstance(macro_root, bs4.Tag):
continue

macro_styling = macro_root.find(name="ac:parameter", attrs={"ac:name": "page"})
if not macro_styling or not isinstance(macro_styling, bs4.Tag):
continue

macro_styling.extract()