Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/danswer/configs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ class DocumentSource(str, Enum):
R2 = "r2"
GOOGLE_CLOUD_STORAGE = "google_cloud_storage"
OCI_STORAGE = "oci_storage"
XENFORO = "xenforo"
NOT_APPLICABLE = "not_applicable"


Expand Down
2 changes: 2 additions & 0 deletions backend/danswer/connectors/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from danswer.connectors.teams.connector import TeamsConnector
from danswer.connectors.web.connector import WebConnector
from danswer.connectors.wikipedia.connector import WikipediaConnector
from danswer.connectors.xenforo.connector import XenforoConnector
from danswer.connectors.zendesk.connector import ZendeskConnector
from danswer.connectors.zulip.connector import ZulipConnector
from danswer.db.credentials import backend_update_credential_json
Expand Down Expand Up @@ -97,6 +98,7 @@ def identify_connector_class(
DocumentSource.R2: BlobStorageConnector,
DocumentSource.GOOGLE_CLOUD_STORAGE: BlobStorageConnector,
DocumentSource.OCI_STORAGE: BlobStorageConnector,
DocumentSource.XENFORO: XenforoConnector,
}
connector_by_source = connector_map.get(source, {})

Expand Down
Empty file.
233 changes: 233 additions & 0 deletions backend/danswer/connectors/xenforo/connector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
"""
This is the XenforoConnector class. It is used to connect to a Xenforo forum and load or update documents from the forum.
To use this class, you need to provide the URL of the Xenforo forum board you want to connect to when creating an instance
of the class. The URL should be a string that starts with 'http://' or 'https://', followed by the domain name of the
forum, followed by the board name. For example:
base_url = 'https://www.example.com/forum/boards/some-topic/'
The `load_from_state` method is used to load documents from the forum. It takes an optional `state` parameter, which
can be used to specify a state from which to start loading documents.
"""
import re
from datetime import datetime
from datetime import timedelta
from datetime import timezone
from typing import Any
from urllib.parse import urlparse

import pytz
import requests
from bs4 import BeautifulSoup
from bs4 import Tag

from danswer.configs.constants import DocumentSource
from danswer.connectors.cross_connector_utils.miscellaneous_utils import datetime_to_utc
from danswer.connectors.interfaces import GenerateDocumentsOutput
from danswer.connectors.interfaces import LoadConnector
from danswer.connectors.models import BasicExpertInfo
from danswer.connectors.models import Document
from danswer.connectors.models import Section
from danswer.utils.logger import setup_logger

logger = setup_logger()


def get_title(soup: BeautifulSoup) -> str:
el = soup.find("h1", "p-title-value")
if not el:
return ""
title = el.text
for char in (";", ":", "!", "*", "/", "\\", "?", '"', "<", ">", "|"):
title = title.replace(char, "_")
return title


def get_pages(soup: BeautifulSoup, url: str) -> list[str]:
page_tags = soup.select("li.pageNav-page")
page_numbers = []
for button in page_tags:
if re.match(r"^\d+$", button.text):
page_numbers.append(button.text)

max_pages = int(max(page_numbers, key=int)) if page_numbers else 1

all_pages = []
for x in range(1, int(max_pages) + 1):
all_pages.append(f"{url}page-{x}")
return all_pages


def parse_post_date(post_element: BeautifulSoup) -> datetime:
el = post_element.find("time")
if not isinstance(el, Tag) or "datetime" not in el.attrs:
return datetime.utcfromtimestamp(0).replace(tzinfo=timezone.utc)

date_value = el["datetime"]

# Ensure date_value is a string (if it's a list, take the first element)
if isinstance(date_value, list):
date_value = date_value[0]

post_date = datetime.strptime(date_value, "%Y-%m-%dT%H:%M:%S%z")
return datetime_to_utc(post_date)


def scrape_page_posts(
soup: BeautifulSoup, url: str, initial_run: bool, start_time: datetime
) -> list:
title = get_title(soup)

documents = []
for post in soup.find_all("div", class_="message-inner"):
post_date = parse_post_date(post)
if initial_run or post_date > start_time:
el = post.find("div", class_="bbWrapper")
if not el:
continue
post_text = el.get_text(strip=True) + "\n"
author_tag = post.find("a", class_="username")
if author_tag is None:
author_tag = post.find("span", class_="username")
author = author_tag.get_text(strip=True) if author_tag else "Deleted author"
document = Document(
id=f"{DocumentSource.XENFORO.value}__{title}",
sections=[Section(link=url, text=post_text)],
title=title,
source=DocumentSource.WEB,
semantic_identifier=title,
primary_owners=[BasicExpertInfo(display_name=author)],
metadata={
"type": "post",
"author": author,
"time": post_date.strftime("%Y-%m-%d %H:%M:%S"),
},
doc_updated_at=post_date,
)
documents.append(document)
return documents


class XenforoConnector(LoadConnector):
# Class variable to track if the connector has been run before
has_been_run_before = False

def __init__(self, base_url: str) -> None:
self.base_url = base_url
self.initial_run = not XenforoConnector.has_been_run_before
self.start = datetime.utcnow().replace(tzinfo=pytz.utc) - timedelta(days=1)
self.cookies: dict[str, str] = {}
# mimic user browser to avoid being blocked by the website (see: https://www.useragents.me/)
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/121.0.0.0 Safari/537.36"
}

def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
if credentials:
logger.warning("Unexpected credentials provided for Xenforo Connector")
return None

def load_from_state(self) -> GenerateDocumentsOutput:
# Standardize URL to always end in /.
if self.base_url[-1] != "/":
self.base_url += "/"

# Remove all extra parameters from the end such as page, post.
matches = ("threads/", "boards/", "forums/")
for each in matches:
if each in self.base_url:
try:
self.base_url = self.base_url[
0 : self.base_url.index(
"/", self.base_url.index(each) + len(each)
)
+ 1
]
except ValueError:
pass

doc_batch: list[Document] = []
all_threads = []

# If the URL contains "boards/" or "forums/", find all threads.
if "boards/" in self.base_url or "forums/" in self.base_url:
pages = get_pages(self.requestsite(self.base_url), self.base_url)

# Get all pages on thread_list_page
for pre_count, thread_list_page in enumerate(pages, start=1):
logger.info(
f"\x1b[KGetting pages from thread_list_page.. Current: {pre_count}/{len(pages)}\r"
)
all_threads += self.get_threads(thread_list_page)
# If the URL contains "threads/", add the thread to the list.
elif "threads/" in self.base_url:
all_threads.append(self.base_url)

# Process all threads
for thread_count, thread_url in enumerate(all_threads, start=1):
soup = self.requestsite(thread_url)
if soup is None:
logger.error(f"Failed to load page: {self.base_url}")
continue
pages = get_pages(soup, thread_url)
# Getting all pages for all threads
for page_count, page in enumerate(pages, start=1):
logger.info(f"Visiting {page}")
logger.info(
f"\x1b[KProgress: Page {page_count}/{len(pages)} - Thread {thread_count}/{len(all_threads)}\r"
)
soup_url = self.requestsite(page)
doc_batch.extend(
scrape_page_posts(
soup_url, thread_url, self.initial_run, self.start
)
)
if doc_batch:
yield doc_batch

# Mark the initial run finished after all threads and pages have been processed
XenforoConnector.has_been_run_before = True

def get_threads(self, url: str) -> list[str]:
soup = self.requestsite(url)
thread_tags = soup.find_all(class_="structItem-title")
base_url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(url))
threads = []
for x in thread_tags:
y = x.find_all(href=True)
for element in y:
link = element["href"]
if "threads/" in link:
stripped = link[0 : link.rfind("/") + 1]
if base_url + stripped not in threads:
threads.append(base_url + stripped)
return threads

def requestsite(self, url: str) -> BeautifulSoup:
try:
response = requests.get(
url, cookies=self.cookies, headers=self.headers, timeout=10
)
if response.status_code != 200:
logger.error(
f"<{url}> Request Error: {response.status_code} - {response.reason}"
)
return BeautifulSoup(response.text, "html.parser")
except TimeoutError:
logger.error("Timed out Error.")
except Exception as e:
logger.error(f"Error on {url}")
logger.exception(e)
return BeautifulSoup("", "html.parser")


if __name__ == "__main__":
connector = XenforoConnector(
# base_url="https://cassiopaea.org/forum/threads/how-to-change-your-emotional-state.41381/"
base_url="https://xenforo.com/community/threads/whats-new-with-enhanced-search-resource-manager-and-media-gallery-in-xenforo-2-3.220935/"
)
document_batches = connector.load_from_state()
print(next(document_batches))
1 change: 1 addition & 0 deletions web/public/Xenforo.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import {
createConnectorValidationSchema,
defaultPruneFreqDays,
defaultRefreshFreqMinutes,
isLoadState,
} from "@/lib/connectors/connectors";
import { Modal } from "@/components/Modal";
import GDriveMain from "./pages/gdrive/GoogleDrivePage";
Expand Down Expand Up @@ -317,7 +318,7 @@ export default function AddConnector({
const { message, isSuccess, response } = await submitConnector<any>(
{
connector_specific_config: transformedConnectorSpecificConfig,
input_type: connector == "web" ? "load_state" : "poll", // single case
input_type: isLoadState(connector) ? "load_state" : "poll", // single case
name: name,
source: connector,
is_public: access_type == "public",
Expand Down
17 changes: 16 additions & 1 deletion web/src/components/icons/icons.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ import clickupIcon from "../../../public/Clickup.svg";
import cohereIcon from "../../../public/Cohere.svg";
import voyageIcon from "../../../public/Voyage.png";
import googleIcon from "../../../public/Google.webp";

import xenforoIcon from "../../../public/Xenforo.svg";
import { FaRobot } from "react-icons/fa";

export interface IconProps {
Expand Down Expand Up @@ -2811,6 +2811,21 @@ export const WindowsIcon = ({
</svg>
);
};

export const XenforoIcon = ({
size = 16,
className = defaultTailwindCSS,
}: IconProps) => {
return (
<div
style={{ width: `${size}px`, height: `${size}px` }}
className={`w-[${size}px] h-[${size}px] ` + className}
>
<Image src={xenforoIcon} alt="Logo" width="96" height="96" />
</div>
);
};

export const AsanaIcon = ({
size = 16,
className = defaultTailwindCSS,
Expand Down
27 changes: 27 additions & 0 deletions web/src/lib/connectors/connectors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@ import * as Yup from "yup";
import { IsPublicGroupSelectorFormType } from "@/components/IsPublicGroupSelector";
import { ConfigurableSources, ValidInputTypes, ValidSources } from "../types";

export function isLoadState(connector_name: string): boolean {
const connectors = ["web", "xenforo"];
if (connectors.includes(connector_name)) {
return true;
}

return false;
}

export type InputType =
| "list"
| "text"
Expand Down Expand Up @@ -763,6 +772,20 @@ For example, specifying .*-support.* as a "channel" will cause the connector to
},
],
},
xenforo: {
description: "Configure Xenforo connector",
values: [
{
type: "text",
query: "Enter forum or thread URL:",
label: "URL",
name: "base_url",
optional: false,
description:
"The XenForo v2.2 forum URL to index. Can be board or thread.",
},
],
},
asana: {
description: "Configure Asana connector",
values: [
Expand Down Expand Up @@ -1052,6 +1075,10 @@ export interface GoogleSitesConfig {
base_url: string;
}

export interface XenforoConfig {
base_url: string;
}

export interface ZendeskConfig {}

export interface DropboxConfig {}
Expand Down
1 change: 1 addition & 0 deletions web/src/lib/connectors/credentials.ts
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ export const credentialTemplates: Record<ValidSources, any> = {
access_key_id: "",
secret_access_key: "",
} as OCICredentialJson,
xenforo: null,
google_sites: null,
file: null,
wikipedia: null,
Expand Down
6 changes: 6 additions & 0 deletions web/src/lib/sources.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import {
OCIStorageIcon,
GoogleStorageIcon,
ColorSlackIcon,
XenforoIcon,
} from "@/components/icons/icons";
import { ValidSources } from "./types";
import {
Expand Down Expand Up @@ -279,6 +280,11 @@ const SOURCE_METADATA_MAP: SourceMap = {
category: SourceCategory.Storage,
docs: "https://docs.danswer.dev/connectors/google_storage",
},
xenforo: {
icon: XenforoIcon,
displayName: "Xenforo",
category: SourceCategory.Messaging,
},
ingestion_api: {
icon: GlobeIcon,
displayName: "Ingestion",
Expand Down
1 change: 1 addition & 0 deletions web/src/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ const validSources = [
"s3",
"r2",
"google_cloud_storage",
"xenforo",
"oci_storage",
"not_applicable",
"ingestion_api",
Expand Down
Loading