Skip to content

Commit 40832be

Browse files
authored
Merge pull request #158 from oracle-samples/157-add-unstructured
Fixes #139
2 parents 38809c6 + 22aca0b commit 40832be

File tree

1 file changed

+10
-10
lines changed

1 file changed

+10
-10
lines changed

src/server/utils/embedding.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Copyright (c) 2024, 2025, Oracle and/or its affiliates.
33
Licensed under the Universal Permissive License v1.0 as shown at http://oss.oracle.com/licenses/upl.
44
"""
5-
# spell-checker:ignore langchain, docstore, docos, vectorstores, oraclevs
5+
# spell-checker:ignore langchain, docstore, docos, vectorstores, oraclevs, genai, hnsw
66

77
import json
88
import copy
@@ -22,7 +22,7 @@
2222
from langchain_core.language_models.chat_models import BaseChatModel
2323
from langchain.docstore.document import Document as LangchainDocument
2424
from langchain.text_splitter import RecursiveCharacterTextSplitter
25-
from langchain_text_splitters import HTMLSectionSplitter, CharacterTextSplitter
25+
from langchain_text_splitters import HTMLHeaderTextSplitter, CharacterTextSplitter
2626

2727
import server.utils.databases as databases
2828

@@ -130,20 +130,19 @@ def split_document(
130130
("h4", "Header 4"),
131131
("h5", "Header 5"),
132132
]
133-
html_splitter = HTMLSectionSplitter(headers_to_split_on=headers_to_split_on)
133+
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
134134
##################################
135135
# Splitters - End
136136
##################################
137137
match extension.lower():
138138
case "pdf":
139139
doc_split = text_splitter.split_documents(document)
140140
case "html":
141-
try:
142-
html_split = html_splitter.split_documents(document)
143-
except Exception as ex:
144-
logger.exception(ex)
145-
html_split = document
146-
doc_split = text_splitter.split_documents(html_split)
141+
tmp_meta = document[0].metadata
142+
doc_split = html_splitter.split_text(document[0].page_content)
143+
# Update metadata with source
144+
for doc in doc_split:
145+
doc.metadata.update(tmp_meta)
147146
case "pdf" | "md" | "txt" | "csv":
148147
doc_split = text_splitter.split_documents(document)
149148
case _:
@@ -180,7 +179,8 @@ def load_and_split_documents(
180179
case "pdf":
181180
loader = document_loaders.PyPDFLoader(file)
182181
case "html":
183-
loader = document_loaders.UnstructuredHTMLLoader(file)
182+
# Use TextLoader to preserve for header split
183+
loader = document_loaders.TextLoader(file)
184184
case "md":
185185
loader = document_loaders.TextLoader(file)
186186
case "csv":

0 commit comments

Comments
 (0)