|
2 | 2 | Copyright (c) 2024, 2025, Oracle and/or its affiliates.
|
3 | 3 | Licensed under the Universal Permissive License v1.0 as shown at http://oss.oracle.com/licenses/upl.
|
4 | 4 | """
|
5 |
| -# spell-checker:ignore langchain, docstore, docos, vectorstores, oraclevs |
| 5 | +# spell-checker:ignore langchain, docstore, docos, vectorstores, oraclevs, genai, hnsw |
6 | 6 |
|
7 | 7 | import json
|
8 | 8 | import copy
|
|
22 | 22 | from langchain_core.language_models.chat_models import BaseChatModel
|
23 | 23 | from langchain.docstore.document import Document as LangchainDocument
|
24 | 24 | from langchain.text_splitter import RecursiveCharacterTextSplitter
|
25 |
| -from langchain_text_splitters import HTMLSectionSplitter, CharacterTextSplitter |
| 25 | +from langchain_text_splitters import HTMLHeaderTextSplitter, CharacterTextSplitter |
26 | 26 |
|
27 | 27 | import server.utils.databases as databases
|
28 | 28 |
|
@@ -130,20 +130,19 @@ def split_document(
|
130 | 130 | ("h4", "Header 4"),
|
131 | 131 | ("h5", "Header 5"),
|
132 | 132 | ]
|
133 |
| - html_splitter = HTMLSectionSplitter(headers_to_split_on=headers_to_split_on) |
| 133 | + html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on) |
134 | 134 | ##################################
|
135 | 135 | # Splitters - End
|
136 | 136 | ##################################
|
137 | 137 | match extension.lower():
|
138 | 138 | case "pdf":
|
139 | 139 | doc_split = text_splitter.split_documents(document)
|
140 | 140 | case "html":
|
141 |
| - try: |
142 |
| - html_split = html_splitter.split_documents(document) |
143 |
| - except Exception as ex: |
144 |
| - logger.exception(ex) |
145 |
| - html_split = document |
146 |
| - doc_split = text_splitter.split_documents(html_split) |
| 141 | + tmp_meta = document[0].metadata |
| 142 | + doc_split = html_splitter.split_text(document[0].page_content) |
| 143 | + # Update metadata with source |
| 144 | + for doc in doc_split: |
| 145 | + doc.metadata.update(tmp_meta) |
147 | 146 | case "pdf" | "md" | "txt" | "csv":
|
148 | 147 | doc_split = text_splitter.split_documents(document)
|
149 | 148 | case _:
|
@@ -180,7 +179,8 @@ def load_and_split_documents(
|
180 | 179 | case "pdf":
|
181 | 180 | loader = document_loaders.PyPDFLoader(file)
|
182 | 181 | case "html":
|
183 |
| - loader = document_loaders.UnstructuredHTMLLoader(file) |
| 182 | + # Use TextLoader to preserve for header split |
| 183 | + loader = document_loaders.TextLoader(file) |
184 | 184 | case "md":
|
185 | 185 | loader = document_loaders.TextLoader(file)
|
186 | 186 | case "csv":
|
|
0 commit comments