From e7fd53c5c205d58dcf57a0d9f1b11e7f7bc32680 Mon Sep 17 00:00:00 2001 From: Ben Constable Date: Mon, 11 Nov 2024 16:06:28 +0000 Subject: [PATCH] Perform section extraction only in pre-embedding cleaner --- adi_function_app/README.md | 25 ++-------------------- adi_function_app/adi_2_ai_search.py | 13 +----------- adi_function_app/pre_embedding_cleaner.py | 26 +++++++++++------------ deploy_ai_search/ai_search.py | 4 +--- deploy_ai_search/rag_documents.py | 2 +- 5 files changed, 18 insertions(+), 52 deletions(-) diff --git a/adi_function_app/README.md b/adi_function_app/README.md index a127fcc..b34c5a7 100644 --- a/adi_function_app/README.md +++ b/adi_function_app/README.md @@ -24,15 +24,13 @@ Once the Markdown is obtained, several steps are carried out: 1. **Extraction of images / charts**. The figures identified are extracted from the original document and passed to a multi-modal model (gpt4o in this case) for analysis. We obtain a description and summary of the chart / image to infer the meaning of the figure. This allows us to index and perform RAG analysis the information that is visually obtainable from a chart, without it being explicitly mentioned in the text surrounding. The information is added back into the original chart. -2. **Extraction of sections and headers**. The sections and headers are extracted from the document and returned additionally to the indexer under a separate field. This allows us to store them as a separate field in the index and therefore surface the most relevant chunks. - -3. **Cleaning of Markdown**. The final markdown content is cleaned of any characters or unsupported Markdown elements that we do not want in the chunk e.g. non-relevant images. +2. **Cleaning of Markdown**. The final markdown content is cleaned of any characters or unsupported Markdown elements that we do not want in the chunk e.g. non-relevant images. Page wise analysis in ADI is used to avoid splitting tables / figures across multiple chunks, when the chunking is performed. The properties returned from the ADI Custom Skill are then used to perform the following skills: -- Pre-vectorisation cleaning +- Pre-vectorisation cleaning. This stage is important as we extract the section information in this step from the headers in the document. Additionally, we remove any Markdown tags or characters that would cause an embedding error. - Keyphrase extraction - Vectorisation @@ -43,7 +41,6 @@ Using the [Phi-3 Technical Report: A Highly Capable Language Model Locally on Yo ```json { "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Table 1: Comparison results on RepoQA benchmark.
ModelCtx SizePythonC++RustJavaTypeScriptAverage
gpt-4O-2024-05-13128k958085969790.6
gemini-1.5-flash-latest1000k937987949790
Phi-3.5-MoE128k897481889585
Phi-3.5-Mini128k866773778277
Llama-3.1-8B-Instruct128k806573766371
Mixtral-8x7B-Instruct-v0.132k666564717468
Mixtral-8x22B-Instruct-v0.164k606774835567.8
\n\n\nsuch as Arabic, Chinese, Russian, Ukrainian, and Vietnamese, with average MMLU-multilingual scores\nof 55.4 and 47.3, respectively. Due to its larger model capacity, phi-3.5-MoE achieves a significantly\nhigher average score of 69.9, outperforming phi-3.5-mini.\n\nMMLU(5-shot) MultiLingual\n\nPhi-3-mini\n\nPhi-3.5-mini\n\nPhi-3.5-MoE\n\n\n\n\n\n We evaluate the phi-3.5-mini and phi-3.5-MoE models on two long-context understanding tasks:\nRULER [HSK+24] and RepoQA [LTD+24]. As shown in Tables 1 and 2, both phi-3.5-MoE and phi-\n3.5-mini outperform other open-source models with larger sizes, such as Llama-3.1-8B, Mixtral-8x7B,\nand Mixtral-8x22B, on the RepoQA task, and achieve comparable performance to Llama-3.1-8B on\nthe RULER task. However, we observe a significant performance drop when testing the 128K context\nwindow on the RULER task. We suspect this is due to the lack of high-quality long-context data in\nmid-training, an issue we plan to address in the next version of the model release.\n\n In the table 3, we present a detailed evaluation of the phi-3.5-mini and phi-3.5-MoE models\ncompared with recent SoTA pretrained language models, such as GPT-4o-mini, Gemini-1.5 Flash, and\nopen-source models like Llama-3.1-8B and the Mistral models. The results show that phi-3.5-mini\nachieves performance comparable to much larger models like Mistral-Nemo-12B and Llama-3.1-8B, while\nphi-3.5-MoE significantly outperforms other open-source models, offers performance comparable to\nGemini-1.5 Flash, and achieves above 90% of the average performance of GPT-4o-mini across various\nlanguage benchmarks.\n\n\n\n\n", - "sections": [], "page_number": 7 } ``` @@ -133,16 +130,10 @@ If `chunk_by_page` header is `True` (recommended): "extracted_content": [ { "page_number": 1, - "sections": [ - "" - ], "content": "" }, { "page_number": 2, - "sections": [ - "" - ], "content": "" } ] @@ -154,16 +145,10 @@ If `chunk_by_page` header is `True` (recommended): "extracted_content": [ { "page_number": 1, - "sections": [ - "" - ], "content": "" }, { "page_number": 2, - "sections": [ - "" - ], "content": "" } ] @@ -182,9 +167,6 @@ If `chunk_by_page` header is `False`: "recordId": "0", "data": { "extracted_content": { - "sections": [ - "" - ], "content": "" } } @@ -193,9 +175,6 @@ If `chunk_by_page` header is `False`: "recordId": "1", "data": { "extracted_content": { - "sections": [ - "" - ], "content": "" } } diff --git a/adi_function_app/adi_2_ai_search.py b/adi_function_app/adi_2_ai_search.py index 2b7999a..3b7c509 100644 --- a/adi_function_app/adi_2_ai_search.py +++ b/adi_function_app/adi_2_ai_search.py @@ -40,7 +40,7 @@ async def build_and_clean_markdown_for_response( """ output_dict = {} - comment_patterns = r"|||" + comment_patterns = r"||||" cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL) # Remove irrelevant figures @@ -52,18 +52,7 @@ async def build_and_clean_markdown_for_response( logging.info(f"Cleaned Text: {cleaned_text}") - markdown_without_figure_content = re.sub( - r"", "", cleaned_text, flags=re.DOTALL - ) - - combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n" - doc_metadata = re.findall( - combined_pattern, markdown_without_figure_content, re.DOTALL - ) - doc_metadata = [match for group in doc_metadata for match in group if match] - output_dict["content"] = cleaned_text - output_dict["sections"] = doc_metadata output_dict["figures"] = figures diff --git a/adi_function_app/pre_embedding_cleaner.py b/adi_function_app/pre_embedding_cleaner.py index ad49231..5c787e6 100644 --- a/adi_function_app/pre_embedding_cleaner.py +++ b/adi_function_app/pre_embedding_cleaner.py @@ -5,7 +5,7 @@ import re -def get_section(cleaned_text: str) -> list: +def get_sections(cleaned_text: str) -> list: """ Returns the section details from the content @@ -52,7 +52,7 @@ def remove_markdown_tags(text: str, tag_patterns: dict) -> str: return text -def clean_text(src_text: str) -> str: +def clean_text_with_section_extraction(src_text: str) -> tuple[str, str]: """This function performs following cleanup activities on the text, remove all unicode characters remove line spacing,remove stop words, normalize characters @@ -77,6 +77,8 @@ def clean_text(src_text: str) -> str: } cleaned_text = remove_markdown_tags(src_text, tag_patterns) + sections = get_sections(cleaned_text) + # Updated regex to keep Unicode letters, punctuation, whitespace, currency symbols, and percentage signs, # while also removing non-printable characters cleaned_text = re.sub(r"[^\p{L}\p{P}\s\p{Sc}%\x20-\x7E]", "", cleaned_text) @@ -88,7 +90,7 @@ def clean_text(src_text: str) -> str: except Exception as e: logging.error(f"An error occurred in clean_text: {e}") return "" - return cleaned_text + return cleaned_text, sections async def process_pre_embedding_cleaner(record: dict) -> dict: @@ -114,19 +116,17 @@ async def process_pre_embedding_cleaner(record: dict) -> dict: # scenarios when page by chunking is enabled if isinstance(record["data"]["chunk"], dict): - cleaned_record["data"]["cleanedChunk"] = clean_text( - record["data"]["chunk"]["content"] - ) + ( + cleaned_record["data"]["cleanedChunk"], + cleaned_record["data"]["sections"], + ) = clean_text_with_section_extraction(record["data"]["chunk"]["content"]) cleaned_record["data"]["chunk"] = record["data"]["chunk"]["content"] - cleaned_record["data"]["cleanedSections"] = clean_sections( - record["data"]["chunk"]["sections"] - ) else: - cleaned_record["data"]["cleanedChunk"] = clean_text(record["data"]["chunk"]) + ( + cleaned_record["data"]["cleanedChunk"], + cleaned_record["data"]["sections"], + ) = clean_text_with_section_extraction(record["data"]["chunk"]) cleaned_record["data"]["chunk"] = record["data"]["chunk"] - cleaned_record["data"]["cleanedSections"] = get_section( - record["data"]["chunk"] - ) except Exception as e: logging.error("string cleanup Error: %s", e) diff --git a/deploy_ai_search/ai_search.py b/deploy_ai_search/ai_search.py index 4046f62..2e8bf12 100644 --- a/deploy_ai_search/ai_search.py +++ b/deploy_ai_search/ai_search.py @@ -220,9 +220,7 @@ def get_pre_embedding_cleaner_skill(self, context, source) -> WebApiSkill: pre_embedding_cleaner_skill_outputs = [ OutputFieldMappingEntry(name="cleanedChunk", target_name="cleanedChunk"), OutputFieldMappingEntry(name="chunk", target_name="chunk"), - OutputFieldMappingEntry( - name="cleanedSections", target_name="cleanedSections" - ), + OutputFieldMappingEntry(name="sections", target_name="sections"), ] pre_embedding_cleaner_skill = WebApiSkill( diff --git a/deploy_ai_search/rag_documents.py b/deploy_ai_search/rag_documents.py index 3c674f3..c514a0d 100644 --- a/deploy_ai_search/rag_documents.py +++ b/deploy_ai_search/rag_documents.py @@ -215,7 +215,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjection: name="Keywords", source="/document/pages/*/keywords" ), InputFieldMappingEntry( - name="Sections", source="/document/pages/*/cleanedSections" + name="Sections", source="/document/pages/*/sections" ), InputFieldMappingEntry( name="Figures",