From 819eacb8a452d4ff77ba93276d6a645498a21a47 Mon Sep 17 00:00:00 2001 From: "alex.graber" Date: Wed, 7 May 2025 14:37:12 -0400 Subject: [PATCH 1/5] fix: minor docstring updates to SingleHopSpecificQuerySynthesizer --- .../ragas/testset/synthesizers/single_hop/specific.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ragas/src/ragas/testset/synthesizers/single_hop/specific.py b/ragas/src/ragas/testset/synthesizers/single_hop/specific.py index ac0f1b367..dfa6e5aff 100644 --- a/ragas/src/ragas/testset/synthesizers/single_hop/specific.py +++ b/ragas/src/ragas/testset/synthesizers/single_hop/specific.py @@ -39,12 +39,14 @@ class SingleHopScenario(BaseScenario): @dataclass class SingleHopSpecificQuerySynthesizer(SingleHopQuerySynthesizer): - name: str = "single_hop_specifc_query_synthesizer" - theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt() + """Synthesize single-hop queries based on an entity of interest.""" + + name: str = "single_hop_specific_query_synthesizer" property_name: str = "entities" + theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt() def get_node_clusters(self, knowledge_graph: KnowledgeGraph) -> t.List[Node]: - + """Identify clusters of nodes based on the entity of interest.""" node_type_dict = defaultdict(int) for node in knowledge_graph.nodes: if ( @@ -81,7 +83,8 @@ async def _generate_scenarios( callbacks: Callbacks, ) -> t.List[SingleHopScenario]: """ - Generates a list of scenarios on type SingleHopSpecificQuerySynthesizer + Generate a list of scenarios of type SingleHopScenario. + Steps to generate scenarios: 1. Find nodes with CHUNK type and entities property 2. Calculate the number of samples that should be created per node to get n samples in total From 6c8a78413fcc1c0853c2a896bb83ba24772e2120 Mon Sep 17 00:00:00 2001 From: "alex.graber" Date: Wed, 7 May 2025 14:42:18 -0400 Subject: [PATCH 2/5] feat: allow user-defined node- and relationship- properties in MultiHopAbstractQuerySynthesizer - Users should be able to set the relationship property to use for identifying clusters and the node property used for identifying abstract concepts. - This will not change default behavior, but allows users to override. --- .../synthesizers/multi_hop/abstract.py | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/ragas/src/ragas/testset/synthesizers/multi_hop/abstract.py b/ragas/src/ragas/testset/synthesizers/multi_hop/abstract.py index 09de39a78..1a581fb85 100644 --- a/ragas/src/ragas/testset/synthesizers/multi_hop/abstract.py +++ b/ragas/src/ragas/testset/synthesizers/multi_hop/abstract.py @@ -31,23 +31,18 @@ @dataclass class MultiHopAbstractQuerySynthesizer(MultiHopQuerySynthesizer): - """ - Synthesizes abstract multi-hop queries from given knowledge graph. - - Attributes - ---------- - """ + """Synthesize abstract multi-hop queries from given knowledge graph.""" name: str = "multi_hop_abstract_query_synthesizer" + relation_property: str = "summary_similarity" + abstract_property_name: str = "themes" concept_combination_prompt: PydanticPrompt = ConceptCombinationPrompt() theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt() def get_node_clusters(self, knowledge_graph: KnowledgeGraph) -> t.List[t.Set[Node]]: - + """Identify clusters of nodes based on the specified relationship condition.""" node_clusters = knowledge_graph.find_indirect_clusters( - relationship_condition=lambda rel: ( - True if rel.get_property("summary_similarity") else False - ), + relationship_condition=lambda rel: bool(rel.get_property(self.relation_property)), depth_limit=3, ) logger.info("found %d clusters", len(node_clusters)) @@ -61,7 +56,8 @@ async def _generate_scenarios( callbacks: Callbacks, ) -> t.List[MultiHopScenario]: """ - Generates a list of scenarios on type MultiHopAbstractQuerySynthesizer + Generate a list of scenarios of type MultiHopScenario. + Steps to generate scenarios: 1. Find indirect clusters of nodes based on relationship condition 2. Calculate the number of samples that should be created per cluster to get n samples in total @@ -93,7 +89,7 @@ async def _generate_scenarios( nodes.append(node) base_scenarios = [] - node_themes = [node.properties.get("themes", []) for node in nodes] + node_themes = [node.properties.get(self.abstract_property_name, []) for node in nodes] prompt_input = ConceptsList( lists_of_concepts=node_themes, max_combinations=num_sample_per_cluster ) @@ -117,7 +113,7 @@ async def _generate_scenarios( concept_combination.combinations, personas=persona_list, persona_item_mapping=persona_concepts.mapping, - property_name="themes", + property_name=self.abstract_property_name, ) base_scenarios = self.sample_diverse_combinations( base_scenarios, num_sample_per_cluster From ed0c75a119409ba31b55dfd79eda7861e507ea4b Mon Sep 17 00:00:00 2001 From: "alex.graber" Date: Wed, 7 May 2025 14:45:29 -0400 Subject: [PATCH 3/5] feat: allow user-defined node- and relationship- properties in MultiHopSpecificQuerySynthesizer - Users should be able to set the relationship type to use for identifying clusters and the relationship property used for identifying overlapping concepts within the triple. - This will not change default behavior, but allows users to override. --- .../synthesizers/multi_hop/specific.py | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/ragas/src/ragas/testset/synthesizers/multi_hop/specific.py b/ragas/src/ragas/testset/synthesizers/multi_hop/specific.py index 53ce84094..bf9e557e5 100644 --- a/ragas/src/ragas/testset/synthesizers/multi_hop/specific.py +++ b/ragas/src/ragas/testset/synthesizers/multi_hop/specific.py @@ -27,28 +27,19 @@ @dataclass class MultiHopSpecificQuerySynthesizer(MultiHopQuerySynthesizer): - """ - Synthesizes overlap based queries by choosing specific chunks and generating a - keyphrase from them and then generating queries based on that. - - Attributes - ---------- - generate_query_prompt : PydanticPrompt - The prompt used for generating the query. - """ + """Synthesize multi-hop queries based on a chunk cluster defined by entity overlap.""" name: str = "multi_hop_specific_query_synthesizer" - relation_type: str = "entities_overlap" property_name: str = "entities" + relation_type: str = "entities_overlap" + relation_overlap_property: str = "overlapped_items" theme_persona_matching_prompt: PydanticPrompt = ThemesPersonasMatchingPrompt() generate_query_reference_prompt: PydanticPrompt = QueryAnswerGenerationPrompt() def get_node_clusters(self, knowledge_graph: KnowledgeGraph) -> t.List[t.Tuple]: - + """Identify clusters of nodes based on the specified relationship condition.""" node_clusters = knowledge_graph.find_two_nodes_single_rel( - relationship_condition=lambda rel: ( - True if rel.type == self.relation_type else False - ) + relationship_condition=lambda rel: rel.type == self.relation_type ) logger.info("found %d clusters", len(node_clusters)) return node_clusters @@ -61,7 +52,8 @@ async def _generate_scenarios( callbacks: Callbacks, ) -> t.List[MultiHopScenario]: """ - Generates a list of scenarios on type MultiHopSpecificQuerySynthesizer + Generate a list of scenarios of type MultiHopScenario. + Steps to generate scenarios: 1. Filter the knowledge graph to find cluster of nodes or defined relation type. Here entities_overlap 2. Calculate the number of samples that should be created per cluster to get n samples in total @@ -87,7 +79,7 @@ async def _generate_scenarios( if len(scenarios) < n: node_a, node_b = triplet[0], triplet[-1] overlapped_items = [] - overlapped_items = triplet[1].properties["overlapped_items"] + overlapped_items = triplet[1].properties[self.relation_overlap_property] if overlapped_items: themes = list(dict(overlapped_items).keys()) prompt_input = ThemesPersonasInput( From 4ef90d720687bf074cf531b885617f8fd6340806 Mon Sep 17 00:00:00 2001 From: ahgraber Date: Wed, 7 May 2025 19:27:27 -0400 Subject: [PATCH 4/5] fix: 'single_hop_specifc_query_synthesizer' typo --- .../applications/singlehop_testset_gen.md | 20 +++++++++---------- .../testgenerator/_persona_generator.md | 10 +++++----- .../testgenerator/persona_generator.ipynb | 20 +++++++++---------- docs/howtos/integrations/_llamaindex.md | 6 +++--- docs/howtos/integrations/llamaindex.ipynb | 12 +++++------ 5 files changed, 34 insertions(+), 34 deletions(-) diff --git a/docs/howtos/applications/singlehop_testset_gen.md b/docs/howtos/applications/singlehop_testset_gen.md index 2358a950f..2b77a83b7 100644 --- a/docs/howtos/applications/singlehop_testset_gen.md +++ b/docs/howtos/applications/singlehop_testset_gen.md @@ -211,70 +211,70 @@ Output Wut do I do if my baggage is Delayed, Lost, or... [Baggage Policies\n\nThis section provides a d... If your baggage is delayed, lost, or damaged, ... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 1 Wht asistance is provided by the airline durin... [Flight Delays\n\nFlight delays can be caused ... Depending on the length of the delay, Ragas Ai... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 2 What is Step 1: Check Fare Rules in the contex... [Flight Cancellations\n\nFlight cancellations ... Step 1: Check Fare Rules involves logging into... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 3 How can I access my booking online with Ragas ... [Managing Reservations\n\nManaging your reserv... To access your booking online with Ragas Airli... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 4 What assistance does Ragas Airlines provide fo... [Special Assistance\n\nRagas Airlines provides... Ragas Airlines provides special assistance ser... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 5 What steps should I take if my baggage is dela... [Baggage Policies This section provides a deta... If your baggage is delayed, lost, or damaged w... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 6 How can I resubmit the claim for my baggage is... [Potential Issues and Resolutions for Baggage ... To resubmit the claim for your baggage issue, ... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 7 Wut are the main causes of flight delays and h... [Flight Delays Flight delays can be caused by ... Flight delays can be caused by weather conditi... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 8 How can I request reimbursement for additional... [2. Additional Expenses Incurred Due to Delay ... To request reimbursement for additional expens... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 9 What are passenger-initiated cancelations? [Flight Cancellations Flight cancellations can... Passenger-initiated cancellations occur when a... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer diff --git a/docs/howtos/customizations/testgenerator/_persona_generator.md b/docs/howtos/customizations/testgenerator/_persona_generator.md index d0d32824c..984fd6d8d 100644 --- a/docs/howtos/customizations/testgenerator/_persona_generator.md +++ b/docs/howtos/customizations/testgenerator/_persona_generator.md @@ -98,35 +98,35 @@ testset.to_pandas().head() What the Director do in GitLab and how they wo... [09db4f3e-1c10-4863-9024-f869af48d3e0\n\ntitle... The Director at GitLab, such as the Director o... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 1 Wht is the rol of the VP in GitLab? [56c84f1b-3558-4c80-b8a9-348e69a4801b\n\nJob F... The VP, or Vice President, at GitLab is respon... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 2 What GitLab do for career progression? [ead619a5-930f-4e2b-b797-41927a04d2e3\n\nGoals... The Job frameworks at GitLab help team members... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 3 Wht is the S-grop and how do they work with ot... [42babb12-b033-493f-b684-914e2b1b1d0f\n\nPeopl... Members of the S-group are expected to demonst... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 4 How does Google execute its company vision? [c3ed463d-1cdc-4ba4-a6ca-2c4ab12da883\n\nof mo... To effectively execute the company vision, man... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer diff --git a/docs/howtos/customizations/testgenerator/persona_generator.ipynb b/docs/howtos/customizations/testgenerator/persona_generator.ipynb index c29d8a0fc..ba90e1260 100644 --- a/docs/howtos/customizations/testgenerator/persona_generator.ipynb +++ b/docs/howtos/customizations/testgenerator/persona_generator.ipynb @@ -122,35 +122,35 @@ " What the Director do in GitLab and how they wo...\n", " [09db4f3e-1c10-4863-9024-f869af48d3e0\\n\\ntitle...\n", " The Director at GitLab, such as the Director o...\n", - " single_hop_specifc_query_synthesizer\n", + " single_hop_specific_query_synthesizer\n", " \n", " \n", " 1\n", " Wht is the rol of the VP in GitLab?\n", " [56c84f1b-3558-4c80-b8a9-348e69a4801b\\n\\nJob F...\n", " The VP, or Vice President, at GitLab is respon...\n", - " single_hop_specifc_query_synthesizer\n", + " single_hop_specific_query_synthesizer\n", " \n", " \n", " 2\n", " What GitLab do for career progression?\n", " [ead619a5-930f-4e2b-b797-41927a04d2e3\\n\\nGoals...\n", " The Job frameworks at GitLab help team members...\n", - " single_hop_specifc_query_synthesizer\n", + " single_hop_specific_query_synthesizer\n", " \n", " \n", " 3\n", " Wht is the S-grop and how do they work with ot...\n", " [42babb12-b033-493f-b684-914e2b1b1d0f\\n\\nPeopl...\n", " Members of the S-group are expected to demonst...\n", - " single_hop_specifc_query_synthesizer\n", + " single_hop_specific_query_synthesizer\n", " \n", " \n", " 4\n", " How does Google execute its company vision?\n", " [c3ed463d-1cdc-4ba4-a6ca-2c4ab12da883\\n\\nof mo...\n", " To effectively execute the company vision, man...\n", - " single_hop_specifc_query_synthesizer\n", + " single_hop_specific_query_synthesizer\n", " \n", " \n", "\n", @@ -158,11 +158,11 @@ ], "text/plain": [ " user_input ... synthesizer_name\n", - "0 What the Director do in GitLab and how they wo... ... single_hop_specifc_query_synthesizer\n", - "1 Wht is the rol of the VP in GitLab? ... single_hop_specifc_query_synthesizer\n", - "2 What GitLab do for career progression? ... single_hop_specifc_query_synthesizer\n", - "3 Wht is the S-grop and how do they work with ot... ... single_hop_specifc_query_synthesizer\n", - "4 How does Google execute its company vision? ... single_hop_specifc_query_synthesizer\n", + "0 What the Director do in GitLab and how they wo... ... single_hop_specific_query_synthesizer\n", + "1 Wht is the rol of the VP in GitLab? ... single_hop_specific_query_synthesizer\n", + "2 What GitLab do for career progression? ... single_hop_specific_query_synthesizer\n", + "3 Wht is the S-grop and how do they work with ot... ... single_hop_specific_query_synthesizer\n", + "4 How does Google execute its company vision? ... single_hop_specific_query_synthesizer\n", "\n", "[5 rows x 4 columns]" ] diff --git a/docs/howtos/integrations/_llamaindex.md b/docs/howtos/integrations/_llamaindex.md index 865880cb8..f1c24adfd 100644 --- a/docs/howtos/integrations/_llamaindex.md +++ b/docs/howtos/integrations/_llamaindex.md @@ -88,21 +88,21 @@ df.head() Cud yu pleese explane the role of New York Cit... [New York, often called New York City or NYC, ... New York City serves as the geographical and d... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 1 So like, what was New York City called before ... [History == === Early history === In the pre-C... Before it was called New York, the area was kn... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 2 what happen in new york with slavery and how i... [and rechristened it "New Orange" after Willia... In the early 18th century, New York became a c... - single_hop_specifc_query_synthesizer + single_hop_specific_query_synthesizer 3 diff --git a/docs/howtos/integrations/llamaindex.ipynb b/docs/howtos/integrations/llamaindex.ipynb index 780c3dfb3..8cf44a942 100644 --- a/docs/howtos/integrations/llamaindex.ipynb +++ b/docs/howtos/integrations/llamaindex.ipynb @@ -135,21 +135,21 @@ " Cud yu pleese explane the role of New York Cit...\n", " [New York, often called New York City or NYC, ...\n", " New York City serves as the geographical and d...\n", - " single_hop_specifc_query_synthesizer\n", + " single_hop_specific_query_synthesizer\n", " \n", " \n", " 1\n", " So like, what was New York City called before ...\n", " [History == === Early history === In the pre-C...\n", " Before it was called New York, the area was kn...\n", - " single_hop_specifc_query_synthesizer\n", + " single_hop_specific_query_synthesizer\n", " \n", " \n", " 2\n", " what happen in new york with slavery and how i...\n", " [and rechristened it \"New Orange\" after Willia...\n", " In the early 18th century, New York became a c...\n", - " single_hop_specifc_query_synthesizer\n", + " single_hop_specific_query_synthesizer\n", " \n", " \n", " 3\n", @@ -192,9 +192,9 @@ "4 The Staten Island Ferry plays a significant ro... \n", "\n", " synthesizer_name \n", - "0 single_hop_specifc_query_synthesizer \n", - "1 single_hop_specifc_query_synthesizer \n", - "2 single_hop_specifc_query_synthesizer \n", + "0 single_hop_specific_query_synthesizer \n", + "1 single_hop_specific_query_synthesizer \n", + "2 single_hop_specific_query_synthesizer \n", "3 multi_hop_specific_query_synthesizer \n", "4 multi_hop_specific_query_synthesizer " ] From 33c7b5ce1483f2620db69616c6fe6acfd1bed825 Mon Sep 17 00:00:00 2001 From: "alex.graber" Date: Tue, 1 Jul 2025 15:20:12 -0400 Subject: [PATCH 5/5] fix: handle non-dict types for overlapped_items in MultiHopSpecificQuerySynthesizer - Updated logic to check if overlapped_items is a dictionary before extracting keys. - Ensured compatibility with non-dict types for overlapped_items. --- .../synthesizers/multi_hop/abstract.py | 8 ++++++-- .../synthesizers/multi_hop/specific.py | 19 ++++++++++++++++--- ragas/tests/unit/test_analytics.py | 2 +- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/ragas/src/ragas/testset/synthesizers/multi_hop/abstract.py b/ragas/src/ragas/testset/synthesizers/multi_hop/abstract.py index 1a581fb85..a6f660dc1 100644 --- a/ragas/src/ragas/testset/synthesizers/multi_hop/abstract.py +++ b/ragas/src/ragas/testset/synthesizers/multi_hop/abstract.py @@ -42,7 +42,9 @@ class MultiHopAbstractQuerySynthesizer(MultiHopQuerySynthesizer): def get_node_clusters(self, knowledge_graph: KnowledgeGraph) -> t.List[t.Set[Node]]: """Identify clusters of nodes based on the specified relationship condition.""" node_clusters = knowledge_graph.find_indirect_clusters( - relationship_condition=lambda rel: bool(rel.get_property(self.relation_property)), + relationship_condition=lambda rel: bool( + rel.get_property(self.relation_property) + ), depth_limit=3, ) logger.info("found %d clusters", len(node_clusters)) @@ -89,7 +91,9 @@ async def _generate_scenarios( nodes.append(node) base_scenarios = [] - node_themes = [node.properties.get(self.abstract_property_name, []) for node in nodes] + node_themes = [ + node.properties.get(self.abstract_property_name, []) for node in nodes + ] prompt_input = ConceptsList( lists_of_concepts=node_themes, max_combinations=num_sample_per_cluster ) diff --git a/ragas/src/ragas/testset/synthesizers/multi_hop/specific.py b/ragas/src/ragas/testset/synthesizers/multi_hop/specific.py index bf9e557e5..fdef74876 100644 --- a/ragas/src/ragas/testset/synthesizers/multi_hop/specific.py +++ b/ragas/src/ragas/testset/synthesizers/multi_hop/specific.py @@ -2,6 +2,7 @@ import logging import typing as t +from collections.abc import Iterable from dataclasses import dataclass import numpy as np @@ -81,7 +82,16 @@ async def _generate_scenarios( overlapped_items = [] overlapped_items = triplet[1].properties[self.relation_overlap_property] if overlapped_items: - themes = list(dict(overlapped_items).keys()) + if not all( + isinstance(item, (str, Iterable)) for item in overlapped_items + ): + logger.debug("Overlapped items are not strings or iterables.") + continue + themes = ( + list(overlapped_items.keys()) + if isinstance(overlapped_items, dict) + else overlapped_items + ) prompt_input = ThemesPersonasInput( themes=themes, personas=persona_list ) @@ -90,10 +100,13 @@ async def _generate_scenarios( data=prompt_input, llm=self.llm, callbacks=callbacks ) ) - overlapped_items = [list(item) for item in overlapped_items] + combinations = [ + [item] if isinstance(item, str) else list(item) + for item in themes + ] base_scenarios = self.prepare_combinations( [node_a, node_b], - overlapped_items, + combinations, personas=persona_list, persona_item_mapping=persona_concepts.mapping, property_name=self.property_name, diff --git a/ragas/tests/unit/test_analytics.py b/ragas/tests/unit/test_analytics.py index 4570f722a..ea4eedc23 100644 --- a/ragas/tests/unit/test_analytics.py +++ b/ragas/tests/unit/test_analytics.py @@ -135,7 +135,7 @@ def test_testset_generation_tracking(monkeypatch): ) assert testset_event_payload.model_dump()["evolution_names"] == [ - "single_hop_specifc_query_synthesizer", + "single_hop_specific_query_synthesizer", "multi_hop_abstract_query_synthesizer", "multi_hop_specific_query_synthesizer", ]