diff --git a/backend/editor/controllers/node_controller.py b/backend/editor/controllers/node_controller.py index 0d020953..59ee27c9 100644 --- a/backend/editor/controllers/node_controller.py +++ b/backend/editor/controllers/node_controller.py @@ -23,14 +23,12 @@ async def delete_project_nodes(project_id: str): await get_current_transaction().run(query) -async def create_entry_node( - project_id: str, entry_node: EntryNodeCreate, stopwords: dict[str, list[str]] -) -> str: +async def create_entry_node(project_id: str, entry_node: EntryNodeCreate) -> str: """ Creates a new entry node in the database """ name, language_code = entry_node.name, entry_node.main_language_code - normalized_name = parser_utils.normalize_text(name, language_code, stopwords=stopwords) + normalized_name = parser_utils.normalize_text(name, language_code) # Create params dict entry_node_data = { diff --git a/backend/editor/entries.py b/backend/editor/entries.py index e132d7c4..e6f3881d 100644 --- a/backend/editor/entries.py +++ b/backend/editor/entries.py @@ -79,12 +79,9 @@ async def create_entry_node(self, name, main_language_code) -> str: """ Helper function used to create an entry node with given name and main language """ - stopwords = await self.get_stopwords_dict() - return await create_entry_node( self.project_name, EntryNodeCreate(name=name, main_language_code=main_language_code), - stopwords, ) async def get_local_taxonomy_file(self, tmpdir: str, uploadfile: UploadFile): @@ -571,8 +568,7 @@ async def update_node(self, label: NodeType, new_node: EntryNode): curr_node = EntryNode(**result[0]["n"]) # Recompute normalized tags ids corresponding to entry tags - stopwords = await self.get_stopwords_dict() - new_node.recompute_tags_ids(stopwords) + new_node.recompute_tags_ids() # Build query query = [f"""MATCH (n:{self.project_name}:{label.value}) WHERE n.id = $id """] diff --git a/backend/editor/models/node_models.py b/backend/editor/models/node_models.py index ea199069..2c8d3715 100644 --- a/backend/editor/models/node_models.py +++ b/backend/editor/models/node_models.py @@ -55,7 +55,7 @@ def flat_dict(self) -> dict[str, Any]: del flat_data["comments"] return flat_data - def recompute_tags_ids(self, stopwords: dict[str, list[str]]): + def recompute_tags_ids(self): """Recompute the tags_ids dictionary based on the tags dictionary and the provided stopwords.""" self.tags_ids = {} @@ -64,9 +64,7 @@ def recompute_tags_ids(self, stopwords: dict[str, list[str]]): keys_language_code = key.split("_", 1)[1] normalised_value = [] for value in values: - normalised_value.append( - parser_utils.normalize_text(value, keys_language_code, stopwords=stopwords) - ) + normalised_value.append(parser_utils.normalize_text(value, keys_language_code)) self.tags_ids["tags_ids" + key[4:]] = normalised_value def recompute_id(self): diff --git a/backend/tests/data/test.txt b/backend/tests/data/test.txt index f654e6f0..64ea19b9 100644 --- a/backend/tests/data/test.txt +++ b/backend/tests/data/test.txt @@ -42,7 +42,7 @@ description:fr: un yaourt avec du citron color:en: yellow flavour:en: lemon - str: """ raw_id = raw_id.strip() lc, main_tag = raw_id.split(":", 1) - normalized_main_tag = normalize_text(main_tag, lc, stopwords=self.stopwords) + normalized_main_tag = normalize_text(main_tag, lc) normalized_id = f"{lc}:{normalized_main_tag}" return normalized_id @@ -138,11 +138,11 @@ def undo_normalize_text(self, text: str) -> str: text = re.sub(r"\\‚", "\\,", text) return text - def _get_lc_value(self, line: str, remove_stopwords=True) -> tuple[str, list[str]]: + def _get_lc_value(self, line: str, remove_stopwords=False) -> tuple[str, list[str]]: """Get the language code "lc" and a list of values and normalized values""" lc, line = line.split(":", 1) values = [self.undo_normalize_text(word.strip()) for word in line.split(",")] - stopwords = self.stopwords if remove_stopwords else [] + stopwords = self.stopwords if remove_stopwords else None tags = [normalize_text(word, lc, stopwords=stopwords) for word in values] return lc, values, tags @@ -264,7 +264,7 @@ def _process_stopwords(self, data, line, line_number, index_stopwords): # remove "stopwords:" part line = line[10:] try: - lc, tags, tags_ids = self._get_lc_value(line, remove_stopwords=False) + lc, tags, tags_ids = self._get_lc_value(line) except ValueError: self.parser_logger.error( f"Missing language code at line {line_number + 1} ? " @@ -317,7 +317,7 @@ def _process_entry(self, data, line, comments): tagsids_list = [] for word in line.split(","): tags_list.append(self.undo_normalize_text(word.strip())) - word_normalized = normalize_text(word, lang, stopwords=self.stopwords) + word_normalized = normalize_text(word, lang) if word_normalized not in tagsids_list: # in case 2 normalized synonyms are the same tagsids_list.append(word_normalized) @@ -553,8 +553,7 @@ def _merge_duplicate_entry_nodes(self, entry_nodes: list[NodeData]) -> list[Node # because two tags can have the same normalized value language_code = key.split("_")[1] first_node.tags[f"tags_ids_{language_code}"] = [ - normalize_text(tag, language_code, stopwords=self.stopwords) - for tag in first_node.tags[key] + normalize_text(tag, language_code) for tag in first_node.tags[key] ] for key, value in node.properties.items(): # overwrite the value if the property already exists in the first node diff --git a/parser/openfoodfacts_taxonomy_parser/utils.py b/parser/openfoodfacts_taxonomy_parser/utils.py index 9eaa7961..70174176 100644 --- a/parser/openfoodfacts_taxonomy_parser/utils.py +++ b/parser/openfoodfacts_taxonomy_parser/utils.py @@ -10,7 +10,14 @@ def normalize_text( char: str = "-", stopwords: dict[str, list[str]] | None = None, ) -> str: - """Normalize a string depending on the language code""" + """Normalize a string depending on the language code + + :param stopwords: associate a language code to a list of stopwords, + stopwords will be removed from the normalized text. + + It should only be used while we are trying to extend synonym matching. + Most of the time (computing entry id), you should not account for stopwords. + """ if stopwords is None: stopwords = {} @@ -44,6 +51,7 @@ def normalize_text( line = line.strip(char) # Remove stopwords + # Be careful, this must not be used to compute entry id if lang in stopwords: stopwords = stopwords[lang] line_surrounded_by_char = char + line + char diff --git a/parser/tests/data/test.txt b/parser/tests/data/test.txt index c501a4b5..242d699b 100644 --- a/parser/tests/data/test.txt +++ b/parser/tests/data/test.txt @@ -18,7 +18,7 @@ fr: yaourts à la banane en: Passion fruit yogurts fr: yaourts au fruit de la passion -< fr:yaourts fruit de la passion +< fr:yaourts au fruit de la passion fr: yaourts au fruit de la passion allégés # meat diff --git a/parser/tests/integration/test_parse_unparse_integration.py b/parser/tests/integration/test_parse_unparse_integration.py index e2420845..25a559a7 100644 --- a/parser/tests/integration/test_parse_unparse_integration.py +++ b/parser/tests/integration/test_parse_unparse_integration.py @@ -56,7 +56,7 @@ def test_round_trip(neo4j): if line.startswith("stopwords:fr: aux"): line = "stopwords:fr: aux, au, de, le, du, la, a, et, test normalisation" # second tweak: renaming parent - elif line.startswith("< fr:yaourts fruit de la passion"): + elif line.startswith("< fr:yaourts au fruit de la passion"): line = "< en:Passion fruit yogurts" # third tweak: commenting non existing parents elif line.startswith("< en:milk"): @@ -101,7 +101,7 @@ def test_two_branch_round_trip(neo4j): if line.startswith("stopwords:fr: aux"): line = "stopwords:fr: aux, au, de, le, du, la, a, et, test normalisation" # second tweak: renaming parent - elif line.startswith("< fr:yaourts fruit de la passion"): + elif line.startswith("< fr:yaourts au fruit de la passion"): line = "< en:Passion fruit yogurts" # third tweak: commenting non existing parents elif line.startswith("< en:milk"): @@ -142,7 +142,7 @@ def test_round_trip_with_external_taxonomies(neo4j): if line.startswith("stopwords:fr: aux"): line = "stopwords:fr: aux, au, de, le, du, la, a, et, test normalisation" # second tweak: renaming parent - elif line.startswith("< fr:yaourts fruit de la passion"): + elif line.startswith("< fr:yaourts au fruit de la passion"): line = "< en:Passion fruit yogurts" expected_lines.append(line) @@ -227,7 +227,7 @@ def test_patcher_with_modifications(neo4j): result = session.run( f""" MATCH (n:p_test_branch) - WHERE n.id = "fr:yaourts-fruit-passion-alleges" + WHERE n.id = "fr:yaourts-au-fruit-de-la-passion-alleges" SET n.modified = {modified} WITH n MATCH (m:p_test_branch) @@ -236,7 +236,7 @@ def test_patcher_with_modifications(neo4j): RETURN n.id, m.id """ ) - assert result.values() == [["fr:yaourts-fruit-passion-alleges", "en:yogurts"]] + assert result.values() == [["fr:yaourts-au-fruit-de-la-passion-alleges", "en:yogurts"]] # detach the node and set the node as REMOVED result = session.run( f""" @@ -346,7 +346,7 @@ def test_patcher_with_modifications(neo4j): for num, (line, next_line) in enumerate(zip(original_lines, original_lines[1:] + [None])): more_lines = [] # changed parent - if line.startswith("< fr:yaourts fruit de la passion"): + if line.startswith("< fr:yaourts au fruit de la passion"): line = "< en:yogurts" # no more parent elif line.startswith("< en:yogurts") and next_line.startswith("en: banana yogurts"): diff --git a/parser/tests/integration/test_parser_integration.py b/parser/tests/integration/test_parser_integration.py index 92f01bb4..0949174a 100644 --- a/parser/tests/integration/test_parser_integration.py +++ b/parser/tests/integration/test_parser_integration.py @@ -56,7 +56,7 @@ def test_calling(neo4j): { "id": "synonyms:1", "tags_fr": ["fruit de la passion", "maracuja", "passion"], - "tags_ids_fr": ["fruit-passion", "maracuja", "passion"], + "tags_ids_fr": ["fruit-de-la-passion", "maracuja", "passion"], "preceding_lines": [""], "src_position": 7, }, @@ -115,7 +115,7 @@ def test_calling(neo4j): "tags_en": ["banana yogurts"], "tags_ids_en": ["banana-yogurts"], "tags_fr": ["yaourts à la banane"], - "tags_ids_fr": ["yaourts-banane"], + "tags_ids_fr": ["yaourts-a-la-banane"], "preceding_lines": [], }, { @@ -146,7 +146,7 @@ def test_calling(neo4j): expected_pairs = [ ["en:banana-yogurts", "en:yogurts"], ["en:passion-fruit-yogurts", "en:yogurts"], - ["fr:yaourts-fruit-passion-alleges", "en:passion-fruit-yogurts"], + ["fr:yaourts-au-fruit-de-la-passion-alleges", "en:passion-fruit-yogurts"], ["en:fake-meat", "en:meat"], ["en:fake-duck-meat", "en:fake-meat"], ["en:fake-duck-meat", "en:fake-stuff"], @@ -173,8 +173,8 @@ def test_calling(neo4j): ["synonyms:1", "en:yogurts"], ["en:yogurts", "en:banana-yogurts"], ["en:banana-yogurts", "en:passion-fruit-yogurts"], - ["en:passion-fruit-yogurts", "fr:yaourts-fruit-passion-alleges"], - ["fr:yaourts-fruit-passion-alleges", "en:meat"], + ["en:passion-fruit-yogurts", "fr:yaourts-au-fruit-de-la-passion-alleges"], + ["fr:yaourts-au-fruit-de-la-passion-alleges", "en:meat"], ["en:meat", "en:fake-meat"], ["en:fake-meat", "en:fake-stuff"], ["en:fake-stuff", "en:fake-duck-meat"], @@ -220,7 +220,7 @@ def test_with_external_taxonomies(neo4j): { "id": "synonyms:1", "tags_fr": ["fruit de la passion", "maracuja", "passion"], - "tags_ids_fr": ["fruit-passion", "maracuja", "passion"], + "tags_ids_fr": ["fruit-de-la-passion", "maracuja", "passion"], "preceding_lines": [""], "src_position": 7, }, @@ -279,7 +279,7 @@ def test_with_external_taxonomies(neo4j): "tags_en": ["banana yogurts"], "tags_ids_en": ["banana-yogurts"], "tags_fr": ["yaourts à la banane"], - "tags_ids_fr": ["yaourts-banane"], + "tags_ids_fr": ["yaourts-a-la-banane"], "preceding_lines": [], }, { @@ -311,7 +311,7 @@ def test_with_external_taxonomies(neo4j): ["en:yogurts", "en:milk"], ["en:banana-yogurts", "en:yogurts"], ["en:passion-fruit-yogurts", "en:yogurts"], - ["fr:yaourts-fruit-passion-alleges", "en:passion-fruit-yogurts"], + ["fr:yaourts-au-fruit-de-la-passion-alleges", "en:passion-fruit-yogurts"], ["en:fake-meat", "en:meat"], ["en:fake-duck-meat", "en:fake-meat"], ["en:fake-duck-meat", "en:fake-stuff"], @@ -338,8 +338,8 @@ def test_with_external_taxonomies(neo4j): ["synonyms:1", "en:yogurts"], ["en:yogurts", "en:banana-yogurts"], ["en:banana-yogurts", "en:passion-fruit-yogurts"], - ["en:passion-fruit-yogurts", "fr:yaourts-fruit-passion-alleges"], - ["fr:yaourts-fruit-passion-alleges", "en:meat"], + ["en:passion-fruit-yogurts", "fr:yaourts-au-fruit-de-la-passion-alleges"], + ["fr:yaourts-au-fruit-de-la-passion-alleges", "en:meat"], ["en:meat", "en:fake-meat"], ["en:fake-meat", "en:fake-stuff"], ["en:fake-stuff", "en:fake-duck-meat"], @@ -406,7 +406,7 @@ def test_properties_confused_lang(neo4j, tmp_path): pathlib.Path(__file__).parent.parent / "data" / "test_property_confused_lang.txt" ) test_parser(fpath, None, "branch", "test") - query = "MATCH (n:p_test_branch) WHERE n.id = 'en:1-for-planet' RETURN n" + query = "MATCH (n:p_test_branch) WHERE n.id = 'en:1-for-the-planet' RETURN n" result = session.run(query) node = result.value()[0] # "web:en" was not confused with a language prefix "web:"