Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions backend/editor/controllers/node_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,12 @@ async def delete_project_nodes(project_id: str):
await get_current_transaction().run(query)


async def create_entry_node(
project_id: str, entry_node: EntryNodeCreate, stopwords: dict[str, list[str]]
) -> str:
async def create_entry_node(project_id: str, entry_node: EntryNodeCreate) -> str:
"""
Creates a new entry node in the database
"""
name, language_code = entry_node.name, entry_node.main_language_code
normalized_name = parser_utils.normalize_text(name, language_code, stopwords=stopwords)
normalized_name = parser_utils.normalize_text(name, language_code)

# Create params dict
entry_node_data = {
Expand Down
6 changes: 1 addition & 5 deletions backend/editor/entries.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,9 @@ async def create_entry_node(self, name, main_language_code) -> str:
"""
Helper function used to create an entry node with given name and main language
"""
stopwords = await self.get_stopwords_dict()

return await create_entry_node(
self.project_name,
EntryNodeCreate(name=name, main_language_code=main_language_code),
stopwords,
)

async def get_local_taxonomy_file(self, tmpdir: str, uploadfile: UploadFile):
Expand Down Expand Up @@ -571,8 +568,7 @@ async def update_node(self, label: NodeType, new_node: EntryNode):
curr_node = EntryNode(**result[0]["n"])

# Recompute normalized tags ids corresponding to entry tags
stopwords = await self.get_stopwords_dict()
new_node.recompute_tags_ids(stopwords)
new_node.recompute_tags_ids()

# Build query
query = [f"""MATCH (n:{self.project_name}:{label.value}) WHERE n.id = $id """]
Expand Down
6 changes: 2 additions & 4 deletions backend/editor/models/node_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def flat_dict(self) -> dict[str, Any]:
del flat_data["comments"]
return flat_data

def recompute_tags_ids(self, stopwords: dict[str, list[str]]):
def recompute_tags_ids(self):
"""Recompute the tags_ids dictionary based on the tags dictionary
and the provided stopwords."""
self.tags_ids = {}
Expand All @@ -64,9 +64,7 @@ def recompute_tags_ids(self, stopwords: dict[str, list[str]]):
keys_language_code = key.split("_", 1)[1]
normalised_value = []
for value in values:
normalised_value.append(
parser_utils.normalize_text(value, keys_language_code, stopwords=stopwords)
)
normalised_value.append(parser_utils.normalize_text(value, keys_language_code))
self.tags_ids["tags_ids" + key[4:]] = normalised_value

def recompute_id(self):
Expand Down
2 changes: 1 addition & 1 deletion backend/tests/data/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ description:fr: un yaourt avec du citron
color:en: yellow
flavour:en: lemon

<fr: yaourts fruit de la passion
<fr: yaourts au fruit de la passion
<fr: yaourts allégés
fr: yaourts au fruit de la passion allégés
nl: magere yoghurts met passievrucht
Expand Down
113 changes: 62 additions & 51 deletions backend/tests/expected_results/test_upload_taxonomy.json

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions backend/tests/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ async def test_remove_parent(taxonomy_test):
# remove "yaourts allégés" for "yaourts au fruit de la passion allégés"
children = await taxonomy_test.get_children("fr:yaourts-alleges")
children_ids = [record["child.id"] for record in children]
children_ids.remove("fr:yaourts-fruit-passion-alleges")
children_ids.remove("fr:yaourts-au-fruit-de-la-passion-alleges")
await taxonomy_test.update_node_children("fr:yaourts-alleges", children_ids)
background_tasks = FakeBackgroundTask()
file_path = taxonomy_test.dump_taxonomy(background_tasks)
Expand All @@ -84,7 +84,7 @@ async def test_add_parent(taxonomy_test):
# add "en: fake-stuff" to "yaourts au fruit de la passion allégés"
children = await taxonomy_test.get_children("en:fake-stuff")
children_ids = [record["child.id"] for record in children]
children_ids.append("fr:yaourts-fruit-passion-alleges")
children_ids.append("fr:yaourts-au-fruit-de-la-passion-alleges")
await taxonomy_test.update_node_children("en:fake-stuff", children_ids)
background_tasks = FakeBackgroundTask()
file_path = taxonomy_test.dump_taxonomy(background_tasks)
Expand All @@ -106,7 +106,7 @@ async def test_add_synonym(taxonomy_test):
async with graph_db.TransactionCtx():
# add synonym to yaourts au fruit de la passion
(node_data,) = await taxonomy_test.get_nodes(
NodeType.ENTRY, "fr:yaourts-fruit-passion-alleges"
NodeType.ENTRY, "fr:yaourts-au-fruit-de-la-passion-alleges"
)
node = EntryNode(**dict(node_data["n"]))
node.tags["tags_fr"].append("yaourts allégé aux grenadilles")
Expand Down
13 changes: 6 additions & 7 deletions parser/openfoodfacts_taxonomy_parser/parser/taxonomy_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def _normalize_entry_id(self, raw_id: str) -> str:
"""
raw_id = raw_id.strip()
lc, main_tag = raw_id.split(":", 1)
normalized_main_tag = normalize_text(main_tag, lc, stopwords=self.stopwords)
normalized_main_tag = normalize_text(main_tag, lc)
normalized_id = f"{lc}:{normalized_main_tag}"
return normalized_id

Expand All @@ -138,11 +138,11 @@ def undo_normalize_text(self, text: str) -> str:
text = re.sub(r"\\‚", "\\,", text)
return text

def _get_lc_value(self, line: str, remove_stopwords=True) -> tuple[str, list[str]]:
def _get_lc_value(self, line: str, remove_stopwords=False) -> tuple[str, list[str]]:
"""Get the language code "lc" and a list of values and normalized values"""
lc, line = line.split(":", 1)
values = [self.undo_normalize_text(word.strip()) for word in line.split(",")]
stopwords = self.stopwords if remove_stopwords else []
stopwords = self.stopwords if remove_stopwords else None
tags = [normalize_text(word, lc, stopwords=stopwords) for word in values]
return lc, values, tags

Expand Down Expand Up @@ -264,7 +264,7 @@ def _process_stopwords(self, data, line, line_number, index_stopwords):
# remove "stopwords:" part
line = line[10:]
try:
lc, tags, tags_ids = self._get_lc_value(line, remove_stopwords=False)
lc, tags, tags_ids = self._get_lc_value(line)
except ValueError:
self.parser_logger.error(
f"Missing language code at line {line_number + 1} ? "
Expand Down Expand Up @@ -317,7 +317,7 @@ def _process_entry(self, data, line, comments):
tagsids_list = []
for word in line.split(","):
tags_list.append(self.undo_normalize_text(word.strip()))
word_normalized = normalize_text(word, lang, stopwords=self.stopwords)
word_normalized = normalize_text(word, lang)
if word_normalized not in tagsids_list:
# in case 2 normalized synonyms are the same
tagsids_list.append(word_normalized)
Expand Down Expand Up @@ -553,8 +553,7 @@ def _merge_duplicate_entry_nodes(self, entry_nodes: list[NodeData]) -> list[Node
# because two tags can have the same normalized value
language_code = key.split("_")[1]
first_node.tags[f"tags_ids_{language_code}"] = [
normalize_text(tag, language_code, stopwords=self.stopwords)
for tag in first_node.tags[key]
normalize_text(tag, language_code) for tag in first_node.tags[key]
]
for key, value in node.properties.items():
# overwrite the value if the property already exists in the first node
Expand Down
10 changes: 9 additions & 1 deletion parser/openfoodfacts_taxonomy_parser/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,14 @@ def normalize_text(
char: str = "-",
stopwords: dict[str, list[str]] | None = None,
) -> str:
"""Normalize a string depending on the language code"""
"""Normalize a string depending on the language code

:param stopwords: associate a language code to a list of stopwords,
stopwords will be removed from the normalized text.

It should only be used while we are trying to extend synonym matching.
Most of the time (computing entry id), you should not account for stopwords.
"""
if stopwords is None:
stopwords = {}

Expand Down Expand Up @@ -44,6 +51,7 @@ def normalize_text(
line = line.strip(char)

# Remove stopwords
# Be careful, this must not be used to compute entry id
if lang in stopwords:
stopwords = stopwords[lang]
line_surrounded_by_char = char + line + char
Expand Down
2 changes: 1 addition & 1 deletion parser/tests/data/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ fr: yaourts à la banane
en: Passion fruit yogurts
fr: yaourts au fruit de la passion

< fr:yaourts fruit de la passion
< fr:yaourts au fruit de la passion
fr: yaourts au fruit de la passion allégés

# meat
Expand Down
12 changes: 6 additions & 6 deletions parser/tests/integration/test_parse_unparse_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_round_trip(neo4j):
if line.startswith("stopwords:fr: aux"):
line = "stopwords:fr: aux, au, de, le, du, la, a, et, test normalisation"
# second tweak: renaming parent
elif line.startswith("< fr:yaourts fruit de la passion"):
elif line.startswith("< fr:yaourts au fruit de la passion"):
line = "< en:Passion fruit yogurts"
# third tweak: commenting non existing parents
elif line.startswith("< en:milk"):
Expand Down Expand Up @@ -101,7 +101,7 @@ def test_two_branch_round_trip(neo4j):
if line.startswith("stopwords:fr: aux"):
line = "stopwords:fr: aux, au, de, le, du, la, a, et, test normalisation"
# second tweak: renaming parent
elif line.startswith("< fr:yaourts fruit de la passion"):
elif line.startswith("< fr:yaourts au fruit de la passion"):
line = "< en:Passion fruit yogurts"
# third tweak: commenting non existing parents
elif line.startswith("< en:milk"):
Expand Down Expand Up @@ -142,7 +142,7 @@ def test_round_trip_with_external_taxonomies(neo4j):
if line.startswith("stopwords:fr: aux"):
line = "stopwords:fr: aux, au, de, le, du, la, a, et, test normalisation"
# second tweak: renaming parent
elif line.startswith("< fr:yaourts fruit de la passion"):
elif line.startswith("< fr:yaourts au fruit de la passion"):
line = "< en:Passion fruit yogurts"
expected_lines.append(line)

Expand Down Expand Up @@ -227,7 +227,7 @@ def test_patcher_with_modifications(neo4j):
result = session.run(
f"""
MATCH (n:p_test_branch)
WHERE n.id = "fr:yaourts-fruit-passion-alleges"
WHERE n.id = "fr:yaourts-au-fruit-de-la-passion-alleges"
SET n.modified = {modified}
WITH n
MATCH (m:p_test_branch)
Expand All @@ -236,7 +236,7 @@ def test_patcher_with_modifications(neo4j):
RETURN n.id, m.id
"""
)
assert result.values() == [["fr:yaourts-fruit-passion-alleges", "en:yogurts"]]
assert result.values() == [["fr:yaourts-au-fruit-de-la-passion-alleges", "en:yogurts"]]
# detach the node and set the node as REMOVED
result = session.run(
f"""
Expand Down Expand Up @@ -346,7 +346,7 @@ def test_patcher_with_modifications(neo4j):
for num, (line, next_line) in enumerate(zip(original_lines, original_lines[1:] + [None])):
more_lines = []
# changed parent
if line.startswith("< fr:yaourts fruit de la passion"):
if line.startswith("< fr:yaourts au fruit de la passion"):
line = "< en:yogurts"
# no more parent
elif line.startswith("< en:yogurts") and next_line.startswith("en: banana yogurts"):
Expand Down
22 changes: 11 additions & 11 deletions parser/tests/integration/test_parser_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_calling(neo4j):
{
"id": "synonyms:1",
"tags_fr": ["fruit de la passion", "maracuja", "passion"],
"tags_ids_fr": ["fruit-passion", "maracuja", "passion"],
"tags_ids_fr": ["fruit-de-la-passion", "maracuja", "passion"],
"preceding_lines": [""],
"src_position": 7,
},
Expand Down Expand Up @@ -115,7 +115,7 @@ def test_calling(neo4j):
"tags_en": ["banana yogurts"],
"tags_ids_en": ["banana-yogurts"],
"tags_fr": ["yaourts à la banane"],
"tags_ids_fr": ["yaourts-banane"],
"tags_ids_fr": ["yaourts-a-la-banane"],
"preceding_lines": [],
},
{
Expand Down Expand Up @@ -146,7 +146,7 @@ def test_calling(neo4j):
expected_pairs = [
["en:banana-yogurts", "en:yogurts"],
["en:passion-fruit-yogurts", "en:yogurts"],
["fr:yaourts-fruit-passion-alleges", "en:passion-fruit-yogurts"],
["fr:yaourts-au-fruit-de-la-passion-alleges", "en:passion-fruit-yogurts"],
["en:fake-meat", "en:meat"],
["en:fake-duck-meat", "en:fake-meat"],
["en:fake-duck-meat", "en:fake-stuff"],
Expand All @@ -173,8 +173,8 @@ def test_calling(neo4j):
["synonyms:1", "en:yogurts"],
["en:yogurts", "en:banana-yogurts"],
["en:banana-yogurts", "en:passion-fruit-yogurts"],
["en:passion-fruit-yogurts", "fr:yaourts-fruit-passion-alleges"],
["fr:yaourts-fruit-passion-alleges", "en:meat"],
["en:passion-fruit-yogurts", "fr:yaourts-au-fruit-de-la-passion-alleges"],
["fr:yaourts-au-fruit-de-la-passion-alleges", "en:meat"],
["en:meat", "en:fake-meat"],
["en:fake-meat", "en:fake-stuff"],
["en:fake-stuff", "en:fake-duck-meat"],
Expand Down Expand Up @@ -220,7 +220,7 @@ def test_with_external_taxonomies(neo4j):
{
"id": "synonyms:1",
"tags_fr": ["fruit de la passion", "maracuja", "passion"],
"tags_ids_fr": ["fruit-passion", "maracuja", "passion"],
"tags_ids_fr": ["fruit-de-la-passion", "maracuja", "passion"],
"preceding_lines": [""],
"src_position": 7,
},
Expand Down Expand Up @@ -279,7 +279,7 @@ def test_with_external_taxonomies(neo4j):
"tags_en": ["banana yogurts"],
"tags_ids_en": ["banana-yogurts"],
"tags_fr": ["yaourts à la banane"],
"tags_ids_fr": ["yaourts-banane"],
"tags_ids_fr": ["yaourts-a-la-banane"],
"preceding_lines": [],
},
{
Expand Down Expand Up @@ -311,7 +311,7 @@ def test_with_external_taxonomies(neo4j):
["en:yogurts", "en:milk"],
["en:banana-yogurts", "en:yogurts"],
["en:passion-fruit-yogurts", "en:yogurts"],
["fr:yaourts-fruit-passion-alleges", "en:passion-fruit-yogurts"],
["fr:yaourts-au-fruit-de-la-passion-alleges", "en:passion-fruit-yogurts"],
["en:fake-meat", "en:meat"],
["en:fake-duck-meat", "en:fake-meat"],
["en:fake-duck-meat", "en:fake-stuff"],
Expand All @@ -338,8 +338,8 @@ def test_with_external_taxonomies(neo4j):
["synonyms:1", "en:yogurts"],
["en:yogurts", "en:banana-yogurts"],
["en:banana-yogurts", "en:passion-fruit-yogurts"],
["en:passion-fruit-yogurts", "fr:yaourts-fruit-passion-alleges"],
["fr:yaourts-fruit-passion-alleges", "en:meat"],
["en:passion-fruit-yogurts", "fr:yaourts-au-fruit-de-la-passion-alleges"],
["fr:yaourts-au-fruit-de-la-passion-alleges", "en:meat"],
["en:meat", "en:fake-meat"],
["en:fake-meat", "en:fake-stuff"],
["en:fake-stuff", "en:fake-duck-meat"],
Expand Down Expand Up @@ -406,7 +406,7 @@ def test_properties_confused_lang(neo4j, tmp_path):
pathlib.Path(__file__).parent.parent / "data" / "test_property_confused_lang.txt"
)
test_parser(fpath, None, "branch", "test")
query = "MATCH (n:p_test_branch) WHERE n.id = 'en:1-for-planet' RETURN n"
query = "MATCH (n:p_test_branch) WHERE n.id = 'en:1-for-the-planet' RETURN n"
result = session.run(query)
node = result.value()[0]
# "web:en" was not confused with a language prefix "web:"
Expand Down
Loading