As an option, remove accents from GRC models. Only do the augmentation if the name ends with -Diacritics

AngledLuffa · AngledLuffa · commit 5beca58c054c · 2025-04-11T17:04:45.000-07:00
We use this to train a set of Perseus models on both diacritics and missing diacritics, hopefully giving better performance to people who don't have diacritics in their text #1311 (comment)
diff --git a/stanza/utils/datasets/common.py b/stanza/utils/datasets/common.py
@@ -7,6 +7,7 @@
 import re
 import subprocess
 import sys
+import unicodedata
 
 from stanza.models.common.short_name_to_treebank import canonical_treebank_name
 import stanza.utils.datasets.prepare_tokenizer_data as prepare_tokenizer_data
@@ -53,6 +54,16 @@ def convert_conllu_to_txt(tokenizer_dir, short_name, shards=("train", "dev", "te
         # use an external script to produce the txt files
         subprocess.check_output(f"perl {CONLLU_TO_TXT_PERL} {output_conllu} > {output_txt}", shell=True)
 
+def strip_accents(word):
+    """
+    Remove diacritics from words such as in the UD GRC datasets
+    """
+    converted = ''.join(c for c in unicodedata.normalize('NFD', word)
+                        if unicodedata.category(c) not in ('Mn'))
+    if len(converted) == 0:
+        return word
+    return converted
+
 def mwt_name(base_dir, short_name, dataset):
     return os.path.join(base_dir, f"{short_name}-ud-{dataset}-mwt.json")
 
@@ -178,6 +189,8 @@ def find_treebank_dataset_file(treebank, udbase_dir, dataset, extension, fail=Fa
     """
     if treebank.startswith("UD_Korean") and treebank.endswith("_seg"):
         treebank = treebank[:-4]
+    if treebank.startswith("UD_Ancient_Greek-") and (treebank.endswith("-Diacritics") or treebank.endswith("-diacritics")):
+        treebank = treebank[:-11]
     filename = os.path.join(udbase_dir, treebank, f"*-ud-{dataset}.{extension}")
     files = glob.glob(filename)
     if len(files) == 0:
diff --git a/stanza/utils/datasets/prepare_tokenizer_treebank.py b/stanza/utils/datasets/prepare_tokenizer_treebank.py
@@ -701,7 +701,22 @@ def augment_punct(sents):
 
     return new_sents
 
+def remove_accents_from_words(sents):
+    new_sents = []
+    for sent in sents:
+        new_sent = []
+        for line in sent:
+            if line.startswith("#"):
+                new_sent.append(line)
+            else:
+                pieces = line.split("\t")
+                pieces[1] = common.strip_accents(pieces[1])
+                new_sent.append("\t".join(pieces))
+        new_sents.append(new_sent)
+    return new_sents
 
+def augment_accents(sents):
+    return sents + remove_accents_from_words(sents)
 
 def write_augmented_dataset(input_conllu, output_conllu, augment_function):
     # set the seed for each data file so that the results are the same
@@ -1307,6 +1322,8 @@ def prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_la
         write_augmented_dataset(input_conllu, output_conllu, augment_arabic_padt)
     elif short_name.startswith("ko_") and short_name.endswith("_seg"):
         remove_spaces(input_conllu, output_conllu)
+    elif short_name.startswith("grc_") and short_name.endswith("-diacritics"):
+        write_augmented_dataset(input_conllu, output_conllu, augment_accents)
     elif dataset == 'train' and augment:
         write_augmented_dataset(input_conllu, output_conllu, augment_punct)
     else: