Skip to content

Commit 5beca58

Browse files
committed
As an option, remove accents from GRC models. Only do the augmentation if the name ends with -Diacritics
We use this to train a set of Perseus models on both diacritics and missing diacritics, hopefully giving better performance to people who don't have diacritics in their text #1311 (comment)
1 parent 7c34714 commit 5beca58

File tree

2 files changed

+30
-0
lines changed

2 files changed

+30
-0
lines changed

stanza/utils/datasets/common.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import re
88
import subprocess
99
import sys
10+
import unicodedata
1011

1112
from stanza.models.common.short_name_to_treebank import canonical_treebank_name
1213
import stanza.utils.datasets.prepare_tokenizer_data as prepare_tokenizer_data
@@ -53,6 +54,16 @@ def convert_conllu_to_txt(tokenizer_dir, short_name, shards=("train", "dev", "te
5354
# use an external script to produce the txt files
5455
subprocess.check_output(f"perl {CONLLU_TO_TXT_PERL} {output_conllu} > {output_txt}", shell=True)
5556

57+
def strip_accents(word):
58+
"""
59+
Remove diacritics from words such as in the UD GRC datasets
60+
"""
61+
converted = ''.join(c for c in unicodedata.normalize('NFD', word)
62+
if unicodedata.category(c) not in ('Mn'))
63+
if len(converted) == 0:
64+
return word
65+
return converted
66+
5667
def mwt_name(base_dir, short_name, dataset):
5768
return os.path.join(base_dir, f"{short_name}-ud-{dataset}-mwt.json")
5869

@@ -178,6 +189,8 @@ def find_treebank_dataset_file(treebank, udbase_dir, dataset, extension, fail=Fa
178189
"""
179190
if treebank.startswith("UD_Korean") and treebank.endswith("_seg"):
180191
treebank = treebank[:-4]
192+
if treebank.startswith("UD_Ancient_Greek-") and (treebank.endswith("-Diacritics") or treebank.endswith("-diacritics")):
193+
treebank = treebank[:-11]
181194
filename = os.path.join(udbase_dir, treebank, f"*-ud-{dataset}.{extension}")
182195
files = glob.glob(filename)
183196
if len(files) == 0:

stanza/utils/datasets/prepare_tokenizer_treebank.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -701,7 +701,22 @@ def augment_punct(sents):
701701

702702
return new_sents
703703

704+
def remove_accents_from_words(sents):
705+
new_sents = []
706+
for sent in sents:
707+
new_sent = []
708+
for line in sent:
709+
if line.startswith("#"):
710+
new_sent.append(line)
711+
else:
712+
pieces = line.split("\t")
713+
pieces[1] = common.strip_accents(pieces[1])
714+
new_sent.append("\t".join(pieces))
715+
new_sents.append(new_sent)
716+
return new_sents
704717

718+
def augment_accents(sents):
719+
return sents + remove_accents_from_words(sents)
705720

706721
def write_augmented_dataset(input_conllu, output_conllu, augment_function):
707722
# set the seed for each data file so that the results are the same
@@ -1307,6 +1322,8 @@ def prepare_ud_dataset(treebank, udbase_dir, tokenizer_dir, short_name, short_la
13071322
write_augmented_dataset(input_conllu, output_conllu, augment_arabic_padt)
13081323
elif short_name.startswith("ko_") and short_name.endswith("_seg"):
13091324
remove_spaces(input_conllu, output_conllu)
1325+
elif short_name.startswith("grc_") and short_name.endswith("-diacritics"):
1326+
write_augmented_dataset(input_conllu, output_conllu, augment_accents)
13101327
elif dataset == 'train' and augment:
13111328
write_augmented_dataset(input_conllu, output_conllu, augment_punct)
13121329
else:

0 commit comments

Comments
 (0)