diff --git a/ocrolib/lstm.py b/ocrolib/lstm.py index bf10ab56..42793d49 100644 --- a/ocrolib/lstm.py +++ b/ocrolib/lstm.py @@ -852,12 +852,15 @@ def ctc_align_targets(outputs,targets,threshold=100.0,verbose=0,debug=0,lo=1e-5) def normalize_nfkc(s): return unicodedata.normalize('NFKC',s) +def normalize_nfc(s): + return unicodedata.normalize('NFC',s) + def add_training_info(network): return network class SeqRecognizer: """Perform sequence recognition using BIDILSTM and alignment.""" - def __init__(self,ninput,nstates,noutput=-1,codec=None,normalize=normalize_nfkc): + def __init__(self,ninput,nstates,noutput=-1,codec=None,normalize=normalize_nfc): self.Ni = ninput if codec: noutput = codec.size() assert noutput>0 diff --git a/ocropus-rtrain b/ocropus-rtrain index 9d365ae3..bec093b2 100755 --- a/ocropus-rtrain +++ b/ocropus-rtrain @@ -134,7 +134,7 @@ if args.codec!=[]: print(args.codec) for fname in ocrolib.glob_all(args.codec): transcript = ocrolib.read_text(fname) - l = list(lstm.normalize_nfkc(transcript)) + l = list(lstm.normalize_nfc(transcript)) charset = charset.union(l) charset = sorted(list(charset)) charset = [c for c in charset if c>" " and c!="~"] @@ -172,7 +172,7 @@ def load_lstm(fname): if args.clstm: network = lstm.SeqRecognizer(args.height,args.hiddensize, codec=codec, - normalize=lstm.normalize_nfkc) + normalize=lstm.normalize_nfc) import clstm mylstm = clstm.make_BIDILSTM() mylstm.init(network.No,args.hiddensize,network.Ni) @@ -193,7 +193,7 @@ else: last_save = None network = lstm.SeqRecognizer(args.height,args.hiddensize, codec=codec, - normalize=lstm.normalize_nfkc) + normalize=lstm.normalize_nfc) if args.clstm: import clstm mylstm = clstm.make_BIDILSTM()