Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion ocrolib/lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -852,12 +852,15 @@ def ctc_align_targets(outputs,targets,threshold=100.0,verbose=0,debug=0,lo=1e-5)
def normalize_nfkc(s):
return unicodedata.normalize('NFKC',s)

def normalize_nfc(s):
return unicodedata.normalize('NFC',s)

def add_training_info(network):
return network

class SeqRecognizer:
"""Perform sequence recognition using BIDILSTM and alignment."""
def __init__(self,ninput,nstates,noutput=-1,codec=None,normalize=normalize_nfkc):
def __init__(self,ninput,nstates,noutput=-1,codec=None,normalize=normalize_nfc):
self.Ni = ninput
if codec: noutput = codec.size()
assert noutput>0
Expand Down
6 changes: 3 additions & 3 deletions ocropus-rtrain
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ if args.codec!=[]:
print(args.codec)
for fname in ocrolib.glob_all(args.codec):
transcript = ocrolib.read_text(fname)
l = list(lstm.normalize_nfkc(transcript))
l = list(lstm.normalize_nfc(transcript))
charset = charset.union(l)
charset = sorted(list(charset))
charset = [c for c in charset if c>" " and c!="~"]
Expand Down Expand Up @@ -172,7 +172,7 @@ def load_lstm(fname):
if args.clstm:
network = lstm.SeqRecognizer(args.height,args.hiddensize,
codec=codec,
normalize=lstm.normalize_nfkc)
normalize=lstm.normalize_nfc)
import clstm
mylstm = clstm.make_BIDILSTM()
mylstm.init(network.No,args.hiddensize,network.Ni)
Expand All @@ -193,7 +193,7 @@ else:
last_save = None
network = lstm.SeqRecognizer(args.height,args.hiddensize,
codec=codec,
normalize=lstm.normalize_nfkc)
normalize=lstm.normalize_nfc)
if args.clstm:
import clstm
mylstm = clstm.make_BIDILSTM()
Expand Down