diff --git a/egs/mini_librispeech/s5/cmd.sh b/egs/mini_librispeech/s5/cmd.sh index 71dd849a93b..3189d83975a 100644 --- a/egs/mini_librispeech/s5/cmd.sh +++ b/egs/mini_librispeech/s5/cmd.sh @@ -10,6 +10,8 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +# in future I'd like to start using just one $cmd variable. +export cmd="queue.pl --mem 2G" export train_cmd="queue.pl --mem 2G" export decode_cmd="queue.pl --mem 4G" export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/mini_librispeech/s5/conf/mfcc_hires2.conf b/egs/mini_librispeech/s5/conf/mfcc_hires2.conf new file mode 100644 index 00000000000..2e8dc221d40 --- /dev/null +++ b/egs/mini_librispeech/s5/conf/mfcc_hires2.conf @@ -0,0 +1,14 @@ +# config for high-resolution MFCC features, intended for 'chaina' neural network +# training. These '..2.conf' setups are intended to have the --modified=true +# configuration value. + +# Note: we keep all cepstra, so it has the same info as filterbank features, +# but MFCC is more easily compressible (because less correlated) which is why +# we prefer this method. +--use-energy=false # use average of log energy, not energy. +# Will soon add: --modified=true +--num-mel-bins=40 # similar to Google's setup. +--num-ceps=40 # there is no dimensionality reduction. +--low-freq=20 # low cutoff frequency for mel bins... this is high-bandwidth data, so + # there might be some information at the low end. +--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600) diff --git a/egs/mini_librispeech/s5/local/chaina/data_prep_common.sh b/egs/mini_librispeech/s5/local/chaina/data_prep_common.sh new file mode 100755 index 00000000000..a736fc8c008 --- /dev/null +++ b/egs/mini_librispeech/s5/local/chaina/data_prep_common.sh @@ -0,0 +1,76 @@ +#!/bin/bash + +set -euo pipefail + +# This script is called from local/chaina/run_tdnn.sh and +# similar scripts. It contains the common feature preparation and +# lattice-alignment preparation parts of the chaina training. +# See those scripts for examples of usage. + +stage=0 +train_set=train_clean_5 +test_sets="dev_clean_2" +gmm=tri3b + +. ./cmd.sh +. ./path.sh +. utils/parse_options.sh + +gmm_dir=exp/${gmm} +ali_dir=exp/${gmm}_ali_${train_set}_sp + +for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + +# Our default data augmentation method is 3-way speed augmentation followed by +# volume perturbation. We are looking into better ways of doing this, +# e.g. involving noise and reverberation. + +if [ $stage -le 1 ]; then + # Although the nnet will be trained by high resolution data, we still have to + # perturb the normal data to get the alignment. _sp stands for speed-perturbed + echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)" + utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp + echo "$0: making MFCC features for low-resolution speed-perturbed data" + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/${train_set}_sp || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1; + utils/fix_data_dir.sh data/${train_set}_sp +fi + +if [ $stage -le 2 ]; then + echo "$0: aligning with the perturbed low-resolution data" + steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \ + data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1 +fi + +if [ $stage -le 3 ]; then + # Create high-resolution MFCC features (with 40 cepstra instead of 13). + # this shows how you can split across multiple file-systems. + echo "$0: creating high-resolution MFCC features" + mfccdir=data/${train_set}_sp_hires/data + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then + utils/create_split_dir.pl /export/fs0{1,2}/$USER/kaldi-data/mfcc/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage + fi + + for datadir in ${train_set}_sp ${test_sets}; do + utils/copy_data_dir.sh data/$datadir data/${datadir}_hires + done + + # do volume-perturbation on the training data prior to extracting hires + # features; this helps make trained nnets more invariant to test data volume. + utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1; + + for datadir in ${train_set}_sp ${test_sets}; do + steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \ + --cmd "$train_cmd" data/${datadir}_hires || exit 1; + steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1; + utils/fix_data_dir.sh data/${datadir}_hires || exit 1; + done +fi + + +exit 0 diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh new file mode 100755 index 00000000000..8aa00c0d975 --- /dev/null +++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh @@ -0,0 +1,499 @@ +#!/bin/bash + + +# grep WER exp/chaina/tdnn1a_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 21.44 [ 4317 / 20138, 341 ins, 947 del, 3029 sub ] exp/chaina/tdnn1a_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# a09:s5: grep WER exp/chaina/tdnn1a_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 19.72 [ 3971 / 20138, 317 ins, 771 del, 2883 sub ] exp/chaina/tdnn1a_sp/decode_dev_clean_2_tgsmall/wer_17_0.0 + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +srand=0 +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 + + +# training chunk-options +chunk_width=140 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.1 +bottom_subsampling_factor=1 # I'll set this to 3 later, 1 is for compatibility with a broken ru. +frame_subsampling_factor=3 +langs="default" # list of language names + +# The amount of extra left/right context we put in the egs. Note: this could +# easily be zero, since we're not using a recurrent topology, but we put in a +# little extra context so that we have more room to play with the configuration +# without re-dumping egs. +egs_extra_left_context=5 +egs_extra_right_context=5 + +# The number of chunks (of length: see $chunk_width above) that we group +# together for each "speaker" (actually: pseudo-speaker, since we may have +# to group multiple speaker together in some cases). +chunks_per_group=4 + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 2900 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l $lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 1f). About 0.5% better. +# 1g is as 1c2 but using MeanOnlyTransform. Better!! + + +# grep WER exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 16.22 [ 3266 / 20138, 297 ins, 463 del, 2506 sub ] exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# a09:s5: grep WER exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 14.29 [ 2877 / 20138, 275 ins, 398 del, 2204 sub ] exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall/wer_12_0.0 + +# a09:s5: grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 17.08 [ 3439 / 20138, 361 ins, 467 del, 2611 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 14.68 [ 2956 / 20138, 243 ins, 519 del, 2194 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_12_0.5 +# +# vs. the baseline: +# grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 16.30 [ 3282 / 20138, 323 ins, 458 del, 2501 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# a09:s5: grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 15.88 [ 3197 / 20138, 296 ins, 462 del, 2439 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_12_0.0 + +# 1c2 is as 1c but changing num-epochs from + +# 1c is a sanity check that the baseline setup is working well; +# we're simply making the transform a NoOpTransform, so the two decoding +# passes should give almost the same results. + + +# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 18.27 [ 3679 / 20138, 334 ins, 565 del, 2780 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_13_0.0 +#a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 18.09 [ 3643 / 20138, 324 ins, 552 del, 2767 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_15_0.0 + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +srand=0 +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1h # affix for the TDNN directory name +tree_affix=b +train_stage=-10 +get_egs_stage=-10 +common_egs_dir=exp/chaina/tdnn1f_sp/egs + +# training chunk-options +chunk_width=140 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.1 +bottom_subsampling_factor=3 +frame_subsampling_factor=3 +langs="default" # list of language names + +# The amount of extra left/right context we put in the egs. Note: this could +# easily be zero, since we're not using a recurrent topology, but we put in a +# little extra context so that we have more room to play with the configuration +# without re-dumping egs. +egs_extra_left_context=5 +egs_extra_right_context=5 + +# The number of chunks (of length: see $chunk_width above) that we group +# together for each "speaker" (actually: pseudo-speaker, since we may have +# to group multiple speaker together in some cases). +chunks_per_group=4 + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 2900 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 1f) +# 1g is as 1c2 but using MeanOnlyTransform. Better!! + +# a09:s5: grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 17.08 [ 3439 / 20138, 361 ins, 467 del, 2611 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 14.68 [ 2956 / 20138, 243 ins, 519 del, 2194 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_12_0.5 +# +# vs. the baseline: +# grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 16.30 [ 3282 / 20138, 323 ins, 458 del, 2501 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# a09:s5: grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 15.88 [ 3197 / 20138, 296 ins, 462 del, 2439 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_12_0.0 + +# 1c2 is as 1c but changing num-epochs from + +# 1c is a sanity check that the baseline setup is working well; +# we're simply making the transform a NoOpTransform, so the two decoding +# passes should give almost the same results. + + +# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 18.27 [ 3679 / 20138, 334 ins, 565 del, 2780 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_13_0.0 +#a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 18.09 [ 3643 / 20138, 324 ins, 552 del, 2767 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_15_0.0 + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +srand=0 +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1i # affix for the TDNN directory name +tree_affix=b +train_stage=-10 +get_egs_stage=-10 +common_egs_dir=exp/chaina/tdnn1f_sp/egs + +# training chunk-options +chunk_width=140 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.1 +bottom_subsampling_factor=3 +frame_subsampling_factor=3 +langs="default" # list of language names + +# The amount of extra left/right context we put in the egs. Note: this could +# easily be zero, since we're not using a recurrent topology, but we put in a +# little extra context so that we have more room to play with the configuration +# without re-dumping egs. +egs_extra_left_context=5 +egs_extra_right_context=5 + +# The number of chunks (of length: see $chunk_width above) that we group +# together for each "speaker" (actually: pseudo-speaker, since we may have +# to group multiple speaker together in some cases). +chunks_per_group=4 + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 2900 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < $dir/init/info.txt < $dir/init/info.txt +fi + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 1f) +# 1g is as 1c2 but using MeanOnlyTransform. Better!! + +# a09:s5: grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 17.08 [ 3439 / 20138, 361 ins, 467 del, 2611 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 14.68 [ 2956 / 20138, 243 ins, 519 del, 2194 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_12_0.5 +# +# vs. the baseline: +# grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 16.30 [ 3282 / 20138, 323 ins, 458 del, 2501 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0 +# a09:s5: grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 15.88 [ 3197 / 20138, 296 ins, 462 del, 2439 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_12_0.0 + +# 1c2 is as 1c but changing num-epochs from + +# 1c is a sanity check that the baseline setup is working well; +# we're simply making the transform a NoOpTransform, so the two decoding +# passes should give almost the same results. + + +# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh +# %WER 18.27 [ 3679 / 20138, 334 ins, 565 del, 2780 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_13_0.0 +#a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh +# %WER 18.09 [ 3643 / 20138, 324 ins, 552 del, 2767 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_15_0.0 + + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +srand=0 +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1j # affix for the TDNN directory name +tree_affix=b +train_stage=-10 +get_egs_stage=-10 +common_egs_dir=exp/chaina/tdnn1f_sp/egs + +# training chunk-options +chunk_width=140 +dropout_schedule='0,0@0.20,0.3@0.50,0' +xent_regularize=0.1 +bottom_subsampling_factor=3 +frame_subsampling_factor=3 +langs="default" # list of language names + +# The amount of extra left/right context we put in the egs. Note: this could +# easily be zero, since we're not using a recurrent topology, but we put in a +# little extra context so that we have more room to play with the configuration +# without re-dumping egs. +egs_extra_left_context=5 +egs_extra_right_context=5 + +# The number of chunks (of length: see $chunk_width above) that we group +# together for each "speaker" (actually: pseudo-speaker, since we may have +# to group multiple speaker together in some cases). +chunks_per_group=4 + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + # This will be a two-level tree (with the smaller number of leaves specified + # by the '--num-clusters' option); this is needed by the adaptation framework + # search below for 'tree.map' + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --num-clusters 200 \ + --frame-subsampling-factor ${frame_subsampling_factor} \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 2900 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +# $dir/configs will contain xconfig and config files for the initial +# models. It's a scratch space used by this script but not by +# scripts called from here. +mkdir -p $dir/configs/ +# $dir/init will contain the initial models +mkdir -p $dir/init/ + +l2=0.03 +tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" +tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" +linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" +prefinal_opts="l2-regularize=0.03" +output_opts="l2-regularize=0.015" +num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') +learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + +if [ $stage -le 13 ]; then + echo "$0: creating top neural net using the xconfig parser"; + + cat < $dir/configs/bottom.xconfig + input dim=40 name=input + + batchnorm-component name=input-batchnorm + + relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1) + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0 + # this 'batchnorm-layer' has an affine component but no nonlinearlity + linear-component name=linear_bottleneck dim=256 l2-regularize=$l2 + batchnorm-component name=linear_bottleneck_bn + output name=output input=linear_bottleneck_bn +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \ + --config-file-out $dir/configs/bottom.config + nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw +fi + +if [ $stage -le 14 ]; then + echo "$0: creating adaptation model/transform" + + # note: 'default' corresponds to the language name (we use 'default' since this + # is not really a multilingual setup. + # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match + # with the dimension of this transform (256). + cat < $dir/configs/default.xconfig + input name=input dim=256 + linear-component $linear_opts name=linear_from_input dim=768 + tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1 + linear-component name=prefinal-l dim=192 $linear_opts + + # adding the output layer for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts + + # adding the output layer for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts + # .. and its speaker-independent version + prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \ + --config-file-out $dir/configs/default.config + nnet3-init --srand=$srand $dir/configs/default.config - | \ + nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl +fi + + +if [ $stage -le 16 ]; then + # Work out the model's total effective left and right context (in the + # feature frame-sampling rate). + # The following script is equivalent to doing something like the + # following: + # cat > $dir/init/info.txt < " + echo "e.g.: steps/chaina/compute_embeddings.sh --nj 8 \\" + echo " data/test_eval92_hires exp/chaina/tdnn1_sp/final exp/nnet3/tdnn1_sp/data/final/test_eval92_hires" + echo "Output will be in /output.scp" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --iter # Iteration of model to decode; default is final." + exit 1; +fi + +data=$1 +model_dir=$2 +dir=$3 + +mkdir -p $dir/log + +# convert $dir to absolute pathname +fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}` + +for f in $model_dir/bottom.raw $model_dir/info.txt $data/feats.scp; do + if [ ! -f $f ]; then + echo "$0: no such file $f" + exit 1 + fi +done + + +sdata=$data/split$nj; +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + + + +bottom_subsampling_factor=$(awk '/^bottom_subsampling_factor/ {print $2}' <$model_dir/info.txt) +if ! [ $bottom_subsampling_factor -gt 0 ]; then + echo "$0: error getting bottom_subsampling_factor from $model_dir/info.txt" + exit 1 +fi + + + +if [ $stage -le 1 ]; then + $cmd JOB=1:$nj $dir/log/compute.JOB.log \ + nnet3-compute --use-gpu=no \ + --frame-subsampling-factor=$bottom_subsampling_factor \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + $model_dir/bottom.raw scp:$sdata/JOB/feats.scp \ + "ark:|copy-feats --compress=$compress ark:- ark,scp:$dir/output.JOB.ark,$dir/output.JOB.scp" +fi + +for n in $(seq $nj); do + cat $dir/output.$n.scp +done > $dir/output.scp + +exit 0; diff --git a/egs/wsj/s5/steps/chaina/decode.sh b/egs/wsj/s5/steps/chaina/decode.sh new file mode 100755 index 00000000000..df7b627f8c8 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/decode.sh @@ -0,0 +1,155 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + +# This script does the speaker-dependent pass of decoding with a 'chaina' model, +# including getting the speaker-dependent transforms and dumping lattices. + + +# Begin configuration section. +stage=1 + +acwt=1.0 # Just a default value, used for adaptation and beam-pruning.. +post_decode_acwt=10.0 # This is typically used in 'chain' systems to scale + # acoustics by 10 so the regular scoring script works OK + # (since it evaluates the LM scale at integer values, + # typically close to 10). We make this the default in + # order to make scoring easier, but you should remember + # when using the lattices, that this has been done. +cmd=run.pl +beam=15.0 +frames_per_chunk=50 +max_active=7000 +min_active=200 +silence_weight=0.01 # We weight down the posteriors of silence (needs to be tuned). +lattice_beam=6.0 # Beam we use in lattice generation. +iter=final +num_threads=1 # if >1, will use nnet3-latgen-faster-parallel + +scoring_opts= +skip_diagnostics=false +skip_scoring=false +# we may later add extra-{left,right}-context options, but these might be +# problematic. +extra_left_context=0 +extra_right_context=0 +minimize=false +lang=default +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; +set -e -u + +if [ $# -ne 6 ]; then + echo "Usage: $0 [options] " + echo "e.g.: steps/chaina/decode.sh --nj 8 \\" + echo " data/test exp/chaina/tdnn1a_sp/graph_bg exp/chaina/tdnn1a_sp/final" + echo " exp/chaina/tdnn1a_sp/data/test exp/chaina/tdnn1a_sp/decode_test_bg.si exp/chaina/tdnn1a_sp/decode_test_bg" + echo "Main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 15.0" + echo " --lattice-beam # Lattice pruning beam; default 6.0" + echo " --iter # Iteration of model to decode; default is final." + echo " --scoring-opts # options to local/score.sh" + echo " --num-threads # number of threads to use, default 1." + echo " --use-gpu # default: false. If true, we recommend" + echo " # to use large --num-threads as the graph" + echo " # search becomes the limiting factor." + exit 1; +fi + + +data=$1 +graphdir=$2 +model_dir=$3 +embedding_dir=$4 +si_dir=$5 +dir=$6 + + +mkdir -p $dir/log + +for f in $graphdir/HCLG.fst $data/utt2spk $model_dir/$lang.mdl $model_dir/$lang.ada \ + $model_dir/info.txt $embedding_dir/output.scp $si_dir/lat.1.gz $si_dir/num_jobs; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + + +nj=$(cat $si_dir/num_jobs) +echo $nj > $dir/num_jobs +sdata=$data/split$nj; +silphonelist=$(cat $graphdir/phones/silence.csl) +frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$model_dir/info.txt) +bottom_subsampling_factor=$(awk '/^bottom_subsampling_factor/ {print $2}' <$model_dir/info.txt) +top_subsampling_factor=$[frame_subsampling_factor/bottom_subsampling_factor] + + +## Now get the first-pass fMLLR transforms. +if [ $stage -le 1 ]; then + echo "$0: getting speaker-dependent transforms" + # The --acoustic-scale=0.1 is to reverse the --post-decode-acwt (default: 10) + # that we used when dumping the SI lattices (this was for scoring + # convenience). + $cmd JOB=1:$nj $dir/log/get_transform.JOB.log \ + gunzip -c $si_dir/lat.JOB.gz \| \ + lattice-to-post --acoustic-scale=0.1 ark:- ark:- \| \ + weight-silence-post $silence_weight $silphonelist $model_dir/${lang}.mdl ark:- ark:- \| \ + post-to-pdf-post $model_dir/${lang}.mdl ark:- ark:- \| \ + nnet3-adapt --verbose=2 --frame-subsampling-factor=$top_subsampling_factor \ + get-transforms $model_dir/${lang}.ada ark:$sdata/JOB/spk2utt \ + "scp:filter_scp.pl $sdata/JOB/utt2spk $embedding_dir/output.scp|" \ + ark,s,cs:- ark:$dir/trans.JOB.ark +fi + +if [ $num_threads -gt 1 ]; then + thread_string="-parallel --num-threads=$num_threads" + queue_opt="--num-threads $num_threads" +else + thread_string= + queue_opt= +fi + +if [ $stage -le 2 ]; then + $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \ + nnet3-latgen-faster$thread_string \ + --frame-subsampling-factor=$top_subsampling_factor \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt \ + $model_dir/${lang}.mdl \ + $graphdir/HCLG.fst \ + "ark:filter_scp.pl $sdata/JOB/utt2spk $embedding_dir/output.scp | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB.ark scp:- ark:-|" \ + "ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" +fi + + +if [ $stage -le 3 ]; then + if ! $skip_diagnostics ; then + [ ! -z $iter ] && iter_opt="--iter $iter" + steps/diagnostic/analyze_lats.sh --cmd "$cmd" --model $model_dir/${lang}.mdl $graphdir $dir + fi +fi + + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. +if [ $stage -le 4 ]; then + if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + echo "score confidence and timing with sclite" + fi +fi +echo "Decoding done." +exit 0; diff --git a/egs/wsj/s5/steps/chaina/decode_si.sh b/egs/wsj/s5/steps/chaina/decode_si.sh new file mode 100755 index 00000000000..f21d82f6278 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/decode_si.sh @@ -0,0 +1,139 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). +# Apache 2.0. + +# This script does the speaker-independent pass of decoding with a 'chaina' model, + + +# Begin configuration section. +stage=1 +nj=4 # number of decoding jobs. +acwt=1.0 # Just a default value, used for adaptation and beam-pruning.. +post_decode_acwt=10.0 # This is typically used in 'chain' systems to scale + # acoustics by 10 so the regular scoring script works OK + # (since it evaluates the LM scale at integer values, + # typically close to 10). We make this the default in + # order to make scoring easier, but you should remember + # when using the lattices, that this has been done. +cmd=run.pl +beam=15.0 +frames_per_chunk=50 +max_active=7000 +min_active=200 +lattice_beam=6.0 # Beam we use in lattice generation. +iter=final +num_threads=1 # if >1, will use nnet3-latgen-faster-parallel + +scoring_opts= +skip_diagnostics=false +skip_scoring=false +# we may later add extra-{left,right}-context options, but these might be +# problematic. +extra_left_context=0 +extra_right_context=0 +minimize=false +lang=default +# End configuration section. + +echo "$0 $@" # Print the command line for logging + +[ -f ./path.sh ] && . ./path.sh; # source the path. +. utils/parse_options.sh || exit 1; +set -e -u + +if [ $# -ne 5 ]; then + echo "Usage: $0 [options] " + echo "e.g.: steps/chaina/decode.sh --nj 8 \\" + echo " data/test exp/chaina/tdnn1a_sp/graph_bg exp/chaina/tdnn1a_sp/final" + echo " exp/chaina/tdnn1a_sp/data/test exp/chaina/tdnn1a_sp/decode_test_bg" + echo "Main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd # Command to run in parallel with" + echo " --beam # Decoding beam; default 15.0" + echo " --lattice-beam # Lattice pruning beam; default 6.0" + echo " --iter # Iteration of model to decode; default is final." + echo " --scoring-opts # options to local/score.sh" + echo " --num-threads # number of threads to use, default 1." + echo " --use-gpu # default: false. If true, we recommend" + echo " # to use large --num-threads as the graph" + echo " # search becomes the limiting factor." + exit 1; +fi + + +data=$1 +graphdir=$2 +model_dir=$3 +embedding_dir=$4 +dir=$5 + + +mkdir -p $dir/log + +for f in $graphdir/HCLG.fst $data/utt2spk $model_dir/$lang.mdl $model_dir/$lang.ada \ + $model_dir/info.txt $embedding_dir/output.scp; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +if [ $num_threads -gt 1 ]; then + thread_string="-parallel --num-threads=$num_threads" + queue_opt="--num-threads $num_threads" +else + thread_string= + queue_opt= +fi + +sdata=$data/split$nj; +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; +echo $nj > $dir/num_jobs + + +frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$model_dir/info.txt) +bottom_subsampling_factor=$(awk '/^bottom_subsampling_factor/ {print $2}' <$model_dir/info.txt) +top_subsampling_factor=$[frame_subsampling_factor/bottom_subsampling_factor] + + +# We need to use the output named 'output-si' from the model, since this the speaker independent +# decoding pass. +model="nnet3-am-copy --edits='remove-output-nodes name=output; rename-node old-name=output-si new-name=output' $model_dir/${lang}.mdl -|" + +if [ $stage -le 1 ]; then + $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \ + nnet3-latgen-faster$thread_string \ + --frame-subsampling-factor=$top_subsampling_factor \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \ + --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \ + --word-symbol-table=$graphdir/words.txt \ + "$model" \ + $graphdir/HCLG.fst \ + "scp:filter_scp.pl $sdata/JOB/utt2spk $embedding_dir/output.scp|" \ + "ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" +fi + + +if [ $stage -le 2 ]; then + if ! $skip_diagnostics ; then + [ ! -z $iter ] && iter_opt="--iter $iter" + steps/diagnostic/analyze_lats.sh --cmd "$cmd" --model $model_dir/${lang}.mdl $graphdir $dir + fi +fi + + +# The output of this script is the files "lat.*.gz"-- we'll rescore this at +# different acoustic scales to get the final output. +if [ $stage -le 3 ]; then + if ! $skip_scoring ; then + [ ! -x local/score.sh ] && \ + echo "Not scoring because local/score.sh does not exist or not executable." && exit 1; + echo "score best paths" + local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir + echo "score confidence and timing with sclite" + fi +fi +echo "Decoding done." +exit 0; diff --git a/egs/wsj/s5/steps/chaina/get_model_context.sh b/egs/wsj/s5/steps/chaina/get_model_context.sh new file mode 100755 index 00000000000..7abf1f6e3b5 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/get_model_context.sh @@ -0,0 +1,134 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script computes the total left and right context needed for example (eg) +# creation from a set of 'chaina' models. +# See the usage message for more information about input and output formats. + +# Begin configuration section. +frame_subsampling_factor=1 # The total frame subsampling factor of the bottom + # + top model, i.e. the relative difference in + # frame rate between the input of the bottom model + # and the output of the top model. Would normally + # be 3. +bottom_subsampling_factor=1 # The frame subsampling factor of the bottom + # (feature-extracting) model only. Must be a + # divisor of frame_subsampling_factor. Would + # normally be 1 or 3. + +langs=default # the list of languages. This script checks that + # in the dir (first arg to the script), each + # language exists as $lang.mdl, and it warns if + # any model files appear (which might indicate a + # script bug). +# End configuration section + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 2 ]; then + cat 1>&2 < +This script works out some acoustic-context-related information, +and writes it, long with the options provided to the script, +to the provided. An example of what +output-info-file> might contain after this script is called, is: +langs default +frame_subsampling_factor 3 +bottom_subsampling_factor 3 +model_left_context 22 +model_right_context 22 + + e.g.: $0 --frame-subsampling-factor 3 --bottom-subsampling-factor 3 + --langs 'default' exp/chaina/tdnn1a_sp/0 exp/chaina/tdnn1a_sp/0/info.txt + + Options: + --frame-subsampling-factor # (default: 1) Total frame subsampling factor of + # both models combined, i.e. ratio of + # frame rate of input features vs. + # alignments and decoding (e.g. 3). + --bottom-subsampling-factor # (default: 1) Controls the frequency at which + # the output of the bottom model is + # evaluated, and the interpretation of frame + # offsets in the top config file. Must be a + # divisor of --frame-subsampling-factor + --langs # The list of languages (must be in quotes, + # to be parsed as a single arg). May be + # 'default' or e.g. 'english french' +EOF + exit 1; +fi + + +dir=$1 +info_file=$2 + +# die on error or undefined variable. +set -e -u + +if [ ! -d $dir ]; then + echo 1>&2 "$0: expected directory $dir to exist" + exit 1 +fi + +if [ -z $langs ]; then + echo 1>&2 "$0: list of languages (--langs option) is empty" + exit 1 +fi + +if ! [ $frame_subsampling_factor -ge 1 ] || \ + ! [ $bottom_subsampling_factor -ge 1 ] || \ + ! [ $[frame_subsampling_factor%bottom_subsampling_factor] -eq 0 ]; then + echo 1>&2 "$0: there was a problem with the options --frame-subsampling-factor=$frame_subsampling_factor --bottom-subsampling-factor=$bottom_subsampling_factor" + exit 1 +fi + +mkdir -p $dir/temp + +if [ ! -s $dir/bottom.raw ]; then + echo 1>&2 "$0: expected file $dir/bottom.raw to exist and be nonempty" + exit 1 +fi + +nnet3-info $dir/bottom.raw > $dir/temp/bottom.info +bottom_left_context=$(grep '^left-context:' $dir/temp/bottom.info | awk '{print $2}') +bottom_right_context=$(grep '^right-context:' $dir/temp/bottom.info | awk '{print $2}') + +max_top_left_context=0 +max_top_right_context=0 + + +for lang in $langs; do + if [ ! -s $dir/$lang.mdl ]; then + echo 1>&2 "$0: expected file $dir/$lang.mdl to exist and be nonempty (check --langs option)" + exit 1 + fi + nnet3-am-info $dir/$lang.mdl > $dir/temp/$lang.info + this_left_context=$(grep '^left-context:' $dir/temp/$lang.info | awk '{print $2}') + this_right_context=$(grep '^right-context:' $dir/temp/$lang.info | awk '{print $2}') + if [ $this_left_context -gt $max_top_left_context ]; then + max_top_left_context=$this_left_context + fi + if [ $this_right_context -gt $max_top_right_context ]; then + max_top_right_context=$this_right_context + fi +done + +left_context=$[bottom_left_context+(max_top_left_context*bottom_subsampling_factor)] +right_context=$[bottom_right_context+(max_top_right_context*bottom_subsampling_factor)] + + +cat >$info_file <=0, right-context for last chunk of an utterance. +right_context_final=-1 # if >=0, right-context for last chunk of an utterance. + +compress=true # set this to false to disable compression (e.g. if you want to + # see whether results are affected). Note: if the features on + # disk were originally compressed, nnet3-chain-get-egs will dump + # compressed features regardless (since there is no further loss + # in that case). + +lang=default # the language name. will usually be 'default' in single-language + # setups. Requires because it's part of the name of some of + # the input files. + +right_tolerance= # chain right tolerance == max label delay. Only relevant if + # constrained=true. At frame rate of alignments. Code + # default is 5. +left_tolerance= # chain left tolerance (versus alignments from lattices). + # Only relevant if constrained=true. At frame rate of + # alignments. Code default is 5. + +stage=0 +max_jobs_run=40 # This should be set to the maximum number of + # nnet3-chain-get-egs jobs you are comfortable to run in + # parallel; you can increase it if your disk speed is + # greater and you have more machines. + + +srand=0 # rand seed for nnet3-chain-get-egs, nnet3-chain-copy-egs and nnet3-chain-shuffle-egs + +lattice_lm_scale= # If supplied, the graph/lm weight of the lattices will be + # used (with this scale) in generating supervisions + # This is 0 by default for conventional supervised training, + # but may be close to 1 for the unsupervised part of the data + # in semi-supervised training. The optimum is usually + # 0.5 for unsupervised data. +lattice_prune_beam= # If supplied, the lattices will be pruned to this beam, + # before being used to get supervisions. + +acwt=0.1 # For pruning. Should be, for instance, 1.0 for chain lattices. +deriv_weights_scp= + +# end configuration section + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 4 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train exp/chaina/tdnn1a_sp exp/tri3_lats exp/chaina/tdnn1a_sp/raw_egs" + echo "" + echo "From , 0/.mdl (for the transition-model), .tree (the tree), " + echo " den_fsts/.den.fst, and den_fsts/.normalization.fst (the normalization " + echo " FST, derived from the denominator FST echo are read (where is specified" + echo " by the --lang option (its default values is 'default')" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options (alternative to this" + echo " # command line)" + echo " --max-jobs-run # The maximum number of jobs you want to run in" + echo " # parallel (increase this only if you have good disk and" + echo " # network speed). default=6" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --frame-subsampling-factor # factor by which num-frames at nnet output is reduced " + echo " --lang # Name of the language, determines names of some inputs." + echo " --frames-per-chunk # number of supervised frames per chunk on disk" + echo " # ... may be a comma separated list, but we advise a single" + echo " # number in most cases, due to interaction with the need " + echo " # to group egs from the same speaker into groups." + echo " --left-context # Number of frames on left side to append for feature input" + echo " --right-context # Number of frames on right side to append for feature input" + echo " --left-context-initial # Left-context for first chunk of an utterance" + echo " --right-context-final # Right-context for last chunk of an utterance" + echo " --lattice-lm-scale # If supplied, the graph/lm weight of the lattices will be " + echo " # used (with this scale) in generating supervisions" + echo " --lattice-prune-beam # If supplied, the lattices will be pruned to this beam, " + echo " # before being used to get supervisions." + echo " --acwt # Acoustic scale -- should be acoustic scale at which the " + echo " # supervision lattices are to be interpreted. Affects pruning" + echo " --deriv-weights-scp # If supplied, adds per-frame weights to the supervision." + echo " # (e.g., might be relevant for unsupervised training)." + echo " --stage # Used to run this script from somewhere in" + echo " # the middle." + exit 1; +fi + +data=$1 +chaindir=$2 +latdir=$3 +dir=$4 + +tree=$chaindir/${lang}.tree +trans_mdl=$chaindir/init/${lang}.mdl # contains the transition model and a nnet, but + # we won't be making use of the nnet part. +normalization_fst=$chaindir/den_fsts/${lang}.normalization.fst +den_fst=$chaindir/den_fsts/${lang}.den.fst + +for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \ + $tree $trans_mdl $normalization_fst $den_fst; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +nj=$(cat $latdir/num_jobs) || exit 1 +if [ -f $latdir/per_utt ]; then + sdata=$data/split${nj}utt + utils/split_data.sh --per-utt $data $nj +else + sdata=$data/split$nj + utils/split_data.sh $data $nj +fi + +mkdir -p $dir/log $dir/misc + +cp $tree $dir/misc/ +copy-transition-model $trans_mdl $dir/misc/${lang}.trans_mdl +cp $normalization_fst $den_fst $dir/misc/ +cp $data/utt2spk $dir/misc/ +if [ -f $data/utt2uniq ]; then + cp $data/utt2uniq $dir/misc/ +elif [ -f $dir/misc/utt2uniq ]; then + rm $dir/misc/utt2uniq +fi + +if [ -e $dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for x in $(seq $nj); do echo $dir/cegs.$x.ark; done) +fi + + +lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |" +if [ ! -z $lattice_prune_beam ]; then + if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then + lats_rspecifier="$lats_rspecifier lattice-1best --acoustic-scale=$acwt ark:- ark:- |" + else + lats_rspecifier="$lats_rspecifier lattice-prune --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:- |" + fi +fi + +egs_opts="--long-key=true --left-context=$left_context --right-context=$right_context --num-frames=$frames_per_chunk --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress" +[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial" +[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final" + +[ ! -z "$deriv_weights_scp" ] && egs_opts="$egs_opts --deriv-weights-rspecifier=scp:$deriv_weights_scp" + + +chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor" +[ ! -z $right_tolerance ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --right-tolerance=$right_tolerance" + +[ ! -z $left_tolerance ] && \ + chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance" + +if ! $constrained; then + # e2e supervision + chain_supervision_all_opts="$chain_supervision_all_opts --convert-to-pdfs=false" + egs_opts="$egs_opts --transition-model=$chaindir/0.trans_mdl" +fi + +if [ ! -z "$lattice_lm_scale" ]; then + chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale" + + normalization_fst_scale=$(perl -e " + if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) { + print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; exit(1); + } + print (1.0 - $lattice_lm_scale);") || exit 1 + egs_opts="$egs_opts --normalization-fst-scale=$normalization_fst_scale" +fi + +if [ $stage -le 0 ]; then + $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \ + lattice-align-phones --replace-output-symbols=true $latdir/final.mdl \ + "$lats_rspecifier" ark:- \| \ + chain-get-supervision $chain_supervision_all_opts \ + $dir/misc/${lang}.tree $dir/misc/${lang}.trans_mdl ark:- ark:- \| \ + nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \ + "$normalization_fst" scp:$sdata/JOB/feats.scp ark,s,cs:- \ + ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp || exit 1; +fi + + +if [ $stage -le 1 ]; then + frames_and_chunks=$(for n in $(seq $nj); do cat $dir/log/get_egs.$n.log; done | \ + perl -e '$nc=0; $nf=0; while() { + if (m/Split .+ into (\d+) chunks/) { $this_nc = $1; } + if (m/Average chunk length was (\d+) frames/) { $nf += $1 * $this_nc; $nc += $this_nc; } + } print "$nf $nc"; ') + num_frames=$(echo $frames_and_chunks | awk '{print $1}') + num_chunks=$(echo $frames_and_chunks | awk '{print $2}') + frames_per_chunk_avg=$[num_frames/num_chunks] + feat_dim=$(feat-to-dim scp:$sdata/1/feats.scp -) + num_leaves=$(tree-info $tree | awk '/^num-pdfs/ {print $2}') + if [ $left_context_initial -lt 0 ]; then + left_context_initial=$left_context + fi + if [ $right_context_final -lt 0 ]; then + right_context_final=$right_context + fi + + cat >$dir/info.txt < $dir/all.scp +fi + +echo "$0: Finished preparing raw egs" diff --git a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py new file mode 100755 index 00000000000..a4e8a44c1cd --- /dev/null +++ b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py @@ -0,0 +1,364 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Johns Hopkins University (author: Daniel Povey) +# Copyright 2018 Hossein Hadian + +# License: Apache 2.0. + +import os +import argparse +import sys +import re +import logging +import traceback +import random + +sys.path.insert(0, 'steps') + +logger = logging.getLogger('libs') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) +logger.info('Starting choose_egs_to_merge.py') + + + + +def get_args(): + parser = argparse.ArgumentParser(description="Chooses groups of examples to merge into groups " + "of size given by the --chunks-per-group option, based on speaker " + "information (preferentially, chunks from the same utterance " + "and, if possible, the same speaker, get combined into " + "groups). This script also computes a held-out subset of...", + epilog="E.g. " + sys.argv[0] + "*** TODO *** ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument('--random-seed', type=int, + default = 123, help='Random seed.') + parser.add_argument("--chunks-per-group", type=int, default=4, + help="Number of chunks per speaker in the final egs (actually " + "means the number of chunks per group of chunks, and they are " + "only preferentially taken from the same speaker.") + parser.add_argument("--num-repeats", type=int, default=1, + help="The number of times the data is to be repeated. Must divide " + "--chunks-per-group. Suggest to try only 1 or 2. The idea " + "is to divide chunks into groups in different ways, to give " + "more variety to the egs (since the adaptation information " + "will differ.") + parser.add_argument("--heldout-data-selection-proportion", type=float, + default=0.2, + help="This parameter governs the selection of the heldout " + "subset and the statistically matched training subset. " + "It does not affect the size of that subset, but only " + "affects what pool the examples are drawb from. " + "Smaller values of this mean that the heldout groups " + "will be preferentially drawn from groups that " + "'contaminate' the least number of other groups, " + "and so require the least data to be removed from the " + "training set. Setting this to 1.0 would mean that " + "the heldout subset is drawn completely at random " + "(which might be more wasteful of training data, but " + "gives a selection that's statistically more " + "representative).") + parser.add_argument("--num-heldout-groups", type=int, default=200, + help="Number of utterance groups " + "that will go in the heldout subset (and in the " + "statistically matched training subset)") + parser.add_argument("--utt2uniq", type=str, default='', + help="File used in setups with data " + "augmentation, that maps from utterance-ids to the " + "pre-augmentation utterance-id. The reason it's needed " + "is to ensure that the heldout set is properly held " + "out (i.e., that different versions of those utterances " + "weren't trained on. If not specified, we assume the " + "identity map.") + parser.add_argument("--scp-in", type=str, required=True, + help="The scp file in, likely containing chain egs. The " + "keys are expected to be of the form: " + "'-----v1', " + "where the left_context, num_frames and right_context are required to be the " + "same in order for keys to be in a group (note: it's best if the " + "--extra-left-context-initial and --extra-right-context-final options " + "are not used, and if the --frames-per-chunk is a single number, in " + "order to prevent this constraint from splitting up the utterances from " + "a single speaker") + parser.add_argument("--training-data-out", type=str, required=True, + help="The output file containing the chunks that are to be grouped; each " + "line will contain --chunks-per-group (e.g. 4) rxfilenames, obtained " + "from the second field of the input --scp-in file.") + parser.add_argument("--heldout-subset-out", type=str, required=True, + help="This is the name of the file to which the heldout data subset " + "will be written; the format is the same as --training-data-out.") + parser.add_argument("--training-subset-out", type=str, required=True, + help="This is the name of the file to which the statistically matched " + "(to --heldout-subset-out) set of training data will be written") + + print(sys.argv, file=sys.stderr) + args = parser.parse_args() + + return args + + +""" +Notes on plan for how to implement this (we can keep this as documentation, but +we'll maybe move some of it around when things get implemented). + +This is a rather simple plan and we might later implement something more +sophisticated that does a better job of keeping chunks from the same utterance +or the same speaker together. + +Basically we rely on the fact that the input utterances come in in sorted order +(so utterances from adjacent speakers will naturally be together. + +We read the entries in the input scp file as a list, keeping them in the order +they were in the input (which will naturally keep together chunks from the +same utterance and utterances from the same speaker, since the raw egs were +not randomized). We split that list into distinct sub-lists, each with a unique value +of --. In the normal case +there will be just one such sub-list. + +In the case where --chunks-per-group=4 and --num-repeats=1, the groups of +chunks would then just be (and we do this for each of the sub-lists): +the first 4 chunks; the second 4 chunks; and so on. In the case where +--chunks-per-group=4 and --num-repeats=2, we'd obtain the groups as above, then +we'd discard the first 2 chunks of each sub-list and repeat the process, giving +us twice the original number of groups. If you want you can just +assert that --num-repeats is either 1 or 2 for now; higher values don't +really make sense with the current approach for choosing groups. + +Once we have the groups as above, we need to figure out the subset of +size --num-heldout-groups which will be chosen to appear in the output +file --heldout-subset-out. We'll also be choosing another subset of +the same size to appear in the file --training-subset-out; and we'll +be excluding some groups from the output --training-data-out (any +utterances that appeared in --heldout-subset-out, or which were linked +with such utterances via the --utt2uniq map, will be excluded). + +The way we choose the groups to appear in --heldout-subset-out is as follows. +Firstly: in cases where the utt2uniq file is undefined, treat it as the identity +map. We are given list of groups. We compute, for each group, the set of +utterances represented in it, and from that, the set of "uniq" values (a "uniq" +value is a string, representing a pre-augmentation utterance-id). For each +"uniq" value, we will compute the set of group-ids in which it was represented. +For a given group, we take the union of all those sets for its "uniq" value, and +remove its own group-id; this gives us the set of other groups that share a +pre-augmentation utterance in common with this group. This set might be empty +only in the case where there was no augmentation and --num-repeats=1, and some +particular utterance had been split into exactly 4 chunks which all ended up in +the same group. + +From the information above we can sort the groups by the number of groups we'd +have to hold out if we were to put that group in the heldout set. Then if, say, +--heldout-data-selection-proportion=0.2, we take the bottom 20% of groups by +this measure, meaning the groups which will cause less training data to have to +be held out. This is the set from which we'll select the heldout data and the +matched subset of training data. Call this the "candidate set". We first +choose --num-heldout-groups groups from the candidate set. This is the heldout +subset. From the heldout subset we compute the set of "uniq" values represented, +and we remove from the training set any groups which share those "uniq" values. + +Next we need to choose the matched subset of training examples. The way we do +this is that we choose --num-heldout-groups from the "candidate set", after +excluding groups that were in the heldout subset or which were removed from the +training set because they contained "uniq" values in common with those in the +heldout set. If this fails because there were too few groups in the candidate +set, just double --heldout-data-selection-proportion and retry. Make sure to do +something sensible in the case where the dataset is too tiny to choose the +requested heldout set size (i.e. print an informative error message before +dying). + +""" + +class Chunk: + """ This is a data structure for a chunk. A chunk is a single entry + of the --scp-in file. + 'eg' second field of --scp-in file + """ + def __init__(self, scp_line): + result = re.match("^(.*)-(\d+)-(\d+)-(\d+)-(\d+)-v1\s+(.*)$", scp_line) + self.utt_id, first_frame, left_context, num_frames, right_context, self.eg = result.groups() + self.chunk_id = self.utt_id + '-' + first_frame + self.context_structure = '-'.join((left_context, num_frames, right_context)) + def __repr__(self): + return '{}-{} {}'.format(self.chunk_id, self.context_structure, self.eg) + + +def read_all_chunks(scp_file): + """ Loads all the lines of the --scp-in file as chunk objects. + """ + chunks = [] + with open(scp_file, 'r', encoding='latin-1') as f: + for line in f: + try: + chunks.append(Chunk(line.strip())) + except: + logger.error('Bad line: ' + line.strip()) + raise + return chunks + +def load_utt2uniq(filename): + """ Loads the --utt2uniq file as a dict. + """ + utt2uniq = {} + with open(filename, 'r', encoding='latin-1') as f: + for line in f: + uttid, base_uttid = line.strip().split() + utt2uniq[uttid] = base_uttid + return utt2uniq + +def write_egs(filename, group_indexes, all_groups): + """ Writes the output egs, i.e. the second field of + the --scp-in file for specific chunks specified by `group_indexes`. + """ + with open(filename, 'w', encoding='latin-1') as f: + for group_index in group_indexes: + for chunk in all_groups[group_index]: + f.write('{}\n'.format(chunk.eg)) + + + +def choose_egs(args): + """ The main part of the program. + """ + random.seed(args.random_seed) + logger.info('Set random seed to {}.'.format(args.random_seed)) + all_chunks = read_all_chunks(args.scp_in) + logger.info('Loaded {} chunks.'.format(len(all_chunks))) + + chunk_to_sublist = {} + for chunk in all_chunks: + if chunk.context_structure not in chunk_to_sublist: + chunk_to_sublist[chunk.context_structure] = [chunk] + else: + chunk_to_sublist[chunk.context_structure].append(chunk) + + logger.info('Created {} sub-lists with uniqe context ' + 'structure.'.format(len(chunk_to_sublist))) + + + assert(args.num_repeats == 1 or args.num_repeats == 2) + groups = [] # All groups from all sub-lists + for context_structure in sorted(chunk_to_sublist.keys()): + sublist = chunk_to_sublist[context_structure] + logger.info('Processing chunks with context ' + 'structure: {}'.format(context_structure)) + num_groups = (len(sublist) + + args.chunks_per_group - 1) // args.chunks_per_group + for i in range(num_groups): + group = sublist[i * args.chunks_per_group : (i + 1) * args.chunks_per_group] + groups.append(group) + if args.num_repeats == 2: + shift = args.chunks_per_group // 2 + group = sublist[i * args.chunks_per_group + shift : + (i + 1) * args.chunks_per_group + shift] + if group: + groups.append(group) + + logger.info('Created a total of {} groups.'.format(len(groups))) + + utt2uniq = {} + if args.utt2uniq: + utt2uniq = load_utt2uniq(args.utt2uniq) + logger.info('Loaded utt2uniq file with {} entries.'.format(len(utt2uniq))) + else: + logger.info('--utt2uniq not specified; using identity map.') + + + uniq_to_groups = {} # uniq to set of groups that include it + for i, group in enumerate(groups): + for chunk in group: + uniq = utt2uniq.get(chunk.utt_id, chunk.utt_id) + if uniq not in uniq_to_groups: + uniq_to_groups[uniq] = set([i]) + else: + uniq_to_groups[uniq].add(i) + + logger.info('Computed uniq-to-groups for {} uniqs. Average number of ' + 'groups representing a uniq is ' + '{}'.format(len(uniq_to_groups), + sum([len(g) for g in uniq_to_groups.values()]) / + len(uniq_to_groups))) + + # This is indexed by group-index (same len as groups). other_groups[i] is + # the set of other groups which share some utterance with group i. + other_groups = [set() for g in groups] + for i, group in enumerate(groups): + for chunk in group: + uniq = utt2uniq.get(chunk.utt_id, chunk.utt_id) + other_groups_this_uniq = uniq_to_groups[uniq] + other_groups[i].update(other_groups_this_uniq) + + for i, other in enumerate(other_groups): # Remove self + other.remove(i) + + # 'group_shared_size' is a list of pairs (i, n) where i is group-index and + # n is the number of groups that we'd + # have to hold out if we were to put that group in the heldout set. + group_shared_size = [(i, len(other)) for i, other in enumerate(other_groups)] + # Sort it on n: + group_shared_size.sort(key=lambda tup: tup[1]) + + total_num_groups = len(groups) + training_set = set(range(total_num_groups)) # All groups + candidate_set_size = int(args.heldout_data_selection_proportion + * total_num_groups) + logger.info('Initial candidate set size: {}'.format(candidate_set_size)) + if args.num_heldout_groups > candidate_set_size: + logger.error('args.heldout_data_selection_proportion is too small or ' + 'there are too few groups.') + sys.exit(1) + + candidate_set = set([tup[0] for tup in group_shared_size[:candidate_set_size]]) + heldout_list = random.sample(candidate_set, args.num_heldout_groups) + + + # Remove all the heldout groups (and any other groups sharing some utterance + # with them) from both the candidate set and the training set + for group_index in heldout_list: + for shared_group_index in other_groups[group_index]: + candidate_set.discard(shared_group_index) + training_set.discard(shared_group_index) + candidate_set.discard(group_index) + training_set.discard(group_index) + + logger.info('Candidate set size after removing heldout ' + 'groups: {}'.format(len(candidate_set))) + if args.num_heldout_groups > len(candidate_set): + logger.warn('Not enough groups left in the candidate set. Doubling it.') + candidate_set = set([tup[0] for tup in + group_shared_size[:candidate_set_size * 2]]) + for group_index in heldout_list: + for shared_group_index in other_groups[group_index]: + candidate_set.discard(shared_group_index) + candidate_set.discard(group_index) + logger.info('Candidate set size after doubling and removing heldout ' + 'groups: {}'.format(len(candidate_set))) + if args.num_heldout_groups > len(candidate_set): + logger.error('args.heldout_data_selection_proportion is too small ' + 'or there are too few groups. Not enough groups left.') + sys.exit(1) + + train_subset_list = random.sample(candidate_set, args.num_heldout_groups) + + + # Write the outputs: + write_egs(args.training_data_out, training_set, groups) + write_egs(args.heldout_subset_out, heldout_list, groups) + write_egs(args.training_subset_out, train_subset_list, groups) + + +def main(): + try: + args = get_args() + choose_egs(args) + except Exception as e: + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py b/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py new file mode 100755 index 00000000000..c1e9a04179b --- /dev/null +++ b/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 + +# Copyright 2019 Johns Hopkins University (author: Daniel Povey) +# Copyright Hossein Hadian + + +# Apache 2.0. + +""" This script outputs information about a neural net training schedule, + to be used by ../train.sh, in the form of lines that can be selected + and sourced by the shell. +""" + +import argparse +import sys + +sys.path.insert(0, 'steps') +import libs.nnet3.train.common as common_train_lib +import libs.common as common_lib + +def get_args(): + parser = argparse.ArgumentParser( + description="""Output training schedule information to be consumed by ../train.sh""", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument("--frame-subsampling-factor", type=int, default=3, + help="""Frame subsampling factor for the combined model + (bottom+top), will normally be 3. Required here in order + to deal with frame-shifted versions of the input.""") + parser.add_argument("--initial-effective-lrate", + type=float, + dest='initial_effective_lrate', default=0.001, + help="""Effective learning rate used on the first iteration, + determines schedule via geometric interpolation with + --final-effective-lrate. Actual learning rate is + this times the num-jobs on that iteration.""") + parser.add_argument("--final-effective-lrate", type=float, + dest='final_effective_lrate', default=0.0001, + help="""Learning rate used on the final iteration, see + --initial-effective-lrate for more documentation.""") + parser.add_argument("--num-jobs-initial", type=int, default=1, + help="""Number of parallel neural net jobs to use at + the start of training""") + parser.add_argument("--num-jobs-final", type=int, default=1, + help="""Number of parallel neural net jobs to use at + the end of training. Would normally + be >= --num-jobs-initial""") + parser.add_argument("--num-epochs", type=float, default=4.0, + help="""The number of epochs to train for. + Note: the 'real' number of times we see each + utterance is this number times --frame-subsampling-factor + (to cover frame-shifted copies of the data), times + the value of --num-repeats given to process_egs.sh, + times any factor arising from data augmentation.""") + parser.add_argument("--dropout-schedule", type=str, + help="""Use this to specify the dropout schedule (how the dropout probability varies + with time, 0 == no dropout). You specify a piecewise + linear function on the domain [0,1], where 0 is the + start and 1 is the end of training; the + function-argument (x) rises linearly with the amount of + data you have seen, not iteration number (this improves + invariance to num-jobs-{initial-final}). E.g. '0,0.2,0' + means 0 at the start; 0.2 after seeing half the data; + and 0 at the end. You may specify the x-value of + selected points, e.g. '0,0.2@0.25,0' means that the 0.2 + dropout-proportion is reached a quarter of the way + through the data. The start/end x-values are at + x=0/x=1, and other unspecified x-values are interpolated + between known x-values. You may specify different rules + for different component-name patterns using + 'pattern1=func1 pattern2=func2', e.g. 'relu*=0,0.1,0 + lstm*=0,0.2,0'. More general should precede less + general patterns, as they are applied sequentially.""") + + parser.add_argument("--num-scp-files", type=int, default=0, required=True, + help="""The number of .scp files in the egs dir.""") + parser.add_argument("--schedule-out", type=str, required=True, + help="""Output file containing the training schedule. The output + is lines, one per training iteration. + Each line (one per iteration) is a list of ;-separated commands setting shell + variables. Currently the following variables are set: + iter, num_jobs, inv_num_jobs, scp_indexes, frame_shifts, dropout_opt, lrate. + """) + + print(sys.argv, file=sys.stderr) + args = parser.parse_args() + + return args + +def get_schedules(args): + num_scp_files_expanded = args.num_scp_files * args.frame_subsampling_factor + num_scp_files_to_process = int(args.num_epochs * num_scp_files_expanded) + num_scp_files_processed = 0 + num_iters = ((num_scp_files_to_process * 2) + // (args.num_jobs_initial + args.num_jobs_final)) + + with open(args.schedule_out, 'w', encoding='latin-1') as ostream: + for iter in range(num_iters): + current_num_jobs = int(0.5 + args.num_jobs_initial + + (args.num_jobs_final - args.num_jobs_initial) + * float(iter) / num_iters) + # as a special case, for iteration zero we use just one job + # regardless of the --num-jobs-initial and --num-jobs-final. This + # is because the model averaging does not work reliably for a + # freshly initialized model. + if iter == 0: + current_num_jobs = 1 + + lrate = common_train_lib.get_learning_rate(iter, current_num_jobs, + num_iters, + num_scp_files_processed, + num_scp_files_to_process, + args.initial_effective_lrate, + args.final_effective_lrate) + + if args.dropout_schedule == "": + args.dropout_schedule = None + dropout_edit_option = common_train_lib.get_dropout_edit_option( + args.dropout_schedule, + float(num_scp_files_processed) / max(1, (num_scp_files_to_process - args.num_jobs_final)), + iter) + + frame_shifts = [] + egs = [] + for job in range(1, current_num_jobs + 1): + # k is a zero-based index that we will derive the other indexes from. + k = num_scp_files_processed + job - 1 + # work out the 1-based scp index. + scp_index = (k % args.num_scp_files) + 1 + # previous : frame_shift = (k/num_scp_files) % frame_subsampling_factor + frame_shift = ((scp_index + k // args.num_scp_files) + % args.frame_subsampling_factor) + + # Instead of frame shifts like [0, 1, 2], we make them more like + # [0, 1, -1]. This is clearer in intent, and keeps the + # supervision starting at frame zero, which IIRC is a + # requirement somewhere in the 'chaina' code. + if frame_shift > (args.frame_subsampling_factor // 2): + frame_shift = frame_shift - args.frame_subsampling_factor + + frame_shifts.append(str(frame_shift)) + egs.append(str(scp_index)) + + + print("""iter={iter}; num_jobs={nj}; inv_num_jobs={nj_inv}; scp_indexes=(pad {indexes}); frame_shifts=(pad {shifts}); dropout_opt="{opt}"; lrate={lrate}""".format( + iter=iter, nj=current_num_jobs, nj_inv=(1.0 / current_num_jobs), + indexes = ' '.join(egs), shifts=' '.join(frame_shifts), + opt=dropout_edit_option, lrate=lrate), file=ostream) + num_scp_files_processed = num_scp_files_processed + current_num_jobs + + +def main(): + args = get_args() + get_schedules(args) + +if __name__ == "__main__": + main() diff --git a/egs/wsj/s5/steps/chaina/process_egs.sh b/egs/wsj/s5/steps/chaina/process_egs.sh new file mode 100755 index 00000000000..e8d8cfeab4e --- /dev/null +++ b/egs/wsj/s5/steps/chaina/process_egs.sh @@ -0,0 +1,186 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script takes nnet examples dumped by steps/chaina/get_raw_egs.sh and +# combines the chunks into groups by speaker (to the extent possible; it may +# need to combine speakers in some cases), locally randomizes the result, and +# dumps the resulting egs to disk. Chunks of these will later be globally +# randomized (at the scp level) by steps/chaina/randomize_egs.sh + + +# Begin configuration section. +cmd=run.pl +chunks_per_group=4 +num_repeats=2 # number of times we repeat the same chunks with different + # grouping. Recommend 1 or 2; must divide chunks_per_group +compress=true # set this to false to disable compression (e.g. if you want to see whether + # results are affected). + + +num_heldout_groups=200 # The number of groups (i.e. groups of chunks) that + # will go in the held-out set and the train subset + # (heldout_subset.scp and train_subset.scp). The real + # point of train_subset.scp, and the reason we can't + # just use a subset of train.scp, is that it contains + # egs that are statistically comparable to + # heldout_subset.scp, so their prob can be + # meaningfully compared with those from + # heldout_subset.scp. Note: the number (e.g. 200) is + # *after* merging chunks into groups of size + # $chunks_per_group. + + +shuffle_buffer_size=5000 # Size of buffer (containing grouped egs) to use + # for random shuffle. + +stage=0 +nj=5 # the number of parallel jobs to run. +srand=0 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 2 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 --chunks-per-group 4 exp/chaina/tdnn1a_sp/raw_egs exp/chaina/tdnn1a_sp/processed_egs" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options (alternative to this" + echo " # command line)" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --chunks-per-group # Number of chunks (preferentially, from a single speaker" + echo " # to combine into each example. This grouping of" + echo " # egs is part of the 'chaina' framework; the adaptation" + echo " # parameters will be estimated from these groups." + echo " --num-repeats # Number of times we group the same chunks into different" + echo " # groups. For now only the values 1 and 2 are" + echo " # recommended, due to the very simple way we choose" + echo " # the groups (it's consecutive)." + echo " --nj # Number of jobs to run in parallel. Usually quite a" + echo " # small number, as we'll be limited by disk access" + echo " # speed." + echo " --compress # True if you want the egs to be compressed" + echo " # (e.g. you may set to false for debugging purposes, to" + echo " # check that the compression is not hurting)." + echo " --num-heldout-egs # Number of egs to put in train_subset.scp and heldout_subset.scp." + echo " # These will be used for diagnostics. Note: this number is" + echo " # the number of grouped egs, after merging --chunks-per-group" + echo " # chunks into a single eg." + echo " # ... may be a comma separated list, but we advise a single" + echo " # number in most cases, due to interaction with the need " + echo " # to group egs from the same speaker into groups." + echo " --stage # Used to run this script from somewhere in" + echo " # the middle." + exit 1; +fi + +raw_egs_dir=$1 +dir=$2 + +# die on error or undefined variable. +set -e -u + +if ! steps/chaina/validate_raw_egs.sh $raw_egs_dir; then + echo "$0: failed to validate input directory $raw_egs_dir" + exit 1 +fi + + +mkdir -p $dir/temp $dir/log + + +if [ $stage -le 0 ]; then + echo "$0: choosing egs to merge" + + utt2uniq_opt= + [ -f $raw_egs_dir/misc/utt2uniq ] && utt2uniq_opt="--utt2uniq=$raw_egs_dir/misc/utt2uniq" + + $cmd $dir/log/choose_egs_to_merge.log steps/chaina/internal/choose_egs_to_merge.py \ + --chunks-per-group=$chunks_per_group \ + --num-repeats=$num_repeats \ + --num-heldout-groups=$num_heldout_groups \ + $utt2uniq_opt \ + --scp-in=$raw_egs_dir/all.scp \ + --training-data-out=$dir/temp/train.list \ + --heldout-subset-out=$dir/temp/heldout_subset.list \ + --training-subset-out=$dir/temp/train_subset.list +fi + +if [ $stage -le 1 ]; then + + for name in heldout_subset train_subset; do + echo "$0: merging and shuffling $name egs" + + # Linearize these lists and add keys to make it an scp format. + awk '{for (n=1;n<=NF;n++) { count++; print count, $n; }}' <$dir/temp/${name}.list >$dir/temp/${name}.scp + + $cmd $dir/log/merge_${name}_egs.log \ + nnet3-chain-merge-egs --minibatch-size=$chunks_per_group --compress=$compress \ + scp:$dir/temp/${name}.scp ark:- \| \ + nnet3-chain-shuffle-egs --srand=$srand ark:- ark,scp:$dir/${name}.ark,$dir/${name}.scp + done + + # Split up the training list into multiple smaller lists, as it could be long. + utils/split_scp.pl $dir/temp/train.list $(for j in $(seq $nj); do echo $dir/temp/train.$j.list; done) + # Linearize these lists and add keys to make them in scp format; + # nnet3-chain-merge-egs will merge the right groups, it's deterministic + # and we specified --minibatch-size=$chunks_per_group. + for j in $(seq $nj); do + awk '{for (n=1;n<=NF;n++) { count++; print count, $n; }}' <$dir/temp/train.$j.list >$dir/temp/train.$j.scp + done + + if [ -e $dir/storage ]; then + # Make soft links to storage directories, if distributing this way.. See + # utils/create_split_dir.pl. + echo "$0: creating data links" + utils/create_data_link.pl $(for j in $(seq $nj); do echo $dir/train.$j.ark; done) || true + fi + + $cmd JOB=1:$nj $dir/log/merge_train_egs.JOB.log \ + nnet3-chain-merge-egs --compress=$compress --minibatch-size=$chunks_per_group \ + scp:$dir/temp/train.JOB.scp ark:- \| \ + nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size \ + --srand=\$[JOB+$srand] ark:- ark,scp:$dir/train.JOB.ark,$dir/train.JOB.scp + # the awk command is to ensure unique ids for each group. + cat $(for j in $(seq $nj); do echo $dir/train.$j.scp; done) | awk '{printf("%09d %s\n", NR, $2);}' > $dir/train.scp +fi + + +cat $raw_egs_dir/info.txt | awk -v num_repeats=$num_repeats \ + -v chunks_per_group=$chunks_per_group ' + /^dir_type / { print "dir_type processed_chaina_egs"; next; } + /^num_input_frames / { print "num_input_frames "$2 * num_repeats; next; } # approximate; ignores held-out egs. + /^num_chunks / { print "num_chunks " $2 * num_repeats; next; } + {print;} + END{print "chunks_per_group " chunks_per_group; print "num_repeats " num_repeats;}' >$dir/info.txt + +# # Note: the info.txt will actually look like the following, in general, +# # taking into account the fields present in the info.txt in the source dir: +# dir_type processed_chaina_egs +# num_input_frames $num_frames +# num_chunks $num_chunks +# lang $lang +# feat_dim $feat_dim +# num_leaves $num_leaves +# frames_per_chunk $frames_per_chunk +# frames_per_chunk_avg $frames_per_chunk_avg +# left_context $left_context +# left_context_initial $left_context_initial +# right_context $right_context +# right_context_final $right_context_final +# chunks_per_group $chunks_per_group + + +if ! cat $dir/info.txt | awk '{if (NF == 1) exit(1);}'; then + echo "$0: we failed to obtain at least one of the fields in $dir/info.txt" + exit 1 +fi + +cp -r $raw_egs_dir/misc/ $dir/ + + +echo "$0: Finished processing egs" diff --git a/egs/wsj/s5/steps/chaina/randomize_egs.sh b/egs/wsj/s5/steps/chaina/randomize_egs.sh new file mode 100755 index 00000000000..943d383c571 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/randomize_egs.sh @@ -0,0 +1,194 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script takes nnet examples dumped by steps/chaina/process_egs.sh, +# globally randomizes the egs, and divides into multiple .scp files. This is +# the form of egs which is consumed by the training script. All this is done +# only by manipulating the contents of .scp files. To keep locality of disk +# access, we only randomize blocks of egs (e.g. blocks containing 128 groups of +# sequences). This doesn't defeat randomization, because both process_egs.sh +# and the training script use nnet3-shuffle-egs to do more local randomization. + +# Later on, we'll have a multilingual/multi-input-dir version fo this script +# that combines egs from various data sources and possibly multiple languages. +# This version assumes there is just one language. + +# Begin configuration section. +cmd=run.pl + +groups_per_block=128 # The 'groups' are the egs in the scp file from + # process_egs.sh, containing '--chunks-per-group' sequences + # each. + +frames_per_job=3000000 # The number of frames of data we want to process per + # training job (will determine how long each job takes, + # and the frequency of model averaging. This was + # previously called --frames-per-iter, but + # --frames-per-job is clearer as each job does this + # many. + +num_groups_combine=1000 # the number of groups from the training set that we + # randomly choose as input to nnet3-chain-combine; + # these will go to combine.scp. train_subset.scp and + # heldout_subset.scp are, for now, just copied over + # from the input. + +# Later we may provide a mechanism to change the language name; for now we +# just copy it from the input. + + +srand=0 +stage=0 + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 2 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 --frames-per-job 2000000 exp/chaina/tdnn1a_sp/processed_egs exp/chaina/tdnn1a_sp/egs" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options (alternative to this" + echo " # command line)" + echo " --cmd (utils/run.pl;utils/queue.pl ) # how to run jobs." + echo " --groups-per-block # The number of groups (i.e. previously merged egs" + echo " # containing --chunks-per-group chunks) to to consider " + echo " # as one block, where whole blocks are randomized;" + echo " # smaller means more complete randomization but less" + echo " # local disk access." + echo " --frames-per-job # The number of input frames (not counting context)" + echo " # that we aim to have in each scp file after" + echo " # randomization and splitting." + echo " --num-groups-combine # The number of randomly chosen groups to" + echo " # put in the subset in 'combine.scp' which will" + echo " # be used in nnet3-chaina-combine to decide which" + echo " # models to average over." + echo " --stage # Used to run this script from somewhere in" + echo " # the middle." + echo " --srand # Random seed, affects randomization." + exit 1; +fi + +processed_egs_dir=$1 +dir=$2 + +# die on error or undefined variable. +set -e -u + +if ! steps/chaina/validate_processed_egs.sh $processed_egs_dir; then + echo "$0: could not validate input directory $processed_egs_dir" + exit 1 +fi + +# Work out how many groups per job and how many frames per job we'll have + +info_in=$processed_egs_dir/info.txt + +frames_per_group_avg=$(awk '/^frames_per_chunk_avg/ { fpc=$2; } /^chunks_per_group/ { print int(fpc * $2); }' $info_in) +if ! [ $frames_per_group_avg -gt 0 ]; then + echo "$0: error getting frames per group."; +fi + +num_groups=$(wc -l <$processed_egs_dir/train.scp) + +num_scp_files=$[(frames_per_group_avg*num_groups + frames_per_job/2) / frames_per_job] +[ $num_scp_files -eq 0 ] && num_scp_files=1 + +frames_per_scp_file=$[(frames_per_group_avg * num_groups) / num_scp_files] +groups_per_scp_file=$[ num_groups / num_scp_files] + + +mkdir -p $dir/temp + +if [ -d $dir/misc ]; then + rm -r $dir/misc +fi + +mkdir -p $dir/misc +cp $processed_egs_dir/misc/* $dir/misc + + +# We want to globally randomize the order of these blocks of (e.g.) 128 lines of +# the input train.scp, and then split up into $num_scp_files groups. we could +# do this in a specially-written python script, but instead we do it with a +# combination of existing Kaldi and UNIX utilities. + +awk -v gpb=$groups_per_block \ + '{block=sprintf("%05d", NR / gpb); group_id=$1; print group_id, block;}' \ + <$processed_egs_dir/train.scp >$dir/temp/key2block + +# get list of blocks +awk '{print $2}' <$dir/temp/key2block | uniq > $dir/temp/blocks +# get randomized-order list of blocks +utils/shuffle_list.pl --srand "$srand" <$dir/temp/blocks > $dir/temp/blocks_rand +# Map block-ids to randomized-order block-ids +paste $dir/temp/blocks $dir/temp/blocks_rand > $dir/temp/block2rand + + +# The following command first maps block-ids to randomized-order block-ids, then +# sorts the keys by these randomized-order block-ids while otherwise maintaining +# stable sorting (-s) which keeps the keys in the blocks in the same order. +utils/apply_map.pl -f 2 $dir/temp/block2rand <$dir/temp/key2block | \ + sort -k2 -s > $dir/temp/key2block_rand + + +# The following command just changes the order of train.scp to +# match the order in key2block_rand (which has the order of blocks +# of lines randomly moved around). +awk '{print $1, $1}' $dir/temp/key2block_rand | \ + utils/apply_map.pl -f 2 $processed_egs_dir/train.scp \ + >$dir/temp/train.scp_rand + + +# The following command splits up $dir/temp/train.scp_rand (the randomized-order +# version of train.scp), while keeping distinct blocks in separate scp files, +# thanks to the --utt2spk option. +utils/split_scp.pl --utt2spk=$dir/temp/key2block_rand \ + $dir/temp/train.scp_rand \ + $(for i in $(seq $num_scp_files); do echo $dir/train.$i.scp; done) + + +cp $processed_egs_dir/heldout_subset.scp $processed_egs_dir/train_subset.scp $dir/ + + +# note: there is only one language in $processed_egs_dir (any +# merging would be done at the randomization stage but that is not supported yet). + +lang=$(awk '/^lang / { print $2; }' <$processed_egs_dir/info.txt) + +# We'll store info files per language, containing the part of the information +# that is language-specific, plus a single global info.txt containing stuff that +# is not language specific. +# This will get more complicated once we actually support multiple languages, +# and when we allow multiple input processed egs dirs for the same language. + +grep -v -E '^dir_type|^lang|^feat_dim' <$processed_egs_dir/info.txt | \ + cat <(echo "dir_type randomized_chaina_egs") - > $dir/info_$lang.txt + + +cat <$dir/info.txt +dir_type randomized_chaina_egs +num_scp_files $num_scp_files +langs $lang +frames_per_scp_file $frames_per_scp_file +groups_per_scp_file $groups_per_scp_file +EOF +# frames_per_job, after rounding, becomes frames_per_scp_file. + +# note: frames_per_chunk_avg will be present in the info.txt file as well as +# the per-language files. +grep -E '^feat_dim|^frames_per_chunk_avg' <$processed_egs_dir/info.txt >>$dir/info.txt + + + +if ! cat $dir/info.txt | awk '{if (NF == 1) exit(1);}'; then + echo "$0: we failed to obtain at least one of the fields in $dir/info.txt" + exit 1 +fi + + +echo "$0: Finished randomizing egs" diff --git a/egs/wsj/s5/steps/chaina/train.sh b/egs/wsj/s5/steps/chaina/train.sh new file mode 100755 index 00000000000..0bfefd43b21 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/train.sh @@ -0,0 +1,329 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. + + +# Begin configuration section +stage=0 +leaky_hmm_coefficient=0.1 +xent_regularize=0.1 +apply_deriv_weights=false # you might want to set this to true in unsupervised training + # scenarios. +memory_compression_level=2 # Enables us to use larger minibatch size than we + # otherwise could, but may not be optimal for speed + # (--> set to 0 if you have plenty of memory. +dropout_schedule= +srand=0 +max_param_change=1.0 # we use a smaller than normal default (it's normally + # 2.0), because there are two models (bottom and top). +use_gpu=yes # can be "yes", "no", "optional", "wait" + +common_opts= # Options passed through to nnet3-chaina-train and nnet3-chaina-combine + +top_unadapted_weight=0.5 +bottom_unadapted_weight=0.5 + +num_epochs=4.0 # Note: each epoch may actually contain multiple repetitions of + # the data, for various reasons: + # using the --num-repeats option in process_egs.sh + # data augmentation + # different data shifts (this includes 3 different shifts + # of the data if frame_subsampling_factor=3 (see $dir/init/info.txt) + +num_jobs_initial=1 +num_jobs_final=1 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +groups_per_minibatch=32 # This is how you set the minibatch size. Note: if + # chunks_per_group=4, this would mean 128 chunks per + # minibatch. + +max_iters_combine=80 +max_models_combine=20 +diagnostic_period=5 # Get diagnostics every this-many iterations + +shuffle_buffer_size=1000 # This "buffer_size" variable controls randomization of the groups + # on each iter. + + + + +# End configuration section + + + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + + +if [ $# != 2 ]; then + echo "Usage: $0 [options] " + echo " e.g.: $0 exp/chaina/tdnn1a_sp/egs exp/chaina/tdnn1a_sp" + echo "" + echo " TODO: more documentation" + exit 1 +fi + +egs_dir=$1 +dir=$2 + +set -e -u # die on failed command or undefined variable + +steps/chaina/validate_randomized_egs.sh $egs_dir + +for f in $dir/init/info.txt $dir/init/bottom.raw; do + if [ ! -f $f ]; then + echo "$0: expected file $f to exist" + exit 1 + fi +done + + +frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$dir/init/info.txt) +bottom_subsampling_factor=$(awk '/^bottom_subsampling_factor/ {print $2}' <$dir/init/info.txt) + +if ! [ $[frame_subsampling_factor%bottom_subsampling_factor] == 0 ]; then + echo "$0: bad subsampling factors in $dir/init/info.txt" + exit 1 +fi + +num_scp_files=$(awk '/^num_scp_files/ {print $2}' <$egs_dir/info.txt) + +steps/chaina/internal/get_train_schedule.py \ + --frame-subsampling-factor=$frame_subsampling_factor \ + --num-jobs-initial=$num_jobs_initial \ + --num-jobs-final=$num_jobs_final \ + --num-epochs=$num_epochs \ + --dropout-schedule="$dropout_schedule" \ + --num-scp-files=$num_scp_files \ + --frame-subsampling-factor=$frame_subsampling_factor \ + --initial-effective-lrate=$initial_effective_lrate \ + --final-effective-lrate=$final_effective_lrate \ + --schedule-out=$dir/schedule.txt + + + +if [ "$use_gpu" != "no" ]; then gpu_cmd_opt="--gpu 1"; else gpu_cmd_opt=""; fi + +num_iters=$(wc -l <$dir/schedule.txt) + +echo "$0: will train for $num_epochs epochs = $num_iters iterations" + +# source the 1st line of schedule.txt in the shell; this sets +# lrate and dropout_opt, among other variables. +. <(head -n 1 $dir/schedule.txt) +langs=$(awk '/^langs/ { $1=""; print; }' <$dir/init/info.txt) + +mkdir -p $dir/log + +# Copy models with initial learning rate and dropout options from $dir/init to $dir/0 +mkdir -p $dir/0 +run.pl $dir/log/init_bottom_model.log \ + nnet3-copy --learning-rate=$lrate $dropout_opt $dir/init/bottom.raw $dir/0/bottom.raw +for lang in $langs; do + run.pl $dir/log/init_model_$lang.log \ + nnet3-am-copy --learning-rate=$lrate $dropout_opt $dir/init/$lang.mdl $dir/0/$lang.mdl +done + + +x=0 +if [ $stage -gt $x ]; then x=$stage; fi + +while [ $x -lt $num_iters ]; do + # Source some variables fromm schedule.txt. The effect will be something + # like the following: + # iter=0; num_jobs=2; inv_num_jobs=0.5; scp_indexes=(pad 1 2); frame_shifts=(pad 1 2); dropout_opt="--edits='set-dropout-proportion name=* proportion=0.0'" lrate=0.002 + . <(grep "^iter=$x;" $dir/schedule.txt) + + echo "$0: training, iteration $x, num-jobs is $num_jobs" + + next_x=$[$x+1] + model_in_dir=$dir/$x + if [ ! -f $model_in_dir/bottom.raw ]; then + echo "$0: expected $model_in_dir/bottom.raw to exist" + exit 1 + fi + den_fst_dir=$egs_dir/misc + transform_dir=$dir/init + model_out_dir=$dir/${next_x} + + + # for the first 4 iterations, plus every $diagnostic_period iterations, launch + # some diagnostic processes. We don't do this on iteration 0, because + # the batchnorm stats wouldn't be ready + if [ $x -gt 0 ] && [ $[x%diagnostic_period] -eq 0 -o $x -lt 5 ]; then + + [ -f $dir/$x/.error_diagnostic ] && rm $dir/$x/.error_diagnostic + for name in train heldout; do + $cmd $gpu_cmd_opt $dir/log/diagnostic_${name}.$x.log \ + nnet3-chaina-train --use-gpu=$use_gpu \ + --bottom.train=false --bottom.dropout-test-mode=true \ + --top.train=false --top.dropout-test-mode=true \ + --leaky-hmm-coefficient=$leaky_hmm_coefficient \ + --bottom-subsampling-factor=$bottom_subsampling_factor \ + --xent-regularize=$xent_regularize \ + --print-interval=10 \ + $model_in_dir $den_fst_dir $transform_dir \ + "ark:nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch scp:$egs_dir/${name}_subset.scp ark:-|" \ + || touch $dir/$x/.error_diagnostic & + done + fi + + if [ -d $dir/$next_x ]; then + echo "$0: removing previous contents of $dir/$next_x" + rm -r $dir/$next_x + fi + mkdir -p $dir/$next_x + + for j in $(seq $num_jobs); do + scp_index=${scp_indexes[$j]} + frame_shift=${frame_shifts[$j]} + + $cmd $gpu_cmd_opt $dir/log/train.$x.$j.log \ + nnet3-chaina-train --job-id=$j --use-gpu=$use_gpu --apply-deriv-weights=$apply_deriv_weights \ + --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \ + --bottom-subsampling-factor=$bottom_subsampling_factor \ + --top.unadapted-weight=$top_unadapted_weight --bottom.unadapted-weight=$bottom_unadapted_weight \ + --print-interval=10 --max-param-change=$max_param_change \ + --l2-regularize-factor=$inv_num_jobs --optimization.memory-compression-level=$memory_compression_level \ + $model_in_dir $den_fst_dir $transform_dir \ + "ark:nnet3-chain-copy-egs --frame-shift=$frame_shift scp:$egs_dir/train.$scp_index.scp ark:- | nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch ark:- ark:-|" \ + $model_out_dir || touch $dir/$next_x/.error & + done + wait + if [ -f $dir/$next_x/.error ]; then + echo "$0: error detected training on iteration $x" + exit 1 + fi + # First average the bottom models + models=$(for j in $(seq $num_jobs); do echo $dir/$next_x/bottom.$j.raw; done) + run.pl $dir/log/average.$x.log \ + nnet3-average $models - \| \ + nnet3-copy --learning-rate=$lrate $dropout_opt - $dir/$next_x/bottom.raw + rm $models + for lang in $langs; do + models=$dir/$next_x/$lang.*.raw + run.pl $dir/log/average_${lang}.$x.log \ + nnet3-average $models - \| \ + nnet3-am-copy --set-raw-nnet=- --learning-rate=$lrate $dropout_opt $dir/$iter/$lang.mdl $dir/$next_x/$lang.mdl + rm $models + done + wait + [ -f $dir/$x/.error_diagnostic ] && echo "$0: error getting diagnostics on iter $x" && exit 1; + + $cmd $dir/log/progress_bottom.$x.log \ + nnet3-show-progress $dir/$x/bottom.raw $dir/$next_x/bottom.raw '&&' \ + nnet3-info $dir/$next_x/bottom.raw || touch $dir/$next_x/.error & + for lang in $langs; do + $cmd $dir/log/progress_${lang}.$x.log \ + nnet3-show-progress $dir/$x/$lang.mdl $dir/$next_x/$lang.mdl '&&' \ + nnet3-am-info $dir/$next_x/$lang.mdl || touch $dir/$next_x/.error & + done + [ -f $dir/$next_x/.error ] && echo "$0: error getting progress logs" && exit 1; + + # TODO: cleanup + x=$[x+1] +done + + +if [ $stage -le $num_iters ]; then + echo "$0: doing model combination" + if [ -d $dir/final ]; then + echo "$0: removing previous contents of $dir/final" + rm -r $dir/final + fi + mkdir -p $dir/final + den_fst_dir=$egs_dir/misc + + [ $max_models_combine -gt $[num_iters/2] ] && max_models_combine=$[num_iters/2]; + input_model_dirs=$(for x in $(seq $[num_iters+1-max_models_combine] $num_iters); do echo $dir/$x; done) + output_model_dir=$dir/final + transform_dir=$dir/init + + $cmd $gpu_cmd_opt $dir/log/combine.log \ + nnet3-chaina-combine --use-gpu=$use_gpu \ + --leaky-hmm-coefficient=$leaky_hmm_coefficient \ + --bottom-subsampling-factor=$bottom_subsampling_factor \ + --print-interval=10 \ + $input_model_dirs $den_fst_dir $transform_dir \ + "ark:nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch scp:$egs_dir/train_subset.scp ark:-|" \ + $dir/final +fi + + +if [ $stage -le $[num_iters+1] ]; then + # Now accumulate the class-dependent mean (and variance) stats of the + # adaptation model, which will be needed for decoding. We remove the map that + # had reduced the num-classes from several thousand to (e.g.) 200, because we + # are now estimating the means on a larger set of data and we're not concerned + # about noisy estimates. + mkdir -p $dir/transforms_unmapped + # Note: the plan was to add the option --remove-pdf-map=true to the 'copy' + # command below (to use the full number of pdf-ids as classes in test time), + # but it seemed to degrade the objective function, based on diagnostics. + # We'll look into this later. + for lang in $langs; do + run.pl $dir/log/copy_transform_${lang}.log \ + nnet3-adapt copy $dir/init/${lang}.ada $dir/transforms_unmapped/${lang}.ada + done + den_fst_dir=$egs_dir/misc + transform_dir=$dir/init + + num_jobs=$num_scp_files + [ $num_jobs -gt 4 ] && num_jobs=4 # there are so few params to estimate that + # more than 4 jobs would be a waste. + + $cmd $gpu_cmd_opt JOB=1:$num_jobs $dir/log/acc_target_model.JOB.log \ + nnet3-chaina-train --job-id=JOB --use-gpu=$use_gpu \ + --bottom-subsampling-factor=$bottom_subsampling_factor \ + --print-interval=10 \ + --bottom.train=false --bottom.dropout-test-mode=true --bottom.batchnorm-test-mode=true \ + --top.train=false --top.dropout-test-mode=true --top.batchnorm-test-mode=true \ + --adaptation-model-accumulate=true \ + $dir/final $den_fst_dir $dir/transforms_unmapped \ + "ark:nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size scp:$egs_dir/train.JOB.scp ark:- | nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch ark:- ark:-|" \ + $dir/final + + for lang in $langs; do + stats=$dir/final/${lang}.*.ada + run.pl $dir/log/estimate_target_model_${lang}.log \ + nnet3-adapt estimate $stats $dir/final/${lang}.ada + rm $stats + done +fi + +if [ $stage -le $[num_iters+2] ]; then + # Accumulate some final diagnostics. The difference with the last iteration's + # diagnostics is that we use test-mode for the adaptation model (i.e. a target + # model computed from all the data, not just one minibatch). + [ -f $dir/final/.error_diagnostic ] && rm $dir/final/.error_diagnostic + for name in train heldout; do + den_fst_dir=$egs_dir/misc + $cmd $gpu_cmd_opt $dir/log/diagnostic_${name}.final.log \ + nnet3-chaina-train --use-gpu=$use_gpu \ + --bottom-subsampling-factor=$bottom_subsampling_factor \ + --bottom.train=false --bottom.dropout-test-mode=true \ + --top.train=false --top.dropout-test-mode=true \ + --adaptation-test-mode=true \ + --leaky-hmm-coefficient=$leaky_hmm_coefficient \ + --xent-regularize=$xent_regularize \ + --print-interval=10 \ + $dir/final $den_fst_dir $dir/final \ + "ark:nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch scp:$egs_dir/${name}_subset.scp ark:-|" \ + || touch $dir/final/.error_diagnostic & + done + wait + if [ -f $dir/final/.error_diagnostic ]; then + echo "$0: error getting final diagnostic information" + exit 1 + fi + cp $dir/init/info.txt $dir/final/ +fi + + +transform_dir=$dir/init + +echo "$0: done" +exit 0 diff --git a/egs/wsj/s5/steps/chaina/validate_processed_egs.sh b/egs/wsj/s5/steps/chaina/validate_processed_egs.sh new file mode 100755 index 00000000000..d928642dff9 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/validate_processed_egs.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script validates a directory containing 'processed' egs for 'chaina' +# training, i.e. the output of process_egs.sh. It also helps to document the +# expectations on such a directory. + + +if [ -f path.sh ]; then . ./path.sh; fi + + +if [ $# != 1 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/chaina/tdnn1a_sp/processed_egs" + echo "" + echo "Validates that the processed-egs dir has the expected format" +fi + +dir=$1 + +# Note: the .ark files are not actually consumed directly downstream (only via +# the top-level .scp files), but we check them anyway for now. +for f in $dir/train.scp $dir/info.txt \ + $dir/heldout_subset.{ark,scp} $dir/train_subset.{ark,scp} \ + $dir/train.1.scp $dir/train.1.ark; do + if ! [ -f $f -a -s $f ]; then + echo "$0: expected file $f to exist and be nonempty." + exit 1 + fi +done + + +if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "processed_chaina_egs" ]; then + grep dir_type $dir/info.txt + echo "$0: dir_type should be processed_chaina_egs in $dir/info.txt" + exit 1 +fi + +lang=$(awk '/^lang / {print $2; }' <$dir/info.txt) + +for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do + if ! [ -f $f -a -s $f ]; then + echo "$0: expected file $f to exist and be nonempty." + exit 1 + fi +done + +echo "$0: sucessfully validated processed egs in $dir" diff --git a/egs/wsj/s5/steps/chaina/validate_randomized_egs.sh b/egs/wsj/s5/steps/chaina/validate_randomized_egs.sh new file mode 100755 index 00000000000..1eebc144347 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/validate_randomized_egs.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script validates a directory containing 'randomized' egs for 'chaina' +# training, i.e. the output of randomize_egs.sh (this is the final form of the +# egs which is consumed by the training script). It also helps to document the +# expectations on such a directory. + + +if [ -f path.sh ]; then . ./path.sh; fi + + +if [ $# != 1 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/chaina/tdnn1a_sp/egs" + echo "" + echo "Validates that the final (randomized) egs dir has the expected format" +fi + +dir=$1 + +# Note: the .ark files are not actually consumed directly downstream (only via +# the top-level .scp files), but we check them anyway for now. +for f in $dir/train.1.scp $dir/info.txt \ + $dir/heldout_subset.scp $dir/train_subset.scp; do + if ! [ -f $f -a -s $f ]; then + echo "$0: expected file $f to exist and be nonempty." + exit 1 + fi +done + + +if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "randomized_chaina_egs" ]; then + grep dir_type $dir/info.txt + echo "$0: dir_type should be randomized_chaina_egs in $dir/info.txt" + exit 1 +fi + +langs=$(awk '/^langs / {$1 = ""; print; }' <$dir/info.txt) +num_scp_files=$(awk '/^num_scp_files / { print $2; }' <$dir/info.txt) + +if [ -z "$langs" ]; then + echo "$0: expecting the list of languages to be nonempty in $dir/info.txt" + exit 1 +fi + +for lang in $langs; do + for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst} $dir/info_${lang}.txt; do + if ! [ -f $f -a -s $f ]; then + echo "$0: expected file $f to exist and be nonempty." + exit 1 + fi + done +done + +for i in $(seq $num_scp_files); do + if ! [ -s $dir/train.$i.scp ]; then + echo "$0: expected file $dir/train.$i.scp to exist and be nonempty." + exit 1 + fi +done + + +echo "$0: sucessfully validated randomized egs in $dir" diff --git a/egs/wsj/s5/steps/chaina/validate_raw_egs.sh b/egs/wsj/s5/steps/chaina/validate_raw_egs.sh new file mode 100755 index 00000000000..5e15bc0c897 --- /dev/null +++ b/egs/wsj/s5/steps/chaina/validate_raw_egs.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +# Copyright 2019 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# +# This script validates a directory containing 'raw' egs for 'chaina' training. +# It also helps to document the expectations on such a directory. + + + +if [ -f path.sh ]; then . ./path.sh; fi + + +if [ $# != 1 ]; then + echo "Usage: $0 " + echo " e.g.: $0 exp/chaina/tdnn1a_sp/raw_egs" + echo "" + echo "Validates that the raw-egs dir has the expected format" +fi + +dir=$1 + +for f in $dir/all.scp $dir/cegs.1.ark $dir/info.txt \ + $dir/misc/utt2spk; do + if ! [ -s $f ]; then + echo "$0: expected file $f to exist and be nonempty." + exit 1 + fi +done + + +if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "raw_chaina_egs" ]; then + grep dir_type $dir/info.txt + echo "$0: dir_type should be raw_chaina_egs in $dir/info.txt" + exit 1 +fi + +lang=$(awk '/^lang / {print $2; }' <$dir/info.txt) + +for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do + if ! [ -s $f ]; then + echo "$0: expected file $f to exist and be nonempty." + exit 1 + fi +done + +echo "$0: sucessfully validated raw egs in $dir" diff --git a/egs/wsj/s5/steps/diagnostic/analyze_lats.sh b/egs/wsj/s5/steps/diagnostic/analyze_lats.sh index df1a6d64801..6b6091e8684 100755 --- a/egs/wsj/s5/steps/diagnostic/analyze_lats.sh +++ b/egs/wsj/s5/steps/diagnostic/analyze_lats.sh @@ -9,6 +9,7 @@ # begin configuration section. iter=final cmd=run.pl +model= acwt=0.1 #end configuration section. @@ -22,6 +23,10 @@ if [ $# -ne 2 ]; then echo " Options:" echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." echo " --acwt # Acoustic scale for getting best-path (default: 0.1)" + echo " --iter # default: final; affects model location if --model" + echo " # not specified." + echo " --model # Name of .mdl file (if not specified, defaults" + echo " # to /../.mdl if not specified." echo "e.g.:" echo "$0 data/lang exp/tri4b/decode_dev" echo "This script writes some diagnostics to /log/alignments.log" @@ -31,7 +36,9 @@ fi lang=$1 dir=$2 -model=$dir/../${iter}.mdl +if [ -z $model ]; then + model=$dir/../${iter}.mdl +fi for f in $lang/words.txt $model $dir/lat.1.gz $dir/num_jobs; do [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1; diff --git a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py index 0ad93e5977d..d890f8007e6 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py @@ -186,9 +186,22 @@ def _get_component_dropout(dropout_schedule, data_fraction): def _get_dropout_proportions(dropout_schedule, data_fraction): """Returns dropout proportions based on the dropout_schedule for the - fraction of data seen at this stage of training. + fraction of data seen at this stage of training. Returns a list of + pairs (pattern, dropout_proportion); for instance, it might return + the list ['*', 0.625] meaning a dropout proportion of 0.625 is to + be applied to all dropout components. + Returns None if dropout_schedule is None. + dropout_schedule might be (in the sample case using the default pattern of + '*'): '0.1,0.5@0.5,0.1', meaning a piecewise linear function that starts at + 0.1 when data_fraction=0.0, rises to 0.5 when data_fraction=0.5, and falls + again to 0.1 when data_fraction=1.0. It can also contain space-separated + items of the form 'pattern=schedule', for instance: + '*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0' + The more specific patterns should go later, otherwise they will be overridden + by the less specific patterns' commands. + Calls _get_component_dropout() for the different component name patterns in dropout_schedule. @@ -198,6 +211,7 @@ def _get_dropout_proportions(dropout_schedule, data_fraction): See _self_test() for examples. data_fraction: The fraction of data seen until this stage of training. + """ if dropout_schedule is None: return None @@ -210,14 +224,21 @@ def _get_dropout_proportions(dropout_schedule, data_fraction): return dropout_proportions -def get_dropout_edit_string(dropout_schedule, data_fraction, iter_): - """Return an nnet3-copy --edits line to modify raw_model_string to - set dropout proportions according to dropout_proportions. + +def get_dropout_edit_option(dropout_schedule, data_fraction, iter_): + """Return an option to be passed to nnet3-copy (or nnet3-am-copy) + that will set the appropriate dropout proportion. If no dropout + is being used (dropout_schedule is None), returns the empty + string, otherwise returns something like + "--edits='set-dropout-proportion name=* proportion=0.625'" Arguments: dropout_schedule: Value for the --trainer.dropout-schedule option. See help for --trainer.dropout-schedule. See _self_test() for examples. + data_fraction: real number in [0,1] that says how far along + in training we are. + iter_: iteration number (needed for debug printing only) See ReadEditConfig() in nnet3/nnet-utils.h to see how set-dropout-proportion directive works. @@ -241,9 +262,39 @@ def get_dropout_edit_string(dropout_schedule, data_fraction, iter_): if _debug_dropout: logger.info("On iteration %d, %s", iter_, ', '.join(dropout_info)) - return ("""nnet3-copy --edits='{edits}' - - |""".format( - edits=";".join(edit_config_lines))) + return "--edits='{0}'".format(";".join(edit_config_lines)) + + +def get_dropout_edit_string(dropout_schedule, data_fraction, iter_): + """Return an nnet3-copy --edits line to modify raw_model_string to + set dropout proportions according to dropout_proportions. + E.g. if _dropout_proportions(dropout_schedule, data_fraction) + returns [('*', 0.625)], this will return the string: + "nnet3-copy --edits='set-dropout-proportion name=* proportion=0.625'" + + This is a wrapper of the function get_dropout_edit_option which + gets the --edits option; this function just adds the nnet3-copy + and its arguments. + + Arguments: + dropout_schedule: Value for the --trainer.dropout-schedule option. + See help for --trainer.dropout-schedule. + See _self_test() for examples. + data_fraction: real number in [0,1] that says how far along + in training we are. + iter_: iteration number (needed for debug printing only) + + See ReadEditConfig() in nnet3/nnet-utils.h to see how + set-dropout-proportion directive works. + """ + + edit_option = get_dropout_edit_option(dropout_schedule, data_fraction, iter_) + + if edit_option == "": + return "" + else: + return ("nnet3-copy {0} - - |".format(edit_option)) def _self_test(): """Run self-test. diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 5ac2ed59003..b540423e3cd 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -27,6 +27,7 @@ 'relu-batchnorm-layer' : xlayers.XconfigBasicLayer, 'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer, 'batchnorm-so-relu-layer' : xlayers.XconfigBasicLayer, + 'batchnorm-layer' : xlayers.XconfigBasicLayer, 'sigmoid-layer' : xlayers.XconfigBasicLayer, 'tanh-layer' : xlayers.XconfigBasicLayer, 'fixed-affine-layer' : xlayers.XconfigFixedAffineLayer, diff --git a/egs/wsj/s5/steps/nnet3/chain/align_lats.sh b/egs/wsj/s5/steps/nnet3/chain/align_lats.sh new file mode 100755 index 00000000000..ed10735245d --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/chain/align_lats.sh @@ -0,0 +1,149 @@ +#!/bin/bash +# Copyright 2012 Brno University of Technology (Author: Karel Vesely) +# 2013 Johns Hopkins University (Author: Daniel Povey) +# 2015 Vijayaditya Peddinti +# 2016 Vimal Manohar +# 2017 Pegah Ghahremani +# Apache 2.0 + +# Computes training alignments using nnet3 DNN, with output to lattices. + +# Begin configuration section. +nj=4 +cmd=run.pl +stage=-1 +# Begin configuration. +scale_opts="--transition-scale=1.0 --self-loop-scale=1.0" +acoustic_scale=1.0 +post_decode_acwt=10.0 +beam=20 +iter=final +frames_per_chunk=50 +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 +online_ivector_dir= +graphs_scp= +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: $0 " + echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +mkdir -p $dir/log +echo $nj > $dir/num_jobs +sdata=$data/split${nj} +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || \ + split_data.sh $data $nj || exit 1; + +extra_files= +if [ ! -z "$online_ivector_dir" ]; then + steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1 + extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period" +fi + +for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +cp $srcdir/{tree,${iter}.mdl} $dir || exit 1; + +utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1; +cp $lang/phones.txt $dir || exit 1; +## Set up features. Note: these are different from the normal features +## because we have one rspecifier that has the features for the entire +## training set, not separate ones for each batch. +echo "$0: feature type is raw" + +cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +cp $srcdir/cmvn_opts $dir 2>/dev/null + +feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" + +ivector_opts= +if [ ! -z "$online_ivector_dir" ]; then + ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; + ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period" +fi + +echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir" + +frame_subsampling_opt= +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor) + frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor" + cp $srcdir/frame_subsampling_factor $dir + if [ "$frame_subsampling_factor" -gt 1 ] && \ + [ "$scale_opts" == "--transition-scale=1.0 --self-loop-scale=0.1" ]; then + echo "$0: frame-subsampling-factor is not 1 (so likely a chain system)," + echo "... but the scale opts are the defaults. You probably want" + echo "--scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'" + sleep 1 + fi +fi + +if [ ! -z "$graphs_scp" ]; then + if [ ! -f $graphs_scp ]; then + echo "Could not find graphs $graphs_scp" && exit 1 + fi + tra="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $graphs_scp |" + prog=compile-train-graphs-fsts +else + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + prog=compile-train-graphs +fi + +if [ $stage -le 0 ]; then + ## because nnet3-latgen-faster doesn't support adding the transition-probs to the + ## graph itself, we need to bake them into the compiled graphs. This means we can't reuse previously compiled graphs, + ## because the other scripts write them without transition probs. + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + $prog --read-disambig-syms=$lang/phones/disambig.int \ + $scale_opts \ + $dir/tree $srcdir/${iter}.mdl $lang/L.fst "$tra" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1 +fi + +if [ $stage -le 1 ]; then + # Warning: nnet3-latgen-faster doesn't support a retry-beam so you may get more + # alignment errors (however, it does have a default min-active=200 so this + # will tend to reduce alignment errors). + # --allow_partial=false makes sure we reach the end of the decoding graph. + # --word-determinize=false makes sure we retain the alternative pronunciations of + # words (including alternatives regarding optional silences). + # --lattice-beam=$beam keeps all the alternatives that were within the beam, + # it means we do no pruning of the lattice (lattices from a training transcription + # will be small anyway). + $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \ + nnet3-latgen-faster --acoustic-scale=$acoustic_scale $ivector_opts $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --beam=$beam --lattice-beam=$beam \ + --allow-partial=false --word-determinize=false \ + $srcdir/${iter}.mdl "ark:gunzip -c $dir/fsts.JOB.gz |" \ + "$feats" "ark:|lattice-copy --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" || exit 1; +fi + +echo "$0: done generating lattices from training transcripts." diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh index 757963f13a7..6fcbc472412 100755 --- a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh +++ b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh @@ -27,6 +27,11 @@ leftmost_questions_truncate=-1 # note: this option is deprecated and has no eff tree_stats_opts= cluster_phones_opts= repeat_frames=false +num_clusters= # e.g. 200; can be used if you want a 2-level tree, and + # in that case the file tree.map will be output, which + # maps from the leaves to (effectively) clusters of + # leaves. We'll also output the file num_clusters which is + # the number of these clusters (normally == the option). # End configuration section. echo "$0 $@" # Print the command line for logging @@ -58,6 +63,13 @@ if [ $# != 5 ]; then echo " --frame-subsampling-factor # Factor (e.g. 3) controlling frame subsampling" echo " # at the neural net output, so the frame rate at" echo " # the output is less than at the input." + echo " --alignment-subsampling-factor # Factor controlling subsampling of the input alignment." + echo " # By default it equal to the frame-subsampling-factor," + echo " # but (e.g.) if you use a low-frame-rate system to" + echo " # generate alignments, you might want to set this to 1." + echo " --num-clusters # Default: none. E.g. 200; can be used if you want" + echo " # a 2-level tree. Used in 'chaina' setup. The file" + echo " # tree.map will be output in this case." exit 1; fi @@ -168,11 +180,28 @@ if [ $stage -le -3 ] && $train_tree; then compile-questions $context_opts $lang/topo \ $dir/questions.int $dir/questions.qst || exit 1; - echo "$0: Building the tree" - $cmd $dir/log/build_tree.log \ - build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ - --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ - $dir/questions.qst $lang/topo $dir/tree || exit 1; + if [ -z "$num_clusters" ]; then + # normal case: single tree. + echo "$0: Building the tree" + $cmd $dir/log/build_tree.log \ + build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ + --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree || exit 1; + else + if ! [ $num_clusters -lt $numleaves ]; then + echo "$0: --num-clusters=$num_clusters must be less than num-leaves=$numleaves" + exit 1; + fi + $cmd $dir/log/build_tree.log \ + build-tree-two-level $context_opts --verbose=1 \ + --max-leaves-first=$num_clusters --max-leaves-second=$numleaves \ + $dir/treeacc $lang/phones/roots.int \ + $dir/questions.qst $lang/topo $dir/tree \ + "|copy-int-vector --binary=false - $dir/tree.map" || exit 1; + num_clusters_effective=$(cat $dir/tree.map awk '{nc=0; for(n=2;n=nc) nc=1+$n; }END{print nc}') + echo $num_clusters_effective >$dir/num_clusters + echo "$0: you requested --num-clusters=$num_clusters, you got 2nd-level tree num-leaves=$num_clusters_effective" + fi fi if [ $stage -le -2 ]; then diff --git a/egs/wsj/s5/steps/nnet3/compute_output.sh b/egs/wsj/s5/steps/nnet3/compute_output.sh index e55f705043b..1f61e97876e 100755 --- a/egs/wsj/s5/steps/nnet3/compute_output.sh +++ b/egs/wsj/s5/steps/nnet3/compute_output.sh @@ -35,6 +35,7 @@ if [ $# -ne 3 ]; then echo "e.g.: steps/nnet3/compute_output.sh --nj 8 \\" echo "--online-ivector-dir exp/nnet3/ivectors_test_eval92 \\" echo " data/test_eval92_hires exp/nnet3/tdnn exp/nnet3/tdnn/output" + echo "Output will be in /output.scp" echo "main options (for others, see top of script file)" echo " --config # config containing options" echo " --nj # number of parallel jobs" diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index 14dda2bd457..adf686fa10e 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -19,7 +19,7 @@ min_active=200 ivector_scale=1.0 lattice_beam=8.0 # Beam we use in lattice generation. iter=final -num_threads=1 # if >1, will use gmm-latgen-faster-parallel +num_threads=1 # if >1, will use nnet3-latgen-faster-parallel use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch. # In that case it is recommended to set num-threads to a large # number, e.g. 20 if you have that many free CPU slots on a GPU diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_config.py b/egs/wsj/s5/steps/nnet3/xconfig_to_config.py new file mode 100755 index 00000000000..e234ea732d4 --- /dev/null +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_config.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 + +# Copyright 2016-2018 Johns Hopkins University (Dan Povey) +# 2016 Vijayaditya Peddinti +# 2017 Google Inc. (vpeddinti@google.com) +# Apache 2.0. + +# we're using python 3.x style print but want it to work in python 2.x, + +import argparse +import os +import sys +from collections import defaultdict + +sys.path.insert(0, 'steps/') +# the following is in case we weren't running this from the normal directory. +sys.path.insert(0, os.path.realpath(os.path.dirname(sys.argv[0])) + '/') + +import libs.nnet3.xconfig.parser as xparser +import libs.common as common_lib + + +def get_args(): + # we add compulsory arguments as named arguments for readability + parser = argparse.ArgumentParser( + description="Reads an xconfig file and creates config files " + "for neural net creation and training", + epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples') + parser.add_argument('--xconfig-file', required=True, + help='Filename of input xconfig file') + parser.add_argument('--existing-model', + help='Filename of previously trained neural net ' + '(e.g. final.mdl) which is useful in case of ' + 'using nodes from list of component-nodes in ' + 'already trained model ' + 'to generate new config file for new model.' + 'The context info is also generated using ' + 'a model generated by adding final.config ' + 'to the existing model.' + 'e.g. In Transfer learning: generate new model using ' + 'component nodes in existing model.') + parser.add_argument('--config-file-out', required=True, + help='Filename to write nnet config file.'); + parser.add_argument('--nnet-edits', type=str, default=None, + action=common_lib.NullstrToNoneAction, + help="""This option is useful in case the network you + are creating does not have an output node called + 'output' (e.g. for multilingual setups). You can set + this to an edit-string like: 'rename-node old-name=xxx + new-name=output' if node xxx plays the role of the + output node in this network. This is only used for + computing the left/right context.""") + + print(' '.join(sys.argv), file=sys.stderr) + + args = parser.parse_args() + + return args + + + +def write_config_file(config_file_out, all_layers): + # config_basename_to_lines is map from the basename of the + # config, as a string (i.e. 'ref', 'all', 'init') to a list of + # strings representing lines to put in the config file. + config_basename_to_lines = defaultdict(list) + + for layer in all_layers: + try: + pairs = layer.get_full_config() + for config_basename, line in pairs: + config_basename_to_lines[config_basename].append(line) + except Exception as e: + print("{0}: error producing config lines from xconfig " + "line '{1}': error was: {2}".format(sys.argv[0], + str(layer), repr(e)), + file=sys.stderr) + # we use raise rather than raise(e) as using a blank raise + # preserves the backtrace + raise + + with open(config_file_out, 'w') as f: + print('# This file was created by the command:\n' + '# {0} '.format(sys.argv), file=f) + lines = config_basename_to_lines['final'] + for line in lines: + print(line, file=f) + + +def main(): + args = get_args() + existing_layers = [] + if args.existing_model is not None: + existing_layers = xparser.get_model_component_info(args.existing_model) + all_layers = xparser.read_xconfig_file(args.xconfig_file, existing_layers) + write_config_file(args.config_file_out, all_layers) + + +if __name__ == '__main__': + main() + + +# test: +# (echo 'input dim=40 name=input'; echo 'output name=output input=Append(-1,0,1)') >xconfig; steps/nnet3/xconfig_to_config.py --xconfig-file=xconfig --config-file-out=foo diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py index f025eb5b343..4d96ef5db43 100755 --- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py +++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py @@ -39,8 +39,13 @@ def get_args(): 'to the existing model.' 'e.g. In Transfer learning: generate new model using ' 'component nodes in existing model.') - parser.add_argument('--config-dir', required=True, - help='Directory to write config files and variables') + parser.add_argument('--config-dir', required=False, + help='Directory to write config files and variables; either ' + 'this or --config-out must be specified.') + parser.add_argument('--config-out', required=False, + help='Filename to write nnet config file. This is the ' + 'simplified interface that does not support lda-layer. ' + 'Either this or --config-dir must be supplied.') parser.add_argument('--nnet-edits', type=str, default=None, action=common_lib.NullstrToNoneAction, help="""This option is useful in case the network you @@ -141,7 +146,7 @@ def write_expanded_xconfig_files(config_dir, all_layers): def get_config_headers(): """ This function returns a map from config-file basename - e.g. 'init', 'ref', 'layer1' to a documentation string that goes + e.g. 'init', 'ref', 'final' to a documentation string that goes at the top of the file. """ # resulting dict will default to the empty string for any config files not @@ -230,6 +235,41 @@ def write_config_files(config_dir, all_layers): raise +# This is an alternative to 'write_config_files' where a single output +# file is desired (would correspond to the output 'final.config' in the +# normal setup). In this case, things like LDA and presoftmax are not +# supported. +def write_single_config_file(config_file_out, all_layers): + # config_basename_to_lines is map from the basename of the + # config, as a string (i.e. 'ref', 'all', 'init') to a list of + # strings representing lines to put in the config file. + config_basename_to_lines = defaultdict(list) + + config_basename_to_header = get_config_headers() + + for layer in all_layers: + try: + pairs = layer.get_full_config() + for config_basename, line in pairs: + config_basename_to_lines[config_basename].append(line) + except Exception as e: + print("{0}: error producing config lines from xconfig " + "line '{1}': error was: {2}".format(sys.argv[0], + str(layer), repr(e)), + file=sys.stderr) + # we use raise rather than raise(e) as using a blank raise + # preserves the backtrace + raise + + + with open(config_file_out, 'w') as f: + header = config_basename_to_header['final'] + print(header, file=f) + lines = config_basename_to_lines['final'] + for line in lines: + print(line, file=f) + + def add_nnet_context_info(config_dir, nnet_edits=None, existing_model=None): """Create the 'vars' file that specifies model_left_context, etc.""" diff --git a/src/Makefile b/src/Makefile index 1b37ebce745..737a26338ca 100644 --- a/src/Makefile +++ b/src/Makefile @@ -6,16 +6,16 @@ SHELL := /bin/bash SUBDIRS = base matrix util feat tree gmm transform \ - fstext hmm lm decoder lat kws cudamatrix nnet \ + fstext hmm lm decoder lat kws cudamatrix adapt nnet \ bin fstbin gmmbin fgmmbin featbin \ - nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 rnnlm chain nnet3bin nnet2bin kwsbin \ - ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin + nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 nnet3a rnnlm chain nnet3bin nnet2bin kwsbin \ + ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin nnet3abin MEMTESTDIRS = base matrix util feat tree gmm transform \ - fstext hmm lm decoder lat nnet kws chain \ + fstext hmm lm decoder lat nnet kws chain nnet3a \ bin fstbin gmmbin fgmmbin featbin \ nnetbin latbin sgmm2 nnet2 nnet3 rnnlm nnet2bin nnet3bin sgmm2bin kwsbin \ - ivector ivectorbin online2 online2bin lmbin + ivector ivectorbin online2 online2bin lmbin nnet3abin CUDAMEMTESTDIR = cudamatrix @@ -150,7 +150,7 @@ $(EXT_SUBDIRS) : mklibdir ext_depend ### Dependency list ### # this is necessary for correct parallel compilation #1)The tools depend on all the libraries -bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin: \ +bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin nnet3abin: \ base matrix util feat tree gmm transform sgmm2 fstext hmm \ lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 rnnlm @@ -169,9 +169,11 @@ lm: base util matrix fstext decoder: base util matrix gmm hmm tree transform lat lat: base util hmm tree matrix cudamatrix: base util matrix +adapt: base util matrix hmm cudamatrix nnet: base util hmm tree matrix cudamatrix nnet2: base util matrix lat gmm hmm tree transform cudamatrix nnet3: base util matrix lat gmm hmm tree transform cudamatrix chain fstext +nnet3a: base util matrix lat gmm hmm tree transform cudamatrix adapt nnet3 chain fstext rnnlm: base util matrix cudamatrix nnet3 lm hmm chain: lat hmm tree fstext matrix cudamatrix util base ivector: base util matrix transform tree gmm diff --git a/src/adapt/Makefile b/src/adapt/Makefile new file mode 100644 index 00000000000..25c016b4e6d --- /dev/null +++ b/src/adapt/Makefile @@ -0,0 +1,19 @@ +all: + +include ../kaldi.mk + +LDFLAGS += $(CUDA_LDFLAGS) +LDLIBS += $(CUDA_LDLIBS) + +TESTFILES = differentiable-fmllr-test differentiable-transform-test + +OBJFILES = differentiable-fmllr.o differentiable-transform-itf.o \ + generic-transform.o differentiable-transform.o + +LIBNAME = kaldi-adapt + +ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../hmm/kaldi-hmm.a \ + ../matrix/kaldi-matrix.a ../util/kaldi-util.a \ + ../base/kaldi-base.a + +include ../makefiles/default_rules.mk diff --git a/src/adapt/differentiable-fmllr-test.cc b/src/adapt/differentiable-fmllr-test.cc new file mode 100644 index 00000000000..86f3b924418 --- /dev/null +++ b/src/adapt/differentiable-fmllr-test.cc @@ -0,0 +1,639 @@ +// adapt/differentiable-fmllr-test.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "adapt/differentiable-fmllr.h" +#include "matrix/sp-matrix.h" + +namespace kaldi { +namespace differentiable_transform { + + + +// Test derivatives produced by the Estimator object for K. +void TestCoreFmllrEstimatorKDeriv( + BaseFloat gamma, + const Matrix &G, + const Matrix &K, + const Matrix &A, + CoreFmllrEstimator *estimator) { + + int32 num_directions = 4; + Vector expected_changes(num_directions), + actual_changes(num_directions); + + int32 dim = G.NumRows(); + BaseFloat epsilon = 1.0e-03 * gamma; + Matrix A_deriv(dim, dim); + // A_deriv defines the objective function: a random linear function in A. + A_deriv.SetRandn(); + A_deriv.Add(0.1); // Introduce some asymmetry. + + Matrix G_deriv(dim, dim), + K_deriv(dim, dim); + estimator->Backward(A_deriv, &G_deriv, &K_deriv); + + for (int32 i = 0; i < num_directions; i++) { + Matrix K_new(dim, dim); + K_new.SetRandn(); + K_new.Scale(epsilon); + expected_changes(i) = TraceMatMat(K_new, K_deriv, kTrans); + K_new.AddMat(1.0, K); + FmllrEstimatorOptions opts; + Matrix A_new(dim, dim); + CoreFmllrEstimator estimator2(opts, gamma, G, K_new, &A_new); + estimator2.Forward(); + A_new.AddMat(-1.0, A); + // compute the change in our random linear objective function defined by + // A_deriv, that would be produced by taking some small random change in K + // and computing the A that results from that. + actual_changes(i) = TraceMatMat(A_new, A_deriv, kTrans); + } + + KALDI_LOG << "Expected changes: " << expected_changes + << ", actual changes: " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + +// Test derivatives produced by the Estimator object for G. +void TestCoreFmllrEstimatorGDeriv( + BaseFloat gamma, + const Matrix &G, + const Matrix &K, + const Matrix &A, + CoreFmllrEstimator *estimator) { + + int32 num_directions = 4; + Vector expected_changes(num_directions), + actual_changes(num_directions); + + int32 dim = G.NumRows(); + BaseFloat epsilon = 1.0e-03 * gamma; + Matrix A_deriv(dim, dim); + // A_deriv defines the objective function: a random linear function in A. + A_deriv.SetRandn(); + A_deriv.Add(0.1); // Introduce some asymmetry. + + Matrix G_deriv(dim, dim), + K_deriv(dim, dim); + estimator->Backward(A_deriv, &G_deriv, &K_deriv); + + KALDI_ASSERT(G_deriv.IsSymmetric()); + + for (int32 i = 0; i < num_directions; i++) { + Matrix G_new(dim, dim); + { + SpMatrix s(dim); + s.SetRandn(); + G_new.CopyFromSp(s); + } + G_new.Scale(epsilon); + expected_changes(i) = TraceMatMat(G_new, G_deriv, kTrans); + G_new.AddMat(1.0, G); + FmllrEstimatorOptions opts; + Matrix A_new(dim, dim); + CoreFmllrEstimator estimator2(opts, gamma, G_new, K, &A_new); + estimator2.Forward(); + A_new.AddMat(-1.0, A); + // compute the change in our random linear objective function defined by + // A_deriv, that would be produced by taking some small random change in K + // and computing the A that results from that. + actual_changes(i) = TraceMatMat(A_new, A_deriv, kTrans); + } + + KALDI_LOG << "Expected changes: " << expected_changes + << ", actual changes: " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + + + +void UnitTestCoreFmllrEstimatorSimple() { + int32 dim = RandInt(10, 20); + BaseFloat gamma = RandInt(5, 10); + Matrix G(dim, dim), + K(dim, dim), A(dim, dim, kUndefined); + G.AddToDiag(1.234 * gamma); + K.AddToDiag(0.234 * gamma); + FmllrEstimatorOptions opts; + CoreFmllrEstimator estimator(opts, gamma, G, K, &A); + BaseFloat objf_impr = estimator.Forward(); + KALDI_LOG << "A is " << A; + KALDI_ASSERT(A.IsUnit(0.01)); + KALDI_ASSERT(fabs(objf_impr) < 0.01); + for (int32 i = 0; i < 5; i++) { + TestCoreFmllrEstimatorKDeriv(gamma, G, K, A, &estimator); + TestCoreFmllrEstimatorGDeriv(gamma, G, K, A, &estimator); + } +} + +static void InitRandNonsingular(MatrixBase *M) { + do { + M->SetRandn(); + } while (M->Cond() > 50.0); +} + + +void UnitTestCoreFmllrEstimatorGeneral() { + int32 dim = RandInt(10, 20); + BaseFloat gamma = RandInt(5, 10); + Matrix G(dim, dim), + K(dim, dim), A(dim, dim, kUndefined); + + { + // make sure G is symmetric and +ve definite. + Matrix A(dim, dim + 10); + A.SetRandn(); + G.AddMatMat(gamma, A, kNoTrans, A, kTrans, 0.0); + } + + InitRandNonsingular(&K); + K.Scale(gamma); + FmllrEstimatorOptions opts; + CoreFmllrEstimator estimator(opts, gamma, G, K, &A); + BaseFloat objf_impr = estimator.Forward(); + KALDI_LOG << "A is " << A << ", objf impr is " << objf_impr; + for (int32 i = 0; i < 5; i++) { + TestCoreFmllrEstimatorKDeriv(gamma, G, K, A, &estimator); + TestCoreFmllrEstimatorGDeriv(gamma, G, K, A, &estimator); + } +} + +void TestGaussianEstimatorDerivs(const MatrixBase &feats, + const Posterior &post, + const FmllrEstimatorOptions &opts, + GaussianEstimator *g) { + int32 n = 4; // number of delta-params we use. + Vector expected_changes(n), + actual_changes(n); + + // if !test_mean_deriv, then we test the var deriv. + bool test_mean_deriv = (RandInt(0, 1) == 0); + + int32 num_classes = g->NumClasses(), dim = g->Dim(); + + Matrix mean_derivs(num_classes, dim); + Vector var_derivs(num_classes); + if (test_mean_deriv) { + KALDI_LOG << "Testing mean derivs."; + mean_derivs.SetRandn(); + } else { + KALDI_LOG << "Testing var derivs."; + var_derivs.SetRandn(); + var_derivs.Add(0.2); // Nonzero mean makes the test easier to pass + } + g->AddToOutputDerivs(mean_derivs, var_derivs); + Matrix feats_deriv(feats.NumRows(), feats.NumCols()); + g->AccStatsBackward(feats, post, &feats_deriv); + + BaseFloat epsilon = 1.0e-04; + + for (int32 i = 0; i < n; i++) { + Matrix new_feats(feats.NumRows(), + feats.NumCols()); + new_feats.SetRandn(); + new_feats.Scale(epsilon); + + expected_changes(i) = TraceMatMat(feats_deriv, new_feats, kTrans); + + new_feats.AddMat(1.0, feats); + + GaussianEstimator g2(num_classes, dim); + g2.AccStats(new_feats, post); + g2.Estimate(opts); + + actual_changes(i) = + TraceMatMat(mean_derivs, g2.GetMeans(), kTrans) - + TraceMatMat(mean_derivs, g->GetMeans(), kTrans) + + VecVec(var_derivs, g2.GetVars()) - + VecVec(var_derivs, g->GetVars()); + } + KALDI_LOG << "Actual changes are " << actual_changes + << " vs. predicted " << expected_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + +void TestFmllrEstimatorMeanDerivs(const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + const MatrixBase &mu(g.GetMeans()); + const VectorBase &s(g.GetVars()); + + int32 T = feats.NumRows(), dim = feats.NumCols(), + num_classes = mu.NumRows(); + + FmllrEstimatorOptions opts; + + FmllrEstimator f(opts, mu, s); + + Matrix adapted_feats(T, dim, kUndefined); + BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats); + KALDI_LOG << "Forward objf-impr per frame (with same features) is " + << objf_impr; + + // adapted_feats_deriv is the deriv of a random objective function + // w.r.t the output (adapted) features. + Matrix adapted_feats_deriv(T, dim), + feats_deriv(T, dim); + adapted_feats_deriv.SetRandn(); + adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. + + f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); + + KALDI_LOG << "2-norm of adapted_feats_deriv is " + << adapted_feats_deriv.FrobeniusNorm() + << ", of feats_deriv is " + << feats_deriv.FrobeniusNorm(); + + const MatrixBase &mu_deriv = f.GetMeanDeriv(); + + // measure the accuracy of the deriv in 4 random directions. + int32 n = 4; + BaseFloat epsilon = 1.0e-04; + Vector expected_changes(n), actual_changes(n); + for (int32 i = 0; i < n; i++) { + Matrix new_mu(num_classes, dim, kUndefined), + new_adapted_feats(T, dim, kUndefined); + new_mu.SetRandn(); + // adding a systematic component helps the test to succeed in low precision. + for (int32 c = 0; c < num_classes; c++) { + new_mu.Row(c).Add(0.1 * RandInt(-1, 1)); + } + new_mu.Scale(epsilon); + expected_changes(i) = TraceMatMat(new_mu, mu_deriv, kTrans); + new_mu.AddMat(1.0, mu); + FmllrEstimator f2(opts, new_mu, s); + f2.ForwardCombined(feats, post, &new_adapted_feats); + actual_changes(i) = + TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - + TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); + } + KALDI_LOG << "Expected changes are " << expected_changes + << " vs. actual " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + +void TestFmllrEstimatorVarDerivs(const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + const MatrixBase &mu(g.GetMeans()); + const VectorBase &s(g.GetVars()); + + int32 T = feats.NumRows(), dim = feats.NumCols(), + num_classes = mu.NumRows(); + + FmllrEstimatorOptions opts; + + FmllrEstimator f(opts, mu, s); + + Matrix adapted_feats(T, dim, kUndefined); + BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats); + KALDI_LOG << "Forward objf-impr per frame (with same features) is " + << objf_impr; + + // adapted_feats_deriv is the deriv of a random objective function + // w.r.t the output (adapted) features. + Matrix adapted_feats_deriv(T, dim), + feats_deriv(T, dim); + adapted_feats_deriv.SetRandn(); + // Adding a systematic component to the derivative makes the test easier + // to pass, as the derivs are less random. + adapted_feats_deriv.AddMat(0.1, feats); + + f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); + + KALDI_LOG << "2-norm of adapted_feats_deriv is " + << adapted_feats_deriv.FrobeniusNorm() + << ", of feats_deriv is " + << feats_deriv.FrobeniusNorm(); + + const VectorBase &s_deriv = f.GetVarDeriv(); + + // measure the accuracy of the deriv in 10 random directions + int32 n = 10; + BaseFloat epsilon = 0.001; + Vector expected_changes(n), actual_changes(n); + for (int32 i = 0; i < n; i++) { + Vector new_s(num_classes, kUndefined); + Matrix new_adapted_feats(T, dim, kUndefined); + new_s.SetRandn(); + new_s.Scale(epsilon); + expected_changes(i) = VecVec(new_s, s_deriv); + new_s.AddVec(1.0, s); + FmllrEstimator f2(opts, mu, new_s); + f2.ForwardCombined(feats, post, &new_adapted_feats); + actual_changes(i) = + TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - + TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); + } + KALDI_LOG << "Expected changes are " << expected_changes + << " vs. actual " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + + +void TestFmllrEstimatorSequence(const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + // Do two fMLLR's in a row and see if the change in objf decreases. + + int32 T = feats.NumRows(), dim = feats.NumCols(); + const MatrixBase &mu(g.GetMeans()); + const VectorBase &s(g.GetVars()); + FmllrEstimatorOptions opts; + + FmllrEstimator f(opts, mu, s); + + Matrix adapted_feats(T, dim, kUndefined); + BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats); + KALDI_LOG << "Forward objf-impr per frame (first time) is " + << objf_impr; + + + Matrix adapted_feats2(T, dim, kUndefined); + FmllrEstimator f2(opts, mu, s); + BaseFloat objf_impr2 = f.ForwardCombined(adapted_feats, post, &adapted_feats2); + KALDI_LOG << "Forward objf-impr per frame (second time) is " + << objf_impr2; +} + +void TestFmllrEstimatorFeatDerivs(const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + int32 T = feats.NumRows(), dim = feats.NumCols(); + const MatrixBase &mu(g.GetMeans()); + const VectorBase &s(g.GetVars()); + + FmllrEstimatorOptions opts; + + FmllrEstimator f(opts, mu, s); + + Matrix adapted_feats(T, dim, kUndefined); + BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats); + KALDI_LOG << "Forward objf-impr per frame (with same features) is " + << objf_impr; + + // adapted_feats_deriv is the deriv of a random objective function + // w.r.t the output (adapted) features. + Matrix adapted_feats_deriv(T, dim), + feats_deriv(T, dim); + adapted_feats_deriv.SetRandn(); + adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. + + f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); + + KALDI_LOG << "2-norm of adapted_feats_deriv is " + << adapted_feats_deriv.FrobeniusNorm() + << ", of feats_deriv is " + << feats_deriv.FrobeniusNorm(); + + // measure the accuracy of the deriv in 4 random directions. + int32 n = 4; + BaseFloat epsilon = 1.0e-03; + Vector expected_changes(n), actual_changes(n); + for (int32 i = 0; i < n; i++) { + Matrix new_feats(T, dim, kUndefined), + new_adapted_feats(T, dim, kUndefined); + new_feats.SetRandn(); + new_feats.Add(RandGauss()); // will help to test whether the indirect + // part of the derivative is accurate. + new_feats.Scale(epsilon); + expected_changes(i) = TraceMatMat(new_feats, feats_deriv, kTrans); + new_feats.AddMat(1.0, feats); + FmllrEstimator f2(opts, mu, s); + f2.ForwardCombined(new_feats, post, &new_adapted_feats); + actual_changes(i) = + TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - + TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); + } + KALDI_LOG << "Expected changes are " << expected_changes + << " vs. actual " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + + +void TestMeanOnlyTransformEstimatorMeanDerivs( + const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + const MatrixBase &mu(g.GetMeans()); + + int32 T = feats.NumRows(), dim = feats.NumCols(), + num_classes = mu.NumRows(); + + MeanOnlyTransformEstimator m(mu); + + Matrix adapted_feats(T, dim, kUndefined); + m.ForwardCombined(feats, post, &adapted_feats); + + // adapted_feats_deriv is the deriv of a random objective function + // w.r.t the output (adapted) features. + Matrix adapted_feats_deriv(T, dim), + feats_deriv(T, dim); + adapted_feats_deriv.SetRandn(); + adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. + + m.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); + + KALDI_LOG << "2-norm of adapted_feats_deriv is " + << adapted_feats_deriv.FrobeniusNorm() + << ", of feats_deriv is " + << feats_deriv.FrobeniusNorm(); + + const MatrixBase &mu_deriv = m.GetMeanDeriv(); + + // measure the accuracy of the deriv in 4 random directions. + int32 n = 4; + BaseFloat epsilon = 1.0e-03; + Vector expected_changes(n), actual_changes(n); + for (int32 i = 0; i < n; i++) { + Matrix new_mu(num_classes, dim, kUndefined), + new_adapted_feats(T, dim, kUndefined); + new_mu.SetRandn(); + // adding a systematic component helps the test to succeed in low precision. + for (int32 c = 0; c < num_classes; c++) { + new_mu.Row(c).Add(0.1 * RandInt(-1, 1)); + } + new_mu.Scale(epsilon); + expected_changes(i) = TraceMatMat(new_mu, mu_deriv, kTrans); + new_mu.AddMat(1.0, mu); + MeanOnlyTransformEstimator m2(new_mu); + m2.ForwardCombined(feats, post, &new_adapted_feats); + actual_changes(i) = + TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - + TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); + } + KALDI_LOG << "Expected changes are " << expected_changes + << " vs. actual " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + + +void TestMeanOnlyTransformEstimatorFeatDerivs( + const MatrixBase &feats, + const Posterior &post, + const GaussianEstimator &g) { + int32 T = feats.NumRows(), dim = feats.NumCols(); + const MatrixBase &mu(g.GetMeans()); + + + MeanOnlyTransformEstimator m(mu); + + Matrix adapted_feats(T, dim, kUndefined); + m.ForwardCombined(feats, post, &adapted_feats); + + // adapted_feats_deriv is the deriv of a random objective function + // w.r.t the output (adapted) features. + Matrix adapted_feats_deriv(T, dim), + feats_deriv(T, dim); + adapted_feats_deriv.SetRandn(); + adapted_feats_deriv.Add(0.1); // Introduce some asymmetry. + + m.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv); + + KALDI_LOG << "2-norm of adapted_feats_deriv is " + << adapted_feats_deriv.FrobeniusNorm() + << ", of feats_deriv is " + << feats_deriv.FrobeniusNorm(); + + // measure the accuracy of the deriv in 4 random directions. + int32 n = 4; + BaseFloat epsilon = 1.0e-03; + Vector expected_changes(n), actual_changes(n); + for (int32 i = 0; i < n; i++) { + Matrix new_feats(T, dim, kUndefined), + new_adapted_feats(T, dim, kUndefined); + new_feats.SetRandn(); + new_feats.Scale(epsilon); + expected_changes(i) = TraceMatMat(new_feats, feats_deriv, kTrans); + new_feats.AddMat(1.0, feats); + MeanOnlyTransformEstimator m2(mu); + m2.ForwardCombined(new_feats, post, &new_adapted_feats); + actual_changes(i) = + TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) - + TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans); + } + KALDI_LOG << "Expected changes are " << expected_changes + << " vs. actual " << actual_changes; + if (!expected_changes.ApproxEqual(actual_changes, 0.1)) { + KALDI_ERR << "Expected and actual changes differ too much: " + << expected_changes << " vs. " + << actual_changes; + } +} + + +void UnitTestGaussianAndEstimators() { + // It's important that the number of classes be greater than the dimension, or + // we would get a low-rank K. + int32 num_classes = RandInt(30, 40), + dim = RandInt(10, 20), + num_frames = RandInt(20 * num_classes, 40 * num_classes); + + GaussianEstimator g(num_classes, dim); + + Matrix feats(num_frames, dim); + feats.SetRandn(); + feats.Add(0.2); // Nonzero offset tests certain aspects of the code better. + Posterior post(num_frames); + for (int32 t = 0; t < num_frames; t++) { + int32 n = RandInt(0, 2); + for (int32 j = 0; j < n; j++) { + int32 i = RandInt(0, num_classes - 1); + BaseFloat p = 0.25 * RandInt(1, 5); + post[t].push_back(std::pair(i, p)); + } + } + g.AccStats(feats, post); + FmllrEstimatorOptions opts; + // avoid setting variance_sharing_weight to 1.0; it's hard for the tests to + // succeed then, and there are valid reasons for that + opts.variance_sharing_weight = 0.25 * RandInt(0, 2); + g.Estimate(opts); + KALDI_LOG << "Means are: " + << g.GetMeans() << ", vars are: " + << g.GetVars(); + + TestGaussianEstimatorDerivs(feats, post, opts, &g); + + if (RandInt(0, 1) == 0) { + opts.smoothing_count = 500.0; + } + + { // test FmllrEstimator + TestFmllrEstimatorSequence(feats, post, g); + TestFmllrEstimatorMeanDerivs(feats, post, g); + TestFmllrEstimatorFeatDerivs(feats, post, g); + TestFmllrEstimatorVarDerivs(feats, post, g); + } + + { // test MeanOnlyTransformEstimator. + TestMeanOnlyTransformEstimatorMeanDerivs(feats, post, g); + TestMeanOnlyTransformEstimatorFeatDerivs(feats, post, g); + } + + + + +} + + + +} // namespace kaldi +} // namespace differentiable_transform + + + +int main() { + using namespace kaldi::differentiable_transform; + + for (int32 i = 0; i < 50; i++) { + UnitTestCoreFmllrEstimatorSimple(); + UnitTestCoreFmllrEstimatorGeneral(); + UnitTestGaussianAndEstimators(); + } + std::cout << "Test OK.\n"; +} diff --git a/src/adapt/differentiable-fmllr.cc b/src/adapt/differentiable-fmllr.cc new file mode 100644 index 00000000000..faabc7b1496 --- /dev/null +++ b/src/adapt/differentiable-fmllr.cc @@ -0,0 +1,888 @@ +// adapt/differentiable-fmllr.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "adapt/differentiable-fmllr.h" +#include "matrix/matrix-functions.h" + +namespace kaldi { +namespace differentiable_transform { + + +void FmllrEstimatorOptions::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, singular_value_relative_floor); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, variance_floor); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, variance_sharing_weight); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, smoothing_count); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, smoothing_between_class_factor); + WriteToken(os, binary, ""); +} + +void FmllrEstimatorOptions::Read(std::istream &is, bool binary) { + ExpectToken(is, binary, ""); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &singular_value_relative_floor); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &variance_floor); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &variance_sharing_weight); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &smoothing_count); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &smoothing_between_class_factor); + ExpectToken(is, binary, ""); +} + +void FmllrEstimatorOptions::ReadFromConfig(ConfigLine *config_line) { + config_line->GetValue("singular-value-relative-floor", + &singular_value_relative_floor); + config_line->GetValue("variance-floor", &variance_floor); + config_line->GetValue("variance-sharing-weight", &variance_sharing_weight); + config_line->GetValue("smoothing-count", &smoothing_count); + config_line->GetValue("smoothing-between-class-factor", + &smoothing_between_class_factor); +} + + +CoreFmllrEstimator::CoreFmllrEstimator( + const FmllrEstimatorOptions &opts, + BaseFloat gamma, + const MatrixBase &G, + const MatrixBase &K, + MatrixBase *A): + opts_(opts), gamma_(gamma), + G_(G), K_(K), A_(A) { + KALDI_ASSERT(opts.singular_value_relative_floor > 0.0 && + gamma > 0.0 && G.NumRows() == K.NumRows() && + K.NumRows() == K.NumCols() && + SameDim(K, *A)); +} + + +BaseFloat CoreFmllrEstimator::Forward() { + ComputeH(); + ComputeL(); + ComputeB(); + ComputeA(); + return ComputeObjfChange(); +} + +void CoreFmllrEstimator::ComputeH() { + int32 dim = G_.NumRows(); + bool symmetric = true; + G_rescaler_.Init(&G_, symmetric); + BaseFloat *G_singular_values = G_rescaler_.InputSingularValues(); + + { + SubVector v(G_singular_values, dim); + BaseFloat floor = v.Max() * opts_.singular_value_relative_floor; + KALDI_ASSERT(floor > 0.0); + MatrixIndexT num_floored = 0; + v.ApplyFloor(floor, &num_floored); + if (num_floored > 0.0) + KALDI_WARN << num_floored << " out of " << dim + << " singular values floored in G matrix."; + } + BaseFloat *H_singular_values = G_rescaler_.OutputSingularValues(), + *H_singular_value_derivs = G_rescaler_.OutputSingularValueDerivs(); + // We don't have to worry about elements of G_singular_values being zero, + // since we floored them above. + for (int32 i = 0; i < dim; i++) { + H_singular_values[i] = 1.0 / std::sqrt(G_singular_values[i]); + // The following expression is equivalent to + // -0.5 * pow(G_singular_values[i], -1.5), + // which is the derivative of lambda^{-0.5} w.r.t lambda. + // (lambda, here, is G_singular_values[i]). + H_singular_value_derivs[i] = -0.5 * (H_singular_values[i] / + G_singular_values[i]); + } + H_.Resize(dim, dim, kUndefined); + G_rescaler_.GetOutput(&H_); +} + +void CoreFmllrEstimator::ComputeL() { + int32 dim = G_.NumRows(); + L_.Resize(dim, dim); + L_.AddMatMat(1.0, K_, kNoTrans, H_, kNoTrans, 0.0); +} + +// Compute B = F(L), where F is the +// function that takes the singular values of L, puts them through the function +// f(lamba) = (lambda + sqrt(lambda^2 + 4 gamma)) / 2. +void CoreFmllrEstimator::ComputeB() { + int32 dim = L_.NumRows(); + bool symmetric = false; + L_rescaler_.Init(&L_, symmetric); + BaseFloat *lambda = L_rescaler_.InputSingularValues(); + { // This block deals with flooring lambda to avoid zero values. + SubVector v(lambda, dim); + BaseFloat floor = v.Max() * opts_.singular_value_relative_floor; + KALDI_ASSERT(floor > 0.0); + MatrixIndexT num_floored = 0; + v.ApplyFloor(floor, &num_floored); + static int num_warned = 100; + if (num_floored > 0.0 && num_warned > 0) + KALDI_WARN << num_floored << " out of " << dim + << " singular values floored in L matrix." + << (--num_warned == 0 ? " Will not warn again." : ""); + } + // f is where we put f(lambda). + // f_prime is where we put f'(lambda) (the function-derivative of f w.r.t + // lambda). + BaseFloat *f = L_rescaler_.OutputSingularValues(), + *f_prime = L_rescaler_.OutputSingularValueDerivs(); + + BaseFloat gamma = gamma_; + for (int32 i = 0; i < dim; i++) { + BaseFloat lambda_i = lambda[i]; + f[i] = (lambda_i + std::sqrt(lambda_i * lambda_i + 4.0 * gamma)) / 2.0; + f_prime[i] = (1.0 + lambda_i / + std::sqrt(lambda_i * lambda_i + 4.0 * gamma)) / 2.0; + } + B_.Resize(dim, dim, kUndefined); + L_rescaler_.GetOutput(&B_); +} + +void CoreFmllrEstimator::ComputeA() { + A_->SetZero(); // Make sure there are no NaN's. + A_->AddMatMat(1.0, B_, kNoTrans, H_, kNoTrans, 0.0); +} + +BaseFloat CoreFmllrEstimator::ComputeObjfChange() { + // we are computing the objective-function improvement from estimating + // A (we'll later compute the improvement from estimating the offset b). + // This is the equation which, from the writeup, is: + // \gamma log |A| + tr(A^T K) - tr(K) + // + 1/2 tr(G) - 1/2 tr(B B^T). + // and we note that log |A| = log |B| + log |G^{-0.5}| = log |B| -0.5 log |G|. + // Here, |.| actually means the absolute value of the determinant. + + int32 dim = L_.NumRows(); + double logdet_g = 0.0, logdet_b = 0.0, tr_b_bt = 0.0, tr_g = 0.0; + BaseFloat *G_singular_values = G_rescaler_.InputSingularValues(), + *B_singular_values = L_rescaler_.OutputSingularValues(); + for (int32 i = 0; i < dim; i++) { + // we have already ensured that G_singular_values[i] > 0. + logdet_g += Log(G_singular_values[i]); + tr_g += G_singular_values[i]; + logdet_b += Log(B_singular_values[i]); + tr_b_bt += B_singular_values[i] * B_singular_values[i]; + } + + double logdet_A = logdet_b - 0.5 * logdet_g, + tr_at_k = TraceMatMat(*A_, K_, kTrans), + tr_k = K_.Trace(); + + return BaseFloat( + gamma_ * logdet_A + tr_at_k - tr_k + 0.5 * tr_g - 0.5 * tr_b_bt); +} + +void CoreFmllrEstimator::BackpropA(const MatrixBase &A_deriv, + MatrixBase *B_deriv, + MatrixBase *H_deriv) { + B_deriv->AddMatMat(1.0, A_deriv, kNoTrans, H_, kTrans, 0.0); + H_deriv->AddMatMat(1.0, B_, kTrans, A_deriv, kNoTrans, 0.0); +} + +void CoreFmllrEstimator::BackpropL(const MatrixBase &L_deriv, + MatrixBase *K_deriv, + MatrixBase *H_deriv) { + K_deriv->AddMatMat(1.0, L_deriv, kNoTrans, H_, kTrans, 0.0); + H_deriv->AddMatMat(1.0, K_, kTrans, L_deriv, kNoTrans, 1.0); +} + + +void CoreFmllrEstimator::Backward(const MatrixBase &A_deriv, + Matrix *G_deriv, + Matrix *K_deriv) { + KALDI_ASSERT(SameDim(A_deriv, *A_) && SameDim(A_deriv, *G_deriv) + && SameDim(*G_deriv, *K_deriv)); + int32 dim = A_->NumRows(); + Matrix B_deriv(dim, dim), H_deriv(dim, dim), + L_deriv(dim, dim); + BackpropA(A_deriv, &B_deriv, &H_deriv); + // Backprop through the operation B = F(L). + L_rescaler_.ComputeInputDeriv(B_deriv, &L_deriv); + BackpropL(L_deriv, K_deriv, &H_deriv); + // Backprop through the operation H = G^{-0.5}. + G_rescaler_.ComputeInputDeriv(H_deriv, G_deriv); + + { // Make sure G_deriv is symmetric. Use H_deriv as a temporary. + H_deriv.CopyFromMat(*G_deriv); + G_deriv->AddMat(1.0, H_deriv, kTrans); + G_deriv->Scale(0.5); + } +} + + +GaussianEstimator::GaussianEstimator(int32 num_classes, int32 feature_dim): + gamma_(num_classes), + m_(num_classes, feature_dim), + v_(num_classes), + variance_floor_(-1), variance_sharing_weight_(-1) { + // the floor and weight are actually set later on, in Estimate(). + KALDI_ASSERT(num_classes > 0 && feature_dim > 0); +} + +void GaussianEstimator::AccStats(const MatrixBase &feats, + const SubPosterior &post) { + KALDI_ASSERT(static_cast(post.size()) == feats.NumRows()); + int32 T = feats.NumRows(), + num_classes = m_.NumRows(); + for (int32 t = 0; t < T; t++) { + SubVector feat(feats, t); + const std::vector > &this_post = post[t]; + auto iter2 = this_post.begin(), + end2 = this_post.end(); + for (; iter2 != end2; ++iter2) { + int32 i = iter2->first; + KALDI_ASSERT(i >= 0 && i < num_classes && + "Posteriors and adaptation model mismatch"); + BaseFloat p = iter2->second; + gamma_(i) += p; + SubVector this_m(m_, i); + this_m.AddVec(p, feat); + v_(i) += p * VecVec(feat, feat); + } + } +} + +void GaussianEstimator::Estimate(const FmllrEstimatorOptions &opts) { + variance_floor_ = opts.variance_floor; + variance_sharing_weight_ = opts.variance_sharing_weight; + KALDI_ASSERT(variance_floor_ > 0.0 && + variance_sharing_weight_ >= 0.0 && + variance_sharing_weight_ <= 1.0); + KALDI_ASSERT(mu_.NumRows() == 0 && + "You cannot call Estimate() twice."); + int32 num_classes = m_.NumRows(), dim = m_.NumCols(); + + mu_ = m_; + s_.Resize(num_classes, kUndefined); + t_.Resize(num_classes, kUndefined); + for (int32 i = 0; i < num_classes; i++) { + BaseFloat gamma_i = gamma_(i); + if (gamma_i < 1.0e-10) { + // the i'th row of mu will already be zero. + s_(i) = variance_floor_; + } else { + SubVector mu_i(mu_, i); + // We already copied m_ to mu_. + mu_i.Scale(1.0 / gamma_i); + s_(i) = std::max(variance_floor_, + v_(i) / (gamma_i * dim) - VecVec(mu_i, mu_i) / dim); + } + } + + // apply variance_sharing_weight_. + BaseFloat gamma = gamma_.Sum(), + s = VecVec(gamma_, s_) / gamma, + f = variance_sharing_weight_; + KALDI_ASSERT(gamma != 0.0 && + "You cannot call Estimate() with no stats."); + for (int32 i = 0; i < num_classes; i++) { + t_(i) = (BaseFloat(1.0) - f) * s_(i) + f * s; + } + { BaseFloat sum = mu_.Sum(); KALDI_ASSERT(sum - sum == 0); } // TEMP + + // Clear the stats, which won't be needed any longer. + m_.Resize(0, 0); + v_.Resize(0); +} + +void GaussianEstimator::AddToOutputDerivs( + const MatrixBase &mean_derivs, + const VectorBase &var_derivs) { + KALDI_ASSERT(SameDim(mean_derivs, mu_) && + var_derivs.Dim() == t_.Dim()); + int32 num_classes = mean_derivs.NumRows(), + dim = mean_derivs.NumCols(); + BaseFloat f = variance_sharing_weight_, + variance_floor = variance_floor_, + gamma = gamma_.Sum(); + KALDI_ASSERT(gamma > 0.0); + if (m_bar_.NumRows() == 0) { + // This is the first time this function was called. + m_bar_.Resize(num_classes, dim); + v_bar_.Resize(num_classes); + } + + const VectorBase &t_bar(var_derivs); + const MatrixBase &mu_bar(mean_derivs); + BaseFloat s_bar = f * t_bar.Sum(); + for (int32 i = 0; i < num_classes; i++) { + SubVector m_bar_i(m_bar_, i); + BaseFloat gamma_i = gamma_(i); + if (gamma_i > 1.0e-10) { + if (s_(i) != variance_floor) { + BaseFloat s_bar_i = (BaseFloat(1.0) - f) * t_bar(i) + s_bar * gamma_i / gamma; + v_bar_(i) += s_bar_i / (gamma_i * dim); + m_bar_i.AddVec(-2.0 * s_bar_i / (gamma_i * dim), mu_.Row(i)); + } + m_bar_i.AddVec(1.0 / gamma_i, mu_bar.Row(i)); + } + } +} + +int32 GaussianEstimator::Dim() const { + // One of these two will be nonempty. + return std::max(m_.NumCols(), mu_.NumCols()); +} + +void GaussianEstimator::AccStatsBackward( + const MatrixBase &feats, + const SubPosterior &post, + const MatrixBase *feats_deriv) { + // The equation we're implementing is: + // \bar{x}_t = \sum_i \gamma_{t,i} (\bar{m}_i + 2\bar{v}_i x_t) + // See the comment in the header: + // "Notes on implementation of GaussianEstimator". + int32 T = feats.NumRows(); + KALDI_ASSERT(static_cast(post.size() == T) && + SameDim(feats, *feats_deriv)); + for (int32 t = 0; t < T; t++) { + SubVector feat(feats, t), + feat_deriv(*feats_deriv, t); + const std::vector > &this_post = post[t]; + auto iter2 = this_post.begin(), + end2 = this_post.end(); + for (; iter2 != end2; ++iter2) { + int32 i = iter2->first; + BaseFloat p = iter2->second; + SubVector m_bar_i(m_bar_, i); + feat_deriv.AddVec(p, m_bar_i); + feat_deriv.AddVec(p * 2.0 * v_bar_(i), feat); + } + } +} + +void GaussianEstimator::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + gamma_.Write(os, binary); + m_.Write(os, binary); + v_.Write(os, binary); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, variance_floor_); + WriteBasicType(os, binary, variance_sharing_weight_); + WriteToken(os, binary, ""); + mu_.Write(os, binary); + WriteToken(os, binary, ""); + t_.Write(os, binary); + WriteToken(os, binary, ""); +} + +void GaussianEstimator::Add(const GaussianEstimator &other) { + gamma_.AddVec(1.0, other.gamma_); + m_.AddMat(1.0, other.m_); + v_.AddVec(1.0, other.v_); +} + + +void GaussianEstimator::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + gamma_.Read(is, binary); + m_.Read(is, binary); + v_.Read(is, binary); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &variance_floor_); + ReadBasicType(is, binary, &variance_sharing_weight_); + ExpectToken(is, binary, ""); + mu_.Read(is, binary); + ExpectToken(is, binary, ""); + t_.Read(is, binary); + ExpectToken(is, binary, ""); +} + + +FmllrEstimator::FmllrEstimator(const FmllrEstimatorOptions &opts, + const MatrixBase &mu, + const VectorBase &s): + opts_(opts), mu_(mu), s_(s), estimator_(NULL) { + int32 num_classes = mu_.NumRows(), dim = mu_.NumCols(); + opts_.Check(); + + gamma_.Resize(num_classes); + raw_G_.Resize(dim, dim); + z_.Resize(num_classes, dim); +} + +void FmllrEstimator::AccStats(const MatrixBase &feats, + const SubPosterior &post) { + KALDI_ASSERT(static_cast(post.size() == feats.NumRows())); + int32 num_classes = mu_.NumRows(), + dim = mu_.NumCols(), + T = feats.NumRows(); + + // Use temporaries for the stats and later add them to the stats in the class; + // this will reduce roundoff errors if this function is called more than once. + Vector gamma_hat_t(T, kUndefined), + gamma(num_classes); + + for (int32 t = 0; t < T; t++) { + auto iter = post[t].begin(), end = post[t].end(); + SubVector x_t(feats, t); + BaseFloat this_gamma_hat_t = 0.0; + for (; iter != end; ++iter) { + int32 i = iter->first; + KALDI_ASSERT(i >= 0 && i < num_classes && + "Posteriors and adaptation model mismatch"); + BaseFloat gamma_ti = iter->second, + gamma_hat_ti = gamma_ti / s_(i); + SubVector z_i(z_, i); + z_i.AddVec(gamma_ti, x_t); + gamma(i) += gamma_ti; + this_gamma_hat_t += gamma_hat_ti; + } + gamma_hat_t(t) = this_gamma_hat_t; + } + gamma_.AddVec(1.0, gamma); + + SpMatrix G(dim); + int32 rows_per_chunk = 100; + for (int32 offset = 0; offset < T; offset += rows_per_chunk) { + int32 n_frames = std::min(rows_per_chunk, feats.NumRows() - offset); + SubMatrix feats_part(feats, offset, n_frames, 0, dim); + SubVector gamma_hat_t_part(gamma_hat_t, offset, n_frames); + // the 0.0 value for beta means we don't double-count stats. + G.AddMat2Vec(1.0, feats_part, kTrans, gamma_hat_t_part, 0.0); + raw_G_.AddSp(1.0, G); + } +} + + +BaseFloat FmllrEstimator::Estimate() { + int32 dim = mu_.NumCols(); + BaseFloat gamma_tot = gamma_.Sum(); + KALDI_ASSERT(gamma_tot > 0.0 && + "You cannot call Estimate() with zero stats."); + + Vector s_inv(s_); + s_inv.InvertElements(); + + // compute \hat{\gamma} = \sum_i \gamma_i / s_i + gamma_hat_tot_ = VecVec(gamma_, s_inv); + + // compute n = (1/\hat{\gamma}) \sum_i (1/s_i) z_i + n_.Resize(dim); + n_.AddMatVec(1.0 / gamma_hat_tot_, z_, kTrans, s_inv, 0.0); + + { // Set m = 1/\hat{\gamma} \sum_i (\gamma_i / s_i) \mu_i. + Vector s_inv_gamma(s_inv); + s_inv_gamma.MulElements(gamma_); + m_.Resize(dim); + m_.AddMatVec(1.0 / gamma_hat_tot_, mu_, kTrans, s_inv_gamma, 0.0); + } + + + { // Set K := \sum_i (1/s_i) \mu_i z_i^T - \hat{\gamma} m n^T + Matrix mu_s(mu_); + mu_s.MulRowsVec(s_inv); + K_.Resize(dim, dim); + K_.AddMatMat(1.0, mu_s, kTrans, z_, kNoTrans, 0.0); + K_.AddVecVec(-gamma_hat_tot_, m_, n_); + } + + // In AccStats(), we did raw_G := \sum_t \hat{\gamma}_t x_t x_t^T. + // Now we do: G = raw_G - \hat{\gamma} n n^T + G_ = raw_G_; + G_.AddVecVec(-gamma_hat_tot_, n_, n_); + KALDI_ASSERT(G_.IsSymmetric(0.0001)); + + A_.Resize(dim, dim, kUndefined); + + BaseFloat gamma_tot_smoothed = gamma_tot; + { + /* + Add smoothing counts to gamma_tot, K_ and G_. This prevents the matrix + from diverging too far from the identity, and ensures more reasonable + transform values when counts are small or dimensions large. We can ignore + this smoothing for computing derivatives, because it happens that it + doesn't affect anything; the quantities gamma_, K_ and G_ are never + consumed in the backprop phase, and the expressions for the derivatives + w.r.t. these quantities don't change from adding an extra term. + */ + gamma_tot_smoothed = gamma_tot + opts_.smoothing_count; + BaseFloat s = opts_.smoothing_between_class_factor; + K_.AddToDiag(opts_.smoothing_count * s); + G_.AddToDiag(opts_.smoothing_count * (1.0 + s)); + } + // Compute A_. + estimator_ = new CoreFmllrEstimator(opts_, gamma_tot_smoothed, G_, K_, &A_); + // A_impr will be the objective-function improvement from estimating A + // (vs. the unit matrix), divided by gamma_tot. Note: the likelihood of the + // 'fake data' we used for the smoothing could only have been made worse by + // estimating this transform, so dividing the total objf-impr by gamma_tot + // (rather than gamma_tot_smoothed, if different) will still be an + // underestimate of the actual improvement. + BaseFloat A_impr = (1.0 / gamma_tot) * estimator_->Forward(); + + // Compute b = m - A n. + b_ = m_; + b_.AddMatVec(-1.0, A_, kNoTrans, n_, 1.0); + + // b_impr is the amount of objective-function improvement from estimating b + // (vs. the default value), divided by the total-count gamma_tot. See section + // 'diagnostics' in the document. + // Note: we aren't doing any smoothing for the offset term. + BaseFloat b_impr = (0.5 * VecVec(b_, b_) * gamma_hat_tot_) / gamma_tot; + return A_impr + b_impr; +} + +bool FmllrEstimator::IsEstimated() const { + return A_.NumRows() != 0; +} + +void FmllrEstimator::AdaptFeatures(const MatrixBase &feats, + MatrixBase *adapted_feats) const { + KALDI_ASSERT(A_.NumRows() != 0 && "You cannot call AdaptFeatures before " + "calling Estimate()."); + KALDI_ASSERT(SameDim(feats, *adapted_feats)); + adapted_feats->CopyRowsFromVec(b_); + adapted_feats->AddMatMat(1.0, feats, kNoTrans, A_, kTrans, 1.0); +} + + +void FmllrEstimator::AdaptFeaturesBackward( + const MatrixBase &feats, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv) { + KALDI_ASSERT(SameDim(feats, adapted_feats_deriv) && + SameDim(feats, *feats_deriv) && + G_bar_.NumRows() == 0); + int32 rows_per_chunk = 100; + if (feats.NumRows() > rows_per_chunk) { + // Break it up into 100-frame chunks and recurse. This will reduce roundoff + // error due to the way we work with temporaries. + for (int32 offset = 0; offset < feats.NumRows(); offset += rows_per_chunk) { + int32 n = std::min(rows_per_chunk, feats.NumRows() - offset); + SubMatrix feats_deriv_part = feats_deriv->RowRange(offset, n); + AdaptFeaturesBackward(feats.RowRange(offset, n), + adapted_feats_deriv.RowRange(offset, n), + &feats_deriv_part); + } + return; + } + + // in the writeup: \bar{x}_t <-- A^T \bar{y}_t. + // In this implementation, x_t corresponds to a + // row vector in feats and feats_deriv, so everything is + // transposed to: + // \bar{x}_t^T <--- \bar{y}_t^T A. + feats_deriv->AddMatMat(1.0, adapted_feats_deriv, kNoTrans, + A_, kNoTrans, 1.0); + + // We use temporaries below to possibly reduce roundoff error. + // It's not clear whether this would make a difference-- it depends + // how the BLAS we're using was implemented. + int32 dim = mu_.NumCols(); + // \bar{b} = \sum_t \bar{y}_t + Vector b_bar(dim); + b_bar.AddRowSumMat(1.0, adapted_feats_deriv); + if (b_bar_.Dim() == 0) + b_bar_.Swap(&b_bar); + else + b_bar_.AddVec(1.0, b_bar); + // \bar{A} <-- \sum_t \bar{y}_t x_t^T + Matrix A_bar(dim, dim); + A_bar.AddMatMat(1.0, adapted_feats_deriv, kTrans, feats, kNoTrans, 0.0); + if (A_bar_.NumRows() == 0) + A_bar_.Swap(&A_bar); + else + A_bar_.AddMat(1.0, A_bar); +} + +void FmllrEstimator::EstimateBackward() { + KALDI_ASSERT(G_bar_.NumRows() == 0 && + "You cannot call EstimateBackward() twice."); + KALDI_ASSERT(A_bar_.NumRows() != 0 && + "You must call AdaptFeaturesBackward() before calling " + "EstimateBackward()."); + + Vector s_inv(s_); + s_inv.InvertElements(); + Vector s_inv_gamma(s_inv); + s_inv_gamma.MulElements(gamma_); + + // do \bar{A} -= \bar{b} n^T + A_bar_.AddVecVec(-1.0, b_bar_, n_); + + int32 num_classes = mu_.NumRows(), dim = mu_.NumCols(); + G_bar_.Resize(dim, dim); + K_bar_.Resize(dim, dim); + estimator_->Backward(A_bar_, &G_bar_, &K_bar_); + delete estimator_; + estimator_ = NULL; + KALDI_ASSERT(G_bar_.IsSymmetric()); + + // \bar{n} = - (A^T \bar{b} + 2\bar{G} n + \bar{K}^T m) + n_bar_.Resize(dim); + n_bar_.AddMatVec(-1.0, A_, kTrans, b_bar_, 0.0); + n_bar_.AddMatVec(-2.0 * gamma_hat_tot_, G_bar_, kNoTrans, n_, 1.0); + n_bar_.AddMatVec(-1.0 * gamma_hat_tot_, K_bar_, kTrans, m_, 1.0); + + + // \bar{m} = \bar{b} - \hat{\gamma} \bar{K} n + m_bar_ = b_bar_; + m_bar_.AddMatVec(-gamma_hat_tot_, K_bar_, kNoTrans, n_, 1.0); + + // \bar{z}_i = (1/s_i) \bar{K}^T \mu_i + 1/(s_i \hat{\gamma}) \bar{n} + z_bar_.Resize(num_classes, dim); + // set \bar{z}_i := \bar{K}^T \mu_i. It's transposed below. + z_bar_.AddMatMat(1.0, mu_, kNoTrans, K_bar_, kNoTrans, 0.0); + // \bar{z}_i += 1/\hat{\gamma} \bar{n} + z_bar_.AddVecToRows(1.0 / gamma_hat_tot_, n_bar_); + // \bar{z}_i /= s_i + z_bar_.MulRowsVec(s_inv); + + // \bar{\hat{\gamma}} = - n^T \bar{G} n - m^t \bar{K} n + // - \frac{1}{\hat{\gamma}} (n^T \bar{n} + m^T \bar{m}) + gamma_hat_tot_bar_ = -1.0 * VecMatVec(n_, G_bar_, n_) + - VecMatVec(m_, K_bar_, n_) + - (1.0 / gamma_hat_tot_) * (VecVec(n_, n_bar_) + VecVec(m_, m_bar_)); + + // Set \bar{mu}_i = (1/s_i) \bar{K} z_i + (\gamma_i / (s_i \hat{\gamma})) \bar{m} + mu_bar_.Resize(num_classes, dim); + mu_bar_.AddMatMat(1.0, z_, kNoTrans, K_bar_, kTrans, 0.0); + mu_bar_.MulRowsVec(s_inv); + mu_bar_.AddVecVec(1.0 / gamma_hat_tot_, s_inv_gamma, m_bar_); + + // Add all terms in \bar{s}_i except the one involving \bar{\hat{\gamma}}_t. + // The full equation (also present in the header) is: + // \bar{s}_i = -(1 / s_i^2) * ( + // \mu_i^T \bar{K} z_i + (1 / \hat{\gamma}) \z_i^T \bar{n} + // + (\gamma_i / \hat{\gamma}) \mu_i^T \bar{m} + \gamma_i \hat{\gamma} + // + \sum_t \gamma_{t,i} \bar{\hat{\gamma}}_t ) + // Noticing that some expressions in it are common with \bar{\mu}_i, this can + // be simplified to: + // \bar{s}_i = (-1/s_i) \mu_i^T \bar{\mu}_i + // - (1/s_i^2) * ((1 / \hat{\gamma}) \z_i^T \bar{n} + \gamma_i \hat{\gamma} + // + \sum_t \gamma_{t,i} \bar{\hat{\gamma}}_t ) + s_bar_.Resize(num_classes); + // do s_bar_ -= (1 / \hat{\gamma}) \z_i^T \bar{n}. We'll later multiply by 1/s_i^2. + s_bar_.AddMatVec(-1.0 / gamma_hat_tot_, z_, kNoTrans, n_bar_, 0.0); + // do s_bar_(i) -= \gamma_i \bar{\hat{\gamma}} + s_bar_.AddVec(-1.0 * gamma_hat_tot_bar_, gamma_); + // do s_bar_(i) *= 1/s_i + s_bar_.MulElements(s_inv); + // do s_bar_(i) -= \mu_i^T \bar{\mu}_i + s_bar_.AddDiagMatMat(-1.0, mu_, kNoTrans, mu_bar_, kTrans, 1.0); + // do s_bar_(i) *= 1/s_i + s_bar_.MulElements(s_inv); + // OK, s_bar_ is now set up with all but the last term. It remains only to do: + // \bar{s}_i += (-1/s_i^2) \sum_t \gamma_{t,i} \bar{\hat{\gamma}}_t ) +} + +void FmllrEstimator::AccStatsBackward( + const MatrixBase &feats, + const SubPosterior &post, + MatrixBase *feats_deriv) { + KALDI_ASSERT(static_cast(post.size() == feats.NumRows())); + int32 T = feats.NumRows(), num_classes = mu_.NumRows(); + + // Use temporaries for s_bar_, to reduce roundoff error. + Vector s_bar(num_classes); + for (int32 t = 0; t < T; t++) { + auto iter = post[t].begin(), end = post[t].end(); + SubVector x_t(feats, t), + x_bar_t(*feats_deriv, t); + BaseFloat gamma_hat_t = 0.0; + for (; iter != end; ++iter) { + int32 i = iter->first; + BaseFloat gamma_ti = iter->second, + gamma_hat_ti = gamma_ti / s_(i); + gamma_hat_t += gamma_hat_ti; + SubVector z_bar_i(z_bar_, i); + // \bar{x}_t += \gamma_{t,i} \bar{z}_i + x_bar_t.AddVec(gamma_ti, z_bar_i); + } + double gamma_hat_bar_t = VecMatVec(x_t, G_bar_, x_t); + + // \bar{x}_t += 2 \hat{\gamma}_t \bar{G} x_t + x_bar_t.AddMatVec(2.0 * gamma_hat_t, G_bar_, kNoTrans, x_t, 1.0); + + for (iter = post[t].begin(); iter != end; ++iter) { + int32 i = iter->first; + BaseFloat gamma_ti = iter->second; + SubVector mu_i(mu_, i); + // \bar{s}_i -= \frac{1}{s_i^2} \gamma_{t,i} \bar{\hat{\gamma}}_t + s_bar(i) -= 1.0 / (s_(i) * s_(i)) * gamma_ti * gamma_hat_bar_t; + } + if (t == T - 1 || (t > 0 && t % 200 == 0)) { + s_bar_.AddVec(1.0, s_bar); + if (t < T - 1) + s_bar.SetZero(); + } + } +} + +BaseFloat FmllrEstimator::ForwardCombined( + const MatrixBase &feats, + const SubPosterior &post, + MatrixBase *adapted_feats) { + AccStats(feats, post); + BaseFloat ans = Estimate(); + AdaptFeatures(feats, adapted_feats); + return ans; +} + +void FmllrEstimator::BackwardCombined( + const MatrixBase &feats, + const SubPosterior &post, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv) { + AdaptFeaturesBackward(feats, adapted_feats_deriv, feats_deriv); + EstimateBackward(); + AccStatsBackward(feats, post, feats_deriv); +} + +FmllrEstimator::~FmllrEstimator() { + delete estimator_; // in case Estimate() was never called. +} + + +MeanOnlyTransformEstimator::MeanOnlyTransformEstimator( + const MatrixBase &mu): mu_(mu) { + int32 num_classes = mu_.NumRows(), + dim = mu_.NumCols(); + gamma_.Resize(num_classes); + input_sum_.Resize(dim); +} + +void MeanOnlyTransformEstimator::AccStats(const MatrixBase &feats, + const SubPosterior &post) { + int32 T = feats.NumRows(), + num_classes = mu_.NumRows(); + KALDI_ASSERT(static_cast(post.size()) == T); + + for (int32 t = 0; t < T; t++) { + BaseFloat gamma_t = 0.0; // Total weight for this frame. + auto iter = post[t].begin(), end = post[t].end(); + for (; iter != end; ++iter) { + int32 i = iter->first; + KALDI_ASSERT(i >= 0 && i < num_classes && + "Posteriors and adaptation model mismatch"); + BaseFloat gamma_ti = iter->second; + gamma_t += gamma_ti; + gamma_(i) += gamma_ti; + } + SubVector feat(feats, t); + KALDI_ASSERT(gamma_t >= 0); + input_sum_.AddVec(gamma_t, feat); + } +} + + +void MeanOnlyTransformEstimator::Estimate() { + double tot_gamma = gamma_.Sum(); + int32 dim = mu_.NumCols(); + if (tot_gamma <= 0.0) + KALDI_ERR << "You cannot call Estimate() if total count is zero."; + Vector gamma_float(gamma_); + Vector expected_mean(dim); + expected_mean.AddMatVec(1.0 / tot_gamma, mu_, kTrans, gamma_float, 0.0); + // basically: offset_ = expected_mean - observed_mean, + // where observed_mean = input_sum_ / tot_gamma. + offset_ = expected_mean; + offset_.AddVec(-1.0 / tot_gamma, input_sum_); + output_deriv_sum_.Resize(dim); +} + +bool MeanOnlyTransformEstimator::IsEstimated() const { + return offset_.Dim() != 0; +} + +void MeanOnlyTransformEstimator::AdaptFeatures( + const MatrixBase &feats, + MatrixBase *adapted_feats) const { + adapted_feats->CopyRowsFromVec(offset_); + adapted_feats->AddMat(1.0, feats); +} + +void MeanOnlyTransformEstimator::AdaptFeaturesBackward( + const MatrixBase &feats, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv) { + int32 dim = mu_.NumCols(); + Vector output_deriv_sum(dim); + output_deriv_sum.AddRowSumMat(1.0, adapted_feats_deriv); + output_deriv_sum_.AddVec(1.0, output_deriv_sum); + feats_deriv->AddMat(1.0, adapted_feats_deriv); +} + +void MeanOnlyTransformEstimator::EstimateBackward() { + int32 num_classes = mu_.NumRows(), dim = mu_.NumCols(); + mu_bar_.Resize(num_classes, dim); + Vector gamma(gamma_), + output_deriv_sum(output_deriv_sum_); + BaseFloat gamma_tot = gamma_.Sum(); + KALDI_ASSERT(gamma_tot > 0.0); + mu_bar_.AddVecVec(1.0 / gamma_tot, gamma, output_deriv_sum); + + x_deriv_ = output_deriv_sum; + x_deriv_.Scale(-1.0 / gamma_tot); +} + + +void MeanOnlyTransformEstimator::AccStatsBackward( + const MatrixBase &feats, + const SubPosterior &post, + MatrixBase *feats_deriv) { + + int32 T = feats.NumRows(); + // tot_weight will be the total weight of the posteriors in 'post' + // for each frame. + Vector tot_weight(T, kUndefined); + for (int32 t = 0; t < T; t++) { + BaseFloat gamma_t = 0.0; // Total weight for this frame. + auto iter = post[t].begin(), end = post[t].end(); + for (; iter != end; ++iter) + gamma_t += iter->second; + tot_weight(t) = gamma_t; + } + feats_deriv->AddVecVec(1.0, tot_weight, x_deriv_); +} + +void MeanOnlyTransformEstimator::ForwardCombined( + const MatrixBase &feats, + const SubPosterior &post, + MatrixBase *adapted_feats) { + AccStats(feats, post); + Estimate(); + AdaptFeatures(feats, adapted_feats); +} + +void MeanOnlyTransformEstimator::BackwardCombined( + const MatrixBase &feats, + const SubPosterior &post, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv) { + AdaptFeaturesBackward(feats, adapted_feats_deriv, feats_deriv); + EstimateBackward(); + AccStatsBackward(feats, post, feats_deriv); +} + + +} // namespace differentiable_transform +} // namespace kaldi diff --git a/src/adapt/differentiable-fmllr.h b/src/adapt/differentiable-fmllr.h new file mode 100644 index 00000000000..c15175752a1 --- /dev/null +++ b/src/adapt/differentiable-fmllr.h @@ -0,0 +1,974 @@ +// adapt/differentiable-fmllr.h + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_TRANSFORM_DIFFERENTIABLE_FMLLR_H_ +#define KALDI_TRANSFORM_DIFFERENTIABLE_FMLLR_H_ + +#include + +#include "base/kaldi-common.h" +#include "util/kaldi-table.h" +#include "util/kaldi-holder.h" +#include "hmm/posterior.h" +#include "matrix/matrix-functions.h" +#include "matrix/matrix-common.h" +#include "matrix/sp-matrix.h" + +namespace kaldi { +namespace differentiable_transform { + + +// This header contains some utilities for implementing differentiable fMLLR. +// Since it is fairly complicated, we aren't putting all the implementation +// details in class FmllrTransform (in differentiable-transform.h), but +// segregating most of the technical stuff to this file. This also +// allows us to separate out the testing of individual components. +// The reference for things in this header is +// http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf. +// The notation we are using corresponds to the notation used in +// the "Summary" section of that document. + + + + +/** + With reference to the notation in + http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf, + this class implements the operation that takes G and K as input (and the + count gamma), and produces A. This has been separated into its own object + for purposes of testability. + */ +struct FmllrEstimatorOptions { + + // singular_value_relative_floor is floor that we apply on the + // singular values of the inputs G and K, to ensure that no NaN's are + // generated in the forward pass and to prevent the derivatives + // in the backprop from becoming undefined. It affects both + // the forward and backward computations. A warning will be printed + // if this floor actually had an effect. + // Must be greater than zero (to avoid the possibility of generating + // NaN's). + BaseFloat singular_value_relative_floor; + + + // Floor for (spherical) variances; will be passed to class GaussianEstimator + // when estimating means and variances. + BaseFloat variance_floor; + + // A value in the range [0, 1] which dictates to what extent the variances are + // shared. 0 means not shared at all, 1 means completely shared. Shared + // means the variance is a weighted average of variances, weighted by count of + // that class. This is consumed by class GaussianEstimator. + BaseFloat variance_sharing_weight; + + // A count value of 'fake' counts that we add to the stats G, K and lambda + // during estimation, namely: + // lambda += smoothing_count + // K += smoothing_count * smoothing_between_class_factor * I + // G += smoothing_count * I. + // Interpretable as a number of frames. This prevents things going crazy + // when the amount of data is small. + BaseFloat smoothing_count; + + // A factor that says how large the assumed between-class covariance matrix + // is, relative to the within-class covariance matrix. Should be >= 0. In + // the limit as it approaches zero, the smoothing will only penalize scaling + // of the space, but not rotations. This is likely not a good thing, so a + // value greater than zero will probably be desired. + BaseFloat smoothing_between_class_factor; + + FmllrEstimatorOptions(): + singular_value_relative_floor(0.001), + variance_floor(0.0001), + variance_sharing_weight(0.1), + smoothing_count(0.0), + smoothing_between_class_factor(0.25) { } + + void Check() { + KALDI_ASSERT(singular_value_relative_floor > 0.0 && + singular_value_relative_floor < 0.1 && + (variance_floor > 0.0 || variance_sharing_weight > 0.0) && + variance_floor >= 0.0 && + variance_sharing_weight >= 0.0 && + variance_sharing_weight <= 1.0); + } + + void Write(std::ostream &os, bool binary) const; + void Read(std::istream &is, bool binary); + + // This will set any options in this class that it can find in 'config_line'. + void ReadFromConfig(ConfigLine *config_line); + +}; + + +/** + Class CoreFmllrEstimator takes care of the core parts of the fMLLR estimation: + with reference to the notation in + http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf, + it accepts the statistics G and K and the count gamma, and it + computes the fMLLR transform matrix A, and allows you to backprop through + that computation. The reason why we have broken it out as its own class, + is for testability and to limit the complexity of any one class. + + The end-user may want to use class FmllrEstimator instead. + + */ +class CoreFmllrEstimator { + public: + /** + Constructor. Does not do any real work. This class will store + references/pointers to G, K and A, so you need to make sure that + those quantities exist for the lifetime of this object. + + @param [in] opts Options class; see its definition for details. Will be copied + in the constructor. + @param [in] gamma The total data-count (often this will be the number of frames). + @param [in] G A symmetric matrix containing the quadratic + stats for estimating A. This the sum of outer products + of the input features, after mean subtraction, and + weighted by the inverse-variance factor s_i. Must be + positive definite for this computation to be well + defined. + @param [in] K A matrix containing the linear stats for estimating A. + This is a sum of outer products of the means with the + input features, with mean subtraction and inverse-variance + weighting. Must not have more than one zero singular value + for this computation to be well defined. + @param [in] A We mark this as an input parameter but it is the location + where the output of this computation will be placed when + you call Forward(). May be undefined (e.g., NaN) on + entry. You must not change the value of A between + calling Forward() and calling Backward(). + */ + CoreFmllrEstimator(const FmllrEstimatorOptions &opts, + BaseFloat gamma, + const MatrixBase &G, + const MatrixBase &K, + MatrixBase *A); + + /** + Does the forward pass of estimation. Writes to the location + 'A' that was passed to the constructor. + + Returns the objective-function improvement per frame, as compared + with what the objective-function would be with unit A. This is not + normalized by the number of frames. + */ + BaseFloat Forward(); + + /** + Does the backward pass. Note: it is permissible to call + Backward() any number of times, it does not have to be called + exactly once. + + @param [in] A_deriv The derivative of the objective + function (say, f) w.r.t. the output A (which was passed as a + pointer to the constructor). + @param [out] G_deriv A pointer to a location where the + derivative df/dG will be written. Will be added to, so + should contain zero (or some other defined value) + at input. + @param [out] K_deriv A pointer to a location where the + derivative df/dK will be written (so the i,j'th + element is the derivative w.r.t. the i,j'th element + of the input matrix K. + */ + void Backward(const MatrixBase &A_deriv, + Matrix *G_deriv, + Matrix *K_deriv); + + private: + // Computes H = G^{-0.5} + void ComputeH(); + // Compute L = K H + void ComputeL(); + // Compute B = F(L), where F is the + // function that takes the singular values of L, puts them through the function + // f(lamba) = (lambda + sqrt(lambda^2 + 4 gamma)) / 2. + void ComputeB(); + // Computes A = B H. + void ComputeA(); + + + // Backprops through the operation "A = B H". B_deriv and H_deriv + // must be free of NaN and inf on entry. + void BackpropA(const MatrixBase &A_deriv, + MatrixBase *B_deriv, + MatrixBase *H_deriv); + + // Backprops through the function "L = K H".. + // K_deriv must be free of NaN and inf on entry, but otherwise + // its value is ignored. H_deriv is added to by this function. + void BackpropL(const MatrixBase &L_deriv, + MatrixBase *K_deriv, + MatrixBase *H_deriv); + + // returns the objective-function change (vs. A being the unit matrix) from + // this estimation. + BaseFloat ComputeObjfChange(); + + FmllrEstimatorOptions opts_; + BaseFloat gamma_; + const MatrixBase &G_; + const MatrixBase &K_; + MatrixBase *A_; + + // H = G^{-0.5} is symmetric. + Matrix H_; + // L = K H. + Matrix L_; + // B = F(L) is the result of applying SvdRescaler with + // the function f(lambda) = ((lambda + sqrt(lambda^2 + 4 gamma)) / 2) + Matrix B_; + + // Object that helps us to compute, and to backprop through the + // computation of, H = G^{-0.5}. + SvdRescaler G_rescaler_; + + // Object that helps us to compute, and to backprop through the computation + // of: B = F(L), where F is the function that takes the singular values of L, + // puts them through the function f(lamba) = (lambda + sqrt(lambda^2 + 4 + // gamma)) / 2. + SvdRescaler L_rescaler_; + +}; + + + +/** + Class GaussianEstimator allows you to estimate means and (spherical) variances + from features and posteriors, and to later backprop through that process if + needed. + + It is intended for use during training of the neural net, for use on + individual minibatches: it uses BaseFloat for the accumulators, which might + lead to excessive roundoff if you had a large amount of data. We'll later on + create a separate mechanism for accumulating stats over all the data, given + the full tree. + + The normal usage pattern would be: + - Construct the object. + - Call AccStats() for each sequence. + - Call Estimate() + - Call GetMeans() and GetVars() to obtain the means and vars, and do + something with them, e.g. compute some kind of objective, from which + you would obtain derivatives w.r.t. those means and vars. + - Call SetOutputDerivs() to tell this class what those derivatives w.r.t. + the means and vars are. + - Call AccStatsBackward() for each sequence to propagate the derivatives + back to the features that were used to estimate the means and vars. + */ +class GaussianEstimator { + public: + GaussianEstimator(int32 num_classes, int32 feature_dim); + + GaussianEstimator(const GaussianEstimator &other) = default; + + int32 NumClasses() const { return gamma_.Dim(); } + + int32 Dim() const; + + // Accumulate statistics (you can call this multiple times of needed). + // It does: for each t, and for each pair (i, f) in post[t], accumulate stats + // from feats.Row(t) with class i and weight f. + // May not be called after Estimate() is called. + // + // @param [in] feats The input features, of dimension + // num-frames by feature-dimension + // @param [in] post The posteriors, which can be thought of as a + // vector > >. + // Its size() must equal feats.NumRows(). + void AccStats(const MatrixBase &feats, + const SubPosterior &post); + + // You call this once after calling AccStats() one or more times. + // It estimates the model means and variances. + // See the members 'variance_floor' and 'variance_sharing_weight' + // of the options class. + void Estimate(const FmllrEstimatorOptions &opts); + + // Returns true if Estimate() has previously been called, i.e. if + // the means and variances have been computed. + bool IsEstimated() const; + + // Returns the means, in a matrix of dimension num_classes by dim. Must not + // be called if ! IsEstimated(). + const MatrixBase &GetMeans() const { return mu_; } + + // Returns the 's' quantities, which are the scalar factors on the (spherical) + // variances. Must not be called if ! IsEstimated(). The + // variance for class i will actually be s_i I, where s_i is an element of + // this vector. + const VectorBase &GetVars() const { return t_; } + + // You call this to add something the derivatives df/dmeans and df/dvars-- the + // derivatives of the objective function f w.r.t. those quantities. You might + // call this once or several times. Doing this allows you to backprop through + // the estimation of the means and variances, back to the features. This must + // only be called after previously calling Estimate(). This function writes + // to v_bar_ and m_bar_. + void AddToOutputDerivs(const MatrixBase &mean_derivs, + const VectorBase &var_derivs); + + + // This function, which must only be called after AddToOutputDerivs() has been + // called at least once, propagates the derivative back to the features. For + // purposes of this backpropagation, the posteriors are treated as constants. + // @param [in] feats The features, which must be the same + // as you provided to one of the calls to + // AccStats(). dimension is num-frames by + // feature-dimension. + // @param [in] post The posteriors, as provided to AccStats(). + // Its size() must equal feats.NumRows(). + // @param [in,out] feats_deriv The derivative of the objective + // function w.r.t. the input features. + // This function will *add to* feats_deriv, + // so it must have a well-defined value on + // entry. + void AccStatsBackward(const MatrixBase &feats, + const SubPosterior &post, + const MatrixBase *feats_deriv); + + + void Write(std::ostream &os, bool binary) const; + void Read(std::istream &is, bool binary); + + // Adds any statistics in gamma_, m_ and v_ from 'other' to *this. + // Used when summing adaptation-model statistics over multiple + // jobs. Requires that '*this' and 'other' have identical + // structure. + void Add(const GaussianEstimator &other); + + private: + /* + Notes on implementation of GaussianEstimator. + Using Latex notation. + + We are estimating means \mu_i and variance-factors s_i (these + are scales on unit variances). Later we'll apply a kind of + interpolation with the global average variance, controlled + by variance_sharing_weight_, and we'll call the variances that + we finally output t_i. + + We formulate the sufficient statistics as: + the counts \gamma_i, the mean stats m_i and the (scalar) + variance stats v_i: + + \gamma_i = \sum_t \gamma_{t,i} + m_i = \sum_t \gamma_{t,i} x_t + v_i = \sum_t \gamma_{t,i} x_t^T x_t + The estimation procedure is: + \mu_i = \frac{m_i}{\gamma_i}, or 0 if \gamma_i is 0. + s_i = variance_floor if \gamma_i = 0, else: + max(variance_floor, (v_i/\gamma_i - \mu_i^T \mu_i) / dim) + where dim is the feature dimension; and another form more convenient for backprop: + = variance_floor if \gamma_i = 0, else: + max(variance_floor, v_i/(dim * \gamma_i) - m_i^T m_i / (dim * \gamma_i^2)) + + We write \bar{foo} for a derivative of the objective function w.r.t. foo. + We are provided by the user with with \bar{\mu}_i and \bar{s}_i, when they + call SetOutputDerivs(); and we aim to compute \bar{m}_i and \bar{v}_i, which + are the derivs w.r.t. the raw statistics. This is done as follows: + \bar{m}_i = 0 if \gamma_i is 0, otherwise: + \frac{\bar{\mu}_i}{\gamma_i} - (\frac{2\bar{s}_i m_i}{dim \gamma_i^2} + if s_i > variance_floor, else 0) + = or 0 if \gamma_i is 0, otherwise: + \frac{\bar{\mu}_i}{\gamma_i} - (\frac{2\bar{s}_i \mu_i}{dim \gamma_i} + if s_i > variance_floor, else 0) + \bar{v}_i = 0 if \gamma_i is 0 or s_i equals variance_floor, otherwise: + \frac{\bar{s}_i}{dim * \gamma_i} + \bar{x}_t = \sum_i \gamma_{t,i} (\bar{m}_i + 2\bar{v}_i x_t) + + + If 'variance_sharing_weight' != 0.0, then we need to modify the above. + Let the variance-floored version of the variance be t_i. + Write variance_sharing_weight as f (with 0 <= f <= 1), and let + \gamma = \sum_i \gamma_i. + Define the weighted-average variance: + s = \sum_i \frac{\gamma_i}{\gamma} s_i + and the partly-shared output variance is: + t_i = (1-f) s_i + f s. + For the backprop: If the user supplies derivatives \bar{t}_i, then: + \bar{s} = f \sum_i \bar{t}_i + \bar{s}_i = (1-f) \bar{t}_i + \frac{\gamma_i}{\gamma} \bar{s}. + */ + + + // gamma_, of dimension num_classes, contains the raw count statistics \gamma_i. + // It's added to when you call AccStats(). + Vector gamma_; + // m_ is the raw mean statistics (feature times soft-count); it's of dimension + // num_classes by feat_dim. + Matrix m_; + // v_ is the raw variance statistics (inner-product-of-feature times soft-count); + // it's of dimension num_classes. + Vector v_; + + // variance_floor_ and variance_sharing_weight_ are copies of the + // corresponding variables in class FmllrEstimatorOptions; they are set when + // Estimate() is called. They are temporaries, not permanent members. + BaseFloat variance_floor_; + BaseFloat variance_sharing_weight_; + + // mu_ is the estimated means, which is set up when you call Estimate(). + Matrix mu_; + // s_ is the variances, after flooring by variance_floor_ but before + // applying variance_sharing_weight_. + Vector s_; + // t_ is the smoothed or maybe totally averaged-over-all-classes variances, + // derived from t as specified by variance_sharing_weight_. + Vector t_; + + // v_bar_, of dimension num_classes, contains \bar{v}_i. It's only set up + // after you call SetOutputDerivs(). + Vector v_bar_; + // m_bar_, of dimension num_classes by feature_dim, contains \bar{m}_i. + // It's only set up after you call SetOutputDerivs(). + Matrix m_bar_; + + +}; + + + +/** + Class FmllrEstimator encapsulates the whole of the fMLLR computation- for + a single speaker. See + http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf + for a description of what is being implemented here. + + This class is suitable for use in training, where you want to backprop + through the computation; and also in test time (but not for the online + scenario; we may later rewrite a version that's optimized for that, or modify + this class to handle that). + + This class would normally be used as follows: + - Construct an instance of the class (probably for a particular speaker on + a particular minibatch). + + Then, either: + + - Call AccStats() one or more times. + - Call Estimate(). + - Call AdaptFeatures() one or more times to get the output features. + - Do something with those output features that (if you are training) + gives you some kind of objective-function derivative w.r.t. those + features. Then if you are training, do what's below: + - Call AdaptFeaturesBackward() one or more times to get part of the + derivative w.r.t. the input features. Note: the calls to AdaptFeatures() + and AdaptFeaturesBackward() may be interleaved, since the call to + AdaptFeatures() does not modify the object. + - Call EstimateBackward() + - Call AccStatsBackward() one or more times to get the part of the + derivative w.r.t. the input features that comes from the effect + on the transform itself. + - Make use of the calls GetMeanDeriv() and GetVarDeriv() to + account for the effect of the features on the class means and + variances (these will be passed to class GaussianEstimator, + and eventually to the features). + + Or: if there is only one training sequence, you can use the + simplified interface: after calling the constructor, + + - call ForwardCombined() + - call BackwardCombined() + - Make use of the calls GetMeanDeriv() and GetVarDeriv() to + account for the effect of the features on the class means and + variances, with the help of class GaussianEstimator. +*/ +class FmllrEstimator { + public: + /** + Constructor. + @param [in] opts Options class. This class makes a copy. + @param [in] mu Class means, probably as output by class + GaussianEstimator. This class maintains a + reference to this object, so you should ensure + that it exists for the lifetime of this object. + @param [in] s Scaling factors for spherical class + variances, probably as output by class + GaussianEstimator. As with mu, we store + a reference to it, so don't destroy or + change it as long as this class instance exists. + */ + FmllrEstimator(const FmllrEstimatorOptions &opts, + const MatrixBase &mu, + const VectorBase &s); + + + /** + Accumulate statistics to estimate the fMLLR transform. + @param [in] feats The feature matrix. A row of it would be called + x_t in the writeup in + http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf. + @param [in] post The posteriors. post.size() must equal feats.NumRows(). + Each element of post is a list of pairs (i, p) where + i is the class label and p is the soft-count. + */ + void AccStats(const MatrixBase &feats, + const SubPosterior &post); + + + /** + Estimate the fMLLR transform parameters A and b. Returns the + objective-function improvement compared with A = I, b = 0, divided by the + total count as returned by TotalCount(). + + You are allowed to call this multiple times (e.g. call AccStats(), call + Estimate(), call AccStats(), call Estimate() again). + */ + BaseFloat Estimate(); + + // Return true if Estimate() has previously been called. + bool IsEstimated() const; + + /// Returns the total count of the posteriors accumulated so far. + BaseFloat TotalCount() { return gamma_.Sum(); } + + /// Return the linear parameter matrix. Adapted features are + /// y_t = A x_t + b. You won't necessarily need to + /// call this, you can use ComputeAdaptedFeatures() intead. + const MatrixBase &GetLinearParams() const { return A_; } + + /// Return the bias term b. + const VectorBase &GetBiasParams() const { return b_; } + + /// Computes the adapted features y_t = A x_t + b. + /// feats (x) and adapted_feats (y) must have the same dimension. Must + /// only be called after Estimate() has been called. + /// 'adapted_feats' may contain NaN's on entry. + void AdaptFeatures(const MatrixBase &feats, + MatrixBase *adapted_feats) const; + + /** + This is the backward pass corresponding to the function AdaptFeatures(). + It propagates back only part of the derivative-- not including the part + that's due to how the transform changes when the features change. It + also accumulates within this class instance the derivative w.r.t. + A and b. You are expected to later call EstimateBackward() and + AccStatsBackward() to propagate the part of the derivative that comes from + the effect on the transform, back to the input features. + + See also AccStatsBackward(). + @param [in] feats The features (x) that were the original input to + AdaptFeatures(). + @param [in] adapted_feats_deriv The derivative \bar{y} w.r.t. the output (y) + that was the result of calling AdaptFeatures(). Must + have the same size as feat. + @param [in,out] feats_deriv The derivative w.r.t. 'feats'; this function + *adds* to it. + */ + void AdaptFeaturesBackward(const MatrixBase &feats, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv); + + /** + This is the backward pass corresponding to Estimate(). You call this after + calling AdaptFeaturesBackward() one or more times (which will accumulate + the derivative w.r.t. A and B). It backpropagates through the core + estimation procedure of fMLLR, in preparation for you calling + AccStatsBackward(). + */ + void EstimateBackward(); + + + // Returns the derivative w.r.t. the class means 'mu' that were supplied to the + // constructor. Must not be called until EstimateBackward() and + // AccStatsBackward() have been called. + const MatrixBase &GetMeanDeriv() const { return mu_bar_; } + // Returns the derivative w.r.t. the variance factors 's' that were supplied + // to the constructor. Must not be called until EstimateBackward() and + // AccStatsBackward() have been called. + const VectorBase &GetVarDeriv() const { return s_bar_; } + + /** + This is the backward pass corresponding to AccStats(). You call this after + calling EstimateBackward(). It computes the part of the derivative w.r.t. + 'feats' that comes from the effect on the transform parameters. You will + normally have previously called AdaptFeaturesBackward() on these same + features. + @param [in] feats The features as given to AccStats() + @param [in] post The posteriors as given to AccStats() + @param [in,out] feats_deriv This function *adds* to feats_deriv. + It adds the terms in \bar{x}_t that arise from + the derivative w.r.t. the transform parameters. The + "direct" term \bar{x}_t = A^T \bar{y}_t will have + previously been added by AdaptFeaturesBackward(). + */ + void AccStatsBackward(const MatrixBase &feats, + const SubPosterior &post, + MatrixBase *feats_deriv); + + /** + Combines AccStats(), Estimate() and AdaptFeatures() in one call; + for use when there is only one sequence. Returns the objective-function + improvement (per soft-count). + @param [in] feats The features we're estimating the fMLLR parameters from + @param [in] post The posteriors corresponding to 'feats + @param [out] adapted_feats A matrix the same size as 'feats', to which + the adapted features will be written. May contain + NaNs at entry. + */ + BaseFloat ForwardCombined(const MatrixBase &feats, + const SubPosterior &post, + MatrixBase *adapted_feats); + /** + Combines AdaptFeaturesBackward(), EstimateBackward(), and + AccStatsBackward(); for use when there is only one sequence. + Note: 'feats_deriv' is *added* to so must be defined at entry. + */ + void BackwardCombined(const MatrixBase &feats, + const SubPosterior &post, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv); + + ~FmllrEstimator(); + private: + + + ///////////// Fixed quantities passed in in the constructor /////////// + + // The options. + FmllrEstimatorOptions opts_; + // The means. A reference to an object owned elsewhere. + const MatrixBase &mu_; + // The variance factors (the variances are s_(i) times I). A reference to an + // object owned elsewhere. + const VectorBase &s_; + + ///////////// Quantities that are accumulated in AccStats() /////////// + + // Counts per class; dimension is num_classes. Added to when AccStats() is + // called. gamma_(i) corresponds to \gamma_i in the write up; it's + // \gamma_i = \sum_t gamma_{t,i} + Vector gamma_; + + // This contains one term in G_, namely: + // (\sum_t \hat{\gamma}_t x_t x_t^T ) + Matrix raw_G_; + + // This is of dimension num_classes by dim (same as mu_). It contains + // the weighted sums of the input data, for each class: + // z_i = \sum_t \gamma_{t,i} x_i. + Matrix z_; + + + /////////// Quantities that are computed when Estimate() is called //////// + + + // This contains + // G = (\sum_t \hat{\gamma}_t x_t x_t^T ) - \hat{\gamma} n n^T. + // It is computed as raw_G_ - \hat{\gamma} n n^T. + // We use two separate variables to make it easier to call Estimate() + // more than once without things getting confused. + Matrix G_; + + // gamma_hat_tot_ is the total of gamma_(i) / s_(i), i.e. + // \hat{\gamma} = \sum_i gamma_i / s_i. + BaseFloat gamma_hat_tot_; + + // After Estimate() is called, this will be the quantity: + // n = \frac{1}{\hat{\gamma}} \sum_i (1/s_i) z_i + Vector n_; + + // The weighted-average of the means: + // m = \frac{1}{\hat{\gamma}} \sum_i (\gamma_i/s_i) \mu_i + Vector m_; + + // This contains + // K = (\sum_i (1/s_i) \mu_i z_i^T) - \hat{\gamma} m n^T + Matrix K_; + + // The parameter matrix + Matrix A_; + // The offset term + Vector b_; + // The object we use to estimate A and b, and to backprop through that + // process. + CoreFmllrEstimator *estimator_; + + ////////// Quantities that are accumulated in AdaptFeaturesBackward() //////// + + // The derivative w.r.t. A. This is set when AdaptFeaturesBackward() is called, + // to: + // \bar{A} = \sum_t \bar{y}_t x_t^T + // and then when EstimateBackward() is called, we add the term from the estimation + // of b, which is: + // \bar{A} -= \bar{b} n^T + Matrix A_bar_; + + // The derivative w.r.t. b. This is set when AdaptFeaturesBackward() is called, + // to: \bar{b} = \sum_t \bar{y}_t. + Vector b_bar_; + + ////////// Quantities that are computed in EstimateBackward() //////// + + // The derivative w.r.t. G; computed by 'estimator_' + Matrix G_bar_; + // The derivative w.r.t. K; computed by 'estimator_'. + Matrix K_bar_; + + // The derivative w.r.t. n: + // \bar{n} = -A^T \bar{b} - 2\hat{\gamma} \bar{G} n - \hat{\gamma} \bar{K}^T m + Vector n_bar_; + + // The derivative w.r.t. m: + // \bar{m} = \bar{b} - \hat{\gamma} \bar{K} n + Vector m_bar_; + + // The derivative w.r.t the z_i quantities. The i'th row is: + // \bar{z}_i = (1/s_i) \bar{K}^T \mu_i + 1/(s_i \hat{\gamma}) \bar{n} + Matrix z_bar_; + + // gamma_hat_tot_bar_ is \bar{\hat{\gamma}} in the writeup; + // it's: + // \bar{\hat{\gamma}} = - n^T \bar{G} n - m^t \bar{K} n + // - \frac{1}{\hat{\gamma}} (n^T \bar{n} + m^T \bar{m}) + BaseFloat gamma_hat_tot_bar_; + + // The i'th row contains the derivative w.r.t mu_i. + // This is: + // \bar{\mu}_i = (1/s_i) \bar{K} z_i + (\gamma_i / (s_i \hat{\gamma})) \bar{m} + Matrix mu_bar_; + + //////////// Quantities that are written to in AccStatsBackward() /////////// + + // s_bar_(i) contains the derivative w.r.t the variance factor s_i, + // which we write in the writeup as \bar{s}_i. + // It is: + // \bar{s}_i = -(1 / s_i^2) * ( + // \mu_i^T \bar{K} z_i + (1 / \hat{\gamma}) \z_i^T \bar{n} + // + (\gamma_i / \hat{\gamma}) \mu_i^T \bar{m} + \gamma_i \bar{\hat{\gamma}} + // + \sum_t \gamma_{t,i} \bar{\hat{\gamma}}_t ) + // where + // \bar{\hat{\gamma}}_t = x_t^T \bar{G} x_t . + // Note: we add all but the first terms during Estimate(), and only the one + // with \sum_t in it in AccStatsBackward. + Vector s_bar_; + + // There is another quantity that's updated by AccStatsBackward(), which is + // \bar{x}_t, the derivative w.r.t. x_t. AccStatsBackward() does not include + // the term \bar{x}_t = A^T \bar{y}_t. But it does include the rest of the + // terms, doing: + // \bar{x}_t += 2 \hat{\gamma}_t \bar{G} x_t + // + \sum_i \gamma_{t,i} \bar{z}_i + // There is no member variable for this; it's a temporary. + +}; + + +/* MeanOnlyTransformEstimator is like a highly simplified version of + FmllrEstimator, where the transform is just y_t = x_t + b. + There are class means but the variances are assumed to be all + unit. (This is equivalent to assuming that they are all identical + with an arbitrary value; the value doesn't actually affect the + learned offset so we assume they are unit). + + The equations involved are like an extremly simplified version + of what we do in class FmllrEstimator, with m as a weighted + average of the means and n as a weighted average of the input + features. The weights come from the posterior information you + supply. + + This object has a similar interface to class FmllrEstimator. + + This class would normally be used as follows: + - Construct an instance of the class (probably for a particular speaker on + a particular minibatch). + + Then, either: + + - Call AccStats() one or more times. + - Call Estimate(). + - Call AdaptFeatures() one or more times to get the output features. + - Do something with those output features that (if you are training) + gives you some kind of objective-function derivative w.r.t. those + features. Then if you are training, do what's below: + - Call AdaptFeaturesBackward() one or more times to get part of the + derivative w.r.t. the input features. Note: the calls to AdaptFeatures() + and AdaptFeaturesBackward() may be interleaved, since the call to + AdaptFeatures() does not modify the object. + - Call EstimateBackward() + - Call AccStatsBackward() one or more times to get the part of the + derivative w.r.t. the input features that comes from the effect + on the transform itself. + - Make use of the calls GetMeanDeriv() and GetVarDeriv() to + account for the effect of the features on the class means and + variances (these will be passed to class GaussianEstimator, + and eventually to the features). + + Or: if there is only one training sequence, you can use the + simplified interface: after calling the constructor, + + - call ForwardCombined() + - call BackwardCombined() + - Make use of the call GetMeanDeriv() to account for the effect of the + features on the class means and variances, with the help of class + GaussianEstimator. + */ +class MeanOnlyTransformEstimator { + public: + /** + Constructor. + @param [in] mu Class means, probably as output by class + GaussianEstimator. This class maintains a + reference to this object, so you should ensure + that it exists for the lifetime of this object. + You can ignore the variances from class + GaussianEstimator; they are not used. + */ + MeanOnlyTransformEstimator(const MatrixBase &mu); + + /** + Accumulate statistics to estimate the fMLLR transform. + @param [in] feats The feature matrix. A row of it would be called + x_t in the writeup in + http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf. + @param [in] post The posteriors. post.size() must equal feats.NumRows(). + Each element of post is a list of pairs (i, p) where + i is the class label and p is the soft-count. + */ + void AccStats(const MatrixBase &feats, + const SubPosterior &post); + + /** + Estimate the parameter (the offset). Requires the total count to be + nonzero. You are allowed to call this multiple times (e.g. call + AccStats(), call Estimate(), call AccStats(), call Estimate() again). + */ + void Estimate(); + + // Returns true if Estimate() has previously been called. + bool IsEstimated() const; + + BaseFloat TotalCount() { return gamma_.Sum(); } + + /// Return the bias term b. + const VectorBase &GetOffset() const { return offset_; } + + /// Computes the adapted features y_t = x_t + b. + /// feats (x) and adapted_feats (y) must have the same dimension. Must + /// only be called after Estimate() has been called. + /// 'adapted_feats' may contain NaN's on entry. + void AdaptFeatures(const MatrixBase &feats, + MatrixBase *adapted_feats) const; + + + /** + This is the backward pass corresponding to the function AdaptFeatures(). + It propagates back only part of the derivative-- not including the part + that's due to how the offset changes when the features change. It + also accumulates within this class instance the derivative w.r.t. the + offset. + See also AccStatsBackward(). + + @param [in] feats The features (x) that were the original input to + AdaptFeatures(). + @param [in] adapted_feats_deriv The derivative \bar{y} w.r.t. the output (y) + that was the result of calling AdaptFeatures(). Must + have the same size as feat. + @param [in,out] feats_deriv The derivative w.r.t. 'feats'; this function + *adds* to it. + */ + void AdaptFeaturesBackward(const MatrixBase &feats, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv); + + /** + Backward pass corresponding to Estimate(). Should be called after + you've called AdaptFeatures() on all utterances. Computes the + derivatives w.r.t. the mean. */ + void EstimateBackward(); + + /** + Returns the derivative w.r.t. the class means 'mu' that were supplied to + the constructor. Must not be called until EstimateBackward() has been + called. */ + const MatrixBase &GetMeanDeriv() const { return mu_bar_; } + + /** + This is the backward pass corresponding to AccStats(). You call this after + calling EstimateBackward(). It computes the part of the derivative w.r.t. + 'feats' that comes from the effect on the transform parameters. You will + normally have previously called AdaptFeaturesBackward() on these same + features. + @param [in] feats The features as given to AccStats() + @param [in,out] feats_deriv This function *adds* to feats_deriv. + It adds the terms in \bar{x}_t that arise from + the derivative w.r.t. the offset b. + */ + void AccStatsBackward(const MatrixBase &feats, + const SubPosterior &post, + MatrixBase *feats_deriv); + + + /** + Combines AccStats(), Estimate() and AdaptFeatures() in one call; + for use when there is only one sequence. + @param [in] feats The features we're estimating the fMLLR parameters from + @param [in] post The posteriors corresponding to 'feats + @param [out] adapted_feats A matrix the same size as 'feats', to which + the adapted features will be written. May contain + NaNs at entry. + */ + void ForwardCombined(const MatrixBase &feats, + const SubPosterior &post, + MatrixBase *adapted_feats); + /** + Combines AdaptFeaturesBackward(), EstimateBackward(), and + AccStatsBackward(); for use when there is only one sequence. + Note: 'feats_deriv' is *added* to so must be defined at entry. + */ + void BackwardCombined(const MatrixBase &feats, + const SubPosterior &post, + const MatrixBase &adapted_feats_deriv, + MatrixBase *feats_deriv); + + private: + // The means, one row per class. A reference to an object owned elsewhere. + const MatrixBase &mu_; + + // The counts per class + Vector gamma_; + // The total of the input features, weighted by total posterior. + Vector input_sum_; + + // The offset. + Vector offset_; + + // The total of the derivative w.r.t. the output. + Vector output_deriv_sum_; + + // The derivative w.r.t. each row of the input features-- i.e. the part of the + // derivative that comes from the effect via the offset. This equals + // (-1 / total-count) * output_deriv_sum_. + Vector x_deriv_; + + // The derivative w.r.t. mu: + // (1/gamma_tot) gamma_ . output_deriv_sum_^T. + Matrix mu_bar_; +}; + + +} // namespace differentiable_transform +} // namespace kaldi + +#endif // KALDI_TRANSFORM_DIFFERENTIABLE_FMLLR_H_ diff --git a/src/adapt/differentiable-transform-itf.cc b/src/adapt/differentiable-transform-itf.cc new file mode 100644 index 00000000000..e9c490c943d --- /dev/null +++ b/src/adapt/differentiable-transform-itf.cc @@ -0,0 +1,198 @@ +// adapt/differentiable-transform-itf.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "adapt/differentiable-transform-itf.h" +#include "adapt/generic-transform.h" +#include "adapt/differentiable-transform.h" + +namespace kaldi { +namespace differentiable_transform { + + +// static +DifferentiableTransform* DifferentiableTransform::ReadNew( + std::istream &is, bool binary) { + + std::string token; + ReadToken(is, binary, &token); // e.g. "" + token.erase(0, 1); // erase "<". + token.erase(token.length()-1); // erase ">". + DifferentiableTransform *ans = NewTransformOfType(token); + if (!ans) + KALDI_ERR << "Unknown DifferentialbeTransform type " << token + << " (maybe you should recompile?)"; + ans->Read(is, binary); + return ans; +} + +// static +DifferentiableTransform* DifferentiableTransform::NewTransformOfType( + const std::string &type) { + if (type.size() > 2 && type[type.size() - 1] == '>') { + std::string new_type(type); + if (new_type[0] == '<') + new_type.erase(0, 1); // erase "<" + new_type.erase(new_type.size() - 1); // erase ">". + return NewTransformOfType(new_type); + } + + if (type == "NoOpTransform") { + return new NoOpTransform(); + } else if (type == "FmllrTransform") { + return new FmllrTransform(); + } else if (type == "MeanOnlyTransform") { + return new MeanOnlyTransform(); + } else if (type == "SequenceTransform") { + return new SequenceTransform(); + } else if (type == "AppendTransform") { + return new AppendTransform(); + } else { + // Calling code will throw an error. + return NULL; + } +} + + +void DifferentiableTransform::TestingForwardBatch( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const { + int32 dim = input.NumCols(), + num_frames = input.NumRows(), + chunks_per_spk = num_chunks / num_spk, + frames_per_chunk = num_frames / num_chunks; + + // Just copy to CPU for now. + Matrix input_cpu(input); + Matrix output_cpu(num_frames, dim, kUndefined); + + for (int32 s = 0; s < num_spk; s++) { + SpeakerStatsItf *stats = this->GetEmptySpeakerStats(); + for (int32 chunk = s * chunks_per_spk; + chunk < (s + 1) * chunks_per_spk; chunk++) { + SubMatrix this_input(input_cpu.RowData(chunk), + frames_per_chunk, dim, + input_cpu.Stride() * num_chunks); + SubPosterior this_posteriors(posteriors, + chunk, // offset + frames_per_chunk, // num_frames + num_chunks); // stride + this->TestingAccumulate(this_input, this_posteriors, stats); + } + stats->Estimate(); + for (int32 chunk = s * chunks_per_spk; + chunk < (s + 1) * chunks_per_spk; chunk++) { + SubMatrix this_input(input_cpu.RowData(chunk), + frames_per_chunk, dim, + input_cpu.Stride() * num_chunks), + this_output(output_cpu.RowData(chunk), + frames_per_chunk, dim, + output_cpu.Stride() * num_chunks); + /* + // The following testing code was temporarily present to test + // GetTransformAsMatrix().. + if (GetVerboseLevel() >= 3 && RandInt(0, 1) == 0) { + Matrix transform(dim, dim + 1, kUndefined); + this->GetTransformAsMatrix(*stats, &transform); + SubMatrix linear_part(transform, 0, dim, 0, dim); + Vector offset(dim); + offset.CopyColFromMat(transform, dim); + this_output.CopyRowsFromVec(offset); + this_output.AddMatMat(1.0, this_input, kNoTrans, + linear_part, kTrans, 1.0); + } else */ + this->TestingForward(this_input, *stats, &this_output); + } + delete stats; + } + output->CopyFromMat(output_cpu); +} + +// static +DifferentiableTransform* DifferentiableTransform::ReadFromConfig( + std::istream &is, int32 num_classes) { + std::vector lines; + ReadConfigLines(is, &lines); + std::vector config_lines; + ParseConfigLines(lines, &config_lines); + if (config_lines.empty()) + KALDI_ERR << "Config file is empty."; + std::string transform_type = config_lines[0].FirstToken(); + DifferentiableTransform *transform = NewTransformOfType(transform_type); + if (transform == NULL) + KALDI_ERR << "Parsing config file, could not find transform of type " + << transform_type; + int32 pos = transform->InitFromConfig(0, &config_lines); + if (pos != static_cast(config_lines.size())) + KALDI_ERR << "Found junk at end of config file, starting with line " + << pos << ": " << config_lines[pos].WholeLine(); + KALDI_ASSERT(num_classes > 0); + transform->SetNumClasses(num_classes); + return transform; +} + +int32 DifferentiableTransformMapped::NumPdfs() const { + if (pdf_map.empty()) + return transform->NumClasses(); + else + return static_cast(pdf_map.size()); +} + +void DifferentiableTransformMapped::Read(std::istream &is, bool binary) { + if (transform) + delete transform; + transform = DifferentiableTransform::ReadNew(is, binary); + ReadIntegerVector(is, binary, &pdf_map); + Check(); +} + +void DifferentiableTransformMapped::Write(std::ostream &os, bool binary) const { + Check(); + transform->Write(os, binary); + WriteIntegerVector(os, binary, pdf_map); +} + + +void DifferentiableTransformMapped::Check() const { + KALDI_ASSERT(transform != NULL && + (pdf_map.empty() || + 1 + *std::max_element(pdf_map.begin(), pdf_map.end()) == + transform->NumClasses())); +} + +std::string DifferentiableTransformMapped::Info() const { + KALDI_ASSERT(transform != NULL); + std::ostringstream os; + os << "dim=" << transform->Dim() << std::endl + << "num-classes=" << transform->NumClasses() << std::endl + << "num-pdfs=" << NumPdfs() << std::endl; + return os.str(); +} + +DifferentiableTransformMapped::DifferentiableTransformMapped( + const DifferentiableTransformMapped &other): pdf_map(other.pdf_map) { + if (other.transform == NULL) transform = NULL; + else transform = other.transform->Copy(); +} + + +} // namespace differentiable_transform +} // namespace kaldi diff --git a/src/adapt/differentiable-transform-itf.h b/src/adapt/differentiable-transform-itf.h new file mode 100644 index 00000000000..e2842cf6af0 --- /dev/null +++ b/src/adapt/differentiable-transform-itf.h @@ -0,0 +1,444 @@ +// adapt/differentiable-transform-itf.h + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_ITF_H_ +#define KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_ITF_H_ + +#include +#include "base/kaldi-common.h" +#include "matrix/kaldi-matrix.h" +#include "cudamatrix/cu-matrix.h" +#include "util/text-utils.h" +#include "hmm/posterior.h" + + +namespace kaldi { +namespace differentiable_transform { + +class MinibatchInfoItf { + public: + virtual ~MinibatchInfoItf() { } +}; + + +class SpeakerStatsItf { + public: + // Does any estimation that is required-- you call this after accumulating + // stats and before calling TestingForward(). You'll normally want to + // override this, unless your object requires no estimation. + virtual void Estimate() { } + + virtual ~SpeakerStatsItf() { } +}; + + + +/** + This class is for speaker-dependent feature-space transformations -- + principally various varieties of fMLLR, including mean-only, diagonal and + block-diagonal versions -- which are intended for placement in the bottleneck + of a neural net. So code-wise, we'd have: bottom neural net, then transform, + then top neural net. The transform is designed to be differentiable, i.e. it + can be used during training to propagate derivatives from the top neural net + down to the bottom neural net. The reason this is non-trivial (i.e. why it's + not just a matrix multiplication) is that the value of the transform itself + depends on the features, and also on the speaker-independent statistics for + each class (i.e. the mean and variance), which also depend on the features + sicne we estimate them from the same minibatch. + You can view this as an extension of things like BatchNorm, except the + interface is more complicated because there is a dependence on the per-frame + class labels. + + The class labels we'll use here will probably be derived from some kind of + minimal tree, with hundreds instead of thousands of states. Part of the + reason for using a smaller number of states is that, to make the thing + properly differentiable during training, we need to use a small enough number + of states that we can obtain a reasonable estimate for the mean and (spherical) + variance of a Gaussian for each one in training time. Anyway, as you can see in + http://isl.anthropomatik.kit.edu/pdf/Nguyen2017.pdf, it's generally better + for this kind of thing to use "simple target models" for adaptation rather than + very complex models. + + Note: for training utterances we'll generally get the class labels used for + adatpation in a supervised manner, either by aligning a previous system like + a GMM system, or-- more likely-- from the (soft) posteriors of the the + numerator graphs. In test time, we'll usually be getting these class labels + from some kind of unsupervised process. + + Because we tend to train neural nets on fairly small fixed-size chunks + (e.g. 1.5 seconds), and transforms like fMLLR don't tend to work very well + until you have about 5 seconds of data, we will usually be arranging those + chunks into groups where all members of the group come from the same + speaker. So, for instance, instead of 128 totally separate chunks, we might + have 4 chunks per speaker and 32 speakers. + + The basic pattern of usage of class DifferentiableTransform is this: + + - Initialize the object prior to training, e.g. with InitFromConfig(). + + - Use this object to jointly train the 'bottom' (feature-extracting) and + 'top' (ASR) network. This involves functions TrainingForward() and + TrainingBackward() of this object; the posteriors used for that might be + dumped with the 'egs' (e.g. come from a GMM system), or might be derived + from the alignment of the numerator lattices in chain training. Any + class means that must be estimated, would be estimated on each minibatch + (we'll try to keep the minibatches as large as possible, and may use + tricks like using bigger minibatch sizes for the bottom + (feature-extracting) network and smaller ones for the top one, to save + memory. At this stage, this object will most likely only contain + configuration information and not any kind of data-dependent statistics. + + - Use some reasonable-sized subset of training data to accumulate more + reliable statistics for the target model using Accumulate() followed + by Estimate(). If NumFinalIterations() is more than one you may need + do this in a short loop. + + - In test time, for each speaker you'll: + - call GetEmptySpeakerStats() to get an object to store adaptation statistics + for your speaker. + - Obtain some class-level posteriors somehow (could come from an initial + decoding pass on all the data, or from the final decoding pass on the + part of the data you've seen up till now). Use these to call + TestingAccumulate() to accumulate speaker stats. + - Call TestingForward() with the speaker-stats object to get + adapted features. + + + */ +class DifferentiableTransform { + public: + + /// Return the dimension of the features this operates on. + virtual int32 Dim() const = 0; + + /// Return the number of classes in the model used for adaptation. These + /// will probably correspond to the leaves of a small tree, so they would + /// be pdf-ids. This model only keeps track of the number of classes, + /// it does not contain any information about what they mean. The + /// integers in the objects of type Posterior provided to this class + /// are expected to contain numbers from 0 to NumClasses() - 1. + int32 NumClasses() const { return num_classes_; } + + + /// This can be used to change the number of classes. It would normally be + /// used, if at all, after the model is trained and prior to calling + /// Accumulate(), in case you want to use a more detailed model (e.g. the + /// normal-size tree instead of the small one that we use during training). + /// Child classes may want to override this, in case they need to do + /// something more than just set this variable. + virtual void SetNumClasses(int32 num_classes) { num_classes_ = num_classes; } + + /** + This is the function you call in training time, for the forward + pass; it adapts the features. By "training time" here, we + assume you are training the 'bottom' neural net, that produces + the features in 'input'; if you were not training it, it would + be the same as test time as far as this function is concerned. + + @param [in] input The original, un-adapted features; these + will typically be output by a neural net, the 'bottom' net in our + terminology. This will correspond to a whole minibatch, + consisting of multiple speakers and multiple sequences (chunks) + per speaker. Caution: in the input and + output features, and the posteriors, the 't' has the larger + stride than the minibatch-index 'n', so the order is: + first frame of all sequences; then the second frame of + all sequences; and so on. This is the default order in + nnet3; see operator < of nnet3::Index. + @param [in] num_chunks The number of individual sequences + (e.g., chunks of speech) represented in 'input'. + input.NumRows() will equal num_sequences times the number + of time frames. + @param [in] num_spk The number of speakers. Must be greater than one, and + must divide num_chunks. The number of chunks per speaker + must be the same for all speakers (it will equal num_chunks / num_spk), + and the chunks for a speaker must be consecutively numbered. + @param [in] posteriors (note: this is a vector of vector of + pair). This provides, in 'soft-count' + form, the class supervision information that is used for the + adaptation. posteriors.size() will be equal to input.NumRows(), + and the ordering of its elements is the same as the ordering + of the rows of input (i.e. the 't' has the larger stride). + There is no assumption that the posteriors sum to one; + this allows you to do things like silence weighting. But + the posteriors are expected to be nonnegative. + @param [out] output The adapted output. This matrix should have the + same dimensions as 'input'. It does not have to be free of + NaNs when you call this function. + @return This function returns either NULL or an object of type + DifferentiableTransform*, which is expected to later be given + to the function TrainingBackward(). It will store + any information that needs to be remembered for the backward + phase. + */ + virtual MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const = 0; + + /** + This does the backpropagation, during the training pass. + + @param [in] input The original input (pre-transform) features that + were given to TrainingForward(). + @param [in] output_deriv The derivative of the objective function + (that we are backpropagating) w.r.t. the output. + @param [in] num_chunks,num_spk,posteriors + See TrainingForward() for information + about these arguments; they should be the same + values. + @param [in] minibatch_info The pointer returned by the corresponding + call to TrainingForward() (may be NULL). This function + takes ownership of the pointer. If for some reason the + backward pass was not done, the caller will likely + want to delete it themselves. + @param [in,out] input_deriv The derivative at the input, i.e. + dF/d(input), where F is the function we are + evaluating. Must have the same dimension as + 'input'. The derivative is *added* to here. + This is useful because generally we will also + be training (perhaps with less weight) on + the unadapted features, in order to prevent them + from deviating too far from the adapted ones + and to allow the same model to be used for the + first pass. + */ + virtual void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const = 0; + + + /** + Returns the number of times you have to (call Accumulate() on a subset + of data, then call Estimate()) + */ + virtual int32 NumFinalIterations() = 0; + + /** + This will typically be called sequentially, minibatch by minibatch, + for a subset of training data, after training the neural nets, + followed by a call to Estimate(). Accumulate() stores statistics + that are used by Estimate(). This process is analogous to + computing the final stats in BatchNorm, in preparation for testing. + In practice it will be doing things like computing per-class means + and variances. + + @param [in] final_iter An iteration number in the range + [0, NumFinalIterations()]. In many cases there will + be only one iteration so this will just be zero. + + The input parameters are the same as the same-named parameters to + TrainingForward(); please refer to the documentation there. + */ + virtual void Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) = 0; + + // Adds any stats accumulated via Accumulate() that are present in 'other' to + // 'this'. Used when summing adaptation-model statistics across multiple + // jobs. + virtual void Add(const DifferentiableTransform &other) = 0; + + // To be called after repeated calls to Accumulate(), does any estimation that + // is required in training time (normally per-speaker means and possibly + // variances. + // @param [in] final_iter An iteration number in the range + // [0, NumFinalIterations()]. In many cases there will + // be only one iteration so this will just be zero. + virtual void Estimate(int32 final_iter) = 0; + + // Returns an object representing sufficient statistics for estimating a + // speaker-dependent transform. This object will initially have zero + // counts in its statistics. It will represent the stats for a single + // speaker. + virtual SpeakerStatsItf *GetEmptySpeakerStats() const = 0; + + + // Accumulate statistics for a segment of test data, storing them in the + // object 'speaker_stats'. There is no assumption that the soft-counts in + // 'posteriors' are positive; this allows you to change your mind about the + // traceback, in test-time, by subtracting the stats that you no longer want + // to use. + virtual void TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const = 0; + + + // Applies the transformation implied by the statistics in 'speaker_stats' to + // 'input', storing in the result in 'output'. You must have done any estimation + // procedure that is required first, by calling Estimate() on the speaker-stats + // object. 'output' may contain NaN's at entry. + virtual void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const = 0; + + + // This function outputs the speaker-specific transformation in a matrix form + // with an offset, i.e., a matrix of dimension Dim() by Dim() + 1 where + // the last column represents the offset term (the same way Kaldi represents + // LDA and fMLLR transforms as matrices. + // The 'speaker_stats' object must have had Estimate() called on it. + // 'transform' must be of dimension Dim() by Dim() + 1; it may contain + // NaN's at entry. + virtual void GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const = 0; + + + // TestingForwardBatch() combines GetEmptySpeakerStats(), TestingAccumulate() and + // TestingForward(). It has a default implementation. It is a convenience + // function that may be useful during training under some circumstances, e.g. + // when you want to train only the top network. + virtual void TestingForwardBatch( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const; + + // Copies transform (deep copy). + virtual DifferentiableTransform* Copy() const = 0; + + // Return the type of this transform. E.g. "NoOpTransform". + virtual std::string Type() const = 0; + + /* + Initialize this object from the config line at position 'cur_pos' of the + vector 'config_lines'. This function may end up reading more lines than + one, if this is a transform type that contains other transforms. + + @param [in] cur_pos The starting position in config_lines; required + to be in the range [0, config_lines->size() - 1]. + The Type() of this object must match the first token + (function FirstToken()) of that ConfigLine. + @param [in,out] config_lines Config lines to be read. It's non-const + because the process of reading them has effects on + the lines themselves (the ConfigLine object keeps + track of which configuration values have been read). + @return Returns the next position to be read. Will be in the range + [cur_pos + 1, config_lines->size()]; if it's equal to + config_lines->size(), it means we're done. + */ + virtual int32 InitFromConfig(int32 cur_pos, + std::vector *config_lines) = 0; + + // Returns a new transform of the given type e.g. "NoOpTransform" + // or NULL if no such component type exists. If angle brackets are + // present, e.g. "", this function will detect and + // remove them. + static DifferentiableTransform *NewTransformOfType(const std::string &type); + + // Reads a differentiable transform from a config file (this function parses + // the file and reads a single DifferentiableTransform object from it). Note: + // since DifferentiableTransform objects can contain others, the file may + // contain many lines. Throws exception if it did not succeed-- including + // if the config file had junk at the end that was not parsed. + static DifferentiableTransform *ReadFromConfig(std::istream &is, + int32 num_classes); + + + + // Write transform to stream + virtual void Write(std::ostream &os, bool binary) const = 0; + + // Reads transform from stream (normally you would previously have created + // the transform object of the correct type using ReadNew(). + virtual void Read(std::istream &is, bool binary) = 0; + + // Read transform from stream (works out its type). Dies on error. + // This will be used when reading in objects that have been written with + // the Write() function, since you won't know the type of the object + // beforehand. + static DifferentiableTransform* ReadNew(std::istream &is, bool binary); + + DifferentiableTransform(): num_classes_(-1) { } + + virtual ~DifferentiableTransform() { } + protected: + DifferentiableTransform(const DifferentiableTransform &other): + num_classes_(other.num_classes_) { } + + int32 num_classes_; +}; + + +/** + struct DifferentiableTransformMapped is just a holder of an object of type + DifferentiableTransform and a vector representing a map from + pdf-ids to classes. + + This map (if present) will be obtained from the binary build-tree-two-level, + and will map from tree leaves to a smaller number of classes (e.g. 200), so + that we can reasonably estimate the class means from a single minibatch + during training. The contents of 'pdf_map' should be in the range [0, + transform->NumClases() - 1]. + + */ +struct DifferentiableTransformMapped { + DifferentiableTransform *transform; + std::vector pdf_map; + + // This function returns pdf_map.size() if pdf_map is nonempty; otherwise + // it returns transform->NumClasses(). + int32 NumPdfs() const; + + void Read(std::istream &is, bool binary); + + void Write(std::ostream &os, bool binary) const; + + // Returns a string something like: + // dim=256 + // num-classes=200 + // num-pdfs=6391 + // ... in future we will likely add more information, but for now you can get it by + // copying to text form and looking at it directly. + // the "num-classes" is transform->NumClasses(), and "num-pdfs" is + // pdf_map.size() if pdf_map is nonempty; else, transform->NumClasses(). + std::string Info() const; + + // Check that the dimensions are consistent, i.e. pdf_map.empty() or + // transform->NumClasses() == max-element-in-pdf_map + 1. + void Check() const; + + DifferentiableTransformMapped(): transform(NULL) {} + + ~DifferentiableTransformMapped() { delete transform; } + + // Copy constructor + DifferentiableTransformMapped(const DifferentiableTransformMapped &other); +}; + + +} // namespace differentiable_transform +} // namespace kaldi + +#endif // KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ diff --git a/src/adapt/differentiable-transform-test.cc b/src/adapt/differentiable-transform-test.cc new file mode 100644 index 00000000000..8ad9ee7dcfa --- /dev/null +++ b/src/adapt/differentiable-transform-test.cc @@ -0,0 +1,281 @@ +// adapt/differentiable-transform-test.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "adapt/differentiable-transform.h" +#include "matrix/sp-matrix.h" + +namespace kaldi { +namespace differentiable_transform { + +// This function writes a random configuration file of dimension +// 'dim' (or a random dimension if dim == -1) to 'os'. +void WriteRandomConfigOfDim(std::ostream &os, int32 dim) { + // nonrandom_dim is a randomly chosen dimension if dim == -1, + // else it's dim. + int32 actual_dim = (dim == -1 ? RandInt(10, 20) : dim); + int32 i, num_transforms = RandInt(1, 3); + + while (true) { + // we loop here in case we hit a case we don't want to handle. + // We give more cases to the non-recursive transforms to ensure + // the expected size of the config file is finite. + switch(RandInt(0, 7)) { + case 0: + os << "NoOpTransform dim=" << actual_dim << "\n"; + return; + case 1: case 2: case 3: + os << "FmllrTransform dim=" << actual_dim << " smoothing-count=" + << 100.0 * RandInt(0, 2) << "\n"; + return; + case 4: case 5: + os << "MeanOnlyTransform dim=" << actual_dim << "\n"; + return; + case 6: + if (dim != -1) // complicated to ensure a given dim for AppendTransform. + continue; + os << "AppendTransform num-transforms=" << num_transforms << "\n"; + for (i = 0; i < num_transforms; i++) + WriteRandomConfigOfDim(os, -1); + return; + case 7: + os << "SequenceTransform num-transforms=" << num_transforms << "\n"; + for (i = 0; i < num_transforms; i++) + WriteRandomConfigOfDim(os, actual_dim); + return; + } + } + +} + +// This function writes a random configuration file to 'os'. +void WriteRandomConfigFile(std::ostream &os) { + WriteRandomConfigOfDim(os, -1); +} + + + +void UnitTestReadFromConfig() { + using namespace kaldi; + using namespace kaldi::differentiable_transform; + + for (int32 i = 0; i < 100; i++) { + std::ostringstream os; + WriteRandomConfigFile(os); + std::istringstream is(os.str()); + int32 num_classes = RandInt(20, 30); + DifferentiableTransform *transform = + DifferentiableTransform::ReadFromConfig(is, num_classes); + KALDI_ASSERT(transform != NULL); + delete transform; + } +} + +// Creates a random mean per class and adds it to the features, weighted +// according to the posteriors. It makes the tests more realistic, if +// there are systematic differences between the classes. +void AddRandomMeanOffsets(BaseFloat scale, + int32 num_classes, + const Posterior &post, + CuMatrix *feats) { + int32 T = feats->NumRows(), dim = feats->NumCols(); + CuMatrix class_means(num_classes, dim); + class_means.SetRandn(); + class_means.Scale(scale); + for (int32 t = 0; t < T; t++) { + auto iter = post[t].begin(), end = post[t].end(); + BaseFloat tot_post = 0.0; + for (; iter != end; ++iter) + tot_post += iter->second; + for (iter = post[t].begin(); iter != end; ++iter) { + int32 i = iter->first; + BaseFloat p = iter->second / tot_post; + feats->Row(t).AddVec(p, class_means.Row(i)); + } + } +} + +void GetRandomPosterior(int32 num_frames, int32 num_classes, + Posterior *post) { + post->resize(num_frames); + for (int32 t = 0; t < num_frames; t++) { + for (int32 i = 0; i < 3; i++) { + if (RandInt(0, 1) == 0) { + (*post)[t].push_back(std::pair( + RandInt(0, num_classes - 1), 0.1 + RandUniform())); + } + } + } + +} + +void TestTraining(DifferentiableTransform *transform) { + // test that the training process runs. + int32 dim = transform->Dim(), + num_classes = transform->NumClasses(), + num_frames = RandInt(200, 300), + num_spk = RandInt(2, 10), + chunks_per_spk = RandInt(1, 4), + num_rows = num_frames * num_spk * chunks_per_spk; + CuMatrix input_feats(num_rows, dim), + output_feats(num_rows, dim, kUndefined), + output_deriv(num_rows, dim, kUndefined), + input_deriv(num_rows, dim); + + // This is to verify that TrainingBackward() adds to, rather than + // setting to, the input deriv. + CuMatrix random_input_deriv(num_rows, dim); + random_input_deriv.SetRandn(); + input_deriv.AddMat(1.0, random_input_deriv); + + input_feats.SetRandn(); + output_deriv.SetRandn(); + Posterior post; + GetRandomPosterior(num_rows, num_classes, &post); + AddRandomMeanOffsets(10.0, num_classes, post, &input_feats); + + int32 num_chunks = num_spk * chunks_per_spk; + MinibatchInfoItf *info = + transform->TrainingForward(input_feats, num_chunks, num_spk, post, + &output_feats); + CuMatrix diff(input_feats); + diff.AddMat(-1.0, output_feats); + KALDI_LOG << "Difference in features (relative) is " + << (diff.FrobeniusNorm() / input_feats.FrobeniusNorm()); + + + transform->TrainingBackward(input_feats, output_deriv, num_chunks, + num_spk, post, info, &input_deriv); + // testing that TrainingBackward adds to the input deriv. + input_deriv.AddMat(-1.0, random_input_deriv); + + int32 n = 5; + Vector expected_changes(n), observed_changes(n); + BaseFloat epsilon = 1.0e-03; + for (int32 i = 0; i < n; i++) { + CuMatrix new_input_feats(num_rows, dim), + new_output_feats(num_rows, dim, kUndefined); + new_input_feats.SetRandn(); + new_input_feats.Scale(epsilon); + expected_changes(i) = TraceMatMat(new_input_feats, input_deriv, kTrans); + new_input_feats.AddMat(1.0, input_feats); + MinibatchInfoItf *info2 = + transform->TrainingForward(new_input_feats, num_chunks, num_spk, + post, &new_output_feats); + delete info2; + new_output_feats.AddMat(-1.0, output_feats); + observed_changes(i) = TraceMatMat(new_output_feats, output_deriv, kTrans); + } + KALDI_LOG << "Expected changes: " << expected_changes + << ", observed changes: " << observed_changes; + KALDI_ASSERT(expected_changes.ApproxEqual(observed_changes, 0.15)); + + { + // Test that if we do Accumulate() and Estimate() on the same data we + // trained on, and then TestingForwardBatch(), we get the same answer + // as during training. Note: this may not be true for all examples + // including SequenceTransform, due to how we treat the last of the + // transforms specially. + + int32 num_final_iters = transform->NumFinalIterations(); + for (int32 i = 0; i < num_final_iters; i++) { + transform->Accumulate(i, input_feats, num_chunks, num_spk, post); + // transform->Add(*transform); // Just check Add() does not crash. + // it does crash but because of AddVec() failing on this == other.. its ok. + transform->Estimate(i); + } + CuMatrix output_feats2(output_feats.NumRows(), + output_feats.NumCols(), kUndefined); + transform->TestingForwardBatch(input_feats, num_chunks, num_spk, post, + &output_feats2); + output_feats2.AddMat(-1.0, output_feats); + BaseFloat rel_diff = (output_feats2.FrobeniusNorm() / + output_feats.FrobeniusNorm()); + KALDI_LOG << "Difference in features train vs. test (relative) is " + << rel_diff; + if (rel_diff > 0.001) { + KALDI_WARN << "Make sure this config would not be equivalent train " + "vs. test (see config printed above)."; + } + } +} + + +void UnitTestTraining() { + for (int32 i = 0; i < 100; i++) { + std::ostringstream os; + WriteRandomConfigFile(os); + std::istringstream is(os.str()); + int32 num_classes = RandInt(20, 30); + DifferentiableTransform *transform = + DifferentiableTransform::ReadFromConfig(is, num_classes); + KALDI_LOG << "Config is: " << os.str(); + KALDI_ASSERT(transform != NULL); + if (os.str().find("smoothing-count=0") == std::string::npos) { + // Don't do this test if smoothing-count is zero: it can + // fail but it doesn't indicate a real problem. + TestTraining(transform); + } + delete transform; + } +} + + +void UnitTestIo() { + for (int32 i = 0; i < 100; i++) { + std::ostringstream os; + WriteRandomConfigFile(os); + std::istringstream is(os.str()); + int32 num_classes = RandInt(20, 30); + DifferentiableTransform *transform = + DifferentiableTransform::ReadFromConfig(is, num_classes); + KALDI_ASSERT(transform != NULL); + + std::ostringstream os2; + bool binary = (RandInt(0,1) == 0); + transform->Write(os2, binary); + + std::istringstream is2(os2.str()); + + DifferentiableTransform *transform2 = + DifferentiableTransform::ReadNew(is2, binary); + std::ostringstream os3; + transform2->Write(os3, binary); + KALDI_ASSERT(os2.str() == os3.str()); + delete transform; + delete transform2; + } +} + + + +} // namespace kaldi +} // namespace differentiable_transform + + + +int main() { + using namespace kaldi::differentiable_transform; + kaldi::SetVerboseLevel(3); + for (int32 i = 0; i < 3; i++) { + UnitTestReadFromConfig(); + UnitTestIo(); + UnitTestTraining(); + } + std::cout << "Test OK.\n"; +} diff --git a/src/adapt/differentiable-transform.cc b/src/adapt/differentiable-transform.cc new file mode 100644 index 00000000000..bcaf356e695 --- /dev/null +++ b/src/adapt/differentiable-transform.cc @@ -0,0 +1,624 @@ +// adapt/differentiable-transform.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "adapt/differentiable-transform.h" + + +// This header contains the 'base-cases' of DifferentiableTransform: namely, +// FmllrTransform and MeanOnlyTransform. See also generic-transform.h where +// sequence, append and no-op types are defined. +namespace kaldi { +namespace differentiable_transform { + +FmllrMinibatchInfo::FmllrMinibatchInfo( + int32 num_classes, int32 dim, int32 num_speakers): + target_model(num_classes, dim), + estimators(num_speakers, NULL) { } + +FmllrMinibatchInfo::~FmllrMinibatchInfo() { + for (size_t i = 0; i < estimators.size(); i++) + delete estimators[i]; +} + + +void FmllrSpeakerStats::Estimate() { + BaseFloat objf_impr = estimator.Estimate(); + KALDI_VLOG(1) << "Objective function improvement per frame is " << objf_impr; +} + + +int32 FmllrTransform::InitFromConfig( + int32 cur_pos, + std::vector *config_lines) { + KALDI_ASSERT(cur_pos < int32(config_lines->size())); + ConfigLine *line = &((*config_lines)[cur_pos]); + KALDI_ASSERT(line->FirstToken() == Type()); + + if (!line->GetValue("dim", &dim_) || dim_ <= 0) + KALDI_ERR << "Dimension 'dim' must be specified for FmllrTransform, config " + "line is: " << line->WholeLine(); + fmllr_opts_.ReadFromConfig(line); + if (line->HasUnusedValues()) + KALDI_ERR << "Some configuration values were not used: '" + << line->UnusedValues() << "', in line: " + << line->WholeLine(); + return cur_pos + 1; +} + + +FmllrTransform::FmllrTransform(const FmllrTransform &other): + DifferentiableTransform(other), + dim_(other.dim_), fmllr_opts_(other.fmllr_opts_), + target_model_(other.target_model_ == NULL ? NULL : + new GaussianEstimator(*other.target_model_)) { } + +DifferentiableTransform *FmllrTransform::Copy() const { + return new FmllrTransform(*this); +} + +void FmllrTransform::Add(const DifferentiableTransform &other_in) { + const FmllrTransform *other = dynamic_cast(&other_in); + if (target_model_ && other->target_model_) + target_model_->Add(*(other->target_model_)); +} + +void FmllrTransform::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_classes_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dim_); + fmllr_opts_.Write(os, binary); + if (target_model_ != NULL) { + WriteToken(os, binary, ""); + target_model_->Write(os, binary); + } else { + WriteToken(os, binary, ""); + } + WriteToken(os, binary, ""); +} + +void FmllrTransform::Read(std::istream &is, bool binary) { + delete target_model_; + target_model_ = NULL; + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &num_classes_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &dim_); + fmllr_opts_.Read(is, binary); + std::string tok; + ReadToken(is, binary, &tok); + if (tok == "") { + target_model_ = new GaussianEstimator(num_classes_, dim_); + target_model_->Read(is, binary); + } // else "". + ExpectToken(is, binary, ""); +} + + +MinibatchInfoItf* FmllrTransform::TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const { + int32 num_classes = num_classes_, + dim = dim_, num_frames = input.NumRows(); + KALDI_ASSERT(SameDim(input, *output) && input.NumCols() == dim && + int32(posteriors.size()) == input.NumRows()); + KALDI_ASSERT(num_chunks % num_spk == 0 && num_spk > 1 && + num_frames % num_chunks == 0); + int32 chunks_per_spk = num_chunks / num_spk, + frames_per_chunk = num_frames / num_chunks; + + FmllrMinibatchInfo *ans = new FmllrMinibatchInfo(num_classes, + dim, num_spk); + + // The input is in CuMatrix, i.e. it's on the GPU if we're using a GPU. For + // now we just transfer everything to CPU, which of course is not optimal; we + // may later implement some of the deeper parts of this on GPU if the methods + // turn out to be effective. + Matrix input_cpu(input), + output_cpu(num_frames, dim, kUndefined); + + // First estimate the target model (Gaussian means and spherical variances). + ans->target_model.AccStats(input_cpu, posteriors); + ans->target_model.Estimate(fmllr_opts_); + + for (int32 s = 0; s < num_spk; s++) + ans->estimators[s] = new FmllrEstimator(fmllr_opts_, + ans->target_model.GetMeans(), + ans->target_model.GetVars()); + + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix this_input(input_cpu.RowData(chunk), + frames_per_chunk, // num-rows + dim, // num-cols + input_cpu.Stride() * num_chunks); // stride + SubPosterior this_posteriors(posteriors, + chunk, // offset + frames_per_chunk, // num_frames + num_chunks); // stride + ans->estimators[speaker]->AccStats(this_input, this_posteriors); + } + BaseFloat objf_impr = 0.0; + for (int32 s = 0; s < num_spk; s++) { + BaseFloat this_impr = ans->estimators[s]->Estimate(); + objf_impr += this_impr / num_spk; + } + // objf_impr is now the average objective-function improvement per frame. + // We will later find a better way to display this. + KALDI_LOG << "Objective function improvement per frame is " + << objf_impr; + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix + this_input(input_cpu.RowData(chunk), frames_per_chunk, dim, + input_cpu.Stride() * num_chunks), + this_output(output_cpu.RowData(chunk), + frames_per_chunk, dim, output_cpu.Stride() * num_chunks); + ans->estimators[speaker]->AdaptFeatures(this_input, &this_output); + } + output->CopyFromMat(output_cpu); + return ans; +} + +void FmllrTransform::TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const { + FmllrMinibatchInfo *info = dynamic_cast(minibatch_info); + KALDI_ASSERT(info != NULL && "Wrong type of minibatch info supplied."); + + int32 dim = dim_, num_frames = input.NumRows(); + KALDI_ASSERT(SameDim(input, output_deriv) && input.NumCols() == dim && + SameDim(input, *input_deriv) && + int32(posteriors.size()) == input.NumRows()); + KALDI_ASSERT(num_chunks % num_spk == 0 && num_spk > 1 && + num_frames % num_chunks == 0); + int32 chunks_per_spk = num_chunks / num_spk, + frames_per_chunk = num_frames / num_chunks; + + // For now we just transfer everything to the CPU. + Matrix input_cpu(input), + output_deriv_cpu(output_deriv), + input_deriv_cpu(num_frames, dim); + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix this_input( + input_cpu.RowData(chunk), frames_per_chunk, + dim, input_cpu.Stride() * num_chunks), + this_output_deriv(output_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + output_deriv_cpu.Stride() * num_chunks), + this_input_deriv(input_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + input_deriv_cpu.Stride() * num_chunks); + info->estimators[speaker]->AdaptFeaturesBackward( + this_input, this_output_deriv, &this_input_deriv); + } + + for (int32 s = 0; s < num_spk; s++) + info->estimators[s]->EstimateBackward(); + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix this_input( + input_cpu.RowData(chunk), frames_per_chunk, + dim, input_cpu.Stride() * num_chunks), + this_output_deriv(output_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + output_deriv_cpu.Stride() * num_chunks), + this_input_deriv(input_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + input_deriv_cpu.Stride() * num_chunks); + SubPosterior this_posteriors(posteriors, chunk, + frames_per_chunk, num_chunks); + info->estimators[speaker]->AccStatsBackward( + this_input, this_posteriors, &this_input_deriv); + } + + for (int32 s = 0; s < num_spk; s++) + info->target_model.AddToOutputDerivs(info->estimators[s]->GetMeanDeriv(), + info->estimators[s]->GetVarDeriv()); + + info->target_model.AccStatsBackward(input_cpu, posteriors, &input_deriv_cpu); + // These TrainingBackward() functions are all supposed to add to the + // 'input_deriv'. + CuMatrix input_deriv_temp(input_deriv->NumRows(), + input_deriv->NumCols(), + kUndefined); + input_deriv_temp.CopyFromMat(input_deriv_cpu); + input_deriv->AddMat(1.0, input_deriv_temp); + + delete info; +} + + +void FmllrTransform::Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) { + KALDI_ASSERT(final_iter == 0); + if (target_model_ == NULL) + target_model_ = new GaussianEstimator(num_classes_, dim_); + Matrix input_cpu(input); + target_model_->AccStats(input_cpu, posteriors); +} + + +void FmllrTransform::Estimate(int32 final_iter) { + KALDI_ASSERT(final_iter == 0 && target_model_ != NULL); + target_model_->Estimate(fmllr_opts_); +} + + +SpeakerStatsItf *FmllrTransform::GetEmptySpeakerStats() const { + KALDI_ASSERT(target_model_ != NULL && + target_model_->GetMeans().NumRows() != 0 && + "You're trying to do adaptation with speaker transforms on " + "which you haven't done the final phase of training."); + return new FmllrSpeakerStats(fmllr_opts_, target_model_->GetMeans(), + target_model_->GetVars()); +} + +void FmllrTransform::TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const { + FmllrSpeakerStats *stats = dynamic_cast( + speaker_stats); + KALDI_ASSERT(stats != NULL && "Wrong type of speaker stats supplied."); + stats->estimator.AccStats(input, posteriors); +} + +void FmllrTransform::TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const { + const FmllrSpeakerStats *stats = dynamic_cast( + &speaker_stats); + KALDI_ASSERT(stats != NULL && "Wrong type of speaker stats supplied."); + KALDI_ASSERT(stats->estimator.IsEstimated() && + "You can't call TestingForward() without calling Estimate() on " + "the speaker stats."); + stats->estimator.AdaptFeatures(input, output); +} + +void FmllrTransform::GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const { + const FmllrSpeakerStats *stats = dynamic_cast( + &speaker_stats); + int32 dim = Dim(); + KALDI_ASSERT(transform->NumRows() == dim && transform->NumCols() == dim + 1); + transform->ColRange(0, dim).CopyFromMat(stats->estimator.GetLinearParams()); + transform->CopyColFromVec(stats->estimator.GetBiasParams(), dim); +} + +FmllrTransform::~FmllrTransform() { + delete target_model_; +} + + +MeanOnlyTransformMinibatchInfo::MeanOnlyTransformMinibatchInfo( + int32 num_classes, int32 dim, int32 num_speakers): + target_model(num_classes, dim), + estimators(num_speakers, NULL) { } + +MeanOnlyTransformMinibatchInfo::~MeanOnlyTransformMinibatchInfo() { + for (size_t i = 0; i < estimators.size(); i++) + delete estimators[i]; +} + + +int32 MeanOnlyTransform::InitFromConfig( + int32 cur_pos, + std::vector *config_lines) { + KALDI_ASSERT(cur_pos < int32(config_lines->size())); + ConfigLine *line = &((*config_lines)[cur_pos]); + KALDI_ASSERT(line->FirstToken() == Type()); + + if (!line->GetValue("dim", &dim_) || dim_ <= 0) + KALDI_ERR << "Dimension 'dim' must be specified for MeanOnlyTransform, config " + "line is: " << line->WholeLine(); + if (line->HasUnusedValues()) + KALDI_ERR << "Some configuration values were not used: '" + << line->UnusedValues() << "', in line: " + << line->WholeLine(); + return cur_pos + 1; +} + +MeanOnlyTransform::MeanOnlyTransform(const MeanOnlyTransform &other): + DifferentiableTransform(other), + dim_(other.dim_), target_model_(other.target_model_ == NULL ? NULL : + new GaussianEstimator(*other.target_model_)) { } + + +void MeanOnlyTransform::Add(const DifferentiableTransform &other_in) { + const MeanOnlyTransform *other = + dynamic_cast(&other_in); + if (target_model_ && other->target_model_) + target_model_->Add(*(other->target_model_)); +} + +void MeanOnlyTransform::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_classes_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dim_); + if (target_model_ != NULL) { + WriteToken(os, binary, ""); + target_model_->Write(os, binary); + } else { + WriteToken(os, binary, ""); + } + WriteToken(os, binary, ""); +} + +void MeanOnlyTransform::Read(std::istream &is, bool binary) { + delete target_model_; + target_model_ = NULL; + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &num_classes_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &dim_); + std::string tok; + ReadToken(is, binary, &tok); + if (tok == "") { + target_model_ = new GaussianEstimator(num_classes_, dim_); + target_model_->Read(is, binary); + } // else "". + ExpectToken(is, binary, ""); +} + + +MinibatchInfoItf* MeanOnlyTransform::TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const { + int32 num_classes = num_classes_, + dim = dim_, num_frames = input.NumRows(); + KALDI_ASSERT(SameDim(input, *output) && input.NumCols() == dim && + int32(posteriors.size()) == input.NumRows()); + KALDI_ASSERT(num_chunks % num_spk == 0 && num_spk > 1 && + num_frames % num_chunks == 0); + int32 chunks_per_spk = num_chunks / num_spk, + frames_per_chunk = num_frames / num_chunks; + + MeanOnlyTransformMinibatchInfo *ans = new MeanOnlyTransformMinibatchInfo(num_classes, + dim, num_spk); + + // The input is in CuMatrix, i.e. it's on the GPU if we're using a GPU. For + // now we just transfer everything to CPU, which of course is not optimal; we + // may later implement some of the deeper parts of this on GPU if the methods + // turn out to be effective. + Matrix input_cpu(input), + output_cpu(num_frames, dim, kUndefined); + + // First estimate the target model (Gaussian means and spherical variances). + // We use the default options: they only affect the variances, which we won't + // be using. + ans->target_model.AccStats(input_cpu, posteriors); + FmllrEstimatorOptions default_opts; + ans->target_model.Estimate(default_opts); + + for (int32 s = 0; s < num_spk; s++) + ans->estimators[s] = new MeanOnlyTransformEstimator( + ans->target_model.GetMeans()); + + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix this_input(input_cpu.RowData(chunk), + frames_per_chunk, // num-rows + dim, // num-cols + input_cpu.Stride() * num_chunks); // stride + SubPosterior this_posteriors(posteriors, + chunk, // offset + frames_per_chunk, // num_frames + num_chunks); // stride + ans->estimators[speaker]->AccStats(this_input, this_posteriors); + } + for (int32 s = 0; s < num_spk; s++) + ans->estimators[s]->Estimate(); + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix + this_input(input_cpu.RowData(chunk), frames_per_chunk, dim, + input_cpu.Stride() * num_chunks), + this_output(output_cpu.RowData(chunk), + frames_per_chunk, dim, output_cpu.Stride() * num_chunks); + ans->estimators[speaker]->AdaptFeatures(this_input, &this_output); + } + output->CopyFromMat(output_cpu); + return ans; +} + + +DifferentiableTransform *MeanOnlyTransform::Copy() const { + return new MeanOnlyTransform(*this); +} + +void MeanOnlyTransform::TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const { + MeanOnlyTransformMinibatchInfo *info = + dynamic_cast(minibatch_info); + KALDI_ASSERT(info != NULL && "Wrong type of minibatch info supplied."); + + int32 dim = dim_, num_frames = input.NumRows(); + KALDI_ASSERT(SameDim(input, output_deriv) && input.NumCols() == dim && + SameDim(input, *input_deriv) && + int32(posteriors.size()) == input.NumRows()); + KALDI_ASSERT(num_chunks % num_spk == 0 && num_spk > 1 && + num_frames % num_chunks == 0); + int32 chunks_per_spk = num_chunks / num_spk, + frames_per_chunk = num_frames / num_chunks; + + // For now we just transfer everything to the CPU. + Matrix input_cpu(input), + output_deriv_cpu(output_deriv), + input_deriv_cpu(num_frames, dim); + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix this_input( + input_cpu.RowData(chunk), frames_per_chunk, + dim, input_cpu.Stride() * num_chunks), + this_output_deriv(output_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + output_deriv_cpu.Stride() * num_chunks), + this_input_deriv(input_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + input_deriv_cpu.Stride() * num_chunks); + info->estimators[speaker]->AdaptFeaturesBackward( + this_input, this_output_deriv, &this_input_deriv); + } + + for (int32 s = 0; s < num_spk; s++) + info->estimators[s]->EstimateBackward(); + + for (int32 chunk = 0; chunk < num_chunks; chunk++) { + int32 speaker = chunk / chunks_per_spk; + SubMatrix this_input( + input_cpu.RowData(chunk), frames_per_chunk, + dim, input_cpu.Stride() * num_chunks), + this_output_deriv(output_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + output_deriv_cpu.Stride() * num_chunks), + this_input_deriv(input_deriv_cpu.RowData(chunk), + frames_per_chunk, dim, + input_deriv_cpu.Stride() * num_chunks); + SubPosterior this_posteriors(posteriors, chunk, + frames_per_chunk, num_chunks); + info->estimators[speaker]->AccStatsBackward( + this_input, this_posteriors, &this_input_deriv); + } + + for (int32 s = 0; s < num_spk; s++) { + Vector var_derivs(num_classes_); // zero. + info->target_model.AddToOutputDerivs(info->estimators[s]->GetMeanDeriv(), + var_derivs); + } + + info->target_model.AccStatsBackward(input_cpu, posteriors, &input_deriv_cpu); + // These TrainingBackward() functions are all supposed to add to the + // 'input_deriv'. + CuMatrix input_deriv_temp(input_deriv->NumRows(), + input_deriv->NumCols(), + kUndefined); + input_deriv_temp.CopyFromMat(input_deriv_cpu); + input_deriv->AddMat(1.0, input_deriv_temp); + delete info; +} + + +void MeanOnlyTransform::Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) { + KALDI_ASSERT(final_iter == 0); + if (target_model_ == NULL) + target_model_ = new GaussianEstimator(num_classes_, dim_); + Matrix input_cpu(input); + target_model_->AccStats(input_cpu, posteriors); +} + +void MeanOnlyTransform::Estimate(int32 final_iter) { + KALDI_ASSERT(final_iter == 0 && target_model_ != NULL); + // The options only affect the estimates of the variance, which we don't use + // here, so we use the default options. + FmllrEstimatorOptions default_opts; + target_model_->Estimate(default_opts); +} + + + +SpeakerStatsItf *MeanOnlyTransform::GetEmptySpeakerStats() const { + KALDI_ASSERT(target_model_ != NULL && + target_model_->GetMeans().NumRows() != 0 && + "You're trying to do adaptation with speaker transforms on " + "which you haven't done the final phase of training."); + return new MeanOnlyTransformSpeakerStats(target_model_->GetMeans()); +} + +void MeanOnlyTransform::TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const { + MeanOnlyTransformSpeakerStats *stats = dynamic_cast( + speaker_stats); + KALDI_ASSERT(stats != NULL && "Wrong type of speaker stats supplied."); + stats->estimator.AccStats(input, posteriors); +} + +void MeanOnlyTransform::TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const { + const MeanOnlyTransformSpeakerStats *stats = dynamic_cast( + &speaker_stats); + KALDI_ASSERT(stats != NULL && "Wrong type of speaker stats supplied."); + KALDI_ASSERT(stats->estimator.IsEstimated() && + "You can't call TestingForward() without calling Estimate() on " + "the speaker stats."); + stats->estimator.AdaptFeatures(input, output); +} + +void MeanOnlyTransform::GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const { + const MeanOnlyTransformSpeakerStats *stats = + dynamic_cast(&speaker_stats); + int32 dim = Dim(); + KALDI_ASSERT(transform->NumRows() == dim && transform->NumCols() == dim + 1); + transform->SetUnit(); + transform->CopyColFromVec(stats->estimator.GetOffset(), dim); +} + +MeanOnlyTransform::~MeanOnlyTransform() { + delete target_model_; +} + + + +} // namespace differentiable_transform +} // namespace kaldi diff --git a/src/adapt/differentiable-transform.h b/src/adapt/differentiable-transform.h new file mode 100644 index 00000000000..c3abb1bbb96 --- /dev/null +++ b/src/adapt/differentiable-transform.h @@ -0,0 +1,289 @@ +// adapt/differentiable-transform.h + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ +#define KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ + +#include + +#include "base/kaldi-common.h" +#include "matrix/kaldi-matrix.h" +#include "cudamatrix/cu-matrix.h" +#include "adapt/differentiable-transform-itf.h" +#include "adapt/differentiable-fmllr.h" + + +// This header contains the 'base-cases' of DifferentiableTransform: namely, +// FmllrTransform and MeanOnlyTransform. See also generic-transform.h where +// sequence, append and no-op types are defined. +namespace kaldi { +namespace differentiable_transform { + + +/** + This is a version of the transform class that implements fMLLR (with + spherical variances, to make the update equations non-iterative); see + differentiable-fmllr.h where the core parts of this are implemented, + this provides the interface compatible with DifferentiableTransform. + + Please see the comments in class DifferentiableTransform (in + differentiable-transform-itf.h) for the meaning and usage of the various + interface functions and their parameters. +*/ +class FmllrTransform: public DifferentiableTransform { + public: + int32 InitFromConfig(int32 cur_pos, + std::vector *config_lines) override; + + + int32 Dim() const override { return dim_; } + + MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const override; + + virtual void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const override; + + void Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) override; + + void Estimate(int32 final_iter) override; + + int32 NumFinalIterations() override { return 1; } + + SpeakerStatsItf *GetEmptySpeakerStats() const override; + + void TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const override; + + void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const override; + + void GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const override; + + FmllrTransform(const FmllrTransform &other); + + FmllrTransform(): target_model_(NULL) { } + + std::string Type() const override { return "FmllrTransform"; } + + DifferentiableTransform* Copy() const override; + + void Write(std::ostream &os, bool binary) const override; + + void Read(std::istream &is, bool binary) override; + + void Add(const DifferentiableTransform &other) override; + + ~FmllrTransform(); + private: + int32 dim_; + + FmllrEstimatorOptions fmllr_opts_; + + // Note: this target model is only for consumption in test time; it is + // produced right at the end of training when Accumulate() and Estimate() are + // called. We allocate it the first time Accumulate() is called. In training + // time the corresponding stats are esimated minibatch by minibatch, not via + // this member (which is why we don't expect to have that many classes in + // training time). At the end of training we'll accumulate stats here in + // Accumulate(), and Estimate() will estimate it. + GaussianEstimator *target_model_; +}; + +class FmllrMinibatchInfo: public MinibatchInfoItf { + public: + + FmllrMinibatchInfo(int32 num_classes, int32 dim, int32 num_speakers); + + GaussianEstimator target_model; + + // One estimator of Fmllr per speaker. Make them pointers so we don't have to + // implement self-constructor for class FmllrEstimator. + std::vector estimators; + + ~FmllrMinibatchInfo(); +}; + +class FmllrSpeakerStats: public SpeakerStatsItf { + public: + // Caution: this object maintains references to mu and s, so it's not a good + // idea to let the target-model (which lives in the FmllrTransform object) be + // deleted during the lifetime of this object. + FmllrSpeakerStats(const FmllrEstimatorOptions &opts, + const MatrixBase &mu, + const VectorBase &s): + estimator(opts, mu, s) { } + + void Estimate() override; + + FmllrEstimator estimator; + + ~FmllrSpeakerStats() { } +}; + +/** + This version of the transform class does a mean normalization: adding an + offset to its input so that the difference (per speaker) of the transformed + class means from the speaker-independent class means is minimized. + This is like a mean-only fMLLR with fixed (say, unit) covariance model. + */ +class MeanOnlyTransform: public DifferentiableTransform { + public: + /* + Example config line: + + MeanOnlyTransform dim=100 + */ + int32 InitFromConfig(int32 cur_pos, + std::vector *config_lines) override; + + + int32 Dim() const override { return dim_; } + + MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const override; + + virtual void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const override; + + void Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) override; + + void Estimate(int32 final_iter) override; + + int32 NumFinalIterations() override { return 1; } + + SpeakerStatsItf *GetEmptySpeakerStats() const override; + + void TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const override; + + void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const override; + + void GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const override; + + MeanOnlyTransform(const MeanOnlyTransform &other); + + MeanOnlyTransform(): target_model_(NULL) { } + + std::string Type() const override { return "MeanOnlyTransform"; } + + DifferentiableTransform* Copy() const override; + + void Write(std::ostream &os, bool binary) const override; + + void Read(std::istream &is, bool binary) override; + + void Add(const DifferentiableTransform &other) override; + + ~MeanOnlyTransform(); + private: + int32 dim_; + + // Note: this target model is only for consumption in test time; it is + // produced right at the end of training when Accumulate() and Estimate() are + // called. We allocate it the first time Accumulate() is called. In training + // time the corresponding stats are esimated minibatch by minibatch, not via + // this member (which is why we don't expect to have that many classes in + // training time). At the end of training we'll accumulate stats here in + // Accumulate(), and Estimate() will estimate it. + GaussianEstimator *target_model_; +}; + +class MeanOnlyTransformMinibatchInfo: public MinibatchInfoItf { + public: + + MeanOnlyTransformMinibatchInfo(int32 num_classes, int32 dim, + int32 num_speakers); + + GaussianEstimator target_model; + + // One estimator of offset per speaker. Make them pointers so we don't have to + // implement self-constructor for class FmllrEstimator. + std::vector estimators; + + ~MeanOnlyTransformMinibatchInfo(); +}; + +class MeanOnlyTransformSpeakerStats: public SpeakerStatsItf { + public: + // Caution: this object maintains a reference to mu, so it's not a good idea + // to let the target-model (which lives in the FmllrTransform object) be + // deleted during the lifetime of this object. + MeanOnlyTransformSpeakerStats(const MatrixBase &mu): + estimator(mu) { } + + void Estimate() override { estimator.Estimate(); } + + MeanOnlyTransformEstimator estimator; + + ~MeanOnlyTransformSpeakerStats() { } +}; + + + + + +} // namespace differentiable_transform +} // namespace kaldi + +#endif // KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_ diff --git a/src/adapt/generic-transform.cc b/src/adapt/generic-transform.cc new file mode 100644 index 00000000000..c2c73aefe85 --- /dev/null +++ b/src/adapt/generic-transform.cc @@ -0,0 +1,616 @@ +// adapt/generic-transform.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "adapt/differentiable-transform-itf.h" +#include "adapt/generic-transform.h" + +namespace kaldi { +namespace differentiable_transform { + + +int32 NoOpTransform::InitFromConfig( + int32 cur_pos, + std::vector *config_lines) { + KALDI_ASSERT(cur_pos < int32(config_lines->size())); + ConfigLine *line = &((*config_lines)[cur_pos]); + KALDI_ASSERT(line->FirstToken() == Type()); + if (!line->GetValue("dim", &dim_) || dim_ <= 0) + KALDI_ERR << "Dimension 'dim' must be specified for NoOpTransform, config " + "line is: " << line->WholeLine(); + if (line->HasUnusedValues()) + KALDI_ERR << "Some configuration values were not used: '" + << line->UnusedValues() << "', in line: " + << line->WholeLine(); + return cur_pos + 1; +} + + +void NoOpTransform::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_classes_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, dim_); + WriteToken(os, binary, ""); +} + +void NoOpTransform::Read(std::istream &is, bool binary) { + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &num_classes_); + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &dim_); + ExpectToken(is, binary, ""); +} + + +int32 SequenceTransform::InitFromConfig( + int32 cur_pos, + std::vector *config_lines) { + KALDI_ASSERT(cur_pos < int32(config_lines->size()) && + transforms_.empty()); + ConfigLine *line = &((*config_lines)[cur_pos]); + KALDI_ASSERT(line->FirstToken() == Type()); + int32 num_transforms = -1; + if (!line->GetValue("num-transforms", &num_transforms) || + num_transforms <= 0) + KALDI_ERR << "Config value num-transforms must be specified for " + "SequenceTransform, line is: " << line->WholeLine(); + if (line->HasUnusedValues()) + KALDI_ERR << "Some configuration values were not used: '" + << line->UnusedValues() << "', in line: " + << line->WholeLine(); + cur_pos++; + + int32 dim = 0; + for (int32 i = 0; i < num_transforms; i++) { + if (cur_pos >= int32(config_lines->size())) + KALDI_ERR << "Config file lacks enough lines for SequenceTransform."; + ConfigLine *other_line = &((*config_lines)[cur_pos]); + std::string transform_type = other_line->FirstToken(); + DifferentiableTransform *transform = NewTransformOfType(transform_type); + if (transform == NULL) + KALDI_ERR << "Could not find transform of type " << transform_type; + cur_pos = transform->InitFromConfig(cur_pos, config_lines); + if (i == 0) { + dim = transform->Dim(); + } else if (dim != transform->Dim()) { + KALDI_ERR << "Transforms used in SequenceTransform have inconsistent dim: " + << dim << " vs " << transform->Dim(); + } + transforms_.push_back(transform); + } + return cur_pos; +} + + +SequenceTransform::SequenceTransform(const SequenceTransform &other): + DifferentiableTransform(other), + transforms_(other.transforms_.size(), NULL) { + for (size_t i = 0; i < other.transforms_.size(); i++) + transforms_[i] = other.transforms_[i]->Copy(); +} + + +void SequenceTransform::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_classes_); + WriteToken(os, binary, ""); + int32 num_transforms = transforms_.size(); + WriteBasicType(os, binary, num_transforms); + for (int32 i = 0; i < num_transforms; i++) + transforms_[i]->Write(os, binary); + WriteToken(os, binary, ""); +} + +void SequenceTransform::Read(std::istream &is, bool binary) { + while (!transforms_.empty()) { + delete transforms_.back(); + transforms_.pop_back(); + } + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &num_classes_); + ExpectToken(is, binary, ""); + int32 num_transforms; + ReadBasicType(is, binary, &num_transforms); + for (int32 i = 0; i < num_transforms; i++) { + std::string tok; + ReadToken(is, binary, &tok); + DifferentiableTransform *transform; + if (!(transform = NewTransformOfType(tok))) + KALDI_ERR << "Expected the name of a transform, got " + << tok << " (maybe you should recompile?)"; + transform->Read(is, binary); + transforms_.push_back(transform); + } + ExpectToken(is, binary, ""); +} + +void SequenceTransform::Add(const DifferentiableTransform &other_in) { + const SequenceTransform *other = dynamic_cast( + &other_in); + KALDI_ASSERT(transforms_.size() == other->transforms_.size()); + for (size_t i = 0; i < transforms_.size(); i++) + transforms_[i]->Add(*(other->transforms_[i])); +} + +int32 SequenceTransform::Dim() const { + size_t num_transforms = transforms_.size(); + KALDI_ASSERT(num_transforms > 0); + return transforms_[0]->Dim(); +} + +void SequenceTransform::SetNumClasses(int32 num_classes) { + KALDI_ASSERT(num_classes > 0); + num_classes_ = num_classes; + for (size_t i = 0; i < transforms_.size(); i++) { + transforms_[i]->SetNumClasses(num_classes); + } +} + +SequenceTransform::~SequenceTransform() { + for (size_t i = 0; i < transforms_.size(); i++) + delete transforms_[i]; +} + +MinibatchInfoItf* SequenceTransform::TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const { + KALDI_ASSERT(SameDim(input, *output) && + !transforms_.empty()); + SequenceMinibatchInfo *ans = new SequenceMinibatchInfo(); + + const CuMatrixBase *last_output = &input; + CuMatrixBase *this_output; + + ans->outputs.resize(transforms_.size() - 1); + + for (size_t i = 0; i < transforms_.size(); i++) { + if (i + 1 == transforms_.size()) { + this_output = output; + } else { + // not the final transform. + ans->outputs[i].Resize(output->NumRows(), output->NumCols(), kUndefined); + this_output = &(ans->outputs[i]); + } + ans->info_vec.push_back(transforms_[i]->TrainingForward( + *last_output, num_chunks, num_spk, posteriors, this_output)); + last_output = this_output; + } + return ans; +} + +void SequenceTransform::TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const { + KALDI_ASSERT(SameDim(input, output_deriv) && SameDim(input, *input_deriv)); + + SequenceMinibatchInfo *info = dynamic_cast(minibatch_info); + KALDI_ASSERT(info != NULL && "Mismatched MinibatchInfo type?"); + + CuMatrix temp_deriv(input.NumRows(), + input.NumCols()); + int32 num_transforms = transforms_.size(); + KALDI_ASSERT(num_transforms > 0); + + const CuMatrixBase *cur_output_deriv = &output_deriv; + + for (int32 i = num_transforms - 1; i >= 0; i--) { + const CuMatrixBase *cur_input = (i == 0 ? &input : + &(info->outputs[i-1])); + CuMatrixBase *cur_input_deriv; + if (i == 0) { + cur_input_deriv = input_deriv; + } else if (i == num_transforms - 1) { + cur_input_deriv = &temp_deriv; + } else { + // this matrix is no longer needed, store the intermediate deriv here. + cur_input_deriv = &(info->outputs[i]); + cur_input_deriv->SetZero(); + } + transforms_[i]->TrainingBackward(*cur_input, *cur_output_deriv, + num_chunks, num_spk, posteriors, + info->info_vec[i], cur_input_deriv); + info->info_vec[i] = NULL; // Prevent it from being deleted twice. + cur_output_deriv = cur_input_deriv; + } + delete info; // This function took ownership. +} + +int32 SequenceTransform::NumFinalIterations() { + int32 ans = 0; + for (size_t i = 0; i < transforms_.size(); i++) + ans += transforms_[i]->NumFinalIterations(); + return ans; +} + +void SequenceTransform::Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) { + CuMatrix temp; + const CuMatrixBase *cur_input = &input; + + int32 prev_final_iters = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 nf = transforms_[i]->NumFinalIterations(); + if (final_iter < prev_final_iters + nf) { + transforms_[i]->Accumulate(final_iter - prev_final_iters, + *cur_input, num_chunks, num_spk, + posteriors); + return; + } else { + KALDI_ASSERT(i + 1 < transforms_.size()); + // We have to propagate the features through this transform. + CuMatrix this_output(input.NumRows(), input.NumCols(), + kUndefined); + transforms_[i]->TestingForwardBatch(*cur_input, num_chunks, num_spk, + posteriors, &this_output); + temp.Swap(&this_output); + cur_input = &temp; + } + prev_final_iters += nf; + } + KALDI_ERR << "final_iter out of range."; +} + +void SequenceTransform::Estimate(int32 final_iter) { + CuMatrix temp; + + int32 prev_final_iters = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 nf = transforms_[i]->NumFinalIterations(); + if (final_iter < prev_final_iters + nf) { + transforms_[i]->Estimate(final_iter - prev_final_iters); + return; + } + prev_final_iters += nf; + } + KALDI_ERR << "final_iter out of range."; +} + +void SequenceTransform::TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const { + transforms_.back()->TestingAccumulate(input, posteriors, + speaker_stats); +} + +void SequenceTransform::TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const { + transforms_.back()->TestingForward(input, speaker_stats, output); +} + +void SequenceTransform::GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const { + transforms_.back()->GetTransformAsMatrix(speaker_stats, transform); +} + + +SequenceMinibatchInfo::~SequenceMinibatchInfo() { + for (size_t i = 0; i < info_vec.size(); i++) + delete info_vec[i]; +} + + + +int32 AppendTransform::InitFromConfig( + int32 cur_pos, + std::vector *config_lines) { + KALDI_ASSERT(cur_pos < int32(config_lines->size()) && + transforms_.empty()); + ConfigLine *line = &((*config_lines)[cur_pos]); + KALDI_ASSERT(line->FirstToken() == Type()); + int32 num_transforms = -1; + if (!line->GetValue("num-transforms", &num_transforms) || + num_transforms <= 0) + KALDI_ERR << "Config value num-transforms must be specified for " + "AppendTransform, line is: " << line->WholeLine(); + if (line->HasUnusedValues()) + KALDI_ERR << "Some configuration values were not used: '" + << line->UnusedValues() << "', in line: " + << line->WholeLine(); + cur_pos++; + + for (int32 i = 0; i < num_transforms; i++) { + if (cur_pos >= int32(config_lines->size())) + KALDI_ERR << "Config file lacks enough lines for AppendTransform."; + ConfigLine *other_line = &((*config_lines)[cur_pos]); + std::string transform_type = other_line->FirstToken(); + DifferentiableTransform *transform = NewTransformOfType(transform_type); + if (transform == NULL) + KALDI_ERR << "Could not find transform of type " << transform_type; + cur_pos = transform->InitFromConfig(cur_pos, config_lines); + transforms_.push_back(transform); + } + return cur_pos; +} + + + +AppendTransform::AppendTransform(const AppendTransform &other): + DifferentiableTransform(other), + transforms_(other.transforms_.size(), NULL) { + for (size_t i = 0; i < other.transforms_.size(); i++) + transforms_[i] = other.transforms_[i]->Copy(); +} + + + +void AppendTransform::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, num_classes_); + WriteToken(os, binary, ""); + int32 num_transforms = transforms_.size(); + WriteBasicType(os, binary, num_transforms); + for (int32 i = 0; i < num_transforms; i++) + transforms_[i]->Write(os, binary); + WriteToken(os, binary, ""); +} + +void AppendTransform::Read(std::istream &is, bool binary) { + while (!transforms_.empty()) { + delete transforms_.back(); + transforms_.pop_back(); + } + ExpectOneOrTwoTokens(is, binary, "", ""); + ReadBasicType(is, binary, &num_classes_); + ExpectToken(is, binary, ""); + int32 num_transforms; + ReadBasicType(is, binary, &num_transforms); + for (int32 i = 0; i < num_transforms; i++) { + std::string tok; + ReadToken(is, binary, &tok); + DifferentiableTransform *transform; + if (!(transform = NewTransformOfType(tok))) + KALDI_ERR << "Expected the name of a transform, got " + << tok << " (maybe you should recompile?)"; + transform->Read(is, binary); + transforms_.push_back(transform); + } + ExpectToken(is, binary, ""); +} + +void AppendTransform::Add(const DifferentiableTransform &other_in) { + const AppendTransform *other = dynamic_cast( + &other_in); + KALDI_ASSERT(transforms_.size() == other->transforms_.size()); + for (size_t i = 0; i < transforms_.size(); i++) + transforms_[i]->Add(*(other->transforms_[i])); +} + +int32 AppendTransform::Dim() const { + size_t num_transforms = transforms_.size(); + KALDI_ASSERT(num_transforms > 0); + int32 ans = 0; + for (size_t i = 0; i < num_transforms; i++) + ans += transforms_[i]->Dim(); + return ans; +} + +void AppendTransform::SetNumClasses(int32 num_classes) { + num_classes_ = num_classes; + for (size_t i = 0; i < transforms_.size(); i++) { + transforms_[i]->SetNumClasses(num_classes); + } +} + +AppendTransform::~AppendTransform() { + for (size_t i = 0; i < transforms_.size(); i++) + delete transforms_[i]; +} + + +MinibatchInfoItf* AppendTransform::TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const { + KALDI_ASSERT(input.NumCols() == Dim() && + SameDim(input, *output)); + AppendMinibatchInfo *ans = new AppendMinibatchInfo(); + int32 dim_offset = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 this_dim = transforms_[i]->Dim(); + CuSubMatrix input_part = input.ColRange(dim_offset, this_dim), + output_part = output->ColRange(dim_offset, this_dim); + ans->info_vec.push_back(transforms_[i]->TrainingForward( + input_part, num_chunks, num_spk, posteriors, &output_part)); + dim_offset += this_dim; + } + KALDI_ASSERT(dim_offset == input.NumCols()); + return ans; +} + +void AppendTransform::TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const { + AppendMinibatchInfo *info = dynamic_cast(minibatch_info); + KALDI_ASSERT(info != NULL && "Mismatched MinibatchInfo type?"); + + int32 dim_offset = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 this_dim = transforms_[i]->Dim(); + CuSubMatrix input_part = input.ColRange(dim_offset, this_dim), + output_deriv_part = output_deriv.ColRange(dim_offset, this_dim), + input_deriv_part = input_deriv->ColRange(dim_offset, this_dim); + transforms_[i]->TrainingBackward( + input_part, output_deriv_part, num_chunks, num_spk, + posteriors, info->info_vec[i], &input_deriv_part); + info->info_vec[i] = NULL; // Prevent it from being deleted twice. + dim_offset += this_dim; + } + KALDI_ASSERT(dim_offset == input.NumCols()); + delete info; // This function took ownership. +} + +int32 AppendTransform::NumFinalIterations() { + int32 ans = 0; + for (size_t i = 0; i < transforms_.size(); i++) + ans = std::max(ans, transforms_[i]->NumFinalIterations()); + return ans; +} + + +void AppendTransform::Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) { + int32 num_final_iters = 0, + dim_offset = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 this_nf = transforms_[i]->NumFinalIterations(), + this_dim = transforms_[i]->Dim(); + if (final_iter < this_nf) + transforms_[i]->Accumulate(final_iter, + input.ColRange(dim_offset, this_dim), + num_chunks, num_spk, posteriors); + if (this_nf > num_final_iters) + num_final_iters = this_nf; + dim_offset += this_dim; + } + KALDI_ASSERT(final_iter >= 0 && final_iter < num_final_iters); +} + +void AppendTransform::Estimate(int32 final_iter) { + for (size_t i = 0; i < transforms_.size(); i++) { + int32 this_nf = transforms_[i]->NumFinalIterations(); + if (final_iter < this_nf) { + transforms_[i]->Estimate(final_iter); + } + } +} + +AppendMinibatchInfo::~AppendMinibatchInfo() { + for (size_t i = 0; i < info_vec.size(); i++) + delete info_vec[i]; +} + +SpeakerStatsItf* AppendTransform::GetEmptySpeakerStats() const { + AppendSpeakerStats *ans = new AppendSpeakerStats(); + for (size_t i = 0; i < transforms_.size(); i++) + ans->stats.push_back(transforms_[i]->GetEmptySpeakerStats()); + return ans; +} + +void AppendTransform::TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const { + AppendSpeakerStats *stats = dynamic_cast(speaker_stats); + KALDI_ASSERT(stats != NULL && stats->stats.size() == transforms_.size() && + "Wrong type of stats supplied to AppendTransform."); + int32 dim_offset = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 this_dim = transforms_[i]->Dim(); + SubMatrix input_part = input.ColRange(dim_offset, this_dim); + transforms_[i]->TestingAccumulate(input_part, posteriors, + stats->stats[i]); + dim_offset += this_dim; + } + KALDI_ASSERT(dim_offset == input.NumCols()); +} + + +void AppendTransform::TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const { + const AppendSpeakerStats *stats = + dynamic_cast(&speaker_stats); + KALDI_ASSERT(stats != NULL && stats->stats.size() == transforms_.size() && + "Wrong type of stats supplied to AppendTransform."); + int32 dim_offset = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 this_dim = transforms_[i]->Dim(); + SubMatrix input_part = input.ColRange(dim_offset, this_dim), + output_part = output->ColRange(dim_offset, this_dim); + transforms_[i]->TestingForward(input_part, *(stats->stats[i]), + &output_part); + dim_offset += this_dim; + } + KALDI_ASSERT(dim_offset == input.NumCols()); +} + +void AppendTransform::GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const { + int32 dim = Dim(); + KALDI_ASSERT(transform->NumRows() == dim && transform->NumCols() == dim + 1); + // first make sure the off-diagonal elements are zero. + transform->SetZero(); + const AppendSpeakerStats *stats = + dynamic_cast(&speaker_stats); + KALDI_ASSERT(stats != NULL && stats->stats.size() == transforms_.size() && + "Wrong type of stats supplied to AppendTransform."); + int32 dim_offset = 0; + for (size_t i = 0; i < transforms_.size(); i++) { + int32 this_dim = transforms_[i]->Dim(); + SubMatrix transform_part(*transform, dim_offset, this_dim, + dim_offset, this_dim + 1); + transforms_[i]->GetTransformAsMatrix(*(stats->stats[i]), &transform_part); + if (i + 1 < transforms_.size()) { + int32 current_offset_column = dim_offset + this_dim, + required_offset_column = dim; + for (int32 r = dim_offset; r < dim_offset + this_dim; r++) { + (*transform)(r, required_offset_column) = (*transform)(r, current_offset_column); + (*transform)(r, current_offset_column) = BaseFloat(0.0); + } + } + dim_offset += this_dim; + } + KALDI_ASSERT(dim_offset == Dim()); +} + +void AppendSpeakerStats::Estimate() { + for (size_t i = 0; i < stats.size(); i++) + stats[i]->Estimate(); +} + +AppendSpeakerStats::~AppendSpeakerStats() { + for (size_t i = 0; i < stats.size(); i++) + delete stats[i]; +} + + +} // namespace differentiable_transform +} // namespace kaldi diff --git a/src/adapt/generic-transform.h b/src/adapt/generic-transform.h new file mode 100644 index 00000000000..9b7933b69af --- /dev/null +++ b/src/adapt/generic-transform.h @@ -0,0 +1,333 @@ +// adapt/generic-transform.h + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#ifndef KALDI_TRANSFORM_GENERIC_TRANSFORM_H_ +#define KALDI_TRANSFORM_GENERIC_TRANSFORM_H_ + +#include +#include "base/kaldi-common.h" +#include "matrix/kaldi-matrix.h" +#include "cudamatrix/cu-matrix.h" +#include "adapt/differentiable-transform-itf.h" + +// This header contains 'generic' forms of differentiable transform, which allow +// you to append more basic transforms together or concatenate them dimension-wise. +// Also it includes a no-op transform. + +namespace kaldi { +namespace differentiable_transform { + + +/** + This is a version of the transform class that does nothing. It's potentially + useful for situations where you want to apply speaker normalization to some + dimensions of the feature vector but not to others. + */ +class NoOpTransform: public DifferentiableTransform { + public: + + int32 InitFromConfig(int32 cur_pos, + std::vector *config_lines) override; + + int32 Dim() const override { return dim_; } + + MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const override { + output->CopyFromMat(input); + return NULL; + } + void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const override { + KALDI_ASSERT(minibatch_info == NULL); + input_deriv->AddMat(1.0, output_deriv); + } + + int32 NumFinalIterations() override { return 0; } + + void Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) override { } + + + SpeakerStatsItf *GetEmptySpeakerStats() const override { + return new SpeakerStatsItf(); + } + + void TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const override { } + + void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const override { + output->CopyFromMat(input); + } + + void GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const override { transform->SetUnit(); } + + void Estimate(int32 final_iter) override { } + + NoOpTransform(): dim_(-1) { } + + NoOpTransform(const NoOpTransform &other): + DifferentiableTransform(other), + dim_(other.dim_) { } + + DifferentiableTransform* Copy() const override { + return new NoOpTransform(*this); + } + + std::string Type() const override { return "NoOpTransform"; } + + void Write(std::ostream &os, bool binary) const override; + + void Read(std::istream &is, bool binary) override; + + void Add(const DifferentiableTransform &other) override { } + private: + int32 dim_; +}; + + +/** + This is a version of the transform class that does a sequence of other + transforms, specified by other instances of the DifferentiableTransform + interface. For instance: fMLLR followed by another fMLLR, or mean normalization + followed by fMLLR. The reason this might make sense is that you'd get a better + estimate of the speaker-adapted class means if you do some kind of speaker + normalization before estimating those class means. + + Caution: the framework currently implicitly assumes that the + final one of the supplied transforms subsumes the previous ones + (as in fMLLR subsumes mean subtraction, or fMLLR subsumes a previous + fMLLR of the same dimension). This means that in test time the + first of the two transforms may be ignored and only the second one + performed. This is in order to keep a single-pass adaptation framework + in test time. The sequence of transforms still makes a difference + because it affects how we compute the adaptation model (i.e., it's + more like a speaker-adapted model than a speaker independent model, + to use traditional ASR terminology). + */ +class SequenceTransform: public DifferentiableTransform { + public: + int32 InitFromConfig(int32 cur_pos, + std::vector *config_lines) override; + + int32 Dim() const override; + void SetNumClasses(int32 num_classes) override; + MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const override; + void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const override; + + int32 NumFinalIterations() override; + + void Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) override; + + void Estimate(int32 final_iter) override; + + SpeakerStatsItf *GetEmptySpeakerStats() const override { + // See comment at the top of this class for an explanation. + return transforms_.back()->GetEmptySpeakerStats(); + } + + void TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const override; + + void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const override; + + void GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const override; + + SequenceTransform(const SequenceTransform &other); + + SequenceTransform() { } + + DifferentiableTransform* Copy() const override { + return new SequenceTransform(*this); + } + + std::string Type() const override { return "SequenceTransform"; } + + void Write(std::ostream &os, bool binary) const override; + + void Read(std::istream &is, bool binary) override; + + void Add(const DifferentiableTransform &other) override; + + ~SequenceTransform() override; + private: + std::vector transforms_; +}; + +// This is the type actually returned by TrainingForward() for SequenceTransform. +// It contains a list of other MinibatchInfo, together with the outputs for all +// but the last call. +class SequenceMinibatchInfo: public MinibatchInfoItf { + public: + std::vector info_vec; + // outputs.size() will be info.size() - 1. + std::vector > outputs; + + ~SequenceMinibatchInfo() override; +}; + + +class AppendSpeakerStats: public SpeakerStatsItf { + public: + AppendSpeakerStats() { } + + std::vector stats; + + void Estimate() override; + + ~AppendSpeakerStats(); +}; + +/** + This is a version of the transform class that consists of a number of other + transforms, appended dimension-wise, so its feature dimension is the sum of + the dimensions of the constituent transforms-- e.g. this could be used to + implement block-diagonal fMLLR, or a structure where some dimensions are + adapted and some are not. + */ +class AppendTransform: public DifferentiableTransform { + public: + int32 InitFromConfig(int32 cur_pos, + std::vector *config_lines) override; + + int32 Dim() const override; + void SetNumClasses(int32 num_classes) override; + MinibatchInfoItf* TrainingForward( + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + CuMatrixBase *output) const override; + void TrainingBackward( + const CuMatrixBase &input, + const CuMatrixBase &output_deriv, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors, + MinibatchInfoItf *minibatch_info, + CuMatrixBase *input_deriv) const override; + + int32 NumFinalIterations() override; + + void Accumulate( + int32 final_iter, + const CuMatrixBase &input, + int32 num_chunks, + int32 num_spk, + const Posterior &posteriors) override; + + SpeakerStatsItf *GetEmptySpeakerStats() const override; + + void TestingAccumulate( + const MatrixBase &input, + const SubPosterior &posteriors, + SpeakerStatsItf *speaker_stats) const override; + + void TestingForward( + const MatrixBase &input, + const SpeakerStatsItf &speaker_stats, + MatrixBase *output) const override; + + void GetTransformAsMatrix( + const SpeakerStatsItf &speaker_stats, + MatrixBase *transform) const override; + + void Estimate(int32 final_iter) override; + + AppendTransform(const AppendTransform &other); + + AppendTransform() { } + + DifferentiableTransform* Copy() const override { + return new AppendTransform(*this); + } + + std::string Type() const override { return "AppendTransform"; } + + void Write(std::ostream &os, bool binary) const override; + + void Read(std::istream &is, bool binary) override; + + ~AppendTransform(); + + void Add(const DifferentiableTransform &other) override; + private: + std::vector transforms_; +}; + + +// This is the type created by TrainingForward() for AppendTransform. +// It just contains a list of other MinibatchInfo. +class AppendMinibatchInfo: public MinibatchInfoItf { + public: + std::vector info_vec; + + ~AppendMinibatchInfo() override; +}; + + +} // namespace differentiable_transform +} // namespace kaldi + +#endif // KALDI_TRANSFORM_GENERIC_TRANSFORM_H_ diff --git a/src/base/io-funcs.h b/src/base/io-funcs.h index 6c2b690f54c..0144e71f987 100644 --- a/src/base/io-funcs.h +++ b/src/base/io-funcs.h @@ -108,7 +108,7 @@ namespace kaldi { it doesn't throw. It's useful if a class can have various forms based on typedefs and virtual classes, and wants to know which version to read. - ReadToken allow the caller to obtain the next token. PeekToken works just + ReadToken allows the caller to obtain the next token. PeekToken works just like ReadToken, but seeks back to the beginning of the token. A subsequent call to ReadToken will read the same token again. This is useful when different object types are written to the same file; using PeekToken one can diff --git a/src/chain/chain-generic-numerator.cc b/src/chain/chain-generic-numerator.cc index d3a114242c2..7453568913a 100644 --- a/src/chain/chain-generic-numerator.cc +++ b/src/chain/chain-generic-numerator.cc @@ -209,9 +209,33 @@ BaseFloat GenericNumeratorComputation::AlphaRemainingFrames(int seq, return log_prob_product + log_scale_product; } +/* This function converts the pdf occupation probabilties (computed + using Forward-Backward on the numerator graph) to posteriors. + "derivs" is frames_per_sequence by pdf_index_size (i.e., indices.size()) +*/ +static void ConvertDerivsToPosterior(const MatrixBase &derivs, + const std::vector &indices, + int32 pdf_stride, + int32 frames_per_sequence, + int32 num_sequences, + Posterior *post) { + post->resize(frames_per_sequence * num_sequences); + for (size_t t = 0; t < derivs.NumRows(); ++t) + for (int32 n = 0; n < derivs.NumCols(); ++n) { + BaseFloat posterior = Exp(derivs(t, n)); + if (posterior != 0.0) { + int32 seq = indices[n] / pdf_stride; + int32 pdfid = indices[n] % pdf_stride; + (*post)[t * num_sequences + seq].push_back( + std::make_pair(pdfid, posterior)); + } + } +} + bool GenericNumeratorComputation::ForwardBackward( BaseFloat *total_loglike, - CuMatrixBase *nnet_output_deriv) { + CuMatrixBase *nnet_output_deriv, + Posterior *numerator_post) { KALDI_ASSERT(total_loglike != NULL); KALDI_ASSERT(nnet_output_deriv != NULL); KALDI_ASSERT(nnet_output_deriv->NumCols() == nnet_output_.NumCols()); @@ -243,6 +267,10 @@ bool GenericNumeratorComputation::ForwardBackward( if (GetVerboseLevel() >= 1) ok = ok && CheckValues(seq, probs, alpha, beta, derivs); } + if (numerator_post) + ConvertDerivsToPosterior(derivs, index_to_pdf_, nnet_output_.Stride(), + supervision_.frames_per_sequence, + num_sequences, numerator_post); // Transfer and add the derivatives to the values in the matrix AddSpecificPdfsIndirect(&derivs, index_to_pdf_, nnet_output_deriv); *total_loglike = partial_loglike; diff --git a/src/chain/chain-generic-numerator.h b/src/chain/chain-generic-numerator.h index fc5e00b2c63..2becfd56051 100644 --- a/src/chain/chain-generic-numerator.h +++ b/src/chain/chain-generic-numerator.h @@ -33,6 +33,7 @@ #include "lat/kaldi-lattice.h" #include "matrix/kaldi-matrix.h" #include "hmm/transition-model.h" +#include "hmm/posterior.h" #include "chain/chain-supervision.h" #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-array.h" @@ -121,7 +122,8 @@ class GenericNumeratorComputation { // nnet output w.r.t. the (log-prob times supervision_.weight times // deriv_weight) to 'nnet_output_deriv'. bool ForwardBackward(BaseFloat *total_loglike, - CuMatrixBase *nnet_output_deriv); + CuMatrixBase *nnet_output_deriv, + Posterior *numerator_post = NULL); BaseFloat ComputeObjf(); private: diff --git a/src/chain/chain-numerator.cc b/src/chain/chain-numerator.cc index 139d28bdd77..caba37023a7 100644 --- a/src/chain/chain-numerator.cc +++ b/src/chain/chain-numerator.cc @@ -146,9 +146,29 @@ BaseFloat NumeratorComputation::Forward() { return tot_log_prob_ * supervision_.weight; } +/* This function converts the pdf occupation probabilties (computed + using Forward-Backward on the numerator graph) to posteriors. +*/ +static void ConvertDerivsToPosterior( + const Vector &derivs, + const std::vector &nnet_output_indexes, + int32 nnet_output_rows, + Posterior *post) { + post->resize(nnet_output_rows); + for (size_t i = 0; i < nnet_output_indexes.size(); ++i) { + if (derivs(i) != 0.0) { + int32 row = nnet_output_indexes[i].first; + int32 pdfid = nnet_output_indexes[i].second; + (*post)[row].push_back( + std::make_pair(pdfid, derivs(i))); + } + } +} + void NumeratorComputation::Backward( - CuMatrixBase *nnet_output_deriv) { + CuMatrixBase *nnet_output_deriv, + Posterior *numerator_post) { const fst::StdVectorFst &fst = supervision_.fst; int32 num_states = fst.NumStates(); log_beta_.Resize(num_states, kUndefined); @@ -201,6 +221,13 @@ void NumeratorComputation::Backward( KALDI_WARN << "Disagreement in forward/backward log-probs: " << tot_log_prob_backward << " vs. " << tot_log_prob_; + if (numerator_post) { + std::vector nnet_output_indexes_cpu; + nnet_output_indexes_.CopyToVec(&nnet_output_indexes_cpu); + ConvertDerivsToPosterior(nnet_logprob_derivs_, nnet_output_indexes_cpu, + nnet_output_.NumRows(), numerator_post); + } + // copy this data to GPU. CuVector nnet_logprob_deriv_cuda; nnet_logprob_deriv_cuda.Swap(&nnet_logprob_derivs_); diff --git a/src/chain/chain-numerator.h b/src/chain/chain-numerator.h index 15cb31e0571..63cb186fde8 100644 --- a/src/chain/chain-numerator.h +++ b/src/chain/chain-numerator.h @@ -32,6 +32,7 @@ #include "lat/kaldi-lattice.h" #include "matrix/kaldi-matrix.h" #include "hmm/transition-model.h" +#include "hmm/posterior.h" #include "chain/chain-supervision.h" #include "cudamatrix/cu-matrix.h" #include "cudamatrix/cu-array.h" @@ -78,7 +79,8 @@ class NumeratorComputation { // Does the backward computation and (efficiently) adds the derivative of the // nnet output w.r.t. the (log-prob times supervision_.weight times // deriv_weight) to 'nnet_output_deriv'. - void Backward(CuMatrixBase *nnet_output_deriv); + void Backward(CuMatrixBase *nnet_output_deriv, + Posterior *numerator_post = NULL); private: @@ -143,4 +145,3 @@ class NumeratorComputation { } // namespace kaldi #endif // KALDI_CHAIN_CHAIN_NUMERATOR_H_ - diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index f8a2c1d11cc..be727d333d2 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -720,17 +720,16 @@ Supervision::Supervision(const Supervision &other): void MergeSupervisionE2e(const std::vector &input, Supervision *output_supervision) { KALDI_ASSERT(!input.empty()); - KALDI_ASSERT(input[0]->e2e_fsts.size() == 1); *output_supervision = *(input[0]); output_supervision->e2e_fsts.reserve(input.size()); int32 frames_per_sequence = output_supervision->frames_per_sequence, num_seqs = input.size(); for (int32 i = 1; i < num_seqs; i++) { - output_supervision->num_sequences++; - KALDI_ASSERT(input[i]->e2e_fsts.size() == 1); + output_supervision->num_sequences += input[i]->num_sequences; KALDI_ASSERT(input[i]->frames_per_sequence == frames_per_sequence); - output_supervision->e2e_fsts.push_back(input[i]->e2e_fsts[0]); + for (int32 j = 0; j < input[i]->num_sequences; ++j) + output_supervision->e2e_fsts.push_back(input[i]->e2e_fsts[j]); } output_supervision->alignment_pdfs.clear(); // The program nnet3-chain-acc-lda-stats works on un-merged egs, @@ -766,7 +765,7 @@ void MergeSupervision(const std::vector &input, // append src.fst to output_supervision->fst. // the complexity here is O(V1 + E1) fst::Concat(src.fst, &output_supervision->fst); - output_supervision->num_sequences++; + output_supervision->num_sequences += src.num_sequences; } else { KALDI_ERR << "Mismatch weight or frames_per_sequence between inputs"; } diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc index 6b4a7b593c2..c4637c9cb86 100644 --- a/src/chain/chain-training.cc +++ b/src/chain/chain-training.cc @@ -28,7 +28,6 @@ namespace kaldi { namespace chain { - void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, const Supervision &supervision, @@ -37,7 +36,8 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, - CuMatrix *xent_output_deriv) { + CuMatrix *xent_output_deriv, + Posterior *numerator_post = NULL) { BaseFloat num_logprob_weighted, den_logprob_weighted; bool denominator_ok = true; bool numerator_ok = true; @@ -77,12 +77,14 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts, // the numerator object, as well as the returned logprob. if (xent_output_deriv) { numerator_ok = numerator.ForwardBackward(&num_logprob_weighted, - xent_output_deriv); + xent_output_deriv, + numerator_post); if (numerator_ok && nnet_output_deriv) nnet_output_deriv->AddMat(1.0, *xent_output_deriv); } else if (nnet_output_deriv) { numerator_ok = numerator.ForwardBackward(&num_logprob_weighted, - nnet_output_deriv); + nnet_output_deriv, + numerator_post); } else { num_logprob_weighted = numerator.ComputeObjf(); } @@ -146,11 +148,26 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, - CuMatrix *xent_output_deriv) { + CuMatrix *xent_output_deriv, + Posterior *numerator_post) { + if (!nnet_output_deriv && !xent_output_deriv && numerator_post) { + // To compute the posteriors, we will need to compute the numerator + // derivatives first (and to compute them, at least one of the *_deriv + // arguments should be non-NULL). + CuMatrix xent_deriv; + // Recurse + ComputeChainObjfAndDeriv(opts, den_graph, supervision, + nnet_output, objf, l2_term, + weight, nnet_output_deriv, + &xent_deriv, numerator_post); + return; + } + if (!supervision.e2e_fsts.empty()) { ComputeChainObjfAndDerivE2e(opts, den_graph, supervision, nnet_output, objf, l2_term, - weight, nnet_output_deriv, xent_output_deriv); + weight, nnet_output_deriv, + xent_output_deriv, numerator_post); return; } @@ -190,11 +207,11 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, num_logprob_weighted = numerator.Forward(); if (xent_output_deriv) { - numerator.Backward(xent_output_deriv); + numerator.Backward(xent_output_deriv, numerator_post); if (nnet_output_deriv) nnet_output_deriv->AddMat(1.0, *xent_output_deriv); } else if (nnet_output_deriv) { - numerator.Backward(nnet_output_deriv); + numerator.Backward(nnet_output_deriv, numerator_post); } } diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h index 6ea70b5ca41..d66c3c18900 100644 --- a/src/chain/chain-training.h +++ b/src/chain/chain-training.h @@ -28,10 +28,12 @@ #include "base/kaldi-common.h" #include "util/common-utils.h" #include "fstext/fstext-lib.h" +#include "hmm/posterior.h" #include "tree/context-dep.h" #include "lat/kaldi-lattice.h" #include "matrix/kaldi-matrix.h" #include "hmm/transition-model.h" +#include "hmm/posterior.h" #include "chain/chain-den-graph.h" #include "chain/chain-supervision.h" @@ -99,7 +101,7 @@ struct ChainTrainingOptions { example; you'll want to divide it by 'tot_weight' before displaying it. @param [out] l2_term The l2 regularization term in the objective function, if - the --l2-regularize option is used. To be added to 'o + the --l2-regularize option is used (else will be set to 0.0). @param [out] weight The weight to normalize the objective function by; equals supervision.weight * supervision.num_sequences * supervision.frames_per_sequence. @@ -115,6 +117,12 @@ struct ChainTrainingOptions { peak memory use). xent_output_deriv will be used in the cross-entropy regularization code; it is also used in computing the cross-entropy objective value. + @param [out] numerator_post If non-NULL, then the posterior from the numerator + forward-backward will be written here (note: it won't be + scaled by the supervision weight). The order is the + same as the input (i.e., frame 0 for all sequences, + then frame 1, etc). This is intended for + use in the adaptation framework used in "chaina" training. */ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, const DenominatorGraph &den_graph, @@ -124,7 +132,8 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts, BaseFloat *l2_term, BaseFloat *weight, CuMatrixBase *nnet_output_deriv, - CuMatrix *xent_output_deriv = NULL); + CuMatrix *xent_output_deriv = NULL, + Posterior *numerator_post = NULL); diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc index a3222d2285f..2accefc57fa 100644 --- a/src/chainbin/nnet3-chain-combine.cc +++ b/src/chainbin/nnet3-chain-combine.cc @@ -98,7 +98,7 @@ int main(int argc, char *argv[]) { "Usage: nnet3-chain-combine [options] ... \n" "\n" "e.g.:\n" - " nnet3-combine den.fst 35.raw 36.raw 37.raw 38.raw ark:valid.cegs final.raw\n"; + " nnet3-chain-combine den.fst 35.raw 36.raw 37.raw 38.raw ark:valid.cegs final.raw\n"; bool binary_write = true; int32 max_objective_evaluations = 30; @@ -113,7 +113,7 @@ int main(int argc, char *argv[]) { "maximum number of objective evaluations in order to figure " "out the best number of models to combine. It helps to speedup " "if the number of models provided to this binary is quite " - "large (e.g. several hundred)."); + "large (e.g. several hundred)."); po.Register("use-gpu", &use_gpu, "yes|no|optional|wait, only has effect if compiled with CUDA"); po.Register("batchnorm-test-mode", &batchnorm_test_mode, diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc index 0117fe2200f..60fb70bd1c7 100644 --- a/src/chainbin/nnet3-chain-copy-egs.cc +++ b/src/chainbin/nnet3-chain-copy-egs.cc @@ -91,7 +91,7 @@ void FilterExample(int32 min_input_t, if (io.name == "input") { min_t = min_input_t; max_t = max_input_t; - + const std::vector &indexes_in = io.indexes; std::vector indexes_out; indexes_out.reserve(indexes_in.size()); @@ -124,22 +124,88 @@ void FilterExample(int32 min_input_t, } } +/** + This function extends the left/right input context by adding + necessary indexes (and feature rows) for the NnetIo named "input". + First/last frame will be duplicated to add left/right context respectively. + */ +void ExtendContext(NnetChainExample *eg, + int32 n_stride, + int32 min_input_t, + int32 max_input_t, + int32 extend_left_context, + int32 extend_right_context) { + // process the inputs + for (size_t i = 0; i < eg->inputs.size(); i++) { + NnetIo &io = eg->inputs[i]; + if (io.name == "input") { + // Assume t_stride = 1 (since it's input) + std::vector &indexes = io.indexes; + KALDI_ASSERT(indexes.size() < 2 || indexes[0].t + 1 == indexes[1].t); + // The input indexes are not re-ordered. The order is: all frames of first + // sequence, then all frames of 2nd seq, ... + indexes.resize(indexes.size() + n_stride * (extend_left_context + + extend_right_context)); + KALDI_ASSERT(indexes.size() == n_stride * + (max_input_t - min_input_t + 1)); + + for (int32 n = 0, i = 0; n < n_stride; ++n) { + for (int32 t = min_input_t; t <= max_input_t; ++t, ++i) { + indexes[i].t = t; + indexes[i].n = n; + } + } + + Matrix features_out(indexes.size(), io.features.NumCols()); + Matrix features_in; + io.features.GetMatrix(&features_in); -/** Returns true if the "eg" contains just a single example, meaning - that all the "n" values in the indexes are zero, and the example - has NnetIo members named both "input" and "output" + int32 original_min_t = min_input_t + extend_left_context, + original_max_t = max_input_t - extend_right_context; + // For each "n", duplicate the first frame to extend left context, + // then copy the features, then duplicate the last frame to extend right + // context. + int32 i_in = 0, i_out = 0; + for (int32 n = 0; n < n_stride; ++n) { + // Duplicate frame i_in, "extend_left_context" times + for (int32 j = 0; j < extend_left_context; ++j, ++i_out) + features_out.Row(i_out).CopyFromVec(features_in.Row(i_in)); + + for (int32 t = original_min_t; t <= original_max_t; ++t, ++i_out, ++i_in) + features_out.Row(i_out).CopyFromVec(features_in.Row(i_in)); + + // Duplicate frame i_in - 1, "extend_right_context" times + for (int32 j = 0; j < extend_right_context; ++j, ++i_out) + features_out.Row(i_out).CopyFromVec(features_in.Row(i_in - 1)); + + } + KALDI_ASSERT(i_in == features_in.NumRows()); + KALDI_ASSERT(i_out == features_out.NumRows()); + + GeneralMatrix features_out_gmat; + features_out_gmat.SwapFullMatrix(&features_out); + io.features = features_out_gmat; + } + } +} + +/** Counts the number of single examples in "eg", which is equal to + the maximum "n" value in the indexes plus 1. + If the example does not have both "input" and "output" NnetIo members, + this function will exit the program with an error. Also computes the minimum and maximum "t" values in the "input" and "output" NnetIo members. */ -bool ContainsSingleExample(const NnetChainExample &eg, - int32 *min_input_t, - int32 *max_input_t, - int32 *min_output_t, - int32 *max_output_t) { +static int32 CountSingleExamples(const NnetChainExample &eg, + int32 *min_input_t, + int32 *max_input_t, + int32 *min_output_t, + int32 *max_output_t) { bool done_input = false, done_output = false; int32 num_indexes_input = eg.inputs.size(); int32 num_indexes_output = eg.outputs.size(); + int32 max_n = 0; for (int32 i = 0; i < num_indexes_input; i++) { const NnetIo &input = eg.inputs[i]; std::vector::const_iterator iter = input.indexes.begin(), @@ -152,23 +218,12 @@ bool ContainsSingleExample(const NnetChainExample &eg, int32 this_t = iter->t; min_t = std::min(min_t, this_t); max_t = std::max(max_t, this_t); - if (iter->n != 0) { - KALDI_WARN << "Example does not contain just a single example; " - << "too late to do frame selection or reduce context."; - return false; - } + if (iter->n > max_n) + max_n = iter->n; } done_input = true; *min_input_t = min_t; *max_input_t = max_t; - } else { - for (; iter != end; ++iter) { - if (iter->n != 0) { - KALDI_WARN << "Example does not contain just a single example; " - << "too late to do frame selection or reduce context."; - return false; - } - } } } @@ -184,34 +239,22 @@ bool ContainsSingleExample(const NnetChainExample &eg, int32 this_t = iter->t; min_t = std::min(min_t, this_t); max_t = std::max(max_t, this_t); - if (iter->n != 0) { - KALDI_WARN << "Example does not contain just a single example; " - << "too late to do frame selection or reduce context."; - return false; - } + // max_n must be the same for all io's (either input or output). + KALDI_ASSERT(iter->n <= max_n + && "Mismatched 'n' values. Partially merged?"); } done_output = true; *min_output_t = min_t; *max_output_t = max_t; - } else { - for (; iter != end; ++iter) { - if (iter->n != 0) { - KALDI_WARN << "Example does not contain just a single example; " - << "too late to do frame selection or reduce context."; - return false; - } - } } } - if (!done_input) { - KALDI_WARN << "Example does not have any input named 'input'"; - return false; - } - if (!done_output) { - KALDI_WARN << "Example does not have any output named 'output'"; - return false; - } - return true; + if (!done_input) + KALDI_ERR << "Example does not have any input named 'input'"; + + if (!done_output) + KALDI_ERR << "Example does not have any output named 'output'"; + + return max_n + 1; } // calculate the frame_subsampling_factor @@ -221,47 +264,49 @@ void CalculateFrameSubsamplingFactor(const NnetChainExample &eg, - eg.outputs[0].indexes[0].t; } +/* This function adds or removes context for the examples inside + "eg" (which can contain just a single example or it can be a + merged-eg which would contain more than one example). Addition or + removal of context is determined by comparing "left_context" with + the observed left context of "eg" (the same goes for right context): + if it's more, it'll extend input context by duplicating the first (or last, + for right context) frame. Otherwise, it'll remove the extra context from + both inputs and outputs in "eg". Note that when extending context, only the + "input" io will be modified (the "output" io will remain the same). + */ void ModifyChainExampleContext(int32 left_context, int32 right_context, const int32 frame_subsampling_factor, - NnetChainExample *eg) { - static bool warned_left = false, warned_right = false; + NnetChainExample *eg, + int32 *left_context_extension, + int32 *right_context_extension) { int32 min_input_t, max_input_t, - min_output_t, max_output_t; - if (!ContainsSingleExample(*eg, &min_input_t, &max_input_t, - &min_output_t, &max_output_t)) - KALDI_ERR << "Too late to perform frame selection/context reduction on " - << "these examples (already merged?)"; - if (left_context != -1) { + min_output_t, max_output_t; + *left_context_extension = 0; + *right_context_extension = 0; + // Example stride really means "n" stride (of the NnetIo's) + int32 example_stride = CountSingleExamples(*eg, &min_input_t, &max_input_t, + &min_output_t, &max_output_t); + if (left_context >= 0) { int32 observed_left_context = min_output_t - min_input_t; - if (!warned_left && observed_left_context < left_context) { - warned_left = true; - KALDI_WARN << "You requested --left-context=" << left_context - << ", but example only has left-context of " - << observed_left_context - << " (will warn only once; this may be harmless if " - "using any --*left-context-initial options)"; - } - min_input_t = std::max(min_input_t, min_output_t - left_context); + if (left_context > observed_left_context) // Extend + *left_context_extension = left_context - observed_left_context; + // Adjust min input t + min_input_t = min_output_t - left_context; } - if (right_context != -1) { + if (right_context >= 0) { int32 observed_right_context = max_input_t - max_output_t; - - if (right_context != -1) { - if (!warned_right && observed_right_context < right_context) { - warned_right = true; - KALDI_WARN << "You requested --right-context=" << right_context - << ", but example only has right-context of " - << observed_right_context - << " (will warn only once; this may be harmless if " - "using any --*right-context-final options."; - } - max_input_t = std::min(max_input_t, max_output_t + right_context); - } + if (right_context > observed_right_context) // Extend + *right_context_extension = right_context - observed_right_context; + max_input_t = max_output_t + right_context; } + FilterExample(min_input_t, max_input_t, min_output_t, max_output_t, eg); + if (*left_context_extension > 0 || *right_context_extension > 0) + ExtendContext(eg, example_stride, min_input_t, max_input_t, + *left_context_extension, *right_context_extension); } // ModifyChainExampleContext } // namespace nnet3 @@ -348,6 +393,8 @@ int main(int argc, char *argv[]) { exclude_names.push_back(std::string("ivector")); int64 num_read = 0, num_written = 0, num_err = 0; + int64 num_left_context_extensions = 0, num_right_context_extensions = 0, + total_left_context_extension = 0, total_right_context_extension = 0; for (; !example_reader.Done(); example_reader.Next(), num_read++) { const std::string &key = example_reader.Key(); NnetChainExample &eg = example_reader.Value(); @@ -367,7 +414,7 @@ int main(int argc, char *argv[]) { weight = egs_weight_reader.Value(key); ScaleSupervisionWeight(weight, &eg); } - + if (!eg_output_name_rspecifier.empty()) { if (!output_name_reader.HasKey(key)) { KALDI_WARN << "No new output-name for example key " << key; @@ -377,13 +424,25 @@ int main(int argc, char *argv[]) { std::string new_output_name = output_name_reader.Value(key); RenameOutputs(new_output_name, &eg); } - + if (frame_shift != 0) ShiftChainExampleTimes(frame_shift, exclude_names, &eg); - if (left_context != -1 || right_context != -1) + if (left_context != -1 || right_context != -1) { + int32 right_context_extension, left_context_extension; ModifyChainExampleContext(left_context, right_context, - frame_subsampling_factor, &eg); - + frame_subsampling_factor, &eg, + &left_context_extension, + &right_context_extension); + if (left_context_extension > 0) { + num_left_context_extensions++; + total_left_context_extension += left_context_extension; + } + if (right_context_extension > 0) { + num_right_context_extensions++; + total_right_context_extension += right_context_extension; + } + } + for (int32 c = 0; c < count; c++) { int32 index = (random ? Rand() : num_written) % num_outputs; example_writers[index]->Write(key, eg); @@ -394,6 +453,16 @@ int main(int argc, char *argv[]) { delete example_writers[i]; KALDI_LOG << "Read " << num_read << " neural-network training examples, wrote " << num_written; + if (num_left_context_extensions > 0) + KALDI_LOG << "Left context was extended for " + << num_left_context_extensions << " examples, by an average of " + << (1.0 * total_left_context_extension / + num_left_context_extensions) << " frames"; + if (num_right_context_extensions > 0) + KALDI_LOG << "Right context was extended for " + << num_right_context_extensions << " examples, by an average of " + << (1.0 * total_right_context_extension + / num_right_context_extensions) << " frames."; return (num_written == 0 ? 1 : 0); } catch(const std::exception &e) { std::cerr << e.what() << '\n'; diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc index 1032b7e2125..23291eac167 100644 --- a/src/chainbin/nnet3-chain-get-egs.cc +++ b/src/chainbin/nnet3-chain-get-egs.cc @@ -95,7 +95,7 @@ static bool ProcessFile(const TransitionModel *trans_mdl, const VectorBase *deriv_weights, int32 supervision_length_tolerance, const std::string &utt_id, - bool compress, + bool compress, bool long_key, UtteranceSplitter *utt_splitter, NnetChainExampleWriter *example_writer) { KALDI_ASSERT(supervision.num_sequences == 1); @@ -228,9 +228,14 @@ static bool ProcessFile(const TransitionModel *trans_mdl, nnet_chain_eg.Compress(); std::ostringstream os; - os << utt_id << "-" << chunk.first_frame; + if (long_key) + os << utt_id + << "-" << chunk.first_frame << "-" << chunk.left_context + << "-" << chunk.num_frames << "-" << chunk.right_context << "-v1"; + else // key is - + os << utt_id << "-" << chunk.first_frame; - std::string key = os.str(); // key is - + std::string key = os.str(); example_writer->Write(key, nnet_chain_eg); } @@ -265,7 +270,7 @@ int main(int argc, char *argv[]) { "Note: the --frame-subsampling-factor option must be the same as given to\n" "chain-get-supervision.\n"; - bool compress = true; + bool compress = true, long_key = false; int32 length_tolerance = 100, online_ivector_period = 1, supervision_length_tolerance = 1; @@ -281,9 +286,9 @@ int main(int argc, char *argv[]) { ParseOptions po(usage); po.Register("compress", &compress, "If true, write egs with input features " "in compressed format (recommended). Update: this is now " - "only relevant if the features being read are un-compressed; " - "if already compressed, we keep we same compressed format when " - "dumping-egs."); + "only relevant if the features being read are uncompressed; " + "if already compressed, we keep the same compressed format when " + "dumping egs."); po.Register("ivectors", &online_ivector_rspecifier, "Alias for " "--online-ivectors option, for back compatibility"); po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of " @@ -311,6 +316,8 @@ int main(int argc, char *argv[]) { "Filename of transition model to read; should only be supplied " "if you want 'unconstrained' egs, and if you supplied " "--convert-to-pdfs=false to chain-get-supervision."); + po.Register("long-key", &long_key, "If true, a long format will be used " + "for the key, which encodes context info, etc."); eg_config.Register(&po); @@ -426,7 +433,7 @@ int main(int argc, char *argv[]) { if (!ProcessFile(trans_mdl_ptr, normalization_fst, feats, online_ivector_feats, online_ivector_period, supervision, deriv_weights, supervision_length_tolerance, - key, compress, + key, compress, long_key, &utt_splitter, &example_writer)) num_err++; } diff --git a/src/chainbin/nnet3-chain-merge-egs.cc b/src/chainbin/nnet3-chain-merge-egs.cc index a3686d2fc30..926cda76cf3 100644 --- a/src/chainbin/nnet3-chain-merge-egs.cc +++ b/src/chainbin/nnet3-chain-merge-egs.cc @@ -64,7 +64,16 @@ int main(int argc, char *argv[]) { ChainExampleMerger merger(merging_config, &example_writer); for (; !example_reader.Done(); example_reader.Next()) { const NnetChainExample &cur_eg = example_reader.Value(); - merger.AcceptExample(new NnetChainExample(cur_eg)); + NnetChainExample *cur_eg_copy = new NnetChainExample(cur_eg); + if (merging_config.use_query_string) { + std::string key = example_reader.Key(); + int pos = key.find('?'); + if (pos != std::string::npos) { + std::string query = key.substr(pos + 1, key.size() - pos - 1); + cur_eg_copy->bucket = query; + } + } + merger.AcceptExample(cur_eg_copy); } // the merger itself prints the necessary diagnostics. merger.Finish(); diff --git a/src/featbin/select-feats.cc b/src/featbin/select-feats.cc index c10f0c64ed5..284902f782e 100644 --- a/src/featbin/select-feats.cc +++ b/src/featbin/select-feats.cc @@ -37,7 +37,9 @@ int main(int argc, char *argv[]) { "command cut -f ...\n" "Usage: select-feats \n" " e.g. select-feats 0,24-22,3-12 scp:feats.scp ark,scp:feat-red.ark,feat-red.scp\n" - "See also copy-feats, extract-feature-segments, subset-feats, subsample-feats\n"; + "See also copy-feats, extract-feature-segments, subset-feats, subsample-feats\n" + "Note: this command should no longer be needed in most cases, as it can be done\n" + "more efficiently at the script level; see the script utils/data/limit_feature_dim.sh"; ParseOptions po(usage); diff --git a/src/gmmbin/gmm-est-fmllr.cc b/src/gmmbin/gmm-est-fmllr.cc index 9f8dfd89143..e0702c4fcf8 100644 --- a/src/gmmbin/gmm-est-fmllr.cc +++ b/src/gmmbin/gmm-est-fmllr.cc @@ -195,4 +195,3 @@ int main(int argc, char *argv[]) { return -1; } } - diff --git a/src/hmm/posterior.h b/src/hmm/posterior.h index e153c249740..3b8016ac712 100644 --- a/src/hmm/posterior.h +++ b/src/hmm/posterior.h @@ -51,6 +51,42 @@ typedef std::vector > > Posterior; typedef std::vector > > > GaussPost; +/// This class allows you to select a sub-vector of Posteriors, possibly with a +/// stride, without copying them elsewhere. SubPosterior is to Posterior as +/// SubVector is to Vector. (Note: Posterior is actually a typedef to +/// std::vector > >. +/// We can add a non-const interface later if needed. +class SubPosterior { + public: + SubPosterior(const Posterior &post): + num_frames_(post.size()), stride_(1), data_( + num_frames_ == 0 ? NULL : &(post[0])) { } + SubPosterior(const Posterior &post, size_t offset, + size_t num_frames, size_t stride = 1): + num_frames_(num_frames), stride_(stride), + data_(num_frames_ == 0 ? NULL : &(post[offset])) { + KALDI_ASSERT(stride > 0 && post.size() > offset + (num_frames-1) * stride); + } + SubPosterior(const SubPosterior &post, size_t offset, + size_t num_frames, size_t stride = 1): + num_frames_(num_frames), stride_(stride * post.stride_), + data_(num_frames_ == 0 ? NULL : post.data_ + (offset * post.stride_)) { + KALDI_ASSERT(offset + num_frames * (stride - 1) < post.num_frames_); + } + size_t size() const { return num_frames_; } + const std::vector > &operator[] (size_t i) const { + KALDI_PARANOID_ASSERT(i < num_frames_); + return data_[i * stride_]; + } + SubPosterior(const SubPosterior &other) = default; + private: + size_t num_frames_; + size_t stride_; + const std::vector > *data_; +}; + + + // PosteriorHolder is a holder for Posterior, which is // std::vector > > // This is used for storing posteriors of transition id's for an diff --git a/src/hmm/transition-model.h b/src/hmm/transition-model.h index e453c24f9cb..c41ec2e7b32 100644 --- a/src/hmm/transition-model.h +++ b/src/hmm/transition-model.h @@ -251,6 +251,7 @@ class TransitionModel { /// compare the transition probabilities. bool Compatible(const TransitionModel &other) const; + TransitionModel(const TransitionModel &other) = default; private: void MleUpdateShared(const Vector &stats, const MleTransitionUpdateConfig &cfg, @@ -321,7 +322,8 @@ class TransitionModel { /// of pdfs). int32 num_pdfs_; - KALDI_DISALLOW_COPY_AND_ASSIGN(TransitionModel); + // Disallow assignment by making it private; this won't be defined. + TransitionModel &operator=(const TransitionModel &other); }; inline int32 TransitionModel::TransitionIdToPdf(int32 trans_id) const { diff --git a/src/matrix/Makefile b/src/matrix/Makefile index e39be1ffec9..2fcf62fcb69 100644 --- a/src/matrix/Makefile +++ b/src/matrix/Makefile @@ -10,7 +10,7 @@ include ../kaldi.mk # you can uncomment matrix-lib-speed-test if you want to do the speed tests. -TESTFILES = matrix-lib-test sparse-matrix-test #matrix-lib-speed-test +TESTFILES = matrix-lib-test sparse-matrix-test matrix-functions-test #matrix-lib-speed-test OBJFILES = kaldi-matrix.o kaldi-vector.o packed-matrix.o sp-matrix.o tp-matrix.o \ matrix-functions.o qr.o srfft.o compressed-matrix.o \ diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h index 11a5e08b15d..d7ee8eb388f 100644 --- a/src/matrix/kaldi-matrix.h +++ b/src/matrix/kaldi-matrix.h @@ -531,6 +531,10 @@ class MatrixBase { * positive semi-definite (check_thresh controls how stringent the check is; * set it to 2 to ensure it won't ever complain, but it will zero out negative * dimensions in your matrix. + * + * Caution: if you want the eigenvalues, it may make more sense to convert to + * SpMatrix and use Eig() function there, which uses eigenvalue decomposition + * directly rather than SVD. */ void SymPosSemiDefEig(VectorBase *s, MatrixBase *P, Real check_thresh = 0.001); diff --git a/src/matrix/matrix-functions-test.cc b/src/matrix/matrix-functions-test.cc new file mode 100644 index 00000000000..203892a54e3 --- /dev/null +++ b/src/matrix/matrix-functions-test.cc @@ -0,0 +1,184 @@ +// matrix/matrix-functions-test.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) +// 2018 Institute of Acoustics, CAS (Gaofeng Cheng) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "matrix/matrix-functions.h" +#include "matrix/kaldi-vector.h" +#include "matrix/kaldi-matrix.h" +#include "matrix/sp-matrix.h" + +namespace kaldi { + +void SvdRescalerTestIdentity() { + // this tests the case where f() is the identity function. + int32 dim = 10; + Matrix mat(dim, dim); + if (RandInt(0, 1) == 0) + mat.SetRandn(); + // else zero. + bool symmetric = false; + + SvdRescaler sc; + sc.Init(&mat, symmetric); + + BaseFloat *lambda = sc.InputSingularValues(), + *f_lambda= sc.OutputSingularValues(), + *fprime_lambda = sc.OutputSingularValueDerivs(); + for (int32 i = 0; i < dim; i++) { + f_lambda[i] = lambda[i]; + fprime_lambda[i] = 1.0; + } + Matrix output(dim, dim, kUndefined); + sc.GetOutput(&output); + AssertEqual(mat, output, 0.001); + Matrix output_deriv(dim, dim, kUndefined), + input_deriv(dim, dim); + output_deriv.SetRandn(); + sc.ComputeInputDeriv(output_deriv, &input_deriv); + KALDI_LOG << output_deriv << input_deriv; + AssertEqual(output_deriv, input_deriv); +} + +void SvdRescalerTestPowerDiag() { + // this tests the case where f() is a power function with random exponent, + // and the matrix is diagonal. + int32 dim = 10; + BaseFloat power = 0.25 * RandInt(0, 4); + bool symmetric = (RandInt(0, 1) == 0); + Matrix mat(dim, dim); + for (int32 i = 0; i < dim; i++) { + mat(i, i) = 0.25 * RandInt(0, 10); + // if power < 1.0, we can't allow zero diagonal + // elements, or the derivatives would be undefined. + if (power < 1.0 && mat(i, i) == 0.0) + mat(i, i) = 0.333; + } + + SvdRescaler sc; + sc.Init(&mat, symmetric); + + BaseFloat *lambda = sc.InputSingularValues(), + *f_lambda= sc.OutputSingularValues(), + *fprime_lambda = sc.OutputSingularValueDerivs(); + for (int32 i = 0; i < dim; i++) { + f_lambda[i] = pow(lambda[i], power); + fprime_lambda[i] = power * pow(lambda[i], power - 1.0); + } + Matrix output(dim, dim, kUndefined); + sc.GetOutput(&output); + KALDI_ASSERT(mat.IsDiagonal(0.001)); + Matrix output_deriv(dim, dim, kUndefined), + input_deriv(dim, dim); + output_deriv.SetRandn(); + sc.ComputeInputDeriv(output_deriv, &input_deriv); + + for (int32 i = 0; i < dim; i++) { + BaseFloat oderiv = output_deriv(i, i), + ideriv = input_deriv(i, i), + x = mat(i, i), + df = power * pow(x, power - 1.0); + AssertEqual(ideriv, oderiv * df); + } +} + + +void SvdRescalerTestExp() { + // this tests the case where f() is the exponential function, and the matrix + // is an arbitrary matrix. + int32 dim = 10; + //bool symmetric = (RandInt(0, 1) == 0); + bool symmetric = false; + BaseFloat exp_scale = 0.2 * RandInt(0, 5); + + Matrix mat(dim, dim); + + if (symmetric) { + SpMatrix s(dim); + s.SetRandn(); + mat.CopyFromSp(s); + } else { + mat.SetRandn(); + } + + KALDI_LOG << "Matrix sum is " << mat.Sum(); + + SvdRescaler sc; + sc.Init(&mat, symmetric); + BaseFloat *lambda = sc.InputSingularValues(), + *f_lambda= sc.OutputSingularValues(), + *fprime_lambda = sc.OutputSingularValueDerivs(); + for (int32 i = 0; i < dim; i++) { + f_lambda[i] = exp(exp_scale * lambda[i]); + fprime_lambda[i] = exp_scale * exp(exp_scale * lambda[i]); + } + Matrix output(dim, dim, kUndefined); + sc.GetOutput(&output); + Matrix output_deriv(dim, dim, kUndefined), + input_deriv(dim, dim); + output_deriv.SetRandn(); + sc.ComputeInputDeriv(output_deriv, &input_deriv); + + + // use random directions to test the accuracy of the derivatives. + int32 n = 4; + Vector expected_change(n), actual_change(n); + BaseFloat epsilon = 0.001; + for (int32 k = 0; k < n; k++) { + Matrix delta(dim, dim); + if (symmetric) { + SpMatrix s(dim); + s.SetRandn(); + delta.CopyFromSp(s); + } else { + delta.SetRandn(); + } + delta.Scale(epsilon); + expected_change(k) = TraceMatMat(delta, input_deriv, kTrans); + delta.AddMat(1.0, mat); + SvdRescaler sc2(&delta, symmetric); + BaseFloat *lambda = sc2.InputSingularValues(), + *f_lambda= sc2.OutputSingularValues(), + *fprime_lambda = sc2.OutputSingularValueDerivs(); + for (int32 i = 0; i < dim; i++) { + f_lambda[i] = exp(exp_scale * lambda[i]); + fprime_lambda[i] = exp_scale * exp(exp_scale * lambda[i]); + } + Matrix output_perturbed(dim, dim); + sc2.GetOutput(&output_perturbed); + actual_change(k) = TraceMatMat(output_deriv, output_perturbed, kTrans) - + TraceMatMat(output_deriv, output, kTrans); + } + KALDI_LOG << "Matrix sum is " << mat.Sum(); + KALDI_LOG << "Predicted " << expected_change + << " vs. actual " << actual_change; + AssertEqual(expected_change, actual_change, 0.01); +} + + + +} // namespace kaldi + +int main() { + for (int32 i = 0; i < 10; i++) { + kaldi::SvdRescalerTestIdentity(); + kaldi::SvdRescalerTestPowerDiag(); + kaldi::SvdRescalerTestExp(); + } + std::cout << "Test OK.\n"; +} diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc index 496c09f5344..7a222026010 100644 --- a/src/matrix/matrix-functions.cc +++ b/src/matrix/matrix-functions.cc @@ -2,6 +2,7 @@ // Copyright 2009-2011 Microsoft Corporation; Go Vivace Inc.; Jan Silovsky // Yanmin Qian; Saarland University; Johns Hopkins University (Author: Daniel Povey) +// Gaofeng Cheng (Institute of Acoustics, Chinese Academy of Sciences) // See ../../COPYING for clarification regarding multiple authors // @@ -769,5 +770,115 @@ void AddOuterProductPlusMinus(double alpha, MatrixBase *plus, MatrixBase *minus); +void SvdRescaler::Init(const MatrixBase *A, bool symmetric) { + KALDI_ASSERT(A->NumRows() == A->NumCols()); + A_ = A; + symmetric_ = symmetric; + int32 dim = A->NumRows(); + lambdas_.Resize(3, dim, kUndefined); + U_.Resize(dim, dim, kUndefined); + SubVector lambda(lambdas_, 0); + if (symmetric) { + // the following constructor will check that A is actually symmetric. + SpMatrix A_sym(*A_, kTakeMeanAndCheck); + A_sym.Eig(&lambda, &U_); + } else { + Vt_.Resize(dim, dim, kUndefined); + A_->Svd(&lambda, &U_, &Vt_); + } +} + +BaseFloat *SvdRescaler::InputSingularValues() { + return lambdas_.RowData(0); +} + +BaseFloat *SvdRescaler::OutputSingularValues() { + return lambdas_.RowData(1); +} + +BaseFloat *SvdRescaler::OutputSingularValueDerivs() { + return lambdas_.RowData(2); +} + +void SvdRescaler::GetOutput(MatrixBase *output) { + int32 dim = A_->NumRows(); + SubVector f_lambda(lambdas_, 1); // f(lambda) in the writeup. + if (symmetric_) { + SpMatrix S(dim); + S.AddMat2Vec(1.0, U_, kNoTrans, f_lambda, 0.0); + output->CopyFromSp(S); + } else { + Matrix U_tmp(U_); + U_tmp.MulColsVec(f_lambda); + output->SetZero(); + output->AddMatMat(1.0, U_tmp, kNoTrans, Vt_, kNoTrans, 0.0); + } +} + +void SvdRescaler::ComputeInputDeriv(const MatrixBase &output_deriv, + MatrixBase *input_deriv) const { + int32 dim = A_->NumRows(); + KALDI_ASSERT(output_deriv.NumRows() == dim && output_deriv.NumCols() == dim && + input_deriv->NumRows() == dim && input_deriv->NumCols() == dim); + // input_deriv is \bar{A} in the writeup. + input_deriv->SetZero(); + + // \bar{D} in the writeup; see class declaration. + Matrix bar_d(dim, dim); + if (!symmetric_) + bar_d.AddMatMatMat(1.0, U_, kTrans, output_deriv, kNoTrans, Vt_, kTrans, 0.0); + else + bar_d.AddMatMatMat(1.0, U_, kTrans, output_deriv, kNoTrans, U_, kNoTrans, 0.0); + + Matrix bar_lambda(dim, dim); + + const BaseFloat *lambda = lambdas_.RowData(0), // elements \lambda_i + *f_lambda = lambdas_.RowData(1), // elements f(\lambda_i) + *f_lambda_deriv = lambdas_.RowData(2); // elements f'(lambda_i) + + // we use doubles in the computations below, to avoid underflow if any floating + // point values were extremely close to zero (e.g., denormal) + for(int32 i = 0; i < dim; i++) { + double lambda_i = lambda[i], lambda2_i = lambda_i * lambda_i, + d_i = f_lambda[i]; + for(int32 j = 0; j < dim; j++) { + double lambda_j = lambda[j], lambda2_j = lambda_j * lambda_j, + d_j = f_lambda[j], bar_d_ij = bar_d(i, j), + bar_d_ji = bar_d(j, i), bar_lambda_ij; + + if (i == j) { + bar_lambda_ij = bar_d_ij * f_lambda_deriv[i]; + } else if (std::abs(lambda_i - lambda_j) > 1.0e-03 * std::abs(lambda_i)) { + // if lambda_i and lambda_j are not (relatively) too close in value (which + // implies that at least one them is nonzero).. + bar_lambda_ij = bar_d_ij * ((lambda_i * d_i - lambda_j * d_j) / + (lambda2_i - lambda2_j)) + + bar_d_ji * ((lambda_j * d_i - lambda_i * d_j) / + (lambda2_i - lambda2_j)); + } else if (lambda_i != 0) { + // If we reached here, it implies they are both nonzero, but extremely + // close in value. + // lambda is the average of the two lambdas. + // Assume f'(lambda) is the average of the two derivatives. + double lambda = 0.5 * (lambda_i + lambda_j), + f_prime_lambda = 0.5 * (f_lambda_deriv[i] + f_lambda_deriv[j]), + d = 0.5 * (d_i + d_j); + bar_lambda_ij = bar_d_ij * ((lambda * f_prime_lambda + d) / (2.0 * lambda)) + + bar_d_ji * ((lambda * f_prime_lambda - d) / (2.0 * lambda)); + } else { + // both zero. + KALDI_ASSERT(lambda_i == 0 && lambda_j == 0); + bar_lambda_ij = bar_d_ij * f_lambda_deriv[i]; + } + bar_lambda(i, j) = bar_lambda_ij; + } + } + if (!symmetric_) + input_deriv->AddMatMatMat(1.0, U_, kNoTrans, bar_lambda, kNoTrans, + Vt_, kNoTrans, 0.0); + else + input_deriv->AddMatMatMat(1.0, U_, kNoTrans, bar_lambda, kNoTrans, + U_, kTrans, 0.0); +} } // end namespace kaldi diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h index ca50ddda7c8..2b3ec8133e9 100644 --- a/src/matrix/matrix-functions.h +++ b/src/matrix/matrix-functions.h @@ -164,6 +164,126 @@ inline void AssertSameDim(const MatrixBase &mat1, const MatrixBase } +/* + This class allows you to compute the class of function described in + http://www.danielpovey.com/files/2018_svd_derivative.pdf + and to backprop through that computation. + Short summary: it allows you to apply some kind of scalar function + to the singular values of a square matrix, reconstruct it, and then + backprop through that operation. + + This class is quite general-purpose in the sense that you can + provide any scalar function; but in order to avoid things like + passing function-pointers around, we had give it a rather clunky + interface. The way you are supposed to use it is as follows + (to give an example): + + Matrix A(...); // set it somehow. + SvdRescaler rescaler(A); + const VectorBase &lambda_in = A.InputSingularValues(); + VectorBase &lambda_out = *(A.OutputSingularValues()); + VectorBase &lambda_out_deriv = *(A.OutputSingularValueDerivs()); + for (int32 i = 0; i < lambda_in.size(); i++) { + // compute the scalar function and its derivative for the singular + // values. + lambda_out(i) = some_func(lambda_in(i)); + lambda_out_deriv(i) = some_func_deriv(lambda_in(i)); + } + Matrix B(A.NumRows(), A.NumCols(), kUndefined); + rescaler.GetOutput(&B); + // Do something with B. + Matrix B_deriv(...); // Get the derivative w.r.t. B + // somehow. + Matrix A_deriv(A.NumRows(), A.NumCols()); // Get the derivative w.r.t. A. + + + */ +class SvdRescaler { + public: + /* + Constructor. + 'A' is the input matrix. See class-level documentation above for + more information. + + If 'symmetric' is set to true, then the user is asserting that A is + symmetric, and that that symmetric structure needs to be preserved in the + output. In this case, we use code for the symmetric eigenvalue problem to + do the decomposition instead of the SVD. I.e. decompose A = P diag(s) P^T + instead of A = U diag(s) V^T, using SpMatrix::Eig(). You can view this as a + special case of SVD. + */ + SvdRescaler(const MatrixBase *A, bool symmetric) { + Init(A, symmetric); + } + + // Constructor that takes no args. In this case you are supposed to + // call Init() + SvdRescaler() { } + + // An alternative to the constructor that takes args. Should only be called + // directly after initializing the object with no args. Warning: this object + // keeps a reference to this matrix, so don't modify it during the lifetime + // of this object. + // A is required to be square. + void Init(const MatrixBase *A, bool symmetric); + + // Return a pointer to the the singular values of A, which will have been + // computed in the constructor. + // The reason why this is not const is that there may be + // situations where you discover that the input matrix has some very small + // singular values, and you want to (say) floor them somehow and reconstruct, + // and have the derivatives be valid assuming you had given that 'repaired' + // matrix A as input. Modifying the elements of this vector gives you + // a way to do that, although currently this class doesn't provide a way + // for you to access that 'fixed-up' A directly. + // We hope you know what you are doing if you modify these singular values. + BaseFloat *InputSingularValues(); + + // Returns a pointer to a place that you can write the + // modified singular values f(lambda). + BaseFloat *OutputSingularValues(); + + // Returns a pointer to a place that you can write the + // values of f'(lambda) (the function-derivative of f). + BaseFloat *OutputSingularValueDerivs(); + + // Outputs F(A) to 'output', which must have the correct size. + // It's OK if 'output' contains NaNs on entry. + // Before calling this, you must have set the values in + // 'OutputSingularValues()'. + void GetOutput(MatrixBase *output); + + // Computes the derivative of some function g w.r.t. the input A, + // given that dg/d(output) is provided in 'output_deriv'. + // This derivative is *added* to 'input_deriv', so you need + // to zero 'input_deriv' or otherwise set it, beforehand. + // It is acceptable to call ComputeInputDeriv (with possibly different + // values of 'output_deriv' and 'input_deriv' as many times as you want, + // on the same object. + void ComputeInputDeriv(const MatrixBase &output_deriv, + MatrixBase *input_deriv) const; + + protected: + // the input matrix A. Owned by the user but will not be changed by them + // during the lifetime of this object. + const MatrixBase *A_; + bool symmetric_; + // U_ is present regardless of whether symmetric_ is true. It is the + // left part of the decomposition A = U diag(s) V^T. + Matrix U_; + // Vt_ is only present if symmetric_ is false. Otherwise, we + // assume that Vt_ equals U_. + Matrix Vt_; + + // a matrix containing three rows, and num-cols equal to the num-rows of the + // symmetric matrix A_. + // row 0 is 'lambda_in' (the input singular values; or the input eigenvalues, + // in the symmetric case). + // row 1 is 'lambda_out' (the input singular values, i.e. f(lambda)), + // row 2 is 'lambda_out_deriv' (the function-derivative f'(lambda)). + Matrix lambdas_; +}; + /// @} end of "addtogroup matrix_funcs_misc" } // end namespace kaldi diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile index 5e67211c3a7..66177559218 100644 --- a/src/nnet3/Makefile +++ b/src/nnet3/Makefile @@ -41,6 +41,6 @@ ADDLIBS = ../chain/kaldi-chain.a ../cudamatrix/kaldi-cudamatrix.a \ ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \ ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \ ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \ - ../base/kaldi-base.a + ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index c627bb1032a..517c63e394e 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -33,13 +33,18 @@ void NnetChainSupervision::Write(std::ostream &os, bool binary) const { supervision.Write(os, binary); WriteToken(os, binary, ""); deriv_weights.Write(os, binary); + if (chunks_per_group != 1) { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, chunks_per_group); + } WriteToken(os, binary, ""); } bool NnetChainSupervision::operator == (const NnetChainSupervision &other) const { return name == other.name && indexes == other.indexes && supervision == other.supervision && - deriv_weights.ApproxEqual(other.deriv_weights); + deriv_weights.ApproxEqual(other.deriv_weights) && + chunks_per_group == other.chunks_per_group; } void NnetChainSupervision::Read(std::istream &is, bool binary) { @@ -47,17 +52,17 @@ void NnetChainSupervision::Read(std::istream &is, bool binary) { ReadToken(is, binary, &name); ReadIndexVector(is, binary, &indexes); supervision.Read(is, binary); - std::string token; - ReadToken(is, binary, &token); - // in the future this back-compatibility code can be reworked. - if (token != "") { - KALDI_ASSERT(token == "" || token == ""); - if (token == "") - ReadVectorAsChar(is, binary, &deriv_weights); - else - deriv_weights.Read(is, binary); - ExpectToken(is, binary, ""); + // If the following fails, you may be using much older egs that are no longer + // supported to be read by the current code -> re-dump the egs. + ExpectToken(is, binary, ""); + deriv_weights.Read(is, binary); + if (PeekToken(is, binary) == 'C') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &chunks_per_group); + } else { + chunks_per_group = 1; } + ExpectToken(is, binary, ""); CheckDim(); } @@ -75,6 +80,8 @@ void NnetChainSupervision::CheckDim() const { frame_skip = indexes[supervision.num_sequences].t - first_frame, num_sequences = supervision.num_sequences, frames_per_sequence = supervision.frames_per_sequence; + KALDI_ASSERT(chunks_per_group > 0 && + num_sequences % chunks_per_group == 0); int32 k = 0; for (int32 i = 0; i < frames_per_sequence; i++) { for (int32 j = 0; j < num_sequences; j++,k++) { @@ -93,13 +100,15 @@ NnetChainSupervision::NnetChainSupervision(const NnetChainSupervision &other): name(other.name), indexes(other.indexes), supervision(other.supervision), - deriv_weights(other.deriv_weights) { CheckDim(); } + deriv_weights(other.deriv_weights), + chunks_per_group(other.chunks_per_group) { CheckDim(); } void NnetChainSupervision::Swap(NnetChainSupervision *other) { name.swap(other->name); indexes.swap(other->indexes); supervision.Swap(&(other->supervision)); deriv_weights.Swap(&(other->deriv_weights)); + std::swap(chunks_per_group, other->chunks_per_group); if (RandInt(0, 5) == 0) CheckDim(); } @@ -112,7 +121,8 @@ NnetChainSupervision::NnetChainSupervision( int32 frame_skip): name(name), supervision(supervision), - deriv_weights(deriv_weights) { + deriv_weights(deriv_weights), + chunks_per_group(1) { // note: this will set the 'x' index to zero. indexes.resize(supervision.num_sequences * supervision.frames_per_sequence); @@ -177,6 +187,7 @@ void NnetChainExample::Read(std::istream &is, bool binary) { void NnetChainExample::Swap(NnetChainExample *other) { inputs.swap(other->inputs); outputs.swap(other->outputs); + std::swap(bucket, other->bucket); } void NnetChainExample::Compress() { @@ -211,6 +222,14 @@ static void MergeSupervision( &output_supervision); output->supervision.Swap(&output_supervision); + int32 example_stride = 0; + for (auto &index: inputs[0]->indexes) + if (index.n > example_stride) + example_stride = index.n; + example_stride++; + + KALDI_ASSERT(example_stride == inputs[0]->supervision.num_sequences); + output->indexes.clear(); output->indexes.reserve(num_indexes); for (int32 n = 0; n < num_inputs; n++) { @@ -223,8 +242,8 @@ static void MergeSupervision( // change the 'n' index to correspond to the index into 'input'. // Each example gets a different 'n' value, starting from 0. for (; iter != end; ++iter) { - KALDI_ASSERT(iter->n == 0 && "Merging already-merged chain egs"); - iter->n = n; + KALDI_ASSERT(iter->n < example_stride); + iter->n += n * example_stride; } } KALDI_ASSERT(output->indexes.size() == num_indexes); @@ -249,6 +268,7 @@ static void MergeSupervision( } } } + output->chunks_per_group = example_stride; output->CheckDim(); } @@ -350,6 +370,30 @@ void GetChainComputationRequest(const Nnet &nnet, KALDI_ERR << "No outputs in computation request."; } + +// Returns the frame subsampling factor, which is the difference between the +// first 't' value we encounter in 'indexes', and the next 't' value that is +// different from the first 't'. It will typically be 3. +// This function will crash if it could not figure it out (e.g. because +// 'indexes' was empty or had only one element). +static int32 GetFrameSubsamplingFactor(const std::vector &indexes) { + + auto iter = indexes.begin(), end = indexes.end(); + int32 cur_t_value; + if (iter != end) { + cur_t_value = iter->t; + ++iter; + } + for (; iter != end; ++iter) { + if (iter->t != cur_t_value) { + KALDI_ASSERT(iter->t > cur_t_value); + return iter->t - cur_t_value; + } + } + KALDI_ERR << "Error getting frame subsampling factor"; + return 0; // Shouldn't be reached, this is to avoid compiler warnings. +} + void ShiftChainExampleTimes(int32 frame_shift, const std::vector &exclude_names, NnetChainExample *eg) { @@ -377,10 +421,7 @@ void ShiftChainExampleTimes(int32 frame_shift, sup_end = eg->outputs.end(); for (; sup_iter != sup_end; ++sup_iter) { std::vector &indexes = sup_iter->indexes; - KALDI_ASSERT(indexes.size() >= 2 && indexes[0].n == indexes[1].n && - indexes[0].x == indexes[1].x); - int32 frame_subsampling_factor = indexes[1].t - indexes[0].t; - KALDI_ASSERT(frame_subsampling_factor > 0); + int32 frame_subsampling_factor = GetFrameSubsamplingFactor(indexes); // We need to shift by a multiple of frame_subsampling_factor. // Round to the closest multiple. @@ -401,12 +442,13 @@ size_t NnetChainExampleStructureHasher::operator () ( const NnetChainExample &eg) const noexcept { // these numbers were chosen at random from a list of primes. NnetIoStructureHasher io_hasher; + StringHasher string_hasher; size_t size = eg.inputs.size(), ans = size * 35099; + ans += string_hasher(eg.bucket); for (size_t i = 0; i < size; i++) ans = ans * 19157 + io_hasher(eg.inputs[i]); for (size_t i = 0; i < eg.outputs.size(); i++) { const NnetChainSupervision &sup = eg.outputs[i]; - StringHasher string_hasher; IndexVectorHasher indexes_hasher; ans = ans * 17957 + string_hasher(sup.name) + indexes_hasher(sup.indexes); @@ -417,6 +459,8 @@ size_t NnetChainExampleStructureHasher::operator () ( bool NnetChainExampleStructureCompare::operator () ( const NnetChainExample &a, const NnetChainExample &b) const { + if (a.bucket != b.bucket) + return false; NnetIoStructureCompare io_compare; if (a.inputs.size() != b.inputs.size() || a.outputs.size() != b.outputs.size()) @@ -499,6 +543,8 @@ void ChainExampleMerger::WriteMinibatch( MergeChainExamples(config_.compress, egs, &merged_eg); std::ostringstream key; key << "merged-" << (num_egs_written_++) << "-" << minibatch_size; + if (!(*egs)[0].bucket.empty()) + key << "?" << (*egs)[0].bucket; writer_->Write(key.str(), merged_eg); } diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h index 187bb4ef3a3..eb6846fa4d2 100644 --- a/src/nnet3/nnet-chain-example.h +++ b/src/nnet3/nnet-chain-example.h @@ -60,7 +60,10 @@ struct NnetChainSupervision { std::vector indexes; - /// The supervision object, containing the FST. + /// The supervision object, containing the FST; its members are + /// weight, num_sequences, frames_per_sequence, label_dim, fst, + /// e2e_fsts (for e2e examples only); alignment_pdfs (which is required + /// only for nnet3-chain-acc-lda-stats). chain::Supervision supervision; /// This is a vector of per-frame weights, required to be between 0 and 1, @@ -76,6 +79,14 @@ struct NnetChainSupervision { /// to disk compactly as unsigned char. Vector deriv_weights; + /// This will be 1 in normal cases, but in the 'chaina' code (chain training + /// with adaptation) it will be set to the number of chunks/sequences per + /// group in this minibatch (the chunks from a particular group are expected + /// to come from the same speaker). For example if it's 4, then we are + /// asserting that sequences n=0 through 3 all come from the same speaker, n=4 + /// through 7 all come from the same speaker, and so on. + int32 chunks_per_group; + // Use default assignment operator NnetChainSupervision() { } @@ -118,6 +129,12 @@ struct NnetChainExample { /// be just one member with name == "output". std::vector outputs; + /// This relates to the '--use-query-string' option for merging. Examples + /// with different values of 'bucket' won't be merged together. Note that + /// this member variable is not written or read (in the Write/Read functions) + /// as it's not a permanent part of an eg. It's only used in the merging code. + std::string bucket; + void Write(std::ostream &os, bool binary) const; void Read(std::istream &is, bool binary); @@ -270,7 +287,7 @@ class ChainExampleMerger { std::vector, NnetChainExampleStructureHasher, NnetChainExampleStructureCompare> MapType; -MapType eg_to_egs_; + MapType eg_to_egs_; }; diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index a798cb597f5..d9562887817 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -33,6 +33,7 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, compiler_(*nnet, opts_.nnet_config.optimize_config, opts_.nnet_config.compiler_config), num_minibatches_processed_(0), + max_change_stats_(*nnet), srand_seed_(RandInt(0, 100000)) { if (opts.nnet_config.zero_component_stats) ZeroComponentStats(nnet); @@ -41,9 +42,6 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts, opts.nnet_config.backstitch_training_interval > 0); delta_nnet_ = nnet_->Copy(); ScaleNnet(0.0, delta_nnet_); - const int32 num_updatable = NumUpdatableComponents(*delta_nnet_); - num_max_change_per_component_applied_.resize(num_updatable, 0); - num_max_change_global_applied_ = 0; if (opts.nnet_config.read_cache != "") { bool binary; @@ -111,17 +109,19 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg, this->ProcessOutputs(false, eg, &computer); computer.Run(); - // If relevant, add in the part of the gradient that comes from L2 - // regularization. + // If relevant, add in the part of the gradient that comes from + // parameter-level L2 regularization. ApplyL2Regularization(*nnet_, GetNumNvalues(eg.inputs, false) * nnet_config.l2_regularize_factor, delta_nnet_); // Updates the parameters of nnet - bool success = UpdateNnetWithMaxChange(*delta_nnet_, - nnet_config.max_param_change, 1.0, 1.0 - nnet_config.momentum, nnet_, - &num_max_change_per_component_applied_, &num_max_change_global_applied_); + bool success = UpdateNnetWithMaxChange( + *delta_nnet_, + nnet_config.max_param_change, + 1.0, 1.0 - nnet_config.momentum, nnet_, + &max_change_stats_); // Scale down the batchnorm stats (keeps them fresh... this affects what // happens when we use the model with batchnorm test-mode set). @@ -176,9 +176,10 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg, } // Updates the parameters of nnet - UpdateNnetWithMaxChange(*delta_nnet_, - nnet_config.max_param_change, max_change_scale, scale_adding, nnet_, - &num_max_change_per_component_applied_, &num_max_change_global_applied_); + UpdateNnetWithMaxChange( + *delta_nnet_, nnet_config.max_param_change, + max_change_scale, scale_adding, nnet_, + &max_change_stats_); if (is_backstitch_step1) { // The following will only do something if we have a LinearComponent or @@ -276,41 +277,10 @@ bool NnetChainTrainer::PrintTotalStats() const { const ObjectiveFunctionInfo &info = iter->second; ans = info.PrintTotalStats(name) || ans; } - PrintMaxChangeStats(); + max_change_stats_.Print(*nnet_); return ans; } -void NnetChainTrainer::PrintMaxChangeStats() const { - KALDI_ASSERT(delta_nnet_ != NULL); - const NnetTrainerOptions &nnet_config = opts_.nnet_config; - int32 i = 0; - for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) { - Component *comp = delta_nnet_->GetComponent(c); - if (comp->Properties() & kUpdatableComponent) { - UpdatableComponent *uc = dynamic_cast(comp); - if (uc == NULL) - KALDI_ERR << "Updatable component does not inherit from class " - << "UpdatableComponent; change this code."; - if (num_max_change_per_component_applied_[i] > 0) - KALDI_LOG << "For " << delta_nnet_->GetComponentName(c) - << ", per-component max-change was enforced " - << (100.0 * num_max_change_per_component_applied_[i]) / - (num_minibatches_processed_ * - (nnet_config.backstitch_training_scale == 0.0 ? 1.0 : - 1.0 + 1.0 / nnet_config.backstitch_training_interval)) - << " \% of the time."; - i++; - } - } - if (num_max_change_global_applied_ > 0) - KALDI_LOG << "The global max-change was enforced " - << (100.0 * num_max_change_global_applied_) / - (num_minibatches_processed_ * - (nnet_config.backstitch_training_scale == 0.0 ? 1.0 : - 1.0 + 1.0 / nnet_config.backstitch_training_interval)) - << " \% of the time."; -} - NnetChainTrainer::~NnetChainTrainer() { if (opts_.nnet_config.write_cache != "") { Output ko(opts_.nnet_config.write_cache, opts_.nnet_config.binary_write_cache); diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h index 5bf6a3f6fce..bc5143491ac 100644 --- a/src/nnet3/nnet-chain-training.h +++ b/src/nnet3/nnet-chain-training.h @@ -64,10 +64,6 @@ class NnetChainTrainer { // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; - // Prints out the max-change stats (if nonzero): the percentage of time that - // per-component max-change and global max-change were enforced. - void PrintMaxChangeStats() const; - ~NnetChainTrainer(); private: // The internal function for doing one step of conventional SGD training. @@ -88,11 +84,8 @@ class NnetChainTrainer { chain::DenominatorGraph den_graph_; Nnet *nnet_; - Nnet *delta_nnet_; // Only used if momentum != 0.0 or max-param-change != - // 0.0. nnet representing accumulated parameter-change - // (we'd call this gradient_nnet_, but due to - // natural-gradient update, it's better to consider it as - // a delta-parameter nnet. + Nnet *delta_nnet_; // stores the change to the parameters on each training + // iteration. CachingOptimizingCompiler compiler_; // This code supports multiple output layers, even though in the @@ -101,8 +94,7 @@ class NnetChainTrainer { int32 num_minibatches_processed_; // stats for max-change. - std::vector num_max_change_per_component_applied_; - int32 num_max_change_global_applied_; + MaxChangeStats max_change_stats_; unordered_map objf_info_; diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index 1ff7daa01d1..53859e9b03c 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -81,9 +81,9 @@ ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecompute // static Component* Component::ReadNew(std::istream &is, bool binary) { std::string token; - ReadToken(is, binary, &token); // e.g. "". - token.erase(0, 1); // erase "<". - token.erase(token.length()-1); // erase ">". + ReadToken(is, binary, &token); // e.g. "". + token.erase(0, 1); // erase "<". + token.erase(token.length() - 1); // erase ">". Component *ans = NewComponentOfType(token); if (!ans) KALDI_ERR << "Unknown component type " << token; diff --git a/src/nnet3/nnet-diagnostics.h b/src/nnet3/nnet-diagnostics.h index 61e2ed18e1d..8e88794d022 100644 --- a/src/nnet3/nnet-diagnostics.h +++ b/src/nnet3/nnet-diagnostics.h @@ -61,7 +61,7 @@ struct NnetComputeProbOptions { // constructor of NnetComputeProb that takes a pointer to the nnet, and the // stats will be stored there. bool store_component_stats; - + bool compute_per_dim_accuracy; NnetOptimizeOptions optimize_config; @@ -186,18 +186,18 @@ class NnetComputeProb { @param [out] tot_weight The sum of the values in the supervision matrix @param [out] tot_accuracy The total accuracy, equal to the sum over all row indexes r such that the maximum column index of row r of - supervision and nnet_output is the same, of the sum of + supervision and nnet_output is the same, of the sum of the r'th row of supervision (i.e. the row's weight). @param [out] tot_weight_vec If non-NULL, we write to this location the counts per-class in the supervision matrix. - This is expected to have the same dimension as the - corresponding output in the network. - @param [out] tot_accuracy_vec If non-NULL, we write to this location - the accuracy per-class. For index j, - the value is equal to the sum - over all row indexes r such that the maximum column index + This is expected to have the same dimension as the + corresponding output in the network. + @param [out] tot_accuracy_vec If non-NULL, we write to this location + the accuracy per-class. For index j, + the value is equal to the sum + over all row indexes r such that the maximum column index of row r of supervision is j and nnet_output is also j, - of the sum of the r'th row of supervision + of the sum of the r'th row of supervision (i.e. the row's weight) */ diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc index cc5fe3cc050..f837ce27c66 100644 --- a/src/nnet3/nnet-example-utils.cc +++ b/src/nnet3/nnet-example-utils.cc @@ -81,7 +81,13 @@ static void GetIoSizes(const std::vector &src, } - +static int32 FindMaxNValue(const NnetIo &io) { + int32 max_n = 0; + for (auto &index: io.indexes) + if (index.n > max_n) + max_n = index.n; + return max_n; +} // Do the final merging of NnetIo, once we have obtained the names, dims and // sizes for each feature/supervision type. @@ -98,6 +104,9 @@ static void MergeIo(const std::vector &src, // The features in the different NnetIo in the Indexes across all examples std::vector > output_lists(num_feats); + // This is 1 for single examples and larger than 1 for already-merged egs, and + // it must be the same for all io's across all examples: + int32 example_stride = FindMaxNValue(src[0].io[0]) + 1; // Initialize the merged_eg merged_eg->io.clear(); merged_eg->io.resize(num_feats); @@ -137,11 +146,8 @@ static void MergeIo(const std::vector &src, std::vector::iterator output_iter = output_io.indexes.begin(); // Set the n index to be different for each of the original examples. for (int32 i = this_offset; i < this_offset + this_size; i++) { - // we could easily support merging already-merged egs, but I don't see a - // need for it right now. - KALDI_ASSERT(output_iter[i].n == 0 && - "Merging already-merged egs? Not currentlysupported."); - output_iter[i].n = n; + KALDI_ASSERT(output_iter[i].n < example_stride); + output_iter[i].n += n * example_stride; } this_offset += this_size; // note: this_offset is a reference. } @@ -357,7 +363,8 @@ UtteranceSplitter::~UtteranceSplitter() { KALDI_LOG << "Split " << total_num_utterances_ << " utts, with " << "total length " << total_input_frames_ << " frames (" << (total_input_frames_ / 360000.0) << " hours assuming " - << "100 frames per second)"; + << "100 frames per second) into " << total_num_chunks_ + << " chunks."; float average_chunk_length = total_frames_in_chunks_ * 1.0 / total_num_chunks_, overlap_percent = total_frames_overlap_ * 100.0 / total_input_frames_, output_percent = total_frames_in_chunks_ * 100.0 / total_input_frames_, @@ -556,7 +563,7 @@ bool UtteranceSplitter::LengthsMatch(const std::string &utt, int32 length_tolerance) const { int32 sf = config_.frame_subsampling_factor, expected_supervision_length = (utterance_length + sf - 1) / sf; - if (std::abs(supervision_length - expected_supervision_length) + if (std::abs(supervision_length - expected_supervision_length) <= length_tolerance) { return true; } else { diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h index 52b2ebbf904..0553eeb3d82 100644 --- a/src/nnet3/nnet-example-utils.h +++ b/src/nnet3/nnet-example-utils.h @@ -88,7 +88,6 @@ struct ExampleGenerationConfig { int32 frame_subsampling_factor; std::string num_frames_str; - // The following parameters are derived parameters, computed by // ComputeDerived(). @@ -325,12 +324,14 @@ class ExampleMergingConfig { std::string measure_output_frames; // for back-compatibility, not used. std::string minibatch_size; std::string discard_partial_minibatches; // for back-compatibility, not used. + bool use_query_string; ExampleMergingConfig(const char *default_minibatch_size = "256"): compress(false), measure_output_frames("deprecated"), minibatch_size(default_minibatch_size), - discard_partial_minibatches("deprecated") { } + discard_partial_minibatches("deprecated"), + use_query_string(false) { } void Register(OptionsItf *po) { po->Register("compress", &compress, "If true, compress the output examples " @@ -354,6 +355,14 @@ class ExampleMergingConfig { "--minibatch-size=128=64:128,256/256=32:64,128. Egs are given " "minibatch-sizes based on the specified eg-size closest to " "their actual size."); + po->Register("use-query-string", &use_query_string, "If true, the part of " + "the key name after the final '?' in the string (if one " + "is present) will be required to match when determining " + "which egs may be merged (so only egs with the same text " + "after the '?' will be merged), and the key used in the " + "output will end with the same query string, including " + "the '?'. An example query string is: " + "'?lang=english&tw=0.5&bw=1.0'"); } diff --git a/src/nnet3/nnet-parse-test.cc b/src/nnet3/nnet-parse-test.cc index babdbbdcb0e..5ae4917dba6 100644 --- a/src/nnet3/nnet-parse-test.cc +++ b/src/nnet3/nnet-parse-test.cc @@ -23,193 +23,6 @@ namespace kaldi { namespace nnet3 { -void UnitTestConfigLineParse() { - std::string str; - { - ConfigLine cfl; - str = "a-b xx=yyy foo=bar baz=123 ba=1:2"; - bool status = cfl.ParseLine(str); - KALDI_ASSERT(status && cfl.FirstToken() == "a-b"); - - KALDI_ASSERT(cfl.HasUnusedValues()); - std::string str_value; - KALDI_ASSERT(cfl.GetValue("xx", &str_value)); - KALDI_ASSERT(str_value == "yyy"); - KALDI_ASSERT(cfl.HasUnusedValues()); - KALDI_ASSERT(cfl.GetValue("foo", &str_value)); - KALDI_ASSERT(str_value == "bar"); - KALDI_ASSERT(cfl.HasUnusedValues()); - KALDI_ASSERT(!cfl.GetValue("xy", &str_value)); - KALDI_ASSERT(cfl.GetValue("baz", &str_value)); - KALDI_ASSERT(str_value == "123"); - - std::vector int_values; - KALDI_ASSERT(!cfl.GetValue("xx", &int_values)); - KALDI_ASSERT(cfl.GetValue("baz", &int_values)); - KALDI_ASSERT(cfl.HasUnusedValues()); - KALDI_ASSERT(int_values.size() == 1 && int_values[0] == 123); - KALDI_ASSERT(cfl.GetValue("ba", &int_values)); - KALDI_ASSERT(int_values.size() == 2 && int_values[0] == 1 && int_values[1] == 2); - KALDI_ASSERT(!cfl.HasUnusedValues()); - } - - { - ConfigLine cfl; - str = "a-b baz=x y z pp = qq ab =cd ac= bd"; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = "a-b baz=x y z pp = qq ab=cd ac=bd"; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = "foo-bar"; - KALDI_ASSERT(cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = "foo-bar a=b c d f=g"; - std::string value; - KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "foo-bar" && - cfl.GetValue("a", &value) && value == "b c d" && - cfl.GetValue("f", &value) && value == "g" && - !cfl.HasUnusedValues()); - } - { - ConfigLine cfl; - str = "zzz a=b baz"; - KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "zzz" && - cfl.UnusedValues() == "a=b baz"); - } - { - ConfigLine cfl; - str = "xxx a=b baz "; - KALDI_ASSERT(cfl.ParseLine(str) && cfl.UnusedValues() == "a=b baz"); - } - { - ConfigLine cfl; - str = "xxx a=b =c"; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = "xxx baz='x y z' pp=qq ab=cd ac=bd"; - KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "xxx"); - std::string str_value; - KALDI_ASSERT(cfl.GetValue("baz", &str_value)); - KALDI_ASSERT(str_value == "x y z"); - KALDI_ASSERT(cfl.GetValue("pp", &str_value)); - KALDI_ASSERT(str_value == "qq"); - KALDI_ASSERT(cfl.UnusedValues() == "ab=cd ac=bd"); - KALDI_ASSERT(cfl.GetValue("ab", &str_value)); - KALDI_ASSERT(str_value == "cd"); - KALDI_ASSERT(cfl.UnusedValues() == "ac=bd"); - KALDI_ASSERT(cfl.HasUnusedValues()); - KALDI_ASSERT(cfl.GetValue("ac", &str_value)); - KALDI_ASSERT(str_value == "bd"); - KALDI_ASSERT(!cfl.HasUnusedValues()); - } - - { - ConfigLine cfl; - str = "x baz= pp = qq flag=t "; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = " x baz= pp=qq flag=t "; - KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "x"); - - std::string str_value; - KALDI_ASSERT(cfl.GetValue("baz", &str_value)); - KALDI_ASSERT(str_value == ""); - KALDI_ASSERT(cfl.GetValue("pp", &str_value)); - KALDI_ASSERT(str_value == "qq"); - KALDI_ASSERT(cfl.HasUnusedValues()); - KALDI_ASSERT(cfl.GetValue("flag", &str_value)); - KALDI_ASSERT(str_value == "t"); - KALDI_ASSERT(!cfl.HasUnusedValues()); - - bool bool_value = false; - KALDI_ASSERT(cfl.GetValue("flag", &bool_value)); - KALDI_ASSERT(bool_value); - } - - { - ConfigLine cfl; - str = "xx _baz=a -pp=qq"; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = "xx 0baz=a pp=qq"; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = "xx -baz=a pp=qq"; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = "xx _baz'=a pp=qq"; - KALDI_ASSERT(!cfl.ParseLine(str)); - } - { - ConfigLine cfl; - str = " baz=g"; - KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == ""); - bool flag; - KALDI_ASSERT(!cfl.GetValue("baz", &flag)); - } - { - ConfigLine cfl; - str = "xx _baz1=a pp=qq"; - KALDI_ASSERT(cfl.ParseLine(str)); - - std::string str_value; - KALDI_ASSERT(cfl.GetValue("_baz1", &str_value)); - } -} - -void UnitTestReadConfig() { - std::string str = "a-b alpha=aa beta=\"b b\"# String test\n" - "a-b beta2='b c' beta3=bd # \n" - "a-b gamma=1:2:3:4 # Int Vector test\n" - " a-b de1ta=f # Bool + Integer in key Comment test delta=t \n" - "a-b _epsilon=-1 # Int Vector test _epsilon=1 \n" - "a-b zet-_a=0.15 theta=1.1# Float, -, _ test\n" - "a-b quoted='a b c' # quoted string\n" - "a-b quoted2=\"d e 'a b=c' f\" # string quoted with double quotes"; - - std::istringstream is(str); - std::vector lines; - ReadConfigLines(is, &lines); - KALDI_ASSERT(lines.size() == 8); - - ConfigLine cfl; - for (size_t i = 0; i < lines.size(); i++) { - KALDI_ASSERT(cfl.ParseLine(lines[i]) && cfl.FirstToken() == "a-b"); - if (i == 1) { - KALDI_ASSERT(cfl.GetValue("beta2", &str) && str == "b c"); - } - if (i == 4) { - KALDI_ASSERT(cfl.GetValue("_epsilon", &str) && str == "-1"); - } - if (i == 5) { - BaseFloat float_val = 0; - KALDI_ASSERT(cfl.GetValue("zet-_a", &float_val) && ApproxEqual(float_val, 0.15)); - } - if (i == 6) { - KALDI_ASSERT(cfl.GetValue("quoted", &str) && str == "a b c"); - } - if (i == 7) { - KALDI_ASSERT(cfl.GetValue("quoted2", &str) && str == "d e 'a b=c' f"); - } - } -} void UnitTestDescriptorTokenize() { std::vector lines; @@ -281,8 +94,6 @@ int main() { using namespace kaldi; using namespace kaldi::nnet3; - UnitTestConfigLineParse(); - UnitTestReadConfig(); UnitTestDescriptorTokenize(); UnitTestSummarizeVector(); UnitTestNameMatchesPattern(); diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc index a51bba21484..17dec23e7c1 100644 --- a/src/nnet3/nnet-parse.cc +++ b/src/nnet3/nnet-parse.cc @@ -27,353 +27,6 @@ namespace kaldi { namespace nnet3 { - -bool ConfigLine::ParseLine(const std::string &line) { - data_.clear(); - whole_line_ = line; - if (line.size() == 0) return false; // Empty line - size_t pos = 0, size = line.size(); - while (isspace(line[pos]) && pos < size) pos++; - if (pos == size) - return false; // whitespace-only line - size_t first_token_start_pos = pos; - // first get first_token_. - while (!isspace(line[pos]) && pos < size) { - if (line[pos] == '=') { - // If the first block of non-whitespace looks like "foo-bar=...", - // then we ignore it: there is no initial token, and FirstToken() - // is empty. - pos = first_token_start_pos; - break; - } - pos++; - } - first_token_ = std::string(line, first_token_start_pos, pos - first_token_start_pos); - // first_token_ is expected to be either empty or something like - // "component-node", which actually is a slightly more restrictive set of - // strings than IsValidName() checks for this is a convenient way to check it. - if (!first_token_.empty() && !IsValidName(first_token_)) - return false; - - while (pos < size) { - if (isspace(line[pos])) { - pos++; - continue; - } - - // OK, at this point we know that we are pointing at nonspace. - size_t next_equals_sign = line.find_first_of("=", pos); - if (next_equals_sign == pos || next_equals_sign == std::string::npos) { - // we're looking for something like 'key=value'. If there is no equals sign, - // or it's not preceded by something, it's a parsing failure. - return false; - } - std::string key(line, pos, next_equals_sign - pos); - if (!IsValidName(key)) return false; - - // handle any quotes. we support key='blah blah' or key="foo bar". - // no escaping is supported. - if (line[next_equals_sign+1] == '\'' || line[next_equals_sign+1] == '"') { - char my_quote = line[next_equals_sign+1]; - size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2); - if (next_quote == std::string::npos) { // no matching quote was found. - KALDI_WARN << "No matching quote for " << my_quote << " in config line '" - << line << "'"; - return false; - } else { - std::string value(line, next_equals_sign + 2, - next_quote - next_equals_sign - 2); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = next_quote + 1; - continue; - } - } else { - // we want to be able to parse something like "... input=Offset(a, -1) foo=bar": - // in general, config values with spaces in them, even without quoting. - - size_t next_next_equals_sign = line.find_first_of("=", next_equals_sign + 1), - terminating_space = size; - - if (next_next_equals_sign != std::string::npos) { // found a later equals sign. - size_t preceding_space = line.find_last_of(" \t", next_next_equals_sign); - if (preceding_space != std::string::npos && - preceding_space > next_equals_sign) - terminating_space = preceding_space; - } - while (isspace(line[terminating_space - 1]) && terminating_space > 0) - terminating_space--; - - std::string value(line, next_equals_sign + 1, - terminating_space - (next_equals_sign + 1)); - data_.insert(std::make_pair(key, std::make_pair(value, false))); - pos = terminating_space; - } - } - return true; -} - -bool ConfigLine::GetValue(const std::string &key, std::string *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - *value = (it->second).first; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToReal((it->second).first, value)) - return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, int32 *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!ConvertStringToInteger((it->second).first, value)) - return false; - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, std::vector *value) { - KALDI_ASSERT(value != NULL); - value->clear(); - std::map >::iterator it = data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if (!SplitStringToIntegers((it->second).first, ":,", true, value)) { - // KALDI_WARN << "Bad option " << (it->second).first; - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::GetValue(const std::string &key, bool *value) { - KALDI_ASSERT(value != NULL); - std::map >::iterator it = data_.begin(); - for (; it != data_.end(); ++it) { - if (it->first == key) { - if ((it->second).first.size() == 0) return false; - switch (((it->second).first)[0]) { - case 'F': - case 'f': - *value = false; - break; - case 'T': - case 't': - *value = true; - break; - default: - return false; - } - (it->second).second = true; - return true; - } - } - return false; -} - -bool ConfigLine::HasUnusedValues() const { - std::map >::const_iterator it = data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) return true; - } - return false; -} - -std::string ConfigLine::UnusedValues() const { - std::string unused_str; - std::map >::const_iterator it = data_.begin(); - for (; it != data_.end(); ++it) { - if (!(it->second).second) { - if (unused_str == "") - unused_str = it->first + "=" + (it->second).first; - else - unused_str += " " + it->first + "=" + (it->second).first; - } - } - return unused_str; -} - -// This is like ExpectToken but for two tokens, and it -// will either accept token1 and then token2, or just token2. -// This is useful in Read functions where the first token -// may already have been consumed. -void ExpectOneOrTwoTokens(std::istream &is, bool binary, - const std::string &token1, - const std::string &token2) { - KALDI_ASSERT(token1 != token2); - std::string temp; - ReadToken(is, binary, &temp); - if (temp == token1) { - ExpectToken(is, binary, token2); - } else { - if (temp != token2) { - KALDI_ERR << "Expecting token " << token1 << " or " << token2 - << " but got " << temp; - } - } -} - -// static -bool ParseFromString(const std::string &name, std::string *string, - int32 *param) { - std::vector split_string; - SplitStringToVector(*string, " \t", true, - &split_string); - std::string name_equals = name + "="; // the name and then the equals sign. - size_t len = name_equals.length(); - - for (size_t i = 0; i < split_string.size(); i++) { - if (split_string[i].compare(0, len, name_equals) == 0) { - if (!ConvertStringToInteger(split_string[i].substr(len), param)) - KALDI_ERR << "Bad option " << split_string[i]; - *string = ""; - // Set "string" to all the pieces but the one we used. - for (size_t j = 0; j < split_string.size(); j++) { - if (j != i) { - if (!string->empty()) *string += " "; - *string += split_string[j]; - } - } - return true; - } - } - return false; -} - -bool ParseFromString(const std::string &name, std::string *string, - bool *param) { - std::vector split_string; - SplitStringToVector(*string, " \t", true, - &split_string); - std::string name_equals = name + "="; // the name and then the equals sign. - size_t len = name_equals.length(); - - for (size_t i = 0; i < split_string.size(); i++) { - if (split_string[i].compare(0, len, name_equals) == 0) { - std::string b = split_string[i].substr(len); - if (b.empty()) - KALDI_ERR << "Bad option " << split_string[i]; - if (b[0] == 'f' || b[0] == 'F') *param = false; - else if (b[0] == 't' || b[0] == 'T') *param = true; - else - KALDI_ERR << "Bad option " << split_string[i]; - *string = ""; - // Set "string" to all the pieces but the one we used. - for (size_t j = 0; j < split_string.size(); j++) { - if (j != i) { - if (!string->empty()) *string += " "; - *string += split_string[j]; - } - } - return true; - } - } - return false; -} - -bool ParseFromString(const std::string &name, std::string *string, - BaseFloat *param) { - std::vector split_string; - SplitStringToVector(*string, " \t", true, - &split_string); - std::string name_equals = name + "="; // the name and then the equals sign. - size_t len = name_equals.length(); - - for (size_t i = 0; i < split_string.size(); i++) { - if (split_string[i].compare(0, len, name_equals) == 0) { - if (!ConvertStringToReal(split_string[i].substr(len), param)) - KALDI_ERR << "Bad option " << split_string[i]; - *string = ""; - // Set "string" to all the pieces but the one we used. - for (size_t j = 0; j < split_string.size(); j++) { - if (j != i) { - if (!string->empty()) *string += " "; - *string += split_string[j]; - } - } - return true; - } - } - return false; -} - -bool ParseFromString(const std::string &name, std::string *string, - std::string *param) { - std::vector split_string; - SplitStringToVector(*string, " \t", true, - &split_string); - std::string name_equals = name + "="; // the name and then the equals sign. - size_t len = name_equals.length(); - - for (size_t i = 0; i < split_string.size(); i++) { - if (split_string[i].compare(0, len, name_equals) == 0) { - *param = split_string[i].substr(len); - - // Set "string" to all the pieces but the one we used. - *string = ""; - for (size_t j = 0; j < split_string.size(); j++) { - if (j != i) { - if (!string->empty()) *string += " "; - *string += split_string[j]; - } - } - return true; - } - } - return false; -} - -bool ParseFromString(const std::string &name, std::string *string, - std::vector *param) { - std::vector split_string; - SplitStringToVector(*string, " \t", true, - &split_string); - std::string name_equals = name + "="; // the name and then the equals sign. - size_t len = name_equals.length(); - - for (size_t i = 0; i < split_string.size(); i++) { - if (split_string[i].compare(0, len, name_equals) == 0) { - if (!SplitStringToIntegers(split_string[i].substr(len), ":,", - false, param)) - KALDI_ERR << "Bad option " << split_string[i]; - *string = ""; - // Set "string" to all the pieces but the one we used. - for (size_t j = 0; j < split_string.size(); j++) { - if (j != i) { - if (!string->empty()) *string += " "; - *string += split_string[j]; - } - } - return true; - } - } - return false; -} - bool DescriptorTokenize(const std::string &input, std::vector *tokens) { KALDI_ASSERT(tokens != NULL); @@ -422,32 +75,6 @@ bool DescriptorTokenize(const std::string &input, return true; } -bool IsValidName(const std::string &name) { - if (name.size() == 0) return false; - for (size_t i = 0; i < name.size(); i++) { - if (i == 0 && !isalpha(name[i]) && name[i] != '_') - return false; - if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.') - return false; - } - return true; -} - -void ReadConfigLines(std::istream &is, - std::vector *lines) { - KALDI_ASSERT(lines != NULL); - std::string line; - while (std::getline(is, line)) { - if (line.size() == 0) continue; - size_t start = line.find_first_not_of(" \t"); - size_t end = line.find_first_of('#'); - if (start == std::string::npos || start == end) continue; - end = line.find_last_not_of(" \t", end - 1); - KALDI_ASSERT(end >= start); - lines->push_back(line.substr(start, end - start + 1)); - } -} - std::string ErrorContext(std::istream &is) { if (!is.good()) return "end of line"; char buf[21]; diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h index a073a54f7e0..0fc19d51f6c 100644 --- a/src/nnet3/nnet-parse.h +++ b/src/nnet3/nnet-parse.h @@ -26,103 +26,6 @@ namespace kaldi { namespace nnet3 { -/** - This class is responsible for parsing input like - hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' baz="a b c d='a b' e" - and giving you access to the fields, in this case - - FirstToken() == "hi-there", and key->value pairs: - - xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123", - bing->"a b c", baz->"a b c d='a b' e" - - The first token is optional, if the line started with a key-value pair then - FirstValue() will be empty. - - Note: it can parse value fields with space inside them only if they are free of the '=' - character. If values are going to contain the '=' character, you need to quote them - with either single or double quotes. - - Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_. - */ -class ConfigLine { - public: - // Tries to parse the line as a config-file line. Returns false - // if it could not for some reason, e.g. parsing failure. In most cases - // prints no warnings; the user should do this. Does not expect comments. - bool ParseLine(const std::string &line); - - // the GetValue functions are overloaded for various types. They return true - // if the key exists with value that can be converted to that type, and false - // otherwise. They also mark the key-value pair as having been read. It is - // not an error to read values twice. - bool GetValue(const std::string &key, std::string *value); - bool GetValue(const std::string &key, BaseFloat *value); - bool GetValue(const std::string &key, int32 *value); - // Values may be separated by ":" or by ",". - bool GetValue(const std::string &key, std::vector *value); - bool GetValue(const std::string &key, bool *value); - - bool HasUnusedValues() const; - /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one - /// of the GetValue() functions. - std::string UnusedValues() const; - - const std::string &FirstToken() const { return first_token_; } - - const std::string WholeLine() { return whole_line_; } - // use default assignment operator and copy constructor. - private: - std::string whole_line_; - // the first token of the line, e.g. if line is - // foo-bar baz=bing - // then first_token_ would be "foo-bar". - std::string first_token_; - - // data_ maps from key to (value, is-this-value-consumed?). - std::map > data_; - -}; - -// Note: the ParseFromString functions are to be removed after we switch over to -// using the ConfigLine mechanism. - - -/// \file nnet-parse.h -/// This header contains a few parsing-related functions that are used -/// while reading parsing neural network files and config files. - -/// Function used in Init routines. Suppose name=="foo", if "string" has a -/// field like foo=12, this function will set "param" to 12 and remove that -/// element from "string". It returns true if the parameter was read. -bool ParseFromString(const std::string &name, std::string *string, - int32 *param); - -/// This version of ParseFromString is for parameters of type BaseFloat. -bool ParseFromString(const std::string &name, std::string *string, - BaseFloat *param); - -/// This version of ParseFromString is for parameters of type bool, which can -/// appear as any string beginning with f, F, t or T. -bool ParseFromString(const std::string &name, std::string *string, - bool *param); - -/// This version of ParseFromString is for parsing strings. (these -/// should not contain space). -bool ParseFromString(const std::string &name, std::string *string, - std::string *param); - -/// This version of ParseFromString handles colon-separated or comma-separated -/// lists of integers. -bool ParseFromString(const std::string &name, std::string *string, - std::vector *param); - -/// This function is like ExpectToken but for two tokens, and it will either -/// accept token1 and then token2, or just token2. This is useful in Read -/// functions where the first token may already have been consumed. -void ExpectOneOrTwoTokens(std::istream &is, bool binary, - const std::string &token1, - const std::string &token2); /** This function tokenizes input when parsing Descriptor configuration @@ -142,32 +45,6 @@ void ExpectOneOrTwoTokens(std::istream &is, bool binary, bool DescriptorTokenize(const std::string &input, std::vector *tokens); -/// Returns true if 'name' would be a valid name for a component or node in a -/// Nnet. This is a nonempty string beginning with A-Za-z_, and containing only -/// '-', '_', '.', A-Z, a-z, or 0-9. -bool IsValidName(const std::string &name); - - -/** - This function reads in a config file and *appends* its contents to a vector of - lines; it is responsible for removing comments (anything after '#') and - stripping out any lines that contain only whitespace after comment removal. - */ -void ReadConfigLines(std::istream &is, - std::vector *lines); - - -/** - This function converts config-lines from a simple sequence of strings - as output by ReadConfigLines(), into a sequence of first-tokens and - name-value pairs. The general format is: - "command-type bar=baz xx=yyy" - etc., although there are subtleties as to what exactly is allowed, see - documentation for class ConfigLine for details. - This function will die if there was a parsing failure. - */ -void ParseConfigLines(const std::vector &lines, - std::vector *config_lines); /* Returns true if name 'name' matches pattern 'pattern'. The pattern diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 0acaa5c2008..b4563c7a2c3 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -30,6 +30,7 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config, nnet_(nnet), compiler_(*nnet, config_.optimize_config, config_.compiler_config), num_minibatches_processed_(0), + max_change_stats_(*nnet), srand_seed_(RandInt(0, 100000)) { if (config.zero_component_stats) ZeroComponentStats(nnet); @@ -38,9 +39,6 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config, config.backstitch_training_interval > 0); delta_nnet_ = nnet_->Copy(); ScaleNnet(0.0, delta_nnet_); - const int32 num_updatable = NumUpdatableComponents(*delta_nnet_); - num_max_change_per_component_applied_.resize(num_updatable, 0); - num_max_change_global_applied_ = 0; if (config_.read_cache != "") { bool binary; @@ -111,9 +109,9 @@ void NnetTrainer::TrainInternal(const NnetExample &eg, delta_nnet_); // Update the parameters of nnet - bool success = UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change, - 1.0, 1.0 - config_.momentum, nnet_, - &num_max_change_per_component_applied_, &num_max_change_global_applied_); + bool success = UpdateNnetWithMaxChange( + *delta_nnet_, config_.max_param_change, + 1.0, 1.0 - config_.momentum, nnet_, &max_change_stats_); // Scale down the batchnorm stats (keeps them fresh... this affects what // happens when we use the model with batchnorm test-mode set). @@ -167,9 +165,10 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg, } // Updates the parameters of nnet - UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change, + UpdateNnetWithMaxChange( + *delta_nnet_, config_.max_param_change, max_change_scale, scale_adding, nnet_, - &num_max_change_per_component_applied_, &num_max_change_global_applied_); + &max_change_stats_); if (is_backstitch_step1) { // The following will only do something if we have a LinearComponent or @@ -236,40 +235,10 @@ bool NnetTrainer::PrintTotalStats() const { bool ok = info.PrintTotalStats(name); ans = ans || ok; } - PrintMaxChangeStats(); + max_change_stats_.Print(*nnet_); return ans; } -void NnetTrainer::PrintMaxChangeStats() const { - KALDI_ASSERT(delta_nnet_ != NULL); - int32 i = 0; - for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) { - Component *comp = delta_nnet_->GetComponent(c); - if (comp->Properties() & kUpdatableComponent) { - UpdatableComponent *uc = dynamic_cast(comp); - if (uc == NULL) - KALDI_ERR << "Updatable component does not inherit from class " - << "UpdatableComponent; change this code."; - if (num_max_change_per_component_applied_[i] > 0) - KALDI_LOG << "For " << delta_nnet_->GetComponentName(c) - << ", per-component max-change was enforced " - << (100.0 * num_max_change_per_component_applied_[i]) / - (num_minibatches_processed_ * - (config_.backstitch_training_scale == 0.0 ? 1.0 : - 1.0 + 1.0 / config_.backstitch_training_interval)) - << " \% of the time."; - i++; - } - } - if (num_max_change_global_applied_ > 0) - KALDI_LOG << "The global max-change was enforced " - << (100.0 * num_max_change_global_applied_) / - (num_minibatches_processed_ * - (config_.backstitch_training_scale == 0.0 ? 1.0 : - 1.0 + 1.0 / config_.backstitch_training_interval)) - << " \% of the time."; -} - void ObjectiveFunctionInfo::UpdateStats( const std::string &output_name, int32 minibatches_per_phase, diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h index fffc621930a..f09649d1506 100644 --- a/src/nnet3/nnet-training.h +++ b/src/nnet3/nnet-training.h @@ -26,6 +26,7 @@ #include "nnet3/nnet-compute.h" #include "nnet3/nnet-optimize.h" #include "nnet3/nnet-example-utils.h" +#include "nnet3/nnet-utils.h" namespace kaldi { namespace nnet3 { @@ -81,7 +82,7 @@ struct NnetTrainerOptions { opts->Register("l2-regularize-factor", &l2_regularize_factor, "Factor that " "affects the strength of l2 regularization on model " "parameters. The primary way to specify this type of " - "l2 regularization is via the 'l2-regularize'" + "l2 regularization is via the 'l2-regularize' " "configuration value at the config-file level. " " --l2-regularize-factor will be multiplied by the component-level " "l2-regularize values and can be used to correct for effects " @@ -187,10 +188,6 @@ class NnetTrainer { // Prints out the final stats, and return true if there was a nonzero count. bool PrintTotalStats() const; - // Prints out the max-change stats (if nonzero): the percentage of time that - // per-component max-change and global max-change were enforced. - void PrintMaxChangeStats() const; - ~NnetTrainer(); private: // The internal function for doing one step of conventional SGD training. @@ -220,8 +217,7 @@ class NnetTrainer { int32 num_minibatches_processed_; // stats for max-change. - std::vector num_max_change_per_component_applied_; - int32 num_max_change_global_applied_; + MaxChangeStats max_change_stats_; unordered_map objf_info_; diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index e020f8fc6a7..61da1d7f6a9 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -2173,5 +2173,47 @@ void ApplyL2Regularization(const Nnet &nnet, } +bool UpdateNnetWithMaxChange(const Nnet &delta_nnet, + BaseFloat max_param_change, + BaseFloat max_change_scale, + BaseFloat scale, Nnet *nnet, + MaxChangeStats *stats) { + bool ans = UpdateNnetWithMaxChange( + delta_nnet, max_param_change, max_change_scale, + scale, nnet, + &(stats->num_max_change_per_component_applied), + &(stats->num_max_change_global_applied)); + stats->num_minibatches_processed++; + return ans; +} + + +void MaxChangeStats::Print(const Nnet &nnet) const { + int32 i = 0; + for (int32 c = 0; c < nnet.NumComponents(); c++) { + const Component *comp = nnet.GetComponent(c); + if (comp->Properties() & kUpdatableComponent) { + const UpdatableComponent *uc = dynamic_cast( + comp); + if (uc == NULL) + KALDI_ERR << "Updatable component does not inherit from class " + << "UpdatableComponent; change this code."; + if (num_max_change_per_component_applied[i] > 0) + KALDI_LOG << "For " << nnet.GetComponentName(c) + << ", per-component max-change was enforced " + << ((100.0 * num_max_change_per_component_applied[i]) / + num_minibatches_processed) + << " \% of the time."; + i++; + } + } + if (num_max_change_global_applied > 0) + KALDI_LOG << "The global max-change was enforced " + << ((100.0 * num_max_change_global_applied) / + num_minibatches_processed) + << "\% of the time."; +} + + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index 787bd228a38..a5d17eb0437 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -331,13 +331,13 @@ void ReadEditConfig(std::istream &config_file, Nnet *nnet); \code Nnet temp_nnet(delta_nnet); - ScaleNnet(1.0 / max_change_scale, &temp_nnet); - [ Scale down parameters for each component of temp_nnet as needed so - their Euclidean norms do not exceed their per-component max-changes ] + ScaleNnet(scale, &temp_nnet); + [ Scale down parameters for each component of temp_nnet as needed so + their Euclidean norms do not exceed (their per-component max-changes + each multiplied by max_change_scale) ] [ Scale down temp_nnet as needed so its Euclidean norm does not exceed - the global max-change ] - ScaleNnet(max_change_scale, &temp_nnet); // undo the previous scaling. - AddNnet(temp_nnet, scale, nnet); + the global max-change times max_change_scale ] + AddNnet(temp_nnet, 1.0, nnet); \endcode @param [in] delta_nnet The copy of '*nnet' neural network that contains @@ -361,7 +361,8 @@ void ReadEditConfig(std::istream &config_file, Nnet *nnet); max-change, and 'max_change_scale * max_param_change' as the global max-change). @param [in] scale This value, which will normally be 1.0, is a scaling - factor used when adding to 'nnet', applied after any max-changes. + factor used when adding to 'nnet', which is (conceptually) + applied before any max-changes. It is provided for backstitch-related purposes. @param [in,out] nnet The nnet which we add to. @param [out] num_max_change_per_component_applied We add to the elements of @@ -377,6 +378,17 @@ bool UpdateNnetWithMaxChange(const Nnet &delta_nnet, num_max_change_per_component_applied, int32 *num_max_change_global_applied); +struct MaxChangeStats; + +// This overloaded version of UpdateNnetWithMaxChange() is a convenience +// wrapper for when you have a MaxChangeStats object to keep track +// of how many times the max-change was applied. See documentation above. +bool UpdateNnetWithMaxChange(const Nnet &delta_nnet, + BaseFloat max_param_change, + BaseFloat max_change_scale, + BaseFloat scale, Nnet *nnet, + MaxChangeStats *stats); + /** This function is used as part of the regular training workflow, prior to @@ -513,6 +525,24 @@ int32 GetNumNvalues(const std::vector &io_vec, bool exhaustive); +struct MaxChangeStats { + int32 num_max_change_global_applied; + int32 num_minibatches_processed; + std::vector num_max_change_per_component_applied; + + MaxChangeStats(const Nnet &nnet): + num_max_change_global_applied(0), + num_minibatches_processed(0), + num_max_change_per_component_applied(NumUpdatableComponents(nnet), 0) { } + + // Prints the max-change stats. Usually will be called at the end + // of the program. The nnet is only needed for structural information, + // to work out the component names. + void Print(const Nnet &nnet) const; +}; + + + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3a/Makefile b/src/nnet3a/Makefile new file mode 100644 index 00000000000..5410c54f525 --- /dev/null +++ b/src/nnet3a/Makefile @@ -0,0 +1,23 @@ +all: + +# This directory contains code related to the adaptation +# framework in ../adapt, for nnet3 and (principally) chain +# training. + +include ../kaldi.mk + +TESTFILES = nnet-chaina-utils-test nnet-chaina-training-test + +OBJFILES = nnet-chaina-training.o nnet-chaina-utils.o + +LIBNAME = kaldi-nnet3a + +ADDLIBS = ../fstext/kaldi-fstext.a ../chain/kaldi-chain.a \ + ../nnet3/kaldi-nnet3.a ../adapt/kaldi-adapt.a \ + ../cudamatrix/kaldi-cudamatrix.a ../hmm/kaldi-hmm.a \ + ../transform/kaldi-transform.a ../tree/kaldi-tree.a \ + ../hmm/kaldi-hmm.a ../gmm/kaldi-gmm.a ../lat/kaldi-lat.a \ + ../matrix/kaldi-matrix.a ../util/kaldi-util.a \ + ../base/kaldi-base.a + +include ../makefiles/default_rules.mk diff --git a/src/nnet3a/nnet-chaina-training-test.cc b/src/nnet3a/nnet-chaina-training-test.cc new file mode 100644 index 00000000000..c570ba29340 --- /dev/null +++ b/src/nnet3a/nnet-chaina-training-test.cc @@ -0,0 +1,44 @@ +// nnet3/nnet-chaina-training-test.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "nnet3a/nnet-chaina-training.h" + +namespace kaldi { +namespace nnet3 { + + +void UnitTestCompile() { + // just testing the compilation works, i.e. that all member functions are + // defined + NnetChainaTrainingOptions config; + NnetChainaModels models(true, false, false, "a", "b", "c"); + NnetChainaTrainer trainer(config, &models); +} + + +} // namespace nnet3 +} // namespace kaldi + +int main() { + using namespace kaldi; + using namespace kaldi::nnet3; + SetVerboseLevel(2); + // KALDI_LOG << "Tests succeeded."; + return 0; +} diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc new file mode 100644 index 00000000000..340b4dece7d --- /dev/null +++ b/src/nnet3a/nnet-chaina-training.cc @@ -0,0 +1,1225 @@ +// nnet3/nnet-chaina-training.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "nnet3/nnet-utils.h" +#include "nnet3a/nnet-chaina-training.h" +#include "nnet3a/nnet-chaina-utils.h" + +namespace kaldi { +namespace nnet3 { + +NnetChainaModels::NnetChainaModels( + const NnetChainaTrainingOptions &opts, + const std::string &model_dir, + const std::string &den_fst_dir, + const std::string &transform_dir): + opts_(opts), + model_dir_(model_dir), + den_fst_dir_(den_fst_dir), + transform_dir_(transform_dir) { + std::string bottom_nnet_name; // model_dir/bottom.raw + GetPathname(model_dir, "bottom", "raw", &bottom_nnet_name); + ReadKaldiObject(bottom_nnet_name, &bottom_nnet_); + ComputeSimpleNnetContext(bottom_nnet_, + &bottom_nnet_left_context_, + &bottom_nnet_right_context_); + bool is_top_nnet = false; + InitializeNnet(is_top_nnet, &bottom_nnet_); +} + +void NnetChainaModels::InitializeNnet( + bool is_top_nnet, Nnet *nnet) const { + const NnetChainaTrainingPerModelOptions &bottom_or_top_opts = + (is_top_nnet ? opts_.top : opts_.bottom); + + // we could change that condition later if it turns out to be a problem. + if (bottom_or_top_opts.batchnorm_test_mode) + SetBatchnormTestMode(true, nnet); + if (bottom_or_top_opts.dropout_test_mode) + SetDropoutTestMode(true, nnet); + if (!bottom_or_top_opts.train && bottom_or_top_opts.batchnorm_test_mode) { + // The following is for efficiency in evaluating the nnet; + // it may combine certain component types. + CollapseModel(CollapseModelConfig(), nnet); + } +} + +NnetChainaModels::LanguageInfo::LanguageInfo( + const NnetChainaModels::LanguageInfo &other): + trans_model(other.trans_model), + am_nnet(other.am_nnet), + den_fst(other.den_fst), + transform(other.transform) { } + + +// This code is related to UpdateNnetMovingAverage() in nnet3-chain-combine.cc. +void NnetChainaModels::InterpolateWith( + BaseFloat new_model_weight, + const std::string &model_dir) { + KALDI_ASSERT(new_model_weight > 0.0 && new_model_weight < 1.0); + + std::string bottom_filename; + GetPathname(model_dir, "bottom", "raw", &bottom_filename); + Nnet bottom_nnet; // we don't need the transition model, and the reading code + // is capable of ignoring it. + ReadKaldiObject(bottom_filename, &bottom_nnet); + bool is_top_nnet = false; + InitializeNnet(is_top_nnet, &bottom_nnet); + ScaleNnet(1.0 - new_model_weight, &bottom_nnet_); + AddNnet(bottom_nnet, new_model_weight, &bottom_nnet_); + for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) { + const std::string &lang = iter->first; + LanguageInfo *info = iter->second; + std::string model_filename; + GetPathname(model_dir, lang, "mdl", &model_filename); + Nnet top_nnet; // we don't need the transition model, and the reading code + // is capable of ignoring it. + ReadKaldiObject(model_filename, &top_nnet); + is_top_nnet = true; + InitializeNnet(is_top_nnet, &top_nnet); + Nnet &stored_nnet = info->am_nnet.GetNnet(); + ScaleNnet(1.0 - new_model_weight, &stored_nnet); + AddNnet(top_nnet, new_model_weight, &stored_nnet); + } +} + + +NnetChainaModels::NnetChainaModels(const NnetChainaModels &other): + opts_(other.opts_), + model_dir_(other.model_dir_), + den_fst_dir_(other.den_fst_dir_), + transform_dir_(other.transform_dir_), + bottom_nnet_(other.bottom_nnet_), + bottom_nnet_left_context_(other.bottom_nnet_left_context_), + bottom_nnet_right_context_(other.bottom_nnet_right_context_) { + for (auto iter = other.lang_info_.begin(); + iter != other.lang_info_.end(); ++iter) { + const std::string &lang = iter->first; + LanguageInfo *info = iter->second; + lang_info_[lang] = new LanguageInfo(*info); + } +} + + + +void NnetChainaModels::GetPathname(const std::string &dir, + const std::string &name, + const std::string &suffix, + std::string *pathname) { + std::ostringstream str; + str << dir << '/' << name << '.' << suffix; + *pathname = str.str(); +} + +void NnetChainaModels::GetPathname(const std::string &dir, + const std::string &name, + int32 job_id, + const std::string &suffix, + std::string *pathname) { + std::ostringstream str; + str << dir << '/' << name << '.' << job_id << '.' << suffix; + *pathname = str.str(); +} + +NnetChainaModels::LanguageInfo *NnetChainaModels::GetInfoForLang( + const std::string &lang) { + auto iter = lang_info_.find(lang); + if (iter != lang_info_.end()) { + return iter->second; + } else { + LanguageInfo *info = new LanguageInfo(); + + std::string model_filename, den_fst_filename, transform_filename; + GetPathname(model_dir_, lang, "mdl", &model_filename); + GetPathname(den_fst_dir_, lang, "den.fst", &den_fst_filename); + GetPathname(transform_dir_, lang, "ada", &transform_filename); + + { + bool binary; + Input ki(model_filename, &binary); + info->trans_model.Read(ki.Stream(), binary); + info->am_nnet.Read(ki.Stream(), binary); + Nnet &nnet = info->am_nnet.GetNnet(); + bool is_top_nnet = true; + InitializeNnet(is_top_nnet, &nnet); + } + ReadFstKaldi(den_fst_filename, &(info->den_fst)); + ReadKaldiObject(transform_filename, &(info->transform)); + lang_info_[lang] = info; + return info; + } +} + +Nnet* NnetChainaModels::GetBottomNnet() { + return &bottom_nnet_; +} + + +AmNnetSimple* NnetChainaModels::GetNnetForLang( + const std::string &language_name) { + LanguageInfo *info = GetInfoForLang(language_name); + return &(info->am_nnet); +} + +TransitionModel* NnetChainaModels::GetTransitionModelForLang( + const std::string &language_name) { + LanguageInfo *info = GetInfoForLang(language_name); + return &(info->trans_model); +} + +fst::StdVectorFst* NnetChainaModels::GetDenFstForLang( + const std::string &language_name) { + LanguageInfo *info = GetInfoForLang(language_name); + return &(info->den_fst); +} + +Nnet* NnetChainaModels::GetRawNnetForLang( + const std::string &language_name) { + LanguageInfo *info = GetInfoForLang(language_name); + return &(info->am_nnet.GetNnet()); +} + +differentiable_transform::DifferentiableTransformMapped* +NnetChainaModels::GetTransformForLang( + const std::string &language_name) { + LanguageInfo *info = GetInfoForLang(language_name); + return &(info->transform); +} + + + +void NnetChainaModels::Write(const std::string &model_out_dir, + bool binary, int32 job_id) { + std::ostringstream ss; + if (opts_.bottom.train) { + ss << "bottom nnet and "; + std::string bottom_model_name; + GetPathname(model_out_dir, "bottom", job_id, "raw", &bottom_model_name); + WriteKaldiObject(bottom_nnet_, bottom_model_name, binary); + } + if (opts_.top.train) { + ss << "nnets for languages "; + for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) { + const std::string &lang_name = iter->first; + ss << lang_name << " "; + LanguageInfo *info = iter->second; + { + // we write it as a 'raw' model without the TransitionModel or + // the AmNnetSimple wrapper, since we can reconstruct those parts + // from the previous iter's model. + std::string top_model_name; + GetPathname(model_out_dir, lang_name, job_id, "raw", &top_model_name); + WriteKaldiObject(info->am_nnet.GetNnet(), top_model_name, binary); + } + } + } + if (opts_.adaptation_model_accumulate) { + ss << "adaptation-model stats for languages "; + for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) { + const std::string &lang_name = iter->first; + ss << lang_name << " "; + LanguageInfo *info = iter->second; + { + std::string transform_name; + GetPathname(model_out_dir, lang_name, job_id, "ada", &transform_name); + WriteKaldiObject(info->transform, transform_name, binary); + } + } + } + KALDI_LOG << "Wrote " << ss.str() << "to " << model_out_dir; +} + + +void NnetChainaModels::WriteCombinedModels(const std::string &model_out_dir, + bool binary) { + + std::string bottom_model_name; + GetPathname(model_out_dir, "bottom", "raw", &bottom_model_name); + WriteKaldiObject(bottom_nnet_, bottom_model_name, binary); + + std::ostringstream ss; + for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) { + const std::string &lang_name = iter->first; + ss << lang_name << " "; + LanguageInfo *info = iter->second; + std::string top_model_name; + GetPathname(model_out_dir, lang_name, "mdl", &top_model_name); + + Output ko(top_model_name, binary); + info->trans_model.Write(ko.Stream(), binary); + info->am_nnet.Write(ko.Stream(), binary); + } + KALDI_LOG << "Wrote bottom.raw and .mdl files for languages:" + << ss.str() << "to: " << model_out_dir; +} + +NnetChainaModels::~NnetChainaModels() { + for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) + delete iter->second; +} + +NnetChainaTopTrainer::NnetChainaTopTrainer( + const std::string &lang_name, + const NnetChainaTrainingOptions &config, + const fst::StdVectorFst &den_fst, + const differentiable_transform::DifferentiableTransformMapped &transform, + Nnet *nnet): + lang_name_(lang_name), + opts_(config), + den_graph_(den_fst, nnet->OutputDim("output")), + transform_(transform), + compiler_(*nnet, opts_.nnet_config.optimize_config, + opts_.nnet_config.compiler_config), + nnet_(nnet), + delta_nnet_(nnet->Copy()), + num_minibatches_processed_(0), + max_change_stats_(*nnet) { + + config.Check(); + + if (opts_.nnet_config.zero_component_stats && + !opts_.top.batchnorm_test_mode) + ZeroComponentStats(nnet); + + ScaleNnet(0.0, delta_nnet_); + if (opts_.nnet_config.read_cache != "") { + // It would be complicated to implement, as there are various top nnets + // and they would all try to read and write the same cache files. + // To implement this, the best way would be to + KALDI_WARN << "The read-cache options are not currently supported here."; + } + KALDI_ASSERT(opts_.nnet_config.momentum >= 0.0); +} + + +NnetChainaTopTrainer::ComputationStructure::ComputationStructure( + bool adapted, + bool train_model, + bool need_input_deriv, + int32 num_sequences, + int32 frames_per_sequence_in, + int32 frames_per_sequence_out, + int32 first_input_t, + int32 top_subsampling_factor): + adapted(adapted), train_model(train_model), + need_input_deriv(need_input_deriv), num_sequences(num_sequences), + frames_per_sequence_in(frames_per_sequence_in), + frames_per_sequence_out(frames_per_sequence_out), + first_input_t(first_input_t), + top_subsampling_factor(top_subsampling_factor) { } + + +NnetChainaBottomTrainer::ComputationStructure::ComputationStructure( + bool train_model, + int32 num_sequences, + int32 frames_per_sequence_in, + int32 frames_per_sequence_out, + int32 first_input_t, + int32 first_output_t): + train_model(train_model), + num_sequences(num_sequences), + frames_per_sequence_in(frames_per_sequence_in), + frames_per_sequence_out(frames_per_sequence_out), + first_input_t(first_input_t), + first_output_t(first_output_t) { } + + +void NnetChainaTopTrainer::ConsolidateMemory() { + ::kaldi::nnet3::ConsolidateMemory(nnet_); + ::kaldi::nnet3::ConsolidateMemory(delta_nnet_); +} + + +std::shared_ptr NnetChainaTopTrainer::GetComputation( + const ComputationStructure &s) { + { + auto iter = computation_map_.find(s); + if (iter != computation_map_.end()) + return iter->second; + } + int32 num_sequences = s.num_sequences, + frames_per_sequence_in = s.frames_per_sequence_in, + frames_per_sequence_out = s.frames_per_sequence_out, + first_input_t = s.first_input_t, + first_output_t = 0, + top_subsampling_factor = s.top_subsampling_factor; + + if (nnet_->InputDim("input") < 0 || + nnet_->OutputDim("output") < 0 || + nnet_->OutputDim("output-si") < 0 || + nnet_->OutputDim("output-xent") < 0 || + nnet_->OutputDim("output-si-xent") < 0) { + KALDI_ERR << "Top neural net for chaina training must have an input called " + "'input' and outputs called 'output', 'output-xent', 'output-si', and " + "'output-si-xent'."; + } + + ComputationRequest request; + request.need_model_derivative = s.train_model; + // It's probably harmless to store stats unless we have batchorm components in + // test mode. + request.store_component_stats = !opts_.top.batchnorm_test_mode; + request.inputs.resize(1); + request.inputs[0].name = "input"; + request.inputs[0].indexes.resize(frames_per_sequence_in * num_sequences); + request.inputs[0].has_deriv = s.need_input_deriv; + // The inputs are in the order: the first frame of all sequences; the second + // frame of all sequences; and so on. + auto iter = request.inputs[0].indexes.begin(); + for (int32 t = first_input_t; + t < first_input_t + frames_per_sequence_in; ++t) { + for (int32 n = 0; n < num_sequences; ++n,++iter) { + iter->n = n; + iter->t = t; + // the x values will already be 0, thanks to the default constructor of + // Index(). + } + } + // The outputs are also in the order: the first frame of all sequences; + // the second frame of all sequences; and so on. + request.outputs.resize(2); + request.outputs[0].name = (s.adapted ? "output" : "output-si"); + request.outputs[0].has_deriv = opts_.top.train; + request.outputs[0].indexes.resize(frames_per_sequence_out * num_sequences); + int32 t_stride_out = top_subsampling_factor; + iter = request.outputs[0].indexes.begin(); + for (int32 t = first_output_t; + t < first_output_t + frames_per_sequence_out * t_stride_out; + t += t_stride_out) { + for (int32 n = 0; n < num_sequences; ++n,++iter) { + iter->n = n; + iter->t = t; + } + } + request.outputs[1].has_deriv = opts_.top.train; + request.outputs[1].name = (s.adapted ? "output-xent" : "output-si-xent"); + request.outputs[1].indexes = request.outputs[0].indexes; + std::shared_ptr computation = compiler_.Compile( + request); + computation_map_[s] = computation; + return computation; +} + +bool NnetChainaTopTrainer::TrainUnadapted( + const CuMatrixBase &input, + const NnetComputation &computation, + const chain::Supervision &supervision, + bool need_model_deriv, + const CuVectorBase &deriv_weights, + Posterior *posterior, + CuMatrix *input_deriv) { + + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + + // note: because we give the 1st arg (nnet_) as a pointer to the + // constructor of 'computer', it will use that copy of the nnet to + // store stats. + NnetComputer computer(nnet_config.compute_config, computation, + nnet_, delta_nnet_); + + // Give the inputs to the computer object. + CuMatrix input_copy(input); + computer.AcceptInput("input", &input_copy); + // Do the forward propagation. + computer.Run(); + + const CuMatrixBase + &output = computer.GetOutput("output-si"), + &output_xent = computer.GetOutput("output-si-xent"); + // It's not optimal that we compute these derivatives even when we're not + // training, but the 'compute-prob' phase doesn't dominate. + CuMatrix output_deriv(output.NumRows(), + output.NumCols(), + kUndefined), + output_xent_deriv; + + // Note: we normally turn the chain l2 regularization (which is l2 on the + // output of the nnet) off now, since parameter-level l2 regularization seems + // to work better. So expect 'tot_l2_term' to be zero. + BaseFloat tot_objf, tot_l2_term, tot_weight; + + ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, + supervision, output, + &tot_objf, &tot_l2_term, &tot_weight, + &output_deriv, &output_xent_deriv, + posterior); + + if (!(tot_objf - tot_objf == 0.0)) { + // A NaN or inf was encountered in the objective computation. + // The input_deriv won't be used, so no need to set it. + // Un-freeze the natural gradient and return. + return false; + } + + { + // this block computes and keeps track of the cross-entropy objective. + // at this point, xent_deriv is posteriors derived from the numerator + // computation. note, xent_objf has a factor of '.supervision.weight', + // which is also included in 'tot_weight'. + BaseFloat xent_objf = TraceMatMat(output_xent, output_xent_deriv, kTrans); + output_si_xent_objf_.UpdateStats(lang_name_ + ":output-si-xent", + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, xent_objf); + } + + + if (opts_.apply_deriv_weights && deriv_weights.Dim() != 0) { + output_deriv.MulRowsVec(deriv_weights); + output_xent_deriv.MulRowsVec(deriv_weights); + } + + output_si_objf_.UpdateStats(lang_name_ + ":output-si", + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, tot_objf, tot_l2_term); + + if (input_deriv == NULL && !need_model_deriv) + return true; + + // Freeze the natural gradient. We don't want to update the NG scatter + // matrices on this data because we'll next be running the same nnet on the + // speaker-adapted version of the same data, and it would violate the + // independence assumptions needed for NG to work if we updated them. + //if (need_model_deriv) + // FreezeNaturalGradient(true, delta_nnet_); + + computer.AcceptInput("output-si", &output_deriv); + + output_xent_deriv.Scale(opts_.chain_config.xent_regularize); + computer.AcceptInput("output-si-xent", &output_xent_deriv); + + // Do the backprop. + computer.Run(); + + if (input_deriv != NULL) + computer.GetOutputDestructive("input", input_deriv); + + //if (need_model_deriv) // Un-freeze the natural gradient. + // FreezeNaturalGradient(false, delta_nnet_); + + // We'll wait until after the adapted pass to call UpdateNnetWithMaxChange(). + // Training the model on these features in between the two passes would leave + // a strong memory of this minibatch in the model's parameters which could + // cause weird effects. + return true; +} + +bool NnetChainaTopTrainer::TrainAdapted( + const NnetComputation &computation, + const chain::Supervision &supervision, + BaseFloat model_training_scale, + const CuVectorBase &deriv_weights, + CuMatrix *input, + CuMatrix *input_deriv) { + + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + + // note: because we give the 1st arg (nnet_) as a pointer to the + // constructor of 'computer', it will use that copy of the nnet to + // store stats. + NnetComputer computer(nnet_config.compute_config, computation, + nnet_, delta_nnet_); + + // give the input to the computer object. + computer.AcceptInput("input", input); + // Do the forward computation + computer.Run(); + + const CuMatrixBase + &output = computer.GetOutput("output"), + &output_xent = computer.GetOutput("output-xent"); + CuMatrix output_deriv(output.NumRows(), + output.NumCols(), + kUndefined), + output_xent_deriv; + + // Note: we don't normally use the l2 term any more; parameter-level + // regularization seems to work better than regularization of the + // nnet output. + BaseFloat tot_objf, tot_l2_term, tot_weight; + + ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_, + supervision, output, + &tot_objf, &tot_l2_term, &tot_weight, + &output_deriv, &output_xent_deriv); + + if (!(tot_objf - tot_objf == 0.0)) { + // A NaN or inf was encountered in the objective computation. the input_deriv + // won't be used by the calling code, so no need to set it. + return false; + } + + { + // this block computes and keeps track of the cross-entropy objective. + // at this point, xent_deriv is posteriors derived from the numerator + // computation. note, xent_objf has a factor of '.supervision.weight' + BaseFloat xent_objf = TraceMatMat(output_xent, output_xent_deriv, kTrans); + output_xent_objf_.UpdateStats(lang_name_ + ":output-xent", + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, xent_objf); + } + output_objf_.UpdateStats(lang_name_ + ":output", + opts_.nnet_config.print_interval, + num_minibatches_processed_, + tot_weight, tot_objf, tot_l2_term); + + if (input_deriv == NULL && model_training_scale == 0.0) + return true; + + if (opts_.apply_deriv_weights && deriv_weights.Dim() != 0) { + output_deriv.MulRowsVec(deriv_weights); + output_xent_deriv.MulRowsVec(deriv_weights); + } + + computer.AcceptInput("output", &output_deriv); + output_xent_deriv.Scale(opts_.chain_config.xent_regularize); + computer.AcceptInput("output-xent", &output_xent_deriv); + + // Do the backprop. + computer.Run(); + + if (input_deriv != NULL) + computer.GetOutputDestructive("input", input_deriv); + + if (model_training_scale != 0.0) { + // If we're actually training the top model... + + // If relevant, add in the part of the gradient that comes from L2 + // regularization. The factor of (1.0 + opts_.unadapted_top_weight) + // is to make it proportional to the magnitude of the derivative. + ApplyL2Regularization( + *nnet_, + supervision.num_sequences * opts_.nnet_config.l2_regularize_factor * + (1.0 + opts_.top.unadapted_weight), + delta_nnet_); + + // Update the parameters of nnet. + // Note: normally, momentum is 0.0. + bool success = UpdateNnetWithMaxChange( + *delta_nnet_, + nnet_config.max_param_change, + model_training_scale, + model_training_scale * (1.0 - nnet_config.momentum), + nnet_, &max_change_stats_); + + // Scale down the batchnorm stats (keeps them fresh... this affects what + // happens when, later on, we use the model with batchnorm test-mode set). + ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); + + // The following will only do something if we have a LinearComponent + // or AffineComponent with orthonormal-constraint set to a nonzero value. + ConstrainOrthonormal(nnet_); + + if (success) + ScaleNnet(nnet_config.momentum, delta_nnet_); + else + ScaleNnet(0.0, delta_nnet_); + return success; + } else { + return true; + } +} + + +bool NnetChainaTopTrainer::Train(const CuMatrixBase &input, + int32 num_sequences, + int32 num_groups, + int32 first_input_t, + int32 top_subsampling_factor, + const VectorBase &deriv_weights_in, + const chain::Supervision &supervision, + BaseFloat model_training_scale, + CuMatrix *input_deriv) { + // note: if opts_.top.train if false, model_training_scale will have been + // already set to zero. + KALDI_ASSERT(input.NumRows() != 0 && input.NumRows() % num_sequences == 0); + int32 frames_per_sequence_in = input.NumRows() / num_sequences, + frames_per_sequence_out = supervision.frames_per_sequence; + + bool adapted = false; + ComputationStructure structure( + adapted, (model_training_scale != 0.0), (input_deriv != NULL), + num_sequences, frames_per_sequence_in, frames_per_sequence_out, + first_input_t, top_subsampling_factor); + + // Will be the numerator posterior from the unadapted pass, which will be + // padded with l/r context and used to estimate the adapted features. + Posterior post; + + CuVector deriv_weights; + if (opts_.apply_deriv_weights) + deriv_weights = deriv_weights_in; + + + bool need_unadapted_model_deriv = + (model_training_scale * opts_.top.unadapted_weight) != 0.0; + + std::shared_ptr computation_unadapted = + GetComputation(structure); + bool success = TrainUnadapted( + input, *computation_unadapted, supervision, + need_unadapted_model_deriv, + deriv_weights, &post, input_deriv); + + if (!success) { + num_minibatches_processed_++; + return false; + } + + // Scale down the model derivatives from the unadapted pass. + if (need_unadapted_model_deriv && opts_.top.unadapted_weight != 1.0) + ScaleNnet(opts_.top.unadapted_weight, delta_nnet_); + + if (input_deriv && opts_.bottom.unadapted_weight != 1.0) { + // Apply the scale from --unadapted-bottom-weight. We'll supply the other + // factor that comes from from the language-specific bottom_weight ("bw") + // ito UpdateNnetWithMaxChange() later on when we train the bottom nnet. + input_deriv->Scale(opts_.bottom.unadapted_weight); + } + + Posterior post_padded(input.NumRows()); + ConvertPosterior(post, num_sequences, first_input_t, + top_subsampling_factor, + transform_.pdf_map, + transform_.transform->NumClasses(), + &post_padded); + + if (opts_.adaptation_model_accumulate) { + // We will later add a way to handle iteration indexes >0, which is needed + // when the adaptation model contains cascaded transforms, but 0 is the + // normal case. + int32 accumulate_iter = 0; + transform_.transform->Accumulate(accumulate_iter, input, + num_sequences, num_groups, + post_padded); + return true; // We don't be evaluating the adapted version of the top model + } + + + structure.adapted = true; + std::shared_ptr computation_adapted = + GetComputation(structure); + + CuMatrix adapted_input(input.NumRows(), input.NumCols(), + kUndefined), + adapted_input_deriv; + + using namespace differentiable_transform; + MinibatchInfoItf *minibatch_info = NULL; + if (!opts_.adaptation_test_mode) { + minibatch_info = transform_.transform->TrainingForward( + input, num_sequences, num_groups, post_padded, &adapted_input); + } else { + transform_.transform->TestingForwardBatch( + input, num_sequences, num_groups, post_padded, &adapted_input); + } + + success = TrainAdapted( + *computation_adapted, supervision, + model_training_scale, deriv_weights, + &adapted_input, + (input_deriv != NULL ? &adapted_input_deriv : NULL)); + + num_minibatches_processed_++; + if (!success) + return false; + + if (input_deriv == NULL) + delete minibatch_info; + else { + transform_.transform->TrainingBackward(input, adapted_input_deriv, + num_sequences, num_groups, post_padded, + minibatch_info, input_deriv); + } + return true; +} + + +/** + This helper function for ConvertPosterior() converts from pdf-ids to + cluster-ids using the map provided in pdf_map, if it is nonempty. + If pdf_map is empty, it just copies the pairs over unchanged. + */ +static inline void ConvertPosteriorElement( + const std::vector &pdf_map, + int32 num_classes, + const std::vector > &post_elem_in, + std::vector > *post_elem_out) { + if (pdf_map.empty()) { + *post_elem_out = post_elem_in; + if (!post_elem_in.empty()) { + // We just check the first int32-- this is a spot-check that the + // pdf-ids are in the correct range. + KALDI_ASSERT(post_elem_in[0].first < num_classes); + } + } else { + int32 num_classes_in = pdf_map.size(); + size_t num_pairs = post_elem_in.size(); + post_elem_out->resize(num_pairs); + for (size_t i =0; i < num_pairs; i++) { + int32 pdf_id = post_elem_in[i].first; + BaseFloat weight = post_elem_in[i].second; + KALDI_ASSERT(pdf_id < num_classes_in); + int32 cluster_id = pdf_map[pdf_id]; + KALDI_ASSERT(cluster_id < num_classes); + (*post_elem_out)[i].first = cluster_id; + (*post_elem_out)[i].second = weight; + } + } +} + +void ConvertPosterior( + const Posterior &post_at_output, + int32 num_sequences, + int32 first_input_t, + int32 top_subsampling_factor, + const std::vector &pdf_map, + int32 num_classes, + Posterior *post_at_input) { + int32 output_post_size = post_at_output.size(), + input_post_size = post_at_input->size(), + s = top_subsampling_factor; + KALDI_ASSERT(input_post_size % num_sequences == 0 && + output_post_size % num_sequences == 0 && + input_post_size >= (output_post_size - 1) * top_subsampling_factor && + top_subsampling_factor > 0); + int32 num_frames_out = output_post_size / num_sequences, + num_frames_in = input_post_size / num_sequences, + last_input_t = first_input_t + (num_frames_in - 1), + first_output_t = 0, + last_output_t = first_output_t + s * (num_frames_out - 1); + + int32 half_s = s / 2; // note: this will round down, which is intended. + + for (int32 t_in = first_input_t; t_in <= last_input_t; t_in++) { + // find the corresponding output frame by rounding t to the closest + // t that's a multiple of top_subsampling_factor (rounding down in + // case of ties). We do this by adding half_s and rounding down. + int32 t_out = s * DivideRoundingDown(t_in + half_s, s); + if (t_out >= first_output_t && t_out <= last_output_t) { + for (int32 n = 0; n < num_sequences; n++) { + int32 input_index = num_sequences * (t_in - first_input_t) + n, + output_index = num_sequences * ((t_out - first_output_t) / s) + n; + ConvertPosteriorElement(pdf_map, num_classes, + post_at_output[output_index], + &((*post_at_input)[input_index])); + } + } + // else just leave the input posterior for this frame empty. This will + // happen for most of the frames that were added for left and right context. + } +} + +BaseFloat NnetChainaTopTrainer::GetTotalObjf(bool adapted, BaseFloat *weight) const { + const ObjectiveFunctionInfo &objf = + (adapted ? output_objf_ : output_si_objf_); + *weight = objf.tot_weight; + return objf.tot_objf; +} + +bool NnetChainaTopTrainer::PrintTotalStats() const { + bool ans = false; + if (output_si_objf_.PrintTotalStats(lang_name_ + ":output-si")) + ans = true; + if (output_objf_.PrintTotalStats(lang_name_ + ":output")) + ans = true; + if (output_si_xent_objf_.PrintTotalStats(lang_name_ + ":output-si-xent")) + ans = true; + if (output_xent_objf_.PrintTotalStats(lang_name_ + ":output-xent")) + ans = true; + KALDI_LOG << "Max-change stats for language " + << lang_name_ << ":"; + max_change_stats_.Print(*nnet_); + return ans; +} + + +NnetChainaTopTrainer::~NnetChainaTopTrainer() { + delete delta_nnet_; +} + +void NnetChainaBottomTrainer::ConsolidateMemory() { + ::kaldi::nnet3::ConsolidateMemory(nnet_); + ::kaldi::nnet3::ConsolidateMemory(delta_nnet_); +} + +NnetComputer* NnetChainaBottomTrainer::Forward( + int32 num_sequences, + int32 first_input_t, + int32 first_output_t, + int32 frames_per_sequence_out, + bool train_model, + CuMatrix *input, + CuMatrix *output) { + KALDI_ASSERT(input->NumRows() != 0 && input->NumRows() % num_sequences == 0); + int32 frames_per_sequence_in = input->NumRows() / num_sequences; + ComputationStructure s(train_model, + num_sequences, + frames_per_sequence_in, + frames_per_sequence_out, + first_input_t, first_output_t); + // Note: this will be cached in the unordered_map owned by this class, so we + // don't have to worry about it being deleted before we're done with the + // NnetComputer object. + std::shared_ptr computation = GetComputation(s); + + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + NnetComputer *computer = new NnetComputer(nnet_config.compute_config, + *computation, nnet_, delta_nnet_); + computer->AcceptInput("input", input); + computer->Run(); + if (!train_model) { + computer->GetOutputDestructive("output", output); + delete computer; + return NULL; + } else { + *output = computer->GetOutput("output"); + return computer; + } +} + + +void NnetChainaBottomTrainer::Backward(BaseFloat model_training_scale, + int32 num_sequences, + NnetComputer *computer, + CuMatrix *output_deriv) { + // if model_training_scale was 0.0, this function should not have been called. + KALDI_ASSERT(model_training_scale > 0.0); + computer->AcceptInput("output", output_deriv); + computer->Run(); + + delete computer; + + const NnetTrainerOptions &nnet_config = opts_.nnet_config; + + + // If relevant, add in the part of the gradient that comes from L2 + // regularization. The factor of (1.0 + opts_.unadapted_bottom_weight) + // is to make it proportional to the magnitude of the derivative. + ApplyL2Regularization( + *nnet_, + num_sequences * opts_.nnet_config.l2_regularize_factor * + (1.0 + opts_.bottom.unadapted_weight), + delta_nnet_); + + + // we may later provide a way to set a different max-change for the bottom + // nnet than on the top nnet. + // Note: normally, momentum is 0.0. + bool success = UpdateNnetWithMaxChange( + *delta_nnet_, + nnet_config.max_param_change, + model_training_scale, + model_training_scale * (1.0 - nnet_config.momentum), + nnet_, + &max_change_stats_); + + // Scale down the batchnorm stats (keeps them fresh... this affects what + // happens when, later on, we use the model with batchnorm test-mode set). + ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_); + + // The following will only do something if we have a LinearComponent + // or AffineComponent with orthonormal-constraint set to a nonzero value. + ConstrainOrthonormal(nnet_); + + if (success) + ScaleNnet(nnet_config.momentum, delta_nnet_); + else + ScaleNnet(0.0, delta_nnet_); + + static bool warned_momentum = false; + if (model_training_scale != 1.0 && nnet_config.momentum != 0.0 && + !warned_momentum) { + KALDI_WARN << "Momentum does not interact correctly with top_weight or " + "bottom_weight values. Will not warn again."; + warned_momentum = true; + } + num_minibatches_processed_++; +} + + +NnetChainaBottomTrainer::NnetChainaBottomTrainer( + const NnetChainaTrainingOptions &opts, + Nnet *nnet): + opts_(opts), + nnet_(nnet), + delta_nnet_(nnet->Copy()), + compiler_(*nnet, opts_.nnet_config.optimize_config, + opts_.nnet_config.compiler_config), + max_change_stats_(*nnet) { + if (opts_.nnet_config.zero_component_stats && + !opts_.bottom.batchnorm_test_mode) + ZeroComponentStats(nnet); + ScaleNnet(0.0, delta_nnet_); + if (opts_.nnet_config.read_cache != "") { + // It would be complicated to implement, as there are various top nnets + // and they would all try to read and write the same cache files. + // To implement this, the best way would be to + KALDI_WARN << "The read-cache options are not currently supported."; + } + KALDI_ASSERT(opts_.nnet_config.momentum >= 0.0 && + opts_.nnet_config.max_param_change >= 0.0 && + opts_.bottom_subsampling_factor >= 1); +} + +std::shared_ptr NnetChainaBottomTrainer::GetComputation( + const ComputationStructure &s) { + { // Check in the cache, in case we already handled this computation. + auto iter = computation_map_.find(s); + if (iter != computation_map_.end()) + return iter->second; + } + + if (!opts_.bottom.train) { + KALDI_ASSERT(!s.train_model); + } + + int32 num_sequences = s.num_sequences, + frames_per_sequence_in = s.frames_per_sequence_in, + frames_per_sequence_out = s.frames_per_sequence_out, + first_input_t = s.first_input_t, + first_output_t = s.first_output_t; + + if (nnet_->InputDim("input") < 0 || + nnet_->OutputDim("output") < 0) { + KALDI_ERR << "Bottom neural net for chaina training must have an input " + "called 'input' and an output called 'output'."; + } + + ComputationRequest request; + request.need_model_derivative = s.train_model; + // It's probably safe to store component-level stats, unless the + // batchnorm is in test mode. + request.store_component_stats = !opts_.bottom.batchnorm_test_mode; + request.inputs.resize(1); + request.inputs[0].name = "input"; + request.inputs[0].indexes.resize(frames_per_sequence_in * num_sequences); + // The inputs are in the order: all frames of sequence 0; then all frames of + // sequence 1; and so on. This is how the example-merging code does it, since + // it's more convenient when dealing with compressed matrices. + auto iter = request.inputs[0].indexes.begin(); + for (int32 n = 0; n < num_sequences; n++) { + for (int32 t = first_input_t; + t < first_input_t + frames_per_sequence_in; ++t,++iter) { + iter->n = n; + iter->t = t; + } + } + // ... but the outputs are in the order: the first frame of all sequences; + // the second frame of all sequences; and so on. + request.outputs.resize(1); + request.outputs[0].name = "output"; + request.outputs[0].has_deriv = s.train_model; + request.outputs[0].indexes.resize(frames_per_sequence_out * num_sequences); + int32 t_stride_out = opts_.bottom_subsampling_factor; + iter = request.outputs[0].indexes.begin(); + for (int32 t = first_output_t; + t < first_output_t + frames_per_sequence_out * t_stride_out; + t += t_stride_out) { + for (int32 n = 0; n < num_sequences; ++n,++iter) { + iter->n = n; + iter->t = t; + } + } + std::shared_ptr computation = compiler_.Compile( + request); + computation_map_[s] = computation; + return computation; +} + +void NnetChainaBottomTrainer::PrintTotalStats() const { + KALDI_LOG << "Max-change stats for bottom nnet:"; + max_change_stats_.Print(*nnet_); +} +NnetChainaBottomTrainer::~NnetChainaBottomTrainer() { + delete delta_nnet_; +} + + +void NnetChainaTrainer::GetContextInfo( + const std::string &lang, + int32 *bottom_left_context, + int32 *bottom_right_context, + int32 *top_left_context, + int32 *top_right_context) { + +} + +BaseFloat NnetChainaTrainer::GetTotalObjf( + bool adapted, BaseFloat *weight) const { + *weight = 0.0; + BaseFloat tot_objf = 0.0; + for (auto iter = top_trainers_.begin(); iter != top_trainers_.end(); + ++iter) { + BaseFloat this_weight; + tot_objf += iter->second->GetTotalObjf(adapted, &this_weight); + *weight += this_weight; + } + return tot_objf; +} + +bool NnetChainaTrainer::PrintTotalStats() const { + bottom_trainer_.PrintTotalStats(); + bool ans = false; + for (auto iter = top_trainers_.begin(); iter != top_trainers_.end(); + ++iter) + if (iter->second->PrintTotalStats()) + ans = true; + return ans; +} + +NnetChainaTrainer::NnetChainaTrainer( + const NnetChainaTrainingOptions &config, + NnetChainaModels *models): + opts_(config), + models_(models), + bottom_trainer_(opts_, models->GetBottomNnet()) { + ComputeSimpleNnetContext(*models->GetBottomNnet(), + &bottom_left_context_, + &bottom_right_context_); +} + + +NnetChainaTopTrainer* NnetChainaTrainer::GetTopTrainerForLang( + const std::string &lang) { + auto iter = top_trainers_.find(lang); + if (iter != top_trainers_.end()) + return iter->second; + NnetChainaTopTrainer *ans = + new NnetChainaTopTrainer( + lang, opts_, + *(models_->GetDenFstForLang(lang)), + *(models_->GetTransformForLang(lang)), + models_->GetRawNnetForLang(lang)); + top_trainers_[lang] = ans; + return ans; +} + +// 'key' might be something like "afsdadsfds12345?lang=english&tw=1.0&bw=0.5" +// expressing how much we want this eg to be used to train the top, and bottom, +// models respectively. +void NnetChainaTrainer::Train(const std::string &key, + const NnetChainExample &eg) { + size_t num_top_trainers = top_trainers_.size(); + std::string lang_name = "default"; + // 'top_weight' is a weight on the derivatives and max-change + // when training the top model, 'bottom_weight' is the same + // for the bottom model. + BaseFloat top_weight = 1.0, + bottom_weight = 1.0; + ParseFromQueryString(key, "lang", &lang_name); + ParseFromQueryString(key, "tw", &top_weight); + ParseFromQueryString(key, "bw", &bottom_weight); + KALDI_ASSERT(top_weight >= 0.0 && bottom_weight >= 0.0); + + if (!opts_.bottom.train) + bottom_weight = 0.0; + if (!opts_.top.train) + top_weight = 0.0; + + int32 num_sequences, chunks_per_group, first_input_t, + num_input_frames, num_output_frames, + frame_subsampling_factor, + eg_left_context, eg_right_context; + FindChainaExampleStructure(eg, &num_sequences, &chunks_per_group, + &first_input_t, + &num_input_frames, &num_output_frames, + &frame_subsampling_factor, + &eg_left_context, &eg_right_context); + KALDI_ASSERT(num_sequences % chunks_per_group == 0); + int32 num_groups = num_sequences / chunks_per_group; + + AmNnetSimple *top_am_nnet = models_->GetNnetForLang(lang_name); + int32 top_left_context = top_am_nnet->LeftContext(), + top_right_context = top_am_nnet->RightContext(); + + int32 first_embedding_t, + num_embedding_frames; + ComputeEmbeddingTimes(first_input_t, num_input_frames, num_output_frames, + frame_subsampling_factor, + opts_.bottom_subsampling_factor, + bottom_left_context_, bottom_right_context_, + top_left_context, top_right_context, + opts_.keep_embedding_context, + &first_embedding_t, &num_embedding_frames); + + const GeneralMatrix &eg_input = eg.inputs[0].features; + CuMatrix cu_input(eg_input.NumRows(), eg_input.NumCols(), + kUndefined), + cu_embedding; + eg_input.CopyToMat(&cu_input); + bool train_bottom_nnet = bottom_weight != 0.0; + KALDI_ASSERT(cu_input.NumRows() == num_input_frames * num_sequences); + + NnetComputer *computer = bottom_trainer_.Forward( + num_sequences, first_input_t, + first_embedding_t, num_embedding_frames, + train_bottom_nnet, + &cu_input, &cu_embedding); + + int32 b = opts_.bottom_subsampling_factor, + first_embedding_t_subsampled = first_embedding_t / b, + top_subsampling_factor = frame_subsampling_factor / b; + + NnetChainaTopTrainer *top_trainer = GetTopTrainerForLang(lang_name); + + CuMatrix cu_embedding_deriv; + if (train_bottom_nnet) + cu_embedding_deriv.Resize(cu_embedding.NumRows(), cu_embedding.NumCols()); + + + bool success = top_trainer->Train(cu_embedding, num_sequences, + num_groups, + first_embedding_t_subsampled, + top_subsampling_factor, + eg.outputs[0].deriv_weights, + eg.outputs[0].supervision, + top_weight, + (train_bottom_nnet ? + &cu_embedding_deriv : NULL)); + + if (success && train_bottom_nnet) { + bottom_trainer_.Backward(bottom_weight, num_sequences, computer, + &cu_embedding_deriv); + } else { + delete computer; // if it's NULL, this will do nothing. + } + + if (top_trainers_.size() != num_top_trainers) { + // Move any permanently held bits of GPU memory to low addresses, to reduce + // fragmentation. + bottom_trainer_.ConsolidateMemory(); + top_trainer->ConsolidateMemory(); + } + +} + + +NnetChainaTrainer::~NnetChainaTrainer() { + for (auto iter = top_trainers_.begin(); iter != top_trainers_.end(); + ++iter) + delete iter->second; +} + + + +} // namespace nnet3 +} // namespace kaldi diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h new file mode 100644 index 00000000000..559fb9dfba4 --- /dev/null +++ b/src/nnet3a/nnet-chaina-training.h @@ -0,0 +1,970 @@ +// nnet3a/nnet-chaina-training.h + +// Copyright 2015-2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_NNET3_NNET_CHAINA_TRAINING_H_ +#define KALDI_NNET3_NNET_CHAINA_TRAINING_H_ + +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-computation.h" +#include "nnet3/nnet-compute.h" +#include "nnet3/nnet-optimize.h" +#include "nnet3/nnet-chain-example.h" +#include "nnet3/nnet-training.h" +#include "nnet3/am-nnet-simple.h" +#include "chain/chain-training.h" +#include "chain/chain-den-graph.h" +#include "adapt/differentiable-transform-itf.h" + +namespace kaldi { +namespace nnet3 { + + +// This contains the subset of options that you can set for the bottom and the +// top model separately. They are set, for instance, as --bottom.train=false, +// or --top.dropout-test-mode=true. +struct NnetChainaTrainingPerModelOptions { + BaseFloat unadapted_weight; + bool train; + bool dropout_test_mode; + bool batchnorm_test_mode; + + NnetChainaTrainingPerModelOptions(): + unadapted_weight(0.5), + train(true), + dropout_test_mode(false), batchnorm_test_mode(false) { } + + + void Register(OptionsItf *opts) { + opts->Register("unadapted-weight", &unadapted_weight, + "Scale that is applied to the derivatives arising from the " + "unadapted pass of model evaluation, when training " + "Affects how much we prioritize the unadapted " + "features for neural nnet training."); + opts->Register("train", &train, + "Set this to false to disable training for this model."); + opts->Register("dropout-test-mode", &dropout_test_mode, + "Setting this option sets test mode on any dropout components. " + "Will persist in the model written out, if it's being trained."); + opts->Register("batchnorm-test-mode", &batchnorm_test_mode, + "Setting this option sets test mode on any batch-norm " + "(or batch-norm-like) components. "); + } + void Check() const { + KALDI_ASSERT(!(train && batchnorm_test_mode)); + KALDI_ASSERT(unadapted_weight >= 0.0); + } +}; + + +struct NnetChainaTrainingOptions { + NnetTrainerOptions nnet_config; + chain::ChainTrainingOptions chain_config; + NnetChainaTrainingPerModelOptions top; + NnetChainaTrainingPerModelOptions bottom; + bool apply_deriv_weights; + int32 bottom_subsampling_factor; + bool keep_embedding_context; + bool adaptation_model_accumulate; + bool adaptation_test_mode; + + NnetChainaTrainingOptions(): + apply_deriv_weights(true), + bottom_subsampling_factor(1), + keep_embedding_context(true), + adaptation_model_accumulate(false), + adaptation_test_mode(false) { } + + void Register(OptionsItf *opts) { + nnet_config.Register(opts); + chain_config.Register(opts); + ParseOptions top_opts("top", opts); + top.Register(&top_opts); // Register with prefix "top". + ParseOptions bottom_opts("bottom", opts); + bottom.Register(&bottom_opts); // Register with prefix "bottom". + + opts->Register("apply-deriv-weights", &apply_deriv_weights, + "If true, apply the per-frame derivative weights stored with " + "the example"); + opts->Register("bottom-subsampling-factor", &bottom_subsampling_factor, + "Determines the frequency at which we subsample the " + "embeddings from the bottom nnet. Implicitly, the " + "subsampling factor in the top nnet is the overall " + "--frame-subsampling-factor (determined when we dumped " + "the egs) divided by this value."); + opts->Register("keep-embedding-context", &keep_embedding_context, + "If true, we compute as much left/right context of the " + "embedding vectors (the output of the bottom nnet) as is " + "possible given the provided input features in the eg. " + "You'll generally only want this to be true " + "if the top network is recurrent or otherwise has " + "optional dependencies (for example: if it uses " + "StatisticsExtractionComponent, IfDefined(), Failover(), " + "etc.)."); + opts->Register("adaptation-model-accumulate", &adaptation_model_accumulate, + "Set this to true if you want to accumulate stats for " + "the adaptation model (i.e., its class-dependent means). " + "This will normally be done just once after training the " + "model, and will cause the adaptation objects to be " + "written out to . If this option is given, " + "the speaker adapted pass of the top model, and training " + "of the top or bottom model, will not be done; and we " + "expect --bottom-model-test-mode=true and " + "--top-model-test-mode=true to be set."); + opts->Register("adaptation-test-mode", &adaptation_test_mode, + "If true, use test mode for the adaptation model, which " + "means we'll use previously computed target models " + "rather than ones estimated from the minibatch. Training of " + "the bottom model is currently not supported in this case " + "(and, in any case, is likely undesirable)."); + } + void Check() const { + KALDI_ASSERT(bottom_subsampling_factor > 0); + top.Check(); + bottom.Check(); + } +}; + + +/** + This class, intended to mostly be accessed by NnetChainaTrainer, handles the + logic of reading the models and their corresponding denominator FSTs from + disk, and of writing out the corresponding (raw) trained models when + this iteration of training has finished. + + The reason this is not entirely trivial is that we want to make it easy to + support the multilingual case. In this case there is one 'bottom' model (the + embedding extractor) but there may be multiple 'top' models, each with their + associated transition model and denominator FST, containing their own + langauge name. We use a directory to organize these. + */ +class NnetChainaModels { + public: + /** + Constructor to which you pass the model directory and the den-fst + directory. The directory structure is: + /bottom.raw + should exist, and then for each language name (e.g. "english"), the following + files should exist: + /english.mdl /english.den.fst /english.ada + There is no requirement that all these directories be distinct. + + In practice, the language name will be either "default", in the + typical (monolingual) setup, or it might be arbitrary strings + representing languages such as "english", "french", and so on. + In general the language can be any string containing ASCII letters, numbers + or underscores. + + The models and denominator FSTs will only be read when they are actually + required, so languages that are not used by a particular job (e.g. because + they were not represented in the egs) will not actually be read. + + @param [in] opts Training options; needed to know which models + we should write out, and whether to set test mode + on models when reading them in. + @param [in] model_dir Directory where we'll find bottom.raw, and + .mdl for each language present in the egs + (the will be worked out from the key name from + "...?lang=xxx" in the key when reading the egs, + see ParseFromQueryString() in nnet-chain-utils.h. + @param [in] den_fst_dir Directory where we'll find the denominator + FST .den.fst for each language present in + the egs. + @param [in] transform_dir Directory where we'll find the + transforms (of type DifferentiableTransformItf), + as files .ada for each language present + in the egs. + */ + NnetChainaModels(const NnetChainaTrainingOptions &opts, + const std::string &model_dir, + const std::string &den_fst_dir, + const std::string &transform_dir); + + // Copy constructor + NnetChainaModels(const NnetChainaModels &other); + + + /* + This interpolates the (top and bottom) models stored here with the one in + 'model_dir', giving a weight 0 < new_model_weight < 1 to the new models. + All models currently loaded will be looked for (this depends what + languages were present in the egs), so you need to actually use this + object for training or objective evaluation before calling this function + on it. + */ + void InterpolateWith( + BaseFloat new_model_weight, + const std::string &model_dir); + + + Nnet* GetBottomNnet(); + + /** + Returns the AmNnetSimple object corresponding to a given language + name (e.g. "default", "english", "french"). Note: the model + file /.mdl will contain a TransitionModel and an + AmNnetSimple object + */ + AmNnetSimple *GetNnetForLang(const std::string &language_name); + + TransitionModel *GetTransitionModelForLang( + const std::string &language_name); + + + fst::StdVectorFst *GetDenFstForLang(const std::string &language_name); + + // This convenience function returns the Nnet object in the + // AmNnetSimple object returned by 'GetNnetForLang'. + Nnet *GetRawNnetForLang(const std::string &language_name); + + differentiable_transform::DifferentiableTransformMapped *GetTransformForLang( + const std::string &language_name); + + // Writes out the following files: + // /bottom..raw (if opts_.bottom.train) + // and, for each language that we accessed, + // /..raw (if opts_.top.train) + // /..ada (if opts_.adaptation_model_accumulate) + // + // Thus, this writes out any models that we trained. There is no + // corresponding Read() function. + void Write(const std::string &model_out_dir, + bool binary, + int32 job_id); + + // This is a version of Write() is specialized for use by the + // model-combination code; it differs from the Write() above in + // that it writes out all models we have (ignoring whether or not + // they were trained), and it writes out the 'top' models as + // .mdl files (including the transition models). + void WriteCombinedModels(const std::string &model_out_dir, + bool binary); + + + ~NnetChainaModels(); + private: + // This function sets "pathname" to the string: + // /. + void GetPathname(const std::string &dir, + const std::string &name, + const std::string &suffix, + std::string *pathname); + + // If job_id is >= 0, then this version of GetPathname() sets "pathname" to + // the string: + // /.. + // otherwise (job_id < 0) it sets it to + // /. + void GetPathname(const std::string &dir, + const std::string &name, + int32 job_id, + const std::string &suffix, + std::string *pathname); + + // struct LanguageInfo contains the data that is stored per language. + struct LanguageInfo { + // am_nnet comes from /.mdl, which also + // stores a TransitionModel. + TransitionModel trans_model; + AmNnetSimple am_nnet; + // den_fst comes from /.den.fst + fst::StdVectorFst den_fst; + // transform comes from /.ada + differentiable_transform::DifferentiableTransformMapped transform; + LanguageInfo() { } + // Copy constructor + LanguageInfo(const LanguageInfo &other); + }; + + // Depending on opts_, this function may zero the component stats, set test + // mode for batchnorm and/or dropout components, and do model-collapsing. + void InitializeNnet(bool is_top_nnet, Nnet *nnet) const; + + // get the LanguageInfo* for this language, creating it (and reading its + // contents from disk) if it does not already exist. + LanguageInfo *GetInfoForLang(const std::string &lang); + + + const NnetChainaTrainingOptions &opts_; + // Directory where models are located. + std::string model_dir_; + // Directory where denominator FSTs are located. + std::string den_fst_dir_; + // Directory where transforms (type: DifferentiableTransformMapped) are located. + std::string transform_dir_; + + // This corresponds to /bottom.raw. + Nnet bottom_nnet_; + // The left and right context of bottom_nnet_. + int32 bottom_nnet_left_context_; + int32 bottom_nnet_right_context_; + + std::unordered_map lang_info_; +}; + + +/** + This object, which has a similar function to NnetChainTrainer, trains the + 'top' model for a single language and (optionally) outputs the derivatives + required to obtain the 'bottom' model. + */ +class NnetChainaTopTrainer { + public: + /** + Constructor. + @param [in] lang_name The name of the language this corresponds to + (needed for diagnostics). E.g. "default", + "english". + @param [in] config Options class + @param [in] den_fst The denominator FST for this language + @param [in] transform The transform object which will be used to produce adapted + features after the first pass of training. + @param [in,out] nnet The neural net we are training. Expected to have + outputs called "output-si" (speaker-independent + output), "output", "output-si-xent", "output-xent", + and an input called "input". This class does not + take ownership of the pointer, but it will modify + its parameters (and stored statistics) during + training. + */ + NnetChainaTopTrainer( + const std::string &lang_name, + const NnetChainaTrainingOptions &config, + const fst::StdVectorFst &den_fst, + const differentiable_transform::DifferentiableTransformMapped &transform, + Nnet *nnet); + + /** Train on one minibatch. + @param [in] input The input (unadapted) features, most likely the embeddings + that are the output of the 'bottom' nnet. Assumed to form a + regular grid with the 't' value having higher stride, so the + first 'num_sequences' rows would correspond to the + lowest-numbered frames for all sequences, and so on. + @param [in] num_sequences The number of sequences/chunks represented + in 'input' (a.k.a. the minibatch size). Actually this must + be equal to supervision.num_sequences, but it's easier for + reasons of clarity and documentation to repeat it here. + @param [in] num_groups The total number of groups of chunks (you + can think of these as the same as speakers). Must be >1, and must divide + num_sequences. The number of sequences per speaker + must be the same for all speakers (it will equal num_sequences / num_groups), + and the sequences for a speaker must be consecutively numbered. + @param [in] first_input_t The 't' value corresponding to the first + input frame (will normally be a negative number, + corresponding to the left context we are giving to the + 'top' model, since we renumber to ensure that the sequences + have 't' values starting from 0). The 't' values at the + input will be consecutive, and the number of frames per + sequence will equal input.NumRows() / num_sequences. Note: + if the embeddings are computed at a lower frame rate than + the original features, we renumber things to make the + embeddings consecutive. + @param [in] top_subsampling_factor The subsampling factor of the top network + (which will equal the frame subsampling factor implicit in the original + egs that we read, divided by bottom_subsampling_factor). E.g. this + might frequently be 1 or 3. The frames at the output of the 'top' + nnet are evaluated for 't' values that are multiples of + 'top_subsampling_factor', starting from t=0. + @param [in] deriv_weights Per-frame weights that will be applied to the derivatives + w.r.t. the objective function. Dimension is expected to be either + input.NumRows(), or zero (in which case it is treated the same as a + vector containing all ones). + @param [in] supervision The chain supervision object representing the objective + function at the output. Its num_sequences must equal the + num_sequences passed into this function as a separate argument. + @param [in] model_training_scale A scale we'll apply to the parameter changes + and max-change values when taking any step. This will be + referred to elsewhere as top_weight, or "tw" when present in + keys of egs in scp files; we'll have a separately specifiable + weight for the bottom nnet. If this is zero, we won't be training + the top model on this eg at all. + @param [out] input_deriv If non-NULL, the derivative of the objective function + w.r.t. the input features will be written to here (this function + will set it using Swap(), so you don't need to correctly size it). + @return Returns true if it successfully trained on this minbiatch, + false on error (e.g. if a NaN was generated, which should + not really happen). + */ + bool Train(const CuMatrixBase &input, + int32 num_sequences, + int32 num_groups, + int32 first_input_t, + int32 top_subsampling_factor, + const VectorBase &deriv_weights, + const chain::Supervision &supervision, + BaseFloat model_training_scale, + CuMatrix *input_deriv = NULL); + + // Prints out the final stats, and return true if there was a nonzero count. + bool PrintTotalStats() const; + + + // Returns the total objective-function value for the adapted computation (if + // adapted == true), or the unadapted/speaker-independent computation + // otherwise, with the corresponding weight (which can be interpreted as a + // frame count) written to 'weight'. The returned value would normally be + // divided by 'weight' before being displayed. + BaseFloat GetTotalObjf(bool adapted, BaseFloat *weight) const; + + + // Calls kaldi::nnet3::ConsolidateMemory() on nnet_ and delta_nnet_; we do + // this after the first minibatch of training, to reduce fragmentation. + void ConsolidateMemory(); + + ~NnetChainaTopTrainer(); + private: + + // We use this as an index with which to look up computations, kind of like a + // lookaside buffer; it avoids creating a much larger structure with large + // vectors of Indexes in it. + struct ComputationStructure { + bool adapted; + bool train_model; + bool need_input_deriv; + int32 num_sequences; + int32 frames_per_sequence_in; + int32 frames_per_sequence_out; + int32 first_input_t; + int32 top_subsampling_factor; + inline bool operator == (const ComputationStructure &other) const { + return adapted == other.adapted && + train_model == other.train_model && + need_input_deriv == other.need_input_deriv && + num_sequences == other.num_sequences && + frames_per_sequence_in == other.frames_per_sequence_in && + frames_per_sequence_out == other.frames_per_sequence_out && + first_input_t == other.first_input_t && + top_subsampling_factor == other.top_subsampling_factor; + }; + ComputationStructure (const ComputationStructure &other) = default; + ComputationStructure &operator = ( + const ComputationStructure &other) = default; + /** + Constructor. + @param [in] adapted True if we want the outputs from "output" and + "output-xent", and false if we want the outputs from + "output-si" and "output-si-xent". + @param [in] train_model True if we will be training the acoustic + model with this example. + @param [in] need_input_deriv True if we need the derivative w.r.t. + the features that are the input to this computation. + @param [in] num_sequences The number of sequences in this minibatch + (a.k.a. the minibatch size). + @param [in] frames_per_sequence_in The number of frames for each sequence + of input features. They are assumed to be consecutively + numbered. + @param [in] frames_per_sequence_out The 'frames_per_sequence' in + the ChainSupervision object, i.e. the length of the + output sequences of the computation. + @param [in] first_input_t The first 't' value in the input + sequence; will normally be negative (corresponding to + the negative of the number of frames of left context). + @param [in] top_subsampling_factor Frame subsampling factor at the + output; e.g., 3 would mean we are evaluating the output + at frames t=0, t=3, and so on. + */ + ComputationStructure(bool adapted, + bool train_model, + bool need_input_deriv, + int32 num_sequences, + int32 frames_per_sequence_in, + int32 frames_per_sequence_out, + int32 first_input_t, + int32 top_subsampling_factor); + }; + struct ComputationHasher { + inline size_t operator() (const ComputationStructure &s) const { + return (s.adapted ? 33 : 0) + + (s.train_model ? 333 : 0) + + size_t(s.num_sequences) + + 10 * size_t(s.frames_per_sequence_in) + + 100 * size_t(s.frames_per_sequence_out) + + 1000 * size_t(s.first_input_t) + + 10000 * size_t(s.top_subsampling_factor); + } + }; + + // This is a faster lookup mechanism for the computation than + // is provided by the compiler's inherent caching. + std::unordered_map, + ComputationHasher> computation_map_; + + // This wraps the call to the compiler. See constructor + // of struct ComputationStructure for more documentation. + std::shared_ptr GetComputation( + const ComputationStructure &s); + + + /** + This does the training on the unadapted branch ("si" / speaker-independent) + of the neural net. + @param [in] input The input features, as supplied to Train(). Order + of rows is: the first frame of all sequences; the + second frame of all sequences; and so on. + @param [in] computation The computation corresponding to the unadapted + branch of the nnet. + @param [in] supervision The chain supervision object. The nnet output + dimensions are worked out from this, as well as + using this object to compute the objective function. + @param [in] need_model_deriv True if we are training on this minibatch, + on the unadapted data-- i.e. if we need to compute + the model derivative. + @param [in] deriv_weights Weights to be applied to the derivatives for the + corresponding frames of the output (order is: + first frame for all sequences; second frame for + all sequences, etc.). May be stored with the + egs. If this is the empty vector or + --apply-deriv-weights=false, they won't be + appplied. + @param [out] posterior The posteriors from the numerator forward-backward + on the adaptation model will be written to here. + The number of frames will be the number of frames in + the output sequence (supervision.frames_per_sequence), + and the order is: all sequences' frame 0; then all + sequences' frame 1; and so on. + @param [out] input_deriv Derivative w.r.t. the input features; this will + be set via Swap(), if it is not NULL. Any weight to + (be applied e.g. opts_.unadapted_bottom_weight), + should be applied by the caller. + @return Returns true if the training went through successfully + (it should very rarely return false, e.g. if a NaN was generated). + */ + bool TrainUnadapted(const CuMatrixBase &input, + const NnetComputation &computation, + const chain::Supervision &supervision, + bool need_model_deriv, + const CuVectorBase &deriv_weights, + Posterior *posterior, + CuMatrix *input_deriv); + + /** + Does the adapted pass of training. + @param [in] computation The adapted version of the + computation (this one uses the outputs + "output" and "output-xent" instead of + "output-si" and "output-si-xent". + @param [in] supervision The chain supervision + object, containing information derived + from the numerator lattices. + @param [in] model_training_scale A scale we'll apply to the parameter changes + and max-change values when taking any step. This will be + referred to elsewhere as top_weight, or "tw" when present in + keys of egs in scp files; we'll have a separately specifiable + weight for the bottom nnet. If this is zero, we won't be training + the top model on this eg at all. + @param [in] deriv_weights Weights to be applied to the derivatives for the + corresponding frames of the output (order is: + first frame for all sequences; second frame for + all sequences, etc.). May be stored with the + egs. If this is the empty vector or + --apply-deriv-weights=false, they won't be + appplied. + @param [in] input The adapted input features. Provided as a non-const + pointer because it is consumed destructively (via Swap()). + @param [in,out] input_deriv If non-NULL, the + feature derivative w.r.t. the [speaker-adapted] input + features will be written to this location. It's + done via Swap(), so it doesn't have to be correctly + sized on entry. + @return + */ + bool TrainAdapted(const NnetComputation &computation, + const chain::Supervision &supervision, + BaseFloat model_training_scale, + const CuVectorBase &deriv_weights, + CuMatrix *input, + CuMatrix *input_deriv); + + // This function increments num_minibatches_processed_, but before + // doing so, if it notices that it is zero it makes certain calls + // to ConsolidateMemory() + void IncrementNumMinibatches(); + + std::string lang_name_; + + const NnetChainaTrainingOptions &opts_; + chain::DenominatorGraph den_graph_; + const differentiable_transform::DifferentiableTransformMapped &transform_; + CachingOptimizingCompiler compiler_; + + + Nnet *nnet_; + Nnet *delta_nnet_; // stores the change to the parameters on each training + // iteration. + + // These objects keep track of the objective-function values for the 4 + // outputs. We have the regular output (sequence objective) and the 'xent' + // output for cross-entropy regularization, and there are speaker independent + // (si) versions of those outputs also. + ObjectiveFunctionInfo output_si_objf_; + ObjectiveFunctionInfo output_si_xent_objf_; + ObjectiveFunctionInfo output_objf_; + ObjectiveFunctionInfo output_xent_objf_; + + // Number of minibatches processed. Note: we actually train the nnet twice + // per minibatch, because there are the speaker-independent and + // speaker-dependent passes. + int32 num_minibatches_processed_; + + // stats for max-change. This combines both speaker-independent and + // speaker-adapted phases of training, since we compute the gradient summed + // over both passes (with the unadapted derivatives weighted by + // opts_.unadapted_top_weight) before updating the model. + MaxChangeStats max_change_stats_; +}; + + + +/** + This object, which has a similar function to NnetChainTrainer, takes care of + evaluating and possibly training the 'bottom' model. +*/ +class NnetChainaBottomTrainer { + public: + /** + Constructor. + @param [in] opts Options class. This class maintains a reference to it, + so don't delete it. + @param [in,out] nnet The neural net we are training. Expected (for now) + to have an input called 'input' (corresponding to + the original input features and an output called + 'output' (corresponding to the embeddings). + */ + NnetChainaBottomTrainer(const NnetChainaTrainingOptions &opts, + Nnet *nnet); + + /** Train on one minibatch. + @param [in] num_sequences The number of sequences/chunks represented + in 'input' (a.k.a. the minibatch size). + @param [in] first_input_t The 't' value corresponding to the first input + frame (will normally be a negative number). The 't' values at + the input will be consecutive, and the number of frames per sequence + will equal input.NumRows() / num_sequences. Note: if the embeddings + are computed at a lower frame rate than the original features, we + renumber things to make the embeddings consecutive. + (Note: bottom_subsampling_factor was passed in in the constructor). + @param [in] first_output_t The 't' value corresponding to the first output + frame (will normally be a negative number, corresponding to the left + context we are giving to the 'top' model, since we assume that the + sequences have 't' values starting from 0). The 't' values at + the output will be separated by the 'bottom_subsampling_factor' + which was given to the constructor. (We'll renumber them + by dividing them by 'bottom_subsampling_factor' before giving + them to the 'top' network. + @param [in] frames_per_sequence_out The number of output frames per sequence. + This is determined by the context of the top and bottom nnets + and the "keep_embedding_context" config value. + @param [in] train_model True if we'll be training the bottom model + for this eg. If this is false, a backward pass will not be. + needed, and this function will return NULL + @param [in] input The input features, most likely raw MFCC or filterbank + features. A pointer, since it is consumed destructively + (via 'swap'). + @param [out] output The output will be written to here. Does not have + to be correctly sized (we'll copy using Swap()). + @return Returns the NnetComputer object that we did the computation with, + if train_model == true (otherwise, returns NULL). + The user should either pass this into Backward(), or delete it. + */ + NnetComputer* Forward(int32 num_sequences, + int32 first_input_t, + int32 first_output_t, + int32 frames_per_sequence_out, + bool train_model, + CuMatrix *input, + CuMatrix *output); + + + /** + Does the backward pass, which will do model training. This should only be + called if the bottom nnet needs to be trained. + @param [in] model_training_scale A scale we'll apply to the parameter changes, + l2 term and max-change values when taking the step.. This will be + referred to elsewhere as bottom_weight, or "bw" when present in + keys of egs in scp files; we'll have a separately specifiable + weight for the top nnet. If this is zero, we won't be training + the top model on this eg at all (and we'll expect 'false' to + have been passed in for the 'train_model' arg on the corresponding + call to Forward()). + @param [in] num_sequences The number of sequences (chunks) we had in this + minibatch-- needed for the application of l2. + @param [in] computer The computer object returned from the + forward pass. This function takes ownership of it and + will delete it when done with it. + @param [in] output_deriv The derivative w.r.t. the output of + the forward pass. It is consumed destructively + by this function. + + */ + void Backward(BaseFloat model_training_scale, + int32 num_sequences, + NnetComputer *computer, + CuMatrix *output_deriv); + + // Prints the max-change stats for the bottom nnet. + void PrintTotalStats() const; + + // Calls kaldi::nnet3::ConsolidateMemory() on nnet_ and delta_nnet_; we do + // this after the first minibatch of training, to reduce fragmentation. + void ConsolidateMemory(); + + ~NnetChainaBottomTrainer(); + private: + + // We use this as an index with which to look up computations, kind of like a + // lookaside buffer; it avoids creating a much larger structure with large + // vectors of Indexes in it. + struct ComputationStructure { + bool train_model; + int32 num_sequences; + int32 frames_per_sequence_in; + int32 frames_per_sequence_out; + int32 first_input_t; + int32 first_output_t; + inline bool operator == (const ComputationStructure &other) const { + return train_model == other.train_model && + num_sequences == other.num_sequences && + frames_per_sequence_in == other.frames_per_sequence_in && + frames_per_sequence_out == other.frames_per_sequence_out && + first_input_t == other.first_input_t && + first_output_t == other.first_output_t; + }; + ComputationStructure (const ComputationStructure &other) = default; + ComputationStructure &operator = ( + const ComputationStructure &other) = default; + /** + Constructor. + @param [in] train_model True if we are going to train the bottom model. + @param [in] need_input_deriv True if we need the derivative w.r.t. + the features that are the input to this computation. + @param [in] num_sequences The number of sequences in this minibatch + (a.k.a. the minibatch size). + @param [in] frames_per_sequence_in The number of frames for each sequence + of input features. They are assumed to be consecutively + numbered. + @param [in] frames_per_sequence_out The 'frames_per_sequence' in + the ChainSupervision object, i.e. the length of the + output sequences of the computation. + @param [in] first_input_t The first 't' value in the input + sequence; will normally be negative (corresponding to + the negative of the number of frames of left context). + */ + ComputationStructure(bool train_model, + int32 num_sequences, + int32 frames_per_sequence_in, + int32 frames_per_sequence_out, + int32 first_input_t, + int32 first_output_t); + }; + struct ComputationHasher { + inline size_t operator() (const ComputationStructure &s) const { + return size_t(s.num_sequences) + + 10 * size_t(s.frames_per_sequence_in) + + 100 * size_t(s.frames_per_sequence_out) + + 1000 * size_t(s.first_input_t) + + 10000 * size_t(s.first_output_t); + } + }; + + // This is a faster lookup mechanism for the computation than + // is provided by the compiler's inherent caching. + std::unordered_map, + ComputationHasher> computation_map_; + + // This wraps the call to the compiler. See constructor + // of struct ComputationStructure for more documentation. + std::shared_ptr GetComputation( + const ComputationStructure &s); + + + + /** + Converts the format of the posterior from how it is at the output of the + network to how it is at the input (i.e. in the embedding space). + Basically, this will consist of padding with empty posteriors for the + "context frames", and possibly upsampling the posteriors (by just repeating + each one for, say, 3 frames, if top_subsampling_factor == 3). + + The number of frames per sequence at the output will equal + post_at_output.size() / num_sequences, and the number of frames per + sequence at the input will equal post_at_inptu->size() / num_sequences + (note: this means 'post_at_input is expected to be appropriately sized + when this function is called). + */ + void ConvertPosterior(const Posterior &post_at_output, + int32 num_sequences, + int32 first_input_t, + int32 top_subsampling_factor, + Posterior *post_at_input); + + const NnetChainaTrainingOptions opts_; + + Nnet *nnet_; + Nnet *delta_nnet_; // stores the change to the parameters on each training + // iteration. + + CachingOptimizingCompiler compiler_; + + // Number of minibatches processed. + int32 num_minibatches_processed_; + + // stats for max-change + MaxChangeStats max_change_stats_; +}; + + + +/** + This class is for single-threaded training of neural nets using the 'chain' + model and our adaptation framework +*/ +class NnetChainaTrainer { + public: + /** + Constructor + @param [in] config Options class + @param [in] models Object that provides access to the models and + denominator FSTs, indexed as appropriate by language-id. + */ + NnetChainaTrainer(const NnetChainaTrainingOptions &config, + NnetChainaModels *models); + + /* Train on one minibatch. + @param [in] key The key the example had in the archive. This is + used to work out the language name. + @param [in] eg The example we are training on. It is expected + to have an input named 'input' (the features) and an + output named 'output' (containing the chain supervision + object). We'll make use of the chunks_per_group member + of the NnetChainSupervision object, which is not used + outside the 'chaina' framework. + */ + void Train(const std::string &key, + const NnetChainExample &eg); + + // Prints out the final stats, and return true if there was a nonzero count. + bool PrintTotalStats() const; + + // Returns the total objective-function value, summed over all languages + // present, for the adapted computation (if adapted == true), or the + // unadapted/speaker-independent computation otherwise, with the corresponding + // weight (which can be interpreted as a frame count) written to 'weight'. + // The returned value would normally be divided by 'weight' before being + // displayed. + BaseFloat GetTotalObjf(bool adapted, BaseFloat *weight) const; + + // Prints out the max-change stats (if nonzero): the percentage of time that + // per-component max-change and global max-change were enforced. + void PrintMaxChangeStats() const; + + ~NnetChainaTrainer(); + + private: + + void GetContextInfo(const std::string &lang, + int32 *bottom_left_context, + int32 *bottom_right_context, + int32 *top_left_context, + int32 *top_right_context); + + + NnetChainaTopTrainer *GetTopTrainerForLang(const std::string &lang); + + + const NnetChainaTrainingOptions &opts_; + // pointer to object owned outside this class. + NnetChainaModels *models_; + + // left and right context of bottom model. + int32 bottom_left_context_; + int32 bottom_right_context_; + + NnetChainaBottomTrainer bottom_trainer_; + // map from language name (e.g. "default", "english", "french") to + // the object that trains the corresponding 'top' nnet. + std::unordered_map top_trainers_; +}; + + +/** + This utility function, used in training and test-time adaptation code, + converts the format of the posterior from how it is at the output of the + top network to how it is at the input (i.e. in the embedding space). + Basically, this will consist of padding with empty posteriors for the + "context frames", and possibly upsampling the posteriors (by just repeating + each one for, say, 3 frames, if top_subsampling_factor == 3). The + rule we'll use is: copy the posterior from the output frame that + is closest in numbering, rounding down in case of ties (i.e., for even + subsampling factor). + + @param [in] post_at_output The posterior that needs to be padded, + consisting of 'num_sequences' sequences, each with 't' + values starting at zero, at multiples of + 'top_subsampling_factor', and with number of 't' values + determined by: num_frames_out = post_at_output.size() / + num_sequences. The 't' has the larger stride than the + minibatch index 'n', so it's: frame t=0 of all sequences, + then frame t=1*top_subsampling_factor of all sequences, + and so on. + @param [in] num_sequences The number of sequences/chunks + @param [in] first_input_t The first 't' value at the input, for which + we need a posterior for (note: negative 't' values will + get zero posterior). Implicitly, first_output_t = 0. + The number of input frames is worked out as + post_at_input->size() / num_sequences; the 't' values + at the input are assumed to be consecutive. + @param [in] top_subsampling_factor The number of frames with which + 't' values at the output are separated. + @param [in] pdf_map This is either the empty vector (meaning: + the DifferentiableTransform object deals with pdf-ids + directly), or it is a map from pdf-ids to cluster-ids. + This would actually be obtained from build-tree-two-level + after building a two-level tree, and it would be stored + in the .ada object. The actual class labels that + the DifferentiableTransform object deals with, will + be the values stored in 'pfd_map' (i.e. these cluster-ids). + @param [in] num_classes Provided for checking purposes only: the + number of classes that the DifferentiableTransform object + expects. If pdf_map is empty we expect this to be the + same as the number of pdf-ids (and the ints in + post_at_output to be in the range [0, num_classes - 1]). + If pdf_map is nonempty, we expect this to be the same + as the maximum element in pdf_map, plus one. + @param [out] post_at_input The posterior after padding and possibly + subsampling. Should have the correct size but its + elements are expected to be empty at entry. Like + post_at_output, the 't' has the larger stride than + the minibatch-index 'n'. +*/ +void ConvertPosterior(const Posterior &post_at_output, + int32 num_sequences, + int32 first_input_t, + int32 top_subsampling_factor, + const std::vector &pdf_map, + int32 num_classes, + Posterior *post_at_input); + + + +} // namespace nnet3 +} // namespace kaldi + +#endif // KALDI_NNET3_NNET_CHAINA_TRAINING_H_ diff --git a/src/nnet3a/nnet-chaina-utils-test.cc b/src/nnet3a/nnet-chaina-utils-test.cc new file mode 100644 index 00000000000..6dd9a942ad7 --- /dev/null +++ b/src/nnet3a/nnet-chaina-utils-test.cc @@ -0,0 +1,57 @@ +// nnet3/nnet-chaina-utils-test.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "nnet3a/nnet-chaina-utils.h" + +namespace kaldi { +namespace nnet3 { + +void UnitTestParseFromQueryString(){ + std::string value; + KALDI_ASSERT(ParseFromQueryString("abc", "d", &value) == false); + KALDI_ASSERT(ParseFromQueryString("abc?e=f", "d", &value) == false); + KALDI_ASSERT(ParseFromQueryString("abc?d=f", "d", &value) == true && + value == "f"); + KALDI_ASSERT(ParseFromQueryString("abc?dd=f", "d", &value) == false); + KALDI_ASSERT(ParseFromQueryString("abc?dd=f&d=gab", "d", &value) == true && + value == "gab"); + KALDI_ASSERT(ParseFromQueryString("abc?d=f&dd=gab", "d", &value) == true && + value == "f"); + KALDI_ASSERT(ParseFromQueryString("abc?d=f&ex=fda&dd=gab", "ex", &value) == true && + value == "fda"); + + + BaseFloat f; + KALDI_ASSERT(ParseFromQueryString("abc?d=f&ex=1.0&dd=gab", "ex", &f) == true && + f == 1.0); + KALDI_ASSERT(ParseFromQueryString("abc?d=f&ex=1.0&dd=gab", "e", &f) == false); +} + +} // namespace nnet3 +} // namespace kaldi + +int main() { + using namespace kaldi; + using namespace kaldi::nnet3; + SetVerboseLevel(2); + UnitTestParseFromQueryString(); + KALDI_LOG << "Tests succeeded."; + + return 0; +} diff --git a/src/nnet3a/nnet-chaina-utils.cc b/src/nnet3a/nnet-chaina-utils.cc new file mode 100644 index 00000000000..a83097395de --- /dev/null +++ b/src/nnet3a/nnet-chaina-utils.cc @@ -0,0 +1,186 @@ +// nnet3/nnet-chaina-utils.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "nnet3/nnet-utils.h" +#include "nnet3a/nnet-chaina-utils.h" + +namespace kaldi { +namespace nnet3 { + +void FindChainaExampleStructure(const NnetChainExample &eg, + int32 *num_sequences, + int32 *chunks_per_spk, + int32 *first_input_t, + int32 *num_input_frames, + int32 *num_output_frames, + int32 *frame_subsampling_factor, + int32 *eg_left_context, + int32 *eg_right_context) { + if (eg.inputs.size() != 1 || + eg.inputs[0].name != "input") + KALDI_ERR << "Expected eg to have exactly one input, named 'input'"; + + if (eg.outputs.size() != 1 || + eg.outputs[0].name != "output") + KALDI_ERR << "Expected eg to have exactly one output, named 'output'"; + + + const NnetChainSupervision &supervision = eg.outputs[0]; + *num_sequences = supervision.supervision.num_sequences; + *chunks_per_spk = supervision.chunks_per_group; + + KALDI_ASSERT(supervision.indexes.size() % *num_sequences == 0 && + !supervision.indexes.empty()); + KALDI_ASSERT(supervision.indexes[0] == Index() && + "Expected first index to have t=0,n=0,x=0"); + // We expect t to have the larger stride. + KALDI_ASSERT(supervision.indexes[1].n == 1 && + "Supervision is in an unexpected order"); + Index last_output_index = supervision.indexes.back(); + KALDI_ASSERT(last_output_index.n == *num_sequences - 1); + *num_output_frames = int32(supervision.indexes.size()) / *num_sequences; + int32 last_output_t = last_output_index.t; + KALDI_ASSERT(last_output_t % (*num_output_frames - 1) == 0); + *frame_subsampling_factor = last_output_t / (*num_output_frames - 1); + + + const NnetIo &input_io = eg.inputs[0]; + *first_input_t = input_io.indexes[0].t; + if (input_io.indexes[1].t != *first_input_t + 1) { + KALDI_ERR << "Input indexes are in the wrong order or not consecutive: " + << input_io.indexes[1].t << " != " << (*first_input_t) << " + 1"; + } + Index last_input_index = input_io.indexes.back(); + KALDI_ASSERT(last_input_index.n == *num_sequences - 1); + int32 last_input_t = last_input_index.t; + *num_input_frames = last_input_t + 1 - *first_input_t; + + *eg_left_context = -(*first_input_t); + *eg_right_context = last_input_t - last_output_t; +} + + +bool ParseFromQueryString(const std::string &string, + const std::string &key_name, + std::string *value) { + size_t question_mark_location = string.find_last_of("?"); + if (question_mark_location == std::string::npos) + return false; + std::string key_name_plus_equals = key_name + "="; + // the following do/while and the initialization of key_name_location is a + // little convoluted. We want to find "key_name_plus_equals" but if we find + // it and it's not preceded by '?' or '&' then it's part of a longer key and we + // need to ignore it and see if there's a next one. + size_t key_name_location = question_mark_location; + do { + key_name_location = string.find(key_name_plus_equals, + key_name_location + 1); + } while (key_name_location != std::string::npos && + key_name_location != question_mark_location + 1 && + string[key_name_location - 1] != '&'); + + if (key_name_location == std::string::npos) + return false; + size_t value_location = key_name_location + key_name_plus_equals.length(); + size_t next_ampersand = string.find_first_of("&", value_location); + size_t value_len; + if (next_ampersand == std::string::npos) + value_len = std::string::npos; // will mean "rest of string" + else + value_len = next_ampersand - value_location; + *value = string.substr(value_location, value_len); + return true; +} + + +bool ParseFromQueryString(const std::string &string, + const std::string &key_name, + BaseFloat *value) { + std::string s; + if (!ParseFromQueryString(string, key_name, &s)) + return false; + bool ans = ConvertStringToReal(s, value); + if (!ans) + KALDI_ERR << "For key " << key_name << ", expected float but found '" + << s << "', in string: " << string; + return true; +} + + +bool ComputeEmbeddingTimes(int32 first_input_t, + int32 num_input_frames, + int32 num_output_frames, + int32 frame_subsampling_factor, + int32 bottom_subsampling_factor, + int32 bottom_left_context, + int32 bottom_right_context, + int32 top_left_context, + int32 top_right_context, + bool keep_embedding_context, + int32 *first_embedding_t, + int32 *num_embedding_frames) { + KALDI_ASSERT(num_input_frames > 0 && num_output_frames > 0 && + first_input_t <= 0 && frame_subsampling_factor > 0); + KALDI_ASSERT(bottom_subsampling_factor > 0 && + frame_subsampling_factor % bottom_subsampling_factor == 0); + KALDI_ASSERT(bottom_left_context >= 0 && bottom_right_context >= 0 && + top_left_context >= 0 && top_right_context >= 0); + + // below '_subsampled' means after dividing the 't' values by + // 'bottom_subsampling_factor'. + // Note: implicitly, the first frame required at the output is t=0. + int32 first_required_embedding_t_subsampled = -top_left_context, + last_required_embedding_t_subsampled = + num_output_frames - 1 + top_right_context; + + int32 first_computable_embedding_t = first_input_t + bottom_left_context, + last_computable_embedding_t = + first_input_t + num_input_frames - 1 - bottom_right_context; + + int32 b = bottom_subsampling_factor; + + // By adding b - 1 and doing division that rounds down (towards negative + // infinity, we effectively round up when computing + // first_computable_embedding_t / b, which is appropriate because + // we need the first multiple of b that's actually computable. + int32 first_computable_embedding_t_subsampled = + DivideRoundingDown(first_computable_embedding_t + b - 1, b), + last_computable_embedding_t_subsampled = + DivideRoundingDown(last_computable_embedding_t, b); + if (first_computable_embedding_t_subsampled > first_required_embedding_t_subsampled || + last_computable_embedding_t_subsampled < last_required_embedding_t_subsampled) { + KALDI_WARN << "The training examples have insufficient context vs. the models."; + return false; + } + if (keep_embedding_context) { + *first_embedding_t = first_computable_embedding_t_subsampled * b; + *num_embedding_frames = 1 + last_computable_embedding_t_subsampled - + first_computable_embedding_t_subsampled; + } else { + *first_embedding_t = first_required_embedding_t_subsampled * b; + *num_embedding_frames = 1 + last_required_embedding_t_subsampled - + first_required_embedding_t_subsampled; + } + return true; +} + + + +} // namespace nnet3 +} // namespace kaldi diff --git a/src/nnet3a/nnet-chaina-utils.h b/src/nnet3a/nnet-chaina-utils.h new file mode 100644 index 00000000000..4f028a4af0b --- /dev/null +++ b/src/nnet3a/nnet-chaina-utils.h @@ -0,0 +1,182 @@ +// nnet3a/nnet-chaina-utils.h + +// Copyright 2015-2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef KALDI_NNET3_NNET_CHAINA_UTILS_H_ +#define KALDI_NNET3_NNET_CHAINA_UTILS_H_ + +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-computation.h" +#include "nnet3/nnet-compute.h" +#include "nnet3/nnet-optimize.h" +#include "nnet3/nnet-chain-example.h" +#include "nnet3/nnet-training.h" +#include "chain/chain-training.h" +#include "chain/chain-den-graph.h" + +namespace kaldi { +namespace nnet3 { + +/** + This function works out certain structural information from an example for + 'chaina' (adapted chain) training. It assumes (and spot-checks) that the eg + has a single input, called 'input', with a regular structure where the 'n' + has the highest stride so it's: all frames for sequence 0; all frames for + sequence 1; and so on. It will raise an exception if the example does not, + in some respect, have the expected structure. + + @param [in] The example we are getting the structural information from + @param [out] num_sequences The number of sequences/chunks (actually just + the num_sequences in the eg.supervision object). + @param [out] chunks_per_spk The number of chunks per speaker + (just eg.chunks_per_spk) + @param [out] first_input_t The lowest numbered 't' value in the inputs. + Usually will be negative. This function requires the + input 't' values to be consecutive, and will crash + if they are not. + @param [out] num_input_frames The number of input frames. The last input + 't' value will be first_input_t + num_input_frames - 1. + @param [out] num_output_frames The number of output frames (which are + assumed to start from t=0 and to be spaced by + 'frame_subsampling_factor. + @param [out] frame_subsampling_factor The spacing on the output frames, + equal to the amount of subsampling that happens + between the input and the output (this will + later be factorized as: + frame_subsampling_factor = + bottom_subsampling_factor * top_subsampling_factor. + @param [out] eg_left_context Just as a convenience, this function outputs + the left-context in the example, which equals + first_output_t - first_input_t = -first_input_t. + @param [out] eg_right_context Again just as a convenience, this function + outputs the right-context of the example, which + equals last_input_t - last_output_t = + (first_input_t + num_input_frames - 1) - + (first_output_t + num_output_frames - 1) * frame_subsampling_factor + (note: first_output_t is zero). +*/ +void FindChainaExampleStructure(const NnetChainExample &eg, + int32 *num_sequences, + int32 *chunks_per_spk, + int32 *first_input_t, + int32 *num_input_frames, + int32 *num_output_frames, + int32 *frame_subsampling_factor, + int32 *eg_left_context, + int32 *eg_right_context); + +/** + This function computes some info about which frames we need to compute the + embeddings for (i.e. which frames we need to request at the output of the + bottom nnet). It will print a warning and return false if the egs had + insufficient context to compute what is requested. + + @param [in] first_input_t The first 't' value for the input that + is provided to the bottom nnet. + @param [in] num_input_frames The number of input frames provided to + the bottom nnet; these are assumed to be consecutive. + @param [in] num_output_frames The number of output frames that we + need to compute the output for (this will be + the sequence_length in the chain supervision object). + @param [in] frame_subsampling_factor The factor by which we + subsample to get the final output (includes subsampling + in both the bottom and top nnet). + @param [in] bottom_subsampling_factor The amount of subsampling + for getting the embeddings (i.e. the embeddings + are obtained at t = multiples of this value.) + Must be >0 and divide frame_subsampling_factor. + This must be provided and can't be worked out from + the nnets, because the top nnet uses a different frame + numbering-- i.e. we divide the 't' values by + 'bottom_subsampling_factor' so that the inputs to the + top nnet are consecutive. This will make it easier + to apply the top nnet separately from binaries. + @param [in] bottom_left_context The num-frames of left-context that the + bottom nnet requires + @param [in] bottom_right_context The num-frames of right-context that the + bottom nnet requires + @param [in] top_left_context The num-frames of left-context that the + top nnet requires. Note: this is *after* dividing the + 't' values by bottom_subsampling_factor, so the number + top_left_context * bottom_subsampling_factor can be used + to compute the total left-context that we need to put in + the egs. + @param [in] top_right_context The num-frames of right-context that the + top nnet requires. See docs for top_left_context for more + info RE frame subsampling + @param [in] keep_embedding_context True if we want to compute as + many frames of the embedding as we can given the amount + of available left context in the input. This will be + usually be set to true if the top nnet is recurrent or + can otherwise consume extra context. + @param [out] first_embedding_t First 't' value of the embedding. CAUTION: + this is in the original frame numbering (the one we use + for the bottom nnet), and will be a multiple of + 'bottom_subsampling_factor'. You need to divide by + 'bottom_subsampling_factor' to get the 't' value used + at the input of the top nnet. + @param [out] num_embedding_frames The number of embedding frames that + we are computing. + @return Returns true if it could successfully compute the output, + and false if it could not because of insufficient input + context. + */ +bool ComputeEmbeddingTimes(int32 first_input_t, + int32 num_input_frames, + int32 num_output_frames, + int32 frame_subsampling_factor, + int32 bottom_subsampling_factor, + int32 bottom_left_context, + int32 bottom_right_context, + int32 top_left_context, + int32 top_right_context, + bool keep_embedding_context, + int32 *first_embedding_t, + int32 *num_embedding_frames); + + +/** + This function parses a string value from a 'url-like' string (which is probably actually + a key value from an scp file). The general format this function handles is: + iiiiiiiiiiiiiiiiiii?aaa=xxxx&bbb=yyyy + where the only 'special characters' are '?' and '&'. This is modeled after a query + string in HTML. This function searches for a key name with the value 'key_name', + (e.g. 'aaa' or 'bbb' in the example), and if it exists, sets `value` to that value + (e.g. 'xxxx' or 'yyyy' in the example. If the string `string` has no '?' in it, + or the key name `key_name` is not present, this function returns false; otherwise, + it returns true and sets `value` to that value. + +*/ +bool ParseFromQueryString(const std::string &string, + const std::string &key_name, + std::string *value); + + +// This overloaded version of ParseFromQueryString()is for where a float value +// is required. If the key is present but cannot be turned into a float, it +// will raise an error. +bool ParseFromQueryString(const std::string &string, + const std::string &key, + BaseFloat *f); + + + +} // namespace nnet3 +} // namespace kaldi + +#endif // KALDI_NNET3_NNET_CHAINA_UTILS_H_ diff --git a/src/nnet3a/notes.update b/src/nnet3a/notes.update new file mode 100644 index 00000000000..c01aa208d50 --- /dev/null +++ b/src/nnet3a/notes.update @@ -0,0 +1,392 @@ +=== + + Meta-info for dumping egs: + only really need tree,trans_mdl,normalization.fst,den.fst + + +=== + + +Things needed per language in order to dump raw egs: + + Configuration values: + - left and right acoustic context + - frames_per_eg, frames_overlap_per_eg, frame_subsampling_factor, + alignment_subsampling_factor, constrained, compress, left_tolerance, + right_tolerance, lattice_lm_scale, lattice_prune_beam, acwt + + (Also: left_context_initial,right_context_initial... although + the use of these will make it harder to deal with setups with + little data per speaker). + + - tree, tree.map, 0.trans_mdl, normalization.fst (and probably den.fst + so we can save it in the egs dir). + + - Format of raw egs dir (we'll likely delete this right after creation): + + info.txt: + + dir_type raw_chaina_egs + num_chunks 120000 + num_leaves 6543 + frames_per_chunk 140,110,100 + num_input_frames_tot 432143218 + left_context 10 + left_context_initial 10 + right_context 10 + right_context_initial 10 + + ... various configuration values here... + ... need utt2spk and utt2uniq file ... + + # note: tree.map is optional, since the egs don't depend on it, but it will generally + # be generated with the tree. + misc/{tree,tree.map,0.trans_mdl,normalization.fst,den.fst} + + egs.scp will contain encodings like: + -----v1 + + - Format of merged-egs dir + + dir_type merged_chaina_egs + chunks_per_spk 4 + .. otherwise like raw one. misc/ directory contains similar things. + + - Format of final-egs dir (might be merged). + info.txt: + dir_type final_chain_egs + langs english french + num_input_frames_tot 432143218 + num_scp_files 24 + frames_per_scp_file 143241 + chunks_per_spk 4 + num_chunks xxxx + + den_fsts/ -> lang.fst + norm_fsts/ -> lang.fst + trees/lang.tree, ?lang.tree.map + trans_models/lang.trans_mdl + + + - Format of chain-training-input dir: + + - Two purposes: as input to the model training, and (if single language) as input for getting the egs? + + Need: + - the input models (bottom and top-per-language), the input .ada objects + - The trees per language? + - Options and the like + - List of languages + - Left and right context required for egs + - extra left/right context??? + + +================= + + + + - contains egs.N.{scp,ark}, which might be links to files in the storage dir. + + egs.scp + +The + + + + + +-- Extend nnet3-chain-copy-egs, to supply at least a minimum context in input features by + duplicating frames as needed. E.g. + --extend-left-context=12 --extend-right-context=10 + +============= +Plans for binaries. + + nnet3-adapt --init|--copy|--adapt + +================ + + steps/chaina/init_den_fst.sh + make den.fst, normalization.fst + + # Maybe just use nnet3-init in the scripts, to initialize the nnets, and + # copy them where they are needed. + + + # What's needed in a chain dir? + 0/bottom.raw,lang.mdl,lang.ada + + + steps/chaina/init_chain_dir.sh + make den.fst, normalization.fst, + bottom.config, top.config, + bottom.raw, top.raw + + init.config, init.raw, 0.trans_mdl, + final.config (but not 0.raw yet, might need egs first). + + +============ + +nnet3-get-egs? + ... Make sure the length info and left/right context of each eg is included in the id? + - when we merge, + + steps/chaina/get_raw_egs.sh + + -- need to decide utts-per-spk-max in validation data? do it in process_egs. + + + ... takes options like --utts-per-spk-max --num-utts-subset --frames-per-job + (prev. frames-per-iter), --chunks-per-group (e.g. 4) + + steps/chaina/process_egs.sh [options] + + [shouldn't need any info not already in raw_egs dir, I hope. We'll later have a + multilingual version of this script]. + + steps/chaina/process_egs.sh [options] + + +======== + Monolingual case (training): + + README.txt + bottom.raw default.ada default.mdl default.den + info -> mfcc.config?? Or other config? + info.txt? + frame_subsampling_factor1 + frame_subsampling_factor2 + frame_subsampling_factor +.. we'll need to pass in chain opts such as: + +[for chain objective] + --leaky-hmm-coefficient +[for the neural nets]: + --max-param-change-{bottom,top} + --print-interval + --l2-regularize-factor (use same one). + --train-bottom-nnet {true,false} + +==== + nnet3-copy-egs: maybe introduce an option to extend context? + +=== + +prepare_egs.sh... + - merging into speaker groups. done by python script. Originally we'll dump with: + + utterance-id-{num_frames_out}-{frame_subsampling_factor}-{left_context}-{right_context} + + - so the number of input frames would be + ((num_frames_out - 1) * frame_subsampling_factor) + 1 + left_context + right_context + + + utterance-id-{num_frames_out}-{frame_subsampling_factor}-{left_context}-{right_context} + +=== + +nnet3-chain-merge-egs --keep-distinct + + + ?aaa=xxxx&bbb=yyyy + + + + +``` +This copies nnet3+chain training examples from input to output, merging them +into composite examples. The --minibatch-size option controls how many egs +are merged into a single output eg. + +Usage: nnet3-chain-merge-egs [options] +e.g. +nnet3-chain-merge-egs --minibatch-size=128 ark:1.cegs ark:- | nnet3-chain-train-simple ... +See also nnet3-chain-copy-egs +``` + + + + BUT, we don't want to do this on minibatches + + +==== + - Merging egs: will already have merged into speaker groups in prepare_egs. + - Output names? output --> output-xent. + - Input names? Just input. (May add ivector later but I hope not to have to). + - Could modify nnet3-merge-egs to parse the keys and get weights and output + names (to keep the output names distinct and to incorporate the weights). + + -- Initially, in nnet3*get-egs, we'll dump with: + + utterance-id-{num_frames}-{left_context}-{right_context} + + We'll use that info, together with the speaker-id and utt2uniq information, to + merge chunks together into groups (preferably by utterance; if not, by speaker) + in process_egs.sh (the choice of which egs to merge will be done in python). + +====== + +====== + later, when + + The merging script will decide the key for the merged egs. + + process_egs.sh will dump these as archives *and* scp files, but they will now + be in groups of chunks_per_spk (e.g. 4). The language name will be added as the + last-but-two field in the key; we'll set it to 'default' by default, but it may + be changed in merge_egs.sh. The last two fields will be (1) a weight to be incorporated + just before the final merge (by nnet3-chain-merge-egs with the --interpret-keys + option), and (2) a weight to propagate back to the bottom network (if you want a + particular language to have less of an effect on the bottom network). + + So the keys at the input to the final merge will be of the form: + {language-name}-{egs-weight}-{bottom-nnet-weight} + + And the keys at the output of the final merge would be of the form: + {language-name}-{bottom-nnet-weight}-0-0 + The 'egs-weight' (which becomes weight in the chain supervision objects, + which is a scale on the objective function) will already have been set + in the ChainSupervision object. + The 0 and 0 becom + + + + + info/chunks_per_spk + + We may also have a combine_egs.sh script which can combine egs from multiple + sources (assuming they have the same chunks_per_spk), and can assign them + to different language names if needed. + +==== + + Merging already-merged chain egs + + This is something that I am going to need for the new adaptation framework I am + working on. Currently in nnet-example-utils.cc and nnet-chain-example.cc, the + example-merging code does not support merging already-merged egs (search for already-merged). + This is something that I'm going to need to be supported at least in NnetChainExample, and + this would also need to be supported, I think, in the NnetExample merging code, since + I think the chain example merging code supports that code. If it would be helpful in + implementation, you may assume that all the egs to be merged have the same number + of 'n' values (e.g. it might be 4; it's the number of chunks per speaker that we use + for adaptation). + + After the examples have been merged I'd like a variable as follows to be set in + the NnetChainSupervision object: +``` + // This will be 1 in normal cases, but in the 'chaina' code (chain training + // with adaptation) it will be set to the number of chunks per speaker in + // this minibatch. For example if it's 4, then we are asserting that + // sequences n=0 through 3 all come from the same speaker, n=4 through 7 + // all come from the same speaker, and so on. + int32 chunks_per_spk; +``` +Please make sure this is 1 by default (e.g. in the constructor), that the +on-disk format stays the same when it's 1 (e.g. only write it if it's not 1) to +minimize code-version compatibility headaches; and only set it to +a value other than 1 when merging chain supervision objects that were +already merged (you can check that the sizes of the things being merged match). +We may later introduce such a variable in the NnetSupervision object, but +it's not needed just yet. + +This PR can go to my svd_draft branch in my personal repo, as it's part of +that project. +==== + +Interpreting keys when merging nnet and chain examples + +This is a change that will need to be made to nnet3-chain-merge-egs binary to support +the new adaptation framework. @hhadian, again, please get to this when you can but +it is not urgent at all. If someone else feels like they want to do it that's OK +with me too as long as you don't just sit on it without making progress, but please +have @hhadian check the code. +In ExampleMergingConfig, please add a new boolean config value, default false, registered +as follows: + + po->Register("interpret-keys", &interpret_keys, "If true, require the keys " + "on the example to end in something of the form -xxxxxx-yyy " + "where xxxxxx is a string with only letters, numbers and _, " + "which will be interpreted as a language-name (e.g. \"default\"," + "\"english\", \"french\"), and yyy is a floating point weight " + "e.g. 1.0, to be applied to the example. If the weight is not " + "1.0, then any NnetIo objects with names matching \"output\" and any chain " + "supervision objects will have their weights multiplied by this " + "weight. In addition, the merging will keep distinct language-names " + "distinct, and will ensure that the output keys end in -xxxxxx " + "where xxxxxx is the language-name. This is intended to support " + "the \"chaina\" adaptation framework.") + +and please make any implementation changes required to support it. When +weighting chain supervision objects, just multiply the 'weight' field in the +ChainSupervision object. I think when weighting NnetIo objects you can just +scale the GeneralMatrix, although I'm not sure if there is a generic way to do +that. (This probably only really makes sense with sparse supervision intended +to represent posterior probabilities in xent setups). Do this before merging; I +believe the chain merging code already checks for weight equality but you'll +have to also make sure it checks for network-name equality and encodes the +network name in the output key. I believe the output keys are currently not +really inspected so back compatibility won't be important. Also please make +sure there is a convenience function that makes it easy to extract the "xxxxxx" +network-name suffix from a chain example key; this will be needed in the +training code. + + + + +==== + + + info needed + ?den.fst? + + frame_subsampling_factor1 + frame_subsampling_factor2 + frame_subsampling_factor = their product. + + + separately: different den.fst's? one den.fst? +==== + Multilingual case (training): + + bottom.raw english.ada english.mdl <-- output vs. output_libri, output_wsj. No, will be too complicated (?) + ... just support one name. + spanish.ada spanish.mdl + +0.ada top.mdl + + +when randomizing + +we'll merge in a controlled way, e.g. nnet3-merge-egs --fixed +=== + + --bottom-subsampling-factor is the subsampling in the bottom + model (the feature extractor). frame-subsampling-factor + divided by this is the amount of subsampling in the top + model. In the training code we'll work this out from + the 't' values in the chain supervision object, and + the top network will actually run at the reduced frame + rate. + --keep-embedding-context is true if the top network is + recurrent and therefore we need to keep as much extra + context as possible in the features. + +How to work out the computations: + + We get the number of n values and the first and last 't' values in the input; + check they are contiguous. + + We get the number of 't' values in the output (the chain supervision) and + their spacing; this is interpreted as the frame-subsampling-factor, which is + not passed directly to nnet3-chaina-train. + + We are given the --bottom-subsampling-factor and (boolean) + top_network_is_recurrent. + + We work out the left-context and right-context of the bottom and + top networks. We first use this on the top network to work out, at the + top-network frame rate, the 't' values needed at the input + (e.g. frames -10 through 159 assuming the chunk size is 150 and + the network takes +-10 frames of context). diff --git a/src/nnet3abin/Makefile b/src/nnet3abin/Makefile new file mode 100644 index 00000000000..224c45a5bcd --- /dev/null +++ b/src/nnet3abin/Makefile @@ -0,0 +1,26 @@ + +all: +EXTRA_CXXFLAGS = -Wno-sign-compare +include ../kaldi.mk + +LDFLAGS += $(CUDA_LDFLAGS) +LDLIBS += $(CUDA_LDLIBS) + +BINFILES = nnet3-adapt nnet3-chaina-train nnet3-chaina-combine + +OBJFILES = + +# Add this dependency to force cuda-compiled.o to be rebuilt when we reconfigure. +cuda-compiled.o: ../kaldi.mk + +TESTFILES = + +ADDLIBS = ../nnet3a/kaldi-nnet3a.a ../adapt/kaldi-adapt.a ../nnet3/kaldi-nnet3.a \ + ../chain/kaldi-chain.a \ + ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \ + ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \ + ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \ + ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \ + ../base/kaldi-base.a + +include ../makefiles/default_rules.mk diff --git a/src/nnet3abin/nnet3-adapt.cc b/src/nnet3abin/nnet3-adapt.cc new file mode 100644 index 00000000000..8bd6570bf6f --- /dev/null +++ b/src/nnet3abin/nnet3-adapt.cc @@ -0,0 +1,265 @@ +// nnet3abin/nnet3-adapt.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3/nnet-nnet.h" +#include "hmm/transition-model.h" +#include "adapt/differentiable-transform-itf.h" +#include "nnet3a/nnet-chaina-training.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + using namespace kaldi::differentiable_transform; + typedef kaldi::int32 int32; + + const char *usage = + "This binary supports various modes that manipulate transform objects for\n" + "the nnet3a/chaina adaptation framework. See patterns below\n" + "\n" + "Usage: nnet3-adapt [options] init [] \n" + "(e.g.: nnet3-adapt --num-classes=201 init init.aconfig 0.ada)\n" + " or: nnet3-adapt init init.aconfig tree.map 0.ada\n" + " or: nnet3-adapt [options] copy \n" + "(e.g.: nnet3-adapt copy --binary=false 0.ada 0.txt)\n" + " or: nnet3-adapt info \n" + "(e.g.: nnet3-adapt info 0.ada\n" + " or: nnet3-adapt estimate ... \n" + " .. which sums stats and calls Estimate(), to get the final class-dependent means... \n" + "(e.g. nnet3-adapt estimate foo/final/default.{1,2,3,4,5,6}.ada foo/final/default.ada\n" + " or: nnet3-adapt [options] get-transforms \n" + " ... which estimates and dumps speaker-specific transforms as matrices, which\n" + " could be applied to the features with transform-feats; if you want\n" + " utterance-specific transforms, make spk2utt a one-to-one map.\n" + " is a wspecifier where matrices will be written.\n" + "(e.g.: nnet3-adapt final.ada spk2utt ark:- ark:feats.scp ark:1.trans)\n" + "\n" + "See also: nnet3-chaina-train\n"; + + bool binary_write = true; + bool remove_pdf_map = false; + int32 num_classes = -1; + int32 iter = 0; + int32 frame_subsampling_factor = 1; + + ParseOptions po(usage); + po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("num-classes", &num_classes, + "For 'init' command: number of classes the transform will " + "use (required if is not supplied)."); + po.Register("remove-pdf-map", &remove_pdf_map, + "For the 'copy' command: if true, the pdf_map will be " + "removed so that the transform will be based on " + "pdf-ids."); + po.Register("iter", &iter, "Only for the 'estimate' command: iteration " + "of estimation, will always be 0 in most setups."); + po.Register("frame-subsampling-factor", &frame_subsampling_factor, + "Factor by which the posteriors we read are subsampled relative " + "to the features (only for the get-transforms command). " + "Will correspond to the top-subsampling-factor," + "which, in chaina scripts, refers to frame_subsampling_factor " + "divided by bottom_subsampling_factor"); + + po.Read(argc, argv); + + + if (po.GetOptArg(1) == "init" && po.NumArgs() == 3) { + // This block does the "init" command where the tree.map was not provided. + if (num_classes <= 0) + KALDI_ERR << "The --num-classes option is required with the " + "'init' command."; + std::string config_rxfilename = po.GetArg(2), + transform_wxfilename = po.GetArg(3); + bool binary_in; // should be false. + Input ki(config_rxfilename, &binary_in); + DifferentiableTransformMapped transform; + + transform.transform = DifferentiableTransform::ReadFromConfig( + ki.Stream(), num_classes); + + WriteKaldiObject(transform, transform_wxfilename, binary_write); + return 0; + } else if (po.GetOptArg(1) == "init" && po.NumArgs() == 4) { + // This block does the "init" command where the tree.map was provided. + std::string config_rxfilename = po.GetArg(2), + tree_map_rxfilename = po.GetArg(3), + transform_wxfilename = po.GetArg(4); + + DifferentiableTransformMapped transform; + { // This block reads transform.pdf_map and sets up num_classes. + bool binary_in; + Input ki(tree_map_rxfilename, &binary_in); + ReadIntegerVector(ki.Stream(), binary_in, &(transform.pdf_map)); + if (transform.pdf_map.empty()) + KALDI_ERR << "Expected to be nonempty vector."; + int32 expected_num_classes = + 1 + *std::max_element(transform.pdf_map.begin(), + transform.pdf_map.end()); + if (num_classes > 0 && num_classes != expected_num_classes) + KALDI_ERR << "The --num-classes given via the option " << num_classes + << " differs from the expected value given the tree-map: " + << expected_num_classes; + num_classes = expected_num_classes; + } + + bool binary_in; // should be false. + Input ki(config_rxfilename, &binary_in); + transform.transform = DifferentiableTransform::ReadFromConfig( + ki.Stream(), num_classes); + WriteKaldiObject(transform, transform_wxfilename, binary_write); + return 0; + } else if (po.GetOptArg(1) == "info" && po.NumArgs() == 2) { + std::string transform_rxfilename = po.GetArg(2); + DifferentiableTransformMapped transform; + ReadKaldiObject(transform_rxfilename, &transform); + std::cout << transform.Info(); + return 0; + } else if (po.GetOptArg(1) == "copy" && po.NumArgs() == 3) { + std::string transform_rxfilename = po.GetArg(2), + transform_wxfilename = po.GetArg(3); + DifferentiableTransformMapped transform; + ReadKaldiObject(transform_rxfilename, &transform); + if (remove_pdf_map) { + if (transform.pdf_map.empty()) { + KALDI_WARN << "--remove-pdf-map option: transform does not have a pdf-map."; + } else { + transform.transform->SetNumClasses(transform.pdf_map.size()); + transform.pdf_map.clear(); + } + } + WriteKaldiObject(transform, transform_wxfilename, binary_write); + return 0; + } else if (po.GetOptArg(1) == "estimate" && po.NumArgs() >= 3) { + DifferentiableTransformMapped transform; + std::string transform_rxfilename = po.GetArg(2); + ReadKaldiObject(transform_rxfilename, &transform); + for (int32 i = 3; i < po.NumArgs(); i++) { + std::string other_transform_rxfilename = po.GetArg(i); + DifferentiableTransformMapped other_transform; + ReadKaldiObject(other_transform_rxfilename, &other_transform); + // sum the stats. + transform.transform->Add(*(other_transform.transform)); + } + transform.transform->Estimate(iter); + std::string transform_wxfilename = po.GetArg(po.NumArgs()); + WriteKaldiObject(transform, transform_wxfilename, binary_write); + return 0; + } else if (po.GetOptArg(1) == "get-transforms" && po.NumArgs() == 6) { + std::string transform_rxfilename = po.GetArg(2), + spk2utt_rspecifier = po.GetArg(3), + feats_rspecifier = po.GetArg(4), + post_rspecifier = po.GetArg(5), + transforms_wspecifier = po.GetArg(6); + + DifferentiableTransformMapped transform; + ReadKaldiObject(transform_rxfilename, &transform); + SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier); + RandomAccessPosteriorReader post_reader(post_rspecifier); + RandomAccessBaseFloatMatrixReader feature_reader(feats_rspecifier); + BaseFloatMatrixWriter transform_writer(transforms_wspecifier); + int32 num_done = 0, num_no_post = 0, num_other_error = 0; + + for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) { + std::unique_ptr stats( + transform.transform->GetEmptySpeakerStats()); + std::string spk = spk2utt_reader.Key(); + bool got_stats = false; + const std::vector &uttlist = spk2utt_reader.Value(); + for (size_t i = 0; i < uttlist.size(); i++) { + std::string utt = uttlist[i]; + if (!feature_reader.HasKey(utt)) { + KALDI_WARN << "Did not find features for utterance " << utt; + num_other_error++; + continue; + } + if (!post_reader.HasKey(utt)) { + KALDI_WARN << "Did not find posteriors for utterance " << utt; + num_no_post++; + continue; + } + const Matrix &feats = feature_reader.Value(utt); + const Posterior &post_in = post_reader.Value(utt); + Posterior post_upsampled(feats.NumRows()); + const Posterior *post_to_use = NULL; + if (frame_subsampling_factor != 1 || !transform.pdf_map.empty()) { + ConvertPosterior( + post_in, 1, 0, frame_subsampling_factor, transform.pdf_map, + transform.transform->NumClasses(), &post_upsampled); + post_to_use = &post_upsampled; + } else { + KALDI_ASSERT(post_in.size() == size_t(feats.NumRows()) && + "Mismatch in posterior vs. feats dimension"); + post_to_use = &post_in; + } + transform.transform->TestingAccumulate(feats, *post_to_use, stats.get()); + got_stats = true; + num_done++; + } + if (!got_stats) { + KALDI_WARN << "Got no stats for speaker " << spk; + } else { + stats->Estimate(); + int32 dim = transform.transform->Dim(); + Matrix transform_mat(dim, dim + 1, kUndefined); + transform.transform->GetTransformAsMatrix(*stats, &transform_mat); + transform_writer.Write(spk, transform_mat); + } + } + KALDI_LOG << "Done " << num_done << " files, " << num_no_post + << " with no posts, " << num_other_error << " with other errors."; + return (num_done != 0 && num_done > (num_no_post + num_other_error)) ? 0 : 1; + } else { + po.PrintUsage(); + exit(1); + } + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + +/* +Test script: + +cat < >& keys_and_egs, + NnetChainaModels *models) { + KALDI_ASSERT(!opts.top.train && !opts.bottom.train); + NnetChainaTrainer trainer(opts, models); + size_t num_egs = keys_and_egs.size(); + for (size_t i = 0; i < num_egs; i++) { + trainer.Train(keys_and_egs[i].first, keys_and_egs[i].second); + } + BaseFloat weight, adapted_objf, unadapted_objf; + adapted_objf = trainer.GetTotalObjf(true, &weight); + adapted_objf /= weight; + unadapted_objf = trainer.GetTotalObjf(false, &weight); + unadapted_objf /= weight; + BaseFloat ans = unadapted_objf_weight * unadapted_objf + + (1.0 - unadapted_objf_weight) * adapted_objf; + KALDI_LOG << "When averaging " << num_models_averaged + << " models, objf values (unadapted/si,adapted) " + << unadapted_objf << ", " << adapted_objf + << ", interpolated = " << ans << "; over " + << weight << " frames."; + return ans; +} + +void ReadExamples( + const std::string &egs_rspecifier, + std::vector > *keys_and_egs) { + keys_and_egs->reserve(10000); // reserve a lot of space to minimize the chance of + // reallocation. + SequentialNnetChainExampleReader example_reader(egs_rspecifier); + for (; !example_reader.Done(); example_reader.Next()) { + size_t i = keys_and_egs->size(); + keys_and_egs->resize(i + 1); + keys_and_egs->back().first = example_reader.Key(); + keys_and_egs->back().second.Swap(&(example_reader.Value())); + } + KALDI_LOG << "Read " << keys_and_egs->size() << " examples."; + KALDI_ASSERT(!keys_and_egs->empty()); +} + + +} // namespace nnet3 +} // namespace kaldi + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + using namespace kaldi::chain; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "This program does the final model-combination stage of 'chaina'\n" + "acoustic training: it averages over the last n models, where the\n" + "'n' is chosen (by this program) based on maximizing the objective\n" + "function on the data given to it. It maximizes the average of the\n" + "speaker-independent and speaker-dependent versions of the 'chain'\n" + "objective values.\n" + "This program is intended to be used with a GPU.\n" + "\n" + "Usage: nnet3-chaina-combine [options] ... \\\n" + " \n" + "\n" + " should contain bottom.raw, and .mdl for each language \n" + " (these will be averaged over a range of indexes including N, e.g. just modelN, or\n" + " modelN with model(N-1), and so on).\n" + " should contain .den.fst for each language \n" + " should contain .ada for each language \n" + " is a place to where bottom.mdl and .mdl for each language\n" + " that was seen in the egs, will be written (for , see the --job-id option).\n"; + + + int32 srand_seed = 0; + bool binary_write = true; + std::string use_gpu = "yes"; + NnetChainaTrainingOptions chaina_opts; + chaina_opts.top.train = false; + chaina_opts.bottom.train = false; + chaina_opts.top.dropout_test_mode = true; + chaina_opts.bottom.dropout_test_mode = true; + // But leave the batchnorm test-modes at false. + + // Setting batchnorm_stats_scale to 1.0 means it won't scale down the + // batchnorm stats as it goes (the default is 0.8), so they will all be + // remembered. Note: each time we initialize and use the trainer object, in + // GetObjectiveFunction, it will call ZeroComponentStats() for both the + // bottom and top models (assuming the options are the defaults), so only + // the stats from the most recent run will be present. + chaina_opts.nnet_config.batchnorm_stats_scale = 1.0; + + BaseFloat unadapted_objf_weight = 0.5; + + ParseOptions po(usage); + po.Register("srand", &srand_seed, "Seed for random number generator "); + po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("use-gpu", &use_gpu, + "yes|no|optional|wait, only has effect if compiled with CUDA"); + po.Register("unadapted-weight", &unadapted_objf_weight, + "The weight we give to the unadapted version of the objective function " + "when evaluating the goodness of models (the adapted objective gets " + "1 minus this value as its weight)"); + + + chaina_opts.Register(&po); + RegisterCuAllocatorOptions(&po); + + po.Read(argc, argv); + + srand(srand_seed); + + if (po.NumArgs() < 5) { + po.PrintUsage(); + exit(1); + } + +#if HAVE_CUDA==1 + CuDevice::Instantiate().SelectGpuId(use_gpu); +#endif + + int32 n = po.NumArgs() - 4; // n is the number of models we have + // available to average. + + std::string last_model_in_dir = po.GetArg(n), + den_fst_dir = po.GetArg(n + 1), + transform_dir = po.GetArg(n + 2), + egs_rspecifier = po.GetArg(n + 3), + model_out_dir = po.GetOptArg(n + 4); + + NnetChainaModels models(chaina_opts, + last_model_in_dir, den_fst_dir, + transform_dir); + + + std::vector > keys_and_egs; + ReadExamples(egs_rspecifier, &keys_and_egs); + + // first evaluates the objective using the last model. + int32 best_num_to_combine = -1; + BaseFloat best_objf = -std::numeric_limits::infinity(), + single_model_objf; + + std::unique_ptr best_models; + + for (int32 num_models = 1; num_models <= n; num_models++) { + if (num_models > 1) + models.InterpolateWith(1.0 / num_models, po.GetArg(n + 1 - num_models)); + BaseFloat objf = GetObjectiveFunction(chaina_opts, unadapted_objf_weight, + num_models, keys_and_egs, &models); + if (objf > best_objf || num_models == 1) { + best_objf = objf; + best_models = std::unique_ptr( + new NnetChainaModels(models)); + best_num_to_combine = num_models; + if (num_models == 1) + single_model_objf = objf; + } + if (num_models > best_num_to_combine + 4 && num_models < n) + KALDI_LOG << "Stopping the search early as it looks like we found " + "the best combination"; + } + + KALDI_LOG << "Best objective function was " << best_objf << " with " + << best_num_to_combine << " models."; + KALDI_LOG << "About to recompute objective function with batchnorm in " + "test-mode:\n"; + chaina_opts.top.batchnorm_test_mode = true; + chaina_opts.bottom.batchnorm_test_mode = true; + + BaseFloat test_mode_objf = + GetObjectiveFunction(chaina_opts, unadapted_objf_weight, + best_num_to_combine, + keys_and_egs, + best_models.get()); + KALDI_LOG << "Objf with test-mode batchnorm was " << test_mode_objf + << " (vs. " << best_objf << " without test mode)"; + + KALDI_LOG << "Combination changed the objective from " + << single_model_objf << " with only the final model, to " + << best_objf << " with " << best_num_to_combine + << " models."; + + best_models->WriteCombinedModels(model_out_dir, binary_write); + +#if HAVE_CUDA==1 + CuDevice::Instantiate().PrintProfile(); +#endif + return 0; + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/nnet3abin/nnet3-chaina-train.cc b/src/nnet3abin/nnet3-chaina-train.cc new file mode 100644 index 00000000000..f6f98b6ffd3 --- /dev/null +++ b/src/nnet3abin/nnet3-chaina-train.cc @@ -0,0 +1,116 @@ +// nnet3bin/nnet3-chaina-train.cc + +// Copyright 2018 Johns Hopkins University (author: Daniel Povey) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3a/nnet-chaina-training.h" +#include "cudamatrix/cu-allocator.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + using namespace kaldi::chain; + typedef kaldi::int32 int32; + typedef kaldi::int64 int64; + + const char *usage = + "Train nnet3+chaina (i.e. chain + adaptation framework) neural network.\n" + "Minibatches are to be created by nnet3-chain-merge-egs in\n" + "the input pipeline. This training program is single-threaded (best to\n" + "use it with a GPU).\n" + "\n" + "Usage: nnet3-chaina-train [options] \n" + " []\n" + "\n" + " should contain bottom.raw, and .mdl for each language \n" + " should contain .den.fst for each language \n" + " should contain .ada for each language \n" + " is a place to where bottom..raw and ..raw for each language\n" + " that was seen in the egs, will be written (for , see the --job-id option).\n" + " If it is not specified, the trained models will not be written (e.g. when you are using\n" + " --bottom-model-test-mode=true --top-model-test-mode=true and only want diagnostics).\n"; + + + int32 srand_seed = 0; + bool binary_write = true; + std::string use_gpu = "yes"; + NnetChainaTrainingOptions chaina_opts; + int32 job_id = 0; + + ParseOptions po(usage); + po.Register("srand", &srand_seed, "Seed for random number generator "); + po.Register("binary", &binary_write, "Write output in binary mode"); + po.Register("use-gpu", &use_gpu, + "yes|no|optional|wait, only has effect if compiled with CUDA"); + po.Register("job-id", &job_id, + "Job identifier, helps to determine pathnames of models written " + "to ."); + + chaina_opts.Register(&po); + RegisterCuAllocatorOptions(&po); + + po.Read(argc, argv); + + srand(srand_seed); + + if (po.NumArgs() < 4 || po.NumArgs() > 5) { + po.PrintUsage(); + exit(1); + } + +#if HAVE_CUDA==1 + CuDevice::Instantiate().SelectGpuId(use_gpu); +#endif + + bool ok; + + std::string model_in_dir = po.GetArg(1), + den_fst_dir = po.GetArg(2), + transform_dir = po.GetArg(3), + egs_rspecifier = po.GetArg(4), + model_out_dir = po.GetOptArg(5); + + NnetChainaModels models(chaina_opts, + model_in_dir, den_fst_dir, transform_dir); + + { + NnetChainaTrainer trainer(chaina_opts, &models); + + SequentialNnetChainExampleReader example_reader(egs_rspecifier); + + for (; !example_reader.Done(); example_reader.Next()) + trainer.Train(example_reader.Key(), + example_reader.Value()); + + ok = trainer.PrintTotalStats(); + } + if (po.NumArgs() == 5) + models.Write(model_out_dir, binary_write, job_id); + +#if HAVE_CUDA==1 + CuDevice::Instantiate().PrintProfile(); +#endif + return (ok ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} diff --git a/src/transform/Makefile b/src/transform/Makefile index a265db6ac37..194f362f11a 100644 --- a/src/transform/Makefile +++ b/src/transform/Makefile @@ -15,6 +15,6 @@ OBJFILES = regression-tree.o regtree-mllr-diag-gmm.o lda-estimate.o \ LIBNAME = kaldi-transform ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \ - ../matrix/kaldi-matrix.a ../base/kaldi-base.a + ../matrix/kaldi-matrix.a ../base/kaldi-base.a include ../makefiles/default_rules.mk diff --git a/src/util/text-utils-test.cc b/src/util/text-utils-test.cc index 5bfe4cb24d0..3b58f4f1dd1 100644 --- a/src/util/text-utils-test.cc +++ b/src/util/text-utils-test.cc @@ -2,6 +2,7 @@ // Copyright 2009-2011 Microsoft Corporation // 2017 Johns Hopkins University (author: Daniel Povey) +// 2015 Vimal Manohar (Johns Hopkins University) // See ../../COPYING for clarification regarding multiple authors // @@ -324,6 +325,193 @@ void TestStringsApproxEqual() { KALDI_ASSERT(!StringsApproxEqual("x 1.0 y", "x 1.0001 y", 4)); } +void UnitTestConfigLineParse() { + std::string str; + { + ConfigLine cfl; + str = "a-b xx=yyy foo=bar baz=123 ba=1:2"; + bool status = cfl.ParseLine(str); + KALDI_ASSERT(status && cfl.FirstToken() == "a-b"); + + KALDI_ASSERT(cfl.HasUnusedValues()); + std::string str_value; + KALDI_ASSERT(cfl.GetValue("xx", &str_value)); + KALDI_ASSERT(str_value == "yyy"); + KALDI_ASSERT(cfl.HasUnusedValues()); + KALDI_ASSERT(cfl.GetValue("foo", &str_value)); + KALDI_ASSERT(str_value == "bar"); + KALDI_ASSERT(cfl.HasUnusedValues()); + KALDI_ASSERT(!cfl.GetValue("xy", &str_value)); + KALDI_ASSERT(cfl.GetValue("baz", &str_value)); + KALDI_ASSERT(str_value == "123"); + + std::vector int_values; + KALDI_ASSERT(!cfl.GetValue("xx", &int_values)); + KALDI_ASSERT(cfl.GetValue("baz", &int_values)); + KALDI_ASSERT(cfl.HasUnusedValues()); + KALDI_ASSERT(int_values.size() == 1 && int_values[0] == 123); + KALDI_ASSERT(cfl.GetValue("ba", &int_values)); + KALDI_ASSERT(int_values.size() == 2 && int_values[0] == 1 && int_values[1] == 2); + KALDI_ASSERT(!cfl.HasUnusedValues()); + } + + { + ConfigLine cfl; + str = "a-b baz=x y z pp = qq ab =cd ac= bd"; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = "a-b baz=x y z pp = qq ab=cd ac=bd"; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = "foo-bar"; + KALDI_ASSERT(cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = "foo-bar a=b c d f=g"; + std::string value; + KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "foo-bar" && + cfl.GetValue("a", &value) && value == "b c d" && + cfl.GetValue("f", &value) && value == "g" && + !cfl.HasUnusedValues()); + } + { + ConfigLine cfl; + str = "zzz a=b baz"; + KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "zzz" && + cfl.UnusedValues() == "a=b baz"); + } + { + ConfigLine cfl; + str = "xxx a=b baz "; + KALDI_ASSERT(cfl.ParseLine(str) && cfl.UnusedValues() == "a=b baz"); + } + { + ConfigLine cfl; + str = "xxx a=b =c"; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = "xxx baz='x y z' pp=qq ab=cd ac=bd"; + KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "xxx"); + std::string str_value; + KALDI_ASSERT(cfl.GetValue("baz", &str_value)); + KALDI_ASSERT(str_value == "x y z"); + KALDI_ASSERT(cfl.GetValue("pp", &str_value)); + KALDI_ASSERT(str_value == "qq"); + KALDI_ASSERT(cfl.UnusedValues() == "ab=cd ac=bd"); + KALDI_ASSERT(cfl.GetValue("ab", &str_value)); + KALDI_ASSERT(str_value == "cd"); + KALDI_ASSERT(cfl.UnusedValues() == "ac=bd"); + KALDI_ASSERT(cfl.HasUnusedValues()); + KALDI_ASSERT(cfl.GetValue("ac", &str_value)); + KALDI_ASSERT(str_value == "bd"); + KALDI_ASSERT(!cfl.HasUnusedValues()); + } + + { + ConfigLine cfl; + str = "x baz= pp = qq flag=t "; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = " x baz= pp=qq flag=t "; + KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "x"); + + std::string str_value; + KALDI_ASSERT(cfl.GetValue("baz", &str_value)); + KALDI_ASSERT(str_value == ""); + KALDI_ASSERT(cfl.GetValue("pp", &str_value)); + KALDI_ASSERT(str_value == "qq"); + KALDI_ASSERT(cfl.HasUnusedValues()); + KALDI_ASSERT(cfl.GetValue("flag", &str_value)); + KALDI_ASSERT(str_value == "t"); + KALDI_ASSERT(!cfl.HasUnusedValues()); + + bool bool_value = false; + KALDI_ASSERT(cfl.GetValue("flag", &bool_value)); + KALDI_ASSERT(bool_value); + } + + { + ConfigLine cfl; + str = "xx _baz=a -pp=qq"; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = "xx 0baz=a pp=qq"; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = "xx -baz=a pp=qq"; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = "xx _baz'=a pp=qq"; + KALDI_ASSERT(!cfl.ParseLine(str)); + } + { + ConfigLine cfl; + str = " baz=g"; + KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == ""); + bool flag; + KALDI_ASSERT(!cfl.GetValue("baz", &flag)); + } + { + ConfigLine cfl; + str = "xx _baz1=a pp=qq"; + KALDI_ASSERT(cfl.ParseLine(str)); + + std::string str_value; + KALDI_ASSERT(cfl.GetValue("_baz1", &str_value)); + } +} + +void UnitTestReadConfig() { + std::string str = "a-b alpha=aa beta=\"b b\"# String test\n" + "a-b beta2='b c' beta3=bd # \n" + "a-b gamma=1:2:3:4 # Int Vector test\n" + " a-b de1ta=f # Bool + Integer in key Comment test delta=t \n" + "a-b _epsilon=-1 # Int Vector test _epsilon=1 \n" + "a-b zet-_a=0.15 theta=1.1# Float, -, _ test\n" + "a-b quoted='a b c' # quoted string\n" + "a-b quoted2=\"d e 'a b=c' f\" # string quoted with double quotes"; + + std::istringstream is(str); + std::vector lines; + ReadConfigLines(is, &lines); + KALDI_ASSERT(lines.size() == 8); + + ConfigLine cfl; + for (size_t i = 0; i < lines.size(); i++) { + KALDI_ASSERT(cfl.ParseLine(lines[i]) && cfl.FirstToken() == "a-b"); + if (i == 1) { + KALDI_ASSERT(cfl.GetValue("beta2", &str) && str == "b c"); + } + if (i == 4) { + KALDI_ASSERT(cfl.GetValue("_epsilon", &str) && str == "-1"); + } + if (i == 5) { + BaseFloat float_val = 0; + KALDI_ASSERT(cfl.GetValue("zet-_a", &float_val) && ApproxEqual(float_val, 0.15)); + } + if (i == 6) { + KALDI_ASSERT(cfl.GetValue("quoted", &str) && str == "a b c"); + } + if (i == 7) { + KALDI_ASSERT(cfl.GetValue("quoted2", &str) && str == "d e 'a b=c' f"); + } + } +} } // end namespace kaldi @@ -344,5 +532,7 @@ int main() { TestNan(); TestInf(); TestInf(); + UnitTestConfigLineParse(); + UnitTestReadConfig(); std::cout << "Test OK\n"; } diff --git a/src/util/text-utils.cc b/src/util/text-utils.cc index 200e3ad9327..bbf38ecc5cc 100644 --- a/src/util/text-utils.cc +++ b/src/util/text-utils.cc @@ -340,4 +340,252 @@ bool StringsApproxEqual(const std::string &a, } +bool ConfigLine::ParseLine(const std::string &line) { + data_.clear(); + whole_line_ = line; + if (line.size() == 0) return false; // Empty line + size_t pos = 0, size = line.size(); + while (isspace(line[pos]) && pos < size) pos++; + if (pos == size) + return false; // whitespace-only line + size_t first_token_start_pos = pos; + // first get first_token_. + while (!isspace(line[pos]) && pos < size) { + if (line[pos] == '=') { + // If the first block of non-whitespace looks like "foo-bar=...", + // then we ignore it: there is no initial token, and FirstToken() + // is empty. + pos = first_token_start_pos; + break; + } + pos++; + } + first_token_ = std::string(line, first_token_start_pos, pos - first_token_start_pos); + // first_token_ is expected to be either empty or something like + // "component-node", which actually is a slightly more restrictive set of + // strings than IsValidName() checks for this is a convenient way to check it. + if (!first_token_.empty() && !IsValidName(first_token_)) + return false; + + while (pos < size) { + if (isspace(line[pos])) { + pos++; + continue; + } + + // OK, at this point we know that we are pointing at nonspace. + size_t next_equals_sign = line.find_first_of("=", pos); + if (next_equals_sign == pos || next_equals_sign == std::string::npos) { + // we're looking for something like 'key=value'. If there is no equals sign, + // or it's not preceded by something, it's a parsing failure. + return false; + } + std::string key(line, pos, next_equals_sign - pos); + if (!IsValidName(key)) return false; + + // handle any quotes. we support key='blah blah' or key="foo bar". + // no escaping is supported. + if (line[next_equals_sign+1] == '\'' || line[next_equals_sign+1] == '"') { + char my_quote = line[next_equals_sign+1]; + size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2); + if (next_quote == std::string::npos) { // no matching quote was found. + KALDI_WARN << "No matching quote for " << my_quote << " in config line '" + << line << "'"; + return false; + } else { + std::string value(line, next_equals_sign + 2, + next_quote - next_equals_sign - 2); + data_.insert(std::make_pair(key, std::make_pair(value, false))); + pos = next_quote + 1; + continue; + } + } else { + // we want to be able to parse something like "... input=Offset(a, -1) foo=bar": + // in general, config values with spaces in them, even without quoting. + + size_t next_next_equals_sign = line.find_first_of("=", next_equals_sign + 1), + terminating_space = size; + + if (next_next_equals_sign != std::string::npos) { // found a later equals sign. + size_t preceding_space = line.find_last_of(" \t", next_next_equals_sign); + if (preceding_space != std::string::npos && + preceding_space > next_equals_sign) + terminating_space = preceding_space; + } + while (isspace(line[terminating_space - 1]) && terminating_space > 0) + terminating_space--; + + std::string value(line, next_equals_sign + 1, + terminating_space - (next_equals_sign + 1)); + data_.insert(std::make_pair(key, std::make_pair(value, false))); + pos = terminating_space; + } + } + return true; +} + +bool ConfigLine::GetValue(const std::string &key, std::string *value) { + KALDI_ASSERT(value != NULL); + std::map >::iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (it->first == key) { + *value = (it->second).first; + (it->second).second = true; + return true; + } + } + return false; +} + +bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) { + KALDI_ASSERT(value != NULL); + std::map >::iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (it->first == key) { + if (!ConvertStringToReal((it->second).first, value)) + return false; + (it->second).second = true; + return true; + } + } + return false; +} + +bool ConfigLine::GetValue(const std::string &key, int32 *value) { + KALDI_ASSERT(value != NULL); + std::map >::iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (it->first == key) { + if (!ConvertStringToInteger((it->second).first, value)) + return false; + (it->second).second = true; + return true; + } + } + return false; +} + +bool ConfigLine::GetValue(const std::string &key, std::vector *value) { + KALDI_ASSERT(value != NULL); + value->clear(); + std::map >::iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (it->first == key) { + if (!SplitStringToIntegers((it->second).first, ":,", true, value)) { + // KALDI_WARN << "Bad option " << (it->second).first; + return false; + } + (it->second).second = true; + return true; + } + } + return false; +} + +bool ConfigLine::GetValue(const std::string &key, bool *value) { + KALDI_ASSERT(value != NULL); + std::map >::iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (it->first == key) { + if ((it->second).first.size() == 0) return false; + switch (((it->second).first)[0]) { + case 'F': + case 'f': + *value = false; + break; + case 'T': + case 't': + *value = true; + break; + default: + return false; + } + (it->second).second = true; + return true; + } + } + return false; +} + +bool ConfigLine::HasUnusedValues() const { + std::map >::const_iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (!(it->second).second) return true; + } + return false; +} + +std::string ConfigLine::UnusedValues() const { + std::string unused_str; + std::map >::const_iterator it = data_.begin(); + for (; it != data_.end(); ++it) { + if (!(it->second).second) { + if (unused_str == "") + unused_str = it->first + "=" + (it->second).first; + else + unused_str += " " + it->first + "=" + (it->second).first; + } + } + return unused_str; +} + +// This is like ExpectToken but for two tokens, and it +// will either accept token1 and then token2, or just token2. +// This is useful in Read functions where the first token +// may already have been consumed. +void ExpectOneOrTwoTokens(std::istream &is, bool binary, + const std::string &token1, + const std::string &token2) { + KALDI_ASSERT(token1 != token2); + std::string temp; + ReadToken(is, binary, &temp); + if (temp == token1) { + ExpectToken(is, binary, token2); + } else { + if (temp != token2) { + KALDI_ERR << "Expecting token " << token1 << " or " << token2 + << " but got " << temp; + } + } +} + + +bool IsValidName(const std::string &name) { + if (name.size() == 0) return false; + for (size_t i = 0; i < name.size(); i++) { + if (i == 0 && !isalpha(name[i]) && name[i] != '_') + return false; + if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.') + return false; + } + return true; +} + +void ReadConfigLines(std::istream &is, + std::vector *lines) { + KALDI_ASSERT(lines != NULL); + std::string line; + while (std::getline(is, line)) { + if (line.size() == 0) continue; + size_t start = line.find_first_not_of(" \t"); + size_t end = line.find_first_of('#'); + if (start == std::string::npos || start == end) continue; + end = line.find_last_not_of(" \t", end - 1); + KALDI_ASSERT(end >= start); + lines->push_back(line.substr(start, end - start + 1)); + } +} + +void ParseConfigLines(const std::vector &lines, + std::vector *config_lines) { + config_lines->resize(lines.size()); + for (size_t i = 0; i < lines.size(); i++) { + bool ret = (*config_lines)[i].ParseLine(lines[i]); + if (!ret) { + KALDI_ERR << "Error parsing config line: " << lines[i]; + } + } +} + + } // end namespace kaldi diff --git a/src/util/text-utils.h b/src/util/text-utils.h index 7bc20957672..02f4bf483fc 100644 --- a/src/util/text-utils.h +++ b/src/util/text-utils.h @@ -183,6 +183,98 @@ bool StringsApproxEqual(const std::string &a, const std::string &b, int32 decimal_places_check = 2); +/** + This class is responsible for parsing input like + hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' baz="a b c d='a b' e" + and giving you access to the fields, in this case + + FirstToken() == "hi-there", and key->value pairs: + + xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123", + bing->"a b c", baz->"a b c d='a b' e" + + The first token is optional, if the line started with a key-value pair then + FirstValue() will be empty. + + Note: it can parse value fields with space inside them only if they are free of the '=' + character. If values are going to contain the '=' character, you need to quote them + with either single or double quotes. + + Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_. + */ +class ConfigLine { + public: + // Tries to parse the line as a config-file line. Returns false + // if it could not for some reason, e.g. parsing failure. In most cases + // prints no warnings; the user should do this. Does not expect comments. + bool ParseLine(const std::string &line); + + // the GetValue functions are overloaded for various types. They return true + // if the key exists with value that can be converted to that type, and false + // otherwise. They also mark the key-value pair as having been read. It is + // not an error to read values twice. + bool GetValue(const std::string &key, std::string *value); + bool GetValue(const std::string &key, BaseFloat *value); + bool GetValue(const std::string &key, int32 *value); + // Values may be separated by ":" or by ",". + bool GetValue(const std::string &key, std::vector *value); + bool GetValue(const std::string &key, bool *value); + + bool HasUnusedValues() const; + /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one + /// of the GetValue() functions. + std::string UnusedValues() const; + + const std::string &FirstToken() const { return first_token_; } + + const std::string WholeLine() { return whole_line_; } + // use default assignment operator and copy constructor. + private: + std::string whole_line_; + // the first token of the line, e.g. if line is + // foo-bar baz=bing + // then first_token_ would be "foo-bar". + std::string first_token_; + + // data_ maps from key to (value, is-this-value-consumed?). + std::map > data_; + +}; + +/// This function is like ExpectToken but for two tokens, and it will either +/// accept token1 and then token2, or just token2. This is useful in Read +/// functions where the first token may already have been consumed. +void ExpectOneOrTwoTokens(std::istream &is, bool binary, + const std::string &token1, + const std::string &token2); + + +/** + This function reads in a config file and *appends* its contents to a vector of + lines; it is responsible for removing comments (anything after '#') and + stripping out any lines that contain only whitespace after comment removal. + */ +void ReadConfigLines(std::istream &is, + std::vector *lines); + + +/** + This function converts config-lines from a simple sequence of strings + as output by ReadConfigLines(), into a sequence of first-tokens and + name-value pairs. The general format is: + "command-type bar=baz xx=yyy" + etc., although there are subtleties as to what exactly is allowed, see + documentation for class ConfigLine for details. + This function will die if there was a parsing failure. + */ +void ParseConfigLines(const std::vector &lines, + std::vector *config_lines); + + +/// Returns true if 'name' would be a valid name for a component or node in a +/// nnet3Nnet. This is a nonempty string beginning with A-Za-z_, and containing only +/// '-', '_', '.', A-Z, a-z, or 0-9. +bool IsValidName(const std::string &name); } // namespace kaldi diff --git a/tools/config/common_path.sh b/tools/config/common_path.sh index 9a7ae2d9b29..a8d454b3c06 100644 --- a/tools/config/common_path.sh +++ b/tools/config/common_path.sh @@ -22,4 +22,5 @@ ${KALDI_ROOT}/src/rnnlmbin:\ ${KALDI_ROOT}/src/sgmm2bin:\ ${KALDI_ROOT}/src/sgmmbin:\ ${KALDI_ROOT}/src/tfrnnlmbin:\ +${KALDI_ROOT}/src/nnet3abin:\ $PATH