diff --git a/egs/mini_librispeech/s5/cmd.sh b/egs/mini_librispeech/s5/cmd.sh
index 71dd849a93b..3189d83975a 100644
--- a/egs/mini_librispeech/s5/cmd.sh
+++ b/egs/mini_librispeech/s5/cmd.sh
@@ -10,6 +10,8 @@
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+# in future I'd like to start using just one $cmd variable.
+export cmd="queue.pl --mem 2G"
 export train_cmd="queue.pl --mem 2G"
 export decode_cmd="queue.pl --mem 4G"
 export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/mini_librispeech/s5/conf/mfcc_hires2.conf b/egs/mini_librispeech/s5/conf/mfcc_hires2.conf
new file mode 100644
index 00000000000..2e8dc221d40
--- /dev/null
+++ b/egs/mini_librispeech/s5/conf/mfcc_hires2.conf
@@ -0,0 +1,14 @@
+# config for high-resolution MFCC features, intended for 'chaina' neural network
+# training.  These '..2.conf' setups are intended to have the --modified=true
+# configuration value.
+
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+# Will soon add: --modified=true
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=20     # low cutoff frequency for mel bins... this is high-bandwidth data, so
+                  # there might be some information at the low end.
+--high-freq=-400 # high cutoff frequently, relative to Nyquist of 8000 (=7600)
diff --git a/egs/mini_librispeech/s5/local/chaina/data_prep_common.sh b/egs/mini_librispeech/s5/local/chaina/data_prep_common.sh
new file mode 100755
index 00000000000..a736fc8c008
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chaina/data_prep_common.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/chaina/run_tdnn.sh and
+# similar scripts.   It contains the common feature preparation and
+# lattice-alignment preparation parts of the chaina training.
+# See those scripts for examples of usage.
+
+stage=0
+train_set=train_clean_5
+test_sets="dev_clean_2"
+gmm=tri3b
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+# Our default data augmentation method is 3-way speed augmentation followed by
+# volume perturbation.  We are looking into better ways of doing this,
+# e.g. involving noise and reverberation.
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment.  _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj 20 --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/fs0{1,2}/$USER/kaldi-data/mfcc/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj 10 --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+fi
+
+
+exit 0
diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..8aa00c0d975
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1a.sh
@@ -0,0 +1,499 @@
+#!/bin/bash
+
+
+# grep WER exp/chaina/tdnn1a_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 21.44 [ 4317 / 20138, 341 ins, 947 del, 3029 sub ] exp/chaina/tdnn1a_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+# a09:s5: grep WER exp/chaina/tdnn1a_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 19.72 [ 3971 / 20138, 317 ins, 771 del, 2883 sub ] exp/chaina/tdnn1a_sp/decode_dev_clean_2_tgsmall/wer_17_0.0
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+srand=0
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1a   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+
+
+# training chunk-options
+chunk_width=140
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.1
+bottom_subsampling_factor=1  # I'll set this to 3 later, 1 is for compatibility with a broken ru.
+frame_subsampling_factor=3
+langs="default"  # list of language names
+
+# The amount of extra left/right context we put in the egs.  Note: this could
+# easily be zero, since we're not using a recurrent topology, but we put in a
+# little extra context so that we have more room to play with the configuration
+# without re-dumping egs.
+egs_extra_left_context=5
+egs_extra_right_context=5
+
+# The number of chunks (of length: see $chunk_width above) that we group
+# together for each "speaker" (actually: pseudo-speaker, since we may have
+# to group multiple speaker together in some cases).
+chunks_per_group=4
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/chaina/data_prep_common.sh --stage $stage \
+                                 --train-set $train_set \
+                                 --gmm $gmm  || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chaina/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chaina/${gmm}_${train_set}_sp_lats
+dir=exp/chaina/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  # This will be a two-level tree (with the smaller number of leaves specified
+  # by the '--num-clusters' option); this is needed by the adaptation framework
+  # search below for 'tree.map'
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+   steps/nnet3/chain/build_tree.sh \
+     --num-clusters 200 \
+     --frame-subsampling-factor ${frame_subsampling_factor} \
+     --context-opts "--context-width=2 --central-position=1" \
+     --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+     $lang $ali_dir $tree_dir
+fi
+
+
+# $dir/configs will contain xconfig and config files for the initial
+# models.  It's a scratch space used by this script but not by
+# scripts called from here.
+mkdir -p $dir/configs/
+# $dir/init will contain the initial models
+mkdir -p $dir/init/
+
+l2=0.03
+tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+prefinal_opts="l2-regularize=0.03"
+output_opts="l2-regularize=0.015"
+num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+if [ $stage -le 13 ]; then
+  echo "$0: creating top neural net using the xconfig parser";
+
+  cat <<EOF > $dir/configs/bottom.xconfig
+  input dim=40 name=input
+
+  batchnorm-component name=input-batchnorm
+
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1)
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  # this 'batchnorm-layer' has an affine component but no nonlinearlity
+  linear-component name=linear_bottleneck dim=256 l2-regularize=$l2
+  batchnorm-component name=linear_bottleneck_bn
+  output name=output input=linear_bottleneck_bn
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \
+                                   --config-file-out $dir/configs/bottom.config
+  nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating adaptation model/transform"
+
+  # note: 'default' corresponds to the language name (we use 'default' since this
+  # is not really a multilingual setup.
+  # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match
+  # with the dimension of this transform (256).
+  cat <<EOF | nnet3-adapt --binary=false init - $tree_dir/tree.map $dir/init/default.ada
+AppendTransform num-transforms=6
+  NoOpTransform dim=64
+  MeanOnlyTransform dim=64
+  FmllrTransform dim=32
+  FmllrTransform dim=32
+  FmllrTransform dim=32
+  FmllrTransform dim=32
+EOF
+
+  # check the dimensions match
+  transform_dim=$(nnet3-adapt info $dir/init/default.ada | grep '^dim' | awk -F= '/^dim/ { print $2; }')
+  bottom_output_dim=$(nnet3-info $dir/init/bottom.raw | grep 'output-node name=output ' | perl -ane 'm/dim=(\d+)/ && print $1;')
+  if ! [ "$transform_dim" -eq "$bottom_output_dim" ]; then
+    echo "$0: expected dim of transform to equal output-dim of bottom nnet, got '$transform_dim' != '$bottom_output_dim'"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 15 ]; then
+
+  # Note: we'll use --bottom-subsampling-factor=3, so all time-strides for the
+  # top network should be interpreted at the 30ms frame subsampling rate.
+
+  echo "$0: creating top model"
+  cat <<EOF > $dir/configs/default.xconfig
+  input name=input dim=256
+  linear-component $linear_opts name=linear_from_input dim=768
+  tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  # adding the output layer for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts
+
+  # adding the output layer for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \
+                                   --config-file-out $dir/configs/default.config
+  nnet3-init --srand=$srand $dir/configs/default.config - | \
+     nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl
+fi
+
+
+if [ $stage -le 16 ]; then
+  # Work out the model's total effective left and right context (in the
+  # feature frame-sampling rate).
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor $frame_subsampling_factor \
+        --bottom-subsampling-factor $bottom_subsampling_factor \
+       --langs "$langs" $dir/init/ $dir/init/info.txt
+fi
+
+
+if [ $stage -le 17 ]; then
+  # Make phone LM and denominator and normalization FST
+  mkdir -p $dir/den_fsts/log
+
+  # We may later reorganize this.
+  cp $tree_dir/tree $dir/default.tree
+
+  echo "$0: creating phone language-model"
+  $cmd $dir/den_fsts/log/make_phone_lm_default.log \
+    chain-est-phone-lm --num-extra-lm-states=2000 \
+       "ark:gunzip -c $gmm_dir/ali.*.gz | ali-to-phones $gmm_dir/final.mdl ark:- ark:- |" \
+       $dir/den_fsts/default.phone_lm.fst
+
+  echo "$0: creating denominator FST"
+  $cmd $dir/den_fsts/log/make_den_fst.log \
+     chain-make-den-fst $dir/default.tree $dir/init/default.mdl $dir/den_fsts/default.phone_lm.fst \
+     $dir/den_fsts/default.den.fst $dir/den_fsts/default.normalization.fst || exit 1;
+fi
+
+
+model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/init/info.txt)
+model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/init/info.txt)
+# Note: we add frame_subsampling_factor/2 so that we can support the frame
+# shifting that's done during training, so if frame-subsampling-factor=3, we
+# train on the same egs with the input shifted by -1,0,1 frames.  This is done
+# via the --frame-shift option to nnet3-chain-copy-egs in the script.
+egs_left_context=$[model_left_context+(frame_subsampling_factor/2)+egs_extra_left_context]
+egs_right_context=$[model_right_context+(frame_subsampling_factor/2)+egs_extra_right_context]
+
+for d in $dir/raw_egs $dir/processed_egs; do
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ] ; then
+    mkdir -p $d
+    utils/create_split_dir.pl \
+      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage
+  fi
+done
+
+
+if [ $stage -le 18 ]; then
+  echo "$0: about to dump raw egs."
+  # Dump raw egs.
+  steps/chaina/get_raw_egs.sh --cmd "$cmd" \
+    --lang "default" \
+    --left-context $egs_left_context \
+    --right-context $egs_right_context \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor $frame_subsampling_factor \
+    --frames-per-chunk 150 \
+    ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs
+fi
+
+if [ $stage -le 19 ]; then
+  echo "$0: about to process egs"
+  steps/chaina/process_egs.sh  --cmd "$cmd" \
+    --chunks-per-group ${chunks_per_group} ${dir}/raw_egs ${dir}/processed_egs
+fi
+
+if [ $stage -le 20 ]; then
+  echo "$0: about to randomize egs"
+  steps/chaina/randomize_egs.sh --frames-per-job 3000000 \
+    ${dir}/processed_egs ${dir}/egs
+fi
+
+if [ $stage -le 21 ]; then
+  echo "$0: about to train model"
+  steps/chaina/train.sh \
+    --stage $train_stage --cmd "$cmd" \
+    --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \
+    --dropout-schedule "$dropout_schedule" \
+    --num-jobs-initial 2 --num-jobs-final 4 \
+     $dir/egs $dir
+
+fi
+
+
+if [ $stage -le 22 ]; then
+  # Dump the bottom-nnet outputs for this data.
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/compute_embeddings.sh data/${data}_hires $dir/final $dir/data/final/${data}
+  done
+fi
+
+if [ $stage -le 23 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 24 ]; then
+  # Do the speaker-independent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode_si.sh --cmd "$cmd" --nj 10 --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si
+  done
+fi
+
+if [ $stage -le 25 ]; then
+  # Do the speaker-dependent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall
+  done
+fi
+
+exit 0;
+
+
+  # Work out the model
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor=$frame_subsampling_factor \
+        --bottom-subsampling-factor=$bottom_subsampling_factor \
+       --langs="$langs" $dir/init/ > $dir/init/info.txt
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh
new file mode 100755
index 00000000000..e3990821121
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1b.sh
@@ -0,0 +1,502 @@
+#!/bin/bash
+
+
+# Not working well yet (WER should be closer to 12%.  Need to check for bugs).
+
+#a09:s5: grep WER exp/chaina/tdnn1b_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+#%WER 20.12 [ 4052 / 20138, 394 ins, 569 del, 3089 sub ] exp/chaina/tdnn1b_sp/decode_dev_clean_2_tgsmall.si/wer_10_0.0
+#a09:s5: grep WER exp/chaina/tdnn1b_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+#%WER 18.13 [ 3652 / 20138, 297 ins, 613 del, 2742 sub ] exp/chaina/tdnn1b_sp/decode_dev_clean_2_tgsmall/wer_13_0.0
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+srand=0
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1b   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+
+
+# training chunk-options
+chunk_width=140
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.1
+bottom_subsampling_factor=3
+frame_subsampling_factor=3
+langs="default"  # list of language names
+
+# The amount of extra left/right context we put in the egs.  Note: this could
+# easily be zero, since we're not using a recurrent topology, but we put in a
+# little extra context so that we have more room to play with the configuration
+# without re-dumping egs.
+egs_extra_left_context=5
+egs_extra_right_context=5
+
+# The number of chunks (of length: see $chunk_width above) that we group
+# together for each "speaker" (actually: pseudo-speaker, since we may have
+# to group multiple speaker together in some cases).
+chunks_per_group=4
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/chaina/data_prep_common.sh --stage $stage \
+                                 --train-set $train_set \
+                                 --gmm $gmm  || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chaina/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chaina/${gmm}_${train_set}_sp_lats
+dir=exp/chaina/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  # This will be a two-level tree (with the smaller number of leaves specified
+  # by the '--num-clusters' option); this is needed by the adaptation framework
+  # search below for 'tree.map'
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+   steps/nnet3/chain/build_tree.sh \
+     --num-clusters 200 \
+     --frame-subsampling-factor ${frame_subsampling_factor} \
+     --context-opts "--context-width=2 --central-position=1" \
+     --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+     $lang $ali_dir $tree_dir
+fi
+
+
+# $dir/configs will contain xconfig and config files for the initial
+# models.  It's a scratch space used by this script but not by
+# scripts called from here.
+mkdir -p $dir/configs/
+# $dir/init will contain the initial models
+mkdir -p $dir/init/
+
+l2=0.03
+tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+prefinal_opts="l2-regularize=0.03"
+output_opts="l2-regularize=0.015"
+num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+if [ $stage -le 13 ]; then
+  echo "$0: creating top neural net using the xconfig parser";
+
+  cat <<EOF > $dir/configs/bottom.xconfig
+  input dim=40 name=input
+
+  batchnorm-component name=input-batchnorm
+
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1)
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  # this 'batchnorm-layer' has an affine component but no nonlinearlity
+  linear-component name=linear_bottleneck dim=256 l2-regularize=$l2
+  batchnorm-component name=linear_bottleneck_bn
+  output name=output input=linear_bottleneck_bn
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \
+                                   --config-file-out $dir/configs/bottom.config
+  nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating adaptation model/transform"
+
+  # note: 'default' corresponds to the language name (we use 'default' since this
+  # is not really a multilingual setup.
+  # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match
+  # with the dimension of this transform (256).
+  cat <<EOF | nnet3-adapt --binary=false init - $tree_dir/tree.map $dir/init/default.ada
+AppendTransform num-transforms=6
+  NoOpTransform dim=64
+  MeanOnlyTransform dim=64
+  FmllrTransform dim=32
+  FmllrTransform dim=32
+  FmllrTransform dim=32
+  FmllrTransform dim=32
+EOF
+
+  # check the dimensions match
+  transform_dim=$(nnet3-adapt info $dir/init/default.ada | grep '^dim' | awk -F= '/^dim/ { print $2; }')
+  bottom_output_dim=$(nnet3-info $dir/init/bottom.raw | grep 'output-node name=output ' | perl -ane 'm/dim=(\d+)/ && print $1;')
+  if ! [ "$transform_dim" -eq "$bottom_output_dim" ]; then
+    echo "$0: expected dim of transform to equal output-dim of bottom nnet, got '$transform_dim' != '$bottom_output_dim'"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 15 ]; then
+
+  # Note: we'll use --bottom-subsampling-factor=3, so all time-strides for the
+  # top network should be interpreted at the 30ms frame subsampling rate.
+
+  echo "$0: creating top model"
+  cat <<EOF > $dir/configs/default.xconfig
+  input name=input dim=256
+  linear-component $linear_opts name=linear_from_input dim=768
+  tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  # adding the output layer for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts
+
+  # adding the output layer for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \
+                                   --config-file-out $dir/configs/default.config
+  nnet3-init --srand=$srand $dir/configs/default.config - | \
+     nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl
+fi
+
+
+if [ $stage -le 16 ]; then
+  # Work out the model's total effective left and right context (in the
+  # feature frame-sampling rate).
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor $frame_subsampling_factor \
+        --bottom-subsampling-factor $bottom_subsampling_factor \
+       --langs "$langs" $dir/init/ $dir/init/info.txt
+fi
+
+
+if [ $stage -le 17 ]; then
+  # Make phone LM and denominator and normalization FST
+  mkdir -p $dir/den_fsts/log
+
+  # We may later reorganize this.
+  cp $tree_dir/tree $dir/default.tree
+
+  echo "$0: creating phone language-model"
+  $cmd $dir/den_fsts/log/make_phone_lm_default.log \
+    chain-est-phone-lm --num-extra-lm-states=2000 \
+       "ark:gunzip -c $gmm_dir/ali.*.gz | ali-to-phones $gmm_dir/final.mdl ark:- ark:- |" \
+       $dir/den_fsts/default.phone_lm.fst
+
+  echo "$0: creating denominator FST"
+  $cmd $dir/den_fsts/log/make_den_fst.log \
+     chain-make-den-fst $dir/default.tree $dir/init/default.mdl $dir/den_fsts/default.phone_lm.fst \
+     $dir/den_fsts/default.den.fst $dir/den_fsts/default.normalization.fst || exit 1;
+fi
+
+
+model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/init/info.txt)
+model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/init/info.txt)
+# Note: we add frame_subsampling_factor/2 so that we can support the frame
+# shifting that's done during training, so if frame-subsampling-factor=3, we
+# train on the same egs with the input shifted by -1,0,1 frames.  This is done
+# via the --frame-shift option to nnet3-chain-copy-egs in the script.
+egs_left_context=$[model_left_context+(frame_subsampling_factor/2)+egs_extra_left_context]
+egs_right_context=$[model_right_context+(frame_subsampling_factor/2)+egs_extra_right_context]
+
+for d in $dir/raw_egs $dir/processed_egs; do
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ] ; then
+    mkdir -p $d
+    utils/create_split_dir.pl \
+      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage
+  fi
+done
+
+
+if [ $stage -le 18 ]; then
+  echo "$0: about to dump raw egs."
+  # Dump raw egs.
+  steps/chaina/get_raw_egs.sh --cmd "$cmd" \
+    --lang "default" \
+    --left-context $egs_left_context \
+    --right-context $egs_right_context \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor $frame_subsampling_factor \
+    --frames-per-chunk 150 \
+    ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs
+fi
+
+if [ $stage -le 19 ]; then
+  echo "$0: about to process egs"
+  steps/chaina/process_egs.sh  --cmd "$cmd" \
+    --chunks-per-group ${chunks_per_group} ${dir}/raw_egs ${dir}/processed_egs
+fi
+
+if [ $stage -le 20 ]; then
+  echo "$0: about to randomize egs"
+  steps/chaina/randomize_egs.sh --frames-per-job 3000000 \
+    ${dir}/processed_egs ${dir}/egs
+fi
+
+if [ $stage -le 21 ]; then
+  echo "$0: about to train model"
+  steps/chaina/train.sh \
+    --stage $train_stage --cmd "$cmd" \
+    --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \
+    --dropout-schedule "$dropout_schedule" \
+    --num-jobs-initial 2 --num-jobs-final 4 \
+     $dir/egs $dir
+
+fi
+
+
+if [ $stage -le 22 ]; then
+  # Dump the bottom-nnet outputs for this data.
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/compute_embeddings.sh data/${data}_hires $dir/final $dir/data/final/${data}
+  done
+fi
+
+if [ $stage -le 23 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 24 ]; then
+  # Do the speaker-independent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode_si.sh --cmd "$cmd" --nj 10 --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si
+  done
+fi
+
+if [ $stage -le 25 ]; then
+  # Do the speaker-dependent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall
+  done
+fi
+
+
+exit 0;
+
+
+  # Work out the model
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor=$frame_subsampling_factor \
+        --bottom-subsampling-factor=$bottom_subsampling_factor \
+       --langs="$langs" $dir/init/ > $dir/init/info.txt
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1c.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1c.sh
new file mode 100755
index 00000000000..e4f8d29bbc1
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1c.sh
@@ -0,0 +1,500 @@
+#!/bin/bash
+
+
+# 1c is a sanity check that the baseline setup is working well;
+# we're simply making the transform a NoOpTransform, so the two decoding
+# passes should give almost the same results.
+
+
+# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 17.75 [ 3575 / 20138, 362 ins, 484 del, 2729 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_10_0.0
+# a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 17.10 [ 3443 / 20138, 327 ins, 478 del, 2638 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_11_0.0
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+srand=0
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1c   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+
+
+# training chunk-options
+chunk_width=140
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.1
+bottom_subsampling_factor=3
+frame_subsampling_factor=3
+langs="default"  # list of language names
+
+# The amount of extra left/right context we put in the egs.  Note: this could
+# easily be zero, since we're not using a recurrent topology, but we put in a
+# little extra context so that we have more room to play with the configuration
+# without re-dumping egs.
+egs_extra_left_context=5
+egs_extra_right_context=5
+
+# The number of chunks (of length: see $chunk_width above) that we group
+# together for each "speaker" (actually: pseudo-speaker, since we may have
+# to group multiple speaker together in some cases).
+chunks_per_group=4
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/chaina/data_prep_common.sh --stage $stage \
+                                 --train-set $train_set \
+                                 --gmm $gmm  || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chaina/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chaina/${gmm}_${train_set}_sp_lats
+dir=exp/chaina/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  # This will be a two-level tree (with the smaller number of leaves specified
+  # by the '--num-clusters' option); this is needed by the adaptation framework
+  # search below for 'tree.map'
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+   steps/nnet3/chain/build_tree.sh \
+     --num-clusters 200 \
+     --frame-subsampling-factor ${frame_subsampling_factor} \
+     --context-opts "--context-width=2 --central-position=1" \
+     --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+     $lang $ali_dir $tree_dir
+fi
+
+
+# $dir/configs will contain xconfig and config files for the initial
+# models.  It's a scratch space used by this script but not by
+# scripts called from here.
+mkdir -p $dir/configs/
+# $dir/init will contain the initial models
+mkdir -p $dir/init/
+
+l2=0.03
+tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+prefinal_opts="l2-regularize=0.03"
+output_opts="l2-regularize=0.015"
+num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+if [ $stage -le 13 ]; then
+  echo "$0: creating top neural net using the xconfig parser";
+
+  cat <<EOF > $dir/configs/bottom.xconfig
+  input dim=40 name=input
+
+  batchnorm-component name=input-batchnorm
+
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1)
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  # this 'batchnorm-layer' has an affine component but no nonlinearlity
+  linear-component name=linear_bottleneck dim=256 l2-regularize=$l2
+  batchnorm-component name=linear_bottleneck_bn
+  output name=output input=linear_bottleneck_bn
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \
+                                   --config-file-out $dir/configs/bottom.config
+  nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating adaptation model/transform"
+
+  # note: 'default' corresponds to the language name (we use 'default' since this
+  # is not really a multilingual setup.
+  # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match
+  # with the dimension of this transform (256).
+  cat <<EOF | nnet3-adapt --binary=false init - $tree_dir/tree.map $dir/init/default.ada
+NoOpTransform dim=256
+EOF
+
+  # check the dimensions match
+  transform_dim=$(nnet3-adapt info $dir/init/default.ada | grep '^dim' | awk -F= '/^dim/ { print $2; }')
+  bottom_output_dim=$(nnet3-info $dir/init/bottom.raw | grep 'output-node name=output ' | perl -ane 'm/dim=(\d+)/ && print $1;')
+  if ! [ "$transform_dim" -eq "$bottom_output_dim" ]; then
+    echo "$0: expected dim of transform to equal output-dim of bottom nnet, got '$transform_dim' != '$bottom_output_dim'"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 15 ]; then
+
+  # Note: we'll use --bottom-subsampling-factor=3, so all time-strides for the
+  # top network should be interpreted at the 30ms frame subsampling rate.
+
+  echo "$0: creating top model"
+  cat <<EOF > $dir/configs/default.xconfig
+  input name=input dim=256
+  linear-component $linear_opts name=linear_from_input dim=768
+  tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  # adding the output layer for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts
+
+  # adding the output layer for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \
+                                   --config-file-out $dir/configs/default.config
+  nnet3-init --srand=$srand $dir/configs/default.config - | \
+     nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl
+fi
+
+
+if [ $stage -le 16 ]; then
+  # Work out the model's total effective left and right context (in the
+  # feature frame-sampling rate).
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor $frame_subsampling_factor \
+        --bottom-subsampling-factor $bottom_subsampling_factor \
+       --langs "$langs" $dir/init/ $dir/init/info.txt
+fi
+
+
+if [ $stage -le 17 ]; then
+  # Make phone LM and denominator and normalization FST
+  mkdir -p $dir/den_fsts/log
+
+  # We may later reorganize this.
+  cp $tree_dir/tree $dir/default.tree
+
+  echo "$0: creating phone language-model"
+  $cmd $dir/den_fsts/log/make_phone_lm_default.log \
+    chain-est-phone-lm --num-extra-lm-states=2000 \
+       "ark:gunzip -c $gmm_dir/ali.*.gz | ali-to-phones $gmm_dir/final.mdl ark:- ark:- |" \
+       $dir/den_fsts/default.phone_lm.fst
+
+  echo "$0: creating denominator FST"
+  $cmd $dir/den_fsts/log/make_den_fst.log \
+     chain-make-den-fst $dir/default.tree $dir/init/default.mdl $dir/den_fsts/default.phone_lm.fst \
+     $dir/den_fsts/default.den.fst $dir/den_fsts/default.normalization.fst || exit 1;
+fi
+
+
+model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/init/info.txt)
+model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/init/info.txt)
+# Note: we add frame_subsampling_factor/2 so that we can support the frame
+# shifting that's done during training, so if frame-subsampling-factor=3, we
+# train on the same egs with the input shifted by -1,0,1 frames.  This is done
+# via the --frame-shift option to nnet3-chain-copy-egs in the script.
+egs_left_context=$[model_left_context+(frame_subsampling_factor/2)+egs_extra_left_context]
+egs_right_context=$[model_right_context+(frame_subsampling_factor/2)+egs_extra_right_context]
+
+for d in $dir/raw_egs $dir/processed_egs; do
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ] ; then
+    mkdir -p $d
+    utils/create_split_dir.pl \
+      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage
+  fi
+done
+
+
+if [ $stage -le 18 ]; then
+  echo "$0: about to dump raw egs."
+  # Dump raw egs.
+  steps/chaina/get_raw_egs.sh --cmd "$cmd" \
+    --lang "default" \
+    --left-context $egs_left_context \
+    --right-context $egs_right_context \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor $frame_subsampling_factor \
+    --frames-per-chunk 150 \
+    ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs
+fi
+
+if [ $stage -le 19 ]; then
+  echo "$0: about to process egs"
+  steps/chaina/process_egs.sh  --cmd "$cmd" \
+    --chunks-per-group ${chunks_per_group} ${dir}/raw_egs ${dir}/processed_egs
+fi
+
+if [ $stage -le 20 ]; then
+  echo "$0: about to randomize egs"
+  steps/chaina/randomize_egs.sh --frames-per-job 3000000 \
+    ${dir}/processed_egs ${dir}/egs
+fi
+
+if [ $stage -le 21 ]; then
+  echo "$0: about to train model"
+  steps/chaina/train.sh \
+    --stage $train_stage --cmd "$cmd" \
+    --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \
+    --dropout-schedule "$dropout_schedule" \
+    --num-jobs-initial 2 --num-jobs-final 4 \
+     $dir/egs $dir
+
+fi
+
+
+if [ $stage -le 22 ]; then
+  # Dump the bottom-nnet outputs for this data.
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/compute_embeddings.sh data/${data}_hires $dir/final $dir/data/final/${data}
+  done
+fi
+
+if [ $stage -le 23 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 24 ]; then
+  # Do the speaker-independent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode_si.sh --cmd "$cmd" --nj 10 --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si
+  done
+fi
+
+if [ $stage -le 25 ]; then
+  # Do the speaker-dependent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall
+  done
+fi
+
+
+exit 0;
+
+
+  # Work out the model
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor=$frame_subsampling_factor \
+        --bottom-subsampling-factor=$bottom_subsampling_factor \
+       --langs="$langs" $dir/init/ > $dir/init/info.txt
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1c2.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1c2.sh
new file mode 100755
index 00000000000..b55fb171b77
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1c2.sh
@@ -0,0 +1,508 @@
+#!/bin/bash
+
+# 1c2 is as 1c but changing num-epochs from
+
+# 1c is a sanity check that the baseline setup is working well;
+# we're simply making the transform a NoOpTransform, so the two decoding
+# passes should give almost the same results.
+
+
+# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 18.27 [ 3679 / 20138, 334 ins, 565 del, 2780 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_13_0.0
+#a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 18.09 [ 3643 / 20138, 324 ins, 552 del, 2767 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_15_0.0
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+srand=0
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1c2   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+common_egs_dir=exp/chaina/tdnn1c_sp/egs
+
+# training chunk-options
+chunk_width=140
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.1
+bottom_subsampling_factor=3
+frame_subsampling_factor=3
+langs="default"  # list of language names
+
+# The amount of extra left/right context we put in the egs.  Note: this could
+# easily be zero, since we're not using a recurrent topology, but we put in a
+# little extra context so that we have more room to play with the configuration
+# without re-dumping egs.
+egs_extra_left_context=5
+egs_extra_right_context=5
+
+# The number of chunks (of length: see $chunk_width above) that we group
+# together for each "speaker" (actually: pseudo-speaker, since we may have
+# to group multiple speaker together in some cases).
+chunks_per_group=4
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/chaina/data_prep_common.sh --stage $stage \
+                                 --train-set $train_set \
+                                 --gmm $gmm  || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chaina/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chaina/${gmm}_${train_set}_sp_lats
+dir=exp/chaina/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  # This will be a two-level tree (with the smaller number of leaves specified
+  # by the '--num-clusters' option); this is needed by the adaptation framework
+  # search below for 'tree.map'
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+   steps/nnet3/chain/build_tree.sh \
+     --num-clusters 200 \
+     --frame-subsampling-factor ${frame_subsampling_factor} \
+     --context-opts "--context-width=2 --central-position=1" \
+     --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+     $lang $ali_dir $tree_dir
+fi
+
+
+# $dir/configs will contain xconfig and config files for the initial
+# models.  It's a scratch space used by this script but not by
+# scripts called from here.
+mkdir -p $dir/configs/
+# $dir/init will contain the initial models
+mkdir -p $dir/init/
+
+l2=0.03
+tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+prefinal_opts="l2-regularize=0.03"
+output_opts="l2-regularize=0.015"
+num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+if [ $stage -le 13 ]; then
+  echo "$0: creating top neural net using the xconfig parser";
+
+  cat <<EOF > $dir/configs/bottom.xconfig
+  input dim=40 name=input
+
+  batchnorm-component name=input-batchnorm
+
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1)
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  # this 'batchnorm-layer' has an affine component but no nonlinearlity
+  linear-component name=linear_bottleneck dim=256 l2-regularize=$l2
+  batchnorm-component name=linear_bottleneck_bn
+  output name=output input=linear_bottleneck_bn
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \
+                                   --config-file-out $dir/configs/bottom.config
+  nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating adaptation model/transform"
+
+  # note: 'default' corresponds to the language name (we use 'default' since this
+  # is not really a multilingual setup.
+  # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match
+  # with the dimension of this transform (256).
+  cat <<EOF | nnet3-adapt --binary=false init - $tree_dir/tree.map $dir/init/default.ada
+NoOpTransform dim=256
+EOF
+
+  # check the dimensions match
+  transform_dim=$(nnet3-adapt info $dir/init/default.ada | grep '^dim' | awk -F= '/^dim/ { print $2; }')
+  bottom_output_dim=$(nnet3-info $dir/init/bottom.raw | grep 'output-node name=output ' | perl -ane 'm/dim=(\d+)/ && print $1;')
+  if ! [ "$transform_dim" -eq "$bottom_output_dim" ]; then
+    echo "$0: expected dim of transform to equal output-dim of bottom nnet, got '$transform_dim' != '$bottom_output_dim'"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 15 ]; then
+
+  # Note: we'll use --bottom-subsampling-factor=3, so all time-strides for the
+  # top network should be interpreted at the 30ms frame subsampling rate.
+
+  echo "$0: creating top model"
+  cat <<EOF > $dir/configs/default.xconfig
+  input name=input dim=256
+  linear-component $linear_opts name=linear_from_input dim=768
+  tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  # adding the output layer for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts
+
+  # adding the output layer for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \
+                                   --config-file-out $dir/configs/default.config
+  nnet3-init --srand=$srand $dir/configs/default.config - | \
+     nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl
+fi
+
+
+if [ $stage -le 16 ]; then
+  # Work out the model's total effective left and right context (in the
+  # feature frame-sampling rate).
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor $frame_subsampling_factor \
+        --bottom-subsampling-factor $bottom_subsampling_factor \
+       --langs "$langs" $dir/init/ $dir/init/info.txt
+fi
+
+
+if [ $stage -le 17 ]; then
+  # Make phone LM and denominator and normalization FST
+  mkdir -p $dir/den_fsts/log
+
+  # We may later reorganize this.
+  cp $tree_dir/tree $dir/default.tree
+
+  echo "$0: creating phone language-model"
+  $cmd $dir/den_fsts/log/make_phone_lm_default.log \
+    chain-est-phone-lm --num-extra-lm-states=2000 \
+       "ark:gunzip -c $gmm_dir/ali.*.gz | ali-to-phones $gmm_dir/final.mdl ark:- ark:- |" \
+       $dir/den_fsts/default.phone_lm.fst
+
+  echo "$0: creating denominator FST"
+  $cmd $dir/den_fsts/log/make_den_fst.log \
+     chain-make-den-fst $dir/default.tree $dir/init/default.mdl $dir/den_fsts/default.phone_lm.fst \
+     $dir/den_fsts/default.den.fst $dir/den_fsts/default.normalization.fst || exit 1;
+fi
+
+
+model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/init/info.txt)
+model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/init/info.txt)
+# Note: we add frame_subsampling_factor/2 so that we can support the frame
+# shifting that's done during training, so if frame-subsampling-factor=3, we
+# train on the same egs with the input shifted by -1,0,1 frames.  This is done
+# via the --frame-shift option to nnet3-chain-copy-egs in the script.
+egs_left_context=$[model_left_context+(frame_subsampling_factor/2)+egs_extra_left_context]
+egs_right_context=$[model_right_context+(frame_subsampling_factor/2)+egs_extra_right_context]
+
+for d in $dir/raw_egs $dir/processed_egs; do
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ] ; then
+    mkdir -p $d
+    utils/create_split_dir.pl \
+      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage
+  fi
+done
+
+
+if [ $stage -le 18 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to dump raw egs."
+  # Dump raw egs.
+  steps/chaina/get_raw_egs.sh --cmd "$cmd" \
+    --lang "default" \
+    --left-context $egs_left_context \
+    --right-context $egs_right_context \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor $frame_subsampling_factor \
+    --frames-per-chunk 150 \
+    ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs
+fi
+
+if [ $stage -le 19 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to process egs"
+  steps/chaina/process_egs.sh  --cmd "$cmd" \
+    --chunks-per-group ${chunks_per_group} ${dir}/raw_egs ${dir}/processed_egs
+fi
+
+if [ $stage -le 20 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to randomize egs"
+  steps/chaina/randomize_egs.sh --frames-per-job 3000000 \
+    ${dir}/processed_egs ${dir}/egs
+fi
+
+if [ ! -z $common_egs_dir ]; then
+  egs_dir=$common_egs_dir;
+else
+  egs_dir=$dir/egs
+fi
+
+if [ $stage -le 21 ]; then
+  echo "$0: about to train model"
+  steps/chaina/train.sh \
+    --num-epochs 10.0 \
+    --stage $train_stage --cmd "$cmd" \
+    --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \
+    --dropout-schedule "$dropout_schedule" \
+    --num-jobs-initial 2 --num-jobs-final 4 \
+     $egs_dir $dir
+
+fi
+
+
+if [ $stage -le 22 ]; then
+  # Dump the bottom-nnet outputs for this data.
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/compute_embeddings.sh data/${data}_hires $dir/final $dir/data/final/${data}
+  done
+fi
+
+if [ $stage -le 23 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 24 ]; then
+  # Do the speaker-independent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode_si.sh --cmd "$cmd" --nj 10 --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si
+  done
+fi
+
+if [ $stage -le 25 ]; then
+  # Do the speaker-dependent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall
+  done
+fi
+
+
+exit 0;
+
+
+  # Work out the model
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor=$frame_subsampling_factor \
+        --bottom-subsampling-factor=$bottom_subsampling_factor \
+       --langs="$langs" $dir/init/ > $dir/init/info.txt
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1c3.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1c3.sh
new file mode 100755
index 00000000000..dce1b1e6c8e
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1c3.sh
@@ -0,0 +1,517 @@
+#!/bin/bash
+
+# 1c3 is as 1c2 but double the learning rate, to match baseline.
+# WER is, overall, a little worse than 1c2.
+#
+#a09:s5: grep WER exp/chaina/tdnn1c3_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+#%WER 16.38 [ 3298 / 20138, 350 ins, 412 del, 2536 sub ] exp/chaina/tdnn1c3_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+#a09:s5: grep WER exp/chaina/tdnn1c3_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+#%WER 16.25 [ 3273 / 20138, 320 ins, 461 del, 2492 sub ] exp/chaina/tdnn1c3_sp/decode_dev_clean_2_tgsmall/wer_13_0.0
+
+# 1c2 is as 1c but changing num-epochs from 4 to 10
+
+# 1c is a sanity check that the baseline setup is working well;
+# we're simply making the transform a NoOpTransform, so the two decoding
+# passes should give almost the same results.
+
+
+# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 18.27 [ 3679 / 20138, 334 ins, 565 del, 2780 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_13_0.0
+#a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 18.09 [ 3643 / 20138, 324 ins, 552 del, 2767 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_15_0.0
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+srand=0
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1c3   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+common_egs_dir=exp/chaina/tdnn1c_sp/egs
+
+# training chunk-options
+chunk_width=140
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.1
+bottom_subsampling_factor=3
+frame_subsampling_factor=3
+langs="default"  # list of language names
+
+# The amount of extra left/right context we put in the egs.  Note: this could
+# easily be zero, since we're not using a recurrent topology, but we put in a
+# little extra context so that we have more room to play with the configuration
+# without re-dumping egs.
+egs_extra_left_context=5
+egs_extra_right_context=5
+
+# The number of chunks (of length: see $chunk_width above) that we group
+# together for each "speaker" (actually: pseudo-speaker, since we may have
+# to group multiple speaker together in some cases).
+chunks_per_group=4
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/chaina/data_prep_common.sh --stage $stage \
+                                 --train-set $train_set \
+                                 --gmm $gmm  || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chaina/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chaina/${gmm}_${train_set}_sp_lats
+dir=exp/chaina/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  # This will be a two-level tree (with the smaller number of leaves specified
+  # by the '--num-clusters' option); this is needed by the adaptation framework
+  # search below for 'tree.map'
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+   steps/nnet3/chain/build_tree.sh \
+     --num-clusters 200 \
+     --frame-subsampling-factor ${frame_subsampling_factor} \
+     --context-opts "--context-width=2 --central-position=1" \
+     --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+     $lang $ali_dir $tree_dir
+fi
+
+
+# $dir/configs will contain xconfig and config files for the initial
+# models.  It's a scratch space used by this script but not by
+# scripts called from here.
+mkdir -p $dir/configs/
+# $dir/init will contain the initial models
+mkdir -p $dir/init/
+
+l2=0.03
+tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+prefinal_opts="l2-regularize=0.03"
+output_opts="l2-regularize=0.015"
+num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+if [ $stage -le 13 ]; then
+  echo "$0: creating top neural net using the xconfig parser";
+
+  cat <<EOF > $dir/configs/bottom.xconfig
+  input dim=40 name=input
+
+  batchnorm-component name=input-batchnorm
+
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1)
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  # this 'batchnorm-layer' has an affine component but no nonlinearlity
+  linear-component name=linear_bottleneck dim=256 l2-regularize=$l2
+  batchnorm-component name=linear_bottleneck_bn
+  output name=output input=linear_bottleneck_bn
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \
+                                   --config-file-out $dir/configs/bottom.config
+  nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating adaptation model/transform"
+
+  # note: 'default' corresponds to the language name (we use 'default' since this
+  # is not really a multilingual setup.
+  # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match
+  # with the dimension of this transform (256).
+  cat <<EOF | nnet3-adapt --binary=false init - $tree_dir/tree.map $dir/init/default.ada
+NoOpTransform dim=256
+EOF
+
+  # check the dimensions match
+  transform_dim=$(nnet3-adapt info $dir/init/default.ada | grep '^dim' | awk -F= '/^dim/ { print $2; }')
+  bottom_output_dim=$(nnet3-info $dir/init/bottom.raw | grep 'output-node name=output ' | perl -ane 'm/dim=(\d+)/ && print $1;')
+  if ! [ "$transform_dim" -eq "$bottom_output_dim" ]; then
+    echo "$0: expected dim of transform to equal output-dim of bottom nnet, got '$transform_dim' != '$bottom_output_dim'"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 15 ]; then
+
+  # Note: we'll use --bottom-subsampling-factor=3, so all time-strides for the
+  # top network should be interpreted at the 30ms frame subsampling rate.
+
+  echo "$0: creating top model"
+  cat <<EOF > $dir/configs/default.xconfig
+  input name=input dim=256
+  linear-component $linear_opts name=linear_from_input dim=768
+  tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  # adding the output layer for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts
+
+  # adding the output layer for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \
+                                   --config-file-out $dir/configs/default.config
+  nnet3-init --srand=$srand $dir/configs/default.config - | \
+     nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl
+fi
+
+
+if [ $stage -le 16 ]; then
+  # Work out the model's total effective left and right context (in the
+  # feature frame-sampling rate).
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor $frame_subsampling_factor \
+        --bottom-subsampling-factor $bottom_subsampling_factor \
+       --langs "$langs" $dir/init/ $dir/init/info.txt
+fi
+
+
+if [ $stage -le 17 ]; then
+  # Make phone LM and denominator and normalization FST
+  mkdir -p $dir/den_fsts/log
+
+  # We may later reorganize this.
+  cp $tree_dir/tree $dir/default.tree
+
+  echo "$0: creating phone language-model"
+  $cmd $dir/den_fsts/log/make_phone_lm_default.log \
+    chain-est-phone-lm --num-extra-lm-states=2000 \
+       "ark:gunzip -c $gmm_dir/ali.*.gz | ali-to-phones $gmm_dir/final.mdl ark:- ark:- |" \
+       $dir/den_fsts/default.phone_lm.fst
+
+  echo "$0: creating denominator FST"
+  $cmd $dir/den_fsts/log/make_den_fst.log \
+     chain-make-den-fst $dir/default.tree $dir/init/default.mdl $dir/den_fsts/default.phone_lm.fst \
+     $dir/den_fsts/default.den.fst $dir/den_fsts/default.normalization.fst || exit 1;
+fi
+
+
+model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/init/info.txt)
+model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/init/info.txt)
+# Note: we add frame_subsampling_factor/2 so that we can support the frame
+# shifting that's done during training, so if frame-subsampling-factor=3, we
+# train on the same egs with the input shifted by -1,0,1 frames.  This is done
+# via the --frame-shift option to nnet3-chain-copy-egs in the script.
+egs_left_context=$[model_left_context+(frame_subsampling_factor/2)+egs_extra_left_context]
+egs_right_context=$[model_right_context+(frame_subsampling_factor/2)+egs_extra_right_context]
+
+for d in $dir/raw_egs $dir/processed_egs; do
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ] ; then
+    mkdir -p $d
+    utils/create_split_dir.pl \
+      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage
+  fi
+done
+
+
+if [ $stage -le 18 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to dump raw egs."
+  # Dump raw egs.
+  steps/chaina/get_raw_egs.sh --cmd "$cmd" \
+    --lang "default" \
+    --left-context $egs_left_context \
+    --right-context $egs_right_context \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor $frame_subsampling_factor \
+    --frames-per-chunk 150 \
+    ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs
+fi
+
+if [ $stage -le 19 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to process egs"
+  steps/chaina/process_egs.sh  --cmd "$cmd" \
+    --chunks-per-group ${chunks_per_group} ${dir}/raw_egs ${dir}/processed_egs
+fi
+
+if [ $stage -le 20 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to randomize egs"
+  steps/chaina/randomize_egs.sh --frames-per-job 3000000 \
+    ${dir}/processed_egs ${dir}/egs
+fi
+
+if [ ! -z $common_egs_dir ]; then
+  egs_dir=$common_egs_dir;
+else
+  egs_dir=$dir/egs
+fi
+
+if [ $stage -le 21 ]; then
+  echo "$0: about to train model"
+  steps/chaina/train.sh \
+    --num-epochs 10.0 \
+    --stage $train_stage --cmd "$cmd" \
+    --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \
+    --initial-effective-lrate 0.002 \
+    --final-effective-lrate 0.0002 \
+    --dropout-schedule "$dropout_schedule" \
+    --num-jobs-initial 2 --num-jobs-final 4 \
+     $egs_dir $dir
+fi
+
+
+if [ $stage -le 22 ]; then
+  # Dump the bottom-nnet outputs for this data.
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/compute_embeddings.sh data/${data}_hires $dir/final $dir/data/final/${data}
+  done
+fi
+
+if [ $stage -le 23 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 24 ]; then
+  # Do the speaker-independent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode_si.sh --cmd "$cmd" --nj 10 --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si
+  done
+fi
+
+if [ $stage -le 25 ]; then
+  # Do the speaker-dependent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall
+  done
+fi
+
+
+exit 0;
+
+
+  # Work out the model
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor=$frame_subsampling_factor \
+        --bottom-subsampling-factor=$bottom_subsampling_factor \
+       --langs="$langs" $dir/init/ > $dir/init/info.txt
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1d.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1d.sh
new file mode 100755
index 00000000000..f1bd6f2bdfe
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1d.sh
@@ -0,0 +1,521 @@
+#!/bin/bash
+
+# 1d is as 1c3, but with bottom_subsampling_factor=1 and multiplying
+# frame offsets on the top by 3-- as a debugging exercise.
+#  Doesn't seem to differ significantly from 1c3:
+
+# grep WER exp/chaina/tdnn1c3_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 16.38 [ 3298 / 20138, 350 ins, 412 del, 2536 sub ] exp/chaina/tdnn1c3_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+# grep WER exp/chaina/tdnn1d_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 16.74 [ 3371 / 20138, 316 ins, 497 del, 2558 sub ] exp/chaina/tdnn1d_sp/decode_dev_clean_2_tgsmall.si/wer_12_0.0
+# grep WER exp/chaina/tdnn1d_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 16.46 [ 3314 / 20138, 306 ins, 472 del, 2536 sub ] exp/chaina/tdnn1d_sp/decode_dev_clean_2_tgsmall/wer_13_0.0
+
+# 1c3 is as 1c2 but double the learning rate, to match baseline.
+# 1c2 is as 1c but changing num-epochs from 4 to 10
+
+# 1c is a sanity check that the baseline setup is working well;
+# we're simply making the transform a NoOpTransform, so the two decoding
+# passes should give almost the same results.
+
+
+# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 18.27 [ 3679 / 20138, 334 ins, 565 del, 2780 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_13_0.0
+#a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 18.09 [ 3643 / 20138, 324 ins, 552 del, 2767 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_15_0.0
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+srand=0
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1d   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+common_egs_dir=
+
+# training chunk-options
+chunk_width=140
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.1
+bottom_subsampling_factor=1
+frame_subsampling_factor=3
+langs="default"  # list of language names
+
+# The amount of extra left/right context we put in the egs.  Note: this could
+# easily be zero, since we're not using a recurrent topology, but we put in a
+# little extra context so that we have more room to play with the configuration
+# without re-dumping egs.
+egs_extra_left_context=5
+egs_extra_right_context=5
+
+# The number of chunks (of length: see $chunk_width above) that we group
+# together for each "speaker" (actually: pseudo-speaker, since we may have
+# to group multiple speaker together in some cases).
+chunks_per_group=4
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/chaina/data_prep_common.sh --stage $stage \
+                                 --train-set $train_set \
+                                 --gmm $gmm  || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chaina/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chaina/${gmm}_${train_set}_sp_lats
+dir=exp/chaina/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  # This will be a two-level tree (with the smaller number of leaves specified
+  # by the '--num-clusters' option); this is needed by the adaptation framework
+  # search below for 'tree.map'
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+   steps/nnet3/chain/build_tree.sh \
+     --num-clusters 200 \
+     --frame-subsampling-factor ${frame_subsampling_factor} \
+     --context-opts "--context-width=2 --central-position=1" \
+     --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+     $lang $ali_dir $tree_dir
+fi
+
+
+# $dir/configs will contain xconfig and config files for the initial
+# models.  It's a scratch space used by this script but not by
+# scripts called from here.
+mkdir -p $dir/configs/
+# $dir/init will contain the initial models
+mkdir -p $dir/init/
+
+l2=0.03
+tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+prefinal_opts="l2-regularize=0.03"
+output_opts="l2-regularize=0.015"
+num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+if [ $stage -le 13 ]; then
+  echo "$0: creating top neural net using the xconfig parser";
+
+  cat <<EOF > $dir/configs/bottom.xconfig
+  input dim=40 name=input
+
+  batchnorm-component name=input-batchnorm
+
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1)
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  # this 'batchnorm-layer' has an affine component but no nonlinearlity
+  linear-component name=linear_bottleneck dim=256 l2-regularize=$l2
+  batchnorm-component name=linear_bottleneck_bn
+  output name=output input=linear_bottleneck_bn
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \
+                                   --config-file-out $dir/configs/bottom.config
+  nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating adaptation model/transform"
+
+  # note: 'default' corresponds to the language name (we use 'default' since this
+  # is not really a multilingual setup.
+  # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match
+  # with the dimension of this transform (256).
+  cat <<EOF | nnet3-adapt --binary=false init - $tree_dir/tree.map $dir/init/default.ada
+NoOpTransform dim=256
+EOF
+
+  # check the dimensions match
+  transform_dim=$(nnet3-adapt info $dir/init/default.ada | grep '^dim' | awk -F= '/^dim/ { print $2; }')
+  bottom_output_dim=$(nnet3-info $dir/init/bottom.raw | grep 'output-node name=output ' | perl -ane 'm/dim=(\d+)/ && print $1;')
+  if ! [ "$transform_dim" -eq "$bottom_output_dim" ]; then
+    echo "$0: expected dim of transform to equal output-dim of bottom nnet, got '$transform_dim' != '$bottom_output_dim'"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 15 ]; then
+
+  # Note: we'll use --bottom-subsampling-factor=3, so all time-strides for the
+  # top network should be interpreted at the 30ms frame subsampling rate.
+
+  echo "$0: creating top model"
+  cat <<EOF > $dir/configs/default.xconfig
+  input name=input dim=256
+  linear-component $linear_opts name=linear_from_input dim=768
+  tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  # adding the output layer for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts
+
+  # adding the output layer for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \
+                                   --config-file-out $dir/configs/default.config
+  nnet3-init --srand=$srand $dir/configs/default.config - | \
+     nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl
+fi
+
+
+if [ $stage -le 16 ]; then
+  # Work out the model's total effective left and right context (in the
+  # feature frame-sampling rate).
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor $frame_subsampling_factor \
+        --bottom-subsampling-factor $bottom_subsampling_factor \
+       --langs "$langs" $dir/init/ $dir/init/info.txt
+fi
+
+
+if [ $stage -le 17 ]; then
+  # Make phone LM and denominator and normalization FST
+  mkdir -p $dir/den_fsts/log
+
+  # We may later reorganize this.
+  cp $tree_dir/tree $dir/default.tree
+
+  echo "$0: creating phone language-model"
+  $cmd $dir/den_fsts/log/make_phone_lm_default.log \
+    chain-est-phone-lm --num-extra-lm-states=2000 \
+       "ark:gunzip -c $gmm_dir/ali.*.gz | ali-to-phones $gmm_dir/final.mdl ark:- ark:- |" \
+       $dir/den_fsts/default.phone_lm.fst
+
+  echo "$0: creating denominator FST"
+  $cmd $dir/den_fsts/log/make_den_fst.log \
+     chain-make-den-fst $dir/default.tree $dir/init/default.mdl $dir/den_fsts/default.phone_lm.fst \
+     $dir/den_fsts/default.den.fst $dir/den_fsts/default.normalization.fst || exit 1;
+fi
+
+
+model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/init/info.txt)
+model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/init/info.txt)
+# Note: we add frame_subsampling_factor/2 so that we can support the frame
+# shifting that's done during training, so if frame-subsampling-factor=3, we
+# train on the same egs with the input shifted by -1,0,1 frames.  This is done
+# via the --frame-shift option to nnet3-chain-copy-egs in the script.
+egs_left_context=$[model_left_context+(frame_subsampling_factor/2)+egs_extra_left_context]
+egs_right_context=$[model_right_context+(frame_subsampling_factor/2)+egs_extra_right_context]
+
+for d in $dir/raw_egs $dir/processed_egs; do
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ] ; then
+    mkdir -p $d
+    utils/create_split_dir.pl \
+      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage
+  fi
+done
+
+
+if [ $stage -le 18 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to dump raw egs."
+  # Dump raw egs.
+  steps/chaina/get_raw_egs.sh --cmd "$cmd" \
+    --lang "default" \
+    --left-context $egs_left_context \
+    --right-context $egs_right_context \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor $frame_subsampling_factor \
+    --frames-per-chunk 150 \
+    ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs
+fi
+
+if [ $stage -le 19 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to process egs"
+  steps/chaina/process_egs.sh  --cmd "$cmd" \
+    --chunks-per-group ${chunks_per_group} ${dir}/raw_egs ${dir}/processed_egs
+fi
+
+if [ $stage -le 20 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to randomize egs"
+  steps/chaina/randomize_egs.sh --frames-per-job 3000000 \
+    ${dir}/processed_egs ${dir}/egs
+fi
+
+if [ ! -z $common_egs_dir ]; then
+  egs_dir=$common_egs_dir;
+else
+  egs_dir=$dir/egs
+fi
+
+if [ $stage -le 21 ]; then
+  echo "$0: about to train model"
+  steps/chaina/train.sh \
+    --num-epochs 10.0 \
+    --stage $train_stage --cmd "$cmd" \
+    --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \
+    --initial-effective-lrate 0.002 \
+    --final-effective-lrate 0.0002 \
+    --dropout-schedule "$dropout_schedule" \
+    --num-jobs-initial 2 --num-jobs-final 4 \
+     $egs_dir $dir
+fi
+
+
+if [ $stage -le 22 ]; then
+  # Dump the bottom-nnet outputs for this data.
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/compute_embeddings.sh data/${data}_hires $dir/final $dir/data/final/${data}
+  done
+fi
+
+if [ $stage -le 23 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 24 ]; then
+  # Do the speaker-independent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode_si.sh --cmd "$cmd" --nj 10 --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si
+  done
+fi
+
+if [ $stage -le 25 ]; then
+  # Do the speaker-dependent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall
+  done
+fi
+
+
+exit 0;
+
+
+  # Work out the model
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor=$frame_subsampling_factor \
+        --bottom-subsampling-factor=$bottom_subsampling_factor \
+       --langs="$langs" $dir/init/ > $dir/init/info.txt
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1e.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1e.sh
new file mode 100755
index 00000000000..419debcd51d
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1e.sh
@@ -0,0 +1,511 @@
+#!/bin/bash
+
+# 1e is as 1c3 but with chunks_per_group = 2.
+# 1c3 is as 1c2 but double the learning rate, to match baseline.
+# 1c2 is as 1c but changing num-epochs from 4 to 10
+
+# 1c is a sanity check that the baseline setup is working well;
+# we're simply making the transform a NoOpTransform, so the two decoding
+# passes should give almost the same results.
+
+
+# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 18.27 [ 3679 / 20138, 334 ins, 565 del, 2780 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_13_0.0
+#a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 18.09 [ 3643 / 20138, 324 ins, 552 del, 2767 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_15_0.0
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+srand=0
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1e   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+common_egs_dir=
+
+# training chunk-options
+chunk_width=140
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.1
+bottom_subsampling_factor=3
+frame_subsampling_factor=3
+langs="default"  # list of language names
+
+# The amount of extra left/right context we put in the egs.  Note: this could
+# easily be zero, since we're not using a recurrent topology, but we put in a
+# little extra context so that we have more room to play with the configuration
+# without re-dumping egs.
+egs_extra_left_context=5
+egs_extra_right_context=5
+
+# The number of chunks (of length: see $chunk_width above) that we group
+# together for each "speaker" (actually: pseudo-speaker, since we may have
+# to group multiple speaker together in some cases).
+chunks_per_group=2
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/chaina/data_prep_common.sh --stage $stage \
+                                 --train-set $train_set \
+                                 --gmm $gmm  || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chaina/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chaina/${gmm}_${train_set}_sp_lats
+dir=exp/chaina/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  # This will be a two-level tree (with the smaller number of leaves specified
+  # by the '--num-clusters' option); this is needed by the adaptation framework
+  # search below for 'tree.map'
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+   steps/nnet3/chain/build_tree.sh \
+     --num-clusters 200 \
+     --frame-subsampling-factor ${frame_subsampling_factor} \
+     --context-opts "--context-width=2 --central-position=1" \
+     --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+     $lang $ali_dir $tree_dir
+fi
+
+
+# $dir/configs will contain xconfig and config files for the initial
+# models.  It's a scratch space used by this script but not by
+# scripts called from here.
+mkdir -p $dir/configs/
+# $dir/init will contain the initial models
+mkdir -p $dir/init/
+
+l2=0.03
+tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+prefinal_opts="l2-regularize=0.03"
+output_opts="l2-regularize=0.015"
+num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+if [ $stage -le 13 ]; then
+  echo "$0: creating top neural net using the xconfig parser";
+
+  cat <<EOF > $dir/configs/bottom.xconfig
+  input dim=40 name=input
+
+  batchnorm-component name=input-batchnorm
+
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1)
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  # this 'batchnorm-layer' has an affine component but no nonlinearlity
+  linear-component name=linear_bottleneck dim=256 l2-regularize=$l2
+  batchnorm-component name=linear_bottleneck_bn
+  output name=output input=linear_bottleneck_bn
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \
+                                   --config-file-out $dir/configs/bottom.config
+  nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating adaptation model/transform"
+
+  # note: 'default' corresponds to the language name (we use 'default' since this
+  # is not really a multilingual setup.
+  # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match
+  # with the dimension of this transform (256).
+  cat <<EOF | nnet3-adapt --binary=false init - $tree_dir/tree.map $dir/init/default.ada
+NoOpTransform dim=256
+EOF
+
+  # check the dimensions match
+  transform_dim=$(nnet3-adapt info $dir/init/default.ada | grep '^dim' | awk -F= '/^dim/ { print $2; }')
+  bottom_output_dim=$(nnet3-info $dir/init/bottom.raw | grep 'output-node name=output ' | perl -ane 'm/dim=(\d+)/ && print $1;')
+  if ! [ "$transform_dim" -eq "$bottom_output_dim" ]; then
+    echo "$0: expected dim of transform to equal output-dim of bottom nnet, got '$transform_dim' != '$bottom_output_dim'"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 15 ]; then
+
+  # Note: we'll use --bottom-subsampling-factor=3, so all time-strides for the
+  # top network should be interpreted at the 30ms frame subsampling rate.
+
+  echo "$0: creating top model"
+  cat <<EOF > $dir/configs/default.xconfig
+  input name=input dim=256
+  linear-component $linear_opts name=linear_from_input dim=768
+  tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  # adding the output layer for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts
+
+  # adding the output layer for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \
+                                   --config-file-out $dir/configs/default.config
+  nnet3-init --srand=$srand $dir/configs/default.config - | \
+     nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl
+fi
+
+
+if [ $stage -le 16 ]; then
+  # Work out the model's total effective left and right context (in the
+  # feature frame-sampling rate).
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor $frame_subsampling_factor \
+        --bottom-subsampling-factor $bottom_subsampling_factor \
+       --langs "$langs" $dir/init/ $dir/init/info.txt
+fi
+
+
+if [ $stage -le 17 ]; then
+  # Make phone LM and denominator and normalization FST
+  mkdir -p $dir/den_fsts/log
+
+  # We may later reorganize this.
+  cp $tree_dir/tree $dir/default.tree
+
+  echo "$0: creating phone language-model"
+  $cmd $dir/den_fsts/log/make_phone_lm_default.log \
+    chain-est-phone-lm --num-extra-lm-states=2000 \
+       "ark:gunzip -c $gmm_dir/ali.*.gz | ali-to-phones $gmm_dir/final.mdl ark:- ark:- |" \
+       $dir/den_fsts/default.phone_lm.fst
+
+  echo "$0: creating denominator FST"
+  $cmd $dir/den_fsts/log/make_den_fst.log \
+     chain-make-den-fst $dir/default.tree $dir/init/default.mdl $dir/den_fsts/default.phone_lm.fst \
+     $dir/den_fsts/default.den.fst $dir/den_fsts/default.normalization.fst || exit 1;
+fi
+
+
+model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/init/info.txt)
+model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/init/info.txt)
+# Note: we add frame_subsampling_factor/2 so that we can support the frame
+# shifting that's done during training, so if frame-subsampling-factor=3, we
+# train on the same egs with the input shifted by -1,0,1 frames.  This is done
+# via the --frame-shift option to nnet3-chain-copy-egs in the script.
+egs_left_context=$[model_left_context+(frame_subsampling_factor/2)+egs_extra_left_context]
+egs_right_context=$[model_right_context+(frame_subsampling_factor/2)+egs_extra_right_context]
+
+for d in $dir/raw_egs $dir/processed_egs; do
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ] ; then
+    mkdir -p $d
+    utils/create_split_dir.pl \
+      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage
+  fi
+done
+
+
+if [ $stage -le 18 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to dump raw egs."
+  # Dump raw egs.
+  steps/chaina/get_raw_egs.sh --cmd "$cmd" \
+    --lang "default" \
+    --left-context $egs_left_context \
+    --right-context $egs_right_context \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor $frame_subsampling_factor \
+    --frames-per-chunk 150 \
+    ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs
+fi
+
+if [ $stage -le 19 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to process egs"
+  steps/chaina/process_egs.sh  --cmd "$cmd" \
+    --chunks-per-group ${chunks_per_group} ${dir}/raw_egs ${dir}/processed_egs
+fi
+
+if [ $stage -le 20 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to randomize egs"
+  steps/chaina/randomize_egs.sh --frames-per-job 3000000 \
+    ${dir}/processed_egs ${dir}/egs
+fi
+
+if [ ! -z $common_egs_dir ]; then
+  egs_dir=$common_egs_dir;
+else
+  egs_dir=$dir/egs
+fi
+
+if [ $stage -le 21 ]; then
+  echo "$0: about to train model"
+  steps/chaina/train.sh \
+    --num-epochs 10.0 \
+    --stage $train_stage --cmd "$cmd" \
+    --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \
+    --initial-effective-lrate 0.002 \
+    --final-effective-lrate 0.0002 \
+    --dropout-schedule "$dropout_schedule" \
+    --num-jobs-initial 2 --num-jobs-final 4 \
+     $egs_dir $dir
+fi
+
+
+if [ $stage -le 22 ]; then
+  # Dump the bottom-nnet outputs for this data.
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/compute_embeddings.sh data/${data}_hires $dir/final $dir/data/final/${data}
+  done
+fi
+
+if [ $stage -le 23 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 24 ]; then
+  # Do the speaker-independent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode_si.sh --cmd "$cmd" --nj 10 --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si
+  done
+fi
+
+if [ $stage -le 25 ]; then
+  # Do the speaker-dependent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall
+  done
+fi
+
+
+exit 0;
+
+
+  # Work out the model
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor=$frame_subsampling_factor \
+        --bottom-subsampling-factor=$bottom_subsampling_factor \
+       --langs="$langs" $dir/init/ > $dir/init/info.txt
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1f.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1f.sh
new file mode 100755
index 00000000000..2a5d86816a0
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1f.sh
@@ -0,0 +1,513 @@
+#!/bin/bash
+
+# 1f is as 1c3 but reducing the tree num-leaves from 3500 to 2900, since
+#  the num-leaves is now much less reduced by clustering.
+
+# 1c3 is as 1c2 but double the learning rate, to match baseline.
+# 1c2 is as 1c but changing num-epochs from 4 to 10
+
+# 1c is a sanity check that the baseline setup is working well;
+# we're simply making the transform a NoOpTransform, so the two decoding
+# passes should give almost the same results.
+
+
+# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 18.27 [ 3679 / 20138, 334 ins, 565 del, 2780 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_13_0.0
+#a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 18.09 [ 3643 / 20138, 324 ins, 552 del, 2767 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_15_0.0
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+srand=0
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1f   # affix for the TDNN directory name
+tree_affix=b
+train_stage=-10
+get_egs_stage=-10
+common_egs_dir=
+
+# training chunk-options
+chunk_width=140
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.1
+bottom_subsampling_factor=3
+frame_subsampling_factor=3
+langs="default"  # list of language names
+
+# The amount of extra left/right context we put in the egs.  Note: this could
+# easily be zero, since we're not using a recurrent topology, but we put in a
+# little extra context so that we have more room to play with the configuration
+# without re-dumping egs.
+egs_extra_left_context=5
+egs_extra_right_context=5
+
+# The number of chunks (of length: see $chunk_width above) that we group
+# together for each "speaker" (actually: pseudo-speaker, since we may have
+# to group multiple speaker together in some cases).
+chunks_per_group=4
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/chaina/data_prep_common.sh --stage $stage \
+                                 --train-set $train_set \
+                                 --gmm $gmm  || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chaina/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chaina/${gmm}_${train_set}_sp_lats
+dir=exp/chaina/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  # This will be a two-level tree (with the smaller number of leaves specified
+  # by the '--num-clusters' option); this is needed by the adaptation framework
+  # search below for 'tree.map'
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+   steps/nnet3/chain/build_tree.sh \
+     --num-clusters 200 \
+     --frame-subsampling-factor ${frame_subsampling_factor} \
+     --context-opts "--context-width=2 --central-position=1" \
+     --cmd "$train_cmd" 2900 ${lores_train_data_dir} \
+     $lang $ali_dir $tree_dir
+fi
+
+
+# $dir/configs will contain xconfig and config files for the initial
+# models.  It's a scratch space used by this script but not by
+# scripts called from here.
+mkdir -p $dir/configs/
+# $dir/init will contain the initial models
+mkdir -p $dir/init/
+
+l2=0.03
+tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+prefinal_opts="l2-regularize=0.03"
+output_opts="l2-regularize=0.015"
+num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+if [ $stage -le 13 ]; then
+  echo "$0: creating top neural net using the xconfig parser";
+
+  cat <<EOF > $dir/configs/bottom.xconfig
+  input dim=40 name=input
+
+  batchnorm-component name=input-batchnorm
+
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1)
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  # this 'batchnorm-layer' has an affine component but no nonlinearlity
+  linear-component name=linear_bottleneck dim=256 l2-regularize=$l2
+  batchnorm-component name=linear_bottleneck_bn
+  output name=output input=linear_bottleneck_bn
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \
+                                   --config-file-out $dir/configs/bottom.config
+  nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating adaptation model/transform"
+
+  # note: 'default' corresponds to the language name (we use 'default' since this
+  # is not really a multilingual setup.
+  # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match
+  # with the dimension of this transform (256).
+  cat <<EOF | nnet3-adapt --binary=false init - $tree_dir/tree.map $dir/init/default.ada
+NoOpTransform dim=256
+EOF
+
+  # check the dimensions match
+  transform_dim=$(nnet3-adapt info $dir/init/default.ada | grep '^dim' | awk -F= '/^dim/ { print $2; }')
+  bottom_output_dim=$(nnet3-info $dir/init/bottom.raw | grep 'output-node name=output ' | perl -ane 'm/dim=(\d+)/ && print $1;')
+  if ! [ "$transform_dim" -eq "$bottom_output_dim" ]; then
+    echo "$0: expected dim of transform to equal output-dim of bottom nnet, got '$transform_dim' != '$bottom_output_dim'"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 15 ]; then
+
+  # Note: we'll use --bottom-subsampling-factor=3, so all time-strides for the
+  # top network should be interpreted at the 30ms frame subsampling rate.
+
+  echo "$0: creating top model"
+  cat <<EOF > $dir/configs/default.xconfig
+  input name=input dim=256
+  linear-component $linear_opts name=linear_from_input dim=768
+  tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  # adding the output layer for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts
+
+  # adding the output layer for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \
+                                   --config-file-out $dir/configs/default.config
+  nnet3-init --srand=$srand $dir/configs/default.config - | \
+     nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl
+fi
+
+
+if [ $stage -le 16 ]; then
+  # Work out the model's total effective left and right context (in the
+  # feature frame-sampling rate).
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor $frame_subsampling_factor \
+        --bottom-subsampling-factor $bottom_subsampling_factor \
+       --langs "$langs" $dir/init/ $dir/init/info.txt
+fi
+
+
+if [ $stage -le 17 ]; then
+  # Make phone LM and denominator and normalization FST
+  mkdir -p $dir/den_fsts/log
+
+  # We may later reorganize this.
+  cp $tree_dir/tree $dir/default.tree
+
+  echo "$0: creating phone language-model"
+  $cmd $dir/den_fsts/log/make_phone_lm_default.log \
+    chain-est-phone-lm --num-extra-lm-states=2000 \
+       "ark:gunzip -c $gmm_dir/ali.*.gz | ali-to-phones $gmm_dir/final.mdl ark:- ark:- |" \
+       $dir/den_fsts/default.phone_lm.fst
+
+  echo "$0: creating denominator FST"
+  $cmd $dir/den_fsts/log/make_den_fst.log \
+     chain-make-den-fst $dir/default.tree $dir/init/default.mdl $dir/den_fsts/default.phone_lm.fst \
+     $dir/den_fsts/default.den.fst $dir/den_fsts/default.normalization.fst || exit 1;
+fi
+
+
+model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/init/info.txt)
+model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/init/info.txt)
+# Note: we add frame_subsampling_factor/2 so that we can support the frame
+# shifting that's done during training, so if frame-subsampling-factor=3, we
+# train on the same egs with the input shifted by -1,0,1 frames.  This is done
+# via the --frame-shift option to nnet3-chain-copy-egs in the script.
+egs_left_context=$[model_left_context+(frame_subsampling_factor/2)+egs_extra_left_context]
+egs_right_context=$[model_right_context+(frame_subsampling_factor/2)+egs_extra_right_context]
+
+for d in $dir/raw_egs $dir/processed_egs; do
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ] ; then
+    mkdir -p $d
+    utils/create_split_dir.pl \
+      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage
+  fi
+done
+
+
+if [ $stage -le 18 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to dump raw egs."
+  # Dump raw egs.
+  steps/chaina/get_raw_egs.sh --cmd "$cmd" \
+    --lang "default" \
+    --left-context $egs_left_context \
+    --right-context $egs_right_context \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor $frame_subsampling_factor \
+    --frames-per-chunk 150 \
+    ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs
+fi
+
+if [ $stage -le 19 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to process egs"
+  steps/chaina/process_egs.sh  --cmd "$cmd" \
+    --chunks-per-group ${chunks_per_group} ${dir}/raw_egs ${dir}/processed_egs
+fi
+
+if [ $stage -le 20 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to randomize egs"
+  steps/chaina/randomize_egs.sh --frames-per-job 3000000 \
+    ${dir}/processed_egs ${dir}/egs
+fi
+
+if [ ! -z $common_egs_dir ]; then
+  egs_dir=$common_egs_dir;
+else
+  egs_dir=$dir/egs
+fi
+
+if [ $stage -le 21 ]; then
+  echo "$0: about to train model"
+  steps/chaina/train.sh \
+    --num-epochs 10.0 \
+    --stage $train_stage --cmd "$cmd" \
+    --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \
+    --initial-effective-lrate 0.002 \
+    --final-effective-lrate 0.0002 \
+    --dropout-schedule "$dropout_schedule" \
+    --num-jobs-initial 2 --num-jobs-final 4 \
+     $egs_dir $dir
+fi
+
+
+if [ $stage -le 22 ]; then
+  # Dump the bottom-nnet outputs for this data.
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/compute_embeddings.sh data/${data}_hires $dir/final $dir/data/final/${data}
+  done
+fi
+
+if [ $stage -le 23 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 24 ]; then
+  # Do the speaker-independent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode_si.sh --cmd "$cmd" --nj 10 --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si
+  done
+fi
+
+if [ $stage -le 25 ]; then
+  # Do the speaker-dependent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall
+  done
+fi
+
+
+exit 0;
+
+
+  # Work out the model
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor=$frame_subsampling_factor \
+        --bottom-subsampling-factor=$bottom_subsampling_factor \
+       --langs="$langs" $dir/init/ > $dir/init/info.txt
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1g.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1g.sh
new file mode 100755
index 00000000000..4c79f835607
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1g.sh
@@ -0,0 +1,521 @@
+#!/bin/bash
+
+# 1g is as 1c2 but using MeanOnlyTransform.  Better!!
+
+# a09:s5: grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 17.08 [ 3439 / 20138, 361 ins, 467 del, 2611 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+# grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 14.68 [ 2956 / 20138, 243 ins, 519 del, 2194 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_12_0.5
+#
+# vs. the baseline:
+# grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 16.30 [ 3282 / 20138, 323 ins, 458 del, 2501 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+# a09:s5: grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 15.88 [ 3197 / 20138, 296 ins, 462 del, 2439 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_12_0.0
+
+# 1c2 is as 1c but changing num-epochs from
+
+# 1c is a sanity check that the baseline setup is working well;
+# we're simply making the transform a NoOpTransform, so the two decoding
+# passes should give almost the same results.
+
+
+# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 18.27 [ 3679 / 20138, 334 ins, 565 del, 2780 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_13_0.0
+#a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 18.09 [ 3643 / 20138, 324 ins, 552 del, 2767 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_15_0.0
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+srand=0
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1g   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+common_egs_dir=exp/chaina/tdnn1c_sp/egs
+
+# training chunk-options
+chunk_width=140
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.1
+bottom_subsampling_factor=3
+frame_subsampling_factor=3
+langs="default"  # list of language names
+
+# The amount of extra left/right context we put in the egs.  Note: this could
+# easily be zero, since we're not using a recurrent topology, but we put in a
+# little extra context so that we have more room to play with the configuration
+# without re-dumping egs.
+egs_extra_left_context=5
+egs_extra_right_context=5
+
+# The number of chunks (of length: see $chunk_width above) that we group
+# together for each "speaker" (actually: pseudo-speaker, since we may have
+# to group multiple speaker together in some cases).
+chunks_per_group=4
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/chaina/data_prep_common.sh --stage $stage \
+                                 --train-set $train_set \
+                                 --gmm $gmm  || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chaina/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chaina/${gmm}_${train_set}_sp_lats
+dir=exp/chaina/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  # This will be a two-level tree (with the smaller number of leaves specified
+  # by the '--num-clusters' option); this is needed by the adaptation framework
+  # search below for 'tree.map'
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+   steps/nnet3/chain/build_tree.sh \
+     --num-clusters 200 \
+     --frame-subsampling-factor ${frame_subsampling_factor} \
+     --context-opts "--context-width=2 --central-position=1" \
+     --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+     $lang $ali_dir $tree_dir
+fi
+
+
+# $dir/configs will contain xconfig and config files for the initial
+# models.  It's a scratch space used by this script but not by
+# scripts called from here.
+mkdir -p $dir/configs/
+# $dir/init will contain the initial models
+mkdir -p $dir/init/
+
+l2=0.03
+tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+prefinal_opts="l2-regularize=0.03"
+output_opts="l2-regularize=0.015"
+num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+if [ $stage -le 13 ]; then
+  echo "$0: creating top neural net using the xconfig parser";
+
+  cat <<EOF > $dir/configs/bottom.xconfig
+  input dim=40 name=input
+
+  batchnorm-component name=input-batchnorm
+
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1)
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  # this 'batchnorm-layer' has an affine component but no nonlinearlity
+  linear-component name=linear_bottleneck dim=256 l2-regularize=$l2
+  batchnorm-component name=linear_bottleneck_bn
+  output name=output input=linear_bottleneck_bn
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \
+                                   --config-file-out $dir/configs/bottom.config
+  nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating adaptation model/transform"
+
+  # note: 'default' corresponds to the language name (we use 'default' since this
+  # is not really a multilingual setup.
+  # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match
+  # with the dimension of this transform (256).
+  cat <<EOF | nnet3-adapt --binary=false init - $tree_dir/tree.map $dir/init/default.ada
+MeanOnlyTransform dim=256
+EOF
+
+  # check the dimensions match
+  transform_dim=$(nnet3-adapt info $dir/init/default.ada | grep '^dim' | awk -F= '/^dim/ { print $2; }')
+  bottom_output_dim=$(nnet3-info $dir/init/bottom.raw | grep 'output-node name=output ' | perl -ane 'm/dim=(\d+)/ && print $1;')
+  if ! [ "$transform_dim" -eq "$bottom_output_dim" ]; then
+    echo "$0: expected dim of transform to equal output-dim of bottom nnet, got '$transform_dim' != '$bottom_output_dim'"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 15 ]; then
+
+  # Note: we'll use --bottom-subsampling-factor=3, so all time-strides for the
+  # top network should be interpreted at the 30ms frame subsampling rate.
+
+  echo "$0: creating top model"
+  cat <<EOF > $dir/configs/default.xconfig
+  input name=input dim=256
+  linear-component $linear_opts name=linear_from_input dim=768
+  tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  # adding the output layer for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts
+
+  # adding the output layer for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \
+                                   --config-file-out $dir/configs/default.config
+  nnet3-init --srand=$srand $dir/configs/default.config - | \
+     nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl
+fi
+
+
+if [ $stage -le 16 ]; then
+  # Work out the model's total effective left and right context (in the
+  # feature frame-sampling rate).
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor $frame_subsampling_factor \
+        --bottom-subsampling-factor $bottom_subsampling_factor \
+       --langs "$langs" $dir/init/ $dir/init/info.txt
+fi
+
+
+if [ $stage -le 17 ]; then
+  # Make phone LM and denominator and normalization FST
+  mkdir -p $dir/den_fsts/log
+
+  # We may later reorganize this.
+  cp $tree_dir/tree $dir/default.tree
+
+  echo "$0: creating phone language-model"
+  $cmd $dir/den_fsts/log/make_phone_lm_default.log \
+    chain-est-phone-lm --num-extra-lm-states=2000 \
+       "ark:gunzip -c $gmm_dir/ali.*.gz | ali-to-phones $gmm_dir/final.mdl ark:- ark:- |" \
+       $dir/den_fsts/default.phone_lm.fst
+
+  echo "$0: creating denominator FST"
+  $cmd $dir/den_fsts/log/make_den_fst.log \
+     chain-make-den-fst $dir/default.tree $dir/init/default.mdl $dir/den_fsts/default.phone_lm.fst \
+     $dir/den_fsts/default.den.fst $dir/den_fsts/default.normalization.fst || exit 1;
+fi
+
+
+model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/init/info.txt)
+model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/init/info.txt)
+# Note: we add frame_subsampling_factor/2 so that we can support the frame
+# shifting that's done during training, so if frame-subsampling-factor=3, we
+# train on the same egs with the input shifted by -1,0,1 frames.  This is done
+# via the --frame-shift option to nnet3-chain-copy-egs in the script.
+egs_left_context=$[model_left_context+(frame_subsampling_factor/2)+egs_extra_left_context]
+egs_right_context=$[model_right_context+(frame_subsampling_factor/2)+egs_extra_right_context]
+
+for d in $dir/raw_egs $dir/processed_egs; do
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ] ; then
+    mkdir -p $d
+    utils/create_split_dir.pl \
+      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage
+  fi
+done
+
+
+if [ $stage -le 18 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to dump raw egs."
+  # Dump raw egs.
+  steps/chaina/get_raw_egs.sh --cmd "$cmd" \
+    --lang "default" \
+    --left-context $egs_left_context \
+    --right-context $egs_right_context \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor $frame_subsampling_factor \
+    --frames-per-chunk 150 \
+    ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs
+fi
+
+if [ $stage -le 19 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to process egs"
+  steps/chaina/process_egs.sh  --cmd "$cmd" \
+    --chunks-per-group ${chunks_per_group} ${dir}/raw_egs ${dir}/processed_egs
+fi
+
+if [ $stage -le 20 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to randomize egs"
+  steps/chaina/randomize_egs.sh --frames-per-job 3000000 \
+    ${dir}/processed_egs ${dir}/egs
+fi
+
+if [ ! -z $common_egs_dir ]; then
+  egs_dir=$common_egs_dir;
+else
+  egs_dir=$dir/egs
+fi
+
+if [ $stage -le 21 ]; then
+  echo "$0: about to train model"
+  steps/chaina/train.sh \
+    --num-epochs 10.0 \
+    --stage $train_stage --cmd "$cmd" \
+    --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \
+    --dropout-schedule "$dropout_schedule" \
+    --num-jobs-initial 2 --num-jobs-final 4 \
+     $egs_dir $dir
+
+fi
+
+
+if [ $stage -le 22 ]; then
+  # Dump the bottom-nnet outputs for this data.
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/compute_embeddings.sh data/${data}_hires $dir/final $dir/data/final/${data}
+  done
+fi
+
+if [ $stage -le 23 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 24 ]; then
+  # Do the speaker-independent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode_si.sh --cmd "$cmd" --nj 10 --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si
+  done
+fi
+
+if [ $stage -le 25 ]; then
+  # Do the speaker-dependent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall
+  done
+fi
+
+
+exit 0;
+
+
+  # Work out the model
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor=$frame_subsampling_factor \
+        --bottom-subsampling-factor=$bottom_subsampling_factor \
+       --langs="$langs" $dir/init/ > $dir/init/info.txt
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1h.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1h.sh
new file mode 100755
index 00000000000..962459b4229
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1h.sh
@@ -0,0 +1,528 @@
+#!/bin/bash
+
+# 1h is as 1g but reducing the num-leaves (as in 1c3 -> 1f).  About 0.5% better.
+# 1g is as 1c2 but using MeanOnlyTransform.  Better!!
+
+
+# grep WER exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 16.22 [ 3266 / 20138, 297 ins, 463 del, 2506 sub ] exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+# a09:s5: grep WER exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 14.29 [ 2877 / 20138, 275 ins, 398 del, 2204 sub ] exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall/wer_12_0.0
+
+# a09:s5: grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 17.08 [ 3439 / 20138, 361 ins, 467 del, 2611 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+# grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 14.68 [ 2956 / 20138, 243 ins, 519 del, 2194 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_12_0.5
+#
+# vs. the baseline:
+# grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 16.30 [ 3282 / 20138, 323 ins, 458 del, 2501 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+# a09:s5: grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 15.88 [ 3197 / 20138, 296 ins, 462 del, 2439 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_12_0.0
+
+# 1c2 is as 1c but changing num-epochs from
+
+# 1c is a sanity check that the baseline setup is working well;
+# we're simply making the transform a NoOpTransform, so the two decoding
+# passes should give almost the same results.
+
+
+# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 18.27 [ 3679 / 20138, 334 ins, 565 del, 2780 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_13_0.0
+#a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 18.09 [ 3643 / 20138, 324 ins, 552 del, 2767 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_15_0.0
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+srand=0
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1h   # affix for the TDNN directory name
+tree_affix=b
+train_stage=-10
+get_egs_stage=-10
+common_egs_dir=exp/chaina/tdnn1f_sp/egs
+
+# training chunk-options
+chunk_width=140
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.1
+bottom_subsampling_factor=3
+frame_subsampling_factor=3
+langs="default"  # list of language names
+
+# The amount of extra left/right context we put in the egs.  Note: this could
+# easily be zero, since we're not using a recurrent topology, but we put in a
+# little extra context so that we have more room to play with the configuration
+# without re-dumping egs.
+egs_extra_left_context=5
+egs_extra_right_context=5
+
+# The number of chunks (of length: see $chunk_width above) that we group
+# together for each "speaker" (actually: pseudo-speaker, since we may have
+# to group multiple speaker together in some cases).
+chunks_per_group=4
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/chaina/data_prep_common.sh --stage $stage \
+                                 --train-set $train_set \
+                                 --gmm $gmm  || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chaina/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chaina/${gmm}_${train_set}_sp_lats
+dir=exp/chaina/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  # This will be a two-level tree (with the smaller number of leaves specified
+  # by the '--num-clusters' option); this is needed by the adaptation framework
+  # search below for 'tree.map'
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+   steps/nnet3/chain/build_tree.sh \
+     --num-clusters 200 \
+     --frame-subsampling-factor ${frame_subsampling_factor} \
+     --context-opts "--context-width=2 --central-position=1" \
+     --cmd "$train_cmd" 2900 ${lores_train_data_dir} \
+     $lang $ali_dir $tree_dir
+fi
+
+
+# $dir/configs will contain xconfig and config files for the initial
+# models.  It's a scratch space used by this script but not by
+# scripts called from here.
+mkdir -p $dir/configs/
+# $dir/init will contain the initial models
+mkdir -p $dir/init/
+
+l2=0.03
+tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+prefinal_opts="l2-regularize=0.03"
+output_opts="l2-regularize=0.015"
+num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+if [ $stage -le 13 ]; then
+  echo "$0: creating top neural net using the xconfig parser";
+
+  cat <<EOF > $dir/configs/bottom.xconfig
+  input dim=40 name=input
+
+  batchnorm-component name=input-batchnorm
+
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1)
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  # this 'batchnorm-layer' has an affine component but no nonlinearlity
+  linear-component name=linear_bottleneck dim=256 l2-regularize=$l2
+  batchnorm-component name=linear_bottleneck_bn
+  output name=output input=linear_bottleneck_bn
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \
+                                   --config-file-out $dir/configs/bottom.config
+  nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating adaptation model/transform"
+
+  # note: 'default' corresponds to the language name (we use 'default' since this
+  # is not really a multilingual setup.
+  # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match
+  # with the dimension of this transform (256).
+  cat <<EOF | nnet3-adapt --binary=false init - $tree_dir/tree.map $dir/init/default.ada
+MeanOnlyTransform dim=256
+EOF
+
+  # check the dimensions match
+  transform_dim=$(nnet3-adapt info $dir/init/default.ada | grep '^dim' | awk -F= '/^dim/ { print $2; }')
+  bottom_output_dim=$(nnet3-info $dir/init/bottom.raw | grep 'output-node name=output ' | perl -ane 'm/dim=(\d+)/ && print $1;')
+  if ! [ "$transform_dim" -eq "$bottom_output_dim" ]; then
+    echo "$0: expected dim of transform to equal output-dim of bottom nnet, got '$transform_dim' != '$bottom_output_dim'"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 15 ]; then
+
+  # Note: we'll use --bottom-subsampling-factor=3, so all time-strides for the
+  # top network should be interpreted at the 30ms frame subsampling rate.
+
+  echo "$0: creating top model"
+  cat <<EOF > $dir/configs/default.xconfig
+  input name=input dim=256
+  linear-component $linear_opts name=linear_from_input dim=768
+  tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  # adding the output layer for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts
+
+  # adding the output layer for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \
+                                   --config-file-out $dir/configs/default.config
+  nnet3-init --srand=$srand $dir/configs/default.config - | \
+     nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl
+fi
+
+
+if [ $stage -le 16 ]; then
+  # Work out the model's total effective left and right context (in the
+  # feature frame-sampling rate).
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor $frame_subsampling_factor \
+        --bottom-subsampling-factor $bottom_subsampling_factor \
+       --langs "$langs" $dir/init/ $dir/init/info.txt
+fi
+
+
+if [ $stage -le 17 ]; then
+  # Make phone LM and denominator and normalization FST
+  mkdir -p $dir/den_fsts/log
+
+  # We may later reorganize this.
+  cp $tree_dir/tree $dir/default.tree
+
+  echo "$0: creating phone language-model"
+  $cmd $dir/den_fsts/log/make_phone_lm_default.log \
+    chain-est-phone-lm --num-extra-lm-states=2000 \
+       "ark:gunzip -c $gmm_dir/ali.*.gz | ali-to-phones $gmm_dir/final.mdl ark:- ark:- |" \
+       $dir/den_fsts/default.phone_lm.fst
+
+  echo "$0: creating denominator FST"
+  $cmd $dir/den_fsts/log/make_den_fst.log \
+     chain-make-den-fst $dir/default.tree $dir/init/default.mdl $dir/den_fsts/default.phone_lm.fst \
+     $dir/den_fsts/default.den.fst $dir/den_fsts/default.normalization.fst || exit 1;
+fi
+
+
+model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/init/info.txt)
+model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/init/info.txt)
+# Note: we add frame_subsampling_factor/2 so that we can support the frame
+# shifting that's done during training, so if frame-subsampling-factor=3, we
+# train on the same egs with the input shifted by -1,0,1 frames.  This is done
+# via the --frame-shift option to nnet3-chain-copy-egs in the script.
+egs_left_context=$[model_left_context+(frame_subsampling_factor/2)+egs_extra_left_context]
+egs_right_context=$[model_right_context+(frame_subsampling_factor/2)+egs_extra_right_context]
+
+for d in $dir/raw_egs $dir/processed_egs; do
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ] ; then
+    mkdir -p $d
+    utils/create_split_dir.pl \
+      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage
+  fi
+done
+
+
+if [ $stage -le 18 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to dump raw egs."
+  # Dump raw egs.
+  steps/chaina/get_raw_egs.sh --cmd "$cmd" \
+    --lang "default" \
+    --left-context $egs_left_context \
+    --right-context $egs_right_context \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor $frame_subsampling_factor \
+    --frames-per-chunk 150 \
+    ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs
+fi
+
+if [ $stage -le 19 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to process egs"
+  steps/chaina/process_egs.sh  --cmd "$cmd" \
+    --chunks-per-group ${chunks_per_group} ${dir}/raw_egs ${dir}/processed_egs
+fi
+
+if [ $stage -le 20 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to randomize egs"
+  steps/chaina/randomize_egs.sh --frames-per-job 3000000 \
+    ${dir}/processed_egs ${dir}/egs
+fi
+
+if [ ! -z $common_egs_dir ]; then
+  egs_dir=$common_egs_dir;
+else
+  egs_dir=$dir/egs
+fi
+
+if [ $stage -le 21 ]; then
+  echo "$0: about to train model"
+  steps/chaina/train.sh \
+    --num-epochs 10.0 \
+    --stage $train_stage --cmd "$cmd" \
+    --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \
+    --dropout-schedule "$dropout_schedule" \
+    --num-jobs-initial 2 --num-jobs-final 4 \
+     $egs_dir $dir
+
+fi
+
+
+if [ $stage -le 22 ]; then
+  # Dump the bottom-nnet outputs for this data.
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/compute_embeddings.sh data/${data}_hires $dir/final $dir/data/final/${data}
+  done
+fi
+
+if [ $stage -le 23 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 24 ]; then
+  # Do the speaker-independent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode_si.sh --cmd "$cmd" --nj 10 --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si
+  done
+fi
+
+if [ $stage -le 25 ]; then
+  # Do the speaker-dependent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall
+  done
+fi
+
+
+exit 0;
+
+
+  # Work out the model
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor=$frame_subsampling_factor \
+        --bottom-subsampling-factor=$bottom_subsampling_factor \
+       --langs="$langs" $dir/init/ > $dir/init/info.txt
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1i.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1i.sh
new file mode 100755
index 00000000000..52e90b8ff44
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1i.sh
@@ -0,0 +1,552 @@
+#!/bin/bash
+
+# 1i is as 1h but replacing half the mean-transformed dims with fMLLR in blocks of 8.
+# Improved!!
+# (Note: it was also rerun as i2 after a code change that should make no difference.)
+#
+# a09:s5: grep WER exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+#%WER 16.22 [ 3266 / 20138, 297 ins, 463 del, 2506 sub ] exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+#a09:s5: grep WER exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+#%WER 14.29 [ 2877 / 20138, 275 ins, 398 del, 2204 sub ] exp/chaina/tdnn1h_sp/decode_dev_clean_2_tgsmall/wer_12_0.0
+#a09:s5: grep WER exp/chaina/tdnn1i_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+#%WER 16.85 [ 3393 / 20138, 310 ins, 481 del, 2602 sub ] exp/chaina/tdnn1i_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+#a09:s5: grep WER exp/chaina/tdnn1i_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+#%WER 13.37 [ 2693 / 20138, 243 ins, 398 del, 2052 sub ] exp/chaina/tdnn1i_sp/decode_dev_clean_2_tgsmall/wer_12_0.0
+
+# 1h is as 1g but reducing the num-leaves (as in 1c3 -> 1f)
+# 1g is as 1c2 but using MeanOnlyTransform.  Better!!
+
+# a09:s5: grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 17.08 [ 3439 / 20138, 361 ins, 467 del, 2611 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+# grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 14.68 [ 2956 / 20138, 243 ins, 519 del, 2194 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_12_0.5
+#
+# vs. the baseline:
+# grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 16.30 [ 3282 / 20138, 323 ins, 458 del, 2501 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+# a09:s5: grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 15.88 [ 3197 / 20138, 296 ins, 462 del, 2439 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_12_0.0
+
+# 1c2 is as 1c but changing num-epochs from
+
+# 1c is a sanity check that the baseline setup is working well;
+# we're simply making the transform a NoOpTransform, so the two decoding
+# passes should give almost the same results.
+
+
+# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 18.27 [ 3679 / 20138, 334 ins, 565 del, 2780 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_13_0.0
+#a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 18.09 [ 3643 / 20138, 324 ins, 552 del, 2767 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_15_0.0
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+srand=0
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1i   # affix for the TDNN directory name
+tree_affix=b
+train_stage=-10
+get_egs_stage=-10
+common_egs_dir=exp/chaina/tdnn1f_sp/egs
+
+# training chunk-options
+chunk_width=140
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.1
+bottom_subsampling_factor=3
+frame_subsampling_factor=3
+langs="default"  # list of language names
+
+# The amount of extra left/right context we put in the egs.  Note: this could
+# easily be zero, since we're not using a recurrent topology, but we put in a
+# little extra context so that we have more room to play with the configuration
+# without re-dumping egs.
+egs_extra_left_context=5
+egs_extra_right_context=5
+
+# The number of chunks (of length: see $chunk_width above) that we group
+# together for each "speaker" (actually: pseudo-speaker, since we may have
+# to group multiple speaker together in some cases).
+chunks_per_group=4
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/chaina/data_prep_common.sh --stage $stage \
+                                 --train-set $train_set \
+                                 --gmm $gmm  || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chaina/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chaina/${gmm}_${train_set}_sp_lats
+dir=exp/chaina/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  # This will be a two-level tree (with the smaller number of leaves specified
+  # by the '--num-clusters' option); this is needed by the adaptation framework
+  # search below for 'tree.map'
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+   steps/nnet3/chain/build_tree.sh \
+     --num-clusters 200 \
+     --frame-subsampling-factor ${frame_subsampling_factor} \
+     --context-opts "--context-width=2 --central-position=1" \
+     --cmd "$train_cmd" 2900 ${lores_train_data_dir} \
+     $lang $ali_dir $tree_dir
+fi
+
+
+# $dir/configs will contain xconfig and config files for the initial
+# models.  It's a scratch space used by this script but not by
+# scripts called from here.
+mkdir -p $dir/configs/
+# $dir/init will contain the initial models
+mkdir -p $dir/init/
+
+l2=0.03
+tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+prefinal_opts="l2-regularize=0.03"
+output_opts="l2-regularize=0.015"
+num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+if [ $stage -le 13 ]; then
+  echo "$0: creating top neural net using the xconfig parser";
+
+  cat <<EOF > $dir/configs/bottom.xconfig
+  input dim=40 name=input
+
+  batchnorm-component name=input-batchnorm
+
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1)
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  # this 'batchnorm-layer' has an affine component but no nonlinearlity
+  linear-component name=linear_bottleneck dim=256 l2-regularize=$l2
+  batchnorm-component name=linear_bottleneck_bn
+  output name=output input=linear_bottleneck_bn
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \
+                                   --config-file-out $dir/configs/bottom.config
+  nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating adaptation model/transform"
+
+  # note: 'default' corresponds to the language name (we use 'default' since this
+  # is not really a multilingual setup.
+  # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match
+  # with the dimension of this transform (256).
+  cat <<EOF | nnet3-adapt --binary=false init - $tree_dir/tree.map $dir/init/default.ada
+AppendTransform num-transforms=17
+ MeanOnlyTransform dim=128
+ FmllrTransform dim=8
+ FmllrTransform dim=8
+ FmllrTransform dim=8
+ FmllrTransform dim=8
+ FmllrTransform dim=8
+ FmllrTransform dim=8
+ FmllrTransform dim=8
+ FmllrTransform dim=8
+ FmllrTransform dim=8
+ FmllrTransform dim=8
+ FmllrTransform dim=8
+ FmllrTransform dim=8
+ FmllrTransform dim=8
+ FmllrTransform dim=8
+ FmllrTransform dim=8
+ FmllrTransform dim=8
+EOF
+
+  # check the dimensions match
+  transform_dim=$(nnet3-adapt info $dir/init/default.ada | grep '^dim' | awk -F= '/^dim/ { print $2; }')
+  bottom_output_dim=$(nnet3-info $dir/init/bottom.raw | grep 'output-node name=output ' | perl -ane 'm/dim=(\d+)/ && print $1;')
+  if ! [ "$transform_dim" -eq "$bottom_output_dim" ]; then
+    echo "$0: expected dim of transform to equal output-dim of bottom nnet, got '$transform_dim' != '$bottom_output_dim'"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 15 ]; then
+
+  # Note: we'll use --bottom-subsampling-factor=3, so all time-strides for the
+  # top network should be interpreted at the 30ms frame subsampling rate.
+
+  echo "$0: creating top model"
+  cat <<EOF > $dir/configs/default.xconfig
+  input name=input dim=256
+  linear-component $linear_opts name=linear_from_input dim=768
+  tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  # adding the output layer for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts
+
+  # adding the output layer for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \
+                                   --config-file-out $dir/configs/default.config
+  nnet3-init --srand=$srand $dir/configs/default.config - | \
+     nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl
+fi
+
+
+if [ $stage -le 16 ]; then
+  # Work out the model's total effective left and right context (in the
+  # feature frame-sampling rate).
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor $frame_subsampling_factor \
+        --bottom-subsampling-factor $bottom_subsampling_factor \
+       --langs "$langs" $dir/init/ $dir/init/info.txt
+fi
+
+
+if [ $stage -le 17 ]; then
+  # Make phone LM and denominator and normalization FST
+  mkdir -p $dir/den_fsts/log
+
+  # We may later reorganize this.
+  cp $tree_dir/tree $dir/default.tree
+
+  echo "$0: creating phone language-model"
+  $cmd $dir/den_fsts/log/make_phone_lm_default.log \
+    chain-est-phone-lm --num-extra-lm-states=2000 \
+       "ark:gunzip -c $gmm_dir/ali.*.gz | ali-to-phones $gmm_dir/final.mdl ark:- ark:- |" \
+       $dir/den_fsts/default.phone_lm.fst
+
+  echo "$0: creating denominator FST"
+  $cmd $dir/den_fsts/log/make_den_fst.log \
+     chain-make-den-fst $dir/default.tree $dir/init/default.mdl $dir/den_fsts/default.phone_lm.fst \
+     $dir/den_fsts/default.den.fst $dir/den_fsts/default.normalization.fst || exit 1;
+fi
+
+
+model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/init/info.txt)
+model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/init/info.txt)
+# Note: we add frame_subsampling_factor/2 so that we can support the frame
+# shifting that's done during training, so if frame-subsampling-factor=3, we
+# train on the same egs with the input shifted by -1,0,1 frames.  This is done
+# via the --frame-shift option to nnet3-chain-copy-egs in the script.
+egs_left_context=$[model_left_context+(frame_subsampling_factor/2)+egs_extra_left_context]
+egs_right_context=$[model_right_context+(frame_subsampling_factor/2)+egs_extra_right_context]
+
+for d in $dir/raw_egs $dir/processed_egs; do
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ] ; then
+    mkdir -p $d
+    utils/create_split_dir.pl \
+      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage
+  fi
+done
+
+
+if [ $stage -le 18 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to dump raw egs."
+  # Dump raw egs.
+  steps/chaina/get_raw_egs.sh --cmd "$cmd" \
+    --lang "default" \
+    --left-context $egs_left_context \
+    --right-context $egs_right_context \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor $frame_subsampling_factor \
+    --frames-per-chunk 150 \
+    ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs
+fi
+
+if [ $stage -le 19 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to process egs"
+  steps/chaina/process_egs.sh  --cmd "$cmd" \
+    --chunks-per-group ${chunks_per_group} ${dir}/raw_egs ${dir}/processed_egs
+fi
+
+if [ $stage -le 20 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to randomize egs"
+  steps/chaina/randomize_egs.sh --frames-per-job 3000000 \
+    ${dir}/processed_egs ${dir}/egs
+fi
+
+if [ ! -z $common_egs_dir ]; then
+  egs_dir=$common_egs_dir;
+else
+  egs_dir=$dir/egs
+fi
+
+if [ $stage -le 21 ]; then
+  echo "$0: about to train model"
+  steps/chaina/train.sh \
+    --num-epochs 10.0 \
+    --stage $train_stage --cmd "$cmd" \
+    --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \
+    --dropout-schedule "$dropout_schedule" \
+    --num-jobs-initial 2 --num-jobs-final 4 \
+     $egs_dir $dir
+
+fi
+
+
+if [ $stage -le 22 ]; then
+  # Dump the bottom-nnet outputs for this data.
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/compute_embeddings.sh data/${data}_hires $dir/final $dir/data/final/${data}
+  done
+fi
+
+if [ $stage -le 23 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 24 ]; then
+  # Do the speaker-independent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode_si.sh --cmd "$cmd" --nj 10 --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si
+  done
+fi
+
+if [ $stage -le 25 ]; then
+  # Do the speaker-dependent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall
+  done
+fi
+
+
+exit 0;
+
+
+  # Work out the model
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor=$frame_subsampling_factor \
+        --bottom-subsampling-factor=$bottom_subsampling_factor \
+       --langs="$langs" $dir/init/ > $dir/init/info.txt
+fi
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=20 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.002 \
+    --trainer.optimization.final-effective-lrate=0.0002 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1j.sh b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1j.sh
new file mode 100755
index 00000000000..e575a8275ab
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chaina/tuning/run_tdnn_1j.sh
@@ -0,0 +1,462 @@
+#!/bin/bash
+
+
+# 1j is as 1i but with smoothing-count=100 on the fMLLR
+# Seems better!!
+# Note: although we're still worse than the baseline chain system, it turns out we were
+# getting a lot  of improvement from the 'combine' stage (maybe 3% absolute), so implementing
+# that should make a fairly big difference.
+
+# a09:s5: grep WER exp/chaina/tdnn1j_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+#%WER 16.56 [ 3334 / 20138, 289 ins, 470 del, 2575 sub ] exp/chaina/tdnn1j_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+#a09:s5: grep WER exp/chaina/tdnn1j_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+#%WER 12.95 [ 2608 / 20138, 248 ins, 383 del, 1977 sub ] exp/chaina/tdnn1j_sp/decode_dev_clean_2_tgsmall/wer_12_0.0
+## And a rerun:
+# a09:s5:  grep WER exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 16.08 [ 3239 / 20138, 272 ins, 484 del, 2483 sub ] exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+# a09:s5:  grep WER exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 13.16 [ 2651 / 20138, 236 ins, 402 del, 2013 sub ] exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall/wer_12_0.0
+
+## Then after introducing model combination we got:
+# grep WER exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+#%WER 14.53 [ 2927 / 20138, 301 ins, 347 del, 2279 sub ] exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+#b10:s5:  grep WER exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+#%WER 11.34 [ 2283 / 20138, 234 ins, 303 del, 1746 sub ] exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tgsmall/wer_12_0.0
+# And after LM rescoring:
+# %WER 8.26 [ 1663 / 20138, 243 ins, 147 del, 1273 sub ] exp/chaina/tdnn1j2_sp/decode_dev_clean_2_tglarge/wer_10_0.5
+
+# the baseline 1i:
+#a09:s5: grep WER exp/chaina/tdnn1i_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+#%WER 16.85 [ 3393 / 20138, 310 ins, 481 del, 2602 sub ] exp/chaina/tdnn1i_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+#a09:s5: grep WER exp/chaina/tdnn1i_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+#%WER 13.37 [ 2693 / 20138, 243 ins, 398 del, 2052 sub ] exp/chaina/tdnn1i_sp/decode_dev_clean_2_tgsmall/wer_12_0.0
+# a rerun of the baseline 1i:
+#a09:s5: grep WER exp/chaina/tdnn1i2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+#%WER 16.71 [ 3365 / 20138, 255 ins, 567 del, 2543 sub ] exp/chaina/tdnn1i2_sp/decode_dev_clean_2_tgsmall.si/wer_12_0.0
+#a09:s5: grep WER exp/chaina/tdnn1i2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+#%WER 13.28 [ 2675 / 20138, 259 ins, 374 del, 2042 sub ] exp/chaina/tdnn1i2_sp/decode_dev_clean_2_tgsmall/wer_11_0.0
+
+
+# 1i is as 1h but replacing half the mean-transformed dims with fMLLR in blocks of 8.
+
+# 1h is as 1g but reducing the num-leaves (as in 1c3 -> 1f)
+# 1g is as 1c2 but using MeanOnlyTransform.  Better!!
+
+# a09:s5: grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 17.08 [ 3439 / 20138, 361 ins, 467 del, 2611 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+# grep WER exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 14.68 [ 2956 / 20138, 243 ins, 519 del, 2194 sub ] exp/chaina/tdnn1g_sp/decode_dev_clean_2_tgsmall/wer_12_0.5
+#
+# vs. the baseline:
+# grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 16.30 [ 3282 / 20138, 323 ins, 458 del, 2501 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall.si/wer_11_0.0
+# a09:s5: grep WER exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 15.88 [ 3197 / 20138, 296 ins, 462 del, 2439 sub ] exp/chaina/tdnn1c2_sp/decode_dev_clean_2_tgsmall/wer_12_0.0
+
+# 1c2 is as 1c but changing num-epochs from
+
+# 1c is a sanity check that the baseline setup is working well;
+# we're simply making the transform a NoOpTransform, so the two decoding
+# passes should give almost the same results.
+
+
+# grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_* | utils/best_wer.sh
+# %WER 18.27 [ 3679 / 20138, 334 ins, 565 del, 2780 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall.si/wer_13_0.0
+#a09:s5: grep WER exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_* | utils/best_wer.sh
+# %WER 18.09 [ 3643 / 20138, 324 ins, 552 del, 2767 sub ] exp/chaina/tdnn1c_sp/decode_dev_clean_2_tgsmall/wer_15_0.0
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+srand=0
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1j   # affix for the TDNN directory name
+tree_affix=b
+train_stage=-10
+get_egs_stage=-10
+common_egs_dir=exp/chaina/tdnn1f_sp/egs
+
+# training chunk-options
+chunk_width=140
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.1
+bottom_subsampling_factor=3
+frame_subsampling_factor=3
+langs="default"  # list of language names
+
+# The amount of extra left/right context we put in the egs.  Note: this could
+# easily be zero, since we're not using a recurrent topology, but we put in a
+# little extra context so that we have more room to play with the configuration
+# without re-dumping egs.
+egs_extra_left_context=5
+egs_extra_right_context=5
+
+# The number of chunks (of length: see $chunk_width above) that we group
+# together for each "speaker" (actually: pseudo-speaker, since we may have
+# to group multiple speaker together in some cases).
+chunks_per_group=4
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/chaina/data_prep_common.sh --stage $stage \
+                                 --train-set $train_set \
+                                 --gmm $gmm  || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chaina/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chaina/${gmm}_${train_set}_sp_lats
+dir=exp/chaina/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  # This will be a two-level tree (with the smaller number of leaves specified
+  # by the '--num-clusters' option); this is needed by the adaptation framework
+  # search below for 'tree.map'
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+   steps/nnet3/chain/build_tree.sh \
+     --num-clusters 200 \
+     --frame-subsampling-factor ${frame_subsampling_factor} \
+     --context-opts "--context-width=2 --central-position=1" \
+     --cmd "$train_cmd" 2900 ${lores_train_data_dir} \
+     $lang $ali_dir $tree_dir
+fi
+
+
+# $dir/configs will contain xconfig and config files for the initial
+# models.  It's a scratch space used by this script but not by
+# scripts called from here.
+mkdir -p $dir/configs/
+# $dir/init will contain the initial models
+mkdir -p $dir/init/
+
+l2=0.03
+tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true"
+tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66"
+linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0"
+prefinal_opts="l2-regularize=0.03"
+output_opts="l2-regularize=0.015"
+num_leaves=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+if [ $stage -le 13 ]; then
+  echo "$0: creating top neural net using the xconfig parser";
+
+  cat <<EOF > $dir/configs/bottom.xconfig
+  input dim=40 name=input
+
+  batchnorm-component name=input-batchnorm
+
+  relu-batchnorm-dropout-layer name=tdnn1 $tdnn_opts dim=768 input=Append(-1,0,1)
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=0
+  # this 'batchnorm-layer' has an affine component but no nonlinearlity
+  linear-component name=linear_bottleneck dim=256 l2-regularize=$l2
+  batchnorm-component name=linear_bottleneck_bn
+  output name=output input=linear_bottleneck_bn
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/bottom.xconfig \
+                                   --config-file-out $dir/configs/bottom.config
+  nnet3-init --srand=$srand $dir/configs/bottom.config $dir/init/bottom.raw
+fi
+
+if [ $stage -le 14 ]; then
+  echo "$0: creating adaptation model/transform"
+
+  # note: 'default' corresponds to the language name (we use 'default' since this
+  # is not really a multilingual setup.
+  # Note: the bottleneck dimension of 256 specified in the bottom.nnet must match
+  # with the dimension of this transform (256).
+  cat <<EOF | nnet3-adapt --binary=false init - $tree_dir/tree.map $dir/init/default.ada
+AppendTransform num-transforms=17
+ MeanOnlyTransform dim=128
+ FmllrTransform dim=8 smoothing-count=100
+ FmllrTransform dim=8 smoothing-count=100
+ FmllrTransform dim=8 smoothing-count=100
+ FmllrTransform dim=8 smoothing-count=100
+ FmllrTransform dim=8 smoothing-count=100
+ FmllrTransform dim=8 smoothing-count=100
+ FmllrTransform dim=8 smoothing-count=100
+ FmllrTransform dim=8 smoothing-count=100
+ FmllrTransform dim=8 smoothing-count=100
+ FmllrTransform dim=8 smoothing-count=100
+ FmllrTransform dim=8 smoothing-count=100
+ FmllrTransform dim=8 smoothing-count=100
+ FmllrTransform dim=8 smoothing-count=100
+ FmllrTransform dim=8 smoothing-count=100
+ FmllrTransform dim=8 smoothing-count=100
+ FmllrTransform dim=8 smoothing-count=100
+EOF
+
+  # check the dimensions match
+  transform_dim=$(nnet3-adapt info $dir/init/default.ada | grep '^dim' | awk -F= '/^dim/ { print $2; }')
+  bottom_output_dim=$(nnet3-info $dir/init/bottom.raw | grep 'output-node name=output ' | perl -ane 'm/dim=(\d+)/ && print $1;')
+  if ! [ "$transform_dim" -eq "$bottom_output_dim" ]; then
+    echo "$0: expected dim of transform to equal output-dim of bottom nnet, got '$transform_dim' != '$bottom_output_dim'"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 15 ]; then
+
+  # Note: we'll use --bottom-subsampling-factor=3, so all time-strides for the
+  # top network should be interpreted at the 30ms frame subsampling rate.
+
+  echo "$0: creating top model"
+  cat <<EOF > $dir/configs/default.xconfig
+  input name=input dim=256
+  linear-component $linear_opts name=linear_from_input dim=768
+  tdnnf-layer name=tdnnf1 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf2 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf3 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf4 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf5 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf6 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf7 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=1
+  linear-component name=prefinal-l dim=192 $linear_opts
+
+  # adding the output layer for chain branch
+  prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output include-log-softmax=false dim=$num_leaves $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-chain-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si include-log-softmax=false dim=$num_leaves $output_opts
+
+  # adding the output layer for xent branch
+  prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+  # .. and its speaker-independent version
+  prefinal-layer name=prefinal-xent-si input=prefinal-l $prefinal_opts small-dim=192 big-dim=768
+  output-layer name=output-si-xent dim=$num_leaves learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_config.py --xconfig-file $dir/configs/default.xconfig \
+                                   --config-file-out $dir/configs/default.config
+  nnet3-init --srand=$srand $dir/configs/default.config - | \
+     nnet3-am-init $tree_dir/final.mdl - $dir/init/default.mdl
+fi
+
+
+if [ $stage -le 16 ]; then
+  # Work out the model's total effective left and right context (in the
+  # feature frame-sampling rate).
+  # The following script is equivalent to doing something like the
+  # following:
+  # cat > $dir/init/info.txt <<EOF
+  # langs default
+  # frame_subsampling_factor 3
+  # bottom_subsampling_factor 3
+  # model_left_context 22
+  # model_right_context 22
+  # EOF
+  #
+  # note: $langs is "default"
+  steps/chaina/get_model_context.sh \
+        --frame-subsampling-factor $frame_subsampling_factor \
+        --bottom-subsampling-factor $bottom_subsampling_factor \
+       --langs "$langs" $dir/init/ $dir/init/info.txt
+fi
+
+
+if [ $stage -le 17 ]; then
+  # Make phone LM and denominator and normalization FST
+  mkdir -p $dir/den_fsts/log
+
+  # We may later reorganize this.
+  cp $tree_dir/tree $dir/default.tree
+
+  echo "$0: creating phone language-model"
+  $cmd $dir/den_fsts/log/make_phone_lm_default.log \
+    chain-est-phone-lm --num-extra-lm-states=2000 \
+       "ark:gunzip -c $gmm_dir/ali.*.gz | ali-to-phones $gmm_dir/final.mdl ark:- ark:- |" \
+       $dir/den_fsts/default.phone_lm.fst
+
+  echo "$0: creating denominator FST"
+  $cmd $dir/den_fsts/log/make_den_fst.log \
+     chain-make-den-fst $dir/default.tree $dir/init/default.mdl $dir/den_fsts/default.phone_lm.fst \
+     $dir/den_fsts/default.den.fst $dir/den_fsts/default.normalization.fst || exit 1;
+fi
+
+
+model_left_context=$(awk '/^model_left_context/ {print $2;}' $dir/init/info.txt)
+model_right_context=$(awk '/^model_right_context/ {print $2;}' $dir/init/info.txt)
+# Note: we add frame_subsampling_factor/2 so that we can support the frame
+# shifting that's done during training, so if frame-subsampling-factor=3, we
+# train on the same egs with the input shifted by -1,0,1 frames.  This is done
+# via the --frame-shift option to nnet3-chain-copy-egs in the script.
+egs_left_context=$[model_left_context+(frame_subsampling_factor/2)+egs_extra_left_context]
+egs_right_context=$[model_right_context+(frame_subsampling_factor/2)+egs_extra_right_context]
+
+for d in $dir/raw_egs $dir/processed_egs; do
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $d/storage ] ; then
+    mkdir -p $d
+    utils/create_split_dir.pl \
+      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$d/storage $d/storage
+  fi
+done
+
+
+if [ $stage -le 18 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to dump raw egs."
+  # Dump raw egs.
+  steps/chaina/get_raw_egs.sh --cmd "$cmd" \
+    --lang "default" \
+    --left-context $egs_left_context \
+    --right-context $egs_right_context \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor $frame_subsampling_factor \
+    --frames-per-chunk 150 \
+    ${train_data_dir} ${dir} ${lat_dir} ${dir}/raw_egs
+fi
+
+if [ $stage -le 19 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to process egs"
+  steps/chaina/process_egs.sh  --cmd "$cmd" \
+    --chunks-per-group ${chunks_per_group} ${dir}/raw_egs ${dir}/processed_egs
+fi
+
+if [ $stage -le 20 ] && [ -z $common_egs_dir ]; then
+  echo "$0: about to randomize egs"
+  steps/chaina/randomize_egs.sh --frames-per-job 3000000 \
+    ${dir}/processed_egs ${dir}/egs
+fi
+
+if [ ! -z $common_egs_dir ]; then
+  egs_dir=$common_egs_dir;
+else
+  egs_dir=$dir/egs
+fi
+
+if [ $stage -le 21 ]; then
+  echo "$0: about to train model"
+  steps/chaina/train.sh \
+    --num-epochs 10.0 \
+    --stage $train_stage --cmd "$cmd" \
+    --xent-regularize $xent_regularize --leaky-hmm-coefficient 0.1 \
+    --dropout-schedule "$dropout_schedule" \
+    --num-jobs-initial 2 --num-jobs-final 4 \
+     $egs_dir $dir
+
+fi
+
+
+if [ $stage -le 22 ]; then
+  # Dump the bottom-nnet outputs for this data.
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/compute_embeddings.sh data/${data}_hires $dir/final $dir/data/final/${data}
+  done
+fi
+
+if [ $stage -le 23 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 24 ]; then
+  # Do the speaker-independent decoding pass
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode_si.sh --cmd "$cmd" --nj 10 --num-threads 4 \
+        data/${data}_hires $tree_dir/graph_tgsmall\
+        $dir/final $dir/data/final/${data} \
+        $dir/decode_${data}_tgsmall.si
+  done
+fi
+
+if [ $stage -le 25 ]; then
+  # Do the speaker-dependent decoding pass and LM rescoring
+  test_sets=dev_clean_2
+  for data in $test_sets; do
+    steps/chaina/decode.sh --cmd "$cmd" --num-threads 4 \
+      data/${data}_hires $tree_dir/graph_tgsmall\
+      $dir/final $dir/data/final/${data} \
+      $dir/decode_${data}_tgsmall.si $dir/decode_${data}_tgsmall
+
+    steps/lmrescore_const_arpa.sh --cmd "$cmd" \
+      data/lang_test_{tgsmall,tglarge} \
+      data/${data}_hires $dir/decode_${data}_{tgsmall,tglarge}
+  done
+fi
+
+
+
+exit 0;
diff --git a/egs/wsj/s5/steps/chaina/compute_embeddings.sh b/egs/wsj/s5/steps/chaina/compute_embeddings.sh
new file mode 100755
index 00000000000..fbe1b780828
--- /dev/null
+++ b/egs/wsj/s5/steps/chaina/compute_embeddings.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# Copyright 2012-2019  Johns Hopkins University (Author: Daniel Povey).
+#                2016  Vimal Manohar
+# Apache 2.0.
+
+# This script is used to compute the embeddings (the output of
+# 'bottom.raw' and dump them to disk.
+
+# Begin configuration section.
+stage=1
+nj=4 # number of jobs.
+cmd=run.pl
+frames_per_chunk=50
+extra_left_context=0
+extra_right_context=0
+compress=true     # Specifies whether the output should be compressed before
+                  # dumping to disk
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+set -e -u
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [options] <data-dir> <model-subdirectory> <output-dir>"
+  echo "e.g.:   steps/chaina/compute_embeddings.sh --nj 8 \\"
+  echo "    data/test_eval92_hires exp/chaina/tdnn1_sp/final exp/nnet3/tdnn1_sp/data/final/test_eval92_hires"
+  echo "Output will be in <output-dir>/output.scp"
+  echo "main options (for others, see top of script file)"
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
+  exit 1;
+fi
+
+data=$1
+model_dir=$2
+dir=$3
+
+mkdir -p $dir/log
+
+# convert $dir to absolute pathname
+fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}`
+
+for f in $model_dir/bottom.raw $model_dir/info.txt $data/feats.scp; do
+  if [ ! -f $f ]; then
+    echo "$0: no such file $f"
+    exit 1
+  fi
+done
+
+
+sdata=$data/split$nj;
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+
+
+bottom_subsampling_factor=$(awk '/^bottom_subsampling_factor/ {print $2}' <$model_dir/info.txt)
+if ! [ $bottom_subsampling_factor -gt 0 ]; then
+  echo "$0: error getting bottom_subsampling_factor from $model_dir/info.txt"
+  exit 1
+fi
+
+
+
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj $dir/log/compute.JOB.log \
+    nnet3-compute --use-gpu=no \
+      --frame-subsampling-factor=$bottom_subsampling_factor \
+      --frames-per-chunk=$frames_per_chunk \
+      --extra-left-context=$extra_left_context \
+      --extra-right-context=$extra_right_context \
+      $model_dir/bottom.raw scp:$sdata/JOB/feats.scp \
+      "ark:|copy-feats --compress=$compress ark:- ark,scp:$dir/output.JOB.ark,$dir/output.JOB.scp"
+fi
+
+for n in $(seq $nj); do
+  cat $dir/output.$n.scp
+done > $dir/output.scp
+
+exit 0;
diff --git a/egs/wsj/s5/steps/chaina/decode.sh b/egs/wsj/s5/steps/chaina/decode.sh
new file mode 100755
index 00000000000..df7b627f8c8
--- /dev/null
+++ b/egs/wsj/s5/steps/chaina/decode.sh
@@ -0,0 +1,155 @@
+#!/bin/bash
+
+# Copyright 2019  Johns Hopkins University (Author: Daniel Povey).
+# Apache 2.0.
+
+# This script does the speaker-dependent pass of decoding with a 'chaina' model,
+# including getting the speaker-dependent transforms and dumping lattices.
+
+
+# Begin configuration section.
+stage=1
+
+acwt=1.0  # Just a default value, used for adaptation and beam-pruning..
+post_decode_acwt=10.0  # This is typically used in 'chain' systems to scale
+                       # acoustics by 10 so the regular scoring script works OK
+                       # (since it evaluates the LM scale at integer values,
+                       # typically close to 10).  We make this the default in
+                       # order to make scoring easier, but you should remember
+                       # when using the lattices, that this has been done.
+cmd=run.pl
+beam=15.0
+frames_per_chunk=50
+max_active=7000
+min_active=200
+silence_weight=0.01   # We weight down the posteriors of silence (needs to be tuned).
+lattice_beam=6.0 # Beam we use in lattice generation.
+iter=final
+num_threads=1 # if >1, will use nnet3-latgen-faster-parallel
+
+scoring_opts=
+skip_diagnostics=false
+skip_scoring=false
+# we may later add extra-{left,right}-context options, but these might be
+# problematic.
+extra_left_context=0
+extra_right_context=0
+minimize=false
+lang=default
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+set -e -u
+
+if [ $# -ne 6 ]; then
+  echo "Usage: $0 [options]  <data-dir> <graph-dir> <model-dir> <embedding-dir> <si-decode-dir> <decode-dir>"
+  echo "e.g.:   steps/chaina/decode.sh --nj 8 \\"
+  echo "   data/test exp/chaina/tdnn1a_sp/graph_bg exp/chaina/tdnn1a_sp/final"
+  echo "   exp/chaina/tdnn1a_sp/data/test exp/chaina/tdnn1a_sp/decode_test_bg.si exp/chaina/tdnn1a_sp/decode_test_bg"
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 15.0"
+  echo "  --lattice-beam <beam>                    # Lattice pruning beam; default 6.0"
+  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
+  echo "  --scoring-opts <string>                  # options to local/score.sh"
+  echo "  --num-threads <n>                        # number of threads to use, default 1."
+  echo "  --use-gpu <true|false>                   # default: false.  If true, we recommend"
+  echo "                                           # to use large --num-threads as the graph"
+  echo "                                           # search becomes the limiting factor."
+  exit 1;
+fi
+
+
+data=$1
+graphdir=$2
+model_dir=$3
+embedding_dir=$4
+si_dir=$5
+dir=$6
+
+
+mkdir -p $dir/log
+
+for f in $graphdir/HCLG.fst $data/utt2spk $model_dir/$lang.mdl $model_dir/$lang.ada \
+       $model_dir/info.txt $embedding_dir/output.scp $si_dir/lat.1.gz $si_dir/num_jobs; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+
+nj=$(cat $si_dir/num_jobs)
+echo $nj > $dir/num_jobs
+sdata=$data/split$nj;
+silphonelist=$(cat $graphdir/phones/silence.csl)
+frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$model_dir/info.txt)
+bottom_subsampling_factor=$(awk '/^bottom_subsampling_factor/ {print $2}' <$model_dir/info.txt)
+top_subsampling_factor=$[frame_subsampling_factor/bottom_subsampling_factor]
+
+
+## Now get the first-pass fMLLR transforms.
+if [ $stage -le 1 ]; then
+  echo "$0: getting speaker-dependent transforms"
+  # The --acoustic-scale=0.1 is to reverse the --post-decode-acwt (default: 10)
+  # that we used when dumping the SI lattices (this was for scoring
+  # convenience).
+  $cmd JOB=1:$nj $dir/log/get_transform.JOB.log \
+    gunzip -c $si_dir/lat.JOB.gz \| \
+    lattice-to-post --acoustic-scale=0.1 ark:- ark:- \| \
+    weight-silence-post $silence_weight $silphonelist $model_dir/${lang}.mdl ark:- ark:- \| \
+    post-to-pdf-post $model_dir/${lang}.mdl ark:- ark:- \| \
+    nnet3-adapt --verbose=2 --frame-subsampling-factor=$top_subsampling_factor \
+      get-transforms $model_dir/${lang}.ada ark:$sdata/JOB/spk2utt \
+      "scp:filter_scp.pl $sdata/JOB/utt2spk $embedding_dir/output.scp|" \
+      ark,s,cs:- ark:$dir/trans.JOB.ark
+fi
+
+if [ $num_threads -gt 1 ]; then
+  thread_string="-parallel --num-threads=$num_threads"
+  queue_opt="--num-threads $num_threads"
+else
+  thread_string=
+  queue_opt=
+fi
+
+if [ $stage -le 2 ]; then
+  $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \
+    nnet3-latgen-faster$thread_string \
+     --frame-subsampling-factor=$top_subsampling_factor \
+     --frames-per-chunk=$frames_per_chunk \
+     --extra-left-context=$extra_left_context \
+     --extra-right-context=$extra_right_context \
+     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
+     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
+     --word-symbol-table=$graphdir/words.txt \
+     $model_dir/${lang}.mdl \
+     $graphdir/HCLG.fst \
+     "ark:filter_scp.pl $sdata/JOB/utt2spk $embedding_dir/output.scp | transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB.ark scp:- ark:-|" \
+     "ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
+fi
+
+
+if [ $stage -le 3 ]; then
+  if ! $skip_diagnostics ; then
+    [ ! -z $iter ] && iter_opt="--iter $iter"
+    steps/diagnostic/analyze_lats.sh --cmd "$cmd" --model $model_dir/${lang}.mdl $graphdir $dir
+  fi
+fi
+
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at
+# different acoustic scales to get the final output.
+if [ $stage -le 4 ]; then
+  if ! $skip_scoring ; then
+    [ ! -x local/score.sh ] && \
+      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+    echo "score best paths"
+    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    echo "score confidence and timing with sclite"
+  fi
+fi
+echo "Decoding done."
+exit 0;
diff --git a/egs/wsj/s5/steps/chaina/decode_si.sh b/egs/wsj/s5/steps/chaina/decode_si.sh
new file mode 100755
index 00000000000..f21d82f6278
--- /dev/null
+++ b/egs/wsj/s5/steps/chaina/decode_si.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+# Copyright 2019  Johns Hopkins University (Author: Daniel Povey).
+# Apache 2.0.
+
+# This script does the speaker-independent pass of decoding with a 'chaina' model,
+
+
+# Begin configuration section.
+stage=1
+nj=4 # number of decoding jobs.
+acwt=1.0  # Just a default value, used for adaptation and beam-pruning..
+post_decode_acwt=10.0  # This is typically used in 'chain' systems to scale
+                       # acoustics by 10 so the regular scoring script works OK
+                       # (since it evaluates the LM scale at integer values,
+                       # typically close to 10).  We make this the default in
+                       # order to make scoring easier, but you should remember
+                       # when using the lattices, that this has been done.
+cmd=run.pl
+beam=15.0
+frames_per_chunk=50
+max_active=7000
+min_active=200
+lattice_beam=6.0 # Beam we use in lattice generation.
+iter=final
+num_threads=1 # if >1, will use nnet3-latgen-faster-parallel
+
+scoring_opts=
+skip_diagnostics=false
+skip_scoring=false
+# we may later add extra-{left,right}-context options, but these might be
+# problematic.
+extra_left_context=0
+extra_right_context=0
+minimize=false
+lang=default
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+set -e -u
+
+if [ $# -ne 5 ]; then
+  echo "Usage: $0 [options]  <data-dir> <graph-dir> <model-dir> <embedding-dir> <decode-dir>"
+  echo "e.g.:   steps/chaina/decode.sh --nj 8 \\"
+  echo "   data/test exp/chaina/tdnn1a_sp/graph_bg exp/chaina/tdnn1a_sp/final"
+  echo "   exp/chaina/tdnn1a_sp/data/test exp/chaina/tdnn1a_sp/decode_test_bg"
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                   # config containing options"
+  echo "  --nj <nj>                                # number of parallel jobs"
+  echo "  --cmd <cmd>                              # Command to run in parallel with"
+  echo "  --beam <beam>                            # Decoding beam; default 15.0"
+  echo "  --lattice-beam <beam>                    # Lattice pruning beam; default 6.0"
+  echo "  --iter <iter>                            # Iteration of model to decode; default is final."
+  echo "  --scoring-opts <string>                  # options to local/score.sh"
+  echo "  --num-threads <n>                        # number of threads to use, default 1."
+  echo "  --use-gpu <true|false>                   # default: false.  If true, we recommend"
+  echo "                                           # to use large --num-threads as the graph"
+  echo "                                           # search becomes the limiting factor."
+  exit 1;
+fi
+
+
+data=$1
+graphdir=$2
+model_dir=$3
+embedding_dir=$4
+dir=$5
+
+
+mkdir -p $dir/log
+
+for f in $graphdir/HCLG.fst $data/utt2spk $model_dir/$lang.mdl $model_dir/$lang.ada \
+       $model_dir/info.txt $embedding_dir/output.scp; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+if [ $num_threads -gt 1 ]; then
+  thread_string="-parallel --num-threads=$num_threads"
+  queue_opt="--num-threads $num_threads"
+else
+  thread_string=
+  queue_opt=
+fi
+
+sdata=$data/split$nj;
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
+echo $nj > $dir/num_jobs
+
+
+frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$model_dir/info.txt)
+bottom_subsampling_factor=$(awk '/^bottom_subsampling_factor/ {print $2}' <$model_dir/info.txt)
+top_subsampling_factor=$[frame_subsampling_factor/bottom_subsampling_factor]
+
+
+# We need to use the output named 'output-si' from the model, since this the speaker independent
+# decoding pass.
+model="nnet3-am-copy --edits='remove-output-nodes name=output; rename-node old-name=output-si new-name=output' $model_dir/${lang}.mdl -|"
+
+if [ $stage -le 1 ]; then
+  $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \
+    nnet3-latgen-faster$thread_string \
+     --frame-subsampling-factor=$top_subsampling_factor \
+     --frames-per-chunk=$frames_per_chunk \
+     --extra-left-context=$extra_left_context \
+     --extra-right-context=$extra_right_context \
+     --minimize=$minimize --max-active=$max_active --min-active=$min_active --beam=$beam \
+     --lattice-beam=$lattice_beam --acoustic-scale=$acwt --allow-partial=true \
+     --word-symbol-table=$graphdir/words.txt \
+     "$model" \
+     $graphdir/HCLG.fst \
+     "scp:filter_scp.pl $sdata/JOB/utt2spk $embedding_dir/output.scp|" \
+     "ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz"
+fi
+
+
+if [ $stage -le 2 ]; then
+  if ! $skip_diagnostics ; then
+    [ ! -z $iter ] && iter_opt="--iter $iter"
+    steps/diagnostic/analyze_lats.sh --cmd "$cmd" --model $model_dir/${lang}.mdl $graphdir $dir
+  fi
+fi
+
+
+# The output of this script is the files "lat.*.gz"-- we'll rescore this at
+# different acoustic scales to get the final output.
+if [ $stage -le 3 ]; then
+  if ! $skip_scoring ; then
+    [ ! -x local/score.sh ] && \
+      echo "Not scoring because local/score.sh does not exist or not executable." && exit 1;
+    echo "score best paths"
+    local/score.sh $scoring_opts --cmd "$cmd" $data $graphdir $dir
+    echo "score confidence and timing with sclite"
+  fi
+fi
+echo "Decoding done."
+exit 0;
diff --git a/egs/wsj/s5/steps/chaina/get_model_context.sh b/egs/wsj/s5/steps/chaina/get_model_context.sh
new file mode 100755
index 00000000000..7abf1f6e3b5
--- /dev/null
+++ b/egs/wsj/s5/steps/chaina/get_model_context.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+
+# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+#
+# This script computes the total left and right context needed for example (eg)
+# creation from a set of 'chaina' models.
+# See the usage message for more information about input and output formats.
+
+# Begin configuration section.
+frame_subsampling_factor=1   # The total frame subsampling factor of the bottom
+                             # + top model, i.e. the relative difference in
+                             # frame rate between the input of the bottom model
+                             # and the output of the top model.  Would normally
+                             # be 3.
+bottom_subsampling_factor=1  # The frame subsampling factor of the bottom
+                             # (feature-extracting) model only.  Must be a
+                             # divisor of frame_subsampling_factor.  Would
+                             # normally be 1 or 3.
+
+langs=default                # the list of languages.  This script checks that
+                             # in the dir (first arg to the script), each
+                             # language exists as $lang.mdl, and it warns if
+                             # any model files appear (which might indicate a
+                             # script bug).
+# End configuration section
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 2 ]; then
+  cat 1>&2 <<EOF
+Usage: $0 [opts] <model-dir> <output-info-file>
+This script works out some acoustic-context-related information,
+and writes it, long with  the options provided to the script,
+to the <output-info-file> provided.  An example of what
+output-info-file> might contain after this script is called, is:
+langs default
+frame_subsampling_factor 3
+bottom_subsampling_factor 3
+model_left_context 22
+model_right_context 22
+
+  e.g.: $0 --frame-subsampling-factor 3 --bottom-subsampling-factor 3
+          --langs 'default' exp/chaina/tdnn1a_sp/0 exp/chaina/tdnn1a_sp/0/info.txt
+
+ Options:
+     --frame-subsampling-factor    # (default: 1)  Total frame subsampling factor of
+                                   # both models combined, i.e. ratio of
+                                   # frame rate of input features vs.
+                                   # alignments and decoding (e.g. 3).
+     --bottom-subsampling-factor   # (default: 1) Controls the frequency at which
+                                   # the output of the bottom model is
+                                   # evaluated, and the interpretation of frame
+                                   # offsets in the top config file.  Must be a
+                                   # divisor of --frame-subsampling-factor
+     --langs                       # The list of languages (must be in quotes,
+                                   # to be parsed as a single arg).  May be
+                                   # 'default' or e.g. 'english french'
+EOF
+  exit 1;
+fi
+
+
+dir=$1
+info_file=$2
+
+# die on error or undefined variable.
+set -e -u
+
+if [ ! -d $dir ]; then
+  echo 1>&2 "$0: expected directory $dir to exist"
+  exit 1
+fi
+
+if [ -z $langs ]; then
+  echo 1>&2 "$0: list of languages (--langs option) is empty"
+  exit 1
+fi
+
+if  ! [ $frame_subsampling_factor -ge 1 ] || \
+    ! [ $bottom_subsampling_factor -ge 1 ] || \
+    ! [ $[frame_subsampling_factor%bottom_subsampling_factor] -eq 0 ]; then
+  echo 1>&2 "$0: there was a problem with the options --frame-subsampling-factor=$frame_subsampling_factor --bottom-subsampling-factor=$bottom_subsampling_factor"
+  exit 1
+fi
+
+mkdir -p $dir/temp
+
+if [ ! -s $dir/bottom.raw ]; then
+  echo 1>&2 "$0: expected file $dir/bottom.raw to exist and be nonempty"
+  exit 1
+fi
+
+nnet3-info $dir/bottom.raw > $dir/temp/bottom.info
+bottom_left_context=$(grep '^left-context:' $dir/temp/bottom.info | awk '{print $2}')
+bottom_right_context=$(grep '^right-context:' $dir/temp/bottom.info | awk '{print $2}')
+
+max_top_left_context=0
+max_top_right_context=0
+
+
+for lang in $langs; do
+  if [ ! -s $dir/$lang.mdl ]; then
+    echo 1>&2 "$0: expected file $dir/$lang.mdl to exist and be nonempty (check --langs option)"
+    exit 1
+  fi
+  nnet3-am-info $dir/$lang.mdl > $dir/temp/$lang.info
+  this_left_context=$(grep '^left-context:' $dir/temp/$lang.info | awk '{print $2}')
+  this_right_context=$(grep '^right-context:' $dir/temp/$lang.info | awk '{print $2}')
+  if [ $this_left_context -gt $max_top_left_context ]; then
+    max_top_left_context=$this_left_context
+  fi
+  if [ $this_right_context -gt $max_top_right_context ]; then
+    max_top_right_context=$this_right_context
+  fi
+done
+
+left_context=$[bottom_left_context+(max_top_left_context*bottom_subsampling_factor)]
+right_context=$[bottom_right_context+(max_top_right_context*bottom_subsampling_factor)]
+
+
+cat >$info_file <<EOF
+frame_subsampling_factor $frame_subsampling_factor
+bottom_subsampling_factor $bottom_subsampling_factor
+langs $langs
+model_left_context $left_context
+model_right_context $right_context
+EOF
+
+
+echo "$0: Finished randomizing egs"
diff --git a/egs/wsj/s5/steps/chaina/get_raw_egs.sh b/egs/wsj/s5/steps/chaina/get_raw_egs.sh
new file mode 100755
index 00000000000..b6cd103b263
--- /dev/null
+++ b/egs/wsj/s5/steps/chaina/get_raw_egs.sh
@@ -0,0 +1,263 @@
+#!/bin/bash
+
+# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+#
+# This script dumps 'raw' egs for 'chaina' training.  What 'raw' means in this
+# context is that they need to be further processed to merge egs of the same
+# speaker, etc.  So they won't be directly consumed by training, but by
+# by the script process_egs.sh.
+
+
+
+# Begin configuration section.
+cmd=run.pl
+frames_per_chunk=150  # Number of frames (at feature frame rate) per example.  You
+                      # are allowed to make this a comma-separated list,
+                      # e.g. 150,110,100, meaning that a range of eg widths are
+                      # allowed (but this may not be as helpful when using our
+                      # adaptation framework, since it will tend to split up
+                      # utterances into separate minibatches.
+
+frame_subsampling_factor=3 # frames-per-second of features we train on divided
+                           # by frames-per-second at output of chain model
+alignment_subsampling_factor=3 # frames-per-second of input alignments divided
+                               # by frames-per-second at output of chain model
+constrained=true  # 'constrained=true' is the traditional setup; 'constrained=false'
+                  # gives you the 'unconstrained' egs creation in which the time
+                  # boundaries are not enforced inside chunks.
+left_context=0    # amount of left-context per eg (i.e. extra frames of input
+                  # features not present in the output supervision).  Would
+                  # normally depend on the model context, plus desired 'extra'
+                  # context (e.g. for LSTM).
+right_context=0   # amount of right-context per eg.
+
+left_context_initial=-1   # if >=0, right-context for last chunk of an utterance.
+right_context_final=-1     # if >=0, right-context for last chunk of an utterance.
+
+compress=true   # set this to false to disable compression (e.g. if you want to
+                # see whether results are affected).  Note: if the features on
+                # disk were originally compressed, nnet3-chain-get-egs will dump
+                # compressed features regardless (since there is no further loss
+                # in that case).
+
+lang=default   # the language name.  will usually be 'default' in single-language
+               # setups.  Requires because it's part of the name of some of
+               # the input files.
+
+right_tolerance=  # chain right tolerance == max label delay.  Only relevant if
+                  # constrained=true.  At frame rate of alignments.  Code
+                  # default is 5.
+left_tolerance=   # chain left tolerance (versus alignments from lattices).
+                  # Only relevant if constrained=true.  At frame rate of
+                  # alignments.  Code default is 5.
+
+stage=0
+max_jobs_run=40         # This should be set to the maximum number of
+                        # nnet3-chain-get-egs jobs you are comfortable to run in
+                        # parallel; you can increase it if your disk speed is
+                        # greater and you have more machines.
+
+
+srand=0         # rand seed for nnet3-chain-get-egs, nnet3-chain-copy-egs and nnet3-chain-shuffle-egs
+
+lattice_lm_scale=     # If supplied, the graph/lm weight of the lattices will be
+                      # used (with this scale) in generating supervisions
+                      # This is 0 by default for conventional supervised training,
+                      # but may be close to 1 for the unsupervised part of the data
+                      # in semi-supervised training. The optimum is usually
+                      # 0.5 for unsupervised data.
+lattice_prune_beam=        # If supplied, the lattices will be pruned to this beam,
+                           # before being used to get supervisions.
+
+acwt=0.1   # For pruning.  Should be, for instance, 1.0 for chain lattices.
+deriv_weights_scp=
+
+# end configuration section
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 4 ]; then
+  echo "Usage: $0 [opts] <data> <chain-dir> <lattice-dir> <raw-egs-dir>"
+  echo " e.g.: $0 data/train exp/chaina/tdnn1a_sp exp/tri3_lats exp/chaina/tdnn1a_sp/raw_egs"
+  echo ""
+  echo "From <chain-dir>, 0/<lang>.mdl (for the transition-model), <lang>.tree (the tree), "
+  echo "   den_fsts/<lang>.den.fst, and den_fsts/<lang>.normalization.fst (the normalization "
+  echo "   FST, derived from the denominator FST echo are read (where <lang> is specified"
+  echo "   by the --lang option (its default values is 'default')"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options (alternative to this"
+  echo "                                                   # command line)"
+  echo "  --max-jobs-run <max-jobs-run>                    # The maximum number of jobs you want to run in"
+  echo "                                                   # parallel (increase this only if you have good disk and"
+  echo "                                                   # network speed).  default=6"
+  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --frame-subsampling-factor <factor;3>            # factor by which num-frames at nnet output is reduced "
+  echo "  --lang       <language-name;'default'>           # Name of the language, determines names of some inputs."
+  echo "  --frames-per-chunk <frames;150>                  # number of supervised frames per chunk on disk"
+  echo "                                                   # ... may be a comma separated list, but we advise a single"
+  echo "                                                   #  number in most cases, due to interaction with the need "
+  echo "                                                   # to group egs from the same speaker into groups."
+  echo "  --left-context <int;0>                           # Number of frames on left side to append for feature input"
+  echo "  --right-context <int;0>                          # Number of frames on right side to append for feature input"
+  echo "  --left-context-initial <int;-1>                  # Left-context for first chunk of an utterance"
+  echo "  --right-context-final <int;-1>                   # Right-context for last chunk of an utterance"
+  echo "  --lattice-lm-scale <float>                       # If supplied, the graph/lm weight of the lattices will be "
+  echo "                                                   # used (with this scale) in generating supervisions"
+  echo "  --lattice-prune-beam <float>                     # If supplied, the lattices will be pruned to this beam, "
+  echo "                                                   # before being used to get supervisions."
+  echo "  --acwt <float;0.1>                               # Acoustic scale -- should be acoustic scale at which the "
+  echo "                                                   # supervision lattices are to be interpreted.  Affects pruning"
+  echo "  --deriv-weights-scp <str>                        # If supplied, adds per-frame weights to the supervision."
+  echo "                                                   # (e.g., might be relevant for unsupervised training)."
+  echo "  --stage <stage|0>                                # Used to run this script from somewhere in"
+  echo "                                                   # the middle."
+  exit 1;
+fi
+
+data=$1
+chaindir=$2
+latdir=$3
+dir=$4
+
+tree=$chaindir/${lang}.tree
+trans_mdl=$chaindir/init/${lang}.mdl  # contains the transition model and a nnet, but
+                                   # we won't be making use of the nnet part.
+normalization_fst=$chaindir/den_fsts/${lang}.normalization.fst
+den_fst=$chaindir/den_fsts/${lang}.den.fst
+
+for f in $data/feats.scp $latdir/lat.1.gz $latdir/final.mdl \
+         $tree $trans_mdl $normalization_fst $den_fst; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+nj=$(cat $latdir/num_jobs) || exit 1
+if [ -f $latdir/per_utt ]; then
+  sdata=$data/split${nj}utt
+  utils/split_data.sh --per-utt $data $nj
+else
+  sdata=$data/split$nj
+  utils/split_data.sh $data $nj
+fi
+
+mkdir -p $dir/log  $dir/misc
+
+cp $tree $dir/misc/
+copy-transition-model $trans_mdl $dir/misc/${lang}.trans_mdl
+cp $normalization_fst $den_fst $dir/misc/
+cp $data/utt2spk $dir/misc/
+if [ -f $data/utt2uniq ]; then
+  cp $data/utt2uniq $dir/misc/
+elif [ -f $dir/misc/utt2uniq ]; then
+  rm $dir/misc/utt2uniq
+fi
+
+if [ -e $dir/storage ]; then
+  # Make soft links to storage directories, if distributing this way..  See
+  # utils/create_split_dir.pl.
+  echo "$0: creating data links"
+  utils/create_data_link.pl $(for x in $(seq $nj); do echo $dir/cegs.$x.ark; done)
+fi
+
+
+lats_rspecifier="ark:gunzip -c $latdir/lat.JOB.gz |"
+if [ ! -z $lattice_prune_beam ]; then
+  if [ "$lattice_prune_beam" == "0" ] || [ "$lattice_prune_beam" == "0.0" ]; then
+    lats_rspecifier="$lats_rspecifier lattice-1best --acoustic-scale=$acwt ark:- ark:- |"
+  else
+    lats_rspecifier="$lats_rspecifier lattice-prune --acoustic-scale=$acwt --beam=$lattice_prune_beam ark:- ark:- |"
+  fi
+fi
+
+egs_opts="--long-key=true --left-context=$left_context --right-context=$right_context --num-frames=$frames_per_chunk --frame-subsampling-factor=$frame_subsampling_factor --compress=$compress"
+[ $left_context_initial -ge 0 ] && egs_opts="$egs_opts --left-context-initial=$left_context_initial"
+[ $right_context_final -ge 0 ] && egs_opts="$egs_opts --right-context-final=$right_context_final"
+
+[ ! -z "$deriv_weights_scp" ] && egs_opts="$egs_opts --deriv-weights-rspecifier=scp:$deriv_weights_scp"
+
+
+chain_supervision_all_opts="--lattice-input=true --frame-subsampling-factor=$alignment_subsampling_factor"
+[ ! -z $right_tolerance ] && \
+  chain_supervision_all_opts="$chain_supervision_all_opts --right-tolerance=$right_tolerance"
+
+[ ! -z $left_tolerance ] && \
+  chain_supervision_all_opts="$chain_supervision_all_opts --left-tolerance=$left_tolerance"
+
+if ! $constrained; then
+  # e2e supervision
+  chain_supervision_all_opts="$chain_supervision_all_opts --convert-to-pdfs=false"
+  egs_opts="$egs_opts --transition-model=$chaindir/0.trans_mdl"
+fi
+
+if [ ! -z "$lattice_lm_scale" ]; then
+  chain_supervision_all_opts="$chain_supervision_all_opts --lm-scale=$lattice_lm_scale"
+
+  normalization_fst_scale=$(perl -e "
+  if ($lattice_lm_scale >= 1.0 || $lattice_lm_scale < 0) {
+    print STDERR \"Invalid --lattice-lm-scale $lattice_lm_scale\"; exit(1);
+  }
+  print (1.0 - $lattice_lm_scale);") || exit 1
+  egs_opts="$egs_opts --normalization-fst-scale=$normalization_fst_scale"
+fi
+
+if [ $stage -le 0 ]; then
+  $cmd --max-jobs-run $max_jobs_run JOB=1:$nj $dir/log/get_egs.JOB.log \
+       lattice-align-phones --replace-output-symbols=true $latdir/final.mdl \
+       "$lats_rspecifier" ark:- \| \
+       chain-get-supervision $chain_supervision_all_opts \
+       $dir/misc/${lang}.tree $dir/misc/${lang}.trans_mdl ark:- ark:- \| \
+       nnet3-chain-get-egs $ivector_opts --srand=\$[JOB+$srand] $egs_opts \
+       "$normalization_fst" scp:$sdata/JOB/feats.scp ark,s,cs:- \
+       ark,scp:$dir/cegs.JOB.ark,$dir/cegs.JOB.scp || exit 1;
+fi
+
+
+if [ $stage -le 1 ]; then
+  frames_and_chunks=$(for n in $(seq $nj); do cat $dir/log/get_egs.$n.log; done | \
+           perl -e '$nc=0; $nf=0; while(<STDIN>) {
+     if (m/Split .+ into (\d+) chunks/) { $this_nc = $1;  }
+     if (m/Average chunk length was (\d+) frames/) { $nf += $1 * $this_nc;  $nc += $this_nc; }
+    } print "$nf $nc"; ')
+  num_frames=$(echo $frames_and_chunks | awk '{print $1}')
+  num_chunks=$(echo $frames_and_chunks | awk '{print $2}')
+  frames_per_chunk_avg=$[num_frames/num_chunks]
+  feat_dim=$(feat-to-dim scp:$sdata/1/feats.scp -)
+  num_leaves=$(tree-info $tree | awk '/^num-pdfs/ {print $2}')
+  if [ $left_context_initial -lt 0 ]; then
+    left_context_initial=$left_context
+  fi
+  if [ $right_context_final -lt 0 ]; then
+    right_context_final=$right_context
+  fi
+
+  cat >$dir/info.txt <<EOF
+dir_type raw_chaina_egs
+num_input_frames $num_frames
+num_chunks $num_chunks
+lang $lang
+feat_dim $feat_dim
+num_leaves $num_leaves
+frames_per_chunk $frames_per_chunk
+frames_per_chunk_avg $frames_per_chunk_avg
+left_context $left_context
+left_context_initial $left_context_initial
+right_context $right_context
+right_context_final $right_context_final
+EOF
+
+  if ! cat $dir/info.txt | awk '{if (NF == 1) exit(1);}'; then
+    echo "$0: we failed to obtain at least one of the fields in $dir/info.txt"
+    exit 1
+  fi
+fi
+
+
+if [ $stage -le 2 ]; then
+  for n in $(seq $nj); do cat $dir/cegs.$n.scp; done > $dir/all.scp
+fi
+
+echo "$0: Finished preparing raw egs"
diff --git a/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py
new file mode 100755
index 00000000000..a4e8a44c1cd
--- /dev/null
+++ b/egs/wsj/s5/steps/chaina/internal/choose_egs_to_merge.py
@@ -0,0 +1,364 @@
+#!/usr/bin/env python3
+
+# Copyright  2018  Johns Hopkins University (author: Daniel Povey)
+# Copyright  2018  Hossein Hadian
+
+# License: Apache 2.0.
+
+import os
+import argparse
+import sys
+import re
+import logging
+import traceback
+import random
+
+sys.path.insert(0, 'steps')
+
+logger = logging.getLogger('libs')
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+logger.info('Starting choose_egs_to_merge.py')
+
+
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="Chooses groups of examples to merge into groups "
+                                     "of size given by the --chunks-per-group option, based on speaker "
+                                     "information (preferentially, chunks from the same utterance "
+                                     "and, if possible, the same speaker, get combined into "
+                                     "groups).  This script also computes a held-out subset of...",
+                                     epilog="E.g. " + sys.argv[0] + "*** TODO *** ",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument('--random-seed', type=int,
+                        default = 123, help='Random seed.')
+    parser.add_argument("--chunks-per-group", type=int, default=4,
+                        help="Number of chunks per speaker in the final egs (actually "
+                        "means the number of chunks per group of chunks, and they are "
+                        "only preferentially taken from the same speaker.")
+    parser.add_argument("--num-repeats", type=int, default=1,
+                        help="The number of times the data is to be repeated.  Must divide "
+                        "--chunks-per-group.  Suggest to try only 1 or 2.  The idea "
+                        "is to divide chunks into groups in different ways, to give "
+                        "more variety to the egs (since the adaptation information "
+                        "will differ.")
+    parser.add_argument("--heldout-data-selection-proportion", type=float,
+                        default=0.2,
+                        help="This parameter governs the selection of the heldout "
+                        "subset and the statistically matched training subset. "
+                        "It does not affect the size of that subset, but only "
+                        "affects what pool the examples are drawb from.  "
+                        "Smaller values of this mean that the heldout groups "
+                        "will be preferentially drawn from groups that "
+                        "'contaminate' the least number of other groups, "
+                        "and so require the least data to be removed from the "
+                        "training set.  Setting this to 1.0 would mean that "
+                        "the heldout subset is drawn completely at random "
+                        "(which might be more wasteful of training data, but "
+                        "gives a selection that's statistically more "
+                        "representative).")
+    parser.add_argument("--num-heldout-groups", type=int, default=200,
+                        help="Number of utterance groups "
+                        "that will go in the heldout subset (and in the "
+                        "statistically matched training subset)")
+    parser.add_argument("--utt2uniq", type=str, default='',
+                        help="File used in setups with data "
+                        "augmentation, that maps from utterance-ids to the "
+                        "pre-augmentation utterance-id.  The reason it's needed "
+                        "is to ensure that the heldout set is properly held "
+                        "out (i.e., that different versions of those utterances "
+                        "weren't trained on.  If not specified, we assume the "
+                        "identity map.")
+    parser.add_argument("--scp-in", type=str, required=True,
+                        help="The scp file in, likely containing chain egs.  The "
+                        "keys are expected to be of the form: "
+                        "'<utterance_id>-<first_frame>-<left_context>-<num_frames>-<right_context>-v1', "
+                        "where the left_context, num_frames and right_context are required to be the "
+                        "same in order for keys to be in a group (note: it's best if the "
+                        "--extra-left-context-initial and --extra-right-context-final options "
+                        "are not used, and if the --frames-per-chunk is a single number, in "
+                        "order to prevent this constraint from splitting up the utterances from "
+                        "a single speaker")
+    parser.add_argument("--training-data-out", type=str, required=True,
+                        help="The output file containing the chunks that are to be grouped; each "
+                        "line will contain --chunks-per-group (e.g. 4) rxfilenames, obtained "
+                        "from the second field of the input --scp-in file.")
+    parser.add_argument("--heldout-subset-out", type=str, required=True,
+                        help="This is the name of the file to which the heldout data subset "
+                        "will be written; the format is the same as --training-data-out.")
+    parser.add_argument("--training-subset-out", type=str, required=True,
+                        help="This is the name of the file to which the statistically matched "
+                        "(to --heldout-subset-out) set of training data will be written")
+
+    print(sys.argv, file=sys.stderr)
+    args = parser.parse_args()
+
+    return args
+
+
+"""
+Notes on plan for how to implement this (we can keep this as documentation, but
+we'll maybe move some of it around when things get implemented).
+
+This is a rather simple plan and we might later implement something more
+sophisticated that does a better job of keeping chunks from the same utterance
+or the same speaker together.
+
+Basically we rely on the fact that the input utterances come in in sorted order
+(so utterances from adjacent speakers will naturally be together.
+
+We read the entries in the input scp file as a list, keeping them in the order
+they were in the input (which will naturally keep together chunks from the
+same utterance and utterances from the same speaker, since the raw egs were
+not randomized).  We split that list into distinct sub-lists, each with a unique value
+of <left_context>-<num_frames>-<right_context>.  In the normal case
+there will be just one such sub-list.
+
+In the case where --chunks-per-group=4 and --num-repeats=1, the groups of
+chunks would then just be (and we do this for each of the sub-lists):
+the first 4 chunks; the second 4 chunks; and so on.  In the case where
+--chunks-per-group=4 and --num-repeats=2, we'd obtain the groups as above, then
+we'd discard the first 2 chunks of each sub-list and repeat the process, giving
+us twice the original number of groups.  If you want you can just
+assert that --num-repeats is either 1 or 2 for now; higher values don't
+really make sense with the current approach for choosing groups.
+
+Once we have the groups as above, we need to figure out the subset of
+size --num-heldout-groups which will be chosen to appear in the output
+file --heldout-subset-out.  We'll also be choosing another subset of
+the same size to appear in the file --training-subset-out; and we'll
+be excluding some groups from the output --training-data-out (any
+utterances that appeared in --heldout-subset-out, or which were linked
+with such utterances via the --utt2uniq map, will be excluded).
+
+The way we choose the groups to appear in --heldout-subset-out is as follows.
+Firstly: in cases where the utt2uniq file is undefined, treat it as the identity
+map.  We are given list of groups.  We compute, for each group, the set of
+utterances represented in it, and from that, the set of "uniq" values (a "uniq"
+value is a string, representing a pre-augmentation utterance-id).  For each
+"uniq" value, we will compute the set of group-ids in which it was represented.
+For a given group, we take the union of all those sets for its "uniq" value, and
+remove its own group-id; this gives us the set of other groups that share a
+pre-augmentation utterance in common with this group.  This set might be empty
+only in the case where there was no augmentation and --num-repeats=1, and some
+particular utterance had been split into exactly 4 chunks which all ended up in
+the same group.
+
+From the information above we can sort the groups by the number of groups we'd
+have to hold out if we were to put that group in the heldout set.  Then if, say,
+--heldout-data-selection-proportion=0.2, we take the bottom 20% of groups by
+this measure, meaning the groups which will cause less training data to have to
+be held out.  This is the set from which we'll select the heldout data and the
+matched subset of training data.  Call this the "candidate set".  We first
+choose --num-heldout-groups groups from the candidate set.  This is the heldout
+subset.  From the heldout subset we compute the set of "uniq" values represented,
+and we remove from the training set any groups which share those "uniq" values.
+
+Next we need to choose the matched subset of training examples.  The way we do
+this is that we choose --num-heldout-groups from the "candidate set", after
+excluding groups that were in the heldout subset or which were removed from the
+training set because they contained "uniq" values in common with those in the
+heldout set.  If this fails because there were too few groups in the candidate
+set, just double --heldout-data-selection-proportion and retry.  Make sure to do
+something sensible in the case where the dataset is too tiny to choose the
+requested heldout set size (i.e. print an informative error message before
+dying).
+
+"""
+
+class Chunk:
+    """ This is a data structure for a chunk. A chunk is a single entry
+        of the --scp-in file.
+        'eg'  second field of --scp-in file
+    """
+    def __init__(self, scp_line):
+        result = re.match("^(.*)-(\d+)-(\d+)-(\d+)-(\d+)-v1\s+(.*)$", scp_line)
+        self.utt_id, first_frame, left_context, num_frames, right_context, self.eg = result.groups()
+        self.chunk_id = self.utt_id + '-' + first_frame
+        self.context_structure = '-'.join((left_context, num_frames, right_context))
+    def __repr__(self):
+        return '{}-{} {}'.format(self.chunk_id, self.context_structure, self.eg)
+
+
+def read_all_chunks(scp_file):
+    """ Loads all the lines of the --scp-in file as chunk objects.
+    """
+    chunks = []
+    with open(scp_file, 'r', encoding='latin-1') as f:
+        for line in f:
+            try:
+                chunks.append(Chunk(line.strip()))
+            except:
+                logger.error('Bad line: ' + line.strip())
+                raise
+    return chunks
+
+def load_utt2uniq(filename):
+    """ Loads the --utt2uniq file as a dict.
+    """
+    utt2uniq = {}
+    with open(filename, 'r', encoding='latin-1') as f:
+        for line in f:
+            uttid, base_uttid = line.strip().split()
+            utt2uniq[uttid] = base_uttid
+    return utt2uniq
+
+def write_egs(filename, group_indexes, all_groups):
+    """ Writes the output egs, i.e. the second field of
+        the --scp-in file for specific chunks specified by `group_indexes`.
+    """
+    with open(filename, 'w', encoding='latin-1') as f:
+        for group_index in group_indexes:
+            for chunk in all_groups[group_index]:
+                f.write('{}\n'.format(chunk.eg))
+
+
+
+def choose_egs(args):
+    """ The main part of the program.
+    """
+    random.seed(args.random_seed)
+    logger.info('Set random seed to {}.'.format(args.random_seed))
+    all_chunks = read_all_chunks(args.scp_in)
+    logger.info('Loaded {} chunks.'.format(len(all_chunks)))
+
+    chunk_to_sublist = {}
+    for chunk in all_chunks:
+        if chunk.context_structure not in chunk_to_sublist:
+            chunk_to_sublist[chunk.context_structure] = [chunk]
+        else:
+            chunk_to_sublist[chunk.context_structure].append(chunk)
+
+    logger.info('Created {} sub-lists with uniqe context '
+                'structure.'.format(len(chunk_to_sublist)))
+
+
+    assert(args.num_repeats == 1 or args.num_repeats == 2)
+    groups = []  # All groups from all sub-lists
+    for context_structure in sorted(chunk_to_sublist.keys()):
+        sublist = chunk_to_sublist[context_structure]
+        logger.info('Processing chunks with context '
+                    'structure: {}'.format(context_structure))
+        num_groups = (len(sublist) +
+                      args.chunks_per_group - 1) // args.chunks_per_group
+        for i in range(num_groups):
+            group = sublist[i * args.chunks_per_group : (i + 1) * args.chunks_per_group]
+            groups.append(group)
+            if args.num_repeats == 2:
+                shift = args.chunks_per_group // 2
+                group = sublist[i * args.chunks_per_group + shift :
+                                (i + 1) * args.chunks_per_group + shift]
+                if group:
+                    groups.append(group)
+
+    logger.info('Created a total of {} groups.'.format(len(groups)))
+
+    utt2uniq = {}
+    if args.utt2uniq:
+        utt2uniq = load_utt2uniq(args.utt2uniq)
+        logger.info('Loaded utt2uniq file with {} entries.'.format(len(utt2uniq)))
+    else:
+        logger.info('--utt2uniq not specified; using identity map.')
+
+
+    uniq_to_groups = {}  # uniq to set of groups that include it
+    for i, group in enumerate(groups):
+        for chunk in group:
+            uniq = utt2uniq.get(chunk.utt_id, chunk.utt_id)
+            if uniq not in uniq_to_groups:
+                uniq_to_groups[uniq] = set([i])
+            else:
+                uniq_to_groups[uniq].add(i)
+
+    logger.info('Computed uniq-to-groups for {} uniqs. Average number of '
+                'groups representing a uniq is '
+                '{}'.format(len(uniq_to_groups),
+                            sum([len(g) for g in uniq_to_groups.values()]) /
+                            len(uniq_to_groups)))
+
+    # This is indexed by group-index (same len as groups). other_groups[i] is
+    # the set of other groups which share some utterance with group i.
+    other_groups = [set() for g in groups]
+    for i, group in enumerate(groups):
+        for chunk in group:
+            uniq = utt2uniq.get(chunk.utt_id, chunk.utt_id)
+            other_groups_this_uniq = uniq_to_groups[uniq]
+            other_groups[i].update(other_groups_this_uniq)
+
+    for i, other in enumerate(other_groups):  # Remove self
+        other.remove(i)
+
+    # 'group_shared_size' is a list of pairs (i, n) where i is group-index and
+    # n is the number of groups that we'd
+    # have to hold out if we were to put that group in the heldout set.
+    group_shared_size = [(i, len(other)) for i, other in enumerate(other_groups)]
+    # Sort it on n:
+    group_shared_size.sort(key=lambda tup: tup[1])
+
+    total_num_groups = len(groups)
+    training_set = set(range(total_num_groups))  # All groups
+    candidate_set_size = int(args.heldout_data_selection_proportion
+                             * total_num_groups)
+    logger.info('Initial candidate set size: {}'.format(candidate_set_size))
+    if args.num_heldout_groups > candidate_set_size:
+        logger.error('args.heldout_data_selection_proportion is too small or '
+                     'there are too few groups.')
+        sys.exit(1)
+
+    candidate_set = set([tup[0] for tup in group_shared_size[:candidate_set_size]])
+    heldout_list = random.sample(candidate_set, args.num_heldout_groups)
+
+
+    # Remove all the heldout groups (and any other groups sharing some utterance
+    # with them) from both the candidate set and the training set
+    for group_index in heldout_list:
+        for shared_group_index in other_groups[group_index]:
+            candidate_set.discard(shared_group_index)
+            training_set.discard(shared_group_index)
+        candidate_set.discard(group_index)
+        training_set.discard(group_index)
+
+    logger.info('Candidate set size after removing heldout '
+                'groups: {}'.format(len(candidate_set)))
+    if args.num_heldout_groups > len(candidate_set):
+        logger.warn('Not enough groups left in the candidate set. Doubling it.')
+        candidate_set = set([tup[0] for tup in
+                             group_shared_size[:candidate_set_size * 2]])
+        for group_index in heldout_list:
+            for shared_group_index in other_groups[group_index]:
+                candidate_set.discard(shared_group_index)
+            candidate_set.discard(group_index)
+        logger.info('Candidate set size after doubling and removing heldout '
+                    'groups: {}'.format(len(candidate_set)))
+        if args.num_heldout_groups > len(candidate_set):
+            logger.error('args.heldout_data_selection_proportion is too small '
+                         'or there are too few groups. Not enough groups left.')
+            sys.exit(1)
+
+    train_subset_list = random.sample(candidate_set, args.num_heldout_groups)
+
+
+    # Write the outputs:
+    write_egs(args.training_data_out, training_set, groups)
+    write_egs(args.heldout_subset_out, heldout_list, groups)
+    write_egs(args.training_subset_out, train_subset_list, groups)
+
+
+def main():
+    try:
+        args = get_args()
+        choose_egs(args)
+    except Exception as e:
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py b/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py
new file mode 100755
index 00000000000..c1e9a04179b
--- /dev/null
+++ b/egs/wsj/s5/steps/chaina/internal/get_train_schedule.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+
+# Copyright 2019    Johns Hopkins University (author: Daniel Povey)
+# Copyright         Hossein Hadian
+
+
+# Apache 2.0.
+
+""" This script outputs information about a neural net training schedule,
+    to be used by ../train.sh, in the form of lines that can be selected
+    and sourced by the shell.
+"""
+
+import argparse
+import sys
+
+sys.path.insert(0, 'steps')
+import libs.nnet3.train.common as common_train_lib
+import libs.common as common_lib
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""Output training schedule information to be consumed by ../train.sh""",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument("--frame-subsampling-factor", type=int, default=3,
+                        help="""Frame subsampling factor for the combined model
+                        (bottom+top), will normally be 3.  Required here in order
+                        to deal with frame-shifted versions of the input.""")
+    parser.add_argument("--initial-effective-lrate",
+                        type=float,
+                        dest='initial_effective_lrate', default=0.001,
+                        help="""Effective learning rate used on the first iteration,
+                        determines schedule via geometric interpolation with
+                        --final-effective-lrate.   Actual learning rate is
+                        this times the num-jobs on that iteration.""")
+    parser.add_argument("--final-effective-lrate", type=float,
+                        dest='final_effective_lrate', default=0.0001,
+                        help="""Learning rate used on the final iteration, see
+                        --initial-effective-lrate for more documentation.""")
+    parser.add_argument("--num-jobs-initial", type=int, default=1,
+                        help="""Number of parallel neural net jobs to use at
+                        the start of training""")
+    parser.add_argument("--num-jobs-final", type=int, default=1,
+                        help="""Number of parallel neural net jobs to use at
+                        the end of training.  Would normally
+                        be >= --num-jobs-initial""")
+    parser.add_argument("--num-epochs", type=float, default=4.0,
+                        help="""The number of epochs to train for.
+                        Note: the 'real' number of times we see each
+                        utterance is this number times --frame-subsampling-factor
+                        (to cover frame-shifted copies of the data), times
+                        the value of --num-repeats given to process_egs.sh,
+                        times any factor arising from data augmentation.""")
+    parser.add_argument("--dropout-schedule", type=str,
+                        help="""Use this to specify the dropout schedule (how the dropout probability varies
+                        with time, 0 == no dropout).  You specify a piecewise
+                        linear function on the domain [0,1], where 0 is the
+                        start and 1 is the end of training; the
+                        function-argument (x) rises linearly with the amount of
+                        data you have seen, not iteration number (this improves
+                        invariance to num-jobs-{initial-final}).  E.g. '0,0.2,0'
+                        means 0 at the start; 0.2 after seeing half the data;
+                        and 0 at the end.  You may specify the x-value of
+                        selected points, e.g.  '0,0.2@0.25,0' means that the 0.2
+                        dropout-proportion is reached a quarter of the way
+                        through the data.  The start/end x-values are at
+                        x=0/x=1, and other unspecified x-values are interpolated
+                        between known x-values.  You may specify different rules
+                        for different component-name patterns using
+                        'pattern1=func1 pattern2=func2', e.g. 'relu*=0,0.1,0
+                        lstm*=0,0.2,0'.  More general should precede less
+                        general patterns, as they are applied sequentially.""")
+
+    parser.add_argument("--num-scp-files", type=int, default=0, required=True,
+                        help="""The number of .scp files in the egs dir.""")
+    parser.add_argument("--schedule-out", type=str, required=True,
+                        help="""Output file containing the training schedule.  The output
+                        is lines, one per training iteration.
+                        Each line (one per iteration) is a list of ;-separated commands setting shell
+                        variables.  Currently the following variables are set:
+                        iter, num_jobs, inv_num_jobs, scp_indexes, frame_shifts, dropout_opt, lrate.
+                        """)
+
+    print(sys.argv, file=sys.stderr)
+    args = parser.parse_args()
+
+    return args
+
+def get_schedules(args):
+    num_scp_files_expanded = args.num_scp_files * args.frame_subsampling_factor
+    num_scp_files_to_process = int(args.num_epochs * num_scp_files_expanded)
+    num_scp_files_processed = 0
+    num_iters = ((num_scp_files_to_process * 2)
+                 // (args.num_jobs_initial + args.num_jobs_final))
+
+    with open(args.schedule_out, 'w', encoding='latin-1') as ostream:
+        for iter in range(num_iters):
+            current_num_jobs = int(0.5 + args.num_jobs_initial
+                                   + (args.num_jobs_final - args.num_jobs_initial)
+                                   * float(iter) / num_iters)
+            # as a special case, for iteration zero we use just one job
+            # regardless of the --num-jobs-initial and --num-jobs-final.  This
+            # is because the model averaging does not work reliably for a
+            # freshly initialized model.
+            if iter == 0:
+                current_num_jobs = 1
+
+            lrate = common_train_lib.get_learning_rate(iter, current_num_jobs,
+                                                       num_iters,
+                                                       num_scp_files_processed,
+                                                       num_scp_files_to_process,
+                                                       args.initial_effective_lrate,
+                                                       args.final_effective_lrate)
+
+            if args.dropout_schedule == "":
+                args.dropout_schedule = None
+            dropout_edit_option = common_train_lib.get_dropout_edit_option(
+                args.dropout_schedule,
+                float(num_scp_files_processed) / max(1, (num_scp_files_to_process - args.num_jobs_final)),
+                iter)
+
+            frame_shifts = []
+            egs = []
+            for job in range(1, current_num_jobs + 1):
+                # k is a zero-based index that we will derive the other indexes from.
+                k = num_scp_files_processed + job - 1
+                # work out the 1-based scp index.
+                scp_index = (k % args.num_scp_files) + 1
+                # previous : frame_shift = (k/num_scp_files) % frame_subsampling_factor
+                frame_shift = ((scp_index + k // args.num_scp_files)
+                               % args.frame_subsampling_factor)
+
+                # Instead of frame shifts like [0, 1, 2], we make them more like
+                # [0, 1, -1].  This is clearer in intent, and keeps the
+                # supervision starting at frame zero, which IIRC is a
+                # requirement somewhere in the 'chaina' code.
+                if frame_shift > (args.frame_subsampling_factor // 2):
+                    frame_shift = frame_shift - args.frame_subsampling_factor
+
+                frame_shifts.append(str(frame_shift))
+                egs.append(str(scp_index))
+
+
+            print("""iter={iter}; num_jobs={nj}; inv_num_jobs={nj_inv}; scp_indexes=(pad {indexes}); frame_shifts=(pad {shifts}); dropout_opt="{opt}"; lrate={lrate}""".format(
+                iter=iter, nj=current_num_jobs, nj_inv=(1.0 / current_num_jobs),
+                indexes = ' '.join(egs), shifts=' '.join(frame_shifts),
+                opt=dropout_edit_option, lrate=lrate), file=ostream)
+            num_scp_files_processed = num_scp_files_processed + current_num_jobs
+
+
+def main():
+    args = get_args()
+    get_schedules(args)
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wsj/s5/steps/chaina/process_egs.sh b/egs/wsj/s5/steps/chaina/process_egs.sh
new file mode 100755
index 00000000000..e8d8cfeab4e
--- /dev/null
+++ b/egs/wsj/s5/steps/chaina/process_egs.sh
@@ -0,0 +1,186 @@
+#!/bin/bash
+
+# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+#
+# This script takes nnet examples dumped by steps/chaina/get_raw_egs.sh and
+# combines the chunks into groups by speaker (to the extent possible; it may
+# need to combine speakers in some cases), locally randomizes the result, and
+# dumps the resulting egs to disk.  Chunks of these will later be globally
+# randomized (at the scp level) by steps/chaina/randomize_egs.sh
+
+
+# Begin configuration section.
+cmd=run.pl
+chunks_per_group=4
+num_repeats=2  # number of times we repeat the same chunks with different
+               # grouping.  Recommend 1 or 2; must divide chunks_per_group
+compress=true   # set this to false to disable compression (e.g. if you want to see whether
+                # results are affected).
+
+
+num_heldout_groups=200    # The number of groups (i.e. groups of chunks) that
+                          # will go in the held-out set and the train subset
+                          # (heldout_subset.scp and train_subset.scp).  The real
+                          # point of train_subset.scp, and the reason we can't
+                          # just use a subset of train.scp, is that it contains
+                          # egs that are statistically comparable to
+                          # heldout_subset.scp, so their prob can be
+                          # meaningfully compared with those from
+                          # heldout_subset.scp.  Note: the number (e.g. 200) is
+                          # *after* merging chunks into groups of size
+                          # $chunks_per_group.
+
+
+shuffle_buffer_size=5000   # Size of buffer (containing grouped egs) to use
+                           # for random shuffle.
+
+stage=0
+nj=5             # the number of parallel jobs to run.
+srand=0
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 [opts] <raw-egs-dir> <processed-egs-dir>"
+  echo " e.g.: $0 --chunks-per-group 4 exp/chaina/tdnn1a_sp/raw_egs exp/chaina/tdnn1a_sp/processed_egs"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options (alternative to this"
+  echo "                                                   # command line)"
+  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --chunks-per-group <n;4>                           # Number of chunks (preferentially, from a single speaker"
+  echo "                                                   # to combine into each example.  This grouping of"
+  echo "                                                   # egs is part of the 'chaina' framework; the adaptation"
+  echo "                                                   # parameters will be estimated from these groups."
+  echo "  --num-repeats <n;2>                              # Number of times we group the same chunks into different"
+  echo "                                                   # groups.  For now only the values 1 and 2 are"
+  echo "                                                   # recommended, due to the very simple way we choose"
+  echo "                                                   # the groups (it's consecutive)."
+  echo "  --nj       <num-jobs;5>                          # Number of jobs to run in parallel.  Usually quite a"
+  echo "                                                   # small number, as we'll be limited by disk access"
+  echo "                                                   # speed."
+  echo "  --compress <bool;true>                           # True if you want the egs to be compressed"
+  echo "                                                   # (e.g. you may set to false for debugging purposes, to"
+  echo "                                                   # check that the compression is not hurting)."
+  echo "  --num-heldout-egs <n;200>                        # Number of egs to put in train_subset.scp and heldout_subset.scp."
+  echo "                                                   # These will be used for diagnostics.  Note: this number is"
+  echo "                                                   # the number of  grouped egs, after merging --chunks-per-group"
+  echo "                                                   # chunks into a single eg."
+  echo "                                                   # ... may be a comma separated list, but we advise a single"
+  echo "                                                   #  number in most cases, due to interaction with the need "
+  echo "                                                   # to group egs from the same speaker into groups."
+  echo "  --stage <stage|0>                                # Used to run this script from somewhere in"
+  echo "                                                   # the middle."
+  exit 1;
+fi
+
+raw_egs_dir=$1
+dir=$2
+
+# die on error or undefined variable.
+set -e -u
+
+if ! steps/chaina/validate_raw_egs.sh $raw_egs_dir; then
+  echo "$0: failed to validate input directory $raw_egs_dir"
+  exit 1
+fi
+
+
+mkdir -p $dir/temp $dir/log
+
+
+if [ $stage -le 0 ]; then
+  echo "$0: choosing egs to merge"
+
+  utt2uniq_opt=
+  [ -f $raw_egs_dir/misc/utt2uniq ] && utt2uniq_opt="--utt2uniq=$raw_egs_dir/misc/utt2uniq"
+
+  $cmd $dir/log/choose_egs_to_merge.log steps/chaina/internal/choose_egs_to_merge.py \
+    --chunks-per-group=$chunks_per_group \
+    --num-repeats=$num_repeats \
+    --num-heldout-groups=$num_heldout_groups \
+    $utt2uniq_opt \
+    --scp-in=$raw_egs_dir/all.scp \
+    --training-data-out=$dir/temp/train.list \
+    --heldout-subset-out=$dir/temp/heldout_subset.list \
+    --training-subset-out=$dir/temp/train_subset.list
+fi
+
+if [ $stage -le 1 ]; then
+
+  for name in heldout_subset train_subset; do
+    echo "$0: merging and shuffling $name egs"
+
+    # Linearize these lists and add keys to make it an scp format.
+    awk '{for (n=1;n<=NF;n++) { count++; print count, $n; }}' <$dir/temp/${name}.list >$dir/temp/${name}.scp
+
+    $cmd $dir/log/merge_${name}_egs.log \
+      nnet3-chain-merge-egs --minibatch-size=$chunks_per_group --compress=$compress \
+           scp:$dir/temp/${name}.scp ark:- \| \
+      nnet3-chain-shuffle-egs --srand=$srand ark:- ark,scp:$dir/${name}.ark,$dir/${name}.scp
+  done
+
+  # Split up the training list into multiple smaller lists, as it could be long.
+  utils/split_scp.pl $dir/temp/train.list  $(for j in $(seq $nj); do echo $dir/temp/train.$j.list; done)
+  # Linearize these lists and add keys to make them in scp format;
+  # nnet3-chain-merge-egs will merge the right groups, it's deterministic
+  # and we specified --minibatch-size=$chunks_per_group.
+  for j in $(seq $nj); do
+    awk '{for (n=1;n<=NF;n++) { count++; print count, $n; }}' <$dir/temp/train.$j.list >$dir/temp/train.$j.scp
+  done
+
+  if [ -e $dir/storage ]; then
+    # Make soft links to storage directories, if distributing this way..  See
+    # utils/create_split_dir.pl.
+    echo "$0: creating data links"
+    utils/create_data_link.pl $(for j in $(seq $nj); do echo $dir/train.$j.ark; done) || true
+  fi
+
+  $cmd JOB=1:$nj $dir/log/merge_train_egs.JOB.log \
+     nnet3-chain-merge-egs --compress=$compress --minibatch-size=$chunks_per_group \
+       scp:$dir/temp/train.JOB.scp ark:- \| \
+     nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size \
+         --srand=\$[JOB+$srand] ark:- ark,scp:$dir/train.JOB.ark,$dir/train.JOB.scp
+  # the awk command is to ensure unique ids for each group.
+  cat $(for j in $(seq $nj); do echo $dir/train.$j.scp; done) | awk '{printf("%09d %s\n", NR, $2);}' > $dir/train.scp
+fi
+
+
+cat $raw_egs_dir/info.txt  | awk  -v num_repeats=$num_repeats \
+   -v chunks_per_group=$chunks_per_group '
+  /^dir_type / { print "dir_type processed_chaina_egs"; next; }
+  /^num_input_frames / { print "num_input_frames "$2 * num_repeats; next; } # approximate; ignores held-out egs.
+  /^num_chunks / { print "num_chunks " $2 * num_repeats; next; }
+   {print;}
+  END{print "chunks_per_group " chunks_per_group; print "num_repeats " num_repeats;}' >$dir/info.txt
+
+# # Note: the info.txt will actually look like the following, in general,
+# # taking into account the fields present in the info.txt in the source dir:
+# dir_type processed_chaina_egs
+# num_input_frames $num_frames
+# num_chunks $num_chunks
+# lang $lang
+# feat_dim $feat_dim
+# num_leaves $num_leaves
+# frames_per_chunk $frames_per_chunk
+# frames_per_chunk_avg $frames_per_chunk_avg
+# left_context $left_context
+# left_context_initial $left_context_initial
+# right_context $right_context
+# right_context_final $right_context_final
+# chunks_per_group $chunks_per_group
+
+
+if ! cat $dir/info.txt | awk '{if (NF == 1) exit(1);}'; then
+  echo "$0: we failed to obtain at least one of the fields in $dir/info.txt"
+  exit 1
+fi
+
+cp -r $raw_egs_dir/misc/ $dir/
+
+
+echo "$0: Finished processing egs"
diff --git a/egs/wsj/s5/steps/chaina/randomize_egs.sh b/egs/wsj/s5/steps/chaina/randomize_egs.sh
new file mode 100755
index 00000000000..943d383c571
--- /dev/null
+++ b/egs/wsj/s5/steps/chaina/randomize_egs.sh
@@ -0,0 +1,194 @@
+#!/bin/bash
+
+# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+#
+# This script takes nnet examples dumped by steps/chaina/process_egs.sh,
+# globally randomizes the egs, and divides into multiple .scp files.  This is
+# the form of egs which is consumed by the training script.  All this is done
+# only by manipulating the contents of .scp files.  To keep locality of disk
+# access, we only randomize blocks of egs (e.g.  blocks containing 128 groups of
+# sequences).  This doesn't defeat randomization, because both process_egs.sh
+# and the training script use nnet3-shuffle-egs to do more local randomization.
+
+# Later on, we'll have a multilingual/multi-input-dir version fo this script
+# that combines egs from various data sources and possibly multiple languages.
+# This version assumes there is just one language.
+
+# Begin configuration section.
+cmd=run.pl
+
+groups_per_block=128     # The 'groups' are the egs in the scp file from
+                         # process_egs.sh, containing '--chunks-per-group' sequences
+                         # each.
+
+frames_per_job=3000000   # The number of frames of data we want to process per
+                         # training job (will determine how long each job takes,
+                         # and the frequency of model averaging.  This was
+                         # previously called --frames-per-iter, but
+                         # --frames-per-job is clearer as each job does this
+                         # many.
+
+num_groups_combine=1000  # the number of groups from the training set that we
+                         # randomly choose as input to nnet3-chain-combine;
+                         # these will go to combine.scp.  train_subset.scp and
+                         # heldout_subset.scp are, for now, just copied over
+                         # from the input.
+
+# Later we may provide a mechanism to change the language name; for now we
+# just copy it from the input.
+
+
+srand=0
+stage=0
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 [opts] <processed-egs-dir> <randomized-egs-dir>"
+  echo " e.g.: $0 --frames-per-job 2000000 exp/chaina/tdnn1a_sp/processed_egs exp/chaina/tdnn1a_sp/egs"
+  echo ""
+  echo "Main options (for others, see top of script file)"
+  echo "  --config <config-file>                           # config file containing options (alternative to this"
+  echo "                                                   # command line)"
+  echo "  --cmd (utils/run.pl;utils/queue.pl <queue opts>) # how to run jobs."
+  echo "  --groups-per-block <n;128>                       # The number of groups (i.e. previously merged egs"
+  echo "                                                   # containing --chunks-per-group chunks) to to consider "
+  echo "                                                   # as one block, where whole blocks are randomized;"
+  echo "                                                   # smaller means more complete randomization but less"
+  echo "                                                   # local disk access."
+  echo "  --frames-per-job <n;3000000>                     # The number of input frames (not counting context)"
+  echo "                                                   # that we aim to have in each scp file after"
+  echo "                                                   # randomization and splitting."
+  echo "  --num-groups-combine <n;1000>                    # The number of randomly chosen groups to"
+  echo "                                                   # put in the subset in 'combine.scp' which will"
+  echo "                                                   # be used in nnet3-chaina-combine to decide which"
+  echo "                                                   # models to average over."
+  echo "  --stage <stage|0>                                # Used to run this script from somewhere in"
+  echo "                                                   # the middle."
+  echo "  --srand <srand|0>                                # Random seed, affects randomization."
+  exit 1;
+fi
+
+processed_egs_dir=$1
+dir=$2
+
+# die on error or undefined variable.
+set -e -u
+
+if ! steps/chaina/validate_processed_egs.sh $processed_egs_dir; then
+  echo "$0: could not validate input directory $processed_egs_dir"
+  exit 1
+fi
+
+# Work out how many groups per job and how many frames per job we'll have
+
+info_in=$processed_egs_dir/info.txt
+
+frames_per_group_avg=$(awk '/^frames_per_chunk_avg/ { fpc=$2; } /^chunks_per_group/ { print int(fpc * $2); }' $info_in)
+if ! [ $frames_per_group_avg -gt 0 ]; then
+  echo "$0: error getting frames per group.";
+fi
+
+num_groups=$(wc -l <$processed_egs_dir/train.scp)
+
+num_scp_files=$[(frames_per_group_avg*num_groups + frames_per_job/2) / frames_per_job]
+[ $num_scp_files -eq 0 ] && num_scp_files=1
+
+frames_per_scp_file=$[(frames_per_group_avg * num_groups) / num_scp_files]
+groups_per_scp_file=$[ num_groups / num_scp_files]
+
+
+mkdir -p $dir/temp
+
+if [ -d $dir/misc ]; then
+  rm -r $dir/misc
+fi
+
+mkdir -p $dir/misc
+cp $processed_egs_dir/misc/* $dir/misc
+
+
+# We want to globally randomize the order of these blocks of (e.g.) 128 lines of
+# the input train.scp, and then split up into $num_scp_files groups.  we could
+# do this in a specially-written python script, but instead we do it with a
+# combination of existing Kaldi and UNIX utilities.
+
+awk -v gpb=$groups_per_block \
+    '{block=sprintf("%05d", NR / gpb); group_id=$1; print group_id, block;}' \
+    <$processed_egs_dir/train.scp >$dir/temp/key2block
+
+# get list of blocks
+awk '{print $2}' <$dir/temp/key2block | uniq > $dir/temp/blocks
+# get randomized-order list of blocks
+utils/shuffle_list.pl --srand "$srand" <$dir/temp/blocks > $dir/temp/blocks_rand
+# Map block-ids to randomized-order block-ids
+paste $dir/temp/blocks $dir/temp/blocks_rand > $dir/temp/block2rand
+
+
+# The following command first maps block-ids to randomized-order block-ids, then
+# sorts the keys by these randomized-order block-ids while otherwise maintaining
+# stable sorting (-s) which keeps the keys in the blocks in the same order.
+utils/apply_map.pl -f 2 $dir/temp/block2rand <$dir/temp/key2block | \
+  sort -k2 -s > $dir/temp/key2block_rand
+
+
+# The following command just changes the order of train.scp to
+# match the order in key2block_rand (which has the order of blocks
+# of lines randomly moved around).
+awk '{print $1, $1}' $dir/temp/key2block_rand | \
+  utils/apply_map.pl -f 2 $processed_egs_dir/train.scp \
+                     >$dir/temp/train.scp_rand
+
+
+# The following command splits up $dir/temp/train.scp_rand (the randomized-order
+# version of train.scp), while keeping distinct blocks in separate scp files,
+# thanks to the --utt2spk option.
+utils/split_scp.pl --utt2spk=$dir/temp/key2block_rand \
+   $dir/temp/train.scp_rand \
+   $(for i in $(seq $num_scp_files); do echo $dir/train.$i.scp; done)
+
+
+cp $processed_egs_dir/heldout_subset.scp $processed_egs_dir/train_subset.scp $dir/
+
+
+# note: there is only one language in $processed_egs_dir (any
+# merging would be done at the randomization stage but that is not supported yet).
+
+lang=$(awk '/^lang / { print $2; }' <$processed_egs_dir/info.txt)
+
+# We'll store info files per language, containing the part of the information
+# that is language-specific, plus a single global info.txt containing stuff that
+# is not language specific.
+# This will get more complicated once we actually support multiple languages,
+# and when we allow multiple input processed egs dirs for the same language.
+
+grep -v -E '^dir_type|^lang|^feat_dim' <$processed_egs_dir/info.txt | \
+  cat <(echo "dir_type randomized_chaina_egs") - > $dir/info_$lang.txt
+
+
+cat <<EOF >$dir/info.txt
+dir_type randomized_chaina_egs
+num_scp_files $num_scp_files
+langs $lang
+frames_per_scp_file $frames_per_scp_file
+groups_per_scp_file $groups_per_scp_file
+EOF
+# frames_per_job, after rounding, becomes frames_per_scp_file.
+
+# note: frames_per_chunk_avg will be present in the info.txt file as well as
+# the per-language files.
+grep -E '^feat_dim|^frames_per_chunk_avg' <$processed_egs_dir/info.txt >>$dir/info.txt
+
+
+
+if ! cat $dir/info.txt | awk '{if (NF == 1) exit(1);}'; then
+  echo "$0: we failed to obtain at least one of the fields in $dir/info.txt"
+  exit 1
+fi
+
+
+echo "$0: Finished randomizing egs"
diff --git a/egs/wsj/s5/steps/chaina/train.sh b/egs/wsj/s5/steps/chaina/train.sh
new file mode 100755
index 00000000000..0bfefd43b21
--- /dev/null
+++ b/egs/wsj/s5/steps/chaina/train.sh
@@ -0,0 +1,329 @@
+#!/bin/bash
+
+# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+
+
+# Begin configuration section
+stage=0
+leaky_hmm_coefficient=0.1
+xent_regularize=0.1
+apply_deriv_weights=false   # you might want to set this to true in unsupervised training
+                            # scenarios.
+memory_compression_level=2  # Enables us to use larger minibatch size than we
+                            # otherwise could, but may not be optimal for speed
+                            # (--> set to 0 if you have plenty of memory.
+dropout_schedule=
+srand=0
+max_param_change=1.0    # we use a smaller than normal default (it's normally
+                        # 2.0), because there are two models (bottom and top).
+use_gpu=yes   # can be "yes", "no", "optional", "wait"
+
+common_opts=           # Options passed through to nnet3-chaina-train and nnet3-chaina-combine
+
+top_unadapted_weight=0.5
+bottom_unadapted_weight=0.5
+
+num_epochs=4.0   #  Note: each epoch may actually contain multiple repetitions of
+                 #  the data, for various reasons:
+                 #    using the --num-repeats option in process_egs.sh
+                 #    data augmentation
+                 #    different data shifts (this includes 3 different shifts
+                 #    of the data if frame_subsampling_factor=3 (see $dir/init/info.txt)
+
+num_jobs_initial=1
+num_jobs_final=1
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+groups_per_minibatch=32  # This is how you set the minibatch size.  Note: if
+                         # chunks_per_group=4, this would mean 128 chunks per
+                         # minibatch.
+
+max_iters_combine=80
+max_models_combine=20
+diagnostic_period=5    # Get diagnostics every this-many iterations
+
+shuffle_buffer_size=1000  # This "buffer_size" variable controls randomization of the groups
+                          # on each iter.
+
+
+
+
+# End configuration section
+
+
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+
+if [ $# != 2 ]; then
+  echo "Usage: $0  [options] <egs-dir>  <model-dir>"
+  echo " e.g.: $0 exp/chaina/tdnn1a_sp/egs  exp/chaina/tdnn1a_sp"
+  echo ""
+  echo " TODO: more documentation"
+  exit 1
+fi
+
+egs_dir=$1
+dir=$2
+
+set -e -u  # die on failed command or undefined variable
+
+steps/chaina/validate_randomized_egs.sh $egs_dir
+
+for f in $dir/init/info.txt $dir/init/bottom.raw; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+
+frame_subsampling_factor=$(awk '/^frame_subsampling_factor/ {print $2}' <$dir/init/info.txt)
+bottom_subsampling_factor=$(awk '/^bottom_subsampling_factor/ {print $2}' <$dir/init/info.txt)
+
+if ! [ $[frame_subsampling_factor%bottom_subsampling_factor] == 0 ]; then
+  echo "$0: bad subsampling factors in $dir/init/info.txt"
+  exit 1
+fi
+
+num_scp_files=$(awk '/^num_scp_files/ {print $2}' <$egs_dir/info.txt)
+
+steps/chaina/internal/get_train_schedule.py \
+  --frame-subsampling-factor=$frame_subsampling_factor \
+  --num-jobs-initial=$num_jobs_initial \
+  --num-jobs-final=$num_jobs_final \
+  --num-epochs=$num_epochs \
+  --dropout-schedule="$dropout_schedule" \
+  --num-scp-files=$num_scp_files \
+  --frame-subsampling-factor=$frame_subsampling_factor \
+  --initial-effective-lrate=$initial_effective_lrate \
+  --final-effective-lrate=$final_effective_lrate \
+  --schedule-out=$dir/schedule.txt
+
+
+
+if [ "$use_gpu" != "no" ]; then gpu_cmd_opt="--gpu 1"; else gpu_cmd_opt=""; fi
+
+num_iters=$(wc -l <$dir/schedule.txt)
+
+echo "$0: will train for $num_epochs epochs = $num_iters iterations"
+
+# source the 1st line of schedule.txt in the shell; this sets
+# lrate and dropout_opt, among other variables.
+. <(head -n 1 $dir/schedule.txt)
+langs=$(awk '/^langs/ { $1=""; print; }' <$dir/init/info.txt)
+
+mkdir -p $dir/log
+
+# Copy models with initial learning rate and dropout options from $dir/init to $dir/0
+mkdir -p $dir/0
+run.pl $dir/log/init_bottom_model.log \
+  nnet3-copy --learning-rate=$lrate $dropout_opt $dir/init/bottom.raw $dir/0/bottom.raw
+for lang in $langs; do
+  run.pl $dir/log/init_model_$lang.log \
+      nnet3-am-copy --learning-rate=$lrate $dropout_opt $dir/init/$lang.mdl $dir/0/$lang.mdl
+done
+
+
+x=0
+if [ $stage -gt $x ]; then x=$stage; fi
+
+while [ $x -lt $num_iters ]; do
+  # Source some variables fromm schedule.txt.  The effect will be something
+  # like the following:
+  # iter=0; num_jobs=2; inv_num_jobs=0.5; scp_indexes=(pad 1 2); frame_shifts=(pad 1 2); dropout_opt="--edits='set-dropout-proportion name=* proportion=0.0'" lrate=0.002
+  . <(grep "^iter=$x;" $dir/schedule.txt)
+
+  echo "$0: training, iteration $x, num-jobs is $num_jobs"
+
+  next_x=$[$x+1]
+  model_in_dir=$dir/$x
+  if [ ! -f $model_in_dir/bottom.raw ]; then
+    echo "$0: expected $model_in_dir/bottom.raw to exist"
+    exit 1
+  fi
+  den_fst_dir=$egs_dir/misc
+  transform_dir=$dir/init
+  model_out_dir=$dir/${next_x}
+
+
+  # for the first 4 iterations, plus every $diagnostic_period iterations, launch
+  # some diagnostic processes.  We don't do this on iteration 0, because
+  # the batchnorm stats wouldn't be ready
+  if [ $x -gt 0 ] && [ $[x%diagnostic_period] -eq 0 -o $x -lt 5 ]; then
+
+    [ -f $dir/$x/.error_diagnostic ] && rm $dir/$x/.error_diagnostic
+    for name in train heldout; do
+      $cmd $gpu_cmd_opt $dir/log/diagnostic_${name}.$x.log \
+         nnet3-chaina-train --use-gpu=$use_gpu \
+            --bottom.train=false --bottom.dropout-test-mode=true \
+            --top.train=false --top.dropout-test-mode=true \
+            --leaky-hmm-coefficient=$leaky_hmm_coefficient \
+            --bottom-subsampling-factor=$bottom_subsampling_factor \
+            --xent-regularize=$xent_regularize \
+            --print-interval=10  \
+           $model_in_dir $den_fst_dir $transform_dir \
+           "ark:nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch scp:$egs_dir/${name}_subset.scp ark:-|" \
+      || touch $dir/$x/.error_diagnostic &
+    done
+  fi
+
+  if [ -d $dir/$next_x ]; then
+    echo "$0: removing previous contents of $dir/$next_x"
+    rm -r $dir/$next_x
+  fi
+  mkdir -p $dir/$next_x
+
+  for j in $(seq $num_jobs); do
+    scp_index=${scp_indexes[$j]}
+    frame_shift=${frame_shifts[$j]}
+
+    $cmd $gpu_cmd_opt $dir/log/train.$x.$j.log \
+         nnet3-chaina-train --job-id=$j --use-gpu=$use_gpu --apply-deriv-weights=$apply_deriv_weights \
+         --leaky-hmm-coefficient=$leaky_hmm_coefficient --xent-regularize=$xent_regularize \
+         --bottom-subsampling-factor=$bottom_subsampling_factor \
+         --top.unadapted-weight=$top_unadapted_weight --bottom.unadapted-weight=$bottom_unadapted_weight \
+         --print-interval=10 --max-param-change=$max_param_change \
+         --l2-regularize-factor=$inv_num_jobs --optimization.memory-compression-level=$memory_compression_level \
+         $model_in_dir $den_fst_dir $transform_dir \
+         "ark:nnet3-chain-copy-egs --frame-shift=$frame_shift scp:$egs_dir/train.$scp_index.scp ark:- | nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:- | nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch ark:- ark:-|" \
+         $model_out_dir || touch $dir/$next_x/.error &
+  done
+  wait
+  if [ -f $dir/$next_x/.error ]; then
+    echo "$0: error detected training on iteration $x"
+    exit 1
+  fi
+  # First average the bottom models
+  models=$(for j in $(seq $num_jobs); do echo $dir/$next_x/bottom.$j.raw; done)
+  run.pl $dir/log/average.$x.log \
+      nnet3-average $models - \| \
+      nnet3-copy --learning-rate=$lrate $dropout_opt - $dir/$next_x/bottom.raw
+  rm $models
+  for lang in $langs; do
+    models=$dir/$next_x/$lang.*.raw
+    run.pl $dir/log/average_${lang}.$x.log \
+           nnet3-average $models - \| \
+           nnet3-am-copy --set-raw-nnet=- --learning-rate=$lrate $dropout_opt $dir/$iter/$lang.mdl $dir/$next_x/$lang.mdl
+    rm $models
+  done
+  wait
+  [ -f $dir/$x/.error_diagnostic ] && echo "$0: error getting diagnostics on iter $x" && exit 1;
+
+  $cmd $dir/log/progress_bottom.$x.log \
+     nnet3-show-progress $dir/$x/bottom.raw $dir/$next_x/bottom.raw '&&' \
+     nnet3-info $dir/$next_x/bottom.raw || touch $dir/$next_x/.error &
+  for lang in $langs; do
+    $cmd $dir/log/progress_${lang}.$x.log \
+      nnet3-show-progress $dir/$x/$lang.mdl $dir/$next_x/$lang.mdl '&&' \
+      nnet3-am-info $dir/$next_x/$lang.mdl || touch $dir/$next_x/.error &
+  done
+  [ -f $dir/$next_x/.error ] && echo "$0: error getting progress logs" && exit 1;
+
+  # TODO: cleanup
+  x=$[x+1]
+done
+
+
+if [ $stage -le $num_iters ]; then
+  echo "$0: doing model combination"
+  if [ -d $dir/final ]; then
+    echo "$0: removing previous contents of $dir/final"
+    rm -r $dir/final
+  fi
+  mkdir -p $dir/final
+  den_fst_dir=$egs_dir/misc
+
+  [ $max_models_combine -gt $[num_iters/2] ] && max_models_combine=$[num_iters/2];
+  input_model_dirs=$(for x in $(seq $[num_iters+1-max_models_combine] $num_iters); do echo $dir/$x; done)
+  output_model_dir=$dir/final
+  transform_dir=$dir/init
+
+   $cmd $gpu_cmd_opt $dir/log/combine.log \
+      nnet3-chaina-combine --use-gpu=$use_gpu \
+        --leaky-hmm-coefficient=$leaky_hmm_coefficient \
+        --bottom-subsampling-factor=$bottom_subsampling_factor \
+        --print-interval=10  \
+        $input_model_dirs $den_fst_dir $transform_dir \
+        "ark:nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch scp:$egs_dir/train_subset.scp ark:-|" \
+        $dir/final
+fi
+
+
+if [ $stage -le $[num_iters+1] ]; then
+  # Now accumulate the class-dependent mean (and variance) stats of the
+  # adaptation model, which will be needed for decoding.  We remove the map that
+  # had reduced the num-classes from several thousand to (e.g.) 200, because we
+  # are now estimating the means on a larger set of data and we're not concerned
+  # about noisy estimates.
+  mkdir -p $dir/transforms_unmapped
+  # Note: the plan was to add the option --remove-pdf-map=true to the 'copy'
+  # command below (to use the full number of pdf-ids as classes in test time),
+  # but it seemed to degrade the objective function, based on diagnostics.
+  # We'll look into this later.
+  for lang in $langs; do
+    run.pl $dir/log/copy_transform_${lang}.log \
+        nnet3-adapt copy $dir/init/${lang}.ada $dir/transforms_unmapped/${lang}.ada
+  done
+  den_fst_dir=$egs_dir/misc
+  transform_dir=$dir/init
+
+  num_jobs=$num_scp_files
+  [ $num_jobs -gt 4 ] && num_jobs=4   # there are so few params to estimate that
+                                      # more than 4 jobs would be a waste.
+
+  $cmd $gpu_cmd_opt JOB=1:$num_jobs $dir/log/acc_target_model.JOB.log \
+    nnet3-chaina-train --job-id=JOB --use-gpu=$use_gpu \
+      --bottom-subsampling-factor=$bottom_subsampling_factor \
+      --print-interval=10 \
+      --bottom.train=false --bottom.dropout-test-mode=true --bottom.batchnorm-test-mode=true \
+      --top.train=false --top.dropout-test-mode=true --top.batchnorm-test-mode=true \
+      --adaptation-model-accumulate=true \
+         $dir/final $den_fst_dir $dir/transforms_unmapped \
+        "ark:nnet3-chain-shuffle-egs --buffer-size=$shuffle_buffer_size scp:$egs_dir/train.JOB.scp ark:- | nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch ark:- ark:-|" \
+        $dir/final
+
+  for lang in $langs; do
+    stats=$dir/final/${lang}.*.ada
+    run.pl $dir/log/estimate_target_model_${lang}.log \
+           nnet3-adapt estimate $stats $dir/final/${lang}.ada
+    rm $stats
+  done
+fi
+
+if [ $stage -le $[num_iters+2] ]; then
+  # Accumulate some final diagnostics.  The difference with the last iteration's
+  # diagnostics is that we use test-mode for the adaptation model (i.e. a target
+  # model computed from all the data, not just one minibatch).
+  [ -f $dir/final/.error_diagnostic ] && rm $dir/final/.error_diagnostic
+  for name in train heldout; do
+    den_fst_dir=$egs_dir/misc
+    $cmd $gpu_cmd_opt $dir/log/diagnostic_${name}.final.log \
+         nnet3-chaina-train --use-gpu=$use_gpu \
+         --bottom-subsampling-factor=$bottom_subsampling_factor \
+         --bottom.train=false --bottom.dropout-test-mode=true \
+         --top.train=false --top.dropout-test-mode=true \
+         --adaptation-test-mode=true \
+         --leaky-hmm-coefficient=$leaky_hmm_coefficient \
+         --xent-regularize=$xent_regularize \
+         --print-interval=10  \
+          $dir/final $den_fst_dir $dir/final \
+           "ark:nnet3-chain-merge-egs --minibatch-size=$groups_per_minibatch scp:$egs_dir/${name}_subset.scp ark:-|" \
+      || touch $dir/final/.error_diagnostic &
+  done
+  wait
+  if [ -f $dir/final/.error_diagnostic ]; then
+    echo "$0: error getting final diagnostic information"
+    exit 1
+  fi
+  cp $dir/init/info.txt $dir/final/
+fi
+
+
+transform_dir=$dir/init
+
+echo "$0: done"
+exit 0
diff --git a/egs/wsj/s5/steps/chaina/validate_processed_egs.sh b/egs/wsj/s5/steps/chaina/validate_processed_egs.sh
new file mode 100755
index 00000000000..d928642dff9
--- /dev/null
+++ b/egs/wsj/s5/steps/chaina/validate_processed_egs.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+#
+# This script validates a directory containing 'processed' egs for 'chaina'
+# training, i.e. the output of process_egs.sh.  It also helps to document the
+# expectations on such a directory.
+
+
+if [ -f path.sh ]; then . ./path.sh; fi
+
+
+if [ $# != 1 ]; then
+  echo "Usage: $0  <processed-egs-dir>"
+  echo " e.g.: $0 exp/chaina/tdnn1a_sp/processed_egs"
+  echo ""
+  echo "Validates that the processed-egs dir has the expected format"
+fi
+
+dir=$1
+
+# Note: the .ark files are not actually consumed directly downstream (only via
+# the top-level .scp files), but we check them anyway for now.
+for f in $dir/train.scp $dir/info.txt \
+         $dir/heldout_subset.{ark,scp} $dir/train_subset.{ark,scp} \
+         $dir/train.1.scp $dir/train.1.ark; do
+  if ! [ -f $f -a -s $f ]; then
+    echo "$0: expected file $f to exist and be nonempty."
+    exit 1
+  fi
+done
+
+
+if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "processed_chaina_egs" ]; then
+  grep dir_type $dir/info.txt
+  echo "$0: dir_type should be processed_chaina_egs in $dir/info.txt"
+  exit 1
+fi
+
+lang=$(awk '/^lang / {print $2; }' <$dir/info.txt)
+
+for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do
+  if ! [ -f $f -a -s $f ]; then
+    echo "$0: expected file $f to exist and be nonempty."
+    exit 1
+  fi
+done
+
+echo "$0: sucessfully validated processed egs in $dir"
diff --git a/egs/wsj/s5/steps/chaina/validate_randomized_egs.sh b/egs/wsj/s5/steps/chaina/validate_randomized_egs.sh
new file mode 100755
index 00000000000..1eebc144347
--- /dev/null
+++ b/egs/wsj/s5/steps/chaina/validate_randomized_egs.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+#
+# This script validates a directory containing 'randomized' egs for 'chaina'
+# training, i.e. the output of randomize_egs.sh (this is the final form of the
+# egs which is consumed by the training script).  It also helps to document the
+# expectations on such a directory.
+
+
+if [ -f path.sh ]; then . ./path.sh; fi
+
+
+if [ $# != 1 ]; then
+  echo "Usage: $0  <randomized-egs-dir>"
+  echo " e.g.: $0 exp/chaina/tdnn1a_sp/egs"
+  echo ""
+  echo "Validates that the final (randomized) egs dir has the expected format"
+fi
+
+dir=$1
+
+# Note: the .ark files are not actually consumed directly downstream (only via
+# the top-level .scp files), but we check them anyway for now.
+for f in $dir/train.1.scp $dir/info.txt \
+         $dir/heldout_subset.scp $dir/train_subset.scp; do
+  if ! [ -f $f -a -s $f ]; then
+    echo "$0: expected file $f to exist and be nonempty."
+    exit 1
+  fi
+done
+
+
+if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "randomized_chaina_egs" ]; then
+  grep dir_type $dir/info.txt
+  echo "$0: dir_type should be randomized_chaina_egs in $dir/info.txt"
+  exit 1
+fi
+
+langs=$(awk '/^langs / {$1 = ""; print; }' <$dir/info.txt)
+num_scp_files=$(awk '/^num_scp_files / { print $2; }' <$dir/info.txt)
+
+if [ -z "$langs" ]; then
+  echo "$0: expecting the list of languages to be nonempty in $dir/info.txt"
+  exit 1
+fi
+
+for lang in $langs; do
+  for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst} $dir/info_${lang}.txt; do
+    if ! [ -f $f -a -s $f ]; then
+      echo "$0: expected file $f to exist and be nonempty."
+      exit 1
+    fi
+  done
+done
+
+for i in $(seq $num_scp_files); do
+  if ! [ -s $dir/train.$i.scp ]; then
+    echo "$0: expected file $dir/train.$i.scp to exist and be nonempty."
+    exit 1
+  fi
+done
+
+
+echo "$0: sucessfully validated randomized egs in $dir"
diff --git a/egs/wsj/s5/steps/chaina/validate_raw_egs.sh b/egs/wsj/s5/steps/chaina/validate_raw_egs.sh
new file mode 100755
index 00000000000..5e15bc0c897
--- /dev/null
+++ b/egs/wsj/s5/steps/chaina/validate_raw_egs.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright   2019  Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+#
+# This script validates a directory containing 'raw' egs for 'chaina' training.
+# It also helps to document the expectations on such a directory.
+
+
+
+if [ -f path.sh ]; then . ./path.sh; fi
+
+
+if [ $# != 1 ]; then
+  echo "Usage: $0  <raw-egs-dir>"
+  echo " e.g.: $0 exp/chaina/tdnn1a_sp/raw_egs"
+  echo ""
+  echo "Validates that the raw-egs dir has the expected format"
+fi
+
+dir=$1
+
+for f in $dir/all.scp $dir/cegs.1.ark $dir/info.txt \
+         $dir/misc/utt2spk; do
+  if ! [ -s $f ]; then
+    echo "$0: expected file $f to exist and be nonempty."
+    exit 1
+  fi
+done
+
+
+if [ $(awk '/^dir_type/ { print $2; }' <$dir/info.txt) != "raw_chaina_egs" ]; then
+  grep dir_type $dir/info.txt
+  echo "$0: dir_type should be raw_chaina_egs in $dir/info.txt"
+  exit 1
+fi
+
+lang=$(awk '/^lang / {print $2; }' <$dir/info.txt)
+
+for f in $dir/misc/$lang.{trans_mdl,normalization.fst,den.fst}; do
+  if ! [ -s $f ]; then
+    echo "$0: expected file $f to exist and be nonempty."
+    exit 1
+  fi
+done
+
+echo "$0: sucessfully validated raw egs in $dir"
diff --git a/egs/wsj/s5/steps/diagnostic/analyze_lats.sh b/egs/wsj/s5/steps/diagnostic/analyze_lats.sh
index df1a6d64801..6b6091e8684 100755
--- a/egs/wsj/s5/steps/diagnostic/analyze_lats.sh
+++ b/egs/wsj/s5/steps/diagnostic/analyze_lats.sh
@@ -9,6 +9,7 @@
 # begin configuration section.
 iter=final
 cmd=run.pl
+model=
 acwt=0.1
 #end configuration section.
 
@@ -22,6 +23,10 @@ if [ $# -ne 2 ]; then
   echo " Options:"
   echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
   echo "    --acwt <acoustic-scale>         # Acoustic scale for getting best-path (default: 0.1)"
+  echo "    --iter  <iter>                  # default: final; affects model location if --model"
+  echo "                                    # not specified."
+  echo "    --model <model-name>            # Name of .mdl file (if not specified, defaults"
+  echo "                                    # to <decode-dir>/../<iter>.mdl if not specified."
   echo "e.g.:"
   echo "$0 data/lang exp/tri4b/decode_dev"
   echo "This script writes some diagnostics to <decode-dir>/log/alignments.log"
@@ -31,7 +36,9 @@ fi
 lang=$1
 dir=$2
 
-model=$dir/../${iter}.mdl
+if [ -z $model ]; then
+  model=$dir/../${iter}.mdl
+fi
 
 for f in $lang/words.txt $model $dir/lat.1.gz $dir/num_jobs; do
   [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
index 0ad93e5977d..d890f8007e6 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/dropout_schedule.py
@@ -186,9 +186,22 @@ def _get_component_dropout(dropout_schedule, data_fraction):
 
 def _get_dropout_proportions(dropout_schedule, data_fraction):
     """Returns dropout proportions based on the dropout_schedule for the
-    fraction of data seen at this stage of training.
+    fraction of data seen at this stage of training.  Returns a list of
+    pairs (pattern, dropout_proportion); for instance, it might return
+    the list ['*', 0.625] meaning a dropout proportion of 0.625 is to
+    be applied to all dropout components.
+
     Returns None if dropout_schedule is None.
 
+    dropout_schedule might be (in the sample case using the default pattern of
+    '*'): '0.1,0.5@0.5,0.1', meaning a piecewise linear function that starts at
+    0.1 when data_fraction=0.0, rises to 0.5 when data_fraction=0.5, and falls
+    again to 0.1 when data_fraction=1.0.   It can also contain space-separated
+    items of the form 'pattern=schedule', for instance:
+       '*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0'
+    The more specific patterns should go later, otherwise they will be overridden
+    by the less specific patterns' commands.
+
     Calls _get_component_dropout() for the different component name patterns
     in dropout_schedule.
 
@@ -198,6 +211,7 @@ def _get_dropout_proportions(dropout_schedule, data_fraction):
             See _self_test() for examples.
         data_fraction: The fraction of data seen until this stage of
             training.
+
     """
     if dropout_schedule is None:
         return None
@@ -210,14 +224,21 @@ def _get_dropout_proportions(dropout_schedule, data_fraction):
     return dropout_proportions
 
 
-def get_dropout_edit_string(dropout_schedule, data_fraction, iter_):
-    """Return an nnet3-copy --edits line to modify raw_model_string to
-    set dropout proportions according to dropout_proportions.
+
+def get_dropout_edit_option(dropout_schedule, data_fraction, iter_):
+    """Return an option to be passed to nnet3-copy (or nnet3-am-copy)
+    that will set the appropriate dropout proportion.  If no dropout
+    is being used (dropout_schedule is None), returns the empty
+    string, otherwise returns something like
+    "--edits='set-dropout-proportion name=* proportion=0.625'"
 
     Arguments:
         dropout_schedule: Value for the --trainer.dropout-schedule option.
             See help for --trainer.dropout-schedule.
             See _self_test() for examples.
+        data_fraction: real number in [0,1] that says how far along
+            in training we are.
+        iter_: iteration number (needed for debug printing only)
 
     See ReadEditConfig() in nnet3/nnet-utils.h to see how
     set-dropout-proportion directive works.
@@ -241,9 +262,39 @@ def get_dropout_edit_string(dropout_schedule, data_fraction, iter_):
 
     if _debug_dropout:
         logger.info("On iteration %d, %s", iter_, ', '.join(dropout_info))
-    return ("""nnet3-copy --edits='{edits}' - - |""".format(
-        edits=";".join(edit_config_lines)))
 
+    return "--edits='{0}'".format(";".join(edit_config_lines))
+
+
+def get_dropout_edit_string(dropout_schedule, data_fraction, iter_):
+    """Return an nnet3-copy --edits line to modify raw_model_string to
+    set dropout proportions according to dropout_proportions.
+    E.g. if _dropout_proportions(dropout_schedule, data_fraction)
+    returns [('*', 0.625)],  this will return the string:
+     "nnet3-copy --edits='set-dropout-proportion name=* proportion=0.625'"
+
+    This is a wrapper of the function get_dropout_edit_option which
+    gets the --edits option; this function just adds the nnet3-copy
+    and its arguments.
+
+    Arguments:
+        dropout_schedule: Value for the --trainer.dropout-schedule option.
+            See help for --trainer.dropout-schedule.
+            See _self_test() for examples.
+        data_fraction: real number in [0,1] that says how far along
+            in training we are.
+        iter_: iteration number (needed for debug printing only)
+
+    See ReadEditConfig() in nnet3/nnet-utils.h to see how
+    set-dropout-proportion directive works.
+    """
+
+    edit_option = get_dropout_edit_option(dropout_schedule, data_fraction, iter_)
+
+    if edit_option == "":
+        return ""
+    else:
+        return ("nnet3-copy {0} - - |".format(edit_option))
 
 def _self_test():
     """Run self-test.
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 5ac2ed59003..b540423e3cd 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -27,6 +27,7 @@
         'relu-batchnorm-layer' : xlayers.XconfigBasicLayer,
         'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer,
         'batchnorm-so-relu-layer' : xlayers.XconfigBasicLayer,
+        'batchnorm-layer' : xlayers.XconfigBasicLayer,
         'sigmoid-layer' : xlayers.XconfigBasicLayer,
         'tanh-layer' : xlayers.XconfigBasicLayer,
         'fixed-affine-layer' : xlayers.XconfigFixedAffineLayer,
diff --git a/egs/wsj/s5/steps/nnet3/chain/align_lats.sh b/egs/wsj/s5/steps/nnet3/chain/align_lats.sh
new file mode 100755
index 00000000000..ed10735245d
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/chain/align_lats.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+# Copyright 2012  Brno University of Technology (Author: Karel Vesely)
+#           2013  Johns Hopkins University (Author: Daniel Povey)
+#           2015  Vijayaditya Peddinti
+#           2016  Vimal Manohar
+#           2017  Pegah Ghahremani
+# Apache 2.0
+
+# Computes training alignments using nnet3 DNN, with output to lattices.
+
+# Begin configuration section.
+nj=4
+cmd=run.pl
+stage=-1
+# Begin configuration.
+scale_opts="--transition-scale=1.0 --self-loop-scale=1.0"
+acoustic_scale=1.0
+post_decode_acwt=10.0
+beam=20
+iter=final
+frames_per_chunk=50
+extra_left_context=0
+extra_right_context=0
+extra_left_context_initial=-1
+extra_right_context_final=-1
+online_ivector_dir=
+graphs_scp=
+# End configuration options.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f path.sh ] && . ./path.sh # source the path.
+. parse_options.sh || exit 1;
+
+if [ $# != 4 ]; then
+   echo "Usage: $0 <data-dir> <lang-dir> <src-dir> <align-dir>"
+   echo "e.g.: $0 data/train data/lang exp/nnet4 exp/nnet4_ali"
+   echo "main options (for others, see top of script file)"
+   echo "  --config <config-file>                           # config containing options"
+   echo "  --nj <nj>                                        # number of parallel jobs"
+   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
+   exit 1;
+fi
+
+data=$1
+lang=$2
+srcdir=$3
+dir=$4
+
+oov=`cat $lang/oov.int` || exit 1;
+mkdir -p $dir/log
+echo $nj > $dir/num_jobs
+sdata=$data/split${nj}
+[[ -d $sdata && $data/feats.scp -ot $sdata ]] || \
+   split_data.sh $data $nj || exit 1;
+
+extra_files=
+if [ ! -z "$online_ivector_dir" ]; then
+  steps/nnet2/check_ivectors_compatible.sh $srcdir $online_ivector_dir || exit 1
+  extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
+fi
+
+for f in $srcdir/tree $srcdir/${iter}.mdl $data/feats.scp $lang/L.fst $extra_files; do
+  [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
+done
+
+cp $srcdir/{tree,${iter}.mdl} $dir || exit 1;
+
+utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1;
+cp $lang/phones.txt $dir || exit 1;
+## Set up features.  Note: these are different from the normal features
+## because we have one rspecifier that has the features for the entire
+## training set, not separate ones for each batch.
+echo "$0: feature type is raw"
+
+cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null`
+cp $srcdir/cmvn_opts $dir 2>/dev/null
+
+feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |"
+
+ivector_opts=
+if [ ! -z "$online_ivector_dir" ]; then
+  ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1;
+  ivector_opts="--online-ivectors=scp:$online_ivector_dir/ivector_online.scp --online-ivector-period=$ivector_period"
+fi
+
+echo "$0: aligning data in $data using model from $srcdir, putting alignments in $dir"
+
+frame_subsampling_opt=
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  # e.g. for 'chain' systems
+  frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor)
+  frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor"
+  cp $srcdir/frame_subsampling_factor $dir
+  if [ "$frame_subsampling_factor" -gt 1 ] && \
+     [ "$scale_opts" == "--transition-scale=1.0 --self-loop-scale=0.1" ]; then
+    echo "$0: frame-subsampling-factor is not 1 (so likely a chain system),"
+    echo "...  but the scale opts are the defaults.  You probably want"
+    echo "--scale-opts '--transition-scale=1.0 --self-loop-scale=1.0'"
+    sleep 1
+  fi
+fi
+
+if [ ! -z "$graphs_scp" ]; then
+  if [ ! -f $graphs_scp ]; then
+    echo "Could not find graphs $graphs_scp" && exit 1
+  fi
+  tra="scp:utils/filter_scp.pl $sdata/JOB/utt2spk $graphs_scp |"
+  prog=compile-train-graphs-fsts
+else
+  tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|";
+  prog=compile-train-graphs
+fi
+
+if [ $stage -le 0 ]; then
+  ## because nnet3-latgen-faster doesn't support adding the transition-probs to the
+  ## graph itself, we need to bake them into the compiled graphs.  This means we can't reuse previously compiled graphs,
+  ## because the other scripts write them without transition probs.
+  $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \
+    $prog --read-disambig-syms=$lang/phones/disambig.int \
+    $scale_opts \
+    $dir/tree $srcdir/${iter}.mdl  $lang/L.fst "$tra" \
+    "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1
+fi
+
+if [ $stage -le 1 ]; then
+  # Warning: nnet3-latgen-faster doesn't support a retry-beam so you may get more
+  # alignment errors (however, it does have a default min-active=200 so this
+  # will tend to reduce alignment errors).
+  # --allow_partial=false makes sure we reach the end of the decoding graph.
+  # --word-determinize=false makes sure we retain the alternative pronunciations of
+  #   words (including alternatives regarding optional silences).
+  #  --lattice-beam=$beam keeps all the alternatives that were within the beam,
+  #    it means we do no pruning of the lattice (lattices from a training transcription
+  #    will be small anyway).
+  $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \
+    nnet3-latgen-faster --acoustic-scale=$acoustic_scale $ivector_opts $frame_subsampling_opt \
+    --frames-per-chunk=$frames_per_chunk \
+    --extra-left-context=$extra_left_context \
+    --extra-right-context=$extra_right_context \
+    --extra-left-context-initial=$extra_left_context_initial \
+    --extra-right-context-final=$extra_right_context_final \
+    --beam=$beam --lattice-beam=$beam \
+    --allow-partial=false --word-determinize=false \
+    $srcdir/${iter}.mdl "ark:gunzip -c $dir/fsts.JOB.gz |" \
+    "$feats" "ark:|lattice-copy --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" || exit 1;
+fi
+
+echo "$0: done generating lattices from training transcripts."
diff --git a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
index 757963f13a7..6fcbc472412 100755
--- a/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/build_tree.sh
@@ -27,6 +27,11 @@ leftmost_questions_truncate=-1  # note: this option is deprecated and has no eff
 tree_stats_opts=
 cluster_phones_opts=
 repeat_frames=false
+num_clusters=           # e.g. 200; can be used if you want a 2-level tree, and
+                        # in that case the file tree.map will be output, which
+                        # maps from the leaves to (effectively) clusters of
+                        # leaves.  We'll also output the file num_clusters which is
+                        # the number of these clusters (normally == the option).
 # End configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -58,6 +63,13 @@ if [ $# != 5 ]; then
   echo "  --frame-subsampling-factor <factor>              # Factor (e.g. 3) controlling frame subsampling"
   echo "                                                   # at the neural net output, so the frame rate at"
   echo "                                                   # the output is less than at the input."
+  echo "  --alignment-subsampling-factor <factor>          # Factor controlling subsampling of the input alignment."
+  echo "                                                   # By default it equal to the frame-subsampling-factor,"
+  echo "                                                   # but (e.g.) if you use a low-frame-rate system to"
+  echo "                                                   # generate alignments, you might want to set this to 1."
+  echo " --num-clusters <num-clust>                        # Default: none.  E.g. 200; can be used if you want"
+  echo "                                                   # a 2-level tree.  Used in 'chaina' setup.  The file"
+  echo "                                                   # tree.map will be output in this case."
   exit 1;
 fi
 
@@ -168,11 +180,28 @@ if [ $stage -le -3 ] && $train_tree; then
     compile-questions $context_opts $lang/topo \
       $dir/questions.int $dir/questions.qst || exit 1;
 
-  echo "$0: Building the tree"
-  $cmd $dir/log/build_tree.log \
-    build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
-    --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
-    $dir/questions.qst $lang/topo $dir/tree || exit 1;
+  if [ -z "$num_clusters" ]; then
+    # normal case: single tree.
+    echo "$0: Building the tree"
+    $cmd $dir/log/build_tree.log \
+         build-tree $context_opts --verbose=1 --max-leaves=$numleaves \
+         --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \
+         $dir/questions.qst $lang/topo $dir/tree || exit 1;
+  else
+    if ! [ $num_clusters -lt $numleaves ]; then
+      echo "$0: --num-clusters=$num_clusters must be less than num-leaves=$numleaves"
+      exit 1;
+    fi
+    $cmd $dir/log/build_tree.log \
+         build-tree-two-level $context_opts --verbose=1 \
+             --max-leaves-first=$num_clusters --max-leaves-second=$numleaves \
+          $dir/treeacc $lang/phones/roots.int \
+          $dir/questions.qst $lang/topo $dir/tree \
+          "|copy-int-vector --binary=false - $dir/tree.map" || exit 1;
+    num_clusters_effective=$(cat $dir/tree.map awk '{nc=0; for(n=2;n<NF;n++) if($n>=nc) nc=1+$n; }END{print nc}')
+    echo $num_clusters_effective >$dir/num_clusters
+    echo "$0: you requested --num-clusters=$num_clusters, you got 2nd-level tree num-leaves=$num_clusters_effective"
+  fi
 fi
 
 if [ $stage -le -2 ]; then
diff --git a/egs/wsj/s5/steps/nnet3/compute_output.sh b/egs/wsj/s5/steps/nnet3/compute_output.sh
index e55f705043b..1f61e97876e 100755
--- a/egs/wsj/s5/steps/nnet3/compute_output.sh
+++ b/egs/wsj/s5/steps/nnet3/compute_output.sh
@@ -35,6 +35,7 @@ if [ $# -ne 3 ]; then
   echo "e.g.:   steps/nnet3/compute_output.sh --nj 8 \\"
   echo "--online-ivector-dir exp/nnet3/ivectors_test_eval92 \\"
   echo "    data/test_eval92_hires exp/nnet3/tdnn exp/nnet3/tdnn/output"
+  echo "Output will be in <output-dir>/output.scp"
   echo "main options (for others, see top of script file)"
   echo "  --config <config-file>                   # config containing options"
   echo "  --nj <nj>                                # number of parallel jobs"
diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh
index 14dda2bd457..adf686fa10e 100755
--- a/egs/wsj/s5/steps/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/nnet3/decode.sh
@@ -19,7 +19,7 @@ min_active=200
 ivector_scale=1.0
 lattice_beam=8.0 # Beam we use in lattice generation.
 iter=final
-num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+num_threads=1 # if >1, will use nnet3-latgen-faster-parallel
 use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch.
               # In that case it is recommended to set num-threads to a large
               # number, e.g. 20 if you have that many free CPU slots on a GPU
diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_config.py b/egs/wsj/s5/steps/nnet3/xconfig_to_config.py
new file mode 100755
index 00000000000..e234ea732d4
--- /dev/null
+++ b/egs/wsj/s5/steps/nnet3/xconfig_to_config.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+# Copyright 2016-2018    Johns Hopkins University (Dan Povey)
+#           2016    Vijayaditya Peddinti
+#           2017    Google Inc. (vpeddinti@google.com)
+# Apache 2.0.
+
+# we're using python 3.x style print but want it to work in python 2.x,
+
+import argparse
+import os
+import sys
+from collections import defaultdict
+
+sys.path.insert(0, 'steps/')
+# the following is in case we weren't running this from the normal directory.
+sys.path.insert(0, os.path.realpath(os.path.dirname(sys.argv[0])) + '/')
+
+import libs.nnet3.xconfig.parser as xparser
+import libs.common as common_lib
+
+
+def get_args():
+    # we add compulsory arguments as named arguments for readability
+    parser = argparse.ArgumentParser(
+        description="Reads an xconfig file and creates config files "
+                    "for neural net creation and training",
+        epilog='Search egs/*/*/local/{nnet3,chain}/*sh for examples')
+    parser.add_argument('--xconfig-file', required=True,
+                        help='Filename of input xconfig file')
+    parser.add_argument('--existing-model',
+                        help='Filename of previously trained neural net '
+                             '(e.g. final.mdl) which is useful in case of '
+                             'using nodes from list of component-nodes in '
+                             'already trained model '
+                             'to generate new config file for new model.'
+                             'The context info is also generated using '
+                             'a model generated by adding final.config '
+                             'to the existing model.'
+                             'e.g. In Transfer learning: generate new model using '
+                             'component nodes in existing model.')
+    parser.add_argument('--config-file-out', required=True,
+                        help='Filename to write nnet config file.');
+    parser.add_argument('--nnet-edits', type=str, default=None,
+                        action=common_lib.NullstrToNoneAction,
+                        help="""This option is useful in case the network you
+                        are creating does not have an output node called
+                        'output' (e.g. for multilingual setups).  You can set
+                        this to an edit-string like: 'rename-node old-name=xxx
+                        new-name=output' if node xxx plays the role of the
+                        output node in this network.  This is only used for
+                        computing the left/right context.""")
+
+    print(' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+
+    return args
+
+
+
+def write_config_file(config_file_out, all_layers):
+    # config_basename_to_lines is map from the basename of the
+    # config, as a string (i.e. 'ref', 'all', 'init') to a list of
+    # strings representing lines to put in the config file.
+    config_basename_to_lines = defaultdict(list)
+
+    for layer in all_layers:
+        try:
+            pairs = layer.get_full_config()
+            for config_basename, line in pairs:
+                config_basename_to_lines[config_basename].append(line)
+        except Exception as e:
+            print("{0}: error producing config lines from xconfig "
+                  "line '{1}': error was: {2}".format(sys.argv[0],
+                                                      str(layer), repr(e)),
+                  file=sys.stderr)
+            # we use raise rather than raise(e) as using a blank raise
+            # preserves the backtrace
+            raise
+
+    with open(config_file_out, 'w') as f:
+        print('# This file was created by the command:\n'
+              '# {0} '.format(sys.argv), file=f)
+        lines = config_basename_to_lines['final']
+        for line in lines:
+            print(line, file=f)
+
+
+def main():
+    args = get_args()
+    existing_layers = []
+    if args.existing_model is not None:
+        existing_layers = xparser.get_model_component_info(args.existing_model)
+    all_layers = xparser.read_xconfig_file(args.xconfig_file, existing_layers)
+    write_config_file(args.config_file_out, all_layers)
+
+
+if __name__ == '__main__':
+    main()
+
+
+# test:
+# (echo 'input dim=40 name=input'; echo 'output name=output input=Append(-1,0,1)')  >xconfig; steps/nnet3/xconfig_to_config.py --xconfig-file=xconfig --config-file-out=foo
diff --git a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
index f025eb5b343..4d96ef5db43 100755
--- a/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
+++ b/egs/wsj/s5/steps/nnet3/xconfig_to_configs.py
@@ -39,8 +39,13 @@ def get_args():
                              'to the existing model.'
                              'e.g. In Transfer learning: generate new model using '
                              'component nodes in existing model.')
-    parser.add_argument('--config-dir', required=True,
-                        help='Directory to write config files and variables')
+    parser.add_argument('--config-dir', required=False,
+                        help='Directory to write config files and variables; either '
+                        'this or --config-out must be specified.')
+    parser.add_argument('--config-out', required=False,
+                        help='Filename to write nnet config file.  This is the '
+                        'simplified interface that does not support lda-layer. '
+                        'Either this or --config-dir must be supplied.')
     parser.add_argument('--nnet-edits', type=str, default=None,
                         action=common_lib.NullstrToNoneAction,
                         help="""This option is useful in case the network you
@@ -141,7 +146,7 @@ def write_expanded_xconfig_files(config_dir, all_layers):
 
 def get_config_headers():
     """ This function returns a map from config-file basename
-    e.g. 'init', 'ref', 'layer1' to a documentation string that goes
+    e.g. 'init', 'ref', 'final' to a documentation string that goes
     at the top of the file.
     """
     # resulting dict will default to the empty string for any config files not
@@ -230,6 +235,41 @@ def write_config_files(config_dir, all_layers):
             raise
 
 
+# This is an alternative to 'write_config_files' where a single output
+# file is desired (would correspond to the output 'final.config' in the
+# normal setup).  In this case, things like LDA and presoftmax are not
+# supported.
+def write_single_config_file(config_file_out, all_layers):
+    # config_basename_to_lines is map from the basename of the
+    # config, as a string (i.e. 'ref', 'all', 'init') to a list of
+    # strings representing lines to put in the config file.
+    config_basename_to_lines = defaultdict(list)
+
+    config_basename_to_header = get_config_headers()
+
+    for layer in all_layers:
+        try:
+            pairs = layer.get_full_config()
+            for config_basename, line in pairs:
+                config_basename_to_lines[config_basename].append(line)
+        except Exception as e:
+            print("{0}: error producing config lines from xconfig "
+                  "line '{1}': error was: {2}".format(sys.argv[0],
+                                                      str(layer), repr(e)),
+                  file=sys.stderr)
+            # we use raise rather than raise(e) as using a blank raise
+            # preserves the backtrace
+            raise
+
+
+    with open(config_file_out, 'w') as f:
+        header = config_basename_to_header['final']
+        print(header, file=f)
+        lines = config_basename_to_lines['final']
+        for line in lines:
+            print(line, file=f)
+
+
 def add_nnet_context_info(config_dir, nnet_edits=None,
                           existing_model=None):
     """Create the 'vars' file that specifies model_left_context, etc."""
diff --git a/src/Makefile b/src/Makefile
index 1b37ebce745..737a26338ca 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -6,16 +6,16 @@ SHELL := /bin/bash
 
 
 SUBDIRS = base matrix util feat tree gmm transform \
-          fstext hmm lm decoder lat kws cudamatrix nnet \
+          fstext hmm lm decoder lat kws cudamatrix adapt nnet \
           bin fstbin gmmbin fgmmbin featbin \
-          nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 rnnlm chain nnet3bin nnet2bin kwsbin \
-          ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin
+          nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 nnet3a rnnlm chain nnet3bin nnet2bin kwsbin \
+          ivector ivectorbin online2 online2bin lmbin chainbin rnnlmbin nnet3abin
 
 MEMTESTDIRS = base matrix util feat tree gmm transform \
-          fstext hmm lm decoder lat nnet kws chain \
+          fstext hmm lm decoder lat nnet kws chain nnet3a \
           bin fstbin gmmbin fgmmbin featbin \
           nnetbin latbin sgmm2 nnet2 nnet3 rnnlm nnet2bin nnet3bin sgmm2bin kwsbin \
-          ivector ivectorbin online2 online2bin lmbin
+          ivector ivectorbin online2 online2bin lmbin nnet3abin
 
 CUDAMEMTESTDIR = cudamatrix
 
@@ -150,7 +150,7 @@ $(EXT_SUBDIRS) : mklibdir ext_depend
 ### Dependency list ###
 # this is necessary for correct parallel compilation
 #1)The tools depend on all the libraries
-bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin: \
+bin fstbin gmmbin fgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin rnnlmbin nnet3abin: \
  base matrix util feat tree gmm transform sgmm2 fstext hmm \
  lm decoder lat cudamatrix nnet nnet2 nnet3 ivector chain kws online2 rnnlm
 
@@ -169,9 +169,11 @@ lm: base util matrix fstext
 decoder: base util matrix gmm hmm tree transform lat
 lat: base util hmm tree matrix
 cudamatrix: base util matrix
+adapt: base util matrix hmm cudamatrix
 nnet: base util hmm tree matrix cudamatrix
 nnet2: base util matrix lat gmm hmm tree transform cudamatrix
 nnet3: base util matrix lat gmm hmm tree transform cudamatrix chain fstext
+nnet3a: base util matrix lat gmm hmm tree transform cudamatrix adapt nnet3 chain fstext
 rnnlm: base util matrix cudamatrix nnet3 lm hmm
 chain: lat hmm tree fstext matrix cudamatrix util base
 ivector: base util matrix transform tree gmm
diff --git a/src/adapt/Makefile b/src/adapt/Makefile
new file mode 100644
index 00000000000..25c016b4e6d
--- /dev/null
+++ b/src/adapt/Makefile
@@ -0,0 +1,19 @@
+all:
+
+include ../kaldi.mk
+
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+TESTFILES = differentiable-fmllr-test differentiable-transform-test
+
+OBJFILES = differentiable-fmllr.o differentiable-transform-itf.o \
+           generic-transform.o differentiable-transform.o
+
+LIBNAME = kaldi-adapt
+
+ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../hmm/kaldi-hmm.a	\
+          ../matrix/kaldi-matrix.a ../util/kaldi-util.a \
+          ../base/kaldi-base.a
+
+include ../makefiles/default_rules.mk
diff --git a/src/adapt/differentiable-fmllr-test.cc b/src/adapt/differentiable-fmllr-test.cc
new file mode 100644
index 00000000000..86f3b924418
--- /dev/null
+++ b/src/adapt/differentiable-fmllr-test.cc
@@ -0,0 +1,639 @@
+// adapt/differentiable-fmllr-test.cc
+
+// Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "adapt/differentiable-fmllr.h"
+#include "matrix/sp-matrix.h"
+
+namespace kaldi {
+namespace differentiable_transform {
+
+
+
+// Test derivatives produced by the Estimator object for K.
+void TestCoreFmllrEstimatorKDeriv(
+    BaseFloat gamma,
+    const Matrix<BaseFloat> &G,
+    const Matrix<BaseFloat> &K,
+    const Matrix<BaseFloat> &A,
+    CoreFmllrEstimator *estimator) {
+
+  int32 num_directions = 4;
+  Vector<BaseFloat> expected_changes(num_directions),
+      actual_changes(num_directions);
+
+  int32 dim = G.NumRows();
+  BaseFloat epsilon = 1.0e-03 * gamma;
+  Matrix<BaseFloat> A_deriv(dim, dim);
+  // A_deriv defines the objective function: a random linear function in A.
+  A_deriv.SetRandn();
+  A_deriv.Add(0.1);  // Introduce some asymmetry.
+
+  Matrix<BaseFloat> G_deriv(dim, dim),
+      K_deriv(dim, dim);
+  estimator->Backward(A_deriv, &G_deriv, &K_deriv);
+
+  for (int32 i = 0; i < num_directions; i++) {
+    Matrix<BaseFloat> K_new(dim, dim);
+    K_new.SetRandn();
+    K_new.Scale(epsilon);
+    expected_changes(i) = TraceMatMat(K_new, K_deriv, kTrans);
+    K_new.AddMat(1.0, K);
+    FmllrEstimatorOptions opts;
+    Matrix<BaseFloat> A_new(dim, dim);
+    CoreFmllrEstimator estimator2(opts, gamma, G, K_new, &A_new);
+    estimator2.Forward();
+    A_new.AddMat(-1.0, A);
+    // compute the change in our random linear objective function defined by
+    // A_deriv, that would be produced by taking some small random change in K
+    // and computing the A that results from that.
+    actual_changes(i) = TraceMatMat(A_new, A_deriv, kTrans);
+  }
+
+  KALDI_LOG << "Expected changes: " << expected_changes
+            << ", actual changes: " << actual_changes;
+  if (!expected_changes.ApproxEqual(actual_changes, 0.1)) {
+    KALDI_ERR << "Expected and actual changes differ too much: "
+               << expected_changes << " vs. "
+               << actual_changes;
+  }
+}
+
+// Test derivatives produced by the Estimator object for G.
+void TestCoreFmllrEstimatorGDeriv(
+    BaseFloat gamma,
+    const Matrix<BaseFloat> &G,
+    const Matrix<BaseFloat> &K,
+    const Matrix<BaseFloat> &A,
+    CoreFmllrEstimator *estimator) {
+
+  int32 num_directions = 4;
+  Vector<BaseFloat> expected_changes(num_directions),
+      actual_changes(num_directions);
+
+  int32 dim = G.NumRows();
+  BaseFloat epsilon = 1.0e-03 * gamma;
+  Matrix<BaseFloat> A_deriv(dim, dim);
+  // A_deriv defines the objective function: a random linear function in A.
+  A_deriv.SetRandn();
+  A_deriv.Add(0.1);  // Introduce some asymmetry.
+
+  Matrix<BaseFloat> G_deriv(dim, dim),
+      K_deriv(dim, dim);
+  estimator->Backward(A_deriv, &G_deriv, &K_deriv);
+
+  KALDI_ASSERT(G_deriv.IsSymmetric());
+
+  for (int32 i = 0; i < num_directions; i++) {
+    Matrix<BaseFloat> G_new(dim, dim);
+    {
+      SpMatrix<BaseFloat> s(dim);
+      s.SetRandn();
+      G_new.CopyFromSp(s);
+    }
+    G_new.Scale(epsilon);
+    expected_changes(i) = TraceMatMat(G_new, G_deriv, kTrans);
+    G_new.AddMat(1.0, G);
+    FmllrEstimatorOptions opts;
+    Matrix<BaseFloat> A_new(dim, dim);
+    CoreFmllrEstimator estimator2(opts, gamma, G_new, K, &A_new);
+    estimator2.Forward();
+    A_new.AddMat(-1.0, A);
+    // compute the change in our random linear objective function defined by
+    // A_deriv, that would be produced by taking some small random change in K
+    // and computing the A that results from that.
+    actual_changes(i) = TraceMatMat(A_new, A_deriv, kTrans);
+  }
+
+  KALDI_LOG << "Expected changes: " << expected_changes
+            << ", actual changes: " << actual_changes;
+  if (!expected_changes.ApproxEqual(actual_changes, 0.1)) {
+    KALDI_ERR << "Expected and actual changes differ too much: "
+               << expected_changes << " vs. "
+               << actual_changes;
+  }
+}
+
+
+
+void UnitTestCoreFmllrEstimatorSimple() {
+  int32 dim = RandInt(10, 20);
+  BaseFloat gamma = RandInt(5, 10);
+  Matrix<BaseFloat> G(dim, dim),
+      K(dim, dim), A(dim, dim, kUndefined);
+  G.AddToDiag(1.234 * gamma);
+  K.AddToDiag(0.234 * gamma);
+  FmllrEstimatorOptions opts;
+  CoreFmllrEstimator estimator(opts, gamma, G, K, &A);
+  BaseFloat objf_impr = estimator.Forward();
+  KALDI_LOG << "A is " << A;
+  KALDI_ASSERT(A.IsUnit(0.01));
+  KALDI_ASSERT(fabs(objf_impr) < 0.01);
+  for (int32 i = 0; i < 5; i++) {
+    TestCoreFmllrEstimatorKDeriv(gamma, G, K, A, &estimator);
+    TestCoreFmllrEstimatorGDeriv(gamma, G, K, A, &estimator);
+  }
+}
+
+static void InitRandNonsingular(MatrixBase<BaseFloat> *M) {
+  do {
+    M->SetRandn();
+  } while (M->Cond() > 50.0);
+}
+
+
+void UnitTestCoreFmllrEstimatorGeneral() {
+  int32 dim = RandInt(10, 20);
+  BaseFloat gamma = RandInt(5, 10);
+  Matrix<BaseFloat> G(dim, dim),
+      K(dim, dim), A(dim, dim, kUndefined);
+
+  {
+    // make sure G is symmetric and +ve definite.
+    Matrix<BaseFloat> A(dim, dim + 10);
+    A.SetRandn();
+    G.AddMatMat(gamma, A, kNoTrans, A, kTrans, 0.0);
+  }
+
+  InitRandNonsingular(&K);
+  K.Scale(gamma);
+  FmllrEstimatorOptions opts;
+  CoreFmllrEstimator estimator(opts, gamma, G, K, &A);
+  BaseFloat objf_impr = estimator.Forward();
+  KALDI_LOG << "A is " << A << ", objf impr is " << objf_impr;
+  for (int32 i = 0; i < 5; i++) {
+    TestCoreFmllrEstimatorKDeriv(gamma, G, K, A, &estimator);
+    TestCoreFmllrEstimatorGDeriv(gamma, G, K, A, &estimator);
+  }
+}
+
+void TestGaussianEstimatorDerivs(const MatrixBase<BaseFloat> &feats,
+                                 const Posterior &post,
+                                 const FmllrEstimatorOptions &opts,
+                                 GaussianEstimator *g) {
+  int32 n = 4;  // number of delta-params we use.
+  Vector<BaseFloat> expected_changes(n),
+      actual_changes(n);
+
+  // if !test_mean_deriv, then we test the var deriv.
+  bool test_mean_deriv = (RandInt(0, 1) == 0);
+
+  int32 num_classes = g->NumClasses(), dim = g->Dim();
+
+  Matrix<BaseFloat> mean_derivs(num_classes, dim);
+  Vector<BaseFloat> var_derivs(num_classes);
+  if (test_mean_deriv) {
+    KALDI_LOG << "Testing mean derivs.";
+    mean_derivs.SetRandn();
+  } else {
+    KALDI_LOG << "Testing var derivs.";
+    var_derivs.SetRandn();
+    var_derivs.Add(0.2);  // Nonzero mean makes the test easier to pass
+  }
+  g->AddToOutputDerivs(mean_derivs, var_derivs);
+  Matrix<BaseFloat> feats_deriv(feats.NumRows(), feats.NumCols());
+  g->AccStatsBackward(feats, post, &feats_deriv);
+
+  BaseFloat epsilon = 1.0e-04;
+
+  for (int32 i = 0; i < n; i++) {
+    Matrix<BaseFloat> new_feats(feats.NumRows(),
+                                feats.NumCols());
+    new_feats.SetRandn();
+    new_feats.Scale(epsilon);
+
+    expected_changes(i) = TraceMatMat(feats_deriv, new_feats, kTrans);
+
+    new_feats.AddMat(1.0, feats);
+
+    GaussianEstimator g2(num_classes, dim);
+    g2.AccStats(new_feats, post);
+    g2.Estimate(opts);
+
+    actual_changes(i) =
+        TraceMatMat(mean_derivs, g2.GetMeans(), kTrans) -
+        TraceMatMat(mean_derivs, g->GetMeans(), kTrans) +
+        VecVec(var_derivs, g2.GetVars()) -
+        VecVec(var_derivs, g->GetVars());
+  }
+  KALDI_LOG << "Actual changes are " << actual_changes
+            << " vs. predicted " << expected_changes;
+  if (!expected_changes.ApproxEqual(actual_changes, 0.1)) {
+    KALDI_ERR << "Expected and actual changes differ too much: "
+              << expected_changes << " vs. "
+              << actual_changes;
+  }
+}
+
+void TestFmllrEstimatorMeanDerivs(const MatrixBase<BaseFloat> &feats,
+                                  const Posterior &post,
+                                  const GaussianEstimator &g) {
+  const MatrixBase<BaseFloat> &mu(g.GetMeans());
+  const VectorBase<BaseFloat> &s(g.GetVars());
+
+  int32 T = feats.NumRows(), dim = feats.NumCols(),
+      num_classes = mu.NumRows();
+
+  FmllrEstimatorOptions opts;
+
+  FmllrEstimator f(opts, mu, s);
+
+  Matrix<BaseFloat> adapted_feats(T, dim, kUndefined);
+  BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats);
+  KALDI_LOG << "Forward objf-impr per frame (with same features) is "
+            << objf_impr;
+
+  // adapted_feats_deriv is the deriv of a random objective function
+  // w.r.t the output (adapted) features.
+  Matrix<BaseFloat> adapted_feats_deriv(T, dim),
+      feats_deriv(T, dim);
+  adapted_feats_deriv.SetRandn();
+  adapted_feats_deriv.Add(0.1);  // Introduce some asymmetry.
+
+  f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv);
+
+  KALDI_LOG << "2-norm of adapted_feats_deriv is "
+            << adapted_feats_deriv.FrobeniusNorm()
+            << ", of feats_deriv is "
+            << feats_deriv.FrobeniusNorm();
+
+  const MatrixBase<BaseFloat> &mu_deriv = f.GetMeanDeriv();
+
+  // measure the accuracy of the deriv in 4 random directions.
+  int32 n = 4;
+  BaseFloat epsilon = 1.0e-04;
+  Vector<BaseFloat> expected_changes(n), actual_changes(n);
+  for (int32 i = 0; i < n; i++) {
+    Matrix<BaseFloat> new_mu(num_classes, dim, kUndefined),
+        new_adapted_feats(T, dim, kUndefined);
+    new_mu.SetRandn();
+    // adding a systematic component helps the test to succeed in low precision.
+    for (int32 c = 0; c < num_classes; c++) {
+      new_mu.Row(c).Add(0.1 * RandInt(-1, 1));
+    }
+    new_mu.Scale(epsilon);
+    expected_changes(i) = TraceMatMat(new_mu, mu_deriv, kTrans);
+    new_mu.AddMat(1.0, mu);
+    FmllrEstimator f2(opts, new_mu, s);
+    f2.ForwardCombined(feats, post, &new_adapted_feats);
+    actual_changes(i) =
+        TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) -
+        TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans);
+  }
+  KALDI_LOG << "Expected changes are " << expected_changes
+            << " vs. actual " << actual_changes;
+  if (!expected_changes.ApproxEqual(actual_changes, 0.1)) {
+    KALDI_ERR << "Expected and actual changes differ too much: "
+              << expected_changes << " vs. "
+               << actual_changes;
+  }
+}
+
+void TestFmllrEstimatorVarDerivs(const MatrixBase<BaseFloat> &feats,
+                                 const Posterior &post,
+                                 const GaussianEstimator &g) {
+  const MatrixBase<BaseFloat> &mu(g.GetMeans());
+  const VectorBase<BaseFloat> &s(g.GetVars());
+
+  int32 T = feats.NumRows(), dim = feats.NumCols(),
+      num_classes = mu.NumRows();
+
+  FmllrEstimatorOptions opts;
+
+  FmllrEstimator f(opts, mu, s);
+
+  Matrix<BaseFloat> adapted_feats(T, dim, kUndefined);
+  BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats);
+  KALDI_LOG << "Forward objf-impr per frame (with same features) is "
+            << objf_impr;
+
+  // adapted_feats_deriv is the deriv of a random objective function
+  // w.r.t the output (adapted) features.
+  Matrix<BaseFloat> adapted_feats_deriv(T, dim),
+      feats_deriv(T, dim);
+  adapted_feats_deriv.SetRandn();
+  // Adding a systematic component to the derivative makes the test easier
+  // to pass, as the derivs are less random.
+  adapted_feats_deriv.AddMat(0.1, feats);
+
+  f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv);
+
+  KALDI_LOG << "2-norm of adapted_feats_deriv is "
+            << adapted_feats_deriv.FrobeniusNorm()
+            << ", of feats_deriv is "
+            << feats_deriv.FrobeniusNorm();
+
+  const VectorBase<BaseFloat> &s_deriv = f.GetVarDeriv();
+
+  // measure the accuracy of the deriv in 10 random directions
+  int32 n = 10;
+  BaseFloat epsilon = 0.001;
+  Vector<BaseFloat> expected_changes(n), actual_changes(n);
+  for (int32 i = 0; i < n; i++) {
+    Vector<BaseFloat> new_s(num_classes, kUndefined);
+    Matrix<BaseFloat> new_adapted_feats(T, dim, kUndefined);
+    new_s.SetRandn();
+    new_s.Scale(epsilon);
+    expected_changes(i) = VecVec(new_s, s_deriv);
+    new_s.AddVec(1.0, s);
+    FmllrEstimator f2(opts, mu, new_s);
+    f2.ForwardCombined(feats, post, &new_adapted_feats);
+    actual_changes(i) =
+        TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) -
+        TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans);
+  }
+  KALDI_LOG << "Expected changes are " << expected_changes
+            << " vs. actual " << actual_changes;
+  if (!expected_changes.ApproxEqual(actual_changes, 0.1)) {
+    KALDI_ERR << "Expected and actual changes differ too much: "
+              << expected_changes << " vs. "
+               << actual_changes;
+  }
+}
+
+
+void TestFmllrEstimatorSequence(const MatrixBase<BaseFloat> &feats,
+                                const Posterior &post,
+                                const GaussianEstimator &g) {
+  // Do two fMLLR's in a row and see if the change in objf decreases.
+
+  int32 T = feats.NumRows(), dim = feats.NumCols();
+  const MatrixBase<BaseFloat> &mu(g.GetMeans());
+  const VectorBase<BaseFloat> &s(g.GetVars());
+  FmllrEstimatorOptions opts;
+
+  FmllrEstimator f(opts, mu, s);
+
+  Matrix<BaseFloat> adapted_feats(T, dim, kUndefined);
+  BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats);
+  KALDI_LOG << "Forward objf-impr per frame (first time) is "
+            << objf_impr;
+
+
+  Matrix<BaseFloat> adapted_feats2(T, dim, kUndefined);
+  FmllrEstimator f2(opts, mu, s);
+  BaseFloat objf_impr2 = f.ForwardCombined(adapted_feats, post, &adapted_feats2);
+  KALDI_LOG << "Forward objf-impr per frame (second time) is "
+            << objf_impr2;
+}
+
+void TestFmllrEstimatorFeatDerivs(const MatrixBase<BaseFloat> &feats,
+                                  const Posterior &post,
+                                  const GaussianEstimator &g) {
+  int32 T = feats.NumRows(), dim = feats.NumCols();
+  const MatrixBase<BaseFloat> &mu(g.GetMeans());
+  const VectorBase<BaseFloat> &s(g.GetVars());
+
+  FmllrEstimatorOptions opts;
+
+  FmllrEstimator f(opts, mu, s);
+
+  Matrix<BaseFloat> adapted_feats(T, dim, kUndefined);
+  BaseFloat objf_impr = f.ForwardCombined(feats, post, &adapted_feats);
+  KALDI_LOG << "Forward objf-impr per frame (with same features) is "
+            << objf_impr;
+
+  // adapted_feats_deriv is the deriv of a random objective function
+  // w.r.t the output (adapted) features.
+  Matrix<BaseFloat> adapted_feats_deriv(T, dim),
+      feats_deriv(T, dim);
+  adapted_feats_deriv.SetRandn();
+  adapted_feats_deriv.Add(0.1);  // Introduce some asymmetry.
+
+  f.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv);
+
+  KALDI_LOG << "2-norm of adapted_feats_deriv is "
+            << adapted_feats_deriv.FrobeniusNorm()
+            << ", of feats_deriv is "
+            << feats_deriv.FrobeniusNorm();
+
+  // measure the accuracy of the deriv in 4 random directions.
+  int32 n = 4;
+  BaseFloat epsilon = 1.0e-03;
+  Vector<BaseFloat> expected_changes(n), actual_changes(n);
+  for (int32 i = 0; i < n; i++) {
+    Matrix<BaseFloat> new_feats(T, dim, kUndefined),
+        new_adapted_feats(T, dim, kUndefined);
+    new_feats.SetRandn();
+    new_feats.Add(RandGauss());  // will help to test whether the indirect
+                                 // part of the derivative is accurate.
+    new_feats.Scale(epsilon);
+    expected_changes(i) = TraceMatMat(new_feats, feats_deriv, kTrans);
+    new_feats.AddMat(1.0, feats);
+    FmllrEstimator f2(opts, mu, s);
+    f2.ForwardCombined(new_feats, post, &new_adapted_feats);
+    actual_changes(i) =
+        TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) -
+        TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans);
+  }
+  KALDI_LOG << "Expected changes are " << expected_changes
+            << " vs. actual " << actual_changes;
+  if (!expected_changes.ApproxEqual(actual_changes, 0.1)) {
+    KALDI_ERR << "Expected and actual changes differ too much: "
+              << expected_changes << " vs. "
+               << actual_changes;
+  }
+}
+
+
+void TestMeanOnlyTransformEstimatorMeanDerivs(
+    const MatrixBase<BaseFloat> &feats,
+    const Posterior &post,
+    const GaussianEstimator &g) {
+  const MatrixBase<BaseFloat> &mu(g.GetMeans());
+
+  int32 T = feats.NumRows(), dim = feats.NumCols(),
+      num_classes = mu.NumRows();
+
+  MeanOnlyTransformEstimator m(mu);
+
+  Matrix<BaseFloat> adapted_feats(T, dim, kUndefined);
+  m.ForwardCombined(feats, post, &adapted_feats);
+
+  // adapted_feats_deriv is the deriv of a random objective function
+  // w.r.t the output (adapted) features.
+  Matrix<BaseFloat> adapted_feats_deriv(T, dim),
+      feats_deriv(T, dim);
+  adapted_feats_deriv.SetRandn();
+  adapted_feats_deriv.Add(0.1);  // Introduce some asymmetry.
+
+  m.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv);
+
+  KALDI_LOG << "2-norm of adapted_feats_deriv is "
+            << adapted_feats_deriv.FrobeniusNorm()
+            << ", of feats_deriv is "
+            << feats_deriv.FrobeniusNorm();
+
+  const MatrixBase<BaseFloat> &mu_deriv = m.GetMeanDeriv();
+
+  // measure the accuracy of the deriv in 4 random directions.
+  int32 n = 4;
+  BaseFloat epsilon = 1.0e-03;
+  Vector<BaseFloat> expected_changes(n), actual_changes(n);
+  for (int32 i = 0; i < n; i++) {
+    Matrix<BaseFloat> new_mu(num_classes, dim, kUndefined),
+        new_adapted_feats(T, dim, kUndefined);
+    new_mu.SetRandn();
+    // adding a systematic component helps the test to succeed in low precision.
+    for (int32 c = 0; c < num_classes; c++) {
+      new_mu.Row(c).Add(0.1 * RandInt(-1, 1));
+    }
+    new_mu.Scale(epsilon);
+    expected_changes(i) = TraceMatMat(new_mu, mu_deriv, kTrans);
+    new_mu.AddMat(1.0, mu);
+    MeanOnlyTransformEstimator m2(new_mu);
+    m2.ForwardCombined(feats, post, &new_adapted_feats);
+    actual_changes(i) =
+        TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) -
+        TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans);
+  }
+  KALDI_LOG << "Expected changes are " << expected_changes
+            << " vs. actual " << actual_changes;
+  if (!expected_changes.ApproxEqual(actual_changes, 0.1)) {
+    KALDI_ERR << "Expected and actual changes differ too much: "
+              << expected_changes << " vs. "
+               << actual_changes;
+  }
+}
+
+
+void TestMeanOnlyTransformEstimatorFeatDerivs(
+    const MatrixBase<BaseFloat> &feats,
+    const Posterior &post,
+    const GaussianEstimator &g) {
+  int32 T = feats.NumRows(), dim = feats.NumCols();
+  const MatrixBase<BaseFloat> &mu(g.GetMeans());
+
+
+  MeanOnlyTransformEstimator m(mu);
+
+  Matrix<BaseFloat> adapted_feats(T, dim, kUndefined);
+  m.ForwardCombined(feats, post, &adapted_feats);
+
+  // adapted_feats_deriv is the deriv of a random objective function
+  // w.r.t the output (adapted) features.
+  Matrix<BaseFloat> adapted_feats_deriv(T, dim),
+      feats_deriv(T, dim);
+  adapted_feats_deriv.SetRandn();
+  adapted_feats_deriv.Add(0.1);  // Introduce some asymmetry.
+
+  m.BackwardCombined(feats, post, adapted_feats_deriv, &feats_deriv);
+
+  KALDI_LOG << "2-norm of adapted_feats_deriv is "
+            << adapted_feats_deriv.FrobeniusNorm()
+            << ", of feats_deriv is "
+            << feats_deriv.FrobeniusNorm();
+
+  // measure the accuracy of the deriv in 4 random directions.
+  int32 n = 4;
+  BaseFloat epsilon = 1.0e-03;
+  Vector<BaseFloat> expected_changes(n), actual_changes(n);
+  for (int32 i = 0; i < n; i++) {
+    Matrix<BaseFloat> new_feats(T, dim, kUndefined),
+        new_adapted_feats(T, dim, kUndefined);
+    new_feats.SetRandn();
+    new_feats.Scale(epsilon);
+    expected_changes(i) = TraceMatMat(new_feats, feats_deriv, kTrans);
+    new_feats.AddMat(1.0, feats);
+    MeanOnlyTransformEstimator m2(mu);
+    m2.ForwardCombined(new_feats, post, &new_adapted_feats);
+    actual_changes(i) =
+        TraceMatMat(new_adapted_feats, adapted_feats_deriv, kTrans) -
+        TraceMatMat(adapted_feats, adapted_feats_deriv, kTrans);
+  }
+  KALDI_LOG << "Expected changes are " << expected_changes
+            << " vs. actual " << actual_changes;
+  if (!expected_changes.ApproxEqual(actual_changes, 0.1)) {
+    KALDI_ERR << "Expected and actual changes differ too much: "
+              << expected_changes << " vs. "
+               << actual_changes;
+  }
+}
+
+
+void UnitTestGaussianAndEstimators() {
+  // It's important that the number of classes be greater than the dimension, or
+  // we would get a low-rank K.
+  int32 num_classes = RandInt(30, 40),
+      dim = RandInt(10, 20),
+      num_frames = RandInt(20 * num_classes, 40 * num_classes);
+
+  GaussianEstimator g(num_classes, dim);
+
+  Matrix<BaseFloat> feats(num_frames, dim);
+  feats.SetRandn();
+  feats.Add(0.2);  // Nonzero offset tests certain aspects of the code better.
+  Posterior post(num_frames);
+  for (int32 t = 0; t < num_frames; t++) {
+    int32 n = RandInt(0, 2);
+    for (int32 j = 0; j < n; j++) {
+      int32 i = RandInt(0, num_classes - 1);
+      BaseFloat p = 0.25 * RandInt(1, 5);
+      post[t].push_back(std::pair<int32, BaseFloat>(i, p));
+    }
+  }
+  g.AccStats(feats, post);
+  FmllrEstimatorOptions opts;
+  // avoid setting variance_sharing_weight to 1.0; it's hard for the tests to
+  // succeed then, and there are valid reasons for that
+  opts.variance_sharing_weight = 0.25 * RandInt(0, 2);
+  g.Estimate(opts);
+  KALDI_LOG << "Means are: "
+            << g.GetMeans() << ", vars are: "
+            << g.GetVars();
+
+  TestGaussianEstimatorDerivs(feats, post, opts, &g);
+
+  if (RandInt(0, 1) == 0) {
+    opts.smoothing_count = 500.0;
+  }
+
+  {  // test FmllrEstimator
+    TestFmllrEstimatorSequence(feats, post, g);
+    TestFmllrEstimatorMeanDerivs(feats, post, g);
+    TestFmllrEstimatorFeatDerivs(feats, post, g);
+    TestFmllrEstimatorVarDerivs(feats, post, g);
+  }
+
+  {  // test MeanOnlyTransformEstimator.
+    TestMeanOnlyTransformEstimatorMeanDerivs(feats, post, g);
+    TestMeanOnlyTransformEstimatorFeatDerivs(feats, post, g);
+  }
+
+
+
+
+}
+
+
+
+}  // namespace kaldi
+}  // namespace differentiable_transform
+
+
+
+int main() {
+  using namespace kaldi::differentiable_transform;
+
+  for (int32 i = 0; i < 50; i++) {
+    UnitTestCoreFmllrEstimatorSimple();
+    UnitTestCoreFmllrEstimatorGeneral();
+    UnitTestGaussianAndEstimators();
+  }
+  std::cout << "Test OK.\n";
+}
diff --git a/src/adapt/differentiable-fmllr.cc b/src/adapt/differentiable-fmllr.cc
new file mode 100644
index 00000000000..faabc7b1496
--- /dev/null
+++ b/src/adapt/differentiable-fmllr.cc
@@ -0,0 +1,888 @@
+// adapt/differentiable-fmllr.cc
+
+// Copyright     2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "adapt/differentiable-fmllr.h"
+#include "matrix/matrix-functions.h"
+
+namespace kaldi {
+namespace differentiable_transform {
+
+
+void FmllrEstimatorOptions::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<FmllrEstimatorOptions>");
+  WriteToken(os, binary, "<SVFloor>");
+  WriteBasicType(os, binary, singular_value_relative_floor);
+  WriteToken(os, binary, "<VarFloor>");
+  WriteBasicType(os, binary, variance_floor);
+  WriteToken(os, binary, "<VarShareWeight>");
+  WriteBasicType(os, binary, variance_sharing_weight);
+  WriteToken(os, binary, "<SmoothingCount>");
+  WriteBasicType(os, binary, smoothing_count);
+  WriteToken(os, binary, "<SmoothingFactor>");
+  WriteBasicType(os, binary, smoothing_between_class_factor);
+  WriteToken(os, binary, "</FmllrEstimatorOptions>");
+}
+
+void FmllrEstimatorOptions::Read(std::istream &is, bool binary) {
+  ExpectToken(is, binary, "<FmllrEstimatorOptions>");
+  ExpectToken(is, binary, "<SVFloor>");
+  ReadBasicType(is, binary, &singular_value_relative_floor);
+  ExpectToken(is, binary, "<VarFloor>");
+  ReadBasicType(is, binary, &variance_floor);
+  ExpectToken(is, binary, "<VarShareWeight>");
+  ReadBasicType(is, binary, &variance_sharing_weight);
+  ExpectToken(is, binary, "<SmoothingCount>");
+  ReadBasicType(is, binary, &smoothing_count);
+  ExpectToken(is, binary, "<SmoothingFactor>");
+  ReadBasicType(is, binary, &smoothing_between_class_factor);
+  ExpectToken(is, binary, "</FmllrEstimatorOptions>");
+}
+
+void FmllrEstimatorOptions::ReadFromConfig(ConfigLine *config_line) {
+  config_line->GetValue("singular-value-relative-floor",
+                        &singular_value_relative_floor);
+  config_line->GetValue("variance-floor", &variance_floor);
+  config_line->GetValue("variance-sharing-weight", &variance_sharing_weight);
+  config_line->GetValue("smoothing-count", &smoothing_count);
+  config_line->GetValue("smoothing-between-class-factor",
+                        &smoothing_between_class_factor);
+}
+
+
+CoreFmllrEstimator::CoreFmllrEstimator(
+    const FmllrEstimatorOptions &opts,
+    BaseFloat gamma,
+    const MatrixBase<BaseFloat> &G,
+    const MatrixBase<BaseFloat> &K,
+    MatrixBase<BaseFloat> *A):
+    opts_(opts),  gamma_(gamma),
+    G_(G), K_(K), A_(A) {
+  KALDI_ASSERT(opts.singular_value_relative_floor > 0.0 &&
+               gamma > 0.0 && G.NumRows() == K.NumRows() &&
+               K.NumRows() == K.NumCols() &&
+               SameDim(K, *A));
+}
+
+
+BaseFloat CoreFmllrEstimator::Forward() {
+  ComputeH();
+  ComputeL();
+  ComputeB();
+  ComputeA();
+  return ComputeObjfChange();
+}
+
+void CoreFmllrEstimator::ComputeH() {
+  int32 dim = G_.NumRows();
+  bool symmetric = true;
+  G_rescaler_.Init(&G_, symmetric);
+  BaseFloat *G_singular_values = G_rescaler_.InputSingularValues();
+
+  {
+    SubVector<BaseFloat> v(G_singular_values, dim);
+    BaseFloat floor = v.Max() * opts_.singular_value_relative_floor;
+    KALDI_ASSERT(floor > 0.0);
+    MatrixIndexT num_floored = 0;
+    v.ApplyFloor(floor, &num_floored);
+    if (num_floored > 0.0)
+      KALDI_WARN << num_floored << " out of " << dim
+                 << " singular values floored in G matrix.";
+  }
+  BaseFloat *H_singular_values = G_rescaler_.OutputSingularValues(),
+      *H_singular_value_derivs = G_rescaler_.OutputSingularValueDerivs();
+  // We don't have to worry about elements of G_singular_values being zero,
+  // since we floored them above.
+  for (int32 i = 0; i < dim; i++) {
+    H_singular_values[i] = 1.0 / std::sqrt(G_singular_values[i]);
+    // The following expression is equivalent to
+    // -0.5 * pow(G_singular_values[i], -1.5),
+    // which is the derivative of lambda^{-0.5} w.r.t lambda.
+    // (lambda, here, is G_singular_values[i]).
+    H_singular_value_derivs[i] = -0.5 * (H_singular_values[i] /
+                                         G_singular_values[i]);
+  }
+  H_.Resize(dim, dim, kUndefined);
+  G_rescaler_.GetOutput(&H_);
+}
+
+void CoreFmllrEstimator::ComputeL() {
+  int32 dim = G_.NumRows();
+  L_.Resize(dim, dim);
+  L_.AddMatMat(1.0, K_, kNoTrans, H_, kNoTrans, 0.0);
+}
+
+// Compute B = F(L), where F is the
+// function that takes the singular values of L, puts them through the function
+// f(lamba) = (lambda + sqrt(lambda^2 + 4 gamma)) / 2.
+void CoreFmllrEstimator::ComputeB() {
+  int32 dim = L_.NumRows();
+  bool symmetric = false;
+  L_rescaler_.Init(&L_, symmetric);
+  BaseFloat *lambda = L_rescaler_.InputSingularValues();
+  {  // This block deals with flooring lambda to avoid zero values.
+    SubVector<BaseFloat> v(lambda, dim);
+    BaseFloat floor = v.Max() * opts_.singular_value_relative_floor;
+    KALDI_ASSERT(floor > 0.0);
+    MatrixIndexT num_floored = 0;
+    v.ApplyFloor(floor, &num_floored);
+    static int num_warned = 100;
+    if (num_floored > 0.0 && num_warned > 0)
+      KALDI_WARN << num_floored << " out of " << dim
+                 << " singular values floored in L matrix."
+                 << (--num_warned == 0 ? "  Will not warn again." : "");
+  }
+  // f is where we put f(lambda).
+  // f_prime is where we put f'(lambda) (the function-derivative of f w.r.t
+  // lambda).
+  BaseFloat *f = L_rescaler_.OutputSingularValues(),
+      *f_prime = L_rescaler_.OutputSingularValueDerivs();
+
+  BaseFloat gamma = gamma_;
+  for (int32 i = 0; i < dim; i++) {
+    BaseFloat lambda_i = lambda[i];
+    f[i] = (lambda_i + std::sqrt(lambda_i * lambda_i + 4.0 * gamma)) / 2.0;
+    f_prime[i] = (1.0 + lambda_i /
+                  std::sqrt(lambda_i * lambda_i + 4.0 * gamma)) / 2.0;
+  }
+  B_.Resize(dim, dim, kUndefined);
+  L_rescaler_.GetOutput(&B_);
+}
+
+void CoreFmllrEstimator::ComputeA() {
+  A_->SetZero();  // Make sure there are no NaN's.
+  A_->AddMatMat(1.0, B_, kNoTrans, H_, kNoTrans, 0.0);
+}
+
+BaseFloat CoreFmllrEstimator::ComputeObjfChange() {
+  // we are computing the objective-function improvement from estimating
+  // A (we'll later compute the improvement from estimating the offset b).
+  // This is the equation which, from the writeup, is:
+  // \gamma log |A| + tr(A^T K) - tr(K)
+  //    + 1/2 tr(G) - 1/2 tr(B B^T).
+  // and we note that log |A| = log |B| + log |G^{-0.5}| = log |B| -0.5 log |G|.
+  // Here, |.| actually means the absolute value of the determinant.
+
+  int32 dim = L_.NumRows();
+  double logdet_g = 0.0, logdet_b = 0.0, tr_b_bt = 0.0, tr_g = 0.0;
+  BaseFloat *G_singular_values = G_rescaler_.InputSingularValues(),
+      *B_singular_values = L_rescaler_.OutputSingularValues();
+  for (int32 i = 0; i < dim; i++) {
+    // we have already ensured that G_singular_values[i] > 0.
+    logdet_g += Log(G_singular_values[i]);
+    tr_g += G_singular_values[i];
+    logdet_b += Log(B_singular_values[i]);
+    tr_b_bt += B_singular_values[i] * B_singular_values[i];
+  }
+
+  double logdet_A = logdet_b - 0.5 * logdet_g,
+      tr_at_k = TraceMatMat(*A_, K_, kTrans),
+      tr_k = K_.Trace();
+
+  return BaseFloat(
+      gamma_ * logdet_A + tr_at_k - tr_k + 0.5 * tr_g - 0.5 * tr_b_bt);
+}
+
+void CoreFmllrEstimator::BackpropA(const MatrixBase<BaseFloat> &A_deriv,
+                                   MatrixBase<BaseFloat> *B_deriv,
+                                   MatrixBase<BaseFloat> *H_deriv) {
+  B_deriv->AddMatMat(1.0, A_deriv, kNoTrans, H_, kTrans, 0.0);
+  H_deriv->AddMatMat(1.0, B_, kTrans, A_deriv, kNoTrans, 0.0);
+}
+
+void CoreFmllrEstimator::BackpropL(const MatrixBase<BaseFloat> &L_deriv,
+                                   MatrixBase<BaseFloat> *K_deriv,
+                                   MatrixBase<BaseFloat> *H_deriv) {
+  K_deriv->AddMatMat(1.0, L_deriv, kNoTrans, H_, kTrans, 0.0);
+  H_deriv->AddMatMat(1.0, K_, kTrans, L_deriv, kNoTrans, 1.0);
+}
+
+
+void CoreFmllrEstimator::Backward(const MatrixBase<BaseFloat> &A_deriv,
+                                  Matrix<BaseFloat> *G_deriv,
+                                  Matrix<BaseFloat> *K_deriv) {
+  KALDI_ASSERT(SameDim(A_deriv, *A_) && SameDim(A_deriv, *G_deriv)
+               && SameDim(*G_deriv, *K_deriv));
+  int32 dim = A_->NumRows();
+  Matrix<BaseFloat> B_deriv(dim, dim), H_deriv(dim, dim),
+      L_deriv(dim, dim);
+  BackpropA(A_deriv, &B_deriv, &H_deriv);
+  // Backprop through the operation B = F(L).
+  L_rescaler_.ComputeInputDeriv(B_deriv, &L_deriv);
+  BackpropL(L_deriv, K_deriv, &H_deriv);
+    // Backprop through the operation H = G^{-0.5}.
+  G_rescaler_.ComputeInputDeriv(H_deriv, G_deriv);
+
+  { // Make sure G_deriv is symmetric.  Use H_deriv as a temporary.
+    H_deriv.CopyFromMat(*G_deriv);
+    G_deriv->AddMat(1.0, H_deriv, kTrans);
+    G_deriv->Scale(0.5);
+  }
+}
+
+
+GaussianEstimator::GaussianEstimator(int32 num_classes, int32 feature_dim):
+    gamma_(num_classes),
+    m_(num_classes, feature_dim),
+    v_(num_classes),
+    variance_floor_(-1), variance_sharing_weight_(-1) {
+  // the floor and weight are actually set later on, in Estimate().
+  KALDI_ASSERT(num_classes > 0 && feature_dim > 0);
+}
+
+void GaussianEstimator::AccStats(const MatrixBase<BaseFloat> &feats,
+                                 const SubPosterior &post) {
+  KALDI_ASSERT(static_cast<int32>(post.size()) == feats.NumRows());
+  int32 T = feats.NumRows(),
+      num_classes = m_.NumRows();
+  for (int32 t = 0; t < T; t++) {
+    SubVector<BaseFloat> feat(feats, t);
+    const std::vector<std::pair<int32, BaseFloat> > &this_post = post[t];
+    auto iter2 = this_post.begin(),
+        end2 = this_post.end();
+    for (; iter2 != end2; ++iter2) {
+      int32 i = iter2->first;
+      KALDI_ASSERT(i >= 0 && i < num_classes &&
+                   "Posteriors and adaptation model mismatch");
+      BaseFloat p = iter2->second;
+      gamma_(i) += p;
+      SubVector<BaseFloat> this_m(m_, i);
+      this_m.AddVec(p, feat);
+      v_(i) += p * VecVec(feat, feat);
+    }
+  }
+}
+
+void GaussianEstimator::Estimate(const FmllrEstimatorOptions &opts) {
+  variance_floor_ = opts.variance_floor;
+  variance_sharing_weight_ = opts.variance_sharing_weight;
+  KALDI_ASSERT(variance_floor_ > 0.0 &&
+               variance_sharing_weight_ >= 0.0 &&
+               variance_sharing_weight_ <= 1.0);
+  KALDI_ASSERT(mu_.NumRows() == 0 &&
+               "You cannot call Estimate() twice.");
+  int32 num_classes = m_.NumRows(), dim = m_.NumCols();
+
+  mu_ = m_;
+  s_.Resize(num_classes, kUndefined);
+  t_.Resize(num_classes, kUndefined);
+  for (int32 i = 0; i < num_classes; i++) {
+    BaseFloat gamma_i = gamma_(i);
+    if (gamma_i < 1.0e-10) {
+      // the i'th row of mu will already be zero.
+      s_(i) = variance_floor_;
+    } else {
+      SubVector<BaseFloat> mu_i(mu_, i);
+      // We already copied m_ to mu_.
+      mu_i.Scale(1.0 / gamma_i);
+      s_(i) = std::max<BaseFloat>(variance_floor_,
+                                  v_(i) / (gamma_i * dim) - VecVec(mu_i, mu_i) / dim);
+    }
+  }
+
+  // apply variance_sharing_weight_.
+  BaseFloat gamma = gamma_.Sum(),
+      s = VecVec(gamma_, s_) / gamma,
+      f = variance_sharing_weight_;
+  KALDI_ASSERT(gamma != 0.0 &&
+               "You cannot call Estimate() with no stats.");
+  for (int32 i = 0; i < num_classes; i++) {
+    t_(i) = (BaseFloat(1.0) - f) * s_(i) + f * s;
+  }
+  { BaseFloat sum = mu_.Sum(); KALDI_ASSERT(sum - sum == 0); } // TEMP
+
+  // Clear the stats, which won't be needed any longer.
+  m_.Resize(0, 0);
+  v_.Resize(0);
+}
+
+void GaussianEstimator::AddToOutputDerivs(
+    const MatrixBase<BaseFloat> &mean_derivs,
+    const VectorBase<BaseFloat> &var_derivs) {
+  KALDI_ASSERT(SameDim(mean_derivs, mu_) &&
+               var_derivs.Dim() == t_.Dim());
+  int32 num_classes = mean_derivs.NumRows(),
+      dim = mean_derivs.NumCols();
+  BaseFloat f = variance_sharing_weight_,
+      variance_floor = variance_floor_,
+      gamma = gamma_.Sum();
+  KALDI_ASSERT(gamma > 0.0);
+  if (m_bar_.NumRows() == 0) {
+    // This is the first time this function was called.
+    m_bar_.Resize(num_classes, dim);
+    v_bar_.Resize(num_classes);
+  }
+
+  const VectorBase<BaseFloat> &t_bar(var_derivs);
+  const MatrixBase<BaseFloat> &mu_bar(mean_derivs);
+  BaseFloat s_bar = f * t_bar.Sum();
+  for (int32 i = 0; i < num_classes; i++) {
+    SubVector<BaseFloat> m_bar_i(m_bar_, i);
+    BaseFloat gamma_i = gamma_(i);
+    if (gamma_i > 1.0e-10) {
+      if (s_(i) != variance_floor) {
+        BaseFloat s_bar_i = (BaseFloat(1.0) - f) * t_bar(i) + s_bar * gamma_i / gamma;
+        v_bar_(i) += s_bar_i / (gamma_i * dim);
+        m_bar_i.AddVec(-2.0 * s_bar_i / (gamma_i * dim), mu_.Row(i));
+      }
+      m_bar_i.AddVec(1.0 / gamma_i, mu_bar.Row(i));
+    }
+  }
+}
+
+int32 GaussianEstimator::Dim() const {
+  // One of these two will be nonempty.
+  return std::max(m_.NumCols(), mu_.NumCols());
+}
+
+void GaussianEstimator::AccStatsBackward(
+    const MatrixBase<BaseFloat> &feats,
+    const SubPosterior &post,
+    const MatrixBase<BaseFloat> *feats_deriv) {
+  // The equation we're implementing is:
+  // \bar{x}_t = \sum_i \gamma_{t,i} (\bar{m}_i + 2\bar{v}_i x_t)
+  // See the comment in the header:
+  // "Notes on implementation of GaussianEstimator".
+  int32 T = feats.NumRows();
+  KALDI_ASSERT(static_cast<BaseFloat>(post.size() == T) &&
+               SameDim(feats, *feats_deriv));
+  for (int32 t = 0; t < T; t++) {
+    SubVector<BaseFloat> feat(feats, t),
+        feat_deriv(*feats_deriv, t);
+    const std::vector<std::pair<int32, BaseFloat> > &this_post = post[t];
+    auto iter2 = this_post.begin(),
+        end2 = this_post.end();
+    for (; iter2 != end2; ++iter2) {
+      int32 i = iter2->first;
+      BaseFloat p = iter2->second;
+      SubVector<BaseFloat> m_bar_i(m_bar_, i);
+      feat_deriv.AddVec(p, m_bar_i);
+      feat_deriv.AddVec(p * 2.0 * v_bar_(i), feat);
+    }
+  }
+}
+
+void GaussianEstimator::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<GaussianEstimator>");
+  WriteToken(os, binary, "<Stats>");
+  gamma_.Write(os, binary);
+  m_.Write(os, binary);
+  v_.Write(os, binary);
+  WriteToken(os, binary, "<Config>");
+  WriteBasicType(os, binary, variance_floor_);
+  WriteBasicType(os, binary, variance_sharing_weight_);
+  WriteToken(os, binary, "<Mu>");
+  mu_.Write(os, binary);
+  WriteToken(os, binary, "<t>");
+  t_.Write(os, binary);
+  WriteToken(os, binary, "</GaussianEstimator>");
+}
+
+void GaussianEstimator::Add(const GaussianEstimator &other) {
+  gamma_.AddVec(1.0, other.gamma_);
+  m_.AddMat(1.0, other.m_);
+  v_.AddVec(1.0, other.v_);
+}
+
+
+void GaussianEstimator::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<GaussianEstimator>", "<Stats>");
+  gamma_.Read(is, binary);
+  m_.Read(is, binary);
+  v_.Read(is, binary);
+  ExpectToken(is, binary, "<Config>");
+  ReadBasicType(is, binary, &variance_floor_);
+  ReadBasicType(is, binary, &variance_sharing_weight_);
+  ExpectToken(is, binary, "<Mu>");
+  mu_.Read(is, binary);
+  ExpectToken(is, binary, "<t>");
+  t_.Read(is, binary);
+  ExpectToken(is, binary, "</GaussianEstimator>");
+}
+
+
+FmllrEstimator::FmllrEstimator(const FmllrEstimatorOptions &opts,
+                               const MatrixBase<BaseFloat> &mu,
+                               const VectorBase<BaseFloat> &s):
+    opts_(opts), mu_(mu), s_(s), estimator_(NULL) {
+  int32 num_classes = mu_.NumRows(), dim = mu_.NumCols();
+  opts_.Check();
+
+  gamma_.Resize(num_classes);
+  raw_G_.Resize(dim, dim);
+  z_.Resize(num_classes, dim);
+}
+
+void FmllrEstimator::AccStats(const MatrixBase<BaseFloat> &feats,
+                              const SubPosterior &post) {
+  KALDI_ASSERT(static_cast<int32>(post.size() == feats.NumRows()));
+  int32 num_classes = mu_.NumRows(),
+      dim = mu_.NumCols(),
+      T = feats.NumRows();
+
+  // Use temporaries for the stats and later add them to the stats in the class;
+  // this will reduce roundoff errors if this function is called more than once.
+  Vector<BaseFloat> gamma_hat_t(T, kUndefined),
+      gamma(num_classes);
+
+  for (int32 t = 0; t < T; t++) {
+    auto iter = post[t].begin(), end = post[t].end();
+    SubVector<BaseFloat> x_t(feats, t);
+    BaseFloat this_gamma_hat_t = 0.0;
+    for (; iter != end; ++iter) {
+      int32 i = iter->first;
+      KALDI_ASSERT(i >= 0 && i < num_classes &&
+                   "Posteriors and adaptation model mismatch");
+      BaseFloat gamma_ti = iter->second,
+          gamma_hat_ti = gamma_ti / s_(i);
+      SubVector<BaseFloat> z_i(z_, i);
+      z_i.AddVec(gamma_ti, x_t);
+      gamma(i) += gamma_ti;
+      this_gamma_hat_t += gamma_hat_ti;
+    }
+    gamma_hat_t(t) = this_gamma_hat_t;
+  }
+  gamma_.AddVec(1.0, gamma);
+
+  SpMatrix<BaseFloat> G(dim);
+  int32 rows_per_chunk = 100;
+  for (int32 offset = 0; offset < T; offset += rows_per_chunk) {
+    int32 n_frames = std::min<int32>(rows_per_chunk, feats.NumRows() - offset);
+    SubMatrix<BaseFloat> feats_part(feats, offset, n_frames, 0, dim);
+    SubVector<BaseFloat> gamma_hat_t_part(gamma_hat_t, offset, n_frames);
+    // the 0.0 value for beta means we don't double-count stats.
+    G.AddMat2Vec(1.0, feats_part, kTrans, gamma_hat_t_part, 0.0);
+    raw_G_.AddSp(1.0, G);
+  }
+}
+
+
+BaseFloat FmllrEstimator::Estimate() {
+  int32 dim = mu_.NumCols();
+  BaseFloat gamma_tot = gamma_.Sum();
+  KALDI_ASSERT(gamma_tot > 0.0 &&
+               "You cannot call Estimate() with zero stats.");
+
+  Vector<BaseFloat> s_inv(s_);
+  s_inv.InvertElements();
+
+  // compute \hat{\gamma} = \sum_i \gamma_i / s_i
+  gamma_hat_tot_ = VecVec(gamma_, s_inv);
+
+  // compute n = (1/\hat{\gamma}) \sum_i (1/s_i) z_i
+  n_.Resize(dim);
+  n_.AddMatVec(1.0 / gamma_hat_tot_, z_, kTrans, s_inv, 0.0);
+
+  {  // Set m = 1/\hat{\gamma} \sum_i (\gamma_i / s_i) \mu_i.
+    Vector<BaseFloat> s_inv_gamma(s_inv);
+    s_inv_gamma.MulElements(gamma_);
+    m_.Resize(dim);
+    m_.AddMatVec(1.0 / gamma_hat_tot_, mu_, kTrans, s_inv_gamma, 0.0);
+  }
+
+
+  {  // Set K := \sum_i (1/s_i) \mu_i z_i^T - \hat{\gamma} m n^T
+    Matrix<BaseFloat> mu_s(mu_);
+    mu_s.MulRowsVec(s_inv);
+    K_.Resize(dim, dim);
+    K_.AddMatMat(1.0, mu_s, kTrans, z_, kNoTrans, 0.0);
+    K_.AddVecVec(-gamma_hat_tot_, m_, n_);
+  }
+
+  // In AccStats(), we did raw_G := \sum_t \hat{\gamma}_t x_t x_t^T.
+  // Now we do: G  = raw_G - \hat{\gamma} n n^T
+  G_ = raw_G_;
+  G_.AddVecVec(-gamma_hat_tot_, n_, n_);
+  KALDI_ASSERT(G_.IsSymmetric(0.0001));
+
+  A_.Resize(dim, dim, kUndefined);
+
+  BaseFloat gamma_tot_smoothed = gamma_tot;
+  {
+    /*
+      Add smoothing counts to gamma_tot, K_ and G_.  This prevents the matrix
+      from diverging too far from the identity, and ensures more reasonable
+      transform values when counts are small or dimensions large.  We can ignore
+      this smoothing for computing derivatives, because it happens that it
+      doesn't affect anything; the quantities gamma_, K_ and G_ are never
+      consumed in the backprop phase, and the expressions for the derivatives
+      w.r.t. these quantities don't change from adding an extra term.
+    */
+    gamma_tot_smoothed = gamma_tot + opts_.smoothing_count;
+    BaseFloat s = opts_.smoothing_between_class_factor;
+    K_.AddToDiag(opts_.smoothing_count * s);
+    G_.AddToDiag(opts_.smoothing_count * (1.0 + s));
+  }
+  // Compute A_.
+  estimator_ = new CoreFmllrEstimator(opts_, gamma_tot_smoothed, G_, K_, &A_);
+  // A_impr will be the objective-function improvement from estimating A
+  // (vs. the unit matrix), divided by gamma_tot.  Note: the likelihood of the
+  // 'fake data' we used for the smoothing could only have been made worse by
+  // estimating this transform, so dividing the total objf-impr by gamma_tot
+  // (rather than gamma_tot_smoothed, if different) will still be an
+  // underestimate of the actual improvement.
+  BaseFloat A_impr = (1.0  / gamma_tot) * estimator_->Forward();
+
+  // Compute b = m - A n.
+  b_ = m_;
+  b_.AddMatVec(-1.0, A_, kNoTrans, n_, 1.0);
+
+  // b_impr is the amount of objective-function improvement from estimating b
+  // (vs. the default value), divided by the total-count gamma_tot.  See section
+  // 'diagnostics' in the document.
+  // Note: we aren't doing any smoothing for the offset term.
+  BaseFloat b_impr = (0.5 * VecVec(b_, b_) * gamma_hat_tot_) / gamma_tot;
+  return A_impr + b_impr;
+}
+
+bool FmllrEstimator::IsEstimated() const {
+  return A_.NumRows() != 0;
+}
+
+void FmllrEstimator::AdaptFeatures(const MatrixBase<BaseFloat> &feats,
+                                   MatrixBase<BaseFloat> *adapted_feats) const {
+  KALDI_ASSERT(A_.NumRows() != 0 && "You cannot call AdaptFeatures before "
+               "calling Estimate().");
+  KALDI_ASSERT(SameDim(feats, *adapted_feats));
+  adapted_feats->CopyRowsFromVec(b_);
+  adapted_feats->AddMatMat(1.0, feats, kNoTrans, A_, kTrans, 1.0);
+}
+
+
+void FmllrEstimator::AdaptFeaturesBackward(
+    const MatrixBase<BaseFloat> &feats,
+    const MatrixBase<BaseFloat> &adapted_feats_deriv,
+    MatrixBase<BaseFloat> *feats_deriv) {
+  KALDI_ASSERT(SameDim(feats, adapted_feats_deriv) &&
+               SameDim(feats, *feats_deriv) &&
+               G_bar_.NumRows() == 0);
+  int32 rows_per_chunk = 100;
+  if (feats.NumRows() > rows_per_chunk) {
+    // Break it up into 100-frame chunks and recurse.  This will reduce roundoff
+    // error due to the way we work with temporaries.
+    for (int32 offset = 0; offset < feats.NumRows(); offset += rows_per_chunk) {
+      int32 n = std::min<int32>(rows_per_chunk, feats.NumRows() - offset);
+      SubMatrix<BaseFloat> feats_deriv_part = feats_deriv->RowRange(offset, n);
+      AdaptFeaturesBackward(feats.RowRange(offset, n),
+                            adapted_feats_deriv.RowRange(offset, n),
+                            &feats_deriv_part);
+    }
+    return;
+  }
+
+  // in the writeup: \bar{x}_t <-- A^T \bar{y}_t.
+  // In this implementation, x_t corresponds to a
+  // row vector in feats and feats_deriv, so everything is
+  // transposed to:
+  //  \bar{x}_t^T <--- \bar{y}_t^T A.
+  feats_deriv->AddMatMat(1.0, adapted_feats_deriv, kNoTrans,
+                         A_, kNoTrans, 1.0);
+
+  // We use temporaries below to possibly reduce roundoff error.
+  // It's not clear whether this would make a difference-- it depends
+  // how the BLAS we're using was implemented.
+  int32 dim = mu_.NumCols();
+  // \bar{b}  =  \sum_t \bar{y}_t
+  Vector<BaseFloat> b_bar(dim);
+  b_bar.AddRowSumMat(1.0, adapted_feats_deriv);
+  if (b_bar_.Dim() == 0)
+    b_bar_.Swap(&b_bar);
+  else
+    b_bar_.AddVec(1.0, b_bar);
+  // \bar{A} <--  \sum_t \bar{y}_t x_t^T
+  Matrix<BaseFloat> A_bar(dim, dim);
+  A_bar.AddMatMat(1.0, adapted_feats_deriv, kTrans, feats, kNoTrans, 0.0);
+  if (A_bar_.NumRows() == 0)
+    A_bar_.Swap(&A_bar);
+  else
+    A_bar_.AddMat(1.0, A_bar);
+}
+
+void FmllrEstimator::EstimateBackward() {
+  KALDI_ASSERT(G_bar_.NumRows() == 0 &&
+               "You cannot call EstimateBackward() twice.");
+  KALDI_ASSERT(A_bar_.NumRows() != 0 &&
+               "You must call AdaptFeaturesBackward() before calling "
+               "EstimateBackward().");
+
+  Vector<BaseFloat> s_inv(s_);
+  s_inv.InvertElements();
+  Vector<BaseFloat> s_inv_gamma(s_inv);
+  s_inv_gamma.MulElements(gamma_);
+
+  // do \bar{A} -= \bar{b} n^T
+  A_bar_.AddVecVec(-1.0, b_bar_, n_);
+
+  int32 num_classes = mu_.NumRows(), dim = mu_.NumCols();
+  G_bar_.Resize(dim, dim);
+  K_bar_.Resize(dim, dim);
+  estimator_->Backward(A_bar_, &G_bar_, &K_bar_);
+  delete estimator_;
+  estimator_ = NULL;
+  KALDI_ASSERT(G_bar_.IsSymmetric());
+
+  // \bar{n} = - (A^T \bar{b} + 2\bar{G} n + \bar{K}^T m)
+  n_bar_.Resize(dim);
+  n_bar_.AddMatVec(-1.0, A_, kTrans, b_bar_, 0.0);
+  n_bar_.AddMatVec(-2.0 * gamma_hat_tot_, G_bar_, kNoTrans, n_, 1.0);
+  n_bar_.AddMatVec(-1.0 * gamma_hat_tot_, K_bar_, kTrans, m_, 1.0);
+
+
+  // \bar{m} = \bar{b} - \hat{\gamma} \bar{K} n
+  m_bar_ = b_bar_;
+  m_bar_.AddMatVec(-gamma_hat_tot_, K_bar_, kNoTrans, n_, 1.0);
+
+  //  \bar{z}_i =  (1/s_i) \bar{K}^T \mu_i  +  1/(s_i \hat{\gamma}) \bar{n}
+  z_bar_.Resize(num_classes, dim);
+  // set \bar{z}_i := \bar{K}^T \mu_i.  It's transposed below.
+  z_bar_.AddMatMat(1.0, mu_, kNoTrans, K_bar_, kNoTrans, 0.0);
+  // \bar{z}_i += 1/\hat{\gamma} \bar{n}
+  z_bar_.AddVecToRows(1.0 / gamma_hat_tot_, n_bar_);
+  // \bar{z}_i /= s_i
+  z_bar_.MulRowsVec(s_inv);
+
+  // \bar{\hat{\gamma}} = - n^T \bar{G} n - m^t \bar{K} n
+  //                      - \frac{1}{\hat{\gamma}} (n^T \bar{n} + m^T \bar{m})
+  gamma_hat_tot_bar_ = -1.0 * VecMatVec(n_, G_bar_, n_)
+      - VecMatVec(m_, K_bar_, n_)
+      - (1.0 / gamma_hat_tot_) * (VecVec(n_, n_bar_) + VecVec(m_, m_bar_));
+
+  // Set \bar{mu}_i = (1/s_i) \bar{K} z_i  +  (\gamma_i / (s_i \hat{\gamma})) \bar{m}
+  mu_bar_.Resize(num_classes, dim);
+  mu_bar_.AddMatMat(1.0, z_, kNoTrans, K_bar_, kTrans, 0.0);
+  mu_bar_.MulRowsVec(s_inv);
+  mu_bar_.AddVecVec(1.0 / gamma_hat_tot_, s_inv_gamma, m_bar_);
+
+  // Add all terms in \bar{s}_i except the one involving \bar{\hat{\gamma}}_t.
+  // The full equation (also present in the header) is:
+  //    \bar{s}_i  =  -(1 / s_i^2) * (
+  //          \mu_i^T \bar{K} z_i  +  (1 / \hat{\gamma}) \z_i^T \bar{n}
+  //       + (\gamma_i / \hat{\gamma}) \mu_i^T \bar{m}  + \gamma_i \hat{\gamma}
+  //       + \sum_t  \gamma_{t,i} \bar{\hat{\gamma}}_t )
+  // Noticing that some expressions in it are common with \bar{\mu}_i, this can
+  // be simplified to:
+  //    \bar{s}_i = (-1/s_i) \mu_i^T \bar{\mu}_i
+  //          - (1/s_i^2) * ((1 / \hat{\gamma}) \z_i^T \bar{n} + \gamma_i \hat{\gamma}
+  //                          + \sum_t  \gamma_{t,i} \bar{\hat{\gamma}}_t )
+  s_bar_.Resize(num_classes);
+  // do s_bar_ -= (1 / \hat{\gamma}) \z_i^T \bar{n}.  We'll later multiply by 1/s_i^2.
+  s_bar_.AddMatVec(-1.0 / gamma_hat_tot_, z_, kNoTrans, n_bar_, 0.0);
+  // do s_bar_(i) -= \gamma_i \bar{\hat{\gamma}}
+  s_bar_.AddVec(-1.0 * gamma_hat_tot_bar_, gamma_);
+  // do s_bar_(i) *= 1/s_i
+  s_bar_.MulElements(s_inv);
+  // do s_bar_(i) -= \mu_i^T \bar{\mu}_i
+  s_bar_.AddDiagMatMat(-1.0, mu_, kNoTrans, mu_bar_, kTrans, 1.0);
+  // do s_bar_(i) *= 1/s_i
+  s_bar_.MulElements(s_inv);
+  // OK, s_bar_ is now set up with all but the last term.  It remains only to do:
+  // \bar{s}_i += (-1/s_i^2) \sum_t  \gamma_{t,i} \bar{\hat{\gamma}}_t )
+}
+
+void FmllrEstimator::AccStatsBackward(
+    const MatrixBase<BaseFloat> &feats,
+    const SubPosterior &post,
+    MatrixBase<BaseFloat> *feats_deriv) {
+  KALDI_ASSERT(static_cast<int32>(post.size() == feats.NumRows()));
+  int32 T = feats.NumRows(), num_classes = mu_.NumRows();
+
+  // Use temporaries for s_bar_, to reduce roundoff error.
+  Vector<BaseFloat> s_bar(num_classes);
+  for (int32 t = 0; t < T; t++) {
+    auto iter = post[t].begin(), end = post[t].end();
+    SubVector<BaseFloat> x_t(feats, t),
+        x_bar_t(*feats_deriv, t);
+    BaseFloat gamma_hat_t = 0.0;
+    for (; iter != end; ++iter) {
+      int32 i = iter->first;
+      BaseFloat gamma_ti = iter->second,
+          gamma_hat_ti = gamma_ti / s_(i);
+      gamma_hat_t += gamma_hat_ti;
+      SubVector<BaseFloat> z_bar_i(z_bar_, i);
+      // \bar{x}_t += \gamma_{t,i} \bar{z}_i
+      x_bar_t.AddVec(gamma_ti, z_bar_i);
+    }
+    double gamma_hat_bar_t = VecMatVec(x_t, G_bar_, x_t);
+
+    // \bar{x}_t += 2 \hat{\gamma}_t \bar{G} x_t
+    x_bar_t.AddMatVec(2.0 * gamma_hat_t, G_bar_, kNoTrans, x_t, 1.0);
+
+    for (iter = post[t].begin(); iter != end; ++iter) {
+      int32 i = iter->first;
+      BaseFloat gamma_ti = iter->second;
+      SubVector<BaseFloat> mu_i(mu_, i);
+      // \bar{s}_i -= \frac{1}{s_i^2} \gamma_{t,i} \bar{\hat{\gamma}}_t
+      s_bar(i) -= 1.0 / (s_(i) * s_(i)) * gamma_ti * gamma_hat_bar_t;
+    }
+    if (t == T - 1 || (t > 0 && t % 200 == 0)) {
+      s_bar_.AddVec(1.0, s_bar);
+      if (t < T - 1)
+        s_bar.SetZero();
+    }
+  }
+}
+
+BaseFloat FmllrEstimator::ForwardCombined(
+    const MatrixBase<BaseFloat> &feats,
+    const SubPosterior &post,
+    MatrixBase<BaseFloat> *adapted_feats) {
+  AccStats(feats, post);
+  BaseFloat ans = Estimate();
+  AdaptFeatures(feats, adapted_feats);
+  return ans;
+}
+
+void FmllrEstimator::BackwardCombined(
+    const MatrixBase<BaseFloat> &feats,
+    const SubPosterior &post,
+    const MatrixBase<BaseFloat> &adapted_feats_deriv,
+    MatrixBase<BaseFloat> *feats_deriv) {
+  AdaptFeaturesBackward(feats, adapted_feats_deriv, feats_deriv);
+  EstimateBackward();
+  AccStatsBackward(feats, post, feats_deriv);
+}
+
+FmllrEstimator::~FmllrEstimator() {
+  delete estimator_;  // in case Estimate() was never called.
+}
+
+
+MeanOnlyTransformEstimator::MeanOnlyTransformEstimator(
+    const MatrixBase<BaseFloat> &mu): mu_(mu) {
+  int32 num_classes = mu_.NumRows(),
+      dim = mu_.NumCols();
+  gamma_.Resize(num_classes);
+  input_sum_.Resize(dim);
+}
+
+void MeanOnlyTransformEstimator::AccStats(const MatrixBase<BaseFloat> &feats,
+                                          const SubPosterior &post) {
+  int32 T = feats.NumRows(),
+      num_classes = mu_.NumRows();
+  KALDI_ASSERT(static_cast<int32>(post.size()) == T);
+
+  for (int32 t = 0; t < T; t++) {
+    BaseFloat gamma_t = 0.0;  // Total weight for this frame.
+    auto iter = post[t].begin(), end = post[t].end();
+    for (; iter != end; ++iter) {
+      int32 i = iter->first;
+      KALDI_ASSERT(i >= 0 && i < num_classes &&
+                   "Posteriors and adaptation model mismatch");
+      BaseFloat gamma_ti = iter->second;
+      gamma_t += gamma_ti;
+      gamma_(i) += gamma_ti;
+    }
+    SubVector<BaseFloat> feat(feats, t);
+    KALDI_ASSERT(gamma_t >= 0);
+    input_sum_.AddVec(gamma_t, feat);
+  }
+}
+
+
+void MeanOnlyTransformEstimator::Estimate() {
+  double tot_gamma = gamma_.Sum();
+  int32 dim = mu_.NumCols();
+  if (tot_gamma <= 0.0)
+    KALDI_ERR << "You cannot call Estimate() if total count is zero.";
+  Vector<BaseFloat> gamma_float(gamma_);
+  Vector<BaseFloat> expected_mean(dim);
+  expected_mean.AddMatVec(1.0 / tot_gamma, mu_, kTrans, gamma_float, 0.0);
+  // basically: offset_ = expected_mean - observed_mean,
+  // where observed_mean = input_sum_ / tot_gamma.
+  offset_ = expected_mean;
+  offset_.AddVec(-1.0 / tot_gamma, input_sum_);
+  output_deriv_sum_.Resize(dim);
+}
+
+bool MeanOnlyTransformEstimator::IsEstimated() const {
+  return offset_.Dim() != 0;
+}
+
+void MeanOnlyTransformEstimator::AdaptFeatures(
+    const MatrixBase<BaseFloat> &feats,
+    MatrixBase<BaseFloat> *adapted_feats) const {
+  adapted_feats->CopyRowsFromVec(offset_);
+  adapted_feats->AddMat(1.0, feats);
+}
+
+void MeanOnlyTransformEstimator::AdaptFeaturesBackward(
+    const MatrixBase<BaseFloat> &feats,
+    const MatrixBase<BaseFloat> &adapted_feats_deriv,
+    MatrixBase<BaseFloat> *feats_deriv) {
+  int32 dim = mu_.NumCols();
+  Vector<BaseFloat> output_deriv_sum(dim);
+  output_deriv_sum.AddRowSumMat(1.0, adapted_feats_deriv);
+  output_deriv_sum_.AddVec(1.0, output_deriv_sum);
+  feats_deriv->AddMat(1.0, adapted_feats_deriv);
+}
+
+void MeanOnlyTransformEstimator::EstimateBackward() {
+  int32 num_classes = mu_.NumRows(), dim = mu_.NumCols();
+  mu_bar_.Resize(num_classes, dim);
+  Vector<BaseFloat> gamma(gamma_),
+      output_deriv_sum(output_deriv_sum_);
+  BaseFloat gamma_tot = gamma_.Sum();
+  KALDI_ASSERT(gamma_tot > 0.0);
+  mu_bar_.AddVecVec(1.0 / gamma_tot, gamma, output_deriv_sum);
+
+  x_deriv_ = output_deriv_sum;
+  x_deriv_.Scale(-1.0 / gamma_tot);
+}
+
+
+void MeanOnlyTransformEstimator::AccStatsBackward(
+    const MatrixBase<BaseFloat> &feats,
+    const SubPosterior &post,
+    MatrixBase<BaseFloat> *feats_deriv) {
+
+  int32 T = feats.NumRows();
+  // tot_weight will be the total weight of the posteriors in 'post'
+  // for each frame.
+  Vector<BaseFloat> tot_weight(T, kUndefined);
+  for (int32 t = 0; t < T; t++) {
+    BaseFloat gamma_t = 0.0;  // Total weight for this frame.
+    auto iter = post[t].begin(), end = post[t].end();
+    for (; iter != end; ++iter)
+      gamma_t += iter->second;
+    tot_weight(t) = gamma_t;
+  }
+  feats_deriv->AddVecVec(1.0, tot_weight, x_deriv_);
+}
+
+void MeanOnlyTransformEstimator::ForwardCombined(
+    const MatrixBase<BaseFloat> &feats,
+    const SubPosterior &post,
+    MatrixBase<BaseFloat> *adapted_feats) {
+  AccStats(feats, post);
+  Estimate();
+  AdaptFeatures(feats, adapted_feats);
+}
+
+void MeanOnlyTransformEstimator::BackwardCombined(
+    const MatrixBase<BaseFloat> &feats,
+    const SubPosterior &post,
+    const MatrixBase<BaseFloat> &adapted_feats_deriv,
+    MatrixBase<BaseFloat> *feats_deriv) {
+  AdaptFeaturesBackward(feats, adapted_feats_deriv, feats_deriv);
+  EstimateBackward();
+  AccStatsBackward(feats, post, feats_deriv);
+}
+
+
+}  // namespace differentiable_transform
+}  // namespace kaldi
diff --git a/src/adapt/differentiable-fmllr.h b/src/adapt/differentiable-fmllr.h
new file mode 100644
index 00000000000..c15175752a1
--- /dev/null
+++ b/src/adapt/differentiable-fmllr.h
@@ -0,0 +1,974 @@
+// adapt/differentiable-fmllr.h
+
+// Copyright      2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_TRANSFORM_DIFFERENTIABLE_FMLLR_H_
+#define KALDI_TRANSFORM_DIFFERENTIABLE_FMLLR_H_
+
+#include <vector>
+
+#include "base/kaldi-common.h"
+#include "util/kaldi-table.h"
+#include "util/kaldi-holder.h"
+#include "hmm/posterior.h"
+#include "matrix/matrix-functions.h"
+#include "matrix/matrix-common.h"
+#include "matrix/sp-matrix.h"
+
+namespace kaldi {
+namespace differentiable_transform {
+
+
+// This header contains some utilities for implementing differentiable fMLLR.
+// Since it is fairly complicated, we aren't putting all the implementation
+// details in class FmllrTransform (in differentiable-transform.h), but
+// segregating most of the technical stuff to this file.  This also
+// allows us to separate out the testing of individual components.
+// The reference for things in this header is
+// http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf.
+// The notation we are using corresponds to the notation used in
+// the "Summary" section of that document.
+
+
+
+
+/**
+   With reference to the notation in
+     http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf,
+   this class implements the operation that takes G and K as input (and the
+   count gamma), and produces A.  This has been separated into its own object
+   for purposes of testability.
+ */
+struct FmllrEstimatorOptions {
+
+  // singular_value_relative_floor is floor that we apply on the
+  // singular values of the inputs G and K, to ensure that no NaN's are
+  // generated in the forward pass and to prevent the derivatives
+  // in the backprop from becoming undefined.  It affects both
+  // the forward and backward computations.  A warning will be printed
+  // if this floor actually had an effect.
+  // Must be greater than zero (to avoid the possibility of generating
+  // NaN's).
+  BaseFloat singular_value_relative_floor;
+
+
+  // Floor for (spherical) variances; will be passed to class GaussianEstimator
+  // when estimating means and variances.
+  BaseFloat variance_floor;
+
+  // A value in the range [0, 1] which dictates to what extent the variances are
+  // shared.  0 means not shared at all, 1 means completely shared.  Shared
+  // means the variance is a weighted average of variances, weighted by count of
+  // that class.  This is consumed by class GaussianEstimator.
+  BaseFloat variance_sharing_weight;
+
+  // A count value of 'fake' counts that we add to the stats G, K and lambda
+  // during estimation, namely:
+  //   lambda += smoothing_count
+  //   K += smoothing_count * smoothing_between_class_factor * I
+  //   G += smoothing_count * I.
+  // Interpretable as a number of frames.  This prevents things going crazy
+  // when the amount of data is small.
+  BaseFloat smoothing_count;
+
+  // A factor that says how large the assumed between-class covariance matrix
+  // is, relative to the within-class covariance matrix.  Should be >= 0.  In
+  // the limit as it approaches zero, the smoothing will only penalize scaling
+  // of the space, but not rotations.  This is likely not a good thing, so a
+  // value greater than zero will probably be desired.
+  BaseFloat smoothing_between_class_factor;
+
+  FmllrEstimatorOptions():
+      singular_value_relative_floor(0.001),
+      variance_floor(0.0001),
+      variance_sharing_weight(0.1),
+      smoothing_count(0.0),
+      smoothing_between_class_factor(0.25) { }
+
+  void Check() {
+    KALDI_ASSERT(singular_value_relative_floor > 0.0 &&
+                 singular_value_relative_floor < 0.1 &&
+                 (variance_floor > 0.0  || variance_sharing_weight > 0.0) &&
+                 variance_floor >= 0.0 &&
+                 variance_sharing_weight >= 0.0 &&
+                 variance_sharing_weight <= 1.0);
+  }
+
+  void Write(std::ostream &os, bool binary) const;
+  void Read(std::istream &is, bool binary);
+
+  // This will set any options in this class that it can find in 'config_line'.
+  void ReadFromConfig(ConfigLine *config_line);
+
+};
+
+
+/**
+   Class CoreFmllrEstimator takes care of the core parts of the fMLLR estimation:
+   with reference to the notation in
+   http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf,
+   it accepts the statistics G and K and the count gamma, and it
+   computes the fMLLR transform matrix A, and allows you to backprop through
+   that computation.  The reason why we have broken it out as its own class,
+   is for testability and to limit the complexity of any one class.
+
+   The end-user may want to use class FmllrEstimator instead.
+
+ */
+class CoreFmllrEstimator {
+ public:
+  /**
+     Constructor.  Does not do any real work.  This class will store
+     references/pointers to G, K and A, so you need to make sure that
+     those quantities exist for the lifetime of this object.
+
+       @param [in] opts  Options class; see its definition for details.  Will be copied
+                      in the constructor.
+       @param [in]  gamma  The total data-count (often this will be the number of frames).
+       @param [in]  G  A symmetric matrix containing the quadratic
+                       stats for estimating A.  This the sum of outer products
+                       of the input features, after mean subtraction, and
+                       weighted by the inverse-variance factor s_i.  Must be
+                       positive definite for this computation to be well
+                       defined.
+       @param [in] K   A matrix containing the linear stats for estimating A.
+                       This is a sum of outer products of the means with the
+                       input features, with mean subtraction and inverse-variance
+                       weighting.  Must not have more than one zero singular value
+                       for this computation to be well defined.
+       @param [in] A   We mark this as an input parameter but it is the location
+                       where the output of this computation will be placed when
+                       you call Forward().  May be undefined (e.g., NaN) on
+                       entry.  You must not change the value of A between
+                       calling Forward() and calling Backward().
+   */
+  CoreFmllrEstimator(const FmllrEstimatorOptions &opts,
+                     BaseFloat gamma,
+                     const MatrixBase<BaseFloat> &G,
+                     const MatrixBase<BaseFloat> &K,
+                     MatrixBase<BaseFloat> *A);
+
+  /**
+     Does the forward pass of estimation.  Writes to the location
+     'A' that was passed to the constructor.
+
+     Returns the objective-function improvement per frame, as compared
+     with what the objective-function would be with unit A.  This is not
+     normalized by the number of frames.
+  */
+  BaseFloat Forward();
+
+  /**
+     Does the backward pass.  Note: it is permissible to call
+     Backward() any number of times, it does not have to be called
+     exactly once.
+
+       @param [in] A_deriv  The derivative of the objective
+           function (say, f) w.r.t. the output A (which was passed as a
+           pointer to the constructor).
+       @param [out] G_deriv  A pointer to a location where the
+           derivative df/dG will be written.  Will be added to, so
+           should contain zero (or some other defined value)
+           at input.
+       @param [out] K_deriv  A pointer to a location where the
+           derivative df/dK will be written (so the i,j'th
+           element is the derivative w.r.t. the i,j'th element
+           of the input matrix K.
+  */
+  void Backward(const MatrixBase<BaseFloat> &A_deriv,
+                Matrix<BaseFloat> *G_deriv,
+                Matrix<BaseFloat> *K_deriv);
+
+ private:
+  // Computes H = G^{-0.5}
+  void ComputeH();
+  // Compute L = K H
+  void ComputeL();
+  // Compute B = F(L), where F is the
+  // function that takes the singular values of L, puts them through the function
+  // f(lamba) = (lambda + sqrt(lambda^2 + 4 gamma)) / 2.
+  void ComputeB();
+  // Computes A = B H.
+  void ComputeA();
+
+
+  // Backprops through the operation "A = B H".  B_deriv and H_deriv
+  // must be free of NaN and inf on entry.
+  void BackpropA(const MatrixBase<BaseFloat> &A_deriv,
+                 MatrixBase<BaseFloat> *B_deriv,
+                 MatrixBase<BaseFloat> *H_deriv);
+
+  // Backprops through the function "L = K H"..
+  // K_deriv must be free of NaN and inf on entry, but otherwise
+  // its value is ignored.  H_deriv is added to by this function.
+  void BackpropL(const MatrixBase<BaseFloat> &L_deriv,
+                 MatrixBase<BaseFloat> *K_deriv,
+                 MatrixBase<BaseFloat> *H_deriv);
+
+  // returns the objective-function change (vs. A being the unit matrix) from
+  // this estimation.
+  BaseFloat ComputeObjfChange();
+
+  FmllrEstimatorOptions opts_;
+  BaseFloat gamma_;
+  const MatrixBase<BaseFloat> &G_;
+  const MatrixBase<BaseFloat> &K_;
+  MatrixBase<BaseFloat> *A_;
+
+  // H = G^{-0.5} is symmetric.
+  Matrix<BaseFloat> H_;
+  // L = K H.
+  Matrix<BaseFloat> L_;
+  // B = F(L) is the result of applying SvdRescaler with
+  // the function f(lambda) = ((lambda + sqrt(lambda^2 + 4 gamma)) / 2)
+  Matrix<BaseFloat> B_;
+
+  // Object that helps us to compute, and to backprop through the
+  // computation of, H = G^{-0.5}.
+  SvdRescaler G_rescaler_;
+
+  // Object that helps us to compute, and to backprop through the computation
+  // of: B = F(L), where F is the function that takes the singular values of L,
+  // puts them through the function f(lamba) = (lambda + sqrt(lambda^2 + 4
+  // gamma)) / 2.
+  SvdRescaler L_rescaler_;
+
+};
+
+
+
+/**
+   Class GaussianEstimator allows you to estimate means and (spherical) variances
+   from features and posteriors, and to later backprop through that process if
+   needed.
+
+   It is intended for use during training of the neural net, for use on
+   individual minibatches: it uses BaseFloat for the accumulators, which might
+   lead to excessive roundoff if you had a large amount of data.  We'll later on
+   create a separate mechanism for accumulating stats over all the data, given
+   the full tree.
+
+   The normal usage pattern would be:
+      - Construct the object.
+      - Call AccStats() for each sequence.
+      - Call Estimate()
+      - Call GetMeans() and GetVars() to obtain the means and vars, and do
+        something with them, e.g. compute some kind of objective, from which
+        you would obtain derivatives w.r.t. those means and vars.
+      - Call SetOutputDerivs() to tell this class what those derivatives w.r.t.
+        the means and vars are.
+      - Call AccStatsBackward() for each sequence to propagate the derivatives
+        back to the features that were used to estimate the means and vars.
+ */
+class GaussianEstimator {
+ public:
+  GaussianEstimator(int32 num_classes, int32 feature_dim);
+
+  GaussianEstimator(const GaussianEstimator &other) = default;
+
+  int32 NumClasses() const { return gamma_.Dim(); }
+
+  int32 Dim() const;
+
+  // Accumulate statistics (you can call this multiple times of needed).
+  // It does: for each t, and for each pair (i, f) in post[t], accumulate stats
+  // from feats.Row(t) with class i and weight f.
+  // May not be called after Estimate() is called.
+  //
+  //   @param [in] feats   The input features, of dimension
+  //                       num-frames by feature-dimension
+  //   @param [in] post    The posteriors, which can be thought of as a
+  //                       vector<vector<pair<int32,BaseFloat> > >.
+  //                       Its size() must equal feats.NumRows().
+  void AccStats(const MatrixBase<BaseFloat> &feats,
+                const SubPosterior &post);
+
+  // You call this once after calling AccStats() one or more times.
+  // It estimates the model means and variances.
+  // See the members 'variance_floor' and 'variance_sharing_weight'
+  // of the options class.
+  void Estimate(const FmllrEstimatorOptions &opts);
+
+  // Returns true if Estimate() has previously been called, i.e. if
+  // the means and variances have been computed.
+  bool IsEstimated() const;
+
+  // Returns the means, in a matrix of dimension num_classes by dim.  Must not
+  // be called if ! IsEstimated().
+  const MatrixBase<BaseFloat> &GetMeans() const { return mu_; }
+
+  // Returns the 's' quantities, which are the scalar factors on the (spherical)
+  // variances.  Must not be called if ! IsEstimated().  The
+  // variance for class i will actually be s_i I, where s_i is an element of
+  // this vector.
+  const VectorBase<BaseFloat> &GetVars() const { return t_; }
+
+  // You call this to add something the derivatives df/dmeans and df/dvars-- the
+  // derivatives of the objective function f w.r.t. those quantities.  You might
+  // call this once or several times.  Doing this allows you to backprop through
+  // the estimation of the means and variances, back to the features.  This must
+  // only be called after previously calling Estimate().  This function writes
+  // to v_bar_ and m_bar_.
+  void AddToOutputDerivs(const MatrixBase<BaseFloat> &mean_derivs,
+                         const VectorBase<BaseFloat> &var_derivs);
+
+
+  // This function, which must only be called after AddToOutputDerivs() has been
+  // called at least once, propagates the derivative back to the features.  For
+  // purposes of this backpropagation, the posteriors are treated as constants.
+  //      @param [in] feats   The features, which must be the same
+  //                          as you provided to one of the calls to
+  //                          AccStats().  dimension is num-frames by
+  //                          feature-dimension.
+  //      @param [in] post    The posteriors, as provided to AccStats().
+  //                          Its size() must equal feats.NumRows().
+  //      @param [in,out] feats_deriv  The derivative of the objective
+  //                          function w.r.t. the input features.
+  //                          This function will *add to* feats_deriv,
+  //                          so it must have a well-defined value on
+  //                          entry.
+  void AccStatsBackward(const MatrixBase<BaseFloat> &feats,
+                        const SubPosterior &post,
+                        const MatrixBase<BaseFloat> *feats_deriv);
+
+
+  void Write(std::ostream &os, bool binary) const;
+  void Read(std::istream &is, bool binary);
+
+  // Adds any statistics in gamma_, m_ and v_ from 'other' to *this.
+  // Used when summing adaptation-model statistics over multiple
+  // jobs.  Requires that '*this' and 'other' have identical
+  // structure.
+  void Add(const GaussianEstimator &other);
+
+ private:
+  /*
+    Notes on implementation of GaussianEstimator.
+    Using Latex notation.
+
+     We are estimating means \mu_i and variance-factors s_i (these
+     are scales on unit variances).  Later we'll apply a kind of
+     interpolation with the global average variance, controlled
+     by variance_sharing_weight_, and we'll call the variances that
+     we finally output t_i.
+
+     We formulate the sufficient statistics as:
+      the counts \gamma_i, the mean stats m_i and the (scalar)
+      variance stats v_i:
+
+      \gamma_i = \sum_t \gamma_{t,i}
+           m_i = \sum_t \gamma_{t,i} x_t
+           v_i = \sum_t \gamma_{t,i} x_t^T x_t
+     The estimation procedure is:
+        \mu_i = \frac{m_i}{\gamma_i}, or 0 if \gamma_i is 0.
+          s_i = variance_floor if \gamma_i = 0, else:
+                max(variance_floor, (v_i/\gamma_i - \mu_i^T \mu_i) / dim)
+         where dim is the feature dimension; and another form more convenient for backprop:
+              = variance_floor if \gamma_i = 0, else:
+                max(variance_floor, v_i/(dim * \gamma_i) - m_i^T m_i / (dim * \gamma_i^2))
+
+     We write \bar{foo} for a derivative of the objective function w.r.t. foo.
+     We are provided by the user with with \bar{\mu}_i and \bar{s}_i, when they
+     call SetOutputDerivs(); and we aim to compute \bar{m}_i and \bar{v}_i, which
+     are the derivs w.r.t. the raw statistics.  This is done as follows:
+       \bar{m}_i = 0 if \gamma_i is 0, otherwise:
+                     \frac{\bar{\mu}_i}{\gamma_i} - (\frac{2\bar{s}_i m_i}{dim \gamma_i^2}
+                                                     if s_i > variance_floor, else 0)
+                 = or 0 if \gamma_i is 0, otherwise:
+                     \frac{\bar{\mu}_i}{\gamma_i} - (\frac{2\bar{s}_i \mu_i}{dim \gamma_i}
+                                                     if s_i > variance_floor, else 0)
+       \bar{v}_i = 0 if \gamma_i is 0 or s_i equals variance_floor, otherwise:
+                     \frac{\bar{s}_i}{dim * \gamma_i}
+       \bar{x}_t = \sum_i \gamma_{t,i} (\bar{m}_i + 2\bar{v}_i x_t)
+
+
+    If 'variance_sharing_weight' != 0.0, then we need to modify the above.
+    Let the variance-floored version of the variance be t_i.
+    Write variance_sharing_weight as f (with 0 <= f <= 1), and let
+        \gamma = \sum_i \gamma_i.
+    Define the weighted-average variance:
+         s  = \sum_i  \frac{\gamma_i}{\gamma} s_i
+    and the partly-shared output variance is:
+         t_i  = (1-f) s_i +  f s.
+    For the backprop: If the user supplies derivatives \bar{t}_i, then:
+          \bar{s} = f \sum_i \bar{t}_i
+        \bar{s}_i = (1-f) \bar{t}_i  + \frac{\gamma_i}{\gamma} \bar{s}.
+   */
+
+
+  // gamma_, of dimension num_classes, contains the raw count statistics \gamma_i.
+  // It's added to when you call AccStats().
+  Vector<BaseFloat> gamma_;
+  // m_ is the raw mean statistics (feature times soft-count); it's of dimension
+  // num_classes by feat_dim.
+  Matrix<BaseFloat> m_;
+  // v_ is the raw variance statistics (inner-product-of-feature times soft-count);
+  // it's of dimension num_classes.
+  Vector<BaseFloat> v_;
+
+  // variance_floor_ and variance_sharing_weight_ are copies of the
+  // corresponding variables in class FmllrEstimatorOptions; they are set when
+  // Estimate() is called.  They are temporaries, not permanent members.
+  BaseFloat variance_floor_;
+  BaseFloat variance_sharing_weight_;
+
+  // mu_ is the estimated means, which is set up when you call Estimate().
+  Matrix<BaseFloat> mu_;
+  // s_ is the variances, after flooring by variance_floor_ but before
+  // applying variance_sharing_weight_.
+  Vector<BaseFloat> s_;
+  // t_ is the smoothed or maybe totally averaged-over-all-classes variances,
+  // derived from t as specified by variance_sharing_weight_.
+  Vector<BaseFloat> t_;
+
+  // v_bar_, of dimension num_classes, contains \bar{v}_i.  It's only set up
+  // after you call SetOutputDerivs().
+  Vector<BaseFloat> v_bar_;
+  // m_bar_, of dimension num_classes by feature_dim, contains \bar{m}_i.
+  // It's only set up after you call SetOutputDerivs().
+  Matrix<BaseFloat> m_bar_;
+
+
+};
+
+
+
+/**
+   Class FmllrEstimator encapsulates the whole of the fMLLR computation- for
+   a single speaker.  See
+     http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf
+   for a description of what is being implemented here.
+
+   This class is suitable for use in training, where you want to backprop
+   through the computation; and also in test time (but not for the online
+   scenario; we may later rewrite a version that's optimized for that, or modify
+   this class to handle that).
+
+   This class would normally be used as follows:
+     - Construct an instance of the class (probably for a particular speaker on
+       a particular minibatch).
+
+   Then, either:
+
+     - Call AccStats() one or more times.
+     - Call Estimate().
+     - Call AdaptFeatures() one or more times to get the output features.
+        - Do something with those output features that (if you are training)
+          gives you some kind of objective-function derivative w.r.t. those
+         features.  Then if you are training, do what's below:
+     - Call AdaptFeaturesBackward() one or more times to get part of the
+       derivative w.r.t. the input features.  Note: the calls to AdaptFeatures()
+       and AdaptFeaturesBackward() may be interleaved, since the call to
+       AdaptFeatures() does not modify the object.
+     - Call EstimateBackward()
+     - Call AccStatsBackward() one or more times to get the part of the
+       derivative w.r.t. the input features that comes from the effect
+       on the transform itself.
+     - Make use of the calls GetMeanDeriv() and GetVarDeriv() to
+       account for the effect of the features on the class means and
+       variances (these will be passed to class GaussianEstimator,
+       and eventually to the features).
+
+   Or: if there is only one training sequence, you can use the
+   simplified interface:  after calling the constructor,
+
+      - call ForwardCombined()
+      - call BackwardCombined()
+      - Make use of the calls GetMeanDeriv() and GetVarDeriv() to
+        account for the effect of the features on the class means and
+        variances, with the help of class GaussianEstimator.
+*/
+class FmllrEstimator {
+ public:
+  /**
+     Constructor.
+     @param [in] opts   Options class.  This class makes a copy.
+     @param [in] mu     Class means, probably as output by class
+                        GaussianEstimator.  This class maintains a
+                        reference to this object, so you should ensure
+                        that it exists for the lifetime of this object.
+     @param [in] s      Scaling factors for spherical class
+                        variances, probably as output by class
+                        GaussianEstimator.  As with mu, we store
+                        a reference to it, so don't destroy or
+                        change it as long as this class instance exists.
+  */
+  FmllrEstimator(const FmllrEstimatorOptions &opts,
+                 const MatrixBase<BaseFloat> &mu,
+                 const VectorBase<BaseFloat> &s);
+
+
+  /**
+     Accumulate statistics to estimate the fMLLR transform.
+       @param [in] feats  The feature matrix.  A row of it would be called
+                       x_t in the writeup in
+                       http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf.
+       @param [in] post  The posteriors.  post.size() must equal feats.NumRows().
+                       Each element of post is a list of pairs (i, p) where
+                       i is the class label and p is the soft-count.
+   */
+  void AccStats(const MatrixBase<BaseFloat> &feats,
+                const SubPosterior &post);
+
+
+  /**
+     Estimate the fMLLR transform parameters A and b.  Returns the
+     objective-function improvement compared with A = I, b = 0, divided by the
+     total count as returned by TotalCount().
+
+     You are allowed to call this multiple times (e.g. call AccStats(), call
+     Estimate(), call AccStats(), call Estimate() again).
+  */
+  BaseFloat Estimate();
+
+  // Return true if Estimate() has previously been called.
+  bool IsEstimated() const;
+
+  /// Returns the total count of the posteriors accumulated so far.
+  BaseFloat TotalCount() { return gamma_.Sum(); }
+
+  /// Return the linear parameter matrix.  Adapted features are
+  /// y_t = A x_t  +  b.  You won't necessarily need to
+  /// call this, you can use ComputeAdaptedFeatures() intead.
+  const MatrixBase<BaseFloat> &GetLinearParams() const { return A_; }
+
+  /// Return the bias term b.
+  const VectorBase<BaseFloat> &GetBiasParams() const { return b_; }
+
+  /// Computes the adapted features y_t = A x_t + b.
+  /// feats (x) and adapted_feats (y) must have the same dimension.  Must
+  /// only be called after Estimate() has been called.
+  /// 'adapted_feats' may contain NaN's on entry.
+  void AdaptFeatures(const MatrixBase<BaseFloat> &feats,
+                     MatrixBase<BaseFloat> *adapted_feats) const;
+
+  /**
+     This is the backward pass corresponding to the function AdaptFeatures().
+     It propagates back only part of the derivative-- not including the part
+     that's due to how the transform changes when the features change.  It
+     also accumulates within this class instance the derivative w.r.t.
+     A and b.  You are expected to later call EstimateBackward() and
+     AccStatsBackward() to propagate the part of the derivative that comes from
+     the effect on the transform, back to the input features.
+
+     See also AccStatsBackward().
+        @param [in]   feats    The features (x) that were the original input to
+                               AdaptFeatures().
+        @param [in]   adapted_feats_deriv  The derivative \bar{y} w.r.t. the output (y)
+                               that was the result of calling AdaptFeatures().  Must
+                               have the same size as feat.
+        @param [in,out] feats_deriv   The derivative w.r.t. 'feats'; this function
+                               *adds* to it.
+   */
+  void AdaptFeaturesBackward(const MatrixBase<BaseFloat> &feats,
+                             const MatrixBase<BaseFloat> &adapted_feats_deriv,
+                             MatrixBase<BaseFloat> *feats_deriv);
+
+  /**
+     This is the backward pass corresponding to Estimate().  You call this after
+     calling AdaptFeaturesBackward() one or more times (which will accumulate
+     the derivative w.r.t. A and B).  It backpropagates through the core
+     estimation procedure of fMLLR, in preparation for you calling
+     AccStatsBackward().
+   */
+  void EstimateBackward();
+
+
+  // Returns the derivative w.r.t. the class means 'mu' that were supplied to the
+  // constructor.  Must not be called until EstimateBackward() and
+  // AccStatsBackward() have been called.
+  const MatrixBase<BaseFloat> &GetMeanDeriv() const { return mu_bar_; }
+  // Returns the derivative w.r.t. the variance factors 's' that were supplied
+  // to the constructor.  Must not be called until EstimateBackward() and
+  // AccStatsBackward() have been called.
+  const VectorBase<BaseFloat> &GetVarDeriv() const { return s_bar_; }
+
+  /**
+     This is the backward pass corresponding to AccStats().  You call this after
+     calling EstimateBackward().  It computes the part of the derivative w.r.t.
+     'feats' that comes from the effect on the transform parameters.  You will
+     normally have previously called AdaptFeaturesBackward() on these same
+     features.
+       @param [in] feats  The features as given to AccStats()
+       @param [in] post   The posteriors as given to AccStats()
+       @param [in,out] feats_deriv   This function *adds* to feats_deriv.
+                          It adds the terms in \bar{x}_t that arise from
+                          the derivative w.r.t. the transform parameters.  The
+                          "direct" term \bar{x}_t = A^T \bar{y}_t will have
+                          previously been added by AdaptFeaturesBackward().
+   */
+  void AccStatsBackward(const MatrixBase<BaseFloat> &feats,
+                        const SubPosterior &post,
+                        MatrixBase<BaseFloat> *feats_deriv);
+
+  /**
+     Combines AccStats(), Estimate() and AdaptFeatures() in one call;
+     for use when there is only one sequence.  Returns the objective-function
+     improvement (per soft-count).
+        @param [in] feats  The features we're estimating the fMLLR parameters from
+        @param [in] post   The posteriors corresponding to 'feats
+        @param [out] adapted_feats   A matrix the same size as 'feats', to which
+                           the adapted features will be written.  May contain
+                           NaNs at entry.
+   */
+  BaseFloat ForwardCombined(const MatrixBase<BaseFloat> &feats,
+                            const SubPosterior &post,
+                            MatrixBase<BaseFloat> *adapted_feats);
+  /**
+     Combines AdaptFeaturesBackward(), EstimateBackward(), and
+     AccStatsBackward(); for use when there is only one sequence.
+     Note: 'feats_deriv' is *added* to so must be defined at entry.
+  */
+  void BackwardCombined(const MatrixBase<BaseFloat> &feats,
+                        const SubPosterior &post,
+                        const MatrixBase<BaseFloat> &adapted_feats_deriv,
+                        MatrixBase<BaseFloat> *feats_deriv);
+
+  ~FmllrEstimator();
+ private:
+
+
+  ///////////// Fixed quantities passed in in the constructor ///////////
+
+  // The options.
+  FmllrEstimatorOptions opts_;
+  // The means.  A reference to an object owned elsewhere.
+  const MatrixBase<BaseFloat> &mu_;
+  // The variance factors (the variances are s_(i) times I).  A reference to an
+  // object owned elsewhere.
+  const VectorBase<BaseFloat> &s_;
+
+  ///////////// Quantities that are accumulated in AccStats()  ///////////
+
+  // Counts per class; dimension is num_classes.  Added to when AccStats() is
+  // called.  gamma_(i) corresponds to \gamma_i in the write up; it's
+  //   \gamma_i = \sum_t gamma_{t,i}
+  Vector<BaseFloat> gamma_;
+
+  // This contains one term in G_, namely:
+  //   (\sum_t \hat{\gamma}_t x_t x_t^T )
+  Matrix<BaseFloat> raw_G_;
+
+  // This is of dimension num_classes by dim (same as mu_).  It contains
+  // the weighted sums of the input data, for each class:
+  //        z_i = \sum_t \gamma_{t,i} x_i.
+  Matrix<BaseFloat> z_;
+
+
+  /////////// Quantities that are computed when Estimate() is called  ////////
+
+
+  // This contains
+  //   G = (\sum_t \hat{\gamma}_t x_t x_t^T ) - \hat{\gamma} n n^T.
+  // It is computed as raw_G_ - \hat{\gamma} n n^T.
+  // We use two separate variables to make it easier to call Estimate()
+  // more than once without things getting confused.
+  Matrix<BaseFloat> G_;
+
+  // gamma_hat_tot_ is the total of gamma_(i) / s_(i), i.e.
+  //   \hat{\gamma} = \sum_i gamma_i / s_i.
+  BaseFloat gamma_hat_tot_;
+
+  // After Estimate() is called, this will be the quantity:
+  //   n = \frac{1}{\hat{\gamma}} \sum_i (1/s_i) z_i
+  Vector<BaseFloat> n_;
+
+  // The weighted-average of the means:
+  // m = \frac{1}{\hat{\gamma}} \sum_i (\gamma_i/s_i) \mu_i
+  Vector<BaseFloat> m_;
+
+  // This contains
+  // K = (\sum_i (1/s_i) \mu_i z_i^T) - \hat{\gamma} m n^T
+  Matrix<BaseFloat> K_;
+
+  // The parameter matrix
+  Matrix<BaseFloat> A_;
+  // The offset term
+  Vector<BaseFloat> b_;
+  // The object we use to estimate A and b, and to backprop through that
+  // process.
+  CoreFmllrEstimator *estimator_;
+
+  ////////// Quantities that are accumulated in AdaptFeaturesBackward() ////////
+
+  // The derivative w.r.t. A.  This is set when AdaptFeaturesBackward() is called,
+  // to:
+  // \bar{A} = \sum_t \bar{y}_t x_t^T
+  //   and then when EstimateBackward() is called, we add the term from the estimation
+  //   of b, which is:
+  // \bar{A} -=  \bar{b} n^T
+  Matrix<BaseFloat> A_bar_;
+
+  // The derivative w.r.t. b.  This is set when AdaptFeaturesBackward() is called,
+  // to: \bar{b} = \sum_t \bar{y}_t.
+  Vector<BaseFloat> b_bar_;
+
+  ////////// Quantities that are computed in EstimateBackward() ////////
+
+  // The derivative w.r.t. G; computed by 'estimator_'
+  Matrix<BaseFloat> G_bar_;
+  // The derivative w.r.t. K; computed by 'estimator_'.
+  Matrix<BaseFloat> K_bar_;
+
+  // The derivative w.r.t. n:
+  // \bar{n} = -A^T \bar{b} - 2\hat{\gamma} \bar{G} n - \hat{\gamma} \bar{K}^T m
+  Vector<BaseFloat> n_bar_;
+
+  // The derivative w.r.t. m:
+  // \bar{m} = \bar{b} - \hat{\gamma} \bar{K} n
+  Vector<BaseFloat> m_bar_;
+
+  // The derivative w.r.t the z_i quantities.  The i'th row is:
+  //  \bar{z}_i =  (1/s_i) \bar{K}^T \mu_i  +  1/(s_i \hat{\gamma}) \bar{n}
+  Matrix<BaseFloat> z_bar_;
+
+  // gamma_hat_tot_bar_ is \bar{\hat{\gamma}} in the writeup;
+  // it's:
+  // \bar{\hat{\gamma}} = - n^T \bar{G} n - m^t \bar{K} n
+  //                      - \frac{1}{\hat{\gamma}} (n^T \bar{n} + m^T \bar{m})
+  BaseFloat gamma_hat_tot_bar_;
+
+  // The i'th row contains the derivative w.r.t mu_i.
+  // This is:
+  // \bar{\mu}_i = (1/s_i) \bar{K} z_i + (\gamma_i / (s_i \hat{\gamma})) \bar{m}
+  Matrix<BaseFloat> mu_bar_;
+
+  //////////// Quantities that are written to in AccStatsBackward() ///////////
+
+  // s_bar_(i) contains the derivative w.r.t the variance factor s_i,
+  // which we write in the writeup as \bar{s}_i.
+  // It is:
+  //    \bar{s}_i  =  -(1 / s_i^2) * (
+  //          \mu_i^T \bar{K} z_i  +  (1 / \hat{\gamma}) \z_i^T \bar{n}
+  //       + (\gamma_i / \hat{\gamma}) \mu_i^T \bar{m}  + \gamma_i \bar{\hat{\gamma}}
+  //       + \sum_t  \gamma_{t,i} \bar{\hat{\gamma}}_t )
+  // where
+  //  \bar{\hat{\gamma}}_t = x_t^T \bar{G} x_t  .
+  // Note: we add all but the first terms during Estimate(), and only the one
+  // with \sum_t in it in AccStatsBackward.
+  Vector<BaseFloat> s_bar_;
+
+  // There is another quantity that's updated by AccStatsBackward(), which is
+  // \bar{x}_t, the derivative w.r.t. x_t.  AccStatsBackward() does not include
+  // the term \bar{x}_t = A^T \bar{y}_t.  But it does include the rest of the
+  // terms, doing:
+  // \bar{x}_t  +=  2 \hat{\gamma}_t \bar{G} x_t
+  //                 + \sum_i \gamma_{t,i} \bar{z}_i
+  // There is no member variable for this; it's a temporary.
+
+};
+
+
+/* MeanOnlyTransformEstimator is like a highly simplified version of
+   FmllrEstimator, where the transform is just y_t = x_t + b.
+   There are class means but the variances are assumed to be all
+   unit.  (This is equivalent to assuming that they are all identical
+   with an arbitrary value; the value doesn't actually affect the
+   learned offset so we assume they are unit).
+
+   The equations involved are like an extremly simplified version
+   of what we do in class FmllrEstimator, with m as a weighted
+   average of the means and n as a weighted average of the input
+   features.  The weights come from the posterior information you
+   supply.
+
+   This object has a similar interface to class FmllrEstimator.
+
+   This class would normally be used as follows:
+     - Construct an instance of the class (probably for a particular speaker on
+       a particular minibatch).
+
+   Then, either:
+
+     - Call AccStats() one or more times.
+     - Call Estimate().
+     - Call AdaptFeatures() one or more times to get the output features.
+        - Do something with those output features that (if you are training)
+          gives you some kind of objective-function derivative w.r.t. those
+         features.  Then if you are training, do what's below:
+     - Call AdaptFeaturesBackward() one or more times to get part of the
+       derivative w.r.t. the input features.  Note: the calls to AdaptFeatures()
+       and AdaptFeaturesBackward() may be interleaved, since the call to
+       AdaptFeatures() does not modify the object.
+     - Call EstimateBackward()
+     - Call AccStatsBackward() one or more times to get the part of the
+       derivative w.r.t. the input features that comes from the effect
+       on the transform itself.
+     - Make use of the calls GetMeanDeriv() and GetVarDeriv() to
+       account for the effect of the features on the class means and
+       variances (these will be passed to class GaussianEstimator,
+       and eventually to the features).
+
+   Or: if there is only one training sequence, you can use the
+   simplified interface:  after calling the constructor,
+
+      - call ForwardCombined()
+      - call BackwardCombined()
+      - Make use of the call GetMeanDeriv() to account for the effect of the
+        features on the class means and variances, with the help of class
+        GaussianEstimator.
+ */
+class MeanOnlyTransformEstimator {
+ public:
+  /**
+     Constructor.
+     @param [in] mu     Class means, probably as output by class
+                        GaussianEstimator.  This class maintains a
+                        reference to this object, so you should ensure
+                        that it exists for the lifetime of this object.
+                        You can ignore the variances from class
+                        GaussianEstimator; they are not used.
+  */
+  MeanOnlyTransformEstimator(const MatrixBase<BaseFloat> &mu);
+
+  /**
+     Accumulate statistics to estimate the fMLLR transform.
+       @param [in] feats  The feature matrix.  A row of it would be called
+                       x_t in the writeup in
+                       http://www.danielpovey.com/files/2018_differentiable_fmllr.pdf.
+       @param [in] post  The posteriors.  post.size() must equal feats.NumRows().
+                       Each element of post is a list of pairs (i, p) where
+                       i is the class label and p is the soft-count.
+   */
+  void AccStats(const MatrixBase<BaseFloat> &feats,
+                const SubPosterior &post);
+
+  /**
+     Estimate the parameter (the offset).  Requires the total count to be
+     nonzero.  You are allowed to call this multiple times (e.g. call
+     AccStats(), call Estimate(), call AccStats(), call Estimate() again).
+  */
+  void Estimate();
+
+  // Returns true if Estimate() has previously been called.
+  bool IsEstimated() const;
+
+  BaseFloat TotalCount() { return gamma_.Sum(); }
+
+  /// Return the bias term b.
+  const VectorBase<BaseFloat> &GetOffset() const { return offset_; }
+
+  /// Computes the adapted features y_t = x_t + b.
+  /// feats (x) and adapted_feats (y) must have the same dimension.  Must
+  /// only be called after Estimate() has been called.
+  /// 'adapted_feats' may contain NaN's on entry.
+  void AdaptFeatures(const MatrixBase<BaseFloat> &feats,
+                     MatrixBase<BaseFloat> *adapted_feats) const;
+
+
+  /**
+     This is the backward pass corresponding to the function AdaptFeatures().
+     It propagates back only part of the derivative-- not including the part
+     that's due to how the offset changes when the features change.  It
+     also accumulates within this class instance the derivative w.r.t. the
+     offset.
+     See also AccStatsBackward().
+
+        @param [in]   feats    The features (x) that were the original input to
+                               AdaptFeatures().
+        @param [in]   adapted_feats_deriv  The derivative \bar{y} w.r.t. the output (y)
+                               that was the result of calling AdaptFeatures().  Must
+                               have the same size as feat.
+        @param [in,out] feats_deriv   The derivative w.r.t. 'feats'; this function
+                               *adds* to it.
+   */
+  void AdaptFeaturesBackward(const MatrixBase<BaseFloat> &feats,
+                             const MatrixBase<BaseFloat> &adapted_feats_deriv,
+                             MatrixBase<BaseFloat> *feats_deriv);
+
+  /**
+     Backward pass corresponding to Estimate().  Should be called after
+     you've called AdaptFeatures() on all utterances.  Computes the
+     derivatives w.r.t. the mean.  */
+  void EstimateBackward();
+
+  /**
+     Returns the derivative w.r.t. the class means 'mu' that were supplied to
+     the constructor.  Must not be called until EstimateBackward() has been
+     called.  */
+  const MatrixBase<BaseFloat> &GetMeanDeriv() const { return mu_bar_; }
+
+  /**
+     This is the backward pass corresponding to AccStats().  You call this after
+     calling EstimateBackward().  It computes the part of the derivative w.r.t.
+     'feats' that comes from the effect on the transform parameters.  You will
+     normally have previously called AdaptFeaturesBackward() on these same
+     features.
+       @param [in] feats  The features as given to AccStats()
+       @param [in,out] feats_deriv   This function *adds* to feats_deriv.
+                          It adds the terms in \bar{x}_t that arise from
+                          the derivative w.r.t. the offset b.
+   */
+  void AccStatsBackward(const MatrixBase<BaseFloat> &feats,
+                        const SubPosterior &post,
+                        MatrixBase<BaseFloat> *feats_deriv);
+
+
+  /**
+     Combines AccStats(), Estimate() and AdaptFeatures() in one call;
+     for use when there is only one sequence.
+        @param [in] feats  The features we're estimating the fMLLR parameters from
+        @param [in] post   The posteriors corresponding to 'feats
+        @param [out] adapted_feats   A matrix the same size as 'feats', to which
+                           the adapted features will be written.  May contain
+                           NaNs at entry.
+   */
+  void ForwardCombined(const MatrixBase<BaseFloat> &feats,
+                       const SubPosterior &post,
+                       MatrixBase<BaseFloat> *adapted_feats);
+  /**
+     Combines AdaptFeaturesBackward(), EstimateBackward(), and
+     AccStatsBackward(); for use when there is only one sequence.
+     Note: 'feats_deriv' is *added* to so must be defined at entry.
+  */
+  void BackwardCombined(const MatrixBase<BaseFloat> &feats,
+                        const SubPosterior &post,
+                        const MatrixBase<BaseFloat> &adapted_feats_deriv,
+                        MatrixBase<BaseFloat> *feats_deriv);
+
+ private:
+  // The means, one row per class.  A reference to an object owned elsewhere.
+  const MatrixBase<BaseFloat> &mu_;
+
+  // The counts per class
+  Vector<double> gamma_;
+  // The total of the input features, weighted by total posterior.
+  Vector<double> input_sum_;
+
+  // The offset.
+  Vector<BaseFloat> offset_;
+
+  // The total of the derivative w.r.t. the output.
+  Vector<double> output_deriv_sum_;
+
+  // The derivative w.r.t. each row of the input features-- i.e. the part of the
+  // derivative that comes from the effect via the offset.  This equals
+  //  (-1 / total-count) * output_deriv_sum_.
+  Vector<BaseFloat> x_deriv_;
+
+  // The derivative w.r.t. mu:
+  //    (1/gamma_tot) gamma_ . output_deriv_sum_^T.
+  Matrix<BaseFloat> mu_bar_;
+};
+
+
+} // namespace differentiable_transform
+} // namespace kaldi
+
+#endif  // KALDI_TRANSFORM_DIFFERENTIABLE_FMLLR_H_
diff --git a/src/adapt/differentiable-transform-itf.cc b/src/adapt/differentiable-transform-itf.cc
new file mode 100644
index 00000000000..e9c490c943d
--- /dev/null
+++ b/src/adapt/differentiable-transform-itf.cc
@@ -0,0 +1,198 @@
+// adapt/differentiable-transform-itf.cc
+
+// Copyright     2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "adapt/differentiable-transform-itf.h"
+#include "adapt/generic-transform.h"
+#include "adapt/differentiable-transform.h"
+
+namespace kaldi {
+namespace differentiable_transform {
+
+
+// static
+DifferentiableTransform* DifferentiableTransform::ReadNew(
+    std::istream &is, bool binary) {
+
+  std::string token;
+  ReadToken(is, binary, &token); // e.g. "<NoOpTransform>"
+  token.erase(0, 1); // erase "<".
+  token.erase(token.length()-1); // erase ">".
+  DifferentiableTransform *ans = NewTransformOfType(token);
+  if (!ans)
+    KALDI_ERR << "Unknown DifferentialbeTransform type " << token
+              << " (maybe you should recompile?)";
+  ans->Read(is, binary);
+  return ans;
+}
+
+// static
+DifferentiableTransform* DifferentiableTransform::NewTransformOfType(
+    const std::string &type) {
+  if (type.size() > 2 && type[type.size() - 1] == '>') {
+    std::string new_type(type);
+    if (new_type[0] == '<')
+      new_type.erase(0, 1);  // erase "<"
+    new_type.erase(new_type.size() - 1);  // erase ">".
+    return NewTransformOfType(new_type);
+  }
+
+  if (type == "NoOpTransform") {
+    return new NoOpTransform();
+  } else if (type == "FmllrTransform") {
+    return new FmllrTransform();
+  } else if (type == "MeanOnlyTransform") {
+    return new MeanOnlyTransform();
+  } else if (type == "SequenceTransform") {
+    return new SequenceTransform();
+  } else if (type == "AppendTransform") {
+    return new AppendTransform();
+  } else {
+    // Calling code will throw an error.
+    return NULL;
+  }
+}
+
+
+void DifferentiableTransform::TestingForwardBatch(
+    const CuMatrixBase<BaseFloat> &input,
+    int32 num_chunks,
+    int32 num_spk,
+    const Posterior &posteriors,
+    CuMatrixBase<BaseFloat> *output) const {
+  int32 dim = input.NumCols(),
+      num_frames = input.NumRows(),
+      chunks_per_spk = num_chunks / num_spk,
+      frames_per_chunk = num_frames / num_chunks;
+
+  // Just copy to CPU for now.
+  Matrix<BaseFloat> input_cpu(input);
+  Matrix<BaseFloat> output_cpu(num_frames, dim, kUndefined);
+
+  for (int32 s = 0; s < num_spk; s++) {
+    SpeakerStatsItf *stats = this->GetEmptySpeakerStats();
+    for (int32 chunk = s * chunks_per_spk;
+         chunk < (s + 1) * chunks_per_spk; chunk++) {
+      SubMatrix<BaseFloat> this_input(input_cpu.RowData(chunk),
+                                      frames_per_chunk, dim,
+                                      input_cpu.Stride() * num_chunks);
+      SubPosterior this_posteriors(posteriors,
+                                   chunk, // offset
+                                   frames_per_chunk, // num_frames
+                                   num_chunks);  // stride
+      this->TestingAccumulate(this_input, this_posteriors, stats);
+    }
+    stats->Estimate();
+    for (int32 chunk = s * chunks_per_spk;
+         chunk < (s + 1) * chunks_per_spk; chunk++) {
+      SubMatrix<BaseFloat> this_input(input_cpu.RowData(chunk),
+                                      frames_per_chunk, dim,
+                                      input_cpu.Stride() * num_chunks),
+          this_output(output_cpu.RowData(chunk),
+                      frames_per_chunk, dim,
+                      output_cpu.Stride() * num_chunks);
+      /*
+        // The following  testing code was temporarily present to test
+        // GetTransformAsMatrix()..
+      if (GetVerboseLevel() >= 3 && RandInt(0, 1) == 0) {
+        Matrix<BaseFloat> transform(dim, dim + 1, kUndefined);
+        this->GetTransformAsMatrix(*stats, &transform);
+        SubMatrix<BaseFloat> linear_part(transform, 0, dim, 0, dim);
+        Vector<BaseFloat> offset(dim);
+        offset.CopyColFromMat(transform, dim);
+        this_output.CopyRowsFromVec(offset);
+        this_output.AddMatMat(1.0, this_input, kNoTrans,
+                              linear_part, kTrans, 1.0);
+                              } else */
+      this->TestingForward(this_input, *stats, &this_output);
+    }
+    delete stats;
+  }
+  output->CopyFromMat(output_cpu);
+}
+
+// static
+DifferentiableTransform* DifferentiableTransform::ReadFromConfig(
+    std::istream &is, int32 num_classes) {
+  std::vector<std::string> lines;
+  ReadConfigLines(is, &lines);
+  std::vector<ConfigLine> config_lines;
+  ParseConfigLines(lines, &config_lines);
+  if (config_lines.empty())
+    KALDI_ERR << "Config file is empty.";
+  std::string transform_type = config_lines[0].FirstToken();
+  DifferentiableTransform *transform = NewTransformOfType(transform_type);
+  if (transform == NULL)
+    KALDI_ERR << "Parsing config file, could not find transform of type "
+              << transform_type;
+  int32 pos = transform->InitFromConfig(0, &config_lines);
+  if (pos != static_cast<int32>(config_lines.size()))
+    KALDI_ERR << "Found junk at end of config file, starting with line "
+              << pos << ": " << config_lines[pos].WholeLine();
+  KALDI_ASSERT(num_classes > 0);
+  transform->SetNumClasses(num_classes);
+  return transform;
+}
+
+int32 DifferentiableTransformMapped::NumPdfs() const {
+  if (pdf_map.empty())
+    return transform->NumClasses();
+  else
+    return static_cast<int32>(pdf_map.size());
+}
+
+void DifferentiableTransformMapped::Read(std::istream &is, bool binary) {
+  if (transform)
+    delete transform;
+  transform = DifferentiableTransform::ReadNew(is, binary);
+  ReadIntegerVector(is, binary, &pdf_map);
+  Check();
+}
+
+void DifferentiableTransformMapped::Write(std::ostream &os, bool binary) const {
+  Check();
+  transform->Write(os, binary);
+  WriteIntegerVector(os, binary, pdf_map);
+}
+
+
+void DifferentiableTransformMapped::Check() const {
+  KALDI_ASSERT(transform != NULL &&
+               (pdf_map.empty() ||
+                1 + *std::max_element(pdf_map.begin(), pdf_map.end()) ==
+                transform->NumClasses()));
+}
+
+std::string DifferentiableTransformMapped::Info() const {
+  KALDI_ASSERT(transform != NULL);
+  std::ostringstream os;
+  os << "dim=" << transform->Dim() << std::endl
+     << "num-classes=" << transform->NumClasses() << std::endl
+     << "num-pdfs=" << NumPdfs() << std::endl;
+  return os.str();
+}
+
+DifferentiableTransformMapped::DifferentiableTransformMapped(
+    const DifferentiableTransformMapped &other): pdf_map(other.pdf_map) {
+  if (other.transform == NULL) transform = NULL;
+  else transform = other.transform->Copy();
+}
+
+
+}  // namespace differentiable_transform
+}  // namespace kaldi
diff --git a/src/adapt/differentiable-transform-itf.h b/src/adapt/differentiable-transform-itf.h
new file mode 100644
index 00000000000..e2842cf6af0
--- /dev/null
+++ b/src/adapt/differentiable-transform-itf.h
@@ -0,0 +1,444 @@
+// adapt/differentiable-transform-itf.h
+
+// Copyright      2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_ITF_H_
+#define KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_ITF_H_
+
+#include <vector>
+#include "base/kaldi-common.h"
+#include "matrix/kaldi-matrix.h"
+#include "cudamatrix/cu-matrix.h"
+#include "util/text-utils.h"
+#include "hmm/posterior.h"
+
+
+namespace kaldi {
+namespace differentiable_transform {
+
+class MinibatchInfoItf {
+ public:
+  virtual ~MinibatchInfoItf() { }
+};
+
+
+class SpeakerStatsItf {
+ public:
+  // Does any estimation that is required-- you call this after accumulating
+  // stats and before calling TestingForward().  You'll normally want to
+  // override this, unless your object requires no estimation.
+  virtual void Estimate() { }
+
+  virtual ~SpeakerStatsItf() { }
+};
+
+
+
+/**
+   This class is for speaker-dependent feature-space transformations --
+   principally various varieties of fMLLR, including mean-only, diagonal and
+   block-diagonal versions -- which are intended for placement in the bottleneck
+   of a neural net.  So code-wise, we'd have: bottom neural net, then transform,
+   then top neural net.  The transform is designed to be differentiable, i.e. it
+   can be used during training to propagate derivatives from the top neural net
+   down to the bottom neural net.  The reason this is non-trivial (i.e. why it's
+   not just a matrix multiplication) is that the value of the transform itself
+   depends on the features, and also on the speaker-independent statistics for
+   each class (i.e. the mean and variance), which also depend on the features
+   sicne we estimate them from the same minibatch.
+   You can view this as an extension of things like BatchNorm, except the
+   interface is more complicated because there is a dependence on the per-frame
+   class labels.
+
+   The class labels we'll use here will probably be derived from some kind of
+   minimal tree, with hundreds instead of thousands of states.  Part of the
+   reason for using a smaller number of states is that, to make the thing
+   properly differentiable during training, we need to use a small enough number
+   of states that we can obtain a reasonable estimate for the mean and (spherical)
+   variance of a Gaussian for each one in training time.   Anyway, as you can see in
+   http://isl.anthropomatik.kit.edu/pdf/Nguyen2017.pdf, it's generally better
+   for this kind of thing to use "simple target models" for adaptation rather than
+   very complex models.
+
+   Note: for training utterances we'll generally get the class labels used for
+   adatpation in a supervised manner, either by aligning a previous system like
+   a GMM system, or-- more likely-- from the (soft) posteriors of the the
+   numerator graphs.  In test time, we'll usually be getting these class labels
+   from some kind of unsupervised process.
+
+   Because we tend to train neural nets on fairly small fixed-size chunks
+   (e.g. 1.5 seconds), and transforms like fMLLR don't tend to work very well
+   until you have about 5 seconds of data, we will usually be arranging those
+   chunks into groups where all members of the group come from the same
+   speaker.  So, for instance, instead of 128 totally separate chunks, we might
+   have 4 chunks per speaker and 32 speakers.
+
+   The basic pattern of usage of class DifferentiableTransform is this:
+
+     - Initialize the object prior to training, e.g. with InitFromConfig().
+
+     - Use this object to jointly train the 'bottom' (feature-extracting) and
+       'top' (ASR) network.  This involves functions TrainingForward() and
+       TrainingBackward() of this object; the posteriors used for that might be
+       dumped with the 'egs' (e.g. come from a GMM system), or might be derived
+       from the alignment of the numerator lattices in chain training.  Any
+       class means that must be estimated, would be estimated on each minibatch
+       (we'll try to keep the minibatches as large as possible, and may use
+       tricks like using bigger minibatch sizes for the bottom
+       (feature-extracting) network and smaller ones for the top one, to save
+       memory.  At this stage, this object will most likely only contain
+       configuration information and not any kind of data-dependent statistics.
+
+     - Use some reasonable-sized subset of training data to accumulate more
+       reliable statistics for the target model using Accumulate() followed
+       by Estimate().  If NumFinalIterations() is more than one you may need
+       do this in a short loop.
+
+     - In test time, for each speaker you'll:
+       - call GetEmptySpeakerStats() to get an object to store adaptation statistics
+         for your speaker.
+       - Obtain some class-level posteriors somehow (could come from an initial
+         decoding pass on all the data, or from the final decoding pass on the
+         part of the data you've seen up till now).  Use these to call
+         TestingAccumulate() to accumulate speaker stats.
+       - Call TestingForward() with the speaker-stats object to get
+         adapted features.
+
+
+ */
+class DifferentiableTransform {
+ public:
+
+  /// Return the dimension of the features this operates on.
+  virtual int32 Dim() const = 0;
+
+  /// Return the number of classes in the model used for adaptation.  These
+  /// will probably correspond to the leaves of a small tree, so they would
+  /// be pdf-ids.  This model only keeps track of the number of classes,
+  /// it does not contain any information about what they mean.  The
+  /// integers in the objects of type Posterior provided to this class
+  /// are expected to contain numbers from 0 to NumClasses() - 1.
+  int32 NumClasses() const { return num_classes_; }
+
+
+  /// This can be used to change the number of classes.  It would normally be
+  /// used, if at all, after the model is trained and prior to calling
+  /// Accumulate(), in case you want to use a more detailed model (e.g. the
+  /// normal-size tree instead of the small one that we use during training).
+  /// Child classes may want to override this, in case they need to do
+  /// something more than just set this variable.
+  virtual void SetNumClasses(int32 num_classes) { num_classes_ = num_classes; }
+
+  /**
+     This is the function you call in training time, for the forward
+     pass; it adapts the features.  By "training time" here, we
+     assume you are training the 'bottom' neural net, that produces
+     the features in 'input'; if you were not training it, it would
+     be the same as test time as far as this function is concerned.
+
+     @param [in] input  The original, un-adapted features; these
+              will typically be output by a neural net, the 'bottom' net in our
+              terminology.  This will correspond to a whole minibatch,
+              consisting of multiple speakers and multiple sequences (chunks)
+              per speaker.  Caution: in the input and
+              output features, and the posteriors, the 't' has the larger
+              stride than the minibatch-index 'n', so the order is:
+              first frame of all sequences; then the second frame of
+              all sequences; and so on.  This is the default order in
+              nnet3; see operator < of nnet3::Index.
+     @param [in] num_chunks   The number of individual sequences
+              (e.g., chunks of speech) represented in 'input'.
+              input.NumRows() will equal num_sequences times the number
+              of time frames.
+     @param [in] num_spk  The number of speakers.  Must be greater than one, and
+             must divide num_chunks.  The number of chunks per speaker
+             must be the same for all speakers (it will equal num_chunks / num_spk),
+             and the chunks for a speaker must be consecutively numbered.
+     @param [in] posteriors  (note: this is a vector of vector of
+             pair<int32,BaseFloat>).  This provides, in 'soft-count'
+             form, the class supervision information that is used for the
+             adaptation.  posteriors.size() will be equal to input.NumRows(),
+             and the ordering of its elements is the same as the ordering
+             of the rows of input (i.e. the 't' has the larger stride).
+             There is no assumption that the posteriors sum to one;
+             this allows you to do things like silence weighting.  But
+             the posteriors are expected to be nonnegative.
+     @param [out] output  The adapted output.  This matrix should have the
+             same dimensions as 'input'.  It does not have to be free of
+             NaNs when you call this function.
+     @return  This function returns either NULL or an object of type
+             DifferentiableTransform*, which is expected to later be given
+             to the function TrainingBackward().  It will store
+             any information that needs to be remembered for the backward
+             phase.
+   */
+  virtual MinibatchInfoItf* TrainingForward(
+      const CuMatrixBase<BaseFloat> &input,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors,
+      CuMatrixBase<BaseFloat> *output) const = 0;
+
+  /**
+     This does the backpropagation, during the training pass.
+
+     @param [in] input   The original input (pre-transform) features that
+                       were given to TrainingForward().
+     @param [in] output_deriv  The derivative of the objective function
+                       (that we are backpropagating) w.r.t. the output.
+     @param [in] num_chunks,num_spk,posteriors
+                       See TrainingForward() for information
+                       about these arguments; they should be the same
+                       values.
+     @param [in] minibatch_info  The pointer returned by the corresponding
+                      call to TrainingForward() (may be NULL).  This function
+                      takes ownership of the pointer.  If for some reason the
+                      backward pass was not done, the caller will likely
+                      want to delete it themselves.
+     @param [in,out] input_deriv  The derivative at the input, i.e.
+                      dF/d(input), where F is the function we are
+                      evaluating.  Must have the same dimension as
+                      'input'.  The derivative is *added* to here.
+                      This is useful because generally we will also
+                      be training (perhaps with less weight) on
+                      the unadapted features, in order to prevent them
+                      from deviating too far from the adapted ones
+                      and to allow the same model to be used for the
+                      first pass.
+   */
+  virtual void TrainingBackward(
+      const CuMatrixBase<BaseFloat> &input,
+      const CuMatrixBase<BaseFloat> &output_deriv,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors,
+      MinibatchInfoItf *minibatch_info,
+      CuMatrixBase<BaseFloat> *input_deriv) const = 0;
+
+
+  /**
+     Returns the number of times you have to (call Accumulate() on a subset
+     of data, then call Estimate())
+   */
+  virtual int32 NumFinalIterations() = 0;
+
+  /**
+     This will typically be called sequentially, minibatch by minibatch,
+     for a subset of training data, after training the neural nets,
+     followed by a call to Estimate().  Accumulate() stores statistics
+     that are used by Estimate().  This process is analogous to
+     computing the final stats in BatchNorm, in preparation for testing.
+     In practice it will be doing things like computing per-class means
+     and variances.
+
+        @param [in] final_iter  An iteration number in the range
+                 [0, NumFinalIterations()].  In many cases there will
+                 be only one iteration so this will just be zero.
+
+     The input parameters are the same as the same-named parameters to
+     TrainingForward(); please refer to the documentation there.
+   */
+  virtual void Accumulate(
+      int32 final_iter,
+      const CuMatrixBase<BaseFloat> &input,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors) = 0;
+
+  // Adds any stats accumulated via Accumulate() that are present in 'other' to
+  // 'this'.  Used when summing adaptation-model statistics across multiple
+  // jobs.
+  virtual void Add(const DifferentiableTransform &other) = 0;
+
+  // To be called after repeated calls to Accumulate(), does any estimation that
+  // is required in training time (normally per-speaker means and possibly
+  // variances.
+  //      @param [in] final_iter  An iteration number in the range
+  //               [0, NumFinalIterations()].  In many cases there will
+  //               be only one iteration so this will just be zero.
+  virtual void Estimate(int32 final_iter) = 0;
+
+  // Returns an object representing sufficient statistics for estimating a
+  // speaker-dependent transform.  This object will initially have zero
+  // counts in its statistics.  It will represent the stats for a single
+  // speaker.
+  virtual SpeakerStatsItf *GetEmptySpeakerStats() const = 0;
+
+
+  // Accumulate statistics for a segment of test data, storing them in the
+  // object 'speaker_stats'.  There is no assumption that the soft-counts in
+  // 'posteriors' are positive; this allows you to change your mind about the
+  // traceback, in test-time, by subtracting the stats that you no longer want
+  // to use.
+  virtual void TestingAccumulate(
+      const MatrixBase<BaseFloat> &input,
+      const SubPosterior &posteriors,
+      SpeakerStatsItf *speaker_stats) const = 0;
+
+
+  // Applies the transformation implied by the statistics in 'speaker_stats' to
+  // 'input', storing in the result in 'output'.  You must have done any estimation
+  // procedure that is required first, by calling Estimate() on the speaker-stats
+  // object.  'output' may contain NaN's at entry.
+  virtual void TestingForward(
+      const MatrixBase<BaseFloat> &input,
+      const SpeakerStatsItf &speaker_stats,
+      MatrixBase<BaseFloat> *output) const = 0;
+
+
+  // This function outputs the speaker-specific transformation in a matrix form
+  // with an offset, i.e., a matrix of dimension Dim() by Dim() + 1 where
+  // the last column represents the offset term (the same way Kaldi represents
+  // LDA and fMLLR transforms as matrices.
+  // The 'speaker_stats' object must have had Estimate() called on it.
+  // 'transform' must be of dimension Dim() by Dim() + 1; it may contain
+  // NaN's at entry.
+  virtual void GetTransformAsMatrix(
+      const SpeakerStatsItf &speaker_stats,
+      MatrixBase<BaseFloat> *transform) const = 0;
+
+
+  // TestingForwardBatch() combines GetEmptySpeakerStats(), TestingAccumulate() and
+  // TestingForward().  It has a default implementation.   It is a convenience
+  // function that may be useful during training under some circumstances, e.g.
+  // when you want to train only the top network.
+  virtual void TestingForwardBatch(
+      const CuMatrixBase<BaseFloat> &input,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors,
+      CuMatrixBase<BaseFloat> *output) const;
+
+  // Copies transform (deep copy).
+  virtual DifferentiableTransform* Copy() const = 0;
+
+  // Return the type of this transform.  E.g. "NoOpTransform".
+  virtual std::string Type() const = 0;
+
+  /*
+    Initialize this object from the config line at position 'cur_pos' of the
+    vector 'config_lines'.  This function may end up reading more lines than
+    one, if this is a transform type that contains other transforms.
+
+        @param [in]     cur_pos  The starting position in config_lines; required
+                            to be in the range [0, config_lines->size() - 1].
+                            The Type() of this object must match the first token
+                            (function FirstToken()) of that ConfigLine.
+        @param [in,out] config_lines   Config lines to be read.  It's non-const
+                            because the process of reading them has effects on
+                            the lines themselves (the ConfigLine object keeps
+                            track of which configuration values have been read).
+        @return        Returns the next position to be read.  Will be in the range
+                       [cur_pos + 1, config_lines->size()]; if it's equal to
+                       config_lines->size(), it means we're done.
+   */
+  virtual int32 InitFromConfig(int32 cur_pos,
+                               std::vector<ConfigLine> *config_lines) = 0;
+
+  // Returns a new transform of the given type e.g. "NoOpTransform"
+  // or NULL if no such component type exists.  If angle brackets are
+  // present, e.g. "<FmllrTransform>", this function will detect and
+  // remove them.
+  static DifferentiableTransform *NewTransformOfType(const std::string &type);
+
+  // Reads a differentiable transform from a config file (this function parses
+  // the file and reads a single DifferentiableTransform object from it).  Note:
+  // since DifferentiableTransform objects can contain others, the file may
+  // contain many lines.  Throws exception if it did not succeed-- including
+  // if the config file had junk at the end that was not parsed.
+  static DifferentiableTransform *ReadFromConfig(std::istream &is,
+                                                 int32 num_classes);
+
+
+
+  // Write transform to stream
+  virtual void Write(std::ostream &os, bool binary) const = 0;
+
+  // Reads transform from stream (normally you would previously have created
+  // the transform object of the correct type using ReadNew().
+  virtual void Read(std::istream &is, bool binary) = 0;
+
+  // Read transform from stream (works out its type).  Dies on error.
+  // This will be used when reading in objects that have been written with
+  // the Write() function, since you won't know the type of the object
+  // beforehand.
+  static DifferentiableTransform* ReadNew(std::istream &is, bool binary);
+
+  DifferentiableTransform(): num_classes_(-1) { }
+
+  virtual ~DifferentiableTransform() { }
+ protected:
+  DifferentiableTransform(const DifferentiableTransform &other):
+      num_classes_(other.num_classes_) { }
+
+  int32 num_classes_;
+};
+
+
+/**
+   struct DifferentiableTransformMapped is just a holder of an object of type
+   DifferentiableTransform and a vector<int32> representing a map from
+   pdf-ids to classes.
+
+   This map (if present) will be obtained from the binary build-tree-two-level,
+   and will map from tree leaves to a smaller number of classes (e.g. 200), so
+   that we can reasonably estimate the class means from a single minibatch
+   during training.  The contents of 'pdf_map' should be in the range [0,
+   transform->NumClases() - 1].
+
+ */
+struct DifferentiableTransformMapped {
+  DifferentiableTransform *transform;
+  std::vector<int32> pdf_map;
+
+  // This function returns pdf_map.size() if pdf_map is nonempty; otherwise
+  // it returns transform->NumClasses().
+  int32 NumPdfs() const;
+
+  void Read(std::istream &is, bool binary);
+
+  void Write(std::ostream &os, bool binary) const;
+
+  // Returns a string something like:
+  // dim=256
+  // num-classes=200
+  // num-pdfs=6391
+  // ... in future we will likely add more information, but for now you can get it by
+  // copying to text form and looking at it directly.
+  // the "num-classes" is transform->NumClasses(), and "num-pdfs" is
+  // pdf_map.size() if pdf_map is nonempty; else, transform->NumClasses().
+  std::string Info() const;
+
+  // Check that the dimensions are consistent, i.e. pdf_map.empty() or
+  // transform->NumClasses() == max-element-in-pdf_map + 1.
+  void Check() const;
+
+  DifferentiableTransformMapped(): transform(NULL)  {}
+
+  ~DifferentiableTransformMapped() { delete transform; }
+
+  // Copy constructor
+  DifferentiableTransformMapped(const DifferentiableTransformMapped &other);
+};
+
+
+} // namespace differentiable_transform
+} // namespace kaldi
+
+#endif  // KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_
diff --git a/src/adapt/differentiable-transform-test.cc b/src/adapt/differentiable-transform-test.cc
new file mode 100644
index 00000000000..8ad9ee7dcfa
--- /dev/null
+++ b/src/adapt/differentiable-transform-test.cc
@@ -0,0 +1,281 @@
+// adapt/differentiable-transform-test.cc
+
+// Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "adapt/differentiable-transform.h"
+#include "matrix/sp-matrix.h"
+
+namespace kaldi {
+namespace differentiable_transform {
+
+// This function writes a random configuration file of dimension
+// 'dim' (or a random dimension if dim == -1) to 'os'.
+void WriteRandomConfigOfDim(std::ostream &os, int32 dim) {
+  // nonrandom_dim is a randomly chosen dimension if dim == -1,
+  // else it's dim.
+  int32 actual_dim = (dim == -1 ? RandInt(10, 20) : dim);
+  int32 i, num_transforms = RandInt(1, 3);
+
+  while (true) {
+    // we loop here in case we hit a case we don't want to handle.
+    // We give more cases to the non-recursive transforms to ensure
+    // the expected size of the config file is finite.
+    switch(RandInt(0, 7)) {
+      case 0:
+        os << "NoOpTransform dim=" << actual_dim << "\n";
+        return;
+      case 1: case 2: case 3:
+        os << "FmllrTransform dim=" << actual_dim << " smoothing-count="
+           << 100.0 * RandInt(0, 2) << "\n";
+        return;
+      case 4: case 5:
+        os << "MeanOnlyTransform dim=" << actual_dim << "\n";
+        return;
+      case 6:
+        if (dim != -1)  // complicated to ensure a given dim for AppendTransform.
+          continue;
+        os << "AppendTransform num-transforms=" << num_transforms << "\n";
+        for (i = 0; i < num_transforms; i++)
+          WriteRandomConfigOfDim(os, -1);
+        return;
+      case 7:
+        os << "SequenceTransform num-transforms=" << num_transforms << "\n";
+        for (i = 0; i < num_transforms; i++)
+          WriteRandomConfigOfDim(os, actual_dim);
+        return;
+    }
+  }
+
+}
+
+// This function writes a random configuration file to 'os'.
+void WriteRandomConfigFile(std::ostream &os) {
+  WriteRandomConfigOfDim(os, -1);
+}
+
+
+
+void UnitTestReadFromConfig() {
+  using namespace kaldi;
+  using namespace kaldi::differentiable_transform;
+
+  for (int32 i = 0; i < 100; i++) {
+    std::ostringstream os;
+    WriteRandomConfigFile(os);
+    std::istringstream is(os.str());
+    int32 num_classes = RandInt(20, 30);
+    DifferentiableTransform *transform =
+        DifferentiableTransform::ReadFromConfig(is, num_classes);
+    KALDI_ASSERT(transform != NULL);
+    delete transform;
+  }
+}
+
+// Creates a random mean per class and adds it to the features, weighted
+// according to the posteriors.   It makes the tests more realistic, if
+// there are systematic differences between the classes.
+void AddRandomMeanOffsets(BaseFloat scale,
+                          int32 num_classes,
+                          const Posterior &post,
+                          CuMatrix<BaseFloat> *feats) {
+  int32 T = feats->NumRows(), dim = feats->NumCols();
+  CuMatrix<BaseFloat> class_means(num_classes, dim);
+  class_means.SetRandn();
+  class_means.Scale(scale);
+  for (int32 t = 0; t < T; t++) {
+    auto iter = post[t].begin(), end = post[t].end();
+    BaseFloat tot_post = 0.0;
+    for (; iter != end; ++iter)
+      tot_post += iter->second;
+    for (iter = post[t].begin(); iter != end; ++iter) {
+      int32 i = iter->first;
+      BaseFloat p = iter->second / tot_post;
+      feats->Row(t).AddVec(p, class_means.Row(i));
+    }
+  }
+}
+
+void GetRandomPosterior(int32 num_frames, int32 num_classes,
+                        Posterior *post) {
+  post->resize(num_frames);
+  for (int32 t = 0; t < num_frames; t++) {
+    for (int32 i = 0; i < 3; i++) {
+      if (RandInt(0, 1) == 0) {
+        (*post)[t].push_back(std::pair<int32, BaseFloat>(
+            RandInt(0, num_classes - 1), 0.1 + RandUniform()));
+      }
+    }
+  }
+
+}
+
+void TestTraining(DifferentiableTransform *transform) {
+  // test that the training process runs.
+  int32 dim = transform->Dim(),
+      num_classes = transform->NumClasses(),
+      num_frames = RandInt(200, 300),
+      num_spk = RandInt(2, 10),
+      chunks_per_spk = RandInt(1, 4),
+      num_rows = num_frames * num_spk * chunks_per_spk;
+  CuMatrix<BaseFloat> input_feats(num_rows, dim),
+      output_feats(num_rows, dim, kUndefined),
+      output_deriv(num_rows, dim, kUndefined),
+      input_deriv(num_rows, dim);
+
+  // This is to verify that TrainingBackward() adds to, rather than
+  // setting to, the input deriv.
+  CuMatrix<BaseFloat> random_input_deriv(num_rows, dim);
+  random_input_deriv.SetRandn();
+  input_deriv.AddMat(1.0, random_input_deriv);
+
+  input_feats.SetRandn();
+  output_deriv.SetRandn();
+  Posterior post;
+  GetRandomPosterior(num_rows, num_classes, &post);
+  AddRandomMeanOffsets(10.0, num_classes, post, &input_feats);
+
+  int32 num_chunks = num_spk * chunks_per_spk;
+  MinibatchInfoItf *info =
+      transform->TrainingForward(input_feats, num_chunks, num_spk, post,
+                                 &output_feats);
+  CuMatrix<BaseFloat> diff(input_feats);
+  diff.AddMat(-1.0, output_feats);
+  KALDI_LOG << "Difference in features (relative) is "
+            << (diff.FrobeniusNorm() / input_feats.FrobeniusNorm());
+
+
+  transform->TrainingBackward(input_feats, output_deriv, num_chunks,
+                              num_spk, post, info, &input_deriv);
+  // testing that TrainingBackward adds to the input deriv.
+  input_deriv.AddMat(-1.0, random_input_deriv);
+
+  int32 n = 5;
+  Vector<BaseFloat> expected_changes(n), observed_changes(n);
+  BaseFloat epsilon = 1.0e-03;
+  for (int32 i = 0; i < n; i++) {
+    CuMatrix<BaseFloat> new_input_feats(num_rows, dim),
+        new_output_feats(num_rows, dim, kUndefined);
+    new_input_feats.SetRandn();
+    new_input_feats.Scale(epsilon);
+    expected_changes(i) = TraceMatMat(new_input_feats, input_deriv, kTrans);
+    new_input_feats.AddMat(1.0, input_feats);
+    MinibatchInfoItf *info2 =
+        transform->TrainingForward(new_input_feats, num_chunks, num_spk,
+                                   post, &new_output_feats);
+    delete info2;
+    new_output_feats.AddMat(-1.0, output_feats);
+    observed_changes(i) = TraceMatMat(new_output_feats, output_deriv, kTrans);
+  }
+  KALDI_LOG << "Expected changes: " << expected_changes
+            << ", observed changes: " << observed_changes;
+  KALDI_ASSERT(expected_changes.ApproxEqual(observed_changes, 0.15));
+
+  {
+    // Test that if we do Accumulate() and Estimate() on the same data we
+    // trained on, and then TestingForwardBatch(), we get the same answer
+    // as during training.  Note: this may not be true for all examples
+    // including SequenceTransform, due to how we treat the last of the
+    // transforms specially.
+
+    int32 num_final_iters = transform->NumFinalIterations();
+    for (int32 i = 0; i < num_final_iters; i++) {
+      transform->Accumulate(i, input_feats, num_chunks, num_spk, post);
+      // transform->Add(*transform);  // Just check Add() does not crash.
+      // it does crash but because of AddVec() failing on this == other.. its ok.
+      transform->Estimate(i);
+    }
+    CuMatrix<BaseFloat> output_feats2(output_feats.NumRows(),
+                                      output_feats.NumCols(), kUndefined);
+    transform->TestingForwardBatch(input_feats, num_chunks, num_spk, post,
+                                   &output_feats2);
+    output_feats2.AddMat(-1.0, output_feats);
+    BaseFloat rel_diff = (output_feats2.FrobeniusNorm() /
+                          output_feats.FrobeniusNorm());
+    KALDI_LOG << "Difference in features train vs. test (relative) is "
+              << rel_diff;
+    if (rel_diff > 0.001) {
+      KALDI_WARN << "Make sure this config would not be equivalent train "
+          "vs. test (see config printed above).";
+    }
+  }
+}
+
+
+void UnitTestTraining() {
+  for (int32 i = 0; i < 100; i++) {
+    std::ostringstream os;
+    WriteRandomConfigFile(os);
+    std::istringstream is(os.str());
+    int32 num_classes = RandInt(20, 30);
+    DifferentiableTransform *transform =
+        DifferentiableTransform::ReadFromConfig(is, num_classes);
+    KALDI_LOG << "Config is: " << os.str();
+    KALDI_ASSERT(transform != NULL);
+    if (os.str().find("smoothing-count=0") == std::string::npos) {
+      // Don't do this test if smoothing-count is zero: it can
+      // fail but it doesn't indicate a real problem.
+      TestTraining(transform);
+    }
+    delete transform;
+  }
+}
+
+
+void UnitTestIo() {
+  for (int32 i = 0; i < 100; i++) {
+    std::ostringstream os;
+    WriteRandomConfigFile(os);
+    std::istringstream is(os.str());
+    int32 num_classes = RandInt(20, 30);
+    DifferentiableTransform *transform =
+        DifferentiableTransform::ReadFromConfig(is, num_classes);
+    KALDI_ASSERT(transform != NULL);
+
+    std::ostringstream os2;
+    bool binary = (RandInt(0,1) == 0);
+    transform->Write(os2, binary);
+
+    std::istringstream is2(os2.str());
+
+    DifferentiableTransform *transform2 =
+        DifferentiableTransform::ReadNew(is2, binary);
+    std::ostringstream os3;
+    transform2->Write(os3, binary);
+    KALDI_ASSERT(os2.str() == os3.str());
+    delete transform;
+    delete transform2;
+  }
+}
+
+
+
+}  // namespace kaldi
+}  // namespace differentiable_transform
+
+
+
+int main() {
+  using namespace kaldi::differentiable_transform;
+  kaldi::SetVerboseLevel(3);
+  for (int32 i = 0; i < 3; i++) {
+    UnitTestReadFromConfig();
+    UnitTestIo();
+    UnitTestTraining();
+  }
+  std::cout << "Test OK.\n";
+}
diff --git a/src/adapt/differentiable-transform.cc b/src/adapt/differentiable-transform.cc
new file mode 100644
index 00000000000..bcaf356e695
--- /dev/null
+++ b/src/adapt/differentiable-transform.cc
@@ -0,0 +1,624 @@
+// adapt/differentiable-transform.cc
+
+// Copyright     2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "adapt/differentiable-transform.h"
+
+
+// This header contains the 'base-cases' of DifferentiableTransform: namely,
+// FmllrTransform and MeanOnlyTransform.  See also generic-transform.h where
+// sequence, append and no-op types are defined.
+namespace kaldi {
+namespace differentiable_transform {
+
+FmllrMinibatchInfo::FmllrMinibatchInfo(
+    int32 num_classes, int32 dim, int32 num_speakers):
+    target_model(num_classes, dim),
+    estimators(num_speakers, NULL) { }
+
+FmllrMinibatchInfo::~FmllrMinibatchInfo() {
+  for (size_t i = 0; i < estimators.size(); i++)
+    delete estimators[i];
+}
+
+
+void FmllrSpeakerStats::Estimate() {
+  BaseFloat objf_impr = estimator.Estimate();
+  KALDI_VLOG(1) << "Objective function improvement per frame is " << objf_impr;
+}
+
+
+int32 FmllrTransform::InitFromConfig(
+    int32 cur_pos,
+    std::vector<ConfigLine> *config_lines) {
+  KALDI_ASSERT(cur_pos < int32(config_lines->size()));
+  ConfigLine *line = &((*config_lines)[cur_pos]);
+  KALDI_ASSERT(line->FirstToken() == Type());
+
+  if (!line->GetValue("dim", &dim_) || dim_ <= 0)
+    KALDI_ERR << "Dimension 'dim' must be specified for FmllrTransform, config "
+        "line is: " << line->WholeLine();
+  fmllr_opts_.ReadFromConfig(line);
+  if (line->HasUnusedValues())
+    KALDI_ERR << "Some configuration values were not used: '"
+              << line->UnusedValues() << "', in line: "
+              << line->WholeLine();
+  return cur_pos + 1;
+}
+
+
+FmllrTransform::FmllrTransform(const FmllrTransform &other):
+    DifferentiableTransform(other),
+    dim_(other.dim_), fmllr_opts_(other.fmllr_opts_),
+    target_model_(other.target_model_ == NULL ? NULL :
+                  new GaussianEstimator(*other.target_model_)) { }
+
+DifferentiableTransform *FmllrTransform::Copy() const {
+  return new FmllrTransform(*this);
+}
+
+void FmllrTransform::Add(const DifferentiableTransform &other_in) {
+  const FmllrTransform *other = dynamic_cast<const FmllrTransform*>(&other_in);
+  if (target_model_ && other->target_model_)
+    target_model_->Add(*(other->target_model_));
+}
+
+void FmllrTransform::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<FmllrTransform>");
+  WriteToken(os, binary, "<NumClasses>");
+  WriteBasicType(os, binary, num_classes_);
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  fmllr_opts_.Write(os, binary);
+  if (target_model_ != NULL) {
+    WriteToken(os, binary, "<TargetModel>");
+    target_model_->Write(os, binary);
+  } else {
+    WriteToken(os, binary, "<NoTargetModel>");
+  }
+  WriteToken(os, binary, "</FmllrTransform>");
+}
+
+void FmllrTransform::Read(std::istream &is, bool binary) {
+  delete target_model_;
+  target_model_ = NULL;
+  ExpectOneOrTwoTokens(is, binary, "<FmllrTransform>", "<NumClasses>");
+  ReadBasicType(is, binary, &num_classes_);
+  ExpectToken(is, binary, "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  fmllr_opts_.Read(is, binary);
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<TargetModel>") {
+    target_model_ = new GaussianEstimator(num_classes_, dim_);
+    target_model_->Read(is, binary);
+  } // else "<NoTargetModel>".
+  ExpectToken(is, binary, "</FmllrTransform>");
+}
+
+
+MinibatchInfoItf* FmllrTransform::TrainingForward(
+    const CuMatrixBase<BaseFloat> &input,
+    int32 num_chunks,
+    int32 num_spk,
+    const Posterior &posteriors,
+    CuMatrixBase<BaseFloat> *output) const  {
+  int32 num_classes = num_classes_,
+      dim = dim_, num_frames = input.NumRows();
+  KALDI_ASSERT(SameDim(input, *output) && input.NumCols() == dim &&
+               int32(posteriors.size()) == input.NumRows());
+  KALDI_ASSERT(num_chunks % num_spk == 0 && num_spk > 1 &&
+              num_frames % num_chunks == 0);
+  int32 chunks_per_spk = num_chunks / num_spk,
+      frames_per_chunk = num_frames / num_chunks;
+
+  FmllrMinibatchInfo *ans = new FmllrMinibatchInfo(num_classes,
+                                                   dim, num_spk);
+
+  // The input is in CuMatrix, i.e. it's on the GPU if we're using a GPU.  For
+  // now we just transfer everything to CPU, which of course is not optimal; we
+  // may later implement some of the deeper parts of this on GPU if the methods
+  // turn out to be effective.
+  Matrix<BaseFloat> input_cpu(input),
+      output_cpu(num_frames, dim, kUndefined);
+
+  // First estimate the target model (Gaussian means and spherical variances).
+  ans->target_model.AccStats(input_cpu, posteriors);
+  ans->target_model.Estimate(fmllr_opts_);
+
+  for (int32 s = 0; s < num_spk; s++)
+    ans->estimators[s] = new FmllrEstimator(fmllr_opts_,
+                                            ans->target_model.GetMeans(),
+                                            ans->target_model.GetVars());
+
+
+  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
+    int32 speaker = chunk / chunks_per_spk;
+    SubMatrix<BaseFloat> this_input(input_cpu.RowData(chunk),
+                                    frames_per_chunk,  // num-rows
+                                    dim,  // num-cols
+                                    input_cpu.Stride() * num_chunks); // stride
+    SubPosterior this_posteriors(posteriors,
+                                 chunk, // offset
+                                 frames_per_chunk, // num_frames
+                                 num_chunks);  // stride
+    ans->estimators[speaker]->AccStats(this_input, this_posteriors);
+  }
+  BaseFloat objf_impr = 0.0;
+  for (int32 s = 0; s < num_spk; s++) {
+    BaseFloat this_impr = ans->estimators[s]->Estimate();
+    objf_impr += this_impr / num_spk;
+  }
+  // objf_impr is now the average objective-function improvement per frame.
+  // We will later find a better way to display this.
+  KALDI_LOG << "Objective function improvement per frame is "
+            << objf_impr;
+
+  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
+    int32 speaker = chunk / chunks_per_spk;
+    SubMatrix<BaseFloat>
+        this_input(input_cpu.RowData(chunk), frames_per_chunk, dim,
+                   input_cpu.Stride() * num_chunks),
+        this_output(output_cpu.RowData(chunk),
+                    frames_per_chunk, dim, output_cpu.Stride() * num_chunks);
+    ans->estimators[speaker]->AdaptFeatures(this_input, &this_output);
+  }
+  output->CopyFromMat(output_cpu);
+  return ans;
+}
+
+void FmllrTransform::TrainingBackward(
+    const CuMatrixBase<BaseFloat> &input,
+    const CuMatrixBase<BaseFloat> &output_deriv,
+    int32 num_chunks,
+    int32 num_spk,
+    const Posterior &posteriors,
+    MinibatchInfoItf *minibatch_info,
+    CuMatrixBase<BaseFloat> *input_deriv) const {
+  FmllrMinibatchInfo *info = dynamic_cast<FmllrMinibatchInfo*>(minibatch_info);
+  KALDI_ASSERT(info != NULL && "Wrong type of minibatch info supplied.");
+
+  int32 dim = dim_, num_frames = input.NumRows();
+  KALDI_ASSERT(SameDim(input, output_deriv) && input.NumCols() == dim &&
+               SameDim(input, *input_deriv) &&
+               int32(posteriors.size()) == input.NumRows());
+  KALDI_ASSERT(num_chunks % num_spk == 0 && num_spk > 1 &&
+              num_frames % num_chunks == 0);
+  int32 chunks_per_spk = num_chunks / num_spk,
+      frames_per_chunk = num_frames / num_chunks;
+
+  // For now we just transfer everything to the CPU.
+  Matrix<BaseFloat> input_cpu(input),
+      output_deriv_cpu(output_deriv),
+      input_deriv_cpu(num_frames, dim);
+
+  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
+    int32 speaker = chunk / chunks_per_spk;
+    SubMatrix<BaseFloat> this_input(
+        input_cpu.RowData(chunk), frames_per_chunk,
+        dim, input_cpu.Stride() * num_chunks),
+        this_output_deriv(output_deriv_cpu.RowData(chunk),
+                          frames_per_chunk, dim,
+                          output_deriv_cpu.Stride() * num_chunks),
+        this_input_deriv(input_deriv_cpu.RowData(chunk),
+                         frames_per_chunk, dim,
+                         input_deriv_cpu.Stride() * num_chunks);
+    info->estimators[speaker]->AdaptFeaturesBackward(
+        this_input, this_output_deriv, &this_input_deriv);
+  }
+
+  for (int32 s = 0; s < num_spk; s++)
+    info->estimators[s]->EstimateBackward();
+
+  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
+    int32 speaker = chunk / chunks_per_spk;
+    SubMatrix<BaseFloat> this_input(
+        input_cpu.RowData(chunk), frames_per_chunk,
+        dim, input_cpu.Stride() * num_chunks),
+        this_output_deriv(output_deriv_cpu.RowData(chunk),
+                          frames_per_chunk, dim,
+                          output_deriv_cpu.Stride() * num_chunks),
+        this_input_deriv(input_deriv_cpu.RowData(chunk),
+                         frames_per_chunk, dim,
+                         input_deriv_cpu.Stride() * num_chunks);
+    SubPosterior this_posteriors(posteriors, chunk,
+                                 frames_per_chunk, num_chunks);
+    info->estimators[speaker]->AccStatsBackward(
+        this_input, this_posteriors, &this_input_deriv);
+  }
+
+  for (int32 s = 0; s < num_spk; s++)
+    info->target_model.AddToOutputDerivs(info->estimators[s]->GetMeanDeriv(),
+                                        info->estimators[s]->GetVarDeriv());
+
+  info->target_model.AccStatsBackward(input_cpu, posteriors, &input_deriv_cpu);
+  // These TrainingBackward() functions are all supposed to add to the
+  // 'input_deriv'.
+  CuMatrix<BaseFloat> input_deriv_temp(input_deriv->NumRows(),
+                                       input_deriv->NumCols(),
+                                       kUndefined);
+  input_deriv_temp.CopyFromMat(input_deriv_cpu);
+  input_deriv->AddMat(1.0, input_deriv_temp);
+
+  delete info;
+}
+
+
+void FmllrTransform::Accumulate(
+    int32 final_iter,
+    const CuMatrixBase<BaseFloat> &input,
+    int32 num_chunks,
+    int32 num_spk,
+    const Posterior &posteriors) {
+  KALDI_ASSERT(final_iter == 0);
+  if (target_model_ == NULL)
+    target_model_ = new GaussianEstimator(num_classes_, dim_);
+  Matrix<BaseFloat> input_cpu(input);
+  target_model_->AccStats(input_cpu, posteriors);
+}
+
+
+void FmllrTransform::Estimate(int32 final_iter) {
+  KALDI_ASSERT(final_iter == 0 && target_model_ != NULL);
+  target_model_->Estimate(fmllr_opts_);
+}
+
+
+SpeakerStatsItf *FmllrTransform::GetEmptySpeakerStats() const {
+  KALDI_ASSERT(target_model_ != NULL &&
+               target_model_->GetMeans().NumRows() != 0 &&
+               "You're trying to do adaptation with speaker transforms on "
+               "which you haven't done the final phase of training.");
+  return new FmllrSpeakerStats(fmllr_opts_, target_model_->GetMeans(),
+                               target_model_->GetVars());
+}
+
+void FmllrTransform::TestingAccumulate(
+    const MatrixBase<BaseFloat> &input,
+    const SubPosterior &posteriors,
+    SpeakerStatsItf *speaker_stats) const {
+  FmllrSpeakerStats *stats = dynamic_cast<FmllrSpeakerStats*>(
+      speaker_stats);
+  KALDI_ASSERT(stats != NULL && "Wrong type of speaker stats supplied.");
+  stats->estimator.AccStats(input, posteriors);
+}
+
+void FmllrTransform::TestingForward(
+      const MatrixBase<BaseFloat> &input,
+      const SpeakerStatsItf &speaker_stats,
+      MatrixBase<BaseFloat> *output) const {
+  const FmllrSpeakerStats *stats = dynamic_cast<const FmllrSpeakerStats*>(
+      &speaker_stats);
+  KALDI_ASSERT(stats != NULL && "Wrong type of speaker stats supplied.");
+  KALDI_ASSERT(stats->estimator.IsEstimated() &&
+               "You can't call TestingForward() without calling Estimate() on "
+               "the speaker stats.");
+  stats->estimator.AdaptFeatures(input, output);
+}
+
+void FmllrTransform::GetTransformAsMatrix(
+    const SpeakerStatsItf &speaker_stats,
+    MatrixBase<BaseFloat> *transform) const {
+  const FmllrSpeakerStats *stats = dynamic_cast<const FmllrSpeakerStats*>(
+      &speaker_stats);
+  int32 dim = Dim();
+  KALDI_ASSERT(transform->NumRows() == dim && transform->NumCols() == dim + 1);
+  transform->ColRange(0, dim).CopyFromMat(stats->estimator.GetLinearParams());
+  transform->CopyColFromVec(stats->estimator.GetBiasParams(), dim);
+}
+
+FmllrTransform::~FmllrTransform() {
+  delete target_model_;
+}
+
+
+MeanOnlyTransformMinibatchInfo::MeanOnlyTransformMinibatchInfo(
+    int32 num_classes, int32 dim, int32 num_speakers):
+    target_model(num_classes, dim),
+    estimators(num_speakers, NULL) { }
+
+MeanOnlyTransformMinibatchInfo::~MeanOnlyTransformMinibatchInfo() {
+  for (size_t i = 0; i < estimators.size(); i++)
+    delete estimators[i];
+}
+
+
+int32 MeanOnlyTransform::InitFromConfig(
+    int32 cur_pos,
+    std::vector<ConfigLine> *config_lines) {
+  KALDI_ASSERT(cur_pos < int32(config_lines->size()));
+  ConfigLine *line = &((*config_lines)[cur_pos]);
+  KALDI_ASSERT(line->FirstToken() == Type());
+
+  if (!line->GetValue("dim", &dim_) || dim_ <= 0)
+    KALDI_ERR << "Dimension 'dim' must be specified for MeanOnlyTransform, config "
+        "line is: " << line->WholeLine();
+  if (line->HasUnusedValues())
+    KALDI_ERR << "Some configuration values were not used: '"
+              << line->UnusedValues() << "', in line: "
+              << line->WholeLine();
+  return cur_pos + 1;
+}
+
+MeanOnlyTransform::MeanOnlyTransform(const MeanOnlyTransform &other):
+    DifferentiableTransform(other),
+    dim_(other.dim_), target_model_(other.target_model_ == NULL ? NULL :
+                                    new GaussianEstimator(*other.target_model_)) { }
+
+
+void MeanOnlyTransform::Add(const DifferentiableTransform &other_in) {
+  const MeanOnlyTransform *other =
+      dynamic_cast<const MeanOnlyTransform*>(&other_in);
+  if (target_model_ && other->target_model_)
+    target_model_->Add(*(other->target_model_));
+}
+
+void MeanOnlyTransform::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<MeanOnlyTransform>");
+  WriteToken(os, binary, "<NumClasses>");
+  WriteBasicType(os, binary, num_classes_);
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  if (target_model_ != NULL) {
+    WriteToken(os, binary, "<TargetModel>");
+    target_model_->Write(os, binary);
+  } else {
+    WriteToken(os, binary, "<NoTargetModel>");
+  }
+  WriteToken(os, binary, "</MeanOnlyTransform>");
+}
+
+void MeanOnlyTransform::Read(std::istream &is, bool binary) {
+  delete target_model_;
+  target_model_ = NULL;
+  ExpectOneOrTwoTokens(is, binary, "<MeanOnlyTransform>", "<NumClasses>");
+  ReadBasicType(is, binary, &num_classes_);
+  ExpectToken(is, binary, "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  std::string tok;
+  ReadToken(is, binary, &tok);
+  if (tok == "<TargetModel>") {
+    target_model_ = new GaussianEstimator(num_classes_, dim_);
+    target_model_->Read(is, binary);
+  } // else "<NoTargetModel>".
+  ExpectToken(is, binary, "</MeanOnlyTransform>");
+}
+
+
+MinibatchInfoItf* MeanOnlyTransform::TrainingForward(
+    const CuMatrixBase<BaseFloat> &input,
+    int32 num_chunks,
+    int32 num_spk,
+    const Posterior &posteriors,
+    CuMatrixBase<BaseFloat> *output) const  {
+  int32 num_classes = num_classes_,
+      dim = dim_, num_frames = input.NumRows();
+  KALDI_ASSERT(SameDim(input, *output) && input.NumCols() == dim &&
+               int32(posteriors.size()) == input.NumRows());
+  KALDI_ASSERT(num_chunks % num_spk == 0 && num_spk > 1 &&
+              num_frames % num_chunks == 0);
+  int32 chunks_per_spk = num_chunks / num_spk,
+      frames_per_chunk = num_frames / num_chunks;
+
+  MeanOnlyTransformMinibatchInfo *ans = new MeanOnlyTransformMinibatchInfo(num_classes,
+                                                   dim, num_spk);
+
+  // The input is in CuMatrix, i.e. it's on the GPU if we're using a GPU.  For
+  // now we just transfer everything to CPU, which of course is not optimal; we
+  // may later implement some of the deeper parts of this on GPU if the methods
+  // turn out to be effective.
+  Matrix<BaseFloat> input_cpu(input),
+      output_cpu(num_frames, dim, kUndefined);
+
+  // First estimate the target model (Gaussian means and spherical variances).
+  // We use the default options: they only affect the variances, which we won't
+  // be using.
+  ans->target_model.AccStats(input_cpu, posteriors);
+  FmllrEstimatorOptions default_opts;
+  ans->target_model.Estimate(default_opts);
+
+  for (int32 s = 0; s < num_spk; s++)
+    ans->estimators[s] = new MeanOnlyTransformEstimator(
+        ans->target_model.GetMeans());
+
+
+  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
+    int32 speaker = chunk / chunks_per_spk;
+    SubMatrix<BaseFloat> this_input(input_cpu.RowData(chunk),
+                                    frames_per_chunk,  // num-rows
+                                    dim,  // num-cols
+                                    input_cpu.Stride() * num_chunks); // stride
+    SubPosterior this_posteriors(posteriors,
+                                 chunk, // offset
+                                 frames_per_chunk, // num_frames
+                                 num_chunks);  // stride
+    ans->estimators[speaker]->AccStats(this_input, this_posteriors);
+  }
+  for (int32 s = 0; s < num_spk; s++)
+    ans->estimators[s]->Estimate();
+
+  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
+    int32 speaker = chunk / chunks_per_spk;
+    SubMatrix<BaseFloat>
+        this_input(input_cpu.RowData(chunk), frames_per_chunk, dim,
+                   input_cpu.Stride() * num_chunks),
+        this_output(output_cpu.RowData(chunk),
+                    frames_per_chunk, dim, output_cpu.Stride() * num_chunks);
+    ans->estimators[speaker]->AdaptFeatures(this_input, &this_output);
+  }
+  output->CopyFromMat(output_cpu);
+  return ans;
+}
+
+
+DifferentiableTransform *MeanOnlyTransform::Copy() const {
+  return new MeanOnlyTransform(*this);
+}
+
+void MeanOnlyTransform::TrainingBackward(
+    const CuMatrixBase<BaseFloat> &input,
+    const CuMatrixBase<BaseFloat> &output_deriv,
+    int32 num_chunks,
+    int32 num_spk,
+    const Posterior &posteriors,
+    MinibatchInfoItf *minibatch_info,
+    CuMatrixBase<BaseFloat> *input_deriv) const {
+  MeanOnlyTransformMinibatchInfo *info =
+      dynamic_cast<MeanOnlyTransformMinibatchInfo*>(minibatch_info);
+  KALDI_ASSERT(info != NULL && "Wrong type of minibatch info supplied.");
+
+  int32 dim = dim_, num_frames = input.NumRows();
+  KALDI_ASSERT(SameDim(input, output_deriv) && input.NumCols() == dim &&
+               SameDim(input, *input_deriv) &&
+               int32(posteriors.size()) == input.NumRows());
+  KALDI_ASSERT(num_chunks % num_spk == 0 && num_spk > 1 &&
+              num_frames % num_chunks == 0);
+  int32 chunks_per_spk = num_chunks / num_spk,
+      frames_per_chunk = num_frames / num_chunks;
+
+  // For now we just transfer everything to the CPU.
+  Matrix<BaseFloat> input_cpu(input),
+      output_deriv_cpu(output_deriv),
+      input_deriv_cpu(num_frames, dim);
+
+  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
+    int32 speaker = chunk / chunks_per_spk;
+    SubMatrix<BaseFloat> this_input(
+        input_cpu.RowData(chunk), frames_per_chunk,
+        dim, input_cpu.Stride() * num_chunks),
+        this_output_deriv(output_deriv_cpu.RowData(chunk),
+                          frames_per_chunk, dim,
+                          output_deriv_cpu.Stride() * num_chunks),
+        this_input_deriv(input_deriv_cpu.RowData(chunk),
+                         frames_per_chunk, dim,
+                         input_deriv_cpu.Stride() * num_chunks);
+    info->estimators[speaker]->AdaptFeaturesBackward(
+        this_input, this_output_deriv, &this_input_deriv);
+  }
+
+  for (int32 s = 0; s < num_spk; s++)
+    info->estimators[s]->EstimateBackward();
+
+  for (int32 chunk = 0; chunk < num_chunks; chunk++) {
+    int32 speaker = chunk / chunks_per_spk;
+    SubMatrix<BaseFloat> this_input(
+        input_cpu.RowData(chunk), frames_per_chunk,
+        dim, input_cpu.Stride() * num_chunks),
+        this_output_deriv(output_deriv_cpu.RowData(chunk),
+                          frames_per_chunk, dim,
+                          output_deriv_cpu.Stride() * num_chunks),
+        this_input_deriv(input_deriv_cpu.RowData(chunk),
+                         frames_per_chunk, dim,
+                         input_deriv_cpu.Stride() * num_chunks);
+    SubPosterior this_posteriors(posteriors, chunk,
+                                 frames_per_chunk, num_chunks);
+    info->estimators[speaker]->AccStatsBackward(
+        this_input, this_posteriors, &this_input_deriv);
+  }
+
+  for (int32 s = 0; s < num_spk; s++) {
+    Vector<BaseFloat> var_derivs(num_classes_);  // zero.
+    info->target_model.AddToOutputDerivs(info->estimators[s]->GetMeanDeriv(),
+                                         var_derivs);
+  }
+
+  info->target_model.AccStatsBackward(input_cpu, posteriors, &input_deriv_cpu);
+  // These TrainingBackward() functions are all supposed to add to the
+  // 'input_deriv'.
+  CuMatrix<BaseFloat> input_deriv_temp(input_deriv->NumRows(),
+                                       input_deriv->NumCols(),
+                                       kUndefined);
+  input_deriv_temp.CopyFromMat(input_deriv_cpu);
+  input_deriv->AddMat(1.0, input_deriv_temp);
+  delete info;
+}
+
+
+void MeanOnlyTransform::Accumulate(
+    int32 final_iter,
+    const CuMatrixBase<BaseFloat> &input,
+    int32 num_chunks,
+    int32 num_spk,
+    const Posterior &posteriors) {
+  KALDI_ASSERT(final_iter == 0);
+  if (target_model_ == NULL)
+    target_model_ = new GaussianEstimator(num_classes_, dim_);
+  Matrix<BaseFloat> input_cpu(input);
+  target_model_->AccStats(input_cpu, posteriors);
+}
+
+void MeanOnlyTransform::Estimate(int32 final_iter) {
+  KALDI_ASSERT(final_iter == 0 && target_model_ != NULL);
+  // The options only affect the estimates of the variance, which we don't use
+  // here, so we use the default options.
+  FmllrEstimatorOptions default_opts;
+  target_model_->Estimate(default_opts);
+}
+
+
+
+SpeakerStatsItf *MeanOnlyTransform::GetEmptySpeakerStats() const {
+  KALDI_ASSERT(target_model_ != NULL &&
+               target_model_->GetMeans().NumRows() != 0 &&
+               "You're trying to do adaptation with speaker transforms on "
+               "which you haven't done the final phase of training.");
+  return new MeanOnlyTransformSpeakerStats(target_model_->GetMeans());
+}
+
+void MeanOnlyTransform::TestingAccumulate(
+    const MatrixBase<BaseFloat> &input,
+    const SubPosterior &posteriors,
+    SpeakerStatsItf *speaker_stats) const {
+  MeanOnlyTransformSpeakerStats *stats = dynamic_cast<MeanOnlyTransformSpeakerStats*>(
+      speaker_stats);
+  KALDI_ASSERT(stats != NULL && "Wrong type of speaker stats supplied.");
+  stats->estimator.AccStats(input, posteriors);
+}
+
+void MeanOnlyTransform::TestingForward(
+      const MatrixBase<BaseFloat> &input,
+      const SpeakerStatsItf &speaker_stats,
+      MatrixBase<BaseFloat> *output) const {
+  const MeanOnlyTransformSpeakerStats *stats = dynamic_cast<const MeanOnlyTransformSpeakerStats*>(
+      &speaker_stats);
+  KALDI_ASSERT(stats != NULL && "Wrong type of speaker stats supplied.");
+  KALDI_ASSERT(stats->estimator.IsEstimated() &&
+               "You can't call TestingForward() without calling Estimate() on "
+               "the speaker stats.");
+  stats->estimator.AdaptFeatures(input, output);
+}
+
+void MeanOnlyTransform::GetTransformAsMatrix(
+    const SpeakerStatsItf &speaker_stats,
+    MatrixBase<BaseFloat> *transform) const {
+  const MeanOnlyTransformSpeakerStats *stats =
+      dynamic_cast<const MeanOnlyTransformSpeakerStats*>(&speaker_stats);
+  int32 dim = Dim();
+  KALDI_ASSERT(transform->NumRows() == dim && transform->NumCols() == dim + 1);
+  transform->SetUnit();
+  transform->CopyColFromVec(stats->estimator.GetOffset(), dim);
+}
+
+MeanOnlyTransform::~MeanOnlyTransform() {
+  delete target_model_;
+}
+
+
+
+}  // namespace differentiable_transform
+}  // namespace kaldi
diff --git a/src/adapt/differentiable-transform.h b/src/adapt/differentiable-transform.h
new file mode 100644
index 00000000000..c3abb1bbb96
--- /dev/null
+++ b/src/adapt/differentiable-transform.h
@@ -0,0 +1,289 @@
+// adapt/differentiable-transform.h
+
+// Copyright      2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_
+#define KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_
+
+#include <vector>
+
+#include "base/kaldi-common.h"
+#include "matrix/kaldi-matrix.h"
+#include "cudamatrix/cu-matrix.h"
+#include "adapt/differentiable-transform-itf.h"
+#include "adapt/differentiable-fmllr.h"
+
+
+// This header contains the 'base-cases' of DifferentiableTransform: namely,
+// FmllrTransform and MeanOnlyTransform.  See also generic-transform.h where
+// sequence, append and no-op types are defined.
+namespace kaldi {
+namespace differentiable_transform {
+
+
+/**
+   This is a version of the transform class that implements fMLLR (with
+   spherical variances, to make the update equations non-iterative); see
+   differentiable-fmllr.h where the core parts of this are implemented,
+   this provides the interface compatible with DifferentiableTransform.
+
+   Please see the comments in class DifferentiableTransform (in
+   differentiable-transform-itf.h) for the meaning and usage of the various
+   interface functions and their parameters.
+*/
+class FmllrTransform: public DifferentiableTransform {
+ public:
+  int32 InitFromConfig(int32 cur_pos,
+                       std::vector<ConfigLine> *config_lines) override;
+
+
+  int32 Dim() const override { return dim_; }
+
+  MinibatchInfoItf* TrainingForward(
+      const CuMatrixBase<BaseFloat> &input,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors,
+      CuMatrixBase<BaseFloat> *output) const override;
+
+  virtual void TrainingBackward(
+      const CuMatrixBase<BaseFloat> &input,
+      const CuMatrixBase<BaseFloat> &output_deriv,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors,
+      MinibatchInfoItf *minibatch_info,
+      CuMatrixBase<BaseFloat> *input_deriv) const override;
+
+  void Accumulate(
+      int32 final_iter,
+      const CuMatrixBase<BaseFloat> &input,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors) override;
+
+  void Estimate(int32 final_iter) override;
+
+  int32 NumFinalIterations() override { return 1; }
+
+  SpeakerStatsItf *GetEmptySpeakerStats() const override;
+
+  void TestingAccumulate(
+      const MatrixBase<BaseFloat> &input,
+      const SubPosterior &posteriors,
+      SpeakerStatsItf *speaker_stats) const override;
+
+  void TestingForward(
+      const MatrixBase<BaseFloat> &input,
+      const SpeakerStatsItf &speaker_stats,
+      MatrixBase<BaseFloat> *output) const override;
+
+  void GetTransformAsMatrix(
+      const SpeakerStatsItf &speaker_stats,
+      MatrixBase<BaseFloat> *transform) const override;
+
+  FmllrTransform(const FmllrTransform &other);
+
+  FmllrTransform(): target_model_(NULL) { }
+
+  std::string Type() const override { return "FmllrTransform"; }
+
+  DifferentiableTransform* Copy() const override;
+
+  void Write(std::ostream &os, bool binary) const override;
+
+  void Read(std::istream &is, bool binary) override;
+
+  void Add(const DifferentiableTransform &other) override;
+
+  ~FmllrTransform();
+ private:
+  int32 dim_;
+
+  FmllrEstimatorOptions fmllr_opts_;
+
+  // Note: this target model is only for consumption in test time; it is
+  // produced right at the end of training when Accumulate() and Estimate() are
+  // called.  We allocate it the first time Accumulate() is called.  In training
+  // time the corresponding stats are esimated minibatch by minibatch, not via
+  // this member (which is why we don't expect to have that many classes in
+  // training time).  At the end of training we'll accumulate stats here in
+  // Accumulate(), and Estimate() will estimate it.
+  GaussianEstimator *target_model_;
+};
+
+class FmllrMinibatchInfo: public MinibatchInfoItf {
+ public:
+
+  FmllrMinibatchInfo(int32 num_classes, int32 dim, int32 num_speakers);
+
+  GaussianEstimator target_model;
+
+  // One estimator of Fmllr per speaker.  Make them pointers so we don't have to
+  // implement self-constructor for class FmllrEstimator.
+  std::vector<FmllrEstimator*> estimators;
+
+  ~FmllrMinibatchInfo();
+};
+
+class FmllrSpeakerStats: public SpeakerStatsItf {
+ public:
+  // Caution: this object maintains references to mu and s, so it's not a good
+  // idea to let the target-model (which lives in the FmllrTransform object) be
+  // deleted during the lifetime of this object.
+  FmllrSpeakerStats(const FmllrEstimatorOptions &opts,
+                    const MatrixBase<BaseFloat> &mu,
+                    const VectorBase<BaseFloat> &s):
+      estimator(opts, mu, s) { }
+
+  void Estimate() override;
+
+  FmllrEstimator estimator;
+
+  ~FmllrSpeakerStats() { }
+};
+
+/**
+   This version of the transform class does a mean normalization: adding an
+   offset to its input so that the difference (per speaker) of the transformed
+   class means from the speaker-independent class means is minimized.
+   This is like a mean-only fMLLR with fixed (say, unit) covariance model.
+ */
+class MeanOnlyTransform: public DifferentiableTransform {
+ public:
+  /*
+    Example config line:
+
+    MeanOnlyTransform dim=100
+   */
+  int32 InitFromConfig(int32 cur_pos,
+                       std::vector<ConfigLine> *config_lines) override;
+
+
+  int32 Dim() const override { return dim_; }
+
+  MinibatchInfoItf* TrainingForward(
+      const CuMatrixBase<BaseFloat> &input,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors,
+      CuMatrixBase<BaseFloat> *output) const override;
+
+  virtual void TrainingBackward(
+      const CuMatrixBase<BaseFloat> &input,
+      const CuMatrixBase<BaseFloat> &output_deriv,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors,
+      MinibatchInfoItf *minibatch_info,
+      CuMatrixBase<BaseFloat> *input_deriv) const override;
+
+  void Accumulate(
+      int32 final_iter,
+      const CuMatrixBase<BaseFloat> &input,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors) override;
+
+  void Estimate(int32 final_iter) override;
+
+  int32 NumFinalIterations() override { return 1; }
+
+  SpeakerStatsItf *GetEmptySpeakerStats() const override;
+
+  void TestingAccumulate(
+      const MatrixBase<BaseFloat> &input,
+      const SubPosterior &posteriors,
+      SpeakerStatsItf *speaker_stats) const override;
+
+  void TestingForward(
+      const MatrixBase<BaseFloat> &input,
+      const SpeakerStatsItf &speaker_stats,
+      MatrixBase<BaseFloat> *output) const override;
+
+  void GetTransformAsMatrix(
+      const SpeakerStatsItf &speaker_stats,
+      MatrixBase<BaseFloat> *transform) const override;
+
+  MeanOnlyTransform(const MeanOnlyTransform &other);
+
+  MeanOnlyTransform(): target_model_(NULL) { }
+
+  std::string Type() const override { return "MeanOnlyTransform"; }
+
+  DifferentiableTransform* Copy() const override;
+
+  void Write(std::ostream &os, bool binary) const override;
+
+  void Read(std::istream &is, bool binary) override;
+
+  void Add(const DifferentiableTransform &other) override;
+
+  ~MeanOnlyTransform();
+ private:
+  int32 dim_;
+
+  // Note: this target model is only for consumption in test time; it is
+  // produced right at the end of training when Accumulate() and Estimate() are
+  // called.  We allocate it the first time Accumulate() is called.  In training
+  // time the corresponding stats are esimated minibatch by minibatch, not via
+  // this member (which is why we don't expect to have that many classes in
+  // training time).  At the end of training we'll accumulate stats here in
+  // Accumulate(), and Estimate() will estimate it.
+  GaussianEstimator *target_model_;
+};
+
+class MeanOnlyTransformMinibatchInfo: public MinibatchInfoItf {
+ public:
+
+  MeanOnlyTransformMinibatchInfo(int32 num_classes, int32 dim,
+                                 int32 num_speakers);
+
+  GaussianEstimator target_model;
+
+  // One estimator of offset per speaker.  Make them pointers so we don't have to
+  // implement self-constructor for class FmllrEstimator.
+  std::vector<MeanOnlyTransformEstimator*> estimators;
+
+  ~MeanOnlyTransformMinibatchInfo();
+};
+
+class MeanOnlyTransformSpeakerStats: public SpeakerStatsItf {
+ public:
+  // Caution: this object maintains a reference to mu, so it's not a good idea
+  // to let the target-model (which lives in the FmllrTransform object) be
+  // deleted during the lifetime of this object.
+  MeanOnlyTransformSpeakerStats(const MatrixBase<BaseFloat> &mu):
+      estimator(mu) { }
+
+  void Estimate() override { estimator.Estimate(); }
+
+  MeanOnlyTransformEstimator estimator;
+
+  ~MeanOnlyTransformSpeakerStats() { }
+};
+
+
+
+
+
+} // namespace differentiable_transform
+} // namespace kaldi
+
+#endif  // KALDI_TRANSFORM_DIFFERENTIABLE_TRANSFORM_H_
diff --git a/src/adapt/generic-transform.cc b/src/adapt/generic-transform.cc
new file mode 100644
index 00000000000..c2c73aefe85
--- /dev/null
+++ b/src/adapt/generic-transform.cc
@@ -0,0 +1,616 @@
+// adapt/generic-transform.cc
+
+// Copyright     2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "adapt/differentiable-transform-itf.h"
+#include "adapt/generic-transform.h"
+
+namespace kaldi {
+namespace differentiable_transform {
+
+
+int32 NoOpTransform::InitFromConfig(
+    int32 cur_pos,
+    std::vector<ConfigLine> *config_lines) {
+  KALDI_ASSERT(cur_pos < int32(config_lines->size()));
+  ConfigLine *line = &((*config_lines)[cur_pos]);
+  KALDI_ASSERT(line->FirstToken() == Type());
+  if (!line->GetValue("dim", &dim_) || dim_ <= 0)
+    KALDI_ERR << "Dimension 'dim' must be specified for NoOpTransform, config "
+        "line is: " << line->WholeLine();
+  if (line->HasUnusedValues())
+    KALDI_ERR << "Some configuration values were not used: '"
+              << line->UnusedValues() << "', in line: "
+              << line->WholeLine();
+  return cur_pos + 1;
+}
+
+
+void NoOpTransform::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<NoOpTransform>");
+  WriteToken(os, binary, "<NumClasses>");
+  WriteBasicType(os, binary, num_classes_);
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "</NoOpTransform>");
+}
+
+void NoOpTransform::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<NoOpTransform>", "<NumClasses>");
+  ReadBasicType(is, binary, &num_classes_);
+  ExpectToken(is, binary, "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "</NoOpTransform>");
+}
+
+
+int32 SequenceTransform::InitFromConfig(
+    int32 cur_pos,
+    std::vector<ConfigLine> *config_lines) {
+  KALDI_ASSERT(cur_pos < int32(config_lines->size()) &&
+               transforms_.empty());
+  ConfigLine *line = &((*config_lines)[cur_pos]);
+  KALDI_ASSERT(line->FirstToken() == Type());
+  int32 num_transforms = -1;
+  if (!line->GetValue("num-transforms", &num_transforms) ||
+      num_transforms <= 0)
+    KALDI_ERR << "Config value num-transforms must be specified for "
+        "SequenceTransform, line is: " << line->WholeLine();
+  if (line->HasUnusedValues())
+    KALDI_ERR << "Some configuration values were not used: '"
+              << line->UnusedValues() << "', in line: "
+              << line->WholeLine();
+  cur_pos++;
+
+  int32 dim = 0;
+  for (int32 i = 0; i < num_transforms; i++) {
+    if (cur_pos >= int32(config_lines->size()))
+      KALDI_ERR << "Config file lacks enough lines for SequenceTransform.";
+    ConfigLine *other_line = &((*config_lines)[cur_pos]);
+    std::string transform_type = other_line->FirstToken();
+    DifferentiableTransform *transform = NewTransformOfType(transform_type);
+    if (transform == NULL)
+      KALDI_ERR << "Could not find transform of type " << transform_type;
+    cur_pos = transform->InitFromConfig(cur_pos, config_lines);
+    if (i == 0) {
+      dim = transform->Dim();
+    } else if (dim != transform->Dim()) {
+      KALDI_ERR << "Transforms used in SequenceTransform have inconsistent dim: "
+                << dim << " vs " << transform->Dim();
+    }
+    transforms_.push_back(transform);
+  }
+  return cur_pos;
+}
+
+
+SequenceTransform::SequenceTransform(const SequenceTransform &other):
+    DifferentiableTransform(other),
+    transforms_(other.transforms_.size(), NULL) {
+  for (size_t i = 0; i < other.transforms_.size(); i++)
+    transforms_[i] = other.transforms_[i]->Copy();
+}
+
+
+void SequenceTransform::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<SequenceTransform>");
+  WriteToken(os, binary, "<NumClasses>");
+  WriteBasicType(os, binary, num_classes_);
+  WriteToken(os, binary, "<NumTransforms>");
+  int32 num_transforms = transforms_.size();
+  WriteBasicType(os, binary, num_transforms);
+  for (int32 i = 0; i < num_transforms; i++)
+    transforms_[i]->Write(os, binary);
+  WriteToken(os, binary, "</SequenceTransform>");
+}
+
+void SequenceTransform::Read(std::istream &is, bool binary) {
+  while (!transforms_.empty()) {
+    delete transforms_.back();
+    transforms_.pop_back();
+  }
+  ExpectOneOrTwoTokens(is, binary, "<SequenceTransform>", "<NumClasses>");
+  ReadBasicType(is, binary, &num_classes_);
+  ExpectToken(is, binary, "<NumTransforms>");
+  int32 num_transforms;
+  ReadBasicType(is, binary, &num_transforms);
+  for (int32 i = 0; i < num_transforms; i++) {
+    std::string tok;
+    ReadToken(is, binary, &tok);
+    DifferentiableTransform *transform;
+    if (!(transform = NewTransformOfType(tok)))
+      KALDI_ERR << "Expected the name of a transform, got "
+                << tok << " (maybe you should recompile?)";
+    transform->Read(is, binary);
+    transforms_.push_back(transform);
+  }
+  ExpectToken(is, binary, "</SequenceTransform>");
+}
+
+void SequenceTransform::Add(const DifferentiableTransform &other_in) {
+  const SequenceTransform *other = dynamic_cast<const SequenceTransform*>(
+      &other_in);
+  KALDI_ASSERT(transforms_.size() == other->transforms_.size());
+  for (size_t i = 0; i < transforms_.size(); i++)
+    transforms_[i]->Add(*(other->transforms_[i]));
+}
+
+int32 SequenceTransform::Dim() const {
+  size_t num_transforms = transforms_.size();
+  KALDI_ASSERT(num_transforms > 0);
+  return transforms_[0]->Dim();
+}
+
+void SequenceTransform::SetNumClasses(int32 num_classes) {
+  KALDI_ASSERT(num_classes > 0);
+  num_classes_ = num_classes;
+  for (size_t i = 0; i < transforms_.size(); i++) {
+    transforms_[i]->SetNumClasses(num_classes);
+  }
+}
+
+SequenceTransform::~SequenceTransform() {
+  for (size_t i = 0; i < transforms_.size(); i++)
+    delete transforms_[i];
+}
+
+MinibatchInfoItf* SequenceTransform::TrainingForward(
+    const CuMatrixBase<BaseFloat> &input,
+    int32 num_chunks,
+    int32 num_spk,
+    const Posterior &posteriors,
+    CuMatrixBase<BaseFloat> *output) const {
+  KALDI_ASSERT(SameDim(input, *output) &&
+               !transforms_.empty());
+  SequenceMinibatchInfo *ans = new SequenceMinibatchInfo();
+
+  const CuMatrixBase<BaseFloat> *last_output = &input;
+  CuMatrixBase<BaseFloat> *this_output;
+
+  ans->outputs.resize(transforms_.size() - 1);
+
+  for (size_t i = 0; i < transforms_.size(); i++) {
+    if (i + 1 == transforms_.size()) {
+      this_output = output;
+    } else {
+      // not the final transform.
+      ans->outputs[i].Resize(output->NumRows(), output->NumCols(), kUndefined);
+      this_output = &(ans->outputs[i]);
+    }
+    ans->info_vec.push_back(transforms_[i]->TrainingForward(
+        *last_output, num_chunks, num_spk, posteriors, this_output));
+    last_output = this_output;
+  }
+  return ans;
+}
+
+void SequenceTransform::TrainingBackward(
+      const CuMatrixBase<BaseFloat> &input,
+      const CuMatrixBase<BaseFloat> &output_deriv,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors,
+      MinibatchInfoItf *minibatch_info,
+      CuMatrixBase<BaseFloat> *input_deriv) const {
+  KALDI_ASSERT(SameDim(input, output_deriv) && SameDim(input, *input_deriv));
+
+  SequenceMinibatchInfo *info = dynamic_cast<SequenceMinibatchInfo*>(minibatch_info);
+  KALDI_ASSERT(info != NULL && "Mismatched MinibatchInfo type?");
+
+  CuMatrix<BaseFloat> temp_deriv(input.NumRows(),
+                                 input.NumCols());
+  int32 num_transforms = transforms_.size();
+  KALDI_ASSERT(num_transforms > 0);
+
+  const CuMatrixBase<BaseFloat> *cur_output_deriv = &output_deriv;
+
+  for (int32 i = num_transforms - 1; i >= 0; i--) {
+    const CuMatrixBase<BaseFloat> *cur_input = (i == 0 ? &input :
+                                                &(info->outputs[i-1]));
+    CuMatrixBase<BaseFloat> *cur_input_deriv;
+    if (i == 0) {
+      cur_input_deriv = input_deriv;
+    } else if (i == num_transforms - 1) {
+      cur_input_deriv = &temp_deriv;
+    } else {
+      // this matrix is no longer needed, store the intermediate deriv here.
+      cur_input_deriv = &(info->outputs[i]);
+      cur_input_deriv->SetZero();
+    }
+    transforms_[i]->TrainingBackward(*cur_input, *cur_output_deriv,
+                                     num_chunks, num_spk, posteriors,
+                                     info->info_vec[i], cur_input_deriv);
+    info->info_vec[i] = NULL;  // Prevent it from being deleted twice.
+    cur_output_deriv = cur_input_deriv;
+  }
+  delete info;  // This function took ownership.
+}
+
+int32 SequenceTransform::NumFinalIterations() {
+  int32 ans = 0;
+  for (size_t i = 0; i < transforms_.size(); i++)
+    ans += transforms_[i]->NumFinalIterations();
+  return ans;
+}
+
+void SequenceTransform::Accumulate(
+    int32 final_iter,
+    const CuMatrixBase<BaseFloat> &input,
+    int32 num_chunks,
+    int32 num_spk,
+    const Posterior &posteriors) {
+  CuMatrix<BaseFloat> temp;
+  const CuMatrixBase<BaseFloat> *cur_input = &input;
+
+  int32 prev_final_iters = 0;
+  for (size_t i = 0; i < transforms_.size(); i++) {
+    int32 nf = transforms_[i]->NumFinalIterations();
+    if (final_iter < prev_final_iters + nf) {
+      transforms_[i]->Accumulate(final_iter - prev_final_iters,
+                                 *cur_input, num_chunks, num_spk,
+                                 posteriors);
+      return;
+    } else {
+      KALDI_ASSERT(i + 1 < transforms_.size());
+      // We have to propagate the features through this transform.
+      CuMatrix<BaseFloat> this_output(input.NumRows(), input.NumCols(),
+                                      kUndefined);
+      transforms_[i]->TestingForwardBatch(*cur_input, num_chunks, num_spk,
+                                          posteriors, &this_output);
+      temp.Swap(&this_output);
+      cur_input = &temp;
+    }
+    prev_final_iters += nf;
+  }
+  KALDI_ERR << "final_iter out of range.";
+}
+
+void SequenceTransform::Estimate(int32 final_iter) {
+  CuMatrix<BaseFloat> temp;
+
+  int32 prev_final_iters = 0;
+  for (size_t i = 0; i < transforms_.size(); i++) {
+    int32 nf = transforms_[i]->NumFinalIterations();
+    if (final_iter < prev_final_iters + nf) {
+      transforms_[i]->Estimate(final_iter - prev_final_iters);
+      return;
+    }
+    prev_final_iters += nf;
+  }
+  KALDI_ERR << "final_iter out of range.";
+}
+
+void SequenceTransform::TestingAccumulate(
+    const MatrixBase<BaseFloat> &input,
+    const SubPosterior &posteriors,
+    SpeakerStatsItf *speaker_stats) const {
+  transforms_.back()->TestingAccumulate(input, posteriors,
+                                        speaker_stats);
+}
+
+void SequenceTransform::TestingForward(
+    const MatrixBase<BaseFloat> &input,
+    const SpeakerStatsItf &speaker_stats,
+    MatrixBase<BaseFloat> *output) const {
+  transforms_.back()->TestingForward(input, speaker_stats, output);
+}
+
+void SequenceTransform::GetTransformAsMatrix(
+    const SpeakerStatsItf &speaker_stats,
+    MatrixBase<BaseFloat> *transform) const {
+  transforms_.back()->GetTransformAsMatrix(speaker_stats, transform);
+}
+
+
+SequenceMinibatchInfo::~SequenceMinibatchInfo() {
+  for (size_t i = 0; i < info_vec.size(); i++)
+    delete info_vec[i];
+}
+
+
+
+int32 AppendTransform::InitFromConfig(
+    int32 cur_pos,
+    std::vector<ConfigLine> *config_lines) {
+  KALDI_ASSERT(cur_pos < int32(config_lines->size()) &&
+               transforms_.empty());
+  ConfigLine *line = &((*config_lines)[cur_pos]);
+  KALDI_ASSERT(line->FirstToken() == Type());
+  int32 num_transforms = -1;
+  if (!line->GetValue("num-transforms", &num_transforms) ||
+      num_transforms <= 0)
+    KALDI_ERR << "Config value num-transforms must be specified for "
+        "AppendTransform, line is: " << line->WholeLine();
+  if (line->HasUnusedValues())
+    KALDI_ERR << "Some configuration values were not used: '"
+              << line->UnusedValues() << "', in line: "
+              << line->WholeLine();
+  cur_pos++;
+
+  for (int32 i = 0; i < num_transforms; i++) {
+    if (cur_pos >= int32(config_lines->size()))
+      KALDI_ERR << "Config file lacks enough lines for AppendTransform.";
+    ConfigLine *other_line = &((*config_lines)[cur_pos]);
+    std::string transform_type = other_line->FirstToken();
+    DifferentiableTransform *transform = NewTransformOfType(transform_type);
+    if (transform == NULL)
+      KALDI_ERR << "Could not find transform of type " << transform_type;
+    cur_pos = transform->InitFromConfig(cur_pos, config_lines);
+    transforms_.push_back(transform);
+  }
+  return cur_pos;
+}
+
+
+
+AppendTransform::AppendTransform(const AppendTransform &other):
+    DifferentiableTransform(other),
+    transforms_(other.transforms_.size(), NULL) {
+  for (size_t i = 0; i < other.transforms_.size(); i++)
+    transforms_[i] = other.transforms_[i]->Copy();
+}
+
+
+
+void AppendTransform::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<AppendTransform>");
+  WriteToken(os, binary, "<NumClasses>");
+  WriteBasicType(os, binary, num_classes_);
+  WriteToken(os, binary, "<NumTransforms>");
+  int32 num_transforms = transforms_.size();
+  WriteBasicType(os, binary, num_transforms);
+  for (int32 i = 0; i < num_transforms; i++)
+    transforms_[i]->Write(os, binary);
+  WriteToken(os, binary, "</AppendTransform>");
+}
+
+void AppendTransform::Read(std::istream &is, bool binary) {
+  while (!transforms_.empty()) {
+    delete transforms_.back();
+    transforms_.pop_back();
+  }
+  ExpectOneOrTwoTokens(is, binary, "<AppendTransform>", "<NumClasses>");
+  ReadBasicType(is, binary, &num_classes_);
+  ExpectToken(is, binary, "<NumTransforms>");
+  int32 num_transforms;
+  ReadBasicType(is, binary, &num_transforms);
+  for (int32 i = 0; i < num_transforms; i++) {
+    std::string tok;
+    ReadToken(is, binary, &tok);
+    DifferentiableTransform *transform;
+    if (!(transform = NewTransformOfType(tok)))
+      KALDI_ERR << "Expected the name of a transform, got "
+                << tok << " (maybe you should recompile?)";
+    transform->Read(is, binary);
+    transforms_.push_back(transform);
+  }
+  ExpectToken(is, binary, "</AppendTransform>");
+}
+
+void AppendTransform::Add(const DifferentiableTransform &other_in) {
+  const AppendTransform *other = dynamic_cast<const AppendTransform*>(
+      &other_in);
+  KALDI_ASSERT(transforms_.size() == other->transforms_.size());
+  for (size_t i = 0; i < transforms_.size(); i++)
+    transforms_[i]->Add(*(other->transforms_[i]));
+}
+
+int32 AppendTransform::Dim() const {
+  size_t num_transforms = transforms_.size();
+  KALDI_ASSERT(num_transforms > 0);
+  int32 ans = 0;
+  for (size_t i = 0; i < num_transforms; i++)
+    ans += transforms_[i]->Dim();
+  return ans;
+}
+
+void AppendTransform::SetNumClasses(int32 num_classes) {
+  num_classes_ = num_classes;
+  for (size_t i = 0; i < transforms_.size(); i++) {
+    transforms_[i]->SetNumClasses(num_classes);
+  }
+}
+
+AppendTransform::~AppendTransform() {
+  for (size_t i = 0; i < transforms_.size(); i++)
+    delete transforms_[i];
+}
+
+
+MinibatchInfoItf* AppendTransform::TrainingForward(
+    const CuMatrixBase<BaseFloat> &input,
+    int32 num_chunks,
+    int32 num_spk,
+    const Posterior &posteriors,
+    CuMatrixBase<BaseFloat> *output) const {
+  KALDI_ASSERT(input.NumCols() == Dim() &&
+               SameDim(input, *output));
+  AppendMinibatchInfo *ans = new AppendMinibatchInfo();
+  int32 dim_offset = 0;
+  for (size_t i = 0; i < transforms_.size(); i++) {
+    int32 this_dim = transforms_[i]->Dim();
+    CuSubMatrix<BaseFloat> input_part = input.ColRange(dim_offset, this_dim),
+        output_part = output->ColRange(dim_offset, this_dim);
+    ans->info_vec.push_back(transforms_[i]->TrainingForward(
+        input_part, num_chunks, num_spk, posteriors, &output_part));
+    dim_offset += this_dim;
+  }
+  KALDI_ASSERT(dim_offset == input.NumCols());
+  return ans;
+}
+
+void AppendTransform::TrainingBackward(
+      const CuMatrixBase<BaseFloat> &input,
+      const CuMatrixBase<BaseFloat> &output_deriv,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors,
+      MinibatchInfoItf *minibatch_info,
+      CuMatrixBase<BaseFloat> *input_deriv) const {
+  AppendMinibatchInfo *info = dynamic_cast<AppendMinibatchInfo*>(minibatch_info);
+  KALDI_ASSERT(info != NULL && "Mismatched MinibatchInfo type?");
+
+  int32 dim_offset = 0;
+  for (size_t i = 0; i < transforms_.size(); i++) {
+    int32 this_dim = transforms_[i]->Dim();
+    CuSubMatrix<BaseFloat> input_part = input.ColRange(dim_offset, this_dim),
+        output_deriv_part = output_deriv.ColRange(dim_offset, this_dim),
+        input_deriv_part = input_deriv->ColRange(dim_offset, this_dim);
+    transforms_[i]->TrainingBackward(
+        input_part, output_deriv_part, num_chunks, num_spk,
+        posteriors, info->info_vec[i], &input_deriv_part);
+    info->info_vec[i] = NULL;  // Prevent it from being deleted twice.
+    dim_offset += this_dim;
+  }
+  KALDI_ASSERT(dim_offset == input.NumCols());
+  delete info;  // This function took ownership.
+}
+
+int32 AppendTransform::NumFinalIterations() {
+  int32 ans = 0;
+  for (size_t i = 0; i < transforms_.size(); i++)
+    ans = std::max<int32>(ans, transforms_[i]->NumFinalIterations());
+  return ans;
+}
+
+
+void AppendTransform::Accumulate(
+    int32 final_iter,
+    const CuMatrixBase<BaseFloat> &input,
+    int32 num_chunks,
+    int32 num_spk,
+    const Posterior &posteriors) {
+  int32 num_final_iters = 0,
+      dim_offset = 0;
+  for (size_t i = 0; i < transforms_.size(); i++) {
+    int32 this_nf = transforms_[i]->NumFinalIterations(),
+        this_dim = transforms_[i]->Dim();
+    if (final_iter < this_nf)
+      transforms_[i]->Accumulate(final_iter,
+                                 input.ColRange(dim_offset, this_dim),
+                                 num_chunks, num_spk, posteriors);
+    if (this_nf > num_final_iters)
+      num_final_iters = this_nf;
+    dim_offset += this_dim;
+  }
+  KALDI_ASSERT(final_iter >= 0 && final_iter < num_final_iters);
+}
+
+void AppendTransform::Estimate(int32 final_iter) {
+  for (size_t i = 0; i < transforms_.size(); i++) {
+    int32 this_nf = transforms_[i]->NumFinalIterations();
+    if (final_iter < this_nf) {
+      transforms_[i]->Estimate(final_iter);
+    }
+  }
+}
+
+AppendMinibatchInfo::~AppendMinibatchInfo() {
+  for (size_t i = 0; i < info_vec.size(); i++)
+    delete info_vec[i];
+}
+
+SpeakerStatsItf* AppendTransform::GetEmptySpeakerStats() const {
+  AppendSpeakerStats *ans = new AppendSpeakerStats();
+  for (size_t i = 0; i < transforms_.size(); i++)
+    ans->stats.push_back(transforms_[i]->GetEmptySpeakerStats());
+  return ans;
+}
+
+void AppendTransform::TestingAccumulate(
+    const MatrixBase<BaseFloat> &input,
+    const SubPosterior &posteriors,
+    SpeakerStatsItf *speaker_stats) const {
+  AppendSpeakerStats *stats = dynamic_cast<AppendSpeakerStats*>(speaker_stats);
+  KALDI_ASSERT(stats != NULL && stats->stats.size() == transforms_.size() &&
+               "Wrong type of stats supplied to AppendTransform.");
+  int32 dim_offset = 0;
+  for (size_t i = 0; i < transforms_.size(); i++) {
+    int32 this_dim = transforms_[i]->Dim();
+    SubMatrix<BaseFloat> input_part = input.ColRange(dim_offset, this_dim);
+    transforms_[i]->TestingAccumulate(input_part, posteriors,
+                                      stats->stats[i]);
+    dim_offset += this_dim;
+  }
+  KALDI_ASSERT(dim_offset == input.NumCols());
+}
+
+
+void AppendTransform::TestingForward(
+    const MatrixBase<BaseFloat> &input,
+    const SpeakerStatsItf &speaker_stats,
+    MatrixBase<BaseFloat> *output) const {
+  const AppendSpeakerStats *stats =
+      dynamic_cast<const AppendSpeakerStats*>(&speaker_stats);
+  KALDI_ASSERT(stats != NULL && stats->stats.size() == transforms_.size() &&
+               "Wrong type of stats supplied to AppendTransform.");
+  int32 dim_offset = 0;
+  for (size_t i = 0; i < transforms_.size(); i++) {
+    int32 this_dim = transforms_[i]->Dim();
+    SubMatrix<BaseFloat> input_part = input.ColRange(dim_offset, this_dim),
+        output_part = output->ColRange(dim_offset, this_dim);
+    transforms_[i]->TestingForward(input_part, *(stats->stats[i]),
+                                   &output_part);
+    dim_offset += this_dim;
+  }
+  KALDI_ASSERT(dim_offset == input.NumCols());
+}
+
+void AppendTransform::GetTransformAsMatrix(
+    const SpeakerStatsItf &speaker_stats,
+    MatrixBase<BaseFloat> *transform) const {
+  int32 dim = Dim();
+  KALDI_ASSERT(transform->NumRows() == dim && transform->NumCols() == dim + 1);
+  // first make sure the off-diagonal elements are zero.
+  transform->SetZero();
+  const AppendSpeakerStats *stats =
+      dynamic_cast<const AppendSpeakerStats*>(&speaker_stats);
+  KALDI_ASSERT(stats != NULL && stats->stats.size() == transforms_.size() &&
+               "Wrong type of stats supplied to AppendTransform.");
+  int32 dim_offset = 0;
+  for (size_t i = 0; i < transforms_.size(); i++) {
+    int32 this_dim = transforms_[i]->Dim();
+    SubMatrix<BaseFloat> transform_part(*transform, dim_offset, this_dim,
+                                        dim_offset, this_dim + 1);
+    transforms_[i]->GetTransformAsMatrix(*(stats->stats[i]), &transform_part);
+    if (i + 1 < transforms_.size()) {
+      int32 current_offset_column = dim_offset + this_dim,
+          required_offset_column = dim;
+      for (int32 r = dim_offset; r < dim_offset + this_dim; r++) {
+        (*transform)(r, required_offset_column) = (*transform)(r, current_offset_column);
+        (*transform)(r, current_offset_column) = BaseFloat(0.0);
+      }
+    }
+    dim_offset += this_dim;
+  }
+  KALDI_ASSERT(dim_offset == Dim());
+}
+
+void AppendSpeakerStats::Estimate() {
+  for (size_t i = 0; i < stats.size(); i++)
+    stats[i]->Estimate();
+}
+
+AppendSpeakerStats::~AppendSpeakerStats() {
+  for (size_t i = 0; i < stats.size(); i++)
+    delete stats[i];
+}
+
+
+}  // namespace differentiable_transform
+}  // namespace kaldi
diff --git a/src/adapt/generic-transform.h b/src/adapt/generic-transform.h
new file mode 100644
index 00000000000..9b7933b69af
--- /dev/null
+++ b/src/adapt/generic-transform.h
@@ -0,0 +1,333 @@
+// adapt/generic-transform.h
+
+// Copyright      2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_TRANSFORM_GENERIC_TRANSFORM_H_
+#define KALDI_TRANSFORM_GENERIC_TRANSFORM_H_
+
+#include <vector>
+#include "base/kaldi-common.h"
+#include "matrix/kaldi-matrix.h"
+#include "cudamatrix/cu-matrix.h"
+#include "adapt/differentiable-transform-itf.h"
+
+// This header contains 'generic' forms of differentiable transform, which allow
+// you to append more basic transforms together or concatenate them dimension-wise.
+// Also it includes a no-op transform.
+
+namespace kaldi {
+namespace differentiable_transform {
+
+
+/**
+   This is a version of the transform class that does nothing.  It's potentially
+   useful for situations where you want to apply speaker normalization to some
+   dimensions of the feature vector but not to others.
+ */
+class NoOpTransform: public DifferentiableTransform {
+ public:
+
+  int32 InitFromConfig(int32 cur_pos,
+                       std::vector<ConfigLine> *config_lines) override;
+
+  int32 Dim() const override { return dim_; }
+
+  MinibatchInfoItf* TrainingForward(
+      const CuMatrixBase<BaseFloat> &input,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors,
+      CuMatrixBase<BaseFloat> *output) const override {
+    output->CopyFromMat(input);
+    return NULL;
+  }
+  void TrainingBackward(
+      const CuMatrixBase<BaseFloat> &input,
+      const CuMatrixBase<BaseFloat> &output_deriv,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors,
+      MinibatchInfoItf *minibatch_info,
+      CuMatrixBase<BaseFloat> *input_deriv) const override {
+    KALDI_ASSERT(minibatch_info == NULL);
+    input_deriv->AddMat(1.0, output_deriv);
+  }
+
+  int32 NumFinalIterations() override { return 0; }
+
+  void Accumulate(
+      int32 final_iter,
+      const CuMatrixBase<BaseFloat> &input,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors) override { }
+
+
+  SpeakerStatsItf *GetEmptySpeakerStats() const override {
+    return new SpeakerStatsItf();
+  }
+
+  void TestingAccumulate(
+      const MatrixBase<BaseFloat> &input,
+      const SubPosterior &posteriors,
+      SpeakerStatsItf *speaker_stats) const override { }
+
+  void TestingForward(
+      const MatrixBase<BaseFloat> &input,
+      const SpeakerStatsItf &speaker_stats,
+      MatrixBase<BaseFloat> *output) const override {
+    output->CopyFromMat(input);
+  }
+
+  void GetTransformAsMatrix(
+      const SpeakerStatsItf &speaker_stats,
+      MatrixBase<BaseFloat> *transform) const override { transform->SetUnit(); }
+
+  void Estimate(int32 final_iter) override { }
+
+  NoOpTransform(): dim_(-1) { }
+
+  NoOpTransform(const NoOpTransform &other):
+      DifferentiableTransform(other),
+      dim_(other.dim_) { }
+
+  DifferentiableTransform* Copy() const override {
+    return new NoOpTransform(*this);
+  }
+
+  std::string Type() const override { return "NoOpTransform"; }
+
+  void Write(std::ostream &os, bool binary) const override;
+
+  void Read(std::istream &is, bool binary) override;
+
+  void Add(const DifferentiableTransform &other) override { }
+ private:
+  int32 dim_;
+};
+
+
+/**
+   This is a version of the transform class that does a sequence of other
+   transforms, specified by other instances of the DifferentiableTransform
+   interface.  For instance: fMLLR followed by another fMLLR, or mean normalization
+   followed by fMLLR.  The reason this might make sense is that you'd get a better
+   estimate of the speaker-adapted class means if you do some kind of speaker
+   normalization before estimating those class means.
+
+   Caution: the framework currently implicitly assumes that the
+   final one of the supplied transforms subsumes the previous ones
+   (as in fMLLR subsumes mean subtraction, or fMLLR subsumes a previous
+   fMLLR of the same dimension).  This means that in test time the
+   first of the two transforms may be ignored and only the second one
+   performed.  This is in order to keep a single-pass adaptation framework
+   in test time.  The sequence of transforms still makes a difference
+   because it affects how we compute the adaptation model (i.e., it's
+   more like a speaker-adapted model than a speaker independent model,
+   to use traditional ASR terminology).
+ */
+class SequenceTransform: public DifferentiableTransform {
+ public:
+  int32 InitFromConfig(int32 cur_pos,
+                       std::vector<ConfigLine> *config_lines) override;
+
+  int32 Dim() const override;
+  void SetNumClasses(int32 num_classes) override;
+  MinibatchInfoItf* TrainingForward(
+      const CuMatrixBase<BaseFloat> &input,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors,
+      CuMatrixBase<BaseFloat> *output) const override;
+  void TrainingBackward(
+      const CuMatrixBase<BaseFloat> &input,
+      const CuMatrixBase<BaseFloat> &output_deriv,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors,
+      MinibatchInfoItf *minibatch_info,
+      CuMatrixBase<BaseFloat> *input_deriv) const override;
+
+  int32 NumFinalIterations() override;
+
+  void Accumulate(
+      int32 final_iter,
+      const CuMatrixBase<BaseFloat> &input,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors) override;
+
+  void Estimate(int32 final_iter) override;
+
+  SpeakerStatsItf *GetEmptySpeakerStats() const override {
+    // See comment at the top of this class for an explanation.
+    return transforms_.back()->GetEmptySpeakerStats();
+  }
+
+  void TestingAccumulate(
+      const MatrixBase<BaseFloat> &input,
+      const SubPosterior &posteriors,
+      SpeakerStatsItf *speaker_stats) const override;
+
+  void TestingForward(
+      const MatrixBase<BaseFloat> &input,
+      const SpeakerStatsItf &speaker_stats,
+      MatrixBase<BaseFloat> *output) const override;
+
+  void GetTransformAsMatrix(
+      const SpeakerStatsItf &speaker_stats,
+      MatrixBase<BaseFloat> *transform) const override;
+
+  SequenceTransform(const SequenceTransform &other);
+
+  SequenceTransform() { }
+
+  DifferentiableTransform* Copy() const override {
+    return new SequenceTransform(*this);
+  }
+
+  std::string Type() const override { return "SequenceTransform"; }
+
+  void Write(std::ostream &os, bool binary) const override;
+
+  void Read(std::istream &is, bool binary) override;
+
+  void Add(const DifferentiableTransform &other) override;
+
+  ~SequenceTransform() override;
+ private:
+  std::vector<DifferentiableTransform*> transforms_;
+};
+
+// This is the type actually returned by TrainingForward() for SequenceTransform.
+// It contains a list of other MinibatchInfo, together with the outputs for all
+// but the last call.
+class SequenceMinibatchInfo: public MinibatchInfoItf {
+ public:
+  std::vector<MinibatchInfoItf*> info_vec;
+  // outputs.size() will be info.size() - 1.
+  std::vector<CuMatrix<BaseFloat> > outputs;
+
+  ~SequenceMinibatchInfo() override;
+};
+
+
+class AppendSpeakerStats: public SpeakerStatsItf {
+ public:
+  AppendSpeakerStats() { }
+
+  std::vector<SpeakerStatsItf*> stats;
+
+  void Estimate() override;
+
+  ~AppendSpeakerStats();
+};
+
+/**
+   This is a version of the transform class that consists of a number of other
+   transforms, appended dimension-wise, so its feature dimension is the sum of
+   the dimensions of the constituent transforms-- e.g. this could be used to
+   implement block-diagonal fMLLR, or a structure where some dimensions are
+   adapted and some are not.
+ */
+class AppendTransform: public DifferentiableTransform {
+ public:
+  int32 InitFromConfig(int32 cur_pos,
+                       std::vector<ConfigLine> *config_lines) override;
+
+  int32 Dim() const override;
+  void SetNumClasses(int32 num_classes) override;
+  MinibatchInfoItf* TrainingForward(
+      const CuMatrixBase<BaseFloat> &input,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors,
+      CuMatrixBase<BaseFloat> *output) const override;
+  void TrainingBackward(
+      const CuMatrixBase<BaseFloat> &input,
+      const CuMatrixBase<BaseFloat> &output_deriv,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors,
+      MinibatchInfoItf *minibatch_info,
+      CuMatrixBase<BaseFloat> *input_deriv) const override;
+
+  int32 NumFinalIterations() override;
+
+  void Accumulate(
+      int32 final_iter,
+      const CuMatrixBase<BaseFloat> &input,
+      int32 num_chunks,
+      int32 num_spk,
+      const Posterior &posteriors) override;
+
+  SpeakerStatsItf *GetEmptySpeakerStats() const override;
+
+  void TestingAccumulate(
+      const MatrixBase<BaseFloat> &input,
+      const SubPosterior &posteriors,
+      SpeakerStatsItf *speaker_stats) const override;
+
+  void TestingForward(
+      const MatrixBase<BaseFloat> &input,
+      const SpeakerStatsItf &speaker_stats,
+      MatrixBase<BaseFloat> *output) const override;
+
+  void GetTransformAsMatrix(
+      const SpeakerStatsItf &speaker_stats,
+      MatrixBase<BaseFloat> *transform) const override;
+
+  void Estimate(int32 final_iter) override;
+
+  AppendTransform(const AppendTransform &other);
+
+  AppendTransform() { }
+
+  DifferentiableTransform* Copy() const override {
+    return new AppendTransform(*this);
+  }
+
+  std::string Type() const override { return "AppendTransform"; }
+
+  void Write(std::ostream &os, bool binary) const override;
+
+  void Read(std::istream &is, bool binary) override;
+
+  ~AppendTransform();
+
+  void Add(const DifferentiableTransform &other) override;
+ private:
+  std::vector<DifferentiableTransform*> transforms_;
+};
+
+
+// This is the type created by TrainingForward() for AppendTransform.
+// It just contains a list of other MinibatchInfo.
+class AppendMinibatchInfo: public MinibatchInfoItf {
+ public:
+  std::vector<MinibatchInfoItf*> info_vec;
+
+  ~AppendMinibatchInfo() override;
+};
+
+
+} // namespace differentiable_transform
+} // namespace kaldi
+
+#endif  // KALDI_TRANSFORM_GENERIC_TRANSFORM_H_
diff --git a/src/base/io-funcs.h b/src/base/io-funcs.h
index 6c2b690f54c..0144e71f987 100644
--- a/src/base/io-funcs.h
+++ b/src/base/io-funcs.h
@@ -108,7 +108,7 @@ namespace kaldi {
   it doesn't throw.  It's useful if a class can have various forms based on
   typedefs and virtual classes, and wants to know which version to read.
 
-  ReadToken allow the caller to obtain the next token.  PeekToken works just
+  ReadToken allows the caller to obtain the next token.  PeekToken works just
   like ReadToken, but seeks back to the beginning of the token.  A subsequent
   call to ReadToken will read the same token again.  This is useful when
   different object types are written to the same file; using PeekToken one can
diff --git a/src/chain/chain-generic-numerator.cc b/src/chain/chain-generic-numerator.cc
index d3a114242c2..7453568913a 100644
--- a/src/chain/chain-generic-numerator.cc
+++ b/src/chain/chain-generic-numerator.cc
@@ -209,9 +209,33 @@ BaseFloat GenericNumeratorComputation::AlphaRemainingFrames(int seq,
   return log_prob_product + log_scale_product;
 }
 
+/* This function converts the pdf occupation probabilties (computed
+   using Forward-Backward on the numerator graph) to posteriors.
+   "derivs" is frames_per_sequence by pdf_index_size (i.e., indices.size())
+*/
+static void ConvertDerivsToPosterior(const MatrixBase<BaseFloat> &derivs,
+                                     const std::vector<MatrixIndexT> &indices,
+                                     int32 pdf_stride,
+                                     int32 frames_per_sequence,
+                                     int32 num_sequences,
+                                     Posterior *post) {
+  post->resize(frames_per_sequence * num_sequences);
+  for (size_t t = 0; t < derivs.NumRows(); ++t)
+    for (int32 n = 0; n < derivs.NumCols(); ++n) {
+      BaseFloat posterior = Exp(derivs(t, n));
+      if (posterior != 0.0) {
+        int32 seq = indices[n] / pdf_stride;
+        int32 pdfid = indices[n] % pdf_stride;
+        (*post)[t * num_sequences + seq].push_back(
+            std::make_pair(pdfid, posterior));
+      }
+    }
+}
+
 bool GenericNumeratorComputation::ForwardBackward(
                                  BaseFloat *total_loglike,
-                                 CuMatrixBase<BaseFloat> *nnet_output_deriv) {
+                                 CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                                 Posterior *numerator_post) {
   KALDI_ASSERT(total_loglike != NULL);
   KALDI_ASSERT(nnet_output_deriv != NULL);
   KALDI_ASSERT(nnet_output_deriv->NumCols() == nnet_output_.NumCols());
@@ -243,6 +267,10 @@ bool GenericNumeratorComputation::ForwardBackward(
     if (GetVerboseLevel() >= 1)
       ok = ok && CheckValues(seq, probs, alpha, beta, derivs);
   }
+  if (numerator_post)
+    ConvertDerivsToPosterior(derivs, index_to_pdf_, nnet_output_.Stride(),
+                             supervision_.frames_per_sequence,
+                             num_sequences, numerator_post);
   // Transfer and add the derivatives to the values in the matrix
   AddSpecificPdfsIndirect(&derivs, index_to_pdf_, nnet_output_deriv);
   *total_loglike = partial_loglike;
diff --git a/src/chain/chain-generic-numerator.h b/src/chain/chain-generic-numerator.h
index fc5e00b2c63..2becfd56051 100644
--- a/src/chain/chain-generic-numerator.h
+++ b/src/chain/chain-generic-numerator.h
@@ -33,6 +33,7 @@
 #include "lat/kaldi-lattice.h"
 #include "matrix/kaldi-matrix.h"
 #include "hmm/transition-model.h"
+#include "hmm/posterior.h"
 #include "chain/chain-supervision.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-array.h"
@@ -121,7 +122,8 @@ class GenericNumeratorComputation {
   // nnet output w.r.t. the (log-prob times supervision_.weight times
   // deriv_weight) to 'nnet_output_deriv'.
   bool ForwardBackward(BaseFloat *total_loglike,
-                       CuMatrixBase<BaseFloat> *nnet_output_deriv);
+                       CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                       Posterior *numerator_post = NULL);
 
   BaseFloat ComputeObjf();
  private:
diff --git a/src/chain/chain-numerator.cc b/src/chain/chain-numerator.cc
index 139d28bdd77..caba37023a7 100644
--- a/src/chain/chain-numerator.cc
+++ b/src/chain/chain-numerator.cc
@@ -146,9 +146,29 @@ BaseFloat NumeratorComputation::Forward() {
   return tot_log_prob_ * supervision_.weight;
 }
 
+/* This function converts the pdf occupation probabilties (computed
+   using Forward-Backward on the numerator graph) to posteriors.
+*/
+static void ConvertDerivsToPosterior(
+    const Vector<BaseFloat> &derivs,
+    const std::vector<Int32Pair> &nnet_output_indexes,
+    int32 nnet_output_rows,
+    Posterior *post) {
+  post->resize(nnet_output_rows);
+  for (size_t i = 0; i < nnet_output_indexes.size(); ++i) {
+    if (derivs(i) != 0.0) {
+      int32 row = nnet_output_indexes[i].first;
+      int32 pdfid = nnet_output_indexes[i].second;
+      (*post)[row].push_back(
+          std::make_pair(pdfid, derivs(i)));
+    }
+  }
+}
+
 
 void NumeratorComputation::Backward(
-    CuMatrixBase<BaseFloat> *nnet_output_deriv) {
+    CuMatrixBase<BaseFloat> *nnet_output_deriv,
+    Posterior *numerator_post) {
   const fst::StdVectorFst &fst = supervision_.fst;
   int32 num_states = fst.NumStates();
   log_beta_.Resize(num_states, kUndefined);
@@ -201,6 +221,13 @@ void NumeratorComputation::Backward(
     KALDI_WARN << "Disagreement in forward/backward log-probs: "
                << tot_log_prob_backward << " vs. " << tot_log_prob_;
 
+  if (numerator_post) {
+    std::vector<Int32Pair> nnet_output_indexes_cpu;
+    nnet_output_indexes_.CopyToVec(&nnet_output_indexes_cpu);
+    ConvertDerivsToPosterior(nnet_logprob_derivs_, nnet_output_indexes_cpu,
+                             nnet_output_.NumRows(), numerator_post);
+  }
+
   // copy this data to GPU.
   CuVector<BaseFloat> nnet_logprob_deriv_cuda;
   nnet_logprob_deriv_cuda.Swap(&nnet_logprob_derivs_);
diff --git a/src/chain/chain-numerator.h b/src/chain/chain-numerator.h
index 15cb31e0571..63cb186fde8 100644
--- a/src/chain/chain-numerator.h
+++ b/src/chain/chain-numerator.h
@@ -32,6 +32,7 @@
 #include "lat/kaldi-lattice.h"
 #include "matrix/kaldi-matrix.h"
 #include "hmm/transition-model.h"
+#include "hmm/posterior.h"
 #include "chain/chain-supervision.h"
 #include "cudamatrix/cu-matrix.h"
 #include "cudamatrix/cu-array.h"
@@ -78,7 +79,8 @@ class NumeratorComputation {
   // Does the backward computation and (efficiently) adds the derivative of the
   // nnet output w.r.t. the (log-prob times supervision_.weight times
   // deriv_weight) to 'nnet_output_deriv'.
-  void Backward(CuMatrixBase<BaseFloat> *nnet_output_deriv);
+  void Backward(CuMatrixBase<BaseFloat> *nnet_output_deriv,
+                Posterior *numerator_post = NULL);
 
  private:
 
@@ -143,4 +145,3 @@ class NumeratorComputation {
 }  // namespace kaldi
 
 #endif  // KALDI_CHAIN_CHAIN_NUMERATOR_H_
-
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index f8a2c1d11cc..be727d333d2 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -720,17 +720,16 @@ Supervision::Supervision(const Supervision &other):
 void MergeSupervisionE2e(const std::vector<const Supervision*> &input,
                           Supervision *output_supervision) {
   KALDI_ASSERT(!input.empty());
-  KALDI_ASSERT(input[0]->e2e_fsts.size() == 1);
   *output_supervision = *(input[0]);
   output_supervision->e2e_fsts.reserve(input.size());
   int32 frames_per_sequence = output_supervision->frames_per_sequence,
       num_seqs = input.size();
   for (int32 i = 1; i < num_seqs; i++) {
-    output_supervision->num_sequences++;
-    KALDI_ASSERT(input[i]->e2e_fsts.size() == 1);
+    output_supervision->num_sequences += input[i]->num_sequences;
     KALDI_ASSERT(input[i]->frames_per_sequence ==
                  frames_per_sequence);
-    output_supervision->e2e_fsts.push_back(input[i]->e2e_fsts[0]);
+    for (int32 j = 0; j < input[i]->num_sequences; ++j)
+      output_supervision->e2e_fsts.push_back(input[i]->e2e_fsts[j]);
   }
   output_supervision->alignment_pdfs.clear();
   // The program nnet3-chain-acc-lda-stats works on un-merged egs,
@@ -766,7 +765,7 @@ void MergeSupervision(const std::vector<const Supervision*> &input,
       // append src.fst to output_supervision->fst.
       // the complexity here is O(V1 + E1)
       fst::Concat(src.fst, &output_supervision->fst);
-      output_supervision->num_sequences++;
+      output_supervision->num_sequences += src.num_sequences;
     } else {
       KALDI_ERR << "Mismatch weight or frames_per_sequence  between inputs";
     }
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index 6b4a7b593c2..c4637c9cb86 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -28,7 +28,6 @@ namespace kaldi {
 namespace chain {
 
 
-
 void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts,
                                  const DenominatorGraph &den_graph,
                                  const Supervision &supervision,
@@ -37,7 +36,8 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts,
                                  BaseFloat *l2_term,
                                  BaseFloat *weight,
                                  CuMatrixBase<BaseFloat> *nnet_output_deriv,
-                                 CuMatrix<BaseFloat> *xent_output_deriv) {
+                                 CuMatrix<BaseFloat> *xent_output_deriv,
+                                 Posterior *numerator_post = NULL) {
   BaseFloat num_logprob_weighted, den_logprob_weighted;
   bool denominator_ok = true;
   bool numerator_ok = true;
@@ -77,12 +77,14 @@ void ComputeChainObjfAndDerivE2e(const ChainTrainingOptions &opts,
     // the numerator object, as well as the returned logprob.
     if (xent_output_deriv) {
       numerator_ok = numerator.ForwardBackward(&num_logprob_weighted,
-                                               xent_output_deriv);
+                                               xent_output_deriv,
+                                               numerator_post);
       if (numerator_ok && nnet_output_deriv)
         nnet_output_deriv->AddMat(1.0, *xent_output_deriv);
     } else if (nnet_output_deriv) {
       numerator_ok = numerator.ForwardBackward(&num_logprob_weighted,
-                                               nnet_output_deriv);
+                                               nnet_output_deriv,
+                                               numerator_post);
     } else {
       num_logprob_weighted = numerator.ComputeObjf();
     }
@@ -146,11 +148,26 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               BaseFloat *l2_term,
                               BaseFloat *weight,
                               CuMatrixBase<BaseFloat> *nnet_output_deriv,
-                              CuMatrix<BaseFloat> *xent_output_deriv) {
+                              CuMatrix<BaseFloat> *xent_output_deriv,
+                              Posterior *numerator_post) {
+  if (!nnet_output_deriv && !xent_output_deriv && numerator_post) {
+    // To compute the posteriors, we will need to compute the numerator
+    // derivatives first (and to compute them, at least one of the *_deriv
+    // arguments should be non-NULL).
+    CuMatrix<BaseFloat> xent_deriv;
+    // Recurse
+    ComputeChainObjfAndDeriv(opts, den_graph, supervision,
+                             nnet_output, objf, l2_term,
+                             weight, nnet_output_deriv,
+                             &xent_deriv, numerator_post);
+    return;
+  }
+
   if (!supervision.e2e_fsts.empty()) {
     ComputeChainObjfAndDerivE2e(opts, den_graph, supervision,
                                 nnet_output, objf, l2_term,
-                                weight, nnet_output_deriv, xent_output_deriv);
+                                weight, nnet_output_deriv,
+                                xent_output_deriv, numerator_post);
     return;
   }
 
@@ -190,11 +207,11 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
     num_logprob_weighted = numerator.Forward();
 
     if (xent_output_deriv) {
-      numerator.Backward(xent_output_deriv);
+      numerator.Backward(xent_output_deriv, numerator_post);
       if (nnet_output_deriv)
         nnet_output_deriv->AddMat(1.0, *xent_output_deriv);
     } else if (nnet_output_deriv) {
-      numerator.Backward(nnet_output_deriv);
+      numerator.Backward(nnet_output_deriv, numerator_post);
     }
   }
 
diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h
index 6ea70b5ca41..d66c3c18900 100644
--- a/src/chain/chain-training.h
+++ b/src/chain/chain-training.h
@@ -28,10 +28,12 @@
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
 #include "fstext/fstext-lib.h"
+#include "hmm/posterior.h"
 #include "tree/context-dep.h"
 #include "lat/kaldi-lattice.h"
 #include "matrix/kaldi-matrix.h"
 #include "hmm/transition-model.h"
+#include "hmm/posterior.h"
 #include "chain/chain-den-graph.h"
 #include "chain/chain-supervision.h"
 
@@ -99,7 +101,7 @@ struct ChainTrainingOptions {
                            example; you'll want to divide it by 'tot_weight' before
                            displaying it.
    @param [out] l2_term  The l2 regularization term in the objective function, if
-                           the --l2-regularize option is used.  To be added to 'o
+                         the --l2-regularize option is used (else will be set to 0.0).
    @param [out] weight     The weight to normalize the objective function by;
                            equals supervision.weight * supervision.num_sequences *
                            supervision.frames_per_sequence.
@@ -115,6 +117,12 @@ struct ChainTrainingOptions {
                            peak memory use).  xent_output_deriv will be used in
                            the cross-entropy regularization code; it is also
                            used in computing the cross-entropy objective value.
+   @param [out] numerator_post  If non-NULL, then the posterior from the numerator
+                           forward-backward will be written here (note: it won't be
+                           scaled by the supervision weight).  The order is the
+                           same as the input (i.e., frame 0 for all sequences,
+                           then frame 1, etc). This is intended for
+                           use in the adaptation framework used in "chaina" training.
 */
 void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               const DenominatorGraph &den_graph,
@@ -124,7 +132,8 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               BaseFloat *l2_term,
                               BaseFloat *weight,
                               CuMatrixBase<BaseFloat> *nnet_output_deriv,
-                              CuMatrix<BaseFloat> *xent_output_deriv = NULL);
+                              CuMatrix<BaseFloat> *xent_output_deriv = NULL,
+                              Posterior *numerator_post = NULL);
 
 
 
diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc
index a3222d2285f..2accefc57fa 100644
--- a/src/chainbin/nnet3-chain-combine.cc
+++ b/src/chainbin/nnet3-chain-combine.cc
@@ -98,7 +98,7 @@ int main(int argc, char *argv[]) {
         "Usage:  nnet3-chain-combine [options] <den-fst> <raw-nnet-in1> <raw-nnet-in2> ... <raw-nnet-inN> <chain-examples-in> <raw-nnet-out>\n"
         "\n"
         "e.g.:\n"
-        " nnet3-combine den.fst 35.raw 36.raw 37.raw 38.raw ark:valid.cegs final.raw\n";
+        " nnet3-chain-combine den.fst 35.raw 36.raw 37.raw 38.raw ark:valid.cegs final.raw\n";
 
     bool binary_write = true;
     int32 max_objective_evaluations = 30;
@@ -113,7 +113,7 @@ int main(int argc, char *argv[]) {
                 "maximum number of objective evaluations in order to figure "
                 "out the best number of models to combine. It helps to speedup "
                 "if the number of models provided to this binary is quite "
-                "large (e.g. several hundred)."); 
+                "large (e.g. several hundred).");
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
     po.Register("batchnorm-test-mode", &batchnorm_test_mode,
diff --git a/src/chainbin/nnet3-chain-copy-egs.cc b/src/chainbin/nnet3-chain-copy-egs.cc
index 0117fe2200f..60fb70bd1c7 100644
--- a/src/chainbin/nnet3-chain-copy-egs.cc
+++ b/src/chainbin/nnet3-chain-copy-egs.cc
@@ -91,7 +91,7 @@ void FilterExample(int32 min_input_t,
     if (io.name == "input") {
       min_t = min_input_t;
       max_t = max_input_t;
-      
+
       const std::vector<Index> &indexes_in = io.indexes;
       std::vector<Index> indexes_out;
       indexes_out.reserve(indexes_in.size());
@@ -124,22 +124,88 @@ void FilterExample(int32 min_input_t,
   }
 }
 
+/**
+   This function extends the left/right input context by adding
+   necessary indexes (and feature rows) for the NnetIo named "input".
+   First/last frame will be duplicated to add left/right context respectively.
+ */
+void ExtendContext(NnetChainExample *eg,
+                   int32 n_stride,
+                   int32 min_input_t,
+                   int32 max_input_t,
+                   int32 extend_left_context,
+                   int32 extend_right_context) {
+  // process the <NnetIo> inputs
+  for (size_t i = 0; i < eg->inputs.size(); i++) {
+    NnetIo &io = eg->inputs[i];
+    if (io.name == "input") {
+      // Assume t_stride = 1 (since it's input)
+      std::vector<Index> &indexes = io.indexes;
+      KALDI_ASSERT(indexes.size() < 2 || indexes[0].t + 1 == indexes[1].t);
+      // The input indexes are not re-ordered. The order is: all frames of first
+      // sequence, then all frames of 2nd seq, ...
+      indexes.resize(indexes.size() + n_stride * (extend_left_context
+                                                  + extend_right_context));
+      KALDI_ASSERT(indexes.size() == n_stride *
+                   (max_input_t - min_input_t + 1));
+
+      for (int32 n = 0, i = 0; n < n_stride; ++n) {
+        for (int32 t = min_input_t; t <= max_input_t; ++t, ++i) {
+          indexes[i].t = t;
+          indexes[i].n = n;
+        }
+      }
+
+      Matrix<BaseFloat> features_out(indexes.size(), io.features.NumCols());
+      Matrix<BaseFloat> features_in;
+      io.features.GetMatrix(&features_in);
 
-/** Returns true if the "eg" contains just a single example, meaning
-    that all the "n" values in the indexes are zero, and the example
-    has NnetIo members named both "input" and "output"
+      int32 original_min_t = min_input_t + extend_left_context,
+          original_max_t = max_input_t - extend_right_context;
+      // For each "n", duplicate the first frame to extend left context,
+      // then copy the features, then duplicate the last frame to extend right
+      // context.
+      int32 i_in = 0, i_out = 0;
+      for (int32 n = 0; n < n_stride; ++n) {
+        // Duplicate frame i_in, "extend_left_context" times
+        for (int32 j = 0; j < extend_left_context; ++j, ++i_out)
+          features_out.Row(i_out).CopyFromVec(features_in.Row(i_in));
+
+        for (int32 t = original_min_t; t <= original_max_t; ++t, ++i_out, ++i_in)
+          features_out.Row(i_out).CopyFromVec(features_in.Row(i_in));
+
+        // Duplicate frame i_in - 1, "extend_right_context" times
+        for (int32 j = 0; j < extend_right_context; ++j, ++i_out)
+          features_out.Row(i_out).CopyFromVec(features_in.Row(i_in - 1));
+
+      }
+      KALDI_ASSERT(i_in == features_in.NumRows());
+      KALDI_ASSERT(i_out == features_out.NumRows());
+
+      GeneralMatrix features_out_gmat;
+      features_out_gmat.SwapFullMatrix(&features_out);
+      io.features = features_out_gmat;
+    }
+  }
+}
+
+/** Counts the number of single examples in "eg", which is equal to
+    the maximum "n" value in the indexes plus 1.
+    If the example does not have both "input" and "output" NnetIo members,
+    this function will exit the program with an error.
 
     Also computes the minimum and maximum "t" values in the "input" and
     "output" NnetIo members.
  */
-bool ContainsSingleExample(const NnetChainExample &eg,
-                           int32 *min_input_t,
-                           int32 *max_input_t,
-                           int32 *min_output_t,
-                           int32 *max_output_t) {
+static int32 CountSingleExamples(const NnetChainExample &eg,
+                                 int32 *min_input_t,
+                                 int32 *max_input_t,
+                                 int32 *min_output_t,
+                                 int32 *max_output_t) {
   bool done_input = false, done_output = false;
   int32 num_indexes_input = eg.inputs.size();
   int32 num_indexes_output = eg.outputs.size();
+  int32 max_n = 0;
   for (int32 i = 0; i < num_indexes_input; i++) {
     const NnetIo &input = eg.inputs[i];
     std::vector<Index>::const_iterator iter = input.indexes.begin(),
@@ -152,23 +218,12 @@ bool ContainsSingleExample(const NnetChainExample &eg,
         int32 this_t = iter->t;
         min_t = std::min(min_t, this_t);
         max_t = std::max(max_t, this_t);
-        if (iter->n != 0) {
-          KALDI_WARN << "Example does not contain just a single example; "
-                     << "too late to do frame selection or reduce context.";
-          return false;
-        }
+        if (iter->n > max_n)
+          max_n = iter->n;
       }
       done_input = true;
       *min_input_t = min_t;
       *max_input_t = max_t;
-    } else {
-      for (; iter != end; ++iter) {
-        if (iter->n != 0) {
-          KALDI_WARN << "Example does not contain just a single example; "
-                     << "too late to do frame selection or reduce context.";
-          return false;
-        }
-      }
     }
   }
 
@@ -184,34 +239,22 @@ bool ContainsSingleExample(const NnetChainExample &eg,
         int32 this_t = iter->t;
         min_t = std::min(min_t, this_t);
         max_t = std::max(max_t, this_t);
-        if (iter->n != 0) {
-          KALDI_WARN << "Example does not contain just a single example; "
-                     << "too late to do frame selection or reduce context.";
-          return false;
-        }
+        // max_n must be the same for all io's (either input or output).
+        KALDI_ASSERT(iter->n <= max_n
+                     && "Mismatched 'n' values. Partially merged?");
       }
       done_output = true;
       *min_output_t = min_t;
       *max_output_t = max_t;
-    } else {
-      for (; iter != end; ++iter) {
-        if (iter->n != 0) {
-          KALDI_WARN << "Example does not contain just a single example; "
-                     << "too late to do frame selection or reduce context.";
-          return false;
-        }
-      }
     }
   }
-  if (!done_input) {
-    KALDI_WARN << "Example does not have any input named 'input'";
-    return false;
-  }
-  if (!done_output) {
-    KALDI_WARN << "Example does not have any output named 'output'";
-    return false;
-  }
-  return true;
+  if (!done_input)
+    KALDI_ERR << "Example does not have any input named 'input'";
+
+  if (!done_output)
+    KALDI_ERR << "Example does not have any output named 'output'";
+
+  return max_n + 1;
 }
 
 // calculate the frame_subsampling_factor
@@ -221,47 +264,49 @@ void CalculateFrameSubsamplingFactor(const NnetChainExample &eg,
                               - eg.outputs[0].indexes[0].t;
 }
 
+/* This function adds or removes context for the examples inside
+   "eg" (which can contain just a single example or it can be a
+   merged-eg which would contain more than one example). Addition or
+   removal of context is determined by comparing "left_context" with
+   the observed left context of "eg" (the same goes for right context):
+   if it's more, it'll extend input context by duplicating the first (or last,
+   for right context) frame. Otherwise, it'll remove the extra context from
+   both inputs and outputs in "eg". Note that when extending context, only the
+   "input" io will be modified (the "output" io will remain the same).
+ */
 void ModifyChainExampleContext(int32 left_context,
                                int32 right_context,
                                const int32 frame_subsampling_factor,
-                               NnetChainExample *eg) {
-  static bool warned_left = false, warned_right = false;
+                               NnetChainExample *eg,
+                               int32 *left_context_extension,
+                               int32 *right_context_extension) {
   int32 min_input_t, max_input_t,
-        min_output_t, max_output_t;
-  if (!ContainsSingleExample(*eg, &min_input_t, &max_input_t,
-                             &min_output_t, &max_output_t))
-    KALDI_ERR << "Too late to perform frame selection/context reduction on "
-              << "these examples (already merged?)";
-  if (left_context != -1) {
+      min_output_t, max_output_t;
+  *left_context_extension = 0;
+  *right_context_extension = 0;
+  // Example stride really means "n" stride (of the NnetIo's)
+  int32 example_stride = CountSingleExamples(*eg, &min_input_t, &max_input_t,
+                                             &min_output_t, &max_output_t);
+  if (left_context >= 0) {
     int32 observed_left_context = min_output_t - min_input_t;
-    if (!warned_left && observed_left_context < left_context) {
-      warned_left = true;
-      KALDI_WARN << "You requested --left-context=" << left_context
-                 << ", but example only has left-context of "
-                 <<  observed_left_context
-                 << " (will warn only once; this may be harmless if "
-          "using any --*left-context-initial options)";
-    }
-    min_input_t = std::max(min_input_t, min_output_t - left_context);
+    if (left_context > observed_left_context)  // Extend
+      *left_context_extension = left_context - observed_left_context;
+    // Adjust min input t
+    min_input_t = min_output_t - left_context;
   }
-  if (right_context != -1) {
+  if (right_context >= 0) {
     int32 observed_right_context = max_input_t - max_output_t;
-
-    if (right_context != -1) {
-      if (!warned_right && observed_right_context < right_context) {
-        warned_right = true;
-        KALDI_WARN << "You requested --right-context=" << right_context
-                  << ", but example only has right-context of "
-                  << observed_right_context
-                 << " (will warn only once; this may be harmless if "
-            "using any --*right-context-final options.";
-      }
-      max_input_t = std::min(max_input_t, max_output_t + right_context);
-    }
+    if (right_context > observed_right_context)  // Extend
+      *right_context_extension = right_context - observed_right_context;
+    max_input_t = max_output_t + right_context;
   }
+
   FilterExample(min_input_t, max_input_t,
                 min_output_t, max_output_t,
                 eg);
+  if (*left_context_extension > 0 || *right_context_extension > 0)
+    ExtendContext(eg, example_stride, min_input_t, max_input_t,
+                  *left_context_extension, *right_context_extension);
 }  // ModifyChainExampleContext
 
 }  // namespace nnet3
@@ -348,6 +393,8 @@ int main(int argc, char *argv[]) {
     exclude_names.push_back(std::string("ivector"));
 
     int64 num_read = 0, num_written = 0, num_err = 0;
+    int64 num_left_context_extensions = 0, num_right_context_extensions = 0,
+        total_left_context_extension = 0, total_right_context_extension = 0;
     for (; !example_reader.Done(); example_reader.Next(), num_read++) {
       const std::string &key = example_reader.Key();
       NnetChainExample &eg = example_reader.Value();
@@ -367,7 +414,7 @@ int main(int argc, char *argv[]) {
         weight = egs_weight_reader.Value(key);
         ScaleSupervisionWeight(weight, &eg);
       }
-      
+
       if (!eg_output_name_rspecifier.empty()) {
         if (!output_name_reader.HasKey(key)) {
           KALDI_WARN << "No new output-name for example key " << key;
@@ -377,13 +424,25 @@ int main(int argc, char *argv[]) {
         std::string new_output_name = output_name_reader.Value(key);
         RenameOutputs(new_output_name, &eg);
       }
-      
+
       if (frame_shift != 0)
         ShiftChainExampleTimes(frame_shift, exclude_names, &eg);
-      if (left_context != -1 || right_context != -1)
+      if (left_context != -1 || right_context != -1) {
+        int32 right_context_extension, left_context_extension;
         ModifyChainExampleContext(left_context, right_context,
-                                  frame_subsampling_factor, &eg);
-        
+                                  frame_subsampling_factor, &eg,
+                                  &left_context_extension,
+                                  &right_context_extension);
+        if (left_context_extension > 0) {
+          num_left_context_extensions++;
+          total_left_context_extension += left_context_extension;
+        }
+        if (right_context_extension > 0) {
+          num_right_context_extensions++;
+          total_right_context_extension += right_context_extension;
+        }
+      }
+
       for (int32 c = 0; c < count; c++) {
         int32 index = (random ? Rand() : num_written) % num_outputs;
         example_writers[index]->Write(key, eg);
@@ -394,6 +453,16 @@ int main(int argc, char *argv[]) {
       delete example_writers[i];
     KALDI_LOG << "Read " << num_read
               << " neural-network training examples, wrote " << num_written;
+    if (num_left_context_extensions > 0)
+      KALDI_LOG  << "Left context was extended for "
+                 << num_left_context_extensions << " examples, by an average of "
+                 << (1.0 * total_left_context_extension /
+                     num_left_context_extensions) << " frames";
+    if (num_right_context_extensions > 0)
+      KALDI_LOG << "Right context was extended for "
+                << num_right_context_extensions << " examples, by an average of "
+                << (1.0 * total_right_context_extension
+                    / num_right_context_extensions) << " frames.";
     return (num_written == 0 ? 1 : 0);
   } catch(const std::exception &e) {
     std::cerr << e.what() << '\n';
diff --git a/src/chainbin/nnet3-chain-get-egs.cc b/src/chainbin/nnet3-chain-get-egs.cc
index 1032b7e2125..23291eac167 100644
--- a/src/chainbin/nnet3-chain-get-egs.cc
+++ b/src/chainbin/nnet3-chain-get-egs.cc
@@ -95,7 +95,7 @@ static bool ProcessFile(const TransitionModel *trans_mdl,
                         const VectorBase<BaseFloat> *deriv_weights,
                         int32 supervision_length_tolerance,
                         const std::string &utt_id,
-                        bool compress,
+                        bool compress, bool long_key,
                         UtteranceSplitter *utt_splitter,
                         NnetChainExampleWriter *example_writer) {
   KALDI_ASSERT(supervision.num_sequences == 1);
@@ -228,9 +228,14 @@ static bool ProcessFile(const TransitionModel *trans_mdl,
       nnet_chain_eg.Compress();
 
     std::ostringstream os;
-    os << utt_id << "-" << chunk.first_frame;
+    if (long_key)
+      os << utt_id
+         << "-" << chunk.first_frame << "-" << chunk.left_context
+         << "-" << chunk.num_frames << "-" << chunk.right_context << "-v1";
+    else  // key is <utt_id>-<frame_id>
+      os << utt_id << "-" << chunk.first_frame;
 
-    std::string key = os.str(); // key is <utt_id>-<frame_id>
+    std::string key = os.str();
 
     example_writer->Write(key, nnet_chain_eg);
   }
@@ -265,7 +270,7 @@ int main(int argc, char *argv[]) {
         "Note: the --frame-subsampling-factor option must be the same as given to\n"
         "chain-get-supervision.\n";
 
-    bool compress = true;
+    bool compress = true, long_key = false;
     int32 length_tolerance = 100, online_ivector_period = 1,
           supervision_length_tolerance = 1;
 
@@ -281,9 +286,9 @@ int main(int argc, char *argv[]) {
     ParseOptions po(usage);
     po.Register("compress", &compress, "If true, write egs with input features "
                 "in compressed format (recommended).  Update: this is now "
-                "only relevant if the features being read are un-compressed; "
-                "if already compressed, we keep we same compressed format when "
-                "dumping-egs.");
+                "only relevant if the features being read are uncompressed; "
+                "if already compressed, we keep the same compressed format when "
+                "dumping egs.");
     po.Register("ivectors", &online_ivector_rspecifier, "Alias for "
                 "--online-ivectors option, for back compatibility");
     po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier of "
@@ -311,6 +316,8 @@ int main(int argc, char *argv[]) {
                 "Filename of transition model to read; should only be supplied "
                 "if you want 'unconstrained' egs, and if you supplied "
                 "--convert-to-pdfs=false to chain-get-supervision.");
+    po.Register("long-key", &long_key, "If true, a long format will be used "
+                "for the key, which encodes context info, etc.");
 
     eg_config.Register(&po);
 
@@ -426,7 +433,7 @@ int main(int argc, char *argv[]) {
         if (!ProcessFile(trans_mdl_ptr, normalization_fst, feats,
                          online_ivector_feats, online_ivector_period,
                          supervision, deriv_weights, supervision_length_tolerance,
-                         key, compress,
+                         key, compress, long_key,
                          &utt_splitter, &example_writer))
           num_err++;
       }
diff --git a/src/chainbin/nnet3-chain-merge-egs.cc b/src/chainbin/nnet3-chain-merge-egs.cc
index a3686d2fc30..926cda76cf3 100644
--- a/src/chainbin/nnet3-chain-merge-egs.cc
+++ b/src/chainbin/nnet3-chain-merge-egs.cc
@@ -64,7 +64,16 @@ int main(int argc, char *argv[]) {
     ChainExampleMerger merger(merging_config, &example_writer);
     for (; !example_reader.Done(); example_reader.Next()) {
       const NnetChainExample &cur_eg = example_reader.Value();
-      merger.AcceptExample(new NnetChainExample(cur_eg));
+      NnetChainExample *cur_eg_copy = new NnetChainExample(cur_eg);
+      if (merging_config.use_query_string) {
+        std::string key = example_reader.Key();
+        int pos = key.find('?');
+        if (pos != std::string::npos) {
+          std::string query = key.substr(pos + 1, key.size() - pos - 1);
+          cur_eg_copy->bucket = query;
+        }
+      }
+      merger.AcceptExample(cur_eg_copy);
     }
     // the merger itself prints the necessary diagnostics.
     merger.Finish();
diff --git a/src/featbin/select-feats.cc b/src/featbin/select-feats.cc
index c10f0c64ed5..284902f782e 100644
--- a/src/featbin/select-feats.cc
+++ b/src/featbin/select-feats.cc
@@ -37,7 +37,9 @@ int main(int argc, char *argv[]) {
         "command cut -f ...\n"
         "Usage: select-feats <selection> <in-rspecifier> <out-wspecifier>\n"
         "  e.g. select-feats 0,24-22,3-12 scp:feats.scp ark,scp:feat-red.ark,feat-red.scp\n"
-        "See also copy-feats, extract-feature-segments, subset-feats, subsample-feats\n";
+        "See also copy-feats, extract-feature-segments, subset-feats, subsample-feats\n"
+        "Note: this command should no longer be needed in most cases, as it can be done\n"
+        "more efficiently at the script level; see the script utils/data/limit_feature_dim.sh";
 
     ParseOptions po(usage);
 
diff --git a/src/gmmbin/gmm-est-fmllr.cc b/src/gmmbin/gmm-est-fmllr.cc
index 9f8dfd89143..e0702c4fcf8 100644
--- a/src/gmmbin/gmm-est-fmllr.cc
+++ b/src/gmmbin/gmm-est-fmllr.cc
@@ -195,4 +195,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/hmm/posterior.h b/src/hmm/posterior.h
index e153c249740..3b8016ac712 100644
--- a/src/hmm/posterior.h
+++ b/src/hmm/posterior.h
@@ -51,6 +51,42 @@ typedef std::vector<std::vector<std::pair<int32, BaseFloat> > > Posterior;
 typedef std::vector<std::vector<std::pair<int32, Vector<BaseFloat> > > > GaussPost;
 
 
+/// This class allows you to select a sub-vector of Posteriors, possibly with a
+/// stride, without copying them elsewhere.  SubPosterior is to Posterior as
+/// SubVector is to Vector.  (Note: Posterior is actually a typedef to
+/// std::vector<std::vector<std::pair<int32, BaseFloat> > >.
+/// We can add a non-const interface later if needed.
+class SubPosterior {
+ public:
+  SubPosterior(const Posterior &post):
+      num_frames_(post.size()), stride_(1), data_(
+          num_frames_ == 0 ? NULL : &(post[0])) { }
+  SubPosterior(const Posterior &post, size_t offset,
+               size_t num_frames, size_t stride = 1):
+      num_frames_(num_frames), stride_(stride),
+      data_(num_frames_ == 0 ? NULL : &(post[offset])) {
+    KALDI_ASSERT(stride > 0 && post.size() > offset + (num_frames-1) * stride);
+  }
+  SubPosterior(const SubPosterior &post, size_t offset,
+               size_t num_frames, size_t stride = 1):
+      num_frames_(num_frames), stride_(stride * post.stride_),
+      data_(num_frames_ == 0 ? NULL : post.data_ + (offset * post.stride_)) {
+    KALDI_ASSERT(offset + num_frames * (stride - 1) < post.num_frames_);
+  }
+  size_t size() const { return num_frames_; }
+  const std::vector<std::pair<int32, BaseFloat> > &operator[] (size_t i) const {
+    KALDI_PARANOID_ASSERT(i < num_frames_);
+    return data_[i * stride_];
+  }
+  SubPosterior(const SubPosterior &other) = default;
+ private:
+  size_t num_frames_;
+  size_t stride_;
+  const std::vector<std::pair<int32, BaseFloat> > *data_;
+};
+
+
+
 // PosteriorHolder is a holder for Posterior, which is
 // std::vector<std::vector<std::pair<int32, BaseFloat> > >
 // This is used for storing posteriors of transition id's for an
diff --git a/src/hmm/transition-model.h b/src/hmm/transition-model.h
index e453c24f9cb..c41ec2e7b32 100644
--- a/src/hmm/transition-model.h
+++ b/src/hmm/transition-model.h
@@ -251,6 +251,7 @@ class TransitionModel {
   /// compare the transition probabilities.
   bool Compatible(const TransitionModel &other) const;
 
+  TransitionModel(const TransitionModel &other) = default;
  private:
   void MleUpdateShared(const Vector<double> &stats,
                        const MleTransitionUpdateConfig &cfg,
@@ -321,7 +322,8 @@ class TransitionModel {
   /// of pdfs).
   int32 num_pdfs_;
 
-  KALDI_DISALLOW_COPY_AND_ASSIGN(TransitionModel);
+  // Disallow assignment by making it private; this won't be defined.
+  TransitionModel &operator=(const TransitionModel &other);
 };
 
 inline int32 TransitionModel::TransitionIdToPdf(int32 trans_id) const {
diff --git a/src/matrix/Makefile b/src/matrix/Makefile
index e39be1ffec9..2fcf62fcb69 100644
--- a/src/matrix/Makefile
+++ b/src/matrix/Makefile
@@ -10,7 +10,7 @@ include ../kaldi.mk
 
 # you can uncomment matrix-lib-speed-test if you want to do the speed tests.
 
-TESTFILES = matrix-lib-test sparse-matrix-test #matrix-lib-speed-test
+TESTFILES = matrix-lib-test sparse-matrix-test matrix-functions-test #matrix-lib-speed-test
 
 OBJFILES = kaldi-matrix.o kaldi-vector.o packed-matrix.o sp-matrix.o tp-matrix.o \
            matrix-functions.o qr.o srfft.o compressed-matrix.o \
diff --git a/src/matrix/kaldi-matrix.h b/src/matrix/kaldi-matrix.h
index 11a5e08b15d..d7ee8eb388f 100644
--- a/src/matrix/kaldi-matrix.h
+++ b/src/matrix/kaldi-matrix.h
@@ -531,6 +531,10 @@ class MatrixBase {
    * positive semi-definite (check_thresh controls how stringent the check is;
    * set it to 2 to ensure it won't ever complain, but it will zero out negative
    * dimensions in your matrix.
+   *
+   * Caution: if you want the eigenvalues, it may make more sense to convert to
+   * SpMatrix and use Eig() function there, which uses eigenvalue decomposition
+   * directly rather than SVD.
   */
   void SymPosSemiDefEig(VectorBase<Real> *s, MatrixBase<Real> *P,
                         Real check_thresh = 0.001);
diff --git a/src/matrix/matrix-functions-test.cc b/src/matrix/matrix-functions-test.cc
new file mode 100644
index 00000000000..203892a54e3
--- /dev/null
+++ b/src/matrix/matrix-functions-test.cc
@@ -0,0 +1,184 @@
+// matrix/matrix-functions-test.cc
+
+// Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+//           2018  Institute of Acoustics, CAS (Gaofeng Cheng)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "matrix/matrix-functions.h"
+#include "matrix/kaldi-vector.h"
+#include "matrix/kaldi-matrix.h"
+#include "matrix/sp-matrix.h"
+
+namespace kaldi {
+
+void SvdRescalerTestIdentity() {
+  // this tests the case where f() is the identity function.
+  int32 dim = 10;
+  Matrix<BaseFloat> mat(dim, dim);
+  if (RandInt(0, 1) == 0)
+    mat.SetRandn();
+  // else zero.
+  bool symmetric = false;
+
+  SvdRescaler sc;
+  sc.Init(&mat, symmetric);
+
+  BaseFloat *lambda = sc.InputSingularValues(),
+      *f_lambda= sc.OutputSingularValues(),
+      *fprime_lambda = sc.OutputSingularValueDerivs();
+  for (int32 i = 0; i < dim; i++) {
+    f_lambda[i] = lambda[i];
+    fprime_lambda[i] = 1.0;
+  }
+  Matrix<BaseFloat> output(dim, dim, kUndefined);
+  sc.GetOutput(&output);
+  AssertEqual(mat, output, 0.001);
+  Matrix<BaseFloat> output_deriv(dim, dim, kUndefined),
+      input_deriv(dim, dim);
+  output_deriv.SetRandn();
+  sc.ComputeInputDeriv(output_deriv, &input_deriv);
+  KALDI_LOG << output_deriv << input_deriv;
+  AssertEqual(output_deriv, input_deriv);
+}
+
+void SvdRescalerTestPowerDiag() {
+  // this tests the case where f() is a power function with random exponent,
+  // and the matrix is diagonal.
+  int32 dim = 10;
+  BaseFloat power = 0.25 * RandInt(0, 4);
+  bool symmetric = (RandInt(0, 1) == 0);
+  Matrix<BaseFloat> mat(dim, dim);
+  for (int32 i = 0; i < dim; i++) {
+    mat(i, i) = 0.25 * RandInt(0, 10);
+    // if power < 1.0, we can't allow zero diagonal
+    // elements, or the derivatives would be undefined.
+    if (power < 1.0 && mat(i, i) == 0.0)
+      mat(i, i) = 0.333;
+  }
+
+  SvdRescaler sc;
+  sc.Init(&mat, symmetric);
+
+  BaseFloat *lambda = sc.InputSingularValues(),
+      *f_lambda= sc.OutputSingularValues(),
+      *fprime_lambda = sc.OutputSingularValueDerivs();
+  for (int32 i = 0; i < dim; i++) {
+    f_lambda[i] = pow(lambda[i], power);
+    fprime_lambda[i] = power * pow(lambda[i], power - 1.0);
+  }
+  Matrix<BaseFloat> output(dim, dim, kUndefined);
+  sc.GetOutput(&output);
+  KALDI_ASSERT(mat.IsDiagonal(0.001));
+  Matrix<BaseFloat> output_deriv(dim, dim, kUndefined),
+      input_deriv(dim, dim);
+  output_deriv.SetRandn();
+  sc.ComputeInputDeriv(output_deriv, &input_deriv);
+
+  for (int32 i = 0; i < dim; i++) {
+    BaseFloat oderiv = output_deriv(i, i),
+        ideriv = input_deriv(i, i),
+        x = mat(i, i),
+        df = power * pow(x, power - 1.0);
+    AssertEqual(ideriv, oderiv * df);
+  }
+}
+
+
+void SvdRescalerTestExp() {
+  // this tests the case where f() is the exponential function, and the matrix
+  // is an arbitrary matrix.
+  int32 dim = 10;
+  //bool symmetric = (RandInt(0, 1) == 0);
+  bool symmetric = false;
+  BaseFloat exp_scale = 0.2 * RandInt(0, 5);
+
+  Matrix<BaseFloat> mat(dim, dim);
+
+  if (symmetric) {
+    SpMatrix<BaseFloat> s(dim);
+    s.SetRandn();
+    mat.CopyFromSp(s);
+  } else {
+    mat.SetRandn();
+  }
+
+  KALDI_LOG << "Matrix sum is " << mat.Sum();
+
+  SvdRescaler sc;
+  sc.Init(&mat, symmetric);
+  BaseFloat *lambda = sc.InputSingularValues(),
+      *f_lambda= sc.OutputSingularValues(),
+      *fprime_lambda = sc.OutputSingularValueDerivs();
+  for (int32 i = 0; i < dim; i++) {
+    f_lambda[i] = exp(exp_scale * lambda[i]);
+    fprime_lambda[i] = exp_scale * exp(exp_scale * lambda[i]);
+  }
+  Matrix<BaseFloat> output(dim, dim, kUndefined);
+  sc.GetOutput(&output);
+  Matrix<BaseFloat> output_deriv(dim, dim, kUndefined),
+      input_deriv(dim, dim);
+  output_deriv.SetRandn();
+  sc.ComputeInputDeriv(output_deriv, &input_deriv);
+
+
+  // use random directions to test the accuracy of the derivatives.
+  int32 n = 4;
+  Vector<BaseFloat> expected_change(n), actual_change(n);
+  BaseFloat epsilon = 0.001;
+  for (int32 k = 0; k < n; k++) {
+    Matrix<BaseFloat> delta(dim, dim);
+    if (symmetric) {
+      SpMatrix<BaseFloat> s(dim);
+      s.SetRandn();
+      delta.CopyFromSp(s);
+    } else {
+      delta.SetRandn();
+    }
+    delta.Scale(epsilon);
+    expected_change(k) = TraceMatMat(delta, input_deriv, kTrans);
+    delta.AddMat(1.0, mat);
+    SvdRescaler sc2(&delta, symmetric);
+    BaseFloat *lambda = sc2.InputSingularValues(),
+        *f_lambda= sc2.OutputSingularValues(),
+        *fprime_lambda = sc2.OutputSingularValueDerivs();
+    for (int32 i = 0; i < dim; i++) {
+      f_lambda[i] = exp(exp_scale * lambda[i]);
+      fprime_lambda[i] = exp_scale * exp(exp_scale * lambda[i]);
+    }
+    Matrix<BaseFloat> output_perturbed(dim, dim);
+    sc2.GetOutput(&output_perturbed);
+    actual_change(k) = TraceMatMat(output_deriv, output_perturbed, kTrans) -
+        TraceMatMat(output_deriv, output, kTrans);
+  }
+  KALDI_LOG << "Matrix sum is " << mat.Sum();
+  KALDI_LOG << "Predicted " << expected_change
+            << " vs. actual " << actual_change;
+  AssertEqual(expected_change, actual_change, 0.01);
+}
+
+
+
+} // namespace kaldi
+
+int main() {
+  for (int32 i = 0; i < 10; i++) {
+    kaldi::SvdRescalerTestIdentity();
+    kaldi::SvdRescalerTestPowerDiag();
+    kaldi::SvdRescalerTestExp();
+  }
+  std::cout << "Test OK.\n";
+}
diff --git a/src/matrix/matrix-functions.cc b/src/matrix/matrix-functions.cc
index 496c09f5344..7a222026010 100644
--- a/src/matrix/matrix-functions.cc
+++ b/src/matrix/matrix-functions.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011  Microsoft Corporation;  Go Vivace Inc.;  Jan Silovsky
 //                      Yanmin Qian;  Saarland University;  Johns Hopkins University (Author: Daniel Povey)
+//                      Gaofeng Cheng (Institute of Acoustics, Chinese Academy of Sciences)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -769,5 +770,115 @@ void AddOuterProductPlusMinus<double>(double alpha,
                                       MatrixBase<double> *plus,
                                       MatrixBase<double> *minus);
 
+void SvdRescaler::Init(const MatrixBase<BaseFloat> *A, bool symmetric) {
+  KALDI_ASSERT(A->NumRows() == A->NumCols());
+  A_ = A;
+  symmetric_ = symmetric;
+  int32 dim = A->NumRows();
+  lambdas_.Resize(3, dim, kUndefined);
+  U_.Resize(dim, dim, kUndefined);
+  SubVector<BaseFloat> lambda(lambdas_, 0);
+  if (symmetric) {
+    // the following constructor will check that A is actually symmetric.
+    SpMatrix<BaseFloat> A_sym(*A_, kTakeMeanAndCheck);
+    A_sym.Eig(&lambda, &U_);
+  } else {
+    Vt_.Resize(dim, dim, kUndefined);
+    A_->Svd(&lambda, &U_, &Vt_);
+  }
+}
+
+BaseFloat *SvdRescaler::InputSingularValues() {
+  return lambdas_.RowData(0);
+}
+
+BaseFloat *SvdRescaler::OutputSingularValues() {
+  return lambdas_.RowData(1);
+}
+
+BaseFloat *SvdRescaler::OutputSingularValueDerivs() {
+  return lambdas_.RowData(2);
+}
+
+void SvdRescaler::GetOutput(MatrixBase<BaseFloat> *output) {
+  int32 dim = A_->NumRows();
+  SubVector<BaseFloat> f_lambda(lambdas_, 1);  // f(lambda) in the writeup.
+  if (symmetric_) {
+    SpMatrix<BaseFloat> S(dim);
+    S.AddMat2Vec(1.0, U_, kNoTrans, f_lambda, 0.0);
+    output->CopyFromSp(S);
+  } else {
+    Matrix<BaseFloat> U_tmp(U_);
+    U_tmp.MulColsVec(f_lambda);
+    output->SetZero();
+    output->AddMatMat(1.0, U_tmp, kNoTrans, Vt_, kNoTrans, 0.0);
+  }
+}
+
+void SvdRescaler::ComputeInputDeriv(const MatrixBase<BaseFloat> &output_deriv,
+                                    MatrixBase<BaseFloat> *input_deriv) const {
+  int32 dim = A_->NumRows();
+  KALDI_ASSERT(output_deriv.NumRows() == dim && output_deriv.NumCols() == dim &&
+               input_deriv->NumRows() == dim && input_deriv->NumCols() == dim);
+  // input_deriv is \bar{A} in the writeup.
+  input_deriv->SetZero();
+
+  // \bar{D} in the writeup; see class declaration.
+  Matrix<BaseFloat> bar_d(dim, dim);
+  if (!symmetric_)
+    bar_d.AddMatMatMat(1.0, U_, kTrans, output_deriv, kNoTrans, Vt_, kTrans, 0.0);
+  else
+    bar_d.AddMatMatMat(1.0, U_, kTrans, output_deriv, kNoTrans, U_, kNoTrans, 0.0);
+
+  Matrix<BaseFloat> bar_lambda(dim, dim);
+
+  const BaseFloat *lambda = lambdas_.RowData(0),  // elements \lambda_i
+      *f_lambda = lambdas_.RowData(1),  // elements f(\lambda_i)
+      *f_lambda_deriv = lambdas_.RowData(2);  // elements f'(lambda_i)
+
+  // we use doubles in the computations below, to avoid underflow if any floating
+  // point values were extremely close to zero (e.g., denormal)
+  for(int32 i = 0; i < dim; i++) {
+    double lambda_i = lambda[i], lambda2_i = lambda_i * lambda_i,
+        d_i = f_lambda[i];
+    for(int32 j = 0; j < dim; j++) {
+      double lambda_j = lambda[j], lambda2_j = lambda_j * lambda_j,
+          d_j = f_lambda[j], bar_d_ij = bar_d(i, j),
+          bar_d_ji = bar_d(j, i), bar_lambda_ij;
+
+      if (i == j) {
+        bar_lambda_ij = bar_d_ij * f_lambda_deriv[i];
+      } else if (std::abs(lambda_i - lambda_j) > 1.0e-03 * std::abs(lambda_i)) {
+        // if lambda_i and lambda_j are not (relatively) too close in value (which
+        // implies that at least one them is nonzero)..
+        bar_lambda_ij = bar_d_ij * ((lambda_i * d_i - lambda_j * d_j) /
+                                       (lambda2_i - lambda2_j)) +
+            bar_d_ji * ((lambda_j * d_i - lambda_i * d_j) /
+                           (lambda2_i - lambda2_j));
+      } else if (lambda_i != 0) {
+        // If we reached here, it implies they are both nonzero, but extremely
+        // close in value.
+        // lambda is the average of the two lambdas.
+        // Assume f'(lambda) is the average of the two derivatives.
+        double lambda = 0.5 * (lambda_i + lambda_j),
+            f_prime_lambda = 0.5 * (f_lambda_deriv[i] + f_lambda_deriv[j]),
+            d = 0.5 * (d_i + d_j);
+        bar_lambda_ij = bar_d_ij * ((lambda * f_prime_lambda + d) / (2.0 * lambda)) +
+            bar_d_ji * ((lambda * f_prime_lambda - d) / (2.0 * lambda));
+      } else {
+        // both zero.
+        KALDI_ASSERT(lambda_i == 0 && lambda_j == 0);
+        bar_lambda_ij = bar_d_ij * f_lambda_deriv[i];
+      }
+      bar_lambda(i, j) = bar_lambda_ij;
+    }
+  }
+  if (!symmetric_)
+    input_deriv->AddMatMatMat(1.0, U_, kNoTrans, bar_lambda, kNoTrans,
+                              Vt_, kNoTrans, 0.0);
+  else
+    input_deriv->AddMatMatMat(1.0, U_, kNoTrans, bar_lambda, kNoTrans,
+                              U_, kTrans, 0.0);
+}
 
 } // end namespace kaldi
diff --git a/src/matrix/matrix-functions.h b/src/matrix/matrix-functions.h
index ca50ddda7c8..2b3ec8133e9 100644
--- a/src/matrix/matrix-functions.h
+++ b/src/matrix/matrix-functions.h
@@ -164,6 +164,126 @@ inline void AssertSameDim(const MatrixBase<Real1> &mat1, const MatrixBase<Real2>
 }
 
 
+/*
+   This class allows you to compute the class of function described in
+    http://www.danielpovey.com/files/2018_svd_derivative.pdf
+   and to backprop through that computation.
+   Short summary: it allows you to apply some kind of scalar function
+   to the singular values of a square matrix, reconstruct it, and then
+   backprop through that operation.
+
+   This class is quite general-purpose in the sense that you can
+   provide any scalar function; but in order to avoid things like
+   passing function-pointers around, we had give it a rather clunky
+   interface.  The way you are supposed to use it is as follows
+   (to give an example):
+
+      Matrix<BaseFloat> A(...);  // set it somehow.
+      SvdRescaler rescaler(A);
+      const VectorBase<BaseFloat> &lambda_in = A.InputSingularValues();
+      VectorBase<BaseFloat> &lambda_out = *(A.OutputSingularValues());
+      VectorBase<BaseFloat> &lambda_out_deriv = *(A.OutputSingularValueDerivs());
+      for (int32 i = 0; i < lambda_in.size(); i++) {
+        // compute the scalar function and its derivative for the singular
+        // values.
+        lambda_out(i) = some_func(lambda_in(i));
+        lambda_out_deriv(i) = some_func_deriv(lambda_in(i));
+      }
+      Matrix<BaseFloat> B(A.NumRows(), A.NumCols(), kUndefined);
+      rescaler.GetOutput(&B);
+      // Do something with B.
+      Matrix<BaseFloat> B_deriv(...);  // Get the derivative w.r.t. B
+                                       // somehow.
+      Matrix<BaseFloat> A_deriv(A.NumRows(), A.NumCols());  // Get the derivative w.r.t. A.
+
+
+ */
+class SvdRescaler {
+ public:
+  /*
+    Constructor.
+    'A' is the input matrix.  See class-level documentation above for
+     more information.
+
+    If 'symmetric' is set to true, then the user is asserting that A is
+    symmetric, and that that symmetric structure needs to be preserved in the
+    output.  In this case, we use code for the symmetric eigenvalue problem to
+    do the decomposition instead of the SVD.  I.e. decompose A = P diag(s) P^T
+    instead of A = U diag(s) V^T, using SpMatrix::Eig().  You can view this as a
+    special case of SVD.
+  */
+  SvdRescaler(const MatrixBase<BaseFloat> *A, bool symmetric) {
+    Init(A, symmetric);
+  }
+
+  // Constructor that takes no args.  In this case you are supposed to
+  // call Init()
+  SvdRescaler() { }
+
+  // An alternative to the constructor that takes args.  Should only be called
+  // directly after initializing the object with no args.  Warning: this object
+  // keeps a reference to this matrix, so don't modify it during the lifetime
+  // of this object.
+  // A is required to be square.
+  void Init(const MatrixBase<BaseFloat> *A, bool symmetric);
+
+  // Return a pointer to the the singular values of A, which will have been
+  // computed in the constructor.
+  // The reason why this is not const is that there may be
+  // situations where you discover that the input matrix has some very small
+  // singular values, and you want to (say) floor them somehow and reconstruct,
+  // and have the derivatives be valid assuming you had given that 'repaired'
+  // matrix A as input.  Modifying the elements of this vector gives you
+  // a way to do that, although currently this class doesn't provide a way
+  // for you to access that 'fixed-up' A directly.
+  // We hope you know what you are doing if you modify these singular values.
+  BaseFloat *InputSingularValues();
+
+  // Returns a pointer to a place that you can write the
+  // modified singular values f(lambda).
+  BaseFloat *OutputSingularValues();
+
+  // Returns a pointer to a place that you can write the
+  // values of f'(lambda) (the function-derivative of f).
+  BaseFloat *OutputSingularValueDerivs();
+
+  // Outputs F(A) to 'output', which must have the correct size.
+  // It's OK if 'output' contains NaNs on entry.
+  // Before calling this, you must have set the values in
+  // 'OutputSingularValues()'.
+  void GetOutput(MatrixBase<BaseFloat> *output);
+
+  // Computes the derivative of some function g w.r.t. the input A,
+  // given that dg/d(output) is provided in 'output_deriv'.
+  // This derivative is *added* to 'input_deriv', so you need
+  // to zero 'input_deriv' or otherwise set it, beforehand.
+  // It is acceptable to call ComputeInputDeriv (with possibly different
+  // values of 'output_deriv' and 'input_deriv' as many times as you want,
+  // on the same object.
+  void ComputeInputDeriv(const MatrixBase<BaseFloat> &output_deriv,
+                         MatrixBase<BaseFloat> *input_deriv) const;
+
+ protected:
+  // the input matrix A.  Owned by the user but will not be changed by them
+  // during the lifetime of this object.
+  const MatrixBase<BaseFloat> *A_;
+  bool symmetric_;
+  // U_ is present regardless of whether symmetric_ is true.  It is the
+  // left part of the decomposition A = U diag(s) V^T.
+  Matrix<BaseFloat> U_;
+  // Vt_ is only present if symmetric_ is false.  Otherwise, we
+  // assume that Vt_ equals U_.
+  Matrix<BaseFloat> Vt_;
+
+  // a matrix containing three rows, and num-cols equal to the num-rows of the
+  // symmetric matrix A_.
+  //    row 0 is 'lambda_in' (the input singular values; or the input eigenvalues,
+  //                          in the symmetric case).
+  //    row 1 is 'lambda_out' (the input singular values, i.e. f(lambda)),
+  //    row 2 is 'lambda_out_deriv' (the function-derivative f'(lambda)).
+  Matrix<BaseFloat> lambdas_;
+};
+
 /// @} end of "addtogroup matrix_funcs_misc"
 
 } // end namespace kaldi
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 5e67211c3a7..66177559218 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -41,6 +41,6 @@ ADDLIBS = ../chain/kaldi-chain.a ../cudamatrix/kaldi-cudamatrix.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc
index c627bb1032a..517c63e394e 100644
--- a/src/nnet3/nnet-chain-example.cc
+++ b/src/nnet3/nnet-chain-example.cc
@@ -33,13 +33,18 @@ void NnetChainSupervision::Write(std::ostream &os, bool binary) const {
   supervision.Write(os, binary);
   WriteToken(os, binary, "<DW2>");
   deriv_weights.Write(os, binary);
+  if (chunks_per_group != 1) {
+    WriteToken(os, binary, "<ChunksPerGroup>");
+    WriteBasicType(os, binary, chunks_per_group);
+  }
   WriteToken(os, binary, "</NnetChainSup>");
 }
 
 bool NnetChainSupervision::operator == (const NnetChainSupervision &other) const {
   return name == other.name && indexes == other.indexes &&
       supervision == other.supervision &&
-      deriv_weights.ApproxEqual(other.deriv_weights);
+      deriv_weights.ApproxEqual(other.deriv_weights) &&
+      chunks_per_group == other.chunks_per_group;
 }
 
 void NnetChainSupervision::Read(std::istream &is, bool binary) {
@@ -47,17 +52,17 @@ void NnetChainSupervision::Read(std::istream &is, bool binary) {
   ReadToken(is, binary, &name);
   ReadIndexVector(is, binary, &indexes);
   supervision.Read(is, binary);
-  std::string token;
-  ReadToken(is, binary, &token);
-  // in the future this back-compatibility code can be reworked.
-  if (token != "</NnetChainSup>") {
-    KALDI_ASSERT(token == "<DW>" || token == "<DW2>");
-    if (token == "<DW>")
-      ReadVectorAsChar(is, binary, &deriv_weights);
-    else
-      deriv_weights.Read(is, binary);
-    ExpectToken(is, binary, "</NnetChainSup>");
+  // If the following fails, you may be using much older egs that are no longer
+  // supported to be read by the current code -> re-dump the egs.
+  ExpectToken(is, binary, "<DW2>");
+  deriv_weights.Read(is, binary);
+  if (PeekToken(is, binary) == 'C') {
+    ExpectToken(is, binary, "<ChunksPerGroup>");
+    ReadBasicType(is, binary, &chunks_per_group);
+  } else {
+    chunks_per_group = 1;
   }
+  ExpectToken(is, binary, "</NnetChainSup>");
   CheckDim();
 }
 
@@ -75,6 +80,8 @@ void NnetChainSupervision::CheckDim() const {
       frame_skip = indexes[supervision.num_sequences].t - first_frame,
       num_sequences = supervision.num_sequences,
       frames_per_sequence = supervision.frames_per_sequence;
+  KALDI_ASSERT(chunks_per_group > 0 &&
+               num_sequences % chunks_per_group == 0);
   int32 k = 0;
   for (int32 i = 0; i < frames_per_sequence; i++) {
     for (int32 j = 0; j < num_sequences; j++,k++) {
@@ -93,13 +100,15 @@ NnetChainSupervision::NnetChainSupervision(const NnetChainSupervision &other):
     name(other.name),
     indexes(other.indexes),
     supervision(other.supervision),
-    deriv_weights(other.deriv_weights) { CheckDim(); }
+    deriv_weights(other.deriv_weights),
+    chunks_per_group(other.chunks_per_group) { CheckDim(); }
 
 void NnetChainSupervision::Swap(NnetChainSupervision *other) {
   name.swap(other->name);
   indexes.swap(other->indexes);
   supervision.Swap(&(other->supervision));
   deriv_weights.Swap(&(other->deriv_weights));
+  std::swap(chunks_per_group, other->chunks_per_group);
   if (RandInt(0, 5) == 0)
     CheckDim();
 }
@@ -112,7 +121,8 @@ NnetChainSupervision::NnetChainSupervision(
     int32 frame_skip):
     name(name),
     supervision(supervision),
-    deriv_weights(deriv_weights) {
+    deriv_weights(deriv_weights),
+    chunks_per_group(1) {
   // note: this will set the 'x' index to zero.
   indexes.resize(supervision.num_sequences *
                  supervision.frames_per_sequence);
@@ -177,6 +187,7 @@ void NnetChainExample::Read(std::istream &is, bool binary) {
 void NnetChainExample::Swap(NnetChainExample *other) {
   inputs.swap(other->inputs);
   outputs.swap(other->outputs);
+  std::swap(bucket, other->bucket);
 }
 
 void NnetChainExample::Compress() {
@@ -211,6 +222,14 @@ static void MergeSupervision(
                    &output_supervision);
   output->supervision.Swap(&output_supervision);
 
+  int32 example_stride = 0;
+  for (auto &index: inputs[0]->indexes)
+    if (index.n > example_stride)
+      example_stride = index.n;
+  example_stride++;
+
+  KALDI_ASSERT(example_stride == inputs[0]->supervision.num_sequences);
+
   output->indexes.clear();
   output->indexes.reserve(num_indexes);
   for (int32 n = 0; n < num_inputs; n++) {
@@ -223,8 +242,8 @@ static void MergeSupervision(
     // change the 'n' index to correspond to the index into 'input'.
     // Each example gets a different 'n' value, starting from 0.
     for (; iter != end; ++iter) {
-      KALDI_ASSERT(iter->n == 0 && "Merging already-merged chain egs");
-      iter->n = n;
+      KALDI_ASSERT(iter->n < example_stride);
+      iter->n += n * example_stride;
     }
   }
   KALDI_ASSERT(output->indexes.size() == num_indexes);
@@ -249,6 +268,7 @@ static void MergeSupervision(
       }
     }
   }
+  output->chunks_per_group = example_stride;
   output->CheckDim();
 }
 
@@ -350,6 +370,30 @@ void GetChainComputationRequest(const Nnet &nnet,
     KALDI_ERR << "No outputs in computation request.";
 }
 
+
+// Returns the frame subsampling factor, which is the difference between the
+// first 't' value we encounter in 'indexes', and the next 't' value that is
+// different from the first 't'.  It will typically be 3.
+// This function will crash if it could not figure it out (e.g. because
+// 'indexes' was empty or had only one element).
+static int32 GetFrameSubsamplingFactor(const std::vector<Index> &indexes) {
+
+  auto iter = indexes.begin(), end = indexes.end();
+  int32 cur_t_value;
+  if (iter != end) {
+    cur_t_value = iter->t;
+    ++iter;
+  }
+  for (; iter != end; ++iter) {
+    if (iter->t != cur_t_value) {
+      KALDI_ASSERT(iter->t > cur_t_value);
+      return iter->t - cur_t_value;
+    }
+  }
+  KALDI_ERR << "Error getting frame subsampling factor";
+  return 0;  // Shouldn't be reached, this is to avoid compiler warnings.
+}
+
 void ShiftChainExampleTimes(int32 frame_shift,
                             const std::vector<std::string> &exclude_names,
                             NnetChainExample *eg) {
@@ -377,10 +421,7 @@ void ShiftChainExampleTimes(int32 frame_shift,
       sup_end = eg->outputs.end();
   for (; sup_iter != sup_end; ++sup_iter) {
     std::vector<Index> &indexes = sup_iter->indexes;
-    KALDI_ASSERT(indexes.size() >= 2 && indexes[0].n == indexes[1].n &&
-                 indexes[0].x == indexes[1].x);
-    int32 frame_subsampling_factor = indexes[1].t - indexes[0].t;
-    KALDI_ASSERT(frame_subsampling_factor > 0);
+    int32 frame_subsampling_factor = GetFrameSubsamplingFactor(indexes);
 
     // We need to shift by a multiple of frame_subsampling_factor.
     // Round to the closest multiple.
@@ -401,12 +442,13 @@ size_t NnetChainExampleStructureHasher::operator () (
     const NnetChainExample &eg) const noexcept {
   // these numbers were chosen at random from a list of primes.
   NnetIoStructureHasher io_hasher;
+  StringHasher string_hasher;
   size_t size = eg.inputs.size(), ans = size * 35099;
+  ans += string_hasher(eg.bucket);
   for (size_t i = 0; i < size; i++)
     ans = ans * 19157 + io_hasher(eg.inputs[i]);
   for (size_t i = 0; i < eg.outputs.size(); i++) {
     const NnetChainSupervision &sup = eg.outputs[i];
-    StringHasher string_hasher;
     IndexVectorHasher indexes_hasher;
     ans = ans * 17957 +
         string_hasher(sup.name) + indexes_hasher(sup.indexes);
@@ -417,6 +459,8 @@ size_t NnetChainExampleStructureHasher::operator () (
 bool NnetChainExampleStructureCompare::operator () (
     const NnetChainExample &a,
     const NnetChainExample &b) const {
+  if (a.bucket != b.bucket)
+    return false;
   NnetIoStructureCompare io_compare;
   if (a.inputs.size() != b.inputs.size() ||
       a.outputs.size() != b.outputs.size())
@@ -499,6 +543,8 @@ void ChainExampleMerger::WriteMinibatch(
   MergeChainExamples(config_.compress, egs, &merged_eg);
   std::ostringstream key;
   key << "merged-" << (num_egs_written_++) << "-" << minibatch_size;
+  if (!(*egs)[0].bucket.empty())
+    key << "?" << (*egs)[0].bucket;
   writer_->Write(key.str(), merged_eg);
 }
 
diff --git a/src/nnet3/nnet-chain-example.h b/src/nnet3/nnet-chain-example.h
index 187bb4ef3a3..eb6846fa4d2 100644
--- a/src/nnet3/nnet-chain-example.h
+++ b/src/nnet3/nnet-chain-example.h
@@ -60,7 +60,10 @@ struct NnetChainSupervision {
   std::vector<Index> indexes;
 
 
-  /// The supervision object, containing the FST.
+  /// The supervision object, containing the FST; its members are
+  /// weight, num_sequences, frames_per_sequence, label_dim, fst,
+  /// e2e_fsts (for e2e examples only); alignment_pdfs (which is required
+  /// only for nnet3-chain-acc-lda-stats).
   chain::Supervision supervision;
 
   /// This is a vector of per-frame weights, required to be between 0 and 1,
@@ -76,6 +79,14 @@ struct NnetChainSupervision {
   /// to disk compactly as unsigned char.
   Vector<BaseFloat> deriv_weights;
 
+  /// This will be 1 in normal cases, but in the 'chaina' code (chain training
+  /// with adaptation) it will be set to the number of chunks/sequences per
+  /// group in this minibatch (the chunks from a particular group are expected
+  /// to come from the same speaker).  For example if it's 4, then we are
+  /// asserting that sequences n=0 through 3 all come from the same speaker, n=4
+  /// through 7 all come from the same speaker, and so on.
+  int32 chunks_per_group;
+
   // Use default assignment operator
 
   NnetChainSupervision() { }
@@ -118,6 +129,12 @@ struct NnetChainExample {
   /// be just one member with name == "output".
   std::vector<NnetChainSupervision> outputs;
 
+  /// This relates to the '--use-query-string' option for merging. Examples
+  /// with different values of 'bucket' won't be merged together. Note that
+  /// this member variable is not written or read (in the Write/Read functions)
+  /// as it's not a permanent part of an eg. It's only used in the merging code.
+  std::string bucket;
+
   void Write(std::ostream &os, bool binary) const;
   void Read(std::istream &is, bool binary);
 
@@ -270,7 +287,7 @@ class ChainExampleMerger {
                         std::vector<NnetChainExample*>,
                         NnetChainExampleStructureHasher,
                         NnetChainExampleStructureCompare> MapType;
-MapType eg_to_egs_;
+  MapType eg_to_egs_;
 };
 
 
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index a798cb597f5..d9562887817 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -33,6 +33,7 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
     compiler_(*nnet, opts_.nnet_config.optimize_config,
               opts_.nnet_config.compiler_config),
     num_minibatches_processed_(0),
+    max_change_stats_(*nnet),
     srand_seed_(RandInt(0, 100000)) {
   if (opts.nnet_config.zero_component_stats)
     ZeroComponentStats(nnet);
@@ -41,9 +42,6 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
                opts.nnet_config.backstitch_training_interval > 0);
   delta_nnet_ = nnet_->Copy();
   ScaleNnet(0.0, delta_nnet_);
-  const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);
-  num_max_change_per_component_applied_.resize(num_updatable, 0);
-  num_max_change_global_applied_ = 0;
 
   if (opts.nnet_config.read_cache != "") {
     bool binary;
@@ -111,17 +109,19 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
   this->ProcessOutputs(false, eg, &computer);
   computer.Run();
 
-  // If relevant, add in the part of the gradient that comes from L2
-  // regularization.
+  // If relevant, add in the part of the gradient that comes from
+  // parameter-level L2 regularization.
   ApplyL2Regularization(*nnet_,
                         GetNumNvalues(eg.inputs, false) *
                         nnet_config.l2_regularize_factor,
                         delta_nnet_);
 
   // Updates the parameters of nnet
-  bool success = UpdateNnetWithMaxChange(*delta_nnet_,
-      nnet_config.max_param_change, 1.0, 1.0 - nnet_config.momentum, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+  bool success = UpdateNnetWithMaxChange(
+      *delta_nnet_,
+      nnet_config.max_param_change,
+      1.0, 1.0 - nnet_config.momentum, nnet_,
+      &max_change_stats_);
 
   // Scale down the batchnorm stats (keeps them fresh... this affects what
   // happens when we use the model with batchnorm test-mode set).
@@ -176,9 +176,10 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg,
   }
 
   // Updates the parameters of nnet
-  UpdateNnetWithMaxChange(*delta_nnet_,
-      nnet_config.max_param_change, max_change_scale, scale_adding, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+  UpdateNnetWithMaxChange(
+      *delta_nnet_, nnet_config.max_param_change,
+      max_change_scale, scale_adding, nnet_,
+      &max_change_stats_);
 
   if (is_backstitch_step1) {
     // The following will only do something if we have a LinearComponent or
@@ -276,41 +277,10 @@ bool NnetChainTrainer::PrintTotalStats() const {
     const ObjectiveFunctionInfo &info = iter->second;
     ans = info.PrintTotalStats(name) || ans;
   }
-  PrintMaxChangeStats();
+  max_change_stats_.Print(*nnet_);
   return ans;
 }
 
-void NnetChainTrainer::PrintMaxChangeStats() const {
-  KALDI_ASSERT(delta_nnet_ != NULL);
-  const NnetTrainerOptions &nnet_config = opts_.nnet_config;
-  int32 i = 0;
-  for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) {
-    Component *comp = delta_nnet_->GetComponent(c);
-    if (comp->Properties() & kUpdatableComponent) {
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
-      if (uc == NULL)
-        KALDI_ERR << "Updatable component does not inherit from class "
-                  << "UpdatableComponent; change this code.";
-      if (num_max_change_per_component_applied_[i] > 0)
-        KALDI_LOG << "For " << delta_nnet_->GetComponentName(c)
-                  << ", per-component max-change was enforced "
-                  << (100.0 * num_max_change_per_component_applied_[i]) /
-                     (num_minibatches_processed_ *
-                     (nnet_config.backstitch_training_scale == 0.0 ? 1.0 :
-                     1.0 + 1.0 / nnet_config.backstitch_training_interval))
-                  << " \% of the time.";
-      i++;
-    }
-  }
-  if (num_max_change_global_applied_ > 0)
-    KALDI_LOG << "The global max-change was enforced "
-              << (100.0 * num_max_change_global_applied_) /
-                 (num_minibatches_processed_ *
-                 (nnet_config.backstitch_training_scale == 0.0 ? 1.0 :
-                 1.0 + 1.0 / nnet_config.backstitch_training_interval))
-              << " \% of the time.";
-}
-
 NnetChainTrainer::~NnetChainTrainer() {
   if (opts_.nnet_config.write_cache != "") {
     Output ko(opts_.nnet_config.write_cache, opts_.nnet_config.binary_write_cache);
diff --git a/src/nnet3/nnet-chain-training.h b/src/nnet3/nnet-chain-training.h
index 5bf6a3f6fce..bc5143491ac 100644
--- a/src/nnet3/nnet-chain-training.h
+++ b/src/nnet3/nnet-chain-training.h
@@ -64,10 +64,6 @@ class NnetChainTrainer {
   // Prints out the final stats, and return true if there was a nonzero count.
   bool PrintTotalStats() const;
 
-  // Prints out the max-change stats (if nonzero): the percentage of time that
-  // per-component max-change and global max-change were enforced.
-  void PrintMaxChangeStats() const;
-
   ~NnetChainTrainer();
  private:
   // The internal function for doing one step of conventional SGD training.
@@ -88,11 +84,8 @@ class NnetChainTrainer {
 
   chain::DenominatorGraph den_graph_;
   Nnet *nnet_;
-  Nnet *delta_nnet_;  // Only used if momentum != 0.0 or max-param-change !=
-                      // 0.0.  nnet representing accumulated parameter-change
-                      // (we'd call this gradient_nnet_, but due to
-                      // natural-gradient update, it's better to consider it as
-                      // a delta-parameter nnet.
+  Nnet *delta_nnet_;  // stores the change to the parameters on each training
+                      // iteration.
   CachingOptimizingCompiler compiler_;
 
   // This code supports multiple output layers, even though in the
@@ -101,8 +94,7 @@ class NnetChainTrainer {
   int32 num_minibatches_processed_;
 
   // stats for max-change.
-  std::vector<int32> num_max_change_per_component_applied_;
-  int32 num_max_change_global_applied_;
+  MaxChangeStats max_change_stats_;
 
   unordered_map<std::string, ObjectiveFunctionInfo, StringHasher> objf_info_;
 
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index 1ff7daa01d1..53859e9b03c 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -81,9 +81,9 @@ ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecompute
 // static
 Component* Component::ReadNew(std::istream &is, bool binary) {
   std::string token;
-  ReadToken(is, binary, &token); // e.g. "<SigmoidComponent>".
-  token.erase(0, 1); // erase "<".
-  token.erase(token.length()-1); // erase ">".
+  ReadToken(is, binary, &token);  // e.g. "<SigmoidComponent>".
+  token.erase(0, 1);  // erase "<".
+  token.erase(token.length() - 1);  // erase ">".
   Component *ans = NewComponentOfType(token);
   if (!ans)
     KALDI_ERR << "Unknown component type " << token;
diff --git a/src/nnet3/nnet-diagnostics.h b/src/nnet3/nnet-diagnostics.h
index 61e2ed18e1d..8e88794d022 100644
--- a/src/nnet3/nnet-diagnostics.h
+++ b/src/nnet3/nnet-diagnostics.h
@@ -61,7 +61,7 @@ struct NnetComputeProbOptions {
   // constructor of NnetComputeProb that takes a pointer to the nnet, and the
   // stats will be stored there.
   bool store_component_stats;
-  
+
   bool compute_per_dim_accuracy;
 
   NnetOptimizeOptions optimize_config;
@@ -186,18 +186,18 @@ class NnetComputeProb {
    @param [out] tot_weight  The sum of the values in the supervision matrix
    @param [out] tot_accuracy  The total accuracy, equal to the sum over all row
                      indexes r such that the maximum column index of row r of
-                     supervision and nnet_output is the same, of the sum of 
+                     supervision and nnet_output is the same, of the sum of
                      the r'th row of supervision (i.e. the row's weight).
    @param [out] tot_weight_vec  If non-NULL, we write to this location
                     the counts per-class in the supervision matrix.
-                    This is expected to have the same dimension as the 
-                    corresponding output in the network. 
-   @param [out] tot_accuracy_vec  If non-NULL, we write to this location 
-                    the accuracy per-class. For index j, 
-                    the value is equal to the sum 
-                    over all row indexes r such that the maximum column index 
+                    This is expected to have the same dimension as the
+                    corresponding output in the network.
+   @param [out] tot_accuracy_vec  If non-NULL, we write to this location
+                    the accuracy per-class. For index j,
+                    the value is equal to the sum
+                    over all row indexes r such that the maximum column index
                     of row r of supervision is j and nnet_output is also j,
-                    of the sum of the r'th row of supervision 
+                    of the sum of the r'th row of supervision
                     (i.e. the row's weight)
 
 */
diff --git a/src/nnet3/nnet-example-utils.cc b/src/nnet3/nnet-example-utils.cc
index cc5fe3cc050..f837ce27c66 100644
--- a/src/nnet3/nnet-example-utils.cc
+++ b/src/nnet3/nnet-example-utils.cc
@@ -81,7 +81,13 @@ static void GetIoSizes(const std::vector<NnetExample> &src,
 }
 
 
-
+static int32 FindMaxNValue(const NnetIo &io) {
+  int32 max_n = 0;
+  for (auto &index: io.indexes)
+    if (index.n > max_n)
+      max_n = index.n;
+  return max_n;
+}
 
 // Do the final merging of NnetIo, once we have obtained the names, dims and
 // sizes for each feature/supervision type.
@@ -98,6 +104,9 @@ static void MergeIo(const std::vector<NnetExample> &src,
   // The features in the different NnetIo in the Indexes across all examples
   std::vector<std::vector<GeneralMatrix const*> > output_lists(num_feats);
 
+  // This is 1 for single examples and larger than 1 for already-merged egs, and
+  // it must be the same for all io's across all examples:
+  int32 example_stride = FindMaxNValue(src[0].io[0]) + 1;
   // Initialize the merged_eg
   merged_eg->io.clear();
   merged_eg->io.resize(num_feats);
@@ -137,11 +146,8 @@ static void MergeIo(const std::vector<NnetExample> &src,
       std::vector<Index>::iterator output_iter = output_io.indexes.begin();
       // Set the n index to be different for each of the original examples.
       for (int32 i = this_offset; i < this_offset + this_size; i++) {
-        // we could easily support merging already-merged egs, but I don't see a
-        // need for it right now.
-        KALDI_ASSERT(output_iter[i].n == 0 &&
-                     "Merging already-merged egs?  Not currentlysupported.");
-        output_iter[i].n = n;
+        KALDI_ASSERT(output_iter[i].n < example_stride);
+        output_iter[i].n += n * example_stride;
       }
       this_offset += this_size;  // note: this_offset is a reference.
     }
@@ -357,7 +363,8 @@ UtteranceSplitter::~UtteranceSplitter() {
   KALDI_LOG << "Split " << total_num_utterances_ << " utts, with "
             << "total length " << total_input_frames_ << " frames ("
             << (total_input_frames_ / 360000.0) << " hours assuming "
-            << "100 frames per second)";
+            << "100 frames per second) into " << total_num_chunks_
+            << " chunks.";
   float average_chunk_length = total_frames_in_chunks_ * 1.0 / total_num_chunks_,
       overlap_percent = total_frames_overlap_ * 100.0 / total_input_frames_,
       output_percent = total_frames_in_chunks_ * 100.0 / total_input_frames_,
@@ -556,7 +563,7 @@ bool UtteranceSplitter::LengthsMatch(const std::string &utt,
                                      int32 length_tolerance) const {
   int32 sf = config_.frame_subsampling_factor,
       expected_supervision_length = (utterance_length + sf - 1) / sf;
-  if (std::abs(supervision_length - expected_supervision_length) 
+  if (std::abs(supervision_length - expected_supervision_length)
       <= length_tolerance) {
     return true;
   } else {
diff --git a/src/nnet3/nnet-example-utils.h b/src/nnet3/nnet-example-utils.h
index 52b2ebbf904..0553eeb3d82 100644
--- a/src/nnet3/nnet-example-utils.h
+++ b/src/nnet3/nnet-example-utils.h
@@ -88,7 +88,6 @@ struct ExampleGenerationConfig {
   int32 frame_subsampling_factor;
   std::string num_frames_str;
 
-
   // The following parameters are derived parameters, computed by
   // ComputeDerived().
 
@@ -325,12 +324,14 @@ class ExampleMergingConfig {
   std::string measure_output_frames;  // for back-compatibility, not used.
   std::string minibatch_size;
   std::string discard_partial_minibatches;   // for back-compatibility, not used.
+  bool use_query_string;
 
   ExampleMergingConfig(const char *default_minibatch_size = "256"):
       compress(false),
       measure_output_frames("deprecated"),
       minibatch_size(default_minibatch_size),
-      discard_partial_minibatches("deprecated") { }
+      discard_partial_minibatches("deprecated"),
+      use_query_string(false) { }
 
   void Register(OptionsItf *po) {
     po->Register("compress", &compress, "If true, compress the output examples "
@@ -354,6 +355,14 @@ class ExampleMergingConfig {
                  "--minibatch-size=128=64:128,256/256=32:64,128.  Egs are given "
                  "minibatch-sizes based on the specified eg-size closest to "
                  "their actual size.");
+    po->Register("use-query-string", &use_query_string, "If true, the part of "
+                 "the key name after the final '?' in the string (if one "
+                 "is present) will be required to match when determining "
+                 "which egs may be merged (so only egs with the same text "
+                 "after the '?' will be merged), and the key used in the "
+                 "output will end with the same query string, including "
+                 "the '?'. An example query string is: "
+                 "'?lang=english&tw=0.5&bw=1.0'");
   }
 
 
diff --git a/src/nnet3/nnet-parse-test.cc b/src/nnet3/nnet-parse-test.cc
index babdbbdcb0e..5ae4917dba6 100644
--- a/src/nnet3/nnet-parse-test.cc
+++ b/src/nnet3/nnet-parse-test.cc
@@ -23,193 +23,6 @@
 namespace kaldi {
 namespace nnet3 {
 
-void UnitTestConfigLineParse() {
-  std::string str;
-  {
-    ConfigLine cfl;
-    str = "a-b xx=yyy foo=bar  baz=123 ba=1:2";
-    bool status = cfl.ParseLine(str);
-    KALDI_ASSERT(status && cfl.FirstToken() == "a-b");
-
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("xx", &str_value));
-    KALDI_ASSERT(str_value == "yyy");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(cfl.GetValue("foo", &str_value));
-    KALDI_ASSERT(str_value == "bar");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(!cfl.GetValue("xy", &str_value));
-    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
-    KALDI_ASSERT(str_value == "123");
-
-    std::vector<int32> int_values;
-    KALDI_ASSERT(!cfl.GetValue("xx", &int_values));
-    KALDI_ASSERT(cfl.GetValue("baz", &int_values));
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(int_values.size() == 1 && int_values[0] == 123);
-    KALDI_ASSERT(cfl.GetValue("ba", &int_values));
-    KALDI_ASSERT(int_values.size() == 2 && int_values[0] == 1 && int_values[1] == 2);
-    KALDI_ASSERT(!cfl.HasUnusedValues());
-  }
-
-  {
-    ConfigLine cfl;
-    str = "a-b baz=x y z pp = qq ab =cd ac= bd";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "a-b baz=x y z pp = qq ab=cd ac=bd";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "foo-bar";
-    KALDI_ASSERT(cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "foo-bar a=b c d f=g";
-    std::string value;
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "foo-bar" &&
-                 cfl.GetValue("a", &value)  && value == "b c d" &&
-                 cfl.GetValue("f", &value) && value == "g" &&
-                 !cfl.HasUnusedValues());
-  }
-  {
-    ConfigLine cfl;
-    str = "zzz a=b baz";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "zzz" &&
-                 cfl.UnusedValues() == "a=b baz");
-  }
-  {
-    ConfigLine cfl;
-    str = "xxx a=b baz ";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.UnusedValues() == "a=b baz");
-  }
-  {
-    ConfigLine cfl;
-    str = "xxx a=b =c";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xxx baz='x y z' pp=qq ab=cd ac=bd";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "xxx");
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
-    KALDI_ASSERT(str_value == "x y z");
-    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
-    KALDI_ASSERT(str_value == "qq");
-    KALDI_ASSERT(cfl.UnusedValues() == "ab=cd ac=bd");
-    KALDI_ASSERT(cfl.GetValue("ab", &str_value));
-    KALDI_ASSERT(str_value == "cd");
-    KALDI_ASSERT(cfl.UnusedValues() == "ac=bd");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(cfl.GetValue("ac", &str_value));
-    KALDI_ASSERT(str_value == "bd");
-    KALDI_ASSERT(!cfl.HasUnusedValues());
-  }
-
-  {
-    ConfigLine cfl;
-    str = "x baz= pp = qq flag=t ";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = " x baz= pp=qq flag=t  ";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "x");
-
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
-    KALDI_ASSERT(str_value == "");
-    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
-    KALDI_ASSERT(str_value == "qq");
-    KALDI_ASSERT(cfl.HasUnusedValues());
-    KALDI_ASSERT(cfl.GetValue("flag", &str_value));
-    KALDI_ASSERT(str_value == "t");
-    KALDI_ASSERT(!cfl.HasUnusedValues());
-
-    bool bool_value = false;
-    KALDI_ASSERT(cfl.GetValue("flag", &bool_value));
-    KALDI_ASSERT(bool_value);
-  }
-
-  {
-    ConfigLine cfl;
-    str = "xx _baz=a -pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx 0baz=a pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx -baz=a pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx _baz'=a pp=qq";
-    KALDI_ASSERT(!cfl.ParseLine(str));
-  }
-  {
-    ConfigLine cfl;
-    str = " baz=g";
-    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "");
-    bool flag;
-    KALDI_ASSERT(!cfl.GetValue("baz", &flag));
-  }
-  {
-    ConfigLine cfl;
-    str = "xx _baz1=a pp=qq";
-    KALDI_ASSERT(cfl.ParseLine(str));
-
-    std::string str_value;
-    KALDI_ASSERT(cfl.GetValue("_baz1", &str_value));
-  }
-}
-
-void UnitTestReadConfig() {
-  std::string str = "a-b alpha=aa beta=\"b b\"# String test\n"
-      "a-b beta2='b c' beta3=bd # \n"
-      "a-b gamma=1:2:3:4  # Int Vector test\n"
-      " a-b de1ta=f  # Bool + Integer in key Comment test delta=t  \n"
-      "a-b _epsilon=-1  # Int Vector test _epsilon=1 \n"
-      "a-b zet-_a=0.15   theta=1.1# Float, -, _ test\n"
-      "a-b quoted='a b c' # quoted string\n"
-      "a-b quoted2=\"d e 'a b=c' f\" # string quoted with double quotes";
-
-  std::istringstream is(str);
-  std::vector<std::string> lines;
-  ReadConfigLines(is, &lines);
-  KALDI_ASSERT(lines.size() == 8);
-
-  ConfigLine cfl;
-  for (size_t i = 0; i < lines.size(); i++) {
-    KALDI_ASSERT(cfl.ParseLine(lines[i]) && cfl.FirstToken() == "a-b");
-    if (i == 1) {
-        KALDI_ASSERT(cfl.GetValue("beta2", &str) && str == "b c");
-    }
-    if (i == 4) {
-      KALDI_ASSERT(cfl.GetValue("_epsilon", &str) && str == "-1");
-    }
-    if (i == 5) {
-      BaseFloat float_val = 0;
-      KALDI_ASSERT(cfl.GetValue("zet-_a", &float_val) && ApproxEqual(float_val, 0.15));
-    }
-    if (i == 6) {
-      KALDI_ASSERT(cfl.GetValue("quoted", &str) && str == "a b c");
-    }
-    if (i == 7) {
-      KALDI_ASSERT(cfl.GetValue("quoted2", &str) && str == "d e 'a b=c' f");
-    }
-  }
-}
 
 void UnitTestDescriptorTokenize() {
   std::vector<std::string> lines;
@@ -281,8 +94,6 @@ int main() {
   using namespace kaldi;
   using namespace kaldi::nnet3;
 
-  UnitTestConfigLineParse();
-  UnitTestReadConfig();
   UnitTestDescriptorTokenize();
   UnitTestSummarizeVector();
   UnitTestNameMatchesPattern();
diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc
index a51bba21484..17dec23e7c1 100644
--- a/src/nnet3/nnet-parse.cc
+++ b/src/nnet3/nnet-parse.cc
@@ -27,353 +27,6 @@
 namespace kaldi {
 namespace nnet3 {
 
-
-bool ConfigLine::ParseLine(const std::string &line) {
-  data_.clear();
-  whole_line_ = line;
-  if (line.size() == 0) return false;   // Empty line
-  size_t pos = 0, size = line.size();
-  while (isspace(line[pos]) && pos < size) pos++;
-  if (pos == size)
-    return false;  // whitespace-only line
-  size_t first_token_start_pos = pos;
-  // first get first_token_.
-  while (!isspace(line[pos]) && pos < size) {
-    if (line[pos] == '=') {
-      // If the first block of non-whitespace looks like "foo-bar=...",
-      // then we ignore it: there is no initial token, and FirstToken()
-      // is empty.
-      pos = first_token_start_pos;
-      break;
-    }
-    pos++;
-  }
-  first_token_ = std::string(line, first_token_start_pos, pos - first_token_start_pos);
-  // first_token_ is expected to be either empty or something like
-  // "component-node", which actually is a slightly more restrictive set of
-  // strings than IsValidName() checks for this is a convenient way to check it.
-  if (!first_token_.empty() && !IsValidName(first_token_))
-    return false;
-
-  while (pos < size) {
-    if (isspace(line[pos])) {
-      pos++;
-      continue;
-    }
-
-    // OK, at this point we know that we are pointing at nonspace.
-    size_t next_equals_sign = line.find_first_of("=", pos);
-    if (next_equals_sign == pos || next_equals_sign == std::string::npos) {
-      // we're looking for something like 'key=value'.  If there is no equals sign,
-      // or it's not preceded by something, it's a parsing failure.
-      return false;
-    }
-    std::string key(line, pos, next_equals_sign - pos);
-    if (!IsValidName(key)) return false;
-
-    // handle any quotes.  we support key='blah blah' or key="foo bar".
-    // no escaping is supported.
-    if (line[next_equals_sign+1] == '\'' || line[next_equals_sign+1] == '"') {
-      char my_quote = line[next_equals_sign+1];
-      size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2);
-      if (next_quote == std::string::npos) {  // no matching quote was found.
-        KALDI_WARN << "No matching quote for " << my_quote << " in config line '"
-                   << line << "'";
-        return false;
-      } else {
-        std::string value(line, next_equals_sign + 2,
-                          next_quote - next_equals_sign - 2);
-        data_.insert(std::make_pair(key, std::make_pair(value, false)));
-        pos = next_quote + 1;
-        continue;
-      }
-    } else {
-      // we want to be able to parse something like "... input=Offset(a, -1) foo=bar":
-      // in general, config values with spaces in them, even without quoting.
-
-      size_t next_next_equals_sign = line.find_first_of("=", next_equals_sign + 1),
-          terminating_space = size;
-
-      if (next_next_equals_sign != std::string::npos) {  // found a later equals sign.
-        size_t preceding_space = line.find_last_of(" \t", next_next_equals_sign);
-        if (preceding_space != std::string::npos &&
-            preceding_space > next_equals_sign)
-          terminating_space = preceding_space;
-      }
-      while (isspace(line[terminating_space - 1]) && terminating_space > 0)
-        terminating_space--;
-
-      std::string value(line, next_equals_sign + 1,
-                        terminating_space - (next_equals_sign + 1));
-      data_.insert(std::make_pair(key, std::make_pair(value, false)));
-      pos = terminating_space;
-    }
-  }
-  return true;
-}
-
-bool ConfigLine::GetValue(const std::string &key, std::string *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      *value = (it->second).first;
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if (!ConvertStringToReal((it->second).first, value))
-        return false;
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, int32 *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if (!ConvertStringToInteger((it->second).first, value))
-        return false;
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, std::vector<int32> *value) {
-  KALDI_ASSERT(value != NULL);
-  value->clear();
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if (!SplitStringToIntegers((it->second).first, ":,", true, value)) {
-        // KALDI_WARN << "Bad option " << (it->second).first;
-        return false;
-      }
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::GetValue(const std::string &key, bool *value) {
-  KALDI_ASSERT(value != NULL);
-  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (it->first == key) {
-      if ((it->second).first.size() == 0) return false;
-      switch (((it->second).first)[0]) {
-        case 'F':
-        case 'f':
-          *value = false;
-          break;
-        case 'T':
-        case 't':
-          *value = true;
-          break;
-        default:
-          return false;
-      }
-      (it->second).second = true;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ConfigLine::HasUnusedValues() const {
-  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (!(it->second).second) return true;
-  }
-  return false;
-}
-
-std::string ConfigLine::UnusedValues() const {
-  std::string unused_str;
-  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
-  for (; it != data_.end(); ++it) {
-    if (!(it->second).second) {
-      if (unused_str == "")
-        unused_str = it->first + "=" + (it->second).first;
-      else
-        unused_str += " " + it->first + "=" + (it->second).first;
-    }
-  }
-  return unused_str;
-}
-
-// This is like ExpectToken but for two tokens, and it
-// will either accept token1 and then token2, or just token2.
-// This is useful in Read functions where the first token
-// may already have been consumed.
-void ExpectOneOrTwoTokens(std::istream &is, bool binary,
-                          const std::string &token1,
-                          const std::string &token2) {
-  KALDI_ASSERT(token1 != token2);
-  std::string temp;
-  ReadToken(is, binary, &temp);
-  if (temp == token1) {
-    ExpectToken(is, binary, token2);
-  } else {
-    if (temp != token2) {
-      KALDI_ERR << "Expecting token " << token1 << " or " << token2
-                << " but got " << temp;
-    }
-  }
-}
-
-// static
-bool ParseFromString(const std::string &name, std::string *string,
-                     int32 *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!ConvertStringToInteger(split_string[i].substr(len), param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     bool *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      std::string b = split_string[i].substr(len);
-      if (b.empty())
-        KALDI_ERR << "Bad option " << split_string[i];
-      if (b[0] == 'f' || b[0] == 'F') *param = false;
-      else if (b[0] == 't' || b[0] == 'T') *param = true;
-      else
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     BaseFloat *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!ConvertStringToReal(split_string[i].substr(len), param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::string *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      *param = split_string[i].substr(len);
-
-      // Set "string" to all the pieces but the one we used.
-      *string = "";
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::vector<int32> *param) {
-  std::vector<std::string> split_string;
-  SplitStringToVector(*string, " \t", true,
-                      &split_string);
-  std::string name_equals = name + "="; // the name and then the equals sign.
-  size_t len = name_equals.length();
-
-  for (size_t i = 0; i < split_string.size(); i++) {
-    if (split_string[i].compare(0, len, name_equals) == 0) {
-      if (!SplitStringToIntegers(split_string[i].substr(len), ":,",
-                                 false, param))
-        KALDI_ERR << "Bad option " << split_string[i];
-      *string = "";
-      // Set "string" to all the pieces but the one we used.
-      for (size_t j = 0; j < split_string.size(); j++) {
-        if (j != i) {
-          if (!string->empty()) *string += " ";
-          *string += split_string[j];
-        }
-      }
-      return true;
-    }
-  }
-  return false;
-}
-
 bool DescriptorTokenize(const std::string &input,
                         std::vector<std::string> *tokens) {
   KALDI_ASSERT(tokens != NULL);
@@ -422,32 +75,6 @@ bool DescriptorTokenize(const std::string &input,
   return true;
 }
 
-bool IsValidName(const std::string &name) {
-  if (name.size() == 0) return false;
-  for (size_t i = 0; i < name.size(); i++) {
-    if (i == 0 && !isalpha(name[i]) && name[i] != '_')
-      return false;
-    if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.')
-      return false;
-  }
-  return true;
-}
-
-void ReadConfigLines(std::istream &is,
-                    std::vector<std::string> *lines) {
-  KALDI_ASSERT(lines != NULL);
-  std::string line;
-  while (std::getline(is, line)) {
-    if (line.size() == 0) continue;
-    size_t start = line.find_first_not_of(" \t");
-    size_t end = line.find_first_of('#');
-    if (start == std::string::npos || start == end) continue;
-    end = line.find_last_not_of(" \t", end - 1);
-    KALDI_ASSERT(end >= start);
-    lines->push_back(line.substr(start, end - start + 1));
-  }
-}
-
 std::string ErrorContext(std::istream &is) {
   if (!is.good()) return "end of line";
   char buf[21];
diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h
index a073a54f7e0..0fc19d51f6c 100644
--- a/src/nnet3/nnet-parse.h
+++ b/src/nnet3/nnet-parse.h
@@ -26,103 +26,6 @@
 namespace kaldi {
 namespace nnet3 {
 
-/**
-   This class is responsible for parsing input like
-    hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' baz="a b c d='a b' e"
-   and giving you access to the fields, in this case
-
-   FirstToken() == "hi-there", and key->value pairs:
-
-   xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123",
-   bing->"a b c", baz->"a b c d='a b' e"
-
-   The first token is optional, if the line started with a key-value pair then
-   FirstValue() will be empty.
-
-   Note: it can parse value fields with space inside them only if they are free of the '='
-   character.  If values are going to contain the '=' character, you need to quote them
-   with either single or double quotes.
-
-   Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_.
- */
-class ConfigLine {
- public:
-  // Tries to parse the line as a config-file line.  Returns false
-  // if it could not for some reason, e.g. parsing failure.  In most cases
-  // prints no warnings; the user should do this.  Does not expect comments.
-  bool ParseLine(const std::string &line);
-
-  // the GetValue functions are overloaded for various types.  They return true
-  // if the key exists with value that can be converted to that type, and false
-  // otherwise.  They also mark the key-value pair as having been read.  It is
-  // not an error to read values twice.
-  bool GetValue(const std::string &key, std::string *value);
-  bool GetValue(const std::string &key, BaseFloat *value);
-  bool GetValue(const std::string &key, int32 *value);
-  // Values may be separated by ":" or by ",".
-  bool GetValue(const std::string &key, std::vector<int32> *value);
-  bool GetValue(const std::string &key, bool *value);
-
-  bool HasUnusedValues() const;
-  /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one
-  /// of the GetValue() functions.
-  std::string UnusedValues() const;
-
-  const std::string &FirstToken() const { return first_token_; }
-
-  const std::string WholeLine() { return whole_line_; }
-  // use default assignment operator and copy constructor.
- private:
-  std::string whole_line_;
-  // the first token of the line, e.g. if line is
-  // foo-bar baz=bing
-  // then first_token_ would be "foo-bar".
-  std::string first_token_;
-
-  // data_ maps from key to (value, is-this-value-consumed?).
-  std::map<std::string, std::pair<std::string, bool> > data_;
-
-};
-
-// Note: the ParseFromString functions are to be removed after we switch over to
-// using the ConfigLine mechanism.
-
-
-/// \file nnet-parse.h
-///   This header contains a few parsing-related functions that are used
-///    while reading parsing neural network files and config files.
-
-/// Function used in Init routines.  Suppose name=="foo", if "string" has a
-/// field like foo=12, this function will set "param" to 12 and remove that
-/// element from "string".  It returns true if the parameter was read.
-bool ParseFromString(const std::string &name, std::string *string,
-                     int32 *param);
-
-/// This version of ParseFromString is for parameters of type BaseFloat.
-bool ParseFromString(const std::string &name, std::string *string,
-                     BaseFloat *param);
-
-/// This version of ParseFromString is for parameters of type bool, which can
-/// appear as any string beginning with f, F, t or T.
-bool ParseFromString(const std::string &name, std::string *string,
-                     bool *param);
-
-/// This version of ParseFromString is for parsing strings.  (these
-/// should not contain space).
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::string *param);
-
-/// This version of ParseFromString handles colon-separated or comma-separated
-/// lists of integers.
-bool ParseFromString(const std::string &name, std::string *string,
-                     std::vector<int32> *param);
-
-/// This function is like ExpectToken but for two tokens, and it will either
-/// accept token1 and then token2, or just token2.  This is useful in Read
-/// functions where the first token may already have been consumed.
-void ExpectOneOrTwoTokens(std::istream &is, bool binary,
-                          const std::string &token1,
-                          const std::string &token2);
 
 /**
    This function tokenizes input when parsing Descriptor configuration
@@ -142,32 +45,6 @@ void ExpectOneOrTwoTokens(std::istream &is, bool binary,
 bool DescriptorTokenize(const std::string &input,
                         std::vector<std::string> *tokens);
 
-/// Returns true if 'name' would be a valid name for a component or node in a
-/// Nnet.  This is a nonempty string beginning with A-Za-z_, and containing only
-/// '-', '_', '.', A-Z, a-z, or 0-9.
-bool IsValidName(const std::string &name);
-
-
-/**
-   This function reads in a config file and *appends* its contents to a vector of
-   lines; it is responsible for removing comments (anything after '#') and
-   stripping out any lines that contain only whitespace after comment removal.
- */
-void ReadConfigLines(std::istream &is,
-                     std::vector<std::string> *lines);
-
-
-/**
-   This function converts config-lines from a simple sequence of strings
-   as output by ReadConfigLines(), into a sequence of first-tokens and
-   name-value pairs.  The general format is:
-      "command-type bar=baz xx=yyy"
-   etc., although there are subtleties as to what exactly is allowed, see
-   documentation for class ConfigLine for details.
-   This function will die if there was a parsing failure.
- */
-void ParseConfigLines(const std::vector<std::string> &lines,
-                      std::vector<ConfigLine> *config_lines);
 
 /*
   Returns true if name 'name' matches pattern 'pattern'.  The pattern
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 0acaa5c2008..b4563c7a2c3 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -30,6 +30,7 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
     nnet_(nnet),
     compiler_(*nnet, config_.optimize_config, config_.compiler_config),
     num_minibatches_processed_(0),
+    max_change_stats_(*nnet),
     srand_seed_(RandInt(0, 100000)) {
   if (config.zero_component_stats)
     ZeroComponentStats(nnet);
@@ -38,9 +39,6 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
                config.backstitch_training_interval > 0);
   delta_nnet_ = nnet_->Copy();
   ScaleNnet(0.0, delta_nnet_);
-  const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);
-  num_max_change_per_component_applied_.resize(num_updatable, 0);
-  num_max_change_global_applied_ = 0;
 
   if (config_.read_cache != "") {
     bool binary;
@@ -111,9 +109,9 @@ void NnetTrainer::TrainInternal(const NnetExample &eg,
                         delta_nnet_);
 
   // Update the parameters of nnet
-  bool success = UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change,
-      1.0, 1.0 - config_.momentum, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+  bool success = UpdateNnetWithMaxChange(
+      *delta_nnet_, config_.max_param_change,
+      1.0, 1.0 - config_.momentum, nnet_, &max_change_stats_);
 
   // Scale down the batchnorm stats (keeps them fresh... this affects what
   // happens when we use the model with batchnorm test-mode set).
@@ -167,9 +165,10 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg,
   }
 
   // Updates the parameters of nnet
-  UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change,
+  UpdateNnetWithMaxChange(
+      *delta_nnet_, config_.max_param_change,
       max_change_scale, scale_adding, nnet_,
-      &num_max_change_per_component_applied_, &num_max_change_global_applied_);
+      &max_change_stats_);
 
   if (is_backstitch_step1) {
     // The following will only do something if we have a LinearComponent or
@@ -236,40 +235,10 @@ bool NnetTrainer::PrintTotalStats() const {
     bool ok = info.PrintTotalStats(name);
     ans = ans || ok;
   }
-  PrintMaxChangeStats();
+  max_change_stats_.Print(*nnet_);
   return ans;
 }
 
-void NnetTrainer::PrintMaxChangeStats() const {
-  KALDI_ASSERT(delta_nnet_ != NULL);
-  int32 i = 0;
-  for (int32 c = 0; c < delta_nnet_->NumComponents(); c++) {
-    Component *comp = delta_nnet_->GetComponent(c);
-    if (comp->Properties() & kUpdatableComponent) {
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
-      if (uc == NULL)
-        KALDI_ERR << "Updatable component does not inherit from class "
-                  << "UpdatableComponent; change this code.";
-      if (num_max_change_per_component_applied_[i] > 0)
-        KALDI_LOG << "For " << delta_nnet_->GetComponentName(c)
-                  << ", per-component max-change was enforced "
-                  << (100.0 * num_max_change_per_component_applied_[i]) /
-                     (num_minibatches_processed_ *
-                     (config_.backstitch_training_scale == 0.0 ? 1.0 :
-                     1.0 + 1.0 / config_.backstitch_training_interval))
-                  << " \% of the time.";
-      i++;
-    }
-  }
-  if (num_max_change_global_applied_ > 0)
-    KALDI_LOG << "The global max-change was enforced "
-              << (100.0 * num_max_change_global_applied_) /
-                 (num_minibatches_processed_ *
-                 (config_.backstitch_training_scale == 0.0 ? 1.0 :
-                 1.0 + 1.0 / config_.backstitch_training_interval))
-              << " \% of the time.";
-}
-
 void ObjectiveFunctionInfo::UpdateStats(
     const std::string &output_name,
     int32 minibatches_per_phase,
diff --git a/src/nnet3/nnet-training.h b/src/nnet3/nnet-training.h
index fffc621930a..f09649d1506 100644
--- a/src/nnet3/nnet-training.h
+++ b/src/nnet3/nnet-training.h
@@ -26,6 +26,7 @@
 #include "nnet3/nnet-compute.h"
 #include "nnet3/nnet-optimize.h"
 #include "nnet3/nnet-example-utils.h"
+#include "nnet3/nnet-utils.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -81,7 +82,7 @@ struct NnetTrainerOptions {
     opts->Register("l2-regularize-factor", &l2_regularize_factor, "Factor that "
                    "affects the strength of l2 regularization on model "
                    "parameters.  The primary way to specify this type of "
-                   "l2 regularization is via the 'l2-regularize'"
+                   "l2 regularization is via the 'l2-regularize' "
                    "configuration value at the config-file level. "
                    " --l2-regularize-factor will be multiplied by the component-level "
                    "l2-regularize values and can be used to correct for effects "
@@ -187,10 +188,6 @@ class NnetTrainer {
   // Prints out the final stats, and return true if there was a nonzero count.
   bool PrintTotalStats() const;
 
-  // Prints out the max-change stats (if nonzero): the percentage of time that
-  // per-component max-change and global max-change were enforced.
-  void PrintMaxChangeStats() const;
-
   ~NnetTrainer();
  private:
   // The internal function for doing one step of conventional SGD training.
@@ -220,8 +217,7 @@ class NnetTrainer {
   int32 num_minibatches_processed_;
 
   // stats for max-change.
-  std::vector<int32> num_max_change_per_component_applied_;
-  int32 num_max_change_global_applied_;
+  MaxChangeStats max_change_stats_;
 
   unordered_map<std::string, ObjectiveFunctionInfo, StringHasher> objf_info_;
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index e020f8fc6a7..61da1d7f6a9 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -2173,5 +2173,47 @@ void ApplyL2Regularization(const Nnet &nnet,
 }
 
 
+bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,
+                             BaseFloat max_param_change,
+                             BaseFloat max_change_scale,
+                             BaseFloat scale, Nnet *nnet,
+                             MaxChangeStats *stats) {
+  bool ans = UpdateNnetWithMaxChange(
+      delta_nnet, max_param_change, max_change_scale,
+      scale, nnet,
+      &(stats->num_max_change_per_component_applied),
+      &(stats->num_max_change_global_applied));
+  stats->num_minibatches_processed++;
+  return ans;
+}
+
+
+void MaxChangeStats::Print(const Nnet &nnet) const {
+  int32 i = 0;
+  for (int32 c = 0; c < nnet.NumComponents(); c++) {
+    const Component *comp = nnet.GetComponent(c);
+    if (comp->Properties() & kUpdatableComponent) {
+      const UpdatableComponent *uc = dynamic_cast<const UpdatableComponent*>(
+          comp);
+      if (uc == NULL)
+        KALDI_ERR << "Updatable component does not inherit from class "
+                  << "UpdatableComponent; change this code.";
+      if (num_max_change_per_component_applied[i] > 0)
+        KALDI_LOG << "For " << nnet.GetComponentName(c)
+                  << ", per-component max-change was enforced "
+                  << ((100.0 * num_max_change_per_component_applied[i]) /
+                      num_minibatches_processed)
+                  << " \% of the time.";
+      i++;
+    }
+  }
+  if (num_max_change_global_applied > 0)
+    KALDI_LOG << "The global max-change was enforced "
+              << ((100.0 * num_max_change_global_applied) /
+                  num_minibatches_processed)
+              << "\% of the time.";
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 787bd228a38..a5d17eb0437 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -331,13 +331,13 @@ void ReadEditConfig(std::istream &config_file, Nnet *nnet);
 
    \code
      Nnet temp_nnet(delta_nnet);
-     ScaleNnet(1.0 / max_change_scale, &temp_nnet);
-     [ Scale down parameters for each component of temp_nnet as needed so
-     their Euclidean norms do not exceed their per-component max-changes ]
+     ScaleNnet(scale, &temp_nnet);
+      [ Scale down parameters for each component of temp_nnet as needed so
+       their Euclidean norms do not exceed (their per-component max-changes
+        each multiplied by max_change_scale) ]
      [ Scale down temp_nnet as needed so its Euclidean norm does not exceed
-       the global max-change ]
-     ScaleNnet(max_change_scale, &temp_nnet);  // undo the previous scaling.
-     AddNnet(temp_nnet, scale, nnet);
+       the global max-change times max_change_scale ]
+     AddNnet(temp_nnet, 1.0, nnet);
    \endcode
 
    @param [in] delta_nnet  The copy of '*nnet' neural network that contains
@@ -361,7 +361,8 @@ void ReadEditConfig(std::istream &config_file, Nnet *nnet);
                max-change, and 'max_change_scale * max_param_change' as the
                global max-change).
    @param [in] scale  This value, which will normally be 1.0, is a scaling
-               factor used when adding to 'nnet', applied after any max-changes.
+               factor used when adding to 'nnet', which is (conceptually)
+               applied before any max-changes.
                It is provided for backstitch-related purposes.
    @param [in,out] nnet  The nnet which we add to.
    @param [out] num_max_change_per_component_applied  We add to the elements of
@@ -377,6 +378,17 @@ bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,
                              num_max_change_per_component_applied,
                              int32 *num_max_change_global_applied);
 
+struct MaxChangeStats;
+
+// This overloaded version of UpdateNnetWithMaxChange() is a convenience
+// wrapper for when you have a MaxChangeStats object to keep track
+// of how many times the max-change was applied.  See documentation above.
+bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,
+                             BaseFloat max_param_change,
+                             BaseFloat max_change_scale,
+                             BaseFloat scale, Nnet *nnet,
+                             MaxChangeStats *stats);
+
 
 /**
    This function is used as part of the regular training workflow, prior to
@@ -513,6 +525,24 @@ int32 GetNumNvalues(const std::vector<NnetIo> &io_vec,
                     bool exhaustive);
 
 
+struct MaxChangeStats {
+  int32 num_max_change_global_applied;
+  int32 num_minibatches_processed;
+  std::vector<int32> num_max_change_per_component_applied;
+
+  MaxChangeStats(const Nnet &nnet):
+      num_max_change_global_applied(0),
+      num_minibatches_processed(0),
+      num_max_change_per_component_applied(NumUpdatableComponents(nnet), 0) { }
+
+  // Prints the max-change stats.  Usually will be called at the end
+  // of the program.  The nnet is only needed for structural information,
+  // to work out the component names.
+  void Print(const Nnet &nnet) const;
+};
+
+
+
 } // namespace nnet3
 } // namespace kaldi
 
diff --git a/src/nnet3a/Makefile b/src/nnet3a/Makefile
new file mode 100644
index 00000000000..5410c54f525
--- /dev/null
+++ b/src/nnet3a/Makefile
@@ -0,0 +1,23 @@
+all:
+
+# This directory contains code related to the adaptation
+# framework in ../adapt, for nnet3 and (principally) chain
+# training.
+
+include ../kaldi.mk
+
+TESTFILES = nnet-chaina-utils-test nnet-chaina-training-test
+
+OBJFILES = nnet-chaina-training.o nnet-chaina-utils.o
+
+LIBNAME = kaldi-nnet3a
+
+ADDLIBS = ../fstext/kaldi-fstext.a ../chain/kaldi-chain.a \
+           ../nnet3/kaldi-nnet3.a  ../adapt/kaldi-adapt.a \
+           ../cudamatrix/kaldi-cudamatrix.a ../hmm/kaldi-hmm.a	\
+           ../transform/kaldi-transform.a ../tree/kaldi-tree.a \
+           ../hmm/kaldi-hmm.a ../gmm/kaldi-gmm.a ../lat/kaldi-lat.a \
+           ../matrix/kaldi-matrix.a ../util/kaldi-util.a \
+          ../base/kaldi-base.a
+
+include ../makefiles/default_rules.mk
diff --git a/src/nnet3a/nnet-chaina-training-test.cc b/src/nnet3a/nnet-chaina-training-test.cc
new file mode 100644
index 00000000000..c570ba29340
--- /dev/null
+++ b/src/nnet3a/nnet-chaina-training-test.cc
@@ -0,0 +1,44 @@
+// nnet3/nnet-chaina-training-test.cc
+
+// Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3a/nnet-chaina-training.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+void UnitTestCompile() {
+  // just testing the compilation works, i.e. that all member functions are
+  // defined
+  NnetChainaTrainingOptions config;
+  NnetChainaModels models(true, false, false, "a", "b", "c");
+  NnetChainaTrainer  trainer(config, &models);
+}
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+int main() {
+  using namespace kaldi;
+  using namespace kaldi::nnet3;
+  SetVerboseLevel(2);
+  // KALDI_LOG << "Tests succeeded.";
+  return 0;
+}
diff --git a/src/nnet3a/nnet-chaina-training.cc b/src/nnet3a/nnet-chaina-training.cc
new file mode 100644
index 00000000000..340b4dece7d
--- /dev/null
+++ b/src/nnet3a/nnet-chaina-training.cc
@@ -0,0 +1,1225 @@
+// nnet3/nnet-chaina-training.cc
+
+// Copyright      2018    Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet-utils.h"
+#include "nnet3a/nnet-chaina-training.h"
+#include "nnet3a/nnet-chaina-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+NnetChainaModels::NnetChainaModels(
+    const NnetChainaTrainingOptions &opts,
+    const std::string &model_dir,
+    const std::string &den_fst_dir,
+    const std::string &transform_dir):
+    opts_(opts),
+    model_dir_(model_dir),
+    den_fst_dir_(den_fst_dir),
+    transform_dir_(transform_dir) {
+  std::string bottom_nnet_name; // model_dir/bottom.raw
+  GetPathname(model_dir, "bottom", "raw", &bottom_nnet_name);
+  ReadKaldiObject(bottom_nnet_name, &bottom_nnet_);
+  ComputeSimpleNnetContext(bottom_nnet_,
+                           &bottom_nnet_left_context_,
+                           &bottom_nnet_right_context_);
+  bool is_top_nnet = false;
+  InitializeNnet(is_top_nnet, &bottom_nnet_);
+}
+
+void NnetChainaModels::InitializeNnet(
+    bool is_top_nnet, Nnet *nnet) const {
+  const NnetChainaTrainingPerModelOptions &bottom_or_top_opts =
+      (is_top_nnet ? opts_.top : opts_.bottom);
+
+  // we could change that condition later if it turns out to be a problem.
+  if (bottom_or_top_opts.batchnorm_test_mode)
+    SetBatchnormTestMode(true, nnet);
+  if (bottom_or_top_opts.dropout_test_mode)
+    SetDropoutTestMode(true, nnet);
+  if (!bottom_or_top_opts.train && bottom_or_top_opts.batchnorm_test_mode) {
+    // The following is for efficiency in evaluating the nnet;
+    // it may combine certain component types.
+    CollapseModel(CollapseModelConfig(), nnet);
+  }
+}
+
+NnetChainaModels::LanguageInfo::LanguageInfo(
+    const NnetChainaModels::LanguageInfo &other):
+    trans_model(other.trans_model),
+    am_nnet(other.am_nnet),
+    den_fst(other.den_fst),
+    transform(other.transform) { }
+
+
+// This code is related to UpdateNnetMovingAverage() in nnet3-chain-combine.cc.
+void NnetChainaModels::InterpolateWith(
+      BaseFloat new_model_weight,
+      const std::string &model_dir) {
+  KALDI_ASSERT(new_model_weight > 0.0 && new_model_weight < 1.0);
+
+  std::string bottom_filename;
+  GetPathname(model_dir, "bottom", "raw", &bottom_filename);
+  Nnet bottom_nnet;  // we don't need the transition model, and the reading code
+                     // is capable of ignoring it.
+  ReadKaldiObject(bottom_filename, &bottom_nnet);
+  bool is_top_nnet = false;
+  InitializeNnet(is_top_nnet, &bottom_nnet);
+  ScaleNnet(1.0 - new_model_weight, &bottom_nnet_);
+  AddNnet(bottom_nnet, new_model_weight, &bottom_nnet_);
+  for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) {
+    const std::string &lang = iter->first;
+    LanguageInfo *info = iter->second;
+    std::string model_filename;
+    GetPathname(model_dir, lang, "mdl", &model_filename);
+    Nnet top_nnet;  // we don't need the transition model, and the reading code
+                    // is capable of ignoring it.
+    ReadKaldiObject(model_filename, &top_nnet);
+    is_top_nnet = true;
+    InitializeNnet(is_top_nnet, &top_nnet);
+    Nnet &stored_nnet = info->am_nnet.GetNnet();
+    ScaleNnet(1.0 - new_model_weight, &stored_nnet);
+    AddNnet(top_nnet, new_model_weight, &stored_nnet);
+  }
+}
+
+
+NnetChainaModels::NnetChainaModels(const NnetChainaModels &other):
+    opts_(other.opts_),
+    model_dir_(other.model_dir_),
+    den_fst_dir_(other.den_fst_dir_),
+    transform_dir_(other.transform_dir_),
+    bottom_nnet_(other.bottom_nnet_),
+    bottom_nnet_left_context_(other.bottom_nnet_left_context_),
+    bottom_nnet_right_context_(other.bottom_nnet_right_context_) {
+  for (auto iter = other.lang_info_.begin();
+       iter != other.lang_info_.end(); ++iter) {
+    const std::string &lang = iter->first;
+    LanguageInfo *info = iter->second;
+    lang_info_[lang] = new LanguageInfo(*info);
+  }
+}
+
+
+
+void NnetChainaModels::GetPathname(const std::string &dir,
+                                   const std::string &name,
+                                   const std::string &suffix,
+                                   std::string *pathname) {
+  std::ostringstream str;
+  str << dir << '/' << name << '.' << suffix;
+  *pathname = str.str();
+}
+
+void NnetChainaModels::GetPathname(const std::string &dir,
+                                   const std::string &name,
+                                   int32 job_id,
+                                   const std::string &suffix,
+                                   std::string *pathname) {
+  std::ostringstream str;
+  str << dir << '/' << name << '.' << job_id << '.' << suffix;
+  *pathname = str.str();
+}
+
+NnetChainaModels::LanguageInfo *NnetChainaModels::GetInfoForLang(
+    const std::string &lang) {
+  auto iter = lang_info_.find(lang);
+  if (iter != lang_info_.end()) {
+    return iter->second;
+  } else {
+    LanguageInfo *info = new LanguageInfo();
+
+    std::string model_filename, den_fst_filename, transform_filename;
+    GetPathname(model_dir_, lang, "mdl", &model_filename);
+    GetPathname(den_fst_dir_, lang, "den.fst", &den_fst_filename);
+    GetPathname(transform_dir_, lang, "ada", &transform_filename);
+
+    {
+      bool binary;
+      Input ki(model_filename, &binary);
+      info->trans_model.Read(ki.Stream(), binary);
+      info->am_nnet.Read(ki.Stream(), binary);
+      Nnet &nnet = info->am_nnet.GetNnet();
+      bool is_top_nnet = true;
+      InitializeNnet(is_top_nnet, &nnet);
+    }
+    ReadFstKaldi(den_fst_filename, &(info->den_fst));
+    ReadKaldiObject(transform_filename, &(info->transform));
+    lang_info_[lang] = info;
+    return info;
+  }
+}
+
+Nnet* NnetChainaModels::GetBottomNnet() {
+  return &bottom_nnet_;
+}
+
+
+AmNnetSimple* NnetChainaModels::GetNnetForLang(
+    const std::string &language_name) {
+  LanguageInfo *info = GetInfoForLang(language_name);
+  return &(info->am_nnet);
+}
+
+TransitionModel* NnetChainaModels::GetTransitionModelForLang(
+    const std::string &language_name) {
+  LanguageInfo *info = GetInfoForLang(language_name);
+  return &(info->trans_model);
+}
+
+fst::StdVectorFst* NnetChainaModels::GetDenFstForLang(
+       const std::string &language_name) {
+  LanguageInfo *info = GetInfoForLang(language_name);
+  return &(info->den_fst);
+}
+
+Nnet* NnetChainaModels::GetRawNnetForLang(
+       const std::string &language_name) {
+  LanguageInfo *info = GetInfoForLang(language_name);
+  return &(info->am_nnet.GetNnet());
+}
+
+differentiable_transform::DifferentiableTransformMapped*
+NnetChainaModels::GetTransformForLang(
+    const std::string &language_name) {
+  LanguageInfo *info = GetInfoForLang(language_name);
+  return &(info->transform);
+}
+
+
+
+void NnetChainaModels::Write(const std::string &model_out_dir,
+                             bool binary, int32 job_id) {
+  std::ostringstream ss;
+  if (opts_.bottom.train) {
+    ss << "bottom nnet and ";
+    std::string bottom_model_name;
+    GetPathname(model_out_dir, "bottom", job_id, "raw", &bottom_model_name);
+    WriteKaldiObject(bottom_nnet_, bottom_model_name, binary);
+  }
+  if (opts_.top.train) {
+    ss << "nnets for languages ";
+    for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) {
+      const std::string &lang_name = iter->first;
+      ss << lang_name << " ";
+      LanguageInfo *info = iter->second;
+      {
+        // we write it as a 'raw' model without the TransitionModel or
+        // the AmNnetSimple wrapper, since we can reconstruct those parts
+        // from the previous iter's model.
+        std::string top_model_name;
+        GetPathname(model_out_dir, lang_name, job_id, "raw", &top_model_name);
+        WriteKaldiObject(info->am_nnet.GetNnet(), top_model_name, binary);
+      }
+    }
+  }
+  if (opts_.adaptation_model_accumulate) {
+    ss << "adaptation-model stats for languages ";
+    for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) {
+      const std::string &lang_name = iter->first;
+      ss << lang_name << " ";
+      LanguageInfo *info = iter->second;
+      {
+        std::string transform_name;
+        GetPathname(model_out_dir, lang_name, job_id, "ada", &transform_name);
+        WriteKaldiObject(info->transform, transform_name, binary);
+      }
+    }
+  }
+  KALDI_LOG << "Wrote " << ss.str() << "to " << model_out_dir;
+}
+
+
+void NnetChainaModels::WriteCombinedModels(const std::string &model_out_dir,
+                                           bool binary) {
+
+  std::string bottom_model_name;
+  GetPathname(model_out_dir, "bottom", "raw", &bottom_model_name);
+  WriteKaldiObject(bottom_nnet_, bottom_model_name, binary);
+
+  std::ostringstream ss;
+  for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter) {
+    const std::string &lang_name = iter->first;
+    ss << lang_name << " ";
+    LanguageInfo *info = iter->second;
+    std::string top_model_name;
+    GetPathname(model_out_dir, lang_name, "mdl", &top_model_name);
+
+    Output ko(top_model_name, binary);
+    info->trans_model.Write(ko.Stream(), binary);
+    info->am_nnet.Write(ko.Stream(), binary);
+  }
+  KALDI_LOG << "Wrote bottom.raw and .mdl files for languages:"
+            << ss.str() << "to: " << model_out_dir;
+}
+
+NnetChainaModels::~NnetChainaModels() {
+  for (auto iter = lang_info_.begin(); iter != lang_info_.end(); ++iter)
+    delete iter->second;
+}
+
+NnetChainaTopTrainer::NnetChainaTopTrainer(
+    const std::string &lang_name,
+    const NnetChainaTrainingOptions &config,
+    const fst::StdVectorFst &den_fst,
+    const differentiable_transform::DifferentiableTransformMapped &transform,
+    Nnet *nnet):
+    lang_name_(lang_name),
+    opts_(config),
+    den_graph_(den_fst, nnet->OutputDim("output")),
+    transform_(transform),
+    compiler_(*nnet, opts_.nnet_config.optimize_config,
+              opts_.nnet_config.compiler_config),
+    nnet_(nnet),
+    delta_nnet_(nnet->Copy()),
+    num_minibatches_processed_(0),
+    max_change_stats_(*nnet) {
+
+  config.Check();
+
+  if (opts_.nnet_config.zero_component_stats &&
+      !opts_.top.batchnorm_test_mode)
+    ZeroComponentStats(nnet);
+
+  ScaleNnet(0.0, delta_nnet_);
+  if (opts_.nnet_config.read_cache != "") {
+    // It would be complicated to implement, as there are various top nnets
+    // and they would all try to read and write the same cache files.
+    // To implement this, the best way would be to
+    KALDI_WARN << "The read-cache options are not currently supported here.";
+  }
+  KALDI_ASSERT(opts_.nnet_config.momentum >= 0.0);
+}
+
+
+NnetChainaTopTrainer::ComputationStructure::ComputationStructure(
+    bool adapted,
+    bool train_model,
+    bool need_input_deriv,
+    int32 num_sequences,
+    int32 frames_per_sequence_in,
+    int32 frames_per_sequence_out,
+    int32 first_input_t,
+    int32 top_subsampling_factor):
+    adapted(adapted), train_model(train_model),
+    need_input_deriv(need_input_deriv), num_sequences(num_sequences),
+    frames_per_sequence_in(frames_per_sequence_in),
+    frames_per_sequence_out(frames_per_sequence_out),
+    first_input_t(first_input_t),
+    top_subsampling_factor(top_subsampling_factor) { }
+
+
+NnetChainaBottomTrainer::ComputationStructure::ComputationStructure(
+    bool train_model,
+    int32 num_sequences,
+    int32 frames_per_sequence_in,
+    int32 frames_per_sequence_out,
+    int32 first_input_t,
+    int32 first_output_t):
+    train_model(train_model),
+    num_sequences(num_sequences),
+    frames_per_sequence_in(frames_per_sequence_in),
+    frames_per_sequence_out(frames_per_sequence_out),
+    first_input_t(first_input_t),
+    first_output_t(first_output_t) { }
+
+
+void NnetChainaTopTrainer::ConsolidateMemory() {
+  ::kaldi::nnet3::ConsolidateMemory(nnet_);
+  ::kaldi::nnet3::ConsolidateMemory(delta_nnet_);
+}
+
+
+std::shared_ptr<const NnetComputation> NnetChainaTopTrainer::GetComputation(
+    const ComputationStructure &s) {
+  {
+    auto iter = computation_map_.find(s);
+    if (iter != computation_map_.end())
+      return iter->second;
+  }
+  int32 num_sequences = s.num_sequences,
+      frames_per_sequence_in = s.frames_per_sequence_in,
+      frames_per_sequence_out = s.frames_per_sequence_out,
+      first_input_t = s.first_input_t,
+      first_output_t = 0,
+      top_subsampling_factor = s.top_subsampling_factor;
+
+  if (nnet_->InputDim("input") < 0 ||
+      nnet_->OutputDim("output") < 0 ||
+      nnet_->OutputDim("output-si") < 0 ||
+      nnet_->OutputDim("output-xent") < 0 ||
+      nnet_->OutputDim("output-si-xent") < 0) {
+    KALDI_ERR << "Top neural net for chaina training must have an input called "
+        "'input' and outputs called 'output', 'output-xent', 'output-si', and "
+        "'output-si-xent'.";
+  }
+
+  ComputationRequest request;
+  request.need_model_derivative = s.train_model;
+  // It's probably harmless to store stats unless we have batchorm components in
+  // test mode.
+  request.store_component_stats = !opts_.top.batchnorm_test_mode;
+  request.inputs.resize(1);
+  request.inputs[0].name = "input";
+  request.inputs[0].indexes.resize(frames_per_sequence_in * num_sequences);
+  request.inputs[0].has_deriv = s.need_input_deriv;
+  // The inputs are in the order: the first frame of all sequences; the second
+  // frame of all sequences; and so on.
+  auto iter = request.inputs[0].indexes.begin();
+  for (int32 t = first_input_t;
+       t < first_input_t + frames_per_sequence_in; ++t) {
+    for (int32 n = 0; n < num_sequences; ++n,++iter) {
+      iter->n = n;
+      iter->t = t;
+      // the x values will already be 0, thanks to the default constructor of
+      // Index().
+    }
+  }
+  // The outputs are also in the order: the first frame of all sequences;
+  // the second frame of all sequences; and so on.
+  request.outputs.resize(2);
+  request.outputs[0].name = (s.adapted ? "output" : "output-si");
+  request.outputs[0].has_deriv = opts_.top.train;
+  request.outputs[0].indexes.resize(frames_per_sequence_out * num_sequences);
+  int32 t_stride_out = top_subsampling_factor;
+  iter = request.outputs[0].indexes.begin();
+  for (int32 t = first_output_t;
+       t < first_output_t + frames_per_sequence_out * t_stride_out;
+       t += t_stride_out) {
+    for (int32 n = 0; n < num_sequences; ++n,++iter) {
+      iter->n = n;
+      iter->t = t;
+    }
+  }
+  request.outputs[1].has_deriv = opts_.top.train;
+  request.outputs[1].name = (s.adapted ? "output-xent" : "output-si-xent");
+  request.outputs[1].indexes = request.outputs[0].indexes;
+  std::shared_ptr<const NnetComputation> computation = compiler_.Compile(
+      request);
+  computation_map_[s] = computation;
+  return computation;
+}
+
+bool NnetChainaTopTrainer::TrainUnadapted(
+    const CuMatrixBase<BaseFloat> &input,
+    const NnetComputation &computation,
+    const chain::Supervision &supervision,
+    bool need_model_deriv,
+    const CuVectorBase<BaseFloat> &deriv_weights,
+    Posterior *posterior,
+    CuMatrix<BaseFloat> *input_deriv) {
+
+  const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+
+  // note: because we give the 1st arg (nnet_) as a pointer to the
+  // constructor of 'computer', it will use that copy of the nnet to
+  // store stats.
+  NnetComputer computer(nnet_config.compute_config, computation,
+                        nnet_, delta_nnet_);
+
+  // Give the inputs to the computer object.
+  CuMatrix<BaseFloat> input_copy(input);
+  computer.AcceptInput("input", &input_copy);
+  // Do the forward propagation.
+  computer.Run();
+
+  const CuMatrixBase<BaseFloat>
+      &output = computer.GetOutput("output-si"),
+      &output_xent = computer.GetOutput("output-si-xent");
+  // It's not optimal that we compute these derivatives even when we're not
+  // training, but the 'compute-prob' phase doesn't dominate.
+  CuMatrix<BaseFloat> output_deriv(output.NumRows(),
+                                   output.NumCols(),
+                                   kUndefined),
+      output_xent_deriv;
+
+  // Note: we normally turn the chain l2 regularization (which is l2 on the
+  // output of the nnet) off now, since parameter-level l2 regularization seems
+  // to work better.  So expect 'tot_l2_term' to be zero.
+  BaseFloat tot_objf, tot_l2_term, tot_weight;
+
+  ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_,
+                           supervision, output,
+                           &tot_objf, &tot_l2_term, &tot_weight,
+                           &output_deriv, &output_xent_deriv,
+                           posterior);
+
+  if (!(tot_objf - tot_objf == 0.0)) {
+    // A NaN or inf was encountered in the objective computation.
+    // The input_deriv won't be used, so no need to set it.
+    // Un-freeze the natural gradient and return.
+    return false;
+  }
+
+  {
+    // this block computes and keeps track of the cross-entropy objective.
+    // at this point, xent_deriv is posteriors derived from the numerator
+    // computation.  note, xent_objf has a factor of '.supervision.weight',
+    // which is also included in 'tot_weight'.
+    BaseFloat xent_objf = TraceMatMat(output_xent, output_xent_deriv, kTrans);
+    output_si_xent_objf_.UpdateStats(lang_name_ + ":output-si-xent",
+                                     opts_.nnet_config.print_interval,
+                                     num_minibatches_processed_,
+                                     tot_weight, xent_objf);
+  }
+
+
+  if (opts_.apply_deriv_weights && deriv_weights.Dim() != 0) {
+    output_deriv.MulRowsVec(deriv_weights);
+    output_xent_deriv.MulRowsVec(deriv_weights);
+  }
+
+  output_si_objf_.UpdateStats(lang_name_ + ":output-si",
+                              opts_.nnet_config.print_interval,
+                              num_minibatches_processed_,
+                              tot_weight, tot_objf, tot_l2_term);
+
+  if (input_deriv == NULL && !need_model_deriv)
+    return true;
+
+  // Freeze the natural gradient.  We don't want to update the NG scatter
+  // matrices on this data because we'll next be running the same nnet on the
+  // speaker-adapted version of the same data, and it would violate the
+  // independence assumptions needed for NG to work if we updated them.
+  //if (need_model_deriv)
+  //  FreezeNaturalGradient(true, delta_nnet_);
+
+  computer.AcceptInput("output-si", &output_deriv);
+
+  output_xent_deriv.Scale(opts_.chain_config.xent_regularize);
+  computer.AcceptInput("output-si-xent", &output_xent_deriv);
+
+  // Do the backprop.
+  computer.Run();
+
+  if (input_deriv != NULL)
+    computer.GetOutputDestructive("input", input_deriv);
+
+  //if (need_model_deriv)  // Un-freeze the natural gradient.
+  //  FreezeNaturalGradient(false, delta_nnet_);
+
+  // We'll wait until after the adapted pass to call UpdateNnetWithMaxChange().
+  // Training the model on these features in between the two passes would leave
+  // a strong memory of this minibatch in the model's parameters which could
+  // cause weird effects.
+  return true;
+}
+
+bool NnetChainaTopTrainer::TrainAdapted(
+    const NnetComputation &computation,
+    const chain::Supervision &supervision,
+    BaseFloat model_training_scale,
+    const CuVectorBase<BaseFloat> &deriv_weights,
+    CuMatrix<BaseFloat> *input,
+    CuMatrix<BaseFloat> *input_deriv) {
+
+  const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+
+  // note: because we give the 1st arg (nnet_) as a pointer to the
+  // constructor of 'computer', it will use that copy of the nnet to
+  // store stats.
+  NnetComputer computer(nnet_config.compute_config, computation,
+                        nnet_, delta_nnet_);
+
+  // give the input to the computer object.
+  computer.AcceptInput("input", input);
+  // Do the forward computation
+  computer.Run();
+
+  const CuMatrixBase<BaseFloat>
+      &output = computer.GetOutput("output"),
+      &output_xent = computer.GetOutput("output-xent");
+  CuMatrix<BaseFloat> output_deriv(output.NumRows(),
+                                   output.NumCols(),
+                                   kUndefined),
+      output_xent_deriv;
+
+  // Note: we don't normally use the l2 term any more; parameter-level
+  // regularization seems to work better than regularization of the
+  // nnet output.
+  BaseFloat tot_objf, tot_l2_term, tot_weight;
+
+  ComputeChainObjfAndDeriv(opts_.chain_config, den_graph_,
+                           supervision, output,
+                           &tot_objf, &tot_l2_term, &tot_weight,
+                           &output_deriv, &output_xent_deriv);
+
+  if (!(tot_objf - tot_objf == 0.0)) {
+    // A NaN or inf was encountered in the objective computation.  the input_deriv
+    // won't be used by the calling code, so no need to set it.
+    return false;
+  }
+
+  {
+    // this block computes and keeps track of the cross-entropy objective.
+    // at this point, xent_deriv is posteriors derived from the numerator
+    // computation.  note, xent_objf has a factor of '.supervision.weight'
+    BaseFloat xent_objf = TraceMatMat(output_xent, output_xent_deriv, kTrans);
+    output_xent_objf_.UpdateStats(lang_name_ + ":output-xent",
+                                  opts_.nnet_config.print_interval,
+                                  num_minibatches_processed_,
+                                  tot_weight, xent_objf);
+  }
+  output_objf_.UpdateStats(lang_name_ + ":output",
+                           opts_.nnet_config.print_interval,
+                           num_minibatches_processed_,
+                           tot_weight, tot_objf, tot_l2_term);
+
+  if (input_deriv == NULL && model_training_scale == 0.0)
+    return true;
+
+  if (opts_.apply_deriv_weights && deriv_weights.Dim() != 0) {
+    output_deriv.MulRowsVec(deriv_weights);
+    output_xent_deriv.MulRowsVec(deriv_weights);
+  }
+
+  computer.AcceptInput("output", &output_deriv);
+  output_xent_deriv.Scale(opts_.chain_config.xent_regularize);
+  computer.AcceptInput("output-xent", &output_xent_deriv);
+
+  // Do the backprop.
+  computer.Run();
+
+  if (input_deriv != NULL)
+    computer.GetOutputDestructive("input", input_deriv);
+
+  if (model_training_scale != 0.0) {
+    // If we're actually training the top model...
+
+    // If relevant, add in the part of the gradient that comes from L2
+    // regularization.  The factor of (1.0 + opts_.unadapted_top_weight)
+    // is to make it proportional to the magnitude of the derivative.
+    ApplyL2Regularization(
+        *nnet_,
+        supervision.num_sequences * opts_.nnet_config.l2_regularize_factor *
+        (1.0 + opts_.top.unadapted_weight),
+        delta_nnet_);
+
+    // Update the parameters of nnet.
+    // Note: normally, momentum is 0.0.
+    bool success = UpdateNnetWithMaxChange(
+        *delta_nnet_,
+        nnet_config.max_param_change,
+        model_training_scale,
+        model_training_scale * (1.0 - nnet_config.momentum),
+        nnet_, &max_change_stats_);
+
+    // Scale down the batchnorm stats (keeps them fresh... this affects what
+    // happens when, later on, we use the model with batchnorm test-mode set).
+    ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_);
+
+    // The following will only do something if we have a LinearComponent
+    // or AffineComponent with orthonormal-constraint set to a nonzero value.
+    ConstrainOrthonormal(nnet_);
+
+    if (success)
+      ScaleNnet(nnet_config.momentum, delta_nnet_);
+    else
+      ScaleNnet(0.0, delta_nnet_);
+    return success;
+  } else {
+    return true;
+  }
+}
+
+
+bool NnetChainaTopTrainer::Train(const CuMatrixBase<BaseFloat> &input,
+                                 int32 num_sequences,
+                                 int32 num_groups,
+                                 int32 first_input_t,
+                                 int32 top_subsampling_factor,
+                                 const VectorBase<BaseFloat> &deriv_weights_in,
+                                 const chain::Supervision &supervision,
+                                 BaseFloat model_training_scale,
+                                 CuMatrix<BaseFloat> *input_deriv) {
+  // note: if opts_.top.train if false, model_training_scale will have been
+  // already set to zero.
+  KALDI_ASSERT(input.NumRows() != 0 && input.NumRows() % num_sequences == 0);
+  int32 frames_per_sequence_in = input.NumRows() / num_sequences,
+      frames_per_sequence_out = supervision.frames_per_sequence;
+
+  bool adapted = false;
+  ComputationStructure structure(
+      adapted, (model_training_scale != 0.0), (input_deriv != NULL),
+      num_sequences, frames_per_sequence_in, frames_per_sequence_out,
+      first_input_t, top_subsampling_factor);
+
+  // Will be the numerator posterior from the unadapted pass, which will be
+  // padded with l/r context and used to estimate the adapted features.
+  Posterior post;
+
+  CuVector<BaseFloat> deriv_weights;
+  if (opts_.apply_deriv_weights)
+    deriv_weights = deriv_weights_in;
+
+
+  bool need_unadapted_model_deriv =
+      (model_training_scale * opts_.top.unadapted_weight) != 0.0;
+
+  std::shared_ptr<const NnetComputation> computation_unadapted =
+      GetComputation(structure);
+  bool success = TrainUnadapted(
+      input, *computation_unadapted, supervision,
+      need_unadapted_model_deriv,
+      deriv_weights, &post, input_deriv);
+
+  if (!success) {
+    num_minibatches_processed_++;
+    return false;
+  }
+
+  // Scale down the model derivatives from the unadapted pass.
+  if (need_unadapted_model_deriv && opts_.top.unadapted_weight != 1.0)
+    ScaleNnet(opts_.top.unadapted_weight, delta_nnet_);
+
+  if (input_deriv && opts_.bottom.unadapted_weight != 1.0) {
+    // Apply the scale from --unadapted-bottom-weight.  We'll supply the other
+    // factor that comes from from the language-specific bottom_weight ("bw")
+    // ito UpdateNnetWithMaxChange() later on when we train the bottom nnet.
+    input_deriv->Scale(opts_.bottom.unadapted_weight);
+  }
+
+  Posterior post_padded(input.NumRows());
+  ConvertPosterior(post, num_sequences, first_input_t,
+                   top_subsampling_factor,
+                   transform_.pdf_map,
+                   transform_.transform->NumClasses(),
+                   &post_padded);
+
+  if (opts_.adaptation_model_accumulate) {
+    // We will later add a way to handle iteration indexes >0, which is needed
+    // when the adaptation model contains cascaded transforms, but 0 is the
+    // normal case.
+    int32 accumulate_iter = 0;
+    transform_.transform->Accumulate(accumulate_iter, input,
+                                     num_sequences, num_groups,
+                                     post_padded);
+    return true;  // We don't be evaluating the adapted version of the top model
+  }
+
+
+  structure.adapted = true;
+  std::shared_ptr<const NnetComputation> computation_adapted =
+      GetComputation(structure);
+
+  CuMatrix<BaseFloat> adapted_input(input.NumRows(), input.NumCols(),
+                                    kUndefined),
+      adapted_input_deriv;
+
+  using namespace differentiable_transform;
+  MinibatchInfoItf *minibatch_info = NULL;
+  if (!opts_.adaptation_test_mode) {
+    minibatch_info = transform_.transform->TrainingForward(
+        input, num_sequences, num_groups, post_padded, &adapted_input);
+  } else {
+    transform_.transform->TestingForwardBatch(
+        input, num_sequences, num_groups, post_padded, &adapted_input);
+  }
+
+  success = TrainAdapted(
+      *computation_adapted, supervision,
+      model_training_scale, deriv_weights,
+      &adapted_input,
+      (input_deriv != NULL ? &adapted_input_deriv : NULL));
+
+  num_minibatches_processed_++;
+  if (!success)
+    return false;
+
+  if (input_deriv == NULL)
+    delete minibatch_info;
+  else {
+    transform_.transform->TrainingBackward(input, adapted_input_deriv,
+                                           num_sequences, num_groups, post_padded,
+                                           minibatch_info, input_deriv);
+  }
+  return true;
+}
+
+
+/**
+   This helper function for ConvertPosterior() converts from pdf-ids to
+   cluster-ids using the map provided in pdf_map, if it is nonempty.
+   If pdf_map is empty, it just copies the pairs over unchanged.
+ */
+static inline void ConvertPosteriorElement(
+    const std::vector<int32> &pdf_map,
+    int32 num_classes,
+    const std::vector<std::pair<int32, BaseFloat> > &post_elem_in,
+    std::vector<std::pair<int32, BaseFloat> > *post_elem_out) {
+  if (pdf_map.empty()) {
+    *post_elem_out = post_elem_in;
+    if (!post_elem_in.empty()) {
+      // We just check the first int32-- this is a spot-check that the
+      // pdf-ids are in the correct range.
+      KALDI_ASSERT(post_elem_in[0].first < num_classes);
+    }
+  } else {
+    int32 num_classes_in = pdf_map.size();
+    size_t num_pairs = post_elem_in.size();
+    post_elem_out->resize(num_pairs);
+    for (size_t i =0; i < num_pairs; i++) {
+      int32 pdf_id = post_elem_in[i].first;
+      BaseFloat weight = post_elem_in[i].second;
+      KALDI_ASSERT(pdf_id < num_classes_in);
+      int32 cluster_id = pdf_map[pdf_id];
+      KALDI_ASSERT(cluster_id < num_classes);
+      (*post_elem_out)[i].first = cluster_id;
+      (*post_elem_out)[i].second = weight;
+    }
+  }
+}
+
+void ConvertPosterior(
+    const Posterior &post_at_output,
+    int32 num_sequences,
+    int32 first_input_t,
+    int32 top_subsampling_factor,
+    const std::vector<int32> &pdf_map,
+    int32 num_classes,
+    Posterior *post_at_input) {
+  int32 output_post_size = post_at_output.size(),
+      input_post_size = post_at_input->size(),
+      s = top_subsampling_factor;
+  KALDI_ASSERT(input_post_size % num_sequences == 0 &&
+               output_post_size % num_sequences == 0 &&
+               input_post_size >= (output_post_size - 1) * top_subsampling_factor &&
+               top_subsampling_factor > 0);
+  int32 num_frames_out = output_post_size / num_sequences,
+      num_frames_in = input_post_size / num_sequences,
+      last_input_t = first_input_t + (num_frames_in - 1),
+      first_output_t = 0,
+      last_output_t = first_output_t + s * (num_frames_out - 1);
+
+  int32 half_s = s / 2;  // note: this will round down, which is intended.
+
+  for (int32 t_in = first_input_t; t_in <= last_input_t; t_in++) {
+    // find the corresponding output frame by rounding t to the closest
+    // t that's a multiple of top_subsampling_factor (rounding down in
+    // case of ties).  We do this by adding half_s and rounding down.
+    int32 t_out = s * DivideRoundingDown(t_in + half_s, s);
+    if (t_out >= first_output_t && t_out <= last_output_t) {
+      for (int32 n = 0; n < num_sequences; n++) {
+        int32 input_index = num_sequences * (t_in - first_input_t) + n,
+            output_index = num_sequences * ((t_out - first_output_t) / s) + n;
+        ConvertPosteriorElement(pdf_map, num_classes,
+                                post_at_output[output_index],
+                                &((*post_at_input)[input_index]));
+      }
+    }
+    // else just leave the input posterior for this frame empty.  This will
+    // happen for most of the frames that were added for left and right context.
+  }
+}
+
+BaseFloat NnetChainaTopTrainer::GetTotalObjf(bool adapted, BaseFloat *weight) const {
+  const ObjectiveFunctionInfo &objf =
+      (adapted ? output_objf_ : output_si_objf_);
+  *weight = objf.tot_weight;
+  return objf.tot_objf;
+}
+
+bool NnetChainaTopTrainer::PrintTotalStats() const {
+  bool ans = false;
+  if (output_si_objf_.PrintTotalStats(lang_name_ + ":output-si"))
+    ans = true;
+  if (output_objf_.PrintTotalStats(lang_name_ + ":output"))
+    ans = true;
+  if (output_si_xent_objf_.PrintTotalStats(lang_name_ + ":output-si-xent"))
+    ans = true;
+  if (output_xent_objf_.PrintTotalStats(lang_name_ + ":output-xent"))
+    ans = true;
+  KALDI_LOG << "Max-change stats for language "
+            << lang_name_ << ":";
+  max_change_stats_.Print(*nnet_);
+  return ans;
+}
+
+
+NnetChainaTopTrainer::~NnetChainaTopTrainer() {
+  delete delta_nnet_;
+}
+
+void NnetChainaBottomTrainer::ConsolidateMemory() {
+  ::kaldi::nnet3::ConsolidateMemory(nnet_);
+  ::kaldi::nnet3::ConsolidateMemory(delta_nnet_);
+}
+
+NnetComputer* NnetChainaBottomTrainer::Forward(
+    int32 num_sequences,
+    int32 first_input_t,
+    int32 first_output_t,
+    int32 frames_per_sequence_out,
+    bool train_model,
+    CuMatrix<BaseFloat> *input,
+    CuMatrix<BaseFloat> *output) {
+  KALDI_ASSERT(input->NumRows() != 0 && input->NumRows() % num_sequences == 0);
+  int32 frames_per_sequence_in = input->NumRows() / num_sequences;
+  ComputationStructure s(train_model,
+                         num_sequences,
+                         frames_per_sequence_in,
+                         frames_per_sequence_out,
+                         first_input_t, first_output_t);
+  // Note: this will be cached in the unordered_map owned by this class, so we
+  // don't have to worry about it being deleted before we're done with the
+  // NnetComputer object.
+  std::shared_ptr<const NnetComputation> computation = GetComputation(s);
+
+  const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+  NnetComputer *computer = new NnetComputer(nnet_config.compute_config,
+                                            *computation, nnet_, delta_nnet_);
+  computer->AcceptInput("input", input);
+  computer->Run();
+  if (!train_model) {
+    computer->GetOutputDestructive("output", output);
+    delete computer;
+    return NULL;
+  } else {
+    *output = computer->GetOutput("output");
+    return computer;
+  }
+}
+
+
+void NnetChainaBottomTrainer::Backward(BaseFloat model_training_scale,
+                                       int32 num_sequences,
+                                       NnetComputer *computer,
+                                       CuMatrix<BaseFloat> *output_deriv) {
+  // if model_training_scale was 0.0, this function should not have been called.
+  KALDI_ASSERT(model_training_scale > 0.0);
+  computer->AcceptInput("output", output_deriv);
+  computer->Run();
+
+  delete computer;
+
+  const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+
+
+  // If relevant, add in the part of the gradient that comes from L2
+  // regularization.  The factor of (1.0 + opts_.unadapted_bottom_weight)
+  // is to make it proportional to the magnitude of the derivative.
+  ApplyL2Regularization(
+      *nnet_,
+      num_sequences * opts_.nnet_config.l2_regularize_factor *
+      (1.0 + opts_.bottom.unadapted_weight),
+      delta_nnet_);
+
+
+  // we may later provide a way to set a different max-change for the bottom
+  // nnet than on the top nnet.
+  // Note: normally, momentum is 0.0.
+  bool success = UpdateNnetWithMaxChange(
+      *delta_nnet_,
+      nnet_config.max_param_change,
+      model_training_scale,
+      model_training_scale * (1.0 - nnet_config.momentum),
+      nnet_,
+      &max_change_stats_);
+
+  // Scale down the batchnorm stats (keeps them fresh... this affects what
+  // happens when, later on, we use the model with batchnorm test-mode set).
+  ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_);
+
+  // The following will only do something if we have a LinearComponent
+  // or AffineComponent with orthonormal-constraint set to a nonzero value.
+  ConstrainOrthonormal(nnet_);
+
+  if (success)
+    ScaleNnet(nnet_config.momentum, delta_nnet_);
+  else
+    ScaleNnet(0.0, delta_nnet_);
+
+  static bool warned_momentum = false;
+  if (model_training_scale != 1.0 && nnet_config.momentum != 0.0 &&
+      !warned_momentum) {
+    KALDI_WARN << "Momentum does not interact correctly with top_weight or "
+        "bottom_weight values.  Will not warn again.";
+    warned_momentum = true;
+  }
+  num_minibatches_processed_++;
+}
+
+
+NnetChainaBottomTrainer::NnetChainaBottomTrainer(
+    const NnetChainaTrainingOptions &opts,
+    Nnet *nnet):
+    opts_(opts),
+    nnet_(nnet),
+    delta_nnet_(nnet->Copy()),
+    compiler_(*nnet, opts_.nnet_config.optimize_config,
+              opts_.nnet_config.compiler_config),
+    max_change_stats_(*nnet) {
+  if (opts_.nnet_config.zero_component_stats &&
+      !opts_.bottom.batchnorm_test_mode)
+    ZeroComponentStats(nnet);
+  ScaleNnet(0.0, delta_nnet_);
+  if (opts_.nnet_config.read_cache != "") {
+    // It would be complicated to implement, as there are various top nnets
+    // and they would all try to read and write the same cache files.
+    // To implement this, the best way would be to
+    KALDI_WARN << "The read-cache options are not currently supported.";
+  }
+  KALDI_ASSERT(opts_.nnet_config.momentum >= 0.0 &&
+               opts_.nnet_config.max_param_change >= 0.0 &&
+               opts_.bottom_subsampling_factor >= 1);
+}
+
+std::shared_ptr<const NnetComputation> NnetChainaBottomTrainer::GetComputation(
+    const ComputationStructure &s) {
+  { // Check in the cache, in case we already handled this computation.
+    auto iter = computation_map_.find(s);
+    if (iter != computation_map_.end())
+      return iter->second;
+  }
+
+  if (!opts_.bottom.train) {
+    KALDI_ASSERT(!s.train_model);
+  }
+
+  int32 num_sequences = s.num_sequences,
+      frames_per_sequence_in = s.frames_per_sequence_in,
+      frames_per_sequence_out = s.frames_per_sequence_out,
+      first_input_t = s.first_input_t,
+      first_output_t = s.first_output_t;
+
+  if (nnet_->InputDim("input") < 0 ||
+      nnet_->OutputDim("output") < 0) {
+    KALDI_ERR << "Bottom neural net for chaina training must have an input "
+        "called 'input' and an output called 'output'.";
+  }
+
+  ComputationRequest request;
+  request.need_model_derivative = s.train_model;
+  // It's probably safe to store component-level stats, unless the
+  // batchnorm is in test mode.
+  request.store_component_stats = !opts_.bottom.batchnorm_test_mode;
+  request.inputs.resize(1);
+  request.inputs[0].name = "input";
+  request.inputs[0].indexes.resize(frames_per_sequence_in * num_sequences);
+  // The inputs are in the order: all frames of sequence 0; then all frames of
+  // sequence 1; and so on.  This is how the example-merging code does it, since
+  // it's more convenient when dealing with compressed matrices.
+  auto iter = request.inputs[0].indexes.begin();
+  for (int32 n = 0; n < num_sequences; n++) {
+    for (int32 t = first_input_t;
+         t < first_input_t + frames_per_sequence_in; ++t,++iter) {
+      iter->n = n;
+      iter->t = t;
+    }
+  }
+  // ... but the outputs are in the order: the first frame of all sequences;
+  // the second frame of all sequences; and so on.
+  request.outputs.resize(1);
+  request.outputs[0].name = "output";
+  request.outputs[0].has_deriv = s.train_model;
+  request.outputs[0].indexes.resize(frames_per_sequence_out * num_sequences);
+  int32 t_stride_out = opts_.bottom_subsampling_factor;
+  iter = request.outputs[0].indexes.begin();
+  for (int32 t = first_output_t;
+       t < first_output_t  +  frames_per_sequence_out * t_stride_out;
+       t += t_stride_out) {
+    for (int32 n = 0; n < num_sequences; ++n,++iter) {
+      iter->n = n;
+      iter->t = t;
+    }
+  }
+  std::shared_ptr<const NnetComputation> computation = compiler_.Compile(
+      request);
+  computation_map_[s] = computation;
+  return computation;
+}
+
+void NnetChainaBottomTrainer::PrintTotalStats() const {
+  KALDI_LOG << "Max-change stats for bottom nnet:";
+  max_change_stats_.Print(*nnet_);
+}
+NnetChainaBottomTrainer::~NnetChainaBottomTrainer() {
+  delete delta_nnet_;
+}
+
+
+void NnetChainaTrainer::GetContextInfo(
+    const std::string &lang,
+    int32 *bottom_left_context,
+    int32 *bottom_right_context,
+    int32 *top_left_context,
+    int32 *top_right_context) {
+
+}
+
+BaseFloat NnetChainaTrainer::GetTotalObjf(
+    bool adapted, BaseFloat *weight) const {
+  *weight = 0.0;
+  BaseFloat tot_objf = 0.0;
+  for (auto iter = top_trainers_.begin(); iter != top_trainers_.end();
+       ++iter) {
+    BaseFloat this_weight;
+    tot_objf += iter->second->GetTotalObjf(adapted, &this_weight);
+    *weight += this_weight;
+  }
+  return tot_objf;
+}
+
+bool NnetChainaTrainer::PrintTotalStats() const {
+  bottom_trainer_.PrintTotalStats();
+  bool ans = false;
+  for (auto iter = top_trainers_.begin(); iter != top_trainers_.end();
+       ++iter)
+    if (iter->second->PrintTotalStats())
+      ans = true;
+  return ans;
+}
+
+NnetChainaTrainer::NnetChainaTrainer(
+    const NnetChainaTrainingOptions &config,
+    NnetChainaModels *models):
+    opts_(config),
+    models_(models),
+    bottom_trainer_(opts_, models->GetBottomNnet()) {
+  ComputeSimpleNnetContext(*models->GetBottomNnet(),
+                           &bottom_left_context_,
+                           &bottom_right_context_);
+}
+
+
+NnetChainaTopTrainer* NnetChainaTrainer::GetTopTrainerForLang(
+    const std::string &lang) {
+  auto iter = top_trainers_.find(lang);
+  if (iter != top_trainers_.end())
+    return iter->second;
+  NnetChainaTopTrainer *ans =
+      new NnetChainaTopTrainer(
+          lang, opts_,
+          *(models_->GetDenFstForLang(lang)),
+          *(models_->GetTransformForLang(lang)),
+          models_->GetRawNnetForLang(lang));
+  top_trainers_[lang] = ans;
+  return ans;
+}
+
+// 'key' might be something like "afsdadsfds12345?lang=english&tw=1.0&bw=0.5"
+// expressing how much we want this eg to be used to train the top, and bottom,
+// models respectively.
+void NnetChainaTrainer::Train(const std::string &key,
+                              const NnetChainExample &eg) {
+  size_t num_top_trainers = top_trainers_.size();
+  std::string lang_name = "default";
+  // 'top_weight' is a weight on the derivatives and max-change
+  // when training the top model, 'bottom_weight' is the same
+  // for the bottom model.
+  BaseFloat top_weight = 1.0,
+      bottom_weight = 1.0;
+  ParseFromQueryString(key, "lang", &lang_name);
+  ParseFromQueryString(key, "tw", &top_weight);
+  ParseFromQueryString(key, "bw", &bottom_weight);
+  KALDI_ASSERT(top_weight >= 0.0 && bottom_weight >= 0.0);
+
+  if (!opts_.bottom.train)
+    bottom_weight = 0.0;
+  if (!opts_.top.train)
+    top_weight = 0.0;
+
+  int32 num_sequences, chunks_per_group, first_input_t,
+      num_input_frames, num_output_frames,
+      frame_subsampling_factor,
+      eg_left_context, eg_right_context;
+  FindChainaExampleStructure(eg, &num_sequences, &chunks_per_group,
+                             &first_input_t,
+                             &num_input_frames, &num_output_frames,
+                             &frame_subsampling_factor,
+                             &eg_left_context, &eg_right_context);
+  KALDI_ASSERT(num_sequences % chunks_per_group == 0);
+  int32 num_groups = num_sequences / chunks_per_group;
+
+  AmNnetSimple *top_am_nnet = models_->GetNnetForLang(lang_name);
+  int32 top_left_context = top_am_nnet->LeftContext(),
+      top_right_context = top_am_nnet->RightContext();
+
+  int32 first_embedding_t,
+      num_embedding_frames;
+  ComputeEmbeddingTimes(first_input_t, num_input_frames, num_output_frames,
+                        frame_subsampling_factor,
+                        opts_.bottom_subsampling_factor,
+                        bottom_left_context_, bottom_right_context_,
+                        top_left_context, top_right_context,
+                        opts_.keep_embedding_context,
+                        &first_embedding_t, &num_embedding_frames);
+
+  const GeneralMatrix &eg_input = eg.inputs[0].features;
+  CuMatrix<BaseFloat> cu_input(eg_input.NumRows(), eg_input.NumCols(),
+                               kUndefined),
+      cu_embedding;
+  eg_input.CopyToMat(&cu_input);
+  bool train_bottom_nnet = bottom_weight != 0.0;
+  KALDI_ASSERT(cu_input.NumRows() == num_input_frames * num_sequences);
+
+  NnetComputer *computer = bottom_trainer_.Forward(
+      num_sequences, first_input_t,
+      first_embedding_t, num_embedding_frames,
+      train_bottom_nnet,
+      &cu_input, &cu_embedding);
+
+  int32 b = opts_.bottom_subsampling_factor,
+      first_embedding_t_subsampled = first_embedding_t / b,
+      top_subsampling_factor = frame_subsampling_factor / b;
+
+  NnetChainaTopTrainer *top_trainer = GetTopTrainerForLang(lang_name);
+
+  CuMatrix<BaseFloat> cu_embedding_deriv;
+  if (train_bottom_nnet)
+    cu_embedding_deriv.Resize(cu_embedding.NumRows(), cu_embedding.NumCols());
+
+
+  bool success = top_trainer->Train(cu_embedding, num_sequences,
+                                    num_groups,
+                                    first_embedding_t_subsampled,
+                                    top_subsampling_factor,
+                                    eg.outputs[0].deriv_weights,
+                                    eg.outputs[0].supervision,
+                                    top_weight,
+                                    (train_bottom_nnet ?
+                                     &cu_embedding_deriv : NULL));
+
+  if (success && train_bottom_nnet) {
+    bottom_trainer_.Backward(bottom_weight, num_sequences, computer,
+                             &cu_embedding_deriv);
+  } else {
+    delete computer;  // if it's NULL, this will do nothing.
+  }
+
+  if (top_trainers_.size() != num_top_trainers) {
+    // Move any permanently held bits of GPU memory to low addresses, to reduce
+    // fragmentation.
+    bottom_trainer_.ConsolidateMemory();
+    top_trainer->ConsolidateMemory();
+  }
+
+}
+
+
+NnetChainaTrainer::~NnetChainaTrainer() {
+  for (auto iter = top_trainers_.begin(); iter != top_trainers_.end();
+       ++iter)
+    delete iter->second;
+}
+
+
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3a/nnet-chaina-training.h b/src/nnet3a/nnet-chaina-training.h
new file mode 100644
index 00000000000..559fb9dfba4
--- /dev/null
+++ b/src/nnet3a/nnet-chaina-training.h
@@ -0,0 +1,970 @@
+// nnet3a/nnet-chaina-training.h
+
+// Copyright    2015-2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_CHAINA_TRAINING_H_
+#define KALDI_NNET3_NNET_CHAINA_TRAINING_H_
+
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-computation.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-chain-example.h"
+#include "nnet3/nnet-training.h"
+#include "nnet3/am-nnet-simple.h"
+#include "chain/chain-training.h"
+#include "chain/chain-den-graph.h"
+#include "adapt/differentiable-transform-itf.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+// This contains the subset of options that you can set for the bottom and the
+// top model separately.  They are set, for instance, as --bottom.train=false,
+// or --top.dropout-test-mode=true.
+struct NnetChainaTrainingPerModelOptions {
+  BaseFloat unadapted_weight;
+  bool train;
+  bool dropout_test_mode;
+  bool batchnorm_test_mode;
+
+  NnetChainaTrainingPerModelOptions():
+      unadapted_weight(0.5),
+      train(true),
+      dropout_test_mode(false), batchnorm_test_mode(false) { }
+
+
+  void Register(OptionsItf *opts) {
+    opts->Register("unadapted-weight", &unadapted_weight,
+                   "Scale that is applied to the derivatives arising from the "
+                   "unadapted pass of model evaluation, when training "
+                   "Affects how much we prioritize the unadapted "
+                   "features for neural nnet training.");
+    opts->Register("train", &train,
+                   "Set this to false to disable training for this model.");
+    opts->Register("dropout-test-mode", &dropout_test_mode,
+                   "Setting this option sets test mode on any dropout components. "
+                   "Will persist in the model written out, if it's being trained.");
+    opts->Register("batchnorm-test-mode", &batchnorm_test_mode,
+                   "Setting this option sets test mode on any batch-norm "
+                   "(or batch-norm-like) components. ");
+  }
+  void Check() const {
+    KALDI_ASSERT(!(train && batchnorm_test_mode));
+    KALDI_ASSERT(unadapted_weight >= 0.0);
+  }
+};
+
+
+struct NnetChainaTrainingOptions {
+  NnetTrainerOptions nnet_config;
+  chain::ChainTrainingOptions chain_config;
+  NnetChainaTrainingPerModelOptions top;
+  NnetChainaTrainingPerModelOptions bottom;
+  bool apply_deriv_weights;
+  int32 bottom_subsampling_factor;
+  bool keep_embedding_context;
+  bool adaptation_model_accumulate;
+  bool adaptation_test_mode;
+
+  NnetChainaTrainingOptions():
+      apply_deriv_weights(true),
+      bottom_subsampling_factor(1),
+      keep_embedding_context(true),
+      adaptation_model_accumulate(false),
+      adaptation_test_mode(false) { }
+
+  void Register(OptionsItf *opts) {
+    nnet_config.Register(opts);
+    chain_config.Register(opts);
+    ParseOptions top_opts("top", opts);
+    top.Register(&top_opts);  // Register with prefix "top".
+    ParseOptions bottom_opts("bottom", opts);
+    bottom.Register(&bottom_opts);  // Register with prefix "bottom".
+
+    opts->Register("apply-deriv-weights", &apply_deriv_weights,
+                   "If true, apply the per-frame derivative weights stored with "
+                   "the example");
+    opts->Register("bottom-subsampling-factor", &bottom_subsampling_factor,
+                   "Determines the frequency at which we subsample the "
+                   "embeddings from the bottom nnet.  Implicitly, the "
+                   "subsampling factor in the top nnet is the overall "
+                   "--frame-subsampling-factor (determined when we dumped "
+                   "the egs) divided by this value.");
+    opts->Register("keep-embedding-context", &keep_embedding_context,
+                   "If true, we compute as much left/right context of the "
+                   "embedding vectors (the output of the bottom nnet) as is "
+                   "possible given the provided input features in the eg. "
+                   "You'll generally only want this to be true "
+                   "if the top network is recurrent or otherwise has "
+                   "optional dependencies (for example: if it uses "
+                   "StatisticsExtractionComponent, IfDefined(), Failover(), "
+                   "etc.).");
+    opts->Register("adaptation-model-accumulate", &adaptation_model_accumulate,
+                   "Set this to true if you want to accumulate stats for "
+                   "the adaptation model (i.e., its class-dependent means). "
+                   "This will normally be done just once after training the "
+                   "model, and will cause the adaptation objects to be "
+                   "written out to <model-dir>.  If this option is given, "
+                   "the speaker adapted pass of the top model, and training "
+                   "of the top or bottom model, will not be done; and we "
+                   "expect --bottom-model-test-mode=true and "
+                   "--top-model-test-mode=true to be set.");
+    opts->Register("adaptation-test-mode", &adaptation_test_mode,
+                   "If true, use test mode for the adaptation model, which "
+                   "means we'll use previously computed target models "
+                   "rather than ones estimated from the minibatch.  Training of "
+                   "the bottom model is currently not supported in this case "
+                   "(and, in any case, is likely undesirable).");
+  }
+  void Check() const {
+    KALDI_ASSERT(bottom_subsampling_factor > 0);
+    top.Check();
+    bottom.Check();
+  }
+};
+
+
+/**
+   This class, intended to mostly be accessed by NnetChainaTrainer, handles the
+   logic of reading the models and their corresponding denominator FSTs from
+   disk, and of writing out the corresponding (raw) trained models when
+   this iteration of training has finished.
+
+   The reason this is not entirely trivial is that we want to make it easy to
+   support the multilingual case.  In this case there is one 'bottom' model (the
+   embedding extractor) but there may be multiple 'top' models, each with their
+   associated transition model and denominator FST, containing their own
+   langauge name.  We use a directory to organize these.
+ */
+class NnetChainaModels {
+ public:
+  /**
+     Constructor to which you pass the model directory and the den-fst
+     directory.  The directory structure is:
+       <model_dir>/bottom.raw
+     should exist, and then for each language name (e.g. "english"), the following
+     files should exist:
+       <model_dir>/english.mdl <den_fst_dir>/english.den.fst <transform_dir>/english.ada
+     There is no requirement that all these directories be distinct.
+
+     In practice, the language name will be either "default", in the
+     typical (monolingual) setup, or it might be arbitrary strings
+     representing languages such as "english", "french", and so on.
+     In general the language can be any string containing ASCII letters, numbers
+     or underscores.
+
+     The models and denominator FSTs will only be read when they are actually
+     required, so languages that are not used by a particular job (e.g. because
+     they were not represented in the egs) will not actually be read.
+
+         @param [in] opts  Training options; needed to know which models
+                     we should write out, and whether to set test mode
+                     on models when reading them in.
+         @param  [in] model_dir  Directory where we'll find bottom.raw, and
+                     <lang>.mdl for each language <lang> present in the egs
+                     (the <lang> will be worked out from the key name from
+                     "...?lang=xxx" in the key when reading the egs,
+                     see ParseFromQueryString() in nnet-chain-utils.h.
+         @param [in] den_fst_dir  Directory where we'll find the denominator
+                     FST <lang>.den.fst for each language <lang> present in
+                     the egs.
+         @param [in] transform_dir  Directory where we'll find the
+                     transforms (of type DifferentiableTransformItf),
+                     as files <lang>.ada for each language <lang> present
+                     in the egs.
+  */
+  NnetChainaModels(const NnetChainaTrainingOptions &opts,
+                   const std::string &model_dir,
+                   const std::string &den_fst_dir,
+                   const std::string &transform_dir);
+
+  // Copy constructor
+  NnetChainaModels(const NnetChainaModels &other);
+
+
+  /*
+    This interpolates the (top and bottom) models stored here with the one in
+    'model_dir', giving a weight 0 < new_model_weight < 1 to the new models.
+    All models currently loaded will be looked for (this depends what
+    languages were present in the egs), so you need to actually use this
+    object for training or objective evaluation before calling this function
+    on it.
+   */
+  void InterpolateWith(
+      BaseFloat new_model_weight,
+      const std::string &model_dir);
+
+
+  Nnet* GetBottomNnet();
+
+  /**
+     Returns the AmNnetSimple object corresponding to a given language
+     name (e.g. "default", "english", "french").  Note: the model
+     file <model_dir>/<language_name>.mdl will contain a TransitionModel and an
+     AmNnetSimple object
+   */
+  AmNnetSimple *GetNnetForLang(const std::string &language_name);
+
+  TransitionModel *GetTransitionModelForLang(
+      const std::string &language_name);
+
+
+  fst::StdVectorFst *GetDenFstForLang(const std::string &language_name);
+
+  // This convenience function returns the Nnet object in the
+  // AmNnetSimple object returned by 'GetNnetForLang'.
+  Nnet *GetRawNnetForLang(const std::string &language_name);
+
+  differentiable_transform::DifferentiableTransformMapped *GetTransformForLang(
+      const std::string &language_name);
+
+  // Writes out the following files:
+  //  <model_out_dir>/bottom.<job_id>.raw (if opts_.bottom.train)
+  // and, for each language <lang> that we accessed,
+  //  <model_out_dir>/<lang>.<job_id>.raw (if opts_.top.train)
+  //  <model_out_dir>/<lang>.<job_id>.ada (if opts_.adaptation_model_accumulate)
+  //
+  // Thus, this writes out any models that we trained.  There is no
+  // corresponding Read() function.
+  void Write(const std::string &model_out_dir,
+             bool binary,
+             int32 job_id);
+
+  // This is a version of Write() is specialized for use by the
+  // model-combination code; it differs from the Write() above in
+  // that it writes out all models we have (ignoring whether or not
+  // they were trained), and it writes out the 'top' models as
+  // .mdl files (including the transition models).
+  void WriteCombinedModels(const std::string &model_out_dir,
+                           bool binary);
+
+
+  ~NnetChainaModels();
+ private:
+  // This function sets "pathname" to the string:
+  // <dir>/<name>.<suffix>
+  void GetPathname(const std::string &dir,
+                   const std::string &name,
+                   const std::string &suffix,
+                   std::string *pathname);
+
+  // If job_id is >= 0, then this version of GetPathname() sets "pathname" to
+  // the string:
+  // <dir>/<name>.<job_id>.<suffix>
+  // otherwise (job_id < 0) it sets it to
+  // <dir>/<name>.<suffix>
+  void GetPathname(const std::string &dir,
+                   const std::string &name,
+                   int32 job_id,
+                   const std::string &suffix,
+                   std::string *pathname);
+
+  // struct LanguageInfo contains the data that is stored per language.
+  struct LanguageInfo {
+    // am_nnet comes from <model_dir>/<language_name>.mdl, which also
+    // stores a TransitionModel.
+    TransitionModel trans_model;
+    AmNnetSimple am_nnet;
+    // den_fst comes from <den_fst_dir>/<language_name>.den.fst
+    fst::StdVectorFst den_fst;
+    // transform comes from <transform_dir>/<language_name>.ada
+    differentiable_transform::DifferentiableTransformMapped transform;
+    LanguageInfo() { }
+    // Copy constructor
+    LanguageInfo(const LanguageInfo &other);
+  };
+
+  // Depending on opts_, this function may zero the component stats, set test
+  // mode for batchnorm and/or dropout components, and do model-collapsing.
+  void InitializeNnet(bool is_top_nnet, Nnet *nnet) const;
+
+  // get the LanguageInfo* for this language, creating it (and reading its
+  // contents from disk) if it does not already exist.
+  LanguageInfo *GetInfoForLang(const std::string &lang);
+
+
+  const NnetChainaTrainingOptions &opts_;
+  // Directory where models are located.
+  std::string model_dir_;
+  // Directory where denominator FSTs are located.
+  std::string den_fst_dir_;
+  // Directory where transforms (type: DifferentiableTransformMapped) are located.
+  std::string transform_dir_;
+
+  // This corresponds to <model_dir>/bottom.raw.
+  Nnet bottom_nnet_;
+  // The left and right context of bottom_nnet_.
+  int32 bottom_nnet_left_context_;
+  int32 bottom_nnet_right_context_;
+
+  std::unordered_map<std::string, LanguageInfo*, StringHasher> lang_info_;
+};
+
+
+/**
+   This object, which has a similar function to NnetChainTrainer, trains the
+   'top' model for a single language and (optionally) outputs the derivatives
+   required to obtain the 'bottom' model.
+ */
+class NnetChainaTopTrainer {
+ public:
+  /**
+     Constructor.
+      @param [in] lang_name  The name of the language this corresponds to
+                             (needed for diagnostics).   E.g. "default",
+                             "english".
+      @param [in] config     Options class
+      @param [in] den_fst    The denominator FST for this language
+      @param [in] transform  The transform object which will be used to produce adapted
+                             features after the first pass of training.
+      @param [in,out] nnet   The neural net we are training.  Expected to have
+                             outputs called "output-si" (speaker-independent
+                             output), "output", "output-si-xent", "output-xent",
+                             and an input called "input".  This class does not
+                             take ownership of the pointer, but it will modify
+                             its parameters (and stored statistics) during
+                             training.
+   */
+  NnetChainaTopTrainer(
+      const std::string &lang_name,
+      const NnetChainaTrainingOptions &config,
+      const fst::StdVectorFst &den_fst,
+      const differentiable_transform::DifferentiableTransformMapped &transform,
+      Nnet *nnet);
+
+  /**  Train on one minibatch.
+          @param [in] input  The input (unadapted) features, most likely the embeddings
+                    that are the output of the 'bottom' nnet.  Assumed to form a
+                    regular grid with the 't' value having higher stride, so the
+                    first 'num_sequences' rows would correspond to the
+                    lowest-numbered frames for all sequences, and so on.
+          @param [in] num_sequences The number of sequences/chunks represented
+                    in 'input' (a.k.a. the minibatch size).  Actually this must
+                    be equal to supervision.num_sequences, but it's easier for
+                    reasons of clarity and documentation to repeat it here.
+          @param [in] num_groups  The total number of groups of chunks (you
+                     can think of these as the same as speakers).  Must be >1, and must divide
+                     num_sequences.   The number of sequences per speaker
+                     must be the same for all speakers (it will equal num_sequences / num_groups),
+                     and the sequences for a speaker must be consecutively numbered.
+          @param [in] first_input_t The 't' value corresponding to the first
+                     input frame (will normally be a negative number,
+                     corresponding to the left context we are giving to the
+                     'top' model, since we renumber to ensure that the sequences
+                     have 't' values starting from 0).  The 't' values at the
+                     input will be consecutive, and the number of frames per
+                     sequence will equal input.NumRows() / num_sequences.  Note:
+                     if the embeddings are computed at a lower frame rate than
+                     the original features, we renumber things to make the
+                     embeddings consecutive.
+          @param [in] top_subsampling_factor  The subsampling factor of the top network
+                     (which will equal the frame subsampling factor implicit in the original
+                     egs that we read, divided by bottom_subsampling_factor).  E.g. this
+                     might frequently be 1 or 3.  The frames at the output of the 'top'
+                     nnet are evaluated for 't' values that are multiples of
+                     'top_subsampling_factor', starting from t=0.
+          @param [in] deriv_weights  Per-frame weights that will be applied to the derivatives
+                     w.r.t. the objective function.  Dimension is expected to be either
+                     input.NumRows(), or zero (in which case it is treated the same as a
+                     vector containing all ones).
+          @param [in] supervision  The chain supervision object representing the objective
+                     function at the output.  Its num_sequences must equal the
+                     num_sequences passed into this function as a separate argument.
+          @param [in] model_training_scale  A scale we'll apply to the parameter changes
+                     and max-change values when taking any step.  This will be
+                     referred to elsewhere as top_weight, or "tw" when present in
+                     keys of egs in scp files; we'll have a separately specifiable
+                     weight for the bottom nnet.  If this is zero, we won't be training
+                     the top model on this eg at all.
+          @param [out] input_deriv  If non-NULL, the derivative of the objective function
+                     w.r.t. the input features will be written to here (this function
+                     will set it using Swap(), so you don't need to correctly size it).
+          @return   Returns true if it successfully trained on this minbiatch,
+                    false on error (e.g. if a NaN was generated, which should
+                    not really happen).
+  */
+  bool Train(const CuMatrixBase<BaseFloat> &input,
+             int32 num_sequences,
+             int32 num_groups,
+             int32 first_input_t,
+             int32 top_subsampling_factor,
+             const VectorBase<BaseFloat> &deriv_weights,
+             const chain::Supervision &supervision,
+             BaseFloat model_training_scale,
+             CuMatrix<BaseFloat> *input_deriv = NULL);
+
+  // Prints out the final stats, and return true if there was a nonzero count.
+  bool PrintTotalStats() const;
+
+
+  // Returns the total objective-function value for the adapted computation (if
+  // adapted == true), or the unadapted/speaker-independent computation
+  // otherwise, with the corresponding weight (which can be interpreted as a
+  // frame count) written to 'weight'.  The returned value would normally be
+  // divided by 'weight' before being displayed.
+  BaseFloat GetTotalObjf(bool adapted, BaseFloat *weight) const;
+
+
+  // Calls kaldi::nnet3::ConsolidateMemory() on nnet_ and delta_nnet_; we do
+  // this after the first minibatch of training, to reduce fragmentation.
+  void ConsolidateMemory();
+
+  ~NnetChainaTopTrainer();
+ private:
+
+  // We use this as an index with which to look up computations, kind of like a
+  // lookaside buffer; it avoids creating a much larger structure with large
+  // vectors of Indexes in it.
+  struct ComputationStructure {
+    bool adapted;
+    bool train_model;
+    bool need_input_deriv;
+    int32 num_sequences;
+    int32 frames_per_sequence_in;
+    int32 frames_per_sequence_out;
+    int32 first_input_t;
+    int32 top_subsampling_factor;
+    inline bool operator == (const ComputationStructure &other) const {
+      return adapted == other.adapted &&
+          train_model == other.train_model &&
+          need_input_deriv == other.need_input_deriv &&
+          num_sequences == other.num_sequences &&
+          frames_per_sequence_in == other.frames_per_sequence_in &&
+          frames_per_sequence_out == other.frames_per_sequence_out &&
+          first_input_t == other.first_input_t &&
+          top_subsampling_factor == other.top_subsampling_factor;
+    };
+    ComputationStructure (const ComputationStructure &other) = default;
+    ComputationStructure &operator = (
+        const ComputationStructure &other) = default;
+    /**
+       Constructor.
+       @param [in] adapted  True if we want the outputs from "output" and
+                   "output-xent", and false if we want the outputs from
+                    "output-si" and "output-si-xent".
+       @param [in] train_model   True if we will be training the acoustic
+                   model with this example.
+       @param [in] need_input_deriv  True if we need the derivative w.r.t.
+                     the features that are the input to this computation.
+       @param [in] num_sequences  The number of sequences in this minibatch
+                     (a.k.a. the minibatch size).
+       @param [in] frames_per_sequence_in  The number of frames for each sequence
+                    of input features.  They are assumed to be consecutively
+                    numbered.
+       @param [in] frames_per_sequence_out  The 'frames_per_sequence' in
+                    the ChainSupervision object, i.e. the length of the
+                    output sequences of the computation.
+       @param [in] first_input_t  The first 't' value in the input
+                    sequence; will normally be negative (corresponding to
+                    the negative of the number of frames of left context).
+       @param [in] top_subsampling_factor  Frame subsampling factor at the
+                    output; e.g., 3 would mean we are evaluating the output
+                    at frames t=0, t=3, and so on.
+    */
+    ComputationStructure(bool adapted,
+                         bool train_model,
+                         bool need_input_deriv,
+                         int32 num_sequences,
+                         int32 frames_per_sequence_in,
+                         int32 frames_per_sequence_out,
+                         int32 first_input_t,
+                         int32 top_subsampling_factor);
+  };
+  struct ComputationHasher {
+    inline size_t operator() (const ComputationStructure &s) const {
+      return (s.adapted ? 33 : 0) +
+          (s.train_model ? 333 : 0) +
+          size_t(s.num_sequences) +
+          10  * size_t(s.frames_per_sequence_in) +
+          100 * size_t(s.frames_per_sequence_out) +
+          1000 * size_t(s.first_input_t) +
+          10000 * size_t(s.top_subsampling_factor);
+    }
+  };
+
+  // This is a faster lookup mechanism for the computation than
+  // is provided by the compiler's inherent caching.
+  std::unordered_map<ComputationStructure,
+                     std::shared_ptr<const NnetComputation>,
+                     ComputationHasher> computation_map_;
+
+  // This wraps the call to the compiler.  See constructor
+  // of struct ComputationStructure for more documentation.
+  std::shared_ptr<const NnetComputation> GetComputation(
+      const ComputationStructure &s);
+
+
+  /**
+    This does the training on the unadapted branch ("si" / speaker-independent)
+    of the neural net.
+      @param [in] input    The input features, as supplied to Train().  Order
+                          of rows is: the first frame of all sequences; the
+                          second frame of all sequences; and so on.
+      @param [in] computation  The computation corresponding to the unadapted
+                               branch of the nnet.
+      @param [in] supervision   The chain supervision object.  The nnet output
+                               dimensions are worked out from this, as well as
+                               using this object to compute the objective function.
+      @param [in] need_model_deriv   True if we are training on this minibatch,
+                              on the unadapted data-- i.e. if we need to compute
+                              the model derivative.
+      @param [in] deriv_weights  Weights to be applied to the derivatives for the
+                               corresponding frames of the output (order is:
+                               first frame for all sequences; second frame for
+                               all sequences, etc.).  May be stored with the
+                               egs.  If this is the empty vector or
+                               --apply-deriv-weights=false, they won't be
+                               appplied.
+      @param [out] posterior    The posteriors from the numerator forward-backward
+                               on the adaptation model will be written to here.
+                               The number of frames will be the number of frames in
+                               the output sequence (supervision.frames_per_sequence),
+                               and the order is: all sequences' frame 0; then all
+                               sequences' frame 1; and so on.
+      @param [out] input_deriv   Derivative w.r.t. the input features; this will
+                               be set via Swap(), if it is not NULL.  Any weight to
+                               (be applied e.g. opts_.unadapted_bottom_weight),
+                               should be applied by the caller.
+      @return  Returns true if the training went through successfully
+            (it should very rarely return false, e.g. if a NaN was generated).
+  */
+  bool TrainUnadapted(const CuMatrixBase<BaseFloat> &input,
+                      const NnetComputation &computation,
+                      const chain::Supervision &supervision,
+                      bool need_model_deriv,
+                      const CuVectorBase<BaseFloat> &deriv_weights,
+                      Posterior *posterior,
+                      CuMatrix<BaseFloat> *input_deriv);
+
+  /**
+     Does the adapted pass of training.
+         @param [in] computation  The adapted version of the
+                     computation (this one uses the outputs
+                     "output" and "output-xent" instead of
+                     "output-si" and "output-si-xent".
+         @param [in] supervision  The chain supervision
+                     object, containing information derived
+                     from the numerator lattices.
+         @param [in] model_training_scale  A scale we'll apply to the parameter changes
+                     and max-change values when taking any step.  This will be
+                     referred to elsewhere as top_weight, or "tw" when present in
+                     keys of egs in scp files; we'll have a separately specifiable
+                     weight for the bottom nnet.  If this is zero, we won't be training
+                     the top model on this eg at all.
+         @param [in] deriv_weights  Weights to be applied to the derivatives for the
+                     corresponding frames of the output (order is:
+                     first frame for all sequences; second frame for
+                     all sequences, etc.).  May be stored with the
+                     egs.  If this is the empty vector or
+                     --apply-deriv-weights=false, they won't be
+                     appplied.
+         @param [in] input  The adapted input features.  Provided as a non-const
+                     pointer because it is consumed destructively (via Swap()).
+         @param [in,out] input_deriv  If non-NULL, the
+                     feature derivative w.r.t. the [speaker-adapted] input
+                     features will be written to this location.  It's
+                     done via Swap(), so it doesn't have to be correctly
+                     sized on entry.
+         @return
+   */
+  bool TrainAdapted(const NnetComputation &computation,
+                    const chain::Supervision &supervision,
+                    BaseFloat model_training_scale,
+                    const CuVectorBase<BaseFloat> &deriv_weights,
+                    CuMatrix<BaseFloat> *input,
+                    CuMatrix<BaseFloat> *input_deriv);
+
+  // This function increments num_minibatches_processed_, but before
+  // doing so, if it notices that it is zero it makes certain calls
+  // to ConsolidateMemory()
+  void IncrementNumMinibatches();
+
+  std::string lang_name_;
+
+  const NnetChainaTrainingOptions &opts_;
+  chain::DenominatorGraph den_graph_;
+  const differentiable_transform::DifferentiableTransformMapped &transform_;
+  CachingOptimizingCompiler compiler_;
+
+
+  Nnet *nnet_;
+  Nnet *delta_nnet_;  // stores the change to the parameters on each training
+                      // iteration.
+
+  // These objects keep track of the objective-function values for the 4
+  // outputs.  We have the regular output (sequence objective) and the 'xent'
+  // output for cross-entropy regularization, and there are speaker independent
+  // (si) versions of those outputs also.
+  ObjectiveFunctionInfo output_si_objf_;
+  ObjectiveFunctionInfo output_si_xent_objf_;
+  ObjectiveFunctionInfo output_objf_;
+  ObjectiveFunctionInfo output_xent_objf_;
+
+  // Number of minibatches processed.  Note: we actually train the nnet twice
+  // per minibatch, because there are the speaker-independent and
+  // speaker-dependent passes.
+  int32 num_minibatches_processed_;
+
+  // stats for max-change.  This combines both speaker-independent and
+  // speaker-adapted phases of training, since we compute the gradient summed
+  // over both passes (with the unadapted derivatives weighted by
+  // opts_.unadapted_top_weight) before updating the model.
+  MaxChangeStats max_change_stats_;
+};
+
+
+
+/**
+   This object, which has a similar function to NnetChainTrainer, takes care of
+   evaluating and possibly training the 'bottom' model.
+*/
+class NnetChainaBottomTrainer {
+ public:
+  /**
+     Constructor.
+      @param [in] opts    Options class.  This class maintains a reference to it,
+                          so don't delete it.
+      @param [in,out]  nnet   The neural net we are training.  Expected (for now)
+                            to have an input called 'input' (corresponding to
+                            the original input features and an output called
+                            'output' (corresponding to the embeddings).
+   */
+  NnetChainaBottomTrainer(const NnetChainaTrainingOptions &opts,
+                          Nnet *nnet);
+
+  /**  Train on one minibatch.
+          @param [in] num_sequences The number of sequences/chunks represented
+                    in 'input' (a.k.a. the minibatch size).
+          @param [in] first_input_t  The  't' value corresponding to the first input
+                     frame (will normally be a negative number).  The 't' values at
+                     the input will be consecutive, and the number of frames per sequence
+                     will equal input.NumRows() / num_sequences.  Note: if the embeddings
+                     are computed at a lower frame rate than the original features, we
+                     renumber things to make the embeddings consecutive.
+               (Note: bottom_subsampling_factor was passed in in the constructor).
+          @param [in] first_output_t  The  't' value corresponding to the first output
+                     frame (will normally be a negative number, corresponding to the left
+                     context we are giving to the 'top' model, since we assume that the
+                     sequences have 't' values starting from 0).  The 't' values at
+                     the output will be separated by the 'bottom_subsampling_factor'
+                     which was given to the constructor.  (We'll renumber them
+                     by dividing them by 'bottom_subsampling_factor' before giving
+                     them to the 'top' network.
+          @param [in]  frames_per_sequence_out  The  number of output frames per sequence.
+                     This is determined by the context of the top and bottom nnets
+                     and the "keep_embedding_context" config value.
+          @param [in] train_model   True if we'll be training the bottom model
+                     for this eg.  If this is false, a backward pass will not be.
+                     needed, and this function will return NULL
+          @param [in] input  The input features, most likely raw MFCC or filterbank
+                     features.   A pointer, since it is consumed destructively
+                     (via 'swap').
+          @param [out] output   The output will be written to here.  Does not have
+                     to be correctly sized (we'll copy using Swap()).
+          @return   Returns the NnetComputer object that we did the computation with,
+                    if train_model == true (otherwise, returns NULL).
+                    The user should either pass this into Backward(), or delete it.
+  */
+  NnetComputer* Forward(int32 num_sequences,
+                        int32 first_input_t,
+                        int32 first_output_t,
+                        int32 frames_per_sequence_out,
+                        bool train_model,
+                        CuMatrix<BaseFloat> *input,
+                        CuMatrix<BaseFloat> *output);
+
+
+  /**
+      Does the backward pass, which will do model training.  This should only be
+      called if the bottom nnet needs to be trained.
+         @param [in] model_training_scale  A scale we'll apply to the parameter changes,
+                     l2 term and max-change values when taking the step..  This will be
+                     referred to elsewhere as bottom_weight, or "bw" when present in
+                     keys of egs in scp files; we'll have a separately specifiable
+                     weight for the top nnet.  If this is zero, we won't be training
+                     the top model on this eg at all (and we'll expect 'false' to
+                     have been passed in for the 'train_model' arg on the corresponding
+                     call to Forward()).
+         @param [in] num_sequences  The number of sequences (chunks) we had in this
+                     minibatch-- needed for the application of l2.
+         @param [in] computer   The computer object returned from the
+                    forward pass.  This function takes ownership of it and
+                    will delete it when done with it.
+         @param [in] output_deriv  The derivative w.r.t. the output of
+                    the forward pass.  It is consumed destructively
+                    by this function.
+
+   */
+  void Backward(BaseFloat model_training_scale,
+                int32 num_sequences,
+                NnetComputer *computer,
+                CuMatrix<BaseFloat> *output_deriv);
+
+  // Prints the max-change stats for the bottom nnet.
+  void PrintTotalStats() const;
+
+  // Calls kaldi::nnet3::ConsolidateMemory() on nnet_ and delta_nnet_; we do
+  // this after the first minibatch of training, to reduce fragmentation.
+  void ConsolidateMemory();
+
+  ~NnetChainaBottomTrainer();
+ private:
+
+  // We use this as an index with which to look up computations, kind of like a
+  // lookaside buffer; it avoids creating a much larger structure with large
+  // vectors of Indexes in it.
+  struct ComputationStructure {
+    bool train_model;
+    int32 num_sequences;
+    int32 frames_per_sequence_in;
+    int32 frames_per_sequence_out;
+    int32 first_input_t;
+    int32 first_output_t;
+    inline bool operator == (const ComputationStructure &other) const {
+      return train_model == other.train_model &&
+          num_sequences == other.num_sequences &&
+          frames_per_sequence_in == other.frames_per_sequence_in &&
+          frames_per_sequence_out == other.frames_per_sequence_out &&
+          first_input_t == other.first_input_t &&
+          first_output_t == other.first_output_t;
+  };
+    ComputationStructure (const ComputationStructure &other) = default;
+    ComputationStructure &operator = (
+        const ComputationStructure &other) = default;
+    /**
+       Constructor.
+       @param [in] train_model  True if we are going to train the bottom model.
+       @param [in] need_input_deriv  True if we need the derivative w.r.t.
+                     the features that are the input to this computation.
+       @param [in] num_sequences  The number of sequences in this minibatch
+                     (a.k.a. the minibatch size).
+       @param [in] frames_per_sequence_in  The number of frames for each sequence
+                    of input features.  They are assumed to be consecutively
+                    numbered.
+       @param [in] frames_per_sequence_out  The 'frames_per_sequence' in
+                    the ChainSupervision object, i.e. the length of the
+                    output sequences of the computation.
+       @param [in] first_input_t  The first 't' value in the input
+                    sequence; will normally be negative (corresponding to
+                    the negative of the number of frames of left context).
+    */
+    ComputationStructure(bool train_model,
+                         int32 num_sequences,
+                         int32 frames_per_sequence_in,
+                         int32 frames_per_sequence_out,
+                         int32 first_input_t,
+                         int32 first_output_t);
+  };
+  struct ComputationHasher {
+    inline size_t operator() (const ComputationStructure &s) const {
+      return size_t(s.num_sequences) +
+          10  * size_t(s.frames_per_sequence_in) +
+          100 * size_t(s.frames_per_sequence_out) +
+          1000 * size_t(s.first_input_t) +
+          10000 * size_t(s.first_output_t);
+    }
+  };
+
+  // This is a faster lookup mechanism for the computation than
+  // is provided by the compiler's inherent caching.
+  std::unordered_map<ComputationStructure,
+                     std::shared_ptr<const NnetComputation>,
+                     ComputationHasher> computation_map_;
+
+  // This wraps the call to the compiler.  See constructor
+  // of struct ComputationStructure for more documentation.
+  std::shared_ptr<const NnetComputation> GetComputation(
+      const ComputationStructure &s);
+
+
+
+  /**
+     Converts the format of the posterior from how it is at the output of the
+     network to how it is at the input (i.e. in the embedding space).
+     Basically, this will consist of padding with empty posteriors for the
+     "context frames", and possibly upsampling the posteriors (by just repeating
+     each one for, say, 3 frames, if top_subsampling_factor == 3).
+
+     The number of frames per sequence at the output will equal
+     post_at_output.size() / num_sequences, and the number of frames per
+     sequence at the input will equal post_at_inptu->size() / num_sequences
+     (note: this means 'post_at_input is expected to be appropriately sized
+     when this function is called).
+  */
+  void ConvertPosterior(const Posterior &post_at_output,
+                        int32 num_sequences,
+                        int32 first_input_t,
+                        int32 top_subsampling_factor,
+                        Posterior *post_at_input);
+
+  const NnetChainaTrainingOptions opts_;
+
+  Nnet *nnet_;
+  Nnet *delta_nnet_;  // stores the change to the parameters on each training
+                      // iteration.
+
+  CachingOptimizingCompiler compiler_;
+
+  // Number of minibatches processed.
+  int32 num_minibatches_processed_;
+
+  // stats for max-change
+  MaxChangeStats max_change_stats_;
+};
+
+
+
+/**
+   This class is for single-threaded training of neural nets using the 'chain'
+   model and our adaptation framework
+*/
+class NnetChainaTrainer {
+ public:
+  /**
+     Constructor
+        @param [in] config  Options class
+        @param [in] models  Object that provides access to the models and
+                         denominator FSTs, indexed as appropriate by language-id.
+   */
+  NnetChainaTrainer(const NnetChainaTrainingOptions &config,
+                    NnetChainaModels *models);
+
+  /* Train on one minibatch.
+           @param [in] key  The key the example had in the archive.  This is
+                        used to work out the language name.
+            @param [in] eg  The example we are training on.  It is expected
+                        to have an input named 'input' (the features) and an
+                         output named 'output' (containing the chain supervision
+                         object).  We'll make use of the chunks_per_group member
+                         of the NnetChainSupervision object, which is not used
+                         outside the 'chaina' framework.
+  */
+  void Train(const std::string &key,
+             const NnetChainExample &eg);
+
+  // Prints out the final stats, and return true if there was a nonzero count.
+  bool PrintTotalStats() const;
+
+  // Returns the total objective-function value, summed over all languages
+  // present, for the adapted computation (if adapted == true), or the
+  // unadapted/speaker-independent computation otherwise, with the corresponding
+  // weight (which can be interpreted as a frame count) written to 'weight'.
+  // The returned value would normally be divided by 'weight' before being
+  // displayed.
+  BaseFloat GetTotalObjf(bool adapted, BaseFloat *weight) const;
+
+  // Prints out the max-change stats (if nonzero): the percentage of time that
+  // per-component max-change and global max-change were enforced.
+  void PrintMaxChangeStats() const;
+
+  ~NnetChainaTrainer();
+
+ private:
+
+  void GetContextInfo(const std::string &lang,
+                      int32 *bottom_left_context,
+                      int32 *bottom_right_context,
+                      int32 *top_left_context,
+                      int32 *top_right_context);
+
+
+  NnetChainaTopTrainer *GetTopTrainerForLang(const std::string &lang);
+
+
+  const NnetChainaTrainingOptions &opts_;
+  // pointer to object owned outside this class.
+  NnetChainaModels *models_;
+
+  // left and right context of bottom model.
+  int32 bottom_left_context_;
+  int32 bottom_right_context_;
+
+  NnetChainaBottomTrainer bottom_trainer_;
+  // map from language name (e.g. "default", "english", "french") to
+  // the object that trains the corresponding 'top' nnet.
+  std::unordered_map<std::string, NnetChainaTopTrainer*,
+                     StringHasher> top_trainers_;
+};
+
+
+/**
+     This utility function, used in training and test-time adaptation code,
+     converts the format of the posterior from how it is at the output of the
+     top network to how it is at the input (i.e. in the embedding space).
+     Basically, this will consist of padding with empty posteriors for the
+     "context frames", and possibly upsampling the posteriors (by just repeating
+     each one for, say, 3 frames, if top_subsampling_factor == 3).  The
+     rule we'll use is: copy the posterior from the output frame that
+     is closest in numbering, rounding down in case of ties (i.e., for even
+     subsampling factor).
+
+        @param [in] post_at_output  The posterior that needs to be padded,
+                      consisting of 'num_sequences' sequences, each with 't'
+                      values starting at zero, at multiples of
+                      'top_subsampling_factor', and with number of 't' values
+                      determined by: num_frames_out = post_at_output.size() /
+                      num_sequences.  The 't' has the larger stride than the
+                      minibatch index 'n', so it's: frame t=0 of all sequences,
+                      then frame t=1*top_subsampling_factor of all sequences,
+                      and so on.
+        @param [in] num_sequences  The number of sequences/chunks
+        @param [in] first_input_t  The first 't' value at the input, for which
+                      we need a posterior for (note: negative 't' values will
+                      get zero posterior).  Implicitly, first_output_t = 0.
+                      The number of input frames is worked out as
+                      post_at_input->size() / num_sequences; the 't' values
+                      at the input are assumed to be consecutive.
+        @param [in] top_subsampling_factor  The number of frames with which
+                      't' values at the output are separated.
+        @param [in] pdf_map  This is either the empty vector (meaning:
+                     the DifferentiableTransform object deals with pdf-ids
+                     directly), or it is a map from pdf-ids to cluster-ids.
+                     This would actually be obtained from build-tree-two-level
+                     after building a two-level tree, and it would be stored
+                     in the .ada object.  The actual class labels that
+                     the DifferentiableTransform object deals with, will
+                     be the values stored in 'pfd_map' (i.e. these cluster-ids).
+        @param [in] num_classes  Provided for checking purposes only: the
+                     number of classes that the DifferentiableTransform object
+                     expects.  If pdf_map is empty we expect this to be the
+                     same as the number of pdf-ids (and the ints in
+                     post_at_output to be in the range [0, num_classes - 1]).
+                     If pdf_map is nonempty, we expect this to be the same
+                     as the maximum element in pdf_map, plus one.
+        @param [out] post_at_input  The posterior after padding and possibly
+                      subsampling.  Should have the correct size but its
+                      elements are expected to be empty at entry.  Like
+                      post_at_output, the 't' has the larger stride than
+                      the minibatch-index 'n'.
+*/
+void ConvertPosterior(const Posterior &post_at_output,
+                      int32 num_sequences,
+                      int32 first_input_t,
+                      int32 top_subsampling_factor,
+                      const std::vector<int32> &pdf_map,
+                      int32 num_classes,
+                      Posterior *post_at_input);
+
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_NNET_CHAINA_TRAINING_H_
diff --git a/src/nnet3a/nnet-chaina-utils-test.cc b/src/nnet3a/nnet-chaina-utils-test.cc
new file mode 100644
index 00000000000..6dd9a942ad7
--- /dev/null
+++ b/src/nnet3a/nnet-chaina-utils-test.cc
@@ -0,0 +1,57 @@
+// nnet3/nnet-chaina-utils-test.cc
+
+// Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3a/nnet-chaina-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+void UnitTestParseFromQueryString(){
+  std::string value;
+  KALDI_ASSERT(ParseFromQueryString("abc", "d", &value) == false);
+  KALDI_ASSERT(ParseFromQueryString("abc?e=f", "d", &value) == false);
+  KALDI_ASSERT(ParseFromQueryString("abc?d=f", "d", &value) == true &&
+               value == "f");
+  KALDI_ASSERT(ParseFromQueryString("abc?dd=f", "d", &value) == false);
+  KALDI_ASSERT(ParseFromQueryString("abc?dd=f&d=gab", "d", &value) == true &&
+               value == "gab");
+  KALDI_ASSERT(ParseFromQueryString("abc?d=f&dd=gab", "d", &value) == true &&
+               value == "f");
+  KALDI_ASSERT(ParseFromQueryString("abc?d=f&ex=fda&dd=gab", "ex", &value) == true &&
+               value == "fda");
+
+
+  BaseFloat f;
+  KALDI_ASSERT(ParseFromQueryString("abc?d=f&ex=1.0&dd=gab", "ex", &f) == true &&
+               f == 1.0);
+  KALDI_ASSERT(ParseFromQueryString("abc?d=f&ex=1.0&dd=gab", "e", &f) == false);
+}
+
+} // namespace nnet3
+} // namespace kaldi
+
+int main() {
+  using namespace kaldi;
+  using namespace kaldi::nnet3;
+  SetVerboseLevel(2);
+  UnitTestParseFromQueryString();
+  KALDI_LOG << "Tests succeeded.";
+
+  return 0;
+}
diff --git a/src/nnet3a/nnet-chaina-utils.cc b/src/nnet3a/nnet-chaina-utils.cc
new file mode 100644
index 00000000000..a83097395de
--- /dev/null
+++ b/src/nnet3a/nnet-chaina-utils.cc
@@ -0,0 +1,186 @@
+// nnet3/nnet-chaina-utils.cc
+
+// Copyright      2018    Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet3/nnet-utils.h"
+#include "nnet3a/nnet-chaina-utils.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+void FindChainaExampleStructure(const NnetChainExample &eg,
+                                int32 *num_sequences,
+                                int32 *chunks_per_spk,
+                                int32 *first_input_t,
+                                int32 *num_input_frames,
+                                int32 *num_output_frames,
+                                int32 *frame_subsampling_factor,
+                                int32 *eg_left_context,
+                                int32 *eg_right_context) {
+  if (eg.inputs.size() != 1 ||
+      eg.inputs[0].name != "input")
+    KALDI_ERR << "Expected eg to have exactly one input, named 'input'";
+
+  if (eg.outputs.size() != 1 ||
+      eg.outputs[0].name != "output")
+        KALDI_ERR << "Expected eg to have exactly one output, named 'output'";
+
+
+  const NnetChainSupervision &supervision = eg.outputs[0];
+  *num_sequences = supervision.supervision.num_sequences;
+  *chunks_per_spk = supervision.chunks_per_group;
+
+  KALDI_ASSERT(supervision.indexes.size() % *num_sequences == 0 &&
+               !supervision.indexes.empty());
+  KALDI_ASSERT(supervision.indexes[0] == Index() &&
+               "Expected first index to have t=0,n=0,x=0");
+  // We expect t to have the larger stride.
+  KALDI_ASSERT(supervision.indexes[1].n == 1 &&
+               "Supervision is in an unexpected order");
+  Index last_output_index = supervision.indexes.back();
+  KALDI_ASSERT(last_output_index.n == *num_sequences - 1);
+  *num_output_frames = int32(supervision.indexes.size()) / *num_sequences;
+  int32 last_output_t = last_output_index.t;
+  KALDI_ASSERT(last_output_t % (*num_output_frames - 1) == 0);
+  *frame_subsampling_factor = last_output_t / (*num_output_frames - 1);
+
+
+  const NnetIo &input_io = eg.inputs[0];
+  *first_input_t = input_io.indexes[0].t;
+  if (input_io.indexes[1].t != *first_input_t + 1) {
+    KALDI_ERR << "Input indexes are in the wrong order or not consecutive: "
+              << input_io.indexes[1].t << " != " << (*first_input_t) << " + 1";
+  }
+  Index last_input_index = input_io.indexes.back();
+  KALDI_ASSERT(last_input_index.n == *num_sequences - 1);
+  int32 last_input_t = last_input_index.t;
+  *num_input_frames = last_input_t + 1 - *first_input_t;
+
+  *eg_left_context = -(*first_input_t);
+  *eg_right_context = last_input_t - last_output_t;
+}
+
+
+bool ParseFromQueryString(const std::string &string,
+                          const std::string &key_name,
+                          std::string *value) {
+  size_t question_mark_location = string.find_last_of("?");
+  if (question_mark_location == std::string::npos)
+    return false;
+  std::string key_name_plus_equals = key_name + "=";
+  // the following do/while and the initialization of key_name_location is a
+  // little convoluted.  We want to find "key_name_plus_equals" but if we find
+  // it and it's not preceded by '?' or '&' then it's part of a longer key and we
+  // need to ignore it and see if there's a next one.
+  size_t key_name_location = question_mark_location;
+  do {
+    key_name_location = string.find(key_name_plus_equals,
+                                    key_name_location + 1);
+  } while (key_name_location != std::string::npos &&
+           key_name_location != question_mark_location + 1 &&
+           string[key_name_location - 1] != '&');
+
+  if (key_name_location == std::string::npos)
+    return false;
+  size_t value_location = key_name_location + key_name_plus_equals.length();
+  size_t next_ampersand = string.find_first_of("&", value_location);
+  size_t value_len;
+  if (next_ampersand == std::string::npos)
+    value_len = std::string::npos;  // will mean "rest of string"
+  else
+    value_len = next_ampersand - value_location;
+  *value = string.substr(value_location, value_len);
+  return true;
+}
+
+
+bool ParseFromQueryString(const std::string &string,
+                          const std::string &key_name,
+                          BaseFloat *value) {
+  std::string s;
+  if (!ParseFromQueryString(string, key_name, &s))
+    return false;
+  bool ans = ConvertStringToReal(s, value);
+  if (!ans)
+    KALDI_ERR << "For key " << key_name << ", expected float but found '"
+              << s << "', in string: " << string;
+  return true;
+}
+
+
+bool ComputeEmbeddingTimes(int32 first_input_t,
+                           int32 num_input_frames,
+                           int32 num_output_frames,
+                           int32 frame_subsampling_factor,
+                           int32 bottom_subsampling_factor,
+                           int32 bottom_left_context,
+                           int32 bottom_right_context,
+                           int32 top_left_context,
+                           int32 top_right_context,
+                           bool keep_embedding_context,
+                           int32 *first_embedding_t,
+                           int32 *num_embedding_frames) {
+  KALDI_ASSERT(num_input_frames > 0 && num_output_frames > 0 &&
+               first_input_t <= 0 && frame_subsampling_factor > 0);
+  KALDI_ASSERT(bottom_subsampling_factor > 0 &&
+                frame_subsampling_factor % bottom_subsampling_factor == 0);
+  KALDI_ASSERT(bottom_left_context >= 0 && bottom_right_context >= 0 &&
+               top_left_context >= 0 && top_right_context >= 0);
+
+  // below '_subsampled' means after dividing the 't' values by
+  // 'bottom_subsampling_factor'.
+  // Note: implicitly, the first frame required at the output is t=0.
+  int32 first_required_embedding_t_subsampled = -top_left_context,
+      last_required_embedding_t_subsampled =
+      num_output_frames - 1 + top_right_context;
+
+  int32 first_computable_embedding_t = first_input_t + bottom_left_context,
+      last_computable_embedding_t =
+      first_input_t + num_input_frames - 1 - bottom_right_context;
+
+  int32 b = bottom_subsampling_factor;
+
+  // By adding b - 1 and doing division that rounds down (towards negative
+  // infinity, we effectively round up when computing
+  // first_computable_embedding_t / b, which is appropriate because
+  // we need the first multiple of b that's actually computable.
+  int32 first_computable_embedding_t_subsampled =
+      DivideRoundingDown(first_computable_embedding_t + b - 1, b),
+      last_computable_embedding_t_subsampled =
+      DivideRoundingDown(last_computable_embedding_t, b);
+  if (first_computable_embedding_t_subsampled > first_required_embedding_t_subsampled ||
+      last_computable_embedding_t_subsampled < last_required_embedding_t_subsampled) {
+    KALDI_WARN << "The training examples have insufficient context vs. the models.";
+    return false;
+  }
+  if (keep_embedding_context) {
+    *first_embedding_t = first_computable_embedding_t_subsampled * b;
+    *num_embedding_frames = 1 + last_computable_embedding_t_subsampled -
+        first_computable_embedding_t_subsampled;
+  } else {
+    *first_embedding_t = first_required_embedding_t_subsampled * b;
+    *num_embedding_frames = 1 + last_required_embedding_t_subsampled -
+        first_required_embedding_t_subsampled;
+  }
+  return true;
+}
+
+
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3a/nnet-chaina-utils.h b/src/nnet3a/nnet-chaina-utils.h
new file mode 100644
index 00000000000..4f028a4af0b
--- /dev/null
+++ b/src/nnet3a/nnet-chaina-utils.h
@@ -0,0 +1,182 @@
+// nnet3a/nnet-chaina-utils.h
+
+// Copyright    2015-2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_CHAINA_UTILS_H_
+#define KALDI_NNET3_NNET_CHAINA_UTILS_H_
+
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-computation.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-chain-example.h"
+#include "nnet3/nnet-training.h"
+#include "chain/chain-training.h"
+#include "chain/chain-den-graph.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+/**
+   This function works out certain structural information from an example for
+   'chaina' (adapted chain) training.  It assumes (and spot-checks) that the eg
+   has a single input, called 'input', with a regular structure where the 'n'
+   has the highest stride so it's: all frames for sequence 0; all frames for
+   sequence 1; and so on.  It will raise an exception if the example does not,
+   in some respect, have the expected structure.
+
+      @param [in]  The example we are getting the structural information from
+      @param [out] num_sequences  The number of sequences/chunks (actually just
+                            the num_sequences in the eg.supervision object).
+      @param [out] chunks_per_spk  The number of chunks per speaker
+                            (just eg.chunks_per_spk)
+      @param [out] first_input_t   The lowest numbered 't' value in the inputs.
+                            Usually will be negative.  This function requires the
+                            input 't' values to be consecutive, and will crash
+                            if they are not.
+      @param [out] num_input_frames  The number of input frames.  The last input
+                            't' value will be first_input_t + num_input_frames - 1.
+      @param [out] num_output_frames  The number of output frames (which are
+                             assumed to start from t=0 and to be spaced by
+                             'frame_subsampling_factor.
+      @param [out] frame_subsampling_factor  The spacing on the output frames,
+                             equal to the amount of subsampling that happens
+                             between the input and the output (this will
+                             later be factorized as:
+                             frame_subsampling_factor =
+                                bottom_subsampling_factor * top_subsampling_factor.
+      @param [out] eg_left_context  Just as a convenience, this function outputs
+                             the left-context in the example, which equals
+                             first_output_t - first_input_t = -first_input_t.
+      @param [out] eg_right_context  Again just as a convenience, this function
+                             outputs the right-context of the example, which
+                             equals last_input_t - last_output_t =
+                             (first_input_t + num_input_frames - 1) -
+             (first_output_t + num_output_frames - 1) * frame_subsampling_factor
+                             (note: first_output_t is zero).
+*/
+void FindChainaExampleStructure(const NnetChainExample &eg,
+                                int32 *num_sequences,
+                                int32 *chunks_per_spk,
+                                int32 *first_input_t,
+                                int32 *num_input_frames,
+                                int32 *num_output_frames,
+                                int32 *frame_subsampling_factor,
+                                int32 *eg_left_context,
+                                int32 *eg_right_context);
+
+/**
+   This function computes some info about which frames we need to compute the
+   embeddings for (i.e. which frames we need to request at the output of the
+   bottom nnet).  It will print a warning and return false if the egs had
+   insufficient context to compute what is requested.
+
+      @param [in] first_input_t  The first 't' value for the input that
+                       is provided to the bottom nnet.
+      @param [in] num_input_frames   The number of input frames provided to
+                       the bottom nnet; these are assumed to be consecutive.
+      @param [in] num_output_frames  The number of output frames that we
+                       need to compute the output for (this will be
+                       the sequence_length in the chain supervision object).
+      @param [in] frame_subsampling_factor  The factor by which we
+                       subsample to get the final output (includes subsampling
+                       in both the bottom and top nnet).
+      @param [in] bottom_subsampling_factor  The amount of subsampling
+                       for getting the embeddings (i.e. the embeddings
+                       are obtained at t = multiples of this value.)
+                       Must be >0 and divide frame_subsampling_factor.
+                       This must be provided and can't be worked out from
+                       the nnets, because the top nnet uses a different frame
+                       numbering-- i.e. we divide the 't' values by
+                       'bottom_subsampling_factor' so that the inputs to the
+                       top nnet are consecutive.  This will make it easier
+                       to apply the top nnet separately from binaries.
+      @param [in] bottom_left_context  The num-frames of left-context that the
+                       bottom nnet requires
+      @param [in] bottom_right_context  The num-frames of right-context that the
+                       bottom nnet requires
+      @param [in] top_left_context  The num-frames of left-context that the
+                       top nnet requires.  Note: this is *after* dividing the
+                       't' values by bottom_subsampling_factor, so the number
+                       top_left_context * bottom_subsampling_factor can be used
+                       to compute the total left-context that we need to put in
+                       the egs.
+      @param [in] top_right_context  The num-frames of right-context that the
+                       top nnet requires.  See docs for top_left_context for more
+                       info RE frame subsampling
+      @param [in] keep_embedding_context  True if we want to compute as
+                       many frames of the embedding as we can given the amount
+                       of available left context in the input.  This will be
+                       usually be set to true if the top nnet is recurrent or
+                       can otherwise consume extra context.
+      @param [out] first_embedding_t  First 't' value of the embedding.  CAUTION:
+                       this is in the original frame numbering (the one we use
+                       for the bottom nnet), and will be a multiple of
+                       'bottom_subsampling_factor'.  You need to divide by
+                       'bottom_subsampling_factor' to get the 't' value used
+                       at the input of the top nnet.
+      @param [out] num_embedding_frames  The number of embedding frames that
+                       we are computing.
+      @return          Returns true if it could successfully compute the output,
+                       and false if it could not because of insufficient input
+                       context.
+ */
+bool ComputeEmbeddingTimes(int32 first_input_t,
+                           int32 num_input_frames,
+                           int32 num_output_frames,
+                           int32 frame_subsampling_factor,
+                           int32 bottom_subsampling_factor,
+                           int32 bottom_left_context,
+                           int32 bottom_right_context,
+                           int32 top_left_context,
+                           int32 top_right_context,
+                           bool keep_embedding_context,
+                           int32 *first_embedding_t,
+                           int32 *num_embedding_frames);
+
+
+/**
+   This function parses a string value from a 'url-like' string (which is probably actually
+   a key value from an scp file).  The general format this function handles is:
+       iiiiiiiiiiiiiiiiiii?aaa=xxxx&bbb=yyyy
+   where the only 'special characters' are '?' and '&'.  This is modeled after a query
+   string in HTML.  This function searches for a key name with the value 'key_name',
+   (e.g. 'aaa' or 'bbb' in the example), and if it exists, sets `value` to that value
+   (e.g. 'xxxx' or 'yyyy' in the example.  If the string `string` has no '?' in it,
+   or the key name `key_name` is not present, this function returns false; otherwise,
+   it returns true and sets `value` to that value.
+
+*/
+bool ParseFromQueryString(const std::string &string,
+                          const std::string &key_name,
+                          std::string *value);
+
+
+// This overloaded version of ParseFromQueryString()is for where a float value
+// is required.  If the key is present but cannot be turned into a float, it
+// will raise an error.
+bool ParseFromQueryString(const std::string &string,
+                          const std::string &key,
+                          BaseFloat *f);
+
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+#endif // KALDI_NNET3_NNET_CHAINA_UTILS_H_
diff --git a/src/nnet3a/notes.update b/src/nnet3a/notes.update
new file mode 100644
index 00000000000..c01aa208d50
--- /dev/null
+++ b/src/nnet3a/notes.update
@@ -0,0 +1,392 @@
+===
+
+ Meta-info for dumping egs:
+   only really need tree,trans_mdl,normalization.fst,den.fst
+
+
+===
+
+
+Things needed per language in order to dump raw egs:
+
+ Configuration values:
+   - left and right acoustic context
+   - frames_per_eg, frames_overlap_per_eg, frame_subsampling_factor,
+     alignment_subsampling_factor, constrained, compress, left_tolerance,
+     right_tolerance, lattice_lm_scale, lattice_prune_beam, acwt
+
+     (Also: left_context_initial,right_context_initial... although
+      the use of these will make it harder to deal with setups with
+      little data per speaker).
+
+  - tree, tree.map, 0.trans_mdl, normalization.fst (and probably den.fst
+    so we can save it in the egs dir).
+
+  - Format of raw egs dir (we'll likely delete this right after creation):
+
+    info.txt:
+
+     dir_type raw_chaina_egs
+     num_chunks 120000
+     num_leaves 6543
+     frames_per_chunk 140,110,100
+     num_input_frames_tot 432143218
+     left_context 10
+     left_context_initial 10
+     right_context 10
+     right_context_initial 10
+
+     ... various configuration values here...
+     ... need utt2spk and utt2uniq file ...
+
+     # note: tree.map is optional, since the egs don't depend on it, but it will generally
+     # be generated with the tree.
+     misc/{tree,tree.map,0.trans_mdl,normalization.fst,den.fst}
+
+     egs.scp will contain encodings like:
+      <utt_id>-<frame_id>-<chunk_left_context>-<chunk_num_frames>-<chunk_right_context>-v1
+
+  - Format of merged-egs dir
+
+     dir_type merged_chaina_egs
+     chunks_per_spk 4
+    .. otherwise like raw one.  misc/ directory contains similar things.
+
+  - Format of final-egs dir (might be merged).
+   info.txt:
+     dir_type final_chain_egs
+     langs  english french
+     num_input_frames_tot 432143218
+     num_scp_files 24
+     frames_per_scp_file 143241
+     chunks_per_spk 4
+     num_chunks xxxx
+
+   den_fsts/ -> lang.fst
+   norm_fsts/ -> lang.fst
+   trees/lang.tree, ?lang.tree.map
+   trans_models/lang.trans_mdl
+
+
+  - Format of chain-training-input dir:
+
+  - Two purposes: as input to the model training, and (if single language) as input for getting the egs?
+
+  Need:
+    - the input models (bottom and top-per-language), the input .ada objects
+    - The trees per language?
+    - Options and the like
+    - List of languages
+    - Left and right context required for egs
+    - extra left/right context???
+
+
+=================
+
+
+
+  - contains egs.N.{scp,ark}, which might be links to files in the storage dir.
+
+  egs.scp
+
+The
+
+
+
+
+
+-- Extend nnet3-chain-copy-egs, to supply at least a minimum context in input features by
+    duplicating frames as needed.   E.g.
+       --extend-left-context=12 --extend-right-context=10
+
+=============
+Plans for binaries.
+
+  nnet3-adapt --init|--copy|--adapt
+
+================
+
+  steps/chaina/init_den_fst.sh
+     make den.fst, normalization.fst
+
+  # Maybe just use nnet3-init in the scripts, to initialize the nnets, and
+  # copy them where they are needed.
+
+
+  # What's needed in a chain dir?
+  0/bottom.raw,lang.mdl,lang.ada
+
+
+  steps/chaina/init_chain_dir.sh
+     make den.fst, normalization.fst,
+     bottom.config, top.config,
+     bottom.raw, top.raw
+
+ init.config, init.raw, 0.trans_mdl,
+     final.config (but not 0.raw yet, might need egs first).
+
+
+============
+
+nnet3-get-egs?
+  ...  Make sure the length info and left/right context of each eg is included in the id?
+    - when we merge,
+
+  steps/chaina/get_raw_egs.sh
+
+ -- need to decide utts-per-spk-max in validation data?  do it in process_egs.
+
+
+ ... takes options like --utts-per-spk-max --num-utts-subset --frames-per-job
+     (prev. frames-per-iter), --chunks-per-group (e.g. 4)
+
+  steps/chaina/process_egs.sh [options] <input-egs-dir>  <output-egs-dir>
+
+   [shouldn't need any info not already in raw_egs dir, I hope.  We'll later have a
+    multilingual version of this script].
+
+  steps/chaina/process_egs.sh [options] <input-egs-dir>  <output-egs-dir>
+
+
+========
+  Monolingual case (training):
+
+   README.txt
+   bottom.raw   default.ada  default.mdl default.den
+   info  -> mfcc.config??  Or other config?
+    info.txt?
+       frame_subsampling_factor1
+       frame_subsampling_factor2
+       frame_subsampling_factor
+.. we'll need to pass in chain opts such as:
+
+[for chain objective]
+  --leaky-hmm-coefficient
+[for the neural nets]:
+  --max-param-change-{bottom,top}
+  --print-interval
+  --l2-regularize-factor (use same one).
+  --train-bottom-nnet {true,false}
+
+====
+  nnet3-copy-egs: maybe introduce an option to extend context?
+
+===
+
+prepare_egs.sh...
+  - merging into speaker groups.  done by python script.  Originally we'll dump with:
+
+      utterance-id-{num_frames_out}-{frame_subsampling_factor}-{left_context}-{right_context}
+
+  - so the number of input frames would be
+      ((num_frames_out - 1) * frame_subsampling_factor) + 1 + left_context + right_context
+
+
+      utterance-id-{num_frames_out}-{frame_subsampling_factor}-{left_context}-{right_context}
+
+===
+
+nnet3-chain-merge-egs --keep-distinct
+
+
+  ?aaa=xxxx&bbb=yyyy
+
+
+
+
+```
+This copies nnet3+chain training examples from input to output, merging them
+into composite examples.  The --minibatch-size option controls how many egs
+are merged into a single output eg.
+
+Usage:  nnet3-chain-merge-egs [options] <egs-rspecifier> <egs-wspecifier>
+e.g.
+nnet3-chain-merge-egs --minibatch-size=128 ark:1.cegs ark:- | nnet3-chain-train-simple ...
+See also nnet3-chain-copy-egs
+```
+
+
+
+    BUT, we don't want to do this on minibatches
+
+
+====
+ - Merging egs: will already have merged into speaker groups in prepare_egs.
+ - Output names?  output  -->  output-xent.
+ - Input names?  Just input.  (May add ivector later but I hope not to have to).
+ - Could modify nnet3-merge-egs to parse the keys and get weights and output
+   names (to keep the output names distinct and to incorporate the weights).
+
+ -- Initially, in nnet3*get-egs, we'll dump with:
+
+    utterance-id-{num_frames}-{left_context}-{right_context}
+
+ We'll use that info, together with the speaker-id and utt2uniq information, to
+ merge chunks together into groups (preferably by utterance; if not, by speaker)
+ in process_egs.sh (the choice of which egs to merge will be done in python).
+
+======
+
+======
+ later, when
+
+ The merging script will decide the key for the merged egs.
+
+ process_egs.sh will dump these as archives *and* scp files, but they will now
+ be in groups of chunks_per_spk (e.g. 4).  The language name will be added as the
+ last-but-two field in the key; we'll set it to 'default' by default, but it may
+ be changed in merge_egs.sh.  The last two fields will be (1) a weight to be incorporated
+ just before the final merge (by nnet3-chain-merge-egs with the --interpret-keys
+ option), and (2) a weight to propagate back to the bottom network (if you want a
+ particular language to have less of an effect on the bottom network).
+
+ So the keys at the input to the final merge will be of the form:
+      {language-name}-{egs-weight}-{bottom-nnet-weight}
+
+ And the keys at the output of the final merge would be of the form:
+      {language-name}-{bottom-nnet-weight}-0-0
+ The 'egs-weight' (which becomes weight in the chain supervision objects,
+ which is a scale on the objective function) will already have been set
+ in the ChainSupervision object.
+ The 0 and 0 becom
+
+
+
+
+ info/chunks_per_spk
+
+ We may also have a combine_egs.sh script which can combine egs from multiple
+ sources (assuming they have the same chunks_per_spk), and can assign them
+ to different language names if needed.
+
+====
+
+ Merging already-merged chain egs
+
+ This is something that I am going to need for the new adaptation framework I am
+ working on.  Currently in nnet-example-utils.cc and nnet-chain-example.cc, the
+ example-merging code does not support merging already-merged egs (search for already-merged).
+ This is something that I'm going to need to be supported at least in NnetChainExample, and
+ this would also need to be supported, I think, in the NnetExample merging code, since
+ I think the chain example merging code supports that code.  If it would be helpful in
+ implementation, you may assume that all the egs to be merged have the same number
+ of 'n' values (e.g. it might be 4; it's the number of chunks per speaker that we use
+ for adaptation).
+
+ After the examples have been merged I'd like a variable as follows to be set in
+ the NnetChainSupervision object:
+```
+ // This will be 1 in normal cases, but in the 'chaina' code (chain training
+ // with adaptation) it will be set to the number of chunks per speaker in
+ // this minibatch.  For example if it's 4, then we are asserting that
+ // sequences n=0 through 3 all come from the same speaker, n=4 through 7
+ // all come from the same speaker, and so on.
+ int32 chunks_per_spk;
+```
+Please make sure this is 1 by default (e.g. in the constructor), that the
+on-disk format stays the same when it's 1 (e.g. only write it if it's not 1) to
+minimize code-version compatibility headaches; and only set it to
+a value other than 1 when merging chain supervision objects that were
+already merged (you can check that the sizes of the things being merged match).
+We may later introduce such a variable in the NnetSupervision object, but
+it's not needed just yet.
+
+This PR can go to my svd_draft branch in my personal repo, as it's part of
+that project.
+====
+
+Interpreting keys when merging nnet and chain examples
+
+This is a change that will need to be made to nnet3-chain-merge-egs binary to support
+the new adaptation framework.  @hhadian, again, please get to this when you can but
+it is not urgent at all.  If someone else feels like they want to do it that's OK
+with me too as long as you don't just sit on it without making progress, but please
+have @hhadian check the code.
+In ExampleMergingConfig, please add a new boolean config value, default false, registered
+as follows:
+
+    po->Register("interpret-keys", &interpret_keys, "If true, require the keys "
+                "on the example to end in something of the form -xxxxxx-yyy "
+                "where xxxxxx is a string with only letters, numbers and _, "
+                "which will be interpreted as a language-name (e.g. \"default\","
+                "\"english\", \"french\"), and yyy is a floating point weight "
+                "e.g. 1.0, to be applied to the example.  If the weight is not "
+                "1.0, then any NnetIo objects with names matching \"output\" and any chain "
+                "supervision objects will have their weights multiplied by this "
+                "weight.  In addition, the merging will keep distinct language-names "
+                "distinct, and will ensure that the output keys end in -xxxxxx "
+                "where xxxxxx is the language-name.  This is intended to support "
+                "the \"chaina\" adaptation framework.")
+
+and please make any implementation changes required to support it.  When
+weighting chain supervision objects, just multiply the 'weight' field in the
+ChainSupervision object.  I think when weighting NnetIo objects you can just
+scale the GeneralMatrix, although I'm not sure if there is a generic way to do
+that.  (This probably only really makes sense with sparse supervision intended
+to represent posterior probabilities in xent setups).  Do this before merging; I
+believe the chain merging code already checks for weight equality but you'll
+have to also make sure it checks for network-name equality and encodes the
+network name in the output key.  I believe the output keys are currently not
+really inspected so back compatibility won't be important.  Also please make
+sure there is a convenience function that makes it easy to extract the "xxxxxx"
+network-name suffix from a chain example key; this will be needed in the
+training code.
+
+
+
+
+====
+
+
+  info needed
+        ?den.fst?
+
+  frame_subsampling_factor1
+  frame_subsampling_factor2
+  frame_subsampling_factor = their product.
+
+
+  separately: different den.fst's?  one den.fst?
+====
+  Multilingual case (training):
+
+   bottom.raw  english.ada english.mdl   <-- output vs.  output_libri, output_wsj.  No, will be too complicated (?)
+                                              ... just support one name.
+               spanish.ada spanish.mdl
+
+0.ada  top.mdl
+
+
+when randomizing
+
+we'll merge in a controlled way, e.g. nnet3-merge-egs --fixed
+===
+
+  --bottom-subsampling-factor is the subsampling in the bottom
+    model (the feature extractor).  frame-subsampling-factor
+    divided by this is the amount of subsampling in the top
+    model.  In the training code we'll work this out from
+    the 't' values in the chain supervision object, and
+    the top network will actually run at the reduced frame
+    rate.
+  --keep-embedding-context is true if the top network is
+    recurrent and therefore we need to keep as much extra
+    context as possible in the features.
+
+How to work out the computations:
+
+   We get the number of n values and the first and last 't' values in the input;
+   check they are contiguous.
+
+   We get the number of 't' values in the output (the chain supervision) and
+   their spacing; this is interpreted as the frame-subsampling-factor, which is
+   not passed directly to nnet3-chaina-train.
+
+   We are given the --bottom-subsampling-factor and (boolean)
+   top_network_is_recurrent.
+
+   We work out the left-context and right-context of the bottom and
+   top networks.  We first use this on the top network to work out, at the
+   top-network frame rate, the 't' values needed at the input
+   (e.g. frames -10 through 159 assuming the chunk size is 150 and
+   the network takes +-10 frames of context).
diff --git a/src/nnet3abin/Makefile b/src/nnet3abin/Makefile
new file mode 100644
index 00000000000..224c45a5bcd
--- /dev/null
+++ b/src/nnet3abin/Makefile
@@ -0,0 +1,26 @@
+
+all:
+EXTRA_CXXFLAGS = -Wno-sign-compare
+include ../kaldi.mk
+
+LDFLAGS += $(CUDA_LDFLAGS)
+LDLIBS += $(CUDA_LDLIBS)
+
+BINFILES = nnet3-adapt nnet3-chaina-train nnet3-chaina-combine
+
+OBJFILES =
+
+# Add this dependency to force cuda-compiled.o to be rebuilt when we reconfigure.
+cuda-compiled.o: ../kaldi.mk
+
+TESTFILES =
+
+ADDLIBS = ../nnet3a/kaldi-nnet3a.a ../adapt/kaldi-adapt.a ../nnet3/kaldi-nnet3.a \
+          ../chain/kaldi-chain.a \
+          ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
+          ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
+          ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a
+
+include ../makefiles/default_rules.mk
diff --git a/src/nnet3abin/nnet3-adapt.cc b/src/nnet3abin/nnet3-adapt.cc
new file mode 100644
index 00000000000..8bd6570bf6f
--- /dev/null
+++ b/src/nnet3abin/nnet3-adapt.cc
@@ -0,0 +1,265 @@
+// nnet3abin/nnet3-adapt.cc
+
+// Copyright 2018   Johns Hopkins University (author:  Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-nnet.h"
+#include "hmm/transition-model.h"
+#include "adapt/differentiable-transform-itf.h"
+#include "nnet3a/nnet-chaina-training.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    using namespace kaldi::differentiable_transform;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "This binary supports various modes that manipulate transform objects for\n"
+        "the nnet3a/chaina adaptation framework.  See patterns below\n"
+        "\n"
+        "Usage:  nnet3-adapt [options] init <config-file-in> [<tree-map-in>] <transform-out>\n"
+        "(e.g.:  nnet3-adapt --num-classes=201 init init.aconfig  0.ada)\n"
+        "  or:   nnet3-adapt init init.aconfig tree.map 0.ada\n"
+        "  or:  nnet3-adapt [options] copy <transform-in> <transform-out>\n"
+        "(e.g.:  nnet3-adapt copy --binary=false 0.ada 0.txt)\n"
+        "   or:  nnet3-adapt info <transform-in>\n"
+        "(e.g.:  nnet3-adapt info 0.ada\n"
+        "   or:  nnet3-adapt estimate <transform1-in> <transform2-in> ... <transform-out> \n"
+        "    .. which sums stats and calls Estimate(), to get the final class-dependent means... \n"
+        "(e.g.   nnet3-adapt estimate foo/final/default.{1,2,3,4,5,6}.ada foo/final/default.ada\n"
+        "   or:  nnet3-adapt [options] get-transforms <transform-in> <spk2utt-rspecifier> <feats-in> <posteriors-in> <transforms-out>\n"
+        "  ... which estimates and dumps speaker-specific transforms as matrices, which\n"
+        "      could be applied to the features with transform-feats; if you want\n"
+        "      utterance-specific transforms, make spk2utt a one-to-one map.\n"
+        "      <transforms-out> is a wspecifier where matrices will be written.\n"
+        "(e.g.:  nnet3-adapt final.ada spk2utt ark:- ark:feats.scp ark:1.trans)\n"
+        "\n"
+        "See also: nnet3-chaina-train\n";
+
+    bool binary_write = true;
+    bool remove_pdf_map = false;
+    int32 num_classes = -1;
+    int32 iter = 0;
+    int32 frame_subsampling_factor = 1;
+
+    ParseOptions po(usage);
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("num-classes", &num_classes,
+                "For 'init' command: number of classes the transform will "
+                "use (required if <tree-map> is not supplied).");
+    po.Register("remove-pdf-map", &remove_pdf_map,
+                "For the 'copy' command: if true, the pdf_map will be "
+                "removed so that the transform will be based on "
+                "pdf-ids.");
+    po.Register("iter", &iter, "Only for the 'estimate' command: iteration "
+                "of estimation, will always be 0 in most setups.");
+    po.Register("frame-subsampling-factor", &frame_subsampling_factor,
+                "Factor by which the posteriors we read are subsampled relative "
+                "to the features (only for the get-transforms command). "
+                "Will correspond to the top-subsampling-factor,"
+                "which, in chaina scripts, refers to frame_subsampling_factor "
+                "divided by bottom_subsampling_factor");
+
+    po.Read(argc, argv);
+
+
+    if (po.GetOptArg(1) == "init" && po.NumArgs() == 3) {
+      // This block does the "init" command where the tree.map was not provided.
+      if (num_classes <= 0)
+        KALDI_ERR << "The --num-classes option is required with the "
+            "'init' command.";
+      std::string config_rxfilename = po.GetArg(2),
+          transform_wxfilename = po.GetArg(3);
+      bool binary_in;  // should be false.
+      Input ki(config_rxfilename, &binary_in);
+      DifferentiableTransformMapped transform;
+
+      transform.transform = DifferentiableTransform::ReadFromConfig(
+          ki.Stream(), num_classes);
+
+      WriteKaldiObject(transform, transform_wxfilename, binary_write);
+      return 0;
+    } else if (po.GetOptArg(1) == "init" && po.NumArgs() == 4) {
+      // This block does the "init" command where the tree.map was provided.
+      std::string config_rxfilename = po.GetArg(2),
+          tree_map_rxfilename = po.GetArg(3),
+          transform_wxfilename = po.GetArg(4);
+
+      DifferentiableTransformMapped transform;
+      { // This block reads transform.pdf_map and sets up num_classes.
+        bool binary_in;
+        Input ki(tree_map_rxfilename, &binary_in);
+        ReadIntegerVector(ki.Stream(), binary_in, &(transform.pdf_map));
+        if (transform.pdf_map.empty())
+          KALDI_ERR << "Expected <tree-map> to be nonempty vector.";
+        int32 expected_num_classes =
+            1 + *std::max_element(transform.pdf_map.begin(),
+                                  transform.pdf_map.end());
+        if (num_classes > 0 && num_classes != expected_num_classes)
+          KALDI_ERR << "The --num-classes given via the option " << num_classes
+                    << " differs from the expected value given the tree-map: "
+                    << expected_num_classes;
+        num_classes = expected_num_classes;
+      }
+
+      bool binary_in;  // should be false.
+      Input ki(config_rxfilename, &binary_in);
+      transform.transform = DifferentiableTransform::ReadFromConfig(
+          ki.Stream(), num_classes);
+      WriteKaldiObject(transform, transform_wxfilename, binary_write);
+      return 0;
+    } else if (po.GetOptArg(1) == "info" && po.NumArgs() == 2) {
+      std::string transform_rxfilename = po.GetArg(2);
+      DifferentiableTransformMapped transform;
+      ReadKaldiObject(transform_rxfilename, &transform);
+      std::cout << transform.Info();
+      return 0;
+    } else if (po.GetOptArg(1) == "copy" && po.NumArgs() == 3) {
+      std::string transform_rxfilename = po.GetArg(2),
+          transform_wxfilename = po.GetArg(3);
+      DifferentiableTransformMapped transform;
+      ReadKaldiObject(transform_rxfilename, &transform);
+      if (remove_pdf_map) {
+        if (transform.pdf_map.empty()) {
+          KALDI_WARN << "--remove-pdf-map option: transform does not have a pdf-map.";
+        } else {
+          transform.transform->SetNumClasses(transform.pdf_map.size());
+          transform.pdf_map.clear();
+        }
+      }
+      WriteKaldiObject(transform, transform_wxfilename, binary_write);
+      return 0;
+    } else if (po.GetOptArg(1) == "estimate" && po.NumArgs() >= 3) {
+      DifferentiableTransformMapped transform;
+      std::string transform_rxfilename = po.GetArg(2);
+      ReadKaldiObject(transform_rxfilename, &transform);
+      for (int32 i = 3; i < po.NumArgs(); i++) {
+        std::string other_transform_rxfilename = po.GetArg(i);
+        DifferentiableTransformMapped other_transform;
+        ReadKaldiObject(other_transform_rxfilename, &other_transform);
+        // sum the stats.
+        transform.transform->Add(*(other_transform.transform));
+      }
+      transform.transform->Estimate(iter);
+      std::string transform_wxfilename = po.GetArg(po.NumArgs());
+      WriteKaldiObject(transform, transform_wxfilename, binary_write);
+      return 0;
+    } else if (po.GetOptArg(1) == "get-transforms" && po.NumArgs() == 6) {
+      std::string transform_rxfilename = po.GetArg(2),
+          spk2utt_rspecifier = po.GetArg(3),
+          feats_rspecifier = po.GetArg(4),
+          post_rspecifier = po.GetArg(5),
+          transforms_wspecifier = po.GetArg(6);
+
+      DifferentiableTransformMapped transform;
+      ReadKaldiObject(transform_rxfilename, &transform);
+      SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
+      RandomAccessPosteriorReader post_reader(post_rspecifier);
+      RandomAccessBaseFloatMatrixReader feature_reader(feats_rspecifier);
+      BaseFloatMatrixWriter transform_writer(transforms_wspecifier);
+      int32 num_done = 0, num_no_post = 0, num_other_error = 0;
+
+      for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
+        std::unique_ptr <SpeakerStatsItf> stats(
+            transform.transform->GetEmptySpeakerStats());
+        std::string spk = spk2utt_reader.Key();
+        bool got_stats = false;
+        const std::vector<std::string> &uttlist = spk2utt_reader.Value();
+        for (size_t i = 0; i < uttlist.size(); i++) {
+          std::string utt = uttlist[i];
+          if (!feature_reader.HasKey(utt)) {
+            KALDI_WARN << "Did not find features for utterance " << utt;
+            num_other_error++;
+            continue;
+          }
+          if (!post_reader.HasKey(utt)) {
+            KALDI_WARN << "Did not find posteriors for utterance " << utt;
+            num_no_post++;
+            continue;
+          }
+          const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
+          const Posterior &post_in = post_reader.Value(utt);
+          Posterior post_upsampled(feats.NumRows());
+          const Posterior *post_to_use = NULL;
+          if (frame_subsampling_factor != 1 || !transform.pdf_map.empty()) {
+            ConvertPosterior(
+                post_in, 1, 0, frame_subsampling_factor, transform.pdf_map,
+                transform.transform->NumClasses(), &post_upsampled);
+            post_to_use = &post_upsampled;
+          } else {
+            KALDI_ASSERT(post_in.size() == size_t(feats.NumRows()) &&
+                         "Mismatch in posterior vs. feats dimension");
+            post_to_use = &post_in;
+          }
+          transform.transform->TestingAccumulate(feats, *post_to_use, stats.get());
+          got_stats = true;
+          num_done++;
+        }
+        if (!got_stats) {
+          KALDI_WARN << "Got no stats for speaker " << spk;
+        } else {
+          stats->Estimate();
+          int32 dim = transform.transform->Dim();
+          Matrix<BaseFloat> transform_mat(dim, dim + 1, kUndefined);
+          transform.transform->GetTransformAsMatrix(*stats, &transform_mat);
+          transform_writer.Write(spk, transform_mat);
+        }
+      }
+      KALDI_LOG << "Done " << num_done << " files, " << num_no_post
+                << " with no posts, " << num_other_error << " with other errors.";
+      return (num_done != 0 && num_done > (num_no_post + num_other_error)) ? 0 : 1;
+    } else {
+      po.PrintUsage();
+      exit(1);
+    }
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+
+/*
+Test script:
+
+cat <<EOF | nnet3-adapt --binary=false --num-classes=200 init - foo.ada
+AppendTransform num-transforms=4
+  NoOpTransform dim=20
+  FmllrTransform dim=20
+  MeanOnlyTransform dim=20
+  SequenceTransform num-transforms=2
+    FmllrTransform dim=20
+    MeanOnlyTransform dim=20
+EOF
+nnet3-adapt --binary=false --num-classes=400 copy foo.ada -
+
+
+cat <<EOF | nnet3-adapt --binary=false --num-classes=200 init - - | nnet3-adapt info -
+AppendTransform num-transforms=4
+  NoOpTransform dim=20
+  FmllrTransform dim=20
+  MeanOnlyTransform dim=20
+  SequenceTransform num-transforms=2
+    FmllrTransform dim=20
+    MeanOnlyTransform dim=20
+EOF
+
+ */
diff --git a/src/nnet3abin/nnet3-chaina-combine.cc b/src/nnet3abin/nnet3-chaina-combine.cc
new file mode 100644
index 00000000000..857b231a076
--- /dev/null
+++ b/src/nnet3abin/nnet3-chaina-combine.cc
@@ -0,0 +1,246 @@
+// nnet3bin/nnet3-chaina-combine.cc
+
+// Copyright 2019  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3a/nnet-chaina-training.h"
+#include "cudamatrix/cu-allocator.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+/**
+   Computes the average objective function of the provided egs with the provided
+   set of models.
+     @param [in] opts  The options class for the objective computation
+                (shares the same option as training, but we set the training
+                options to true)
+     @param [in] unadapted_objf_weight  A number in the range [0,1] that says
+                how much weight we put on the unadapted version of the
+                objective function when choosing models.
+     @param [in] num_models_averaged   Needed only for diagnostics-- the
+                number of sets of models that we averaged to get the
+                models in the 'models' object
+     @param [in] keys_and_egs  The vector containing the examples we
+                are to evaluate the objective function on, and the corresponding
+                string-valued keys (needed because the language name and
+                example weight are optionally encoded in it).
+     @param [in,out] models  The models that we are evaluating the objective
+               function.  These will only be modified to to the extent that
+               the batchnorm stats and any component-level stats would be
+               affected.
+ */
+BaseFloat GetObjectiveFunction(
+    const NnetChainaTrainingOptions &opts,
+    BaseFloat unadapted_objf_weight,
+    int32 num_models_averaged,
+    const std::vector<std::pair<std::string, NnetChainExample> >& keys_and_egs,
+    NnetChainaModels *models) {
+  KALDI_ASSERT(!opts.top.train && !opts.bottom.train);
+  NnetChainaTrainer trainer(opts, models);
+  size_t num_egs = keys_and_egs.size();
+  for (size_t i = 0; i < num_egs; i++) {
+    trainer.Train(keys_and_egs[i].first, keys_and_egs[i].second);
+  }
+  BaseFloat weight, adapted_objf, unadapted_objf;
+  adapted_objf = trainer.GetTotalObjf(true, &weight);
+  adapted_objf /= weight;
+  unadapted_objf = trainer.GetTotalObjf(false, &weight);
+  unadapted_objf /= weight;
+  BaseFloat ans = unadapted_objf_weight * unadapted_objf +
+      (1.0 - unadapted_objf_weight) * adapted_objf;
+  KALDI_LOG << "When averaging " << num_models_averaged
+            << " models, objf values (unadapted/si,adapted) "
+            << unadapted_objf << ", " << adapted_objf
+            << ", interpolated = " << ans << "; over "
+            << weight << " frames.";
+  return ans;
+}
+
+void ReadExamples(
+    const std::string &egs_rspecifier,
+    std::vector<std::pair<std::string, NnetChainExample> > *keys_and_egs) {
+  keys_and_egs->reserve(10000);  // reserve a lot of space to minimize the chance of
+  // reallocation.
+  SequentialNnetChainExampleReader example_reader(egs_rspecifier);
+  for (; !example_reader.Done(); example_reader.Next()) {
+    size_t i = keys_and_egs->size();
+    keys_and_egs->resize(i + 1);
+    keys_and_egs->back().first = example_reader.Key();
+    keys_and_egs->back().second.Swap(&(example_reader.Value()));
+  }
+  KALDI_LOG << "Read " << keys_and_egs->size() << " examples.";
+  KALDI_ASSERT(!keys_and_egs->empty());
+}
+
+
+}  // namespace nnet3
+}  // namespace kaldi
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    using namespace kaldi::chain;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "This program does the final model-combination stage of 'chaina'\n"
+        "acoustic training: it averages over the last n models, where the\n"
+        "'n' is chosen (by this program) based on maximizing the objective\n"
+        "function on the data given to it.  It maximizes the average of the\n"
+        "speaker-independent and speaker-dependent versions of the 'chain'\n"
+        "objective values.\n"
+        "This program is intended to be used with a GPU.\n"
+        "\n"
+        "Usage:  nnet3-chaina-combine [options] <model1-in-dir> ... <modelN-in-dir> \\\n"
+        "    <den-fst-dir> <transform-dir> <egs-rspecifier> <model-out-dir>\n"
+        "\n"
+        "<modelX-in-dir> should contain bottom.raw, and <lang>.mdl for each language <lang>\n"
+        " (these will be averaged over a range of indexes including N, e.g. just modelN, or\n"
+        "  modelN with model(N-1), and so on).\n"
+        "<den-fst-dir> should contain <lang>.den.fst for each language <lang>\n"
+        "<transform-dir> should contain <lang>.ada for each language <lang>\n"
+        "<model-out-dir> is a place to where bottom.mdl and <lang>.mdl for each language\n"
+        "  <lang> that was seen in the egs, will be written (for <job-id>, see the --job-id option).\n";
+
+
+    int32 srand_seed = 0;
+    bool binary_write = true;
+    std::string use_gpu = "yes";
+    NnetChainaTrainingOptions chaina_opts;
+    chaina_opts.top.train = false;
+    chaina_opts.bottom.train = false;
+    chaina_opts.top.dropout_test_mode = true;
+    chaina_opts.bottom.dropout_test_mode = true;
+    // But leave the batchnorm test-modes at false.
+
+    // Setting batchnorm_stats_scale to 1.0 means it won't scale down the
+    // batchnorm stats as it goes (the default is 0.8), so they will all be
+    // remembered.  Note: each time we initialize and use the trainer object, in
+    // GetObjectiveFunction, it will call ZeroComponentStats() for both the
+    // bottom and top models (assuming the options are the defaults), so only
+    // the stats from the most recent run will be present.
+    chaina_opts.nnet_config.batchnorm_stats_scale = 1.0;
+
+    BaseFloat unadapted_objf_weight = 0.5;
+
+    ParseOptions po(usage);
+    po.Register("srand", &srand_seed, "Seed for random number generator ");
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+    po.Register("unadapted-weight", &unadapted_objf_weight,
+                "The weight we give to the unadapted version of the objective function "
+                "when evaluating the goodness of models (the adapted objective gets "
+                "1 minus this value as its weight)");
+
+
+    chaina_opts.Register(&po);
+    RegisterCuAllocatorOptions(&po);
+
+    po.Read(argc, argv);
+
+    srand(srand_seed);
+
+    if (po.NumArgs() < 5) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    int32 n = po.NumArgs() - 4;  // n is the number of models we have
+                                 // available to average.
+
+    std::string last_model_in_dir = po.GetArg(n),
+        den_fst_dir = po.GetArg(n + 1),
+        transform_dir = po.GetArg(n + 2),
+        egs_rspecifier = po.GetArg(n + 3),
+        model_out_dir = po.GetOptArg(n + 4);
+
+    NnetChainaModels models(chaina_opts,
+                            last_model_in_dir, den_fst_dir,
+                            transform_dir);
+
+
+    std::vector<std::pair<std::string, NnetChainExample> > keys_and_egs;
+    ReadExamples(egs_rspecifier, &keys_and_egs);
+
+    // first evaluates the objective using the last model.
+    int32 best_num_to_combine = -1;
+    BaseFloat best_objf = -std::numeric_limits<BaseFloat>::infinity(),
+                          single_model_objf;
+
+    std::unique_ptr<NnetChainaModels> best_models;
+
+    for (int32 num_models = 1; num_models <= n; num_models++) {
+      if (num_models > 1)
+        models.InterpolateWith(1.0 / num_models, po.GetArg(n + 1 - num_models));
+      BaseFloat objf = GetObjectiveFunction(chaina_opts,  unadapted_objf_weight,
+                                            num_models, keys_and_egs, &models);
+      if (objf > best_objf || num_models == 1) {
+        best_objf = objf;
+        best_models = std::unique_ptr<NnetChainaModels>(
+            new NnetChainaModels(models));
+        best_num_to_combine = num_models;
+        if (num_models == 1)
+          single_model_objf = objf;
+      }
+      if (num_models > best_num_to_combine + 4 && num_models < n)
+        KALDI_LOG << "Stopping the search early as it looks like we found "
+            "the best combination";
+    }
+
+    KALDI_LOG << "Best objective function was " << best_objf << " with "
+              << best_num_to_combine << " models.";
+    KALDI_LOG << "About to recompute objective function with batchnorm in "
+        "test-mode:\n";
+    chaina_opts.top.batchnorm_test_mode = true;
+    chaina_opts.bottom.batchnorm_test_mode = true;
+
+    BaseFloat test_mode_objf =
+        GetObjectiveFunction(chaina_opts, unadapted_objf_weight,
+                             best_num_to_combine,
+                             keys_and_egs,
+                             best_models.get());
+    KALDI_LOG << "Objf with test-mode batchnorm was " << test_mode_objf
+              << " (vs. " << best_objf << " without test mode)";
+
+    KALDI_LOG << "Combination changed the objective from "
+              << single_model_objf << " with only the final model, to "
+              << best_objf << " with " << best_num_to_combine
+              << " models.";
+
+    best_models->WriteCombinedModels(model_out_dir, binary_write);
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    return 0;
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/nnet3abin/nnet3-chaina-train.cc b/src/nnet3abin/nnet3-chaina-train.cc
new file mode 100644
index 00000000000..f6f98b6ffd3
--- /dev/null
+++ b/src/nnet3abin/nnet3-chaina-train.cc
@@ -0,0 +1,116 @@
+// nnet3bin/nnet3-chaina-train.cc
+
+// Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3a/nnet-chaina-training.h"
+#include "cudamatrix/cu-allocator.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    using namespace kaldi::chain;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Train nnet3+chaina (i.e. chain + adaptation framework) neural network.\n"
+        "Minibatches are to be created by nnet3-chain-merge-egs in\n"
+        "the input pipeline.  This training program is single-threaded (best to\n"
+        "use it with a GPU).\n"
+        "\n"
+        "Usage:  nnet3-chaina-train [options] <model-in-dir> <den-fst-dir> <transform-dir>\n"
+        "        <egs-rspecifier> [<model-out-dir>]\n"
+        "\n"
+        "<model-in-dir> should contain bottom.raw, and <lang>.mdl for each language <lang>\n"
+        "<den-fst-dir> should contain <lang>.den.fst for each language <lang>\n"
+        "<transform-dir> should contain <lang>.ada for each language <lang>\n"
+        "<model-out-dir> is a place to where bottom.<job-id>.raw and <lang>.<job-id>.raw for each language\n"
+        "  <lang> that was seen in the egs, will be written (for <job-id>, see the --job-id option).\n"
+        "  If it is not specified, the trained models will not be written (e.g. when you are using\n"
+        "  --bottom-model-test-mode=true --top-model-test-mode=true and only want diagnostics).\n";
+
+
+    int32 srand_seed = 0;
+    bool binary_write = true;
+    std::string use_gpu = "yes";
+    NnetChainaTrainingOptions chaina_opts;
+    int32 job_id = 0;
+
+    ParseOptions po(usage);
+    po.Register("srand", &srand_seed, "Seed for random number generator ");
+    po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+    po.Register("job-id", &job_id,
+                "Job identifier, helps to determine pathnames of models written "
+                "to <model-out-dir>.");
+
+    chaina_opts.Register(&po);
+    RegisterCuAllocatorOptions(&po);
+
+    po.Read(argc, argv);
+
+    srand(srand_seed);
+
+    if (po.NumArgs() < 4 || po.NumArgs() > 5) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    bool ok;
+
+    std::string model_in_dir = po.GetArg(1),
+        den_fst_dir = po.GetArg(2),
+        transform_dir = po.GetArg(3),
+        egs_rspecifier = po.GetArg(4),
+        model_out_dir = po.GetOptArg(5);
+
+    NnetChainaModels models(chaina_opts,
+                            model_in_dir, den_fst_dir, transform_dir);
+
+    {
+      NnetChainaTrainer trainer(chaina_opts, &models);
+
+      SequentialNnetChainExampleReader example_reader(egs_rspecifier);
+
+      for (; !example_reader.Done(); example_reader.Next())
+        trainer.Train(example_reader.Key(),
+                      example_reader.Value());
+
+      ok = trainer.PrintTotalStats();
+    }
+    if (po.NumArgs() == 5)
+      models.Write(model_out_dir, binary_write, job_id);
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    return (ok ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
diff --git a/src/transform/Makefile b/src/transform/Makefile
index a265db6ac37..194f362f11a 100644
--- a/src/transform/Makefile
+++ b/src/transform/Makefile
@@ -15,6 +15,6 @@ OBJFILES = regression-tree.o regtree-mllr-diag-gmm.o lda-estimate.o \
 LIBNAME = kaldi-transform
 
 ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
 
 include ../makefiles/default_rules.mk
diff --git a/src/util/text-utils-test.cc b/src/util/text-utils-test.cc
index 5bfe4cb24d0..3b58f4f1dd1 100644
--- a/src/util/text-utils-test.cc
+++ b/src/util/text-utils-test.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2009-2011     Microsoft Corporation
 //                2017     Johns Hopkins University (author: Daniel Povey)
+//                2015  Vimal Manohar (Johns Hopkins University)
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -324,6 +325,193 @@ void TestStringsApproxEqual() {
   KALDI_ASSERT(!StringsApproxEqual("x 1.0 y", "x 1.0001 y", 4));
 }
 
+void UnitTestConfigLineParse() {
+  std::string str;
+  {
+    ConfigLine cfl;
+    str = "a-b xx=yyy foo=bar  baz=123 ba=1:2";
+    bool status = cfl.ParseLine(str);
+    KALDI_ASSERT(status && cfl.FirstToken() == "a-b");
+
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("xx", &str_value));
+    KALDI_ASSERT(str_value == "yyy");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(cfl.GetValue("foo", &str_value));
+    KALDI_ASSERT(str_value == "bar");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(!cfl.GetValue("xy", &str_value));
+    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
+    KALDI_ASSERT(str_value == "123");
+
+    std::vector<int32> int_values;
+    KALDI_ASSERT(!cfl.GetValue("xx", &int_values));
+    KALDI_ASSERT(cfl.GetValue("baz", &int_values));
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(int_values.size() == 1 && int_values[0] == 123);
+    KALDI_ASSERT(cfl.GetValue("ba", &int_values));
+    KALDI_ASSERT(int_values.size() == 2 && int_values[0] == 1 && int_values[1] == 2);
+    KALDI_ASSERT(!cfl.HasUnusedValues());
+  }
+
+  {
+    ConfigLine cfl;
+    str = "a-b baz=x y z pp = qq ab =cd ac= bd";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "a-b baz=x y z pp = qq ab=cd ac=bd";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "foo-bar";
+    KALDI_ASSERT(cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "foo-bar a=b c d f=g";
+    std::string value;
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "foo-bar" &&
+                 cfl.GetValue("a", &value)  && value == "b c d" &&
+                 cfl.GetValue("f", &value) && value == "g" &&
+                 !cfl.HasUnusedValues());
+  }
+  {
+    ConfigLine cfl;
+    str = "zzz a=b baz";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "zzz" &&
+                 cfl.UnusedValues() == "a=b baz");
+  }
+  {
+    ConfigLine cfl;
+    str = "xxx a=b baz ";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.UnusedValues() == "a=b baz");
+  }
+  {
+    ConfigLine cfl;
+    str = "xxx a=b =c";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xxx baz='x y z' pp=qq ab=cd ac=bd";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "xxx");
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
+    KALDI_ASSERT(str_value == "x y z");
+    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
+    KALDI_ASSERT(str_value == "qq");
+    KALDI_ASSERT(cfl.UnusedValues() == "ab=cd ac=bd");
+    KALDI_ASSERT(cfl.GetValue("ab", &str_value));
+    KALDI_ASSERT(str_value == "cd");
+    KALDI_ASSERT(cfl.UnusedValues() == "ac=bd");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(cfl.GetValue("ac", &str_value));
+    KALDI_ASSERT(str_value == "bd");
+    KALDI_ASSERT(!cfl.HasUnusedValues());
+  }
+
+  {
+    ConfigLine cfl;
+    str = "x baz= pp = qq flag=t ";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = " x baz= pp=qq flag=t  ";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "x");
+
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("baz", &str_value));
+    KALDI_ASSERT(str_value == "");
+    KALDI_ASSERT(cfl.GetValue("pp", &str_value));
+    KALDI_ASSERT(str_value == "qq");
+    KALDI_ASSERT(cfl.HasUnusedValues());
+    KALDI_ASSERT(cfl.GetValue("flag", &str_value));
+    KALDI_ASSERT(str_value == "t");
+    KALDI_ASSERT(!cfl.HasUnusedValues());
+
+    bool bool_value = false;
+    KALDI_ASSERT(cfl.GetValue("flag", &bool_value));
+    KALDI_ASSERT(bool_value);
+  }
+
+  {
+    ConfigLine cfl;
+    str = "xx _baz=a -pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx 0baz=a pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx -baz=a pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx _baz'=a pp=qq";
+    KALDI_ASSERT(!cfl.ParseLine(str));
+  }
+  {
+    ConfigLine cfl;
+    str = " baz=g";
+    KALDI_ASSERT(cfl.ParseLine(str) && cfl.FirstToken() == "");
+    bool flag;
+    KALDI_ASSERT(!cfl.GetValue("baz", &flag));
+  }
+  {
+    ConfigLine cfl;
+    str = "xx _baz1=a pp=qq";
+    KALDI_ASSERT(cfl.ParseLine(str));
+
+    std::string str_value;
+    KALDI_ASSERT(cfl.GetValue("_baz1", &str_value));
+  }
+}
+
+void UnitTestReadConfig() {
+  std::string str = "a-b alpha=aa beta=\"b b\"# String test\n"
+      "a-b beta2='b c' beta3=bd # \n"
+      "a-b gamma=1:2:3:4  # Int Vector test\n"
+      " a-b de1ta=f  # Bool + Integer in key Comment test delta=t  \n"
+      "a-b _epsilon=-1  # Int Vector test _epsilon=1 \n"
+      "a-b zet-_a=0.15   theta=1.1# Float, -, _ test\n"
+      "a-b quoted='a b c' # quoted string\n"
+      "a-b quoted2=\"d e 'a b=c' f\" # string quoted with double quotes";
+
+  std::istringstream is(str);
+  std::vector<std::string> lines;
+  ReadConfigLines(is, &lines);
+  KALDI_ASSERT(lines.size() == 8);
+
+  ConfigLine cfl;
+  for (size_t i = 0; i < lines.size(); i++) {
+    KALDI_ASSERT(cfl.ParseLine(lines[i]) && cfl.FirstToken() == "a-b");
+    if (i == 1) {
+        KALDI_ASSERT(cfl.GetValue("beta2", &str) && str == "b c");
+    }
+    if (i == 4) {
+      KALDI_ASSERT(cfl.GetValue("_epsilon", &str) && str == "-1");
+    }
+    if (i == 5) {
+      BaseFloat float_val = 0;
+      KALDI_ASSERT(cfl.GetValue("zet-_a", &float_val) && ApproxEqual(float_val, 0.15));
+    }
+    if (i == 6) {
+      KALDI_ASSERT(cfl.GetValue("quoted", &str) && str == "a b c");
+    }
+    if (i == 7) {
+      KALDI_ASSERT(cfl.GetValue("quoted2", &str) && str == "d e 'a b=c' f");
+    }
+  }
+}
 
 }  // end namespace kaldi
 
@@ -344,5 +532,7 @@ int main() {
   TestNan<double>();
   TestInf<float>();
   TestInf<double>();
+  UnitTestConfigLineParse();
+  UnitTestReadConfig();
   std::cout << "Test OK\n";
 }
diff --git a/src/util/text-utils.cc b/src/util/text-utils.cc
index 200e3ad9327..bbf38ecc5cc 100644
--- a/src/util/text-utils.cc
+++ b/src/util/text-utils.cc
@@ -340,4 +340,252 @@ bool StringsApproxEqual(const std::string &a,
 }
 
 
+bool ConfigLine::ParseLine(const std::string &line) {
+  data_.clear();
+  whole_line_ = line;
+  if (line.size() == 0) return false;   // Empty line
+  size_t pos = 0, size = line.size();
+  while (isspace(line[pos]) && pos < size) pos++;
+  if (pos == size)
+    return false;  // whitespace-only line
+  size_t first_token_start_pos = pos;
+  // first get first_token_.
+  while (!isspace(line[pos]) && pos < size) {
+    if (line[pos] == '=') {
+      // If the first block of non-whitespace looks like "foo-bar=...",
+      // then we ignore it: there is no initial token, and FirstToken()
+      // is empty.
+      pos = first_token_start_pos;
+      break;
+    }
+    pos++;
+  }
+  first_token_ = std::string(line, first_token_start_pos, pos - first_token_start_pos);
+  // first_token_ is expected to be either empty or something like
+  // "component-node", which actually is a slightly more restrictive set of
+  // strings than IsValidName() checks for this is a convenient way to check it.
+  if (!first_token_.empty() && !IsValidName(first_token_))
+    return false;
+
+  while (pos < size) {
+    if (isspace(line[pos])) {
+      pos++;
+      continue;
+    }
+
+    // OK, at this point we know that we are pointing at nonspace.
+    size_t next_equals_sign = line.find_first_of("=", pos);
+    if (next_equals_sign == pos || next_equals_sign == std::string::npos) {
+      // we're looking for something like 'key=value'.  If there is no equals sign,
+      // or it's not preceded by something, it's a parsing failure.
+      return false;
+    }
+    std::string key(line, pos, next_equals_sign - pos);
+    if (!IsValidName(key)) return false;
+
+    // handle any quotes.  we support key='blah blah' or key="foo bar".
+    // no escaping is supported.
+    if (line[next_equals_sign+1] == '\'' || line[next_equals_sign+1] == '"') {
+      char my_quote = line[next_equals_sign+1];
+      size_t next_quote = line.find_first_of(my_quote, next_equals_sign + 2);
+      if (next_quote == std::string::npos) {  // no matching quote was found.
+        KALDI_WARN << "No matching quote for " << my_quote << " in config line '"
+                   << line << "'";
+        return false;
+      } else {
+        std::string value(line, next_equals_sign + 2,
+                          next_quote - next_equals_sign - 2);
+        data_.insert(std::make_pair(key, std::make_pair(value, false)));
+        pos = next_quote + 1;
+        continue;
+      }
+    } else {
+      // we want to be able to parse something like "... input=Offset(a, -1) foo=bar":
+      // in general, config values with spaces in them, even without quoting.
+
+      size_t next_next_equals_sign = line.find_first_of("=", next_equals_sign + 1),
+          terminating_space = size;
+
+      if (next_next_equals_sign != std::string::npos) {  // found a later equals sign.
+        size_t preceding_space = line.find_last_of(" \t", next_next_equals_sign);
+        if (preceding_space != std::string::npos &&
+            preceding_space > next_equals_sign)
+          terminating_space = preceding_space;
+      }
+      while (isspace(line[terminating_space - 1]) && terminating_space > 0)
+        terminating_space--;
+
+      std::string value(line, next_equals_sign + 1,
+                        terminating_space - (next_equals_sign + 1));
+      data_.insert(std::make_pair(key, std::make_pair(value, false)));
+      pos = terminating_space;
+    }
+  }
+  return true;
+}
+
+bool ConfigLine::GetValue(const std::string &key, std::string *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      *value = (it->second).first;
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, BaseFloat *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if (!ConvertStringToReal((it->second).first, value))
+        return false;
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, int32 *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if (!ConvertStringToInteger((it->second).first, value))
+        return false;
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, std::vector<int32> *value) {
+  KALDI_ASSERT(value != NULL);
+  value->clear();
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if (!SplitStringToIntegers((it->second).first, ":,", true, value)) {
+        // KALDI_WARN << "Bad option " << (it->second).first;
+        return false;
+      }
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::GetValue(const std::string &key, bool *value) {
+  KALDI_ASSERT(value != NULL);
+  std::map<std::string, std::pair<std::string, bool> >::iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (it->first == key) {
+      if ((it->second).first.size() == 0) return false;
+      switch (((it->second).first)[0]) {
+        case 'F':
+        case 'f':
+          *value = false;
+          break;
+        case 'T':
+        case 't':
+          *value = true;
+          break;
+        default:
+          return false;
+      }
+      (it->second).second = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ConfigLine::HasUnusedValues() const {
+  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (!(it->second).second) return true;
+  }
+  return false;
+}
+
+std::string ConfigLine::UnusedValues() const {
+  std::string unused_str;
+  std::map<std::string, std::pair<std::string, bool> >::const_iterator it = data_.begin();
+  for (; it != data_.end(); ++it) {
+    if (!(it->second).second) {
+      if (unused_str == "")
+        unused_str = it->first + "=" + (it->second).first;
+      else
+        unused_str += " " + it->first + "=" + (it->second).first;
+    }
+  }
+  return unused_str;
+}
+
+// This is like ExpectToken but for two tokens, and it
+// will either accept token1 and then token2, or just token2.
+// This is useful in Read functions where the first token
+// may already have been consumed.
+void ExpectOneOrTwoTokens(std::istream &is, bool binary,
+                          const std::string &token1,
+                          const std::string &token2) {
+  KALDI_ASSERT(token1 != token2);
+  std::string temp;
+  ReadToken(is, binary, &temp);
+  if (temp == token1) {
+    ExpectToken(is, binary, token2);
+  } else {
+    if (temp != token2) {
+      KALDI_ERR << "Expecting token " << token1 << " or " << token2
+                << " but got " << temp;
+    }
+  }
+}
+
+
+bool IsValidName(const std::string &name) {
+  if (name.size() == 0) return false;
+  for (size_t i = 0; i < name.size(); i++) {
+    if (i == 0 && !isalpha(name[i]) && name[i] != '_')
+      return false;
+    if (!isalnum(name[i]) && name[i] != '_' && name[i] != '-' && name[i] != '.')
+      return false;
+  }
+  return true;
+}
+
+void ReadConfigLines(std::istream &is,
+                    std::vector<std::string> *lines) {
+  KALDI_ASSERT(lines != NULL);
+  std::string line;
+  while (std::getline(is, line)) {
+    if (line.size() == 0) continue;
+    size_t start = line.find_first_not_of(" \t");
+    size_t end = line.find_first_of('#');
+    if (start == std::string::npos || start == end) continue;
+    end = line.find_last_not_of(" \t", end - 1);
+    KALDI_ASSERT(end >= start);
+    lines->push_back(line.substr(start, end - start + 1));
+  }
+}
+
+void ParseConfigLines(const std::vector<std::string> &lines,
+                      std::vector<ConfigLine> *config_lines) {
+  config_lines->resize(lines.size());
+  for (size_t i = 0; i < lines.size(); i++) {
+    bool ret = (*config_lines)[i].ParseLine(lines[i]);
+    if (!ret) {
+      KALDI_ERR << "Error parsing config line: " << lines[i];
+    }
+  }
+}
+
+
 }  // end namespace kaldi
diff --git a/src/util/text-utils.h b/src/util/text-utils.h
index 7bc20957672..02f4bf483fc 100644
--- a/src/util/text-utils.h
+++ b/src/util/text-utils.h
@@ -183,6 +183,98 @@ bool StringsApproxEqual(const std::string &a,
                         const std::string &b,
                         int32 decimal_places_check = 2);
 
+/**
+   This class is responsible for parsing input like
+    hi-there xx=yyy a=b c empty= f-oo=Append(bar, sss) ba_z=123 bing='a b c' baz="a b c d='a b' e"
+   and giving you access to the fields, in this case
+
+   FirstToken() == "hi-there", and key->value pairs:
+
+   xx->yyy, a->"b c", empty->"", f-oo->"Append(bar, sss)", ba_z->"123",
+   bing->"a b c", baz->"a b c d='a b' e"
+
+   The first token is optional, if the line started with a key-value pair then
+   FirstValue() will be empty.
+
+   Note: it can parse value fields with space inside them only if they are free of the '='
+   character.  If values are going to contain the '=' character, you need to quote them
+   with either single or double quotes.
+
+   Key values may contain -_a-zA-Z0-9, but must begin with a-zA-Z_.
+ */
+class ConfigLine {
+ public:
+  // Tries to parse the line as a config-file line.  Returns false
+  // if it could not for some reason, e.g. parsing failure.  In most cases
+  // prints no warnings; the user should do this.  Does not expect comments.
+  bool ParseLine(const std::string &line);
+
+  // the GetValue functions are overloaded for various types.  They return true
+  // if the key exists with value that can be converted to that type, and false
+  // otherwise.  They also mark the key-value pair as having been read.  It is
+  // not an error to read values twice.
+  bool GetValue(const std::string &key, std::string *value);
+  bool GetValue(const std::string &key, BaseFloat *value);
+  bool GetValue(const std::string &key, int32 *value);
+  // Values may be separated by ":" or by ",".
+  bool GetValue(const std::string &key, std::vector<int32> *value);
+  bool GetValue(const std::string &key, bool *value);
+
+  bool HasUnusedValues() const;
+  /// returns e.g. foo=bar xxx=yyy if foo and xxx were not consumed by one
+  /// of the GetValue() functions.
+  std::string UnusedValues() const;
+
+  const std::string &FirstToken() const { return first_token_; }
+
+  const std::string WholeLine() { return whole_line_; }
+  // use default assignment operator and copy constructor.
+ private:
+  std::string whole_line_;
+  // the first token of the line, e.g. if line is
+  // foo-bar baz=bing
+  // then first_token_ would be "foo-bar".
+  std::string first_token_;
+
+  // data_ maps from key to (value, is-this-value-consumed?).
+  std::map<std::string, std::pair<std::string, bool> > data_;
+
+};
+
+/// This function is like ExpectToken but for two tokens, and it will either
+/// accept token1 and then token2, or just token2.  This is useful in Read
+/// functions where the first token may already have been consumed.
+void ExpectOneOrTwoTokens(std::istream &is, bool binary,
+                          const std::string &token1,
+                          const std::string &token2);
+
+
+/**
+   This function reads in a config file and *appends* its contents to a vector of
+   lines; it is responsible for removing comments (anything after '#') and
+   stripping out any lines that contain only whitespace after comment removal.
+ */
+void ReadConfigLines(std::istream &is,
+                     std::vector<std::string> *lines);
+
+
+/**
+   This function converts config-lines from a simple sequence of strings
+   as output by ReadConfigLines(), into a sequence of first-tokens and
+   name-value pairs.  The general format is:
+      "command-type bar=baz xx=yyy"
+   etc., although there are subtleties as to what exactly is allowed, see
+   documentation for class ConfigLine for details.
+   This function will die if there was a parsing failure.
+ */
+void ParseConfigLines(const std::vector<std::string> &lines,
+                      std::vector<ConfigLine> *config_lines);
+
+
+/// Returns true if 'name' would be a valid name for a component or node in a
+/// nnet3Nnet.  This is a nonempty string beginning with A-Za-z_, and containing only
+/// '-', '_', '.', A-Z, a-z, or 0-9.
+bool IsValidName(const std::string &name);
 
 }  // namespace kaldi
 
diff --git a/tools/config/common_path.sh b/tools/config/common_path.sh
index 9a7ae2d9b29..a8d454b3c06 100644
--- a/tools/config/common_path.sh
+++ b/tools/config/common_path.sh
@@ -22,4 +22,5 @@ ${KALDI_ROOT}/src/rnnlmbin:\
 ${KALDI_ROOT}/src/sgmm2bin:\
 ${KALDI_ROOT}/src/sgmmbin:\
 ${KALDI_ROOT}/src/tfrnnlmbin:\
+${KALDI_ROOT}/src/nnet3abin:\
 $PATH