#!/bin/bash


# This is on the full training set (with duplicates removed).  This version of the recipe runs on GPUs.
# We assume you have 8 GPU cards.  You have to use --num-threads 1 so it will
# use the version of the code that can use GPUs (the -parallel training code
# cannot use GPUs unless we make further modifications as the CUDA model assumes
# a single thread per GPU context, and we're not currently set up to create multiple
# GPU contexts.  We assume the queue is set up as in JHU (or
# as in the "Kluster" project on Sourceforge) where "gpu" is a consumable
# resource that you can set to number of GPU cards a machine has.

. cmd.sh

( 
  if [ ! -f exp/nnet5c_gpu/final.mdl ]; then
    steps/nnet2/train_tanh.sh --cmd "$decode_cmd" --parallel-opts "-l gpu=1" --io-opts "-tc 5" \
      --num-threads 1 --minibatch-size 512 --max-change 40.0 --mix-up 20000 --samples-per-iter 300000 \
      --num-epochs 10 --num-epochs-extra 3 --initial-learning-rate 0.0067 --final-learning-rate 0.00067 \
      --num-jobs-nnet 10 --num-hidden-layers 5 --hidden-layer-dim 1536 data/train_nodup data/lang \
        exp/tri4b exp/nnet5c_gpu || exit 1;
  fi

  for lm_suffix in tg fsh_tgpr; do
    steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 30 \
      --config conf/decode.config --transform-dir exp/tri4b/decode_eval2000_sw1_${lm_suffix} \
      exp/tri4b/graph_sw1_${lm_suffix} data/eval2000 exp/nnet5c_gpu/decode_eval2000_sw1_${lm_suffix} &
  done
)