#This script is not really supposed to be run directly 
#Instead, it should be sourced from the decoding script
#It makes many assumption on existence of certain environmental
#variables as well as certain directory structure.

if [ "${dataset_kind}" == "supervised" ] ; then
  mandatory_variables="my_ecf_file my_kwlist_file my_rttm_file" 
  optional_variables="my_subset_ecf"
else
  mandatory_variables="my_ecf_file my_kwlist_file" 
  optional_variables="my_subset_ecf"
fi

check_variables_are_set

if [ "$dataset_kind" == "shadow" ]; then
  true #we do not support multiple kw lists for shadow set system
   
elif [ ! -f $dataset_dir/.done.kws.fullvocab ] ; then
  #a This will work for both supervised and unsupervised dataset kinds
  kws_flags=()
  if [ "$dataset_kind" == "supervised" ] ; then
    kws_flags+=(--rttm-file $my_rttm_file )
  fi
  if $my_subset_ecf ; then
    kws_flags+=(--subset-ecf $my_data_list)
  fi
  
  #We just could come with some bogus naming scheme,
  #but as long as the audio files can tell the iarpa lang id, we will use that
  langid=`ls -1 $my_data_dir/audio/ | head -n 1| cut -d '_' -f 3`

  #NB: we assume the default KWS search is already done and will "borrow" 
  #the rttm and ecf files.
  #We could easily generate the ecf file, but the RTTM assumes the decoding
  #had been already done. That could be done 
  #Ideally, these files should be generated here!

  local/kws_setup.sh --kwlist-wordlist true "${kws_flags[@]}"  \
    --extraid fullvocab $my_ecf_file \
      <(cat data/lang/words.txt | \
        grep -v -F "<" | grep -v -F "#"  | \
        awk "{printf \"KWID$langid-FULLVOCAB-%05d %s\\n\", \$2, \$1 }" ) \
    data/lang ${dataset_dir} || exit 1

  echo fullvocab >> $dataset_dir/extra_kws_tasks;  
  sort -u $dataset_dir/extra_kws_tasks -o  $dataset_dir/extra_kws_tasks
  touch $dataset_dir/.done.kws.fullvocab
fi