echo "Getting streams IDs ..." PATTERN='{"contentId":ID_PATTERN,"width":1200,"height":1920}' #IDS=$(wget -e robots=off -q -O - "icm.tv.pionier.net.pl"|grep OnMaterialClick|grep "$ID.Urls.Txt echo "" done echo "Got streams ULSs ..." rm $(ls -la *.Urls.Txt|awk '{ print $5 " " $9 }'|grep "0 "|awk '{ print $2 }') echo "useless ids and jsons removed" IDS=$(ls -la *.Urls.Txt|awk '{ print $9 }'|sed -e "s/\./ /g"|awk '{print $1}') echo "Clean ids list will be used. " echo "Getting streams metadatas ..." for ID in $IDS do bash MetadataScraper.Bsh "$ID" done echo "Got streams metadatas ..." echo "Getting streams as MP4s and converting mp4s to wavs ..." # TODO if U need fetch all streams then remove all '#' in this section/for for ID in $IDS do I=0 # for URL=$(cat $ID.Urls.Txt|head -1) # do lynx --dump "$URL">$ID.$I.mp4 # I=$(echo "$I+1"|bc) ffmpeg -i $ID.$I.mp4 -acodec pcm_s16le -ac 1 -ar 16000 $ID.$I.wav # echo "$URL">$ID.usedUrl.txt rm $ID.$I.mp4 # done done echo "Got streams as MP4s and converted mp4s to wavs ..." echo "Getting Speech2Txt by Kaldi ..." # as above, those loops are needed to avoid url/session id expired exception. for ID in $IDS do I=0 # for URL in $(cat $ID.Urls.Txt|head -1) URL=$(cat $ID.Urls.Txt|head -1) # do # I=$(echo "$I+1"|bc) bash GraKaldiClient.sh $ID.$I rm $ID.$I.wav bash PrepareTranscription.Bsh $ID.$I>$ID.text rm $ID.$I.TextGrid $ID.$I.xml # done done echo "removing all tempolary data." rm *.Urls.Txt echo "All done, check directory listing: "