#!/bin/bash
#
# This script transforms the Japanese Vowels UCI KDD dataset into ARFF files.
#
# FracPete

# the usage
function usage()
{
   echo
   echo "usage: ${0##*/} -n <names> -d <data> -o <output> [-r <relation>] [-h]"
   echo "       [-t <task>] [-s <sizes>]"
   echo
   echo "transforms the Japanese Vowels UCI KDD dataset into ARFF files"
   echo
   echo " -h   this help"
   echo " -n   <names>"
   echo "      the html file containing the names of the dataset"
   echo "      default: $NAMES"
   echo " -t   <task>"
   echo "      the html file containing the task for the dataset"
   echo "      default: $TASK"
   echo " -d   <data>"
   echo "      the file containing the actual data of the dataset"
   echo "      default: $DATA"
   echo " -s   <sizes>"
   echo "      the file containing the sizes for the dataset"
   echo "      default: $SIZES"
   echo " -o   <output>"
   echo "      the name of the generated ARFF file"
   echo "      default: $OUTPUT"
   echo " -r   <relation>"
   echo "      the name of the relation, if none specified assembled from the filename"
   echo
}

NAMES="JapaneseVowels.data.html"
TASK="JapaneseVowels.task.html"
DATA="JapaneseVowels.data"
SIZES="size_JapaneseVowels.data"
OUTPUT="JapaneseVowels.arff"
RELATION=""
CLASSTYPE="nominal"
CLASSINDEX="first"

# interprete parameters
while getopts ":hn:d:o:r:s:t:" flag
do
   case $flag in
      n) NAMES=$OPTARG
         ;;
      d) DATA=$OPTARG
         ;;
      o) OUTPUT=$OPTARG
         ;;
      r) RELATION=$OPTARG
         ;;
      s) SIZES=$OPTARG
         ;;
      t) TASK=$OPTARG
         ;;
      h) usage
         exit 0
         ;;
      *) usage
         exit 1
         ;;
   esac
done

# everything provided?
if [ ! -f $NAMES ]
then
   NAMES=""
fi

if [ ! -f $DATA ]
then
   DATA=""
fi

if [ "$NAMES" = "" ] || [ "$DATA" = "" ] || [ "$OUTPUT" = "" ]
then
   usage
   exit 2
fi

# assemble relation name if necessary
if [ "$RELATION" = "" ]
then
   RELATION="NAMES="$NAMES"-DATA="$DATA
fi

# some parameters
ATTS=$OUTPUT.ATTS

# actual script
echo 
echo "Processing '$NAMES'"
echo "and        '$TASK'"
echo "and        '$DATA'"
echo "and        '$SIZES'."
echo "Output can be found in '$OUTPUT'."
echo 

# 1. names etc. as comment
echo "Comments..."
echo "%%%%%%%%%%%%%%%%%%%%" > $OUTPUT
echo "% Data-Description %" >> $OUTPUT
echo "%%%%%%%%%%%%%%%%%%%%" >> $OUTPUT
echo "%" >> $OUTPUT
lynx --dump $NAMES | sed s/^/"% "/g >> $OUTPUT
echo "%%%%%%%%%%%%%%%%%%%%" >> $OUTPUT
echo "% Task-Description %" >> $OUTPUT
echo "%%%%%%%%%%%%%%%%%%%%" >> $OUTPUT
lynx --dump $TASK | sed s/^/"% "/g >> $OUTPUT
#../class.sh -t $CLASSTYPE -i $CLASSINDEX >> $OUTPUT

# 2.header 
# 2a. relation
echo "Relation..."
echo >> $OUTPUT
echo "@relation $RELATION" >> $OUTPUT

# 2b. preprocess data
echo "Preprocessing data..."
# delete potential left overs...
rm -f xx*
# split into files
csplit --digits=3 --quiet --elide-empty-files $DATA /^$/+1 {*}
# add speaker, utteracance and frame to front 
# (speaker,utterance_no,frame_no,REST)
LIST=`cat $SIZES`
SPEAKERNO="1"
FILENO="0"
for i in $LIST
do
   echo -n "- Speaker $SPEAKERNO: "
   for ((n = 1; n <= $i; n++))
   do
      # info
      echo -n "."
      
      # create filename
      FILE="xx"`printf "%.3d" $FILENO`
      
      # remove empty lines, add frame no, add speaker and utterance
      cat $FILE | grep -v "^$" | grep -n "." | sed s/":"/","/g | sed s/" $"//g | sed s/" "/","/g | sed s/^/"$SPEAKERNO,$n,"/g > _$FILE
      rm -f $FILE
      mv _$FILE $FILE
      
      # next file
      FILENO=$((FILENO + 1))
   done

   # next speaker
   SPEAKERNO=$((SPEAKERNO + 1))
   echo
done

# 2c. attributes
echo "Attributes..."
echo >> $OUTPUT
echo "@attribute speaker {A,B,C,D,E,F,G,H,I}" >> $OUTPUT
echo "@attribute utterance INTEGER" >> $OUTPUT
echo "@attribute frame INTEGER" >> $OUTPUT
for ((i = 1; i <= 12; i++))
do
   echo "@attribute coefficient$i REAL" >> $OUTPUT
done

# 3. data
echo "Data..."
echo >> $OUTPUT
echo "@data" >> $OUTPUT
for i in xx*
do
   echo -n "."
   cat $i | sed s/"^1,"/"A,"/g | sed s/"^2,"/"B,"/g | sed s/"^3,"/"C,"/g | sed s/"^4,"/"D,"/g | sed s/"^5,"/"E,"/g | sed s/"^6,"/"F,"/g | sed s/"^7,"/"G,"/g | sed s/"^8,"/"H,"/g | sed s/"^9,"/"I,"/g >> $OUTPUT
done
echo

# 4. clean up
echo "Clean up..."
rm -f xx*

# finished
echo
echo "Finished!"
echo

