#!/bin/bash

# This script extracts the number of attributes from ARFF files
# -h the help
#
# $Revision: 1.1.1.1 $
# FracPete

function usage()
{
   echo
   echo "usage: ${0##*/} -f <filename> | -d <dirname> [-o <filename>] [-t] [-h]"
   echo 
   echo "Retrieves the number of instances from ARFF files"
   echo 
   echo " -h   this help"
   echo " -f   <filename>"
   echo "      the file to check"
   echo " -d   <dirname>"
   echo "      checks all ARFF files in the given directory"
   echo "      default: $DIR"
   echo " -o   <filename>"
   echo "      the name of the file to store the output in"
   echo " -t   whether to organize the data for one dataset in a single line,"
   echo "      otherwise the count is printed for each file"
   echo
}

# counts the lines of the datasets file and returns the result in TMP
function count_datasets()
{
   TMP=`cat $DATASETS | grep -v "^$\|^#" | wc -l | sed s/" "*//g`
}

# returns the specified line (in LINE) from the datasets file in TMP
function get_line()
{
   TMP=`cat $DATASETS | grep -v "^$\|^#" | head -n $LINE | tail -n 1`
}

# counts the number of attributes in TMPFILE and returns the number in TMP
function count_attributes()
{
   if [ -f $TMPFILE ]
   then
      TMP=`grep -i "^@attribute " $TMPFILE | wc -l | sed s/" "*//g`
   else
      TMP="-"
   fi
}

# prints the number of attributes for the given file TMPFILE
function process_file()
{
   LINE=`echo ${TMPFILE##*/} | sed s/"\.arff"//ig`
   count_attributes
   if [ "$OUTPUT" != "" ]
   then
      echo "$LINE: $TMP" | tee $OUTPUT
   else
      echo "$LINE: $TMP"
   fi
}

# cleans up temp. files
function clean_up()
{
   rm -f $DATASETS
}

# variables
ROOT=`expr "$0" : '\(.*\)/'`
FILE=""
DIR="$ROOT/../tmp/"
DATASETS="$ROOT/../tmp/_datasets"
TABLE="no"
OUTPUT=""

# interprete parameters
while getopts ":htf:d:o:" flag
do
   case $flag in
      h) usage
         exit 0
         ;;
      t) TABLE="yes"
         ;;
      f) FILE=$OPTARG
         DIR=""
         ;;
      d) DIR=$OPTARG
         FILE=""
         ;;
      o) OUTPUT=$OPTARG
         ;;
      *) usage
         exit 1
         ;;
   esac
done

# only one file?
if [ "$FILE" != "" ]
then
   TMPFILE=$FILE
   process_file
else
   if [ "$TABLE" = "yes" ]
   then
      if [ "$OUTPUT" != "" ]
      then
         echo "Dataset: RELAGGS/Joiner/REMILK" | tee $OUTPUT
      else
         echo "Dataset: RELAGGS/Joiner/REMILK"
      fi
      
      ls $DIR/*.arff | sed s/".*\/"//g | grep -v "\-stat" | sed s/"\.arff"//g | sed s/"-mi$\|-remi$"//g | sort -u > $DATASETS
      count_datasets;COUNT=$TMP

      for ((i = 1; i <= $COUNT; i++))
      do
         LINE=$i;get_line
         SET=$TMP
         
         TMPFILE="$DIR/$SET.arff";count_attributes;COUNT1=$TMP
         TMPFILE="$DIR/$SET-mi.arff";count_attributes;COUNT2=$TMP
         TMPFILE="$DIR/$SET-remi.arff";count_attributes;COUNT3=$TMP

         if [ "$OUTPUT" != "" ]
         then
            echo "$SET: $COUNT1/$COUNT2/$COUNT3" | tee $OUTPUT
         else
            echo "$SET: $COUNT1/$COUNT2/$COUNT3"
         fi
      done
   else
      for i in $DIR/*.arff
      do
         # statistics file? -> continue
         if [ "`echo $i | sed s/"-stat"//g`" != "$i" ]
         then
            continue
         fi
         
         TMPFILE=$i
         process_file
      done
   fi
fi

# clean up
clean_up

