#!/usr/bin/env python
import optparse
import pandas as pd
import sys
import python_csv.utils

def readCL():
    usagestr = "%prog -f infile col1 [col2]"
    parser = optparse.OptionParser(usage=usagestr)
    parser.add_option("-f","--infile", default=sys.stdin)
    parser.add_option("-n","--no_header",action="store_true")
    parser.add_option("-d","--delimiter",default=",")
    options, args = parser.parse_args()

    if len(args) == 0:
        col1 = None
        col2 = None
    elif len(args) == 1:
        col1 = args[0]
        col2 = None
    elif len(args) == 2:
        col1 = args[0]
        col2 = args[1]
        

    return options.infile, col1, col2, options.no_header, options.delimiter


def str_is_float(var):
    try:
        f = float(var)
        return True
    except:
        return False


def one_col(col):
    col.value_counts().reset_index().to_csv(sys.stdout,header=["val","cnt"], index=False)


def two_col(col1,col2):
    vals1 = set(col1.unique())
    vals2 = set(col2.unique())
    v1 = str(len(vals1.difference(vals2)))
    v2 = str(len(vals1.intersection(vals2)))
    v3 = str(len(vals2.difference(vals1)))
    print "venn diagram breakdown"
    print "|a-b|: " + v1
    print "|a^b|: " + v2
    print "|b-a|: " + v3
    print "----"
    if all(col1.apply(str_is_float)) and \
       all(col2.apply(str_is_float)):
        col1_float = col1.astype(float)
        col2_float = col2.astype(float)
        sd1 = col1_float.std()
        sd2 = col2_float.std()
        corr = (col1_float).corr(col2_float)
        print "Statistics:"
        print "sd1: {:f}".format(sd1)
        print "sd2: {:f}".format(sd2)
        print "R: {:f}".format(corr)
        print "----"
        reg_coeff1 = corr * sd2 / float(sd1)
        reg_coeff2 = corr * sd1 / float(sd2)
        mean1 = col1_float.mean()
        mean2 = col2_float.mean()
        intercept1 = mean2 - reg_coeff1 * mean1
        intercept2 = mean1 - reg_coeff2 * mean2
        print "Linear regression:"
        print "b = {}a + {}".format(reg_coeff1, intercept1)
        print "a = {}b + {}".format(reg_coeff2, intercept2)
        print "---"
    if len(vals1) < 15 and len(vals2) < 15:
        print pd.crosstab(col1,col2)


    
if __name__ == "__main__":
    f_in, col1, col2, no_header, delimiter = readCL()
    python_csv.utils.fix_broken_pipe()
    if not no_header:
        dat = pd.read_csv(f_in, dtype="object", delimiter=delimiter)
        hdr = ",".join(dat.columns)
        sys.stderr.write("WARNING: using first line of input, \"{hdr}\", as header. If file doesn't have a header use -n option.".format(**vars()) + "\n")
    else:
        dat = pd.read_csv(f_in, dtype="object", header=None, delimiter=delimiter)
    # if len(dat.columns) == 2:
    #     for c in dat.columns:
    #         print dat[c].unique()
    # print (dat.iloc[:,0].unique()).difference(set(dat.iloc[:,1]))
    if len(dat.columns) == 1:
        one_col(dat.iloc[:,0])
    elif col1 and not col2:
        one_col(dat.iloc[:,0])
    elif len(dat.columns) == 2:
        two_col(dat.iloc[:,0], dat.iloc[:,1])
        # dat.iloc[:,0].value_counts().reset_index().to_csv(sys.stdout,header=["val","cnt"], index=False)
    elif col1 and col2:
        two_col(dat[col1], dat[col2])
    else:
        raise
