#!/usr/bin/env python
import sys
import optparse
import csv
import re
import itertools

def readCL():
    usagestr = "%prog"
    parser = optparse.OptionParser(usage=usagestr)
    parser.add_option("-f","--infile")
    parser.add_option("-b","--begin_code")
    parser.add_option("-g","--grep_code")
    parser.add_option("-p","--process_code", default="")
    parser.add_option("-e","--end_code")
    parser.add_option("-d","--delimiter", default=",")
    parser.add_option("--exceptions_allowed", action="store_true")
    parser.add_option("--set", help="load a file with no header, storing each line as an element of a set")

    options, args = parser.parse_args()

    if sys.stdin.isatty() and not options.infile:
        sys.stderr.write("WARNING: pawk using /dev/stdin as default input file (-f) but nothing seems to be piped in..." + "\n")

    if not options.infile:
        f_in = sys.stdin
    else:
        f_in = open(options.infile)

    if options.delimiter == "TAB":
        options.delimiter = '\t'
    elif options.delimiter == "\\t":
        options.delimiter = '\t'

        
    return f_in, options.begin_code, options.grep_code, options.process_code, options.end_code, options.exceptions_allowed, options.delimiter, options.set



#####pyindent code#####
def pyindent(string):
    return '\n'.join(_pyindent_iter(string.split('\n')))

def _pyindent_iter(line_iter):
    indent_level = 0
    for l in (line_iter):
        # substrings = _split_on_pystrings(l)
        #for i in range(10): if i % 2==0: print i; end; end; for i in range(5): print i; end; print 10
        # -->
        #["for i in range(10):", "if i % 2 == 0:", "print i;", "end;", "end;", "for i in range(5):", "print i;", "end;", "print 10"]

        #jtrigg@20150804: commenting below two lines to test out the _split function which should handle string literals better?
        # l2 = [i for i in re.split('(:[ $]|;[ ])',l) if i]
        # py_lines = [''.join([i for i in g if i]).strip() for g in _groupby(l2,2)]
        py_lines =  _split(l)
        py_lines = list(_paste_lambdas(py_lines))
        for l in py_lines:
            if l == "end;":
                indent_level -= 1
                continue
            if re.findall("^elif",l) or re.findall("^else",l) or re.findall("^except",l):
                indent_level -= 1
            yield ("    "*indent_level + l)
            if re.findall(":$",l):
                indent_level += 1
        # yield ("    "*indent_level + output_text)

def _split(s):
    """
    read a string representing python code and 
    'print "echo; echo;"'
    """
    out_list = []
    cur_substring = ""
    in_string_type = None
    for las, cur, nex in _threewise(s):
        cur_substring += cur
        if not in_string_type:
            if cur == '"' or cur == "'":
                # out_list.append((cur_substring,in_string_type))
                in_string_type = cur
                # cur_substring = cur
            elif (cur == ":" and nex == " "):
                out_list.append(cur_substring.strip())
                cur_substring = ""
            elif (cur == ";"):
                out_list.append(cur_substring.strip())
                cur_substring = ""
        else:
            if (cur == '"' or cur == "'") and las != "\\":
                #out_list.append((cur_substring,in_string_type))
                in_string_type = None
                #cur_substring = ""
    if cur_substring:
        out_list.append(cur_substring.strip())
    return out_list

        
def _paste_lambdas(match_list):
    """don't want newline after 'lambda x:'
    """
    for las, cur, nex  in _threewise(match_list):
        #TODO: replace with regex of exactly the characters allowed in python variable names (instead of strictly alphanumeric)?
        regex = "lambda[ 0-9A-Za-z]*:$"
        if las and re.findall(regex,las):
            continue
        elif re.findall(regex,cur):
            yield cur + " " + nex
        else:
            yield cur

def _threewise(iterable):
    """s -> (None, s0, s1), (s0, s1, s2), ... (sn-1, sn, None)
    example:
    for (las, cur, nex) in threewise(l):
    """
    a, b, c = itertools.tee(iterable,3)
    def prepend(val, l):
        yield val
        for i in l: yield i
    def postpend(val, l):
        for i in l: yield i
        yield val
    next(c,None)
    for _xa, _xb, _xc in itertools.izip(prepend(None,a), b, postpend(None,c)):
        yield (_xa, _xb, _xc)
            
        
#####end pyindent#####





def process_cut_csv(i,delim=","):
    if i:
        i = i.split(',')
        return list(process_cut_list(i))
    else:
        return None

def process_cut_list(l, delim=","):
    for i in l:
        if "-" in i:
            x,y = i.split('-')
            for r in range(int(x),int(y)+1):
                yield r
        elif str_is_int(i):
            yield int(i)
        else:
            yield i

def str_is_int(var):
    # if not isinstance(var, str) and np.isnan(var):
    #     return False
    if re.findall("^\d+$",var):
        return True
    else:
        return False

def is_int(var):
    return isinstance( var, ( int, long ) )

def str_is_float(var):
    try:
        f = float(var)
        # if np.isnan(f):
        #     return False
        return True
    except:
        return False

    
#new csvlist and raw function
def csvlist_and_raw(f_in, delimiter):
    for line in f_in:
        line = line.rstrip("\n")
        yield line, csv.reader([line], delimiter=delimiter).next()




def write_line(rout):
    csv.writer(sys.stdout, lineterminator= '\n').writerows([rout])

def proc_field(f):
    try:
        int(f)
        return int(f)
    except:
        pass
    return f

def gen_grep_code(grep_code):
    if grep_code:
        grep_string = re.findall("^/(.*)/$",grep_code)
        if grep_string:
            grep_string = grep_string[0]
            grep_code = 're.findall("{grep_string}",",".join(l))'.format(**vars())
    return grep_code

# @profile
def process(f_in, begin_code, grep_code, process_code, end_code, exceptions_allowed, delimiter,load_set):
    hdr = None
    has_exceptions = False
    has_printed_incomplete_line = False
    #jtrigg@20160102 try out only writing when there's no -p option 
    # do_write = process_code and ("print" in process_code or "write_line" in process_code)
    do_write = process_code
    if load_set:
        s = set(l.strip() for l in open(load_set))

    if begin_code:
        begin_code = compile(begin_code,'','exec')
    if grep_code:
        grep_code = compile(grep_code,'','eval')
    if process_code:
        process_code = compile(process_code,'','exec')
    if end_code:
        end_code = compile(end_code,'','exec')
    if begin_code:
        exec(begin_code)

        
    for i,(l,_csvlist) in enumerate(csvlist_and_raw(f_in, delimiter = delimiter)):
        r = _csvlist
        try:
            # print r,process_code
            if grep_code and not eval(grep_code):
                continue
            if process_code:
                exec(process_code)
        except:
            if not exceptions_allowed:
                raise
            else:
                if not has_exceptions:
                    sys.stderr.write("WARNING: exception" + '\n')
                    has_exceptions = True
                continue
        if not do_write:
            write_line(r)
            
    if end_code:
        exec(end_code)
            

if __name__ == "__main__":
    f_in, begin_code, grep_code, process_code, end_code, exceptions_allowed, delimiter, load_set = readCL()
    #following two lines solve 'Broken pipe' error when piping
    #script output into head
    from signal import signal, SIGPIPE, SIG_DFL
    signal(SIGPIPE,SIG_DFL)

    if begin_code:
        begin_code = pyindent(begin_code)
    if grep_code:
        grep_code = pyindent(grep_code)
    if process_code:
        process_code = pyindent(process_code)
    #preprocess /.*/ syntax
    grep_code = gen_grep_code(grep_code)

    process(f_in,begin_code,grep_code,process_code,end_code,exceptions_allowed,delimiter,load_set)
