#!/usr/bin/env python
import sys
import argparse
import csv
import re
import itertools
import io
from jtutils import is_int, str_is_int, str_is_float, to_days, to_years, rand, GroupBy, threewise
from collections import Counter

def readCL():
    parser = argparse.ArgumentParser()
    parser.add_argument("-f","--infile")
    parser.add_argument("-b","--begin_code",nargs="*")
    parser.add_argument("-g","--grep_code")
    parser.add_argument("-p","--process_code", nargs="*", default="")
    parser.add_argument("-e","--end_code", nargs="*")
    parser.add_argument("-d","--delimiter", default=",")
    parser.add_argument("--output_delimiter",default=",")
    parser.add_argument("-m","--multiline",action="store_true",help="allow the incoming csv to have multiline fields, ie newlines in the middle of fields")
    parser.add_argument("--exceptions_allowed", action="store_true")
    parser.add_argument("--set", help="load a file with no header, storing each line as an element of a set")

    args = parser.parse_args()

    if sys.stdin.isatty() and not args.infile:
        sys.stderr.write("WARNING: pawk using /dev/stdin as default input file (-f) but nothing seems to be piped in..." + "\n")

    if not args.infile:
        f_in = sys.stdin
    else:
        f_in = open(args.infile)

    if args.delimiter == "TAB":
        args.delimiter = '\t'
    elif args.delimiter == "\\t":
        args.delimiter = '\t'

    return f_in, args.begin_code, args.grep_code, args.process_code, args.end_code, args.exceptions_allowed, args.delimiter, args.output_delimiter, args.multiline, args.set



#####pyindent code#####
def pyindent(string):
    return '\n'.join(_pyindent_iter(string.split('\n')))

def _pyindent_iter(line_iter):
    indent_level = 0
    for l in (line_iter):
        # substrings = _split_on_pystrings(l)
        #for i in range(10): if i % 2==0: print i; end; end; for i in range(5): print i; end; print 10
        # -->
        #["for i in range(10):", "if i % 2 == 0:", "print i;", "end;", "end;", "for i in range(5):", "print i;", "end;", "print 10"]

        #jtrigg@20150804: commenting below two lines to test out the _split function which should handle string literals better?
        # l2 = [i for i in re.split('(:[ $]|;[ ])',l) if i]
        # py_lines = [''.join([i for i in g if i]).strip() for g in _groupby(l2,2)]
        py_lines =  _split(l)
        py_lines = list(_paste_lambdas(py_lines))
        for l in py_lines:
            if l == "end;":
                indent_level -= 1
                continue
            if re.findall("^elif",l) or re.findall("^else",l) or re.findall("^except",l):
                indent_level -= 1
            yield ("    "*indent_level + l)
            if re.findall(":$",l):
                indent_level += 1
        # yield ("    "*indent_level + output_text)

def _split(s):
    """
    read a string representing python code and
    'print "echo; echo;"'
    """
    out_list = []
    cur_substring = ""
    in_string_type = None
    for las, cur, nex in threewise(s):
        cur_substring += cur
        if not in_string_type:
            if cur == '"' or cur == "'":
                # out_list.append((cur_substring,in_string_type))
                in_string_type = cur
                # cur_substring = cur
            elif (cur == ":" and nex == " "):
                out_list.append(cur_substring.strip())
                cur_substring = ""
            elif (cur == ";"):
                out_list.append(cur_substring.strip())
                cur_substring = ""
        else:
            if (cur == '"' or cur == "'") and las != "\\":
                #out_list.append((cur_substring,in_string_type))
                in_string_type = None
                #cur_substring = ""
    if cur_substring:
        out_list.append(cur_substring.strip())
    return out_list


def _paste_lambdas(match_list):
    """don't want newline after 'lambda x:'
    """
    for las, cur, nex  in threewise(match_list):
        #TODO: replace with regex of exactly the characters allowed in python variable names (instead of strictly alphanumeric)?
        regex = "lambda[ 0-9A-Za-z]*:$"
        if las and re.findall(regex,las):
            continue
        elif re.findall(regex,cur):
            yield cur + " " + nex
        else:
            yield cur

#####end pyindent#####


#dict_and_row function to return a tuple with both unprocessed row and csv.reader() output
def csvlist_and_raw(f_in, delimiter, multiline=False):
    #default max field size of ~131k crashes at times
    csv.field_size_limit(sys.maxsize)
    if multiline:
        f1, f2 = itertools.tee(f_in) #use f1 to return
        reader = csv.reader(f_in, delimiter=delimiter)
        for row in reader:
            output = io.BytesIO()
            wr = csv.writer(output, delimiter=delimiter)
            wr.writerow(row)
            l = output.getvalue()
            # print "here"
            # print l,row
            yield l,row
    else:
        for l in f_in:
            l = l.strip("\r\n")
            row = csv2row(l, delimiter=delimiter)
            yield l, row

def csv2row(csv_string, delimiter=",", quote_char='"'):
    #take a string representing one line from a csv
    #parse that line into a list of fields
    #taking care to properly handle quoting of fields containing the delimiter
    #and nested quotes
    row = []

    #raw pieces from splitting on the delimiter:
    #some of the pieces will need to be combined to make the fields
    #eg: "a,b",c,d
    #has 4 pieces but only three fields
    #pieces = ['"a','b"','c','d']
    pieces = csv_string.split(delimiter)

    #store the pieces in the field currently being built up
    current_field = []
    inquote = False

    for p in pieces:
        #starts and ends with single quote -- all other quotes come in pairs
        #@example: "a""b"
        #@example: """"
        #@example: ""
        re_START_AND_END = '^{quote_char}([^{quote_char}]|{quote_char}{quote_char})*{quote_char}$'.format(**vars())

        #ends with single quote -- all other quotes come in pairs
        #@example: abcd"
        #@example: ""cd"
        re_END = '^([^{quote_char}]|{quote_char}{quote_char})*{quote_char}$'.format(**vars())

        #starts with a single quote -- all other quotes come in pairs
        #@example: "abcd
        #@example: "ab""
        re_START = '^{quote_char}([^{quote_char}]|{quote_char}{quote_char})*$'.format(**vars())
        if not inquote and re.findall(re_START_AND_END,p):
            row.append(p)
        elif not inquote and re.findall(re_START,p):
            inquote = True
            current_field.append(p[1:])
        elif inquote and re.findall(re_END,p):
            inquote = False
            current_field.append(p[:-1])
            x = ",".join(current_field).replace('""','"')
            row.append(x)
            current_field = []
        elif inquote:
            current_field.append(p)
        elif not inquote:
            row.append(p)
            current_field = []
    if inquote:
        row.append(quote_char + current_field[0])
        row += current_field[1:]
    return row

def row2csv(rout, delimiter = ",", quote_char='"'):
    def add_quotes(r):
        if delimiter in r:
            r = r.replace('"','""')
            return quote_char + r + quote_char
        else:
            return r
    return delimiter.join([add_quotes(str(r)) for r in rout])

def write_line(rout, delimiter = ","):
    s = row2csv(rout, delimiter)
    sys.stdout.write(s + "\n")

#"[""Rusted hardware and rivets"",""Raw hem detail""]"

def proc_field(f):
    try:
        int(f)
        return int(f)
    except:
        pass
    return f

def gen_grep_code(grep_code):
    if grep_code:
        grep_string = re.findall("^/(.*)/$",grep_code)
        if grep_string:
            grep_string = grep_string[0]
            grep_code = 're.findall("{grep_string}",",".join(l))'.format(**vars())
    return grep_code

# @profile
def process(f_in, begin_code, grep_code, process_code, end_code, exceptions_allowed, delimiter, output_delimiter, multiline, load_set):
    hdr = None
    has_exceptions = False
    has_printed_incomplete_line = False
    #jtrigg@20160102 try out only writing when there's no -p option
    # do_write = process_code and ("print" in process_code or "write_line" in process_code)
    do_write = process_code
    if load_set:
        s = set(l.strip() for l in open(load_set))

    if begin_code:
        begin_code = [compile(code,'','exec') for code in begin_code]
    if grep_code:
        grep_code = compile(grep_code,'','eval')
    if process_code:
        process_code = [compile(code,'','exec') for code in process_code]
    if end_code:
        end_code = [compile(code,'','exec') for code in end_code]
    if begin_code:
        for code in begin_code:
            exec(code)


    for i,(l,_csvlist) in enumerate(csvlist_and_raw(f_in, delimiter, multiline=multiline)):
        r = _csvlist
        try:
            # print r,process_code
            if grep_code and not eval(grep_code):
                continue
            if process_code:
                for code in process_code:
                    exec(code)
        except:
            if not exceptions_allowed:
                raise
            else:
                if not has_exceptions:
                    sys.stderr.write("WARNING: exception" + '\n')
                    has_exceptions = True
                continue
        if not do_write:
            write_line(r, output_delimiter)

    if end_code:
        for code in end_code:
            exec(code)


def test_csv_identity():
    test_cases = ['a,",",b',
         '["Rusted hardware and rivets","Raw hem detail"]',
         '"[""Rusted hardware and rivets"",""Raw hem detail""]"',
         '"["Rusted hardware and rivets","Raw hem detail"]"',
         '""""',
         '"a","b","c"',
         '"a"b",c,d',
         '"""',
         '""a,"'
         ]
    for t in test_cases:
        row = csv2row(t)
        s = row2csv(row)
        assert(s == t)

if __name__ == "__main__":
    f_in, begin_code, grep_code, process_code, end_code, exceptions_allowed, delimiter, output_delimiter, multiline, load_set = readCL()
    #following two lines solve 'Broken pipe' error when piping
    #script output into head
    from signal import signal, SIGPIPE, SIG_DFL
    signal(SIGPIPE,SIG_DFL)

    if begin_code:
        begin_code = [pyindent(c) for c in begin_code]
    if grep_code:
        grep_code = pyindent(grep_code)
    if process_code:
        process_code = [pyindent(c) for c in process_code]
    if end_code:
        end_code = [pyindent(c) for c in end_code]
    #preprocess /.*/ syntax
    grep_code = gen_grep_code(grep_code)
    process(f_in,begin_code,grep_code,process_code,end_code,exceptions_allowed,delimiter,output_delimiter,multiline,load_set)
