"""Statement Processor - Main orchestrator for the processing pipeline.

This module provides the StatementProcessor class that coordinates all
components of the statement processing pipeline.
"""

import logging
from enum import Enum
from pathlib import Path
from typing import List, Optional

import pandas as pd

from statement_processor.analytics.vendor_clusterer import VendorClusterer
from statement_processor.core import auto_detect_parser, get_parser_by_name
from statement_processor.extraction import PDFScanner, PdfMarkdownExtractor
from statement_processor.models import ProcessingResult, Statement, Transaction
from statement_processor.validation import StatementValidator, ValidationError

logger = logging.getLogger(__name__)


class ExtractionMode(Enum):
    """Extraction mode for PDF processing."""

    TEXT = "text"  # PdfMarkdownExtractor + parser (regex-based)
    STRUCTURED = "structured"  # StructuredExtractor (Pydantic templates)


class StatementProcessor:
    """Orchestrates the full statement processing pipeline.

    This class coordinates all components to:
    1. Scan directories for PDF files
    2. Extract text from PDFs using Docling
    3. Parse transactions using auto-detected or specified parser
    4. Cluster transactions by vendor and export results
    """

    def __init__(
        self,
        input_dirs: List[str],
        output_dir: str,
        debug_dir: Optional[str] = None,
        extraction_mode: ExtractionMode = ExtractionMode.TEXT,
        parser_name: Optional[str] = None,
        strict: bool = True,
        tolerance: float = 0.0,
    ):
        """Initialize processor with configuration.

        Args:
            input_dirs: List of input directory paths containing PDF statements
            output_dir: Output directory path for CSV files
            debug_dir: Optional debug directory for intermediate files
            extraction_mode: Extraction mode - TEXT or STRUCTURED
            parser_name: Optional parser name to use (auto-detect if None)
            strict: If True, fail on validation errors (default True)
            tolerance: Tolerance for validation comparisons in dollars (default 0)
        """
        self._input_dirs = [Path(d) for d in input_dirs]
        self._output_dir = Path(output_dir)
        self._debug_dir = Path(debug_dir) if debug_dir else None
        self._extraction_mode = extraction_mode
        self._parser_name = parser_name
        self._strict = strict
        self._tolerance = tolerance

        self._markdown_extractor = PdfMarkdownExtractor(debug_dir=self._debug_dir)
        self._vendor_clusterer = VendorClusterer()
        self._validator = StatementValidator(tolerance=tolerance, strict=strict)

        logger.info(
            f"StatementProcessor initialized with {len(input_dirs)} input directories"
        )

    @property
    def input_dirs(self) -> List[Path]:
        """Get the list of input directories."""
        return self._input_dirs

    @property
    def output_dir(self) -> Path:
        """Get the output directory."""
        return self._output_dir

    def process(self) -> ProcessingResult:
        """Run full processing pipeline and return results.

        Returns:
            ProcessingResult with statistics and any errors encountered
        """
        result = ProcessingResult()
        all_dataframes: List[pd.DataFrame] = []
        all_statements: List[Statement] = []

        for input_dir in self._input_dirs:
            logger.info(f"Processing directory: {input_dir}")

            try:
                df, statements, errors = self._process_directory(input_dir)

                if not df.empty:
                    all_dataframes.append(df)

                all_statements.extend(statements)
                result.errors.extend(errors)

            except Exception as e:
                error_msg = f"Failed to process directory {input_dir}: {e}"
                logger.error(error_msg)
                result.errors.append(error_msg)

        # Combine all datasets
        if all_dataframes:
            combined_df = self._combine_datasets(all_dataframes)
            logger.info(f"Combined {len(combined_df)} total transactions")
        else:
            combined_df = pd.DataFrame()
            logger.warning("No transactions found in any directory")

        # Export vendor summary
        if not combined_df.empty:
            try:
                vendor_df = self._vendor_clusterer.summarize(combined_df)
                vendor_path = self._output_dir / "vendor_amounts.csv"
                self._vendor_clusterer.export(vendor_df, vendor_path)
                logger.info(f"Exported {len(vendor_df)} vendor clusters to {vendor_path}")

            except Exception as e:
                error_msg = f"Failed to export vendor CSV: {e}"
                logger.error(error_msg)
                result.errors.append(error_msg)

            result.total_transactions = len(combined_df)

        # Calculate file statistics
        result.statements = all_statements
        result.total_files = sum(
            len(PDFScanner(str(d)).scan()) for d in self._input_dirs
        )
        result.successful_files = len(all_statements)
        result.failed_files = result.total_files - result.successful_files

        logger.info(
            f"Processing complete: {result.successful_files}/{result.total_files} files, "
            f"{result.total_transactions} transactions"
        )

        return result

    def _process_directory(
        self, input_dir: Path
    ) -> tuple[pd.DataFrame, List[Statement], List[str]]:
        """Process a single directory.

        Args:
            input_dir: Path to directory containing PDF files

        Returns:
            Tuple of (DataFrame, list of Statements, list of error messages)
        """
        scanner = PDFScanner(str(input_dir))
        if not scanner.validate_directory():
            error_msg = f"Invalid directory: {input_dir}"
            logger.error(error_msg)
            return pd.DataFrame(), [], [error_msg]

        pdf_files = scanner.scan()
        if not pdf_files:
            return pd.DataFrame(), [], []

        statements: List[Statement] = []
        errors: List[str] = []
        transactions: List[dict] = []

        # Extract markdown from PDFs
        extracted_docs = self._markdown_extractor.extract(pdf_files)

        # Parse transactions from each PDF
        for pdf_path, doc in extracted_docs.items():
            if not doc.markdown:
                error_msg = f"No text extracted from {pdf_path}"
                logger.warning(error_msg)
                errors.append(error_msg)
                continue

            try:
                # Get parser (specified or auto-detect)
                if self._parser_name:
                    parser = get_parser_by_name(self._parser_name)
                    if not parser:
                        raise ValueError(f"Parser not found: {self._parser_name}")
                else:
                    parser = auto_detect_parser(doc.markdown)
                    if not parser:
                        raise ValueError("No parser found for document")

                # Parse transactions
                parsed_transactions, metadata = parser.parse_with_metadata(
                    doc.markdown, pdf_path.name
                )

                # Create Statement object with metadata
                from statement_processor.models import StatementMetadata
                stmt_metadata = (
                    StatementMetadata(**metadata)
                    if isinstance(metadata, dict)
                    else metadata
                )
                statement = Statement(
                    source_file=pdf_path.name,
                    metadata=stmt_metadata,
                    transactions=[
                        Transaction(
                            date=t.date,
                            description=t.description,
                            amount=t.amount,
                            source_file=pdf_path.name,
                        )
                        for t in parsed_transactions
                    ],
                )

                # Validate statement
                self._validator.validate(statement)

                statements.append(statement)

                for tx in parsed_transactions:
                    transactions.append(tx.to_dict())

                logger.info(
                    f"Parsed {len(parsed_transactions)} transactions from {pdf_path.name}"
                )

            except ValidationError as ve:
                error_msg = str(ve)
                logger.error(error_msg)
                errors.append(error_msg)
                # Skip this file on validation failure in strict mode
            except Exception as e:
                error_msg = f"Failed to parse {pdf_path.name}: {e}"
                logger.error(error_msg)
                errors.append(error_msg)

        df = pd.DataFrame(transactions) if transactions else pd.DataFrame()
        return df, statements, errors

    def _combine_datasets(self, dataframes: List[pd.DataFrame]) -> pd.DataFrame:
        """Combine multiple DataFrames into one.

        Args:
            dataframes: List of DataFrames to combine

        Returns:
            Combined DataFrame
        """
        if not dataframes:
            return pd.DataFrame()

        non_empty_dfs = [df for df in dataframes if not df.empty]

        if not non_empty_dfs:
            return pd.DataFrame()

        if len(non_empty_dfs) == 1:
            return non_empty_dfs[0].copy()

        return pd.concat(non_empty_dfs, ignore_index=True)
