"""VendorClusterer for grouping transactions by vendor.

This module provides the VendorClusterer class that orchestrates clustering
strategies to group transactions by merchant/vendor.
"""

import logging
from pathlib import Path
from typing import List

import pandas as pd

from statement_processor.analytics.cluster_runner import ClusterRunner
from statement_processor.analytics.clustering import ClusteringStrategy
from statement_processor.analytics.strategies.exact_match import ExactMatchStrategy
from statement_processor.analytics.strategies.regex_vendor import RegexVendorStrategy
from statement_processor.analytics.vendor_cluster import VendorCluster
from statement_processor.analytics.vendor_patterns import VENDOR_PATTERNS

logger = logging.getLogger(__name__)


class VendorClusterer:
    """Clusters transactions by vendor using the ClusterRunner framework.

    This class orchestrates clustering strategies to group transactions by
    merchant/vendor. It uses ExactMatchStrategy as the default baseline and
    allows additional strategies to be registered for improved matching.

    Attributes:
        _runner: ClusterRunner instance managing registered strategies.
        _threshold: Minimum confidence threshold for cluster membership.
    """

    def __init__(self, threshold: float = 0.7, use_regex_patterns: bool = True) -> None:
        """Initialize with clustering threshold.

        Args:
            threshold: Probability threshold for grouping (default 0.7).
            use_regex_patterns: Whether to load patterns from vendor_patterns
                (default True). Regex strategy runs first for canonical naming.
        """
        self._runner = ClusterRunner()
        self._threshold = threshold

        if use_regex_patterns:
            regex_strategy = self._build_regex_strategy()
            self._runner.register_strategy(regex_strategy, weight=1.0)

        self._runner.register_strategy(ExactMatchStrategy(), weight=1.0)

    def _build_regex_strategy(self) -> RegexVendorStrategy:
        """Build RegexVendorStrategy from VENDOR_PATTERNS config.

        Returns:
            Configured RegexVendorStrategy with all patterns loaded.
        """
        strategy = RegexVendorStrategy()
        for pattern, vendor_name in VENDOR_PATTERNS.items():
            strategy.add_pattern(pattern, vendor_name)
        logger.debug(f"Loaded {len(VENDOR_PATTERNS)} vendor patterns")
        return strategy

    def register_strategy(
        self, strategy: ClusteringStrategy, weight: float = 1.0
    ) -> None:
        """Register an additional vendor-matching strategy.

        Args:
            strategy: Strategy conforming to ClusteringStrategy interface.
            weight: Weight for this strategy in combination (default 1.0).
        """
        self._runner.register_strategy(strategy, weight)

    def cluster(self, df: pd.DataFrame) -> List[VendorCluster]:
        """Cluster transactions by vendor.

        Args:
            df: DataFrame with columns: date, description, amount.

        Returns:
            List of VendorCluster objects, one per identified vendor.
        """
        if df.empty:
            return []

        transaction_clusters = self._runner.run(df, min_confidence=self._threshold)
        vendor_clusters = [c for c in transaction_clusters if c.label.startswith("vendor:")]

        result: List[VendorCluster] = []
        for tc in vendor_clusters:
            indices = tc.indices_above_threshold(self._threshold)
            if not indices:
                continue

            vendor_name = tc.metadata.get("vendor_name")
            if vendor_name is None:
                vendor_name = df.loc[indices[0], "description"]

            cluster_df = df.loc[indices]
            total_amount = round(float(cluster_df["amount"].sum()), 2)
            earliest_date = str(cluster_df["date"].min())
            latest_date = str(cluster_df["date"].max())

            result.append(
                VendorCluster(
                    vendor_name=vendor_name,
                    transaction_indices=indices,
                    transaction_count=len(indices),
                    total_amount=total_amount,
                    earliest_date=earliest_date,
                    latest_date=latest_date,
                )
            )

        return result

    def summarize(self, df: pd.DataFrame) -> pd.DataFrame:
        """Cluster transactions and return summary DataFrame.

        Args:
            df: DataFrame with columns: date, description, amount.

        Returns:
            DataFrame with columns: vendor_name, transaction_count,
            total_amount, earliest_date, latest_date.
        """
        clusters = self.cluster(df)

        if not clusters:
            return pd.DataFrame(
                columns=[
                    "vendor_name",
                    "transaction_count",
                    "total_amount",
                    "earliest_date",
                    "latest_date",
                ]
            )

        rows = [
            {
                "vendor_name": c.vendor_name,
                "transaction_count": c.transaction_count,
                "total_amount": c.total_amount,
                "earliest_date": c.earliest_date,
                "latest_date": c.latest_date,
            }
            for c in clusters
        ]

        return pd.DataFrame(rows)

    def export(self, summary_df: pd.DataFrame, output_path: Path) -> None:
        """Export vendor summary to CSV file.

        Args:
            summary_df: DataFrame from summarize() method.
            output_path: Path to output CSV file.
        """
        output_path.parent.mkdir(parents=True, exist_ok=True)
        summary_df.to_csv(output_path, index=False, encoding="utf-8")
        logger.info(f"Exported vendor summary to {output_path}")
