"""RegexVendorStrategy for grouping transactions by regex pattern matching.

This strategy allows defining vendor patterns using regular expressions,
enabling matching of transactions with variable suffixes like:
- "Audible*501CY3Y03 Amzn.com/bill NJ"
- "Audible*CY7IP98G3 Amzn.com/bill NJ"

All matching transactions are grouped under a canonical vendor name.
"""

import logging
import re
from dataclasses import dataclass
from typing import Dict, List

import pandas as pd

from statement_processor.analytics.clustering import ClusteringStrategy, TransactionCluster

logger = logging.getLogger(__name__)


@dataclass
class VendorPattern:
    """A regex pattern for matching vendor transactions.

    Attributes:
        pattern: Regex pattern to match against transaction descriptions.
        vendor_name: Canonical vendor name for matched transactions.
        flags: Optional regex flags (default: case-insensitive).
    """

    pattern: str
    vendor_name: str
    flags: int = re.IGNORECASE

    def __post_init__(self) -> None:
        """Compile the regex pattern for validation."""
        try:
            re.compile(self.pattern, self.flags)
        except re.error as e:
            raise ValueError(f"Invalid regex pattern '{self.pattern}': {e}")


class RegexVendorStrategy(ClusteringStrategy):
    """Groups transactions matching regex patterns under canonical vendor names.

    This strategy is useful for vendors with variable transaction descriptions,
    such as subscription services that append order IDs or reference numbers.

    Example:
        strategy = RegexVendorStrategy()
        strategy.add_pattern(r"Audible\\*.*", "Audible")
        strategy.add_pattern(r"NETFLIX\\.COM.*", "Netflix")
    """

    def __init__(self) -> None:
        """Initialize with empty pattern list."""
        self._patterns: List[VendorPattern] = []
        self._compiled: Dict[str, re.Pattern] = {}

    @property
    def name(self) -> str:
        """Return the strategy name.

        Returns:
            The string "regex_vendor".
        """
        return "regex_vendor"

    def add_pattern(
        self,
        pattern: str,
        vendor_name: str,
        flags: int = re.IGNORECASE,
    ) -> None:
        """Add a vendor pattern for matching.

        Args:
            pattern: Regex pattern to match transaction descriptions.
            vendor_name: Canonical vendor name for matched transactions.
            flags: Regex flags (default: case-insensitive).

        Raises:
            ValueError: If the pattern is invalid regex.
        """
        vp = VendorPattern(pattern=pattern, vendor_name=vendor_name, flags=flags)
        self._patterns.append(vp)
        self._compiled[pattern] = re.compile(pattern, flags)
        logger.debug(f"Added vendor pattern: '{pattern}' -> '{vendor_name}'")

    def cluster(self, transactions: pd.DataFrame) -> List[TransactionCluster]:
        """Group transactions by regex pattern matching.

        Args:
            transactions: DataFrame with columns: date, description, amount.

        Returns:
            List of TransactionCluster objects, one per matched vendor pattern.
            Empty list if no patterns defined or no matches found.
        """
        if transactions.empty or not self._patterns:
            return []

        vendor_matches: Dict[str, Dict[int, float]] = {}

        for idx, row in transactions.iterrows():
            description = str(row.get("description", ""))

            for vp in self._patterns:
                compiled = self._compiled.get(vp.pattern)
                if compiled and compiled.search(description):
                    if vp.vendor_name not in vendor_matches:
                        vendor_matches[vp.vendor_name] = {}
                    vendor_matches[vp.vendor_name][idx] = 1.0
                    break  # First matching pattern wins

        clusters: List[TransactionCluster] = []
        for vendor_name, memberships in vendor_matches.items():
            if memberships:
                clusters.append(
                    TransactionCluster(
                        memberships=memberships,
                        label=f"vendor:{vendor_name}",
                        metadata={
                            "vendor_name": vendor_name,
                            "match_strategy": "regex",
                        },
                    )
                )

        return clusters
