"""Tests for VendorClusterer class."""

import pandas as pd
import pytest
from pathlib import Path
import tempfile

from hypothesis import given, settings, HealthCheck
from hypothesis import strategies as st

from statement_processor.analytics.vendor_clusterer import VendorClusterer
from statement_processor.analytics.vendor_cluster import VendorCluster


class TestVendorClusterer:
    """Tests for VendorClusterer functionality."""

    def test_empty_dataframe_returns_empty_list(self):
        """Empty input returns empty cluster list."""
        clusterer = VendorClusterer()
        df = pd.DataFrame(columns=["date", "description", "amount"])
        
        result = clusterer.cluster(df)
        
        assert result == []

    def test_cluster_groups_identical_descriptions(self):
        """Transactions with same description are grouped together."""
        # Disable regex patterns to test pure exact matching
        clusterer = VendorClusterer(use_regex_patterns=False)
        df = pd.DataFrame({
            "date": ["2024-01-01", "2024-01-15", "2024-02-01"],
            "description": ["AMAZON.COM", "AMAZON.COM", "NETFLIX"],
            "amount": [50.0, 30.0, 15.99],
        })
        
        clusters = clusterer.cluster(df)
        
        assert len(clusters) == 2
        vendor_names = {c.vendor_name for c in clusters}
        assert vendor_names == {"AMAZON.COM", "NETFLIX"}

    def test_cluster_calculates_correct_totals(self):
        """Cluster total_amount equals sum of transaction amounts."""
        clusterer = VendorClusterer()
        df = pd.DataFrame({
            "date": ["2024-01-01", "2024-01-15", "2024-02-01"],
            "description": ["AMAZON.COM", "AMAZON.COM", "AMAZON.COM"],
            "amount": [50.0, 30.0, 20.0],
        })
        
        clusters = clusterer.cluster(df)
        
        assert len(clusters) == 1
        assert clusters[0].total_amount == 100.0
        assert clusters[0].transaction_count == 3

    def test_cluster_calculates_correct_date_range(self):
        """Cluster date range spans earliest to latest transaction."""
        clusterer = VendorClusterer()
        df = pd.DataFrame({
            "date": ["2024-03-15", "2024-01-01", "2024-02-10"],
            "description": ["VENDOR", "VENDOR", "VENDOR"],
            "amount": [10.0, 20.0, 30.0],
        })
        
        clusters = clusterer.cluster(df)
        
        assert len(clusters) == 1
        assert clusters[0].earliest_date == "2024-01-01"
        assert clusters[0].latest_date == "2024-03-15"

    def test_summarize_returns_dataframe_with_correct_columns(self):
        """Summarize returns DataFrame with expected columns."""
        clusterer = VendorClusterer()
        df = pd.DataFrame({
            "date": ["2024-01-01"],
            "description": ["VENDOR"],
            "amount": [100.0],
        })
        
        summary = clusterer.summarize(df)
        
        expected_cols = ["vendor_name", "transaction_count", "total_amount", 
                        "earliest_date", "latest_date"]
        assert list(summary.columns) == expected_cols

    def test_summarize_empty_returns_empty_dataframe_with_columns(self):
        """Empty input returns empty DataFrame with correct columns."""
        clusterer = VendorClusterer()
        df = pd.DataFrame(columns=["date", "description", "amount"])
        
        summary = clusterer.summarize(df)
        
        assert len(summary) == 0
        expected_cols = ["vendor_name", "transaction_count", "total_amount",
                        "earliest_date", "latest_date"]
        assert list(summary.columns) == expected_cols

    def test_register_strategy_adds_to_runner(self):
        """Additional strategies can be registered."""
        from statement_processor.analytics.clustering import ClusteringStrategy, TransactionCluster
        
        class DummyStrategy(ClusteringStrategy):
            @property
            def name(self) -> str:
                return "dummy"
            
            def cluster(self, transactions):
                return []
        
        clusterer = VendorClusterer()
        clusterer.register_strategy(DummyStrategy(), weight=0.5)
        
        # Should not raise - strategy is registered
        assert True

    def test_export_creates_csv_file(self):
        """Export writes CSV file with correct content."""
        clusterer = VendorClusterer()
        summary_df = pd.DataFrame({
            "vendor_name": ["AMAZON"],
            "transaction_count": [3],
            "total_amount": [100.0],
            "earliest_date": ["2024-01-01"],
            "latest_date": ["2024-03-01"],
        })
        
        with tempfile.TemporaryDirectory() as tmpdir:
            output_path = Path(tmpdir) / "vendor_summary.csv"
            clusterer.export(summary_df, output_path)
            
            assert output_path.exists()
            loaded = pd.read_csv(output_path)
            assert len(loaded) == 1
            assert loaded.iloc[0]["vendor_name"] == "AMAZON"


# Hypothesis strategies for generating transaction data
date_strategy = st.dates(
    min_value=pd.Timestamp("2020-01-01").date(),
    max_value=pd.Timestamp("2025-12-31").date(),
).map(lambda d: d.strftime("%Y-%m-%d"))

# Use sampled_from for faster generation - common vendor names
description_strategy = st.sampled_from([
    "AMAZON.COM", "NETFLIX", "SPOTIFY", "WALMART", "TARGET",
    "COSTCO", "STARBUCKS", "UBER", "LYFT", "DOORDASH",
    "GRUBHUB", "APPLE.COM", "GOOGLE", "MICROSOFT", "ADOBE",
    "VENDOR_A", "VENDOR_B", "VENDOR_C", "VENDOR_D", "VENDOR_E",
])

amount_strategy = st.floats(
    min_value=0.01,
    max_value=10000.0,
    allow_nan=False,
    allow_infinity=False,
).map(lambda x: round(x, 2))

transaction_strategy = st.fixed_dictionaries({
    "date": date_strategy,
    "description": description_strategy,
    "amount": amount_strategy,
})


class TestVendorClustererProperties:
    """Property-based tests for VendorClusterer.
    
    **Feature: vendor-analysis, Property 1: Cluster Summary Correctness**
    """

    @given(st.lists(transaction_strategy, min_size=1, max_size=20))
    @settings(max_examples=100, suppress_health_check=[HealthCheck.too_slow])
    def test_cluster_summary_correctness(self, transactions):
        """Property test: cluster summary statistics are correct.
        
        **Feature: vendor-analysis, Property 1: Cluster Summary Correctness**
        **Validates: Requirements 2.1, 2.2, 2.3**
        
        For any vendor cluster, the summary statistics SHALL be correct:
        - transaction_count equals the count of transactions in the cluster
        - total_amount equals the sum of amounts for transactions in the cluster
        - earliest_date equals the minimum date for transactions in the cluster
        - latest_date equals the maximum date for transactions in the cluster
        """
        df = pd.DataFrame(transactions)
        clusterer = VendorClusterer()
        
        clusters = clusterer.cluster(df)
        
        for cluster in clusters:
            # Get the actual transactions in this cluster
            cluster_df = df.loc[cluster.transaction_indices]
            
            # Property: transaction_count equals count of transactions
            assert cluster.transaction_count == len(cluster.transaction_indices), (
                f"transaction_count ({cluster.transaction_count}) != "
                f"len(transaction_indices) ({len(cluster.transaction_indices)})"
            )
            
            # Property: total_amount equals sum of amounts
            expected_total = round(cluster_df["amount"].sum(), 2)
            actual_total = round(cluster.total_amount, 2)
            assert actual_total == expected_total, (
                f"total_amount ({actual_total}) != sum of amounts ({expected_total})"
            )
            
            # Property: earliest_date equals minimum date
            expected_earliest = str(cluster_df["date"].min())
            assert cluster.earliest_date == expected_earliest, (
                f"earliest_date ({cluster.earliest_date}) != "
                f"min date ({expected_earliest})"
            )
            
            # Property: latest_date equals maximum date
            expected_latest = str(cluster_df["date"].max())
            assert cluster.latest_date == expected_latest, (
                f"latest_date ({cluster.latest_date}) != "
                f"max date ({expected_latest})"
            )
