"""Markdown table parser for financial statement transactions.

This parser handles markdown content (typically from Docling PDF extraction)
where transactions appear as markdown tables. It uses regex patterns to
parse transaction rows with date, description, and amount columns.

This is the default fallback parser that works with many statement formats.
"""

import logging
import re
from datetime import datetime
from typing import List, Optional

from statement_processor.core.base_parser import BaseStatementParser
from statement_processor.core.base_transaction import BaseTransaction
from statement_processor.models import Transaction, Statement, StatementMetadata
from statement_processor.parsing_utils import (
    normalize_date,
    normalize_date_with_year,
    parse_amount,
    parse_currency_value,
    extract_statement_year,
)

logger = logging.getLogger(__name__)


class MarkdownTableParser(BaseStatementParser):
    """Parses transactions from markdown tables.

    Works with markdown content extracted by Docling from PDF statements.
    Handles common table formats with date, description, and amount columns.

    This parser is designed to work with many statement formats and serves
    as the default fallback when no specific parser is available.
    """

    def __init__(
        self,
        date_format: str = "%m/%d/%Y",
        default_year: Optional[int] = None,
        strict_validation: bool = False,
    ):
        """Initialize parser with configuration.

        Args:
            date_format: Expected date format string
            default_year: Default year for date parsing (defaults to current year)
            strict_validation: If True, raise errors on validation failures
        """
        self._date_format = date_format
        self._default_year = default_year or datetime.now().year
        self._strict_validation = strict_validation

    @property
    def name(self) -> str:
        """Return the parser name."""
        return "markdown_table"

    def can_parse(self, raw_text: str) -> bool:
        """Check if this parser can handle the document.

        Args:
            raw_text: Extracted text content.

        Returns:
            True if the document contains parseable table structures.
        """
        has_tables = "|" in raw_text and "---" in raw_text
        has_dates = bool(re.search(r"\d{1,2}/\d{1,2}", raw_text))
        has_amounts = bool(re.search(r"\$[\d,]+\.\d{2}", raw_text))
        return has_tables and has_dates and has_amounts

    def parse(self, raw_text: str, source_file: str) -> List[BaseTransaction]:
        """Parse raw text into list of Transaction objects."""
        statement = self.parse_statement(raw_text, source_file)
        return statement.transactions

    def parse_statement(self, raw_text: str, source_file: str) -> Statement:
        """Parse raw text into a complete Statement object with metadata."""
        statement_year = extract_statement_year(raw_text)
        if statement_year:
            self._default_year = statement_year

        metadata = self._parse_metadata(raw_text)
        transactions = self._parse_transactions(raw_text, source_file)

        logger.info(f"Parsed {len(transactions)} transactions from {source_file}")

        return Statement(
            source_file=source_file,
            metadata=metadata,
            transactions=transactions,
        )

    def parse_with_metadata(
        self, raw_text: str, source_file: str
    ) -> tuple[List[BaseTransaction], dict]:
        """Parse document and return transactions with metadata."""
        statement = self.parse_statement(raw_text, source_file)
        return statement.transactions, statement.metadata.model_dump()

    def _parse_transactions(
        self, raw_text: str, source_file: str
    ) -> List[Transaction]:
        """Parse transactions from raw text."""
        transactions: List[Transaction] = []
        lines = raw_text.split("\n")

        for line in lines:
            line = line.strip()
            if not line.startswith("|"):
                continue
            if line.startswith("|--") or line.startswith("|-"):
                continue

            transaction = self._parse_table_row(line, source_file)
            if transaction:
                transactions.append(transaction)

        return transactions

    def _parse_table_row(
        self, line: str, source_file: str
    ) -> Optional[Transaction]:
        """Parse a markdown table row into a Transaction."""
        try:
            parts = [p.strip() for p in line.split("|")]
            if parts and parts[0] == "":
                parts = parts[1:]
            if parts and parts[-1] == "":
                parts = parts[:-1]

            if len(parts) < 4:
                return None

            trans_date_str = parts[0]
            posted_date_str = parts[1] if len(parts) > 1 else ""
            description = parts[2]
            amount_str = parts[-1]

            # Skip header rows
            if "Transaction date" in trans_date_str or "Date posted" in trans_date_str:
                return None
            if "Description" in description and "Amount" in amount_str:
                return None

            if self._is_merged_row(trans_date_str, amount_str):
                logger.warning(f"Skipping merged row: {line[:80]}...")
                return None

            if self._is_partial_description(description):
                logger.warning(f"Skipping partial description: '{description}'")
                return None

            transaction_date = (
                normalize_date_with_year(trans_date_str, self._default_year)
                if trans_date_str
                else None
            )
            posted_date = (
                normalize_date_with_year(posted_date_str, self._default_year)
                if posted_date_str
                else None
            )

            primary_date = transaction_date or posted_date
            if not primary_date:
                return None

            amount = parse_amount(amount_str)
            if amount is None:
                return None

            return Transaction(
                date=primary_date,
                transaction_date=transaction_date,
                posted_date=posted_date,
                description=description,
                amount=amount,
                source_file=source_file,
            )

        except Exception as e:
            logger.warning(f"Failed to parse transaction line: {line} - {e}")
            return None

    def _is_merged_row(self, date_str: str, amount_str: str) -> bool:
        """Detect if a row contains multiple merged transactions."""
        date_pattern = r"\d{1,2}/\d{1,2}(?:/\d{2,4})?\s+\d{1,2}/\d{1,2}"
        if re.search(date_pattern, date_str):
            return True
        amount_pattern = r"\$[\d,]+\.?\d*-?\s+\$[\d,]+\.?\d*"
        if re.search(amount_pattern, amount_str):
            return True
        return False

    def _is_partial_description(self, description: str) -> bool:
        """Detect if a description is likely a partial/malformed extraction."""
        if not description:
            return False
        desc_upper = description.upper().strip()
        location_only = re.match(r"^[A-Z\s]+\s+[A-Z]{2}$", desc_upper)
        if location_only and len(desc_upper.split()) <= 3:
            return True
        phone_only = re.match(r"^\d{3}-\d{3}-\d{4}\s+[A-Z]{2}$", desc_upper)
        if phone_only:
            return True
        return False

    def _parse_metadata(self, raw_text: str) -> StatementMetadata:
        """Parse statement metadata from raw text."""
        metadata = StatementMetadata()

        account_match = re.search(
            r"Account\s+number\s+ending\s+in[.\s]*(\d{4})", raw_text, re.IGNORECASE
        )
        if account_match:
            metadata.account_number_last4 = account_match.group(1)

        period_match = re.search(
            r"Statement\s+period[:\s]+(\d{1,2}/\d{1,2}/\d{2,4})\s*[-–]\s*(\d{1,2}/\d{1,2}/\d{2,4})",
            raw_text,
            re.IGNORECASE,
        )
        if period_match:
            metadata.statement_period_start = normalize_date(period_match.group(1))
            metadata.statement_period_end = normalize_date(period_match.group(2))

        due_date_match = re.search(
            r"Payment\s+due\s+date[:\s]*(\d{1,2}/\d{1,2}/\d{2,4})",
            raw_text,
            re.IGNORECASE,
        )
        if due_date_match:
            metadata.payment_due_date = normalize_date(due_date_match.group(1))

        min_payment_match = re.search(
            r"Minimum\s+(?:revolving\s+)?payment\s+due[:\s]*\$?([\d,]+\.?\d*)",
            raw_text,
            re.IGNORECASE,
        )
        if min_payment_match:
            metadata.minimum_payment_due = parse_currency_value(
                min_payment_match.group(1)
            )

        # Extract purchases total for validation
        # Handle table format: | + Purchases | $2,155.61 |
        purchases_match = re.search(
            r"\+\s*Purchases\s*\|\s*\$?([\d,]+\.?\d*)",
            raw_text,
            re.IGNORECASE,
        )
        if purchases_match:
            metadata.extra["purchases"] = parse_currency_value(
                purchases_match.group(1)
            )

        # Also try non-table format: Purchases: $2,155.61
        if "purchases" not in metadata.extra:
            purchases_match2 = re.search(
                r"Purchases[:\s]+\$?([\d,]+\.?\d*)",
                raw_text,
                re.IGNORECASE,
            )
            if purchases_match2:
                metadata.extra["purchases"] = parse_currency_value(
                    purchases_match2.group(1)
                )

        # Also try "New Charges" or "Total New Charges" patterns
        if "purchases" not in metadata.extra:
            new_charges_match = re.search(
                r"(?:Total\s+)?New\s+Charges[:\s]*\+?\s*\$?([\d,]+\.?\d*)",
                raw_text,
                re.IGNORECASE,
            )
            if new_charges_match:
                metadata.extra["new_charges"] = parse_currency_value(
                    new_charges_match.group(1)
                )

        return metadata
