"""
FotMob Scraper
Scrapes match links from FotMob for La Liga 2017-2018 season based on specific dates and teams.
"""

import pandas as pd
from bs4 import BeautifulSoup
import re
import time
import unicodedata
from typing import Dict, Optional, List
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException

# Configure logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# --- Constants & polite crawling settings ---
REQUEST_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) "
        "Gecko/20100101 Firefox/131.0"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "DNT": "1",
}
REQUEST_SLEEP_SECONDS = 6


class FotMobScraper:
    """Scraper for FotMob match links"""

    def __init__(self):
        """
        Initialize the scraper.

        Args:
            headers: Request headers to use
            sleep_seconds: Seconds to wait between requests
        """
        self.headers = REQUEST_HEADERS
        self.sleep_seconds = REQUEST_SLEEP_SECONDS

    def _normalize_team_name(self, team: str) -> str:
        """
        Normalize team name for comparison.

        Args:
            team: Team name to normalize

        Returns:
            Normalized team name
        """
        normalized = team.strip().lower().replace("_", " ").replace("-", " ")
        # Remove extra spaces
        return " ".join(normalized.split())

    def _get_team_variations(self, team: str) -> list:
        """
        Generate search variations for team names.
        Returns full name first, then each individual word.

        Args:
            team: Team name

        Returns:
            List of possible team name variations
        """
        normalized = self._normalize_team_name(team)
        words = normalized.split()

        variations = [normalized]  # Full name first

        # Add each individual word
        if len(words) > 1:
            variations.extend(words)

        return variations

    def _teams_match(self, team1: str, team2: str) -> bool:
        """
        Check if two team names match, considering variations.

        Args:
            team1: First team name (normalized)
            team2: Second team name (normalized)

        Returns:
            True if teams match
        """
        # Exact match
        if team1 == team2:
            return True

        # Get variations for both teams
        variations1 = self._get_team_variations(team1)
        variations2 = self._get_team_variations(team2)

        # Check if any variation matches
        for v1 in variations1:
            for v2 in variations2:
                if v1 == v2:
                    return True

        return False

    def parse_en_date_to_iso(self, date_str: str) -> Optional[str]:
        """
        Convert English date phrases like:
        - 'Friday, August 11, 2023'
        - 'Fri, Aug 11, 2023'
        - 'August 11, 2023'
        - 'Aug 11th, 2023'
        - 'Friday August 11 2023'
        into 'YYYY-MM-DD'. Returns None if it cannot parse.
        """
        if not date_str:
            return None

        s = date_str.strip()

        # 1) Normalize: remove commas, collapse spaces, lowercase
        s = s.replace(",", " ")
        s = re.sub(r"\s+", " ", s).strip().lower()

        # 2) Remove weekday if present (full or abbrev) at the start
        weekdays = {
            "monday",
            "tuesday",
            "wednesday",
            "thursday",
            "friday",
            "saturday",
            "sunday",
            "mon",
            "tue",
            "tues",
            "wed",
            "thu",
            "thur",
            "thurs",
            "fri",
            "sat",
            "sun",
        }
        tokens = s.split()

        if tokens and tokens[0] in weekdays:
            tokens = tokens[1:]  # drop leading weekday
            # some people put another comma-equivalent gap—already normalized

        if not tokens:
            return None

        # 3) Month name mapping (long + short)
        months = {
            "january": "01",
            "february": "02",
            "march": "03",
            "april": "04",
            "may": "05",
            "june": "06",
            "july": "07",
            "august": "08",
            "september": "09",
            "october": "10",
            "november": "11",
            "december": "12",
            "jan": "01",
            "feb": "02",
            "mar": "03",
            "apr": "04",
            "may": "05",
            "jun": "06",
            "jul": "07",
            "aug": "08",
            "sep": "09",
            "sept": "09",
            "oct": "10",
            "nov": "11",
            "dec": "12",
        }

        # 4) Helper to strip ordinal suffixes: 1st, 2nd, 3rd, 4th...
        def _strip_ordinal(t: str) -> str:
            return re.sub(r"(?<=\d)(st|nd|rd|th)$", "", t)

        # 5) Try patterns:
        #    (A) Month Day Year   e.g., 'august 11 2023' / 'aug 11th 2023'
        #    (B) Day Month Year   (por si acaso aparece así) '11 august 2023'
        # We’ll scan tokens to find whichever matches.

        # (A) Month-first
        try:
            # find first month token
            mi = next(i for i, t in enumerate(tokens) if t in months)
            # expect day right after, then year
            if mi + 2 < len(tokens):
                day_tok = _strip_ordinal(tokens[mi + 1])
                year_tok = tokens[mi + 2]
                if day_tok.isdigit() and year_tok.isdigit() and int(year_tok) >= 1900:
                    m = months[tokens[mi]]
                    d = f"{int(day_tok):02d}"
                    y = year_tok
                    return f"{y}-{m}-{d}"
        except StopIteration:
            pass

        # (B) Day-first
        # find first numeric day, then a month token, then a 4-digit year
        for i, t in enumerate(tokens):
            tt = _strip_ordinal(t)
            if tt.isdigit() and 1 <= int(tt) <= 31:
                # look ahead for month and year
                for j in range(i + 1, len(tokens)):
                    if tokens[j] in months:
                        # find year after month
                        for k in range(j + 1, len(tokens)):
                            if tokens[k].isdigit() and int(tokens[k]) >= 1900:
                                d = f"{int(tt):02d}"
                                m = months[tokens[j]]
                                y = tokens[k]
                                return f"{y}-{m}-{d}"
                        break  # month found but no year after—stop inner search

        return None

    def _dates_match(self, date1: str, date2: str) -> bool:
        """
        Check if two dates match, handling different formats.

        Args:
            date1: First date (YYYY-MM-DD format)
            date2: Second date (can be Spanish format or YYYY-MM-DD)

        Returns:
            True if dates match
        """
        # Parse date2 if it's in Spanish format
        date2_parsed = self.parse_en_date_to_iso(date2)
        if date1 == date2_parsed:
            return True
        return False

    def _fetch_page(self, page: int, fotmob_url: str) -> Optional[BeautifulSoup]:
        """
        Fetch a single page from FotMob using Selenium.
        """
        driver = None
        try:
            # Format URL with page number
            url = fotmob_url.format(page=page)

            logger.info(f"Fetching page {page}: {url}")

            # Setup Chrome options
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options.add_argument(f"user-agent={self.headers['User-Agent']}")

            # Create driver
            driver = webdriver.Chrome(options=chrome_options)

            # Load page
            driver.get(url)

            # Wait for section elements to load
            try:
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.TAG_NAME, "section"))
                )
            except TimeoutException:
                logger.warning("Timeout waiting for sections")

            # Additional wait for JavaScript to finish
            time.sleep(3)

            # Scroll to trigger lazy loading
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)

            # Get page source and parse
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")

            # logger.info(f"HTML length: {len(html)} characters")

            # # Check if we have match sections
            # match_sections = soup.find_all("section", class_=lambda x: x and "LeagueMatches" in str(x))
            # logger.info(f"Found {len(match_sections)} LeagueMatches sections")

            # if not match_sections:
            #     logger.info(f"Page {page} has no match sections")
            #     return None

            return soup

        except Exception as e:
            logger.error(f"Error fetching page {page}: {e}")
            import traceback

            logger.error(traceback.format_exc())
            return None

        finally:
            if driver:
                driver.quit()

    def _extract_date_sections(
        self, soup: BeautifulSoup, target_dates: list
    ) -> Dict[str, List[Dict]]:
        """
        Extract all match links for specific dates from a page.

        Args:
            soup: BeautifulSoup object of the page
            target_dates: List of dates to look for (YYYY-MM-DD format)

        Returns:
            Dictionary mapping date -> list of match info dicts
        """
        sections_by_date = {}

        # Find all sections with "LeagueMatchesSection"
        all_sections = soup.find_all(
            "section", class_=lambda x: x and "LeagueMatchesSection" in str(x)
        )

        for section in all_sections:
            # Look for header with date
            header = section.find("h3", class_=lambda x: x and "Header" in str(x))

            if not header:
                continue

            # Get date text from header
            date_text = header.get_text(strip=True)

            # Check if this date matches any target date
            matched_date = None
            for target_date in target_dates:
                if self._dates_match(target_date, date_text):
                    matched_date = target_date
                    break

            if not matched_date:
                continue

            logger.info(
                f"✓ Found section for date: {date_text} (matched to {matched_date})"
            )

            # Get all match links in this section
            match_links = section.find_all(
                "a", class_=lambda x: x and "MatchWrapper" in str(x)
            )

            logger.info(f"   Found {len(match_links)} matches in this section")

            # Extract info from each match link
            for link in match_links:
                match_url = link.get("href")
                if not match_url:
                    continue

                # Extract team names
                team_spans = link.find_all(
                    "span", class_=lambda x: x and "TeamName" in str(x)
                )

                if len(team_spans) >= 2:
                    home_team_text = team_spans[0].get_text(strip=True)
                    away_team_text = team_spans[1].get_text(strip=True)

                    # Make full URL
                    if match_url.startswith("/"):
                        match_url = f"https://www.fotmob.com{match_url}"

                    match_info = {
                        "date": matched_date,
                        "home_team": home_team_text,
                        "away_team": away_team_text,
                        "home_team_norm": self._normalize_team_name(home_team_text),
                        "away_team_norm": self._normalize_team_name(away_team_text),
                        "url": match_url,
                    }

                    # Add to the date's list
                    if matched_date not in sections_by_date:
                        sections_by_date[matched_date] = []
                    sections_by_date[matched_date].append(match_info)

                    logger.debug(f"      • {home_team_text} vs {away_team_text}")

        return sections_by_date

    def find_matches(
        self, target_matches: pd.DataFrame, fotmob_url: str, max_pages: int = 60
    ) -> pd.DataFrame:
        """
        Find match URLs for specific team matchups and dates.

        New Strategy:
        1. Extract all unique dates from target matches
        2. Scrape pages until all dates are found (or max_pages reached)
        3. For each date, search all target matches with all name variations
        """
        logger.info(f"Starting search for {len(target_matches)} target matches")

        # Normalize target matches
        target_matches = target_matches.copy()
        target_matches["home_team_norm"] = target_matches["home_team"].apply(
            self._normalize_team_name
        )
        target_matches["away_team_norm"] = target_matches["away_team"].apply(
            self._normalize_team_name
        )
        target_matches["url"] = None
        target_matches["found"] = False

        # STEP 1: Extract all unique dates
        unique_dates = set(target_matches["date"].unique())
        logger.info(f"\n{'='*60}")
        logger.info(f"📅 Unique dates to search: {len(unique_dates)}")
        for date in sorted(unique_dates):
            logger.info(f"   - {date}")
        logger.info(f"{'='*60}\n")

        # STEP 2: Scrape pages until all dates are found
        logger.info("🕷️ Scraping pages and organizing by date...")
        sections_by_date = {}  # {date: [list of match links]}
        dates_found = set()

        for page in range(max_pages):
            soup = self._fetch_page(page, fotmob_url)

            if soup is None:
                logger.info(f"No more pages (stopped at page {page})")
                break

            # Extract sections for target dates
            page_sections = self._extract_date_sections(soup, unique_dates)

            # Merge into sections_by_date
            for date, matches in page_sections.items():
                if date not in sections_by_date:
                    sections_by_date[date] = []
                sections_by_date[date].extend(matches)
                dates_found.add(date)

            # CHECK: Have we found all dates?
            if dates_found == unique_dates:
                logger.info(
                    f"\n✅ All {len(unique_dates)} dates found! Stopping at page {page}"
                )
                break

            # Log progress
            remaining_dates = unique_dates - dates_found
            if remaining_dates:
                logger.info(
                    f"📊 Progress: {len(dates_found)}/{len(unique_dates)} dates found. Still looking for: {sorted(remaining_dates)}"
                )

            # Sleep between pages
            if page < max_pages - 1:
                time.sleep(self.sleep_seconds)

        # Check if we found all dates
        missing_dates = unique_dates - dates_found
        if missing_dates:
            logger.warning(
                f"⚠️ Could not find matches for dates: {sorted(missing_dates)}"
            )

        # Report what we found
        logger.info(f"\n{'='*60}")
        logger.info("📊 Matches found by date:")
        for date in sorted(sections_by_date.keys()):
            logger.info(f"   {date}: {len(sections_by_date[date])} matches")
        logger.info(f"{'='*60}\n")

        # STEP 3: Match target matches with found matches
        logger.info("🔍 Matching target matches with scraped data...")

        for idx, target in target_matches.iterrows():
            if target["found"]:
                continue

            target_date = target["date"]
            target_home = target["home_team_norm"]
            target_away = target["away_team_norm"]

            logger.info(f"\n{'='*60}")
            logger.info(
                f"Searching: {target['home_team']} vs {target['away_team']} on {target_date}"
            )
            logger.info(f"{'='*60}")

            # Get all matches for this date
            date_matches = sections_by_date.get(target_date, [])

            if not date_matches:
                logger.warning(f"❌ No matches found for date {target_date}")
                continue

            logger.info(f"Found {len(date_matches)} matches on {target_date}")

            # Try to find the match
            match_found = self._find_match_in_list(
                date_matches,
                target_home,
                target_away,
                target["home_team"],
                target["away_team"],
            )

            if match_found:
                target_matches.at[idx, "url"] = match_found["url"]
                target_matches.at[idx, "found"] = True
                logger.info(
                    f"✅ FOUND: {match_found['home_team']} vs {match_found['away_team']}"
                )
            else:
                logger.warning(f"❌ NOT FOUND")

        # Final report
        found_count = target_matches["found"].sum()
        logger.info(f"\n{'='*60}")
        logger.info(f"RESULTS: {found_count}/{len(target_matches)} matches found")
        logger.info(f"{'='*60}")

        return target_matches

    def _find_match_in_list(
        self,
        date_matches: List[Dict],
        target_home_norm: str,
        target_away_norm: str,
        target_home_original: str,
        target_away_original: str,
    ) -> Optional[Dict]:
        """
        Find a specific match in a list of matches using team name variations.
        Tries both normal and reversed order.

        Args:
            date_matches: List of match info dicts for a specific date
            target_home_norm: Normalized home team name
            target_away_norm: Normalized away team name
            target_home_original: Original home team name (for logging)
            target_away_original: Original away team name (for logging)

        Returns:
            Match info dict if found, None otherwise
        """
        # Try normal order first (home vs away)
        for match in date_matches:
            match_home = match["home_team_norm"]
            match_away = match["away_team_norm"]

            if self._teams_match(match_home, target_home_norm) and self._teams_match(
                match_away, target_away_norm
            ):
                logger.info(
                    f"✓ Match found (normal order): {match['home_team']} vs {match['away_team']}"
                )
                return match

        # Try reversed order (away vs home)
        logger.debug("Trying reversed order...")
        for match in date_matches:
            match_home = match["home_team_norm"]
            match_away = match["away_team_norm"]

            if self._teams_match(match_home, target_away_norm) and self._teams_match(
                match_away, target_home_norm
            ):
                logger.info(
                    f"✓ Match found (REVERSED order): {match['home_team']} vs {match['away_team']}"
                )
                return match

        # Not found - log details for debugging
        logger.debug(
            f"Match not found. Looking for: {target_home_original} vs {target_away_original}"
        )
        logger.debug(f"Available matches on this date:")
        for match in date_matches:
            logger.debug(f"   • {match['home_team']} vs {match['away_team']}")

        return None

    def scrape_injuries_from_match(self, match_url: str) -> Dict[str, List[str]]:
        """
        Scrape injured players from a FotMob match page.
        """
        driver = None
        try:
            logger.info(f"Scraping injuries from: {match_url}")

            # Setup Chrome options
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options.add_argument(f"user-agent={self.headers['User-Agent']}")

            # Create driver
            driver = webdriver.Chrome(options=chrome_options)

            # Load page
            driver.get(match_url)

            # Wait for content to load
            time.sleep(5)

            # Scroll to load all content
            for _ in range(3):
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(2)

            # Get page source
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")

            # Find ALL sections with "4pi1g3-BenchesContainer"
            all_4pi1g3_sections = soup.find_all(
                "section",
                class_=lambda x: x
                and "4pi1g3" in str(x)
                and "BenchesContainer" in str(x),
            )
            logger.info(
                f"Found {len(all_4pi1g3_sections)} sections with '4pi1g3-BenchesContainer'"
            )

            if len(all_4pi1g3_sections) < 3:
                logger.warning(
                    f"Expected at least 3 '4pi1g3' sections, found {len(all_4pi1g3_sections)}"
                )
                return {"home_injuries": [], "away_injuries": []}

            # The LAST 4pi1g3 section is the injuries container
            injuries_container = all_4pi1g3_sections[-1]
            logger.info("✓ Found injuries container (last 4pi1g3 section)")

            # Find the two UL elements inside (first = home, second = away)
            bench_uls = injuries_container.find_all(
                "ul", class_=lambda x: x and "BenchContainer" in str(x)
            )

            logger.info(f"Found {len(bench_uls)} UL elements with player lists")

            if len(bench_uls) < 2:
                logger.warning(f"Expected 2 UL elements, found {len(bench_uls)}")
                return {"home_injuries": [], "away_injuries": []}

            # Initialize injuries lists
            home_injuries = []
            away_injuries = []

            # Process each UL (0 = home, 1 = away)
            for idx, ul in enumerate(bench_uls[:2]):
                team_injuries = []

                # Find all player links
                player_links = ul.find_all(
                    "a", href=lambda x: x and "/players/" in str(x)
                )

                logger.info(
                    f"Team {idx} ({'home' if idx == 0 else 'away'}): Found {len(player_links)} player links"
                )

                for player_link in player_links:
                    href = player_link.get("href", "")

                    if "/players/" in href:
                        # URL: /es/players/1184081/francisco-mwepu
                        url_parts = href.rstrip("/").split("/")
                        player_slug = url_parts[-1]
                        # Replace hyphens with underscores and keep as lowercase or title case
                        player_name = player_slug.replace("-", "_")

                        if player_name not in team_injuries:
                            team_injuries.append(player_name)
                            logger.info(f"  ✓ {player_name}")

                # Assign to home or away
                if idx == 0:
                    home_injuries = team_injuries
                elif idx == 1:
                    away_injuries = team_injuries

            logger.info(f"Results - Home: {home_injuries}, Away: {away_injuries}")

            return {"home_injuries": home_injuries, "away_injuries": away_injuries}

        except Exception as e:
            logger.error(f"Error scraping injuries: {e}")
            import traceback

            logger.error(traceback.format_exc())
            return {"home_injuries": [], "away_injuries": []}

        finally:
            if driver:
                driver.quit()

    def _slugify_name(self, name: str) -> str:
        s = str(name or "").strip().lower()
        s = unicodedata.normalize("NFKD", s).encode("ascii", "ignore").decode("utf-8")
        s = re.sub(r"[^a-z0-9]+", "_", s)
        return s.strip("_")

    def scrape_all_injuries(
        self, target_matches: pd.DataFrame, fotmob_url: str
    ) -> list[str]:
        """
        Scrape injuries for all matches and return a unique list of injured player slugs.

        Args:
            target_matches: DataFrame minimal para find_matches(...)
            fotmob_url: base URL de FotMob

        Returns:
            list[str]: lista de jugadores lesionados (únicos) en formato slug (e.g., 'francisco_mwepu')
        """
        matches_df = self.find_matches(target_matches, fotmob_url)

        logger.info(f"Starting to scrape injuries for {len(matches_df)} matches")

        injured_all: list[str] = []

        for idx, row in matches_df.iterrows():
            match_url = row.get("url")
            if pd.isna(match_url) or not match_url:
                logger.warning(f"Row {idx}: No URL found, skipping")
                continue

            logger.info(f"\n{'='*60}")
            logger.info(
                f"Processing match {idx + 1}/{len(matches_df)}: {row.get('home_team')} vs {row.get('away_team')}"
            )
            logger.info(f"{'='*60}")

            try:
                injuries_data = self.scrape_injuries_from_match(match_url)
            except Exception as e:
                logger.exception(f"Error scraping injuries for {match_url}: {e}")
                continue

            # Acumular ambos lados
            for key in ("home_injuries", "away_injuries"):
                players = injuries_data.get(key) or []
                for p in players:
                    if not p:
                        continue
                    injured_all.append(self._slugify_name(p))

            # cortesía entre requests
            if idx < len(matches_df) - 1:
                time.sleep(self.sleep_seconds)

        # Quitar duplicados preservando orden
        seen = set()
        injured_unique: list[str] = []
        for x in injured_all:
            if x not in seen:
                seen.add(x)
                injured_unique.append(x)

        return injured_unique
