from bs4 import BeautifulSoup, Tag, NavigableString
from typing import Optional


class HTMLToMarkdownParser:
    """
    An efficient HTML to Markdown converter using BeautifulSoup.
    Handles common HTML elements and converts them to Markdown syntax.
    """
    
    # Tags that should be removed entirely
    REMOVE_TAGS = [
        'script', 'style', 'iframe', 'noscript', 
        # Next.js specific tags
        'next-route-announcer', '__next', '__next-route-announcer__'
    ]
    
    # Classes that often indicate non-content areas
    REMOVE_CLASSES = [
        'nav', 'footer', 'header', 'sidebar', 'advertisement', 'ad',
        # Next.js specific classes
        'next-error', 'next-route-announcer', '__next-route-announcer__'
    ]
    
    def __init__(self):
        self._list_depth = 0
    
    def convert(self, html: str) -> str:
        """
        Convert HTML string to Markdown.
        
        Args:
            html: HTML string to convert
            
        Returns:
            Converted Markdown string
        """
        # First clean the HTML
        soup = self._clean_html(html)
        
        # Find main content area
        main_content = (
            soup.find('main') or 
            soup.find('article') or 
            soup.find('div', class_='content') or 
            soup
        )
        
        return self._process_tag(main_content)
    
    def _clean_html(self, html: str) -> BeautifulSoup:
        """Clean HTML by removing unnecessary elements and finding main content."""
        soup = BeautifulSoup(html, 'html.parser')
        
        # Remove unwanted tags
        for tag in self.REMOVE_TAGS:
            for element in soup.find_all(tag):
                element.decompose()
        
        # Remove elements with unwanted classes
        for class_name in self.REMOVE_CLASSES:
            for element in soup.find_all(class_=class_name):
                element.decompose()
                
        # Remove empty elements
        for element in soup.find_all():
            if len(element.get_text(strip=True)) == 0:
                element.decompose()
                
        return soup
    
    def _process_tag(self, element: Tag) -> str:
        """Process a BeautifulSoup tag and its children recursively."""
        if isinstance(element, NavigableString):
            return str(element).strip()
        
        # Initialize result for string building
        result = []
        
        # Process different tag types
        tag_name = element.name if element.name else ''
        
        # Handle block elements
        if tag_name in ['p', 'div']:
            inner = self._process_children(element)
            if inner.strip():  # Only add newlines if there's content
                result.append(f"{inner}\n")
        
        # Handle headings
        elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            level = int(tag_name[1])
            inner = self._process_children(element)
            result.append(f"\n{'#' * level} {inner}\n")
        
        # Handle links
        elif tag_name == 'a':
            href = element.get('href', '')
            inner = self._process_children(element)
            result.append(f"[{inner}]({href})")
        
        # Handle images
        elif tag_name == 'img':
            alt = element.get('alt', '')
            src = element.get('src', '')
            result.append(f"![{alt}]({src})")
        
        # Handle lists
        elif tag_name in ['ul', 'ol']:
            self._list_depth += 1
            items = []
            for i, child in enumerate(element.find_all('li', recursive=False)):
                prefix = '  ' * (self._list_depth - 1)
                marker = '*' if tag_name == 'ul' else f"{i+1}."
                items.append(f"{prefix}{marker} {self._process_children(child)}")
            self._list_depth -= 1
            result.append('\n' + '\n'.join(items) + '\n')
        
        # Handle emphasis
        elif tag_name in ['em', 'i']:
            inner = self._process_children(element)
            result.append(f"_{inner}_")
        
        # Handle strong/bold
        elif tag_name in ['strong', 'b']:
            inner = self._process_children(element)
            result.append(f"**{inner}**")
        
        # Handle code blocks
        elif tag_name == 'pre':
            code_element = element.find('code')
            if code_element:
                # Get class names and look for potential language specification
                classes = code_element.get('class', [])
                language = ''
                for class_name in classes:
                    # Common class naming patterns for code languages
                    if class_name.startswith(('language-', 'lang-')):
                        language = class_name.split('-')[1]
                        break
                    elif class_name in ['python', 'javascript', 'java', 'cpp', 'ruby', 'php', 'html', 'css']:
                        language = class_name
                        break
                
                inner = self._process_children(code_element)
                result.append(f"\n```{language}\n{inner}\n```\n")
            else:
                inner = self._process_children(element)
                result.append(f"\n```\n{inner}\n```\n")
        
        # Handle inline code
        elif tag_name == 'code':
            inner = self._process_children(element)
            result.append(f"`{inner}`")
        
        # Handle horizontal rules
        elif tag_name == 'hr':
            result.append("\n---\n")
        
        # Default case: process children
        else:
            result.append(self._process_children(element))
        
        return ''.join(result)
    
    def _process_children(self, element: Tag) -> str:
        """Process all children of a tag."""
        return ''.join(self._process_tag(child) for child in element.children)


def html_to_markdown(html: str) -> str:
    """
    Convert HTML to Markdown.
    
    Args:
        html: HTML string to convert
        
    Returns:
        Converted Markdown string
    
    Example:
        >>> html = '<h1>Hello</h1><p>This is <strong>bold</strong> text.</p>'
        >>> print(html_to_markdown(html))
        # Hello
        
        This is **bold** text.
    """
    parser = HTMLToMarkdownParser()
    return parser.convert(html).strip()
