"""PDF Markdown Extractor - Extracts markdown-formatted text from PDF files using Docling."""

import logging
from pathlib import Path
from typing import Dict, List, Optional

import pandas as pd
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem

from statement_processor.models import ExtractedDocument

logger = logging.getLogger(__name__)


class PdfMarkdownExtractor:
    """Extracts markdown-formatted text and structured data from PDF files using Docling.
    
    This extractor converts PDF documents to markdown format, preserving
    document structure like tables and headings for downstream parsing.
    """

    def __init__(
        self, debug_dir: Optional[Path] = None, use_threading: bool = True
    ) -> None:
        """Initialize Docling document converter.
        
        Args:
            debug_dir: Optional directory for debug output files (enables image generation)
            use_threading: Whether to use threaded pipeline for better performance
        """
        self._debug_dir = debug_dir
        self._debug_mode = debug_dir is not None
        
        # Configure pipeline options based on debug mode
        pipeline_options = PdfPipelineOptions(
            accelerator_options=AcceleratorOptions(
                device=AcceleratorDevice.AUTO,
            ),
            do_ocr=False,
            do_table_structure=True,
            # Enable image generation only in debug mode
            generate_page_images=self._debug_mode,
            generate_picture_images=self._debug_mode,
            images_scale=2.0 if self._debug_mode else 1.0,
        )
        
        if use_threading:
            from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
            from docling.pipeline.threaded_standard_pdf_pipeline import ThreadedStandardPdfPipeline
            
            threaded_options = ThreadedPdfPipelineOptions(
                accelerator_options=pipeline_options.accelerator_options,
                do_ocr=pipeline_options.do_ocr,
                do_table_structure=pipeline_options.do_table_structure,
                generate_page_images=pipeline_options.generate_page_images,
                generate_picture_images=pipeline_options.generate_picture_images,
                images_scale=pipeline_options.images_scale,
                ocr_batch_size=4,
                layout_batch_size=32,
                table_batch_size=4,
            )
            
            self._converter = DocumentConverter(
                format_options={
                    InputFormat.PDF: PdfFormatOption(
                        pipeline_cls=ThreadedStandardPdfPipeline,
                        pipeline_options=threaded_options,
                    )
                }
            )
        else:
            self._converter = DocumentConverter(
                format_options={
                    InputFormat.PDF: PdfFormatOption(
                        pipeline_options=pipeline_options,
                    )
                }
            )

        if self._debug_dir:
            self._debug_dir.mkdir(parents=True, exist_ok=True)

    def extract(self, pdf_paths: List[Path]) -> Dict[Path, ExtractedDocument]:
        """Extract markdown and structured data from PDF files.
        
        Args:
            pdf_paths: List of PDF file paths to process
            
        Returns:
            Dictionary mapping file paths to ExtractedDocument objects
        """
        if not pdf_paths:
            return {}

        results: Dict[Path, ExtractedDocument] = {}
        total = len(pdf_paths)

        logger.info(f"Starting PDF extraction for {total} file(s)...")

        try:
            conv_results = self._converter.convert_all(
                pdf_paths,
                raises_on_error=False,
            )

            for i, conv_result in enumerate(conv_results, 1):
                source_path = Path(conv_result.input.file)
                logger.info(f"[{i}/{total}] Extracted: {source_path.name}")
                
                try:
                    doc = conv_result.document
                    markdown_text = doc.export_to_markdown()
                    
                    tables: List[pd.DataFrame] = []
                    table_names: List[str] = []
                    
                    for table in doc.tables:
                        try:
                            df = table.export_to_dataframe(doc)
                            tables.append(df)
                            table_names.append("table")
                        except Exception as e:
                            logger.warning(f"Failed to export table: {e}")
                    
                    results[source_path] = ExtractedDocument(
                        markdown=markdown_text,
                        tables=tables,
                        table_names=table_names,
                        source_file=source_path.name,
                    )
                    logger.info(f"[{i}/{total}] Got {len(markdown_text):,} chars from {source_path.name}")
                    
                    # Write debug files if debug_dir is set
                    if self._debug_dir:
                        self._write_debug_output(source_path, conv_result)
                    
                except Exception as e:
                    logger.warning(f"[{i}/{total}] Failed to process {source_path}: {e}")
                    results[source_path] = ExtractedDocument(source_file=source_path.name)

        except Exception as e:
            logger.error(f"Batch conversion failed: {e}")
            # Return empty documents for all paths on total failure
            for path in pdf_paths:
                results[path] = ExtractedDocument(source_file=path.name)

        successful = sum(1 for doc in results.values() if doc.markdown)
        logger.info(f"Extraction complete: {successful}/{total} files successful")

        return results

    def _write_debug_output(self, source_path: Path, conv_result) -> None:
        """Write comprehensive debug output files for a processed document.
        
        Args:
            source_path: Original PDF file path
            conv_result: Docling conversion result object
        """
        if not self._debug_dir:
            return
            
        stem = source_path.stem
        doc = conv_result.document
        
        # Create subdirectories for organized output
        images_dir = self._debug_dir / "images" / stem
        markdown_dir = self._debug_dir / "markdown"
        doctags_dir = self._debug_dir / "doctags"
        reports_dir = self._debug_dir / "reports"
        viz_dir = self._debug_dir / "visualizations" / stem
        
        for d in [images_dir, markdown_dir, doctags_dir, reports_dir, viz_dir]:
            d.mkdir(parents=True, exist_ok=True)
        
        # Save page images
        try:
            for page_no, page in doc.pages.items():
                if page.image and page.image.pil_image:
                    page_image_path = images_dir / f"page-{page_no}.png"
                    with page_image_path.open("wb") as fp:
                        page.image.pil_image.save(fp, format="PNG")
                    logger.debug(f"Saved page image: {page_image_path}")
        except Exception as e:
            logger.warning(f"Failed to save page images for {stem}: {e}")
        
        # Save images of figures and tables
        try:
            table_counter = 0
            picture_counter = 0
            for element, _level in doc.iterate_items():
                if isinstance(element, TableItem):
                    table_counter += 1
                    element_image_path = images_dir / f"table-{table_counter}.png"
                    with element_image_path.open("wb") as fp:
                        element.get_image(doc).save(fp, "PNG")
                    logger.debug(f"Saved table image: {element_image_path}")
                    
                if isinstance(element, PictureItem):
                    picture_counter += 1
                    element_image_path = images_dir / f"picture-{picture_counter}.png"
                    with element_image_path.open("wb") as fp:
                        element.get_image(doc).save(fp, "PNG")
                    logger.debug(f"Saved picture image: {element_image_path}")
                    
            logger.info(f"Saved images for {stem}: {len(doc.pages)} pages, {table_counter} tables, {picture_counter} pictures")
        except Exception as e:
            logger.warning(f"Failed to save element images for {stem}: {e}")
        
        # Save markdown (plain text, images saved separately in images/)
        try:
            md_output_path = markdown_dir / f"{stem}.md"
            md_output_path.write_text(doc.export_to_markdown(), encoding="utf-8")
            logger.debug(f"Saved markdown to: {md_output_path}")
        except Exception as e:
            logger.warning(f"Failed to save markdown for {stem}: {e}")
        
        # Save doctags output (preserves metadata and structure)
        try:
            doctags_path = doctags_dir / f"{stem}.doctags.json"
            doc.save_as_doctags(doctags_path)
            logger.debug(f"Saved doctags to: {doctags_path}")
        except Exception as e:
            logger.warning(f"Failed to save doctags for {stem}: {e}")
        
        # Save conversion confidence report
        try:
            report_path = reports_dir / f"{stem}_report.txt"
            with report_path.open("w", encoding="utf-8") as fp:
                fp.write(f"Confidence Report for: {source_path.name}\n")
                fp.write("=" * 60 + "\n\n")
                fp.write(str(conv_result.confidence))
            logger.debug(f"Saved confidence report to: {report_path}")
        except Exception as e:
            logger.warning(f"Failed to save confidence report for {stem}: {e}")
        
        # Generate debug visualization images
        try:
            # Reading order visualization
            viz_images = doc.get_visualization(
                show_label=True,
                show_branch_numbering=True,
                viz_mode='reading_order'
            )
            for page_num, viz_image in viz_images.items():
                if viz_image is not None:
                    viz_path = viz_dir / f"page{page_num}_reading_order.png"
                    viz_image.save(str(viz_path))
                    logger.debug(f"Saved reading order visualization: {viz_path}")
            
            # Key-value visualization
            kv_viz_images = doc.get_visualization(
                show_label=True,
                show_cell_id=True,
                viz_mode='key_value'
            )
            for page_num, viz_image in kv_viz_images.items():
                if viz_image is not None:
                    viz_path = viz_dir / f"page{page_num}_key_value.png"
                    viz_image.save(str(viz_path))
                    logger.debug(f"Saved key-value visualization: {viz_path}")
                    
        except Exception as e:
            logger.warning(f"Could not generate visualization images for {stem}: {e}")
