from typing import Any

from haystack import component, logging

from llama_cpp import Llama

logger = logging.getLogger(__name__)


@component
class LlamaCppGenerator:
    """
    Provides an interface to generate text using LLM via llama.cpp.

    [llama.cpp](https://github.com/ggml-org/llama.cpp) is a project written in C/C++ for efficient inference of LLMs.
    It employs the quantized GGUF format, suitable for running these models on standard machines (even without GPUs).

    Usage example:
    ```python
    from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
    generator = LlamaCppGenerator(model="zephyr-7b-beta.Q4_0.gguf", n_ctx=2048, n_batch=512)

    print(generator.run("Who is the best American actor?", generation_kwargs={"max_tokens": 128}))
    # {'replies': ['John Cusack'], 'meta': [{"object": "text_completion", ...}]}
    ```
    """

    def __init__(
        self,
        model: str,
        n_ctx: int | None = 0,
        n_batch: int | None = 512,
        model_kwargs: dict[str, Any] | None = None,
        generation_kwargs: dict[str, Any] | None = None,
    ):
        """
        :param model: The path of a quantized model for text generation, for example, "zephyr-7b-beta.Q4_0.gguf".
            If the model path is also specified in the `model_kwargs`, this parameter will be ignored.
        :param n_ctx: The number of tokens in the context. When set to 0, the context will be taken from the model.
        :param n_batch: Prompt processing maximum batch size.
        :param model_kwargs: Dictionary containing keyword arguments used to initialize the LLM for text generation.
            These keyword arguments provide fine-grained control over the model loading.
            In case of duplication, these kwargs override `model`, `n_ctx`, and `n_batch` init parameters.
            For more information on the available kwargs, see
            [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__).
        :param generation_kwargs:  A dictionary containing keyword arguments to customize text generation.
            For more information on the available kwargs, see
            [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_completion).
        """

        model_kwargs = model_kwargs or {}
        generation_kwargs = generation_kwargs or {}

        # check if the model_kwargs contain the essential parameters
        # otherwise, populate them with values from init parameters
        model_kwargs.setdefault("model_path", model)
        model_kwargs.setdefault("n_ctx", n_ctx)
        model_kwargs.setdefault("n_batch", n_batch)

        self.model_path = model
        self.n_ctx = n_ctx
        self.n_batch = n_batch
        self.model_kwargs = model_kwargs
        self.generation_kwargs = generation_kwargs
        self.model: Llama | None = None

    def warm_up(self):
        if self.model is None:
            self.model = Llama(**self.model_kwargs)

    @component.output_types(replies=list[str], meta=list[dict[str, Any]])
    def run(
        self, prompt: str, generation_kwargs: dict[str, Any] | None = None
    ) -> dict[str, list[str] | list[dict[str, Any]]]:
        """
        Run the text generation model on the given prompt.

        :param prompt: the prompt to be sent to the generative model.
        :param generation_kwargs:  A dictionary containing keyword arguments to customize text generation.
            For more information on the available kwargs, see
            [llama.cpp documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.create_completion).
        :returns: A dictionary with the following keys:
            - `replies`: the list of replies generated by the model.
            - `meta`: metadata about the request.
        """
        if self.model is None:
            error_msg = "The model has not been loaded. Please call warm_up() before running."
            raise RuntimeError(error_msg)

        if not prompt:
            return {"replies": []}

        # merge generation kwargs from init method with those from run method
        updated_generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}

        output = self.model.create_completion(prompt=prompt, **updated_generation_kwargs)
        if not isinstance(output, dict):
            msg = f"Expected a dictionary response, got a different object: {output}"
            raise ValueError(msg)

        replies = [output["choices"][0]["text"]]

        return {"replies": replies, "meta": [dict(output.items())]}
