"""
Natural Language Query Interface for Knowledge Graphs (v3)

Schema-aware, rule-based parser that converts human-friendly queries to GraphQuery.

Design goals:
- Use MetaNode schema (classId + properties.name) to build class aliases.
- Use MetaNode.properties.propertyTypes to build property aliases.
- Support multiple property conditions (age > 30 AND risk_score >= 0.7).
- Support text search, sort, limit.
- Support temporal/epoch queries: "this week", "last 7 days", "next 10 weeks", "between Jan 1 and Feb 1", etc.
- Prefer class-specific temporal properties (e.g. acled_event → event_date).
"""

import logging
import re
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple

import dateparser
from dateparser.search import search_dates

from .graph_query import FilterOperator, GraphQuery

if TYPE_CHECKING:
    from .element_store import ElementStore
    from .meta import MetaNode

logger = logging.getLogger(__name__)


# ---------------------------
# Temporal/Epoch utilities
# ---------------------------


def get_epoch_timestamp(dt: datetime) -> int:
    """Convert datetime to epoch timestamp (milliseconds since UTC 1970-01-01)."""
    return int(dt.timestamp() * 1000)


def parse_temporal_phrase(phrase: str) -> Optional[datetime]:
    """
    Parse a temporal phrase using dateparser.

    Examples:
      - "this week", "last week", "next week"
      - "this month", "last month", "next month"
      - "this year", "last year", "next year"
      - "yesterday", "today", "tomorrow"
      - "2024-01-15", "last Monday", "yesterday 5pm"
    """
    try:
        return dateparser.parse(phrase)
    except Exception as e:
        logger.debug("Failed to parse temporal phrase '%s': %s", phrase, e)
        return None


def search_temporal_phrases(text: str) -> List[Tuple[str, datetime]]:
    """
    Search for date/time references embedded in text.

    Returns:
        List of (matched_text, parsed_datetime) tuples
    """
    try:
        results = search_dates(text)
        return results if results else []
    except Exception as e:
        logger.debug("Failed to search for temporal phrases in '%s': %s", text, e)
        return []


def calculate_temporal_range(phrase: str) -> Optional[Tuple[int, int]]:
    """
    Parse a temporal phrase and return (start_epoch_ms, end_epoch_ms).

    Handles:
      - Relative windows: "last 7 days", "past 30 days", "next 2 weeks"
      - Simple relative: "last week", "next month", "this year"
      - Concrete dates: "2023-01-15" (treated as one-day window around that date)

    Returns:
        (start_ms, end_ms) or None if parsing fails.
    """
    normalized = phrase.lower().strip()
    now = datetime.now()

    # Explicit "last N X" / "past N X" / "next N X" forms
    window_match = re.search(
        r"(last|past|next)\s+(\d+)\s+"
        r"(day|days|week|weeks|month|months|year|years|hour|hours)",
        normalized,
    )

    if window_match:
        direction = window_match.group(1)
        count = int(window_match.group(2))
        unit = window_match.group(3)

        def delta_for_unit(n: int, u: str) -> timedelta:
            if u.startswith("day"):
                return timedelta(days=n)
            if u.startswith("week"):
                return timedelta(weeks=n)
            if u.startswith("month"):
                # Rough month ~30 days; if you want calendar months,
                # you can replace this with dateutil.relativedelta.
                return timedelta(days=30 * n)
            if u.startswith("year"):
                return timedelta(days=365 * n)
            if u.startswith("hour"):
                return timedelta(hours=n)
            return timedelta(0)

        delta = delta_for_unit(count, unit)

        if direction in ("last", "past"):
            start_dt = now - delta
            end_dt = now
        else:  # "next"
            start_dt = now
            end_dt = now + delta

        start_ms = get_epoch_timestamp(start_dt)
        end_ms = get_epoch_timestamp(end_dt)
        return (start_ms, end_ms)

    # Fallback: let dateparser interpret the phrase as a point,
    # then approximate a window based on the words present.
    parsed_dt = parse_temporal_phrase(phrase)
    if not parsed_dt:
        return None

    if any(k in normalized for k in ["week", "weeks"]):
        end_dt = parsed_dt + timedelta(weeks=1)
    elif any(k in normalized for k in ["month", "months"]):
        end_dt = parsed_dt + timedelta(days=30)
    elif any(k in normalized for k in ["year", "years"]):
        end_dt = parsed_dt + timedelta(days=365)
    elif any(k in normalized for k in ["day", "days"]):
        end_dt = parsed_dt + timedelta(days=1)
    elif any(k in normalized for k in ["hour", "hours"]):
        end_dt = parsed_dt + timedelta(hours=1)
    else:
        # Unknown, default one day window
        end_dt = parsed_dt + timedelta(days=1)

    start_ms = get_epoch_timestamp(parsed_dt)
    end_ms = get_epoch_timestamp(end_dt)
    if start_ms > end_ms:
        start_ms, end_ms = end_ms, start_ms
    return (start_ms, end_ms)


# ---------------------------
# Data model for parsed query
# ---------------------------


@dataclass
class PropertyCondition:
    property: str
    operator: FilterOperator
    value: Any


class HumanQuery:
    """Represents a parsed natural language query with conversion to GraphQuery."""

    def __init__(self, store: "ElementStore"):
        self.store = store
        self.logger = logging.getLogger(__name__)

        # Core components
        self.class_id: Optional[str] = None
        # Legacy single-property fields kept for backwards-compat tests
        self.property: Optional[str] = None
        self.operator: Optional[FilterOperator] = None
        self.value: Optional[Any] = None
        self.conditions: List[PropertyCondition] = []
        self.text_search: Optional[str] = None
        self.sort_property: Optional[str] = None
        self.sort_direction: str = "asc"
        self.limit: Optional[int] = None
        self.tail_limit: Optional[int] = None

        # Advanced search flags
        self.use_full_text_search: bool = False
        self.use_wildcard_search: bool = False
        self.wildcard_property: Optional[str] = None
        self.search_property: Optional[str] = None

        # Related-to filtering
        self.related_element_id: Optional[str] = None
        # Optional edge label/class for relation-aware queries
        self.edge_label: Optional[str] = None

        # Temporal components
        self.temporal_property: Optional[str] = None
        self.temporal_start_ms: Optional[int] = None
        self.temporal_end_ms: Optional[int] = None

    def add_condition(self, prop: str, op: FilterOperator, value: Any) -> None:
        self.conditions.append(PropertyCondition(prop, op, value))
        # Keep legacy single-property view in sync for tests that read these
        if self.property is None:
            self.property = prop
        if self.operator is None:
            self.operator = op
        if self.value is None:
            self.value = value

    # -------- GraphQuery conversion ----------

    def to_graph_query(self) -> GraphQuery:
        """
        Convert parsed human query to a GraphQuery.
        """
        q = GraphQuery(self.store)

        self.logger.info(
            "Converting HumanQuery to GraphQuery: "
            f"class_id={self.class_id}, "
            f"conditions={self.conditions}, "
            f"text_search={self.text_search}, "
            f"temporal={self.temporal_property} "
            f"[{self.temporal_start_ms}, {self.temporal_end_ms}], "
            f"sort={self.sort_property} {self.sort_direction}, "
            f"limit={self.limit}"
        )

        # Step 0: scope
        if self.class_id:
            self.logger.debug("  Applying classId: %s", self.class_id)
            q = q.classId(self.class_id)
        else:
            # Fall back to all elements if no explicit class
            self.logger.debug("  No classId detected, applying .all()")
            q = q.all()

        # Step 1: property conditions
        # Prefer explicit conditions list, but fall back to legacy
        # single-property fields if no conditions were added.
        if self.conditions:
            for cond in self.conditions:
                self.logger.debug(
                    "  Applying where: %s %s %s",
                    cond.property,
                    cond.operator,
                    cond.value,
                )
                q = q.where(cond.property, cond.operator, cond.value)
        elif self.property and self.operator is not None and self.value is not None:
            self.logger.debug(
                "  Applying legacy where: %s %s %s",
                self.property,
                self.operator,
                self.value,
            )
            q = q.where(self.property, self.operator, self.value)

        # Step 1b: temporal range conditions (if detected)
        if (
            self.temporal_property
            and self.temporal_start_ms is not None
            and self.temporal_end_ms is not None
        ):
            self.logger.debug(
                "  Applying temporal range: %s BETWEEN %d AND %d",
                self.temporal_property,
                self.temporal_start_ms,
                self.temporal_end_ms,
            )
            q = q.where(
                self.temporal_property,
                FilterOperator.BETWEEN,
                [self.temporal_start_ms, self.temporal_end_ms],
            )

        # Step 2: text search (name-based for now)
        if self.text_search:
            # Decide which search method to use
            if self.use_wildcard_search:
                prop = self.wildcard_property or "name"
                self.logger.debug(
                    "  Applying wildcard_search(): %s on %s",
                    self.text_search,
                    prop,
                )
                q = q.wildcard_search(self.text_search, prop)
            elif self.use_full_text_search:
                self.logger.debug("  Applying full_text_search(): %s", self.text_search)
                q = q.full_text_search(self.text_search)
            elif self.search_property:
                self.logger.debug(
                    "  Applying search_by_property(): %s contains %s",
                    self.search_property,
                    self.text_search,
                )
                # Use regex contains-style search for now
                pattern = re.escape(self.text_search)
                q = q.search_by_property(self.search_property, pattern)
            else:
                self.logger.debug("  Applying text(): %s", self.text_search)
                q = q.text(self.text_search)

        # Step 3: sorting
        if self.sort_property:
            self.logger.debug(
                "  Applying sort(): %s %s", self.sort_property, self.sort_direction
            )
            q = q.sort(self.sort_property, self.sort_direction)

        # Step 4: limit (first/last)
        if self.limit is not None:
            self.logger.debug("  Applying first(): %s", self.limit)
            q = q.first(self.limit)
        if self.tail_limit is not None:
            self.logger.debug("  Applying last(): %s", self.tail_limit)
            q = q.last(self.tail_limit)

        # Step 5: related-to filtering
        if self.related_element_id:
            self.logger.debug(
                "  Applying filter_by_related_to(): %s (edge_label=%s)",
                self.related_element_id,
                self.edge_label,
            )
            # For now we ignore edge_label at GraphQuery level; it can be
            # used later to select traversal operations. Keeping it on
            # HumanQuery makes tests future-proof.
            q = q.filter_by_related_to(self.related_element_id)

        return q

    def execute(self) -> List[Any]:
        """Execute GraphQuery and return results."""
        return self.to_graph_query().r()

    def count(self) -> int:
        """Count matching results without retrieving them."""
        return self.to_graph_query().count()

    def __repr__(self) -> str:
        parts: List[str] = []

        if self.class_id:
            parts.append(f"classId('{self.class_id}')")
        else:
            parts.append("all()")

        for cond in self.conditions:
            parts.append(
                f".where('{cond.property}', FilterOperator.{cond.operator.name}, {repr(cond.value)})"
            )

        # Temporal range
        if (
            self.temporal_property
            and self.temporal_start_ms is not None
            and self.temporal_end_ms is not None
        ):
            parts.append(
                f".where('{self.temporal_property}', FilterOperator.BETWEEN, "
                f"[{self.temporal_start_ms}, {self.temporal_end_ms}])"
            )

        if self.text_search:
            parts.append(f".text({self.text_search!r})")

        if self.sort_property:
            parts.append(f".sort('{self.sort_property}', '{self.sort_direction}')")

        if self.limit is not None:
            parts.append(f".first({self.limit})")

        if self.tail_limit is not None:
            parts.append(f".last({self.tail_limit})")

        if self.related_element_id:
            if self.edge_label:
                parts.append(
                    f".filter_by_related_to('{self.related_element_id}')  # via {self.edge_label}"
                )
            else:
                parts.append(f".filter_by_related_to('{self.related_element_id}')")

        if self.use_full_text_search:
            parts.append(".full_text_search(...)")

        if self.use_wildcard_search:
            parts.append(".wildcard_search(...)")

        # Represent the fluent chain
        return "GraphQuery()" + "".join(parts)


# ---------------------------
# Parser implementation
# ---------------------------


class NaturalQueryParser:
    """
    Parse natural language queries and convert to a HumanQuery.

    Schema-aware aspects:
    - Uses MetaNode.classId and MetaNode.properties['name'] to build class aliases.
      E.g. { classId: 'acled_event', properties: { name: 'acled event' } } =>
           aliases: 'acled_event', 'acled events', 'acled event', 'acled events'
    - Uses MetaNode.properties.propertyTypes to build property aliases based on key/label.
    - Builds per-class default temporal properties (e.g. 'acled_event' → 'event_date').
    """

    def __init__(
        self,
        store: "ElementStore",
        class_aliases: Optional[Dict[str, str]] = None,
        property_aliases: Optional[Dict[str, str]] = None,
    ):
        self.store = store
        self.logger = logging.getLogger(__name__)

        # Build class aliases from MetaNodes + caller-provided extras
        self.class_aliases: Dict[str, str] = self._build_class_aliases(
            class_aliases or {}
        )

        # Build property aliases (meta + builtins + caller extras)
        self.property_aliases: Dict[str, str] = self._build_property_aliases(
            property_aliases or {}
        )

        # Build map of option labels to (property_key, canonical_option_value)
        # for quick lookup when parsing option-based natural queries.
        self.option_label_map: Dict[str, Tuple[str, str]] = (
            self._build_option_label_map()
        )

        # Build edge alias map from edge MetaNodes (type == "edge") so we can
        # interpret phrases like "works at" or "assigned to".
        self.edge_aliases: Dict[str, str] = self._build_edge_aliases()

        # Class-specific default temporal property (e.g. acled_event -> event_date)
        self.class_temporal_defaults: Dict[str, str] = (
            self._build_class_temporal_defaults()
        )

        # Precompile patterns
        self._compile_patterns()

    # ---------------- schema helpers ----------------

    def _get_meta_nodes(self) -> List["MetaNode"]:
        """
        Try to retrieve MetaNodes from the store in a schema-agnostic way.
        Adjust this if your ElementStore exposes them differently.
        """
        # Common case: store.meta_nodes is a dict of MetaNode
        meta_nodes = getattr(self.store, "meta_nodes", None)
        if isinstance(meta_nodes, dict) and meta_nodes:
            return list(meta_nodes.values())

        # Alternative: store.meta.nodes
        meta = getattr(self.store, "meta", None)
        if meta is not None:
            nodes = getattr(meta, "nodes", None)
            if isinstance(nodes, dict) and nodes:
                return list(nodes.values())

        # Fallback: scan store.elements for NodeTypes.META / MetaNode instances
        from .base_element import NodeTypes  # local import to avoid cycles

        try:
            from .meta import MetaNode  # type: ignore
        except Exception:  # pragma: no cover - defensive import
            MetaNode = ()  # type: ignore

        elements = getattr(self.store, "elements", None)
        element_values = None

        if isinstance(elements, dict):
            element_values = elements.values()
        elif elements is not None:
            if hasattr(elements, "values"):
                element_values = elements.values()
            elif hasattr(elements, "elements") and isinstance(elements.elements, dict):
                element_values = elements.elements.values()

        if element_values is not None:
            meta_list: List[MetaNode] = []
            for el in element_values:
                if "MetaNode" in type(el).__name__:
                    meta_list.append(el)  # type: ignore[arg-type]
                    continue

                el_type = getattr(el, "type", None)
                if el_type == NodeTypes.META:
                    meta_list.append(el)  # type: ignore[arg-type]

            if meta_list:
                return meta_list

        return []

    def _build_class_aliases(self, extra_aliases: Dict[str, str]) -> Dict[str, str]:
        """
        Build class alias map using MetaNodes + optional user aliases.

        For a MetaNode like:
          classId='person', properties.name='Person'
        we generate aliases:
          'person', 'persons', 'person' label variants
        and special-case: 'people' -> 'person'.
        """
        aliases: Dict[str, str] = {}

        for meta in self._get_meta_nodes():
            class_id = getattr(meta, "classId", None) or getattr(meta, "class_id", None)
            if not class_id:
                continue

            canonical = str(class_id)
            canonical_lower = canonical.lower()

            props = getattr(meta, "properties", None)
            display_name = None
            if isinstance(props, dict):
                display_name = props.get("name") or props.get("label")

            names = set()

            # From classId itself
            names.add(canonical_lower)
            if not canonical_lower.endswith("s"):
                names.add(canonical_lower + "s")

            # From display name
            if display_name:
                dn = str(display_name).strip()
                dn_lower = dn.lower()
                names.add(dn_lower)
                if not dn_lower.endswith("s"):
                    names.add(dn_lower + "s")

            for alias in names:
                aliases.setdefault(alias, canonical)

        # Irregular plural: person -> people / persons
        for canon in set(aliases.values()):
            if canon.lower() == "person":
                aliases.setdefault("people", canon)
                aliases.setdefault("persons", canon)

        # Merge user-provided aliases (explicit overrides)
        for k, v in extra_aliases.items():
            aliases[k.lower()] = v

        return aliases

    def _build_edge_aliases(self) -> Dict[str, str]:
        """Build alias map for edge class_ids based on edge MetaNodes.

        For each MetaNode with properties.type == "edge", we look at:
          - its class_id (canonical edge class id)
          - its properties.name (primary label)
          - any aliases list on the meta properties object (if present)

        and generate lowercased phrases that can be used in natural
        queries, e.g. "works at" -> "employment".
        """
        aliases: Dict[str, str] = {}

        for meta in self._get_meta_nodes():
            class_id = getattr(meta, "classId", None) or getattr(meta, "class_id", None)
            if not class_id:
                continue

            props = getattr(meta, "properties", None)
            meta_type = getattr(props, "type", None) if props is not None else None
            if meta_type != "edge":
                continue

            canonical = str(class_id)

            # Primary edge label/name on the MetaNode
            name_val = None
            if isinstance(props, dict):
                name_val = props.get("name") or props.get("label")
                aliases_list = props.get("aliases") or []
            else:
                name_val = getattr(props, "name", None) or getattr(props, "label", None)
                aliases_list = getattr(props, "aliases", None) or []

            candidates = set()

            if name_val:
                nm = str(name_val).strip().lower()
                if nm:
                    candidates.add(nm)

            for a in aliases_list:
                if not a:
                    continue
                av = str(a).strip().lower()
                if av:
                    candidates.add(av)

            for phrase in candidates:
                aliases.setdefault(phrase, canonical)

        return aliases

    def _build_option_label_map(self) -> Dict[str, Tuple[str, str]]:
        """Build lookup from option label to (property_key, canonical_option_value).

        This lets us interpret phrases like "disorder type Political violence" by
        matching the trailing text against known option labels for that property.
        """
        mapping: Dict[str, Tuple[str, str]] = {}

        for meta in self._get_meta_nodes():
            from .meta import (  # local import to avoid cycles
                MetaNodeProperties,
                MetaPropertyType,
            )

            props_obj = getattr(meta, "properties", None)

            # Handle both dict-like and MetaNodeProperties
            if isinstance(props_obj, dict):
                prop_types = props_obj.get("propertyTypes") or props_obj.get(
                    "property_types"
                )
            elif isinstance(props_obj, MetaNodeProperties):
                prop_types = getattr(props_obj, "property_types", None)
            else:
                prop_types = None

            if not isinstance(prop_types, dict):
                continue

            for prop_id, pdef in prop_types.items():
                # Support both MetaPropertyType objects and plain dicts
                if isinstance(pdef, MetaPropertyType):
                    key = (getattr(pdef, "key", None) or prop_id or "").strip()
                    ptype = str(getattr(pdef, "type", "") or "").lower()
                    options = getattr(pdef, "options", None) or []
                elif isinstance(pdef, dict):
                    key = (pdef.get("key") or prop_id or "").strip()
                    ptype = str(pdef.get("type") or "").lower()
                    options = pdef.get("options") or []
                else:
                    continue

                if not key:
                    continue

                if "options" not in ptype:
                    continue

                for opt in options:
                    if not opt:
                        continue
                    canon = str(opt)
                    mapping[canon.lower()] = (key, canon)

        return mapping

    def _build_property_aliases(self, extra_aliases: Dict[str, str]) -> Dict[str, str]:
        """
        Build global property alias map using MetaNode propertyTypes,
        plus some generic builtins and user-provided overrides.
        """
        aliases: Dict[str, str] = {}

        # 1) Built-in generic mappings
        builtin = {
            "age": "age",
            "years": "age",
            "risk": "risk_score",
            "risk score": "risk_score",
            "employees": "employees",
            "employee count": "employees",
            "name": "name",
            "title": "title",
            "date": "date",
            "event date": "event_date",
            "year": "year",
            "fatalities": "fatalities",
            "deaths": "fatalities",
            "killed": "fatalities",
            "country": "country",
            "region": "region",
        }
        aliases.update({k.lower(): v for k, v in builtin.items()})

        # 2) From meta.propertyTypes (including options and aliases)
        for meta in self._get_meta_nodes():
            from .meta import (  # local import to avoid cycles
                MetaNodeProperties,
                MetaPropertyType,
            )

            props_obj = getattr(meta, "properties", None)

            # Handle both dict-like and MetaNodeProperties
            if isinstance(props_obj, dict):
                prop_types = props_obj.get("propertyTypes") or props_obj.get(
                    "property_types"
                )
            elif isinstance(props_obj, MetaNodeProperties):
                prop_types = getattr(props_obj, "property_types", None)
            else:
                prop_types = None

            if not isinstance(prop_types, dict):
                continue

            for prop_id, pdef in prop_types.items():
                # Support both MetaPropertyType objects and plain dicts
                if isinstance(pdef, MetaPropertyType):
                    key = (getattr(pdef, "key", None) or prop_id or "").strip()
                    label = (getattr(pdef, "label", "") or "").strip()
                    aliases_list = getattr(pdef, "aliases", None) or []
                elif isinstance(pdef, dict):
                    key = (pdef.get("key") or prop_id or "").strip()
                    label = (pdef.get("label") or "").strip()
                    aliases_list = pdef.get("aliases") or []
                else:
                    continue

                if not key:
                    continue

                canonical = key  # what we actually use in GraphQuery

                # Build candidate aliases for the PROPERTY NAME
                candidates = set()

                # key itself, in various forms
                key_lower = key.lower()
                candidates.add(key_lower)
                candidates.add(key_lower.replace("_", " "))
                if key_lower.endswith(" "):
                    candidates.add(key_lower.rstrip())
                if not key_lower.endswith("s"):
                    candidates.add(key_lower + "s")
                    candidates.add(key_lower.replace("_", " ") + "s")

                # label variants
                if label:
                    lbl = label.strip()
                    lbl_lower = lbl.lower()
                    candidates.add(lbl_lower)
                    if not lbl_lower.endswith("s"):
                        candidates.add(lbl_lower + "s")

                # explicit property aliases (MetaPropertyType.aliases)
                for alias_val in aliases_list:
                    if not alias_val:
                        continue
                    av = str(alias_val).strip().lower()
                    if not av:
                        continue
                    candidates.add(av)

                for c in candidates:
                    if c:
                        aliases.setdefault(c, canonical)

        # 3) User-provided overrides
        for k, v in extra_aliases.items():
            aliases[k.lower()] = v

        return aliases

    def _build_class_temporal_defaults(self) -> Dict[str, str]:
        """
        For each MetaNode, pick a "best guess" temporal property.

        Heuristic scoring based on propertyTypes key/label:
          - "event_date", "timestamp" → highest
          - contain "date" → high
          - contain "time" or "start" → medium
          - "year" → lower
        """
        defaults: Dict[str, str] = {}

        for meta in self._get_meta_nodes():
            class_id = getattr(meta, "classId", None) or getattr(meta, "class_id", None)
            if not class_id:
                continue

            props = getattr(meta, "properties", None)
            if not isinstance(props, dict):
                continue

            prop_types = props.get("propertyTypes") or props.get("property_types")
            if not isinstance(prop_types, dict):
                continue

            best_prop = None
            best_score = -1

            for prop_id, pdef in prop_types.items():
                if not isinstance(pdef, dict):
                    continue

                key = (pdef.get("key") or prop_id or "").strip()
                label = (pdef.get("label") or "").strip()
                if not key:
                    continue

                text = (key + " " + label).lower()

                score = 0
                if "event_date" in text:
                    score += 10
                if "timestamp" in text or "epoch" in text:
                    score += 9
                if "date" in text:
                    score += 8
                if "time" in text:
                    score += 6
                if "start" in text:
                    score += 5
                if "year" in text:
                    score += 4

                if score > best_score:
                    best_score = score
                    best_prop = key

            if best_prop:
                defaults[class_id] = best_prop

        return defaults

    # ---------------- pattern compilation ----------------

    def _compile_patterns(self) -> None:
        # Explicit numeric comparisons: "risk_score >= 0.7", "fatalities > 10"
        self.numeric_pattern = re.compile(
            r"(?P<prop>[a-z_][a-z0-9_]*)\s*"
            r"(?P<op>>=|<=|>|<|=|==)\s*"
            r"(?P<value>[0-9]+(?:\.[0-9]+)?)",
            re.IGNORECASE,
        )

        # Linguistic comparisons: "older than 30", "younger than 20"
        self.gt_pattern = re.compile(
            r"(older|greater|more|larger|higher|above|exceeds?)\s+than\s+"
            r"(?P<value>[0-9]+(?:\.[0-9]+)?)",
            re.IGNORECASE,
        )
        self.lt_pattern = re.compile(
            r"(younger|less|smaller|lower|below|under)\s+than\s+"
            r"(?P<value>[0-9]+(?:\.[0-9]+)?)",
            re.IGNORECASE,
        )

        # Sorting: "sorted by name", "order by risk_score desc"
        self.sort_pattern = re.compile(
            r"(?:sorted?|order)\s+(?:by\s+)?(?P<prop>[a-z_][a-z0-9_]*)"
            r"(?:\s+(?P<dir>ascending|descending|asc|desc))?",
            re.IGNORECASE,
        )

        # Limit: "top 10", "first 5", "limit 20"
        self.limit_pattern = re.compile(
            r"(?:first|top|limit|take|at\s+most)\s+(?P<n>[0-9]+)",
            re.IGNORECASE,
        )

        # Text search: quotes optional, keep original case via original string
        self.text_pattern = re.compile(
            r"(?:containing|contains|with\s+name|search\s+for|looking\s+for)\s+"
            r"(?:the\s+)?(?P<quote>['\"]?)(?P<text>.+?)(?P=quote)\b",
            re.IGNORECASE,
        )

        # Temporal patterns: "this week", "last 7 days", "next 10 weeks", etc.
        self.temporal_pattern = re.compile(
            r"(this|last|next|past)\s+(?:(\d+)\s+)?"
            r"(week|month|year|day|days|weeks|months|years)",
            re.IGNORECASE,
        )

        # "between X and Y" (date ranges)
        self.between_pattern = re.compile(
            r"between\s+(?P<a>.+?)\s+and\s+(?P<b>.+)",
            re.IGNORECASE,
        )

        # Temporal property detection keywords
        self.temporal_property_keywords = {
            "created",
            "updated",
            "modified",
            "published",
            "date",
            "time",
            "timestamp",
            "epoch",
            "since",
            "from",
            "until",
            "before",
            "after",
        }

    # ---------------- public API ----------------

    def parse(self, query_string: str) -> HumanQuery:
        """
        Parse a natural language query string into a HumanQuery.

        Examples:
            "all people older than 30"
            "top 10 acled events with fatalities > 10 sorted by event_date desc"
            "acled events in the last 7 days"
            "events between 2023-01-01 and 2023-03-01"
            "articles created last week"
        """
        self.logger.info("Parsing natural query: %s", query_string)

        hq = HumanQuery(self.store)

        original = query_string.strip()
        normalized = original.lower()

        # 1. class detection
        # For queries starting with limit phrases like "top 3 people",
        # it's easier to ignore the leading limiter when detecting class.
        class_target = normalized
        m_lead_limit = re.match(r"^(top|first|at\s+most)\s+\d+\s+(.*)$", normalized)
        if m_lead_limit and m_lead_limit.group(2):
            class_target = m_lead_limit.group(2)

        hq.class_id = self._extract_class(class_target)
        # Heuristic fallback: if the query mentions events + disorder options, force class
        if not hq.class_id and "events" in normalized and "disorder" in normalized:
            hq.class_id = "event"
        self.logger.debug("  class_id=%s", hq.class_id)

        # 2. text / advanced search (uses original for case/punctuation)
        hq.text_search = self._extract_text_search(original)
        # Advanced patterns
        if "search by name" in normalized:
            # e.g. "people search by name bob"
            hq.search_property = "name"
            # Take last token as the term if no quoted text
            if not hq.text_search:
                term = original.split()[-1]
                hq.text_search = term
        elif "full text search" in normalized:
            hq.use_full_text_search = True
            # Optional property scope: "full text search X in prop1 prop2"
            scope_match = re.search(
                r"full text search\s+(?P<term>.+?)(?:\s+in\s+(?P<props>[a-z_\s,]+))?$",
                normalized,
            )
            if scope_match:
                term = scope_match.group("term")
                props_raw = scope_match.group("props")
                if term:
                    hq.text_search = term.strip()
                # For now, we just store the first scoped property as
                # search_property for consistency; multi-prop support
                # can be added later via options.
                if props_raw:
                    first_prop = props_raw.replace(",", " ").split()[0]
                    hq.search_property = first_prop
            elif not hq.text_search:
                # Fallback: e.g. "people full text search developer"
                parts = original.split("full text search", 1)
                if len(parts) > 1:
                    hq.text_search = parts[1].strip()
        elif "wildcard search" in normalized:
            hq.use_wildcard_search = True
            # pattern then optional "on <property>"
            m = re.search(
                r"wildcard search\s+([^\s]+)(?:\s+on\s+([a-z_]+))?", normalized
            )
            if m:
                pattern = m.group(1)
                prop = m.group(2) or "name"
                hq.text_search = pattern
                hq.wildcard_property = prop
        self.logger.debug("  text_search=%s", hq.text_search)

        # 3. numeric / comparison conditions
        for prop, op, val in self._extract_conditions(normalized):
            hq.add_condition(prop, op, val)

        # Negative text filter: "containing bob but not alice"
        neg_match = re.search(r"but not\s+([a-z0-9_ ]+)", normalized)
        if neg_match:
            # Store negative term as a NOT-contains condition on name
            neg_term = neg_match.group(1).strip()
            if neg_term:
                # Use a regex that matches any string NOT containing neg_term
                pattern = f"^(?!.*{re.escape(neg_term)}).*$"
                hq.add_condition(
                    self._normalize_property_name("name"),
                    FilterOperator.REGEX,
                    pattern,
                )

                # For patterns like "people containing an but not bob", also
                # ensure we have a positive text search term if one was
                # specified earlier with "containing".
                if "containing" in normalized and not hq.text_search:
                    # Extract the first term after "containing".
                    m_pos = re.search(r"containing\s+([a-z0-9_]+)", normalized)
                    if m_pos:
                        hq.text_search = m_pos.group(1)

        self.logger.debug("  conditions=%s", hq.conditions)

        # Legacy single comparison support for phrases like
        # "with more than 100 employees" or "with role senior developer"
        if "with more than" in normalized and "employees" in normalized:
            emp_prop = self._normalize_property_name("employees")
            m = re.search(r"more than\s+(\d+)", normalized)
            if m:
                emp_val = int(m.group(1))
                # Legacy single-field view
                hq.property = emp_prop
                hq.operator = FilterOperator.GT
                hq.value = emp_val
                # Structured condition
                hq.add_condition(emp_prop, FilterOperator.GT, emp_val)

        if "with role" in normalized:
            hq.property = self._normalize_property_name("role")
            hq.operator = FilterOperator.EQ
            role_part = normalized.split("with role", 1)[1].strip()
            hq.value = role_part

        # 3b. generic "with/where <property> <value>" pattern for simple
        # equality filters, including option-type properties.
        # First try an option-aware interpretation: if the trailing text
        # matches a known option label, use the corresponding property and
        # canonical option value.

        option_handled = False
        clause_pattern = re.compile(
            r"(?:with|where)\s+(?!more\s+than|less\s+than)([a-z_ ]+?)\s+([a-z0-9_][a-z0-9_ ]*)(?=(?:\s+(?:with|where))|$)",
            re.IGNORECASE,
        )
        for clause in clause_pattern.finditer(normalized):
            raw_prop = clause.group(1).strip()
            raw_val = clause.group(2).strip()
            if not raw_prop or not raw_val:
                continue

            opt_key = raw_val.lower()
            if opt_key in self.option_label_map:
                opt_prop, canon_val = self.option_label_map[opt_key]
                hq.add_condition(opt_prop, FilterOperator.EQ, canon_val)
                option_handled = True
                continue

            matched = False
            for known_opt, (opt_prop, canon_val) in self.option_label_map.items():
                if opt_key.endswith(known_opt):
                    hq.add_condition(opt_prop, FilterOperator.EQ, canon_val)
                    option_handled = True
                    matched = True
                    break
            if matched:
                continue

            prop_name = self._normalize_property_name(raw_prop)
            value_match = re.search(re.escape(raw_val), original, re.IGNORECASE)
            value_text = value_match.group(0).strip() if value_match else raw_val
            hq.add_condition(prop_name, FilterOperator.EQ, value_text)

        if not option_handled:
            bare_opt_match = re.search(
                r"disorder(?:\s+type)?\s+([a-z0-9_ ]+)", normalized
            )
            if bare_opt_match:
                opt_val = bare_opt_match.group(1).strip().lower()
                if opt_val in self.option_label_map:
                    opt_prop, canon_val = self.option_label_map[opt_val]
                    hq.add_condition(opt_prop, FilterOperator.EQ, canon_val)
                    option_handled = True

        # 3c. Fallback text-search-only pattern for "containing X but not Y"
        # when no text_search was set earlier (e.g. missing quotes).
        if (
            hq.class_id == "person"
            and "containing" in normalized
            and "but not" in normalized
            and not hq.text_search
        ):
            m_pos = re.search(r"containing\s+([a-z0-9_]+)", normalized)
            if m_pos:
                hq.text_search = m_pos.group(1)

        # 4. temporal conditions (use class_id for better defaults)
        temporal_prop, start_ms, end_ms = self._extract_temporal_range(
            normalized, class_id=hq.class_id
        )
        hq.temporal_property = temporal_prop
        hq.temporal_start_ms = start_ms
        hq.temporal_end_ms = end_ms
        self.logger.debug(
            "  temporal_property=%s, range=[%s, %s]", temporal_prop, start_ms, end_ms
        )

        # 5. sort
        sort_prop, sort_dir = self._extract_sort(normalized)
        hq.sort_property = sort_prop
        hq.sort_direction = sort_dir
        self.logger.debug("  sort=%s %s", sort_prop, sort_dir)

        # 6. limit / take-first / take-last
        hq.limit = self._extract_limit(normalized)

        # Explicit "last N" pattern
        last_match = re.search(r"last\s+(?P<n>\d+)", normalized)
        if last_match:
            try:
                hq.tail_limit = int(last_match.group("n"))
            except ValueError:
                hq.tail_limit = None

        self.logger.debug("  limit=%s, tail_limit=%s", hq.limit, hq.tail_limit)

        # 7. related-to patterns and aliases
        # Basic pattern: "elements related to person 1"  element id "1"
        rel_match = re.search(r"related to [a-z_]*\s*(\d+)", normalized)
        if not rel_match:
            # Alias: "colleagues of person 1"
            rel_match = re.search(r"colleagues of [a-z_]*\s*(\d+)", normalized)
        if rel_match:
            hq.related_element_id = rel_match.group(1)

        # Extended pattern: "related to node 1 via works at"
        rel_via_match = re.search(
            r"related to [a-z_]*\s*(\d+)\s+via\s+(.+)$", normalized
        )
        if rel_via_match:
            hq.related_element_id = rel_via_match.group(1)
            raw_edge_phrase = rel_via_match.group(2).strip().lower()
            # Direct lookup in edge_aliases; we can later add fuzzy matching
            if raw_edge_phrase in self.edge_aliases:
                hq.edge_label = self.edge_aliases[raw_edge_phrase]

        self.logger.debug(
            "  related_element_id=%s, edge_label=%s",
            hq.related_element_id,
            hq.edge_label,
        )

        return hq

    # ---------------- extraction helpers ----------------

    def _extract_property(self, normalized: str) -> Optional[str]:
        """Extract the primary property mentioned in a simple comparison phrase.

        Designed to satisfy older tests that expect a single property
        from queries like "people older than 30" or
        "organizations with more than 100 employees".
        """
        # First look for explicit numeric comparison: "<prop> >= 10"
        num_match = self.numeric_pattern.search(normalized)
        if num_match:
            raw_prop = num_match.group("prop").strip()
            return self._normalize_property_name(raw_prop)

        # Special-case linguistic patterns
        if "older than" in normalized or "younger than" in normalized:
            return self._normalize_property_name("age")

        # Fallback: scan for known property aliases in the text
        for alias, canonical in self.property_aliases.items():
            if re.search(rf"\b{re.escape(alias)}\b", normalized):
                return canonical

        return None

    def _extract_operator_and_value(
        self, normalized: str
    ) -> Tuple[Optional[FilterOperator], Optional[Any]]:
        """Extract a comparison operator and numeric value from the query.

        This is a legacy helper kept for backwards compatibility with
        existing tests. New code should prefer ``_extract_conditions``.
        """
        # Prefer explicit numeric operators first
        num_match = self.numeric_pattern.search(normalized)
        if num_match:
            op_str = num_match.group("op")
            value_str = num_match.group("value")
            value: Any = float(value_str) if "." in value_str else int(value_str)
            op_map = {
                ">": FilterOperator.GT,
                "<": FilterOperator.LT,
                ">=": FilterOperator.GTE,
                "<=": FilterOperator.LTE,
                "=": FilterOperator.EQ,
                "==": FilterOperator.EQ,
            }
            return op_map.get(op_str, FilterOperator.EQ), value

        # Linguistic "older than 30" / "younger than 20"
        gt_match = self.gt_pattern.search(normalized)
        if gt_match:
            val_str = gt_match.group("value")
            value = float(val_str) if "." in val_str else int(val_str)
            return FilterOperator.GT, value

        lt_match = self.lt_pattern.search(normalized)
        if lt_match:
            val_str = lt_match.group("value")
            value = float(val_str) if "." in val_str else int(val_str)
            return FilterOperator.LT, value

        return None, None

    def _extract_class(self, normalized: str) -> Optional[str]:
        """
        Determine classId from natural language.

        Strategy:
        - Match known aliases (built from MetaNodes).
        - If still nothing, try direct match against store classIds.
        - As last fallback, pick a non-stopword token and try plural-stripping.
        """
        # 1) Alias-based from MetaNodes
        for alias, class_id in self.class_aliases.items():
            if re.search(rf"\b{re.escape(alias)}\b", normalized):
                return class_id

        # 2) Direct match against classIds in elements
        unique_classes = {e.class_id for e in self.store.elements.values()}
        lower_map = {c.lower(): c for c in unique_classes}
        for key_lower, actual in lower_map.items():
            patterns = {key_lower}
            if not key_lower.endswith("s"):
                patterns.add(key_lower + "s")
            else:
                patterns.add(key_lower.rstrip("s"))
            if key_lower.lower() == "person":
                patterns.add("people")

            for pat in patterns:
                if re.search(rf"\b{re.escape(pat)}\b", normalized):
                    return actual

        # 3) Heuristic token fallback
        tokens = re.findall(r"[a-z]+", normalized)
        stopwords = {
            "all",
            "show",
            "me",
            "the",
            "of",
            "with",
            "that",
            "are",
            "and",
            "or",
            "in",
        }

        candidate = None
        for tok in tokens:
            if tok in stopwords:
                continue

            candidate = tok

            if tok in self.class_aliases:
                return self.class_aliases[tok]

            singular = tok[:-1] if tok.endswith("s") else tok
            if tok == "people":
                singular = "person"

            if singular in self.class_aliases:
                return self.class_aliases[singular]

            if singular in lower_map:
                return lower_map[singular]

        if candidate:
            singular = candidate[:-1] if candidate.endswith("s") else candidate
            if candidate == "people":
                singular = "person"
            if singular in lower_map:
                return lower_map[singular]
            return singular

        return None

    def _normalize_property_name(self, raw: str) -> str:
        """Map natural property phrase to graph property id."""
        key = raw.strip().lower()
        if key in self.property_aliases:
            return self.property_aliases[key]
        # simple plural stripping
        if key.endswith("s") and key[:-1] in self.property_aliases:
            return self.property_aliases[key[:-1]]
        return key.replace(" ", "_")

    def _extract_conditions(
        self, normalized: str
    ) -> List[Tuple[str, FilterOperator, Any]]:
        """
        Extract one or more property conditions from the query.

        Supports:
            - 'risk_score >= 0.7'
            - 'fatalities > 10'
            - 'age > 30'
            - 'older than 30' (implies age)
            - 'younger than 20' (implies age)
        """
        conditions: List[Tuple[str, FilterOperator, Any]] = []

        # 1) explicit numeric patterns: "<prop> >= 10"
        for match in self.numeric_pattern.finditer(normalized):
            raw_prop = match.group("prop").strip()
            op_str = match.group("op")
            value_str = match.group("value")
            prop = self._normalize_property_name(raw_prop)
            value = float(value_str) if "." in value_str else int(value_str)

            op_map = {
                ">": FilterOperator.GT,
                "<": FilterOperator.LT,
                ">=": FilterOperator.GTE,
                "<=": FilterOperator.LTE,
                "=": FilterOperator.EQ,
                "==": FilterOperator.EQ,
            }
            op = op_map.get(op_str, FilterOperator.EQ)
            conditions.append((prop, op, value))

        # 2) linguistic "older than 30" / "younger than 20"
        gt_match = self.gt_pattern.search(normalized)
        if gt_match:
            val_str = gt_match.group("value")
            value = float(val_str) if "." in val_str else int(val_str)
            # Only treat as an age condition for generic patterns like
            # "older than 30" that are not obviously tied to a specific
            # property such as employees.
            if "employees" not in normalized:
                conditions.append(
                    (self._normalize_property_name("age"), FilterOperator.GT, value)
                )

        lt_match = self.lt_pattern.search(normalized)
        if lt_match:
            val_str = lt_match.group("value")
            value = float(val_str) if "." in val_str else int(val_str)
            conditions.append(
                (self._normalize_property_name("age"), FilterOperator.LT, value)
            )

        return conditions

    def _extract_text_search(self, original: str) -> Optional[str]:
        """
        Extract text search term (keep original case).
        """
        match = self.text_pattern.search(original)
        if match:
            text = match.group("text").strip()
            return text or None

        # Fallback: bare quoted phrase e.g. "acled events in 'Gaza'"
        quote_match = re.search(r"'([^']+)'|\"([^\"]+)\"", original)
        if quote_match:
            text = quote_match.group(1) or quote_match.group(2)
            return text.strip() or None

        return None

    def _extract_temporal_range(
        self,
        normalized: str,
        class_id: Optional[str],
    ) -> Tuple[Optional[str], Optional[int], Optional[int]]:
        """
        Extract temporal range from query.

        Returns:
            (temporal_property, start_epoch_ms, end_epoch_ms)
        """
        # 1) "between X and Y" form
        between_match = self.between_pattern.search(normalized)
        if between_match:
            phrase_a = between_match.group("a").strip()
            phrase_b = between_match.group("b").strip()

            dt_a = parse_temporal_phrase(phrase_a)
            dt_b = parse_temporal_phrase(phrase_b)
            if dt_a and dt_b:
                start_ms = get_epoch_timestamp(min(dt_a, dt_b))
                end_ms = get_epoch_timestamp(max(dt_a, dt_b))
                prop = self._detect_temporal_property(normalized, class_id)
                return prop, start_ms, end_ms

        # 2) relative phrases: "this week", "last 7 days", etc.
        temporal_match = self.temporal_pattern.search(normalized)
        phrase = None
        if temporal_match:
            phrase = temporal_match.group(0)
        else:
            # 3) Fallback: any parsable date/time phrase in the text
            found = search_temporal_phrases(normalized)
            if found:
                candidate = found[0][0]
                candidate_lower = candidate.strip().lower()
                recognized_keywords = {
                    "today",
                    "yesterday",
                    "tomorrow",
                    "day",
                    "days",
                    "week",
                    "weeks",
                    "month",
                    "months",
                    "year",
                    "years",
                    "hour",
                    "hours",
                }
                if re.search(r"\d", candidate_lower) or any(
                    keyword in candidate_lower for keyword in recognized_keywords
                ):
                    phrase = candidate

        if not phrase:
            return None, None, None

        range_tuple = calculate_temporal_range(phrase)
        if not range_tuple:
            return None, None, None

        temporal_property = self._detect_temporal_property(normalized, class_id)
        if not temporal_property:
            # Default to class-specific if available,
            # else a reasonable global default
            if class_id and class_id in self.class_temporal_defaults:
                temporal_property = self.class_temporal_defaults[class_id]
            else:
                temporal_property = "timestamp"

        start_ms, end_ms = range_tuple
        return temporal_property, start_ms, end_ms

    def _detect_temporal_property(
        self,
        normalized: str,
        class_id: Optional[str],
    ) -> Optional[str]:
        """
        Detect which property the temporal range applies to.

        Looks for property names before/near temporal phrases:
        - "event_date last week" -> "event_date"
        - "created this month" -> "created"
        - "timestamp past 30 days" -> "timestamp"
        If not found, falls back to class-specific defaults, then generic.
        """
        property_pattern = re.compile(
            r"(?P<prop>[a-z_][a-z0-9_]*)\s+"
            r"(?:from\s+)?(this|last|next|past)\s+(?:\d+\s+)?"
            r"(week|month|year|day|days|weeks|months|years)",
            re.IGNORECASE,
        )

        match = property_pattern.search(normalized)
        if match:
            prop_name = match.group("prop")
            normalized_prop = self._normalize_property_name(prop_name)
            return normalized_prop

        # Look for standalone temporal property keywords
        for keyword in self.temporal_property_keywords:
            if re.search(rf"\b{keyword}\b", normalized):
                if keyword in {"created", "updated", "modified", "published"}:
                    return keyword
                if keyword in {"timestamp", "epoch", "date", "time"}:
                    return keyword

        # Class-specific default
        if class_id and class_id in self.class_temporal_defaults:
            return self.class_temporal_defaults[class_id]

        return None

    def _extract_sort(self, normalized: str) -> Tuple[Optional[str], str]:
        match = self.sort_pattern.search(normalized)
        if not match:
            return None, "asc"

        raw_prop = match.group("prop")
        raw_dir = match.group("dir") or "asc"
        direction = "desc" if raw_dir.lower().startswith("desc") else "asc"

        return self._normalize_property_name(raw_prop), direction

    def _extract_limit(self, normalized: str) -> Optional[int]:
        match = self.limit_pattern.search(normalized)
        if not match:
            return None
        try:
            return int(match.group("n"))
        except ValueError:
            return None


# ---------- Convenience helper ----------


def query(store: "ElementStore", natural_query: str) -> HumanQuery:
    """
    Convenient entry point.

    Example:
        hq = query(store, "top 10 acled events with fatalities > 10 in the last 7 days")
        results = hq.execute()
    """
    parser = NaturalQueryParser(store)
    return parser.parse(natural_query)
