dify/api/core/evaluation/base_evaluation_instance.py

import logging
from abc import ABC, abstractmethod
from collections.abc import Mapping
from typing import Any

from core.evaluation.entities.evaluation_entity import (
    CustomizedMetrics,
    EvaluationCategory,
    EvaluationItemInput,
    EvaluationItemResult,
    EvaluationMetric,
)
from dify_graph.node_events.base import NodeRunResult

logger = logging.getLogger(__name__)


class BaseEvaluationInstance(ABC):
    """Abstract base class for evaluation framework adapters."""

    @abstractmethod
    def evaluate_llm(
        self,
        items: list[EvaluationItemInput],
        metric_names: list[str],
        model_provider: str,
        model_name: str,
        tenant_id: str,
    ) -> list[EvaluationItemResult]:
        """Evaluate LLM outputs using the configured framework."""
        ...

    @abstractmethod
    def evaluate_retrieval(
        self,
        items: list[EvaluationItemInput],
        metric_names: list[str],
        model_provider: str,
        model_name: str,
        tenant_id: str,
    ) -> list[EvaluationItemResult]:
        """Evaluate retrieval quality using the configured framework."""
        ...

    @abstractmethod
    def evaluate_agent(
        self,
        items: list[EvaluationItemInput],
        metric_names: list[str],
        model_provider: str,
        model_name: str,
        tenant_id: str,
    ) -> list[EvaluationItemResult]:
        """Evaluate agent outputs using the configured framework."""
        ...

    @abstractmethod
    def get_supported_metrics(self, category: EvaluationCategory) -> list[str]:
        """Return the list of supported metric names for a given evaluation category."""
        ...

    def evaluate_with_customized_workflow(
        self,
        node_run_result_mapping_list: list[dict[str, NodeRunResult]],
        customized_metrics: CustomizedMetrics,
        tenant_id: str,
    ) -> list[EvaluationItemResult]:
        """Evaluate using a published workflow as the evaluator.

        The evaluator workflow's output variables are treated as metrics:
        each output variable name becomes a metric name, and its value
        becomes the score.

        Args:
            node_run_result_mapping_list: One mapping per test-data item,
                where each mapping is ``{node_id: NodeRunResult}`` from the
                target execution.
            customized_metrics: Contains ``evaluation_workflow_id`` (the
                published evaluator workflow) and ``input_fields`` (value
                sources for the evaluator's input variables).
            tenant_id: Tenant scope.

        Returns:
            A list of ``EvaluationItemResult`` with metrics extracted from
            the evaluator workflow's output variables.
        """
        from sqlalchemy.orm import Session

        from core.app.apps.workflow.app_generator import WorkflowAppGenerator
        from core.app.entities.app_invoke_entities import InvokeFrom
        from core.evaluation.runners import get_service_account_for_app
        from models.engine import db
        from models.model import App
        from services.workflow_service import WorkflowService

        workflow_id = customized_metrics.evaluation_workflow_id
        if not workflow_id:
            raise ValueError("customized_metrics must contain 'evaluation_workflow_id' for customized evaluator")

        # Load the evaluator workflow resources using a dedicated session
        with Session(db.engine, expire_on_commit=False) as session, session.begin():
            app = session.query(App).filter_by(id=workflow_id, tenant_id=tenant_id).first()
            if not app:
                raise ValueError(f"Evaluation workflow app {workflow_id} not found in tenant {tenant_id}")
            service_account = get_service_account_for_app(session, workflow_id)

        workflow_service = WorkflowService()
        published_workflow = workflow_service.get_published_workflow(app_model=app)
        if not published_workflow:
            raise ValueError(f"No published workflow found for evaluation app {workflow_id}")

        eval_results: list[EvaluationItemResult] = []
        for idx, node_run_result_mapping in enumerate(node_run_result_mapping_list):
            try:
                workflow_inputs = self._build_workflow_inputs(
                    customized_metrics.input_fields,
                    node_run_result_mapping,
                )

                generator = WorkflowAppGenerator()
                response: Mapping[str, Any] = generator.generate(
                    app_model=app,
                    workflow=published_workflow,
                    user=service_account,
                    args={"inputs": workflow_inputs},
                    invoke_from=InvokeFrom.SERVICE_API,
                    streaming=False,
                    call_depth=0,
                )

                metrics = self._extract_workflow_metrics(response)
                eval_results.append(
                    EvaluationItemResult(
                        index=idx,
                        metrics=metrics,
                    )
                )
            except Exception:
                logger.exception(
                    "Customized evaluator failed for item %d with workflow %s",
                    idx,
                    workflow_id,
                )
                eval_results.append(EvaluationItemResult(index=idx))

        return eval_results

    @staticmethod
    def _build_workflow_inputs(
        input_fields: dict[str, Any],
        node_run_result_mapping: dict[str, NodeRunResult],
    ) -> dict[str, Any]:
        """Build customized workflow inputs by resolving value sources.

        Each entry in ``input_fields`` maps a workflow input variable name
        to its value source, which can be:

          - **Constant**: a plain string without ``{{#…#}}`` used as-is.
          - **Expression**: a string containing one or more
            ``{{#node_id.output_key#}}`` selectors (same format as
            ``VariableTemplateParser``) resolved from
            ``node_run_result_mapping``.

        """
        from dify_graph.nodes.base.variable_template_parser import REGEX as VARIABLE_REGEX

        workflow_inputs: dict[str, Any] = {}

        for field_name, value_source in input_fields.items():
            if not isinstance(value_source, str):
                # Non-string values (numbers, bools, dicts) are used directly.
                workflow_inputs[field_name] = value_source
                continue

            # Check if the entire value is a single expression.
            full_match = VARIABLE_REGEX.fullmatch(value_source)
            if full_match:
                workflow_inputs[field_name] = resolve_variable_selector(
                    full_match.group(1),
                    node_run_result_mapping,
                )
            elif VARIABLE_REGEX.search(value_source):
                # Mixed template: interpolate all expressions as strings.
                workflow_inputs[field_name] = VARIABLE_REGEX.sub(
                    lambda m: str(resolve_variable_selector(m.group(1), node_run_result_mapping)),
                    value_source,
                )
            else:
                # Plain constant — no expression markers.
                workflow_inputs[field_name] = value_source

        return workflow_inputs

    @staticmethod
    def _extract_workflow_metrics(
        response: Mapping[str, object],
    ) -> list[EvaluationMetric]:
        """Extract evaluation metrics from workflow output variables."""
        metrics: list[EvaluationMetric] = []

        data = response.get("data")
        if not isinstance(data, Mapping):
            logger.warning("Unexpected workflow response format: missing 'data' dict")
            return metrics

        outputs = data.get("outputs")
        if not isinstance(outputs, dict):
            logger.warning("Unexpected workflow response format: 'outputs' is not a dict")
            return metrics

        for key, raw_value in outputs.items():
            if not isinstance(key, str):
                continue
            metrics.append(EvaluationMetric(name=key, value=raw_value))

        return metrics


def resolve_variable_selector(
    selector_raw: str,
    node_run_result_mapping: dict[str, NodeRunResult],
) -> object:
    """
    Resolve a ``#node_id.output_key#`` selector against node run results.
    """
    #
    cleaned = selector_raw.strip("#")
    parts = cleaned.split(".")

    if len(parts) < 2:
        logger.warning(
            "Selector '%s' must have at least node_id.output_key",
            selector_raw,
        )
        return ""

    node_id = parts[0]
    output_path = parts[1:]

    node_result = node_run_result_mapping.get(node_id)
    if not node_result or not node_result.outputs:
        logger.warning(
            "Selector '%s': node '%s' not found or has no outputs",
            selector_raw,
            node_id,
        )
        return ""

    # Traverse the output path to support nested keys.
    current: object = node_result.outputs
    for key in output_path:
        if isinstance(current, Mapping):
            next_val = current.get(key)
            if next_val is None:
                logger.warning(
                    "Selector '%s': key '%s' not found in node '%s' outputs",
                    selector_raw,
                    key,
                    node_id,
                )
                return ""
            current = next_val
        else:
            logger.warning(
                "Selector '%s': cannot traverse into non-dict value at key '%s'",
                selector_raw,
                key,
            )
            return ""

    return current if current is not None else ""