mirror of
https://github.com/langgenius/dify.git
synced 2026-03-25 14:01:06 -04:00
272 lines
9.6 KiB
Python
272 lines
9.6 KiB
Python
import logging
|
|
from abc import ABC, abstractmethod
|
|
from collections.abc import Mapping
|
|
from typing import Any
|
|
|
|
from core.evaluation.entities.evaluation_entity import (
|
|
CustomizedMetrics,
|
|
EvaluationCategory,
|
|
EvaluationItemInput,
|
|
EvaluationItemResult,
|
|
EvaluationMetric,
|
|
)
|
|
from dify_graph.node_events.base import NodeRunResult
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BaseEvaluationInstance(ABC):
|
|
"""Abstract base class for evaluation framework adapters."""
|
|
|
|
@abstractmethod
|
|
def evaluate_llm(
|
|
self,
|
|
items: list[EvaluationItemInput],
|
|
metric_names: list[str],
|
|
model_provider: str,
|
|
model_name: str,
|
|
tenant_id: str,
|
|
) -> list[EvaluationItemResult]:
|
|
"""Evaluate LLM outputs using the configured framework."""
|
|
...
|
|
|
|
@abstractmethod
|
|
def evaluate_retrieval(
|
|
self,
|
|
items: list[EvaluationItemInput],
|
|
metric_names: list[str],
|
|
model_provider: str,
|
|
model_name: str,
|
|
tenant_id: str,
|
|
) -> list[EvaluationItemResult]:
|
|
"""Evaluate retrieval quality using the configured framework."""
|
|
...
|
|
|
|
@abstractmethod
|
|
def evaluate_agent(
|
|
self,
|
|
items: list[EvaluationItemInput],
|
|
metric_names: list[str],
|
|
model_provider: str,
|
|
model_name: str,
|
|
tenant_id: str,
|
|
) -> list[EvaluationItemResult]:
|
|
"""Evaluate agent outputs using the configured framework."""
|
|
...
|
|
|
|
@abstractmethod
|
|
def get_supported_metrics(self, category: EvaluationCategory) -> list[str]:
|
|
"""Return the list of supported metric names for a given evaluation category."""
|
|
...
|
|
|
|
def evaluate_with_customized_workflow(
|
|
self,
|
|
node_run_result_mapping_list: list[dict[str, NodeRunResult]],
|
|
customized_metrics: CustomizedMetrics,
|
|
tenant_id: str,
|
|
) -> list[EvaluationItemResult]:
|
|
"""Evaluate using a published workflow as the evaluator.
|
|
|
|
The evaluator workflow's output variables are treated as metrics:
|
|
each output variable name becomes a metric name, and its value
|
|
becomes the score.
|
|
|
|
Args:
|
|
node_run_result_mapping_list: One mapping per test-data item,
|
|
where each mapping is ``{node_id: NodeRunResult}`` from the
|
|
target execution.
|
|
customized_metrics: Contains ``evaluation_workflow_id`` (the
|
|
published evaluator workflow) and ``input_fields`` (value
|
|
sources for the evaluator's input variables).
|
|
tenant_id: Tenant scope.
|
|
|
|
Returns:
|
|
A list of ``EvaluationItemResult`` with metrics extracted from
|
|
the evaluator workflow's output variables.
|
|
"""
|
|
from sqlalchemy.orm import Session
|
|
|
|
from core.app.apps.workflow.app_generator import WorkflowAppGenerator
|
|
from core.app.entities.app_invoke_entities import InvokeFrom
|
|
from core.evaluation.runners import get_service_account_for_app
|
|
from models.engine import db
|
|
from models.model import App
|
|
from services.workflow_service import WorkflowService
|
|
|
|
workflow_id = customized_metrics.evaluation_workflow_id
|
|
if not workflow_id:
|
|
raise ValueError("customized_metrics must contain 'evaluation_workflow_id' for customized evaluator")
|
|
|
|
# Load the evaluator workflow resources using a dedicated session
|
|
with Session(db.engine, expire_on_commit=False) as session, session.begin():
|
|
app = session.query(App).filter_by(id=workflow_id, tenant_id=tenant_id).first()
|
|
if not app:
|
|
raise ValueError(f"Evaluation workflow app {workflow_id} not found in tenant {tenant_id}")
|
|
service_account = get_service_account_for_app(session, workflow_id)
|
|
|
|
workflow_service = WorkflowService()
|
|
published_workflow = workflow_service.get_published_workflow(app_model=app)
|
|
if not published_workflow:
|
|
raise ValueError(f"No published workflow found for evaluation app {workflow_id}")
|
|
|
|
eval_results: list[EvaluationItemResult] = []
|
|
for idx, node_run_result_mapping in enumerate(node_run_result_mapping_list):
|
|
try:
|
|
workflow_inputs = self._build_workflow_inputs(
|
|
customized_metrics.input_fields,
|
|
node_run_result_mapping,
|
|
)
|
|
|
|
generator = WorkflowAppGenerator()
|
|
response: Mapping[str, Any] = generator.generate(
|
|
app_model=app,
|
|
workflow=published_workflow,
|
|
user=service_account,
|
|
args={"inputs": workflow_inputs},
|
|
invoke_from=InvokeFrom.SERVICE_API,
|
|
streaming=False,
|
|
call_depth=0,
|
|
)
|
|
|
|
metrics = self._extract_workflow_metrics(response)
|
|
eval_results.append(
|
|
EvaluationItemResult(
|
|
index=idx,
|
|
metrics=metrics,
|
|
)
|
|
)
|
|
except Exception:
|
|
logger.exception(
|
|
"Customized evaluator failed for item %d with workflow %s",
|
|
idx,
|
|
workflow_id,
|
|
)
|
|
eval_results.append(EvaluationItemResult(index=idx))
|
|
|
|
return eval_results
|
|
|
|
@staticmethod
|
|
def _build_workflow_inputs(
|
|
input_fields: dict[str, Any],
|
|
node_run_result_mapping: dict[str, NodeRunResult],
|
|
) -> dict[str, Any]:
|
|
"""Build customized workflow inputs by resolving value sources.
|
|
|
|
Each entry in ``input_fields`` maps a workflow input variable name
|
|
to its value source, which can be:
|
|
|
|
- **Constant**: a plain string without ``{{#…#}}`` used as-is.
|
|
- **Expression**: a string containing one or more
|
|
``{{#node_id.output_key#}}`` selectors (same format as
|
|
``VariableTemplateParser``) resolved from
|
|
``node_run_result_mapping``.
|
|
|
|
"""
|
|
from dify_graph.nodes.base.variable_template_parser import REGEX as VARIABLE_REGEX
|
|
|
|
workflow_inputs: dict[str, Any] = {}
|
|
|
|
for field_name, value_source in input_fields.items():
|
|
if not isinstance(value_source, str):
|
|
# Non-string values (numbers, bools, dicts) are used directly.
|
|
workflow_inputs[field_name] = value_source
|
|
continue
|
|
|
|
# Check if the entire value is a single expression.
|
|
full_match = VARIABLE_REGEX.fullmatch(value_source)
|
|
if full_match:
|
|
workflow_inputs[field_name] = resolve_variable_selector(
|
|
full_match.group(1),
|
|
node_run_result_mapping,
|
|
)
|
|
elif VARIABLE_REGEX.search(value_source):
|
|
# Mixed template: interpolate all expressions as strings.
|
|
workflow_inputs[field_name] = VARIABLE_REGEX.sub(
|
|
lambda m: str(resolve_variable_selector(m.group(1), node_run_result_mapping)),
|
|
value_source,
|
|
)
|
|
else:
|
|
# Plain constant — no expression markers.
|
|
workflow_inputs[field_name] = value_source
|
|
|
|
return workflow_inputs
|
|
|
|
@staticmethod
|
|
def _extract_workflow_metrics(
|
|
response: Mapping[str, object],
|
|
) -> list[EvaluationMetric]:
|
|
"""Extract evaluation metrics from workflow output variables."""
|
|
metrics: list[EvaluationMetric] = []
|
|
|
|
data = response.get("data")
|
|
if not isinstance(data, Mapping):
|
|
logger.warning("Unexpected workflow response format: missing 'data' dict")
|
|
return metrics
|
|
|
|
outputs = data.get("outputs")
|
|
if not isinstance(outputs, dict):
|
|
logger.warning("Unexpected workflow response format: 'outputs' is not a dict")
|
|
return metrics
|
|
|
|
for key, raw_value in outputs.items():
|
|
if not isinstance(key, str):
|
|
continue
|
|
metrics.append(EvaluationMetric(name=key, value=raw_value))
|
|
|
|
return metrics
|
|
|
|
|
|
def resolve_variable_selector(
|
|
selector_raw: str,
|
|
node_run_result_mapping: dict[str, NodeRunResult],
|
|
) -> object:
|
|
"""
|
|
Resolve a ``#node_id.output_key#`` selector against node run results.
|
|
"""
|
|
#
|
|
cleaned = selector_raw.strip("#")
|
|
parts = cleaned.split(".")
|
|
|
|
if len(parts) < 2:
|
|
logger.warning(
|
|
"Selector '%s' must have at least node_id.output_key",
|
|
selector_raw,
|
|
)
|
|
return ""
|
|
|
|
node_id = parts[0]
|
|
output_path = parts[1:]
|
|
|
|
node_result = node_run_result_mapping.get(node_id)
|
|
if not node_result or not node_result.outputs:
|
|
logger.warning(
|
|
"Selector '%s': node '%s' not found or has no outputs",
|
|
selector_raw,
|
|
node_id,
|
|
)
|
|
return ""
|
|
|
|
# Traverse the output path to support nested keys.
|
|
current: object = node_result.outputs
|
|
for key in output_path:
|
|
if isinstance(current, Mapping):
|
|
next_val = current.get(key)
|
|
if next_val is None:
|
|
logger.warning(
|
|
"Selector '%s': key '%s' not found in node '%s' outputs",
|
|
selector_raw,
|
|
key,
|
|
node_id,
|
|
)
|
|
return ""
|
|
current = next_val
|
|
else:
|
|
logger.warning(
|
|
"Selector '%s': cannot traverse into non-dict value at key '%s'",
|
|
selector_raw,
|
|
key,
|
|
)
|
|
return ""
|
|
|
|
return current if current is not None else ""
|