mirror of
https://github.com/langgenius/dify.git
synced 2026-05-14 10:00:53 -04:00
Co-authored-by: jyong <718720800@qq.com> Co-authored-by: Yansong Zhang <916125788@qq.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: hj24 <mambahj24@gmail.com> Co-authored-by: hj24 <huangjian@dify.ai> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: Stephen Zhou <38493346+hyoban@users.noreply.github.com> Co-authored-by: CodingOnStar <hanxujiang@dify.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
281 lines
11 KiB
Python
281 lines
11 KiB
Python
import json
|
|
from enum import StrEnum
|
|
from typing import Any
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
from core.evaluation.entities.judgment_entity import JudgmentConfig, JudgmentResult
|
|
|
|
|
|
class EvaluationCategory(StrEnum):
|
|
LLM = "llm"
|
|
RETRIEVAL = "knowledge_retrieval"
|
|
AGENT = "agent"
|
|
WORKFLOW = "workflow"
|
|
SNIPPET = "snippet"
|
|
KNOWLEDGE_BASE = "knowledge_base"
|
|
|
|
|
|
class EvaluationMetricName(StrEnum):
|
|
"""Canonical metric names shared across all evaluation frameworks.
|
|
|
|
Each framework maps these names to its own internal implementation.
|
|
A framework that does not support a given metric should log a warning
|
|
and skip it rather than raising an error.
|
|
|
|
── LLM / general text-quality metrics ──────────────────────────────────
|
|
FAITHFULNESS
|
|
Measures whether every claim in the model's response is grounded in
|
|
the provided retrieved context. A high score means the answer
|
|
contains no hallucinated content — each statement can be traced back
|
|
to a passage in the context.
|
|
Required fields: user_input, response, retrieved_contexts.
|
|
|
|
ANSWER_RELEVANCY
|
|
Measures how well the model's response addresses the user's question.
|
|
A high score means the answer stays on-topic; a low score indicates
|
|
irrelevant content or a failure to answer the actual question.
|
|
Required fields: user_input, response.
|
|
|
|
ANSWER_CORRECTNESS
|
|
Measures the factual accuracy and completeness of the model's answer
|
|
relative to a ground-truth reference. It combines semantic similarity
|
|
with key-fact coverage, so both meaning and content matter.
|
|
Required fields: user_input, response, reference (expected_output).
|
|
|
|
SEMANTIC_SIMILARITY
|
|
Measures the cosine similarity between the model's response and the
|
|
reference answer in an embedding space. It evaluates whether the two
|
|
texts convey the same meaning, independent of factual correctness.
|
|
Required fields: response, reference (expected_output).
|
|
|
|
── Retrieval-quality metrics ────────────────────────────────────────────
|
|
CONTEXT_PRECISION
|
|
Measures the proportion of retrieved context chunks that are actually
|
|
relevant to the question (precision). A high score means the retrieval
|
|
pipeline returns little noise.
|
|
Required fields: user_input, reference, retrieved_contexts.
|
|
|
|
CONTEXT_RECALL
|
|
Measures the proportion of ground-truth information that is covered by
|
|
the retrieved context chunks (recall). A high score means the retrieval
|
|
pipeline does not miss important supporting evidence.
|
|
Required fields: user_input, reference, retrieved_contexts.
|
|
|
|
CONTEXT_RELEVANCE
|
|
Measures how relevant each individual retrieved chunk is to the query.
|
|
Similar to CONTEXT_PRECISION but evaluated at the chunk level rather
|
|
than against a reference answer.
|
|
Required fields: user_input, retrieved_contexts.
|
|
|
|
── Agent-quality metrics ────────────────────────────────────────────────
|
|
TOOL_CORRECTNESS
|
|
Measures the correctness of the tool calls made by the agent during
|
|
task execution — both the choice of tool and the arguments passed.
|
|
A high score means the agent's tool-use strategy matches the expected
|
|
behavior.
|
|
Required fields: actual tool calls vs. expected tool calls.
|
|
|
|
TASK_COMPLETION
|
|
Measures whether the agent ultimately achieves the user's stated goal.
|
|
It evaluates the reasoning chain, intermediate steps, and final output
|
|
holistically; a high score means the task was fully accomplished.
|
|
Required fields: user_input, actual_output.
|
|
"""
|
|
|
|
# LLM / general text-quality metrics
|
|
FAITHFULNESS = "faithfulness"
|
|
ANSWER_RELEVANCY = "answer_relevancy"
|
|
ANSWER_CORRECTNESS = "answer_correctness"
|
|
SEMANTIC_SIMILARITY = "semantic_similarity"
|
|
|
|
# Retrieval-quality metrics
|
|
CONTEXT_PRECISION = "context_precision"
|
|
CONTEXT_RECALL = "context_recall"
|
|
CONTEXT_RELEVANCE = "context_relevance"
|
|
|
|
# Agent-quality metrics
|
|
TOOL_CORRECTNESS = "tool_correctness"
|
|
TASK_COMPLETION = "task_completion"
|
|
|
|
|
|
# Per-category canonical metric lists used by get_supported_metrics().
|
|
LLM_METRIC_NAMES: list[EvaluationMetricName] = [
|
|
EvaluationMetricName.FAITHFULNESS, # Every claim is grounded in context; no hallucinations
|
|
EvaluationMetricName.ANSWER_RELEVANCY, # Response stays on-topic and addresses the question
|
|
EvaluationMetricName.ANSWER_CORRECTNESS, # Factual accuracy and completeness vs. reference
|
|
EvaluationMetricName.SEMANTIC_SIMILARITY, # Semantic closeness to the reference answer
|
|
]
|
|
|
|
RETRIEVAL_METRIC_NAMES: list[EvaluationMetricName] = [
|
|
EvaluationMetricName.CONTEXT_PRECISION, # Fraction of retrieved chunks that are relevant (precision)
|
|
EvaluationMetricName.CONTEXT_RECALL, # Fraction of ground-truth info covered by retrieval (recall)
|
|
EvaluationMetricName.CONTEXT_RELEVANCE, # Per-chunk relevance to the query
|
|
]
|
|
|
|
AGENT_METRIC_NAMES: list[EvaluationMetricName] = [
|
|
EvaluationMetricName.TOOL_CORRECTNESS, # Correct tool selection and arguments
|
|
EvaluationMetricName.TASK_COMPLETION, # Whether the agent fully achieves the user's goal
|
|
]
|
|
|
|
WORKFLOW_METRIC_NAMES: list[EvaluationMetricName] = [
|
|
EvaluationMetricName.FAITHFULNESS,
|
|
EvaluationMetricName.ANSWER_RELEVANCY,
|
|
EvaluationMetricName.ANSWER_CORRECTNESS,
|
|
]
|
|
|
|
METRIC_NODE_TYPE_MAPPING: dict[str, str] = {
|
|
**{m.value: "llm" for m in LLM_METRIC_NAMES},
|
|
**{m.value: "knowledge-retrieval" for m in RETRIEVAL_METRIC_NAMES},
|
|
**{m.value: "agent" for m in AGENT_METRIC_NAMES},
|
|
}
|
|
|
|
METRIC_VALUE_TYPE_MAPPING: dict[str, str] = {
|
|
EvaluationMetricName.FAITHFULNESS: "number",
|
|
EvaluationMetricName.ANSWER_RELEVANCY: "number",
|
|
EvaluationMetricName.ANSWER_CORRECTNESS: "number",
|
|
EvaluationMetricName.SEMANTIC_SIMILARITY: "number",
|
|
EvaluationMetricName.CONTEXT_PRECISION: "number",
|
|
EvaluationMetricName.CONTEXT_RECALL: "number",
|
|
EvaluationMetricName.CONTEXT_RELEVANCE: "number",
|
|
EvaluationMetricName.TOOL_CORRECTNESS: "number",
|
|
EvaluationMetricName.TASK_COMPLETION: "number",
|
|
}
|
|
|
|
|
|
class NodeInfo(BaseModel):
|
|
node_id: str
|
|
type: str
|
|
title: str
|
|
|
|
|
|
class EvaluationMetric(BaseModel):
|
|
name: str
|
|
value: Any
|
|
details: dict[str, Any] = Field(default_factory=dict)
|
|
node_info: NodeInfo | None = None
|
|
|
|
|
|
class EvaluationItemInput(BaseModel):
|
|
index: int
|
|
inputs: dict[str, Any]
|
|
output: str
|
|
expected_output: str | None = None
|
|
context: list[str] | None = None
|
|
|
|
|
|
class EvaluationDatasetInput(BaseModel):
|
|
"""Parsed dataset row used throughout evaluation execution.
|
|
|
|
``expected_output`` keeps backward compatibility with the original
|
|
single-reference template. When users upload node-specific reference
|
|
columns such as ``LLM 1 : expected_output``, they are stored in
|
|
``expected_outputs`` and resolved by node title at execution time.
|
|
"""
|
|
|
|
index: int
|
|
inputs: dict[str, Any]
|
|
expected_output: str | None = None
|
|
expected_outputs: dict[str, str] = Field(default_factory=dict)
|
|
|
|
def get_expected_output_for_node(self, node_title: str | None) -> str | None:
|
|
"""Return the best matching reference answer for the given node title."""
|
|
if node_title:
|
|
if node_title in self.expected_outputs:
|
|
return self.expected_outputs[node_title]
|
|
|
|
if self.expected_output is not None:
|
|
return self.expected_output
|
|
|
|
if len(self.expected_outputs) == 1:
|
|
return next(iter(self.expected_outputs.values()))
|
|
|
|
return None
|
|
|
|
def serialize_expected_output(self) -> str | None:
|
|
"""Serialize references for persistence and API responses.
|
|
|
|
Single-reference datasets stay unchanged, while multi-node references
|
|
are stored as JSON so history/detail APIs can still expose the full
|
|
uploaded payload without changing the database schema.
|
|
"""
|
|
if self.expected_output is not None and not self.expected_outputs:
|
|
return self.expected_output
|
|
|
|
if not self.expected_outputs:
|
|
return None
|
|
|
|
serialized_expected_outputs = dict(self.expected_outputs)
|
|
if self.expected_output is not None:
|
|
serialized_expected_outputs = {"expected_output": self.expected_output, **serialized_expected_outputs}
|
|
|
|
return json.dumps(serialized_expected_outputs, ensure_ascii=False, sort_keys=True)
|
|
|
|
def iter_expected_output_columns(self) -> list[tuple[str, str]]:
|
|
"""Return uploaded expected-output columns in display order."""
|
|
columns: list[tuple[str, str]] = []
|
|
if self.expected_output is not None:
|
|
columns.append(("expected_output", self.expected_output))
|
|
|
|
for node_title, value in self.expected_outputs.items():
|
|
columns.append((f"{node_title} : expected_output", value))
|
|
|
|
return columns
|
|
|
|
|
|
class EvaluationItemResult(BaseModel):
|
|
index: int
|
|
actual_output: str | None = None
|
|
metrics: list[EvaluationMetric] = Field(default_factory=list)
|
|
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
judgment: JudgmentResult = Field(default_factory=JudgmentResult)
|
|
error: str | None = None
|
|
|
|
|
|
class DefaultMetric(BaseModel):
|
|
metric: str
|
|
value_type: str = ""
|
|
node_info_list: list[NodeInfo]
|
|
|
|
|
|
class CustomizedMetricOutputField(BaseModel):
|
|
variable: str
|
|
value_type: str
|
|
|
|
|
|
class CustomizedMetrics(BaseModel):
|
|
evaluation_workflow_id: str
|
|
input_fields: dict[str, Any]
|
|
output_fields: list[CustomizedMetricOutputField]
|
|
|
|
|
|
class EvaluationConfigData(BaseModel):
|
|
"""Structured data for saving evaluation configuration."""
|
|
|
|
evaluation_model: str = ""
|
|
evaluation_model_provider: str = ""
|
|
default_metrics: list[DefaultMetric] = Field(default_factory=list)
|
|
customized_metrics: CustomizedMetrics | None = None
|
|
judgment_config: JudgmentConfig | None = None
|
|
|
|
|
|
class EvaluationRunRequest(EvaluationConfigData):
|
|
"""Request body for starting an evaluation run."""
|
|
|
|
file_id: str
|
|
|
|
|
|
class EvaluationRunData(BaseModel):
|
|
"""Serializable data for Celery task."""
|
|
|
|
evaluation_run_id: str
|
|
tenant_id: str
|
|
target_type: str
|
|
target_id: str
|
|
evaluation_model_provider: str
|
|
evaluation_model: str
|
|
default_metrics: list[DefaultMetric] = Field(default_factory=list)
|
|
customized_metrics: CustomizedMetrics | None = None
|
|
judgment_config: JudgmentConfig | None = None
|
|
input_list: list[EvaluationDatasetInput]
|