feat: Implement customized evaluation in BaseEvaluationInstance.

This commit is contained in:
FFXN
2026-03-05 14:30:39 +08:00
parent 7149af3dac
commit b160dce4db
8 changed files with 71 additions and 58 deletions

View File

@@ -21,7 +21,7 @@ class BaseEvaluationInstance(ABC):
def evaluate_llm(
self,
items: list[EvaluationItemInput],
metrics_config: dict,
default_metrics: list[dict[str, Any]],
model_provider: str,
model_name: str,
tenant_id: str,
@@ -33,7 +33,7 @@ class BaseEvaluationInstance(ABC):
def evaluate_retrieval(
self,
items: list[EvaluationItemInput],
metrics_config: dict,
default_metrics: list[dict[str, Any]],
model_provider: str,
model_name: str,
tenant_id: str,
@@ -45,7 +45,7 @@ class BaseEvaluationInstance(ABC):
def evaluate_agent(
self,
items: list[EvaluationItemInput],
metrics_config: dict,
default_metrics: list[dict[str, Any]],
model_provider: str,
model_name: str,
tenant_id: str,
@@ -57,7 +57,7 @@ class BaseEvaluationInstance(ABC):
def evaluate_workflow(
self,
items: list[EvaluationItemInput],
metrics_config: dict,
default_metrics: list[dict[str, Any]],
model_provider: str,
model_name: str,
tenant_id: str,
@@ -74,7 +74,7 @@ class BaseEvaluationInstance(ABC):
self,
items: list[EvaluationItemInput],
results: list[EvaluationItemResult],
metrics_config: dict,
customized_metrics: dict[str, Any],
tenant_id: str,
) -> list[EvaluationItemResult]:
"""Evaluate using a published workflow as the evaluator.
@@ -86,8 +86,8 @@ class BaseEvaluationInstance(ABC):
Args:
items: Evaluation items with inputs, expected_output, context.
results: Results from Phase 1 (with actual_output populated).
metrics_config: Must contain ``workflow_id`` pointing to a
published WORKFLOW-type App.
customized_metrics: Must contain ``evaluation_workflow_id``
pointing to a published WORKFLOW-type App.
tenant_id: Tenant scope.
Returns:
@@ -103,10 +103,10 @@ class BaseEvaluationInstance(ABC):
from models.model import App
from services.workflow_service import WorkflowService
workflow_id = metrics_config.get("workflow_id")
workflow_id = customized_metrics.get("evaluation_workflow_id")
if not workflow_id:
raise ValueError(
"metrics_config must contain 'workflow_id' for customized evaluator"
"customized_metrics must contain 'evaluation_workflow_id' for customized evaluator"
)
# Load the evaluator workflow resources using a dedicated session

View File

@@ -42,51 +42,51 @@ class RagasEvaluator(BaseEvaluationInstance):
def evaluate_llm(
self,
items: list[EvaluationItemInput],
metrics_config: dict,
default_metrics: list[dict[str, Any]],
model_provider: str,
model_name: str,
tenant_id: str,
) -> list[EvaluationItemResult]:
return self._evaluate(items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.LLM)
return self._evaluate(items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.LLM)
def evaluate_retrieval(
self,
items: list[EvaluationItemInput],
metrics_config: dict,
default_metrics: list[dict[str, Any]],
model_provider: str,
model_name: str,
tenant_id: str,
) -> list[EvaluationItemResult]:
return self._evaluate(
items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL
items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL
)
def evaluate_agent(
self,
items: list[EvaluationItemInput],
metrics_config: dict,
default_metrics: list[dict[str, Any]],
model_provider: str,
model_name: str,
tenant_id: str,
) -> list[EvaluationItemResult]:
return self._evaluate(items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.AGENT)
return self._evaluate(items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.AGENT)
def evaluate_workflow(
self,
items: list[EvaluationItemInput],
metrics_config: dict,
default_metrics: list[dict[str, Any]],
model_provider: str,
model_name: str,
tenant_id: str,
) -> list[EvaluationItemResult]:
return self._evaluate(
items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW
items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW
)
def _evaluate(
self,
items: list[EvaluationItemInput],
metrics_config: dict,
default_metrics: list[dict[str, Any]],
model_provider: str,
model_name: str,
tenant_id: str,
@@ -98,7 +98,12 @@ class RagasEvaluator(BaseEvaluationInstance):
string similarity if RAGAS import fails.
"""
model_wrapper = DifyModelWrapper(model_provider, model_name, tenant_id)
requested_metrics = metrics_config.get("metrics", self.get_supported_metrics(category))
# Extract metric names from default_metrics list; each item has a "metric" key.
requested_metrics = (
[m["metric"] for m in default_metrics if "metric" in m]
if default_metrics
else self.get_supported_metrics(category)
)
try:
return self._evaluate_with_ragas(items, requested_metrics, model_wrapper, category)
@@ -116,11 +121,6 @@ class RagasEvaluator(BaseEvaluationInstance):
"""Evaluate using RAGAS library."""
from ragas import evaluate as ragas_evaluate
from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import (
Faithfulness,
ResponseRelevancy,
)
# Build RAGAS dataset
samples = []

View File

@@ -1,5 +1,6 @@
import logging
from typing import Any, Mapping, Union
from collections.abc import Mapping
from typing import Any
from sqlalchemy.orm import Session
@@ -9,7 +10,7 @@ from core.evaluation.entities.evaluation_entity import (
EvaluationItemResult,
)
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
from models.model import App, AppMode
from models.model import App
logger = logging.getLogger(__name__)
@@ -29,8 +30,8 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
) -> EvaluationItemResult:
"""Execute agent app and collect response with tool call information."""
from core.app.apps.agent_chat.app_generator import AgentChatAppGenerator
from core.evaluation.runners import get_service_account_for_app
from core.app.entities.app_invoke_entities import InvokeFrom
from core.evaluation.runners import get_service_account_for_app
app = self.session.query(App).filter_by(id=target_id).first()
if not app:
@@ -67,7 +68,7 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
self,
items: list[EvaluationItemInput],
results: list[EvaluationItemResult],
metrics_config: dict,
default_metrics: list[dict[str, Any]],
model_provider: str,
model_name: str,
tenant_id: str,
@@ -90,7 +91,7 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
)
evaluated = self.evaluation_instance.evaluate_agent(
merged_items, metrics_config, model_provider, model_name, tenant_id
merged_items, default_metrics, model_provider, model_name, tenant_id
)
# Merge metrics back preserving metadata

View File

@@ -51,7 +51,7 @@ class BaseEvaluationRunner(ABC):
self,
items: list[EvaluationItemInput],
results: list[EvaluationItemResult],
metrics_config: dict,
default_metrics: list[dict[str, Any]],
model_provider: str,
model_name: str,
tenant_id: str,
@@ -66,9 +66,10 @@ class BaseEvaluationRunner(ABC):
target_id: str,
target_type: str,
items: list[EvaluationItemInput],
metrics_config: dict,
model_provider: str,
model_name: str,
default_metrics: list[dict[str, Any]],
customized_metrics: dict[str, Any] | None = None,
model_provider: str = "",
model_name: str = "",
judgment_config: JudgmentConfig | None = None,
) -> list[EvaluationItemResult]:
"""Orchestrate target execution + metric evaluation + judgment for all items."""
@@ -106,13 +107,15 @@ class BaseEvaluationRunner(ABC):
if successful_items and successful_results:
try:
if _is_customized_evaluation(metrics_config):
if customized_metrics is not None:
# Customized workflow evaluation — target-type agnostic
evaluated_results = self._evaluate_customized(
successful_items, successful_results, metrics_config, tenant_id,
successful_items, successful_results, customized_metrics, tenant_id,
)
else:
# Framework-specific evaluation — delegate to subclass
evaluated_results = self.evaluate_metrics(
successful_items, successful_results, metrics_config,
successful_items, successful_results, default_metrics,
model_provider, model_name, tenant_id,
)
# Merge evaluated metrics back into results
@@ -153,12 +156,18 @@ class BaseEvaluationRunner(ABC):
self,
items: list[EvaluationItemInput],
results: list[EvaluationItemResult],
metrics_config: dict,
customized_metrics: dict[str, Any],
tenant_id: str,
) -> list[EvaluationItemResult]:
"""Delegate to the instance's customized workflow evaluator."""
"""Delegate to the instance's customized workflow evaluator.
Unlike the framework path (which merges ``actual_output`` into
``context``), here we pass ``results`` directly — the instance's
``evaluate_with_customized_workflow()`` reads ``actual_output``
from each ``EvaluationItemResult``.
"""
evaluated = self.evaluation_instance.evaluate_with_customized_workflow(
items, results, metrics_config, tenant_id,
items, results, customized_metrics, tenant_id,
)
# Merge metrics back preserving actual_output and metadata from Phase 1
@@ -180,7 +189,6 @@ class BaseEvaluationRunner(ABC):
final_results.append(result)
return final_results
@staticmethod
def _apply_judgment(
results: list[EvaluationItemResult],
@@ -225,8 +233,3 @@ class BaseEvaluationRunner(ABC):
result.model_copy(update={"judgment": judgment_result})
)
return judged_results
def _is_customized_evaluation(metrics_config: dict[str, Any]) -> bool:
"""Check if metrics_config indicates a customized workflow evaluation."""
return bool(metrics_config.get("workflow_id"))

View File

@@ -1,5 +1,6 @@
import logging
from typing import Any, Mapping, Union
from collections.abc import Mapping
from typing import Any, Union
from sqlalchemy.orm import Session
@@ -30,8 +31,8 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
"""Execute the App/Snippet with the given inputs and collect the response."""
from core.app.apps.completion.app_generator import CompletionAppGenerator
from core.app.apps.workflow.app_generator import WorkflowAppGenerator
from core.evaluation.runners import get_service_account_for_app
from core.app.entities.app_invoke_entities import InvokeFrom
from core.evaluation.runners import get_service_account_for_app
from services.workflow_service import WorkflowService
app = self.session.query(App).filter_by(id=target_id).first()
@@ -89,7 +90,7 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
self,
items: list[EvaluationItemInput],
results: list[EvaluationItemResult],
metrics_config: dict,
default_metrics: list[dict[str, Any]],
model_provider: str,
model_name: str,
tenant_id: str,
@@ -98,7 +99,7 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
# Merge actual_output into items for evaluation
merged_items = self._merge_results_into_items(items, results)
return self.evaluation_instance.evaluate_llm(
merged_items, metrics_config, model_provider, model_name, tenant_id
merged_items, default_metrics, model_provider, model_name, tenant_id
)
@staticmethod

View File

@@ -58,7 +58,7 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
self,
items: list[EvaluationItemInput],
results: list[EvaluationItemResult],
metrics_config: dict,
default_metrics: list[dict[str, Any]],
model_provider: str,
model_name: str,
tenant_id: str,
@@ -80,7 +80,7 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
)
evaluated = self.evaluation_instance.evaluate_retrieval(
merged_items, metrics_config, model_provider, model_name, tenant_id
merged_items, default_metrics, model_provider, model_name, tenant_id
)
# Merge metrics back into original results (preserve actual_output and metadata)

View File

@@ -1,5 +1,6 @@
import logging
from typing import Any, Mapping
from collections.abc import Mapping
from typing import Any
from sqlalchemy.orm import Session
@@ -29,8 +30,8 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
) -> EvaluationItemResult:
"""Execute workflow and collect outputs."""
from core.app.apps.workflow.app_generator import WorkflowAppGenerator
from core.evaluation.runners import get_service_account_for_app
from core.app.entities.app_invoke_entities import InvokeFrom
from core.evaluation.runners import get_service_account_for_app
from services.workflow_service import WorkflowService
app = self.session.query(App).filter_by(id=target_id).first()
@@ -68,7 +69,7 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
self,
items: list[EvaluationItemInput],
results: list[EvaluationItemResult],
metrics_config: dict,
default_metrics: list[dict[str, Any]],
model_provider: str,
model_name: str,
tenant_id: str,
@@ -91,7 +92,7 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
)
evaluated = self.evaluation_instance.evaluate_workflow(
merged_items, metrics_config, model_provider, model_name, tenant_id
merged_items, default_metrics, model_provider, model_name, tenant_id
)
# Merge metrics back preserving metadata

View File

@@ -1,8 +1,6 @@
import io
import json
import logging
from configs import dify_config
from models.model import UploadFile
from typing import Any
from celery import shared_task
@@ -10,6 +8,7 @@ from openpyxl import Workbook
from openpyxl.styles import Alignment, Border, Font, PatternFill, Side
from openpyxl.utils import get_column_letter
from configs import dify_config
from core.evaluation.entities.evaluation_entity import (
EvaluationCategory,
EvaluationItemResult,
@@ -23,6 +22,7 @@ from core.evaluation.runners.workflow_evaluation_runner import WorkflowEvaluatio
from extensions.ext_database import db
from libs.datetime_utils import naive_utc_now
from models.evaluation import EvaluationRun, EvaluationRunStatus
from models.model import UploadFile
logger = logging.getLogger(__name__)
@@ -86,6 +86,7 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
customized_metrics=run_data.customized_metrics,
model_provider=run_data.evaluation_model_provider,
model_name=run_data.evaluation_model,
judgment_config=run_data.judgment_config,
)
# Compute summary metrics
@@ -210,7 +211,13 @@ def _generate_result_xlsx(
input_keys.append(key)
# Build headers
headers = ["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + ["overall_score", "error"]
headers = (
["index"]
+ input_keys
+ ["expected_output", "actual_output"]
+ all_metric_names
+ ["overall_score", "error"]
)
# Write header row
for col_idx, header in enumerate(headers, start=1):