From b160dce4db7462014ea19cf5d70534f16dcd6737 Mon Sep 17 00:00:00 2001 From: FFXN Date: Thu, 5 Mar 2026 14:30:39 +0800 Subject: [PATCH] feat: Implement customized evaluation in BaseEvaluationInstance. --- .../evaluation/base_evaluation_instance.py | 18 +++++----- .../frameworks/ragas/ragas_evaluator.py | 30 ++++++++-------- .../runners/agent_evaluation_runner.py | 11 +++--- .../runners/base_evaluation_runner.py | 35 ++++++++++--------- .../runners/llm_evaluation_runner.py | 9 ++--- .../runners/retrieval_evaluation_runner.py | 4 +-- .../runners/workflow_evaluation_runner.py | 9 ++--- api/tasks/evaluation_task.py | 13 +++++-- 8 files changed, 71 insertions(+), 58 deletions(-) diff --git a/api/core/evaluation/base_evaluation_instance.py b/api/core/evaluation/base_evaluation_instance.py index 4b714d2017..3a5b853c22 100644 --- a/api/core/evaluation/base_evaluation_instance.py +++ b/api/core/evaluation/base_evaluation_instance.py @@ -21,7 +21,7 @@ class BaseEvaluationInstance(ABC): def evaluate_llm( self, items: list[EvaluationItemInput], - metrics_config: dict, + default_metrics: list[dict[str, Any]], model_provider: str, model_name: str, tenant_id: str, @@ -33,7 +33,7 @@ class BaseEvaluationInstance(ABC): def evaluate_retrieval( self, items: list[EvaluationItemInput], - metrics_config: dict, + default_metrics: list[dict[str, Any]], model_provider: str, model_name: str, tenant_id: str, @@ -45,7 +45,7 @@ class BaseEvaluationInstance(ABC): def evaluate_agent( self, items: list[EvaluationItemInput], - metrics_config: dict, + default_metrics: list[dict[str, Any]], model_provider: str, model_name: str, tenant_id: str, @@ -57,7 +57,7 @@ class BaseEvaluationInstance(ABC): def evaluate_workflow( self, items: list[EvaluationItemInput], - metrics_config: dict, + default_metrics: list[dict[str, Any]], model_provider: str, model_name: str, tenant_id: str, @@ -74,7 +74,7 @@ class BaseEvaluationInstance(ABC): self, items: list[EvaluationItemInput], results: list[EvaluationItemResult], - metrics_config: dict, + customized_metrics: dict[str, Any], tenant_id: str, ) -> list[EvaluationItemResult]: """Evaluate using a published workflow as the evaluator. @@ -86,8 +86,8 @@ class BaseEvaluationInstance(ABC): Args: items: Evaluation items with inputs, expected_output, context. results: Results from Phase 1 (with actual_output populated). - metrics_config: Must contain ``workflow_id`` pointing to a - published WORKFLOW-type App. + customized_metrics: Must contain ``evaluation_workflow_id`` + pointing to a published WORKFLOW-type App. tenant_id: Tenant scope. Returns: @@ -103,10 +103,10 @@ class BaseEvaluationInstance(ABC): from models.model import App from services.workflow_service import WorkflowService - workflow_id = metrics_config.get("workflow_id") + workflow_id = customized_metrics.get("evaluation_workflow_id") if not workflow_id: raise ValueError( - "metrics_config must contain 'workflow_id' for customized evaluator" + "customized_metrics must contain 'evaluation_workflow_id' for customized evaluator" ) # Load the evaluator workflow resources using a dedicated session diff --git a/api/core/evaluation/frameworks/ragas/ragas_evaluator.py b/api/core/evaluation/frameworks/ragas/ragas_evaluator.py index ceab472cfc..31f08639fd 100644 --- a/api/core/evaluation/frameworks/ragas/ragas_evaluator.py +++ b/api/core/evaluation/frameworks/ragas/ragas_evaluator.py @@ -42,51 +42,51 @@ class RagasEvaluator(BaseEvaluationInstance): def evaluate_llm( self, items: list[EvaluationItemInput], - metrics_config: dict, + default_metrics: list[dict[str, Any]], model_provider: str, model_name: str, tenant_id: str, ) -> list[EvaluationItemResult]: - return self._evaluate(items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.LLM) + return self._evaluate(items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.LLM) def evaluate_retrieval( self, items: list[EvaluationItemInput], - metrics_config: dict, + default_metrics: list[dict[str, Any]], model_provider: str, model_name: str, tenant_id: str, ) -> list[EvaluationItemResult]: return self._evaluate( - items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL + items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL ) def evaluate_agent( self, items: list[EvaluationItemInput], - metrics_config: dict, + default_metrics: list[dict[str, Any]], model_provider: str, model_name: str, tenant_id: str, ) -> list[EvaluationItemResult]: - return self._evaluate(items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.AGENT) + return self._evaluate(items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.AGENT) def evaluate_workflow( self, items: list[EvaluationItemInput], - metrics_config: dict, + default_metrics: list[dict[str, Any]], model_provider: str, model_name: str, tenant_id: str, ) -> list[EvaluationItemResult]: return self._evaluate( - items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW + items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW ) def _evaluate( self, items: list[EvaluationItemInput], - metrics_config: dict, + default_metrics: list[dict[str, Any]], model_provider: str, model_name: str, tenant_id: str, @@ -98,7 +98,12 @@ class RagasEvaluator(BaseEvaluationInstance): string similarity if RAGAS import fails. """ model_wrapper = DifyModelWrapper(model_provider, model_name, tenant_id) - requested_metrics = metrics_config.get("metrics", self.get_supported_metrics(category)) + # Extract metric names from default_metrics list; each item has a "metric" key. + requested_metrics = ( + [m["metric"] for m in default_metrics if "metric" in m] + if default_metrics + else self.get_supported_metrics(category) + ) try: return self._evaluate_with_ragas(items, requested_metrics, model_wrapper, category) @@ -116,11 +121,6 @@ class RagasEvaluator(BaseEvaluationInstance): """Evaluate using RAGAS library.""" from ragas import evaluate as ragas_evaluate from ragas.dataset_schema import EvaluationDataset, SingleTurnSample - from ragas.llms import LangchainLLMWrapper - from ragas.metrics import ( - Faithfulness, - ResponseRelevancy, - ) # Build RAGAS dataset samples = [] diff --git a/api/core/evaluation/runners/agent_evaluation_runner.py b/api/core/evaluation/runners/agent_evaluation_runner.py index 2f0bc210a5..90ed0a2590 100644 --- a/api/core/evaluation/runners/agent_evaluation_runner.py +++ b/api/core/evaluation/runners/agent_evaluation_runner.py @@ -1,5 +1,6 @@ import logging -from typing import Any, Mapping, Union +from collections.abc import Mapping +from typing import Any from sqlalchemy.orm import Session @@ -9,7 +10,7 @@ from core.evaluation.entities.evaluation_entity import ( EvaluationItemResult, ) from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner -from models.model import App, AppMode +from models.model import App logger = logging.getLogger(__name__) @@ -29,8 +30,8 @@ class AgentEvaluationRunner(BaseEvaluationRunner): ) -> EvaluationItemResult: """Execute agent app and collect response with tool call information.""" from core.app.apps.agent_chat.app_generator import AgentChatAppGenerator - from core.evaluation.runners import get_service_account_for_app from core.app.entities.app_invoke_entities import InvokeFrom + from core.evaluation.runners import get_service_account_for_app app = self.session.query(App).filter_by(id=target_id).first() if not app: @@ -67,7 +68,7 @@ class AgentEvaluationRunner(BaseEvaluationRunner): self, items: list[EvaluationItemInput], results: list[EvaluationItemResult], - metrics_config: dict, + default_metrics: list[dict[str, Any]], model_provider: str, model_name: str, tenant_id: str, @@ -90,7 +91,7 @@ class AgentEvaluationRunner(BaseEvaluationRunner): ) evaluated = self.evaluation_instance.evaluate_agent( - merged_items, metrics_config, model_provider, model_name, tenant_id + merged_items, default_metrics, model_provider, model_name, tenant_id ) # Merge metrics back preserving metadata diff --git a/api/core/evaluation/runners/base_evaluation_runner.py b/api/core/evaluation/runners/base_evaluation_runner.py index 24fbea5ee6..c88de0169e 100644 --- a/api/core/evaluation/runners/base_evaluation_runner.py +++ b/api/core/evaluation/runners/base_evaluation_runner.py @@ -51,7 +51,7 @@ class BaseEvaluationRunner(ABC): self, items: list[EvaluationItemInput], results: list[EvaluationItemResult], - metrics_config: dict, + default_metrics: list[dict[str, Any]], model_provider: str, model_name: str, tenant_id: str, @@ -66,9 +66,10 @@ class BaseEvaluationRunner(ABC): target_id: str, target_type: str, items: list[EvaluationItemInput], - metrics_config: dict, - model_provider: str, - model_name: str, + default_metrics: list[dict[str, Any]], + customized_metrics: dict[str, Any] | None = None, + model_provider: str = "", + model_name: str = "", judgment_config: JudgmentConfig | None = None, ) -> list[EvaluationItemResult]: """Orchestrate target execution + metric evaluation + judgment for all items.""" @@ -106,13 +107,15 @@ class BaseEvaluationRunner(ABC): if successful_items and successful_results: try: - if _is_customized_evaluation(metrics_config): + if customized_metrics is not None: + # Customized workflow evaluation — target-type agnostic evaluated_results = self._evaluate_customized( - successful_items, successful_results, metrics_config, tenant_id, + successful_items, successful_results, customized_metrics, tenant_id, ) else: + # Framework-specific evaluation — delegate to subclass evaluated_results = self.evaluate_metrics( - successful_items, successful_results, metrics_config, + successful_items, successful_results, default_metrics, model_provider, model_name, tenant_id, ) # Merge evaluated metrics back into results @@ -153,12 +156,18 @@ class BaseEvaluationRunner(ABC): self, items: list[EvaluationItemInput], results: list[EvaluationItemResult], - metrics_config: dict, + customized_metrics: dict[str, Any], tenant_id: str, ) -> list[EvaluationItemResult]: - """Delegate to the instance's customized workflow evaluator.""" + """Delegate to the instance's customized workflow evaluator. + + Unlike the framework path (which merges ``actual_output`` into + ``context``), here we pass ``results`` directly — the instance's + ``evaluate_with_customized_workflow()`` reads ``actual_output`` + from each ``EvaluationItemResult``. + """ evaluated = self.evaluation_instance.evaluate_with_customized_workflow( - items, results, metrics_config, tenant_id, + items, results, customized_metrics, tenant_id, ) # Merge metrics back preserving actual_output and metadata from Phase 1 @@ -180,7 +189,6 @@ class BaseEvaluationRunner(ABC): final_results.append(result) return final_results - @staticmethod def _apply_judgment( results: list[EvaluationItemResult], @@ -225,8 +233,3 @@ class BaseEvaluationRunner(ABC): result.model_copy(update={"judgment": judgment_result}) ) return judged_results - - -def _is_customized_evaluation(metrics_config: dict[str, Any]) -> bool: - """Check if metrics_config indicates a customized workflow evaluation.""" - return bool(metrics_config.get("workflow_id")) diff --git a/api/core/evaluation/runners/llm_evaluation_runner.py b/api/core/evaluation/runners/llm_evaluation_runner.py index 896c14acf0..7c92322557 100644 --- a/api/core/evaluation/runners/llm_evaluation_runner.py +++ b/api/core/evaluation/runners/llm_evaluation_runner.py @@ -1,5 +1,6 @@ import logging -from typing import Any, Mapping, Union +from collections.abc import Mapping +from typing import Any, Union from sqlalchemy.orm import Session @@ -30,8 +31,8 @@ class LLMEvaluationRunner(BaseEvaluationRunner): """Execute the App/Snippet with the given inputs and collect the response.""" from core.app.apps.completion.app_generator import CompletionAppGenerator from core.app.apps.workflow.app_generator import WorkflowAppGenerator - from core.evaluation.runners import get_service_account_for_app from core.app.entities.app_invoke_entities import InvokeFrom + from core.evaluation.runners import get_service_account_for_app from services.workflow_service import WorkflowService app = self.session.query(App).filter_by(id=target_id).first() @@ -89,7 +90,7 @@ class LLMEvaluationRunner(BaseEvaluationRunner): self, items: list[EvaluationItemInput], results: list[EvaluationItemResult], - metrics_config: dict, + default_metrics: list[dict[str, Any]], model_provider: str, model_name: str, tenant_id: str, @@ -98,7 +99,7 @@ class LLMEvaluationRunner(BaseEvaluationRunner): # Merge actual_output into items for evaluation merged_items = self._merge_results_into_items(items, results) return self.evaluation_instance.evaluate_llm( - merged_items, metrics_config, model_provider, model_name, tenant_id + merged_items, default_metrics, model_provider, model_name, tenant_id ) @staticmethod diff --git a/api/core/evaluation/runners/retrieval_evaluation_runner.py b/api/core/evaluation/runners/retrieval_evaluation_runner.py index 285e2c33b6..3949dc4ed9 100644 --- a/api/core/evaluation/runners/retrieval_evaluation_runner.py +++ b/api/core/evaluation/runners/retrieval_evaluation_runner.py @@ -58,7 +58,7 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner): self, items: list[EvaluationItemInput], results: list[EvaluationItemResult], - metrics_config: dict, + default_metrics: list[dict[str, Any]], model_provider: str, model_name: str, tenant_id: str, @@ -80,7 +80,7 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner): ) evaluated = self.evaluation_instance.evaluate_retrieval( - merged_items, metrics_config, model_provider, model_name, tenant_id + merged_items, default_metrics, model_provider, model_name, tenant_id ) # Merge metrics back into original results (preserve actual_output and metadata) diff --git a/api/core/evaluation/runners/workflow_evaluation_runner.py b/api/core/evaluation/runners/workflow_evaluation_runner.py index 1f580eda2b..9508251f56 100644 --- a/api/core/evaluation/runners/workflow_evaluation_runner.py +++ b/api/core/evaluation/runners/workflow_evaluation_runner.py @@ -1,5 +1,6 @@ import logging -from typing import Any, Mapping +from collections.abc import Mapping +from typing import Any from sqlalchemy.orm import Session @@ -29,8 +30,8 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner): ) -> EvaluationItemResult: """Execute workflow and collect outputs.""" from core.app.apps.workflow.app_generator import WorkflowAppGenerator - from core.evaluation.runners import get_service_account_for_app from core.app.entities.app_invoke_entities import InvokeFrom + from core.evaluation.runners import get_service_account_for_app from services.workflow_service import WorkflowService app = self.session.query(App).filter_by(id=target_id).first() @@ -68,7 +69,7 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner): self, items: list[EvaluationItemInput], results: list[EvaluationItemResult], - metrics_config: dict, + default_metrics: list[dict[str, Any]], model_provider: str, model_name: str, tenant_id: str, @@ -91,7 +92,7 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner): ) evaluated = self.evaluation_instance.evaluate_workflow( - merged_items, metrics_config, model_provider, model_name, tenant_id + merged_items, default_metrics, model_provider, model_name, tenant_id ) # Merge metrics back preserving metadata diff --git a/api/tasks/evaluation_task.py b/api/tasks/evaluation_task.py index b74c4fae46..b61a7f6399 100644 --- a/api/tasks/evaluation_task.py +++ b/api/tasks/evaluation_task.py @@ -1,8 +1,6 @@ import io import json import logging -from configs import dify_config -from models.model import UploadFile from typing import Any from celery import shared_task @@ -10,6 +8,7 @@ from openpyxl import Workbook from openpyxl.styles import Alignment, Border, Font, PatternFill, Side from openpyxl.utils import get_column_letter +from configs import dify_config from core.evaluation.entities.evaluation_entity import ( EvaluationCategory, EvaluationItemResult, @@ -23,6 +22,7 @@ from core.evaluation.runners.workflow_evaluation_runner import WorkflowEvaluatio from extensions.ext_database import db from libs.datetime_utils import naive_utc_now from models.evaluation import EvaluationRun, EvaluationRunStatus +from models.model import UploadFile logger = logging.getLogger(__name__) @@ -86,6 +86,7 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None: customized_metrics=run_data.customized_metrics, model_provider=run_data.evaluation_model_provider, model_name=run_data.evaluation_model, + judgment_config=run_data.judgment_config, ) # Compute summary metrics @@ -210,7 +211,13 @@ def _generate_result_xlsx( input_keys.append(key) # Build headers - headers = ["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + ["overall_score", "error"] + headers = ( + ["index"] + + input_keys + + ["expected_output", "actual_output"] + + all_metric_names + + ["overall_score", "error"] + ) # Write header row for col_idx, header in enumerate(headers, start=1):