From b160dce4db7462014ea19cf5d70534f16dcd6737 Mon Sep 17 00:00:00 2001
From: FFXN <lizy@dify.ai>
Date: Thu, 5 Mar 2026 14:30:39 +0800
Subject: [PATCH] feat: Implement customized evaluation in
 BaseEvaluationInstance.

---
 .../evaluation/base_evaluation_instance.py    | 18 +++++-----
 .../frameworks/ragas/ragas_evaluator.py       | 30 ++++++++--------
 .../runners/agent_evaluation_runner.py        | 11 +++---
 .../runners/base_evaluation_runner.py         | 35 ++++++++++---------
 .../runners/llm_evaluation_runner.py          |  9 ++---
 .../runners/retrieval_evaluation_runner.py    |  4 +--
 .../runners/workflow_evaluation_runner.py     |  9 ++---
 api/tasks/evaluation_task.py                  | 13 +++++--
 8 files changed, 71 insertions(+), 58 deletions(-)

diff --git a/api/core/evaluation/base_evaluation_instance.py b/api/core/evaluation/base_evaluation_instance.py
index 4b714d2017..3a5b853c22 100644
--- a/api/core/evaluation/base_evaluation_instance.py
+++ b/api/core/evaluation/base_evaluation_instance.py
@@ -21,7 +21,7 @@ class BaseEvaluationInstance(ABC):
     def evaluate_llm(
         self,
         items: list[EvaluationItemInput],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
         model_provider: str,
         model_name: str,
         tenant_id: str,
@@ -33,7 +33,7 @@ class BaseEvaluationInstance(ABC):
     def evaluate_retrieval(
         self,
         items: list[EvaluationItemInput],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
         model_provider: str,
         model_name: str,
         tenant_id: str,
@@ -45,7 +45,7 @@ class BaseEvaluationInstance(ABC):
     def evaluate_agent(
         self,
         items: list[EvaluationItemInput],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
         model_provider: str,
         model_name: str,
         tenant_id: str,
@@ -57,7 +57,7 @@ class BaseEvaluationInstance(ABC):
     def evaluate_workflow(
         self,
         items: list[EvaluationItemInput],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
         model_provider: str,
         model_name: str,
         tenant_id: str,
@@ -74,7 +74,7 @@ class BaseEvaluationInstance(ABC):
         self,
         items: list[EvaluationItemInput],
         results: list[EvaluationItemResult],
-        metrics_config: dict,
+        customized_metrics: dict[str, Any],
         tenant_id: str,
     ) -> list[EvaluationItemResult]:
         """Evaluate using a published workflow as the evaluator.
@@ -86,8 +86,8 @@ class BaseEvaluationInstance(ABC):
         Args:
             items: Evaluation items with inputs, expected_output, context.
             results: Results from Phase 1 (with actual_output populated).
-            metrics_config: Must contain ``workflow_id`` pointing to a
-                published WORKFLOW-type App.
+            customized_metrics: Must contain ``evaluation_workflow_id``
+                pointing to a published WORKFLOW-type App.
             tenant_id: Tenant scope.
 
         Returns:
@@ -103,10 +103,10 @@ class BaseEvaluationInstance(ABC):
         from models.model import App
         from services.workflow_service import WorkflowService
 
-        workflow_id = metrics_config.get("workflow_id")
+        workflow_id = customized_metrics.get("evaluation_workflow_id")
         if not workflow_id:
             raise ValueError(
-                "metrics_config must contain 'workflow_id' for customized evaluator"
+                "customized_metrics must contain 'evaluation_workflow_id' for customized evaluator"
             )
 
         # Load the evaluator workflow resources using a dedicated session
diff --git a/api/core/evaluation/frameworks/ragas/ragas_evaluator.py b/api/core/evaluation/frameworks/ragas/ragas_evaluator.py
index ceab472cfc..31f08639fd 100644
--- a/api/core/evaluation/frameworks/ragas/ragas_evaluator.py
+++ b/api/core/evaluation/frameworks/ragas/ragas_evaluator.py
@@ -42,51 +42,51 @@ class RagasEvaluator(BaseEvaluationInstance):
     def evaluate_llm(
         self,
         items: list[EvaluationItemInput],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
         model_provider: str,
         model_name: str,
         tenant_id: str,
     ) -> list[EvaluationItemResult]:
-        return self._evaluate(items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.LLM)
+        return self._evaluate(items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.LLM)
 
     def evaluate_retrieval(
         self,
         items: list[EvaluationItemInput],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
         model_provider: str,
         model_name: str,
         tenant_id: str,
     ) -> list[EvaluationItemResult]:
         return self._evaluate(
-            items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL
+            items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL
         )
 
     def evaluate_agent(
         self,
         items: list[EvaluationItemInput],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
         model_provider: str,
         model_name: str,
         tenant_id: str,
     ) -> list[EvaluationItemResult]:
-        return self._evaluate(items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.AGENT)
+        return self._evaluate(items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.AGENT)
 
     def evaluate_workflow(
         self,
         items: list[EvaluationItemInput],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
         model_provider: str,
         model_name: str,
         tenant_id: str,
     ) -> list[EvaluationItemResult]:
         return self._evaluate(
-            items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW
+            items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW
         )
 
     def _evaluate(
         self,
         items: list[EvaluationItemInput],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
         model_provider: str,
         model_name: str,
         tenant_id: str,
@@ -98,7 +98,12 @@ class RagasEvaluator(BaseEvaluationInstance):
         string similarity if RAGAS import fails.
         """
         model_wrapper = DifyModelWrapper(model_provider, model_name, tenant_id)
-        requested_metrics = metrics_config.get("metrics", self.get_supported_metrics(category))
+        # Extract metric names from default_metrics list; each item has a "metric" key.
+        requested_metrics = (
+            [m["metric"] for m in default_metrics if "metric" in m]
+            if default_metrics
+            else self.get_supported_metrics(category)
+        )
 
         try:
             return self._evaluate_with_ragas(items, requested_metrics, model_wrapper, category)
@@ -116,11 +121,6 @@ class RagasEvaluator(BaseEvaluationInstance):
         """Evaluate using RAGAS library."""
         from ragas import evaluate as ragas_evaluate
         from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
-        from ragas.llms import LangchainLLMWrapper
-        from ragas.metrics import (
-            Faithfulness,
-            ResponseRelevancy,
-        )
 
         # Build RAGAS dataset
         samples = []
diff --git a/api/core/evaluation/runners/agent_evaluation_runner.py b/api/core/evaluation/runners/agent_evaluation_runner.py
index 2f0bc210a5..90ed0a2590 100644
--- a/api/core/evaluation/runners/agent_evaluation_runner.py
+++ b/api/core/evaluation/runners/agent_evaluation_runner.py
@@ -1,5 +1,6 @@
 import logging
-from typing import Any, Mapping, Union
+from collections.abc import Mapping
+from typing import Any
 
 from sqlalchemy.orm import Session
 
@@ -9,7 +10,7 @@ from core.evaluation.entities.evaluation_entity import (
     EvaluationItemResult,
 )
 from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
-from models.model import App, AppMode
+from models.model import App
 
 logger = logging.getLogger(__name__)
 
@@ -29,8 +30,8 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
     ) -> EvaluationItemResult:
         """Execute agent app and collect response with tool call information."""
         from core.app.apps.agent_chat.app_generator import AgentChatAppGenerator
-        from core.evaluation.runners import get_service_account_for_app
         from core.app.entities.app_invoke_entities import InvokeFrom
+        from core.evaluation.runners import get_service_account_for_app
 
         app = self.session.query(App).filter_by(id=target_id).first()
         if not app:
@@ -67,7 +68,7 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
         self,
         items: list[EvaluationItemInput],
         results: list[EvaluationItemResult],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
         model_provider: str,
         model_name: str,
         tenant_id: str,
@@ -90,7 +91,7 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
             )
 
         evaluated = self.evaluation_instance.evaluate_agent(
-            merged_items, metrics_config, model_provider, model_name, tenant_id
+            merged_items, default_metrics, model_provider, model_name, tenant_id
         )
 
         # Merge metrics back preserving metadata
diff --git a/api/core/evaluation/runners/base_evaluation_runner.py b/api/core/evaluation/runners/base_evaluation_runner.py
index 24fbea5ee6..c88de0169e 100644
--- a/api/core/evaluation/runners/base_evaluation_runner.py
+++ b/api/core/evaluation/runners/base_evaluation_runner.py
@@ -51,7 +51,7 @@ class BaseEvaluationRunner(ABC):
         self,
         items: list[EvaluationItemInput],
         results: list[EvaluationItemResult],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
         model_provider: str,
         model_name: str,
         tenant_id: str,
@@ -66,9 +66,10 @@ class BaseEvaluationRunner(ABC):
         target_id: str,
         target_type: str,
         items: list[EvaluationItemInput],
-        metrics_config: dict,
-        model_provider: str,
-        model_name: str,
+        default_metrics: list[dict[str, Any]],
+        customized_metrics: dict[str, Any] | None = None,
+        model_provider: str = "",
+        model_name: str = "",
         judgment_config: JudgmentConfig | None = None,
     ) -> list[EvaluationItemResult]:
         """Orchestrate target execution + metric evaluation + judgment for all items."""
@@ -106,13 +107,15 @@ class BaseEvaluationRunner(ABC):
 
         if successful_items and successful_results:
             try:
-                if _is_customized_evaluation(metrics_config):
+                if customized_metrics is not None:
+                    # Customized workflow evaluation — target-type agnostic
                     evaluated_results = self._evaluate_customized(
-                        successful_items, successful_results, metrics_config, tenant_id,
+                        successful_items, successful_results, customized_metrics, tenant_id,
                     )
                 else:
+                    # Framework-specific evaluation — delegate to subclass
                     evaluated_results = self.evaluate_metrics(
-                        successful_items, successful_results, metrics_config,
+                        successful_items, successful_results, default_metrics,
                         model_provider, model_name, tenant_id,
                     )
                 # Merge evaluated metrics back into results
@@ -153,12 +156,18 @@ class BaseEvaluationRunner(ABC):
         self,
         items: list[EvaluationItemInput],
         results: list[EvaluationItemResult],
-        metrics_config: dict,
+        customized_metrics: dict[str, Any],
         tenant_id: str,
     ) -> list[EvaluationItemResult]:
-        """Delegate to the instance's customized workflow evaluator."""
+        """Delegate to the instance's customized workflow evaluator.
+
+        Unlike the framework path (which merges ``actual_output`` into
+        ``context``), here we pass ``results`` directly — the instance's
+        ``evaluate_with_customized_workflow()`` reads ``actual_output``
+        from each ``EvaluationItemResult``.
+        """
         evaluated = self.evaluation_instance.evaluate_with_customized_workflow(
-            items, results, metrics_config, tenant_id,
+            items, results, customized_metrics, tenant_id,
         )
 
         # Merge metrics back preserving actual_output and metadata from Phase 1
@@ -180,7 +189,6 @@ class BaseEvaluationRunner(ABC):
                 final_results.append(result)
         return final_results
 
-
     @staticmethod
     def _apply_judgment(
         results: list[EvaluationItemResult],
@@ -225,8 +233,3 @@ class BaseEvaluationRunner(ABC):
                 result.model_copy(update={"judgment": judgment_result})
             )
         return judged_results
-
-
-def _is_customized_evaluation(metrics_config: dict[str, Any]) -> bool:
-    """Check if metrics_config indicates a customized workflow evaluation."""
-    return bool(metrics_config.get("workflow_id"))
diff --git a/api/core/evaluation/runners/llm_evaluation_runner.py b/api/core/evaluation/runners/llm_evaluation_runner.py
index 896c14acf0..7c92322557 100644
--- a/api/core/evaluation/runners/llm_evaluation_runner.py
+++ b/api/core/evaluation/runners/llm_evaluation_runner.py
@@ -1,5 +1,6 @@
 import logging
-from typing import Any, Mapping, Union
+from collections.abc import Mapping
+from typing import Any, Union
 
 from sqlalchemy.orm import Session
 
@@ -30,8 +31,8 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
         """Execute the App/Snippet with the given inputs and collect the response."""
         from core.app.apps.completion.app_generator import CompletionAppGenerator
         from core.app.apps.workflow.app_generator import WorkflowAppGenerator
-        from core.evaluation.runners import get_service_account_for_app
         from core.app.entities.app_invoke_entities import InvokeFrom
+        from core.evaluation.runners import get_service_account_for_app
         from services.workflow_service import WorkflowService
 
         app = self.session.query(App).filter_by(id=target_id).first()
@@ -89,7 +90,7 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
         self,
         items: list[EvaluationItemInput],
         results: list[EvaluationItemResult],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
         model_provider: str,
         model_name: str,
         tenant_id: str,
@@ -98,7 +99,7 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
         # Merge actual_output into items for evaluation
         merged_items = self._merge_results_into_items(items, results)
         return self.evaluation_instance.evaluate_llm(
-            merged_items, metrics_config, model_provider, model_name, tenant_id
+            merged_items, default_metrics, model_provider, model_name, tenant_id
         )
 
     @staticmethod
diff --git a/api/core/evaluation/runners/retrieval_evaluation_runner.py b/api/core/evaluation/runners/retrieval_evaluation_runner.py
index 285e2c33b6..3949dc4ed9 100644
--- a/api/core/evaluation/runners/retrieval_evaluation_runner.py
+++ b/api/core/evaluation/runners/retrieval_evaluation_runner.py
@@ -58,7 +58,7 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
         self,
         items: list[EvaluationItemInput],
         results: list[EvaluationItemResult],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
         model_provider: str,
         model_name: str,
         tenant_id: str,
@@ -80,7 +80,7 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
             )
 
         evaluated = self.evaluation_instance.evaluate_retrieval(
-            merged_items, metrics_config, model_provider, model_name, tenant_id
+            merged_items, default_metrics, model_provider, model_name, tenant_id
         )
 
         # Merge metrics back into original results (preserve actual_output and metadata)
diff --git a/api/core/evaluation/runners/workflow_evaluation_runner.py b/api/core/evaluation/runners/workflow_evaluation_runner.py
index 1f580eda2b..9508251f56 100644
--- a/api/core/evaluation/runners/workflow_evaluation_runner.py
+++ b/api/core/evaluation/runners/workflow_evaluation_runner.py
@@ -1,5 +1,6 @@
 import logging
-from typing import Any, Mapping
+from collections.abc import Mapping
+from typing import Any
 
 from sqlalchemy.orm import Session
 
@@ -29,8 +30,8 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
     ) -> EvaluationItemResult:
         """Execute workflow and collect outputs."""
         from core.app.apps.workflow.app_generator import WorkflowAppGenerator
-        from core.evaluation.runners import get_service_account_for_app
         from core.app.entities.app_invoke_entities import InvokeFrom
+        from core.evaluation.runners import get_service_account_for_app
         from services.workflow_service import WorkflowService
 
         app = self.session.query(App).filter_by(id=target_id).first()
@@ -68,7 +69,7 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
         self,
         items: list[EvaluationItemInput],
         results: list[EvaluationItemResult],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
         model_provider: str,
         model_name: str,
         tenant_id: str,
@@ -91,7 +92,7 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
             )
 
         evaluated = self.evaluation_instance.evaluate_workflow(
-            merged_items, metrics_config, model_provider, model_name, tenant_id
+            merged_items, default_metrics, model_provider, model_name, tenant_id
         )
 
         # Merge metrics back preserving metadata
diff --git a/api/tasks/evaluation_task.py b/api/tasks/evaluation_task.py
index b74c4fae46..b61a7f6399 100644
--- a/api/tasks/evaluation_task.py
+++ b/api/tasks/evaluation_task.py
@@ -1,8 +1,6 @@
 import io
 import json
 import logging
-from configs import dify_config
-from models.model import UploadFile
 from typing import Any
 
 from celery import shared_task
@@ -10,6 +8,7 @@ from openpyxl import Workbook
 from openpyxl.styles import Alignment, Border, Font, PatternFill, Side
 from openpyxl.utils import get_column_letter
 
+from configs import dify_config
 from core.evaluation.entities.evaluation_entity import (
     EvaluationCategory,
     EvaluationItemResult,
@@ -23,6 +22,7 @@ from core.evaluation.runners.workflow_evaluation_runner import WorkflowEvaluatio
 from extensions.ext_database import db
 from libs.datetime_utils import naive_utc_now
 from models.evaluation import EvaluationRun, EvaluationRunStatus
+from models.model import UploadFile
 
 logger = logging.getLogger(__name__)
 
@@ -86,6 +86,7 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
         customized_metrics=run_data.customized_metrics,
         model_provider=run_data.evaluation_model_provider,
         model_name=run_data.evaluation_model,
+        judgment_config=run_data.judgment_config,
     )
 
     # Compute summary metrics
@@ -210,7 +211,13 @@ def _generate_result_xlsx(
                 input_keys.append(key)
 
     # Build headers
-    headers = ["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + ["overall_score", "error"]
+    headers = (
+        ["index"]
+        + input_keys
+        + ["expected_output", "actual_output"]
+        + all_metric_names
+        + ["overall_score", "error"]
+    )
 
     # Write header row
     for col_idx, header in enumerate(headers, start=1):