feat: Implement customized evaluation in BaseEvaluationInstance.

2026-05-11 00:02:29 -04:00 · 2026-03-05 14:30:39 +08:00
parent 7149af3dac
commit b160dce4db
8 changed files with 71 additions and 58 deletions
--- a/api/core/evaluation/base_evaluation_instance.py
+++ b/api/core/evaluation/base_evaluation_instance.py
@@ -21,7 +21,7 @@ class BaseEvaluationInstance(ABC):
    def evaluate_llm(
        self,
        items: list[EvaluationItemInput],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
        model_provider: str,
        model_name: str,
        tenant_id: str,
@@ -33,7 +33,7 @@ class BaseEvaluationInstance(ABC):
    def evaluate_retrieval(
        self,
        items: list[EvaluationItemInput],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
        model_provider: str,
        model_name: str,
        tenant_id: str,
@@ -45,7 +45,7 @@ class BaseEvaluationInstance(ABC):
    def evaluate_agent(
        self,
        items: list[EvaluationItemInput],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
        model_provider: str,
        model_name: str,
        tenant_id: str,
@@ -57,7 +57,7 @@ class BaseEvaluationInstance(ABC):
    def evaluate_workflow(
        self,
        items: list[EvaluationItemInput],
-        metrics_config: dict,
+        default_metrics: list[dict[str, Any]],
        model_provider: str,
        model_name: str,
        tenant_id: str,
@@ -74,7 +74,7 @@ class BaseEvaluationInstance(ABC):
        self,
        items: list[EvaluationItemInput],
        results: list[EvaluationItemResult],
-        metrics_config: dict,
+        customized_metrics: dict[str, Any],
        tenant_id: str,
    ) -> list[EvaluationItemResult]:
        """Evaluate using a published workflow as the evaluator.
@@ -86,8 +86,8 @@ class BaseEvaluationInstance(ABC):
        Args:
            items: Evaluation items with inputs, expected_output, context.
            results: Results from Phase 1 (with actual_output populated).
-            metrics_config: Must contain ``workflow_id`` pointing to a
-                published WORKFLOW-type App.
+            customized_metrics: Must contain ``evaluation_workflow_id``
+                pointing to a published WORKFLOW-type App.
            tenant_id: Tenant scope.

        Returns:
@@ -103,10 +103,10 @@ class BaseEvaluationInstance(ABC):
        from models.model import App
        from services.workflow_service import WorkflowService

-        workflow_id = metrics_config.get("workflow_id")
+        workflow_id = customized_metrics.get("evaluation_workflow_id")
        if not workflow_id:
            raise ValueError(
-                "metrics_config must contain 'workflow_id' for customized evaluator"
+                "customized_metrics must contain 'evaluation_workflow_id' for customized evaluator"
            )

        # Load the evaluator workflow resources using a dedicated session