feat: Add evaluation result details when querying the workflow run log.

2026-05-11 00:02:29 -04:00 · 2026-03-19 11:45:11 +08:00
parent 87dd0d80e7
commit 6b7b4e40cf
6 changed files with 116 additions and 18 deletions
--- a/api/fields/workflow_app_log_fields.py
+++ b/api/fields/workflow_app_log_fields.py
@@ -14,6 +14,7 @@ workflow_app_log_partial_fields = {
    "id": fields.String,
    "workflow_run": fields.Nested(workflow_run_for_log_fields, attribute="workflow_run", allow_null=True),
    "details": fields.Raw(attribute="details"),
+    "evaluation": fields.Raw(attribute="evaluation", default=None),
    "created_from": fields.String,
    "created_by_role": fields.String,
    "created_by_account": fields.Nested(simple_account_fields, attribute="created_by_account", allow_null=True),
--- a/api/migrations/versions/2026_03_03_0001-a1b2c3d4e5f6_add_evaluation_tables.py
+++ b/api/migrations/versions/2026_03_03_0001-a1b2c3d4e5f6_add_evaluation_tables.py
@@ -78,6 +78,7 @@ def upgrade():
        "evaluation_run_items",
        sa.Column("id", models.types.StringUUID(), nullable=False),
        sa.Column("evaluation_run_id", models.types.StringUUID(), nullable=False),
+        sa.Column("workflow_run_id", models.types.StringUUID(), nullable=True),
        sa.Column("item_index", sa.Integer(), nullable=False),
        sa.Column("inputs", models.types.LongText(), nullable=True),
        sa.Column("expected_output", models.types.LongText(), nullable=True),
@@ -95,10 +96,12 @@ def upgrade():
        batch_op.create_index(
            "evaluation_run_item_index_idx", ["evaluation_run_id", "item_index"], unique=False
        )
+        batch_op.create_index("evaluation_run_item_workflow_run_idx", ["workflow_run_id"], unique=False)


 def downgrade():
    with op.batch_alter_table("evaluation_run_items", schema=None) as batch_op:
+        batch_op.drop_index("evaluation_run_item_workflow_run_idx")
        batch_op.drop_index("evaluation_run_item_index_idx")
        batch_op.drop_index("evaluation_run_item_run_idx")
    op.drop_table("evaluation_run_items")
--- a/api/models/evaluation.py
+++ b/api/models/evaluation.py
@@ -28,6 +28,7 @@ class EvaluationTargetType(StrEnum):
    SNIPPETS = "snippets"
    KNOWLEDGE_BASE = "knowledge_base"

+
 class EvaluationConfiguration(Base):
    """Stores evaluation configuration for each target (App or Snippet)."""

@@ -132,10 +133,12 @@ class EvaluationRunItem(Base):
        sa.PrimaryKeyConstraint("id", name="evaluation_run_item_pkey"),
        sa.Index("evaluation_run_item_run_idx", "evaluation_run_id"),
        sa.Index("evaluation_run_item_index_idx", "evaluation_run_id", "item_index"),
+        sa.Index("evaluation_run_item_workflow_run_idx", "workflow_run_id"),
    )

    id: Mapped[str] = mapped_column(StringUUID, default=lambda: str(uuidv7()))
    evaluation_run_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
+    workflow_run_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True)

    item_index: Mapped[int] = mapped_column(Integer, nullable=False)
    inputs: Mapped[str | None] = mapped_column(LongText, nullable=True)
--- a/api/services/evaluation_service.py
+++ b/api/services/evaluation_service.py
@@ -571,7 +571,7 @@ class EvaluationService:
        target_id: str,
        input_list: list[EvaluationDatasetInput],
        max_workers: int = 5,
-    ) -> list[dict[str, NodeRunResult]]:
+    ) -> tuple[list[dict[str, NodeRunResult]], list[str | None]]:
        """Execute the evaluation target for every test-data item in parallel.

        :param tenant_id: Workspace / tenant ID.
@@ -579,9 +579,11 @@ class EvaluationService:
        :param target_id: ID of the App or CustomizedSnippet.
        :param input_list: All test-data items parsed from the dataset.
        :param max_workers: Maximum number of parallel worker threads.
-        :return: Ordered list of ``{node_id: NodeRunResult}`` mappings.  The
-            *i*-th element corresponds to ``input_list[i]``.  If a target
-            execution fails, the corresponding element is an empty dict.
+        :return: Tuple of (node_results, workflow_run_ids).
+            node_results: ordered list of ``{node_id: NodeRunResult}`` mappings;
+            the *i*-th element corresponds to ``input_list[i]``.
+            workflow_run_ids: ordered list of workflow_run_id strings (or None)
+            for each input item.
        """
        from concurrent.futures import ThreadPoolExecutor

@@ -589,13 +591,12 @@ class EvaluationService:

        flask_app: Flask = current_app._get_current_object()  # type: ignore

-        def _worker(item: EvaluationDatasetInput) -> dict[str, NodeRunResult]:
+        def _worker(item: EvaluationDatasetInput) -> tuple[dict[str, NodeRunResult], str | None]:
            with flask_app.app_context():
                from models.engine import db

                with Session(db.engine, expire_on_commit=False) as thread_session:
                    try:
-                        # 1. Execute target (workflow app / snippet)
                        response = cls._run_single_target(
                            session=thread_session,
                            target_type=target_type,
@@ -603,7 +604,6 @@ class EvaluationService:
                            item=item,
                        )

-                        # 2. Extract workflow_run_id from the blocking response
                        workflow_run_id = cls._extract_workflow_run_id(response)
                        if not workflow_run_id:
                            logger.warning(
@@ -611,34 +611,38 @@ class EvaluationService:
                                item.index,
                                target_id,
                            )
-                            return {}
+                            return {}, None

-                        # 3. Query per-node execution results from DB
-                        return cls._query_node_run_results(
+                        node_results = cls._query_node_run_results(
                            session=thread_session,
                            tenant_id=tenant_id,
                            app_id=target_id,
                            workflow_run_id=workflow_run_id,
                        )
+                        return node_results, workflow_run_id
                    except Exception:
                        logger.exception(
                            "Target execution failed for item %d (target=%s)",
                            item.index,
                            target_id,
                        )
-                        return {}
+                        return {}, None

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(_worker, item) for item in input_list]
            ordered_results: list[dict[str, NodeRunResult]] = []
+            ordered_workflow_run_ids: list[str | None] = []
            for future in futures:
                try:
-                    ordered_results.append(future.result())
+                    node_result, wf_run_id = future.result()
+                    ordered_results.append(node_result)
+                    ordered_workflow_run_ids.append(wf_run_id)
                except Exception:
                    logger.exception("Unexpected error collecting target execution result")
                    ordered_results.append({})
+                    ordered_workflow_run_ids.append(None)

-        return ordered_results
+        return ordered_results, ordered_workflow_run_ids

    @classmethod
    def _run_single_target(
--- a/api/services/workflow_app_service.py
+++ b/api/services/workflow_app_service.py
@@ -19,17 +19,28 @@ class LogView:
    """Lightweight wrapper for WorkflowAppLog with computed details.

    - Exposes `details_` for marshalling to `details` in API response
+    - Exposes `evaluation_` for marshalling evaluation metrics in API response
    - Proxies all other attributes to the underlying `WorkflowAppLog`
    """

-    def __init__(self, log: WorkflowAppLog, details: dict | None):
+    def __init__(
+        self,
+        log: WorkflowAppLog,
+        details: dict | None,
+        evaluation: list[dict] | None = None,
+    ):
        self.log = log
        self.details_ = details
+        self.evaluation_ = evaluation

    @property
    def details(self) -> dict | None:
        return self.details_

+    @property
+    def evaluation(self) -> list[dict] | None:
+        return self.evaluation_
+
    def __getattr__(self, name):
        return getattr(self.log, name)

@@ -159,12 +170,20 @@ class WorkflowAppService:
        # Execute query and get items
        if detail:
            rows = session.execute(offset_stmt).all()
-            items = [
-                LogView(log, {"trigger_metadata": self.handle_trigger_metadata(app_model.tenant_id, meta_val)})
+            logs_with_details = [
+                (log, {"trigger_metadata": self.handle_trigger_metadata(app_model.tenant_id, meta_val)})
                for log, meta_val in rows
            ]
        else:
-            items = [LogView(log, None) for log in session.scalars(offset_stmt).all()]
+            logs_with_details = [(log, None) for log in session.scalars(offset_stmt).all()]
+
+        workflow_run_ids = [log.workflow_run_id for log, _ in logs_with_details]
+        eval_map = self._batch_query_evaluation_metrics(session, workflow_run_ids)
+
+        items = [
+            LogView(log, details, evaluation=eval_map.get(log.workflow_run_id))
+            for log, details in logs_with_details
+        ]
        return {
            "page": page,
            "limit": limit,
@@ -246,6 +265,45 @@ class WorkflowAppService:
            "data": items,
        }

+    @staticmethod
+    def _batch_query_evaluation_metrics(
+        session: Session,
+        workflow_run_ids: list[str],
+    ) -> dict[str, list[dict[str, Any]]]:
+        """Return evaluation metrics keyed by workflow_run_id.
+
+        Only returns metrics from completed evaluation runs.  If a workflow
+        run was not part of any evaluation (or the evaluation has not
+        completed), it will be absent from the result dict.
+        """
+        from models.evaluation import EvaluationRun, EvaluationRunItem, EvaluationRunStatus
+
+        if not workflow_run_ids:
+            return {}
+
+        non_null_ids = [wid for wid in workflow_run_ids if wid]
+        if not non_null_ids:
+            return {}
+
+        stmt = (
+            select(EvaluationRunItem.workflow_run_id, EvaluationRunItem.metrics)
+            .join(EvaluationRun, EvaluationRun.id == EvaluationRunItem.evaluation_run_id)
+            .where(
+                EvaluationRunItem.workflow_run_id.in_(non_null_ids),
+                EvaluationRun.status == EvaluationRunStatus.COMPLETED,
+            )
+        )
+        rows = session.execute(stmt).all()
+
+        result: dict[str, list[dict[str, Any]]] = {}
+        for wf_run_id, metrics_json in rows:
+            if wf_run_id and metrics_json:
+                parsed: list[dict[str, Any]] = json.loads(metrics_json)
+                existing = result.get(wf_run_id, [])
+                existing.extend(parsed)
+                result[wf_run_id] = existing
+        return result
+
    def handle_trigger_metadata(self, tenant_id: str, meta_val: str | None) -> dict[str, Any]:
        metadata: dict[str, Any] | None = self._safe_json_loads(meta_val)
        if not metadata:
--- a/api/tasks/evaluation_task.py
+++ b/api/tasks/evaluation_task.py
@@ -89,7 +89,7 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
        )
    else:
        evaluation_service = EvaluationService()
-        node_run_result_mapping_list: list[dict[str, NodeRunResult]] = evaluation_service.execute_targets(
+        node_run_result_mapping_list, workflow_run_ids = evaluation_service.execute_targets(
            tenant_id=run_data.tenant_id,
            target_type=run_data.target_type,
            target_id=run_data.target_id,
@@ -102,6 +102,13 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
            node_run_result_mapping_list=node_run_result_mapping_list,
        )

+        _backfill_workflow_run_ids(
+            session=session,
+            evaluation_run_id=run_data.evaluation_run_id,
+            input_list=run_data.input_list,
+            workflow_run_ids=workflow_run_ids,
+        )
+
    # Compute summary metrics
    metrics_summary = _compute_metrics_summary(results, run_data.judgment_config)

@@ -235,6 +242,28 @@ def _execute_retrieval_test(
    return results


+def _backfill_workflow_run_ids(
+    session: Any,
+    evaluation_run_id: str,
+    input_list: list[EvaluationDatasetInput],
+    workflow_run_ids: list[str | None],
+) -> None:
+    """Set ``workflow_run_id`` on items that were created by the runner."""
+    from models.evaluation import EvaluationRunItem
+
+    for item, wf_run_id in zip(input_list, workflow_run_ids):
+        if not wf_run_id:
+            continue
+        run_item = (
+            session.query(EvaluationRunItem)
+            .filter_by(evaluation_run_id=evaluation_run_id, item_index=item.index)
+            .first()
+        )
+        if run_item:
+            run_item.workflow_run_id = wf_run_id
+    session.commit()
+
+
 def _mark_run_failed(session: Any, run_id: str, error: str) -> None:
    """Mark an evaluation run as failed."""
    try: