feat: Add evaluation result details when querying the workflow run log.

This commit is contained in:
FFXN
2026-03-19 11:45:11 +08:00
parent 87dd0d80e7
commit 6b7b4e40cf
6 changed files with 116 additions and 18 deletions

View File

@@ -14,6 +14,7 @@ workflow_app_log_partial_fields = {
"id": fields.String,
"workflow_run": fields.Nested(workflow_run_for_log_fields, attribute="workflow_run", allow_null=True),
"details": fields.Raw(attribute="details"),
"evaluation": fields.Raw(attribute="evaluation", default=None),
"created_from": fields.String,
"created_by_role": fields.String,
"created_by_account": fields.Nested(simple_account_fields, attribute="created_by_account", allow_null=True),

View File

@@ -78,6 +78,7 @@ def upgrade():
"evaluation_run_items",
sa.Column("id", models.types.StringUUID(), nullable=False),
sa.Column("evaluation_run_id", models.types.StringUUID(), nullable=False),
sa.Column("workflow_run_id", models.types.StringUUID(), nullable=True),
sa.Column("item_index", sa.Integer(), nullable=False),
sa.Column("inputs", models.types.LongText(), nullable=True),
sa.Column("expected_output", models.types.LongText(), nullable=True),
@@ -95,10 +96,12 @@ def upgrade():
batch_op.create_index(
"evaluation_run_item_index_idx", ["evaluation_run_id", "item_index"], unique=False
)
batch_op.create_index("evaluation_run_item_workflow_run_idx", ["workflow_run_id"], unique=False)
def downgrade():
with op.batch_alter_table("evaluation_run_items", schema=None) as batch_op:
batch_op.drop_index("evaluation_run_item_workflow_run_idx")
batch_op.drop_index("evaluation_run_item_index_idx")
batch_op.drop_index("evaluation_run_item_run_idx")
op.drop_table("evaluation_run_items")

View File

@@ -28,6 +28,7 @@ class EvaluationTargetType(StrEnum):
SNIPPETS = "snippets"
KNOWLEDGE_BASE = "knowledge_base"
class EvaluationConfiguration(Base):
"""Stores evaluation configuration for each target (App or Snippet)."""
@@ -132,10 +133,12 @@ class EvaluationRunItem(Base):
sa.PrimaryKeyConstraint("id", name="evaluation_run_item_pkey"),
sa.Index("evaluation_run_item_run_idx", "evaluation_run_id"),
sa.Index("evaluation_run_item_index_idx", "evaluation_run_id", "item_index"),
sa.Index("evaluation_run_item_workflow_run_idx", "workflow_run_id"),
)
id: Mapped[str] = mapped_column(StringUUID, default=lambda: str(uuidv7()))
evaluation_run_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
workflow_run_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
item_index: Mapped[int] = mapped_column(Integer, nullable=False)
inputs: Mapped[str | None] = mapped_column(LongText, nullable=True)

View File

@@ -571,7 +571,7 @@ class EvaluationService:
target_id: str,
input_list: list[EvaluationDatasetInput],
max_workers: int = 5,
) -> list[dict[str, NodeRunResult]]:
) -> tuple[list[dict[str, NodeRunResult]], list[str | None]]:
"""Execute the evaluation target for every test-data item in parallel.
:param tenant_id: Workspace / tenant ID.
@@ -579,9 +579,11 @@ class EvaluationService:
:param target_id: ID of the App or CustomizedSnippet.
:param input_list: All test-data items parsed from the dataset.
:param max_workers: Maximum number of parallel worker threads.
:return: Ordered list of ``{node_id: NodeRunResult}`` mappings. The
*i*-th element corresponds to ``input_list[i]``. If a target
execution fails, the corresponding element is an empty dict.
:return: Tuple of (node_results, workflow_run_ids).
node_results: ordered list of ``{node_id: NodeRunResult}`` mappings;
the *i*-th element corresponds to ``input_list[i]``.
workflow_run_ids: ordered list of workflow_run_id strings (or None)
for each input item.
"""
from concurrent.futures import ThreadPoolExecutor
@@ -589,13 +591,12 @@ class EvaluationService:
flask_app: Flask = current_app._get_current_object() # type: ignore
def _worker(item: EvaluationDatasetInput) -> dict[str, NodeRunResult]:
def _worker(item: EvaluationDatasetInput) -> tuple[dict[str, NodeRunResult], str | None]:
with flask_app.app_context():
from models.engine import db
with Session(db.engine, expire_on_commit=False) as thread_session:
try:
# 1. Execute target (workflow app / snippet)
response = cls._run_single_target(
session=thread_session,
target_type=target_type,
@@ -603,7 +604,6 @@ class EvaluationService:
item=item,
)
# 2. Extract workflow_run_id from the blocking response
workflow_run_id = cls._extract_workflow_run_id(response)
if not workflow_run_id:
logger.warning(
@@ -611,34 +611,38 @@ class EvaluationService:
item.index,
target_id,
)
return {}
return {}, None
# 3. Query per-node execution results from DB
return cls._query_node_run_results(
node_results = cls._query_node_run_results(
session=thread_session,
tenant_id=tenant_id,
app_id=target_id,
workflow_run_id=workflow_run_id,
)
return node_results, workflow_run_id
except Exception:
logger.exception(
"Target execution failed for item %d (target=%s)",
item.index,
target_id,
)
return {}
return {}, None
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(_worker, item) for item in input_list]
ordered_results: list[dict[str, NodeRunResult]] = []
ordered_workflow_run_ids: list[str | None] = []
for future in futures:
try:
ordered_results.append(future.result())
node_result, wf_run_id = future.result()
ordered_results.append(node_result)
ordered_workflow_run_ids.append(wf_run_id)
except Exception:
logger.exception("Unexpected error collecting target execution result")
ordered_results.append({})
ordered_workflow_run_ids.append(None)
return ordered_results
return ordered_results, ordered_workflow_run_ids
@classmethod
def _run_single_target(

View File

@@ -19,17 +19,28 @@ class LogView:
"""Lightweight wrapper for WorkflowAppLog with computed details.
- Exposes `details_` for marshalling to `details` in API response
- Exposes `evaluation_` for marshalling evaluation metrics in API response
- Proxies all other attributes to the underlying `WorkflowAppLog`
"""
def __init__(self, log: WorkflowAppLog, details: dict | None):
def __init__(
self,
log: WorkflowAppLog,
details: dict | None,
evaluation: list[dict] | None = None,
):
self.log = log
self.details_ = details
self.evaluation_ = evaluation
@property
def details(self) -> dict | None:
return self.details_
@property
def evaluation(self) -> list[dict] | None:
return self.evaluation_
def __getattr__(self, name):
return getattr(self.log, name)
@@ -159,12 +170,20 @@ class WorkflowAppService:
# Execute query and get items
if detail:
rows = session.execute(offset_stmt).all()
items = [
LogView(log, {"trigger_metadata": self.handle_trigger_metadata(app_model.tenant_id, meta_val)})
logs_with_details = [
(log, {"trigger_metadata": self.handle_trigger_metadata(app_model.tenant_id, meta_val)})
for log, meta_val in rows
]
else:
items = [LogView(log, None) for log in session.scalars(offset_stmt).all()]
logs_with_details = [(log, None) for log in session.scalars(offset_stmt).all()]
workflow_run_ids = [log.workflow_run_id for log, _ in logs_with_details]
eval_map = self._batch_query_evaluation_metrics(session, workflow_run_ids)
items = [
LogView(log, details, evaluation=eval_map.get(log.workflow_run_id))
for log, details in logs_with_details
]
return {
"page": page,
"limit": limit,
@@ -246,6 +265,45 @@ class WorkflowAppService:
"data": items,
}
@staticmethod
def _batch_query_evaluation_metrics(
session: Session,
workflow_run_ids: list[str],
) -> dict[str, list[dict[str, Any]]]:
"""Return evaluation metrics keyed by workflow_run_id.
Only returns metrics from completed evaluation runs. If a workflow
run was not part of any evaluation (or the evaluation has not
completed), it will be absent from the result dict.
"""
from models.evaluation import EvaluationRun, EvaluationRunItem, EvaluationRunStatus
if not workflow_run_ids:
return {}
non_null_ids = [wid for wid in workflow_run_ids if wid]
if not non_null_ids:
return {}
stmt = (
select(EvaluationRunItem.workflow_run_id, EvaluationRunItem.metrics)
.join(EvaluationRun, EvaluationRun.id == EvaluationRunItem.evaluation_run_id)
.where(
EvaluationRunItem.workflow_run_id.in_(non_null_ids),
EvaluationRun.status == EvaluationRunStatus.COMPLETED,
)
)
rows = session.execute(stmt).all()
result: dict[str, list[dict[str, Any]]] = {}
for wf_run_id, metrics_json in rows:
if wf_run_id and metrics_json:
parsed: list[dict[str, Any]] = json.loads(metrics_json)
existing = result.get(wf_run_id, [])
existing.extend(parsed)
result[wf_run_id] = existing
return result
def handle_trigger_metadata(self, tenant_id: str, meta_val: str | None) -> dict[str, Any]:
metadata: dict[str, Any] | None = self._safe_json_loads(meta_val)
if not metadata:

View File

@@ -89,7 +89,7 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
)
else:
evaluation_service = EvaluationService()
node_run_result_mapping_list: list[dict[str, NodeRunResult]] = evaluation_service.execute_targets(
node_run_result_mapping_list, workflow_run_ids = evaluation_service.execute_targets(
tenant_id=run_data.tenant_id,
target_type=run_data.target_type,
target_id=run_data.target_id,
@@ -102,6 +102,13 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
node_run_result_mapping_list=node_run_result_mapping_list,
)
_backfill_workflow_run_ids(
session=session,
evaluation_run_id=run_data.evaluation_run_id,
input_list=run_data.input_list,
workflow_run_ids=workflow_run_ids,
)
# Compute summary metrics
metrics_summary = _compute_metrics_summary(results, run_data.judgment_config)
@@ -235,6 +242,28 @@ def _execute_retrieval_test(
return results
def _backfill_workflow_run_ids(
session: Any,
evaluation_run_id: str,
input_list: list[EvaluationDatasetInput],
workflow_run_ids: list[str | None],
) -> None:
"""Set ``workflow_run_id`` on items that were created by the runner."""
from models.evaluation import EvaluationRunItem
for item, wf_run_id in zip(input_list, workflow_run_ids):
if not wf_run_id:
continue
run_item = (
session.query(EvaluationRunItem)
.filter_by(evaluation_run_id=evaluation_run_id, item_index=item.index)
.first()
)
if run_item:
run_item.workflow_run_id = wf_run_id
session.commit()
def _mark_run_failed(session: Any, run_id: str, error: str) -> None:
"""Mark an evaluation run as failed."""
try: