mirror of
https://github.com/langgenius/dify.git
synced 2026-03-25 05:01:18 -04:00
feat: Add evaluation result details when querying the workflow run log.
This commit is contained in:
@@ -14,6 +14,7 @@ workflow_app_log_partial_fields = {
|
||||
"id": fields.String,
|
||||
"workflow_run": fields.Nested(workflow_run_for_log_fields, attribute="workflow_run", allow_null=True),
|
||||
"details": fields.Raw(attribute="details"),
|
||||
"evaluation": fields.Raw(attribute="evaluation", default=None),
|
||||
"created_from": fields.String,
|
||||
"created_by_role": fields.String,
|
||||
"created_by_account": fields.Nested(simple_account_fields, attribute="created_by_account", allow_null=True),
|
||||
|
||||
@@ -78,6 +78,7 @@ def upgrade():
|
||||
"evaluation_run_items",
|
||||
sa.Column("id", models.types.StringUUID(), nullable=False),
|
||||
sa.Column("evaluation_run_id", models.types.StringUUID(), nullable=False),
|
||||
sa.Column("workflow_run_id", models.types.StringUUID(), nullable=True),
|
||||
sa.Column("item_index", sa.Integer(), nullable=False),
|
||||
sa.Column("inputs", models.types.LongText(), nullable=True),
|
||||
sa.Column("expected_output", models.types.LongText(), nullable=True),
|
||||
@@ -95,10 +96,12 @@ def upgrade():
|
||||
batch_op.create_index(
|
||||
"evaluation_run_item_index_idx", ["evaluation_run_id", "item_index"], unique=False
|
||||
)
|
||||
batch_op.create_index("evaluation_run_item_workflow_run_idx", ["workflow_run_id"], unique=False)
|
||||
|
||||
|
||||
def downgrade():
|
||||
with op.batch_alter_table("evaluation_run_items", schema=None) as batch_op:
|
||||
batch_op.drop_index("evaluation_run_item_workflow_run_idx")
|
||||
batch_op.drop_index("evaluation_run_item_index_idx")
|
||||
batch_op.drop_index("evaluation_run_item_run_idx")
|
||||
op.drop_table("evaluation_run_items")
|
||||
|
||||
@@ -28,6 +28,7 @@ class EvaluationTargetType(StrEnum):
|
||||
SNIPPETS = "snippets"
|
||||
KNOWLEDGE_BASE = "knowledge_base"
|
||||
|
||||
|
||||
class EvaluationConfiguration(Base):
|
||||
"""Stores evaluation configuration for each target (App or Snippet)."""
|
||||
|
||||
@@ -132,10 +133,12 @@ class EvaluationRunItem(Base):
|
||||
sa.PrimaryKeyConstraint("id", name="evaluation_run_item_pkey"),
|
||||
sa.Index("evaluation_run_item_run_idx", "evaluation_run_id"),
|
||||
sa.Index("evaluation_run_item_index_idx", "evaluation_run_id", "item_index"),
|
||||
sa.Index("evaluation_run_item_workflow_run_idx", "workflow_run_id"),
|
||||
)
|
||||
|
||||
id: Mapped[str] = mapped_column(StringUUID, default=lambda: str(uuidv7()))
|
||||
evaluation_run_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
|
||||
workflow_run_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
|
||||
|
||||
item_index: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
inputs: Mapped[str | None] = mapped_column(LongText, nullable=True)
|
||||
|
||||
@@ -571,7 +571,7 @@ class EvaluationService:
|
||||
target_id: str,
|
||||
input_list: list[EvaluationDatasetInput],
|
||||
max_workers: int = 5,
|
||||
) -> list[dict[str, NodeRunResult]]:
|
||||
) -> tuple[list[dict[str, NodeRunResult]], list[str | None]]:
|
||||
"""Execute the evaluation target for every test-data item in parallel.
|
||||
|
||||
:param tenant_id: Workspace / tenant ID.
|
||||
@@ -579,9 +579,11 @@ class EvaluationService:
|
||||
:param target_id: ID of the App or CustomizedSnippet.
|
||||
:param input_list: All test-data items parsed from the dataset.
|
||||
:param max_workers: Maximum number of parallel worker threads.
|
||||
:return: Ordered list of ``{node_id: NodeRunResult}`` mappings. The
|
||||
*i*-th element corresponds to ``input_list[i]``. If a target
|
||||
execution fails, the corresponding element is an empty dict.
|
||||
:return: Tuple of (node_results, workflow_run_ids).
|
||||
node_results: ordered list of ``{node_id: NodeRunResult}`` mappings;
|
||||
the *i*-th element corresponds to ``input_list[i]``.
|
||||
workflow_run_ids: ordered list of workflow_run_id strings (or None)
|
||||
for each input item.
|
||||
"""
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
@@ -589,13 +591,12 @@ class EvaluationService:
|
||||
|
||||
flask_app: Flask = current_app._get_current_object() # type: ignore
|
||||
|
||||
def _worker(item: EvaluationDatasetInput) -> dict[str, NodeRunResult]:
|
||||
def _worker(item: EvaluationDatasetInput) -> tuple[dict[str, NodeRunResult], str | None]:
|
||||
with flask_app.app_context():
|
||||
from models.engine import db
|
||||
|
||||
with Session(db.engine, expire_on_commit=False) as thread_session:
|
||||
try:
|
||||
# 1. Execute target (workflow app / snippet)
|
||||
response = cls._run_single_target(
|
||||
session=thread_session,
|
||||
target_type=target_type,
|
||||
@@ -603,7 +604,6 @@ class EvaluationService:
|
||||
item=item,
|
||||
)
|
||||
|
||||
# 2. Extract workflow_run_id from the blocking response
|
||||
workflow_run_id = cls._extract_workflow_run_id(response)
|
||||
if not workflow_run_id:
|
||||
logger.warning(
|
||||
@@ -611,34 +611,38 @@ class EvaluationService:
|
||||
item.index,
|
||||
target_id,
|
||||
)
|
||||
return {}
|
||||
return {}, None
|
||||
|
||||
# 3. Query per-node execution results from DB
|
||||
return cls._query_node_run_results(
|
||||
node_results = cls._query_node_run_results(
|
||||
session=thread_session,
|
||||
tenant_id=tenant_id,
|
||||
app_id=target_id,
|
||||
workflow_run_id=workflow_run_id,
|
||||
)
|
||||
return node_results, workflow_run_id
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"Target execution failed for item %d (target=%s)",
|
||||
item.index,
|
||||
target_id,
|
||||
)
|
||||
return {}
|
||||
return {}, None
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = [executor.submit(_worker, item) for item in input_list]
|
||||
ordered_results: list[dict[str, NodeRunResult]] = []
|
||||
ordered_workflow_run_ids: list[str | None] = []
|
||||
for future in futures:
|
||||
try:
|
||||
ordered_results.append(future.result())
|
||||
node_result, wf_run_id = future.result()
|
||||
ordered_results.append(node_result)
|
||||
ordered_workflow_run_ids.append(wf_run_id)
|
||||
except Exception:
|
||||
logger.exception("Unexpected error collecting target execution result")
|
||||
ordered_results.append({})
|
||||
ordered_workflow_run_ids.append(None)
|
||||
|
||||
return ordered_results
|
||||
return ordered_results, ordered_workflow_run_ids
|
||||
|
||||
@classmethod
|
||||
def _run_single_target(
|
||||
|
||||
@@ -19,17 +19,28 @@ class LogView:
|
||||
"""Lightweight wrapper for WorkflowAppLog with computed details.
|
||||
|
||||
- Exposes `details_` for marshalling to `details` in API response
|
||||
- Exposes `evaluation_` for marshalling evaluation metrics in API response
|
||||
- Proxies all other attributes to the underlying `WorkflowAppLog`
|
||||
"""
|
||||
|
||||
def __init__(self, log: WorkflowAppLog, details: dict | None):
|
||||
def __init__(
|
||||
self,
|
||||
log: WorkflowAppLog,
|
||||
details: dict | None,
|
||||
evaluation: list[dict] | None = None,
|
||||
):
|
||||
self.log = log
|
||||
self.details_ = details
|
||||
self.evaluation_ = evaluation
|
||||
|
||||
@property
|
||||
def details(self) -> dict | None:
|
||||
return self.details_
|
||||
|
||||
@property
|
||||
def evaluation(self) -> list[dict] | None:
|
||||
return self.evaluation_
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.log, name)
|
||||
|
||||
@@ -159,12 +170,20 @@ class WorkflowAppService:
|
||||
# Execute query and get items
|
||||
if detail:
|
||||
rows = session.execute(offset_stmt).all()
|
||||
items = [
|
||||
LogView(log, {"trigger_metadata": self.handle_trigger_metadata(app_model.tenant_id, meta_val)})
|
||||
logs_with_details = [
|
||||
(log, {"trigger_metadata": self.handle_trigger_metadata(app_model.tenant_id, meta_val)})
|
||||
for log, meta_val in rows
|
||||
]
|
||||
else:
|
||||
items = [LogView(log, None) for log in session.scalars(offset_stmt).all()]
|
||||
logs_with_details = [(log, None) for log in session.scalars(offset_stmt).all()]
|
||||
|
||||
workflow_run_ids = [log.workflow_run_id for log, _ in logs_with_details]
|
||||
eval_map = self._batch_query_evaluation_metrics(session, workflow_run_ids)
|
||||
|
||||
items = [
|
||||
LogView(log, details, evaluation=eval_map.get(log.workflow_run_id))
|
||||
for log, details in logs_with_details
|
||||
]
|
||||
return {
|
||||
"page": page,
|
||||
"limit": limit,
|
||||
@@ -246,6 +265,45 @@ class WorkflowAppService:
|
||||
"data": items,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _batch_query_evaluation_metrics(
|
||||
session: Session,
|
||||
workflow_run_ids: list[str],
|
||||
) -> dict[str, list[dict[str, Any]]]:
|
||||
"""Return evaluation metrics keyed by workflow_run_id.
|
||||
|
||||
Only returns metrics from completed evaluation runs. If a workflow
|
||||
run was not part of any evaluation (or the evaluation has not
|
||||
completed), it will be absent from the result dict.
|
||||
"""
|
||||
from models.evaluation import EvaluationRun, EvaluationRunItem, EvaluationRunStatus
|
||||
|
||||
if not workflow_run_ids:
|
||||
return {}
|
||||
|
||||
non_null_ids = [wid for wid in workflow_run_ids if wid]
|
||||
if not non_null_ids:
|
||||
return {}
|
||||
|
||||
stmt = (
|
||||
select(EvaluationRunItem.workflow_run_id, EvaluationRunItem.metrics)
|
||||
.join(EvaluationRun, EvaluationRun.id == EvaluationRunItem.evaluation_run_id)
|
||||
.where(
|
||||
EvaluationRunItem.workflow_run_id.in_(non_null_ids),
|
||||
EvaluationRun.status == EvaluationRunStatus.COMPLETED,
|
||||
)
|
||||
)
|
||||
rows = session.execute(stmt).all()
|
||||
|
||||
result: dict[str, list[dict[str, Any]]] = {}
|
||||
for wf_run_id, metrics_json in rows:
|
||||
if wf_run_id and metrics_json:
|
||||
parsed: list[dict[str, Any]] = json.loads(metrics_json)
|
||||
existing = result.get(wf_run_id, [])
|
||||
existing.extend(parsed)
|
||||
result[wf_run_id] = existing
|
||||
return result
|
||||
|
||||
def handle_trigger_metadata(self, tenant_id: str, meta_val: str | None) -> dict[str, Any]:
|
||||
metadata: dict[str, Any] | None = self._safe_json_loads(meta_val)
|
||||
if not metadata:
|
||||
|
||||
@@ -89,7 +89,7 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
|
||||
)
|
||||
else:
|
||||
evaluation_service = EvaluationService()
|
||||
node_run_result_mapping_list: list[dict[str, NodeRunResult]] = evaluation_service.execute_targets(
|
||||
node_run_result_mapping_list, workflow_run_ids = evaluation_service.execute_targets(
|
||||
tenant_id=run_data.tenant_id,
|
||||
target_type=run_data.target_type,
|
||||
target_id=run_data.target_id,
|
||||
@@ -102,6 +102,13 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
|
||||
node_run_result_mapping_list=node_run_result_mapping_list,
|
||||
)
|
||||
|
||||
_backfill_workflow_run_ids(
|
||||
session=session,
|
||||
evaluation_run_id=run_data.evaluation_run_id,
|
||||
input_list=run_data.input_list,
|
||||
workflow_run_ids=workflow_run_ids,
|
||||
)
|
||||
|
||||
# Compute summary metrics
|
||||
metrics_summary = _compute_metrics_summary(results, run_data.judgment_config)
|
||||
|
||||
@@ -235,6 +242,28 @@ def _execute_retrieval_test(
|
||||
return results
|
||||
|
||||
|
||||
def _backfill_workflow_run_ids(
|
||||
session: Any,
|
||||
evaluation_run_id: str,
|
||||
input_list: list[EvaluationDatasetInput],
|
||||
workflow_run_ids: list[str | None],
|
||||
) -> None:
|
||||
"""Set ``workflow_run_id`` on items that were created by the runner."""
|
||||
from models.evaluation import EvaluationRunItem
|
||||
|
||||
for item, wf_run_id in zip(input_list, workflow_run_ids):
|
||||
if not wf_run_id:
|
||||
continue
|
||||
run_item = (
|
||||
session.query(EvaluationRunItem)
|
||||
.filter_by(evaluation_run_id=evaluation_run_id, item_index=item.index)
|
||||
.first()
|
||||
)
|
||||
if run_item:
|
||||
run_item.workflow_run_id = wf_run_id
|
||||
session.commit()
|
||||
|
||||
|
||||
def _mark_run_failed(session: Any, run_id: str, error: str) -> None:
|
||||
"""Mark an evaluation run as failed."""
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user