mirror of
https://github.com/langgenius/dify.git
synced 2026-03-10 01:01:37 -04:00
feat: Implement customized evaluation in BaseEvaluationInstance.
This commit is contained in:
@@ -1,14 +1,21 @@
|
||||
import json
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Mapping
|
||||
from typing import Any
|
||||
|
||||
from core.evaluation.entities.evaluation_entity import (
|
||||
EvaluationCategory,
|
||||
EvaluationItemInput,
|
||||
EvaluationItemResult,
|
||||
EvaluationMetric,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseEvaluationInstance(ABC):
|
||||
"""Abstract base class for evaluation framework adapters."""
|
||||
"""Abstract base class for evaluation framework adapters. """
|
||||
|
||||
@abstractmethod
|
||||
def evaluate_llm(
|
||||
@@ -62,3 +69,172 @@ class BaseEvaluationInstance(ABC):
|
||||
def get_supported_metrics(self, category: EvaluationCategory) -> list[str]:
|
||||
"""Return the list of supported metric names for a given evaluation category."""
|
||||
...
|
||||
|
||||
def evaluate_with_customized_workflow(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
results: list[EvaluationItemResult],
|
||||
metrics_config: dict,
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
"""Evaluate using a published workflow as the evaluator.
|
||||
|
||||
The evaluator workflow's output variables are treated as metrics:
|
||||
each output variable name becomes a metric name, and its value
|
||||
becomes the score.
|
||||
|
||||
Args:
|
||||
items: Evaluation items with inputs, expected_output, context.
|
||||
results: Results from Phase 1 (with actual_output populated).
|
||||
metrics_config: Must contain ``workflow_id`` pointing to a
|
||||
published WORKFLOW-type App.
|
||||
tenant_id: Tenant scope.
|
||||
|
||||
Returns:
|
||||
A list of ``EvaluationItemResult`` with metrics extracted from
|
||||
the workflow outputs.
|
||||
"""
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from core.app.apps.workflow.app_generator import WorkflowAppGenerator
|
||||
from core.app.entities.app_invoke_entities import InvokeFrom
|
||||
from core.evaluation.runners import get_service_account_for_app
|
||||
from models.engine import db
|
||||
from models.model import App
|
||||
from services.workflow_service import WorkflowService
|
||||
|
||||
workflow_id = metrics_config.get("workflow_id")
|
||||
if not workflow_id:
|
||||
raise ValueError(
|
||||
"metrics_config must contain 'workflow_id' for customized evaluator"
|
||||
)
|
||||
|
||||
# Load the evaluator workflow resources using a dedicated session
|
||||
with Session(db.engine, expire_on_commit=False) as session, session.begin():
|
||||
app = session.query(App).filter_by(
|
||||
id=workflow_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if not app:
|
||||
raise ValueError(
|
||||
f"Evaluation workflow app {workflow_id} not found in tenant {tenant_id}"
|
||||
)
|
||||
service_account = get_service_account_for_app(session, workflow_id)
|
||||
|
||||
workflow_service = WorkflowService()
|
||||
published_workflow = workflow_service.get_published_workflow(app_model=app)
|
||||
if not published_workflow:
|
||||
raise ValueError(
|
||||
f"No published workflow found for evaluation app {workflow_id}"
|
||||
)
|
||||
|
||||
result_by_index = {r.index: r for r in results}
|
||||
eval_results: list[EvaluationItemResult] = []
|
||||
for item in items:
|
||||
result = result_by_index.get(item.index)
|
||||
try:
|
||||
workflow_inputs = self._build_workflow_inputs(item, result)
|
||||
|
||||
generator = WorkflowAppGenerator()
|
||||
response: Mapping[str, Any] = generator.generate(
|
||||
app_model=app,
|
||||
workflow=published_workflow,
|
||||
user=service_account,
|
||||
args={"inputs": workflow_inputs},
|
||||
invoke_from=InvokeFrom.SERVICE_API,
|
||||
streaming=False,
|
||||
)
|
||||
|
||||
metrics = self._extract_workflow_metrics(response)
|
||||
eval_results.append(
|
||||
EvaluationItemResult(
|
||||
index=item.index,
|
||||
metrics=metrics,
|
||||
metadata={
|
||||
"workflow_response": _safe_serialize(response),
|
||||
},
|
||||
)
|
||||
)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"Customized evaluator failed for item %d with workflow %s",
|
||||
item.index,
|
||||
workflow_id,
|
||||
)
|
||||
eval_results.append(EvaluationItemResult(index=item.index))
|
||||
|
||||
return eval_results
|
||||
|
||||
@staticmethod
|
||||
def _build_workflow_inputs(
|
||||
item: EvaluationItemInput,
|
||||
result: EvaluationItemResult | None,
|
||||
) -> dict[str, Any]:
|
||||
"""Build workflow input dict from evaluation data.
|
||||
|
||||
Maps evaluation data to conventional workflow input variable names:
|
||||
- ``actual_output``: The target's actual output (from ``result``).
|
||||
- ``expected_output``: The expected/reference output.
|
||||
- ``inputs``: The original evaluation inputs as a JSON string.
|
||||
- ``context``: All context strings joined by newlines.
|
||||
"""
|
||||
workflow_inputs: dict[str, Any] = {}
|
||||
|
||||
if result and result.actual_output:
|
||||
workflow_inputs["actual_output"] = result.actual_output
|
||||
|
||||
if item.expected_output:
|
||||
workflow_inputs["expected_output"] = item.expected_output
|
||||
|
||||
if item.inputs:
|
||||
workflow_inputs["inputs"] = json.dumps(item.inputs, ensure_ascii=False)
|
||||
|
||||
if item.context:
|
||||
workflow_inputs["context"] = "\n\n".join(item.context)
|
||||
|
||||
return workflow_inputs
|
||||
|
||||
@staticmethod
|
||||
def _extract_workflow_metrics(
|
||||
response: Mapping[str, Any],
|
||||
) -> list[EvaluationMetric]:
|
||||
"""Extract evaluation metrics from workflow output variables.
|
||||
|
||||
Each output variable is treated as a metric. The variable name
|
||||
becomes the metric name, and its value becomes the score.
|
||||
Non-numeric values are recorded with ``score=0.0`` and the raw
|
||||
value stored in ``details``.
|
||||
"""
|
||||
metrics: list[EvaluationMetric] = []
|
||||
|
||||
data = response.get("data", {})
|
||||
if not isinstance(data, Mapping):
|
||||
logger.warning("Unexpected workflow response format: missing 'data' dict")
|
||||
return metrics
|
||||
|
||||
outputs = data.get("outputs", {})
|
||||
if not isinstance(outputs, Mapping):
|
||||
logger.warning(
|
||||
"Unexpected workflow response format: 'outputs' is not a dict"
|
||||
)
|
||||
return metrics
|
||||
|
||||
for key, value in outputs.items():
|
||||
try:
|
||||
score = float(value)
|
||||
metrics.append(EvaluationMetric(name=key, score=score))
|
||||
except (TypeError, ValueError):
|
||||
metrics.append(
|
||||
EvaluationMetric(
|
||||
name=key, score=0.0, details={"raw_value": value}
|
||||
)
|
||||
)
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
def _safe_serialize(response: Mapping[str, Any]) -> dict[str, Any]:
|
||||
"""Safely serialize workflow response for metadata storage."""
|
||||
try:
|
||||
return dict(response)
|
||||
except Exception:
|
||||
return {"raw": str(response)}
|
||||
|
||||
@@ -6,7 +6,6 @@ from pydantic import BaseModel
|
||||
class EvaluationFrameworkEnum(StrEnum):
|
||||
RAGAS = "ragas"
|
||||
DEEPEVAL = "deepeval"
|
||||
CUSTOMIZED = "customized"
|
||||
NONE = "none"
|
||||
|
||||
|
||||
@@ -18,8 +17,3 @@ class BaseEvaluationConfig(BaseModel):
|
||||
class RagasConfig(BaseEvaluationConfig):
|
||||
"""RAGAS-specific configuration."""
|
||||
pass
|
||||
|
||||
|
||||
class CustomizedEvaluatorConfig(BaseEvaluationConfig):
|
||||
"""Configuration for the customized workflow-based evaluator."""
|
||||
pass
|
||||
|
||||
@@ -1,40 +1,112 @@
|
||||
"""Judgment condition entities for evaluation metric assessment.
|
||||
|
||||
Key concepts:
|
||||
- **value_source**: Where the comparison target comes from.
|
||||
- "constant": a literal value supplied by the user (e.g. threshold "0.8").
|
||||
- "variable": a named field from the evaluation target's runtime data
|
||||
(inputs, actual_output, expected_output). The ``value`` field holds the
|
||||
variable key; the actual comparison value is resolved at evaluation time.
|
||||
- **condition_type**: Determines operator semantics and type coercion.
|
||||
- "string": string operators (contains, is, start with, …).
|
||||
- "number": numeric operators (>, <, =, ≠, ≥, ≤).
|
||||
- "datetime": temporal operators (before, after).
|
||||
|
||||
Typical usage:
|
||||
judgment_config = JudgmentConfig(
|
||||
logical_operator="and",
|
||||
conditions=[
|
||||
JudgmentCondition(metric_name="faithfulness", comparison_operator=">", value="0.8"),
|
||||
JudgmentCondition(metric_name="answer_relevancy", comparison_operator="≥", value="0.7"),
|
||||
JudgmentCondition(
|
||||
metric_name="faithfulness",
|
||||
comparison_operator=">",
|
||||
value="0.8",
|
||||
condition_type="number",
|
||||
),
|
||||
JudgmentCondition(
|
||||
metric_name="output",
|
||||
comparison_operator="contains",
|
||||
value="expected_output",
|
||||
value_source="variable",
|
||||
condition_type="string",
|
||||
),
|
||||
],
|
||||
)
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
from enum import StrEnum
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from core.workflow.utils.condition.entities import SupportedComparisonOperator
|
||||
|
||||
class JudgmentValueSource(StrEnum):
|
||||
"""Where the comparison target value comes from."""
|
||||
|
||||
CONSTANT = "constant"
|
||||
VARIABLE = "variable"
|
||||
|
||||
|
||||
class JudgmentConditionType(StrEnum):
|
||||
"""Category of the condition, controls operator semantics and type coercion."""
|
||||
|
||||
STRING = "string"
|
||||
NUMBER = "number"
|
||||
DATETIME = "datetime"
|
||||
|
||||
|
||||
# Supported comparison operators for judgment conditions.
|
||||
JudgmentComparisonOperator = Literal[
|
||||
# string
|
||||
"contains",
|
||||
"not contains",
|
||||
"start with",
|
||||
"end with",
|
||||
"is",
|
||||
"is not",
|
||||
"empty",
|
||||
"not empty",
|
||||
"in",
|
||||
"not in",
|
||||
# number
|
||||
"=",
|
||||
"≠",
|
||||
">",
|
||||
"<",
|
||||
"≥",
|
||||
"≤",
|
||||
# datetime
|
||||
"before",
|
||||
"after",
|
||||
# universal
|
||||
"null",
|
||||
"not null",
|
||||
]
|
||||
|
||||
|
||||
class JudgmentCondition(BaseModel):
|
||||
"""A single judgment condition that checks one metric value.
|
||||
|
||||
Attributes:
|
||||
metric_name: The name of the evaluation metric to check
|
||||
(must match an EvaluationMetric.name in the results).
|
||||
comparison_operator: The comparison operator to apply
|
||||
(reuses the same operator set as workflow condition branches).
|
||||
value: The expected/threshold value to compare against.
|
||||
For numeric operators (>, <, =, etc.), this should be a numeric string.
|
||||
For string operators (contains, is, etc.), this should be a string.
|
||||
metric_name: The name of the evaluation metric to check (left side).
|
||||
Must match an EvaluationMetric.name in the results.
|
||||
comparison_operator: The comparison operator to apply.
|
||||
value: The comparison target (right side).
|
||||
- When value_source is "constant": the literal threshold/expected value.
|
||||
- When value_source is "variable": the variable key name to look up
|
||||
from the evaluation target's runtime data.
|
||||
For unary operators (empty, null, etc.), this can be None.
|
||||
value_source: Where the comparison value comes from.
|
||||
"constant" (default) for user-supplied literals,
|
||||
"variable" for references to evaluation target data.
|
||||
condition_type: Controls type coercion and which operators are valid.
|
||||
"string" (default), "number", or "datetime".
|
||||
"""
|
||||
|
||||
metric_name: str
|
||||
comparison_operator: SupportedComparisonOperator
|
||||
comparison_operator: JudgmentComparisonOperator
|
||||
value: str | Sequence[str] | None = None
|
||||
value_source: JudgmentValueSource = JudgmentValueSource.CONSTANT
|
||||
condition_type: JudgmentConditionType = JudgmentConditionType.STRING
|
||||
|
||||
|
||||
class JudgmentConfig(BaseModel):
|
||||
@@ -56,7 +128,7 @@ class JudgmentConditionResult(BaseModel):
|
||||
Attributes:
|
||||
metric_name: Which metric was checked.
|
||||
comparison_operator: The operator that was applied.
|
||||
expected_value: The threshold/expected value from the condition config.
|
||||
expected_value: The resolved comparison value (after variable resolution).
|
||||
actual_value: The actual metric value that was evaluated.
|
||||
passed: Whether this individual condition passed.
|
||||
error: Error message if the condition evaluation failed.
|
||||
|
||||
@@ -25,14 +25,6 @@ class EvaluationFrameworkConfigMap(collections.UserDict[str, dict[str, Any]]):
|
||||
}
|
||||
case EvaluationFrameworkEnum.DEEPEVAL:
|
||||
raise NotImplementedError("DeepEval adapter is not yet implemented.")
|
||||
case EvaluationFrameworkEnum.CUSTOMIZED:
|
||||
from core.evaluation.entities.config_entity import CustomizedEvaluatorConfig
|
||||
from core.evaluation.frameworks.customized.customized_evaluator import CustomizedEvaluator
|
||||
|
||||
return {
|
||||
"config_class": CustomizedEvaluatorConfig,
|
||||
"evaluator_class": CustomizedEvaluator,
|
||||
}
|
||||
case _:
|
||||
raise ValueError(f"Unknown evaluation framework: {framework}")
|
||||
|
||||
|
||||
@@ -1,267 +0,0 @@
|
||||
"""Customized workflow-based evaluator.
|
||||
|
||||
Uses a published workflow as the evaluation strategy. The target's actual output,
|
||||
expected output, original inputs, and context are passed as workflow inputs.
|
||||
The workflow's output variables are treated as evaluation metrics.
|
||||
|
||||
The evaluation workflow_id is provided per evaluation run via
|
||||
metrics_config["workflow_id"].
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from collections.abc import Mapping
|
||||
from typing import Any
|
||||
|
||||
from core.evaluation.base_evaluation_instance import BaseEvaluationInstance
|
||||
from core.evaluation.entities.config_entity import CustomizedEvaluatorConfig
|
||||
from core.evaluation.entities.evaluation_entity import (
|
||||
EvaluationCategory,
|
||||
EvaluationItemInput,
|
||||
EvaluationItemResult,
|
||||
EvaluationMetric,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CustomizedEvaluator(BaseEvaluationInstance):
|
||||
"""Evaluate using a published workflow."""
|
||||
|
||||
def __init__(self, config: CustomizedEvaluatorConfig):
|
||||
self.config = config
|
||||
|
||||
def evaluate_llm(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
metrics_config: dict,
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
return self._evaluate_with_workflow(items, metrics_config, tenant_id)
|
||||
|
||||
def evaluate_retrieval(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
metrics_config: dict,
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
return self._evaluate_with_workflow(items, metrics_config, tenant_id)
|
||||
|
||||
def evaluate_agent(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
metrics_config: dict,
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
return self._evaluate_with_workflow(items, metrics_config, tenant_id)
|
||||
|
||||
def evaluate_workflow(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
metrics_config: dict,
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
return self._evaluate_with_workflow(items, metrics_config, tenant_id)
|
||||
|
||||
def get_supported_metrics(self, category: EvaluationCategory) -> list[str]:
|
||||
"""Metrics are dynamic and defined by the evaluation workflow outputs.
|
||||
|
||||
Return an empty list since available metrics depend on the specific
|
||||
workflow chosen at runtime.
|
||||
"""
|
||||
return []
|
||||
|
||||
def _evaluate_with_workflow(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
metrics_config: dict,
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
"""Run the evaluation workflow for each item and extract metric scores.
|
||||
|
||||
Args:
|
||||
items: Evaluation items with inputs, expected_output, and context
|
||||
(context typically contains the target's actual_output, merged
|
||||
by the Runner's evaluate_metrics method).
|
||||
metrics_config: Must contain "workflow_id" pointing to a published
|
||||
WORKFLOW-type App.
|
||||
tenant_id: Tenant scope for database and workflow execution.
|
||||
|
||||
Returns:
|
||||
List of EvaluationItemResult with metrics extracted from workflow outputs.
|
||||
|
||||
Raises:
|
||||
ValueError: If workflow_id is missing from metrics_config or the
|
||||
workflow/app cannot be found.
|
||||
"""
|
||||
workflow_id = metrics_config.get("workflow_id")
|
||||
if not workflow_id:
|
||||
raise ValueError(
|
||||
"metrics_config must contain 'workflow_id' for customized evaluator"
|
||||
)
|
||||
|
||||
app, workflow, service_account = self._load_workflow_resources(workflow_id, tenant_id)
|
||||
|
||||
results: list[EvaluationItemResult] = []
|
||||
for item in items:
|
||||
try:
|
||||
result = self._evaluate_single_item(app, workflow, service_account, item)
|
||||
results.append(result)
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"Customized evaluator failed for item %d with workflow %s",
|
||||
item.index,
|
||||
workflow_id,
|
||||
)
|
||||
results.append(EvaluationItemResult(index=item.index))
|
||||
return results
|
||||
|
||||
def _evaluate_single_item(
|
||||
self,
|
||||
app: Any,
|
||||
workflow: Any,
|
||||
service_account: Any,
|
||||
item: EvaluationItemInput,
|
||||
) -> EvaluationItemResult:
|
||||
"""Run the evaluation workflow for a single item.
|
||||
|
||||
Builds workflow inputs from the item data and executes the workflow
|
||||
in non-streaming mode. Extracts metrics from the workflow's output
|
||||
variables.
|
||||
"""
|
||||
from core.app.apps.workflow.app_generator import WorkflowAppGenerator
|
||||
from core.app.entities.app_invoke_entities import InvokeFrom
|
||||
|
||||
workflow_inputs = self._build_workflow_inputs(item)
|
||||
|
||||
generator = WorkflowAppGenerator()
|
||||
response: Mapping[str, Any] = generator.generate(
|
||||
app_model=app,
|
||||
workflow=workflow,
|
||||
user=service_account,
|
||||
args={"inputs": workflow_inputs},
|
||||
invoke_from=InvokeFrom.SERVICE_API,
|
||||
streaming=False,
|
||||
)
|
||||
|
||||
metrics = self._extract_metrics(response)
|
||||
return EvaluationItemResult(
|
||||
index=item.index,
|
||||
metrics=metrics,
|
||||
metadata={"workflow_response": self._safe_serialize(response)},
|
||||
)
|
||||
|
||||
def _load_workflow_resources(
|
||||
self, workflow_id: str, tenant_id: str
|
||||
) -> tuple[Any, Any, Any]:
|
||||
"""Load the evaluation workflow App, its published workflow, and a service account.
|
||||
|
||||
Args:
|
||||
workflow_id: The App ID of the evaluation workflow.
|
||||
tenant_id: Tenant scope.
|
||||
|
||||
Returns:
|
||||
Tuple of (app, workflow, service_account).
|
||||
|
||||
Raises:
|
||||
ValueError: If the app or published workflow cannot be found.
|
||||
"""
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from core.evaluation.runners import get_service_account_for_app
|
||||
from models.engine import db
|
||||
from models.model import App
|
||||
from services.workflow_service import WorkflowService
|
||||
|
||||
with Session(db.engine, expire_on_commit=False) as session, session.begin():
|
||||
app = session.query(App).filter_by(id=workflow_id, tenant_id=tenant_id).first()
|
||||
if not app:
|
||||
raise ValueError(
|
||||
f"Evaluation workflow app {workflow_id} not found in tenant {tenant_id}"
|
||||
)
|
||||
|
||||
service_account = get_service_account_for_app(session, workflow_id)
|
||||
|
||||
workflow_service = WorkflowService()
|
||||
published_workflow = workflow_service.get_published_workflow(app_model=app)
|
||||
if not published_workflow:
|
||||
raise ValueError(
|
||||
f"No published workflow found for evaluation app {workflow_id}"
|
||||
)
|
||||
|
||||
return app, published_workflow, service_account
|
||||
|
||||
@staticmethod
|
||||
def _build_workflow_inputs(item: EvaluationItemInput) -> dict[str, Any]:
|
||||
"""Build workflow input dict from an evaluation item.
|
||||
|
||||
Maps evaluation data to conventional workflow input variable names:
|
||||
- actual_output: The target's actual output (from context[0] if available)
|
||||
- expected_output: The expected/reference output
|
||||
- inputs: The original evaluation inputs as JSON string
|
||||
- context: All context strings joined by newlines
|
||||
|
||||
"""
|
||||
workflow_inputs: dict[str, Any] = {}
|
||||
|
||||
# The actual_output is typically the first element in context
|
||||
# (merged by the Runner's evaluate_metrics method)
|
||||
if item.context:
|
||||
workflow_inputs["actual_output"] = item.context[0] if len(item.context) == 1 else "\n\n".join(item.context)
|
||||
|
||||
if item.expected_output:
|
||||
workflow_inputs["expected_output"] = item.expected_output
|
||||
|
||||
if item.inputs:
|
||||
workflow_inputs["inputs"] = json.dumps(item.inputs, ensure_ascii=False)
|
||||
|
||||
if item.context and len(item.context) > 1:
|
||||
workflow_inputs["context"] = "\n\n".join(item.context)
|
||||
|
||||
return workflow_inputs
|
||||
|
||||
@staticmethod
|
||||
def _extract_metrics(response: Mapping[str, Any]) -> list[EvaluationMetric]:
|
||||
"""Extract evaluation metrics from workflow output variables.
|
||||
|
||||
Each output variable is treated as a metric.
|
||||
"""
|
||||
metrics: list[EvaluationMetric] = []
|
||||
|
||||
data = response.get("data", {})
|
||||
if not isinstance(data, Mapping):
|
||||
logger.warning("Unexpected workflow response format: missing 'data' dict")
|
||||
return metrics
|
||||
|
||||
outputs = data.get("outputs", {})
|
||||
if not isinstance(outputs, Mapping):
|
||||
logger.warning("Unexpected workflow response format: 'outputs' is not a dict")
|
||||
return metrics
|
||||
|
||||
for key, value in outputs.items():
|
||||
try:
|
||||
score = float(value)
|
||||
metrics.append(EvaluationMetric(name=key, score=score))
|
||||
except (TypeError, ValueError):
|
||||
metrics.append(
|
||||
EvaluationMetric(name=key, score=0.0, details={"raw_value": value})
|
||||
)
|
||||
|
||||
return metrics
|
||||
|
||||
@staticmethod
|
||||
def _safe_serialize(response: Mapping[str, Any]) -> dict[str, Any]:
|
||||
"""Safely serialize workflow response for metadata storage."""
|
||||
try:
|
||||
return dict(response)
|
||||
except Exception:
|
||||
return {"raw": str(response)}
|
||||
@@ -5,21 +5,47 @@ Reuses the core comparison engine from the workflow condition system
|
||||
(core.workflow.utils.condition.processor._evaluate_condition) to ensure
|
||||
consistent operator semantics across the platform.
|
||||
|
||||
The processor is intentionally decoupled from evaluation frameworks
|
||||
(RAGAS / Customized) and runners. It operates on plain ``dict`` mappings
|
||||
and can be invoked from any context.
|
||||
|
||||
Typical usage::
|
||||
|
||||
metrics = {"faithfulness": 0.85, "answer_relevancy": 0.6}
|
||||
variables = {"expected_output": "Hello World", "created_at": "2025-01-01T00:00:00"}
|
||||
config = JudgmentConfig(
|
||||
logical_operator="and",
|
||||
conditions=[
|
||||
JudgmentCondition(metric_name="faithfulness", comparison_operator=">",
|
||||
value="0.8", condition_type="number"),
|
||||
JudgmentCondition(metric_name="output", comparison_operator="contains",
|
||||
value="expected_output", value_source="variable",
|
||||
condition_type="string"),
|
||||
],
|
||||
)
|
||||
result = JudgmentProcessor.evaluate(metrics, config, variable_values=variables)
|
||||
"""
|
||||
|
||||
import logging
|
||||
from collections.abc import Sequence
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from core.evaluation.entities.judgment_entity import (
|
||||
JudgmentCondition,
|
||||
JudgmentConditionResult,
|
||||
JudgmentConditionType,
|
||||
JudgmentConfig,
|
||||
JudgmentResult,
|
||||
JudgmentValueSource,
|
||||
)
|
||||
from core.workflow.utils.condition.processor import _evaluate_condition
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Operators that do not need a comparison value (unary operators).
|
||||
_UNARY_OPERATORS = frozenset({"null", "not null", "empty", "not empty"})
|
||||
|
||||
|
||||
class JudgmentProcessor:
|
||||
|
||||
@@ -27,13 +53,17 @@ class JudgmentProcessor:
|
||||
def evaluate(
|
||||
metric_values: dict[str, Any],
|
||||
config: JudgmentConfig,
|
||||
variable_values: dict[str, Any] | None = None,
|
||||
) -> JudgmentResult:
|
||||
"""Evaluate all judgment conditions against the given metric values.
|
||||
|
||||
Args:
|
||||
metric_values: Mapping of metric name to its value
|
||||
(e.g. {"faithfulness": 0.85, "status": "success"}).
|
||||
metric_values: Mapping of metric name → metric value
|
||||
(e.g. ``{"faithfulness": 0.85, "status": "success"}``).
|
||||
config: The judgment configuration with logical_operator and conditions.
|
||||
variable_values: Optional mapping of variable name → value, used when
|
||||
a condition's ``value_source`` is ``"variable"``. Typically built
|
||||
from the evaluation target's inputs / outputs.
|
||||
|
||||
Returns:
|
||||
JudgmentResult with overall pass/fail and per-condition details.
|
||||
@@ -49,7 +79,7 @@ class JudgmentProcessor:
|
||||
|
||||
for condition in config.conditions:
|
||||
result = JudgmentProcessor._evaluate_single_condition(
|
||||
metric_values, condition
|
||||
metric_values, condition, variable_values
|
||||
)
|
||||
condition_results.append(result)
|
||||
|
||||
@@ -66,6 +96,7 @@ class JudgmentProcessor:
|
||||
condition_results=condition_results,
|
||||
)
|
||||
|
||||
# All conditions evaluated
|
||||
if config.logical_operator == "and":
|
||||
final_passed = all(r.passed for r in condition_results)
|
||||
else:
|
||||
@@ -81,31 +112,21 @@ class JudgmentProcessor:
|
||||
def _evaluate_single_condition(
|
||||
metric_values: dict[str, Any],
|
||||
condition: JudgmentCondition,
|
||||
variable_values: dict[str, Any] | None = None,
|
||||
) -> JudgmentConditionResult:
|
||||
"""Evaluate a single judgment condition against the metric values.
|
||||
"""Evaluate a single judgment condition.
|
||||
|
||||
Looks up the metric by name, then delegates to the workflow condition
|
||||
engine for the actual comparison.
|
||||
|
||||
Args:
|
||||
metric_values: Mapping of metric name to its value.
|
||||
condition: The condition to evaluate.
|
||||
|
||||
Returns:
|
||||
JudgmentConditionResult with pass/fail and details.
|
||||
Steps:
|
||||
1. Look up the metric value (left side) by ``metric_name``.
|
||||
2. Resolve the comparison value (right side) — either a constant
|
||||
or a variable reference.
|
||||
3. Dispatch to the correct type handler (string / number / datetime).
|
||||
"""
|
||||
metric_name = condition.metric_name
|
||||
actual_value = metric_values.get(metric_name)
|
||||
|
||||
# Handle metric not found
|
||||
if actual_value is None and condition.comparison_operator not in (
|
||||
"null",
|
||||
"not null",
|
||||
"empty",
|
||||
"not empty",
|
||||
"exists",
|
||||
"not exists",
|
||||
):
|
||||
# Handle metric not found — skip for unary operators that work on None
|
||||
if actual_value is None and condition.comparison_operator not in _UNARY_OPERATORS:
|
||||
return JudgmentConditionResult(
|
||||
metric_name=metric_name,
|
||||
comparison_operator=condition.comparison_operator,
|
||||
@@ -115,17 +136,44 @@ class JudgmentProcessor:
|
||||
error=f"Metric '{metric_name}' not found in evaluation results",
|
||||
)
|
||||
|
||||
# Resolve the comparison value (right side)
|
||||
try:
|
||||
passed = _evaluate_condition(
|
||||
operator=condition.comparison_operator,
|
||||
value=actual_value,
|
||||
expected=condition.value,
|
||||
resolved_value = JudgmentProcessor._resolve_comparison_value(
|
||||
condition, variable_values
|
||||
)
|
||||
except ValueError as e:
|
||||
return JudgmentConditionResult(
|
||||
metric_name=metric_name,
|
||||
comparison_operator=condition.comparison_operator,
|
||||
expected_value=condition.value,
|
||||
actual_value=actual_value,
|
||||
passed=False,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
# Dispatch to the appropriate type handler
|
||||
try:
|
||||
match condition.condition_type:
|
||||
case JudgmentConditionType.DATETIME:
|
||||
passed = _evaluate_datetime_condition(
|
||||
actual_value, condition.comparison_operator, resolved_value
|
||||
)
|
||||
case JudgmentConditionType.NUMBER:
|
||||
passed = _evaluate_number_condition(
|
||||
actual_value, condition.comparison_operator, resolved_value
|
||||
)
|
||||
case _: # STRING (default) — delegate to workflow engine
|
||||
passed = _evaluate_condition(
|
||||
operator=condition.comparison_operator,
|
||||
value=actual_value,
|
||||
expected=resolved_value,
|
||||
)
|
||||
|
||||
return JudgmentConditionResult(
|
||||
metric_name=metric_name,
|
||||
comparison_operator=condition.comparison_operator,
|
||||
expected_value=resolved_value,
|
||||
actual_value=actual_value,
|
||||
passed=passed,
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -137,8 +185,184 @@ class JudgmentProcessor:
|
||||
return JudgmentConditionResult(
|
||||
metric_name=metric_name,
|
||||
comparison_operator=condition.comparison_operator,
|
||||
expected_value=condition.value,
|
||||
expected_value=resolved_value,
|
||||
actual_value=actual_value,
|
||||
passed=False,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _resolve_comparison_value(
|
||||
condition: JudgmentCondition,
|
||||
variable_values: dict[str, Any] | None,
|
||||
) -> str | Sequence[str] | None:
|
||||
"""Resolve the right-side comparison value.
|
||||
|
||||
For ``value_source == "constant"``, returns ``condition.value`` as-is.
|
||||
For ``value_source == "variable"``, looks up ``condition.value`` (as a key)
|
||||
in ``variable_values`` and returns the resolved value (converted to string
|
||||
for compatibility with the comparison engine).
|
||||
|
||||
Raises:
|
||||
ValueError: If the variable cannot be resolved.
|
||||
"""
|
||||
if condition.value_source == JudgmentValueSource.CONSTANT:
|
||||
return condition.value
|
||||
|
||||
# Variable resolution
|
||||
if condition.value is None:
|
||||
raise ValueError("Variable name (value) must be provided when value_source is 'variable'")
|
||||
|
||||
if not variable_values:
|
||||
raise ValueError(
|
||||
f"Cannot resolve variable '{condition.value}': no variable values provided"
|
||||
)
|
||||
|
||||
var_key = condition.value if isinstance(condition.value, str) else str(condition.value)
|
||||
if var_key not in variable_values:
|
||||
raise ValueError(
|
||||
f"Variable '{var_key}' not found in evaluation target data. "
|
||||
f"Available variables: {list(variable_values.keys())}"
|
||||
)
|
||||
|
||||
resolved = variable_values[var_key]
|
||||
# Convert to string for the comparison engine, unless it's already
|
||||
# a str/Sequence[str]/None which the engine expects.
|
||||
if resolved is None:
|
||||
return None
|
||||
if isinstance(resolved, str):
|
||||
return resolved
|
||||
if isinstance(resolved, Sequence) and all(isinstance(v, str) for v in resolved):
|
||||
return resolved
|
||||
return str(resolved)
|
||||
|
||||
|
||||
_DATETIME_FORMATS = [
|
||||
"%Y-%m-%dT%H:%M:%S",
|
||||
"%Y-%m-%dT%H:%M:%S.%f",
|
||||
"%Y-%m-%dT%H:%M:%SZ",
|
||||
"%Y-%m-%dT%H:%M:%S.%fZ",
|
||||
"%Y-%m-%dT%H:%M:%S%z",
|
||||
"%Y-%m-%d %H:%M:%S",
|
||||
"%Y-%m-%d",
|
||||
]
|
||||
|
||||
|
||||
def _parse_datetime(value: object) -> datetime:
|
||||
"""Parse a value into a datetime object.
|
||||
|
||||
Accepts datetime instances, numeric timestamps (int/float), and common
|
||||
ISO 8601 string formats.
|
||||
|
||||
Raises:
|
||||
ValueError: If the value cannot be parsed as a datetime.
|
||||
"""
|
||||
if isinstance(value, datetime):
|
||||
return value
|
||||
if isinstance(value, (int, float)):
|
||||
return datetime.fromtimestamp(value)
|
||||
if not isinstance(value, str):
|
||||
raise ValueError(f"Cannot parse '{value}' (type={type(value).__name__}) as datetime")
|
||||
|
||||
for fmt in _DATETIME_FORMATS:
|
||||
try:
|
||||
return datetime.strptime(value, fmt)
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
raise ValueError(
|
||||
f"Cannot parse datetime string '{value}'. "
|
||||
f"Supported formats: ISO 8601, 'YYYY-MM-DD HH:MM:SS', 'YYYY-MM-DD', or numeric timestamp."
|
||||
)
|
||||
|
||||
|
||||
def _evaluate_datetime_condition(
|
||||
actual: object,
|
||||
operator: str,
|
||||
expected: object,
|
||||
) -> bool:
|
||||
"""Evaluate a datetime comparison condition.
|
||||
|
||||
Also supports the universal unary operators (null, not null, empty, not empty)
|
||||
and the numeric-style operators (=, ≠, >, <, ≥, ≤) for datetime values.
|
||||
|
||||
Args:
|
||||
actual: The actual metric value (left side).
|
||||
operator: The comparison operator.
|
||||
expected: The expected/threshold value (right side).
|
||||
|
||||
Returns:
|
||||
True if the condition passes.
|
||||
|
||||
Raises:
|
||||
ValueError: If values cannot be parsed or operator is unsupported.
|
||||
"""
|
||||
# Handle unary operators first
|
||||
if operator == "null":
|
||||
return actual is None
|
||||
if operator == "not null":
|
||||
return actual is not None
|
||||
if operator == "empty":
|
||||
return not actual
|
||||
if operator == "not empty":
|
||||
return bool(actual)
|
||||
|
||||
if actual is None:
|
||||
return False
|
||||
|
||||
actual_dt = _parse_datetime(actual)
|
||||
expected_dt = _parse_datetime(expected) if expected is not None else None
|
||||
|
||||
if expected_dt is None:
|
||||
raise ValueError(f"Expected datetime value is required for operator '{operator}'")
|
||||
|
||||
match operator:
|
||||
case "before" | "<":
|
||||
return actual_dt < expected_dt
|
||||
case "after" | ">":
|
||||
return actual_dt > expected_dt
|
||||
case "=" | "is":
|
||||
return actual_dt == expected_dt
|
||||
case "≠" | "is not":
|
||||
return actual_dt != expected_dt
|
||||
case "≥":
|
||||
return actual_dt >= expected_dt
|
||||
case "≤":
|
||||
return actual_dt <= expected_dt
|
||||
case _:
|
||||
raise ValueError(f"Unsupported datetime operator: '{operator}'")
|
||||
|
||||
|
||||
def _evaluate_number_condition(
|
||||
actual: object,
|
||||
operator: str,
|
||||
expected: object,
|
||||
) -> bool:
|
||||
"""Evaluate a numeric comparison condition.
|
||||
|
||||
Ensures proper numeric type coercion before delegating to the workflow
|
||||
condition engine. This avoids string-vs-number comparison pitfalls
|
||||
(e.g. comparing float metric 0.85 against string threshold "0.8").
|
||||
|
||||
For unary operators (null, not null, empty, not empty), delegates directly.
|
||||
"""
|
||||
# Unary operators — delegate to workflow engine as-is
|
||||
if operator in _UNARY_OPERATORS:
|
||||
return _evaluate_condition(operator=operator, value=actual, expected=expected)
|
||||
|
||||
if actual is None:
|
||||
return False
|
||||
|
||||
# Coerce actual to numeric
|
||||
if not isinstance(actual, (int, float)):
|
||||
try:
|
||||
actual = float(actual)
|
||||
except (TypeError, ValueError) as e:
|
||||
raise ValueError(f"Cannot convert actual value '{actual}' to number") from e
|
||||
|
||||
# Coerce expected to numeric string for the workflow engine
|
||||
# (the workflow engine's _normalize_numeric_values handles str → float)
|
||||
if expected is not None and not isinstance(expected, str):
|
||||
expected = str(expected)
|
||||
|
||||
return _evaluate_condition(operator=operator, value=actual, expected=expected)
|
||||
|
||||
@@ -1,6 +1,17 @@
|
||||
"""Base evaluation runner.
|
||||
|
||||
Orchestrates the evaluation lifecycle in four phases:
|
||||
1. execute_target — run the target and collect actual outputs (abstract)
|
||||
2. evaluate_metrics — compute metrics via framework or customized workflow
|
||||
3. apply_judgment — evaluate pass/fail judgment conditions on metrics
|
||||
4. persist — save results to the database
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
@@ -21,14 +32,16 @@ class BaseEvaluationRunner(ABC):
|
||||
"""Abstract base class for evaluation runners.
|
||||
|
||||
Runners are responsible for executing the target (App/Snippet/Retrieval)
|
||||
to collect actual outputs, then delegating to the evaluation instance
|
||||
for metric computation, and optionally applying judgment conditions.
|
||||
to collect actual outputs, then computing evaluation metrics, optionally
|
||||
applying judgment conditions, and persisting results.
|
||||
|
||||
Execution phases:
|
||||
1. execute_target — run the target and collect actual outputs
|
||||
2. evaluate_metrics — compute evaluation metrics via the framework
|
||||
3. apply_judgment — evaluate pass/fail judgment conditions on metrics
|
||||
4. persist — save results to the database
|
||||
Built-in capabilities (implemented in this base class):
|
||||
- Customized workflow dispatch (``_evaluate_customized``)
|
||||
- Judgment condition evaluation (``_apply_judgment``)
|
||||
|
||||
Subclass responsibilities:
|
||||
- ``execute_target`` — target-specific execution logic
|
||||
- ``evaluate_metrics`` — framework-specific metric computation (RAGAS etc.)
|
||||
"""
|
||||
|
||||
def __init__(self, evaluation_instance: BaseEvaluationInstance, session: Session):
|
||||
@@ -56,7 +69,18 @@ class BaseEvaluationRunner(ABC):
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
"""Compute evaluation metrics on the collected results."""
|
||||
"""Compute evaluation metrics on the collected results.
|
||||
|
||||
Called only when the evaluation is NOT using a customized workflow
|
||||
(i.e. ``metrics_config`` does not contain ``workflow_id``).
|
||||
|
||||
Implementations should:
|
||||
1. Merge ``actual_output`` from ``results`` into the ``context``
|
||||
field of each ``EvaluationItemInput``.
|
||||
2. Call ``self.evaluation_instance.evaluate_xxx()`` with the
|
||||
merged items.
|
||||
3. Return updated results with metrics populated.
|
||||
"""
|
||||
...
|
||||
|
||||
def run(
|
||||
@@ -71,9 +95,7 @@ class BaseEvaluationRunner(ABC):
|
||||
model_name: str,
|
||||
judgment_config: JudgmentConfig | None = None,
|
||||
) -> list[EvaluationItemResult]:
|
||||
"""Orchestrate target execution + metric evaluation + judgment for all items.
|
||||
|
||||
"""
|
||||
"""Orchestrate target execution + metric evaluation + judgment for all items."""
|
||||
evaluation_run = self.session.query(EvaluationRun).filter_by(id=evaluation_run_id).first()
|
||||
if not evaluation_run:
|
||||
raise ValueError(f"EvaluationRun {evaluation_run_id} not found")
|
||||
@@ -108,9 +130,18 @@ class BaseEvaluationRunner(ABC):
|
||||
|
||||
if successful_items and successful_results:
|
||||
try:
|
||||
evaluated_results = self.evaluate_metrics(
|
||||
successful_items, successful_results, metrics_config, model_provider, model_name, tenant_id
|
||||
)
|
||||
if _is_customized_evaluation(metrics_config):
|
||||
# Customized workflow evaluation — target-type agnostic,
|
||||
# handled via BaseEvaluationInstance.evaluate_with_customized_workflow().
|
||||
evaluated_results = self._evaluate_customized(
|
||||
successful_items, successful_results, metrics_config, tenant_id,
|
||||
)
|
||||
else:
|
||||
# Framework-specific evaluation — delegate to subclass
|
||||
evaluated_results = self.evaluate_metrics(
|
||||
successful_items, successful_results, metrics_config,
|
||||
model_provider, model_name, tenant_id,
|
||||
)
|
||||
# Merge evaluated metrics back into results
|
||||
evaluated_by_index = {r.index: r for r in evaluated_results}
|
||||
for i, result in enumerate(results):
|
||||
@@ -121,7 +152,7 @@ class BaseEvaluationRunner(ABC):
|
||||
|
||||
# Phase 3: Apply judgment conditions on metrics
|
||||
if judgment_config and judgment_config.conditions:
|
||||
results = self._apply_judgment(results, judgment_config)
|
||||
results = self._apply_judgment(results, items, judgment_config)
|
||||
|
||||
# Phase 4: Persist individual items
|
||||
for result in results:
|
||||
@@ -145,27 +176,101 @@ class BaseEvaluationRunner(ABC):
|
||||
|
||||
return results
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Customized workflow evaluation dispatch
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _evaluate_customized(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
results: list[EvaluationItemResult],
|
||||
metrics_config: dict,
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
"""Delegate to the instance's customized workflow evaluator.
|
||||
|
||||
Unlike the framework path (which merges ``actual_output`` into
|
||||
``context``), here we pass ``results`` directly — the instance's
|
||||
``evaluate_with_customized_workflow()`` reads ``actual_output``
|
||||
from each ``EvaluationItemResult``.
|
||||
"""
|
||||
evaluated = self.evaluation_instance.evaluate_with_customized_workflow(
|
||||
items, results, metrics_config, tenant_id,
|
||||
)
|
||||
|
||||
# Merge metrics back preserving actual_output and metadata from Phase 1
|
||||
eval_by_index = {r.index: r for r in evaluated}
|
||||
final_results: list[EvaluationItemResult] = []
|
||||
for result in results:
|
||||
if result.index in eval_by_index:
|
||||
eval_result = eval_by_index[result.index]
|
||||
final_results.append(
|
||||
EvaluationItemResult(
|
||||
index=result.index,
|
||||
actual_output=result.actual_output,
|
||||
metrics=eval_result.metrics,
|
||||
metadata={**result.metadata, **eval_result.metadata},
|
||||
error=eval_result.error,
|
||||
)
|
||||
)
|
||||
else:
|
||||
final_results.append(result)
|
||||
return final_results
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Judgment (target-type agnostic)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
@staticmethod
|
||||
def _apply_judgment(
|
||||
results: list[EvaluationItemResult],
|
||||
items: list[EvaluationItemInput],
|
||||
judgment_config: JudgmentConfig,
|
||||
) -> list[EvaluationItemResult]:
|
||||
"""Apply judgment conditions to each result's metrics.
|
||||
|
||||
Builds a metric_name → score mapping from each result's metrics,
|
||||
then delegates to JudgmentProcessor for condition evaluation.
|
||||
Builds a metric_name → value mapping from each result's metrics,
|
||||
and a variable_values dict from the evaluation target's runtime data
|
||||
(inputs, actual_output, expected_output) for variable-type conditions.
|
||||
Results with errors are skipped.
|
||||
"""
|
||||
items_by_index = {item.index: item for item in items}
|
||||
judged_results: list[EvaluationItemResult] = []
|
||||
|
||||
for result in results:
|
||||
if result.error is not None or not result.metrics:
|
||||
judged_results.append(result)
|
||||
continue
|
||||
|
||||
metric_values = {m.name: m.score for m in result.metrics}
|
||||
judgment_result = JudgmentProcessor.evaluate(metric_values, judgment_config)
|
||||
metric_values: dict[str, object] = {m.name: m.score for m in result.metrics}
|
||||
|
||||
# Build variable pool from the evaluation target's runtime data.
|
||||
# These variables can be referenced in conditions with value_source="variable".
|
||||
item_input = items_by_index.get(result.index)
|
||||
variable_values: dict[str, object] = {}
|
||||
if item_input:
|
||||
variable_values.update(item_input.inputs)
|
||||
if item_input.expected_output is not None:
|
||||
variable_values["expected_output"] = item_input.expected_output
|
||||
if item_input.context:
|
||||
variable_values["context"] = "; ".join(item_input.context)
|
||||
if result.actual_output is not None:
|
||||
variable_values["actual_output"] = result.actual_output
|
||||
|
||||
judgment_result = JudgmentProcessor.evaluate(
|
||||
metric_values, judgment_config, variable_values=variable_values
|
||||
)
|
||||
|
||||
judged_results.append(
|
||||
result.model_copy(update={"judgment": judgment_result})
|
||||
)
|
||||
return judged_results
|
||||
|
||||
|
||||
def _is_customized_evaluation(metrics_config: dict[str, Any]) -> bool:
|
||||
"""Check if metrics_config indicates a customized workflow evaluation.
|
||||
|
||||
The convention is that ``metrics_config["workflow_id"]`` is present
|
||||
when a user-defined workflow should be used for evaluation.
|
||||
"""
|
||||
return bool(metrics_config.get("workflow_id"))
|
||||
|
||||
Reference in New Issue
Block a user