mirror of
https://github.com/langgenius/dify.git
synced 2026-03-07 09:00:46 -05:00
feat: Implement customized evaluation in BaseEvaluationInstance.
This commit is contained in:
@@ -21,7 +21,7 @@ class BaseEvaluationInstance(ABC):
|
||||
def evaluate_llm(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
metrics_config: dict,
|
||||
default_metrics: list[dict[str, Any]],
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
@@ -33,7 +33,7 @@ class BaseEvaluationInstance(ABC):
|
||||
def evaluate_retrieval(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
metrics_config: dict,
|
||||
default_metrics: list[dict[str, Any]],
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
@@ -45,7 +45,7 @@ class BaseEvaluationInstance(ABC):
|
||||
def evaluate_agent(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
metrics_config: dict,
|
||||
default_metrics: list[dict[str, Any]],
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
@@ -57,7 +57,7 @@ class BaseEvaluationInstance(ABC):
|
||||
def evaluate_workflow(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
metrics_config: dict,
|
||||
default_metrics: list[dict[str, Any]],
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
@@ -74,7 +74,7 @@ class BaseEvaluationInstance(ABC):
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
results: list[EvaluationItemResult],
|
||||
metrics_config: dict,
|
||||
customized_metrics: dict[str, Any],
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
"""Evaluate using a published workflow as the evaluator.
|
||||
@@ -86,8 +86,8 @@ class BaseEvaluationInstance(ABC):
|
||||
Args:
|
||||
items: Evaluation items with inputs, expected_output, context.
|
||||
results: Results from Phase 1 (with actual_output populated).
|
||||
metrics_config: Must contain ``workflow_id`` pointing to a
|
||||
published WORKFLOW-type App.
|
||||
customized_metrics: Must contain ``evaluation_workflow_id``
|
||||
pointing to a published WORKFLOW-type App.
|
||||
tenant_id: Tenant scope.
|
||||
|
||||
Returns:
|
||||
@@ -103,10 +103,10 @@ class BaseEvaluationInstance(ABC):
|
||||
from models.model import App
|
||||
from services.workflow_service import WorkflowService
|
||||
|
||||
workflow_id = metrics_config.get("workflow_id")
|
||||
workflow_id = customized_metrics.get("evaluation_workflow_id")
|
||||
if not workflow_id:
|
||||
raise ValueError(
|
||||
"metrics_config must contain 'workflow_id' for customized evaluator"
|
||||
"customized_metrics must contain 'evaluation_workflow_id' for customized evaluator"
|
||||
)
|
||||
|
||||
# Load the evaluator workflow resources using a dedicated session
|
||||
|
||||
@@ -42,51 +42,51 @@ class RagasEvaluator(BaseEvaluationInstance):
|
||||
def evaluate_llm(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
metrics_config: dict,
|
||||
default_metrics: list[dict[str, Any]],
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
return self._evaluate(items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.LLM)
|
||||
return self._evaluate(items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.LLM)
|
||||
|
||||
def evaluate_retrieval(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
metrics_config: dict,
|
||||
default_metrics: list[dict[str, Any]],
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
return self._evaluate(
|
||||
items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL
|
||||
items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.RETRIEVAL
|
||||
)
|
||||
|
||||
def evaluate_agent(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
metrics_config: dict,
|
||||
default_metrics: list[dict[str, Any]],
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
return self._evaluate(items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.AGENT)
|
||||
return self._evaluate(items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.AGENT)
|
||||
|
||||
def evaluate_workflow(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
metrics_config: dict,
|
||||
default_metrics: list[dict[str, Any]],
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
return self._evaluate(
|
||||
items, metrics_config, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW
|
||||
items, default_metrics, model_provider, model_name, tenant_id, EvaluationCategory.WORKFLOW
|
||||
)
|
||||
|
||||
def _evaluate(
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
metrics_config: dict,
|
||||
default_metrics: list[dict[str, Any]],
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
@@ -98,7 +98,12 @@ class RagasEvaluator(BaseEvaluationInstance):
|
||||
string similarity if RAGAS import fails.
|
||||
"""
|
||||
model_wrapper = DifyModelWrapper(model_provider, model_name, tenant_id)
|
||||
requested_metrics = metrics_config.get("metrics", self.get_supported_metrics(category))
|
||||
# Extract metric names from default_metrics list; each item has a "metric" key.
|
||||
requested_metrics = (
|
||||
[m["metric"] for m in default_metrics if "metric" in m]
|
||||
if default_metrics
|
||||
else self.get_supported_metrics(category)
|
||||
)
|
||||
|
||||
try:
|
||||
return self._evaluate_with_ragas(items, requested_metrics, model_wrapper, category)
|
||||
@@ -116,11 +121,6 @@ class RagasEvaluator(BaseEvaluationInstance):
|
||||
"""Evaluate using RAGAS library."""
|
||||
from ragas import evaluate as ragas_evaluate
|
||||
from ragas.dataset_schema import EvaluationDataset, SingleTurnSample
|
||||
from ragas.llms import LangchainLLMWrapper
|
||||
from ragas.metrics import (
|
||||
Faithfulness,
|
||||
ResponseRelevancy,
|
||||
)
|
||||
|
||||
# Build RAGAS dataset
|
||||
samples = []
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import logging
|
||||
from typing import Any, Mapping, Union
|
||||
from collections.abc import Mapping
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
@@ -9,7 +10,7 @@ from core.evaluation.entities.evaluation_entity import (
|
||||
EvaluationItemResult,
|
||||
)
|
||||
from core.evaluation.runners.base_evaluation_runner import BaseEvaluationRunner
|
||||
from models.model import App, AppMode
|
||||
from models.model import App
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -29,8 +30,8 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
|
||||
) -> EvaluationItemResult:
|
||||
"""Execute agent app and collect response with tool call information."""
|
||||
from core.app.apps.agent_chat.app_generator import AgentChatAppGenerator
|
||||
from core.evaluation.runners import get_service_account_for_app
|
||||
from core.app.entities.app_invoke_entities import InvokeFrom
|
||||
from core.evaluation.runners import get_service_account_for_app
|
||||
|
||||
app = self.session.query(App).filter_by(id=target_id).first()
|
||||
if not app:
|
||||
@@ -67,7 +68,7 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
results: list[EvaluationItemResult],
|
||||
metrics_config: dict,
|
||||
default_metrics: list[dict[str, Any]],
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
@@ -90,7 +91,7 @@ class AgentEvaluationRunner(BaseEvaluationRunner):
|
||||
)
|
||||
|
||||
evaluated = self.evaluation_instance.evaluate_agent(
|
||||
merged_items, metrics_config, model_provider, model_name, tenant_id
|
||||
merged_items, default_metrics, model_provider, model_name, tenant_id
|
||||
)
|
||||
|
||||
# Merge metrics back preserving metadata
|
||||
|
||||
@@ -51,7 +51,7 @@ class BaseEvaluationRunner(ABC):
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
results: list[EvaluationItemResult],
|
||||
metrics_config: dict,
|
||||
default_metrics: list[dict[str, Any]],
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
@@ -66,9 +66,10 @@ class BaseEvaluationRunner(ABC):
|
||||
target_id: str,
|
||||
target_type: str,
|
||||
items: list[EvaluationItemInput],
|
||||
metrics_config: dict,
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
default_metrics: list[dict[str, Any]],
|
||||
customized_metrics: dict[str, Any] | None = None,
|
||||
model_provider: str = "",
|
||||
model_name: str = "",
|
||||
judgment_config: JudgmentConfig | None = None,
|
||||
) -> list[EvaluationItemResult]:
|
||||
"""Orchestrate target execution + metric evaluation + judgment for all items."""
|
||||
@@ -106,13 +107,15 @@ class BaseEvaluationRunner(ABC):
|
||||
|
||||
if successful_items and successful_results:
|
||||
try:
|
||||
if _is_customized_evaluation(metrics_config):
|
||||
if customized_metrics is not None:
|
||||
# Customized workflow evaluation — target-type agnostic
|
||||
evaluated_results = self._evaluate_customized(
|
||||
successful_items, successful_results, metrics_config, tenant_id,
|
||||
successful_items, successful_results, customized_metrics, tenant_id,
|
||||
)
|
||||
else:
|
||||
# Framework-specific evaluation — delegate to subclass
|
||||
evaluated_results = self.evaluate_metrics(
|
||||
successful_items, successful_results, metrics_config,
|
||||
successful_items, successful_results, default_metrics,
|
||||
model_provider, model_name, tenant_id,
|
||||
)
|
||||
# Merge evaluated metrics back into results
|
||||
@@ -153,12 +156,18 @@ class BaseEvaluationRunner(ABC):
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
results: list[EvaluationItemResult],
|
||||
metrics_config: dict,
|
||||
customized_metrics: dict[str, Any],
|
||||
tenant_id: str,
|
||||
) -> list[EvaluationItemResult]:
|
||||
"""Delegate to the instance's customized workflow evaluator."""
|
||||
"""Delegate to the instance's customized workflow evaluator.
|
||||
|
||||
Unlike the framework path (which merges ``actual_output`` into
|
||||
``context``), here we pass ``results`` directly — the instance's
|
||||
``evaluate_with_customized_workflow()`` reads ``actual_output``
|
||||
from each ``EvaluationItemResult``.
|
||||
"""
|
||||
evaluated = self.evaluation_instance.evaluate_with_customized_workflow(
|
||||
items, results, metrics_config, tenant_id,
|
||||
items, results, customized_metrics, tenant_id,
|
||||
)
|
||||
|
||||
# Merge metrics back preserving actual_output and metadata from Phase 1
|
||||
@@ -180,7 +189,6 @@ class BaseEvaluationRunner(ABC):
|
||||
final_results.append(result)
|
||||
return final_results
|
||||
|
||||
|
||||
@staticmethod
|
||||
def _apply_judgment(
|
||||
results: list[EvaluationItemResult],
|
||||
@@ -225,8 +233,3 @@ class BaseEvaluationRunner(ABC):
|
||||
result.model_copy(update={"judgment": judgment_result})
|
||||
)
|
||||
return judged_results
|
||||
|
||||
|
||||
def _is_customized_evaluation(metrics_config: dict[str, Any]) -> bool:
|
||||
"""Check if metrics_config indicates a customized workflow evaluation."""
|
||||
return bool(metrics_config.get("workflow_id"))
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import logging
|
||||
from typing import Any, Mapping, Union
|
||||
from collections.abc import Mapping
|
||||
from typing import Any, Union
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
@@ -30,8 +31,8 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
|
||||
"""Execute the App/Snippet with the given inputs and collect the response."""
|
||||
from core.app.apps.completion.app_generator import CompletionAppGenerator
|
||||
from core.app.apps.workflow.app_generator import WorkflowAppGenerator
|
||||
from core.evaluation.runners import get_service_account_for_app
|
||||
from core.app.entities.app_invoke_entities import InvokeFrom
|
||||
from core.evaluation.runners import get_service_account_for_app
|
||||
from services.workflow_service import WorkflowService
|
||||
|
||||
app = self.session.query(App).filter_by(id=target_id).first()
|
||||
@@ -89,7 +90,7 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
results: list[EvaluationItemResult],
|
||||
metrics_config: dict,
|
||||
default_metrics: list[dict[str, Any]],
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
@@ -98,7 +99,7 @@ class LLMEvaluationRunner(BaseEvaluationRunner):
|
||||
# Merge actual_output into items for evaluation
|
||||
merged_items = self._merge_results_into_items(items, results)
|
||||
return self.evaluation_instance.evaluate_llm(
|
||||
merged_items, metrics_config, model_provider, model_name, tenant_id
|
||||
merged_items, default_metrics, model_provider, model_name, tenant_id
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -58,7 +58,7 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
results: list[EvaluationItemResult],
|
||||
metrics_config: dict,
|
||||
default_metrics: list[dict[str, Any]],
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
@@ -80,7 +80,7 @@ class RetrievalEvaluationRunner(BaseEvaluationRunner):
|
||||
)
|
||||
|
||||
evaluated = self.evaluation_instance.evaluate_retrieval(
|
||||
merged_items, metrics_config, model_provider, model_name, tenant_id
|
||||
merged_items, default_metrics, model_provider, model_name, tenant_id
|
||||
)
|
||||
|
||||
# Merge metrics back into original results (preserve actual_output and metadata)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import logging
|
||||
from typing import Any, Mapping
|
||||
from collections.abc import Mapping
|
||||
from typing import Any
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
@@ -29,8 +30,8 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
|
||||
) -> EvaluationItemResult:
|
||||
"""Execute workflow and collect outputs."""
|
||||
from core.app.apps.workflow.app_generator import WorkflowAppGenerator
|
||||
from core.evaluation.runners import get_service_account_for_app
|
||||
from core.app.entities.app_invoke_entities import InvokeFrom
|
||||
from core.evaluation.runners import get_service_account_for_app
|
||||
from services.workflow_service import WorkflowService
|
||||
|
||||
app = self.session.query(App).filter_by(id=target_id).first()
|
||||
@@ -68,7 +69,7 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
|
||||
self,
|
||||
items: list[EvaluationItemInput],
|
||||
results: list[EvaluationItemResult],
|
||||
metrics_config: dict,
|
||||
default_metrics: list[dict[str, Any]],
|
||||
model_provider: str,
|
||||
model_name: str,
|
||||
tenant_id: str,
|
||||
@@ -91,7 +92,7 @@ class WorkflowEvaluationRunner(BaseEvaluationRunner):
|
||||
)
|
||||
|
||||
evaluated = self.evaluation_instance.evaluate_workflow(
|
||||
merged_items, metrics_config, model_provider, model_name, tenant_id
|
||||
merged_items, default_metrics, model_provider, model_name, tenant_id
|
||||
)
|
||||
|
||||
# Merge metrics back preserving metadata
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
from configs import dify_config
|
||||
from models.model import UploadFile
|
||||
from typing import Any
|
||||
|
||||
from celery import shared_task
|
||||
@@ -10,6 +8,7 @@ from openpyxl import Workbook
|
||||
from openpyxl.styles import Alignment, Border, Font, PatternFill, Side
|
||||
from openpyxl.utils import get_column_letter
|
||||
|
||||
from configs import dify_config
|
||||
from core.evaluation.entities.evaluation_entity import (
|
||||
EvaluationCategory,
|
||||
EvaluationItemResult,
|
||||
@@ -23,6 +22,7 @@ from core.evaluation.runners.workflow_evaluation_runner import WorkflowEvaluatio
|
||||
from extensions.ext_database import db
|
||||
from libs.datetime_utils import naive_utc_now
|
||||
from models.evaluation import EvaluationRun, EvaluationRunStatus
|
||||
from models.model import UploadFile
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -86,6 +86,7 @@ def _execute_evaluation(session: Any, run_data: EvaluationRunData) -> None:
|
||||
customized_metrics=run_data.customized_metrics,
|
||||
model_provider=run_data.evaluation_model_provider,
|
||||
model_name=run_data.evaluation_model,
|
||||
judgment_config=run_data.judgment_config,
|
||||
)
|
||||
|
||||
# Compute summary metrics
|
||||
@@ -210,7 +211,13 @@ def _generate_result_xlsx(
|
||||
input_keys.append(key)
|
||||
|
||||
# Build headers
|
||||
headers = ["index"] + input_keys + ["expected_output", "actual_output"] + all_metric_names + ["overall_score", "error"]
|
||||
headers = (
|
||||
["index"]
|
||||
+ input_keys
|
||||
+ ["expected_output", "actual_output"]
|
||||
+ all_metric_names
|
||||
+ ["overall_score", "error"]
|
||||
)
|
||||
|
||||
# Write header row
|
||||
for col_idx, header in enumerate(headers, start=1):
|
||||
|
||||
Reference in New Issue
Block a user