dify/api/models/evaluation.py

from __future__ import annotations

import json
from datetime import datetime
from enum import StrEnum
from typing import Any

import sqlalchemy as sa
from sqlalchemy import DateTime, Float, Integer, String, Text, func
from sqlalchemy.orm import Mapped, mapped_column

from libs.uuid_utils import uuidv7

from .base import Base
from .types import LongText, StringUUID


class EvaluationRunStatus(StrEnum):
    PENDING = "pending"
    RUNNING = "running"
    COMPLETED = "completed"
    FAILED = "failed"
    CANCELLED = "cancelled"


class EvaluationTargetType(StrEnum):
    APP = "app"
    SNIPPETS = "snippets"


class EvaluationConfiguration(Base):
    """Stores evaluation configuration for each target (App or Snippet)."""

    __tablename__ = "evaluation_configurations"
    __table_args__ = (
        sa.PrimaryKeyConstraint("id", name="evaluation_configuration_pkey"),
        sa.Index("evaluation_configuration_target_idx", "tenant_id", "target_type", "target_id"),
        sa.UniqueConstraint("tenant_id", "target_type", "target_id", name="evaluation_configuration_unique"),
    )

    id: Mapped[str] = mapped_column(StringUUID, default=lambda: str(uuidv7()))
    tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
    target_type: Mapped[str] = mapped_column(String(20), nullable=False)
    target_id: Mapped[str] = mapped_column(StringUUID, nullable=False)

    evaluation_model_provider: Mapped[str | None] = mapped_column(String(255), nullable=True)
    evaluation_model: Mapped[str | None] = mapped_column(String(255), nullable=True)
    metrics_config: Mapped[str | None] = mapped_column(LongText, nullable=True)
    judgement_conditions: Mapped[str | None] = mapped_column(LongText, nullable=True)

    created_by: Mapped[str] = mapped_column(StringUUID, nullable=False)
    updated_by: Mapped[str] = mapped_column(StringUUID, nullable=False)
    created_at: Mapped[datetime] = mapped_column(
        DateTime, nullable=False, server_default=func.current_timestamp()
    )
    updated_at: Mapped[datetime] = mapped_column(
        DateTime, nullable=False, server_default=func.current_timestamp(), onupdate=func.current_timestamp()
    )

    @property
    def metrics_config_dict(self) -> dict[str, Any]:
        if self.metrics_config:
            return json.loads(self.metrics_config)
        return {}

    @metrics_config_dict.setter
    def metrics_config_dict(self, value: dict[str, Any]) -> None:
        self.metrics_config = json.dumps(value)

    @property
    def judgement_conditions_dict(self) -> dict[str, Any]:
        if self.judgement_conditions:
            return json.loads(self.judgement_conditions)
        return {}

    @judgement_conditions_dict.setter
    def judgement_conditions_dict(self, value: dict[str, Any]) -> None:
        self.judgement_conditions = json.dumps(value)

    def __repr__(self) -> str:
        return f"<EvaluationConfiguration(id={self.id}, target={self.target_type}:{self.target_id})>"


class EvaluationRun(Base):
    """Stores each evaluation run record."""

    __tablename__ = "evaluation_runs"
    __table_args__ = (
        sa.PrimaryKeyConstraint("id", name="evaluation_run_pkey"),
        sa.Index("evaluation_run_target_idx", "tenant_id", "target_type", "target_id"),
        sa.Index("evaluation_run_status_idx", "tenant_id", "status"),
    )

    id: Mapped[str] = mapped_column(StringUUID, default=lambda: str(uuidv7()))
    tenant_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
    target_type: Mapped[str] = mapped_column(String(20), nullable=False)
    target_id: Mapped[str] = mapped_column(StringUUID, nullable=False)
    evaluation_config_id: Mapped[str] = mapped_column(StringUUID, nullable=False)

    status: Mapped[str] = mapped_column(
        String(20), nullable=False, default=EvaluationRunStatus.PENDING
    )
    dataset_file_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True)
    result_file_id: Mapped[str | None] = mapped_column(StringUUID, nullable=True)

    total_items: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    completed_items: Mapped[int] = mapped_column(Integer, nullable=False, default=0)
    failed_items: Mapped[int] = mapped_column(Integer, nullable=False, default=0)

    metrics_summary: Mapped[str | None] = mapped_column(LongText, nullable=True)
    error: Mapped[str | None] = mapped_column(Text, nullable=True)

    celery_task_id: Mapped[str | None] = mapped_column(String(255), nullable=True)

    created_by: Mapped[str] = mapped_column(StringUUID, nullable=False)
    started_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
    completed_at: Mapped[datetime | None] = mapped_column(DateTime, nullable=True)
    created_at: Mapped[datetime] = mapped_column(
        DateTime, nullable=False, server_default=func.current_timestamp()
    )
    updated_at: Mapped[datetime] = mapped_column(
        DateTime, nullable=False, server_default=func.current_timestamp(), onupdate=func.current_timestamp()
    )

    @property
    def metrics_summary_dict(self) -> dict[str, Any]:
        if self.metrics_summary:
            return json.loads(self.metrics_summary)
        return {}

    @metrics_summary_dict.setter
    def metrics_summary_dict(self, value: dict[str, Any]) -> None:
        self.metrics_summary = json.dumps(value)

    @property
    def progress(self) -> float:
        if self.total_items == 0:
            return 0.0
        return (self.completed_items + self.failed_items) / self.total_items

    def __repr__(self) -> str:
        return f"<EvaluationRun(id={self.id}, status={self.status})>"


class EvaluationRunItem(Base):
    """Stores per-row evaluation results."""

    __tablename__ = "evaluation_run_items"
    __table_args__ = (
        sa.PrimaryKeyConstraint("id", name="evaluation_run_item_pkey"),
        sa.Index("evaluation_run_item_run_idx", "evaluation_run_id"),
        sa.Index("evaluation_run_item_index_idx", "evaluation_run_id", "item_index"),
    )

    id: Mapped[str] = mapped_column(StringUUID, default=lambda: str(uuidv7()))
    evaluation_run_id: Mapped[str] = mapped_column(StringUUID, nullable=False)

    item_index: Mapped[int] = mapped_column(Integer, nullable=False)
    inputs: Mapped[str | None] = mapped_column(LongText, nullable=True)
    expected_output: Mapped[str | None] = mapped_column(LongText, nullable=True)
    context: Mapped[str | None] = mapped_column(LongText, nullable=True)
    actual_output: Mapped[str | None] = mapped_column(LongText, nullable=True)

    metrics: Mapped[str | None] = mapped_column(LongText, nullable=True)
    judgment: Mapped[str | None] = mapped_column(LongText, nullable=True)
    metadata_json: Mapped[str | None] = mapped_column(LongText, nullable=True)
    error: Mapped[str | None] = mapped_column(Text, nullable=True)

    overall_score: Mapped[float | None] = mapped_column(Float, nullable=True)

    created_at: Mapped[datetime] = mapped_column(
        DateTime, nullable=False, server_default=func.current_timestamp()
    )

    @property
    def inputs_dict(self) -> dict[str, Any]:
        if self.inputs:
            return json.loads(self.inputs)
        return {}

    @property
    def metrics_list(self) -> list[dict[str, Any]]:
        if self.metrics:
            return json.loads(self.metrics)
        return []

    @property
    def metadata_dict(self) -> dict[str, Any]:
        if self.metadata_json:
            return json.loads(self.metadata_json)
        return {}

    def __repr__(self) -> str:
        return f"<EvaluationRunItem(id={self.id}, run={self.evaluation_run_id}, index={self.item_index})>"