feat(web): dataset evaluation configure

2026-06-02 07:00:48 -04:00 · 2026-04-09 15:34:59 +08:00
parent ccc8a5f278
commit e52dbd49be
8 changed files with 111 additions and 21 deletions
--- a/web/app/components/evaluation/tests/index.spec.tsx
+++ b/web/app/components/evaluation/tests/index.spec.tsx
@@ -54,7 +54,7 @@ describe('Evaluation', () => {

    mockUseAvailableEvaluationMetrics.mockReturnValue({
      data: {
-        metrics: ['answer-correctness', 'faithfulness'],
+        metrics: ['answer-correctness', 'faithfulness', 'context-precision', 'context-recall', 'context-relevance'],
      },
      isLoading: false,
    })
@@ -240,12 +240,34 @@ describe('Evaluation', () => {
    expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeDisabled()
  })

+  it('should render selected pipeline metrics from config with the default threshold input', () => {
+    mockUseEvaluationConfig.mockReturnValue({
+      data: {
+        evaluation_model: null,
+        evaluation_model_provider: null,
+        metrics_config: {
+          default_metrics: [{
+            metric: 'context-precision',
+          }],
+          customized_metrics: null,
+        },
+        judgement_conditions: null,
+      },
+    })
+
+    render(<Evaluation resourceType="datasets" resourceId="dataset-2" />)
+
+    expect(screen.getByText('Context Precision')).toBeInTheDocument()
+    expect(screen.getByDisplayValue('0.85')).toBeInTheDocument()
+  })
+
  it('should enable pipeline batch actions after selecting a judge model and metric', () => {
    render(<Evaluation resourceType="datasets" resourceId="dataset-2" />)

    fireEvent.click(screen.getByRole('button', { name: 'select-model' }))
    fireEvent.click(screen.getByRole('button', { name: /Context Precision/i }))

+    expect(screen.getByDisplayValue('0.85')).toBeInTheDocument()
    expect(screen.getByRole('button', { name: 'evaluation.batch.downloadTemplate' })).toBeEnabled()
    expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeEnabled()
  })
--- a/web/app/components/evaluation/tests/store.spec.ts
+++ b/web/app/components/evaluation/tests/store.spec.ts
@@ -181,6 +181,7 @@ describe('evaluation store', () => {
    expect(hydratedState.judgeModelId).toBe('openai::gpt-4o-mini')
    expect(hydratedState.metrics).toHaveLength(2)
    expect(hydratedState.metrics[0].optionId).toBe('faithfulness')
+    expect(hydratedState.metrics[0].threshold).toBe(0.85)
    expect(hydratedState.metrics[0].nodeInfoList).toEqual([
      { node_id: 'node-1', title: 'Retriever', type: 'retriever' },
    ])
--- a/web/app/components/evaluation/components/layout/pipeline-evaluation.tsx
+++ b/web/app/components/evaluation/components/layout/pipeline-evaluation.tsx
@@ -6,6 +6,7 @@ import { useTranslation } from 'react-i18next'
 import Button from '@/app/components/base/button'
 import { toast } from '@/app/components/base/ui/toast'
 import { useDocLink } from '@/context/i18n'
+import { useAvailableEvaluationMetrics } from '@/service/use-evaluation'
 import { getEvaluationMockConfig } from '../../mock'
 import { isEvaluationRunnable, useEvaluationResource, useEvaluationStore } from '../../store'
 import JudgeModelSelector from '../judge-model-selector'
@@ -24,8 +25,10 @@ const PipelineEvaluation = ({
  const ensureResource = useEvaluationStore(state => state.ensureResource)
  const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric)
  const removeMetric = useEvaluationStore(state => state.removeMetric)
+  const updateMetricThreshold = useEvaluationStore(state => state.updateMetricThreshold)
  const setUploadedFileName = useEvaluationStore(state => state.setUploadedFileName)
  const runBatchTest = useEvaluationStore(state => state.runBatchTest)
+  const { data: availableMetricsData } = useAvailableEvaluationMetrics()
  const resource = useEvaluationResource(resourceType, resourceId)
  const fileInputRef = useRef<HTMLInputElement>(null)
  const config = getEvaluationMockConfig(resourceType)
@@ -34,6 +37,12 @@ const PipelineEvaluation = ({
      .filter(metric => metric.kind === 'builtin')
      .map(metric => [metric.optionId, metric]),
  ), [resource.metrics])
+  const availableMetricIds = useMemo(() => new Set(availableMetricsData?.metrics ?? []), [availableMetricsData?.metrics])
+  const availableBuiltinMetrics = useMemo(() => {
+    return config.builtinMetrics.filter(metric =>
+      availableMetricIds.has(metric.id) || builtinMetricMap.has(metric.id),
+    )
+  }, [availableMetricIds, builtinMetricMap, config.builtinMetrics])
  const isConfigReady = !!resource.judgeModelId && builtinMetricMap.size > 0
  const isRunnable = isEvaluationRunnable(resource)

@@ -107,15 +116,21 @@ const PipelineEvaluation = ({
            <section>
              <InlineSectionHeader title={t('metrics.title')} tooltip={t('metrics.description')} />
              <div className="mt-1 space-y-0.5">
-                {config.builtinMetrics.map(metric => (
-                  <PipelineMetricItem
-                    key={metric.id}
-                    metric={metric}
-                    selected={builtinMetricMap.has(metric.id)}
-                    disabledCondition
-                    onToggle={() => handleToggleMetric(metric.id)}
-                  />
-                ))}
+                {availableBuiltinMetrics.map((metric) => {
+                  const selectedMetric = builtinMetricMap.get(metric.id)
+
+                  return (
+                    <PipelineMetricItem
+                      key={metric.id}
+                      metric={metric}
+                      selected={!!selectedMetric}
+                      threshold={selectedMetric?.threshold}
+                      disabledCondition
+                      onToggle={() => handleToggleMetric(metric.id)}
+                      onThresholdChange={value => updateMetricThreshold(resourceType, resourceId, selectedMetric?.id ?? '', value)}
+                    />
+                  )
+                })}
              </div>
            </section>

--- a/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx
+++ b/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx
@@ -1,15 +1,20 @@
 'use client'

 import type { MetricOption } from '../../types'
+import { useTranslation } from 'react-i18next'
 import Checkbox from '@/app/components/base/checkbox'
+import Input from '@/app/components/base/input'
 import { Tooltip, TooltipContent, TooltipTrigger } from '@/app/components/base/ui/tooltip'
 import { cn } from '@/utils/classnames'
+import { DEFAULT_PIPELINE_METRIC_THRESHOLD } from '../../store-utils'

 type PipelineMetricItemProps = {
  metric: MetricOption
  selected: boolean
  onToggle: () => void
  disabledCondition: boolean
+  threshold?: number
+  onThresholdChange: (value: number) => void
 }

 const PipelineMetricItem = ({
@@ -17,7 +22,11 @@ const PipelineMetricItem = ({
  selected,
  onToggle,
  disabledCondition,
+  threshold = DEFAULT_PIPELINE_METRIC_THRESHOLD,
+  onThresholdChange,
 }: PipelineMetricItemProps) => {
+  const { t } = useTranslation('evaluation')
+
  return (
    <div className="flex items-center justify-between gap-3 px-1 py-1">
      <button
@@ -41,16 +50,38 @@ const PipelineMetricItem = ({
        </Tooltip>
      </button>

-      <button
-        type="button"
-        disabled={disabledCondition}
-        className={cn(
-          'system-xs-medium text-text-tertiary',
-          disabledCondition && 'cursor-not-allowed text-components-button-secondary-accent-text-disabled',
-        )}
-      >
-        + Condition
-      </button>
+      {selected
+        ? (
+            <div className="flex items-center gap-2">
+              <span className="system-xs-medium text-text-accent">{t('pipeline.passIf')}</span>
+              <div className="w-[52px]">
+                <Input
+                  value={String(threshold)}
+                  type="number"
+                  min={0}
+                  max={1}
+                  step={0.01}
+                  onChange={(event) => {
+                    const parsedValue = Number(event.target.value)
+                    if (!Number.isNaN(parsedValue))
+                      onThresholdChange(parsedValue)
+                  }}
+                />
+              </div>
+            </div>
+          )
+        : (
+            <button
+              type="button"
+              disabled={disabledCondition}
+              className={cn(
+                'system-xs-medium text-text-tertiary',
+                disabledCondition && 'cursor-not-allowed text-components-button-secondary-accent-text-disabled',
+              )}
+            >
+              + Condition
+            </button>
+          )}
    </div>
  )
 }
--- a/web/app/components/evaluation/store-utils.ts
+++ b/web/app/components/evaluation/store-utils.ts
@@ -24,6 +24,8 @@ import { encodeModelSelection } from './utils'

 type EvaluationStoreResources = Record<string, EvaluationResourceState>

+export const DEFAULT_PIPELINE_METRIC_THRESHOLD = 0.85
+
 const createId = (prefix: string) => `${prefix}-${Math.random().toString(36).slice(2, 10)}`

 const humanizeMetricId = (metricId: string) => {
@@ -213,7 +215,11 @@ export function getConditionValue(
  return typeof previousValue === 'string' ? previousValue : null
 }

-export function createBuiltinMetric(metric: MetricOption, nodeInfoList: NodeInfo[] = []): EvaluationMetric {
+export function createBuiltinMetric(
+  metric: MetricOption,
+  nodeInfoList: NodeInfo[] = [],
+  threshold = DEFAULT_PIPELINE_METRIC_THRESHOLD,
+): EvaluationMetric {
  return {
    id: createId('metric'),
    optionId: metric.id,
@@ -221,6 +227,7 @@ export function createBuiltinMetric(metric: MetricOption, nodeInfoList: NodeInfo
    label: metric.label,
    description: metric.description,
    badges: metric.badges,
+    threshold,
    nodeInfoList,
  }
 }
--- a/web/app/components/evaluation/store.ts
+++ b/web/app/components/evaluation/store.ts
@@ -32,6 +32,7 @@ type EvaluationStore = {
  hydrateResource: (resourceType: EvaluationResourceType, resourceId: string, config: EvaluationConfig) => void
  setJudgeModel: (resourceType: EvaluationResourceType, resourceId: string, judgeModelId: string) => void
  addBuiltinMetric: (resourceType: EvaluationResourceType, resourceId: string, optionId: string, nodeInfoList?: NodeInfo[]) => void
+  updateMetricThreshold: (resourceType: EvaluationResourceType, resourceId: string, metricId: string, threshold: number) => void
  addCustomMetric: (resourceType: EvaluationResourceType, resourceId: string) => void
  removeMetric: (resourceType: EvaluationResourceType, resourceId: string, metricId: string) => void
  setCustomMetricWorkflow: (
@@ -126,6 +127,17 @@ export const useEvaluationStore = create<EvaluationStore>((set, get) => ({
      }
    })
  },
+  updateMetricThreshold: (resourceType, resourceId, metricId, threshold) => {
+    set(state => ({
+      resources: updateResourceState(state.resources, resourceType, resourceId, resource => ({
+        ...resource,
+        metrics: updateMetric(resource.metrics, metricId, metric => ({
+          ...metric,
+          threshold,
+        })),
+      })),
+    }))
+  },
  addCustomMetric: (resourceType, resourceId) => {
    set(state => ({
      resources: updateResourceState(state.resources, resourceType, resourceId, resource => ({
--- a/web/app/components/evaluation/types.ts
+++ b/web/app/components/evaluation/types.ts
@@ -82,6 +82,7 @@ export type EvaluationMetric = {
  label: string
  description: string
  badges: string[]
+  threshold?: number
  nodeInfoList?: NodeInfo[]
  customConfig?: CustomMetricConfig
 }
--- a/web/i18n/en-US/evaluation.json
+++ b/web/i18n/en-US/evaluation.json
@@ -92,6 +92,7 @@
  "metrics.showMore": "Show more",
  "metrics.title": "Metrics",
  "metrics.update": "Update",
+  "pipeline.passIf": "Pass if \u2265",
  "pipeline.uploadAndRun": "Upload & Run Test",
  "results.empty": "No evaluation results yet.",
  "title": "Evaluation"