feat(web): dataset evaluation configure

This commit is contained in:
JzoNg
2026-04-09 15:34:59 +08:00
parent ccc8a5f278
commit e52dbd49be
8 changed files with 111 additions and 21 deletions

View File

@@ -54,7 +54,7 @@ describe('Evaluation', () => {
mockUseAvailableEvaluationMetrics.mockReturnValue({
data: {
metrics: ['answer-correctness', 'faithfulness'],
metrics: ['answer-correctness', 'faithfulness', 'context-precision', 'context-recall', 'context-relevance'],
},
isLoading: false,
})
@@ -240,12 +240,34 @@ describe('Evaluation', () => {
expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeDisabled()
})
it('should render selected pipeline metrics from config with the default threshold input', () => {
mockUseEvaluationConfig.mockReturnValue({
data: {
evaluation_model: null,
evaluation_model_provider: null,
metrics_config: {
default_metrics: [{
metric: 'context-precision',
}],
customized_metrics: null,
},
judgement_conditions: null,
},
})
render(<Evaluation resourceType="datasets" resourceId="dataset-2" />)
expect(screen.getByText('Context Precision')).toBeInTheDocument()
expect(screen.getByDisplayValue('0.85')).toBeInTheDocument()
})
it('should enable pipeline batch actions after selecting a judge model and metric', () => {
render(<Evaluation resourceType="datasets" resourceId="dataset-2" />)
fireEvent.click(screen.getByRole('button', { name: 'select-model' }))
fireEvent.click(screen.getByRole('button', { name: /Context Precision/i }))
expect(screen.getByDisplayValue('0.85')).toBeInTheDocument()
expect(screen.getByRole('button', { name: 'evaluation.batch.downloadTemplate' })).toBeEnabled()
expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeEnabled()
})

View File

@@ -181,6 +181,7 @@ describe('evaluation store', () => {
expect(hydratedState.judgeModelId).toBe('openai::gpt-4o-mini')
expect(hydratedState.metrics).toHaveLength(2)
expect(hydratedState.metrics[0].optionId).toBe('faithfulness')
expect(hydratedState.metrics[0].threshold).toBe(0.85)
expect(hydratedState.metrics[0].nodeInfoList).toEqual([
{ node_id: 'node-1', title: 'Retriever', type: 'retriever' },
])

View File

@@ -6,6 +6,7 @@ import { useTranslation } from 'react-i18next'
import Button from '@/app/components/base/button'
import { toast } from '@/app/components/base/ui/toast'
import { useDocLink } from '@/context/i18n'
import { useAvailableEvaluationMetrics } from '@/service/use-evaluation'
import { getEvaluationMockConfig } from '../../mock'
import { isEvaluationRunnable, useEvaluationResource, useEvaluationStore } from '../../store'
import JudgeModelSelector from '../judge-model-selector'
@@ -24,8 +25,10 @@ const PipelineEvaluation = ({
const ensureResource = useEvaluationStore(state => state.ensureResource)
const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric)
const removeMetric = useEvaluationStore(state => state.removeMetric)
const updateMetricThreshold = useEvaluationStore(state => state.updateMetricThreshold)
const setUploadedFileName = useEvaluationStore(state => state.setUploadedFileName)
const runBatchTest = useEvaluationStore(state => state.runBatchTest)
const { data: availableMetricsData } = useAvailableEvaluationMetrics()
const resource = useEvaluationResource(resourceType, resourceId)
const fileInputRef = useRef<HTMLInputElement>(null)
const config = getEvaluationMockConfig(resourceType)
@@ -34,6 +37,12 @@ const PipelineEvaluation = ({
.filter(metric => metric.kind === 'builtin')
.map(metric => [metric.optionId, metric]),
), [resource.metrics])
const availableMetricIds = useMemo(() => new Set(availableMetricsData?.metrics ?? []), [availableMetricsData?.metrics])
const availableBuiltinMetrics = useMemo(() => {
return config.builtinMetrics.filter(metric =>
availableMetricIds.has(metric.id) || builtinMetricMap.has(metric.id),
)
}, [availableMetricIds, builtinMetricMap, config.builtinMetrics])
const isConfigReady = !!resource.judgeModelId && builtinMetricMap.size > 0
const isRunnable = isEvaluationRunnable(resource)
@@ -107,15 +116,21 @@ const PipelineEvaluation = ({
<section>
<InlineSectionHeader title={t('metrics.title')} tooltip={t('metrics.description')} />
<div className="mt-1 space-y-0.5">
{config.builtinMetrics.map(metric => (
<PipelineMetricItem
key={metric.id}
metric={metric}
selected={builtinMetricMap.has(metric.id)}
disabledCondition
onToggle={() => handleToggleMetric(metric.id)}
/>
))}
{availableBuiltinMetrics.map((metric) => {
const selectedMetric = builtinMetricMap.get(metric.id)
return (
<PipelineMetricItem
key={metric.id}
metric={metric}
selected={!!selectedMetric}
threshold={selectedMetric?.threshold}
disabledCondition
onToggle={() => handleToggleMetric(metric.id)}
onThresholdChange={value => updateMetricThreshold(resourceType, resourceId, selectedMetric?.id ?? '', value)}
/>
)
})}
</div>
</section>

View File

@@ -1,15 +1,20 @@
'use client'
import type { MetricOption } from '../../types'
import { useTranslation } from 'react-i18next'
import Checkbox from '@/app/components/base/checkbox'
import Input from '@/app/components/base/input'
import { Tooltip, TooltipContent, TooltipTrigger } from '@/app/components/base/ui/tooltip'
import { cn } from '@/utils/classnames'
import { DEFAULT_PIPELINE_METRIC_THRESHOLD } from '../../store-utils'
type PipelineMetricItemProps = {
metric: MetricOption
selected: boolean
onToggle: () => void
disabledCondition: boolean
threshold?: number
onThresholdChange: (value: number) => void
}
const PipelineMetricItem = ({
@@ -17,7 +22,11 @@ const PipelineMetricItem = ({
selected,
onToggle,
disabledCondition,
threshold = DEFAULT_PIPELINE_METRIC_THRESHOLD,
onThresholdChange,
}: PipelineMetricItemProps) => {
const { t } = useTranslation('evaluation')
return (
<div className="flex items-center justify-between gap-3 px-1 py-1">
<button
@@ -41,16 +50,38 @@ const PipelineMetricItem = ({
</Tooltip>
</button>
<button
type="button"
disabled={disabledCondition}
className={cn(
'system-xs-medium text-text-tertiary',
disabledCondition && 'cursor-not-allowed text-components-button-secondary-accent-text-disabled',
)}
>
+ Condition
</button>
{selected
? (
<div className="flex items-center gap-2">
<span className="system-xs-medium text-text-accent">{t('pipeline.passIf')}</span>
<div className="w-[52px]">
<Input
value={String(threshold)}
type="number"
min={0}
max={1}
step={0.01}
onChange={(event) => {
const parsedValue = Number(event.target.value)
if (!Number.isNaN(parsedValue))
onThresholdChange(parsedValue)
}}
/>
</div>
</div>
)
: (
<button
type="button"
disabled={disabledCondition}
className={cn(
'system-xs-medium text-text-tertiary',
disabledCondition && 'cursor-not-allowed text-components-button-secondary-accent-text-disabled',
)}
>
+ Condition
</button>
)}
</div>
)
}

View File

@@ -24,6 +24,8 @@ import { encodeModelSelection } from './utils'
type EvaluationStoreResources = Record<string, EvaluationResourceState>
export const DEFAULT_PIPELINE_METRIC_THRESHOLD = 0.85
const createId = (prefix: string) => `${prefix}-${Math.random().toString(36).slice(2, 10)}`
const humanizeMetricId = (metricId: string) => {
@@ -213,7 +215,11 @@ export function getConditionValue(
return typeof previousValue === 'string' ? previousValue : null
}
export function createBuiltinMetric(metric: MetricOption, nodeInfoList: NodeInfo[] = []): EvaluationMetric {
export function createBuiltinMetric(
metric: MetricOption,
nodeInfoList: NodeInfo[] = [],
threshold = DEFAULT_PIPELINE_METRIC_THRESHOLD,
): EvaluationMetric {
return {
id: createId('metric'),
optionId: metric.id,
@@ -221,6 +227,7 @@ export function createBuiltinMetric(metric: MetricOption, nodeInfoList: NodeInfo
label: metric.label,
description: metric.description,
badges: metric.badges,
threshold,
nodeInfoList,
}
}

View File

@@ -32,6 +32,7 @@ type EvaluationStore = {
hydrateResource: (resourceType: EvaluationResourceType, resourceId: string, config: EvaluationConfig) => void
setJudgeModel: (resourceType: EvaluationResourceType, resourceId: string, judgeModelId: string) => void
addBuiltinMetric: (resourceType: EvaluationResourceType, resourceId: string, optionId: string, nodeInfoList?: NodeInfo[]) => void
updateMetricThreshold: (resourceType: EvaluationResourceType, resourceId: string, metricId: string, threshold: number) => void
addCustomMetric: (resourceType: EvaluationResourceType, resourceId: string) => void
removeMetric: (resourceType: EvaluationResourceType, resourceId: string, metricId: string) => void
setCustomMetricWorkflow: (
@@ -126,6 +127,17 @@ export const useEvaluationStore = create<EvaluationStore>((set, get) => ({
}
})
},
updateMetricThreshold: (resourceType, resourceId, metricId, threshold) => {
set(state => ({
resources: updateResourceState(state.resources, resourceType, resourceId, resource => ({
...resource,
metrics: updateMetric(resource.metrics, metricId, metric => ({
...metric,
threshold,
})),
})),
}))
},
addCustomMetric: (resourceType, resourceId) => {
set(state => ({
resources: updateResourceState(state.resources, resourceType, resourceId, resource => ({

View File

@@ -82,6 +82,7 @@ export type EvaluationMetric = {
label: string
description: string
badges: string[]
threshold?: number
nodeInfoList?: NodeInfo[]
customConfig?: CustomMetricConfig
}

View File

@@ -92,6 +92,7 @@
"metrics.showMore": "Show more",
"metrics.title": "Metrics",
"metrics.update": "Update",
"pipeline.passIf": "Pass if \u2265",
"pipeline.uploadAndRun": "Upload & Run Test",
"results.empty": "No evaluation results yet.",
"title": "Evaluation"