From e52dbd49beae877e98d47ae2dbb76107c5617be3 Mon Sep 17 00:00:00 2001 From: JzoNg Date: Thu, 9 Apr 2026 15:34:59 +0800 Subject: [PATCH] feat(web): dataset evaluation configure --- .../evaluation/__tests__/index.spec.tsx | 24 ++++++++- .../evaluation/__tests__/store.spec.ts | 1 + .../components/layout/pipeline-evaluation.tsx | 33 ++++++++---- .../pipeline/pipeline-metric-item.tsx | 51 +++++++++++++++---- web/app/components/evaluation/store-utils.ts | 9 +++- web/app/components/evaluation/store.ts | 12 +++++ web/app/components/evaluation/types.ts | 1 + web/i18n/en-US/evaluation.json | 1 + 8 files changed, 111 insertions(+), 21 deletions(-) diff --git a/web/app/components/evaluation/__tests__/index.spec.tsx b/web/app/components/evaluation/__tests__/index.spec.tsx index 302df31c9d..e2da9b196f 100644 --- a/web/app/components/evaluation/__tests__/index.spec.tsx +++ b/web/app/components/evaluation/__tests__/index.spec.tsx @@ -54,7 +54,7 @@ describe('Evaluation', () => { mockUseAvailableEvaluationMetrics.mockReturnValue({ data: { - metrics: ['answer-correctness', 'faithfulness'], + metrics: ['answer-correctness', 'faithfulness', 'context-precision', 'context-recall', 'context-relevance'], }, isLoading: false, }) @@ -240,12 +240,34 @@ describe('Evaluation', () => { expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeDisabled() }) + it('should render selected pipeline metrics from config with the default threshold input', () => { + mockUseEvaluationConfig.mockReturnValue({ + data: { + evaluation_model: null, + evaluation_model_provider: null, + metrics_config: { + default_metrics: [{ + metric: 'context-precision', + }], + customized_metrics: null, + }, + judgement_conditions: null, + }, + }) + + render() + + expect(screen.getByText('Context Precision')).toBeInTheDocument() + expect(screen.getByDisplayValue('0.85')).toBeInTheDocument() + }) + it('should enable pipeline batch actions after selecting a judge model and metric', () => { render() fireEvent.click(screen.getByRole('button', { name: 'select-model' })) fireEvent.click(screen.getByRole('button', { name: /Context Precision/i })) + expect(screen.getByDisplayValue('0.85')).toBeInTheDocument() expect(screen.getByRole('button', { name: 'evaluation.batch.downloadTemplate' })).toBeEnabled() expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeEnabled() }) diff --git a/web/app/components/evaluation/__tests__/store.spec.ts b/web/app/components/evaluation/__tests__/store.spec.ts index 9415e8430a..3814b15219 100644 --- a/web/app/components/evaluation/__tests__/store.spec.ts +++ b/web/app/components/evaluation/__tests__/store.spec.ts @@ -181,6 +181,7 @@ describe('evaluation store', () => { expect(hydratedState.judgeModelId).toBe('openai::gpt-4o-mini') expect(hydratedState.metrics).toHaveLength(2) expect(hydratedState.metrics[0].optionId).toBe('faithfulness') + expect(hydratedState.metrics[0].threshold).toBe(0.85) expect(hydratedState.metrics[0].nodeInfoList).toEqual([ { node_id: 'node-1', title: 'Retriever', type: 'retriever' }, ]) diff --git a/web/app/components/evaluation/components/layout/pipeline-evaluation.tsx b/web/app/components/evaluation/components/layout/pipeline-evaluation.tsx index c605c53e17..32254e1e6d 100644 --- a/web/app/components/evaluation/components/layout/pipeline-evaluation.tsx +++ b/web/app/components/evaluation/components/layout/pipeline-evaluation.tsx @@ -6,6 +6,7 @@ import { useTranslation } from 'react-i18next' import Button from '@/app/components/base/button' import { toast } from '@/app/components/base/ui/toast' import { useDocLink } from '@/context/i18n' +import { useAvailableEvaluationMetrics } from '@/service/use-evaluation' import { getEvaluationMockConfig } from '../../mock' import { isEvaluationRunnable, useEvaluationResource, useEvaluationStore } from '../../store' import JudgeModelSelector from '../judge-model-selector' @@ -24,8 +25,10 @@ const PipelineEvaluation = ({ const ensureResource = useEvaluationStore(state => state.ensureResource) const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric) const removeMetric = useEvaluationStore(state => state.removeMetric) + const updateMetricThreshold = useEvaluationStore(state => state.updateMetricThreshold) const setUploadedFileName = useEvaluationStore(state => state.setUploadedFileName) const runBatchTest = useEvaluationStore(state => state.runBatchTest) + const { data: availableMetricsData } = useAvailableEvaluationMetrics() const resource = useEvaluationResource(resourceType, resourceId) const fileInputRef = useRef(null) const config = getEvaluationMockConfig(resourceType) @@ -34,6 +37,12 @@ const PipelineEvaluation = ({ .filter(metric => metric.kind === 'builtin') .map(metric => [metric.optionId, metric]), ), [resource.metrics]) + const availableMetricIds = useMemo(() => new Set(availableMetricsData?.metrics ?? []), [availableMetricsData?.metrics]) + const availableBuiltinMetrics = useMemo(() => { + return config.builtinMetrics.filter(metric => + availableMetricIds.has(metric.id) || builtinMetricMap.has(metric.id), + ) + }, [availableMetricIds, builtinMetricMap, config.builtinMetrics]) const isConfigReady = !!resource.judgeModelId && builtinMetricMap.size > 0 const isRunnable = isEvaluationRunnable(resource) @@ -107,15 +116,21 @@ const PipelineEvaluation = ({
- {config.builtinMetrics.map(metric => ( - handleToggleMetric(metric.id)} - /> - ))} + {availableBuiltinMetrics.map((metric) => { + const selectedMetric = builtinMetricMap.get(metric.id) + + return ( + handleToggleMetric(metric.id)} + onThresholdChange={value => updateMetricThreshold(resourceType, resourceId, selectedMetric?.id ?? '', value)} + /> + ) + })}
diff --git a/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx b/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx index 14c4a3c726..e535da2204 100644 --- a/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx +++ b/web/app/components/evaluation/components/pipeline/pipeline-metric-item.tsx @@ -1,15 +1,20 @@ 'use client' import type { MetricOption } from '../../types' +import { useTranslation } from 'react-i18next' import Checkbox from '@/app/components/base/checkbox' +import Input from '@/app/components/base/input' import { Tooltip, TooltipContent, TooltipTrigger } from '@/app/components/base/ui/tooltip' import { cn } from '@/utils/classnames' +import { DEFAULT_PIPELINE_METRIC_THRESHOLD } from '../../store-utils' type PipelineMetricItemProps = { metric: MetricOption selected: boolean onToggle: () => void disabledCondition: boolean + threshold?: number + onThresholdChange: (value: number) => void } const PipelineMetricItem = ({ @@ -17,7 +22,11 @@ const PipelineMetricItem = ({ selected, onToggle, disabledCondition, + threshold = DEFAULT_PIPELINE_METRIC_THRESHOLD, + onThresholdChange, }: PipelineMetricItemProps) => { + const { t } = useTranslation('evaluation') + return (
- + {selected + ? ( +
+ {t('pipeline.passIf')} +
+ { + const parsedValue = Number(event.target.value) + if (!Number.isNaN(parsedValue)) + onThresholdChange(parsedValue) + }} + /> +
+
+ ) + : ( + + )}
) } diff --git a/web/app/components/evaluation/store-utils.ts b/web/app/components/evaluation/store-utils.ts index c1acc1300c..b1c6baa4de 100644 --- a/web/app/components/evaluation/store-utils.ts +++ b/web/app/components/evaluation/store-utils.ts @@ -24,6 +24,8 @@ import { encodeModelSelection } from './utils' type EvaluationStoreResources = Record +export const DEFAULT_PIPELINE_METRIC_THRESHOLD = 0.85 + const createId = (prefix: string) => `${prefix}-${Math.random().toString(36).slice(2, 10)}` const humanizeMetricId = (metricId: string) => { @@ -213,7 +215,11 @@ export function getConditionValue( return typeof previousValue === 'string' ? previousValue : null } -export function createBuiltinMetric(metric: MetricOption, nodeInfoList: NodeInfo[] = []): EvaluationMetric { +export function createBuiltinMetric( + metric: MetricOption, + nodeInfoList: NodeInfo[] = [], + threshold = DEFAULT_PIPELINE_METRIC_THRESHOLD, +): EvaluationMetric { return { id: createId('metric'), optionId: metric.id, @@ -221,6 +227,7 @@ export function createBuiltinMetric(metric: MetricOption, nodeInfoList: NodeInfo label: metric.label, description: metric.description, badges: metric.badges, + threshold, nodeInfoList, } } diff --git a/web/app/components/evaluation/store.ts b/web/app/components/evaluation/store.ts index 61e1f773bb..5774dd5934 100644 --- a/web/app/components/evaluation/store.ts +++ b/web/app/components/evaluation/store.ts @@ -32,6 +32,7 @@ type EvaluationStore = { hydrateResource: (resourceType: EvaluationResourceType, resourceId: string, config: EvaluationConfig) => void setJudgeModel: (resourceType: EvaluationResourceType, resourceId: string, judgeModelId: string) => void addBuiltinMetric: (resourceType: EvaluationResourceType, resourceId: string, optionId: string, nodeInfoList?: NodeInfo[]) => void + updateMetricThreshold: (resourceType: EvaluationResourceType, resourceId: string, metricId: string, threshold: number) => void addCustomMetric: (resourceType: EvaluationResourceType, resourceId: string) => void removeMetric: (resourceType: EvaluationResourceType, resourceId: string, metricId: string) => void setCustomMetricWorkflow: ( @@ -126,6 +127,17 @@ export const useEvaluationStore = create((set, get) => ({ } }) }, + updateMetricThreshold: (resourceType, resourceId, metricId, threshold) => { + set(state => ({ + resources: updateResourceState(state.resources, resourceType, resourceId, resource => ({ + ...resource, + metrics: updateMetric(resource.metrics, metricId, metric => ({ + ...metric, + threshold, + })), + })), + })) + }, addCustomMetric: (resourceType, resourceId) => { set(state => ({ resources: updateResourceState(state.resources, resourceType, resourceId, resource => ({ diff --git a/web/app/components/evaluation/types.ts b/web/app/components/evaluation/types.ts index 936c47365e..411c28d540 100644 --- a/web/app/components/evaluation/types.ts +++ b/web/app/components/evaluation/types.ts @@ -82,6 +82,7 @@ export type EvaluationMetric = { label: string description: string badges: string[] + threshold?: number nodeInfoList?: NodeInfo[] customConfig?: CustomMetricConfig } diff --git a/web/i18n/en-US/evaluation.json b/web/i18n/en-US/evaluation.json index 41dff238e3..0ba7fe44cd 100644 --- a/web/i18n/en-US/evaluation.json +++ b/web/i18n/en-US/evaluation.json @@ -92,6 +92,7 @@ "metrics.showMore": "Show more", "metrics.title": "Metrics", "metrics.update": "Update", + "pipeline.passIf": "Pass if \u2265", "pipeline.uploadAndRun": "Upload & Run Test", "results.empty": "No evaluation results yet.", "title": "Evaluation"