From 73d95245f8897966ea7d1533ba1ce2fe837ddbd5 Mon Sep 17 00:00:00 2001 From: JzoNg Date: Thu, 9 Apr 2026 13:44:29 +0800 Subject: [PATCH] feat(web): dataset evaluation layout --- .../evaluation/__tests__/index.spec.tsx | 42 ++- .../components/judge-model-selector.tsx | 11 +- .../components/non-pipeline-evaluation.tsx | 62 ++++ .../components/pipeline-evaluation.tsx | 346 ++++++++++++++++++ web/app/components/evaluation/index.tsx | 63 +--- web/app/components/evaluation/mock.ts | 26 +- web/i18n/en-US/evaluation.json | 11 + 7 files changed, 506 insertions(+), 55 deletions(-) create mode 100644 web/app/components/evaluation/components/non-pipeline-evaluation.tsx create mode 100644 web/app/components/evaluation/components/pipeline-evaluation.tsx diff --git a/web/app/components/evaluation/__tests__/index.spec.tsx b/web/app/components/evaluation/__tests__/index.spec.tsx index 8666c8b96d..748ce5981f 100644 --- a/web/app/components/evaluation/__tests__/index.spec.tsx +++ b/web/app/components/evaluation/__tests__/index.spec.tsx @@ -16,9 +16,23 @@ vi.mock('@/app/components/header/account-setting/model-provider-page/hooks', () })) vi.mock('@/app/components/header/account-setting/model-provider-page/model-selector', () => ({ - default: ({ defaultModel }: { defaultModel?: { provider: string, model: string } }) => ( -
- {defaultModel ? `${defaultModel.provider}:${defaultModel.model}` : 'empty'} + default: ({ + defaultModel, + onSelect, + }: { + defaultModel?: { provider: string, model: string } + onSelect: (model: { provider: string, model: string }) => void + }) => ( +
+
+ {defaultModel ? `${defaultModel.provider}:${defaultModel.model}` : 'empty'} +
+
), })) @@ -208,4 +222,26 @@ describe('Evaluation', () => { expect(screen.getByText('LLM 4')).toBeInTheDocument() expect(screen.getByRole('button', { name: 'evaluation.metrics.showLess' })).toBeInTheDocument() }) + + it('should render the pipeline-specific layout without auto-selecting a judge model', () => { + render() + + expect(screen.getByTestId('evaluation-model-selector')).toHaveTextContent('empty') + expect(screen.getByText('evaluation.history.title')).toBeInTheDocument() + expect(screen.getByText('Context Precision')).toBeInTheDocument() + expect(screen.getByText('Context Recall')).toBeInTheDocument() + expect(screen.getByText('Context Relevance')).toBeInTheDocument() + expect(screen.getByText('evaluation.results.empty')).toBeInTheDocument() + expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeDisabled() + }) + + it('should enable pipeline batch actions after selecting a judge model and metric', () => { + render() + + fireEvent.click(screen.getByRole('button', { name: 'select-model' })) + fireEvent.click(screen.getByRole('button', { name: /Context Precision/i })) + + expect(screen.getByRole('button', { name: 'evaluation.batch.downloadTemplate' })).toBeEnabled() + expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeEnabled() + }) }) diff --git a/web/app/components/evaluation/components/judge-model-selector.tsx b/web/app/components/evaluation/components/judge-model-selector.tsx index e0514012fe..8f9ee4aff6 100644 --- a/web/app/components/evaluation/components/judge-model-selector.tsx +++ b/web/app/components/evaluation/components/judge-model-selector.tsx @@ -8,17 +8,22 @@ import ModelSelector from '@/app/components/header/account-setting/model-provide import { useEvaluationResource, useEvaluationStore } from '../store' import { decodeModelSelection, encodeModelSelection } from '../utils' +type JudgeModelSelectorProps = EvaluationResourceProps & { + autoSelectFirst?: boolean +} + const JudgeModelSelector = ({ resourceType, resourceId, -}: EvaluationResourceProps) => { + autoSelectFirst = true, +}: JudgeModelSelectorProps) => { const { data: modelList } = useModelList(ModelTypeEnum.textGeneration) const resource = useEvaluationResource(resourceType, resourceId) const setJudgeModel = useEvaluationStore(state => state.setJudgeModel) const selectedModel = decodeModelSelection(resource.judgeModelId) useEffect(() => { - if (resource.judgeModelId || !modelList.length) + if (!autoSelectFirst || resource.judgeModelId || !modelList.length) return const firstProvider = modelList[0] @@ -27,7 +32,7 @@ const JudgeModelSelector = ({ return setJudgeModel(resourceType, resourceId, encodeModelSelection(firstProvider.provider, firstModel.model)) - }, [modelList, resource.judgeModelId, resourceId, resourceType, setJudgeModel]) + }, [autoSelectFirst, modelList, resource.judgeModelId, resourceId, resourceType, setJudgeModel]) return ( { + const { t } = useTranslation('evaluation') + const { t: tCommon } = useTranslation('common') + const docLink = useDocLink() + + return ( +
+
+
+ + {t('description')} + {' '} + + {tCommon('operation.learnMore')} + + + )} + descriptionClassName="max-w-[700px]" + /> +
+ +
+ +
+
+
+ +
+ +
+
+ +
+ +
+
+ ) +} + +export default NonPipelineEvaluation diff --git a/web/app/components/evaluation/components/pipeline-evaluation.tsx b/web/app/components/evaluation/components/pipeline-evaluation.tsx new file mode 100644 index 0000000000..4464b85a73 --- /dev/null +++ b/web/app/components/evaluation/components/pipeline-evaluation.tsx @@ -0,0 +1,346 @@ +'use client' + +import type { EvaluationResourceProps, MetricOption } from '../types' +import { useEffect, useMemo, useRef, useState } from 'react' +import { useTranslation } from 'react-i18next' +import Badge from '@/app/components/base/badge' +import Button from '@/app/components/base/button' +import Checkbox from '@/app/components/base/checkbox' +import Input from '@/app/components/base/input' +import { toast } from '@/app/components/base/ui/toast' +import { Tooltip, TooltipContent, TooltipTrigger } from '@/app/components/base/ui/tooltip' +import { useDocLink } from '@/context/i18n' +import { cn } from '@/utils/classnames' +import { getEvaluationMockConfig } from '../mock' +import { isEvaluationRunnable, useEvaluationResource, useEvaluationStore } from '../store' +import JudgeModelSelector from './judge-model-selector' +import SectionHeader, { InlineSectionHeader } from './section-header' + +type PipelineMetricItemProps = { + metric: MetricOption + selected: boolean + onToggle: () => void + disabledCondition: boolean +} + +const PipelineMetricItem = ({ + metric, + selected, + onToggle, + disabledCondition, +}: PipelineMetricItemProps) => { + return ( +
+ + + +
+ ) +} + +const PipelineHistoryTable = ({ + resourceType, + resourceId, +}: EvaluationResourceProps) => { + const { t } = useTranslation('evaluation') + const resource = useEvaluationResource(resourceType, resourceId) + const [query, setQuery] = useState('') + const statusLabels = { + running: t('batch.status.running'), + success: t('batch.status.success'), + failed: t('batch.status.failed'), + } + + const filteredRecords = useMemo(() => { + const keyword = query.trim().toLowerCase() + if (!keyword) + return resource.batchRecords + + return resource.batchRecords.filter(record => + record.fileName.toLowerCase().includes(keyword) + || record.summary.toLowerCase().includes(keyword), + ) + }, [query, resource.batchRecords]) + + return ( +
+
+
{t('history.title')}
+
+ setQuery(event.target.value)} + /> +
+
+ +
+
+
+
+ {t('history.columns.time')} +
+
{t('history.columns.creator')}
+
{t('history.columns.version')}
+
{t('history.columns.status')}
+
+
+ +
+ {filteredRecords.length > 0 && ( +
+ {filteredRecords.map(record => ( +
+
{record.startedAt}
+
{t('history.creatorYou')}
+
{t('history.latestVersion')}
+
+ + {record.status === 'running' + ? ( + + + ) + : statusLabels[record.status]} + +
+
+ +
+
+ ))} +
+ )} + + {filteredRecords.length === 0 && ( +
+
+ )} +
+
+
+
+ ) +} + +const PipelineResultsPanel = () => { + const { t } = useTranslation('evaluation') + + return ( +
+
+
+
+ ) +} + +const PipelineEvaluation = ({ + resourceType, + resourceId, +}: EvaluationResourceProps) => { + const { t } = useTranslation('evaluation') + const { t: tCommon } = useTranslation('common') + const docLink = useDocLink() + const ensureResource = useEvaluationStore(state => state.ensureResource) + const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric) + const removeMetric = useEvaluationStore(state => state.removeMetric) + const setUploadedFileName = useEvaluationStore(state => state.setUploadedFileName) + const runBatchTest = useEvaluationStore(state => state.runBatchTest) + const resource = useEvaluationResource(resourceType, resourceId) + const fileInputRef = useRef(null) + const config = getEvaluationMockConfig(resourceType) + const builtinMetricMap = useMemo(() => new Map( + resource.metrics + .filter(metric => metric.kind === 'builtin') + .map(metric => [metric.optionId, metric]), + ), [resource.metrics]) + const isConfigReady = !!resource.judgeModelId && builtinMetricMap.size > 0 + const isRunnable = isEvaluationRunnable(resource) + + useEffect(() => { + ensureResource(resourceType, resourceId) + }, [ensureResource, resourceId, resourceType]) + + const handleToggleMetric = (metricId: string) => { + const selectedMetric = builtinMetricMap.get(metricId) + if (selectedMetric) { + removeMetric(resourceType, resourceId, selectedMetric.id) + return + } + + addBuiltinMetric(resourceType, resourceId, metricId) + } + + const handleDownloadTemplate = () => { + const content = ['case_id,input,expected', '1,Example input,Example output'].join('\n') + const link = document.createElement('a') + link.href = `data:text/csv;charset=utf-8,${encodeURIComponent(content)}` + link.download = config.templateFileName + link.click() + } + + const handleUploadAndRun = () => { + if (!isRunnable) { + toast.warning(t('batch.validation')) + return + } + + fileInputRef.current?.click() + } + + return ( +
+
+
+ + {t('description')} + {' '} + + {tCommon('operation.learnMore')} + + + )} + /> +
+ +
+
+
+ +
+ +
+
+ +
+ +
+ {config.builtinMetrics.map(metric => ( + handleToggleMetric(metric.id)} + /> + ))} +
+
+ +
+ + +
+ + { + const file = event.target.files?.[0] + if (!file) + return + + setUploadedFileName(resourceType, resourceId, file.name) + runBatchTest(resourceType, resourceId) + event.target.value = '' + }} + /> +
+
+ +
+ + +
+ +
+ +
+
+ ) +} + +export default PipelineEvaluation diff --git a/web/app/components/evaluation/index.tsx b/web/app/components/evaluation/index.tsx index b031cd7afb..5806bf140a 100644 --- a/web/app/components/evaluation/index.tsx +++ b/web/app/components/evaluation/index.tsx @@ -2,67 +2,34 @@ import type { EvaluationResourceProps } from './types' import { useEffect } from 'react' -import { useTranslation } from 'react-i18next' -import { useDocLink } from '@/context/i18n' -import BatchTestPanel from './components/batch-test-panel' -import ConditionsSection from './components/conditions-section' -import JudgeModelSelector from './components/judge-model-selector' -import MetricSection from './components/metric-section' -import SectionHeader, { InlineSectionHeader } from './components/section-header' +import NonPipelineEvaluation from './components/non-pipeline-evaluation' +import PipelineEvaluation from './components/pipeline-evaluation' import { useEvaluationStore } from './store' const Evaluation = ({ resourceType, resourceId, }: EvaluationResourceProps) => { - const { t } = useTranslation('evaluation') - const { t: tCommon } = useTranslation('common') - const docLink = useDocLink() const ensureResource = useEvaluationStore(state => state.ensureResource) useEffect(() => { ensureResource(resourceType, resourceId) }, [ensureResource, resourceId, resourceType]) - return ( -
-
-
- - {t('description')} - {' '} - - {tCommon('operation.learnMore')} - - - )} - descriptionClassName="max-w-[700px]" - /> -
- -
- -
-
-
- -
- -
-
+ if (resourceType === 'pipeline') { + return ( + + ) + } -
- -
-
+ return ( + ) } diff --git a/web/app/components/evaluation/mock.ts b/web/app/components/evaluation/mock.ts index 598e5ee675..341711e567 100644 --- a/web/app/components/evaluation/mock.ts +++ b/web/app/components/evaluation/mock.ts @@ -69,6 +69,30 @@ const builtinMetrics: MetricOption[] = [ }, ] +const pipelineBuiltinMetrics: MetricOption[] = [ + { + id: 'context-precision', + label: 'Context Precision', + description: 'Measures whether retrieved chunks stay tightly aligned to the request.', + group: 'quality', + badges: ['Retrieval'], + }, + { + id: 'context-recall', + label: 'Context Recall', + description: 'Checks whether the retrieval result includes the evidence needed to answer.', + group: 'quality', + badges: ['Retrieval'], + }, + { + id: 'context-relevance', + label: 'Context Relevance', + description: 'Scores how useful the retrieved context is for downstream generation.', + group: 'quality', + badges: ['Retrieval'], + }, +] + const workflowOptions = [ { id: 'workflow-precision-review', @@ -139,7 +163,7 @@ export const getEvaluationMockConfig = (resourceType: EvaluationResourceType): E if (resourceType === 'pipeline') { return { judgeModels, - builtinMetrics, + builtinMetrics: pipelineBuiltinMetrics, workflowOptions, fieldOptions: pipelineFields, templateFileName: 'pipeline-evaluation-template.csv', diff --git a/web/i18n/en-US/evaluation.json b/web/i18n/en-US/evaluation.json index 2749ce3fc5..41dff238e3 100644 --- a/web/i18n/en-US/evaluation.json +++ b/web/i18n/en-US/evaluation.json @@ -47,6 +47,15 @@ "conditions.title": "Judgment Conditions", "conditions.valuePlaceholder": "Enter a value", "description": "Configure automated testing to grade your application's performance.", + "history.columns.creator": "Creator", + "history.columns.status": "Status", + "history.columns.time": "Time", + "history.columns.version": "Version", + "history.creatorYou": "You", + "history.empty": "No test history yet", + "history.latestVersion": "Latest", + "history.searchPlaceholder": "Search", + "history.title": "Test History", "judgeModel.description": "Choose the model used to score your evaluation results.", "judgeModel.title": "Judge Model", "metrics.add": "Add Metric", @@ -83,5 +92,7 @@ "metrics.showMore": "Show more", "metrics.title": "Metrics", "metrics.update": "Update", + "pipeline.uploadAndRun": "Upload & Run Test", + "results.empty": "No evaluation results yet.", "title": "Evaluation" }