feat(web): dataset evaluation layout

This commit is contained in:
JzoNg
2026-04-09 13:44:29 +08:00
parent fb91984fcb
commit 73d95245f8
7 changed files with 506 additions and 55 deletions

View File

@@ -16,9 +16,23 @@ vi.mock('@/app/components/header/account-setting/model-provider-page/hooks', ()
}))
vi.mock('@/app/components/header/account-setting/model-provider-page/model-selector', () => ({
default: ({ defaultModel }: { defaultModel?: { provider: string, model: string } }) => (
<div data-testid="evaluation-model-selector">
{defaultModel ? `${defaultModel.provider}:${defaultModel.model}` : 'empty'}
default: ({
defaultModel,
onSelect,
}: {
defaultModel?: { provider: string, model: string }
onSelect: (model: { provider: string, model: string }) => void
}) => (
<div>
<div data-testid="evaluation-model-selector">
{defaultModel ? `${defaultModel.provider}:${defaultModel.model}` : 'empty'}
</div>
<button
type="button"
onClick={() => onSelect({ provider: 'openai', model: 'gpt-4o-mini' })}
>
select-model
</button>
</div>
),
}))
@@ -208,4 +222,26 @@ describe('Evaluation', () => {
expect(screen.getByText('LLM 4')).toBeInTheDocument()
expect(screen.getByRole('button', { name: 'evaluation.metrics.showLess' })).toBeInTheDocument()
})
it('should render the pipeline-specific layout without auto-selecting a judge model', () => {
render(<Evaluation resourceType="pipeline" resourceId="dataset-1" />)
expect(screen.getByTestId('evaluation-model-selector')).toHaveTextContent('empty')
expect(screen.getByText('evaluation.history.title')).toBeInTheDocument()
expect(screen.getByText('Context Precision')).toBeInTheDocument()
expect(screen.getByText('Context Recall')).toBeInTheDocument()
expect(screen.getByText('Context Relevance')).toBeInTheDocument()
expect(screen.getByText('evaluation.results.empty')).toBeInTheDocument()
expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeDisabled()
})
it('should enable pipeline batch actions after selecting a judge model and metric', () => {
render(<Evaluation resourceType="pipeline" resourceId="dataset-2" />)
fireEvent.click(screen.getByRole('button', { name: 'select-model' }))
fireEvent.click(screen.getByRole('button', { name: /Context Precision/i }))
expect(screen.getByRole('button', { name: 'evaluation.batch.downloadTemplate' })).toBeEnabled()
expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeEnabled()
})
})

View File

@@ -8,17 +8,22 @@ import ModelSelector from '@/app/components/header/account-setting/model-provide
import { useEvaluationResource, useEvaluationStore } from '../store'
import { decodeModelSelection, encodeModelSelection } from '../utils'
type JudgeModelSelectorProps = EvaluationResourceProps & {
autoSelectFirst?: boolean
}
const JudgeModelSelector = ({
resourceType,
resourceId,
}: EvaluationResourceProps) => {
autoSelectFirst = true,
}: JudgeModelSelectorProps) => {
const { data: modelList } = useModelList(ModelTypeEnum.textGeneration)
const resource = useEvaluationResource(resourceType, resourceId)
const setJudgeModel = useEvaluationStore(state => state.setJudgeModel)
const selectedModel = decodeModelSelection(resource.judgeModelId)
useEffect(() => {
if (resource.judgeModelId || !modelList.length)
if (!autoSelectFirst || resource.judgeModelId || !modelList.length)
return
const firstProvider = modelList[0]
@@ -27,7 +32,7 @@ const JudgeModelSelector = ({
return
setJudgeModel(resourceType, resourceId, encodeModelSelection(firstProvider.provider, firstModel.model))
}, [modelList, resource.judgeModelId, resourceId, resourceType, setJudgeModel])
}, [autoSelectFirst, modelList, resource.judgeModelId, resourceId, resourceType, setJudgeModel])
return (
<ModelSelector

View File

@@ -0,0 +1,62 @@
'use client'
import type { EvaluationResourceProps } from '../types'
import { useTranslation } from 'react-i18next'
import { useDocLink } from '@/context/i18n'
import BatchTestPanel from './batch-test-panel'
import ConditionsSection from './conditions-section'
import JudgeModelSelector from './judge-model-selector'
import MetricSection from './metric-section'
import SectionHeader, { InlineSectionHeader } from './section-header'
const NonPipelineEvaluation = ({
resourceType,
resourceId,
}: EvaluationResourceProps) => {
const { t } = useTranslation('evaluation')
const { t: tCommon } = useTranslation('common')
const docLink = useDocLink()
return (
<div className="flex h-full min-h-0 flex-col bg-background-default xl:flex-row">
<div className="min-h-0 flex-1 overflow-y-auto">
<div className="flex min-h-full max-w-[748px] flex-col px-6 py-4">
<SectionHeader
title={t('title')}
description={(
<>
{t('description')}
{' '}
<a
className="text-text-accent"
href={docLink()}
target="_blank"
rel="noopener noreferrer"
>
{tCommon('operation.learnMore')}
</a>
</>
)}
descriptionClassName="max-w-[700px]"
/>
<section className="max-w-[700px] py-4">
<InlineSectionHeader title={t('judgeModel.title')} tooltip={t('judgeModel.description')} />
<div className="mt-1.5">
<JudgeModelSelector resourceType={resourceType} resourceId={resourceId} />
</div>
</section>
<div className="max-w-[700px] border-b border-divider-subtle" />
<MetricSection resourceType={resourceType} resourceId={resourceId} />
<div className="max-w-[700px] border-b border-divider-subtle" />
<ConditionsSection resourceType={resourceType} resourceId={resourceId} />
</div>
</div>
<div className="h-[420px] shrink-0 border-t border-divider-subtle xl:h-auto xl:w-[450px] xl:border-t-0 xl:border-l">
<BatchTestPanel resourceType={resourceType} resourceId={resourceId} />
</div>
</div>
)
}
export default NonPipelineEvaluation

View File

@@ -0,0 +1,346 @@
'use client'
import type { EvaluationResourceProps, MetricOption } from '../types'
import { useEffect, useMemo, useRef, useState } from 'react'
import { useTranslation } from 'react-i18next'
import Badge from '@/app/components/base/badge'
import Button from '@/app/components/base/button'
import Checkbox from '@/app/components/base/checkbox'
import Input from '@/app/components/base/input'
import { toast } from '@/app/components/base/ui/toast'
import { Tooltip, TooltipContent, TooltipTrigger } from '@/app/components/base/ui/tooltip'
import { useDocLink } from '@/context/i18n'
import { cn } from '@/utils/classnames'
import { getEvaluationMockConfig } from '../mock'
import { isEvaluationRunnable, useEvaluationResource, useEvaluationStore } from '../store'
import JudgeModelSelector from './judge-model-selector'
import SectionHeader, { InlineSectionHeader } from './section-header'
type PipelineMetricItemProps = {
metric: MetricOption
selected: boolean
onToggle: () => void
disabledCondition: boolean
}
const PipelineMetricItem = ({
metric,
selected,
onToggle,
disabledCondition,
}: PipelineMetricItemProps) => {
return (
<div className="flex items-center justify-between gap-3 px-1 py-1">
<button
type="button"
className="flex min-w-0 items-center gap-2 text-left"
onClick={onToggle}
>
<Checkbox checked={selected} />
<span className="truncate system-sm-medium text-text-secondary">{metric.label}</span>
<Tooltip>
<TooltipTrigger
render={(
<span className="flex h-4 w-4 items-center justify-center text-text-quaternary">
<span aria-hidden="true" className="i-ri-question-line h-3.5 w-3.5" />
</span>
)}
/>
<TooltipContent>
{metric.description}
</TooltipContent>
</Tooltip>
</button>
<button
type="button"
disabled={disabledCondition}
className={cn(
'system-xs-medium text-text-tertiary',
disabledCondition && 'cursor-not-allowed text-components-button-secondary-accent-text-disabled',
)}
>
+ Condition
</button>
</div>
)
}
const PipelineHistoryTable = ({
resourceType,
resourceId,
}: EvaluationResourceProps) => {
const { t } = useTranslation('evaluation')
const resource = useEvaluationResource(resourceType, resourceId)
const [query, setQuery] = useState('')
const statusLabels = {
running: t('batch.status.running'),
success: t('batch.status.success'),
failed: t('batch.status.failed'),
}
const filteredRecords = useMemo(() => {
const keyword = query.trim().toLowerCase()
if (!keyword)
return resource.batchRecords
return resource.batchRecords.filter(record =>
record.fileName.toLowerCase().includes(keyword)
|| record.summary.toLowerCase().includes(keyword),
)
}, [query, resource.batchRecords])
return (
<div className="flex min-h-0 flex-1 flex-col">
<div className="flex items-center justify-between gap-3 px-6 pt-4 pb-2">
<div className="system-xl-semibold text-text-primary">{t('history.title')}</div>
<div className="w-[160px] shrink-0 sm:w-[200px]">
<Input
value={query}
showLeftIcon
placeholder={t('history.searchPlaceholder')}
onChange={event => setQuery(event.target.value)}
/>
</div>
</div>
<div className="min-h-0 flex-1 px-4 pb-4">
<div className="flex h-full min-h-0 flex-col overflow-hidden rounded-lg border border-effects-highlight bg-background-default">
<div className="grid grid-cols-[minmax(0,1.8fr)_80px_80px_80px_40px] rounded-t-lg bg-background-section px-2 py-1">
<div className="flex items-center gap-1 px-2 system-xs-medium-uppercase text-text-tertiary">
<span>{t('history.columns.time')}</span>
<span aria-hidden="true" className="i-ri-arrow-down-line h-3 w-3" />
</div>
<div className="px-2 system-xs-medium-uppercase text-text-tertiary">{t('history.columns.creator')}</div>
<div className="px-2 system-xs-medium-uppercase text-text-tertiary">{t('history.columns.version')}</div>
<div className="px-2 text-center system-xs-medium-uppercase text-text-tertiary">{t('history.columns.status')}</div>
<div />
</div>
<div className="min-h-0 flex-1 overflow-y-auto">
{filteredRecords.length > 0 && (
<div className="divide-y divide-divider-subtle">
{filteredRecords.map(record => (
<div
key={record.id}
className="grid grid-cols-[minmax(0,1.8fr)_80px_80px_80px_40px] items-center px-2 py-2"
>
<div className="truncate px-2 system-sm-regular text-text-secondary">{record.startedAt}</div>
<div className="truncate px-2 system-sm-regular text-text-secondary">{t('history.creatorYou')}</div>
<div className="truncate px-2 system-sm-regular text-text-secondary">{t('history.latestVersion')}</div>
<div className="flex justify-center px-2">
<Badge
className={cn(
record.status === 'failed' && 'badge-warning',
record.status === 'success' && 'badge-accent',
)}
>
{record.status === 'running'
? (
<span className="flex items-center gap-1">
<span aria-hidden="true" className="i-ri-loader-4-line h-3 w-3 animate-spin" />
{statusLabels.running}
</span>
)
: statusLabels[record.status]}
</Badge>
</div>
<div className="flex justify-center">
<button
type="button"
className="flex h-6 w-6 items-center justify-center rounded-md text-text-quaternary hover:bg-state-base-hover"
aria-label={record.summary}
>
<span aria-hidden="true" className="i-ri-more-2-line h-4 w-4" />
</button>
</div>
</div>
))}
</div>
)}
{filteredRecords.length === 0 && (
<div className="flex h-full min-h-[321px] flex-col items-center justify-center gap-2 px-4 text-center">
<span aria-hidden="true" className="i-ri-history-line h-5 w-5 text-text-quaternary" />
<div className="system-sm-medium text-text-quaternary">{t('history.empty')}</div>
</div>
)}
</div>
</div>
</div>
</div>
)
}
const PipelineResultsPanel = () => {
const { t } = useTranslation('evaluation')
return (
<div className="flex min-h-[360px] flex-1 items-center justify-center xl:min-h-0">
<div className="flex flex-col items-center gap-4 px-4 text-center">
<span aria-hidden="true" className="i-ri-file-list-3-line h-12 w-12 text-text-quaternary" />
<div className="system-md-medium text-text-quaternary">{t('results.empty')}</div>
</div>
</div>
)
}
const PipelineEvaluation = ({
resourceType,
resourceId,
}: EvaluationResourceProps) => {
const { t } = useTranslation('evaluation')
const { t: tCommon } = useTranslation('common')
const docLink = useDocLink()
const ensureResource = useEvaluationStore(state => state.ensureResource)
const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric)
const removeMetric = useEvaluationStore(state => state.removeMetric)
const setUploadedFileName = useEvaluationStore(state => state.setUploadedFileName)
const runBatchTest = useEvaluationStore(state => state.runBatchTest)
const resource = useEvaluationResource(resourceType, resourceId)
const fileInputRef = useRef<HTMLInputElement>(null)
const config = getEvaluationMockConfig(resourceType)
const builtinMetricMap = useMemo(() => new Map(
resource.metrics
.filter(metric => metric.kind === 'builtin')
.map(metric => [metric.optionId, metric]),
), [resource.metrics])
const isConfigReady = !!resource.judgeModelId && builtinMetricMap.size > 0
const isRunnable = isEvaluationRunnable(resource)
useEffect(() => {
ensureResource(resourceType, resourceId)
}, [ensureResource, resourceId, resourceType])
const handleToggleMetric = (metricId: string) => {
const selectedMetric = builtinMetricMap.get(metricId)
if (selectedMetric) {
removeMetric(resourceType, resourceId, selectedMetric.id)
return
}
addBuiltinMetric(resourceType, resourceId, metricId)
}
const handleDownloadTemplate = () => {
const content = ['case_id,input,expected', '1,Example input,Example output'].join('\n')
const link = document.createElement('a')
link.href = `data:text/csv;charset=utf-8,${encodeURIComponent(content)}`
link.download = config.templateFileName
link.click()
}
const handleUploadAndRun = () => {
if (!isRunnable) {
toast.warning(t('batch.validation'))
return
}
fileInputRef.current?.click()
}
return (
<div className="flex h-full min-h-0 flex-col bg-background-default xl:flex-row">
<div className="flex min-h-0 flex-col border-b border-divider-subtle bg-background-default xl:w-[450px] xl:shrink-0 xl:border-r xl:border-b-0">
<div className="px-6 pt-4 pb-2">
<SectionHeader
title={t('title')}
description={(
<>
{t('description')}
{' '}
<a
className="text-text-accent"
href={docLink()}
target="_blank"
rel="noopener noreferrer"
>
{tCommon('operation.learnMore')}
</a>
</>
)}
/>
</div>
<div className="px-6 pt-3 pb-4">
<div className="space-y-3">
<section>
<InlineSectionHeader title={t('judgeModel.title')} tooltip={t('judgeModel.description')} />
<div className="mt-1">
<JudgeModelSelector
resourceType={resourceType}
resourceId={resourceId}
autoSelectFirst={false}
/>
</div>
</section>
<section>
<InlineSectionHeader title={t('metrics.title')} tooltip={t('metrics.description')} />
<div className="mt-1 space-y-0.5">
{config.builtinMetrics.map(metric => (
<PipelineMetricItem
key={metric.id}
metric={metric}
selected={builtinMetricMap.has(metric.id)}
disabledCondition
onToggle={() => handleToggleMetric(metric.id)}
/>
))}
</div>
</section>
<div className="flex gap-2 pt-2">
<Button
className="flex-1 justify-center"
variant="secondary"
disabled={!isConfigReady}
onClick={handleDownloadTemplate}
>
<span aria-hidden="true" className="mr-1 i-ri-file-excel-2-line h-4 w-4" />
{t('batch.downloadTemplate')}
</Button>
<Button
className="flex-1 justify-center"
variant="primary"
disabled={!isConfigReady}
onClick={handleUploadAndRun}
>
{t('pipeline.uploadAndRun')}
</Button>
</div>
<input
ref={fileInputRef}
hidden
type="file"
accept=".csv,.xlsx"
onChange={(event) => {
const file = event.target.files?.[0]
if (!file)
return
setUploadedFileName(resourceType, resourceId, file.name)
runBatchTest(resourceType, resourceId)
event.target.value = ''
}}
/>
</div>
</div>
<div className="border-t border-divider-subtle" />
<PipelineHistoryTable
resourceType={resourceType}
resourceId={resourceId}
/>
</div>
<div className="min-h-0 flex-1 bg-background-default">
<PipelineResultsPanel />
</div>
</div>
)
}
export default PipelineEvaluation

View File

@@ -2,67 +2,34 @@
import type { EvaluationResourceProps } from './types'
import { useEffect } from 'react'
import { useTranslation } from 'react-i18next'
import { useDocLink } from '@/context/i18n'
import BatchTestPanel from './components/batch-test-panel'
import ConditionsSection from './components/conditions-section'
import JudgeModelSelector from './components/judge-model-selector'
import MetricSection from './components/metric-section'
import SectionHeader, { InlineSectionHeader } from './components/section-header'
import NonPipelineEvaluation from './components/non-pipeline-evaluation'
import PipelineEvaluation from './components/pipeline-evaluation'
import { useEvaluationStore } from './store'
const Evaluation = ({
resourceType,
resourceId,
}: EvaluationResourceProps) => {
const { t } = useTranslation('evaluation')
const { t: tCommon } = useTranslation('common')
const docLink = useDocLink()
const ensureResource = useEvaluationStore(state => state.ensureResource)
useEffect(() => {
ensureResource(resourceType, resourceId)
}, [ensureResource, resourceId, resourceType])
return (
<div className="flex h-full min-h-0 flex-col bg-background-default xl:flex-row">
<div className="min-h-0 flex-1 overflow-y-auto">
<div className="flex min-h-full max-w-[748px] flex-col px-6 py-4">
<SectionHeader
title={t('title')}
description={(
<>
{t('description')}
{' '}
<a
className="text-text-accent"
href={docLink()}
target="_blank"
rel="noopener noreferrer"
>
{tCommon('operation.learnMore')}
</a>
</>
)}
descriptionClassName="max-w-[700px]"
/>
<section className="max-w-[700px] py-4">
<InlineSectionHeader title={t('judgeModel.title')} tooltip={t('judgeModel.description')} />
<div className="mt-1.5">
<JudgeModelSelector resourceType={resourceType} resourceId={resourceId} />
</div>
</section>
<div className="max-w-[700px] border-b border-divider-subtle" />
<MetricSection resourceType={resourceType} resourceId={resourceId} />
<div className="max-w-[700px] border-b border-divider-subtle" />
<ConditionsSection resourceType={resourceType} resourceId={resourceId} />
</div>
</div>
if (resourceType === 'pipeline') {
return (
<PipelineEvaluation
resourceType={resourceType}
resourceId={resourceId}
/>
)
}
<div className="h-[420px] shrink-0 border-t border-divider-subtle xl:h-auto xl:w-[450px] xl:border-t-0 xl:border-l">
<BatchTestPanel resourceType={resourceType} resourceId={resourceId} />
</div>
</div>
return (
<NonPipelineEvaluation
resourceType={resourceType}
resourceId={resourceId}
/>
)
}

View File

@@ -69,6 +69,30 @@ const builtinMetrics: MetricOption[] = [
},
]
const pipelineBuiltinMetrics: MetricOption[] = [
{
id: 'context-precision',
label: 'Context Precision',
description: 'Measures whether retrieved chunks stay tightly aligned to the request.',
group: 'quality',
badges: ['Retrieval'],
},
{
id: 'context-recall',
label: 'Context Recall',
description: 'Checks whether the retrieval result includes the evidence needed to answer.',
group: 'quality',
badges: ['Retrieval'],
},
{
id: 'context-relevance',
label: 'Context Relevance',
description: 'Scores how useful the retrieved context is for downstream generation.',
group: 'quality',
badges: ['Retrieval'],
},
]
const workflowOptions = [
{
id: 'workflow-precision-review',
@@ -139,7 +163,7 @@ export const getEvaluationMockConfig = (resourceType: EvaluationResourceType): E
if (resourceType === 'pipeline') {
return {
judgeModels,
builtinMetrics,
builtinMetrics: pipelineBuiltinMetrics,
workflowOptions,
fieldOptions: pipelineFields,
templateFileName: 'pipeline-evaluation-template.csv',

View File

@@ -47,6 +47,15 @@
"conditions.title": "Judgment Conditions",
"conditions.valuePlaceholder": "Enter a value",
"description": "Configure automated testing to grade your application's performance.",
"history.columns.creator": "Creator",
"history.columns.status": "Status",
"history.columns.time": "Time",
"history.columns.version": "Version",
"history.creatorYou": "You",
"history.empty": "No test history yet",
"history.latestVersion": "Latest",
"history.searchPlaceholder": "Search",
"history.title": "Test History",
"judgeModel.description": "Choose the model used to score your evaluation results.",
"judgeModel.title": "Judge Model",
"metrics.add": "Add Metric",
@@ -83,5 +92,7 @@
"metrics.showMore": "Show more",
"metrics.title": "Metrics",
"metrics.update": "Update",
"pipeline.uploadAndRun": "Upload & Run Test",
"results.empty": "No evaluation results yet.",
"title": "Evaluation"
}