diff --git a/web/app/components/evaluation/__tests__/index.spec.tsx b/web/app/components/evaluation/__tests__/index.spec.tsx
index 8666c8b96d..748ce5981f 100644
--- a/web/app/components/evaluation/__tests__/index.spec.tsx
+++ b/web/app/components/evaluation/__tests__/index.spec.tsx
@@ -16,9 +16,23 @@ vi.mock('@/app/components/header/account-setting/model-provider-page/hooks', ()
}))
vi.mock('@/app/components/header/account-setting/model-provider-page/model-selector', () => ({
- default: ({ defaultModel }: { defaultModel?: { provider: string, model: string } }) => (
-
- {defaultModel ? `${defaultModel.provider}:${defaultModel.model}` : 'empty'}
+ default: ({
+ defaultModel,
+ onSelect,
+ }: {
+ defaultModel?: { provider: string, model: string }
+ onSelect: (model: { provider: string, model: string }) => void
+ }) => (
+
+
+ {defaultModel ? `${defaultModel.provider}:${defaultModel.model}` : 'empty'}
+
+
),
}))
@@ -208,4 +222,26 @@ describe('Evaluation', () => {
expect(screen.getByText('LLM 4')).toBeInTheDocument()
expect(screen.getByRole('button', { name: 'evaluation.metrics.showLess' })).toBeInTheDocument()
})
+
+ it('should render the pipeline-specific layout without auto-selecting a judge model', () => {
+ render(
)
+
+ expect(screen.getByTestId('evaluation-model-selector')).toHaveTextContent('empty')
+ expect(screen.getByText('evaluation.history.title')).toBeInTheDocument()
+ expect(screen.getByText('Context Precision')).toBeInTheDocument()
+ expect(screen.getByText('Context Recall')).toBeInTheDocument()
+ expect(screen.getByText('Context Relevance')).toBeInTheDocument()
+ expect(screen.getByText('evaluation.results.empty')).toBeInTheDocument()
+ expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeDisabled()
+ })
+
+ it('should enable pipeline batch actions after selecting a judge model and metric', () => {
+ render(
)
+
+ fireEvent.click(screen.getByRole('button', { name: 'select-model' }))
+ fireEvent.click(screen.getByRole('button', { name: /Context Precision/i }))
+
+ expect(screen.getByRole('button', { name: 'evaluation.batch.downloadTemplate' })).toBeEnabled()
+ expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeEnabled()
+ })
})
diff --git a/web/app/components/evaluation/components/judge-model-selector.tsx b/web/app/components/evaluation/components/judge-model-selector.tsx
index e0514012fe..8f9ee4aff6 100644
--- a/web/app/components/evaluation/components/judge-model-selector.tsx
+++ b/web/app/components/evaluation/components/judge-model-selector.tsx
@@ -8,17 +8,22 @@ import ModelSelector from '@/app/components/header/account-setting/model-provide
import { useEvaluationResource, useEvaluationStore } from '../store'
import { decodeModelSelection, encodeModelSelection } from '../utils'
+type JudgeModelSelectorProps = EvaluationResourceProps & {
+ autoSelectFirst?: boolean
+}
+
const JudgeModelSelector = ({
resourceType,
resourceId,
-}: EvaluationResourceProps) => {
+ autoSelectFirst = true,
+}: JudgeModelSelectorProps) => {
const { data: modelList } = useModelList(ModelTypeEnum.textGeneration)
const resource = useEvaluationResource(resourceType, resourceId)
const setJudgeModel = useEvaluationStore(state => state.setJudgeModel)
const selectedModel = decodeModelSelection(resource.judgeModelId)
useEffect(() => {
- if (resource.judgeModelId || !modelList.length)
+ if (!autoSelectFirst || resource.judgeModelId || !modelList.length)
return
const firstProvider = modelList[0]
@@ -27,7 +32,7 @@ const JudgeModelSelector = ({
return
setJudgeModel(resourceType, resourceId, encodeModelSelection(firstProvider.provider, firstModel.model))
- }, [modelList, resource.judgeModelId, resourceId, resourceType, setJudgeModel])
+ }, [autoSelectFirst, modelList, resource.judgeModelId, resourceId, resourceType, setJudgeModel])
return (
{
+ const { t } = useTranslation('evaluation')
+ const { t: tCommon } = useTranslation('common')
+ const docLink = useDocLink()
+
+ return (
+
+ )
+}
+
+export default NonPipelineEvaluation
diff --git a/web/app/components/evaluation/components/pipeline-evaluation.tsx b/web/app/components/evaluation/components/pipeline-evaluation.tsx
new file mode 100644
index 0000000000..4464b85a73
--- /dev/null
+++ b/web/app/components/evaluation/components/pipeline-evaluation.tsx
@@ -0,0 +1,346 @@
+'use client'
+
+import type { EvaluationResourceProps, MetricOption } from '../types'
+import { useEffect, useMemo, useRef, useState } from 'react'
+import { useTranslation } from 'react-i18next'
+import Badge from '@/app/components/base/badge'
+import Button from '@/app/components/base/button'
+import Checkbox from '@/app/components/base/checkbox'
+import Input from '@/app/components/base/input'
+import { toast } from '@/app/components/base/ui/toast'
+import { Tooltip, TooltipContent, TooltipTrigger } from '@/app/components/base/ui/tooltip'
+import { useDocLink } from '@/context/i18n'
+import { cn } from '@/utils/classnames'
+import { getEvaluationMockConfig } from '../mock'
+import { isEvaluationRunnable, useEvaluationResource, useEvaluationStore } from '../store'
+import JudgeModelSelector from './judge-model-selector'
+import SectionHeader, { InlineSectionHeader } from './section-header'
+
+type PipelineMetricItemProps = {
+ metric: MetricOption
+ selected: boolean
+ onToggle: () => void
+ disabledCondition: boolean
+}
+
+const PipelineMetricItem = ({
+ metric,
+ selected,
+ onToggle,
+ disabledCondition,
+}: PipelineMetricItemProps) => {
+ return (
+
+
+
+
+
+ )
+}
+
+const PipelineHistoryTable = ({
+ resourceType,
+ resourceId,
+}: EvaluationResourceProps) => {
+ const { t } = useTranslation('evaluation')
+ const resource = useEvaluationResource(resourceType, resourceId)
+ const [query, setQuery] = useState('')
+ const statusLabels = {
+ running: t('batch.status.running'),
+ success: t('batch.status.success'),
+ failed: t('batch.status.failed'),
+ }
+
+ const filteredRecords = useMemo(() => {
+ const keyword = query.trim().toLowerCase()
+ if (!keyword)
+ return resource.batchRecords
+
+ return resource.batchRecords.filter(record =>
+ record.fileName.toLowerCase().includes(keyword)
+ || record.summary.toLowerCase().includes(keyword),
+ )
+ }, [query, resource.batchRecords])
+
+ return (
+
+
+
{t('history.title')}
+
+ setQuery(event.target.value)}
+ />
+
+
+
+
+
+
+
+ {t('history.columns.time')}
+
+
+
{t('history.columns.creator')}
+
{t('history.columns.version')}
+
{t('history.columns.status')}
+
+
+
+
+ {filteredRecords.length > 0 && (
+
+ {filteredRecords.map(record => (
+
+
{record.startedAt}
+
{t('history.creatorYou')}
+
{t('history.latestVersion')}
+
+
+ {record.status === 'running'
+ ? (
+
+
+ {statusLabels.running}
+
+ )
+ : statusLabels[record.status]}
+
+
+
+
+
+
+ ))}
+
+ )}
+
+ {filteredRecords.length === 0 && (
+
+
+
{t('history.empty')}
+
+ )}
+
+
+
+
+ )
+}
+
+const PipelineResultsPanel = () => {
+ const { t } = useTranslation('evaluation')
+
+ return (
+
+
+
+
{t('results.empty')}
+
+
+ )
+}
+
+const PipelineEvaluation = ({
+ resourceType,
+ resourceId,
+}: EvaluationResourceProps) => {
+ const { t } = useTranslation('evaluation')
+ const { t: tCommon } = useTranslation('common')
+ const docLink = useDocLink()
+ const ensureResource = useEvaluationStore(state => state.ensureResource)
+ const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric)
+ const removeMetric = useEvaluationStore(state => state.removeMetric)
+ const setUploadedFileName = useEvaluationStore(state => state.setUploadedFileName)
+ const runBatchTest = useEvaluationStore(state => state.runBatchTest)
+ const resource = useEvaluationResource(resourceType, resourceId)
+ const fileInputRef = useRef(null)
+ const config = getEvaluationMockConfig(resourceType)
+ const builtinMetricMap = useMemo(() => new Map(
+ resource.metrics
+ .filter(metric => metric.kind === 'builtin')
+ .map(metric => [metric.optionId, metric]),
+ ), [resource.metrics])
+ const isConfigReady = !!resource.judgeModelId && builtinMetricMap.size > 0
+ const isRunnable = isEvaluationRunnable(resource)
+
+ useEffect(() => {
+ ensureResource(resourceType, resourceId)
+ }, [ensureResource, resourceId, resourceType])
+
+ const handleToggleMetric = (metricId: string) => {
+ const selectedMetric = builtinMetricMap.get(metricId)
+ if (selectedMetric) {
+ removeMetric(resourceType, resourceId, selectedMetric.id)
+ return
+ }
+
+ addBuiltinMetric(resourceType, resourceId, metricId)
+ }
+
+ const handleDownloadTemplate = () => {
+ const content = ['case_id,input,expected', '1,Example input,Example output'].join('\n')
+ const link = document.createElement('a')
+ link.href = `data:text/csv;charset=utf-8,${encodeURIComponent(content)}`
+ link.download = config.templateFileName
+ link.click()
+ }
+
+ const handleUploadAndRun = () => {
+ if (!isRunnable) {
+ toast.warning(t('batch.validation'))
+ return
+ }
+
+ fileInputRef.current?.click()
+ }
+
+ return (
+
+
+
+
+
+
+
+
+
+
+
+ {config.builtinMetrics.map(metric => (
+
handleToggleMetric(metric.id)}
+ />
+ ))}
+
+
+
+
+
+
+
+
+
{
+ const file = event.target.files?.[0]
+ if (!file)
+ return
+
+ setUploadedFileName(resourceType, resourceId, file.name)
+ runBatchTest(resourceType, resourceId)
+ event.target.value = ''
+ }}
+ />
+
+
+
+
+
+
+
+
+
+
+ )
+}
+
+export default PipelineEvaluation
diff --git a/web/app/components/evaluation/index.tsx b/web/app/components/evaluation/index.tsx
index b031cd7afb..5806bf140a 100644
--- a/web/app/components/evaluation/index.tsx
+++ b/web/app/components/evaluation/index.tsx
@@ -2,67 +2,34 @@
import type { EvaluationResourceProps } from './types'
import { useEffect } from 'react'
-import { useTranslation } from 'react-i18next'
-import { useDocLink } from '@/context/i18n'
-import BatchTestPanel from './components/batch-test-panel'
-import ConditionsSection from './components/conditions-section'
-import JudgeModelSelector from './components/judge-model-selector'
-import MetricSection from './components/metric-section'
-import SectionHeader, { InlineSectionHeader } from './components/section-header'
+import NonPipelineEvaluation from './components/non-pipeline-evaluation'
+import PipelineEvaluation from './components/pipeline-evaluation'
import { useEvaluationStore } from './store'
const Evaluation = ({
resourceType,
resourceId,
}: EvaluationResourceProps) => {
- const { t } = useTranslation('evaluation')
- const { t: tCommon } = useTranslation('common')
- const docLink = useDocLink()
const ensureResource = useEvaluationStore(state => state.ensureResource)
useEffect(() => {
ensureResource(resourceType, resourceId)
}, [ensureResource, resourceId, resourceType])
- return (
-
-
+ if (resourceType === 'pipeline') {
+ return (
+
+ )
+ }
-
-
-
-
+ return (
+
)
}
diff --git a/web/app/components/evaluation/mock.ts b/web/app/components/evaluation/mock.ts
index 598e5ee675..341711e567 100644
--- a/web/app/components/evaluation/mock.ts
+++ b/web/app/components/evaluation/mock.ts
@@ -69,6 +69,30 @@ const builtinMetrics: MetricOption[] = [
},
]
+const pipelineBuiltinMetrics: MetricOption[] = [
+ {
+ id: 'context-precision',
+ label: 'Context Precision',
+ description: 'Measures whether retrieved chunks stay tightly aligned to the request.',
+ group: 'quality',
+ badges: ['Retrieval'],
+ },
+ {
+ id: 'context-recall',
+ label: 'Context Recall',
+ description: 'Checks whether the retrieval result includes the evidence needed to answer.',
+ group: 'quality',
+ badges: ['Retrieval'],
+ },
+ {
+ id: 'context-relevance',
+ label: 'Context Relevance',
+ description: 'Scores how useful the retrieved context is for downstream generation.',
+ group: 'quality',
+ badges: ['Retrieval'],
+ },
+]
+
const workflowOptions = [
{
id: 'workflow-precision-review',
@@ -139,7 +163,7 @@ export const getEvaluationMockConfig = (resourceType: EvaluationResourceType): E
if (resourceType === 'pipeline') {
return {
judgeModels,
- builtinMetrics,
+ builtinMetrics: pipelineBuiltinMetrics,
workflowOptions,
fieldOptions: pipelineFields,
templateFileName: 'pipeline-evaluation-template.csv',
diff --git a/web/i18n/en-US/evaluation.json b/web/i18n/en-US/evaluation.json
index 2749ce3fc5..41dff238e3 100644
--- a/web/i18n/en-US/evaluation.json
+++ b/web/i18n/en-US/evaluation.json
@@ -47,6 +47,15 @@
"conditions.title": "Judgment Conditions",
"conditions.valuePlaceholder": "Enter a value",
"description": "Configure automated testing to grade your application's performance.",
+ "history.columns.creator": "Creator",
+ "history.columns.status": "Status",
+ "history.columns.time": "Time",
+ "history.columns.version": "Version",
+ "history.creatorYou": "You",
+ "history.empty": "No test history yet",
+ "history.latestVersion": "Latest",
+ "history.searchPlaceholder": "Search",
+ "history.title": "Test History",
"judgeModel.description": "Choose the model used to score your evaluation results.",
"judgeModel.title": "Judge Model",
"metrics.add": "Add Metric",
@@ -83,5 +92,7 @@
"metrics.showMore": "Show more",
"metrics.title": "Metrics",
"metrics.update": "Update",
+ "pipeline.uploadAndRun": "Upload & Run Test",
+ "results.empty": "No evaluation results yet.",
"title": "Evaluation"
}