mirror of
https://github.com/langgenius/dify.git
synced 2026-04-12 09:00:19 -04:00
feat(web): dataset evaluation configure
This commit is contained in:
@@ -54,7 +54,7 @@ describe('Evaluation', () => {
|
||||
|
||||
mockUseAvailableEvaluationMetrics.mockReturnValue({
|
||||
data: {
|
||||
metrics: ['answer-correctness', 'faithfulness'],
|
||||
metrics: ['answer-correctness', 'faithfulness', 'context-precision', 'context-recall', 'context-relevance'],
|
||||
},
|
||||
isLoading: false,
|
||||
})
|
||||
@@ -240,12 +240,34 @@ describe('Evaluation', () => {
|
||||
expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeDisabled()
|
||||
})
|
||||
|
||||
it('should render selected pipeline metrics from config with the default threshold input', () => {
|
||||
mockUseEvaluationConfig.mockReturnValue({
|
||||
data: {
|
||||
evaluation_model: null,
|
||||
evaluation_model_provider: null,
|
||||
metrics_config: {
|
||||
default_metrics: [{
|
||||
metric: 'context-precision',
|
||||
}],
|
||||
customized_metrics: null,
|
||||
},
|
||||
judgement_conditions: null,
|
||||
},
|
||||
})
|
||||
|
||||
render(<Evaluation resourceType="datasets" resourceId="dataset-2" />)
|
||||
|
||||
expect(screen.getByText('Context Precision')).toBeInTheDocument()
|
||||
expect(screen.getByDisplayValue('0.85')).toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should enable pipeline batch actions after selecting a judge model and metric', () => {
|
||||
render(<Evaluation resourceType="datasets" resourceId="dataset-2" />)
|
||||
|
||||
fireEvent.click(screen.getByRole('button', { name: 'select-model' }))
|
||||
fireEvent.click(screen.getByRole('button', { name: /Context Precision/i }))
|
||||
|
||||
expect(screen.getByDisplayValue('0.85')).toBeInTheDocument()
|
||||
expect(screen.getByRole('button', { name: 'evaluation.batch.downloadTemplate' })).toBeEnabled()
|
||||
expect(screen.getByRole('button', { name: 'evaluation.pipeline.uploadAndRun' })).toBeEnabled()
|
||||
})
|
||||
|
||||
@@ -181,6 +181,7 @@ describe('evaluation store', () => {
|
||||
expect(hydratedState.judgeModelId).toBe('openai::gpt-4o-mini')
|
||||
expect(hydratedState.metrics).toHaveLength(2)
|
||||
expect(hydratedState.metrics[0].optionId).toBe('faithfulness')
|
||||
expect(hydratedState.metrics[0].threshold).toBe(0.85)
|
||||
expect(hydratedState.metrics[0].nodeInfoList).toEqual([
|
||||
{ node_id: 'node-1', title: 'Retriever', type: 'retriever' },
|
||||
])
|
||||
|
||||
@@ -6,6 +6,7 @@ import { useTranslation } from 'react-i18next'
|
||||
import Button from '@/app/components/base/button'
|
||||
import { toast } from '@/app/components/base/ui/toast'
|
||||
import { useDocLink } from '@/context/i18n'
|
||||
import { useAvailableEvaluationMetrics } from '@/service/use-evaluation'
|
||||
import { getEvaluationMockConfig } from '../../mock'
|
||||
import { isEvaluationRunnable, useEvaluationResource, useEvaluationStore } from '../../store'
|
||||
import JudgeModelSelector from '../judge-model-selector'
|
||||
@@ -24,8 +25,10 @@ const PipelineEvaluation = ({
|
||||
const ensureResource = useEvaluationStore(state => state.ensureResource)
|
||||
const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric)
|
||||
const removeMetric = useEvaluationStore(state => state.removeMetric)
|
||||
const updateMetricThreshold = useEvaluationStore(state => state.updateMetricThreshold)
|
||||
const setUploadedFileName = useEvaluationStore(state => state.setUploadedFileName)
|
||||
const runBatchTest = useEvaluationStore(state => state.runBatchTest)
|
||||
const { data: availableMetricsData } = useAvailableEvaluationMetrics()
|
||||
const resource = useEvaluationResource(resourceType, resourceId)
|
||||
const fileInputRef = useRef<HTMLInputElement>(null)
|
||||
const config = getEvaluationMockConfig(resourceType)
|
||||
@@ -34,6 +37,12 @@ const PipelineEvaluation = ({
|
||||
.filter(metric => metric.kind === 'builtin')
|
||||
.map(metric => [metric.optionId, metric]),
|
||||
), [resource.metrics])
|
||||
const availableMetricIds = useMemo(() => new Set(availableMetricsData?.metrics ?? []), [availableMetricsData?.metrics])
|
||||
const availableBuiltinMetrics = useMemo(() => {
|
||||
return config.builtinMetrics.filter(metric =>
|
||||
availableMetricIds.has(metric.id) || builtinMetricMap.has(metric.id),
|
||||
)
|
||||
}, [availableMetricIds, builtinMetricMap, config.builtinMetrics])
|
||||
const isConfigReady = !!resource.judgeModelId && builtinMetricMap.size > 0
|
||||
const isRunnable = isEvaluationRunnable(resource)
|
||||
|
||||
@@ -107,15 +116,21 @@ const PipelineEvaluation = ({
|
||||
<section>
|
||||
<InlineSectionHeader title={t('metrics.title')} tooltip={t('metrics.description')} />
|
||||
<div className="mt-1 space-y-0.5">
|
||||
{config.builtinMetrics.map(metric => (
|
||||
<PipelineMetricItem
|
||||
key={metric.id}
|
||||
metric={metric}
|
||||
selected={builtinMetricMap.has(metric.id)}
|
||||
disabledCondition
|
||||
onToggle={() => handleToggleMetric(metric.id)}
|
||||
/>
|
||||
))}
|
||||
{availableBuiltinMetrics.map((metric) => {
|
||||
const selectedMetric = builtinMetricMap.get(metric.id)
|
||||
|
||||
return (
|
||||
<PipelineMetricItem
|
||||
key={metric.id}
|
||||
metric={metric}
|
||||
selected={!!selectedMetric}
|
||||
threshold={selectedMetric?.threshold}
|
||||
disabledCondition
|
||||
onToggle={() => handleToggleMetric(metric.id)}
|
||||
onThresholdChange={value => updateMetricThreshold(resourceType, resourceId, selectedMetric?.id ?? '', value)}
|
||||
/>
|
||||
)
|
||||
})}
|
||||
</div>
|
||||
</section>
|
||||
|
||||
|
||||
@@ -1,15 +1,20 @@
|
||||
'use client'
|
||||
|
||||
import type { MetricOption } from '../../types'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import Checkbox from '@/app/components/base/checkbox'
|
||||
import Input from '@/app/components/base/input'
|
||||
import { Tooltip, TooltipContent, TooltipTrigger } from '@/app/components/base/ui/tooltip'
|
||||
import { cn } from '@/utils/classnames'
|
||||
import { DEFAULT_PIPELINE_METRIC_THRESHOLD } from '../../store-utils'
|
||||
|
||||
type PipelineMetricItemProps = {
|
||||
metric: MetricOption
|
||||
selected: boolean
|
||||
onToggle: () => void
|
||||
disabledCondition: boolean
|
||||
threshold?: number
|
||||
onThresholdChange: (value: number) => void
|
||||
}
|
||||
|
||||
const PipelineMetricItem = ({
|
||||
@@ -17,7 +22,11 @@ const PipelineMetricItem = ({
|
||||
selected,
|
||||
onToggle,
|
||||
disabledCondition,
|
||||
threshold = DEFAULT_PIPELINE_METRIC_THRESHOLD,
|
||||
onThresholdChange,
|
||||
}: PipelineMetricItemProps) => {
|
||||
const { t } = useTranslation('evaluation')
|
||||
|
||||
return (
|
||||
<div className="flex items-center justify-between gap-3 px-1 py-1">
|
||||
<button
|
||||
@@ -41,16 +50,38 @@ const PipelineMetricItem = ({
|
||||
</Tooltip>
|
||||
</button>
|
||||
|
||||
<button
|
||||
type="button"
|
||||
disabled={disabledCondition}
|
||||
className={cn(
|
||||
'system-xs-medium text-text-tertiary',
|
||||
disabledCondition && 'cursor-not-allowed text-components-button-secondary-accent-text-disabled',
|
||||
)}
|
||||
>
|
||||
+ Condition
|
||||
</button>
|
||||
{selected
|
||||
? (
|
||||
<div className="flex items-center gap-2">
|
||||
<span className="system-xs-medium text-text-accent">{t('pipeline.passIf')}</span>
|
||||
<div className="w-[52px]">
|
||||
<Input
|
||||
value={String(threshold)}
|
||||
type="number"
|
||||
min={0}
|
||||
max={1}
|
||||
step={0.01}
|
||||
onChange={(event) => {
|
||||
const parsedValue = Number(event.target.value)
|
||||
if (!Number.isNaN(parsedValue))
|
||||
onThresholdChange(parsedValue)
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
: (
|
||||
<button
|
||||
type="button"
|
||||
disabled={disabledCondition}
|
||||
className={cn(
|
||||
'system-xs-medium text-text-tertiary',
|
||||
disabledCondition && 'cursor-not-allowed text-components-button-secondary-accent-text-disabled',
|
||||
)}
|
||||
>
|
||||
+ Condition
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
@@ -24,6 +24,8 @@ import { encodeModelSelection } from './utils'
|
||||
|
||||
type EvaluationStoreResources = Record<string, EvaluationResourceState>
|
||||
|
||||
export const DEFAULT_PIPELINE_METRIC_THRESHOLD = 0.85
|
||||
|
||||
const createId = (prefix: string) => `${prefix}-${Math.random().toString(36).slice(2, 10)}`
|
||||
|
||||
const humanizeMetricId = (metricId: string) => {
|
||||
@@ -213,7 +215,11 @@ export function getConditionValue(
|
||||
return typeof previousValue === 'string' ? previousValue : null
|
||||
}
|
||||
|
||||
export function createBuiltinMetric(metric: MetricOption, nodeInfoList: NodeInfo[] = []): EvaluationMetric {
|
||||
export function createBuiltinMetric(
|
||||
metric: MetricOption,
|
||||
nodeInfoList: NodeInfo[] = [],
|
||||
threshold = DEFAULT_PIPELINE_METRIC_THRESHOLD,
|
||||
): EvaluationMetric {
|
||||
return {
|
||||
id: createId('metric'),
|
||||
optionId: metric.id,
|
||||
@@ -221,6 +227,7 @@ export function createBuiltinMetric(metric: MetricOption, nodeInfoList: NodeInfo
|
||||
label: metric.label,
|
||||
description: metric.description,
|
||||
badges: metric.badges,
|
||||
threshold,
|
||||
nodeInfoList,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -32,6 +32,7 @@ type EvaluationStore = {
|
||||
hydrateResource: (resourceType: EvaluationResourceType, resourceId: string, config: EvaluationConfig) => void
|
||||
setJudgeModel: (resourceType: EvaluationResourceType, resourceId: string, judgeModelId: string) => void
|
||||
addBuiltinMetric: (resourceType: EvaluationResourceType, resourceId: string, optionId: string, nodeInfoList?: NodeInfo[]) => void
|
||||
updateMetricThreshold: (resourceType: EvaluationResourceType, resourceId: string, metricId: string, threshold: number) => void
|
||||
addCustomMetric: (resourceType: EvaluationResourceType, resourceId: string) => void
|
||||
removeMetric: (resourceType: EvaluationResourceType, resourceId: string, metricId: string) => void
|
||||
setCustomMetricWorkflow: (
|
||||
@@ -126,6 +127,17 @@ export const useEvaluationStore = create<EvaluationStore>((set, get) => ({
|
||||
}
|
||||
})
|
||||
},
|
||||
updateMetricThreshold: (resourceType, resourceId, metricId, threshold) => {
|
||||
set(state => ({
|
||||
resources: updateResourceState(state.resources, resourceType, resourceId, resource => ({
|
||||
...resource,
|
||||
metrics: updateMetric(resource.metrics, metricId, metric => ({
|
||||
...metric,
|
||||
threshold,
|
||||
})),
|
||||
})),
|
||||
}))
|
||||
},
|
||||
addCustomMetric: (resourceType, resourceId) => {
|
||||
set(state => ({
|
||||
resources: updateResourceState(state.resources, resourceType, resourceId, resource => ({
|
||||
|
||||
@@ -82,6 +82,7 @@ export type EvaluationMetric = {
|
||||
label: string
|
||||
description: string
|
||||
badges: string[]
|
||||
threshold?: number
|
||||
nodeInfoList?: NodeInfo[]
|
||||
customConfig?: CustomMetricConfig
|
||||
}
|
||||
|
||||
@@ -92,6 +92,7 @@
|
||||
"metrics.showMore": "Show more",
|
||||
"metrics.title": "Metrics",
|
||||
"metrics.update": "Update",
|
||||
"pipeline.passIf": "Pass if \u2265",
|
||||
"pipeline.uploadAndRun": "Upload & Run Test",
|
||||
"results.empty": "No evaluation results yet.",
|
||||
"title": "Evaluation"
|
||||
|
||||
Reference in New Issue
Block a user