mirror of
https://github.com/langgenius/dify.git
synced 2026-05-01 01:00:51 -04:00
chore(web): remove mock data of evaluation
This commit is contained in:
@@ -404,6 +404,45 @@ describe('Evaluation', () => {
|
||||
expect(screen.getByText('evaluation.metrics.noNodesInWorkflow')).toBeInTheDocument()
|
||||
})
|
||||
|
||||
it('should add a node from a dynamically returned metric option', () => {
|
||||
mockUseDefaultEvaluationMetrics.mockReturnValue({
|
||||
data: {
|
||||
default_metrics: [
|
||||
{
|
||||
metric: 'answer-correctness',
|
||||
value_type: 'number',
|
||||
node_info_list: [
|
||||
{ node_id: 'node-answer', title: 'Answer Node', type: 'llm' },
|
||||
],
|
||||
},
|
||||
{
|
||||
metric: 'context-precision',
|
||||
value_type: 'number',
|
||||
node_info_list: [
|
||||
{ node_id: 'node-context', title: 'Context Node', type: 'knowledge-retrieval' },
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
isLoading: false,
|
||||
})
|
||||
|
||||
renderWithQueryClient(<Evaluation resourceType="apps" resourceId="app-dynamic-metric" />)
|
||||
|
||||
fireEvent.click(screen.getByRole('button', { name: 'evaluation.metrics.add' }))
|
||||
fireEvent.click(screen.getByTestId('evaluation-metric-node-context-precision-node-context'))
|
||||
|
||||
const metrics = useEvaluationStore.getState().resources['apps:app-dynamic-metric']!.metrics
|
||||
expect(metrics).toHaveLength(1)
|
||||
expect(metrics[0]).toMatchObject({
|
||||
optionId: 'context-precision',
|
||||
label: 'Context Precision',
|
||||
nodeInfoList: [
|
||||
{ node_id: 'node-context', title: 'Context Node', type: 'knowledge-retrieval' },
|
||||
],
|
||||
})
|
||||
})
|
||||
|
||||
it('should render the global empty state when no metrics are available', () => {
|
||||
mockUseDefaultEvaluationMetrics.mockReturnValue({
|
||||
data: {
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import type { EvaluationConfig } from '@/types/evaluation'
|
||||
import { getEvaluationMockConfig } from '../mock'
|
||||
import {
|
||||
getAllowedOperators,
|
||||
isCustomMetricConfigured,
|
||||
@@ -8,6 +7,12 @@ import {
|
||||
} from '../store'
|
||||
import { buildEvaluationConfigPayload, buildEvaluationRunRequest } from '../store-utils'
|
||||
|
||||
const customWorkflow = {
|
||||
id: 'workflow-precision-review',
|
||||
appId: 'custom-workflow-app-id',
|
||||
name: 'Precision Review Workflow',
|
||||
}
|
||||
|
||||
describe('evaluation store', () => {
|
||||
beforeEach(() => {
|
||||
useEvaluationStore.setState({ resources: {}, initialResources: {} })
|
||||
@@ -17,7 +22,6 @@ describe('evaluation store', () => {
|
||||
const resourceType = 'apps'
|
||||
const resourceId = 'app-1'
|
||||
const store = useEvaluationStore.getState()
|
||||
const config = getEvaluationMockConfig(resourceType)
|
||||
|
||||
store.ensureResource(resourceType, resourceId)
|
||||
store.addCustomMetric(resourceType, resourceId)
|
||||
@@ -27,9 +31,9 @@ describe('evaluation store', () => {
|
||||
expect(isCustomMetricConfigured(initialMetric!)).toBe(false)
|
||||
|
||||
store.setCustomMetricWorkflow(resourceType, resourceId, initialMetric!.id, {
|
||||
workflowId: config.workflowOptions[0].id,
|
||||
workflowAppId: 'custom-workflow-app-id',
|
||||
workflowName: config.workflowOptions[0].label,
|
||||
workflowId: customWorkflow.id,
|
||||
workflowAppId: customWorkflow.appId,
|
||||
workflowName: customWorkflow.name,
|
||||
})
|
||||
store.syncCustomMetricMappings(resourceType, resourceId, initialMetric!.id, ['query'])
|
||||
store.syncCustomMetricOutputs(resourceType, resourceId, initialMetric!.id, [{
|
||||
@@ -44,8 +48,8 @@ describe('evaluation store', () => {
|
||||
|
||||
const configuredMetric = useEvaluationStore.getState().resources['apps:app-1'].metrics.find(metric => metric.id === initialMetric!.id)
|
||||
expect(isCustomMetricConfigured(configuredMetric!)).toBe(true)
|
||||
expect(configuredMetric!.customConfig!.workflowAppId).toBe('custom-workflow-app-id')
|
||||
expect(configuredMetric!.customConfig!.workflowName).toBe(config.workflowOptions[0].label)
|
||||
expect(configuredMetric!.customConfig!.workflowAppId).toBe(customWorkflow.appId)
|
||||
expect(configuredMetric!.customConfig!.workflowName).toBe(customWorkflow.name)
|
||||
expect(configuredMetric!.customConfig!.outputs).toEqual([{ id: 'score', valueType: 'number' }])
|
||||
})
|
||||
|
||||
@@ -71,12 +75,11 @@ describe('evaluation store', () => {
|
||||
const resourceType = 'apps'
|
||||
const resourceId = 'app-2'
|
||||
const store = useEvaluationStore.getState()
|
||||
const config = getEvaluationMockConfig(resourceType)
|
||||
|
||||
store.ensureResource(resourceType, resourceId)
|
||||
store.addBuiltinMetric(resourceType, resourceId, config.builtinMetrics[1].id)
|
||||
store.addBuiltinMetric(resourceType, resourceId, 'faithfulness')
|
||||
|
||||
const addedMetric = useEvaluationStore.getState().resources['apps:app-2'].metrics.find(metric => metric.optionId === config.builtinMetrics[1].id)
|
||||
const addedMetric = useEvaluationStore.getState().resources['apps:app-2'].metrics.find(metric => metric.optionId === 'faithfulness')
|
||||
expect(addedMetric).toBeDefined()
|
||||
|
||||
store.removeMetric(resourceType, resourceId, addedMetric!.id)
|
||||
@@ -88,8 +91,7 @@ describe('evaluation store', () => {
|
||||
const resourceType = 'apps'
|
||||
const resourceId = 'app-4'
|
||||
const store = useEvaluationStore.getState()
|
||||
const config = getEvaluationMockConfig(resourceType)
|
||||
const metricId = config.builtinMetrics[0].id
|
||||
const metricId = 'answer-correctness'
|
||||
|
||||
store.ensureResource(resourceType, resourceId)
|
||||
store.addBuiltinMetric(resourceType, resourceId, metricId, [
|
||||
@@ -115,10 +117,9 @@ describe('evaluation store', () => {
|
||||
const resourceType = 'apps'
|
||||
const resourceId = 'app-conditions'
|
||||
const store = useEvaluationStore.getState()
|
||||
const config = getEvaluationMockConfig(resourceType)
|
||||
|
||||
store.ensureResource(resourceType, resourceId)
|
||||
store.addBuiltinMetric(resourceType, resourceId, config.builtinMetrics[0].id, [
|
||||
store.addBuiltinMetric(resourceType, resourceId, 'answer-correctness', [
|
||||
{ node_id: 'node-answer', title: 'Answer Node', type: 'llm' },
|
||||
])
|
||||
store.setConditionLogicalOperator(resourceType, resourceId, 'or')
|
||||
@@ -137,27 +138,26 @@ describe('evaluation store', () => {
|
||||
const resourceType = 'apps'
|
||||
const resourceId = 'app-condition-selector'
|
||||
const store = useEvaluationStore.getState()
|
||||
const config = getEvaluationMockConfig(resourceType)
|
||||
|
||||
store.ensureResource(resourceType, resourceId)
|
||||
store.addCustomMetric(resourceType, resourceId)
|
||||
|
||||
const customMetric = useEvaluationStore.getState().resources['apps:app-condition-selector'].metrics.find(metric => metric.kind === 'custom-workflow')!
|
||||
store.setCustomMetricWorkflow(resourceType, resourceId, customMetric.id, {
|
||||
workflowId: config.workflowOptions[0].id,
|
||||
workflowAppId: 'custom-workflow-app-id',
|
||||
workflowName: config.workflowOptions[0].label,
|
||||
workflowId: customWorkflow.id,
|
||||
workflowAppId: customWorkflow.appId,
|
||||
workflowName: customWorkflow.name,
|
||||
})
|
||||
store.syncCustomMetricOutputs(resourceType, resourceId, customMetric.id, [{
|
||||
id: 'reason',
|
||||
valueType: 'string',
|
||||
}])
|
||||
|
||||
store.addCondition(resourceType, resourceId, [config.workflowOptions[0].id, 'reason'])
|
||||
store.addCondition(resourceType, resourceId, [customWorkflow.id, 'reason'])
|
||||
|
||||
const condition = useEvaluationStore.getState().resources['apps:app-condition-selector'].judgmentConfig.conditions[0]
|
||||
|
||||
expect(condition.variableSelector).toEqual([config.workflowOptions[0].id, 'reason'])
|
||||
expect(condition.variableSelector).toEqual([customWorkflow.id, 'reason'])
|
||||
expect(condition.comparisonOperator).toBe('contains')
|
||||
expect(condition.value).toBeNull()
|
||||
})
|
||||
@@ -166,16 +166,15 @@ describe('evaluation store', () => {
|
||||
const resourceType = 'apps'
|
||||
const resourceId = 'app-3'
|
||||
const store = useEvaluationStore.getState()
|
||||
const config = getEvaluationMockConfig(resourceType)
|
||||
|
||||
store.ensureResource(resourceType, resourceId)
|
||||
store.addCustomMetric(resourceType, resourceId)
|
||||
|
||||
const customMetric = useEvaluationStore.getState().resources['apps:app-3'].metrics.find(metric => metric.kind === 'custom-workflow')!
|
||||
store.setCustomMetricWorkflow(resourceType, resourceId, customMetric.id, {
|
||||
workflowId: config.workflowOptions[0].id,
|
||||
workflowAppId: 'custom-workflow-app-id',
|
||||
workflowName: config.workflowOptions[0].label,
|
||||
workflowId: customWorkflow.id,
|
||||
workflowAppId: customWorkflow.appId,
|
||||
workflowName: customWorkflow.name,
|
||||
})
|
||||
store.syncCustomMetricOutputs(resourceType, resourceId, customMetric.id, [{
|
||||
id: 'reason',
|
||||
@@ -185,7 +184,7 @@ describe('evaluation store', () => {
|
||||
|
||||
const condition = useEvaluationStore.getState().resources['apps:app-3'].judgmentConfig.conditions[0]
|
||||
|
||||
store.updateConditionMetric(resourceType, resourceId, condition.id, [config.workflowOptions[0].id, 'reason'])
|
||||
store.updateConditionMetric(resourceType, resourceId, condition.id, [customWorkflow.id, 'reason'])
|
||||
store.updateConditionValue(resourceType, resourceId, condition.id, 'needs follow-up')
|
||||
store.updateConditionOperator(resourceType, resourceId, condition.id, 'empty')
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import type { EvaluationResourceProps } from '../../types'
|
||||
import { Button } from '@langgenius/dify-ui/button'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { getEvaluationMockConfig } from '../../mock'
|
||||
import { EVALUATION_TEMPLATE_FILE_NAMES } from '../../store-utils'
|
||||
import InputFieldsRequirements from './input-fields/input-fields-requirements'
|
||||
import UploadRunPopover from './input-fields/upload-run-popover'
|
||||
import { useInputFieldsActions } from './input-fields/use-input-fields-actions'
|
||||
@@ -19,7 +19,6 @@ const InputFieldsTab = ({
|
||||
isRunnable,
|
||||
}: InputFieldsTabProps) => {
|
||||
const { t } = useTranslation('evaluation')
|
||||
const config = getEvaluationMockConfig(resourceType)
|
||||
const { inputFields, isInputFieldsLoading } = usePublishedInputFields(resourceType, resourceId)
|
||||
const actions = useInputFieldsActions({
|
||||
resourceType,
|
||||
@@ -28,7 +27,7 @@ const InputFieldsTab = ({
|
||||
isInputFieldsLoading,
|
||||
isPanelReady,
|
||||
isRunnable,
|
||||
templateFileName: config.templateFileName,
|
||||
templateFileName: EVALUATION_TEMPLATE_FILE_NAMES[resourceType],
|
||||
})
|
||||
|
||||
return (
|
||||
|
||||
@@ -13,7 +13,7 @@ type SelectorMetricSectionProps = {
|
||||
isShowingAllNodes: boolean
|
||||
onToggleExpanded: () => void
|
||||
onToggleShowAllNodes: () => void
|
||||
onToggleNodeSelection: (metricId: string, nodeInfo: MetricSelectorSection['visibleNodes'][number]) => void
|
||||
onToggleNodeSelection: (metric: MetricSelectorSection['metric'], nodeInfo: MetricSelectorSection['visibleNodes'][number]) => void
|
||||
t: TFunction<'evaluation'>
|
||||
}
|
||||
|
||||
@@ -93,7 +93,7 @@ const SelectorMetricSection = ({
|
||||
'flex w-full items-center gap-1 rounded-md px-3 py-1.5 text-left transition-colors hover:bg-state-base-hover-alt',
|
||||
isAdded && 'opacity-50',
|
||||
)}
|
||||
onClick={() => onToggleNodeSelection(metric.id, nodeInfo)}
|
||||
onClick={() => onToggleNodeSelection(metric, nodeInfo)}
|
||||
>
|
||||
<div className="flex min-w-0 flex-1 items-center gap-2.5 pr-1">
|
||||
<BlockIcon
|
||||
|
||||
@@ -5,13 +5,11 @@ import { useMemo } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { useDefaultEvaluationMetrics } from '@/service/use-evaluation'
|
||||
import { getTranslatedMetricDescription } from '../../default-metric-descriptions'
|
||||
import { getEvaluationMockConfig } from '../../mock'
|
||||
import { useEvaluationResource, useEvaluationStore } from '../../store'
|
||||
import {
|
||||
buildMetricOption,
|
||||
dedupeNodeInfoList,
|
||||
getDefaultMetricNodeInfoMap,
|
||||
normalizeMetricValueType,
|
||||
} from './utils'
|
||||
|
||||
type UseMetricSelectorDataOptions = {
|
||||
@@ -25,7 +23,7 @@ type UseMetricSelectorDataResult = {
|
||||
builtinMetricMap: BuiltinMetricMap
|
||||
filteredSections: MetricSelectorSection[]
|
||||
isRemoteLoading: boolean
|
||||
toggleNodeSelection: (metricId: string, nodeInfo: NodeInfo) => void
|
||||
toggleNodeSelection: (metric: MetricOption, nodeInfo: NodeInfo) => void
|
||||
}
|
||||
|
||||
export const useMetricSelectorData = ({
|
||||
@@ -35,7 +33,6 @@ export const useMetricSelectorData = ({
|
||||
resourceId,
|
||||
}: UseMetricSelectorDataOptions): UseMetricSelectorDataResult => {
|
||||
const { t } = useTranslation('evaluation')
|
||||
const config = getEvaluationMockConfig(resourceType)
|
||||
const metrics = useEvaluationResource(resourceType, resourceId).metrics
|
||||
const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric)
|
||||
const removeMetric = useEvaluationStore(state => state.removeMetric)
|
||||
@@ -53,25 +50,15 @@ export const useMetricSelectorData = ({
|
||||
const nodeInfoMap = useMemo(() => getDefaultMetricNodeInfoMap(defaultMetrics), [defaultMetrics])
|
||||
|
||||
const resolvedMetrics = useMemo(() => {
|
||||
const metricsMap = new Map(config.builtinMetrics.map(metric => [metric.id, metric] as const))
|
||||
|
||||
return defaultMetrics
|
||||
.map((defaultMetric) => {
|
||||
if (!defaultMetric.metric)
|
||||
return null
|
||||
|
||||
const configMetric = metricsMap.get(defaultMetric.metric)
|
||||
if (configMetric) {
|
||||
return {
|
||||
...configMetric,
|
||||
valueType: normalizeMetricValueType(defaultMetric.value_type),
|
||||
}
|
||||
}
|
||||
|
||||
return buildMetricOption(defaultMetric.metric, defaultMetric.value_type)
|
||||
})
|
||||
.filter((metric): metric is MetricOption => !!metric)
|
||||
}, [config.builtinMetrics, defaultMetrics])
|
||||
}, [defaultMetrics])
|
||||
|
||||
const filteredSections = useMemo(() => {
|
||||
const keyword = query.trim().toLowerCase()
|
||||
@@ -120,7 +107,8 @@ export const useMetricSelectorData = ({
|
||||
}).filter((section): section is MetricSelectorSection => !!section)
|
||||
}, [nodeInfoMap, query, resolvedMetrics, t])
|
||||
|
||||
const toggleNodeSelection = (metricId: string, nodeInfo: NodeInfo) => {
|
||||
const toggleNodeSelection = (metric: MetricOption, nodeInfo: NodeInfo) => {
|
||||
const metricId = metric.id
|
||||
const addedMetric = builtinMetricMap.get(metricId)
|
||||
const currentSelectedNodes = addedMetric?.nodeInfoList ?? []
|
||||
|
||||
@@ -135,7 +123,7 @@ export const useMetricSelectorData = ({
|
||||
return
|
||||
}
|
||||
|
||||
addBuiltinMetric(resourceType, resourceId, metricId, nextSelectedNodes)
|
||||
addBuiltinMetric(resourceType, resourceId, metricId, nextSelectedNodes, metric)
|
||||
}
|
||||
|
||||
return {
|
||||
|
||||
@@ -4,8 +4,8 @@ import type { EvaluationResourceProps } from '../../types'
|
||||
import type { InputField } from '../batch-test-panel/input-fields/input-fields-utils'
|
||||
import { Button } from '@langgenius/dify-ui/button'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { getEvaluationMockConfig } from '../../mock'
|
||||
import { isEvaluationRunnable, useEvaluationResource } from '../../store'
|
||||
import { EVALUATION_TEMPLATE_FILE_NAMES } from '../../store-utils'
|
||||
import UploadRunPopover from '../batch-test-panel/input-fields/upload-run-popover'
|
||||
import { useInputFieldsActions } from '../batch-test-panel/input-fields/use-input-fields-actions'
|
||||
|
||||
@@ -20,7 +20,6 @@ const PipelineBatchActions = ({
|
||||
}: EvaluationResourceProps) => {
|
||||
const { t } = useTranslation('evaluation')
|
||||
const resource = useEvaluationResource(resourceType, resourceId)
|
||||
const config = getEvaluationMockConfig(resourceType)
|
||||
const isConfigReady = !!resource.judgeModelId && resource.metrics.some(metric => metric.kind === 'builtin')
|
||||
const isRunnable = isEvaluationRunnable(resource)
|
||||
const actions = useInputFieldsActions({
|
||||
@@ -30,7 +29,7 @@ const PipelineBatchActions = ({
|
||||
isInputFieldsLoading: false,
|
||||
isPanelReady: isConfigReady,
|
||||
isRunnable,
|
||||
templateFileName: config.templateFileName,
|
||||
templateFileName: EVALUATION_TEMPLATE_FILE_NAMES[resourceType],
|
||||
})
|
||||
|
||||
return (
|
||||
|
||||
@@ -9,8 +9,8 @@ import { BlockEnum } from '@/app/components/workflow/types'
|
||||
import { useDatasetDetailContextWithSelector } from '@/context/dataset-detail'
|
||||
import { useAvailableEvaluationMetrics } from '@/service/use-evaluation'
|
||||
import { usePublishedPipelineInfo } from '@/service/use-pipeline'
|
||||
import { getEvaluationMockConfig } from '../../mock'
|
||||
import { useEvaluationResource, useEvaluationStore } from '../../store'
|
||||
import { buildMetricOption } from '../metric-selector/utils'
|
||||
import { InlineSectionHeader } from '../section-header'
|
||||
import PipelineMetricItem from './pipeline-metric-item'
|
||||
|
||||
@@ -52,7 +52,6 @@ const PipelineMetricsSection = ({
|
||||
const { data: availableMetricsData } = useAvailableEvaluationMetrics()
|
||||
const { data: publishedPipeline } = usePublishedPipelineInfo(pipelineId || '')
|
||||
const resource = useEvaluationResource(resourceType, resourceId)
|
||||
const config = getEvaluationMockConfig(resourceType)
|
||||
const knowledgeIndexNodeInfoList = useMemo(
|
||||
() => getKnowledgeIndexNodeInfo(publishedPipeline?.graph.nodes),
|
||||
[publishedPipeline?.graph.nodes],
|
||||
@@ -62,12 +61,14 @@ const PipelineMetricsSection = ({
|
||||
.filter(metric => metric.kind === 'builtin')
|
||||
.map(metric => [metric.optionId, metric]),
|
||||
), [resource.metrics])
|
||||
const availableMetricIds = useMemo(() => new Set(availableMetricsData?.metrics ?? []), [availableMetricsData?.metrics])
|
||||
const availableBuiltinMetrics = useMemo(() => {
|
||||
return config.builtinMetrics.filter(metric =>
|
||||
availableMetricIds.has(metric.id) || builtinMetricMap.has(metric.id),
|
||||
)
|
||||
}, [availableMetricIds, builtinMetricMap, config.builtinMetrics])
|
||||
const metricIds = new Set([
|
||||
...(availableMetricsData?.metrics ?? []),
|
||||
...builtinMetricMap.keys(),
|
||||
])
|
||||
|
||||
return Array.from(metricIds).map(metricId => buildMetricOption(metricId))
|
||||
}, [availableMetricsData?.metrics, builtinMetricMap])
|
||||
|
||||
useEffect(() => {
|
||||
if (!knowledgeIndexNodeInfoList.length)
|
||||
@@ -77,7 +78,7 @@ const PipelineMetricsSection = ({
|
||||
if (metric.kind !== 'builtin' || isSameNodeInfoList(metric.nodeInfoList, knowledgeIndexNodeInfoList))
|
||||
return
|
||||
|
||||
addBuiltinMetric(resourceType, resourceId, metric.optionId, knowledgeIndexNodeInfoList)
|
||||
addBuiltinMetric(resourceType, resourceId, metric.optionId, knowledgeIndexNodeInfoList, metric)
|
||||
})
|
||||
}, [addBuiltinMetric, knowledgeIndexNodeInfoList, resource.metrics, resourceId, resourceType])
|
||||
|
||||
@@ -88,7 +89,8 @@ const PipelineMetricsSection = ({
|
||||
return
|
||||
}
|
||||
|
||||
addBuiltinMetric(resourceType, resourceId, metricId, knowledgeIndexNodeInfoList)
|
||||
const metricOption = availableBuiltinMetrics.find(metric => metric.id === metricId)
|
||||
addBuiltinMetric(resourceType, resourceId, metricId, knowledgeIndexNodeInfoList, metricOption)
|
||||
}
|
||||
|
||||
return (
|
||||
|
||||
@@ -29,29 +29,29 @@ const DEFAULT_METRIC_DESCRIPTION_KEYS = {
|
||||
type DefaultMetricDescriptionKey = typeof DEFAULT_METRIC_DESCRIPTION_KEYS[keyof typeof DEFAULT_METRIC_DESCRIPTION_KEYS]
|
||||
|
||||
const DEFAULT_METRIC_DESCRIPTIONS: Record<string, DefaultMetricDescription> = {
|
||||
faithfulness: DEFAULT_METRIC_DESCRIPTION.FAITHFULNESS,
|
||||
answer_relevancy: DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY,
|
||||
answer_correctness: DEFAULT_METRIC_DESCRIPTION.ANSWER_CORRECTNESS,
|
||||
semantic_similarity: DEFAULT_METRIC_DESCRIPTION.SEMANTIC_SIMILARITY,
|
||||
context_precision: DEFAULT_METRIC_DESCRIPTION.CONTEXT_PRECISION,
|
||||
context_recall: DEFAULT_METRIC_DESCRIPTION.CONTEXT_RECALL,
|
||||
context_relevance: DEFAULT_METRIC_DESCRIPTION.CONTEXT_RELEVANCE,
|
||||
tool_correctness: DEFAULT_METRIC_DESCRIPTION.TOOL_CORRECTNESS,
|
||||
task_completion: DEFAULT_METRIC_DESCRIPTION.TASK_COMPLETION,
|
||||
relevance: DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY,
|
||||
'faithfulness': DEFAULT_METRIC_DESCRIPTION.FAITHFULNESS,
|
||||
'answer-relevancy': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY,
|
||||
'answer-correctness': DEFAULT_METRIC_DESCRIPTION.ANSWER_CORRECTNESS,
|
||||
'semantic-similarity': DEFAULT_METRIC_DESCRIPTION.SEMANTIC_SIMILARITY,
|
||||
'context-precision': DEFAULT_METRIC_DESCRIPTION.CONTEXT_PRECISION,
|
||||
'context-recall': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RECALL,
|
||||
'context-relevance': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RELEVANCE,
|
||||
'tool-correctness': DEFAULT_METRIC_DESCRIPTION.TOOL_CORRECTNESS,
|
||||
'task-completion': DEFAULT_METRIC_DESCRIPTION.TASK_COMPLETION,
|
||||
'relevance': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY,
|
||||
}
|
||||
|
||||
const DEFAULT_METRIC_DESCRIPTION_I18N_KEYS: Record<string, DefaultMetricDescriptionKey> = {
|
||||
faithfulness: DEFAULT_METRIC_DESCRIPTION_KEYS.FAITHFULNESS,
|
||||
answer_relevancy: DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY,
|
||||
answer_correctness: DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_CORRECTNESS,
|
||||
semantic_similarity: DEFAULT_METRIC_DESCRIPTION_KEYS.SEMANTIC_SIMILARITY,
|
||||
context_precision: DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_PRECISION,
|
||||
context_recall: DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RECALL,
|
||||
context_relevance: DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RELEVANCE,
|
||||
tool_correctness: DEFAULT_METRIC_DESCRIPTION_KEYS.TOOL_CORRECTNESS,
|
||||
task_completion: DEFAULT_METRIC_DESCRIPTION_KEYS.TASK_COMPLETION,
|
||||
relevance: DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY,
|
||||
'faithfulness': DEFAULT_METRIC_DESCRIPTION_KEYS.FAITHFULNESS,
|
||||
'answer-relevancy': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY,
|
||||
'answer-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_CORRECTNESS,
|
||||
'semantic-similarity': DEFAULT_METRIC_DESCRIPTION_KEYS.SEMANTIC_SIMILARITY,
|
||||
'context-precision': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_PRECISION,
|
||||
'context-recall': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RECALL,
|
||||
'context-relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RELEVANCE,
|
||||
'tool-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.TOOL_CORRECTNESS,
|
||||
'task-completion': DEFAULT_METRIC_DESCRIPTION_KEYS.TASK_COMPLETION,
|
||||
'relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY,
|
||||
}
|
||||
|
||||
const normalizeMetricId = (metricId: string) => metricId.trim().toLowerCase().replace(/_/g, '-')
|
||||
|
||||
@@ -1,179 +0,0 @@
|
||||
import type {
|
||||
EvaluationFieldOption,
|
||||
EvaluationMockConfig,
|
||||
EvaluationResourceType,
|
||||
MetricOption,
|
||||
} from './types'
|
||||
import { getDefaultMetricDescription } from './default-metric-descriptions'
|
||||
|
||||
const judgeModels = [
|
||||
{
|
||||
id: 'gpt-4.1-mini',
|
||||
label: 'GPT-4.1 mini',
|
||||
provider: 'OpenAI',
|
||||
},
|
||||
{
|
||||
id: 'claude-3-7-sonnet',
|
||||
label: 'Claude 3.7 Sonnet',
|
||||
provider: 'Anthropic',
|
||||
},
|
||||
{
|
||||
id: 'gemini-2.0-flash',
|
||||
label: 'Gemini 2.0 Flash',
|
||||
provider: 'Google',
|
||||
},
|
||||
]
|
||||
|
||||
const builtinMetrics: MetricOption[] = [
|
||||
{
|
||||
id: 'answer-correctness',
|
||||
label: 'Answer Correctness',
|
||||
description: getDefaultMetricDescription('answer-correctness'),
|
||||
valueType: 'number',
|
||||
},
|
||||
{
|
||||
id: 'faithfulness',
|
||||
label: 'Faithfulness',
|
||||
description: getDefaultMetricDescription('faithfulness'),
|
||||
valueType: 'number',
|
||||
},
|
||||
{
|
||||
id: 'relevance',
|
||||
label: 'Relevance',
|
||||
description: getDefaultMetricDescription('relevance'),
|
||||
valueType: 'number',
|
||||
},
|
||||
{
|
||||
id: 'latency',
|
||||
label: 'Latency',
|
||||
description: 'Captures runtime responsiveness for the full execution path.',
|
||||
valueType: 'number',
|
||||
},
|
||||
{
|
||||
id: 'token-usage',
|
||||
label: 'Token Usage',
|
||||
description: 'Tracks prompt and completion token consumption for the run.',
|
||||
valueType: 'number',
|
||||
},
|
||||
{
|
||||
id: 'tool-success-rate',
|
||||
label: 'Tool Success Rate',
|
||||
description: 'Measures whether each required tool invocation finishes without failure.',
|
||||
valueType: 'number',
|
||||
},
|
||||
]
|
||||
|
||||
const pipelineBuiltinMetrics: MetricOption[] = [
|
||||
{
|
||||
id: 'context-precision',
|
||||
label: 'Context Precision',
|
||||
description: getDefaultMetricDescription('context-precision'),
|
||||
valueType: 'number',
|
||||
},
|
||||
{
|
||||
id: 'context-recall',
|
||||
label: 'Context Recall',
|
||||
description: getDefaultMetricDescription('context-recall'),
|
||||
valueType: 'number',
|
||||
},
|
||||
{
|
||||
id: 'context-relevance',
|
||||
label: 'Context Relevance',
|
||||
description: getDefaultMetricDescription('context-relevance'),
|
||||
valueType: 'number',
|
||||
},
|
||||
]
|
||||
|
||||
const workflowOptions = [
|
||||
{
|
||||
id: 'workflow-precision-review',
|
||||
label: 'Precision Review Workflow',
|
||||
description: 'Custom evaluator for nuanced quality review.',
|
||||
targetVariables: [
|
||||
{ id: 'query', label: 'query' },
|
||||
{ id: 'answer', label: 'answer' },
|
||||
{ id: 'reference', label: 'reference' },
|
||||
],
|
||||
},
|
||||
{
|
||||
id: 'workflow-risk-review',
|
||||
label: 'Risk Review Workflow',
|
||||
description: 'Custom evaluator for policy and escalation checks.',
|
||||
targetVariables: [
|
||||
{ id: 'input', label: 'input' },
|
||||
{ id: 'output', label: 'output' },
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
const workflowFields: EvaluationFieldOption[] = [
|
||||
{ id: 'app.input.query', label: 'Query', group: 'App Input', type: 'string' },
|
||||
{ id: 'app.input.locale', label: 'Locale', group: 'App Input', type: 'enum', options: [{ value: 'en-US', label: 'en-US' }, { value: 'zh-Hans', label: 'zh-Hans' }] },
|
||||
{ id: 'app.output.answer', label: 'Answer', group: 'App Output', type: 'string' },
|
||||
{ id: 'app.output.score', label: 'Score', group: 'App Output', type: 'number' },
|
||||
{ id: 'system.has_context', label: 'Has Context', group: 'System', type: 'boolean' },
|
||||
]
|
||||
|
||||
const pipelineFields: EvaluationFieldOption[] = [
|
||||
{ id: 'dataset.input.document_id', label: 'Document ID', group: 'Dataset', type: 'string' },
|
||||
{ id: 'dataset.input.chunk_count', label: 'Chunk Count', group: 'Dataset', type: 'number' },
|
||||
{ id: 'retrieval.output.hit_rate', label: 'Hit Rate', group: 'Retrieval', type: 'number' },
|
||||
{ id: 'retrieval.output.source', label: 'Source', group: 'Retrieval', type: 'enum', options: [{ value: 'bm25', label: 'BM25' }, { value: 'hybrid', label: 'Hybrid' }] },
|
||||
{ id: 'pipeline.output.published', label: 'Published', group: 'Output', type: 'boolean' },
|
||||
]
|
||||
|
||||
const snippetFields: EvaluationFieldOption[] = [
|
||||
{ id: 'snippet.input.blog_url', label: 'Blog URL', group: 'Snippet Input', type: 'string' },
|
||||
{ id: 'snippet.input.platforms', label: 'Platforms', group: 'Snippet Input', type: 'string' },
|
||||
{ id: 'snippet.output.content', label: 'Generated Content', group: 'Snippet Output', type: 'string' },
|
||||
{ id: 'snippet.output.length', label: 'Output Length', group: 'Snippet Output', type: 'number' },
|
||||
{ id: 'system.requires_review', label: 'Requires Review', group: 'System', type: 'boolean' },
|
||||
]
|
||||
|
||||
export const getEvaluationMockConfig = (resourceType: EvaluationResourceType): EvaluationMockConfig => {
|
||||
if (resourceType === 'datasets') {
|
||||
return {
|
||||
judgeModels,
|
||||
builtinMetrics: pipelineBuiltinMetrics,
|
||||
workflowOptions,
|
||||
fieldOptions: pipelineFields,
|
||||
templateFileName: 'pipeline-evaluation-template.csv',
|
||||
batchRequirements: [
|
||||
'Include one row per retrieval scenario.',
|
||||
'Provide the expected source or target chunk for each case.',
|
||||
'Keep numeric metrics in plain number format.',
|
||||
],
|
||||
historySummaryLabel: 'Pipeline evaluation batch',
|
||||
}
|
||||
}
|
||||
|
||||
if (resourceType === 'snippets') {
|
||||
return {
|
||||
judgeModels,
|
||||
builtinMetrics,
|
||||
workflowOptions,
|
||||
fieldOptions: snippetFields,
|
||||
templateFileName: 'snippet-evaluation-template.csv',
|
||||
batchRequirements: [
|
||||
'Include one row per snippet execution case.',
|
||||
'Provide the expected final content or acceptance rule.',
|
||||
'Keep optional fields empty when not used.',
|
||||
],
|
||||
historySummaryLabel: 'Snippet evaluation batch',
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
judgeModels,
|
||||
builtinMetrics,
|
||||
workflowOptions,
|
||||
fieldOptions: workflowFields,
|
||||
templateFileName: 'workflow-evaluation-template.csv',
|
||||
batchRequirements: [
|
||||
'Include one row per workflow test case.',
|
||||
'Provide both user input and expected answer when available.',
|
||||
'Keep boolean columns as true or false.',
|
||||
],
|
||||
historySummaryLabel: 'Workflow evaluation batch',
|
||||
}
|
||||
}
|
||||
@@ -21,7 +21,6 @@ import type {
|
||||
NodeInfo,
|
||||
} from '@/types/evaluation'
|
||||
import { getDefaultMetricDescription } from './default-metric-descriptions'
|
||||
import { getEvaluationMockConfig } from './mock'
|
||||
import {
|
||||
buildConditionMetricOptions,
|
||||
decodeModelSelection,
|
||||
@@ -34,6 +33,19 @@ import {
|
||||
type EvaluationStoreResources = Record<string, EvaluationResourceState>
|
||||
|
||||
export const DEFAULT_PIPELINE_METRIC_THRESHOLD = 0.85
|
||||
export const EVALUATION_TEMPLATE_FILE_NAMES: Record<EvaluationResourceType, string> = {
|
||||
apps: 'workflow-evaluation-template.csv',
|
||||
snippets: 'snippet-evaluation-template.csv',
|
||||
datasets: 'pipeline-evaluation-template.csv',
|
||||
}
|
||||
|
||||
const BATCH_HISTORY_SUMMARY_LABELS: Record<EvaluationResourceType, string> = {
|
||||
apps: 'Workflow evaluation batch',
|
||||
snippets: 'Snippet evaluation batch',
|
||||
datasets: 'Pipeline evaluation batch',
|
||||
}
|
||||
|
||||
const PIPELINE_METRIC_IDS = new Set(['context-precision', 'context-recall', 'context-relevance'])
|
||||
|
||||
const PIPELINE_LOGICAL_OPERATOR: JudgmentConfig['logicalOperator'] = 'and'
|
||||
|
||||
@@ -47,9 +59,8 @@ const humanizeMetricId = (metricId: string) => {
|
||||
.join(' ')
|
||||
}
|
||||
|
||||
const resolveMetricOption = (resourceType: EvaluationResourceType, metricId: string): MetricOption => {
|
||||
const config = getEvaluationMockConfig(resourceType)
|
||||
return config.builtinMetrics.find(metric => metric.id === metricId) ?? {
|
||||
export const resolveMetricOption = (metricId: string): MetricOption => {
|
||||
return {
|
||||
id: metricId,
|
||||
label: humanizeMetricId(metricId),
|
||||
description: getDefaultMetricDescription(metricId),
|
||||
@@ -57,13 +68,11 @@ const resolveMetricOption = (resourceType: EvaluationResourceType, metricId: str
|
||||
}
|
||||
}
|
||||
|
||||
const pipelineMetricIds = new Set(getEvaluationMockConfig('datasets').builtinMetrics.map(metric => metric.id))
|
||||
|
||||
const isPipelineResourceType = (resourceType: EvaluationResourceType) => resourceType === 'datasets'
|
||||
|
||||
const isPipelineResourceState = (resource: EvaluationResourceState) => {
|
||||
return resource.metrics.length > 0
|
||||
&& resource.metrics.every(metric => metric.kind === 'builtin' && pipelineMetricIds.has(metric.optionId))
|
||||
&& resource.metrics.every(metric => metric.kind === 'builtin' && PIPELINE_METRIC_IDS.has(metric.optionId))
|
||||
}
|
||||
|
||||
const normalizeNodeInfoList = (value: NodeInfo[] | undefined): NodeInfo[] => {
|
||||
@@ -88,10 +97,7 @@ const normalizeNodeInfoList = (value: NodeInfo[] | undefined): NodeInfo[] => {
|
||||
.filter((item): item is NodeInfo => !!item)
|
||||
}
|
||||
|
||||
const normalizeDefaultMetrics = (
|
||||
resourceType: EvaluationResourceType,
|
||||
value: EvaluationDefaultMetric[] | null | undefined,
|
||||
): EvaluationMetric[] => {
|
||||
const normalizeDefaultMetrics = (value: EvaluationDefaultMetric[] | null | undefined): EvaluationMetric[] => {
|
||||
if (!value?.length)
|
||||
return []
|
||||
|
||||
@@ -101,7 +107,7 @@ const normalizeDefaultMetrics = (
|
||||
if (!metricId)
|
||||
return null
|
||||
|
||||
const metricOption = resolveMetricOption(resourceType, metricId)
|
||||
const metricOption = resolveMetricOption(metricId)
|
||||
return createBuiltinMetric(metricOption, normalizeNodeInfoList(item.node_info_list ?? []))
|
||||
})
|
||||
.filter((item): item is EvaluationMetric => !!item)
|
||||
@@ -455,7 +461,7 @@ export const buildStateFromEvaluationConfig = (
|
||||
resourceType: EvaluationResourceType,
|
||||
config: EvaluationConfig,
|
||||
): EvaluationResourceState => {
|
||||
const defaultMetrics = normalizeDefaultMetrics(resourceType, config.default_metrics)
|
||||
const defaultMetrics = normalizeDefaultMetrics(config.default_metrics)
|
||||
const customMetrics = isPipelineResourceType(resourceType) ? [] : normalizeCustomMetric(config.customized_metrics)
|
||||
const metrics = isPipelineResourceType(resourceType)
|
||||
? normalizePipelineMetrics(config, defaultMetrics)
|
||||
@@ -652,14 +658,12 @@ export const createBatchTestRecord = (
|
||||
resourceType: EvaluationResourceType,
|
||||
uploadedFileName: string | null | undefined,
|
||||
): BatchTestRecord => {
|
||||
const config = getEvaluationMockConfig(resourceType)
|
||||
|
||||
return {
|
||||
id: createId('batch'),
|
||||
fileName: uploadedFileName ?? config.templateFileName,
|
||||
fileName: uploadedFileName ?? EVALUATION_TEMPLATE_FILE_NAMES[resourceType],
|
||||
status: 'running',
|
||||
startedAt: new Date().toLocaleTimeString(),
|
||||
summary: config.historySummaryLabel,
|
||||
summary: BATCH_HISTORY_SUMMARY_LABELS[resourceType],
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,11 +2,11 @@ import type {
|
||||
ComparisonOperator,
|
||||
EvaluationResourceState,
|
||||
EvaluationResourceType,
|
||||
MetricOption,
|
||||
} from './types'
|
||||
import type { EvaluationConfig, NodeInfo } from '@/types/evaluation'
|
||||
import { isEqual } from 'es-toolkit/predicate'
|
||||
import { create } from 'zustand'
|
||||
import { getEvaluationMockConfig } from './mock'
|
||||
import {
|
||||
buildConditionItem,
|
||||
buildInitialState,
|
||||
@@ -20,6 +20,7 @@ import {
|
||||
isCustomMetricConfigured as isCustomMetricConfiguredFromUtils,
|
||||
isEvaluationRunnable as isEvaluationRunnableFromUtils,
|
||||
requiresConditionValue as requiresConditionValueFromUtils,
|
||||
resolveMetricOption,
|
||||
syncCustomMetricMappings as syncCustomMetricMappingsFromUtils,
|
||||
syncJudgmentConfigWithMetrics,
|
||||
updateMetric,
|
||||
@@ -35,7 +36,7 @@ type EvaluationStore = {
|
||||
resetResourceConfig: (resourceType: EvaluationResourceType, resourceId: string) => void
|
||||
markResourceConfigSaved: (resourceType: EvaluationResourceType, resourceId: string) => void
|
||||
setJudgeModel: (resourceType: EvaluationResourceType, resourceId: string, judgeModelId: string) => void
|
||||
addBuiltinMetric: (resourceType: EvaluationResourceType, resourceId: string, optionId: string, nodeInfoList?: NodeInfo[]) => void
|
||||
addBuiltinMetric: (resourceType: EvaluationResourceType, resourceId: string, optionId: string, nodeInfoList?: NodeInfo[], metricOption?: MetricOption) => void
|
||||
updateMetricThreshold: (resourceType: EvaluationResourceType, resourceId: string, metricId: string, threshold: number) => void
|
||||
addCustomMetric: (resourceType: EvaluationResourceType, resourceId: string) => void
|
||||
removeMetric: (resourceType: EvaluationResourceType, resourceId: string, metricId: string) => void
|
||||
@@ -214,11 +215,8 @@ export const useEvaluationStore = create<EvaluationStore>((set, get) => ({
|
||||
})),
|
||||
}))
|
||||
},
|
||||
addBuiltinMetric: (resourceType, resourceId, optionId, nodeInfoList = []) => {
|
||||
const option = getEvaluationMockConfig(resourceType).builtinMetrics.find(metric => metric.id === optionId)
|
||||
if (!option)
|
||||
return
|
||||
|
||||
addBuiltinMetric: (resourceType, resourceId, optionId, nodeInfoList = [], metricOption) => {
|
||||
const option = metricOption ?? resolveMetricOption(optionId)
|
||||
set((state) => {
|
||||
return {
|
||||
resources: updateResourceState(state.resources, resourceType, resourceId, (currentResource) => {
|
||||
|
||||
@@ -17,8 +17,6 @@ export type MetricKind = 'builtin' | 'custom-workflow'
|
||||
|
||||
export type BatchTestTab = 'input-fields' | 'history'
|
||||
|
||||
export type FieldType = 'string' | 'number' | 'boolean' | 'enum'
|
||||
|
||||
export type ConditionMetricValueType = 'string' | 'number' | 'boolean'
|
||||
|
||||
export type ComparisonOperator
|
||||
@@ -41,12 +39,6 @@ export type ComparisonOperator
|
||||
| 'is null'
|
||||
| 'is not null'
|
||||
|
||||
export type JudgeModelOption = {
|
||||
id: string
|
||||
label: string
|
||||
provider: string
|
||||
}
|
||||
|
||||
export type MetricOption = {
|
||||
id: string
|
||||
label: string
|
||||
@@ -54,27 +46,6 @@ export type MetricOption = {
|
||||
valueType: ConditionMetricValueType
|
||||
}
|
||||
|
||||
export type EvaluationWorkflowOption = {
|
||||
id: string
|
||||
label: string
|
||||
description: string
|
||||
targetVariables: Array<{
|
||||
id: string
|
||||
label: string
|
||||
}>
|
||||
}
|
||||
|
||||
export type EvaluationFieldOption = {
|
||||
id: string
|
||||
label: string
|
||||
group: string
|
||||
type: FieldType
|
||||
options?: Array<{
|
||||
value: string
|
||||
label: string
|
||||
}>
|
||||
}
|
||||
|
||||
export type CustomMetricMapping = {
|
||||
id: string
|
||||
inputVariableId: string | null
|
||||
@@ -147,13 +118,3 @@ export type EvaluationResourceState = {
|
||||
selectedRunId: string | null
|
||||
batchRecords: BatchTestRecord[]
|
||||
}
|
||||
|
||||
export type EvaluationMockConfig = {
|
||||
judgeModels: JudgeModelOption[]
|
||||
builtinMetrics: MetricOption[]
|
||||
workflowOptions: EvaluationWorkflowOption[]
|
||||
fieldOptions: EvaluationFieldOption[]
|
||||
templateFileName: string
|
||||
batchRequirements: string[]
|
||||
historySummaryLabel: string
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user