From 08c01c4f3fc63276c59908cffb77196e71f78bf4 Mon Sep 17 00:00:00 2001 From: JzoNg Date: Wed, 29 Apr 2026 15:28:10 +0800 Subject: [PATCH] chore(web): remove mock data of evaluation --- .../evaluation/__tests__/index.spec.tsx | 39 ++++ .../evaluation/__tests__/store.spec.ts | 49 +++-- .../batch-test-panel/input-fields-tab.tsx | 5 +- .../selector-metric-section.tsx | 4 +- .../use-metric-selector-data.ts | 22 +-- .../pipeline/pipeline-batch-actions.tsx | 5 +- .../pipeline/pipeline-metrics-section.tsx | 20 +- .../evaluation/default-metric-descriptions.ts | 40 ++-- web/app/components/evaluation/mock.ts | 179 ------------------ web/app/components/evaluation/store-utils.ts | 38 ++-- web/app/components/evaluation/store.ts | 12 +- web/app/components/evaluation/types.ts | 39 ---- 12 files changed, 131 insertions(+), 321 deletions(-) delete mode 100644 web/app/components/evaluation/mock.ts diff --git a/web/app/components/evaluation/__tests__/index.spec.tsx b/web/app/components/evaluation/__tests__/index.spec.tsx index c060b65e6f..b451223390 100644 --- a/web/app/components/evaluation/__tests__/index.spec.tsx +++ b/web/app/components/evaluation/__tests__/index.spec.tsx @@ -404,6 +404,45 @@ describe('Evaluation', () => { expect(screen.getByText('evaluation.metrics.noNodesInWorkflow')).toBeInTheDocument() }) + it('should add a node from a dynamically returned metric option', () => { + mockUseDefaultEvaluationMetrics.mockReturnValue({ + data: { + default_metrics: [ + { + metric: 'answer-correctness', + value_type: 'number', + node_info_list: [ + { node_id: 'node-answer', title: 'Answer Node', type: 'llm' }, + ], + }, + { + metric: 'context-precision', + value_type: 'number', + node_info_list: [ + { node_id: 'node-context', title: 'Context Node', type: 'knowledge-retrieval' }, + ], + }, + ], + }, + isLoading: false, + }) + + renderWithQueryClient() + + fireEvent.click(screen.getByRole('button', { name: 'evaluation.metrics.add' })) + fireEvent.click(screen.getByTestId('evaluation-metric-node-context-precision-node-context')) + + const metrics = useEvaluationStore.getState().resources['apps:app-dynamic-metric']!.metrics + expect(metrics).toHaveLength(1) + expect(metrics[0]).toMatchObject({ + optionId: 'context-precision', + label: 'Context Precision', + nodeInfoList: [ + { node_id: 'node-context', title: 'Context Node', type: 'knowledge-retrieval' }, + ], + }) + }) + it('should render the global empty state when no metrics are available', () => { mockUseDefaultEvaluationMetrics.mockReturnValue({ data: { diff --git a/web/app/components/evaluation/__tests__/store.spec.ts b/web/app/components/evaluation/__tests__/store.spec.ts index 648eee68d8..0557f52945 100644 --- a/web/app/components/evaluation/__tests__/store.spec.ts +++ b/web/app/components/evaluation/__tests__/store.spec.ts @@ -1,5 +1,4 @@ import type { EvaluationConfig } from '@/types/evaluation' -import { getEvaluationMockConfig } from '../mock' import { getAllowedOperators, isCustomMetricConfigured, @@ -8,6 +7,12 @@ import { } from '../store' import { buildEvaluationConfigPayload, buildEvaluationRunRequest } from '../store-utils' +const customWorkflow = { + id: 'workflow-precision-review', + appId: 'custom-workflow-app-id', + name: 'Precision Review Workflow', +} + describe('evaluation store', () => { beforeEach(() => { useEvaluationStore.setState({ resources: {}, initialResources: {} }) @@ -17,7 +22,6 @@ describe('evaluation store', () => { const resourceType = 'apps' const resourceId = 'app-1' const store = useEvaluationStore.getState() - const config = getEvaluationMockConfig(resourceType) store.ensureResource(resourceType, resourceId) store.addCustomMetric(resourceType, resourceId) @@ -27,9 +31,9 @@ describe('evaluation store', () => { expect(isCustomMetricConfigured(initialMetric!)).toBe(false) store.setCustomMetricWorkflow(resourceType, resourceId, initialMetric!.id, { - workflowId: config.workflowOptions[0].id, - workflowAppId: 'custom-workflow-app-id', - workflowName: config.workflowOptions[0].label, + workflowId: customWorkflow.id, + workflowAppId: customWorkflow.appId, + workflowName: customWorkflow.name, }) store.syncCustomMetricMappings(resourceType, resourceId, initialMetric!.id, ['query']) store.syncCustomMetricOutputs(resourceType, resourceId, initialMetric!.id, [{ @@ -44,8 +48,8 @@ describe('evaluation store', () => { const configuredMetric = useEvaluationStore.getState().resources['apps:app-1'].metrics.find(metric => metric.id === initialMetric!.id) expect(isCustomMetricConfigured(configuredMetric!)).toBe(true) - expect(configuredMetric!.customConfig!.workflowAppId).toBe('custom-workflow-app-id') - expect(configuredMetric!.customConfig!.workflowName).toBe(config.workflowOptions[0].label) + expect(configuredMetric!.customConfig!.workflowAppId).toBe(customWorkflow.appId) + expect(configuredMetric!.customConfig!.workflowName).toBe(customWorkflow.name) expect(configuredMetric!.customConfig!.outputs).toEqual([{ id: 'score', valueType: 'number' }]) }) @@ -71,12 +75,11 @@ describe('evaluation store', () => { const resourceType = 'apps' const resourceId = 'app-2' const store = useEvaluationStore.getState() - const config = getEvaluationMockConfig(resourceType) store.ensureResource(resourceType, resourceId) - store.addBuiltinMetric(resourceType, resourceId, config.builtinMetrics[1].id) + store.addBuiltinMetric(resourceType, resourceId, 'faithfulness') - const addedMetric = useEvaluationStore.getState().resources['apps:app-2'].metrics.find(metric => metric.optionId === config.builtinMetrics[1].id) + const addedMetric = useEvaluationStore.getState().resources['apps:app-2'].metrics.find(metric => metric.optionId === 'faithfulness') expect(addedMetric).toBeDefined() store.removeMetric(resourceType, resourceId, addedMetric!.id) @@ -88,8 +91,7 @@ describe('evaluation store', () => { const resourceType = 'apps' const resourceId = 'app-4' const store = useEvaluationStore.getState() - const config = getEvaluationMockConfig(resourceType) - const metricId = config.builtinMetrics[0].id + const metricId = 'answer-correctness' store.ensureResource(resourceType, resourceId) store.addBuiltinMetric(resourceType, resourceId, metricId, [ @@ -115,10 +117,9 @@ describe('evaluation store', () => { const resourceType = 'apps' const resourceId = 'app-conditions' const store = useEvaluationStore.getState() - const config = getEvaluationMockConfig(resourceType) store.ensureResource(resourceType, resourceId) - store.addBuiltinMetric(resourceType, resourceId, config.builtinMetrics[0].id, [ + store.addBuiltinMetric(resourceType, resourceId, 'answer-correctness', [ { node_id: 'node-answer', title: 'Answer Node', type: 'llm' }, ]) store.setConditionLogicalOperator(resourceType, resourceId, 'or') @@ -137,27 +138,26 @@ describe('evaluation store', () => { const resourceType = 'apps' const resourceId = 'app-condition-selector' const store = useEvaluationStore.getState() - const config = getEvaluationMockConfig(resourceType) store.ensureResource(resourceType, resourceId) store.addCustomMetric(resourceType, resourceId) const customMetric = useEvaluationStore.getState().resources['apps:app-condition-selector'].metrics.find(metric => metric.kind === 'custom-workflow')! store.setCustomMetricWorkflow(resourceType, resourceId, customMetric.id, { - workflowId: config.workflowOptions[0].id, - workflowAppId: 'custom-workflow-app-id', - workflowName: config.workflowOptions[0].label, + workflowId: customWorkflow.id, + workflowAppId: customWorkflow.appId, + workflowName: customWorkflow.name, }) store.syncCustomMetricOutputs(resourceType, resourceId, customMetric.id, [{ id: 'reason', valueType: 'string', }]) - store.addCondition(resourceType, resourceId, [config.workflowOptions[0].id, 'reason']) + store.addCondition(resourceType, resourceId, [customWorkflow.id, 'reason']) const condition = useEvaluationStore.getState().resources['apps:app-condition-selector'].judgmentConfig.conditions[0] - expect(condition.variableSelector).toEqual([config.workflowOptions[0].id, 'reason']) + expect(condition.variableSelector).toEqual([customWorkflow.id, 'reason']) expect(condition.comparisonOperator).toBe('contains') expect(condition.value).toBeNull() }) @@ -166,16 +166,15 @@ describe('evaluation store', () => { const resourceType = 'apps' const resourceId = 'app-3' const store = useEvaluationStore.getState() - const config = getEvaluationMockConfig(resourceType) store.ensureResource(resourceType, resourceId) store.addCustomMetric(resourceType, resourceId) const customMetric = useEvaluationStore.getState().resources['apps:app-3'].metrics.find(metric => metric.kind === 'custom-workflow')! store.setCustomMetricWorkflow(resourceType, resourceId, customMetric.id, { - workflowId: config.workflowOptions[0].id, - workflowAppId: 'custom-workflow-app-id', - workflowName: config.workflowOptions[0].label, + workflowId: customWorkflow.id, + workflowAppId: customWorkflow.appId, + workflowName: customWorkflow.name, }) store.syncCustomMetricOutputs(resourceType, resourceId, customMetric.id, [{ id: 'reason', @@ -185,7 +184,7 @@ describe('evaluation store', () => { const condition = useEvaluationStore.getState().resources['apps:app-3'].judgmentConfig.conditions[0] - store.updateConditionMetric(resourceType, resourceId, condition.id, [config.workflowOptions[0].id, 'reason']) + store.updateConditionMetric(resourceType, resourceId, condition.id, [customWorkflow.id, 'reason']) store.updateConditionValue(resourceType, resourceId, condition.id, 'needs follow-up') store.updateConditionOperator(resourceType, resourceId, condition.id, 'empty') diff --git a/web/app/components/evaluation/components/batch-test-panel/input-fields-tab.tsx b/web/app/components/evaluation/components/batch-test-panel/input-fields-tab.tsx index c3bc295081..5cff6e3dc0 100644 --- a/web/app/components/evaluation/components/batch-test-panel/input-fields-tab.tsx +++ b/web/app/components/evaluation/components/batch-test-panel/input-fields-tab.tsx @@ -1,7 +1,7 @@ import type { EvaluationResourceProps } from '../../types' import { Button } from '@langgenius/dify-ui/button' import { useTranslation } from 'react-i18next' -import { getEvaluationMockConfig } from '../../mock' +import { EVALUATION_TEMPLATE_FILE_NAMES } from '../../store-utils' import InputFieldsRequirements from './input-fields/input-fields-requirements' import UploadRunPopover from './input-fields/upload-run-popover' import { useInputFieldsActions } from './input-fields/use-input-fields-actions' @@ -19,7 +19,6 @@ const InputFieldsTab = ({ isRunnable, }: InputFieldsTabProps) => { const { t } = useTranslation('evaluation') - const config = getEvaluationMockConfig(resourceType) const { inputFields, isInputFieldsLoading } = usePublishedInputFields(resourceType, resourceId) const actions = useInputFieldsActions({ resourceType, @@ -28,7 +27,7 @@ const InputFieldsTab = ({ isInputFieldsLoading, isPanelReady, isRunnable, - templateFileName: config.templateFileName, + templateFileName: EVALUATION_TEMPLATE_FILE_NAMES[resourceType], }) return ( diff --git a/web/app/components/evaluation/components/metric-selector/selector-metric-section.tsx b/web/app/components/evaluation/components/metric-selector/selector-metric-section.tsx index 43d7213ba1..942b1ea1ab 100644 --- a/web/app/components/evaluation/components/metric-selector/selector-metric-section.tsx +++ b/web/app/components/evaluation/components/metric-selector/selector-metric-section.tsx @@ -13,7 +13,7 @@ type SelectorMetricSectionProps = { isShowingAllNodes: boolean onToggleExpanded: () => void onToggleShowAllNodes: () => void - onToggleNodeSelection: (metricId: string, nodeInfo: MetricSelectorSection['visibleNodes'][number]) => void + onToggleNodeSelection: (metric: MetricSelectorSection['metric'], nodeInfo: MetricSelectorSection['visibleNodes'][number]) => void t: TFunction<'evaluation'> } @@ -93,7 +93,7 @@ const SelectorMetricSection = ({ 'flex w-full items-center gap-1 rounded-md px-3 py-1.5 text-left transition-colors hover:bg-state-base-hover-alt', isAdded && 'opacity-50', )} - onClick={() => onToggleNodeSelection(metric.id, nodeInfo)} + onClick={() => onToggleNodeSelection(metric, nodeInfo)} >
void + toggleNodeSelection: (metric: MetricOption, nodeInfo: NodeInfo) => void } export const useMetricSelectorData = ({ @@ -35,7 +33,6 @@ export const useMetricSelectorData = ({ resourceId, }: UseMetricSelectorDataOptions): UseMetricSelectorDataResult => { const { t } = useTranslation('evaluation') - const config = getEvaluationMockConfig(resourceType) const metrics = useEvaluationResource(resourceType, resourceId).metrics const addBuiltinMetric = useEvaluationStore(state => state.addBuiltinMetric) const removeMetric = useEvaluationStore(state => state.removeMetric) @@ -53,25 +50,15 @@ export const useMetricSelectorData = ({ const nodeInfoMap = useMemo(() => getDefaultMetricNodeInfoMap(defaultMetrics), [defaultMetrics]) const resolvedMetrics = useMemo(() => { - const metricsMap = new Map(config.builtinMetrics.map(metric => [metric.id, metric] as const)) - return defaultMetrics .map((defaultMetric) => { if (!defaultMetric.metric) return null - const configMetric = metricsMap.get(defaultMetric.metric) - if (configMetric) { - return { - ...configMetric, - valueType: normalizeMetricValueType(defaultMetric.value_type), - } - } - return buildMetricOption(defaultMetric.metric, defaultMetric.value_type) }) .filter((metric): metric is MetricOption => !!metric) - }, [config.builtinMetrics, defaultMetrics]) + }, [defaultMetrics]) const filteredSections = useMemo(() => { const keyword = query.trim().toLowerCase() @@ -120,7 +107,8 @@ export const useMetricSelectorData = ({ }).filter((section): section is MetricSelectorSection => !!section) }, [nodeInfoMap, query, resolvedMetrics, t]) - const toggleNodeSelection = (metricId: string, nodeInfo: NodeInfo) => { + const toggleNodeSelection = (metric: MetricOption, nodeInfo: NodeInfo) => { + const metricId = metric.id const addedMetric = builtinMetricMap.get(metricId) const currentSelectedNodes = addedMetric?.nodeInfoList ?? [] @@ -135,7 +123,7 @@ export const useMetricSelectorData = ({ return } - addBuiltinMetric(resourceType, resourceId, metricId, nextSelectedNodes) + addBuiltinMetric(resourceType, resourceId, metricId, nextSelectedNodes, metric) } return { diff --git a/web/app/components/evaluation/components/pipeline/pipeline-batch-actions.tsx b/web/app/components/evaluation/components/pipeline/pipeline-batch-actions.tsx index cc1c415bde..cad35dc31b 100644 --- a/web/app/components/evaluation/components/pipeline/pipeline-batch-actions.tsx +++ b/web/app/components/evaluation/components/pipeline/pipeline-batch-actions.tsx @@ -4,8 +4,8 @@ import type { EvaluationResourceProps } from '../../types' import type { InputField } from '../batch-test-panel/input-fields/input-fields-utils' import { Button } from '@langgenius/dify-ui/button' import { useTranslation } from 'react-i18next' -import { getEvaluationMockConfig } from '../../mock' import { isEvaluationRunnable, useEvaluationResource } from '../../store' +import { EVALUATION_TEMPLATE_FILE_NAMES } from '../../store-utils' import UploadRunPopover from '../batch-test-panel/input-fields/upload-run-popover' import { useInputFieldsActions } from '../batch-test-panel/input-fields/use-input-fields-actions' @@ -20,7 +20,6 @@ const PipelineBatchActions = ({ }: EvaluationResourceProps) => { const { t } = useTranslation('evaluation') const resource = useEvaluationResource(resourceType, resourceId) - const config = getEvaluationMockConfig(resourceType) const isConfigReady = !!resource.judgeModelId && resource.metrics.some(metric => metric.kind === 'builtin') const isRunnable = isEvaluationRunnable(resource) const actions = useInputFieldsActions({ @@ -30,7 +29,7 @@ const PipelineBatchActions = ({ isInputFieldsLoading: false, isPanelReady: isConfigReady, isRunnable, - templateFileName: config.templateFileName, + templateFileName: EVALUATION_TEMPLATE_FILE_NAMES[resourceType], }) return ( diff --git a/web/app/components/evaluation/components/pipeline/pipeline-metrics-section.tsx b/web/app/components/evaluation/components/pipeline/pipeline-metrics-section.tsx index 2457b081c3..553083f867 100644 --- a/web/app/components/evaluation/components/pipeline/pipeline-metrics-section.tsx +++ b/web/app/components/evaluation/components/pipeline/pipeline-metrics-section.tsx @@ -9,8 +9,8 @@ import { BlockEnum } from '@/app/components/workflow/types' import { useDatasetDetailContextWithSelector } from '@/context/dataset-detail' import { useAvailableEvaluationMetrics } from '@/service/use-evaluation' import { usePublishedPipelineInfo } from '@/service/use-pipeline' -import { getEvaluationMockConfig } from '../../mock' import { useEvaluationResource, useEvaluationStore } from '../../store' +import { buildMetricOption } from '../metric-selector/utils' import { InlineSectionHeader } from '../section-header' import PipelineMetricItem from './pipeline-metric-item' @@ -52,7 +52,6 @@ const PipelineMetricsSection = ({ const { data: availableMetricsData } = useAvailableEvaluationMetrics() const { data: publishedPipeline } = usePublishedPipelineInfo(pipelineId || '') const resource = useEvaluationResource(resourceType, resourceId) - const config = getEvaluationMockConfig(resourceType) const knowledgeIndexNodeInfoList = useMemo( () => getKnowledgeIndexNodeInfo(publishedPipeline?.graph.nodes), [publishedPipeline?.graph.nodes], @@ -62,12 +61,14 @@ const PipelineMetricsSection = ({ .filter(metric => metric.kind === 'builtin') .map(metric => [metric.optionId, metric]), ), [resource.metrics]) - const availableMetricIds = useMemo(() => new Set(availableMetricsData?.metrics ?? []), [availableMetricsData?.metrics]) const availableBuiltinMetrics = useMemo(() => { - return config.builtinMetrics.filter(metric => - availableMetricIds.has(metric.id) || builtinMetricMap.has(metric.id), - ) - }, [availableMetricIds, builtinMetricMap, config.builtinMetrics]) + const metricIds = new Set([ + ...(availableMetricsData?.metrics ?? []), + ...builtinMetricMap.keys(), + ]) + + return Array.from(metricIds).map(metricId => buildMetricOption(metricId)) + }, [availableMetricsData?.metrics, builtinMetricMap]) useEffect(() => { if (!knowledgeIndexNodeInfoList.length) @@ -77,7 +78,7 @@ const PipelineMetricsSection = ({ if (metric.kind !== 'builtin' || isSameNodeInfoList(metric.nodeInfoList, knowledgeIndexNodeInfoList)) return - addBuiltinMetric(resourceType, resourceId, metric.optionId, knowledgeIndexNodeInfoList) + addBuiltinMetric(resourceType, resourceId, metric.optionId, knowledgeIndexNodeInfoList, metric) }) }, [addBuiltinMetric, knowledgeIndexNodeInfoList, resource.metrics, resourceId, resourceType]) @@ -88,7 +89,8 @@ const PipelineMetricsSection = ({ return } - addBuiltinMetric(resourceType, resourceId, metricId, knowledgeIndexNodeInfoList) + const metricOption = availableBuiltinMetrics.find(metric => metric.id === metricId) + addBuiltinMetric(resourceType, resourceId, metricId, knowledgeIndexNodeInfoList, metricOption) } return ( diff --git a/web/app/components/evaluation/default-metric-descriptions.ts b/web/app/components/evaluation/default-metric-descriptions.ts index b2189b52c2..c989a45d62 100644 --- a/web/app/components/evaluation/default-metric-descriptions.ts +++ b/web/app/components/evaluation/default-metric-descriptions.ts @@ -29,29 +29,29 @@ const DEFAULT_METRIC_DESCRIPTION_KEYS = { type DefaultMetricDescriptionKey = typeof DEFAULT_METRIC_DESCRIPTION_KEYS[keyof typeof DEFAULT_METRIC_DESCRIPTION_KEYS] const DEFAULT_METRIC_DESCRIPTIONS: Record = { - faithfulness: DEFAULT_METRIC_DESCRIPTION.FAITHFULNESS, - answer_relevancy: DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY, - answer_correctness: DEFAULT_METRIC_DESCRIPTION.ANSWER_CORRECTNESS, - semantic_similarity: DEFAULT_METRIC_DESCRIPTION.SEMANTIC_SIMILARITY, - context_precision: DEFAULT_METRIC_DESCRIPTION.CONTEXT_PRECISION, - context_recall: DEFAULT_METRIC_DESCRIPTION.CONTEXT_RECALL, - context_relevance: DEFAULT_METRIC_DESCRIPTION.CONTEXT_RELEVANCE, - tool_correctness: DEFAULT_METRIC_DESCRIPTION.TOOL_CORRECTNESS, - task_completion: DEFAULT_METRIC_DESCRIPTION.TASK_COMPLETION, - relevance: DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY, + 'faithfulness': DEFAULT_METRIC_DESCRIPTION.FAITHFULNESS, + 'answer-relevancy': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY, + 'answer-correctness': DEFAULT_METRIC_DESCRIPTION.ANSWER_CORRECTNESS, + 'semantic-similarity': DEFAULT_METRIC_DESCRIPTION.SEMANTIC_SIMILARITY, + 'context-precision': DEFAULT_METRIC_DESCRIPTION.CONTEXT_PRECISION, + 'context-recall': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RECALL, + 'context-relevance': DEFAULT_METRIC_DESCRIPTION.CONTEXT_RELEVANCE, + 'tool-correctness': DEFAULT_METRIC_DESCRIPTION.TOOL_CORRECTNESS, + 'task-completion': DEFAULT_METRIC_DESCRIPTION.TASK_COMPLETION, + 'relevance': DEFAULT_METRIC_DESCRIPTION.ANSWER_RELEVANCY, } const DEFAULT_METRIC_DESCRIPTION_I18N_KEYS: Record = { - faithfulness: DEFAULT_METRIC_DESCRIPTION_KEYS.FAITHFULNESS, - answer_relevancy: DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY, - answer_correctness: DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_CORRECTNESS, - semantic_similarity: DEFAULT_METRIC_DESCRIPTION_KEYS.SEMANTIC_SIMILARITY, - context_precision: DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_PRECISION, - context_recall: DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RECALL, - context_relevance: DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RELEVANCE, - tool_correctness: DEFAULT_METRIC_DESCRIPTION_KEYS.TOOL_CORRECTNESS, - task_completion: DEFAULT_METRIC_DESCRIPTION_KEYS.TASK_COMPLETION, - relevance: DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY, + 'faithfulness': DEFAULT_METRIC_DESCRIPTION_KEYS.FAITHFULNESS, + 'answer-relevancy': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY, + 'answer-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_CORRECTNESS, + 'semantic-similarity': DEFAULT_METRIC_DESCRIPTION_KEYS.SEMANTIC_SIMILARITY, + 'context-precision': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_PRECISION, + 'context-recall': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RECALL, + 'context-relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.CONTEXT_RELEVANCE, + 'tool-correctness': DEFAULT_METRIC_DESCRIPTION_KEYS.TOOL_CORRECTNESS, + 'task-completion': DEFAULT_METRIC_DESCRIPTION_KEYS.TASK_COMPLETION, + 'relevance': DEFAULT_METRIC_DESCRIPTION_KEYS.ANSWER_RELEVANCY, } const normalizeMetricId = (metricId: string) => metricId.trim().toLowerCase().replace(/_/g, '-') diff --git a/web/app/components/evaluation/mock.ts b/web/app/components/evaluation/mock.ts deleted file mode 100644 index 61f6d57b39..0000000000 --- a/web/app/components/evaluation/mock.ts +++ /dev/null @@ -1,179 +0,0 @@ -import type { - EvaluationFieldOption, - EvaluationMockConfig, - EvaluationResourceType, - MetricOption, -} from './types' -import { getDefaultMetricDescription } from './default-metric-descriptions' - -const judgeModels = [ - { - id: 'gpt-4.1-mini', - label: 'GPT-4.1 mini', - provider: 'OpenAI', - }, - { - id: 'claude-3-7-sonnet', - label: 'Claude 3.7 Sonnet', - provider: 'Anthropic', - }, - { - id: 'gemini-2.0-flash', - label: 'Gemini 2.0 Flash', - provider: 'Google', - }, -] - -const builtinMetrics: MetricOption[] = [ - { - id: 'answer-correctness', - label: 'Answer Correctness', - description: getDefaultMetricDescription('answer-correctness'), - valueType: 'number', - }, - { - id: 'faithfulness', - label: 'Faithfulness', - description: getDefaultMetricDescription('faithfulness'), - valueType: 'number', - }, - { - id: 'relevance', - label: 'Relevance', - description: getDefaultMetricDescription('relevance'), - valueType: 'number', - }, - { - id: 'latency', - label: 'Latency', - description: 'Captures runtime responsiveness for the full execution path.', - valueType: 'number', - }, - { - id: 'token-usage', - label: 'Token Usage', - description: 'Tracks prompt and completion token consumption for the run.', - valueType: 'number', - }, - { - id: 'tool-success-rate', - label: 'Tool Success Rate', - description: 'Measures whether each required tool invocation finishes without failure.', - valueType: 'number', - }, -] - -const pipelineBuiltinMetrics: MetricOption[] = [ - { - id: 'context-precision', - label: 'Context Precision', - description: getDefaultMetricDescription('context-precision'), - valueType: 'number', - }, - { - id: 'context-recall', - label: 'Context Recall', - description: getDefaultMetricDescription('context-recall'), - valueType: 'number', - }, - { - id: 'context-relevance', - label: 'Context Relevance', - description: getDefaultMetricDescription('context-relevance'), - valueType: 'number', - }, -] - -const workflowOptions = [ - { - id: 'workflow-precision-review', - label: 'Precision Review Workflow', - description: 'Custom evaluator for nuanced quality review.', - targetVariables: [ - { id: 'query', label: 'query' }, - { id: 'answer', label: 'answer' }, - { id: 'reference', label: 'reference' }, - ], - }, - { - id: 'workflow-risk-review', - label: 'Risk Review Workflow', - description: 'Custom evaluator for policy and escalation checks.', - targetVariables: [ - { id: 'input', label: 'input' }, - { id: 'output', label: 'output' }, - ], - }, -] - -const workflowFields: EvaluationFieldOption[] = [ - { id: 'app.input.query', label: 'Query', group: 'App Input', type: 'string' }, - { id: 'app.input.locale', label: 'Locale', group: 'App Input', type: 'enum', options: [{ value: 'en-US', label: 'en-US' }, { value: 'zh-Hans', label: 'zh-Hans' }] }, - { id: 'app.output.answer', label: 'Answer', group: 'App Output', type: 'string' }, - { id: 'app.output.score', label: 'Score', group: 'App Output', type: 'number' }, - { id: 'system.has_context', label: 'Has Context', group: 'System', type: 'boolean' }, -] - -const pipelineFields: EvaluationFieldOption[] = [ - { id: 'dataset.input.document_id', label: 'Document ID', group: 'Dataset', type: 'string' }, - { id: 'dataset.input.chunk_count', label: 'Chunk Count', group: 'Dataset', type: 'number' }, - { id: 'retrieval.output.hit_rate', label: 'Hit Rate', group: 'Retrieval', type: 'number' }, - { id: 'retrieval.output.source', label: 'Source', group: 'Retrieval', type: 'enum', options: [{ value: 'bm25', label: 'BM25' }, { value: 'hybrid', label: 'Hybrid' }] }, - { id: 'pipeline.output.published', label: 'Published', group: 'Output', type: 'boolean' }, -] - -const snippetFields: EvaluationFieldOption[] = [ - { id: 'snippet.input.blog_url', label: 'Blog URL', group: 'Snippet Input', type: 'string' }, - { id: 'snippet.input.platforms', label: 'Platforms', group: 'Snippet Input', type: 'string' }, - { id: 'snippet.output.content', label: 'Generated Content', group: 'Snippet Output', type: 'string' }, - { id: 'snippet.output.length', label: 'Output Length', group: 'Snippet Output', type: 'number' }, - { id: 'system.requires_review', label: 'Requires Review', group: 'System', type: 'boolean' }, -] - -export const getEvaluationMockConfig = (resourceType: EvaluationResourceType): EvaluationMockConfig => { - if (resourceType === 'datasets') { - return { - judgeModels, - builtinMetrics: pipelineBuiltinMetrics, - workflowOptions, - fieldOptions: pipelineFields, - templateFileName: 'pipeline-evaluation-template.csv', - batchRequirements: [ - 'Include one row per retrieval scenario.', - 'Provide the expected source or target chunk for each case.', - 'Keep numeric metrics in plain number format.', - ], - historySummaryLabel: 'Pipeline evaluation batch', - } - } - - if (resourceType === 'snippets') { - return { - judgeModels, - builtinMetrics, - workflowOptions, - fieldOptions: snippetFields, - templateFileName: 'snippet-evaluation-template.csv', - batchRequirements: [ - 'Include one row per snippet execution case.', - 'Provide the expected final content or acceptance rule.', - 'Keep optional fields empty when not used.', - ], - historySummaryLabel: 'Snippet evaluation batch', - } - } - - return { - judgeModels, - builtinMetrics, - workflowOptions, - fieldOptions: workflowFields, - templateFileName: 'workflow-evaluation-template.csv', - batchRequirements: [ - 'Include one row per workflow test case.', - 'Provide both user input and expected answer when available.', - 'Keep boolean columns as true or false.', - ], - historySummaryLabel: 'Workflow evaluation batch', - } -} diff --git a/web/app/components/evaluation/store-utils.ts b/web/app/components/evaluation/store-utils.ts index b8903442d8..ce2ecf7f18 100644 --- a/web/app/components/evaluation/store-utils.ts +++ b/web/app/components/evaluation/store-utils.ts @@ -21,7 +21,6 @@ import type { NodeInfo, } from '@/types/evaluation' import { getDefaultMetricDescription } from './default-metric-descriptions' -import { getEvaluationMockConfig } from './mock' import { buildConditionMetricOptions, decodeModelSelection, @@ -34,6 +33,19 @@ import { type EvaluationStoreResources = Record export const DEFAULT_PIPELINE_METRIC_THRESHOLD = 0.85 +export const EVALUATION_TEMPLATE_FILE_NAMES: Record = { + apps: 'workflow-evaluation-template.csv', + snippets: 'snippet-evaluation-template.csv', + datasets: 'pipeline-evaluation-template.csv', +} + +const BATCH_HISTORY_SUMMARY_LABELS: Record = { + apps: 'Workflow evaluation batch', + snippets: 'Snippet evaluation batch', + datasets: 'Pipeline evaluation batch', +} + +const PIPELINE_METRIC_IDS = new Set(['context-precision', 'context-recall', 'context-relevance']) const PIPELINE_LOGICAL_OPERATOR: JudgmentConfig['logicalOperator'] = 'and' @@ -47,9 +59,8 @@ const humanizeMetricId = (metricId: string) => { .join(' ') } -const resolveMetricOption = (resourceType: EvaluationResourceType, metricId: string): MetricOption => { - const config = getEvaluationMockConfig(resourceType) - return config.builtinMetrics.find(metric => metric.id === metricId) ?? { +export const resolveMetricOption = (metricId: string): MetricOption => { + return { id: metricId, label: humanizeMetricId(metricId), description: getDefaultMetricDescription(metricId), @@ -57,13 +68,11 @@ const resolveMetricOption = (resourceType: EvaluationResourceType, metricId: str } } -const pipelineMetricIds = new Set(getEvaluationMockConfig('datasets').builtinMetrics.map(metric => metric.id)) - const isPipelineResourceType = (resourceType: EvaluationResourceType) => resourceType === 'datasets' const isPipelineResourceState = (resource: EvaluationResourceState) => { return resource.metrics.length > 0 - && resource.metrics.every(metric => metric.kind === 'builtin' && pipelineMetricIds.has(metric.optionId)) + && resource.metrics.every(metric => metric.kind === 'builtin' && PIPELINE_METRIC_IDS.has(metric.optionId)) } const normalizeNodeInfoList = (value: NodeInfo[] | undefined): NodeInfo[] => { @@ -88,10 +97,7 @@ const normalizeNodeInfoList = (value: NodeInfo[] | undefined): NodeInfo[] => { .filter((item): item is NodeInfo => !!item) } -const normalizeDefaultMetrics = ( - resourceType: EvaluationResourceType, - value: EvaluationDefaultMetric[] | null | undefined, -): EvaluationMetric[] => { +const normalizeDefaultMetrics = (value: EvaluationDefaultMetric[] | null | undefined): EvaluationMetric[] => { if (!value?.length) return [] @@ -101,7 +107,7 @@ const normalizeDefaultMetrics = ( if (!metricId) return null - const metricOption = resolveMetricOption(resourceType, metricId) + const metricOption = resolveMetricOption(metricId) return createBuiltinMetric(metricOption, normalizeNodeInfoList(item.node_info_list ?? [])) }) .filter((item): item is EvaluationMetric => !!item) @@ -455,7 +461,7 @@ export const buildStateFromEvaluationConfig = ( resourceType: EvaluationResourceType, config: EvaluationConfig, ): EvaluationResourceState => { - const defaultMetrics = normalizeDefaultMetrics(resourceType, config.default_metrics) + const defaultMetrics = normalizeDefaultMetrics(config.default_metrics) const customMetrics = isPipelineResourceType(resourceType) ? [] : normalizeCustomMetric(config.customized_metrics) const metrics = isPipelineResourceType(resourceType) ? normalizePipelineMetrics(config, defaultMetrics) @@ -652,14 +658,12 @@ export const createBatchTestRecord = ( resourceType: EvaluationResourceType, uploadedFileName: string | null | undefined, ): BatchTestRecord => { - const config = getEvaluationMockConfig(resourceType) - return { id: createId('batch'), - fileName: uploadedFileName ?? config.templateFileName, + fileName: uploadedFileName ?? EVALUATION_TEMPLATE_FILE_NAMES[resourceType], status: 'running', startedAt: new Date().toLocaleTimeString(), - summary: config.historySummaryLabel, + summary: BATCH_HISTORY_SUMMARY_LABELS[resourceType], } } diff --git a/web/app/components/evaluation/store.ts b/web/app/components/evaluation/store.ts index 9f94e594b0..c0ba492891 100644 --- a/web/app/components/evaluation/store.ts +++ b/web/app/components/evaluation/store.ts @@ -2,11 +2,11 @@ import type { ComparisonOperator, EvaluationResourceState, EvaluationResourceType, + MetricOption, } from './types' import type { EvaluationConfig, NodeInfo } from '@/types/evaluation' import { isEqual } from 'es-toolkit/predicate' import { create } from 'zustand' -import { getEvaluationMockConfig } from './mock' import { buildConditionItem, buildInitialState, @@ -20,6 +20,7 @@ import { isCustomMetricConfigured as isCustomMetricConfiguredFromUtils, isEvaluationRunnable as isEvaluationRunnableFromUtils, requiresConditionValue as requiresConditionValueFromUtils, + resolveMetricOption, syncCustomMetricMappings as syncCustomMetricMappingsFromUtils, syncJudgmentConfigWithMetrics, updateMetric, @@ -35,7 +36,7 @@ type EvaluationStore = { resetResourceConfig: (resourceType: EvaluationResourceType, resourceId: string) => void markResourceConfigSaved: (resourceType: EvaluationResourceType, resourceId: string) => void setJudgeModel: (resourceType: EvaluationResourceType, resourceId: string, judgeModelId: string) => void - addBuiltinMetric: (resourceType: EvaluationResourceType, resourceId: string, optionId: string, nodeInfoList?: NodeInfo[]) => void + addBuiltinMetric: (resourceType: EvaluationResourceType, resourceId: string, optionId: string, nodeInfoList?: NodeInfo[], metricOption?: MetricOption) => void updateMetricThreshold: (resourceType: EvaluationResourceType, resourceId: string, metricId: string, threshold: number) => void addCustomMetric: (resourceType: EvaluationResourceType, resourceId: string) => void removeMetric: (resourceType: EvaluationResourceType, resourceId: string, metricId: string) => void @@ -214,11 +215,8 @@ export const useEvaluationStore = create((set, get) => ({ })), })) }, - addBuiltinMetric: (resourceType, resourceId, optionId, nodeInfoList = []) => { - const option = getEvaluationMockConfig(resourceType).builtinMetrics.find(metric => metric.id === optionId) - if (!option) - return - + addBuiltinMetric: (resourceType, resourceId, optionId, nodeInfoList = [], metricOption) => { + const option = metricOption ?? resolveMetricOption(optionId) set((state) => { return { resources: updateResourceState(state.resources, resourceType, resourceId, (currentResource) => { diff --git a/web/app/components/evaluation/types.ts b/web/app/components/evaluation/types.ts index 57d0a66832..c57d696b79 100644 --- a/web/app/components/evaluation/types.ts +++ b/web/app/components/evaluation/types.ts @@ -17,8 +17,6 @@ export type MetricKind = 'builtin' | 'custom-workflow' export type BatchTestTab = 'input-fields' | 'history' -export type FieldType = 'string' | 'number' | 'boolean' | 'enum' - export type ConditionMetricValueType = 'string' | 'number' | 'boolean' export type ComparisonOperator @@ -41,12 +39,6 @@ export type ComparisonOperator | 'is null' | 'is not null' -export type JudgeModelOption = { - id: string - label: string - provider: string -} - export type MetricOption = { id: string label: string @@ -54,27 +46,6 @@ export type MetricOption = { valueType: ConditionMetricValueType } -export type EvaluationWorkflowOption = { - id: string - label: string - description: string - targetVariables: Array<{ - id: string - label: string - }> -} - -export type EvaluationFieldOption = { - id: string - label: string - group: string - type: FieldType - options?: Array<{ - value: string - label: string - }> -} - export type CustomMetricMapping = { id: string inputVariableId: string | null @@ -147,13 +118,3 @@ export type EvaluationResourceState = { selectedRunId: string | null batchRecords: BatchTestRecord[] } - -export type EvaluationMockConfig = { - judgeModels: JudgeModelOption[] - builtinMetrics: MetricOption[] - workflowOptions: EvaluationWorkflowOption[] - fieldOptions: EvaluationFieldOption[] - templateFileName: string - batchRequirements: string[] - historySummaryLabel: string -}