diff --git a/web/app/components/evaluation/__tests__/index.spec.tsx b/web/app/components/evaluation/__tests__/index.spec.tsx index 1346b363ce..fca39a5e5d 100644 --- a/web/app/components/evaluation/__tests__/index.spec.tsx +++ b/web/app/components/evaluation/__tests__/index.spec.tsx @@ -1,3 +1,5 @@ +import type { ReactNode } from 'react' +import { QueryClient, QueryClientProvider } from '@tanstack/react-query' import { act, fireEvent, render, screen } from '@testing-library/react' import Evaluation from '..' import ConditionsSection from '../components/conditions-section' @@ -6,6 +8,8 @@ import { useEvaluationStore } from '../store' const mockUseAvailableEvaluationMetrics = vi.hoisted(() => vi.fn()) const mockUseEvaluationConfig = vi.hoisted(() => vi.fn()) const mockUseEvaluationNodeInfoMutation = vi.hoisted(() => vi.fn()) +const mockUseSaveEvaluationConfigMutation = vi.hoisted(() => vi.fn()) +const mockUseStartEvaluationRunMutation = vi.hoisted(() => vi.fn()) vi.mock('@/app/components/header/account-setting/model-provider-page/hooks', () => ({ useModelList: () => ({ @@ -42,8 +46,71 @@ vi.mock('@/service/use-evaluation', () => ({ useEvaluationConfig: (...args: unknown[]) => mockUseEvaluationConfig(...args), useAvailableEvaluationMetrics: (...args: unknown[]) => mockUseAvailableEvaluationMetrics(...args), useEvaluationNodeInfoMutation: (...args: unknown[]) => mockUseEvaluationNodeInfoMutation(...args), + useSaveEvaluationConfigMutation: (...args: unknown[]) => mockUseSaveEvaluationConfigMutation(...args), + useStartEvaluationRunMutation: (...args: unknown[]) => mockUseStartEvaluationRunMutation(...args), })) +vi.mock('@/service/use-workflow', () => ({ + useAppWorkflow: () => ({ + data: { + graph: { + nodes: [{ + id: 'start', + data: { + type: 'start', + variables: [{ + variable: 'query', + type: 'text-input', + }], + }, + }], + }, + }, + isLoading: false, + }), +})) + +vi.mock('@/service/use-snippet-workflows', () => ({ + useSnippetPublishedWorkflow: () => ({ + data: { + graph: { + nodes: [{ + id: 'start', + data: { + type: 'start', + variables: [{ + variable: 'query', + type: 'text-input', + }], + }, + }], + }, + }, + isLoading: false, + }), +})) + +const renderWithQueryClient = (ui: ReactNode) => { + const queryClient = new QueryClient({ + defaultOptions: { + queries: { + retry: false, + }, + mutations: { + retry: false, + }, + }, + }) + + return render(ui, { + wrapper: ({ children }: { children: ReactNode }) => ( + + {children} + + ), + }) +} + describe('Evaluation', () => { beforeEach(() => { useEvaluationStore.setState({ resources: {} }) @@ -72,12 +139,24 @@ describe('Evaluation', () => { }) }, }) + mockUseSaveEvaluationConfigMutation.mockReturnValue({ + isPending: false, + mutate: vi.fn(), + }) + mockUseStartEvaluationRunMutation.mockReturnValue({ + isPending: false, + mutate: vi.fn(), + }) }) - it('should search, select metric nodes, and create a batch history record', async () => { - vi.useFakeTimers() + it('should search, select metric nodes, and save evaluation config', () => { + const saveConfig = vi.fn() + mockUseSaveEvaluationConfigMutation.mockReturnValue({ + isPending: false, + mutate: saveConfig, + }) - render() + renderWithQueryClient() expect(screen.getByTestId('evaluation-model-selector')).toHaveTextContent('openai:gpt-4o-mini') @@ -104,17 +183,39 @@ describe('Evaluation', () => { fireEvent.click(screen.getByTestId('evaluation-metric-node-answer-correctness-node-answer')) expect(screen.getAllByText('Answer Correctness').length).toBeGreaterThan(0) - fireEvent.click(screen.getByRole('button', { name: 'evaluation.batch.run' })) - expect(screen.getByText('evaluation.batch.status.running')).toBeInTheDocument() + fireEvent.click(screen.getByRole('button', { name: 'common.operation.save' })) - await act(async () => { - vi.advanceTimersByTime(1300) + expect(saveConfig).toHaveBeenCalledWith({ + params: { + targetType: 'apps', + targetId: 'app-1', + }, + body: { + evaluation_model: 'gpt-4o-mini', + evaluation_model_provider: 'openai', + default_metrics: [ + { + metric: 'faithfulness', + value_type: 'number', + node_info_list: [ + { node_id: 'node-faithfulness', title: 'Retriever Node', type: 'retriever' }, + ], + }, + { + metric: 'answer-correctness', + value_type: 'number', + node_info_list: [ + { node_id: 'node-answer', title: 'Answer Node', type: 'llm' }, + ], + }, + ], + customized_metrics: null, + judgment_config: null, + }, + }, { + onSuccess: expect.any(Function), + onError: expect.any(Function), }) - - expect(screen.getByText('evaluation.batch.status.success')).toBeInTheDocument() - expect(screen.getByText('Workflow evaluation batch')).toBeInTheDocument() - - vi.useRealTimers() }) it('should hide the value row for empty operators', () => { @@ -138,7 +239,7 @@ describe('Evaluation', () => { let rerender: ReturnType['rerender'] act(() => { - ({ rerender } = render()) + ({ rerender } = renderWithQueryClient()) }) expect(screen.getByPlaceholderText('evaluation.conditions.valuePlaceholder')).toBeInTheDocument() @@ -212,7 +313,7 @@ describe('Evaluation', () => { }, }) - render() + renderWithQueryClient() fireEvent.click(screen.getByRole('button', { name: 'evaluation.metrics.add' })) @@ -227,7 +328,7 @@ describe('Evaluation', () => { isLoading: false, }) - render() + renderWithQueryClient() fireEvent.click(screen.getByRole('button', { name: 'evaluation.metrics.add' })) @@ -256,7 +357,7 @@ describe('Evaluation', () => { }, }) - render() + renderWithQueryClient() fireEvent.click(screen.getByRole('button', { name: 'evaluation.metrics.add' })) @@ -270,7 +371,7 @@ describe('Evaluation', () => { }) it('should render the pipeline-specific layout without auto-selecting a judge model', () => { - render() + renderWithQueryClient() expect(screen.getByTestId('evaluation-model-selector')).toHaveTextContent('empty') expect(screen.getByText('evaluation.history.title')).toBeInTheDocument() @@ -294,14 +395,14 @@ describe('Evaluation', () => { }, }) - render() + renderWithQueryClient() expect(screen.getByText('Context Precision')).toBeInTheDocument() expect(screen.getByDisplayValue('0.85')).toBeInTheDocument() }) it('should enable pipeline batch actions after selecting a judge model and metric', () => { - render() + renderWithQueryClient() fireEvent.click(screen.getByRole('button', { name: 'select-model' })) fireEvent.click(screen.getByRole('button', { name: /Context Precision/i })) diff --git a/web/app/components/evaluation/__tests__/store.spec.ts b/web/app/components/evaluation/__tests__/store.spec.ts index eb7cd74b4d..c3b122c18a 100644 --- a/web/app/components/evaluation/__tests__/store.spec.ts +++ b/web/app/components/evaluation/__tests__/store.spec.ts @@ -6,6 +6,7 @@ import { requiresConditionValue, useEvaluationStore, } from '../store' +import { buildEvaluationConfigPayload, buildEvaluationRunRequest } from '../store-utils' describe('evaluation store', () => { beforeEach(() => { @@ -271,4 +272,76 @@ describe('evaluation store', () => { expect(hydratedState.uploadedFileName).toBe('batch.csv') expect(hydratedState.batchRecords).toHaveLength(1) }) + + it('should build an evaluation config save payload from resource state', () => { + const resourceType = 'apps' + const resourceId = 'app-save-config' + const store = useEvaluationStore.getState() + + store.ensureResource(resourceType, resourceId) + store.setJudgeModel(resourceType, resourceId, 'openai::gpt-4o-mini') + store.addBuiltinMetric(resourceType, resourceId, 'faithfulness', [ + { node_id: 'node-faithfulness', title: 'Retriever Node', type: 'retriever' }, + ]) + store.addCustomMetric(resourceType, resourceId) + + const customMetric = useEvaluationStore.getState().resources['apps:app-save-config'].metrics.find(metric => metric.kind === 'custom-workflow')! + store.setCustomMetricWorkflow(resourceType, resourceId, customMetric.id, { + workflowId: 'workflow-precision-review', + workflowAppId: 'evaluation-workflow-app-id', + workflowName: 'Precision Review', + }) + store.syncCustomMetricMappings(resourceType, resourceId, customMetric.id, ['query']) + store.syncCustomMetricOutputs(resourceType, resourceId, customMetric.id, [{ + id: 'score', + valueType: 'number', + }]) + + const syncedMetric = useEvaluationStore.getState().resources['apps:app-save-config'].metrics.find(metric => metric.id === customMetric.id)! + store.updateCustomMetricMapping(resourceType, resourceId, customMetric.id, syncedMetric.customConfig!.mappings[0].id, { + outputVariableId: '{{#node-answer.output#}}', + }) + store.addCondition(resourceType, resourceId, ['workflow-precision-review', 'score']) + + const condition = useEvaluationStore.getState().resources['apps:app-save-config'].judgmentConfig.conditions[0] + store.updateConditionOperator(resourceType, resourceId, condition.id, '≥') + store.updateConditionValue(resourceType, resourceId, condition.id, '0.8') + + const resource = useEvaluationStore.getState().resources['apps:app-save-config'] + const expectedPayload = { + evaluation_model: 'gpt-4o-mini', + evaluation_model_provider: 'openai', + default_metrics: [{ + metric: 'faithfulness', + value_type: 'number', + node_info_list: [ + { node_id: 'node-faithfulness', title: 'Retriever Node', type: 'retriever' }, + ], + }], + customized_metrics: { + evaluation_workflow_id: 'evaluation-workflow-app-id', + input_fields: { + query: '{{#node-answer.output#}}', + }, + output_fields: [{ + variable: 'score', + value_type: 'number', + }], + }, + judgment_config: { + logical_operator: 'and', + conditions: [{ + variable_selector: ['evaluation-workflow-app-id', 'score'], + comparison_operator: '≥', + value: '0.8', + }], + }, + } + + expect(buildEvaluationConfigPayload(resource)).toEqual(expectedPayload) + expect(buildEvaluationRunRequest(resource, 'file-1')).toEqual({ + ...expectedPayload, + file_id: 'file-1', + }) + }) }) diff --git a/web/app/components/evaluation/components/batch-test-panel/index.tsx b/web/app/components/evaluation/components/batch-test-panel/index.tsx index 0736de4f93..0f7226af00 100644 --- a/web/app/components/evaluation/components/batch-test-panel/index.tsx +++ b/web/app/components/evaluation/components/batch-test-panel/index.tsx @@ -2,8 +2,12 @@ import type { BatchTestTab, EvaluationResourceProps } from '../../types' import { useTranslation } from 'react-i18next' +import Button from '@/app/components/base/button' +import { toast } from '@/app/components/base/ui/toast' +import { useSaveEvaluationConfigMutation } from '@/service/use-evaluation' import { cn } from '@/utils/classnames' import { isEvaluationRunnable, useEvaluationResource, useEvaluationStore } from '../../store' +import { buildEvaluationConfigPayload } from '../../store-utils' import { TAB_CLASS_NAME } from '../../utils' import HistoryTab from './history-tab' import InputFieldsTab from './input-fields-tab' @@ -15,20 +19,64 @@ const BatchTestPanel = ({ resourceId, }: EvaluationResourceProps) => { const { t } = useTranslation('evaluation') + const { t: tCommon } = useTranslation('common') const tabLabels: Record = { 'input-fields': t('batch.tabs.input-fields'), 'history': t('batch.tabs.history'), } const resource = useEvaluationResource(resourceType, resourceId) const setBatchTab = useEvaluationStore(state => state.setBatchTab) + const saveConfigMutation = useSaveEvaluationConfigMutation() const isRunnable = isEvaluationRunnable(resource) const isPanelReady = !!resource.judgeModelId && resource.metrics.length > 0 + const handleSave = () => { + if (!isRunnable) { + toast.warning(t('batch.validation')) + return + } + + const body = buildEvaluationConfigPayload(resource) + + if (!body) { + toast.warning(t('batch.validation')) + return + } + + saveConfigMutation.mutate({ + params: { + targetType: resourceType, + targetId: resourceId, + }, + body, + }, { + onSuccess: () => { + toast.success(tCommon('api.saved')) + }, + onError: () => { + toast.error(t('config.saveFailed')) + }, + }) + } + return (
-
{t('batch.title')}
-
{t('batch.description')}
+
+
+
{t('batch.title')}
+
{t('batch.description')}
+
+ +