feat: add OpenAI LLM evaluation step for connector regression tests (#68673)
Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
13c3e1e34f
commit
cf1379d105
34
.github/prompts/regression-evaluation.prompt.yaml
vendored
Normal file
34
.github/prompts/regression-evaluation.prompt.yaml
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
name: Regression Report Evaluation
|
||||
description: Evaluate Airbyte connector regression test reports and return a JSON verdict with reasoning
|
||||
model: llama3.2:3b
|
||||
modelParameters:
|
||||
temperature: 0.3
|
||||
messages:
|
||||
- role: system
|
||||
content: |
|
||||
You are an expert at evaluating connector regression test results.
|
||||
Your task is to analyze the test report and determine if the regression tests should PASS or FAIL.
|
||||
|
||||
Consider the following criteria:
|
||||
1. All test cases should pass (no failed tests)
|
||||
2. Record count differences between control and target versions should be minimal or explainable
|
||||
3. Message count differences should not indicate data loss or corruption
|
||||
4. Stream coverage should be reasonable
|
||||
5. Any warnings or errors in test outputs should be evaluated for severity
|
||||
|
||||
Provide your evaluation in the following JSON format:
|
||||
{
|
||||
"pass": true/false,
|
||||
"summary": "A concise 2-3 sentence summary of the evaluation",
|
||||
"reasoning": "Detailed reasoning for your pass/fail decision, including specific issues found",
|
||||
"severity": "critical/major/minor/none",
|
||||
"recommendations": "Any recommendations for addressing issues"
|
||||
}
|
||||
|
||||
Be strict but fair in your evaluation. Minor differences are acceptable, but data loss,
|
||||
corruption, or test failures should result in a FAIL.
|
||||
- role: user
|
||||
content: |
|
||||
Report:
|
||||
|
||||
{{report_text}}
|
||||
102
.github/workflows/run-regression-tests-command.yml
vendored
102
.github/workflows/run-regression-tests-command.yml
vendored
@@ -84,7 +84,24 @@ jobs:
|
||||
name: Regression Tests
|
||||
runs-on: linux-24.04-large # Custom runner, defined in GitHub org settings
|
||||
timeout-minutes: 360 # 6 hours
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
issues: write
|
||||
steps:
|
||||
- name: Append start with run link
|
||||
id: pr-comment-id
|
||||
if: github.event_name == 'workflow_dispatch' && github.event.inputs.pr != ''
|
||||
uses: peter-evans/create-or-update-comment@v4
|
||||
with:
|
||||
token: ${{ github.token }}
|
||||
issue-number: ${{ github.event.inputs.pr }}
|
||||
comment-id: ${{ github.event.inputs.comment-id }}
|
||||
edit-mode: append
|
||||
body: |
|
||||
> Starting regression tests (filter: `${{ github.event.inputs.connector_filter || '--modified' }}`)
|
||||
> Workflow run: [${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
|
||||
|
||||
- name: Install Python
|
||||
id: install_python
|
||||
uses: actions/setup-python@7f4fc3e22c37d6ff65e88745f38bd3157c663f7c # v4.9.1
|
||||
@@ -183,6 +200,7 @@ jobs:
|
||||
# forks if the user installs the app into their fork. Until we document this as a clear
|
||||
# path, we will have to keep using the PAT.
|
||||
- name: Run Regression Tests [WORKFLOW DISPATCH]
|
||||
id: run-regression-tests
|
||||
if: github.event_name == 'workflow_dispatch' # TODO: consider using the matrix strategy (https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs). See https://github.com/airbytehq/airbyte/pull/37659#discussion_r1583380234 for details.
|
||||
uses: ./.github/actions/run-airbyte-ci
|
||||
with:
|
||||
@@ -199,3 +217,87 @@ jobs:
|
||||
s3_build_cache_access_key_id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }}
|
||||
s3_build_cache_secret_key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }}
|
||||
subcommand: connectors ${{ env.USE_LOCAL_CDK_FLAG }} ${{ inputs.connector_filter }} test --only-step connector_live_tests --connector_live_tests.test-suite=regression --connector_live_tests.connection-id=${{ github.event.inputs.connection_id }} --connector_live_tests.pr-url="https://github.com/airbytehq/airbyte/pull/${{ github.event.inputs.pr }}" ${{ env.READ_WITH_STATE_FLAG }} ${{ env.DISABLE_PROXY_FLAG }} ${{ env.STREAM_PARAMS }} ${{ env.CONNECTION_SUBSET }} ${{ env.CONTROL_VERSION }} --global-status-check-context="Regression Tests" --global-status-check-description='Running regression tests'
|
||||
|
||||
- name: Upload regression test report
|
||||
if: always() && github.event_name == 'workflow_dispatch'
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: regression-test-report
|
||||
path: /tmp/regression_tests_artifacts/report.html
|
||||
if-no-files-found: ignore
|
||||
|
||||
- name: Append regression outcome
|
||||
if: always() && github.event_name == 'workflow_dispatch' && github.event.inputs.pr != ''
|
||||
uses: peter-evans/create-or-update-comment@v4
|
||||
with:
|
||||
token: ${{ github.token }}
|
||||
comment-id: ${{ steps.pr-comment-id.outputs.comment-id }}
|
||||
edit-mode: append
|
||||
body: |
|
||||
> Regression tests: ${{ steps.run-regression-tests.outcome == 'success' && '✅ PASSED' || steps.run-regression-tests.outcome == 'failure' && '❌ FAILED' || steps.run-regression-tests.outcome == 'cancelled' && '⚠️ CANCELLED' || steps.run-regression-tests.outcome == 'skipped' && '⏭️ SKIPPED' || '❓ UNKNOWN' }}
|
||||
> Report: ${{ hashFiles('/tmp/regression_tests_artifacts/report.html') != '' && 'artifact `regression-test-report` available in the run' || 'not generated' }}
|
||||
|
||||
- name: Install live-tests dependencies for LLM evaluation
|
||||
if: always() && github.event_name == 'workflow_dispatch'
|
||||
working-directory: airbyte-ci/connectors/live-tests
|
||||
run: poetry install
|
||||
|
||||
- name: Install and Start Ollama
|
||||
if: always() && github.event_name == 'workflow_dispatch'
|
||||
run: |
|
||||
curl -fsSL https://ollama.com/install.sh | sh
|
||||
ollama serve &
|
||||
sleep 5
|
||||
ollama pull llama3.2:3b
|
||||
echo "Ollama server started and model pulled"
|
||||
|
||||
- name: Evaluate Regression Test Report with LLM
|
||||
if: always() && github.event_name == 'workflow_dispatch'
|
||||
id: llm-eval
|
||||
continue-on-error: true
|
||||
working-directory: airbyte-ci/connectors/live-tests
|
||||
env:
|
||||
OPENAI_API_KEY: ollama
|
||||
OPENAI_BASE_URL: http://127.0.0.1:11434/v1
|
||||
EVAL_MODEL: llama3.2:3b
|
||||
run: |
|
||||
set -u
|
||||
echo "ran=false" >> "$GITHUB_OUTPUT"
|
||||
echo "result=error" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# Find the most recent report.html file in /tmp/regression_tests_artifacts/
|
||||
REPORT_PATH=$(find /tmp/regression_tests_artifacts -name "report.html" -type f -printf '%T@ %p\n' | sort -n | tail -1 | cut -f2- -d" ")
|
||||
|
||||
if [ -z "$REPORT_PATH" ]; then
|
||||
echo "Error: No report.html found in /tmp/regression_tests_artifacts/" >&2
|
||||
echo "## ⚠️ LLM Evaluation Skipped" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "No regression test report found. The tests may have failed to generate a report." >> "$GITHUB_STEP_SUMMARY"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Found report at: $REPORT_PATH"
|
||||
echo "Running LLM evaluation..."
|
||||
|
||||
# Run the evaluation script
|
||||
OUT_JSON="$RUNNER_TEMP/llm_eval.json"
|
||||
poetry run python src/live_tests/regression_tests/llm_evaluation/evaluate_report.py \
|
||||
--report-path "$REPORT_PATH" \
|
||||
--output-json "$OUT_JSON"
|
||||
|
||||
# If we got here, script exit 0 and produced a judgment
|
||||
PASS=$(jq -r '.evaluation.pass' "$OUT_JSON")
|
||||
if [ "$PASS" = "true" ]; then RES="pass"; else RES="fail"; fi
|
||||
echo "ran=true" >> "$GITHUB_OUTPUT"
|
||||
echo "result=$RES" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Append LLM outcome
|
||||
if: always() && github.event_name == 'workflow_dispatch' && github.event.inputs.pr != ''
|
||||
env:
|
||||
EVAL_MODEL: llama3.2:3b
|
||||
uses: peter-evans/create-or-update-comment@v4
|
||||
with:
|
||||
token: ${{ github.token }}
|
||||
comment-id: ${{ steps.pr-comment-id.outputs.comment-id }}
|
||||
edit-mode: append
|
||||
body: |
|
||||
> LLM Evaluation: ${{ steps.llm-eval.outputs.ran == 'true' && (steps.llm-eval.outputs.result == 'pass' && '✅ PASS' || steps.llm-eval.outputs.result == 'fail' && '❌ FAIL' || '⚠️ ERROR') || '⚠️ Did not run' }}${{ steps.llm-eval.outputs.ran == 'true' && format(' (model: {0})', env.EVAL_MODEL) || '' }}
|
||||
|
||||
Reference in New Issue
Block a user