1
0
mirror of synced 2025-12-19 18:14:56 -05:00

feat: add OpenAI LLM evaluation step for connector regression tests (#68673)

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
This commit is contained in:
Aaron ("AJ") Steers
2025-11-20 17:27:25 -08:00
committed by GitHub
parent 13c3e1e34f
commit cf1379d105
7 changed files with 663 additions and 1 deletions

View File

@@ -0,0 +1,34 @@
name: Regression Report Evaluation
description: Evaluate Airbyte connector regression test reports and return a JSON verdict with reasoning
model: llama3.2:3b
modelParameters:
temperature: 0.3
messages:
- role: system
content: |
You are an expert at evaluating connector regression test results.
Your task is to analyze the test report and determine if the regression tests should PASS or FAIL.
Consider the following criteria:
1. All test cases should pass (no failed tests)
2. Record count differences between control and target versions should be minimal or explainable
3. Message count differences should not indicate data loss or corruption
4. Stream coverage should be reasonable
5. Any warnings or errors in test outputs should be evaluated for severity
Provide your evaluation in the following JSON format:
{
"pass": true/false,
"summary": "A concise 2-3 sentence summary of the evaluation",
"reasoning": "Detailed reasoning for your pass/fail decision, including specific issues found",
"severity": "critical/major/minor/none",
"recommendations": "Any recommendations for addressing issues"
}
Be strict but fair in your evaluation. Minor differences are acceptable, but data loss,
corruption, or test failures should result in a FAIL.
- role: user
content: |
Report:
{{report_text}}

View File

@@ -84,7 +84,24 @@ jobs:
name: Regression Tests
runs-on: linux-24.04-large # Custom runner, defined in GitHub org settings
timeout-minutes: 360 # 6 hours
permissions:
contents: read
pull-requests: write
issues: write
steps:
- name: Append start with run link
id: pr-comment-id
if: github.event_name == 'workflow_dispatch' && github.event.inputs.pr != ''
uses: peter-evans/create-or-update-comment@v4
with:
token: ${{ github.token }}
issue-number: ${{ github.event.inputs.pr }}
comment-id: ${{ github.event.inputs.comment-id }}
edit-mode: append
body: |
> Starting regression tests (filter: `${{ github.event.inputs.connector_filter || '--modified' }}`)
> Workflow run: [${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
- name: Install Python
id: install_python
uses: actions/setup-python@7f4fc3e22c37d6ff65e88745f38bd3157c663f7c # v4.9.1
@@ -183,6 +200,7 @@ jobs:
# forks if the user installs the app into their fork. Until we document this as a clear
# path, we will have to keep using the PAT.
- name: Run Regression Tests [WORKFLOW DISPATCH]
id: run-regression-tests
if: github.event_name == 'workflow_dispatch' # TODO: consider using the matrix strategy (https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs). See https://github.com/airbytehq/airbyte/pull/37659#discussion_r1583380234 for details.
uses: ./.github/actions/run-airbyte-ci
with:
@@ -199,3 +217,87 @@ jobs:
s3_build_cache_access_key_id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }}
s3_build_cache_secret_key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }}
subcommand: connectors ${{ env.USE_LOCAL_CDK_FLAG }} ${{ inputs.connector_filter }} test --only-step connector_live_tests --connector_live_tests.test-suite=regression --connector_live_tests.connection-id=${{ github.event.inputs.connection_id }} --connector_live_tests.pr-url="https://github.com/airbytehq/airbyte/pull/${{ github.event.inputs.pr }}" ${{ env.READ_WITH_STATE_FLAG }} ${{ env.DISABLE_PROXY_FLAG }} ${{ env.STREAM_PARAMS }} ${{ env.CONNECTION_SUBSET }} ${{ env.CONTROL_VERSION }} --global-status-check-context="Regression Tests" --global-status-check-description='Running regression tests'
- name: Upload regression test report
if: always() && github.event_name == 'workflow_dispatch'
uses: actions/upload-artifact@v4
with:
name: regression-test-report
path: /tmp/regression_tests_artifacts/report.html
if-no-files-found: ignore
- name: Append regression outcome
if: always() && github.event_name == 'workflow_dispatch' && github.event.inputs.pr != ''
uses: peter-evans/create-or-update-comment@v4
with:
token: ${{ github.token }}
comment-id: ${{ steps.pr-comment-id.outputs.comment-id }}
edit-mode: append
body: |
> Regression tests: ${{ steps.run-regression-tests.outcome == 'success' && '✅ PASSED' || steps.run-regression-tests.outcome == 'failure' && '❌ FAILED' || steps.run-regression-tests.outcome == 'cancelled' && '⚠️ CANCELLED' || steps.run-regression-tests.outcome == 'skipped' && '⏭️ SKIPPED' || '❓ UNKNOWN' }}
> Report: ${{ hashFiles('/tmp/regression_tests_artifacts/report.html') != '' && 'artifact `regression-test-report` available in the run' || 'not generated' }}
- name: Install live-tests dependencies for LLM evaluation
if: always() && github.event_name == 'workflow_dispatch'
working-directory: airbyte-ci/connectors/live-tests
run: poetry install
- name: Install and Start Ollama
if: always() && github.event_name == 'workflow_dispatch'
run: |
curl -fsSL https://ollama.com/install.sh | sh
ollama serve &
sleep 5
ollama pull llama3.2:3b
echo "Ollama server started and model pulled"
- name: Evaluate Regression Test Report with LLM
if: always() && github.event_name == 'workflow_dispatch'
id: llm-eval
continue-on-error: true
working-directory: airbyte-ci/connectors/live-tests
env:
OPENAI_API_KEY: ollama
OPENAI_BASE_URL: http://127.0.0.1:11434/v1
EVAL_MODEL: llama3.2:3b
run: |
set -u
echo "ran=false" >> "$GITHUB_OUTPUT"
echo "result=error" >> "$GITHUB_OUTPUT"
# Find the most recent report.html file in /tmp/regression_tests_artifacts/
REPORT_PATH=$(find /tmp/regression_tests_artifacts -name "report.html" -type f -printf '%T@ %p\n' | sort -n | tail -1 | cut -f2- -d" ")
if [ -z "$REPORT_PATH" ]; then
echo "Error: No report.html found in /tmp/regression_tests_artifacts/" >&2
echo "## ⚠️ LLM Evaluation Skipped" >> "$GITHUB_STEP_SUMMARY"
echo "No regression test report found. The tests may have failed to generate a report." >> "$GITHUB_STEP_SUMMARY"
exit 1
fi
echo "Found report at: $REPORT_PATH"
echo "Running LLM evaluation..."
# Run the evaluation script
OUT_JSON="$RUNNER_TEMP/llm_eval.json"
poetry run python src/live_tests/regression_tests/llm_evaluation/evaluate_report.py \
--report-path "$REPORT_PATH" \
--output-json "$OUT_JSON"
# If we got here, script exit 0 and produced a judgment
PASS=$(jq -r '.evaluation.pass' "$OUT_JSON")
if [ "$PASS" = "true" ]; then RES="pass"; else RES="fail"; fi
echo "ran=true" >> "$GITHUB_OUTPUT"
echo "result=$RES" >> "$GITHUB_OUTPUT"
- name: Append LLM outcome
if: always() && github.event_name == 'workflow_dispatch' && github.event.inputs.pr != ''
env:
EVAL_MODEL: llama3.2:3b
uses: peter-evans/create-or-update-comment@v4
with:
token: ${{ github.token }}
comment-id: ${{ steps.pr-comment-id.outputs.comment-id }}
edit-mode: append
body: |
> LLM Evaluation: ${{ steps.llm-eval.outputs.ran == 'true' && (steps.llm-eval.outputs.result == 'pass' && '✅ PASS' || steps.llm-eval.outputs.result == 'fail' && '❌ FAIL' || '⚠️ ERROR') || '⚠️ Did not run' }}${{ steps.llm-eval.outputs.ran == 'true' && format(' (model: {0})', env.EVAL_MODEL) || '' }}