1
0
mirror of synced 2025-12-19 10:00:34 -05:00

feat: add OpenAI LLM evaluation step for connector regression tests (#68673)

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
This commit is contained in:
Aaron ("AJ") Steers
2025-11-20 17:27:25 -08:00
committed by GitHub
parent 13c3e1e34f
commit cf1379d105
7 changed files with 663 additions and 1 deletions

View File

@@ -0,0 +1,34 @@
name: Regression Report Evaluation
description: Evaluate Airbyte connector regression test reports and return a JSON verdict with reasoning
model: llama3.2:3b
modelParameters:
temperature: 0.3
messages:
- role: system
content: |
You are an expert at evaluating connector regression test results.
Your task is to analyze the test report and determine if the regression tests should PASS or FAIL.
Consider the following criteria:
1. All test cases should pass (no failed tests)
2. Record count differences between control and target versions should be minimal or explainable
3. Message count differences should not indicate data loss or corruption
4. Stream coverage should be reasonable
5. Any warnings or errors in test outputs should be evaluated for severity
Provide your evaluation in the following JSON format:
{
"pass": true/false,
"summary": "A concise 2-3 sentence summary of the evaluation",
"reasoning": "Detailed reasoning for your pass/fail decision, including specific issues found",
"severity": "critical/major/minor/none",
"recommendations": "Any recommendations for addressing issues"
}
Be strict but fair in your evaluation. Minor differences are acceptable, but data loss,
corruption, or test failures should result in a FAIL.
- role: user
content: |
Report:
{{report_text}}

View File

@@ -84,7 +84,24 @@ jobs:
name: Regression Tests
runs-on: linux-24.04-large # Custom runner, defined in GitHub org settings
timeout-minutes: 360 # 6 hours
permissions:
contents: read
pull-requests: write
issues: write
steps:
- name: Append start with run link
id: pr-comment-id
if: github.event_name == 'workflow_dispatch' && github.event.inputs.pr != ''
uses: peter-evans/create-or-update-comment@v4
with:
token: ${{ github.token }}
issue-number: ${{ github.event.inputs.pr }}
comment-id: ${{ github.event.inputs.comment-id }}
edit-mode: append
body: |
> Starting regression tests (filter: `${{ github.event.inputs.connector_filter || '--modified' }}`)
> Workflow run: [${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
- name: Install Python
id: install_python
uses: actions/setup-python@7f4fc3e22c37d6ff65e88745f38bd3157c663f7c # v4.9.1
@@ -183,6 +200,7 @@ jobs:
# forks if the user installs the app into their fork. Until we document this as a clear
# path, we will have to keep using the PAT.
- name: Run Regression Tests [WORKFLOW DISPATCH]
id: run-regression-tests
if: github.event_name == 'workflow_dispatch' # TODO: consider using the matrix strategy (https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs). See https://github.com/airbytehq/airbyte/pull/37659#discussion_r1583380234 for details.
uses: ./.github/actions/run-airbyte-ci
with:
@@ -199,3 +217,87 @@ jobs:
s3_build_cache_access_key_id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }}
s3_build_cache_secret_key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }}
subcommand: connectors ${{ env.USE_LOCAL_CDK_FLAG }} ${{ inputs.connector_filter }} test --only-step connector_live_tests --connector_live_tests.test-suite=regression --connector_live_tests.connection-id=${{ github.event.inputs.connection_id }} --connector_live_tests.pr-url="https://github.com/airbytehq/airbyte/pull/${{ github.event.inputs.pr }}" ${{ env.READ_WITH_STATE_FLAG }} ${{ env.DISABLE_PROXY_FLAG }} ${{ env.STREAM_PARAMS }} ${{ env.CONNECTION_SUBSET }} ${{ env.CONTROL_VERSION }} --global-status-check-context="Regression Tests" --global-status-check-description='Running regression tests'
- name: Upload regression test report
if: always() && github.event_name == 'workflow_dispatch'
uses: actions/upload-artifact@v4
with:
name: regression-test-report
path: /tmp/regression_tests_artifacts/report.html
if-no-files-found: ignore
- name: Append regression outcome
if: always() && github.event_name == 'workflow_dispatch' && github.event.inputs.pr != ''
uses: peter-evans/create-or-update-comment@v4
with:
token: ${{ github.token }}
comment-id: ${{ steps.pr-comment-id.outputs.comment-id }}
edit-mode: append
body: |
> Regression tests: ${{ steps.run-regression-tests.outcome == 'success' && '✅ PASSED' || steps.run-regression-tests.outcome == 'failure' && '❌ FAILED' || steps.run-regression-tests.outcome == 'cancelled' && '⚠️ CANCELLED' || steps.run-regression-tests.outcome == 'skipped' && '⏭️ SKIPPED' || '❓ UNKNOWN' }}
> Report: ${{ hashFiles('/tmp/regression_tests_artifacts/report.html') != '' && 'artifact `regression-test-report` available in the run' || 'not generated' }}
- name: Install live-tests dependencies for LLM evaluation
if: always() && github.event_name == 'workflow_dispatch'
working-directory: airbyte-ci/connectors/live-tests
run: poetry install
- name: Install and Start Ollama
if: always() && github.event_name == 'workflow_dispatch'
run: |
curl -fsSL https://ollama.com/install.sh | sh
ollama serve &
sleep 5
ollama pull llama3.2:3b
echo "Ollama server started and model pulled"
- name: Evaluate Regression Test Report with LLM
if: always() && github.event_name == 'workflow_dispatch'
id: llm-eval
continue-on-error: true
working-directory: airbyte-ci/connectors/live-tests
env:
OPENAI_API_KEY: ollama
OPENAI_BASE_URL: http://127.0.0.1:11434/v1
EVAL_MODEL: llama3.2:3b
run: |
set -u
echo "ran=false" >> "$GITHUB_OUTPUT"
echo "result=error" >> "$GITHUB_OUTPUT"
# Find the most recent report.html file in /tmp/regression_tests_artifacts/
REPORT_PATH=$(find /tmp/regression_tests_artifacts -name "report.html" -type f -printf '%T@ %p\n' | sort -n | tail -1 | cut -f2- -d" ")
if [ -z "$REPORT_PATH" ]; then
echo "Error: No report.html found in /tmp/regression_tests_artifacts/" >&2
echo "## ⚠️ LLM Evaluation Skipped" >> "$GITHUB_STEP_SUMMARY"
echo "No regression test report found. The tests may have failed to generate a report." >> "$GITHUB_STEP_SUMMARY"
exit 1
fi
echo "Found report at: $REPORT_PATH"
echo "Running LLM evaluation..."
# Run the evaluation script
OUT_JSON="$RUNNER_TEMP/llm_eval.json"
poetry run python src/live_tests/regression_tests/llm_evaluation/evaluate_report.py \
--report-path "$REPORT_PATH" \
--output-json "$OUT_JSON"
# If we got here, script exit 0 and produced a judgment
PASS=$(jq -r '.evaluation.pass' "$OUT_JSON")
if [ "$PASS" = "true" ]; then RES="pass"; else RES="fail"; fi
echo "ran=true" >> "$GITHUB_OUTPUT"
echo "result=$RES" >> "$GITHUB_OUTPUT"
- name: Append LLM outcome
if: always() && github.event_name == 'workflow_dispatch' && github.event.inputs.pr != ''
env:
EVAL_MODEL: llama3.2:3b
uses: peter-evans/create-or-update-comment@v4
with:
token: ${{ github.token }}
comment-id: ${{ steps.pr-comment-id.outputs.comment-id }}
edit-mode: append
body: |
> LLM Evaluation: ${{ steps.llm-eval.outputs.ran == 'true' && (steps.llm-eval.outputs.result == 'pass' && '✅ PASS' || steps.llm-eval.outputs.result == 'fail' && '❌ FAIL' || '⚠️ ERROR') || '⚠️ Did not run' }}${{ steps.llm-eval.outputs.ran == 'true' && format(' (model: {0})', env.EVAL_MODEL) || '' }}

View File

@@ -355,6 +355,29 @@ test = ["coverage (>=5.5)", "equinox", "jax[cpu]", "jaxtyping", "mypy (>=0.800)"
test-tox = ["equinox", "jax[cpu]", "jaxtyping", "mypy (>=0.800)", "numba", "numpy", "pandera", "pygments", "pyright (>=1.1.370)", "pytest (>=4.0.0)", "sphinx", "typing-extensions (>=3.10.0.0)"]
test-tox-coverage = ["coverage (>=5.5)"]
[[package]]
name = "beautifulsoup4"
version = "4.14.2"
description = "Screen-scraping library"
optional = false
python-versions = ">=3.7.0"
groups = ["main"]
files = [
{file = "beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515"},
{file = "beautifulsoup4-4.14.2.tar.gz", hash = "sha256:2a98ab9f944a11acee9cc848508ec28d9228abfd522ef0fad6a02a72e0ded69e"},
]
[package.dependencies]
soupsieve = ">1.2"
typing-extensions = ">=4.0.0"
[package.extras]
cchardet = ["cchardet"]
chardet = ["chardet"]
charset-normalizer = ["charset-normalizer"]
html5lib = ["html5lib"]
lxml = ["lxml"]
[[package]]
name = "blessed"
version = "1.20.0"
@@ -979,6 +1002,18 @@ wrapt = ">=1.10,<2"
[package.extras]
dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
[[package]]
name = "distro"
version = "1.9.0"
description = "Distro - an OS platform information API"
optional = false
python-versions = ">=3.6"
groups = ["main"]
files = [
{file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"},
{file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
]
[[package]]
name = "docker"
version = "6.1.3"
@@ -1943,6 +1978,118 @@ files = [
[package.dependencies]
ansicon = {version = "*", markers = "platform_system == \"Windows\""}
[[package]]
name = "jiter"
version = "0.11.1"
description = "Fast iterable JSON parser."
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "jiter-0.11.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:ed58841a491bbbf3f7c55a6b68fff568439ab73b2cce27ace0e169057b5851df"},
{file = "jiter-0.11.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:499beb9b2d7e51d61095a8de39ebcab1d1778f2a74085f8305a969f6cee9f3e4"},
{file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b87b2821795e28cc990939b68ce7a038edea680a24910bd68a79d54ff3f03c02"},
{file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:83f6fa494d8bba14ab100417c80e70d32d737e805cb85be2052d771c76fcd1f8"},
{file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5fbc6aea1daa2ec6f5ed465f0c5e7b0607175062ceebbea5ca70dd5ddab58083"},
{file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:302288e2edc43174bb2db838e94688d724f9aad26c5fb9a74f7a5fb427452a6a"},
{file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85db563fe3b367bb568af5d29dea4d4066d923b8e01f3417d25ebecd958de815"},
{file = "jiter-0.11.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f1c1ba2b6b22f775444ef53bc2d5778396d3520abc7b2e1da8eb0c27cb3ffb10"},
{file = "jiter-0.11.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:523be464b14f8fd0cc78da6964b87b5515a056427a2579f9085ce30197a1b54a"},
{file = "jiter-0.11.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:25b99b3f04cd2a38fefb22e822e35eb203a2cd37d680dbbc0c0ba966918af336"},
{file = "jiter-0.11.1-cp310-cp310-win32.whl", hash = "sha256:47a79e90545a596bb9104109777894033347b11180d4751a216afef14072dbe7"},
{file = "jiter-0.11.1-cp310-cp310-win_amd64.whl", hash = "sha256:cace75621ae9bd66878bf69fbd4dfc1a28ef8661e0c2d0eb72d3d6f1268eddf5"},
{file = "jiter-0.11.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:9b0088ff3c374ce8ce0168523ec8e97122ebb788f950cf7bb8e39c7dc6a876a2"},
{file = "jiter-0.11.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:74433962dd3c3090655e02e461267095d6c84f0741c7827de11022ef8d7ff661"},
{file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d98030e345e6546df2cc2c08309c502466c66c4747b043f1a0d415fada862b8"},
{file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1d6db0b2e788db46bec2cf729a88b6dd36959af2abd9fa2312dfba5acdd96dcb"},
{file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:55678fbbda261eafe7289165dd2ddd0e922df5f9a1ae46d7c79a5a15242bd7d1"},
{file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a6b74fae8e40497653b52ce6ca0f1b13457af769af6fb9c1113efc8b5b4d9be"},
{file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a55a453f8b035eb4f7852a79a065d616b7971a17f5e37a9296b4b38d3b619e4"},
{file = "jiter-0.11.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2638148099022e6bdb3f42904289cd2e403609356fb06eb36ddec2d50958bc29"},
{file = "jiter-0.11.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:252490567a5d990986f83b95a5f1ca1bf205ebd27b3e9e93bb7c2592380e29b9"},
{file = "jiter-0.11.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d431d52b0ca2436eea6195f0f48528202100c7deda354cb7aac0a302167594d5"},
{file = "jiter-0.11.1-cp311-cp311-win32.whl", hash = "sha256:db6f41e40f8bae20c86cb574b48c4fd9f28ee1c71cb044e9ec12e78ab757ba3a"},
{file = "jiter-0.11.1-cp311-cp311-win_amd64.whl", hash = "sha256:0cc407b8e6cdff01b06bb80f61225c8b090c3df108ebade5e0c3c10993735b19"},
{file = "jiter-0.11.1-cp311-cp311-win_arm64.whl", hash = "sha256:fe04ea475392a91896d1936367854d346724a1045a247e5d1c196410473b8869"},
{file = "jiter-0.11.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:c92148eec91052538ce6823dfca9525f5cfc8b622d7f07e9891a280f61b8c96c"},
{file = "jiter-0.11.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ecd4da91b5415f183a6be8f7158d127bdd9e6a3174138293c0d48d6ea2f2009d"},
{file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7e3ac25c00b9275684d47aa42febaa90a9958e19fd1726c4ecf755fbe5e553b"},
{file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:57d7305c0a841858f866cd459cd9303f73883fb5e097257f3d4a3920722c69d4"},
{file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e86fa10e117dce22c547f31dd6d2a9a222707d54853d8de4e9a2279d2c97f239"},
{file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ae5ef1d48aec7e01ee8420155d901bb1d192998fa811a65ebb82c043ee186711"},
{file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb68e7bf65c990531ad8715e57d50195daf7c8e6f1509e617b4e692af1108939"},
{file = "jiter-0.11.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43b30c8154ded5845fa454ef954ee67bfccce629b2dea7d01f795b42bc2bda54"},
{file = "jiter-0.11.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:586cafbd9dd1f3ce6a22b4a085eaa6be578e47ba9b18e198d4333e598a91db2d"},
{file = "jiter-0.11.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:677cc2517d437a83bb30019fd4cf7cad74b465914c56ecac3440d597ac135250"},
{file = "jiter-0.11.1-cp312-cp312-win32.whl", hash = "sha256:fa992af648fcee2b850a3286a35f62bbbaeddbb6dbda19a00d8fbc846a947b6e"},
{file = "jiter-0.11.1-cp312-cp312-win_amd64.whl", hash = "sha256:88b5cae9fa51efeb3d4bd4e52bfd4c85ccc9cac44282e2a9640893a042ba4d87"},
{file = "jiter-0.11.1-cp312-cp312-win_arm64.whl", hash = "sha256:9a6cae1ab335551917f882f2c3c1efe7617b71b4c02381e4382a8fc80a02588c"},
{file = "jiter-0.11.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:71b6a920a5550f057d49d0e8bcc60945a8da998019e83f01adf110e226267663"},
{file = "jiter-0.11.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0b3de72e925388453a5171be83379549300db01284f04d2a6f244d1d8de36f94"},
{file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc19dd65a2bd3d9c044c5b4ebf657ca1e6003a97c0fc10f555aa4f7fb9821c00"},
{file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d58faaa936743cd1464540562f60b7ce4fd927e695e8bc31b3da5b914baa9abd"},
{file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:902640c3103625317291cb73773413b4d71847cdf9383ba65528745ff89f1d14"},
{file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:30405f726e4c2ed487b176c09f8b877a957f535d60c1bf194abb8dadedb5836f"},
{file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3217f61728b0baadd2551844870f65219ac4a1285d5e1a4abddff3d51fdabe96"},
{file = "jiter-0.11.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b1364cc90c03a8196f35f396f84029f12abe925415049204446db86598c8b72c"},
{file = "jiter-0.11.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:53a54bf8e873820ab186b2dca9f6c3303f00d65ae5e7b7d6bda1b95aa472d646"},
{file = "jiter-0.11.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7e29aca023627b0e0c2392d4248f6414d566ff3974fa08ff2ac8dbb96dfee92a"},
{file = "jiter-0.11.1-cp313-cp313-win32.whl", hash = "sha256:f153e31d8bca11363751e875c0a70b3d25160ecbaee7b51e457f14498fb39d8b"},
{file = "jiter-0.11.1-cp313-cp313-win_amd64.whl", hash = "sha256:f773f84080b667c69c4ea0403fc67bb08b07e2b7ce1ef335dea5868451e60fed"},
{file = "jiter-0.11.1-cp313-cp313-win_arm64.whl", hash = "sha256:635ecd45c04e4c340d2187bcb1cea204c7cc9d32c1364d251564bf42e0e39c2d"},
{file = "jiter-0.11.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d892b184da4d94d94ddb4031296931c74ec8b325513a541ebfd6dfb9ae89904b"},
{file = "jiter-0.11.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa22c223a3041dacb2fcd37c70dfd648b44662b4a48e242592f95bda5ab09d58"},
{file = "jiter-0.11.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:330e8e6a11ad4980cd66a0f4a3e0e2e0f646c911ce047014f984841924729789"},
{file = "jiter-0.11.1-cp313-cp313t-win_amd64.whl", hash = "sha256:09e2e386ebf298547ca3a3704b729471f7ec666c2906c5c26c1a915ea24741ec"},
{file = "jiter-0.11.1-cp313-cp313t-win_arm64.whl", hash = "sha256:fe4a431c291157e11cee7c34627990ea75e8d153894365a3bc84b7a959d23ca8"},
{file = "jiter-0.11.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:0fa1f70da7a8a9713ff8e5f75ec3f90c0c870be6d526aa95e7c906f6a1c8c676"},
{file = "jiter-0.11.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:569ee559e5046a42feb6828c55307cf20fe43308e3ae0d8e9e4f8d8634d99944"},
{file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f69955fa1d92e81987f092b233f0be49d4c937da107b7f7dcf56306f1d3fcce9"},
{file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:090f4c9d4a825e0fcbd0a2647c9a88a0f366b75654d982d95a9590745ff0c48d"},
{file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bbf3d8cedf9e9d825233e0dcac28ff15c47b7c5512fdfe2e25fd5bbb6e6b0cee"},
{file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2aa9b1958f9c30d3d1a558b75f0626733c60eb9b7774a86b34d88060be1e67fe"},
{file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e42d1ca16590b768c5e7d723055acd2633908baacb3628dd430842e2e035aa90"},
{file = "jiter-0.11.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5db4c2486a023820b701a17aec9c5a6173c5ba4393f26662f032f2de9c848b0f"},
{file = "jiter-0.11.1-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:4573b78777ccfac954859a6eff45cbd9d281d80c8af049d0f1a3d9fc323d5c3a"},
{file = "jiter-0.11.1-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:7593ac6f40831d7961cb67633c39b9fef6689a211d7919e958f45710504f52d3"},
{file = "jiter-0.11.1-cp314-cp314-win32.whl", hash = "sha256:87202ec6ff9626ff5f9351507def98fcf0df60e9a146308e8ab221432228f4ea"},
{file = "jiter-0.11.1-cp314-cp314-win_amd64.whl", hash = "sha256:a5dd268f6531a182c89d0dd9a3f8848e86e92dfff4201b77a18e6b98aa59798c"},
{file = "jiter-0.11.1-cp314-cp314-win_arm64.whl", hash = "sha256:5d761f863f912a44748a21b5c4979c04252588ded8d1d2760976d2e42cd8d991"},
{file = "jiter-0.11.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2cc5a3965285ddc33e0cab933e96b640bc9ba5940cea27ebbbf6695e72d6511c"},
{file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b572b3636a784c2768b2342f36a23078c8d3aa6d8a30745398b1bab58a6f1a8"},
{file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ad93e3d67a981f96596d65d2298fe8d1aa649deb5374a2fb6a434410ee11915e"},
{file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a83097ce379e202dcc3fe3fc71a16d523d1ee9192c8e4e854158f96b3efe3f2f"},
{file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7042c51e7fbeca65631eb0c332f90c0c082eab04334e7ccc28a8588e8e2804d9"},
{file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a68d679c0e47649a61df591660507608adc2652442de7ec8276538ac46abe08"},
{file = "jiter-0.11.1-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a1b0da75dbf4b6ec0b3c9e604d1ee8beaf15bc046fff7180f7d89e3cdbd3bb51"},
{file = "jiter-0.11.1-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:69dd514bf0fa31c62147d6002e5ca2b3e7ef5894f5ac6f0a19752385f4e89437"},
{file = "jiter-0.11.1-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:bb31ac0b339efa24c0ca606febd8b77ef11c58d09af1b5f2be4c99e907b11111"},
{file = "jiter-0.11.1-cp314-cp314t-win32.whl", hash = "sha256:b2ce0d6156a1d3ad41da3eec63b17e03e296b78b0e0da660876fccfada86d2f7"},
{file = "jiter-0.11.1-cp314-cp314t-win_amd64.whl", hash = "sha256:f4db07d127b54c4a2d43b4cf05ff0193e4f73e0dd90c74037e16df0b29f666e1"},
{file = "jiter-0.11.1-cp314-cp314t-win_arm64.whl", hash = "sha256:28e4fdf2d7ebfc935523e50d1efa3970043cfaa161674fe66f9642409d001dfe"},
{file = "jiter-0.11.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:baa99c8db49467527658bb479857344daf0a14dff909b7f6714579ac439d1253"},
{file = "jiter-0.11.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:860fe55fa3b01ad0edf2adde1098247ff5c303d0121f9ce028c03d4f88c69502"},
{file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:173dd349d99b6feaf5a25a6fbcaf3489a6f947708d808240587a23df711c67db"},
{file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:14ac1dca837514cc946a6ac2c4995d9695303ecc754af70a3163d057d1a444ab"},
{file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69af47de5f93a231d5b85f7372d3284a5be8edb4cc758f006ec5a1406965ac5e"},
{file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:685f8b3abd3bbd3e06e4dfe2429ff87fd5d7a782701151af99b1fcbd80e31b2b"},
{file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d04afa2d4e5526e54ae8a58feea953b1844bf6e3526bc589f9de68e86d0ea01"},
{file = "jiter-0.11.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1e92b927259035b50d8e11a8fdfe0ebd014d883e4552d37881643fa289a4bcf1"},
{file = "jiter-0.11.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e7bd8be4fad8d4c5558b7801770cd2da6c072919c6f247cc5336edb143f25304"},
{file = "jiter-0.11.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:121381a77a3c85987f3eba0d30ceaca9116f7463bedeec2fa79b2e7286b89b60"},
{file = "jiter-0.11.1-cp39-cp39-win32.whl", hash = "sha256:160225407f6dfabdf9be1b44e22f06bc293a78a28ffa4347054698bd712dad06"},
{file = "jiter-0.11.1-cp39-cp39-win_amd64.whl", hash = "sha256:028e0d59bcdfa1079f8df886cdaefc6f515c27a5288dec956999260c7e4a7cfd"},
{file = "jiter-0.11.1-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:e642b5270e61dd02265866398707f90e365b5db2eb65a4f30c789d826682e1f6"},
{file = "jiter-0.11.1-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:464ba6d000585e4e2fd1e891f31f1231f497273414f5019e27c00a4b8f7a24ad"},
{file = "jiter-0.11.1-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:055568693ab35e0bf3a171b03bb40b2dcb10352359e0ab9b5ed0da2bf1eb6f6f"},
{file = "jiter-0.11.1-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0c69ea798d08a915ba4478113efa9e694971e410056392f4526d796f136d3fa"},
{file = "jiter-0.11.1-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:0d4d6993edc83cf75e8c6828a8d6ce40a09ee87e38c7bfba6924f39e1337e21d"},
{file = "jiter-0.11.1-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:f78d151c83a87a6cf5461d5ee55bc730dd9ae227377ac6f115b922989b95f838"},
{file = "jiter-0.11.1-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9022974781155cd5521d5cb10997a03ee5e31e8454c9d999dcdccd253f2353f"},
{file = "jiter-0.11.1-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18c77aaa9117510d5bdc6a946baf21b1f0cfa58ef04d31c8d016f206f2118960"},
{file = "jiter-0.11.1.tar.gz", hash = "sha256:849dcfc76481c0ea0099391235b7ca97d7279e0fa4c86005457ac7c88e8b76dc"},
]
[[package]]
name = "jsonschema"
version = "3.2.0"
@@ -2492,6 +2639,34 @@ rsa = ["cryptography (>=3.0.0)"]
signals = ["blinker (>=1.4.0)"]
signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
[[package]]
name = "openai"
version = "1.109.1"
description = "The official Python library for the openai API"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "openai-1.109.1-py3-none-any.whl", hash = "sha256:6bcaf57086cf59159b8e27447e4e7dd019db5d29a438072fbd49c290c7e65315"},
{file = "openai-1.109.1.tar.gz", hash = "sha256:d173ed8dbca665892a6db099b4a2dfac624f94d20a93f46eb0b56aae940ed869"},
]
[package.dependencies]
anyio = ">=3.5.0,<5"
distro = ">=1.7.0,<2"
httpx = ">=0.23.0,<1"
jiter = ">=0.4.0,<1"
pydantic = ">=1.9.0,<3"
sniffio = "*"
tqdm = ">4"
typing-extensions = ">=4.11,<5"
[package.extras]
aiohttp = ["aiohttp", "httpx-aiohttp (>=0.1.8)"]
datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
realtime = ["websockets (>=13,<16)"]
voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"]
[[package]]
name = "opentelemetry-api"
version = "1.27.0"
@@ -3864,6 +4039,18 @@ files = [
{file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
]
[[package]]
name = "soupsieve"
version = "2.8"
description = "A modern CSS selector implementation for Beautiful Soup."
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c"},
{file = "soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f"},
]
[[package]]
name = "sqlalchemy"
version = "2.0.35"
@@ -4477,4 +4664,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.1"
python-versions = "^3.11,<3.12"
content-hash = "979b7f159a1a284b77b9bec889298c50e3c269149524e647978b517463142e1f"
content-hash = "d6f93822b88fad4aaf0ae5d60932e0e93d9e95aa79fdddc44f9f6045387d3b4c"

View File

@@ -41,6 +41,8 @@ dpath = "^2.1.6"
genson = "^1.3.0"
segment-analytics-python = "^2.3.2"
python-slugify = ">=8.0.4"
beautifulsoup4 = "^4.12.0"
openai = "^1.0.0"
[tool.poetry.group.dev.dependencies]

View File

@@ -0,0 +1,37 @@
# LLM-Based Regression Test Evaluation
Automated evaluation of connector regression test reports using LLM models.
## How It Works
After regression tests complete, this evaluates the HTML report and writes a pass/fail judgment to `GITHUB_STEP_SUMMARY`.
## Configuration
**Environment Variables:**
- `OPENAI_API_KEY` - API key (use `ollama` for Ollama)
- `OPENAI_BASE_URL` - Base URL for OpenAI-compatible API (e.g., `http://127.0.0.1:11434/v1` for Ollama)
- `EVAL_MODEL` - Model name (defaults to `gpt-4o`)
**Evaluation Prompt:**
Stored in `.github/prompts/regression-evaluation.prompt.yaml` following GitHub's prompt format. Uses `{{report_text}}` placeholder for dynamic content injection.
## Local Testing
```bash
# Install Ollama
curl -fsSL https://ollama.com/install.sh | sh
ollama serve &
ollama pull llama3.2:3b
# Set environment
export OPENAI_API_KEY=ollama
export OPENAI_BASE_URL=http://127.0.0.1:11434/v1
export EVAL_MODEL=llama3.2:3b
# Run evaluation
cd airbyte-ci/connectors/live-tests
poetry install
poetry run python src/live_tests/regression_tests/llm_evaluation/evaluate_report.py \
--report-path /path/to/report.html
```

View File

@@ -0,0 +1 @@
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.

View File

@@ -0,0 +1,299 @@
#!/usr/bin/env python3
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
"""
LLM-based evaluation of regression test reports.
This script reads a regression test report (HTML format) and uses OpenAI's LLM
to evaluate the results, make a pass/fail judgment, and generate a summary.
The summary is written to GITHUB_STEP_SUMMARY for display in GitHub Actions.
"""
import argparse
import json
import os
import sys
from pathlib import Path
from typing import Any
import yaml
from bs4 import BeautifulSoup
from openai import OpenAI
MAX_REPORT_CHARS = 200000
# Default evaluation prompt
EVAL_PROMPT = """You are an expert at evaluating connector regression test results.
Your task is to analyze the test report and determine if the regression tests should PASS or FAIL.
Consider the following criteria:
1. All test cases should pass (no failed tests)
2. Record count differences between control and target versions should be minimal or explainable
3. Message count differences should not indicate data loss or corruption
4. Stream coverage should be reasonable
5. Any warnings or errors in test outputs should be evaluated for severity
Provide your evaluation in the following JSON format:
{
"pass": true/false,
"summary": "A concise 2-3 sentence summary of the evaluation",
"reasoning": "Detailed reasoning for your pass/fail decision, including specific issues found",
"severity": "critical/major/minor/none",
"recommendations": "Any recommendations for addressing issues"
}
Be strict but fair in your evaluation. Minor differences are acceptable, but data loss,
corruption, or test failures should result in a FAIL."""
def load_prompt_from_yaml(yaml_path: Path | None = None) -> tuple[list[dict[str, str]] | None, str | None, dict[str, Any] | None]:
"""
Load prompt from GitHub-format YAML file.
Args:
yaml_path: Path to the .prompt.yaml file. If None, uses default location.
Returns:
Tuple of (messages, model, modelParameters) or (None, None, None) if file not found or invalid
"""
if yaml_path is None:
# Default location: .github/prompts/regression-evaluation.prompt.yaml
github_workspace = os.environ.get("GITHUB_WORKSPACE")
if github_workspace:
yaml_path = Path(github_workspace) / ".github" / "prompts" / "regression-evaluation.prompt.yaml"
else:
script_dir = Path(__file__).parent
repo_root = script_dir.parent.parent.parent.parent.parent.parent
yaml_path = repo_root / ".github" / "prompts" / "regression-evaluation.prompt.yaml"
if not yaml_path.exists():
print(f"Prompt file not found at {yaml_path}, using default hardcoded prompt")
return None, None, None
try:
with open(yaml_path, "r", encoding="utf-8") as f:
prompt_data = yaml.safe_load(f)
messages = prompt_data.get("messages", [])
model = prompt_data.get("model")
model_params = prompt_data.get("modelParameters", {})
if not messages:
print(f"Warning: No messages found in {yaml_path}, using default hardcoded prompt")
return None, None, None
print(f"Loaded prompt from {yaml_path}")
return messages, model, model_params
except Exception as e:
print(f"Warning: Failed to load prompt from {yaml_path}: {e}")
print("Using default hardcoded prompt")
return None, None, None
def load_report_text(html_path: Path) -> str:
"""
Load and convert HTML report to clean text.
Args:
html_path: Path to the report.html file
Returns:
Clean text representation of the report
"""
with open(html_path, "r", encoding="utf-8") as f:
html_content = f.read()
soup = BeautifulSoup(html_content, "html.parser")
for element in soup(["script", "style"]):
element.decompose()
report_text = soup.get_text("\n", strip=True)
report_text = "\n".join(line.strip() for line in report_text.splitlines() if line.strip())
if len(report_text) > MAX_REPORT_CHARS:
original_length = len(report_text)
report_text = report_text[:MAX_REPORT_CHARS]
truncation_note = f"\n\n[Report truncated from {original_length} to {MAX_REPORT_CHARS} characters for evaluation]"
report_text += truncation_note
print(f"Warning: Report truncated from {original_length} to {MAX_REPORT_CHARS} characters")
return report_text
def evaluate_with_llm(report_text: str, prompt: str | None = None, prompt_yaml_path: Path | None = None) -> dict[str, Any]:
"""
Use OpenAI LLM to evaluate the regression test report.
Supports both OpenAI API and Ollama (OpenAI-compatible).
Configure via environment variables:
- OPENAI_API_KEY: API key (use 'ollama' for Ollama)
- OPENAI_BASE_URL: Optional base URL (e.g., http://127.0.0.1:11434/v1 for Ollama)
- EVAL_MODEL: Model name (defaults to gpt-4o, use llama3.2:3b for Ollama)
- EVAL_PROMPT_PATH: Optional path to custom .prompt.yaml file
Args:
report_text: Full text of the report
prompt: Optional custom evaluation prompt string (legacy, overrides YAML)
prompt_yaml_path: Optional path to .prompt.yaml file
Returns:
Dictionary containing evaluation results with 'pass', 'summary', 'reasoning', 'severity', and 'recommendations' keys
Raises:
Exception: If LLM evaluation fails after retry
"""
api_key = os.environ.get("OPENAI_API_KEY")
base_url = os.environ.get("OPENAI_BASE_URL")
model = os.environ.get("EVAL_MODEL", "gpt-4o")
if base_url:
client = OpenAI(api_key=api_key, base_url=base_url)
print(f"Using custom base URL: {base_url}")
else:
client = OpenAI(api_key=api_key)
yaml_messages, yaml_model, yaml_params = load_prompt_from_yaml(prompt_yaml_path)
if yaml_model and not os.environ.get("EVAL_MODEL"):
model = yaml_model
temperature = 0.3
if yaml_params and "temperature" in yaml_params:
temperature = yaml_params["temperature"]
print(f"Using model: {model}")
if prompt is not None:
messages = [
{"role": "system", "content": prompt},
{"role": "user", "content": f"Report:\n\n{report_text}"},
]
elif yaml_messages:
messages = []
for msg in yaml_messages:
content = msg.get("content", "")
content = content.replace("{{report_text}}", report_text)
messages.append({"role": msg["role"], "content": content})
else:
# Fallback to hardcoded EVAL_PROMPT
messages = [
{"role": "system", "content": EVAL_PROMPT},
{"role": "user", "content": f"Report:\n\n{report_text}"},
]
try:
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
response_format={"type": "json_object"},
)
evaluation = json.loads(response.choices[0].message.content)
return evaluation
except Exception as e:
error_msg = str(e).lower()
if "response_format" in error_msg or "json_object" in error_msg:
print(f"Warning: JSON response format not supported, retrying without it: {e}")
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=temperature,
)
content = response.choices[0].message.content
evaluation = json.loads(content)
return evaluation
raise
def write_github_summary(evaluation: dict[str, Any], model: str | None = None) -> None:
"""
Write the evaluation summary to GITHUB_STEP_SUMMARY.
Args:
evaluation: LLM evaluation results
model: Model name used for evaluation (optional)
"""
summary_file = os.environ.get("GITHUB_STEP_SUMMARY")
if not summary_file:
print("Warning: GITHUB_STEP_SUMMARY environment variable not set. Writing to stdout instead.")
summary_file = "/dev/stdout"
status_emoji = "" if evaluation["pass"] else ""
model_info = f"model: {model}" if model else "OpenAI-compatible API"
markdown = f"""# {status_emoji} Regression Test Evaluation: {"PASS" if evaluation['pass'] else "FAIL"}
{evaluation['summary']}
{evaluation['reasoning']}
{evaluation.get('recommendations', 'No specific recommendations.')}
---
*This evaluation was generated using {model_info}*
"""
with open(summary_file, "a", encoding="utf-8") as f:
f.write(markdown)
print(f"Summary written to {summary_file}")
def main():
"""Main entry point for the LLM evaluation script."""
parser = argparse.ArgumentParser(description="Evaluate regression test reports using OpenAI LLM")
parser.add_argument("--report-path", type=Path, required=True, help="Path to the report.html file")
parser.add_argument("--prompt-file", type=Path, help="Optional path to a custom evaluation prompt file")
parser.add_argument("--output-json", type=Path, help="Optional path to write evaluation results as JSON")
args = parser.parse_args()
if not os.environ.get("OPENAI_API_KEY"):
print("Error: OPENAI_API_KEY environment variable not set", file=sys.stderr)
sys.exit(1)
if not args.report_path.exists():
print(f"Error: Report file not found: {args.report_path}", file=sys.stderr)
sys.exit(1)
print(f"Loading report from: {args.report_path}")
report_text = load_report_text(args.report_path)
print(f"Report loaded: {len(report_text)} characters")
custom_prompt = None
if args.prompt_file and args.prompt_file.exists():
with open(args.prompt_file, "r", encoding="utf-8") as f:
custom_prompt = f.read()
print(f"Using custom prompt from: {args.prompt_file}")
prompt_yaml_path = None
eval_prompt_path = os.environ.get("EVAL_PROMPT_PATH")
if eval_prompt_path:
prompt_yaml_path = Path(eval_prompt_path)
print("Evaluating report with LLM...")
evaluation = evaluate_with_llm(report_text, custom_prompt, prompt_yaml_path)
print(f"\nEvaluation Result: {'PASS' if evaluation['pass'] else 'FAIL'}")
print(f"Summary: {evaluation['summary']}")
model = os.environ.get("EVAL_MODEL", "gpt-4o")
write_github_summary(evaluation, model)
if args.output_json:
output_data = {"evaluation": evaluation}
with open(args.output_json, "w", encoding="utf-8") as f:
json.dump(output_data, f, indent=2)
print(f"Evaluation results written to: {args.output_json}")
sys.exit(0 if evaluation["pass"] else 1)
if __name__ == "__main__":
main()