feat: add OpenAI LLM evaluation step for connector regression tests (#68673)
Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
13c3e1e34f
commit
cf1379d105
34
.github/prompts/regression-evaluation.prompt.yaml
vendored
Normal file
34
.github/prompts/regression-evaluation.prompt.yaml
vendored
Normal file
@@ -0,0 +1,34 @@
|
||||
name: Regression Report Evaluation
|
||||
description: Evaluate Airbyte connector regression test reports and return a JSON verdict with reasoning
|
||||
model: llama3.2:3b
|
||||
modelParameters:
|
||||
temperature: 0.3
|
||||
messages:
|
||||
- role: system
|
||||
content: |
|
||||
You are an expert at evaluating connector regression test results.
|
||||
Your task is to analyze the test report and determine if the regression tests should PASS or FAIL.
|
||||
|
||||
Consider the following criteria:
|
||||
1. All test cases should pass (no failed tests)
|
||||
2. Record count differences between control and target versions should be minimal or explainable
|
||||
3. Message count differences should not indicate data loss or corruption
|
||||
4. Stream coverage should be reasonable
|
||||
5. Any warnings or errors in test outputs should be evaluated for severity
|
||||
|
||||
Provide your evaluation in the following JSON format:
|
||||
{
|
||||
"pass": true/false,
|
||||
"summary": "A concise 2-3 sentence summary of the evaluation",
|
||||
"reasoning": "Detailed reasoning for your pass/fail decision, including specific issues found",
|
||||
"severity": "critical/major/minor/none",
|
||||
"recommendations": "Any recommendations for addressing issues"
|
||||
}
|
||||
|
||||
Be strict but fair in your evaluation. Minor differences are acceptable, but data loss,
|
||||
corruption, or test failures should result in a FAIL.
|
||||
- role: user
|
||||
content: |
|
||||
Report:
|
||||
|
||||
{{report_text}}
|
||||
102
.github/workflows/run-regression-tests-command.yml
vendored
102
.github/workflows/run-regression-tests-command.yml
vendored
@@ -84,7 +84,24 @@ jobs:
|
||||
name: Regression Tests
|
||||
runs-on: linux-24.04-large # Custom runner, defined in GitHub org settings
|
||||
timeout-minutes: 360 # 6 hours
|
||||
permissions:
|
||||
contents: read
|
||||
pull-requests: write
|
||||
issues: write
|
||||
steps:
|
||||
- name: Append start with run link
|
||||
id: pr-comment-id
|
||||
if: github.event_name == 'workflow_dispatch' && github.event.inputs.pr != ''
|
||||
uses: peter-evans/create-or-update-comment@v4
|
||||
with:
|
||||
token: ${{ github.token }}
|
||||
issue-number: ${{ github.event.inputs.pr }}
|
||||
comment-id: ${{ github.event.inputs.comment-id }}
|
||||
edit-mode: append
|
||||
body: |
|
||||
> Starting regression tests (filter: `${{ github.event.inputs.connector_filter || '--modified' }}`)
|
||||
> Workflow run: [${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
|
||||
|
||||
- name: Install Python
|
||||
id: install_python
|
||||
uses: actions/setup-python@7f4fc3e22c37d6ff65e88745f38bd3157c663f7c # v4.9.1
|
||||
@@ -183,6 +200,7 @@ jobs:
|
||||
# forks if the user installs the app into their fork. Until we document this as a clear
|
||||
# path, we will have to keep using the PAT.
|
||||
- name: Run Regression Tests [WORKFLOW DISPATCH]
|
||||
id: run-regression-tests
|
||||
if: github.event_name == 'workflow_dispatch' # TODO: consider using the matrix strategy (https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs). See https://github.com/airbytehq/airbyte/pull/37659#discussion_r1583380234 for details.
|
||||
uses: ./.github/actions/run-airbyte-ci
|
||||
with:
|
||||
@@ -199,3 +217,87 @@ jobs:
|
||||
s3_build_cache_access_key_id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }}
|
||||
s3_build_cache_secret_key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }}
|
||||
subcommand: connectors ${{ env.USE_LOCAL_CDK_FLAG }} ${{ inputs.connector_filter }} test --only-step connector_live_tests --connector_live_tests.test-suite=regression --connector_live_tests.connection-id=${{ github.event.inputs.connection_id }} --connector_live_tests.pr-url="https://github.com/airbytehq/airbyte/pull/${{ github.event.inputs.pr }}" ${{ env.READ_WITH_STATE_FLAG }} ${{ env.DISABLE_PROXY_FLAG }} ${{ env.STREAM_PARAMS }} ${{ env.CONNECTION_SUBSET }} ${{ env.CONTROL_VERSION }} --global-status-check-context="Regression Tests" --global-status-check-description='Running regression tests'
|
||||
|
||||
- name: Upload regression test report
|
||||
if: always() && github.event_name == 'workflow_dispatch'
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: regression-test-report
|
||||
path: /tmp/regression_tests_artifacts/report.html
|
||||
if-no-files-found: ignore
|
||||
|
||||
- name: Append regression outcome
|
||||
if: always() && github.event_name == 'workflow_dispatch' && github.event.inputs.pr != ''
|
||||
uses: peter-evans/create-or-update-comment@v4
|
||||
with:
|
||||
token: ${{ github.token }}
|
||||
comment-id: ${{ steps.pr-comment-id.outputs.comment-id }}
|
||||
edit-mode: append
|
||||
body: |
|
||||
> Regression tests: ${{ steps.run-regression-tests.outcome == 'success' && '✅ PASSED' || steps.run-regression-tests.outcome == 'failure' && '❌ FAILED' || steps.run-regression-tests.outcome == 'cancelled' && '⚠️ CANCELLED' || steps.run-regression-tests.outcome == 'skipped' && '⏭️ SKIPPED' || '❓ UNKNOWN' }}
|
||||
> Report: ${{ hashFiles('/tmp/regression_tests_artifacts/report.html') != '' && 'artifact `regression-test-report` available in the run' || 'not generated' }}
|
||||
|
||||
- name: Install live-tests dependencies for LLM evaluation
|
||||
if: always() && github.event_name == 'workflow_dispatch'
|
||||
working-directory: airbyte-ci/connectors/live-tests
|
||||
run: poetry install
|
||||
|
||||
- name: Install and Start Ollama
|
||||
if: always() && github.event_name == 'workflow_dispatch'
|
||||
run: |
|
||||
curl -fsSL https://ollama.com/install.sh | sh
|
||||
ollama serve &
|
||||
sleep 5
|
||||
ollama pull llama3.2:3b
|
||||
echo "Ollama server started and model pulled"
|
||||
|
||||
- name: Evaluate Regression Test Report with LLM
|
||||
if: always() && github.event_name == 'workflow_dispatch'
|
||||
id: llm-eval
|
||||
continue-on-error: true
|
||||
working-directory: airbyte-ci/connectors/live-tests
|
||||
env:
|
||||
OPENAI_API_KEY: ollama
|
||||
OPENAI_BASE_URL: http://127.0.0.1:11434/v1
|
||||
EVAL_MODEL: llama3.2:3b
|
||||
run: |
|
||||
set -u
|
||||
echo "ran=false" >> "$GITHUB_OUTPUT"
|
||||
echo "result=error" >> "$GITHUB_OUTPUT"
|
||||
|
||||
# Find the most recent report.html file in /tmp/regression_tests_artifacts/
|
||||
REPORT_PATH=$(find /tmp/regression_tests_artifacts -name "report.html" -type f -printf '%T@ %p\n' | sort -n | tail -1 | cut -f2- -d" ")
|
||||
|
||||
if [ -z "$REPORT_PATH" ]; then
|
||||
echo "Error: No report.html found in /tmp/regression_tests_artifacts/" >&2
|
||||
echo "## ⚠️ LLM Evaluation Skipped" >> "$GITHUB_STEP_SUMMARY"
|
||||
echo "No regression test report found. The tests may have failed to generate a report." >> "$GITHUB_STEP_SUMMARY"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Found report at: $REPORT_PATH"
|
||||
echo "Running LLM evaluation..."
|
||||
|
||||
# Run the evaluation script
|
||||
OUT_JSON="$RUNNER_TEMP/llm_eval.json"
|
||||
poetry run python src/live_tests/regression_tests/llm_evaluation/evaluate_report.py \
|
||||
--report-path "$REPORT_PATH" \
|
||||
--output-json "$OUT_JSON"
|
||||
|
||||
# If we got here, script exit 0 and produced a judgment
|
||||
PASS=$(jq -r '.evaluation.pass' "$OUT_JSON")
|
||||
if [ "$PASS" = "true" ]; then RES="pass"; else RES="fail"; fi
|
||||
echo "ran=true" >> "$GITHUB_OUTPUT"
|
||||
echo "result=$RES" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Append LLM outcome
|
||||
if: always() && github.event_name == 'workflow_dispatch' && github.event.inputs.pr != ''
|
||||
env:
|
||||
EVAL_MODEL: llama3.2:3b
|
||||
uses: peter-evans/create-or-update-comment@v4
|
||||
with:
|
||||
token: ${{ github.token }}
|
||||
comment-id: ${{ steps.pr-comment-id.outputs.comment-id }}
|
||||
edit-mode: append
|
||||
body: |
|
||||
> LLM Evaluation: ${{ steps.llm-eval.outputs.ran == 'true' && (steps.llm-eval.outputs.result == 'pass' && '✅ PASS' || steps.llm-eval.outputs.result == 'fail' && '❌ FAIL' || '⚠️ ERROR') || '⚠️ Did not run' }}${{ steps.llm-eval.outputs.ran == 'true' && format(' (model: {0})', env.EVAL_MODEL) || '' }}
|
||||
|
||||
189
airbyte-ci/connectors/live-tests/poetry.lock
generated
189
airbyte-ci/connectors/live-tests/poetry.lock
generated
@@ -355,6 +355,29 @@ test = ["coverage (>=5.5)", "equinox", "jax[cpu]", "jaxtyping", "mypy (>=0.800)"
|
||||
test-tox = ["equinox", "jax[cpu]", "jaxtyping", "mypy (>=0.800)", "numba", "numpy", "pandera", "pygments", "pyright (>=1.1.370)", "pytest (>=4.0.0)", "sphinx", "typing-extensions (>=3.10.0.0)"]
|
||||
test-tox-coverage = ["coverage (>=5.5)"]
|
||||
|
||||
[[package]]
|
||||
name = "beautifulsoup4"
|
||||
version = "4.14.2"
|
||||
description = "Screen-scraping library"
|
||||
optional = false
|
||||
python-versions = ">=3.7.0"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515"},
|
||||
{file = "beautifulsoup4-4.14.2.tar.gz", hash = "sha256:2a98ab9f944a11acee9cc848508ec28d9228abfd522ef0fad6a02a72e0ded69e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
soupsieve = ">1.2"
|
||||
typing-extensions = ">=4.0.0"
|
||||
|
||||
[package.extras]
|
||||
cchardet = ["cchardet"]
|
||||
chardet = ["chardet"]
|
||||
charset-normalizer = ["charset-normalizer"]
|
||||
html5lib = ["html5lib"]
|
||||
lxml = ["lxml"]
|
||||
|
||||
[[package]]
|
||||
name = "blessed"
|
||||
version = "1.20.0"
|
||||
@@ -979,6 +1002,18 @@ wrapt = ">=1.10,<2"
|
||||
[package.extras]
|
||||
dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
|
||||
|
||||
[[package]]
|
||||
name = "distro"
|
||||
version = "1.9.0"
|
||||
description = "Distro - an OS platform information API"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"},
|
||||
{file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "docker"
|
||||
version = "6.1.3"
|
||||
@@ -1943,6 +1978,118 @@ files = [
|
||||
[package.dependencies]
|
||||
ansicon = {version = "*", markers = "platform_system == \"Windows\""}
|
||||
|
||||
[[package]]
|
||||
name = "jiter"
|
||||
version = "0.11.1"
|
||||
description = "Fast iterable JSON parser."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "jiter-0.11.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:ed58841a491bbbf3f7c55a6b68fff568439ab73b2cce27ace0e169057b5851df"},
|
||||
{file = "jiter-0.11.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:499beb9b2d7e51d61095a8de39ebcab1d1778f2a74085f8305a969f6cee9f3e4"},
|
||||
{file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b87b2821795e28cc990939b68ce7a038edea680a24910bd68a79d54ff3f03c02"},
|
||||
{file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:83f6fa494d8bba14ab100417c80e70d32d737e805cb85be2052d771c76fcd1f8"},
|
||||
{file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5fbc6aea1daa2ec6f5ed465f0c5e7b0607175062ceebbea5ca70dd5ddab58083"},
|
||||
{file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:302288e2edc43174bb2db838e94688d724f9aad26c5fb9a74f7a5fb427452a6a"},
|
||||
{file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85db563fe3b367bb568af5d29dea4d4066d923b8e01f3417d25ebecd958de815"},
|
||||
{file = "jiter-0.11.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f1c1ba2b6b22f775444ef53bc2d5778396d3520abc7b2e1da8eb0c27cb3ffb10"},
|
||||
{file = "jiter-0.11.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:523be464b14f8fd0cc78da6964b87b5515a056427a2579f9085ce30197a1b54a"},
|
||||
{file = "jiter-0.11.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:25b99b3f04cd2a38fefb22e822e35eb203a2cd37d680dbbc0c0ba966918af336"},
|
||||
{file = "jiter-0.11.1-cp310-cp310-win32.whl", hash = "sha256:47a79e90545a596bb9104109777894033347b11180d4751a216afef14072dbe7"},
|
||||
{file = "jiter-0.11.1-cp310-cp310-win_amd64.whl", hash = "sha256:cace75621ae9bd66878bf69fbd4dfc1a28ef8661e0c2d0eb72d3d6f1268eddf5"},
|
||||
{file = "jiter-0.11.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:9b0088ff3c374ce8ce0168523ec8e97122ebb788f950cf7bb8e39c7dc6a876a2"},
|
||||
{file = "jiter-0.11.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:74433962dd3c3090655e02e461267095d6c84f0741c7827de11022ef8d7ff661"},
|
||||
{file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d98030e345e6546df2cc2c08309c502466c66c4747b043f1a0d415fada862b8"},
|
||||
{file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1d6db0b2e788db46bec2cf729a88b6dd36959af2abd9fa2312dfba5acdd96dcb"},
|
||||
{file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:55678fbbda261eafe7289165dd2ddd0e922df5f9a1ae46d7c79a5a15242bd7d1"},
|
||||
{file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a6b74fae8e40497653b52ce6ca0f1b13457af769af6fb9c1113efc8b5b4d9be"},
|
||||
{file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a55a453f8b035eb4f7852a79a065d616b7971a17f5e37a9296b4b38d3b619e4"},
|
||||
{file = "jiter-0.11.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2638148099022e6bdb3f42904289cd2e403609356fb06eb36ddec2d50958bc29"},
|
||||
{file = "jiter-0.11.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:252490567a5d990986f83b95a5f1ca1bf205ebd27b3e9e93bb7c2592380e29b9"},
|
||||
{file = "jiter-0.11.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d431d52b0ca2436eea6195f0f48528202100c7deda354cb7aac0a302167594d5"},
|
||||
{file = "jiter-0.11.1-cp311-cp311-win32.whl", hash = "sha256:db6f41e40f8bae20c86cb574b48c4fd9f28ee1c71cb044e9ec12e78ab757ba3a"},
|
||||
{file = "jiter-0.11.1-cp311-cp311-win_amd64.whl", hash = "sha256:0cc407b8e6cdff01b06bb80f61225c8b090c3df108ebade5e0c3c10993735b19"},
|
||||
{file = "jiter-0.11.1-cp311-cp311-win_arm64.whl", hash = "sha256:fe04ea475392a91896d1936367854d346724a1045a247e5d1c196410473b8869"},
|
||||
{file = "jiter-0.11.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:c92148eec91052538ce6823dfca9525f5cfc8b622d7f07e9891a280f61b8c96c"},
|
||||
{file = "jiter-0.11.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ecd4da91b5415f183a6be8f7158d127bdd9e6a3174138293c0d48d6ea2f2009d"},
|
||||
{file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7e3ac25c00b9275684d47aa42febaa90a9958e19fd1726c4ecf755fbe5e553b"},
|
||||
{file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:57d7305c0a841858f866cd459cd9303f73883fb5e097257f3d4a3920722c69d4"},
|
||||
{file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e86fa10e117dce22c547f31dd6d2a9a222707d54853d8de4e9a2279d2c97f239"},
|
||||
{file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ae5ef1d48aec7e01ee8420155d901bb1d192998fa811a65ebb82c043ee186711"},
|
||||
{file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb68e7bf65c990531ad8715e57d50195daf7c8e6f1509e617b4e692af1108939"},
|
||||
{file = "jiter-0.11.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43b30c8154ded5845fa454ef954ee67bfccce629b2dea7d01f795b42bc2bda54"},
|
||||
{file = "jiter-0.11.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:586cafbd9dd1f3ce6a22b4a085eaa6be578e47ba9b18e198d4333e598a91db2d"},
|
||||
{file = "jiter-0.11.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:677cc2517d437a83bb30019fd4cf7cad74b465914c56ecac3440d597ac135250"},
|
||||
{file = "jiter-0.11.1-cp312-cp312-win32.whl", hash = "sha256:fa992af648fcee2b850a3286a35f62bbbaeddbb6dbda19a00d8fbc846a947b6e"},
|
||||
{file = "jiter-0.11.1-cp312-cp312-win_amd64.whl", hash = "sha256:88b5cae9fa51efeb3d4bd4e52bfd4c85ccc9cac44282e2a9640893a042ba4d87"},
|
||||
{file = "jiter-0.11.1-cp312-cp312-win_arm64.whl", hash = "sha256:9a6cae1ab335551917f882f2c3c1efe7617b71b4c02381e4382a8fc80a02588c"},
|
||||
{file = "jiter-0.11.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:71b6a920a5550f057d49d0e8bcc60945a8da998019e83f01adf110e226267663"},
|
||||
{file = "jiter-0.11.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0b3de72e925388453a5171be83379549300db01284f04d2a6f244d1d8de36f94"},
|
||||
{file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc19dd65a2bd3d9c044c5b4ebf657ca1e6003a97c0fc10f555aa4f7fb9821c00"},
|
||||
{file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d58faaa936743cd1464540562f60b7ce4fd927e695e8bc31b3da5b914baa9abd"},
|
||||
{file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:902640c3103625317291cb73773413b4d71847cdf9383ba65528745ff89f1d14"},
|
||||
{file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:30405f726e4c2ed487b176c09f8b877a957f535d60c1bf194abb8dadedb5836f"},
|
||||
{file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3217f61728b0baadd2551844870f65219ac4a1285d5e1a4abddff3d51fdabe96"},
|
||||
{file = "jiter-0.11.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b1364cc90c03a8196f35f396f84029f12abe925415049204446db86598c8b72c"},
|
||||
{file = "jiter-0.11.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:53a54bf8e873820ab186b2dca9f6c3303f00d65ae5e7b7d6bda1b95aa472d646"},
|
||||
{file = "jiter-0.11.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7e29aca023627b0e0c2392d4248f6414d566ff3974fa08ff2ac8dbb96dfee92a"},
|
||||
{file = "jiter-0.11.1-cp313-cp313-win32.whl", hash = "sha256:f153e31d8bca11363751e875c0a70b3d25160ecbaee7b51e457f14498fb39d8b"},
|
||||
{file = "jiter-0.11.1-cp313-cp313-win_amd64.whl", hash = "sha256:f773f84080b667c69c4ea0403fc67bb08b07e2b7ce1ef335dea5868451e60fed"},
|
||||
{file = "jiter-0.11.1-cp313-cp313-win_arm64.whl", hash = "sha256:635ecd45c04e4c340d2187bcb1cea204c7cc9d32c1364d251564bf42e0e39c2d"},
|
||||
{file = "jiter-0.11.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d892b184da4d94d94ddb4031296931c74ec8b325513a541ebfd6dfb9ae89904b"},
|
||||
{file = "jiter-0.11.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa22c223a3041dacb2fcd37c70dfd648b44662b4a48e242592f95bda5ab09d58"},
|
||||
{file = "jiter-0.11.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:330e8e6a11ad4980cd66a0f4a3e0e2e0f646c911ce047014f984841924729789"},
|
||||
{file = "jiter-0.11.1-cp313-cp313t-win_amd64.whl", hash = "sha256:09e2e386ebf298547ca3a3704b729471f7ec666c2906c5c26c1a915ea24741ec"},
|
||||
{file = "jiter-0.11.1-cp313-cp313t-win_arm64.whl", hash = "sha256:fe4a431c291157e11cee7c34627990ea75e8d153894365a3bc84b7a959d23ca8"},
|
||||
{file = "jiter-0.11.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:0fa1f70da7a8a9713ff8e5f75ec3f90c0c870be6d526aa95e7c906f6a1c8c676"},
|
||||
{file = "jiter-0.11.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:569ee559e5046a42feb6828c55307cf20fe43308e3ae0d8e9e4f8d8634d99944"},
|
||||
{file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f69955fa1d92e81987f092b233f0be49d4c937da107b7f7dcf56306f1d3fcce9"},
|
||||
{file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:090f4c9d4a825e0fcbd0a2647c9a88a0f366b75654d982d95a9590745ff0c48d"},
|
||||
{file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bbf3d8cedf9e9d825233e0dcac28ff15c47b7c5512fdfe2e25fd5bbb6e6b0cee"},
|
||||
{file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2aa9b1958f9c30d3d1a558b75f0626733c60eb9b7774a86b34d88060be1e67fe"},
|
||||
{file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e42d1ca16590b768c5e7d723055acd2633908baacb3628dd430842e2e035aa90"},
|
||||
{file = "jiter-0.11.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5db4c2486a023820b701a17aec9c5a6173c5ba4393f26662f032f2de9c848b0f"},
|
||||
{file = "jiter-0.11.1-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:4573b78777ccfac954859a6eff45cbd9d281d80c8af049d0f1a3d9fc323d5c3a"},
|
||||
{file = "jiter-0.11.1-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:7593ac6f40831d7961cb67633c39b9fef6689a211d7919e958f45710504f52d3"},
|
||||
{file = "jiter-0.11.1-cp314-cp314-win32.whl", hash = "sha256:87202ec6ff9626ff5f9351507def98fcf0df60e9a146308e8ab221432228f4ea"},
|
||||
{file = "jiter-0.11.1-cp314-cp314-win_amd64.whl", hash = "sha256:a5dd268f6531a182c89d0dd9a3f8848e86e92dfff4201b77a18e6b98aa59798c"},
|
||||
{file = "jiter-0.11.1-cp314-cp314-win_arm64.whl", hash = "sha256:5d761f863f912a44748a21b5c4979c04252588ded8d1d2760976d2e42cd8d991"},
|
||||
{file = "jiter-0.11.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2cc5a3965285ddc33e0cab933e96b640bc9ba5940cea27ebbbf6695e72d6511c"},
|
||||
{file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b572b3636a784c2768b2342f36a23078c8d3aa6d8a30745398b1bab58a6f1a8"},
|
||||
{file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ad93e3d67a981f96596d65d2298fe8d1aa649deb5374a2fb6a434410ee11915e"},
|
||||
{file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a83097ce379e202dcc3fe3fc71a16d523d1ee9192c8e4e854158f96b3efe3f2f"},
|
||||
{file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7042c51e7fbeca65631eb0c332f90c0c082eab04334e7ccc28a8588e8e2804d9"},
|
||||
{file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a68d679c0e47649a61df591660507608adc2652442de7ec8276538ac46abe08"},
|
||||
{file = "jiter-0.11.1-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a1b0da75dbf4b6ec0b3c9e604d1ee8beaf15bc046fff7180f7d89e3cdbd3bb51"},
|
||||
{file = "jiter-0.11.1-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:69dd514bf0fa31c62147d6002e5ca2b3e7ef5894f5ac6f0a19752385f4e89437"},
|
||||
{file = "jiter-0.11.1-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:bb31ac0b339efa24c0ca606febd8b77ef11c58d09af1b5f2be4c99e907b11111"},
|
||||
{file = "jiter-0.11.1-cp314-cp314t-win32.whl", hash = "sha256:b2ce0d6156a1d3ad41da3eec63b17e03e296b78b0e0da660876fccfada86d2f7"},
|
||||
{file = "jiter-0.11.1-cp314-cp314t-win_amd64.whl", hash = "sha256:f4db07d127b54c4a2d43b4cf05ff0193e4f73e0dd90c74037e16df0b29f666e1"},
|
||||
{file = "jiter-0.11.1-cp314-cp314t-win_arm64.whl", hash = "sha256:28e4fdf2d7ebfc935523e50d1efa3970043cfaa161674fe66f9642409d001dfe"},
|
||||
{file = "jiter-0.11.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:baa99c8db49467527658bb479857344daf0a14dff909b7f6714579ac439d1253"},
|
||||
{file = "jiter-0.11.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:860fe55fa3b01ad0edf2adde1098247ff5c303d0121f9ce028c03d4f88c69502"},
|
||||
{file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:173dd349d99b6feaf5a25a6fbcaf3489a6f947708d808240587a23df711c67db"},
|
||||
{file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:14ac1dca837514cc946a6ac2c4995d9695303ecc754af70a3163d057d1a444ab"},
|
||||
{file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69af47de5f93a231d5b85f7372d3284a5be8edb4cc758f006ec5a1406965ac5e"},
|
||||
{file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:685f8b3abd3bbd3e06e4dfe2429ff87fd5d7a782701151af99b1fcbd80e31b2b"},
|
||||
{file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d04afa2d4e5526e54ae8a58feea953b1844bf6e3526bc589f9de68e86d0ea01"},
|
||||
{file = "jiter-0.11.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1e92b927259035b50d8e11a8fdfe0ebd014d883e4552d37881643fa289a4bcf1"},
|
||||
{file = "jiter-0.11.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e7bd8be4fad8d4c5558b7801770cd2da6c072919c6f247cc5336edb143f25304"},
|
||||
{file = "jiter-0.11.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:121381a77a3c85987f3eba0d30ceaca9116f7463bedeec2fa79b2e7286b89b60"},
|
||||
{file = "jiter-0.11.1-cp39-cp39-win32.whl", hash = "sha256:160225407f6dfabdf9be1b44e22f06bc293a78a28ffa4347054698bd712dad06"},
|
||||
{file = "jiter-0.11.1-cp39-cp39-win_amd64.whl", hash = "sha256:028e0d59bcdfa1079f8df886cdaefc6f515c27a5288dec956999260c7e4a7cfd"},
|
||||
{file = "jiter-0.11.1-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:e642b5270e61dd02265866398707f90e365b5db2eb65a4f30c789d826682e1f6"},
|
||||
{file = "jiter-0.11.1-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:464ba6d000585e4e2fd1e891f31f1231f497273414f5019e27c00a4b8f7a24ad"},
|
||||
{file = "jiter-0.11.1-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:055568693ab35e0bf3a171b03bb40b2dcb10352359e0ab9b5ed0da2bf1eb6f6f"},
|
||||
{file = "jiter-0.11.1-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0c69ea798d08a915ba4478113efa9e694971e410056392f4526d796f136d3fa"},
|
||||
{file = "jiter-0.11.1-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:0d4d6993edc83cf75e8c6828a8d6ce40a09ee87e38c7bfba6924f39e1337e21d"},
|
||||
{file = "jiter-0.11.1-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:f78d151c83a87a6cf5461d5ee55bc730dd9ae227377ac6f115b922989b95f838"},
|
||||
{file = "jiter-0.11.1-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9022974781155cd5521d5cb10997a03ee5e31e8454c9d999dcdccd253f2353f"},
|
||||
{file = "jiter-0.11.1-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18c77aaa9117510d5bdc6a946baf21b1f0cfa58ef04d31c8d016f206f2118960"},
|
||||
{file = "jiter-0.11.1.tar.gz", hash = "sha256:849dcfc76481c0ea0099391235b7ca97d7279e0fa4c86005457ac7c88e8b76dc"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jsonschema"
|
||||
version = "3.2.0"
|
||||
@@ -2492,6 +2639,34 @@ rsa = ["cryptography (>=3.0.0)"]
|
||||
signals = ["blinker (>=1.4.0)"]
|
||||
signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
|
||||
|
||||
[[package]]
|
||||
name = "openai"
|
||||
version = "1.109.1"
|
||||
description = "The official Python library for the openai API"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "openai-1.109.1-py3-none-any.whl", hash = "sha256:6bcaf57086cf59159b8e27447e4e7dd019db5d29a438072fbd49c290c7e65315"},
|
||||
{file = "openai-1.109.1.tar.gz", hash = "sha256:d173ed8dbca665892a6db099b4a2dfac624f94d20a93f46eb0b56aae940ed869"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
anyio = ">=3.5.0,<5"
|
||||
distro = ">=1.7.0,<2"
|
||||
httpx = ">=0.23.0,<1"
|
||||
jiter = ">=0.4.0,<1"
|
||||
pydantic = ">=1.9.0,<3"
|
||||
sniffio = "*"
|
||||
tqdm = ">4"
|
||||
typing-extensions = ">=4.11,<5"
|
||||
|
||||
[package.extras]
|
||||
aiohttp = ["aiohttp", "httpx-aiohttp (>=0.1.8)"]
|
||||
datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
|
||||
realtime = ["websockets (>=13,<16)"]
|
||||
voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"]
|
||||
|
||||
[[package]]
|
||||
name = "opentelemetry-api"
|
||||
version = "1.27.0"
|
||||
@@ -3864,6 +4039,18 @@ files = [
|
||||
{file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "soupsieve"
|
||||
version = "2.8"
|
||||
description = "A modern CSS selector implementation for Beautiful Soup."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c"},
|
||||
{file = "soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sqlalchemy"
|
||||
version = "2.0.35"
|
||||
@@ -4477,4 +4664,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = "^3.11,<3.12"
|
||||
content-hash = "979b7f159a1a284b77b9bec889298c50e3c269149524e647978b517463142e1f"
|
||||
content-hash = "d6f93822b88fad4aaf0ae5d60932e0e93d9e95aa79fdddc44f9f6045387d3b4c"
|
||||
|
||||
@@ -41,6 +41,8 @@ dpath = "^2.1.6"
|
||||
genson = "^1.3.0"
|
||||
segment-analytics-python = "^2.3.2"
|
||||
python-slugify = ">=8.0.4"
|
||||
beautifulsoup4 = "^4.12.0"
|
||||
openai = "^1.0.0"
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
# LLM-Based Regression Test Evaluation
|
||||
|
||||
Automated evaluation of connector regression test reports using LLM models.
|
||||
|
||||
## How It Works
|
||||
|
||||
After regression tests complete, this evaluates the HTML report and writes a pass/fail judgment to `GITHUB_STEP_SUMMARY`.
|
||||
|
||||
## Configuration
|
||||
|
||||
**Environment Variables:**
|
||||
- `OPENAI_API_KEY` - API key (use `ollama` for Ollama)
|
||||
- `OPENAI_BASE_URL` - Base URL for OpenAI-compatible API (e.g., `http://127.0.0.1:11434/v1` for Ollama)
|
||||
- `EVAL_MODEL` - Model name (defaults to `gpt-4o`)
|
||||
|
||||
**Evaluation Prompt:**
|
||||
Stored in `.github/prompts/regression-evaluation.prompt.yaml` following GitHub's prompt format. Uses `{{report_text}}` placeholder for dynamic content injection.
|
||||
|
||||
## Local Testing
|
||||
|
||||
```bash
|
||||
# Install Ollama
|
||||
curl -fsSL https://ollama.com/install.sh | sh
|
||||
ollama serve &
|
||||
ollama pull llama3.2:3b
|
||||
|
||||
# Set environment
|
||||
export OPENAI_API_KEY=ollama
|
||||
export OPENAI_BASE_URL=http://127.0.0.1:11434/v1
|
||||
export EVAL_MODEL=llama3.2:3b
|
||||
|
||||
# Run evaluation
|
||||
cd airbyte-ci/connectors/live-tests
|
||||
poetry install
|
||||
poetry run python src/live_tests/regression_tests/llm_evaluation/evaluate_report.py \
|
||||
--report-path /path/to/report.html
|
||||
```
|
||||
@@ -0,0 +1 @@
|
||||
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
||||
@@ -0,0 +1,299 @@
|
||||
#!/usr/bin/env python3
|
||||
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
||||
|
||||
"""
|
||||
LLM-based evaluation of regression test reports.
|
||||
|
||||
This script reads a regression test report (HTML format) and uses OpenAI's LLM
|
||||
to evaluate the results, make a pass/fail judgment, and generate a summary.
|
||||
The summary is written to GITHUB_STEP_SUMMARY for display in GitHub Actions.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
from bs4 import BeautifulSoup
|
||||
from openai import OpenAI
|
||||
|
||||
MAX_REPORT_CHARS = 200000
|
||||
|
||||
# Default evaluation prompt
|
||||
EVAL_PROMPT = """You are an expert at evaluating connector regression test results.
|
||||
Your task is to analyze the test report and determine if the regression tests should PASS or FAIL.
|
||||
|
||||
Consider the following criteria:
|
||||
1. All test cases should pass (no failed tests)
|
||||
2. Record count differences between control and target versions should be minimal or explainable
|
||||
3. Message count differences should not indicate data loss or corruption
|
||||
4. Stream coverage should be reasonable
|
||||
5. Any warnings or errors in test outputs should be evaluated for severity
|
||||
|
||||
Provide your evaluation in the following JSON format:
|
||||
{
|
||||
"pass": true/false,
|
||||
"summary": "A concise 2-3 sentence summary of the evaluation",
|
||||
"reasoning": "Detailed reasoning for your pass/fail decision, including specific issues found",
|
||||
"severity": "critical/major/minor/none",
|
||||
"recommendations": "Any recommendations for addressing issues"
|
||||
}
|
||||
|
||||
Be strict but fair in your evaluation. Minor differences are acceptable, but data loss,
|
||||
corruption, or test failures should result in a FAIL."""
|
||||
|
||||
|
||||
def load_prompt_from_yaml(yaml_path: Path | None = None) -> tuple[list[dict[str, str]] | None, str | None, dict[str, Any] | None]:
|
||||
"""
|
||||
Load prompt from GitHub-format YAML file.
|
||||
|
||||
Args:
|
||||
yaml_path: Path to the .prompt.yaml file. If None, uses default location.
|
||||
|
||||
Returns:
|
||||
Tuple of (messages, model, modelParameters) or (None, None, None) if file not found or invalid
|
||||
"""
|
||||
if yaml_path is None:
|
||||
# Default location: .github/prompts/regression-evaluation.prompt.yaml
|
||||
github_workspace = os.environ.get("GITHUB_WORKSPACE")
|
||||
if github_workspace:
|
||||
yaml_path = Path(github_workspace) / ".github" / "prompts" / "regression-evaluation.prompt.yaml"
|
||||
else:
|
||||
script_dir = Path(__file__).parent
|
||||
repo_root = script_dir.parent.parent.parent.parent.parent.parent
|
||||
yaml_path = repo_root / ".github" / "prompts" / "regression-evaluation.prompt.yaml"
|
||||
|
||||
if not yaml_path.exists():
|
||||
print(f"Prompt file not found at {yaml_path}, using default hardcoded prompt")
|
||||
return None, None, None
|
||||
|
||||
try:
|
||||
with open(yaml_path, "r", encoding="utf-8") as f:
|
||||
prompt_data = yaml.safe_load(f)
|
||||
|
||||
messages = prompt_data.get("messages", [])
|
||||
model = prompt_data.get("model")
|
||||
model_params = prompt_data.get("modelParameters", {})
|
||||
|
||||
if not messages:
|
||||
print(f"Warning: No messages found in {yaml_path}, using default hardcoded prompt")
|
||||
return None, None, None
|
||||
|
||||
print(f"Loaded prompt from {yaml_path}")
|
||||
return messages, model, model_params
|
||||
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to load prompt from {yaml_path}: {e}")
|
||||
print("Using default hardcoded prompt")
|
||||
return None, None, None
|
||||
|
||||
|
||||
def load_report_text(html_path: Path) -> str:
|
||||
"""
|
||||
Load and convert HTML report to clean text.
|
||||
|
||||
Args:
|
||||
html_path: Path to the report.html file
|
||||
|
||||
Returns:
|
||||
Clean text representation of the report
|
||||
"""
|
||||
with open(html_path, "r", encoding="utf-8") as f:
|
||||
html_content = f.read()
|
||||
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
|
||||
for element in soup(["script", "style"]):
|
||||
element.decompose()
|
||||
|
||||
report_text = soup.get_text("\n", strip=True)
|
||||
|
||||
report_text = "\n".join(line.strip() for line in report_text.splitlines() if line.strip())
|
||||
|
||||
if len(report_text) > MAX_REPORT_CHARS:
|
||||
original_length = len(report_text)
|
||||
report_text = report_text[:MAX_REPORT_CHARS]
|
||||
truncation_note = f"\n\n[Report truncated from {original_length} to {MAX_REPORT_CHARS} characters for evaluation]"
|
||||
report_text += truncation_note
|
||||
print(f"Warning: Report truncated from {original_length} to {MAX_REPORT_CHARS} characters")
|
||||
|
||||
return report_text
|
||||
|
||||
|
||||
def evaluate_with_llm(report_text: str, prompt: str | None = None, prompt_yaml_path: Path | None = None) -> dict[str, Any]:
|
||||
"""
|
||||
Use OpenAI LLM to evaluate the regression test report.
|
||||
|
||||
Supports both OpenAI API and Ollama (OpenAI-compatible).
|
||||
Configure via environment variables:
|
||||
- OPENAI_API_KEY: API key (use 'ollama' for Ollama)
|
||||
- OPENAI_BASE_URL: Optional base URL (e.g., http://127.0.0.1:11434/v1 for Ollama)
|
||||
- EVAL_MODEL: Model name (defaults to gpt-4o, use llama3.2:3b for Ollama)
|
||||
- EVAL_PROMPT_PATH: Optional path to custom .prompt.yaml file
|
||||
|
||||
Args:
|
||||
report_text: Full text of the report
|
||||
prompt: Optional custom evaluation prompt string (legacy, overrides YAML)
|
||||
prompt_yaml_path: Optional path to .prompt.yaml file
|
||||
|
||||
Returns:
|
||||
Dictionary containing evaluation results with 'pass', 'summary', 'reasoning', 'severity', and 'recommendations' keys
|
||||
|
||||
Raises:
|
||||
Exception: If LLM evaluation fails after retry
|
||||
"""
|
||||
api_key = os.environ.get("OPENAI_API_KEY")
|
||||
base_url = os.environ.get("OPENAI_BASE_URL")
|
||||
model = os.environ.get("EVAL_MODEL", "gpt-4o")
|
||||
|
||||
if base_url:
|
||||
client = OpenAI(api_key=api_key, base_url=base_url)
|
||||
print(f"Using custom base URL: {base_url}")
|
||||
else:
|
||||
client = OpenAI(api_key=api_key)
|
||||
|
||||
yaml_messages, yaml_model, yaml_params = load_prompt_from_yaml(prompt_yaml_path)
|
||||
|
||||
if yaml_model and not os.environ.get("EVAL_MODEL"):
|
||||
model = yaml_model
|
||||
|
||||
temperature = 0.3
|
||||
if yaml_params and "temperature" in yaml_params:
|
||||
temperature = yaml_params["temperature"]
|
||||
|
||||
print(f"Using model: {model}")
|
||||
|
||||
if prompt is not None:
|
||||
messages = [
|
||||
{"role": "system", "content": prompt},
|
||||
{"role": "user", "content": f"Report:\n\n{report_text}"},
|
||||
]
|
||||
elif yaml_messages:
|
||||
messages = []
|
||||
for msg in yaml_messages:
|
||||
content = msg.get("content", "")
|
||||
content = content.replace("{{report_text}}", report_text)
|
||||
messages.append({"role": msg["role"], "content": content})
|
||||
else:
|
||||
# Fallback to hardcoded EVAL_PROMPT
|
||||
messages = [
|
||||
{"role": "system", "content": EVAL_PROMPT},
|
||||
{"role": "user", "content": f"Report:\n\n{report_text}"},
|
||||
]
|
||||
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
response_format={"type": "json_object"},
|
||||
)
|
||||
|
||||
evaluation = json.loads(response.choices[0].message.content)
|
||||
return evaluation
|
||||
except Exception as e:
|
||||
error_msg = str(e).lower()
|
||||
if "response_format" in error_msg or "json_object" in error_msg:
|
||||
print(f"Warning: JSON response format not supported, retrying without it: {e}")
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
temperature=temperature,
|
||||
)
|
||||
content = response.choices[0].message.content
|
||||
evaluation = json.loads(content)
|
||||
return evaluation
|
||||
raise
|
||||
|
||||
|
||||
def write_github_summary(evaluation: dict[str, Any], model: str | None = None) -> None:
|
||||
"""
|
||||
Write the evaluation summary to GITHUB_STEP_SUMMARY.
|
||||
|
||||
Args:
|
||||
evaluation: LLM evaluation results
|
||||
model: Model name used for evaluation (optional)
|
||||
"""
|
||||
summary_file = os.environ.get("GITHUB_STEP_SUMMARY")
|
||||
if not summary_file:
|
||||
print("Warning: GITHUB_STEP_SUMMARY environment variable not set. Writing to stdout instead.")
|
||||
summary_file = "/dev/stdout"
|
||||
|
||||
status_emoji = "✅" if evaluation["pass"] else "❌"
|
||||
|
||||
model_info = f"model: {model}" if model else "OpenAI-compatible API"
|
||||
|
||||
markdown = f"""# {status_emoji} Regression Test Evaluation: {"PASS" if evaluation['pass'] else "FAIL"}
|
||||
|
||||
{evaluation['summary']}
|
||||
|
||||
|
||||
{evaluation['reasoning']}
|
||||
|
||||
{evaluation.get('recommendations', 'No specific recommendations.')}
|
||||
|
||||
---
|
||||
*This evaluation was generated using {model_info}*
|
||||
"""
|
||||
|
||||
with open(summary_file, "a", encoding="utf-8") as f:
|
||||
f.write(markdown)
|
||||
|
||||
print(f"Summary written to {summary_file}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point for the LLM evaluation script."""
|
||||
parser = argparse.ArgumentParser(description="Evaluate regression test reports using OpenAI LLM")
|
||||
parser.add_argument("--report-path", type=Path, required=True, help="Path to the report.html file")
|
||||
parser.add_argument("--prompt-file", type=Path, help="Optional path to a custom evaluation prompt file")
|
||||
parser.add_argument("--output-json", type=Path, help="Optional path to write evaluation results as JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.environ.get("OPENAI_API_KEY"):
|
||||
print("Error: OPENAI_API_KEY environment variable not set", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
if not args.report_path.exists():
|
||||
print(f"Error: Report file not found: {args.report_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Loading report from: {args.report_path}")
|
||||
report_text = load_report_text(args.report_path)
|
||||
print(f"Report loaded: {len(report_text)} characters")
|
||||
|
||||
custom_prompt = None
|
||||
if args.prompt_file and args.prompt_file.exists():
|
||||
with open(args.prompt_file, "r", encoding="utf-8") as f:
|
||||
custom_prompt = f.read()
|
||||
print(f"Using custom prompt from: {args.prompt_file}")
|
||||
|
||||
prompt_yaml_path = None
|
||||
eval_prompt_path = os.environ.get("EVAL_PROMPT_PATH")
|
||||
if eval_prompt_path:
|
||||
prompt_yaml_path = Path(eval_prompt_path)
|
||||
|
||||
print("Evaluating report with LLM...")
|
||||
evaluation = evaluate_with_llm(report_text, custom_prompt, prompt_yaml_path)
|
||||
|
||||
print(f"\nEvaluation Result: {'PASS' if evaluation['pass'] else 'FAIL'}")
|
||||
print(f"Summary: {evaluation['summary']}")
|
||||
|
||||
model = os.environ.get("EVAL_MODEL", "gpt-4o")
|
||||
write_github_summary(evaluation, model)
|
||||
|
||||
if args.output_json:
|
||||
output_data = {"evaluation": evaluation}
|
||||
with open(args.output_json, "w", encoding="utf-8") as f:
|
||||
json.dump(output_data, f, indent=2)
|
||||
print(f"Evaluation results written to: {args.output_json}")
|
||||
|
||||
sys.exit(0 if evaluation["pass"] else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user