From cf1379d105501c97ee3bffbb257fc7d2964d5f97 Mon Sep 17 00:00:00 2001 From: "Aaron (\"AJ\") Steers" Date: Thu, 20 Nov 2025 17:27:25 -0800 Subject: [PATCH] feat: add OpenAI LLM evaluation step for connector regression tests (#68673) Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .../prompts/regression-evaluation.prompt.yaml | 34 ++ .../run-regression-tests-command.yml | 102 ++++++ airbyte-ci/connectors/live-tests/poetry.lock | 189 ++++++++++- .../connectors/live-tests/pyproject.toml | 2 + .../regression_tests/llm_evaluation/README.md | 37 +++ .../llm_evaluation/__init__.py | 1 + .../llm_evaluation/evaluate_report.py | 299 ++++++++++++++++++ 7 files changed, 663 insertions(+), 1 deletion(-) create mode 100644 .github/prompts/regression-evaluation.prompt.yaml create mode 100644 airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/README.md create mode 100644 airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/__init__.py create mode 100644 airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/evaluate_report.py diff --git a/.github/prompts/regression-evaluation.prompt.yaml b/.github/prompts/regression-evaluation.prompt.yaml new file mode 100644 index 00000000000..411d2e9778b --- /dev/null +++ b/.github/prompts/regression-evaluation.prompt.yaml @@ -0,0 +1,34 @@ +name: Regression Report Evaluation +description: Evaluate Airbyte connector regression test reports and return a JSON verdict with reasoning +model: llama3.2:3b +modelParameters: + temperature: 0.3 +messages: + - role: system + content: | + You are an expert at evaluating connector regression test results. + Your task is to analyze the test report and determine if the regression tests should PASS or FAIL. + + Consider the following criteria: + 1. All test cases should pass (no failed tests) + 2. Record count differences between control and target versions should be minimal or explainable + 3. Message count differences should not indicate data loss or corruption + 4. Stream coverage should be reasonable + 5. Any warnings or errors in test outputs should be evaluated for severity + + Provide your evaluation in the following JSON format: + { + "pass": true/false, + "summary": "A concise 2-3 sentence summary of the evaluation", + "reasoning": "Detailed reasoning for your pass/fail decision, including specific issues found", + "severity": "critical/major/minor/none", + "recommendations": "Any recommendations for addressing issues" + } + + Be strict but fair in your evaluation. Minor differences are acceptable, but data loss, + corruption, or test failures should result in a FAIL. + - role: user + content: | + Report: + + {{report_text}} diff --git a/.github/workflows/run-regression-tests-command.yml b/.github/workflows/run-regression-tests-command.yml index c3d17ece80b..9d5c2a85e05 100644 --- a/.github/workflows/run-regression-tests-command.yml +++ b/.github/workflows/run-regression-tests-command.yml @@ -84,7 +84,24 @@ jobs: name: Regression Tests runs-on: linux-24.04-large # Custom runner, defined in GitHub org settings timeout-minutes: 360 # 6 hours + permissions: + contents: read + pull-requests: write + issues: write steps: + - name: Append start with run link + id: pr-comment-id + if: github.event_name == 'workflow_dispatch' && github.event.inputs.pr != '' + uses: peter-evans/create-or-update-comment@v4 + with: + token: ${{ github.token }} + issue-number: ${{ github.event.inputs.pr }} + comment-id: ${{ github.event.inputs.comment-id }} + edit-mode: append + body: | + > Starting regression tests (filter: `${{ github.event.inputs.connector_filter || '--modified' }}`) + > Workflow run: [${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) + - name: Install Python id: install_python uses: actions/setup-python@7f4fc3e22c37d6ff65e88745f38bd3157c663f7c # v4.9.1 @@ -183,6 +200,7 @@ jobs: # forks if the user installs the app into their fork. Until we document this as a clear # path, we will have to keep using the PAT. - name: Run Regression Tests [WORKFLOW DISPATCH] + id: run-regression-tests if: github.event_name == 'workflow_dispatch' # TODO: consider using the matrix strategy (https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs). See https://github.com/airbytehq/airbyte/pull/37659#discussion_r1583380234 for details. uses: ./.github/actions/run-airbyte-ci with: @@ -199,3 +217,87 @@ jobs: s3_build_cache_access_key_id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }} s3_build_cache_secret_key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }} subcommand: connectors ${{ env.USE_LOCAL_CDK_FLAG }} ${{ inputs.connector_filter }} test --only-step connector_live_tests --connector_live_tests.test-suite=regression --connector_live_tests.connection-id=${{ github.event.inputs.connection_id }} --connector_live_tests.pr-url="https://github.com/airbytehq/airbyte/pull/${{ github.event.inputs.pr }}" ${{ env.READ_WITH_STATE_FLAG }} ${{ env.DISABLE_PROXY_FLAG }} ${{ env.STREAM_PARAMS }} ${{ env.CONNECTION_SUBSET }} ${{ env.CONTROL_VERSION }} --global-status-check-context="Regression Tests" --global-status-check-description='Running regression tests' + + - name: Upload regression test report + if: always() && github.event_name == 'workflow_dispatch' + uses: actions/upload-artifact@v4 + with: + name: regression-test-report + path: /tmp/regression_tests_artifacts/report.html + if-no-files-found: ignore + + - name: Append regression outcome + if: always() && github.event_name == 'workflow_dispatch' && github.event.inputs.pr != '' + uses: peter-evans/create-or-update-comment@v4 + with: + token: ${{ github.token }} + comment-id: ${{ steps.pr-comment-id.outputs.comment-id }} + edit-mode: append + body: | + > Regression tests: ${{ steps.run-regression-tests.outcome == 'success' && '✅ PASSED' || steps.run-regression-tests.outcome == 'failure' && '❌ FAILED' || steps.run-regression-tests.outcome == 'cancelled' && '⚠️ CANCELLED' || steps.run-regression-tests.outcome == 'skipped' && '⏭️ SKIPPED' || '❓ UNKNOWN' }} + > Report: ${{ hashFiles('/tmp/regression_tests_artifacts/report.html') != '' && 'artifact `regression-test-report` available in the run' || 'not generated' }} + + - name: Install live-tests dependencies for LLM evaluation + if: always() && github.event_name == 'workflow_dispatch' + working-directory: airbyte-ci/connectors/live-tests + run: poetry install + + - name: Install and Start Ollama + if: always() && github.event_name == 'workflow_dispatch' + run: | + curl -fsSL https://ollama.com/install.sh | sh + ollama serve & + sleep 5 + ollama pull llama3.2:3b + echo "Ollama server started and model pulled" + + - name: Evaluate Regression Test Report with LLM + if: always() && github.event_name == 'workflow_dispatch' + id: llm-eval + continue-on-error: true + working-directory: airbyte-ci/connectors/live-tests + env: + OPENAI_API_KEY: ollama + OPENAI_BASE_URL: http://127.0.0.1:11434/v1 + EVAL_MODEL: llama3.2:3b + run: | + set -u + echo "ran=false" >> "$GITHUB_OUTPUT" + echo "result=error" >> "$GITHUB_OUTPUT" + + # Find the most recent report.html file in /tmp/regression_tests_artifacts/ + REPORT_PATH=$(find /tmp/regression_tests_artifacts -name "report.html" -type f -printf '%T@ %p\n' | sort -n | tail -1 | cut -f2- -d" ") + + if [ -z "$REPORT_PATH" ]; then + echo "Error: No report.html found in /tmp/regression_tests_artifacts/" >&2 + echo "## ⚠️ LLM Evaluation Skipped" >> "$GITHUB_STEP_SUMMARY" + echo "No regression test report found. The tests may have failed to generate a report." >> "$GITHUB_STEP_SUMMARY" + exit 1 + fi + + echo "Found report at: $REPORT_PATH" + echo "Running LLM evaluation..." + + # Run the evaluation script + OUT_JSON="$RUNNER_TEMP/llm_eval.json" + poetry run python src/live_tests/regression_tests/llm_evaluation/evaluate_report.py \ + --report-path "$REPORT_PATH" \ + --output-json "$OUT_JSON" + + # If we got here, script exit 0 and produced a judgment + PASS=$(jq -r '.evaluation.pass' "$OUT_JSON") + if [ "$PASS" = "true" ]; then RES="pass"; else RES="fail"; fi + echo "ran=true" >> "$GITHUB_OUTPUT" + echo "result=$RES" >> "$GITHUB_OUTPUT" + + - name: Append LLM outcome + if: always() && github.event_name == 'workflow_dispatch' && github.event.inputs.pr != '' + env: + EVAL_MODEL: llama3.2:3b + uses: peter-evans/create-or-update-comment@v4 + with: + token: ${{ github.token }} + comment-id: ${{ steps.pr-comment-id.outputs.comment-id }} + edit-mode: append + body: | + > LLM Evaluation: ${{ steps.llm-eval.outputs.ran == 'true' && (steps.llm-eval.outputs.result == 'pass' && '✅ PASS' || steps.llm-eval.outputs.result == 'fail' && '❌ FAIL' || '⚠️ ERROR') || '⚠️ Did not run' }}${{ steps.llm-eval.outputs.ran == 'true' && format(' (model: {0})', env.EVAL_MODEL) || '' }} diff --git a/airbyte-ci/connectors/live-tests/poetry.lock b/airbyte-ci/connectors/live-tests/poetry.lock index 89d6d13dce8..254f971fdab 100644 --- a/airbyte-ci/connectors/live-tests/poetry.lock +++ b/airbyte-ci/connectors/live-tests/poetry.lock @@ -355,6 +355,29 @@ test = ["coverage (>=5.5)", "equinox", "jax[cpu]", "jaxtyping", "mypy (>=0.800)" test-tox = ["equinox", "jax[cpu]", "jaxtyping", "mypy (>=0.800)", "numba", "numpy", "pandera", "pygments", "pyright (>=1.1.370)", "pytest (>=4.0.0)", "sphinx", "typing-extensions (>=3.10.0.0)"] test-tox-coverage = ["coverage (>=5.5)"] +[[package]] +name = "beautifulsoup4" +version = "4.14.2" +description = "Screen-scraping library" +optional = false +python-versions = ">=3.7.0" +groups = ["main"] +files = [ + {file = "beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515"}, + {file = "beautifulsoup4-4.14.2.tar.gz", hash = "sha256:2a98ab9f944a11acee9cc848508ec28d9228abfd522ef0fad6a02a72e0ded69e"}, +] + +[package.dependencies] +soupsieve = ">1.2" +typing-extensions = ">=4.0.0" + +[package.extras] +cchardet = ["cchardet"] +chardet = ["chardet"] +charset-normalizer = ["charset-normalizer"] +html5lib = ["html5lib"] +lxml = ["lxml"] + [[package]] name = "blessed" version = "1.20.0" @@ -979,6 +1002,18 @@ wrapt = ">=1.10,<2" [package.extras] dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"] +[[package]] +name = "distro" +version = "1.9.0" +description = "Distro - an OS platform information API" +optional = false +python-versions = ">=3.6" +groups = ["main"] +files = [ + {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"}, + {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"}, +] + [[package]] name = "docker" version = "6.1.3" @@ -1943,6 +1978,118 @@ files = [ [package.dependencies] ansicon = {version = "*", markers = "platform_system == \"Windows\""} +[[package]] +name = "jiter" +version = "0.11.1" +description = "Fast iterable JSON parser." +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "jiter-0.11.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:ed58841a491bbbf3f7c55a6b68fff568439ab73b2cce27ace0e169057b5851df"}, + {file = "jiter-0.11.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:499beb9b2d7e51d61095a8de39ebcab1d1778f2a74085f8305a969f6cee9f3e4"}, + {file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b87b2821795e28cc990939b68ce7a038edea680a24910bd68a79d54ff3f03c02"}, + {file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:83f6fa494d8bba14ab100417c80e70d32d737e805cb85be2052d771c76fcd1f8"}, + {file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5fbc6aea1daa2ec6f5ed465f0c5e7b0607175062ceebbea5ca70dd5ddab58083"}, + {file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:302288e2edc43174bb2db838e94688d724f9aad26c5fb9a74f7a5fb427452a6a"}, + {file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85db563fe3b367bb568af5d29dea4d4066d923b8e01f3417d25ebecd958de815"}, + {file = "jiter-0.11.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f1c1ba2b6b22f775444ef53bc2d5778396d3520abc7b2e1da8eb0c27cb3ffb10"}, + {file = "jiter-0.11.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:523be464b14f8fd0cc78da6964b87b5515a056427a2579f9085ce30197a1b54a"}, + {file = "jiter-0.11.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:25b99b3f04cd2a38fefb22e822e35eb203a2cd37d680dbbc0c0ba966918af336"}, + {file = "jiter-0.11.1-cp310-cp310-win32.whl", hash = "sha256:47a79e90545a596bb9104109777894033347b11180d4751a216afef14072dbe7"}, + {file = "jiter-0.11.1-cp310-cp310-win_amd64.whl", hash = "sha256:cace75621ae9bd66878bf69fbd4dfc1a28ef8661e0c2d0eb72d3d6f1268eddf5"}, + {file = "jiter-0.11.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:9b0088ff3c374ce8ce0168523ec8e97122ebb788f950cf7bb8e39c7dc6a876a2"}, + {file = "jiter-0.11.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:74433962dd3c3090655e02e461267095d6c84f0741c7827de11022ef8d7ff661"}, + {file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d98030e345e6546df2cc2c08309c502466c66c4747b043f1a0d415fada862b8"}, + {file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1d6db0b2e788db46bec2cf729a88b6dd36959af2abd9fa2312dfba5acdd96dcb"}, + {file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:55678fbbda261eafe7289165dd2ddd0e922df5f9a1ae46d7c79a5a15242bd7d1"}, + {file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a6b74fae8e40497653b52ce6ca0f1b13457af769af6fb9c1113efc8b5b4d9be"}, + {file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a55a453f8b035eb4f7852a79a065d616b7971a17f5e37a9296b4b38d3b619e4"}, + {file = "jiter-0.11.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2638148099022e6bdb3f42904289cd2e403609356fb06eb36ddec2d50958bc29"}, + {file = "jiter-0.11.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:252490567a5d990986f83b95a5f1ca1bf205ebd27b3e9e93bb7c2592380e29b9"}, + {file = "jiter-0.11.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d431d52b0ca2436eea6195f0f48528202100c7deda354cb7aac0a302167594d5"}, + {file = "jiter-0.11.1-cp311-cp311-win32.whl", hash = "sha256:db6f41e40f8bae20c86cb574b48c4fd9f28ee1c71cb044e9ec12e78ab757ba3a"}, + {file = "jiter-0.11.1-cp311-cp311-win_amd64.whl", hash = "sha256:0cc407b8e6cdff01b06bb80f61225c8b090c3df108ebade5e0c3c10993735b19"}, + {file = "jiter-0.11.1-cp311-cp311-win_arm64.whl", hash = "sha256:fe04ea475392a91896d1936367854d346724a1045a247e5d1c196410473b8869"}, + {file = "jiter-0.11.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:c92148eec91052538ce6823dfca9525f5cfc8b622d7f07e9891a280f61b8c96c"}, + {file = "jiter-0.11.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ecd4da91b5415f183a6be8f7158d127bdd9e6a3174138293c0d48d6ea2f2009d"}, + {file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7e3ac25c00b9275684d47aa42febaa90a9958e19fd1726c4ecf755fbe5e553b"}, + {file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:57d7305c0a841858f866cd459cd9303f73883fb5e097257f3d4a3920722c69d4"}, + {file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e86fa10e117dce22c547f31dd6d2a9a222707d54853d8de4e9a2279d2c97f239"}, + {file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ae5ef1d48aec7e01ee8420155d901bb1d192998fa811a65ebb82c043ee186711"}, + {file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb68e7bf65c990531ad8715e57d50195daf7c8e6f1509e617b4e692af1108939"}, + {file = "jiter-0.11.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43b30c8154ded5845fa454ef954ee67bfccce629b2dea7d01f795b42bc2bda54"}, + {file = "jiter-0.11.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:586cafbd9dd1f3ce6a22b4a085eaa6be578e47ba9b18e198d4333e598a91db2d"}, + {file = "jiter-0.11.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:677cc2517d437a83bb30019fd4cf7cad74b465914c56ecac3440d597ac135250"}, + {file = "jiter-0.11.1-cp312-cp312-win32.whl", hash = "sha256:fa992af648fcee2b850a3286a35f62bbbaeddbb6dbda19a00d8fbc846a947b6e"}, + {file = "jiter-0.11.1-cp312-cp312-win_amd64.whl", hash = "sha256:88b5cae9fa51efeb3d4bd4e52bfd4c85ccc9cac44282e2a9640893a042ba4d87"}, + {file = "jiter-0.11.1-cp312-cp312-win_arm64.whl", hash = "sha256:9a6cae1ab335551917f882f2c3c1efe7617b71b4c02381e4382a8fc80a02588c"}, + {file = "jiter-0.11.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:71b6a920a5550f057d49d0e8bcc60945a8da998019e83f01adf110e226267663"}, + {file = "jiter-0.11.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0b3de72e925388453a5171be83379549300db01284f04d2a6f244d1d8de36f94"}, + {file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc19dd65a2bd3d9c044c5b4ebf657ca1e6003a97c0fc10f555aa4f7fb9821c00"}, + {file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d58faaa936743cd1464540562f60b7ce4fd927e695e8bc31b3da5b914baa9abd"}, + {file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:902640c3103625317291cb73773413b4d71847cdf9383ba65528745ff89f1d14"}, + {file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:30405f726e4c2ed487b176c09f8b877a957f535d60c1bf194abb8dadedb5836f"}, + {file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3217f61728b0baadd2551844870f65219ac4a1285d5e1a4abddff3d51fdabe96"}, + {file = "jiter-0.11.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b1364cc90c03a8196f35f396f84029f12abe925415049204446db86598c8b72c"}, + {file = "jiter-0.11.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:53a54bf8e873820ab186b2dca9f6c3303f00d65ae5e7b7d6bda1b95aa472d646"}, + {file = "jiter-0.11.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7e29aca023627b0e0c2392d4248f6414d566ff3974fa08ff2ac8dbb96dfee92a"}, + {file = "jiter-0.11.1-cp313-cp313-win32.whl", hash = "sha256:f153e31d8bca11363751e875c0a70b3d25160ecbaee7b51e457f14498fb39d8b"}, + {file = "jiter-0.11.1-cp313-cp313-win_amd64.whl", hash = "sha256:f773f84080b667c69c4ea0403fc67bb08b07e2b7ce1ef335dea5868451e60fed"}, + {file = "jiter-0.11.1-cp313-cp313-win_arm64.whl", hash = "sha256:635ecd45c04e4c340d2187bcb1cea204c7cc9d32c1364d251564bf42e0e39c2d"}, + {file = "jiter-0.11.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d892b184da4d94d94ddb4031296931c74ec8b325513a541ebfd6dfb9ae89904b"}, + {file = "jiter-0.11.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa22c223a3041dacb2fcd37c70dfd648b44662b4a48e242592f95bda5ab09d58"}, + {file = "jiter-0.11.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:330e8e6a11ad4980cd66a0f4a3e0e2e0f646c911ce047014f984841924729789"}, + {file = "jiter-0.11.1-cp313-cp313t-win_amd64.whl", hash = "sha256:09e2e386ebf298547ca3a3704b729471f7ec666c2906c5c26c1a915ea24741ec"}, + {file = "jiter-0.11.1-cp313-cp313t-win_arm64.whl", hash = "sha256:fe4a431c291157e11cee7c34627990ea75e8d153894365a3bc84b7a959d23ca8"}, + {file = "jiter-0.11.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:0fa1f70da7a8a9713ff8e5f75ec3f90c0c870be6d526aa95e7c906f6a1c8c676"}, + {file = "jiter-0.11.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:569ee559e5046a42feb6828c55307cf20fe43308e3ae0d8e9e4f8d8634d99944"}, + {file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f69955fa1d92e81987f092b233f0be49d4c937da107b7f7dcf56306f1d3fcce9"}, + {file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:090f4c9d4a825e0fcbd0a2647c9a88a0f366b75654d982d95a9590745ff0c48d"}, + {file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bbf3d8cedf9e9d825233e0dcac28ff15c47b7c5512fdfe2e25fd5bbb6e6b0cee"}, + {file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2aa9b1958f9c30d3d1a558b75f0626733c60eb9b7774a86b34d88060be1e67fe"}, + {file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e42d1ca16590b768c5e7d723055acd2633908baacb3628dd430842e2e035aa90"}, + {file = "jiter-0.11.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5db4c2486a023820b701a17aec9c5a6173c5ba4393f26662f032f2de9c848b0f"}, + {file = "jiter-0.11.1-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:4573b78777ccfac954859a6eff45cbd9d281d80c8af049d0f1a3d9fc323d5c3a"}, + {file = "jiter-0.11.1-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:7593ac6f40831d7961cb67633c39b9fef6689a211d7919e958f45710504f52d3"}, + {file = "jiter-0.11.1-cp314-cp314-win32.whl", hash = "sha256:87202ec6ff9626ff5f9351507def98fcf0df60e9a146308e8ab221432228f4ea"}, + {file = "jiter-0.11.1-cp314-cp314-win_amd64.whl", hash = "sha256:a5dd268f6531a182c89d0dd9a3f8848e86e92dfff4201b77a18e6b98aa59798c"}, + {file = "jiter-0.11.1-cp314-cp314-win_arm64.whl", hash = "sha256:5d761f863f912a44748a21b5c4979c04252588ded8d1d2760976d2e42cd8d991"}, + {file = "jiter-0.11.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2cc5a3965285ddc33e0cab933e96b640bc9ba5940cea27ebbbf6695e72d6511c"}, + {file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b572b3636a784c2768b2342f36a23078c8d3aa6d8a30745398b1bab58a6f1a8"}, + {file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ad93e3d67a981f96596d65d2298fe8d1aa649deb5374a2fb6a434410ee11915e"}, + {file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a83097ce379e202dcc3fe3fc71a16d523d1ee9192c8e4e854158f96b3efe3f2f"}, + {file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7042c51e7fbeca65631eb0c332f90c0c082eab04334e7ccc28a8588e8e2804d9"}, + {file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a68d679c0e47649a61df591660507608adc2652442de7ec8276538ac46abe08"}, + {file = "jiter-0.11.1-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a1b0da75dbf4b6ec0b3c9e604d1ee8beaf15bc046fff7180f7d89e3cdbd3bb51"}, + {file = "jiter-0.11.1-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:69dd514bf0fa31c62147d6002e5ca2b3e7ef5894f5ac6f0a19752385f4e89437"}, + {file = "jiter-0.11.1-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:bb31ac0b339efa24c0ca606febd8b77ef11c58d09af1b5f2be4c99e907b11111"}, + {file = "jiter-0.11.1-cp314-cp314t-win32.whl", hash = "sha256:b2ce0d6156a1d3ad41da3eec63b17e03e296b78b0e0da660876fccfada86d2f7"}, + {file = "jiter-0.11.1-cp314-cp314t-win_amd64.whl", hash = "sha256:f4db07d127b54c4a2d43b4cf05ff0193e4f73e0dd90c74037e16df0b29f666e1"}, + {file = "jiter-0.11.1-cp314-cp314t-win_arm64.whl", hash = "sha256:28e4fdf2d7ebfc935523e50d1efa3970043cfaa161674fe66f9642409d001dfe"}, + {file = "jiter-0.11.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:baa99c8db49467527658bb479857344daf0a14dff909b7f6714579ac439d1253"}, + {file = "jiter-0.11.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:860fe55fa3b01ad0edf2adde1098247ff5c303d0121f9ce028c03d4f88c69502"}, + {file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:173dd349d99b6feaf5a25a6fbcaf3489a6f947708d808240587a23df711c67db"}, + {file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:14ac1dca837514cc946a6ac2c4995d9695303ecc754af70a3163d057d1a444ab"}, + {file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69af47de5f93a231d5b85f7372d3284a5be8edb4cc758f006ec5a1406965ac5e"}, + {file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:685f8b3abd3bbd3e06e4dfe2429ff87fd5d7a782701151af99b1fcbd80e31b2b"}, + {file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d04afa2d4e5526e54ae8a58feea953b1844bf6e3526bc589f9de68e86d0ea01"}, + {file = "jiter-0.11.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1e92b927259035b50d8e11a8fdfe0ebd014d883e4552d37881643fa289a4bcf1"}, + {file = "jiter-0.11.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e7bd8be4fad8d4c5558b7801770cd2da6c072919c6f247cc5336edb143f25304"}, + {file = "jiter-0.11.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:121381a77a3c85987f3eba0d30ceaca9116f7463bedeec2fa79b2e7286b89b60"}, + {file = "jiter-0.11.1-cp39-cp39-win32.whl", hash = "sha256:160225407f6dfabdf9be1b44e22f06bc293a78a28ffa4347054698bd712dad06"}, + {file = "jiter-0.11.1-cp39-cp39-win_amd64.whl", hash = "sha256:028e0d59bcdfa1079f8df886cdaefc6f515c27a5288dec956999260c7e4a7cfd"}, + {file = "jiter-0.11.1-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:e642b5270e61dd02265866398707f90e365b5db2eb65a4f30c789d826682e1f6"}, + {file = "jiter-0.11.1-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:464ba6d000585e4e2fd1e891f31f1231f497273414f5019e27c00a4b8f7a24ad"}, + {file = "jiter-0.11.1-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:055568693ab35e0bf3a171b03bb40b2dcb10352359e0ab9b5ed0da2bf1eb6f6f"}, + {file = "jiter-0.11.1-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0c69ea798d08a915ba4478113efa9e694971e410056392f4526d796f136d3fa"}, + {file = "jiter-0.11.1-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:0d4d6993edc83cf75e8c6828a8d6ce40a09ee87e38c7bfba6924f39e1337e21d"}, + {file = "jiter-0.11.1-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:f78d151c83a87a6cf5461d5ee55bc730dd9ae227377ac6f115b922989b95f838"}, + {file = "jiter-0.11.1-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9022974781155cd5521d5cb10997a03ee5e31e8454c9d999dcdccd253f2353f"}, + {file = "jiter-0.11.1-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18c77aaa9117510d5bdc6a946baf21b1f0cfa58ef04d31c8d016f206f2118960"}, + {file = "jiter-0.11.1.tar.gz", hash = "sha256:849dcfc76481c0ea0099391235b7ca97d7279e0fa4c86005457ac7c88e8b76dc"}, +] + [[package]] name = "jsonschema" version = "3.2.0" @@ -2492,6 +2639,34 @@ rsa = ["cryptography (>=3.0.0)"] signals = ["blinker (>=1.4.0)"] signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] +[[package]] +name = "openai" +version = "1.109.1" +description = "The official Python library for the openai API" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "openai-1.109.1-py3-none-any.whl", hash = "sha256:6bcaf57086cf59159b8e27447e4e7dd019db5d29a438072fbd49c290c7e65315"}, + {file = "openai-1.109.1.tar.gz", hash = "sha256:d173ed8dbca665892a6db099b4a2dfac624f94d20a93f46eb0b56aae940ed869"}, +] + +[package.dependencies] +anyio = ">=3.5.0,<5" +distro = ">=1.7.0,<2" +httpx = ">=0.23.0,<1" +jiter = ">=0.4.0,<1" +pydantic = ">=1.9.0,<3" +sniffio = "*" +tqdm = ">4" +typing-extensions = ">=4.11,<5" + +[package.extras] +aiohttp = ["aiohttp", "httpx-aiohttp (>=0.1.8)"] +datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] +realtime = ["websockets (>=13,<16)"] +voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"] + [[package]] name = "opentelemetry-api" version = "1.27.0" @@ -3864,6 +4039,18 @@ files = [ {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, ] +[[package]] +name = "soupsieve" +version = "2.8" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c"}, + {file = "soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f"}, +] + [[package]] name = "sqlalchemy" version = "2.0.35" @@ -4477,4 +4664,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.1" python-versions = "^3.11,<3.12" -content-hash = "979b7f159a1a284b77b9bec889298c50e3c269149524e647978b517463142e1f" +content-hash = "d6f93822b88fad4aaf0ae5d60932e0e93d9e95aa79fdddc44f9f6045387d3b4c" diff --git a/airbyte-ci/connectors/live-tests/pyproject.toml b/airbyte-ci/connectors/live-tests/pyproject.toml index 63676a84df2..6f1a882b059 100644 --- a/airbyte-ci/connectors/live-tests/pyproject.toml +++ b/airbyte-ci/connectors/live-tests/pyproject.toml @@ -41,6 +41,8 @@ dpath = "^2.1.6" genson = "^1.3.0" segment-analytics-python = "^2.3.2" python-slugify = ">=8.0.4" +beautifulsoup4 = "^4.12.0" +openai = "^1.0.0" [tool.poetry.group.dev.dependencies] diff --git a/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/README.md b/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/README.md new file mode 100644 index 00000000000..93f84175f1a --- /dev/null +++ b/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/README.md @@ -0,0 +1,37 @@ +# LLM-Based Regression Test Evaluation + +Automated evaluation of connector regression test reports using LLM models. + +## How It Works + +After regression tests complete, this evaluates the HTML report and writes a pass/fail judgment to `GITHUB_STEP_SUMMARY`. + +## Configuration + +**Environment Variables:** +- `OPENAI_API_KEY` - API key (use `ollama` for Ollama) +- `OPENAI_BASE_URL` - Base URL for OpenAI-compatible API (e.g., `http://127.0.0.1:11434/v1` for Ollama) +- `EVAL_MODEL` - Model name (defaults to `gpt-4o`) + +**Evaluation Prompt:** +Stored in `.github/prompts/regression-evaluation.prompt.yaml` following GitHub's prompt format. Uses `{{report_text}}` placeholder for dynamic content injection. + +## Local Testing + +```bash +# Install Ollama +curl -fsSL https://ollama.com/install.sh | sh +ollama serve & +ollama pull llama3.2:3b + +# Set environment +export OPENAI_API_KEY=ollama +export OPENAI_BASE_URL=http://127.0.0.1:11434/v1 +export EVAL_MODEL=llama3.2:3b + +# Run evaluation +cd airbyte-ci/connectors/live-tests +poetry install +poetry run python src/live_tests/regression_tests/llm_evaluation/evaluate_report.py \ + --report-path /path/to/report.html +``` diff --git a/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/__init__.py b/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/__init__.py new file mode 100644 index 00000000000..7f66676b871 --- /dev/null +++ b/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. diff --git a/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/evaluate_report.py b/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/evaluate_report.py new file mode 100644 index 00000000000..ac9f33c8016 --- /dev/null +++ b/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/evaluate_report.py @@ -0,0 +1,299 @@ +#!/usr/bin/env python3 +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. + +""" +LLM-based evaluation of regression test reports. + +This script reads a regression test report (HTML format) and uses OpenAI's LLM +to evaluate the results, make a pass/fail judgment, and generate a summary. +The summary is written to GITHUB_STEP_SUMMARY for display in GitHub Actions. +""" + +import argparse +import json +import os +import sys +from pathlib import Path +from typing import Any + +import yaml +from bs4 import BeautifulSoup +from openai import OpenAI + +MAX_REPORT_CHARS = 200000 + +# Default evaluation prompt +EVAL_PROMPT = """You are an expert at evaluating connector regression test results. +Your task is to analyze the test report and determine if the regression tests should PASS or FAIL. + +Consider the following criteria: +1. All test cases should pass (no failed tests) +2. Record count differences between control and target versions should be minimal or explainable +3. Message count differences should not indicate data loss or corruption +4. Stream coverage should be reasonable +5. Any warnings or errors in test outputs should be evaluated for severity + +Provide your evaluation in the following JSON format: +{ + "pass": true/false, + "summary": "A concise 2-3 sentence summary of the evaluation", + "reasoning": "Detailed reasoning for your pass/fail decision, including specific issues found", + "severity": "critical/major/minor/none", + "recommendations": "Any recommendations for addressing issues" +} + +Be strict but fair in your evaluation. Minor differences are acceptable, but data loss, +corruption, or test failures should result in a FAIL.""" + + +def load_prompt_from_yaml(yaml_path: Path | None = None) -> tuple[list[dict[str, str]] | None, str | None, dict[str, Any] | None]: + """ + Load prompt from GitHub-format YAML file. + + Args: + yaml_path: Path to the .prompt.yaml file. If None, uses default location. + + Returns: + Tuple of (messages, model, modelParameters) or (None, None, None) if file not found or invalid + """ + if yaml_path is None: + # Default location: .github/prompts/regression-evaluation.prompt.yaml + github_workspace = os.environ.get("GITHUB_WORKSPACE") + if github_workspace: + yaml_path = Path(github_workspace) / ".github" / "prompts" / "regression-evaluation.prompt.yaml" + else: + script_dir = Path(__file__).parent + repo_root = script_dir.parent.parent.parent.parent.parent.parent + yaml_path = repo_root / ".github" / "prompts" / "regression-evaluation.prompt.yaml" + + if not yaml_path.exists(): + print(f"Prompt file not found at {yaml_path}, using default hardcoded prompt") + return None, None, None + + try: + with open(yaml_path, "r", encoding="utf-8") as f: + prompt_data = yaml.safe_load(f) + + messages = prompt_data.get("messages", []) + model = prompt_data.get("model") + model_params = prompt_data.get("modelParameters", {}) + + if not messages: + print(f"Warning: No messages found in {yaml_path}, using default hardcoded prompt") + return None, None, None + + print(f"Loaded prompt from {yaml_path}") + return messages, model, model_params + + except Exception as e: + print(f"Warning: Failed to load prompt from {yaml_path}: {e}") + print("Using default hardcoded prompt") + return None, None, None + + +def load_report_text(html_path: Path) -> str: + """ + Load and convert HTML report to clean text. + + Args: + html_path: Path to the report.html file + + Returns: + Clean text representation of the report + """ + with open(html_path, "r", encoding="utf-8") as f: + html_content = f.read() + + soup = BeautifulSoup(html_content, "html.parser") + + for element in soup(["script", "style"]): + element.decompose() + + report_text = soup.get_text("\n", strip=True) + + report_text = "\n".join(line.strip() for line in report_text.splitlines() if line.strip()) + + if len(report_text) > MAX_REPORT_CHARS: + original_length = len(report_text) + report_text = report_text[:MAX_REPORT_CHARS] + truncation_note = f"\n\n[Report truncated from {original_length} to {MAX_REPORT_CHARS} characters for evaluation]" + report_text += truncation_note + print(f"Warning: Report truncated from {original_length} to {MAX_REPORT_CHARS} characters") + + return report_text + + +def evaluate_with_llm(report_text: str, prompt: str | None = None, prompt_yaml_path: Path | None = None) -> dict[str, Any]: + """ + Use OpenAI LLM to evaluate the regression test report. + + Supports both OpenAI API and Ollama (OpenAI-compatible). + Configure via environment variables: + - OPENAI_API_KEY: API key (use 'ollama' for Ollama) + - OPENAI_BASE_URL: Optional base URL (e.g., http://127.0.0.1:11434/v1 for Ollama) + - EVAL_MODEL: Model name (defaults to gpt-4o, use llama3.2:3b for Ollama) + - EVAL_PROMPT_PATH: Optional path to custom .prompt.yaml file + + Args: + report_text: Full text of the report + prompt: Optional custom evaluation prompt string (legacy, overrides YAML) + prompt_yaml_path: Optional path to .prompt.yaml file + + Returns: + Dictionary containing evaluation results with 'pass', 'summary', 'reasoning', 'severity', and 'recommendations' keys + + Raises: + Exception: If LLM evaluation fails after retry + """ + api_key = os.environ.get("OPENAI_API_KEY") + base_url = os.environ.get("OPENAI_BASE_URL") + model = os.environ.get("EVAL_MODEL", "gpt-4o") + + if base_url: + client = OpenAI(api_key=api_key, base_url=base_url) + print(f"Using custom base URL: {base_url}") + else: + client = OpenAI(api_key=api_key) + + yaml_messages, yaml_model, yaml_params = load_prompt_from_yaml(prompt_yaml_path) + + if yaml_model and not os.environ.get("EVAL_MODEL"): + model = yaml_model + + temperature = 0.3 + if yaml_params and "temperature" in yaml_params: + temperature = yaml_params["temperature"] + + print(f"Using model: {model}") + + if prompt is not None: + messages = [ + {"role": "system", "content": prompt}, + {"role": "user", "content": f"Report:\n\n{report_text}"}, + ] + elif yaml_messages: + messages = [] + for msg in yaml_messages: + content = msg.get("content", "") + content = content.replace("{{report_text}}", report_text) + messages.append({"role": msg["role"], "content": content}) + else: + # Fallback to hardcoded EVAL_PROMPT + messages = [ + {"role": "system", "content": EVAL_PROMPT}, + {"role": "user", "content": f"Report:\n\n{report_text}"}, + ] + + try: + response = client.chat.completions.create( + model=model, + messages=messages, + temperature=temperature, + response_format={"type": "json_object"}, + ) + + evaluation = json.loads(response.choices[0].message.content) + return evaluation + except Exception as e: + error_msg = str(e).lower() + if "response_format" in error_msg or "json_object" in error_msg: + print(f"Warning: JSON response format not supported, retrying without it: {e}") + response = client.chat.completions.create( + model=model, + messages=messages, + temperature=temperature, + ) + content = response.choices[0].message.content + evaluation = json.loads(content) + return evaluation + raise + + +def write_github_summary(evaluation: dict[str, Any], model: str | None = None) -> None: + """ + Write the evaluation summary to GITHUB_STEP_SUMMARY. + + Args: + evaluation: LLM evaluation results + model: Model name used for evaluation (optional) + """ + summary_file = os.environ.get("GITHUB_STEP_SUMMARY") + if not summary_file: + print("Warning: GITHUB_STEP_SUMMARY environment variable not set. Writing to stdout instead.") + summary_file = "/dev/stdout" + + status_emoji = "✅" if evaluation["pass"] else "❌" + + model_info = f"model: {model}" if model else "OpenAI-compatible API" + + markdown = f"""# {status_emoji} Regression Test Evaluation: {"PASS" if evaluation['pass'] else "FAIL"} + +{evaluation['summary']} + + +{evaluation['reasoning']} + +{evaluation.get('recommendations', 'No specific recommendations.')} + +--- +*This evaluation was generated using {model_info}* +""" + + with open(summary_file, "a", encoding="utf-8") as f: + f.write(markdown) + + print(f"Summary written to {summary_file}") + + +def main(): + """Main entry point for the LLM evaluation script.""" + parser = argparse.ArgumentParser(description="Evaluate regression test reports using OpenAI LLM") + parser.add_argument("--report-path", type=Path, required=True, help="Path to the report.html file") + parser.add_argument("--prompt-file", type=Path, help="Optional path to a custom evaluation prompt file") + parser.add_argument("--output-json", type=Path, help="Optional path to write evaluation results as JSON") + + args = parser.parse_args() + + if not os.environ.get("OPENAI_API_KEY"): + print("Error: OPENAI_API_KEY environment variable not set", file=sys.stderr) + sys.exit(1) + + if not args.report_path.exists(): + print(f"Error: Report file not found: {args.report_path}", file=sys.stderr) + sys.exit(1) + + print(f"Loading report from: {args.report_path}") + report_text = load_report_text(args.report_path) + print(f"Report loaded: {len(report_text)} characters") + + custom_prompt = None + if args.prompt_file and args.prompt_file.exists(): + with open(args.prompt_file, "r", encoding="utf-8") as f: + custom_prompt = f.read() + print(f"Using custom prompt from: {args.prompt_file}") + + prompt_yaml_path = None + eval_prompt_path = os.environ.get("EVAL_PROMPT_PATH") + if eval_prompt_path: + prompt_yaml_path = Path(eval_prompt_path) + + print("Evaluating report with LLM...") + evaluation = evaluate_with_llm(report_text, custom_prompt, prompt_yaml_path) + + print(f"\nEvaluation Result: {'PASS' if evaluation['pass'] else 'FAIL'}") + print(f"Summary: {evaluation['summary']}") + + model = os.environ.get("EVAL_MODEL", "gpt-4o") + write_github_summary(evaluation, model) + + if args.output_json: + output_data = {"evaluation": evaluation} + with open(args.output_json, "w", encoding="utf-8") as f: + json.dump(output_data, f, indent=2) + print(f"Evaluation results written to: {args.output_json}") + + sys.exit(0 if evaluation["pass"] else 1) + + +if __name__ == "__main__": + main()