From cf1379d105501c97ee3bffbb257fc7d2964d5f97 Mon Sep 17 00:00:00 2001
From: "Aaron (\"AJ\") Steers" <aj@airbyte.io>
Date: Thu, 20 Nov 2025 17:27:25 -0800
Subject: [PATCH] feat: add OpenAI LLM evaluation step for connector regression
 tests (#68673)

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 .../prompts/regression-evaluation.prompt.yaml |  34 ++
 .../run-regression-tests-command.yml          | 102 ++++++
 airbyte-ci/connectors/live-tests/poetry.lock  | 189 ++++++++++-
 .../connectors/live-tests/pyproject.toml      |   2 +
 .../regression_tests/llm_evaluation/README.md |  37 +++
 .../llm_evaluation/__init__.py                |   1 +
 .../llm_evaluation/evaluate_report.py         | 299 ++++++++++++++++++
 7 files changed, 663 insertions(+), 1 deletion(-)
 create mode 100644 .github/prompts/regression-evaluation.prompt.yaml
 create mode 100644 airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/README.md
 create mode 100644 airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/__init__.py
 create mode 100644 airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/evaluate_report.py

diff --git a/.github/prompts/regression-evaluation.prompt.yaml b/.github/prompts/regression-evaluation.prompt.yaml
new file mode 100644
index 00000000000..411d2e9778b
--- /dev/null
+++ b/.github/prompts/regression-evaluation.prompt.yaml
@@ -0,0 +1,34 @@
+name: Regression Report Evaluation
+description: Evaluate Airbyte connector regression test reports and return a JSON verdict with reasoning
+model: llama3.2:3b
+modelParameters:
+  temperature: 0.3
+messages:
+  - role: system
+    content: |
+      You are an expert at evaluating connector regression test results. 
+      Your task is to analyze the test report and determine if the regression tests should PASS or FAIL.
+
+      Consider the following criteria:
+      1. All test cases should pass (no failed tests)
+      2. Record count differences between control and target versions should be minimal or explainable
+      3. Message count differences should not indicate data loss or corruption
+      4. Stream coverage should be reasonable
+      5. Any warnings or errors in test outputs should be evaluated for severity
+
+      Provide your evaluation in the following JSON format:
+      {
+          "pass": true/false,
+          "summary": "A concise 2-3 sentence summary of the evaluation",
+          "reasoning": "Detailed reasoning for your pass/fail decision, including specific issues found",
+          "severity": "critical/major/minor/none",
+          "recommendations": "Any recommendations for addressing issues"
+      }
+
+      Be strict but fair in your evaluation. Minor differences are acceptable, but data loss, 
+      corruption, or test failures should result in a FAIL.
+  - role: user
+    content: |
+      Report:
+
+      {{report_text}}
diff --git a/.github/workflows/run-regression-tests-command.yml b/.github/workflows/run-regression-tests-command.yml
index c3d17ece80b..9d5c2a85e05 100644
--- a/.github/workflows/run-regression-tests-command.yml
+++ b/.github/workflows/run-regression-tests-command.yml
@@ -84,7 +84,24 @@ jobs:
     name: Regression Tests
     runs-on: linux-24.04-large # Custom runner, defined in GitHub org settings
     timeout-minutes: 360 # 6 hours
+    permissions:
+      contents: read
+      pull-requests: write
+      issues: write
     steps:
+      - name: Append start with run link
+        id: pr-comment-id
+        if: github.event_name == 'workflow_dispatch' && github.event.inputs.pr != ''
+        uses: peter-evans/create-or-update-comment@v4
+        with:
+          token: ${{ github.token }}
+          issue-number: ${{ github.event.inputs.pr }}
+          comment-id: ${{ github.event.inputs.comment-id }}
+          edit-mode: append
+          body: |
+            > Starting regression tests (filter: `${{ github.event.inputs.connector_filter || '--modified' }}`)
+            > Workflow run: [${{ github.run_id }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
+
       - name: Install Python
         id: install_python
         uses: actions/setup-python@7f4fc3e22c37d6ff65e88745f38bd3157c663f7c # v4.9.1
@@ -183,6 +200,7 @@ jobs:
       # forks if the user installs the app into their fork. Until we document this as a clear
       # path, we will have to keep using the PAT.
       - name: Run Regression Tests [WORKFLOW DISPATCH]
+        id: run-regression-tests
         if: github.event_name == 'workflow_dispatch' # TODO: consider using the matrix strategy (https://docs.github.com/en/actions/using-jobs/using-a-matrix-for-your-jobs). See https://github.com/airbytehq/airbyte/pull/37659#discussion_r1583380234 for details.
         uses: ./.github/actions/run-airbyte-ci
         with:
@@ -199,3 +217,87 @@ jobs:
           s3_build_cache_access_key_id: ${{ secrets.SELF_RUNNER_AWS_ACCESS_KEY_ID }}
           s3_build_cache_secret_key: ${{ secrets.SELF_RUNNER_AWS_SECRET_ACCESS_KEY }}
           subcommand: connectors ${{ env.USE_LOCAL_CDK_FLAG }} ${{ inputs.connector_filter }} test --only-step connector_live_tests --connector_live_tests.test-suite=regression --connector_live_tests.connection-id=${{ github.event.inputs.connection_id }} --connector_live_tests.pr-url="https://github.com/airbytehq/airbyte/pull/${{ github.event.inputs.pr }}" ${{ env.READ_WITH_STATE_FLAG }} ${{ env.DISABLE_PROXY_FLAG }} ${{ env.STREAM_PARAMS }} ${{ env.CONNECTION_SUBSET }} ${{ env.CONTROL_VERSION }} --global-status-check-context="Regression Tests" --global-status-check-description='Running regression tests'
+
+      - name: Upload regression test report
+        if: always() && github.event_name == 'workflow_dispatch'
+        uses: actions/upload-artifact@v4
+        with:
+          name: regression-test-report
+          path: /tmp/regression_tests_artifacts/report.html
+          if-no-files-found: ignore
+
+      - name: Append regression outcome
+        if: always() && github.event_name == 'workflow_dispatch' && github.event.inputs.pr != ''
+        uses: peter-evans/create-or-update-comment@v4
+        with:
+          token: ${{ github.token }}
+          comment-id: ${{ steps.pr-comment-id.outputs.comment-id }}
+          edit-mode: append
+          body: |
+            > Regression tests: ${{ steps.run-regression-tests.outcome == 'success' && '✅ PASSED' || steps.run-regression-tests.outcome == 'failure' && '❌ FAILED' || steps.run-regression-tests.outcome == 'cancelled' && '⚠️ CANCELLED' || steps.run-regression-tests.outcome == 'skipped' && '⏭️ SKIPPED' || '❓ UNKNOWN' }}
+            > Report: ${{ hashFiles('/tmp/regression_tests_artifacts/report.html') != '' && 'artifact `regression-test-report` available in the run' || 'not generated' }}
+
+      - name: Install live-tests dependencies for LLM evaluation
+        if: always() && github.event_name == 'workflow_dispatch'
+        working-directory: airbyte-ci/connectors/live-tests
+        run: poetry install
+
+      - name: Install and Start Ollama
+        if: always() && github.event_name == 'workflow_dispatch'
+        run: |
+          curl -fsSL https://ollama.com/install.sh | sh
+          ollama serve &
+          sleep 5
+          ollama pull llama3.2:3b
+          echo "Ollama server started and model pulled"
+
+      - name: Evaluate Regression Test Report with LLM
+        if: always() && github.event_name == 'workflow_dispatch'
+        id: llm-eval
+        continue-on-error: true
+        working-directory: airbyte-ci/connectors/live-tests
+        env:
+          OPENAI_API_KEY: ollama
+          OPENAI_BASE_URL: http://127.0.0.1:11434/v1
+          EVAL_MODEL: llama3.2:3b
+        run: |
+          set -u
+          echo "ran=false" >> "$GITHUB_OUTPUT"
+          echo "result=error" >> "$GITHUB_OUTPUT"
+
+          # Find the most recent report.html file in /tmp/regression_tests_artifacts/
+          REPORT_PATH=$(find /tmp/regression_tests_artifacts -name "report.html" -type f -printf '%T@ %p\n' | sort -n | tail -1 | cut -f2- -d" ")
+
+          if [ -z "$REPORT_PATH" ]; then
+            echo "Error: No report.html found in /tmp/regression_tests_artifacts/" >&2
+            echo "## ⚠️ LLM Evaluation Skipped" >> "$GITHUB_STEP_SUMMARY"
+            echo "No regression test report found. The tests may have failed to generate a report." >> "$GITHUB_STEP_SUMMARY"
+            exit 1
+          fi
+
+          echo "Found report at: $REPORT_PATH"
+          echo "Running LLM evaluation..."
+
+          # Run the evaluation script
+          OUT_JSON="$RUNNER_TEMP/llm_eval.json"
+          poetry run python src/live_tests/regression_tests/llm_evaluation/evaluate_report.py \
+            --report-path "$REPORT_PATH" \
+            --output-json "$OUT_JSON"
+
+          # If we got here, script exit 0 and produced a judgment
+          PASS=$(jq -r '.evaluation.pass' "$OUT_JSON")
+          if [ "$PASS" = "true" ]; then RES="pass"; else RES="fail"; fi
+          echo "ran=true" >> "$GITHUB_OUTPUT"
+          echo "result=$RES" >> "$GITHUB_OUTPUT"
+
+      - name: Append LLM outcome
+        if: always() && github.event_name == 'workflow_dispatch' && github.event.inputs.pr != ''
+        env:
+          EVAL_MODEL: llama3.2:3b
+        uses: peter-evans/create-or-update-comment@v4
+        with:
+          token: ${{ github.token }}
+          comment-id: ${{ steps.pr-comment-id.outputs.comment-id }}
+          edit-mode: append
+          body: |
+            > LLM Evaluation: ${{ steps.llm-eval.outputs.ran == 'true' && (steps.llm-eval.outputs.result == 'pass' && '✅ PASS' || steps.llm-eval.outputs.result == 'fail' && '❌ FAIL' || '⚠️ ERROR') || '⚠️ Did not run' }}${{ steps.llm-eval.outputs.ran == 'true' && format(' (model: {0})', env.EVAL_MODEL) || '' }}
diff --git a/airbyte-ci/connectors/live-tests/poetry.lock b/airbyte-ci/connectors/live-tests/poetry.lock
index 89d6d13dce8..254f971fdab 100644
--- a/airbyte-ci/connectors/live-tests/poetry.lock
+++ b/airbyte-ci/connectors/live-tests/poetry.lock
@@ -355,6 +355,29 @@ test = ["coverage (>=5.5)", "equinox", "jax[cpu]", "jaxtyping", "mypy (>=0.800)"
 test-tox = ["equinox", "jax[cpu]", "jaxtyping", "mypy (>=0.800)", "numba", "numpy", "pandera", "pygments", "pyright (>=1.1.370)", "pytest (>=4.0.0)", "sphinx", "typing-extensions (>=3.10.0.0)"]
 test-tox-coverage = ["coverage (>=5.5)"]
 
+[[package]]
+name = "beautifulsoup4"
+version = "4.14.2"
+description = "Screen-scraping library"
+optional = false
+python-versions = ">=3.7.0"
+groups = ["main"]
+files = [
+    {file = "beautifulsoup4-4.14.2-py3-none-any.whl", hash = "sha256:5ef6fa3a8cbece8488d66985560f97ed091e22bbc4e9c2338508a9d5de6d4515"},
+    {file = "beautifulsoup4-4.14.2.tar.gz", hash = "sha256:2a98ab9f944a11acee9cc848508ec28d9228abfd522ef0fad6a02a72e0ded69e"},
+]
+
+[package.dependencies]
+soupsieve = ">1.2"
+typing-extensions = ">=4.0.0"
+
+[package.extras]
+cchardet = ["cchardet"]
+chardet = ["chardet"]
+charset-normalizer = ["charset-normalizer"]
+html5lib = ["html5lib"]
+lxml = ["lxml"]
+
 [[package]]
 name = "blessed"
 version = "1.20.0"
@@ -979,6 +1002,18 @@ wrapt = ">=1.10,<2"
 [package.extras]
 dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
 
+[[package]]
+name = "distro"
+version = "1.9.0"
+description = "Distro - an OS platform information API"
+optional = false
+python-versions = ">=3.6"
+groups = ["main"]
+files = [
+    {file = "distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2"},
+    {file = "distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed"},
+]
+
 [[package]]
 name = "docker"
 version = "6.1.3"
@@ -1943,6 +1978,118 @@ files = [
 [package.dependencies]
 ansicon = {version = "*", markers = "platform_system == \"Windows\""}
 
+[[package]]
+name = "jiter"
+version = "0.11.1"
+description = "Fast iterable JSON parser."
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "jiter-0.11.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:ed58841a491bbbf3f7c55a6b68fff568439ab73b2cce27ace0e169057b5851df"},
+    {file = "jiter-0.11.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:499beb9b2d7e51d61095a8de39ebcab1d1778f2a74085f8305a969f6cee9f3e4"},
+    {file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b87b2821795e28cc990939b68ce7a038edea680a24910bd68a79d54ff3f03c02"},
+    {file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:83f6fa494d8bba14ab100417c80e70d32d737e805cb85be2052d771c76fcd1f8"},
+    {file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5fbc6aea1daa2ec6f5ed465f0c5e7b0607175062ceebbea5ca70dd5ddab58083"},
+    {file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:302288e2edc43174bb2db838e94688d724f9aad26c5fb9a74f7a5fb427452a6a"},
+    {file = "jiter-0.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85db563fe3b367bb568af5d29dea4d4066d923b8e01f3417d25ebecd958de815"},
+    {file = "jiter-0.11.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f1c1ba2b6b22f775444ef53bc2d5778396d3520abc7b2e1da8eb0c27cb3ffb10"},
+    {file = "jiter-0.11.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:523be464b14f8fd0cc78da6964b87b5515a056427a2579f9085ce30197a1b54a"},
+    {file = "jiter-0.11.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:25b99b3f04cd2a38fefb22e822e35eb203a2cd37d680dbbc0c0ba966918af336"},
+    {file = "jiter-0.11.1-cp310-cp310-win32.whl", hash = "sha256:47a79e90545a596bb9104109777894033347b11180d4751a216afef14072dbe7"},
+    {file = "jiter-0.11.1-cp310-cp310-win_amd64.whl", hash = "sha256:cace75621ae9bd66878bf69fbd4dfc1a28ef8661e0c2d0eb72d3d6f1268eddf5"},
+    {file = "jiter-0.11.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:9b0088ff3c374ce8ce0168523ec8e97122ebb788f950cf7bb8e39c7dc6a876a2"},
+    {file = "jiter-0.11.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:74433962dd3c3090655e02e461267095d6c84f0741c7827de11022ef8d7ff661"},
+    {file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d98030e345e6546df2cc2c08309c502466c66c4747b043f1a0d415fada862b8"},
+    {file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1d6db0b2e788db46bec2cf729a88b6dd36959af2abd9fa2312dfba5acdd96dcb"},
+    {file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:55678fbbda261eafe7289165dd2ddd0e922df5f9a1ae46d7c79a5a15242bd7d1"},
+    {file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a6b74fae8e40497653b52ce6ca0f1b13457af769af6fb9c1113efc8b5b4d9be"},
+    {file = "jiter-0.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a55a453f8b035eb4f7852a79a065d616b7971a17f5e37a9296b4b38d3b619e4"},
+    {file = "jiter-0.11.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2638148099022e6bdb3f42904289cd2e403609356fb06eb36ddec2d50958bc29"},
+    {file = "jiter-0.11.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:252490567a5d990986f83b95a5f1ca1bf205ebd27b3e9e93bb7c2592380e29b9"},
+    {file = "jiter-0.11.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d431d52b0ca2436eea6195f0f48528202100c7deda354cb7aac0a302167594d5"},
+    {file = "jiter-0.11.1-cp311-cp311-win32.whl", hash = "sha256:db6f41e40f8bae20c86cb574b48c4fd9f28ee1c71cb044e9ec12e78ab757ba3a"},
+    {file = "jiter-0.11.1-cp311-cp311-win_amd64.whl", hash = "sha256:0cc407b8e6cdff01b06bb80f61225c8b090c3df108ebade5e0c3c10993735b19"},
+    {file = "jiter-0.11.1-cp311-cp311-win_arm64.whl", hash = "sha256:fe04ea475392a91896d1936367854d346724a1045a247e5d1c196410473b8869"},
+    {file = "jiter-0.11.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:c92148eec91052538ce6823dfca9525f5cfc8b622d7f07e9891a280f61b8c96c"},
+    {file = "jiter-0.11.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ecd4da91b5415f183a6be8f7158d127bdd9e6a3174138293c0d48d6ea2f2009d"},
+    {file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7e3ac25c00b9275684d47aa42febaa90a9958e19fd1726c4ecf755fbe5e553b"},
+    {file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:57d7305c0a841858f866cd459cd9303f73883fb5e097257f3d4a3920722c69d4"},
+    {file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e86fa10e117dce22c547f31dd6d2a9a222707d54853d8de4e9a2279d2c97f239"},
+    {file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ae5ef1d48aec7e01ee8420155d901bb1d192998fa811a65ebb82c043ee186711"},
+    {file = "jiter-0.11.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb68e7bf65c990531ad8715e57d50195daf7c8e6f1509e617b4e692af1108939"},
+    {file = "jiter-0.11.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43b30c8154ded5845fa454ef954ee67bfccce629b2dea7d01f795b42bc2bda54"},
+    {file = "jiter-0.11.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:586cafbd9dd1f3ce6a22b4a085eaa6be578e47ba9b18e198d4333e598a91db2d"},
+    {file = "jiter-0.11.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:677cc2517d437a83bb30019fd4cf7cad74b465914c56ecac3440d597ac135250"},
+    {file = "jiter-0.11.1-cp312-cp312-win32.whl", hash = "sha256:fa992af648fcee2b850a3286a35f62bbbaeddbb6dbda19a00d8fbc846a947b6e"},
+    {file = "jiter-0.11.1-cp312-cp312-win_amd64.whl", hash = "sha256:88b5cae9fa51efeb3d4bd4e52bfd4c85ccc9cac44282e2a9640893a042ba4d87"},
+    {file = "jiter-0.11.1-cp312-cp312-win_arm64.whl", hash = "sha256:9a6cae1ab335551917f882f2c3c1efe7617b71b4c02381e4382a8fc80a02588c"},
+    {file = "jiter-0.11.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:71b6a920a5550f057d49d0e8bcc60945a8da998019e83f01adf110e226267663"},
+    {file = "jiter-0.11.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0b3de72e925388453a5171be83379549300db01284f04d2a6f244d1d8de36f94"},
+    {file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc19dd65a2bd3d9c044c5b4ebf657ca1e6003a97c0fc10f555aa4f7fb9821c00"},
+    {file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d58faaa936743cd1464540562f60b7ce4fd927e695e8bc31b3da5b914baa9abd"},
+    {file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:902640c3103625317291cb73773413b4d71847cdf9383ba65528745ff89f1d14"},
+    {file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:30405f726e4c2ed487b176c09f8b877a957f535d60c1bf194abb8dadedb5836f"},
+    {file = "jiter-0.11.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3217f61728b0baadd2551844870f65219ac4a1285d5e1a4abddff3d51fdabe96"},
+    {file = "jiter-0.11.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b1364cc90c03a8196f35f396f84029f12abe925415049204446db86598c8b72c"},
+    {file = "jiter-0.11.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:53a54bf8e873820ab186b2dca9f6c3303f00d65ae5e7b7d6bda1b95aa472d646"},
+    {file = "jiter-0.11.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:7e29aca023627b0e0c2392d4248f6414d566ff3974fa08ff2ac8dbb96dfee92a"},
+    {file = "jiter-0.11.1-cp313-cp313-win32.whl", hash = "sha256:f153e31d8bca11363751e875c0a70b3d25160ecbaee7b51e457f14498fb39d8b"},
+    {file = "jiter-0.11.1-cp313-cp313-win_amd64.whl", hash = "sha256:f773f84080b667c69c4ea0403fc67bb08b07e2b7ce1ef335dea5868451e60fed"},
+    {file = "jiter-0.11.1-cp313-cp313-win_arm64.whl", hash = "sha256:635ecd45c04e4c340d2187bcb1cea204c7cc9d32c1364d251564bf42e0e39c2d"},
+    {file = "jiter-0.11.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d892b184da4d94d94ddb4031296931c74ec8b325513a541ebfd6dfb9ae89904b"},
+    {file = "jiter-0.11.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa22c223a3041dacb2fcd37c70dfd648b44662b4a48e242592f95bda5ab09d58"},
+    {file = "jiter-0.11.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:330e8e6a11ad4980cd66a0f4a3e0e2e0f646c911ce047014f984841924729789"},
+    {file = "jiter-0.11.1-cp313-cp313t-win_amd64.whl", hash = "sha256:09e2e386ebf298547ca3a3704b729471f7ec666c2906c5c26c1a915ea24741ec"},
+    {file = "jiter-0.11.1-cp313-cp313t-win_arm64.whl", hash = "sha256:fe4a431c291157e11cee7c34627990ea75e8d153894365a3bc84b7a959d23ca8"},
+    {file = "jiter-0.11.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:0fa1f70da7a8a9713ff8e5f75ec3f90c0c870be6d526aa95e7c906f6a1c8c676"},
+    {file = "jiter-0.11.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:569ee559e5046a42feb6828c55307cf20fe43308e3ae0d8e9e4f8d8634d99944"},
+    {file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f69955fa1d92e81987f092b233f0be49d4c937da107b7f7dcf56306f1d3fcce9"},
+    {file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:090f4c9d4a825e0fcbd0a2647c9a88a0f366b75654d982d95a9590745ff0c48d"},
+    {file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bbf3d8cedf9e9d825233e0dcac28ff15c47b7c5512fdfe2e25fd5bbb6e6b0cee"},
+    {file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2aa9b1958f9c30d3d1a558b75f0626733c60eb9b7774a86b34d88060be1e67fe"},
+    {file = "jiter-0.11.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e42d1ca16590b768c5e7d723055acd2633908baacb3628dd430842e2e035aa90"},
+    {file = "jiter-0.11.1-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5db4c2486a023820b701a17aec9c5a6173c5ba4393f26662f032f2de9c848b0f"},
+    {file = "jiter-0.11.1-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:4573b78777ccfac954859a6eff45cbd9d281d80c8af049d0f1a3d9fc323d5c3a"},
+    {file = "jiter-0.11.1-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:7593ac6f40831d7961cb67633c39b9fef6689a211d7919e958f45710504f52d3"},
+    {file = "jiter-0.11.1-cp314-cp314-win32.whl", hash = "sha256:87202ec6ff9626ff5f9351507def98fcf0df60e9a146308e8ab221432228f4ea"},
+    {file = "jiter-0.11.1-cp314-cp314-win_amd64.whl", hash = "sha256:a5dd268f6531a182c89d0dd9a3f8848e86e92dfff4201b77a18e6b98aa59798c"},
+    {file = "jiter-0.11.1-cp314-cp314-win_arm64.whl", hash = "sha256:5d761f863f912a44748a21b5c4979c04252588ded8d1d2760976d2e42cd8d991"},
+    {file = "jiter-0.11.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:2cc5a3965285ddc33e0cab933e96b640bc9ba5940cea27ebbbf6695e72d6511c"},
+    {file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b572b3636a784c2768b2342f36a23078c8d3aa6d8a30745398b1bab58a6f1a8"},
+    {file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ad93e3d67a981f96596d65d2298fe8d1aa649deb5374a2fb6a434410ee11915e"},
+    {file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a83097ce379e202dcc3fe3fc71a16d523d1ee9192c8e4e854158f96b3efe3f2f"},
+    {file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7042c51e7fbeca65631eb0c332f90c0c082eab04334e7ccc28a8588e8e2804d9"},
+    {file = "jiter-0.11.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a68d679c0e47649a61df591660507608adc2652442de7ec8276538ac46abe08"},
+    {file = "jiter-0.11.1-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a1b0da75dbf4b6ec0b3c9e604d1ee8beaf15bc046fff7180f7d89e3cdbd3bb51"},
+    {file = "jiter-0.11.1-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:69dd514bf0fa31c62147d6002e5ca2b3e7ef5894f5ac6f0a19752385f4e89437"},
+    {file = "jiter-0.11.1-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:bb31ac0b339efa24c0ca606febd8b77ef11c58d09af1b5f2be4c99e907b11111"},
+    {file = "jiter-0.11.1-cp314-cp314t-win32.whl", hash = "sha256:b2ce0d6156a1d3ad41da3eec63b17e03e296b78b0e0da660876fccfada86d2f7"},
+    {file = "jiter-0.11.1-cp314-cp314t-win_amd64.whl", hash = "sha256:f4db07d127b54c4a2d43b4cf05ff0193e4f73e0dd90c74037e16df0b29f666e1"},
+    {file = "jiter-0.11.1-cp314-cp314t-win_arm64.whl", hash = "sha256:28e4fdf2d7ebfc935523e50d1efa3970043cfaa161674fe66f9642409d001dfe"},
+    {file = "jiter-0.11.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:baa99c8db49467527658bb479857344daf0a14dff909b7f6714579ac439d1253"},
+    {file = "jiter-0.11.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:860fe55fa3b01ad0edf2adde1098247ff5c303d0121f9ce028c03d4f88c69502"},
+    {file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:173dd349d99b6feaf5a25a6fbcaf3489a6f947708d808240587a23df711c67db"},
+    {file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:14ac1dca837514cc946a6ac2c4995d9695303ecc754af70a3163d057d1a444ab"},
+    {file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:69af47de5f93a231d5b85f7372d3284a5be8edb4cc758f006ec5a1406965ac5e"},
+    {file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:685f8b3abd3bbd3e06e4dfe2429ff87fd5d7a782701151af99b1fcbd80e31b2b"},
+    {file = "jiter-0.11.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d04afa2d4e5526e54ae8a58feea953b1844bf6e3526bc589f9de68e86d0ea01"},
+    {file = "jiter-0.11.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1e92b927259035b50d8e11a8fdfe0ebd014d883e4552d37881643fa289a4bcf1"},
+    {file = "jiter-0.11.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:e7bd8be4fad8d4c5558b7801770cd2da6c072919c6f247cc5336edb143f25304"},
+    {file = "jiter-0.11.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:121381a77a3c85987f3eba0d30ceaca9116f7463bedeec2fa79b2e7286b89b60"},
+    {file = "jiter-0.11.1-cp39-cp39-win32.whl", hash = "sha256:160225407f6dfabdf9be1b44e22f06bc293a78a28ffa4347054698bd712dad06"},
+    {file = "jiter-0.11.1-cp39-cp39-win_amd64.whl", hash = "sha256:028e0d59bcdfa1079f8df886cdaefc6f515c27a5288dec956999260c7e4a7cfd"},
+    {file = "jiter-0.11.1-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:e642b5270e61dd02265866398707f90e365b5db2eb65a4f30c789d826682e1f6"},
+    {file = "jiter-0.11.1-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:464ba6d000585e4e2fd1e891f31f1231f497273414f5019e27c00a4b8f7a24ad"},
+    {file = "jiter-0.11.1-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:055568693ab35e0bf3a171b03bb40b2dcb10352359e0ab9b5ed0da2bf1eb6f6f"},
+    {file = "jiter-0.11.1-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0c69ea798d08a915ba4478113efa9e694971e410056392f4526d796f136d3fa"},
+    {file = "jiter-0.11.1-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:0d4d6993edc83cf75e8c6828a8d6ce40a09ee87e38c7bfba6924f39e1337e21d"},
+    {file = "jiter-0.11.1-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:f78d151c83a87a6cf5461d5ee55bc730dd9ae227377ac6f115b922989b95f838"},
+    {file = "jiter-0.11.1-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9022974781155cd5521d5cb10997a03ee5e31e8454c9d999dcdccd253f2353f"},
+    {file = "jiter-0.11.1-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18c77aaa9117510d5bdc6a946baf21b1f0cfa58ef04d31c8d016f206f2118960"},
+    {file = "jiter-0.11.1.tar.gz", hash = "sha256:849dcfc76481c0ea0099391235b7ca97d7279e0fa4c86005457ac7c88e8b76dc"},
+]
+
 [[package]]
 name = "jsonschema"
 version = "3.2.0"
@@ -2492,6 +2639,34 @@ rsa = ["cryptography (>=3.0.0)"]
 signals = ["blinker (>=1.4.0)"]
 signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
 
+[[package]]
+name = "openai"
+version = "1.109.1"
+description = "The official Python library for the openai API"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "openai-1.109.1-py3-none-any.whl", hash = "sha256:6bcaf57086cf59159b8e27447e4e7dd019db5d29a438072fbd49c290c7e65315"},
+    {file = "openai-1.109.1.tar.gz", hash = "sha256:d173ed8dbca665892a6db099b4a2dfac624f94d20a93f46eb0b56aae940ed869"},
+]
+
+[package.dependencies]
+anyio = ">=3.5.0,<5"
+distro = ">=1.7.0,<2"
+httpx = ">=0.23.0,<1"
+jiter = ">=0.4.0,<1"
+pydantic = ">=1.9.0,<3"
+sniffio = "*"
+tqdm = ">4"
+typing-extensions = ">=4.11,<5"
+
+[package.extras]
+aiohttp = ["aiohttp", "httpx-aiohttp (>=0.1.8)"]
+datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
+realtime = ["websockets (>=13,<16)"]
+voice-helpers = ["numpy (>=2.0.2)", "sounddevice (>=0.5.1)"]
+
 [[package]]
 name = "opentelemetry-api"
 version = "1.27.0"
@@ -3864,6 +4039,18 @@ files = [
     {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
 ]
 
+[[package]]
+name = "soupsieve"
+version = "2.8"
+description = "A modern CSS selector implementation for Beautiful Soup."
+optional = false
+python-versions = ">=3.9"
+groups = ["main"]
+files = [
+    {file = "soupsieve-2.8-py3-none-any.whl", hash = "sha256:0cc76456a30e20f5d7f2e14a98a4ae2ee4e5abdc7c5ea0aafe795f344bc7984c"},
+    {file = "soupsieve-2.8.tar.gz", hash = "sha256:e2dd4a40a628cb5f28f6d4b0db8800b8f581b65bb380b97de22ba5ca8d72572f"},
+]
+
 [[package]]
 name = "sqlalchemy"
 version = "2.0.35"
@@ -4477,4 +4664,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11,<3.12"
-content-hash = "979b7f159a1a284b77b9bec889298c50e3c269149524e647978b517463142e1f"
+content-hash = "d6f93822b88fad4aaf0ae5d60932e0e93d9e95aa79fdddc44f9f6045387d3b4c"
diff --git a/airbyte-ci/connectors/live-tests/pyproject.toml b/airbyte-ci/connectors/live-tests/pyproject.toml
index 63676a84df2..6f1a882b059 100644
--- a/airbyte-ci/connectors/live-tests/pyproject.toml
+++ b/airbyte-ci/connectors/live-tests/pyproject.toml
@@ -41,6 +41,8 @@ dpath = "^2.1.6"
 genson = "^1.3.0"
 segment-analytics-python = "^2.3.2"
 python-slugify = ">=8.0.4"
+beautifulsoup4 = "^4.12.0"
+openai = "^1.0.0"
 
 
 [tool.poetry.group.dev.dependencies]
diff --git a/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/README.md b/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/README.md
new file mode 100644
index 00000000000..93f84175f1a
--- /dev/null
+++ b/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/README.md
@@ -0,0 +1,37 @@
+# LLM-Based Regression Test Evaluation
+
+Automated evaluation of connector regression test reports using LLM models.
+
+## How It Works
+
+After regression tests complete, this evaluates the HTML report and writes a pass/fail judgment to `GITHUB_STEP_SUMMARY`.
+
+## Configuration
+
+**Environment Variables:**
+- `OPENAI_API_KEY` - API key (use `ollama` for Ollama)
+- `OPENAI_BASE_URL` - Base URL for OpenAI-compatible API (e.g., `http://127.0.0.1:11434/v1` for Ollama)
+- `EVAL_MODEL` - Model name (defaults to `gpt-4o`)
+
+**Evaluation Prompt:**
+Stored in `.github/prompts/regression-evaluation.prompt.yaml` following GitHub's prompt format. Uses `{{report_text}}` placeholder for dynamic content injection.
+
+## Local Testing
+
+```bash
+# Install Ollama
+curl -fsSL https://ollama.com/install.sh | sh
+ollama serve &
+ollama pull llama3.2:3b
+
+# Set environment
+export OPENAI_API_KEY=ollama
+export OPENAI_BASE_URL=http://127.0.0.1:11434/v1
+export EVAL_MODEL=llama3.2:3b
+
+# Run evaluation
+cd airbyte-ci/connectors/live-tests
+poetry install
+poetry run python src/live_tests/regression_tests/llm_evaluation/evaluate_report.py \
+  --report-path /path/to/report.html
+```
diff --git a/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/__init__.py b/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/__init__.py
new file mode 100644
index 00000000000..7f66676b871
--- /dev/null
+++ b/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
diff --git a/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/evaluate_report.py b/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/evaluate_report.py
new file mode 100644
index 00000000000..ac9f33c8016
--- /dev/null
+++ b/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/llm_evaluation/evaluate_report.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
+
+"""
+LLM-based evaluation of regression test reports.
+
+This script reads a regression test report (HTML format) and uses OpenAI's LLM
+to evaluate the results, make a pass/fail judgment, and generate a summary.
+The summary is written to GITHUB_STEP_SUMMARY for display in GitHub Actions.
+"""
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Any
+
+import yaml
+from bs4 import BeautifulSoup
+from openai import OpenAI
+
+MAX_REPORT_CHARS = 200000
+
+# Default evaluation prompt
+EVAL_PROMPT = """You are an expert at evaluating connector regression test results. 
+Your task is to analyze the test report and determine if the regression tests should PASS or FAIL.
+
+Consider the following criteria:
+1. All test cases should pass (no failed tests)
+2. Record count differences between control and target versions should be minimal or explainable
+3. Message count differences should not indicate data loss or corruption
+4. Stream coverage should be reasonable
+5. Any warnings or errors in test outputs should be evaluated for severity
+
+Provide your evaluation in the following JSON format:
+{
+    "pass": true/false,
+    "summary": "A concise 2-3 sentence summary of the evaluation",
+    "reasoning": "Detailed reasoning for your pass/fail decision, including specific issues found",
+    "severity": "critical/major/minor/none",
+    "recommendations": "Any recommendations for addressing issues"
+}
+
+Be strict but fair in your evaluation. Minor differences are acceptable, but data loss, 
+corruption, or test failures should result in a FAIL."""
+
+
+def load_prompt_from_yaml(yaml_path: Path | None = None) -> tuple[list[dict[str, str]] | None, str | None, dict[str, Any] | None]:
+    """
+    Load prompt from GitHub-format YAML file.
+
+    Args:
+        yaml_path: Path to the .prompt.yaml file. If None, uses default location.
+
+    Returns:
+        Tuple of (messages, model, modelParameters) or (None, None, None) if file not found or invalid
+    """
+    if yaml_path is None:
+        # Default location: .github/prompts/regression-evaluation.prompt.yaml
+        github_workspace = os.environ.get("GITHUB_WORKSPACE")
+        if github_workspace:
+            yaml_path = Path(github_workspace) / ".github" / "prompts" / "regression-evaluation.prompt.yaml"
+        else:
+            script_dir = Path(__file__).parent
+            repo_root = script_dir.parent.parent.parent.parent.parent.parent
+            yaml_path = repo_root / ".github" / "prompts" / "regression-evaluation.prompt.yaml"
+
+    if not yaml_path.exists():
+        print(f"Prompt file not found at {yaml_path}, using default hardcoded prompt")
+        return None, None, None
+
+    try:
+        with open(yaml_path, "r", encoding="utf-8") as f:
+            prompt_data = yaml.safe_load(f)
+
+        messages = prompt_data.get("messages", [])
+        model = prompt_data.get("model")
+        model_params = prompt_data.get("modelParameters", {})
+
+        if not messages:
+            print(f"Warning: No messages found in {yaml_path}, using default hardcoded prompt")
+            return None, None, None
+
+        print(f"Loaded prompt from {yaml_path}")
+        return messages, model, model_params
+
+    except Exception as e:
+        print(f"Warning: Failed to load prompt from {yaml_path}: {e}")
+        print("Using default hardcoded prompt")
+        return None, None, None
+
+
+def load_report_text(html_path: Path) -> str:
+    """
+    Load and convert HTML report to clean text.
+
+    Args:
+        html_path: Path to the report.html file
+
+    Returns:
+        Clean text representation of the report
+    """
+    with open(html_path, "r", encoding="utf-8") as f:
+        html_content = f.read()
+
+    soup = BeautifulSoup(html_content, "html.parser")
+
+    for element in soup(["script", "style"]):
+        element.decompose()
+
+    report_text = soup.get_text("\n", strip=True)
+
+    report_text = "\n".join(line.strip() for line in report_text.splitlines() if line.strip())
+
+    if len(report_text) > MAX_REPORT_CHARS:
+        original_length = len(report_text)
+        report_text = report_text[:MAX_REPORT_CHARS]
+        truncation_note = f"\n\n[Report truncated from {original_length} to {MAX_REPORT_CHARS} characters for evaluation]"
+        report_text += truncation_note
+        print(f"Warning: Report truncated from {original_length} to {MAX_REPORT_CHARS} characters")
+
+    return report_text
+
+
+def evaluate_with_llm(report_text: str, prompt: str | None = None, prompt_yaml_path: Path | None = None) -> dict[str, Any]:
+    """
+    Use OpenAI LLM to evaluate the regression test report.
+
+    Supports both OpenAI API and Ollama (OpenAI-compatible).
+    Configure via environment variables:
+    - OPENAI_API_KEY: API key (use 'ollama' for Ollama)
+    - OPENAI_BASE_URL: Optional base URL (e.g., http://127.0.0.1:11434/v1 for Ollama)
+    - EVAL_MODEL: Model name (defaults to gpt-4o, use llama3.2:3b for Ollama)
+    - EVAL_PROMPT_PATH: Optional path to custom .prompt.yaml file
+
+    Args:
+        report_text: Full text of the report
+        prompt: Optional custom evaluation prompt string (legacy, overrides YAML)
+        prompt_yaml_path: Optional path to .prompt.yaml file
+
+    Returns:
+        Dictionary containing evaluation results with 'pass', 'summary', 'reasoning', 'severity', and 'recommendations' keys
+
+    Raises:
+        Exception: If LLM evaluation fails after retry
+    """
+    api_key = os.environ.get("OPENAI_API_KEY")
+    base_url = os.environ.get("OPENAI_BASE_URL")
+    model = os.environ.get("EVAL_MODEL", "gpt-4o")
+
+    if base_url:
+        client = OpenAI(api_key=api_key, base_url=base_url)
+        print(f"Using custom base URL: {base_url}")
+    else:
+        client = OpenAI(api_key=api_key)
+
+    yaml_messages, yaml_model, yaml_params = load_prompt_from_yaml(prompt_yaml_path)
+
+    if yaml_model and not os.environ.get("EVAL_MODEL"):
+        model = yaml_model
+
+    temperature = 0.3
+    if yaml_params and "temperature" in yaml_params:
+        temperature = yaml_params["temperature"]
+
+    print(f"Using model: {model}")
+
+    if prompt is not None:
+        messages = [
+            {"role": "system", "content": prompt},
+            {"role": "user", "content": f"Report:\n\n{report_text}"},
+        ]
+    elif yaml_messages:
+        messages = []
+        for msg in yaml_messages:
+            content = msg.get("content", "")
+            content = content.replace("{{report_text}}", report_text)
+            messages.append({"role": msg["role"], "content": content})
+    else:
+        # Fallback to hardcoded EVAL_PROMPT
+        messages = [
+            {"role": "system", "content": EVAL_PROMPT},
+            {"role": "user", "content": f"Report:\n\n{report_text}"},
+        ]
+
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            temperature=temperature,
+            response_format={"type": "json_object"},
+        )
+
+        evaluation = json.loads(response.choices[0].message.content)
+        return evaluation
+    except Exception as e:
+        error_msg = str(e).lower()
+        if "response_format" in error_msg or "json_object" in error_msg:
+            print(f"Warning: JSON response format not supported, retrying without it: {e}")
+            response = client.chat.completions.create(
+                model=model,
+                messages=messages,
+                temperature=temperature,
+            )
+            content = response.choices[0].message.content
+            evaluation = json.loads(content)
+            return evaluation
+        raise
+
+
+def write_github_summary(evaluation: dict[str, Any], model: str | None = None) -> None:
+    """
+    Write the evaluation summary to GITHUB_STEP_SUMMARY.
+
+    Args:
+        evaluation: LLM evaluation results
+        model: Model name used for evaluation (optional)
+    """
+    summary_file = os.environ.get("GITHUB_STEP_SUMMARY")
+    if not summary_file:
+        print("Warning: GITHUB_STEP_SUMMARY environment variable not set. Writing to stdout instead.")
+        summary_file = "/dev/stdout"
+
+    status_emoji = "✅" if evaluation["pass"] else "❌"
+
+    model_info = f"model: {model}" if model else "OpenAI-compatible API"
+
+    markdown = f"""# {status_emoji} Regression Test Evaluation: {"PASS" if evaluation['pass'] else "FAIL"}
+
+{evaluation['summary']}
+
+
+{evaluation['reasoning']}
+
+{evaluation.get('recommendations', 'No specific recommendations.')}
+
+---
+*This evaluation was generated using {model_info}*
+"""
+
+    with open(summary_file, "a", encoding="utf-8") as f:
+        f.write(markdown)
+
+    print(f"Summary written to {summary_file}")
+
+
+def main():
+    """Main entry point for the LLM evaluation script."""
+    parser = argparse.ArgumentParser(description="Evaluate regression test reports using OpenAI LLM")
+    parser.add_argument("--report-path", type=Path, required=True, help="Path to the report.html file")
+    parser.add_argument("--prompt-file", type=Path, help="Optional path to a custom evaluation prompt file")
+    parser.add_argument("--output-json", type=Path, help="Optional path to write evaluation results as JSON")
+
+    args = parser.parse_args()
+
+    if not os.environ.get("OPENAI_API_KEY"):
+        print("Error: OPENAI_API_KEY environment variable not set", file=sys.stderr)
+        sys.exit(1)
+
+    if not args.report_path.exists():
+        print(f"Error: Report file not found: {args.report_path}", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Loading report from: {args.report_path}")
+    report_text = load_report_text(args.report_path)
+    print(f"Report loaded: {len(report_text)} characters")
+
+    custom_prompt = None
+    if args.prompt_file and args.prompt_file.exists():
+        with open(args.prompt_file, "r", encoding="utf-8") as f:
+            custom_prompt = f.read()
+        print(f"Using custom prompt from: {args.prompt_file}")
+
+    prompt_yaml_path = None
+    eval_prompt_path = os.environ.get("EVAL_PROMPT_PATH")
+    if eval_prompt_path:
+        prompt_yaml_path = Path(eval_prompt_path)
+
+    print("Evaluating report with LLM...")
+    evaluation = evaluate_with_llm(report_text, custom_prompt, prompt_yaml_path)
+
+    print(f"\nEvaluation Result: {'PASS' if evaluation['pass'] else 'FAIL'}")
+    print(f"Summary: {evaluation['summary']}")
+
+    model = os.environ.get("EVAL_MODEL", "gpt-4o")
+    write_github_summary(evaluation, model)
+
+    if args.output_json:
+        output_data = {"evaluation": evaluation}
+        with open(args.output_json, "w", encoding="utf-8") as f:
+            json.dump(output_data, f, indent=2)
+        print(f"Evaluation results written to: {args.output_json}")
+
+    sys.exit(0 if evaluation["pass"] else 1)
+
+
+if __name__ == "__main__":
+    main()