Fix flaky TestQueryContextLeakage test (#4887)

Changes:
1. Renamed TestQueryContextLeakage -> TestContextCancellationTiming
   - The test was checking cancellation timing, not memory leaks
   - Updated comments to reflect actual purpose

2. Increased timeout from 1ms to 100ms
   - 1ms is too aggressive for CI runners under load
   - 100ms still catches real deadlocks while avoiding flakiness
   - Added detailed comments explaining the timeout choice

3. Added TestNoGoroutineLeaks using goleak
   - Properly tests for actual resource leaks (goroutines)
   - More reliable than memory-based leak detection
   - Uses industry-standard goleak library

The original 1ms timeout caused intermittent CI failures on slower
runners, as context cancellation involves goroutine scheduling that
has no guaranteed sub-millisecond timing.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Nathan Wallace
2025-11-15 21:05:00 +08:00
committed by GitHub
parent 2e5f3fda97
commit cb1fa48173
2 changed files with 39 additions and 11 deletions

4
go.mod
View File

@@ -156,6 +156,7 @@ require (
github.com/spf13/afero v1.14.0 // indirect
github.com/spf13/cast v1.7.1 // indirect
github.com/stevenle/topsort v0.2.0 // indirect
github.com/stretchr/testify v1.10.0
github.com/subosito/gotenv v1.6.0 // indirect
github.com/tklauser/numcpus v0.10.0 // indirect
github.com/tkrajina/go-reflector v0.5.8 // indirect
@@ -189,10 +190,9 @@ require (
gopkg.in/yaml.v3 v3.0.1 // indirect
oras.land/oras-go/v2 v2.5.0 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect
github.com/stretchr/testify v1.10.0 // indirect
)
require github.com/stretchr/testify v1.10.0
require go.uber.org/goleak v1.3.0
require (
cel.dev/expr v0.23.0 // indirect

View File

@@ -4,6 +4,8 @@ import (
"context"
"testing"
"time"
"go.uber.org/goleak"
)
// TestCreatePromptContext tests prompt context creation
@@ -309,10 +311,12 @@ func TestCancelAfterContextAlreadyCancelled(t *testing.T) {
c.cancelActiveQueryIfAny()
}
// TestQueryContextLeakage tests for context leakage
func TestQueryContextLeakage(t *testing.T) {
// TestContextCancellationTiming verifies that context cancellation propagates
// in a reasonable time across many iterations. This stress test helps identify
// timing issues or deadlocks in the cancellation logic.
func TestContextCancellationTiming(t *testing.T) {
if testing.Short() {
t.Skip("Skipping leak test in short mode")
t.Skip("Skipping timing stress test in short mode")
}
c := &InteractiveClient{}
@@ -327,17 +331,17 @@ func TestQueryContextLeakage(t *testing.T) {
c.cancelActiveQuery()
}
// Verify context is cancelled
// Verify context is cancelled within a reasonable timeout
// Using 100ms to avoid flakiness on slower CI runners while still
// catching real deadlocks or cancellation issues
select {
case <-ctx.Done():
// Good
case <-time.After(1 * time.Millisecond):
t.Errorf("Context %d not cancelled", i)
// Good - context was cancelled
case <-time.After(100 * time.Millisecond):
t.Fatalf("Context %d not cancelled within 100ms - possible deadlock or cancellation failure", i)
return
}
}
// If we get here without hanging or OOM, test passes
}
// TestCancelFuncReplacement tests that cancel functions are properly replaced
@@ -397,3 +401,27 @@ func TestCancelFuncReplacement(t *testing.T) {
// Expected - first context remains active
}
}
// TestNoGoroutineLeaks verifies that creating and cancelling query contexts
// doesn't leak goroutines. This uses goleak to detect goroutines that are
// still running after the test completes.
func TestNoGoroutineLeaks(t *testing.T) {
if testing.Short() {
t.Skip("Skipping goroutine leak test in short mode")
}
defer goleak.VerifyNone(t)
c := &InteractiveClient{}
parentCtx := context.Background()
// Create and cancel many contexts to stress test for leaks
for i := 0; i < 1000; i++ {
ctx := c.createQueryContext(parentCtx)
if c.cancelActiveQuery != nil {
c.cancelActiveQuery()
// Wait for cancellation to complete
<-ctx.Done()
}
}
}