Increase database connection timeout and improve the error message if connection failure occurs. Closes #2377

This commit is contained in:
Binaek Sarkar
2023-01-18 19:56:19 +05:30
committed by kai
parent 8b68c0779f
commit 5dc46c4e09
8 changed files with 100 additions and 54 deletions

View File

@@ -4,6 +4,7 @@ import "time"
var (
DashboardServiceStartTimeout = 30 * time.Second
DBConnectionTimeout = 5 * time.Second
DBConnectionTimeout = 30 * time.Second
DBConnectionRetryBackoff = 200 * time.Millisecond
ServicePingInterval = 50 * time.Millisecond
)

View File

@@ -2,10 +2,12 @@ package db_common
import (
"context"
"log"
"time"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/sethvargo/go-retry"
"github.com/turbot/steampipe/pkg/constants"
"github.com/turbot/steampipe/pkg/utils"
)
@@ -35,27 +37,31 @@ func WaitForPool(ctx context.Context, db *pgxpool.Pool) (err error) {
}
}
// WaitForConnection waits for the db to start accepting connections and returns true
// returns false if the dbClient does not start within a stipulated time,
func WaitForConnection(ctx context.Context, db *pgx.Conn) (err error) {
// WaitForConnection PINGs the DB - retrying after a backoff of constants.ServicePingInterval - but only for constants.DBConnectionTimeout
// returns the error from the database if the dbClient does not respond successfully after a timeout
func WaitForConnection(ctx context.Context, connection *pgx.Conn) (err error) {
utils.LogTime("db.waitForConnection start")
defer utils.LogTime("db.waitForConnection end")
pingTimer := time.NewTicker(constants.ServicePingInterval)
timeoutAt := time.After(constants.DBConnectionTimeout)
defer pingTimer.Stop()
timeoutCtx, cancel := context.WithTimeout(ctx, constants.DBConnectionTimeout)
defer func() {
cancel()
}()
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-pingTimer.C:
err = db.Ping(ctx)
if err == nil {
return
}
case <-timeoutAt:
return
}
retryBackoff := retry.WithMaxDuration(
constants.DBConnectionTimeout,
retry.NewConstant(constants.ServicePingInterval),
)
retryErr := retry.Do(ctx, retryBackoff, func(ctx context.Context) error {
log.Println("[TRACE] Pinging")
pingErr := connection.Ping(timeoutCtx)
if pingErr != nil {
log.Println("[TRACE] Pinging failed -> trying again")
return retry.RetryableError(pingErr)
}
return nil
})
return retryErr
}

View File

@@ -7,6 +7,8 @@ import (
"strings"
"github.com/jackc/pgx/v5"
"github.com/pkg/errors"
"github.com/sethvargo/go-retry"
"github.com/turbot/steampipe/pkg/constants"
"github.com/turbot/steampipe/pkg/constants/runtime"
"github.com/turbot/steampipe/pkg/db/db_common"
@@ -104,3 +106,53 @@ func createLocalDbClient(ctx context.Context, opts *CreateDbOptions) (*pgx.Conn,
}
return conn, nil
}
// createMaintenanceClient connects to the postgres server using the
// maintenance database (postgres) and superuser
// this is used in a couple of places
// 1. During installation to setup the DBMS with foreign_server, extension et.al.
// 2. During service start and stop to query the DBMS for parameters (connected clients, database name etc.)
//
// this is called immediately after the service process is started and hence
// all special handling related to service startup failures SHOULD be handled here
func createMaintenanceClient(ctx context.Context, port int) (*pgx.Conn, error) {
utils.LogTime("db_local.createMaintenanceClient start")
defer utils.LogTime("db_local.createMaintenanceClient end")
var conn *pgx.Conn
var err error
backoff := retry.WithMaxDuration(
constants.DBConnectionTimeout,
retry.NewConstant(constants.DBConnectionRetryBackoff),
)
// create a connection to the service.
// Retry after a backoff, but only upto a maximum duration.
err = retry.Do(ctx, backoff, func(rCtx context.Context) error {
connStr := fmt.Sprintf("host=localhost port=%d user=%s dbname=postgres sslmode=disable", port, constants.DatabaseSuperUser)
log.Println("[TRACE] Trying to create maintenance client with: ", connStr)
dbConnection, err := pgx.Connect(rCtx, connStr)
if err != nil {
log.Println("[TRACE] could not connect:", err)
return retry.RetryableError(err)
}
log.Println("[TRACE] connected to database")
conn = dbConnection
return nil
})
if err != nil {
log.Println("[TRACE] could not connect to service")
return nil, errors.Wrap(err, "connection setup failed")
}
// wait for the connection to get established
// WaitForConnection retries on its own
err = db_common.WaitForConnection(ctx, conn)
if err != nil {
conn.Close(ctx)
log.Println("[TRACE] WaitForConnection timed out")
return nil, err
}
return conn, nil
}

View File

@@ -9,16 +9,13 @@ import (
"os"
"os/exec"
"sync"
"time"
"github.com/fatih/color"
"github.com/jackc/pgx/v5"
"github.com/sethvargo/go-retry"
psutils "github.com/shirou/gopsutil/process"
filehelpers "github.com/turbot/go-kit/files"
"github.com/turbot/go-kit/helpers"
"github.com/turbot/steampipe/pkg/constants"
"github.com/turbot/steampipe/pkg/db/db_common"
"github.com/turbot/steampipe/pkg/filepaths"
"github.com/turbot/steampipe/pkg/ociinstaller"
"github.com/turbot/steampipe/pkg/ociinstaller/versionfile"
@@ -367,35 +364,6 @@ func resolveDatabaseName(oldDbName *string) string {
return databaseName
}
// createMaintenanceClient connects to the postgres server using the
// maintenance database and superuser
func createMaintenanceClient(ctx context.Context, port int) (*pgx.Conn, error) {
backoff := retry.NewConstant(200 * time.Millisecond)
var conn *pgx.Conn
err := retry.Do(ctx, retry.WithMaxRetries(5, backoff), func(ctx context.Context) error {
connStr := fmt.Sprintf("host=localhost port=%d user=%s dbname=postgres sslmode=disable", port, constants.DatabaseSuperUser)
log.Println("[TRACE] Connection string: ", connStr)
utils.LogTime("db_local.createClient connection open start")
connection, err := pgx.Connect(context.Background(), connStr)
utils.LogTime("db_local.createClient connection open end")
if err != nil {
return retry.RetryableError(err)
}
if err := db_common.WaitForConnection(ctx, connection); err != nil {
return retry.RetryableError(err)
}
conn = connection
return nil
})
if err != nil {
return nil, err
}
return conn, nil
}
func startServiceForInstall(port int) (*psutils.Process, error) {
postgresCmd := exec.Command(
getPostgresBinaryExecutablePath(),

View File

@@ -188,7 +188,6 @@ func startDB(ctx context.Context, port int, listen StartListenType, invoker cons
return res.SetError(err)
}
// sometimes connecting to the db immediately after startup results in a dial error - so retry
databaseName, err := getDatabaseName(ctx, port)
if err != nil {
return res.SetError(err)
@@ -311,7 +310,7 @@ func startPostgresProcess(ctx context.Context, port int, listen StartListenType,
func retrieveDatabaseNameFromService(ctx context.Context, port int) (string, error) {
connection, err := createMaintenanceClient(ctx, port)
if err != nil {
return "", err
return "", fmt.Errorf("failed to connect to the database: %v - please try again or reset your steampipe database", err)
}
defer connection.Close(ctx)

View File

@@ -18,7 +18,7 @@ func LoadWorkspacePromptingForVariables(ctx context.Context) (*Workspace, *modco
workspacePath := viper.GetString(constants.ArgModLocation)
t := time.Now()
defer func() {
log.Printf("[TRANCE] Workspace load took %dms\n", time.Since(t).Milliseconds())
log.Printf("[TRACE] Workspace load took %dms\n", time.Since(t).Milliseconds())
}()
w, errAndWarnings := Load(ctx, workspacePath)
if errAndWarnings.GetError() == nil {

View File

@@ -0,0 +1,13 @@
for i in {1..10}; do
echo "############################################################### STARTING"
STEAMPIPE_LOG=trace steampipe service start
ps -ef | grep steampipe
STEAMPIPE_LOG=trace steampipe query "select pg_sleep(10)" &
echo "############################################################### KILLING"
pkill -9 steampipe
ps -ef | grep steampipe
pkill -9 postgres
ps -ef | grep steampipe
echo "############################################################### DONE"
done

View File

@@ -0,0 +1,7 @@
for i in {1..10}; do
echo "############################################################### STARTING"
STEAMPIPE_LOG=trace steampipe service start
echo "############################################################### STOPPING"
STEAMPIPE_LOG=trace steampipe service stop
echo "############################################################### DONE"
done