backend/local: Periodically persist intermediate state snapshots

Terraform Core emits a hook event every time it writes a change into the
in-memory state. Previously the local backend would just copy that into
the transient storage of the state manager, but for most state storage
implementations that doesn't really do anything useful because it just
makes another copy of the state in memory.

We originally added this hook mechanism with the intent of making
Terraform _persist_ the state each time, but we backed that out after
finding that it was a bit too aggressive and was making the state snapshot
history much harder to use in storage systems that can preserve historical
snapshots.

However, sometimes Terraform gets killed mid-apply for whatever reason and
in our previous implementation that meant always losing that transient
state, forcing the user to edit the state manually (or use "import") to
recover a useful state.

In an attempt at finding a sweet spot between these extremes, here we
change the rule so that if an apply runs for longer than 20 seconds then
we'll try to persist the state to the backend in an update that arrives
at least 20 seconds after the first update, and then again for each
additional 20 second period as long as Terraform keeps announcing new
state snapshots.

This also introduces a special interruption mode where if the apply phase
gets interrupted by SIGINT (or equivalent) then the local backend will
try to persist the state immediately in anticipation of a
possibly-imminent SIGKILL, and will then immediately persist any
subsequent state update that arrives until the apply phase is complete.
After interruption Terraform will not start any new operations and will
instead just let any already-running operations run to completion, and so
this will persist the state once per resource instance that is able to
complete before being killed.

This does mean that now long-running applies will generate intermediate
state snapshots where they wouldn't before, but there should still be
considerably fewer snapshots than were created when we were persisting
for each individual state change. We can adjust the 20 second interval
in future commits if we find that this spot isn't as sweet as first
assumed.
This commit is contained in:
Martin Atkins
2023-02-13 17:38:24 -08:00
parent 47fd019dbb
commit f0de9b60c1
10 changed files with 401 additions and 0 deletions

View File

@@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"log"
"time"
"github.com/hashicorp/terraform/internal/backend"
"github.com/hashicorp/terraform/internal/command/views"
@@ -74,6 +75,10 @@ func (b *Local) opApply(
op.ReportResult(runningOp, diags)
return
}
// stateHook uses schemas for when it periodically persists state to the
// persistent storage backend.
stateHook.Schemas = schemas
stateHook.PersistInterval = 20 * time.Second // arbitrary interval that's hopefully a sweet spot
var plan *plans.Plan
// If we weren't given a plan, then we refresh/plan

View File

@@ -1,7 +1,9 @@
package local
import (
"log"
"sync"
"time"
"github.com/hashicorp/terraform/internal/states"
"github.com/hashicorp/terraform/internal/states/statemgr"
@@ -15,6 +17,21 @@ type StateHook struct {
sync.Mutex
StateMgr statemgr.Writer
// If PersistInterval is nonzero then for any new state update after
// the duration has elapsed we'll try to persist a state snapshot
// to the persistent backend too.
// That's only possible if field Schemas is valid, because the
// StateMgr.PersistState function for some backends needs schemas.
PersistInterval time.Duration
// Schemas are the schemas to use when persisting state due to
// PersistInterval. This is ignored if PersistInterval is zero,
// and PersistInterval is ignored if this is nil.
Schemas *terraform.Schemas
lastPersist time.Time
forcePersist bool
}
var _ terraform.Hook = (*StateHook)(nil)
@@ -23,11 +40,56 @@ func (h *StateHook) PostStateUpdate(new *states.State) (terraform.HookAction, er
h.Lock()
defer h.Unlock()
if h.lastPersist.IsZero() {
// The first PostStateUpdate starts the clock for intermediate
// calls to PersistState.
h.lastPersist = time.Now()
}
if h.StateMgr != nil {
if err := h.StateMgr.WriteState(new); err != nil {
return terraform.HookActionHalt, err
}
if mgrPersist, ok := h.StateMgr.(statemgr.Persister); ok && h.PersistInterval != 0 && h.Schemas != nil {
if h.forcePersist || time.Since(h.lastPersist) >= h.PersistInterval {
err := mgrPersist.PersistState(h.Schemas)
if err != nil {
return terraform.HookActionHalt, err
}
h.lastPersist = time.Now()
}
}
}
return terraform.HookActionContinue, nil
}
func (h *StateHook) Stopping() {
h.Lock()
defer h.Unlock()
// If Terraform has been asked to stop then that might mean that a hard
// kill signal will follow shortly in case Terraform doesn't stop
// quickly enough, and so we'll try to persist the latest state
// snapshot in the hope that it'll give the user less recovery work to
// do if they _do_ subsequently hard-kill Terraform during an apply.
if mgrPersist, ok := h.StateMgr.(statemgr.Persister); ok && h.Schemas != nil {
err := mgrPersist.PersistState(h.Schemas)
if err != nil {
// This hook can't affect Terraform Core's ongoing behavior,
// but it's a best effort thing anyway so we'll just emit a
// log to aid with debugging.
log.Printf("[ERROR] Failed to persist state after interruption: %s", err)
}
// While we're in the stopping phase we'll try to persist every
// new state update to maximize every opportunity we get to avoid
// losing track of objects that have been created or updated.
// Terraform Core won't start any new operations after it's been
// stopped, so at most we should see one more PostStateUpdate
// call per already-active request.
h.forcePersist = true
}
}

View File

@@ -1,8 +1,12 @@
package local
import (
"fmt"
"testing"
"time"
"github.com/google/go-cmp/cmp"
"github.com/hashicorp/terraform/internal/states"
"github.com/hashicorp/terraform/internal/states/statemgr"
"github.com/hashicorp/terraform/internal/terraform"
)
@@ -27,3 +31,125 @@ func TestStateHook(t *testing.T) {
t.Fatalf("bad state: %#v", is.State())
}
}
func TestStateHookStopping(t *testing.T) {
is := &testPersistentState{}
hook := &StateHook{
StateMgr: is,
Schemas: &terraform.Schemas{},
PersistInterval: 4 * time.Hour,
lastPersist: time.Now(),
}
s := statemgr.TestFullInitialState()
action, err := hook.PostStateUpdate(s)
if err != nil {
t.Fatalf("unexpected error from PostStateUpdate: %s", err)
}
if got, want := action, terraform.HookActionContinue; got != want {
t.Fatalf("wrong hookaction %#v; want %#v", got, want)
}
if is.Written == nil || !is.Written.Equal(s) {
t.Fatalf("mismatching state written")
}
if is.Persisted != nil {
t.Fatalf("persisted too soon")
}
// We'll now force lastPersist to be long enough ago that persisting
// should be due on the next call.
hook.lastPersist = time.Now().Add(-5 * time.Hour)
hook.PostStateUpdate(s)
if is.Written == nil || !is.Written.Equal(s) {
t.Fatalf("mismatching state written")
}
if is.Persisted == nil || !is.Persisted.Equal(s) {
t.Fatalf("mismatching state persisted")
}
hook.PostStateUpdate(s)
if is.Written == nil || !is.Written.Equal(s) {
t.Fatalf("mismatching state written")
}
if is.Persisted == nil || !is.Persisted.Equal(s) {
t.Fatalf("mismatching state persisted")
}
gotLog := is.CallLog
wantLog := []string{
// Initial call before we reset lastPersist
"WriteState",
// Write and then persist after we reset lastPersist
"WriteState",
"PersistState",
// Final call when persisting wasn't due yet.
"WriteState",
}
if diff := cmp.Diff(wantLog, gotLog); diff != "" {
t.Fatalf("wrong call log so far\n%s", diff)
}
// We'll reset the log now before we try seeing what happens after
// we use "Stopped".
is.CallLog = is.CallLog[:0]
is.Persisted = nil
hook.Stopping()
if is.Persisted == nil || !is.Persisted.Equal(s) {
t.Fatalf("mismatching state persisted")
}
is.Persisted = nil
hook.PostStateUpdate(s)
if is.Persisted == nil || !is.Persisted.Equal(s) {
t.Fatalf("mismatching state persisted")
}
is.Persisted = nil
hook.PostStateUpdate(s)
if is.Persisted == nil || !is.Persisted.Equal(s) {
t.Fatalf("mismatching state persisted")
}
gotLog = is.CallLog
wantLog = []string{
// "Stopping" immediately persisted
"PersistState",
// PostStateUpdate then writes and persists on every call,
// on the assumption that we're now bailing out after
// being cancelled and trying to save as much state as we can.
"WriteState",
"PersistState",
"WriteState",
"PersistState",
}
if diff := cmp.Diff(wantLog, gotLog); diff != "" {
t.Fatalf("wrong call log once in stopping mode\n%s", diff)
}
}
type testPersistentState struct {
CallLog []string
Written *states.State
Persisted *states.State
}
var _ statemgr.Writer = (*testPersistentState)(nil)
var _ statemgr.Persister = (*testPersistentState)(nil)
func (sm *testPersistentState) WriteState(state *states.State) error {
sm.CallLog = append(sm.CallLog, "WriteState")
sm.Written = state
return nil
}
func (sm *testPersistentState) PersistState(schemas *terraform.Schemas) error {
if schemas == nil {
return fmt.Errorf("no schemas")
}
sm.CallLog = append(sm.CallLog, "PersistState")
sm.Persisted = sm.Written
return nil
}