backend/local: Periodically persist intermediate state snapshots

Terraform Core emits a hook event every time it writes a change into the in-memory state. Previously the local backend would just copy that into the transient storage of the state manager, but for most state storage implementations that doesn't really do anything useful because it just makes another copy of the state in memory. We originally added this hook mechanism with the intent of making Terraform _persist_ the state each time, but we backed that out after finding that it was a bit too aggressive and was making the state snapshot history much harder to use in storage systems that can preserve historical snapshots. However, sometimes Terraform gets killed mid-apply for whatever reason and in our previous implementation that meant always losing that transient state, forcing the user to edit the state manually (or use "import") to recover a useful state. In an attempt at finding a sweet spot between these extremes, here we change the rule so that if an apply runs for longer than 20 seconds then we'll try to persist the state to the backend in an update that arrives at least 20 seconds after the first update, and then again for each additional 20 second period as long as Terraform keeps announcing new state snapshots. This also introduces a special interruption mode where if the apply phase gets interrupted by SIGINT (or equivalent) then the local backend will try to persist the state immediately in anticipation of a possibly-imminent SIGKILL, and will then immediately persist any subsequent state update that arrives until the apply phase is complete. After interruption Terraform will not start any new operations and will instead just let any already-running operations run to completion, and so this will persist the state once per resource instance that is able to complete before being killed. This does mean that now long-running applies will generate intermediate state snapshots where they wouldn't before, but there should still be considerably fewer snapshots than were created when we were persisting for each individual state change. We can adjust the 20 second interval in future commits if we find that this spot isn't as sweet as first assumed.
2025-12-25 01:00:16 -05:00 · 2023-02-13 17:38:24 -08:00
parent 47fd019dbb
commit f0de9b60c1
10 changed files with 401 additions and 0 deletions
--- a/internal/backend/local/backend_apply.go
+++ b/internal/backend/local/backend_apply.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"fmt"
 	"log"
+	"time"

 	"github.com/hashicorp/terraform/internal/backend"
 	"github.com/hashicorp/terraform/internal/command/views"
@@ -74,6 +75,10 @@ func (b *Local) opApply(
 		op.ReportResult(runningOp, diags)
 		return
 	}
+	// stateHook uses schemas for when it periodically persists state to the
+	// persistent storage backend.
+	stateHook.Schemas = schemas
+	stateHook.PersistInterval = 20 * time.Second // arbitrary interval that's hopefully a sweet spot

 	var plan *plans.Plan
 	// If we weren't given a plan, then we refresh/plan
--- a/internal/backend/local/hook_state.go
+++ b/internal/backend/local/hook_state.go
@@ -1,7 +1,9 @@
 package local

 import (
+	"log"
 	"sync"
+	"time"

 	"github.com/hashicorp/terraform/internal/states"
 	"github.com/hashicorp/terraform/internal/states/statemgr"
@@ -15,6 +17,21 @@ type StateHook struct {
 	sync.Mutex

 	StateMgr statemgr.Writer
+
+	// If PersistInterval is nonzero then for any new state update after
+	// the duration has elapsed we'll try to persist a state snapshot
+	// to the persistent backend too.
+	// That's only possible if field Schemas is valid, because the
+	// StateMgr.PersistState function for some backends needs schemas.
+	PersistInterval time.Duration
+
+	// Schemas are the schemas to use when persisting state due to
+	// PersistInterval. This is ignored if PersistInterval is zero,
+	// and PersistInterval is ignored if this is nil.
+	Schemas *terraform.Schemas
+
+	lastPersist  time.Time
+	forcePersist bool
 }

 var _ terraform.Hook = (*StateHook)(nil)
@@ -23,11 +40,56 @@ func (h *StateHook) PostStateUpdate(new *states.State) (terraform.HookAction, er
 	h.Lock()
 	defer h.Unlock()

+	if h.lastPersist.IsZero() {
+		// The first PostStateUpdate starts the clock for intermediate
+		// calls to PersistState.
+		h.lastPersist = time.Now()
+	}
+
 	if h.StateMgr != nil {
 		if err := h.StateMgr.WriteState(new); err != nil {
 			return terraform.HookActionHalt, err
 		}
+		if mgrPersist, ok := h.StateMgr.(statemgr.Persister); ok && h.PersistInterval != 0 && h.Schemas != nil {
+			if h.forcePersist || time.Since(h.lastPersist) >= h.PersistInterval {
+				err := mgrPersist.PersistState(h.Schemas)
+				if err != nil {
+					return terraform.HookActionHalt, err
+				}
+				h.lastPersist = time.Now()
+			}
+		}
 	}

 	return terraform.HookActionContinue, nil
 }
+
+func (h *StateHook) Stopping() {
+	h.Lock()
+	defer h.Unlock()
+
+	// If Terraform has been asked to stop then that might mean that a hard
+	// kill signal will follow shortly in case Terraform doesn't stop
+	// quickly enough, and so we'll try to persist the latest state
+	// snapshot in the hope that it'll give the user less recovery work to
+	// do if they _do_ subsequently hard-kill Terraform during an apply.
+
+	if mgrPersist, ok := h.StateMgr.(statemgr.Persister); ok && h.Schemas != nil {
+		err := mgrPersist.PersistState(h.Schemas)
+		if err != nil {
+			// This hook can't affect Terraform Core's ongoing behavior,
+			// but it's a best effort thing anyway so we'll just emit a
+			// log to aid with debugging.
+			log.Printf("[ERROR] Failed to persist state after interruption: %s", err)
+		}
+
+		// While we're in the stopping phase we'll try to persist every
+		// new state update to maximize every opportunity we get to avoid
+		// losing track of objects that have been created or updated.
+		// Terraform Core won't start any new operations after it's been
+		// stopped, so at most we should see one more PostStateUpdate
+		// call per already-active request.
+		h.forcePersist = true
+	}
+
+}
--- a/internal/backend/local/hook_state_test.go
+++ b/internal/backend/local/hook_state_test.go
@@ -1,8 +1,12 @@
 package local

 import (
+	"fmt"
 	"testing"
+	"time"

+	"github.com/google/go-cmp/cmp"
+	"github.com/hashicorp/terraform/internal/states"
 	"github.com/hashicorp/terraform/internal/states/statemgr"
 	"github.com/hashicorp/terraform/internal/terraform"
 )
@@ -27,3 +31,125 @@ func TestStateHook(t *testing.T) {
 		t.Fatalf("bad state: %#v", is.State())
 	}
 }
+
+func TestStateHookStopping(t *testing.T) {
+	is := &testPersistentState{}
+	hook := &StateHook{
+		StateMgr:        is,
+		Schemas:         &terraform.Schemas{},
+		PersistInterval: 4 * time.Hour,
+		lastPersist:     time.Now(),
+	}
+
+	s := statemgr.TestFullInitialState()
+	action, err := hook.PostStateUpdate(s)
+	if err != nil {
+		t.Fatalf("unexpected error from PostStateUpdate: %s", err)
+	}
+	if got, want := action, terraform.HookActionContinue; got != want {
+		t.Fatalf("wrong hookaction %#v; want %#v", got, want)
+	}
+	if is.Written == nil || !is.Written.Equal(s) {
+		t.Fatalf("mismatching state written")
+	}
+	if is.Persisted != nil {
+		t.Fatalf("persisted too soon")
+	}
+
+	// We'll now force lastPersist to be long enough ago that persisting
+	// should be due on the next call.
+	hook.lastPersist = time.Now().Add(-5 * time.Hour)
+	hook.PostStateUpdate(s)
+	if is.Written == nil || !is.Written.Equal(s) {
+		t.Fatalf("mismatching state written")
+	}
+	if is.Persisted == nil || !is.Persisted.Equal(s) {
+		t.Fatalf("mismatching state persisted")
+	}
+	hook.PostStateUpdate(s)
+	if is.Written == nil || !is.Written.Equal(s) {
+		t.Fatalf("mismatching state written")
+	}
+	if is.Persisted == nil || !is.Persisted.Equal(s) {
+		t.Fatalf("mismatching state persisted")
+	}
+
+	gotLog := is.CallLog
+	wantLog := []string{
+		// Initial call before we reset lastPersist
+		"WriteState",
+
+		// Write and then persist after we reset lastPersist
+		"WriteState",
+		"PersistState",
+
+		// Final call when persisting wasn't due yet.
+		"WriteState",
+	}
+	if diff := cmp.Diff(wantLog, gotLog); diff != "" {
+		t.Fatalf("wrong call log so far\n%s", diff)
+	}
+
+	// We'll reset the log now before we try seeing what happens after
+	// we use "Stopped".
+	is.CallLog = is.CallLog[:0]
+	is.Persisted = nil
+
+	hook.Stopping()
+	if is.Persisted == nil || !is.Persisted.Equal(s) {
+		t.Fatalf("mismatching state persisted")
+	}
+
+	is.Persisted = nil
+	hook.PostStateUpdate(s)
+	if is.Persisted == nil || !is.Persisted.Equal(s) {
+		t.Fatalf("mismatching state persisted")
+	}
+	is.Persisted = nil
+	hook.PostStateUpdate(s)
+	if is.Persisted == nil || !is.Persisted.Equal(s) {
+		t.Fatalf("mismatching state persisted")
+	}
+
+	gotLog = is.CallLog
+	wantLog = []string{
+		// "Stopping" immediately persisted
+		"PersistState",
+
+		// PostStateUpdate then writes and persists on every call,
+		// on the assumption that we're now bailing out after
+		// being cancelled and trying to save as much state as we can.
+		"WriteState",
+		"PersistState",
+		"WriteState",
+		"PersistState",
+	}
+	if diff := cmp.Diff(wantLog, gotLog); diff != "" {
+		t.Fatalf("wrong call log once in stopping mode\n%s", diff)
+	}
+}
+
+type testPersistentState struct {
+	CallLog []string
+
+	Written   *states.State
+	Persisted *states.State
+}
+
+var _ statemgr.Writer = (*testPersistentState)(nil)
+var _ statemgr.Persister = (*testPersistentState)(nil)
+
+func (sm *testPersistentState) WriteState(state *states.State) error {
+	sm.CallLog = append(sm.CallLog, "WriteState")
+	sm.Written = state
+	return nil
+}
+
+func (sm *testPersistentState) PersistState(schemas *terraform.Schemas) error {
+	if schemas == nil {
+		return fmt.Errorf("no schemas")
+	}
+	sm.CallLog = append(sm.CallLog, "PersistState")
+	sm.Persisted = sm.Written
+	return nil
+}