open-vault/physical/raft/fsm_test.go
Vishal Nayak 3e55e79a3f
Autopilot: Server Stabilization, State and Dead Server Cleanup (#10856)
* k8s doc: update for 0.9.1 and 0.8.0 releases (#10825)

* k8s doc: update for 0.9.1 and 0.8.0 releases

* Update website/content/docs/platform/k8s/helm/configuration.mdx

Co-authored-by: Theron Voran <tvoran@users.noreply.github.com>

Co-authored-by: Theron Voran <tvoran@users.noreply.github.com>

* Autopilot initial commit

* Move autopilot related backend implementations to its own file

* Abstract promoter creation

* Add nil check for health

* Add server state oss no-ops

* Config ext stub for oss

* Make way for non-voters

* s/health/state

* s/ReadReplica/NonVoter

* Add synopsis and description

* Remove struct tags from AutopilotConfig

* Use var for config storage path

* Handle nin-config when reading

* Enable testing autopilot by using inmem cluster

* First passing test

* Only report the server as known if it is present in raft config

* Autopilot defaults to on for all existing and new clusters

* Add locking to some functions

* Persist initial config

* Clarify the command usage doc

* Add health metric for each node

* Fix audit logging issue

* Don't set DisablePerformanceStandby to true in test

* Use node id label for health metric

* Log updates to autopilot config

* Less aggressively consume config loading failures

* Return a mutable config

* Return early from known servers if raft config is unable to be pulled

* Update metrics name

* Reduce log level for potentially noisy log

* Add knob to disable autopilot

* Don't persist if default config is in use

* Autopilot: Dead server cleanup (#10857)

* Dead server cleanup

* Initialize channel in any case

* Fix a bunch of tests

* Fix panic

* Add follower locking in heartbeat tracker

* Add LastContactFailureThreshold to config

* Add log when marking node as dead

* Update follower state locking in heartbeat tracker

* Avoid follower states being nil

* Pull test to its own file

* Add execution status to state response

* Optionally enable autopilot in some tests

* Updates

* Added API function to fetch autopilot configuration

* Add test for default autopilot configuration

* Configuration tests

* Add State API test

* Update test

* Added TestClusterOptions.PhysicalFactoryConfig

* Update locking

* Adjust locking in heartbeat tracker

* s/last_contact_failure_threshold/left_server_last_contact_threshold

* Add disabling autopilot as a core config option

* Disable autopilot in some tests

* s/left_server_last_contact_threshold/dead_server_last_contact_threshold

* Set the lastheartbeat of followers to now when setting up active node

* Don't use config defaults from CLI command

* Remove config file support

* Remove HCL test as well

* Persist only supplied config; merge supplied config with default to operate

* Use pointer to structs for storing follower information

* Test update

* Retrieve non voter status from configbucket and set it up when a node comes up

* Manage desired suffrage

* Consider bucket being created already

* Move desired suffrage to its own entry

* s/DesiredSuffrageKey/LocalNodeConfigKey

* s/witnessSuffrage/recordSuffrage

* Fix test compilation

* Handle local node config post a snapshot install

* Commit to storage first; then record suffrage in fsm

* No need of local node config being nili case, post snapshot restore

* Reconcile autopilot config when a new leader takes over duty

* Grab fsm lock when recording suffrage

* s/Suffrage/DesiredSuffrage in FollowerState

* Instantiate autopilot only in leader

* Default to old ways in more scenarios

* Make API gracefully handle 404

* Address some feedback

* Make IsDead an atomic.Value

* Simplify follower hearbeat tracking

* Use uber.atomic

* Don't have multiple causes for having autopilot disabled

* Don't remove node from follower states if we fail to remove the dead server

* Autopilot server removals map (#11019)

* Don't remove node from follower states if we fail to remove the dead server

* Use map to track dead server removals

* Use lock and map

* Use delegate lock

* Adjust when to remove entry from map

* Only hold the lock while accessing map

* Fix race

* Don't set default min_quorum

* Fix test

* Ensure follower states is not nil before starting autopilot

* Fix race

Co-authored-by: Jason O'Donnell <2160810+jasonodonnell@users.noreply.github.com>
Co-authored-by: Theron Voran <tvoran@users.noreply.github.com>
2021-03-03 13:59:50 -05:00

162 lines
3.3 KiB
Go

package raft
import (
"context"
fmt "fmt"
"io/ioutil"
"math/rand"
"os"
"sort"
"testing"
"github.com/go-test/deep"
proto "github.com/golang/protobuf/proto"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/raft"
"github.com/hashicorp/vault/sdk/physical"
)
func getFSM(t testing.TB) (*FSM, string) {
raftDir, err := ioutil.TempDir("", "vault-raft-")
if err != nil {
t.Fatal(err)
}
t.Logf("raft dir: %s", raftDir)
logger := hclog.New(&hclog.LoggerOptions{
Name: "raft",
Level: hclog.Trace,
})
fsm, err := NewFSM(raftDir, "", logger)
if err != nil {
t.Fatal(err)
}
return fsm, raftDir
}
func TestFSM_Batching(t *testing.T) {
fsm, dir := getFSM(t)
defer os.RemoveAll(dir)
var index uint64
var term uint64 = 1
getLog := func(i uint64) (int, *raft.Log) {
if rand.Intn(10) >= 8 {
term += 1
return 0, &raft.Log{
Index: i,
Term: term,
Type: raft.LogConfiguration,
Data: raft.EncodeConfiguration(raft.Configuration{
Servers: []raft.Server{
raft.Server{
Address: raft.ServerAddress("test"),
ID: raft.ServerID("test"),
},
},
}),
}
}
command := &LogData{
Operations: make([]*LogOperation, rand.Intn(10)),
}
for j := range command.Operations {
command.Operations[j] = &LogOperation{
OpType: putOp,
Key: fmt.Sprintf("key-%d-%d", i, j),
Value: []byte(fmt.Sprintf("value-%d-%d", i, j)),
}
}
commandBytes, err := proto.Marshal(command)
if err != nil {
t.Fatal(err)
}
return len(command.Operations), &raft.Log{
Index: i,
Term: term,
Type: raft.LogCommand,
Data: commandBytes,
}
}
totalKeys := 0
for i := 0; i < 100; i++ {
batchSize := rand.Intn(64)
batch := make([]*raft.Log, batchSize)
for j := 0; j < batchSize; j++ {
var keys int
index++
keys, batch[j] = getLog(index)
totalKeys += keys
}
resp := fsm.ApplyBatch(batch)
if len(resp) != batchSize {
t.Fatalf("incorrect response length: got %d expected %d", len(resp), batchSize)
}
for _, r := range resp {
if _, ok := r.(*FSMApplyResponse); !ok {
t.Fatal("bad response type")
}
}
}
keys, err := fsm.List(context.Background(), "")
if err != nil {
t.Fatal(err)
}
if len(keys) != totalKeys {
t.Fatalf("incorrect number of keys: got %d expected %d", len(keys), totalKeys)
}
latestIndex, latestConfig := fsm.LatestState()
if latestIndex.Index != index {
t.Fatalf("bad latest index: got %d expected %d", latestIndex.Index, index)
}
if latestIndex.Term != term {
t.Fatalf("bad latest term: got %d expected %d", latestIndex.Term, term)
}
if latestConfig == nil && term > 1 {
t.Fatal("config wasn't updated")
}
}
func TestFSM_List(t *testing.T) {
fsm, dir := getFSM(t)
defer os.RemoveAll(dir)
ctx := context.Background()
count := 100
keys := rand.Perm(count)
var sorted []string
for _, k := range keys {
err := fsm.Put(ctx, &physical.Entry{Key: fmt.Sprintf("foo/%d/bar", k)})
if err != nil {
t.Fatal(err)
}
err = fsm.Put(ctx, &physical.Entry{Key: fmt.Sprintf("foo/%d/baz", k)})
if err != nil {
t.Fatal(err)
}
sorted = append(sorted, fmt.Sprintf("%d/", k))
}
sort.Strings(sorted)
got, err := fsm.List(ctx, "foo/")
if err != nil {
t.Fatal(err)
}
sort.Strings(got)
if diff := deep.Equal(sorted, got); len(diff) > 0 {
t.Fatal(diff)
}
}