open-vault/physical/raft/fsm_test.go

162 lines
3.3 KiB
Go
Raw Normal View History

package raft
import (
"context"
fmt "fmt"
"io/ioutil"
"math/rand"
"os"
"sort"
"testing"
"github.com/go-test/deep"
proto "github.com/golang/protobuf/proto"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/raft"
"github.com/hashicorp/vault/sdk/physical"
)
func getFSM(t testing.TB) (*FSM, string) {
raftDir, err := ioutil.TempDir("", "vault-raft-")
if err != nil {
t.Fatal(err)
}
t.Logf("raft dir: %s", raftDir)
logger := hclog.New(&hclog.LoggerOptions{
Name: "raft",
Level: hclog.Trace,
})
Autopilot: Server Stabilization, State and Dead Server Cleanup (#10856) * k8s doc: update for 0.9.1 and 0.8.0 releases (#10825) * k8s doc: update for 0.9.1 and 0.8.0 releases * Update website/content/docs/platform/k8s/helm/configuration.mdx Co-authored-by: Theron Voran <tvoran@users.noreply.github.com> Co-authored-by: Theron Voran <tvoran@users.noreply.github.com> * Autopilot initial commit * Move autopilot related backend implementations to its own file * Abstract promoter creation * Add nil check for health * Add server state oss no-ops * Config ext stub for oss * Make way for non-voters * s/health/state * s/ReadReplica/NonVoter * Add synopsis and description * Remove struct tags from AutopilotConfig * Use var for config storage path * Handle nin-config when reading * Enable testing autopilot by using inmem cluster * First passing test * Only report the server as known if it is present in raft config * Autopilot defaults to on for all existing and new clusters * Add locking to some functions * Persist initial config * Clarify the command usage doc * Add health metric for each node * Fix audit logging issue * Don't set DisablePerformanceStandby to true in test * Use node id label for health metric * Log updates to autopilot config * Less aggressively consume config loading failures * Return a mutable config * Return early from known servers if raft config is unable to be pulled * Update metrics name * Reduce log level for potentially noisy log * Add knob to disable autopilot * Don't persist if default config is in use * Autopilot: Dead server cleanup (#10857) * Dead server cleanup * Initialize channel in any case * Fix a bunch of tests * Fix panic * Add follower locking in heartbeat tracker * Add LastContactFailureThreshold to config * Add log when marking node as dead * Update follower state locking in heartbeat tracker * Avoid follower states being nil * Pull test to its own file * Add execution status to state response * Optionally enable autopilot in some tests * Updates * Added API function to fetch autopilot configuration * Add test for default autopilot configuration * Configuration tests * Add State API test * Update test * Added TestClusterOptions.PhysicalFactoryConfig * Update locking * Adjust locking in heartbeat tracker * s/last_contact_failure_threshold/left_server_last_contact_threshold * Add disabling autopilot as a core config option * Disable autopilot in some tests * s/left_server_last_contact_threshold/dead_server_last_contact_threshold * Set the lastheartbeat of followers to now when setting up active node * Don't use config defaults from CLI command * Remove config file support * Remove HCL test as well * Persist only supplied config; merge supplied config with default to operate * Use pointer to structs for storing follower information * Test update * Retrieve non voter status from configbucket and set it up when a node comes up * Manage desired suffrage * Consider bucket being created already * Move desired suffrage to its own entry * s/DesiredSuffrageKey/LocalNodeConfigKey * s/witnessSuffrage/recordSuffrage * Fix test compilation * Handle local node config post a snapshot install * Commit to storage first; then record suffrage in fsm * No need of local node config being nili case, post snapshot restore * Reconcile autopilot config when a new leader takes over duty * Grab fsm lock when recording suffrage * s/Suffrage/DesiredSuffrage in FollowerState * Instantiate autopilot only in leader * Default to old ways in more scenarios * Make API gracefully handle 404 * Address some feedback * Make IsDead an atomic.Value * Simplify follower hearbeat tracking * Use uber.atomic * Don't have multiple causes for having autopilot disabled * Don't remove node from follower states if we fail to remove the dead server * Autopilot server removals map (#11019) * Don't remove node from follower states if we fail to remove the dead server * Use map to track dead server removals * Use lock and map * Use delegate lock * Adjust when to remove entry from map * Only hold the lock while accessing map * Fix race * Don't set default min_quorum * Fix test * Ensure follower states is not nil before starting autopilot * Fix race Co-authored-by: Jason O'Donnell <2160810+jasonodonnell@users.noreply.github.com> Co-authored-by: Theron Voran <tvoran@users.noreply.github.com>
2021-03-03 18:59:50 +00:00
fsm, err := NewFSM(raftDir, "", logger)
if err != nil {
t.Fatal(err)
}
return fsm, raftDir
}
func TestFSM_Batching(t *testing.T) {
fsm, dir := getFSM(t)
defer os.RemoveAll(dir)
var index uint64
var term uint64 = 1
getLog := func(i uint64) (int, *raft.Log) {
if rand.Intn(10) >= 8 {
term += 1
return 0, &raft.Log{
Index: i,
Term: term,
Type: raft.LogConfiguration,
Data: raft.EncodeConfiguration(raft.Configuration{
Servers: []raft.Server{
{
Address: raft.ServerAddress("test"),
ID: raft.ServerID("test"),
},
},
}),
}
}
command := &LogData{
Operations: make([]*LogOperation, rand.Intn(10)),
}
for j := range command.Operations {
command.Operations[j] = &LogOperation{
OpType: putOp,
Key: fmt.Sprintf("key-%d-%d", i, j),
Value: []byte(fmt.Sprintf("value-%d-%d", i, j)),
}
}
commandBytes, err := proto.Marshal(command)
if err != nil {
t.Fatal(err)
}
return len(command.Operations), &raft.Log{
Index: i,
Term: term,
Type: raft.LogCommand,
Data: commandBytes,
}
}
totalKeys := 0
for i := 0; i < 100; i++ {
batchSize := rand.Intn(64)
batch := make([]*raft.Log, batchSize)
for j := 0; j < batchSize; j++ {
var keys int
index++
keys, batch[j] = getLog(index)
totalKeys += keys
}
resp := fsm.ApplyBatch(batch)
if len(resp) != batchSize {
t.Fatalf("incorrect response length: got %d expected %d", len(resp), batchSize)
}
for _, r := range resp {
if _, ok := r.(*FSMApplyResponse); !ok {
t.Fatal("bad response type")
}
}
}
keys, err := fsm.List(context.Background(), "")
if err != nil {
t.Fatal(err)
}
if len(keys) != totalKeys {
t.Fatalf("incorrect number of keys: got %d expected %d", len(keys), totalKeys)
}
latestIndex, latestConfig := fsm.LatestState()
if latestIndex.Index != index {
t.Fatalf("bad latest index: got %d expected %d", latestIndex.Index, index)
}
if latestIndex.Term != term {
t.Fatalf("bad latest term: got %d expected %d", latestIndex.Term, term)
}
if latestConfig == nil && term > 1 {
t.Fatal("config wasn't updated")
}
}
func TestFSM_List(t *testing.T) {
fsm, dir := getFSM(t)
defer os.RemoveAll(dir)
ctx := context.Background()
count := 100
keys := rand.Perm(count)
var sorted []string
for _, k := range keys {
err := fsm.Put(ctx, &physical.Entry{Key: fmt.Sprintf("foo/%d/bar", k)})
if err != nil {
t.Fatal(err)
}
err = fsm.Put(ctx, &physical.Entry{Key: fmt.Sprintf("foo/%d/baz", k)})
if err != nil {
t.Fatal(err)
}
sorted = append(sorted, fmt.Sprintf("%d/", k))
}
sort.Strings(sorted)
got, err := fsm.List(ctx, "foo/")
if err != nil {
t.Fatal(err)
}
sort.Strings(got)
if diff := deep.Equal(sorted, got); len(diff) > 0 {
t.Fatal(diff)
}
}