autopilot: assume nodes we haven't received heartbeats from are running the same version as we are (#17019)

OSS parts of ent PR #3172: assume nodes we haven't received heartbeats from are running the same version as we are.  Failing to provide a version/upgrade_version will result in Autopilot (on ent) demoting those unversioned nodes to non-voters until we receive a heartbeat from them.
This commit is contained in:
Nick Cabatoff 2022-09-06 14:49:04 -04:00 committed by GitHub
parent a034ebfd27
commit 5db952eada
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 57 additions and 1 deletions

3
changelog/17019.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:bug
storage/raft: Nodes no longer get demoted to nonvoter if we don't know their version due to missing heartbeats.
```

View File

@ -126,7 +126,7 @@ func (l *InmemLayer) Dial(addr string, timeout time.Duration, tlsConfig *tls.Con
}
if l.logger.IsDebug() {
l.logger.Debug("dailing connection", "node", l.addr, "remote", addr, "alpn", alpn)
l.logger.Debug("dialing connection", "node", l.addr, "remote", addr, "alpn", alpn)
}
if connectionCh != nil {

View File

@ -16,6 +16,7 @@ import (
"github.com/hashicorp/vault/helper/testhelpers"
"github.com/hashicorp/vault/helper/testhelpers/teststorage"
"github.com/hashicorp/vault/physical/raft"
"github.com/hashicorp/vault/sdk/version"
"github.com/hashicorp/vault/vault"
"github.com/kr/pretty"
testingintf "github.com/mitchellh/go-testing-interface"
@ -412,3 +413,53 @@ func join(t *testing.T, core *vault.TestClusterCore, client *api.Client, cluster
time.Sleep(1 * time.Second)
cluster.UnsealCore(t, core)
}
// TestRaft_VotersStayVoters ensures that autopilot doesn't demote a node just
// because it hasn't been heard from in some time.
func TestRaft_VotersStayVoters(t *testing.T) {
cluster := raftCluster(t, &RaftClusterOpts{
DisableFollowerJoins: true,
InmemCluster: true,
EnableAutopilot: true,
PhysicalFactoryConfig: map[string]interface{}{
"performance_multiplier": "5",
},
VersionMap: map[int]string{
0: version.Version,
1: version.Version,
2: version.Version,
},
})
defer cluster.Cleanup()
testhelpers.WaitForActiveNode(t, cluster)
client := cluster.Cores[0].Client
config, err := client.Sys().RaftAutopilotConfiguration()
require.NoError(t, err)
joinAndStabilizeAndPromote(t, cluster.Cores[1], client, cluster, config, "core-1", 2)
joinAndStabilizeAndPromote(t, cluster.Cores[2], client, cluster, config, "core-2", 3)
errIfNonVotersExist := func() error {
t.Helper()
resp, err := client.Sys().RaftAutopilotState()
if err != nil {
t.Fatal(err)
}
for k, v := range resp.Servers {
if v.Status == "non-voter" {
return fmt.Errorf("node %q is a non-voter", k)
}
}
return nil
}
testhelpers.RetryUntil(t, 10*time.Second, errIfNonVotersExist)
// Core0 is the leader, sealing it will both cause an election - and the
// new leader won't have seen any heartbeats initially - and create a "down"
// node that won't be sending heartbeats.
testhelpers.EnsureCoreSealed(t, cluster.Cores[0])
time.Sleep(30 * time.Second)
client = cluster.Cores[1].Client
errIfNonVotersExist()
}

View File

@ -24,6 +24,7 @@ import (
"github.com/hashicorp/vault/physical/raft"
"github.com/hashicorp/vault/sdk/helper/jsonutil"
"github.com/hashicorp/vault/sdk/logical"
"github.com/hashicorp/vault/sdk/version"
"github.com/hashicorp/vault/vault/seal"
"github.com/mitchellh/mapstructure"
"golang.org/x/net/http2"
@ -353,6 +354,7 @@ func (c *Core) raftTLSRotatePhased(ctx context.Context, logger hclog.Logger, raf
AppliedIndex: 0,
Term: 0,
DesiredSuffrage: "voter",
SDKVersion: version.GetVersion().Version,
})
}
}