diff --git a/.changelog/17582.txt b/.changelog/17582.txt new file mode 100644 index 000000000..122b9df98 --- /dev/null +++ b/.changelog/17582.txt @@ -0,0 +1,3 @@ +```release-note:feature +cli: `consul operator raft list-peers` command shows the number of commits each follower is trailing the leader by to aid in troubleshooting. +``` diff --git a/agent/consul/operator_raft_endpoint.go b/agent/consul/operator_raft_endpoint.go index f619b611f..7b0bcbc5c 100644 --- a/agent/consul/operator_raft_endpoint.go +++ b/agent/consul/operator_raft_endpoint.go @@ -48,6 +48,12 @@ func (op *Operator) RaftGetConfiguration(args *structs.DCSpecificRequest, reply serverMap[raft.ServerAddress(addr)] = member } + serverIDLastIndexMap := make(map[raft.ServerID]uint64) + + for _, serverState := range op.srv.autopilot.GetState().Servers { + serverIDLastIndexMap[serverState.Server.ID] = serverState.Stats.LastIndex + } + // Fill out the reply. leader := op.srv.raft.Leader() reply.Index = future.Index() @@ -66,6 +72,7 @@ func (op *Operator) RaftGetConfiguration(args *structs.DCSpecificRequest, reply Leader: server.Address == leader, Voter: server.Suffrage == raft.Voter, ProtocolVersion: raftProtocolVersion, + LastIndex: serverIDLastIndexMap[server.ID], } reply.Servers = append(reply.Servers, entry) } diff --git a/agent/consul/operator_raft_endpoint_test.go b/agent/consul/operator_raft_endpoint_test.go index 7ce5b6e94..7242c40e6 100644 --- a/agent/consul/operator_raft_endpoint_test.go +++ b/agent/consul/operator_raft_endpoint_test.go @@ -50,6 +50,13 @@ func TestOperator_RaftGetConfiguration(t *testing.T) { if len(future.Configuration().Servers) != 1 { t.Fatalf("bad: %v", future.Configuration().Servers) } + + serverIDLastIndexMap := make(map[raft.ServerID]uint64) + + for _, serverState := range s1.autopilot.GetState().Servers { + serverIDLastIndexMap[serverState.Server.ID] = serverState.Stats.LastIndex + } + me := future.Configuration().Servers[0] expected := structs.RaftConfigurationResponse{ Servers: []*structs.RaftServer{ @@ -60,6 +67,7 @@ func TestOperator_RaftGetConfiguration(t *testing.T) { Leader: true, Voter: true, ProtocolVersion: "3", + LastIndex: serverIDLastIndexMap[me.ID], }, }, Index: future.Index(), @@ -113,6 +121,10 @@ func TestOperator_RaftGetConfiguration_ACLDeny(t *testing.T) { if len(future.Configuration().Servers) != 1 { t.Fatalf("bad: %v", future.Configuration().Servers) } + serverIDLastIndexMap := make(map[raft.ServerID]uint64) + for _, serverState := range s1.autopilot.GetState().Servers { + serverIDLastIndexMap[serverState.Server.ID] = serverState.Stats.LastIndex + } me := future.Configuration().Servers[0] expected := structs.RaftConfigurationResponse{ Servers: []*structs.RaftServer{ @@ -123,6 +135,7 @@ func TestOperator_RaftGetConfiguration_ACLDeny(t *testing.T) { Leader: true, Voter: true, ProtocolVersion: "3", + LastIndex: serverIDLastIndexMap[me.ID], }, }, Index: future.Index(), diff --git a/agent/structs/operator.go b/agent/structs/operator.go index f5c2b8ac8..05862861e 100644 --- a/agent/structs/operator.go +++ b/agent/structs/operator.go @@ -34,6 +34,9 @@ type RaftServer struct { // it's a non-voting server, which will be added in a future release of // Consul. Voter bool + + // LastIndex is the last log index this server has a record of in its Raft log. + LastIndex uint64 } // RaftConfigurationResponse is returned when querying for the current Raft diff --git a/api/operator_raft.go b/api/operator_raft.go index 393d6fb3c..d72c00c97 100644 --- a/api/operator_raft.go +++ b/api/operator_raft.go @@ -28,6 +28,9 @@ type RaftServer struct { // it's a non-voting server, which will be added in a future release of // Consul. Voter bool + + // LastIndex is the last log index this server has a record of in its Raft log. + LastIndex uint64 } // RaftConfiguration is returned when querying for the current Raft configuration. diff --git a/command/operator/raft/listpeers/operator_raft_list.go b/command/operator/raft/listpeers/operator_raft_list.go index 47bd161fe..29643a87c 100644 --- a/command/operator/raft/listpeers/operator_raft_list.go +++ b/command/operator/raft/listpeers/operator_raft_list.go @@ -70,8 +70,24 @@ func raftListPeers(client *api.Client, stale bool) (string, error) { return "", fmt.Errorf("Failed to retrieve raft configuration: %v", err) } + leaderLastCommitIndex := uint64(0) + serverIdLastIndexMap := make(map[string]uint64) + + for _, raftServer := range reply.Servers { + serverIdLastIndexMap[raftServer.ID] = raftServer.LastIndex + } + + for _, s := range reply.Servers { + if s.Leader { + lastIndex, ok := serverIdLastIndexMap[s.ID] + if ok { + leaderLastCommitIndex = lastIndex + } + } + } + // Format it as a nice table. - result := []string{"Node\x1fID\x1fAddress\x1fState\x1fVoter\x1fRaftProtocol"} + result := []string{"Node\x1fID\x1fAddress\x1fState\x1fVoter\x1fRaftProtocol\x1fCommit Index\x1fTrails Leader By"} for _, s := range reply.Servers { raftProtocol := s.ProtocolVersion @@ -82,8 +98,20 @@ func raftListPeers(client *api.Client, stale bool) (string, error) { if s.Leader { state = "leader" } - result = append(result, fmt.Sprintf("%s\x1f%s\x1f%s\x1f%s\x1f%v\x1f%s", - s.Node, s.ID, s.Address, state, s.Voter, raftProtocol)) + + trailsLeaderByText := "-" + serverLastIndex, ok := serverIdLastIndexMap[s.ID] + if ok { + trailsLeaderBy := leaderLastCommitIndex - serverLastIndex + trailsLeaderByText = fmt.Sprintf("%d commits", trailsLeaderBy) + if s.Leader { + trailsLeaderByText = "-" + } else if trailsLeaderBy == 1 { + trailsLeaderByText = fmt.Sprintf("%d commit", trailsLeaderBy) + } + } + result = append(result, fmt.Sprintf("%s\x1f%s\x1f%s\x1f%s\x1f%v\x1f%s\x1f%v\x1f%s", + s.Node, s.ID, s.Address, state, s.Voter, raftProtocol, serverLastIndex, trailsLeaderByText)) } return columnize.Format(result, &columnize.Config{Delim: string([]byte{0x1f})}), nil diff --git a/command/operator/raft/listpeers/operator_raft_list_test.go b/command/operator/raft/listpeers/operator_raft_list_test.go index 15bd1bfbe..0694e0dd1 100644 --- a/command/operator/raft/listpeers/operator_raft_list_test.go +++ b/command/operator/raft/listpeers/operator_raft_list_test.go @@ -28,7 +28,7 @@ func TestOperatorRaftListPeersCommand(t *testing.T) { a := agent.NewTestAgent(t, ``) defer a.Shutdown() - expected := fmt.Sprintf("%s %s 127.0.0.1:%d leader true 3", + expected := fmt.Sprintf("%s %s 127.0.0.1:%d leader true 3 1 -", a.Config.NodeName, a.Config.NodeID, a.Config.ServerPort) // Test the list-peers subcommand directly diff --git a/website/content/commands/operator/raft.mdx b/website/content/commands/operator/raft.mdx index 857a9ee1a..eaa0081a6 100644 --- a/website/content/commands/operator/raft.mdx +++ b/website/content/commands/operator/raft.mdx @@ -46,10 +46,10 @@ Usage: `consul operator raft list-peers -stale=[true|false]` The output looks like this: ```text -Node ID Address State Voter RaftProtocol -alice 127.0.0.1:8300 127.0.0.1:8300 follower true 2 -bob 127.0.0.2:8300 127.0.0.2:8300 leader true 3 -carol 127.0.0.3:8300 127.0.0.3:8300 follower true 2 +Node ID Address State Voter RaftProtocol Commit Index Trails Leader By +alice 127.0.0.1:8300 127.0.0.1:8300 follower true 2 1167 0 commits +bob 127.0.0.2:8300 127.0.0.2:8300 leader true 3 1167 - +carol 127.0.0.3:8300 127.0.0.3:8300 follower true 2 1159 8 commits ``` `Node` is the node name of the server, as known to Consul, or "(unknown)" if @@ -70,7 +70,7 @@ configuration. - `-stale` - Enables non-leader servers to provide cluster state information. If the cluster is in an outage state without a leader, - we recommend setting this option to `true. + we recommend setting this option to `true`. Default is `false`. ## remove-peer @@ -109,7 +109,7 @@ The return code will indicate success or failure. Corresponding HTTP API Endpoint: [\[POST\] /v1/operator/raft/transfer-leader](/consul/api-docs/operator/raft#transfer-raft-leadership) -This command transfers Raft leadership to another server agent. If an `id` is provided, Consul transfers leadership to the server with that id. +This command transfers Raft leadership to another server agent. If an `id` is provided, Consul transfers leadership to the server with that id. Use this command to change leadership without restarting the leader node, which maintains quorum and workload capacity.