open-nomad/nomad/client_meta_endpoint_test.go
Michael Schurter f8884d8b52
client/metadata: fix crasher caused by AllowStale = false (#16549)
Fixes #16517

Given a 3 Server cluster with at least 1 Client connected to Follower 1:

If a NodeMeta.{Apply,Read} for the Client request is received by
Follower 1 with `AllowStale = false` the Follower will forward the
request to the Leader.

The Leader, not being connected to the target Client, will forward the
RPC to Follower 1.

Follower 1, seeing AllowStale=false, will forward the request to the
Leader.

The Leader, not being connected to... well hoppefully you get the
picture: an infinite loop occurs.
2023-03-20 16:32:32 -07:00

130 lines
3.9 KiB
Go

package nomad
import (
"testing"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/client"
"github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/helper/pointer"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
"github.com/shoenig/test/must"
)
// TestNodeMeta_Forward asserts that Client RPCs do not result in infinite
// loops. For example in a cluster with 1 Leader, 2 Followers, and a Node
// connected to Follower 1:
//
// If a NodeMeta.Apply RPC with AllowStale=false is received by Follower 1, it
// will honor AllowStale=false and forward the request to the Leader.
//
// The Leader will accept the RPC, notice that Follower 1 has a connection to
// the Node, and the Leader will send the request back to Follower 1.
//
// Follower 1, ever respectful of AllowStale=false, will forward it back to the
// Leader.
//
// The Leader, being unable to forward to the Node, will send it back to
// Follower 1.
//
// This argument will continue until one of the Servers runs out of memory or
// patience and stomps away in anger (crashes). Like any good argument the
// ending is never pretty as the Servers will suffer CPU starvation and
// potentially Raft flapping before anyone actually OOMs.
//
// See https://github.com/hashicorp/nomad/issues/16517 for details.
//
// If test fails it will do so spectacularly by consuming all available CPU and
// potentially all available memory. Running it in a VM or container is
// suggested.
func TestNodeMeta_Forward(t *testing.T) {
ci.Parallel(t)
servers := []*Server{}
for i := 0; i < 3; i++ {
s, cleanup := TestServer(t, func(c *Config) {
c.BootstrapExpect = 3
c.NumSchedulers = 0
})
t.Cleanup(cleanup)
servers = append(servers, s)
}
TestJoin(t, servers...)
leader := testutil.WaitForLeaders(t, servers[0].RPC, servers[1].RPC, servers[2].RPC)
followers := []string{}
for _, s := range servers {
if addr := s.config.RPCAddr.String(); addr != leader {
followers = append(followers, addr)
}
}
t.Logf("leader=%s followers=%q", leader, followers)
clients := []*client.Client{}
for i := 0; i < 4; i++ {
c, cleanup := client.TestClient(t, func(c *config.Config) {
// Clients will rebalance across all servers, but try to get them to use
// followers to ensure we don't hit the loop in #16517
c.Servers = followers
})
defer cleanup()
clients = append(clients, c)
}
for _, c := range clients {
testutil.WaitForClient(t, servers[0].RPC, c.NodeID(), c.Region())
}
agentRPCs := []func(string, any, any) error{}
nodeIDs := make([]string, 0, len(clients))
// Build list of agents and node IDs
for _, s := range servers {
agentRPCs = append(agentRPCs, s.RPC)
}
for _, c := range clients {
agentRPCs = append(agentRPCs, c.RPC)
nodeIDs = append(nodeIDs, c.NodeID())
}
region := clients[0].Region()
// Apply metadata to every client through every agent to ensure forwarding
// always works regardless of path taken.
for _, rpc := range agentRPCs {
for _, nodeID := range nodeIDs {
args := &structs.NodeMetaApplyRequest{
// Intentionally don't set QueryOptions.AllowStale to exercise #16517
QueryOptions: structs.QueryOptions{
Region: region,
},
NodeID: nodeID,
Meta: map[string]*string{"testing": pointer.Of("123")},
}
reply := &structs.NodeMetaResponse{}
must.NoError(t, rpc("NodeMeta.Apply", args, reply))
must.MapNotEmpty(t, reply.Meta)
}
}
for _, rpc := range agentRPCs {
for _, nodeID := range nodeIDs {
args := &structs.NodeSpecificRequest{
// Intentionally don't set QueryOptions.AllowStale to exercise #16517
QueryOptions: structs.QueryOptions{
Region: region,
},
NodeID: nodeID,
}
reply := &structs.NodeMetaResponse{}
must.NoError(t, rpc("NodeMeta.Read", args, reply))
must.MapNotEmpty(t, reply.Meta)
must.Eq(t, reply.Meta["testing"], "123")
must.MapNotEmpty(t, reply.Dynamic)
must.Eq(t, *reply.Dynamic["testing"], "123")
}
}
}