Refactor out recocileServerList anon function
Add testing to reconcileServerList and test various server sizes. Test that a percentage of nodes fail their Ping (50% in testing atm)
This commit is contained in:
parent
6609ee5d51
commit
177f64134e
|
@ -295,7 +295,7 @@ func TestClient_RPC_ConsulServerPing(t *testing.T) {
|
||||||
|
|
||||||
// Artificially fail the server in order to rotate the server
|
// Artificially fail the server in order to rotate the server
|
||||||
// list
|
// list
|
||||||
sm.NotifyFailedServer(s)
|
c.serverMgr.NotifyFailedServer(s)
|
||||||
}
|
}
|
||||||
|
|
||||||
if pingCount != numServers {
|
if pingCount != numServers {
|
||||||
|
|
|
@ -278,8 +278,7 @@ func (sm *ServerManager) NumServers() (numServers int) {
|
||||||
// deregistered. Before the newly shuffled server list is saved, the new
|
// deregistered. Before the newly shuffled server list is saved, the new
|
||||||
// remote endpoint is tested to ensure its responsive.
|
// remote endpoint is tested to ensure its responsive.
|
||||||
func (sm *ServerManager) RebalanceServers() {
|
func (sm *ServerManager) RebalanceServers() {
|
||||||
FAILED_SERVER_DURING_REBALANCE:
|
// Obtain a copy of the current serverConfig
|
||||||
// Obtain a copy of the server config
|
|
||||||
sc := sm.getServerConfig()
|
sc := sm.getServerConfig()
|
||||||
|
|
||||||
// Early abort if there is no value to shuffling
|
// Early abort if there is no value to shuffling
|
||||||
|
@ -293,7 +292,7 @@ FAILED_SERVER_DURING_REBALANCE:
|
||||||
// Don't iterate on the list directly, this loop mutates server the
|
// Don't iterate on the list directly, this loop mutates server the
|
||||||
// list.
|
// list.
|
||||||
var foundHealthyServer bool
|
var foundHealthyServer bool
|
||||||
for n := len(sc.servers); n > 0; n-- {
|
for i := 0; i < len(sc.servers); i++ {
|
||||||
// Always test the first server. Failed servers are cycled
|
// Always test the first server. Failed servers are cycled
|
||||||
// while Serf detects the node has failed.
|
// while Serf detects the node has failed.
|
||||||
selectedServer := sc.servers[0]
|
selectedServer := sc.servers[0]
|
||||||
|
@ -303,7 +302,7 @@ FAILED_SERVER_DURING_REBALANCE:
|
||||||
foundHealthyServer = true
|
foundHealthyServer = true
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
sm.logger.Printf("[DEBUG] server manager: pinging server %s failed: %s", selectedServer.String(), err)
|
sm.logger.Printf(`[DEBUG] server manager: pinging server "%s" failed: %s`, selectedServer.String(), err)
|
||||||
|
|
||||||
sc.cycleServer()
|
sc.cycleServer()
|
||||||
}
|
}
|
||||||
|
@ -311,77 +310,96 @@ FAILED_SERVER_DURING_REBALANCE:
|
||||||
// If no healthy servers were found, sleep and wait for Serf to make
|
// If no healthy servers were found, sleep and wait for Serf to make
|
||||||
// the world a happy place again.
|
// the world a happy place again.
|
||||||
if !foundHealthyServer {
|
if !foundHealthyServer {
|
||||||
const backoffDuration = 1 * time.Second
|
sm.logger.Printf("[DEBUG] server manager: No healthy servers during rebalance, aborting")
|
||||||
sm.logger.Printf("[DEBUG] server manager: No servers available, sleeping for %v", backoffDuration)
|
return
|
||||||
|
|
||||||
// Sleep with no locks
|
|
||||||
time.Sleep(backoffDuration)
|
|
||||||
goto FAILED_SERVER_DURING_REBALANCE
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Verify that all servers are present. Use an anonymous func to
|
// Verify that all servers are present
|
||||||
// ensure lock is released when exiting the critical section.
|
if sm.reconcileServerList(&sc) {
|
||||||
reconcileServerLists := func() bool {
|
sm.logger.Printf("[DEBUG] server manager: Rebalanced %d servers, next active server is %s", len(sc.servers), sc.servers[0].String())
|
||||||
sm.serverConfigLock.Lock()
|
} else {
|
||||||
defer sm.serverConfigLock.Unlock()
|
// reconcileServerList failed because Serf removed the server
|
||||||
tmpServerCfg := sm.getServerConfig()
|
// that was at the front of the list that had successfully
|
||||||
|
// been Ping'ed. Between the Ping and reconcile, a Serf
|
||||||
type targetServer struct {
|
// event had shown up removing the node. Prevent an RPC
|
||||||
server *server_details.ServerDetails
|
// timeout by retrying RebalanceServers().
|
||||||
|
//
|
||||||
// 'b' == both
|
// Instead of doing any heroics, "freeze in place" and
|
||||||
// 'o' == original
|
// continue to use the existing connection until the next
|
||||||
// 'n' == new
|
// rebalance occurs.
|
||||||
state byte
|
|
||||||
}
|
|
||||||
mergedList := make(map[server_details.Key]*targetServer)
|
|
||||||
for _, s := range sc.servers {
|
|
||||||
mergedList[*s.Key()] = &targetServer{server: s, state: 'o'}
|
|
||||||
}
|
|
||||||
for _, s := range tmpServerCfg.servers {
|
|
||||||
k := s.Key()
|
|
||||||
_, found := mergedList[*k]
|
|
||||||
if found {
|
|
||||||
mergedList[*k].state = 'b'
|
|
||||||
} else {
|
|
||||||
mergedList[*k] = &targetServer{server: s, state: 'n'}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ensure the selected server has not been removed by Serf
|
|
||||||
selectedServerKey := sc.servers[0].Key()
|
|
||||||
if v, found := mergedList[*selectedServerKey]; found && v.state == 'o' {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add any new servers and remove any old servers
|
|
||||||
for k, v := range mergedList {
|
|
||||||
switch v.state {
|
|
||||||
case 'b':
|
|
||||||
// Do nothing, server exists in both
|
|
||||||
case 'o':
|
|
||||||
// Server has been removed
|
|
||||||
sc.removeServerByKey(&k)
|
|
||||||
case 'n':
|
|
||||||
// Server added
|
|
||||||
sc.servers = append(sc.servers, v.server)
|
|
||||||
default:
|
|
||||||
panic("not implemented")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
sm.saveServerConfig(sc)
|
|
||||||
return true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if !reconcileServerLists() {
|
|
||||||
goto FAILED_SERVER_DURING_REBALANCE
|
|
||||||
}
|
|
||||||
|
|
||||||
sm.logger.Printf("[DEBUG] server manager: Rebalanced %d servers, next active server is %s", len(sc.servers), sc.servers[0].String())
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// reconcileServerList returns true when the first server in serverConfig
|
||||||
|
// exists in the receiver's serverConfig. If true, the merged serverConfig
|
||||||
|
// is stored as the receiver's serverConfig. Returns false if the first
|
||||||
|
// server does not exist in the list (i.e. was removed by Serf during a
|
||||||
|
// PingConsulServer() call. Newly added servers are appended to the list and
|
||||||
|
// other missing servers are removed from the list.
|
||||||
|
func (sm *ServerManager) reconcileServerList(sc *serverConfig) bool {
|
||||||
|
sm.serverConfigLock.Lock()
|
||||||
|
defer sm.serverConfigLock.Unlock()
|
||||||
|
|
||||||
|
// newServerCfg is a serverConfig that has been kept up to date with
|
||||||
|
// Serf node join and node leave events.
|
||||||
|
newServerCfg := sm.getServerConfig()
|
||||||
|
|
||||||
|
// If Serf has removed all nodes, or there is no selected server
|
||||||
|
// (zero nodes in sc), abort early.
|
||||||
|
if len(newServerCfg.servers) == 0 || len(sc.servers) == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
type targetServer struct {
|
||||||
|
server *server_details.ServerDetails
|
||||||
|
|
||||||
|
// 'b' == both
|
||||||
|
// 'o' == original
|
||||||
|
// 'n' == new
|
||||||
|
state byte
|
||||||
|
}
|
||||||
|
mergedList := make(map[server_details.Key]*targetServer, len(sc.servers))
|
||||||
|
for _, s := range sc.servers {
|
||||||
|
mergedList[*s.Key()] = &targetServer{server: s, state: 'o'}
|
||||||
|
}
|
||||||
|
for _, s := range newServerCfg.servers {
|
||||||
|
k := s.Key()
|
||||||
|
_, found := mergedList[*k]
|
||||||
|
if found {
|
||||||
|
mergedList[*k].state = 'b'
|
||||||
|
} else {
|
||||||
|
mergedList[*k] = &targetServer{server: s, state: 'n'}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the selected server has not been removed by Serf
|
||||||
|
selectedServerKey := sc.servers[0].Key()
|
||||||
|
if v, found := mergedList[*selectedServerKey]; found && v.state == 'o' {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Append any new servers and remove any old servers
|
||||||
|
for k, v := range mergedList {
|
||||||
|
switch v.state {
|
||||||
|
case 'b':
|
||||||
|
// Do nothing, server exists in both
|
||||||
|
case 'o':
|
||||||
|
// Server has been removed
|
||||||
|
sc.removeServerByKey(&k)
|
||||||
|
case 'n':
|
||||||
|
// Server added
|
||||||
|
sc.servers = append(sc.servers, v.server)
|
||||||
|
default:
|
||||||
|
panic("unknown merge list state")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sm.saveServerConfig(*sc)
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
// RemoveServer takes out an internal write lock and removes a server from
|
// RemoveServer takes out an internal write lock and removes a server from
|
||||||
// the server list.
|
// the server list.
|
||||||
func (sm *ServerManager) RemoveServer(server *server_details.ServerDetails) {
|
func (sm *ServerManager) RemoveServer(server *server_details.ServerDetails) {
|
||||||
|
|
|
@ -4,6 +4,7 @@ import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log"
|
"log"
|
||||||
|
"math/rand"
|
||||||
"os"
|
"os"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
@ -26,10 +27,17 @@ func GetBufferedLogger() *log.Logger {
|
||||||
}
|
}
|
||||||
|
|
||||||
type fauxConnPool struct {
|
type fauxConnPool struct {
|
||||||
|
// failPct between 0.0 and 1.0 == pct of time a Ping should fail
|
||||||
|
failPct float64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *fauxConnPool) PingConsulServer(server *server_details.ServerDetails) (bool, error) {
|
func (cp *fauxConnPool) PingConsulServer(server *server_details.ServerDetails) (bool, error) {
|
||||||
return true, nil
|
var success bool
|
||||||
|
successProb := rand.Float64()
|
||||||
|
if successProb > cp.failPct {
|
||||||
|
success = true
|
||||||
|
}
|
||||||
|
return success, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
type fauxSerf struct {
|
type fauxSerf struct {
|
||||||
|
@ -47,6 +55,14 @@ func testServerManager() (sm *ServerManager) {
|
||||||
return sm
|
return sm
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func testServerManagerFailProb(failPct float64) (sm *ServerManager) {
|
||||||
|
logger := GetBufferedLogger()
|
||||||
|
logger = log.New(os.Stderr, "", log.LstdFlags)
|
||||||
|
shutdownCh := make(chan struct{})
|
||||||
|
sm = New(logger, shutdownCh, &fauxSerf{}, &fauxConnPool{failPct: failPct})
|
||||||
|
return sm
|
||||||
|
}
|
||||||
|
|
||||||
// func (sc *serverConfig) cycleServer() (servers []*server_details.ServerDetails) {
|
// func (sc *serverConfig) cycleServer() (servers []*server_details.ServerDetails) {
|
||||||
func TestServerManagerInternal_cycleServer(t *testing.T) {
|
func TestServerManagerInternal_cycleServer(t *testing.T) {
|
||||||
sm := testServerManager()
|
sm := testServerManager()
|
||||||
|
@ -132,6 +148,119 @@ func TestServerManagerInternal_New(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// func (sm *ServerManager) reconcileServerList(sc *serverConfig) bool {
|
||||||
|
func TestServerManagerInternal_reconcileServerList(t *testing.T) {
|
||||||
|
tests := []int{0, 1, 2, 3, 4, 5, 10, 100}
|
||||||
|
for _, n := range tests {
|
||||||
|
ok, err := test_reconcileServerList(n)
|
||||||
|
if !ok {
|
||||||
|
t.Errorf("Expected %d to pass: %v", n, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func test_reconcileServerList(maxServers int) (bool, error) {
|
||||||
|
// Build a server list, reconcile, verify the missing servers are
|
||||||
|
// missing, the added have been added, and the original server is
|
||||||
|
// present.
|
||||||
|
const failPct = 0.5
|
||||||
|
sm := testServerManagerFailProb(failPct)
|
||||||
|
const numShuffleTests = 100
|
||||||
|
const uniquePassRate = 0.5
|
||||||
|
|
||||||
|
var failedServers, healthyServers []*server_details.ServerDetails
|
||||||
|
for i := 0; i < maxServers; i++ {
|
||||||
|
nodeName := fmt.Sprintf("s%02d", i)
|
||||||
|
|
||||||
|
node := &server_details.ServerDetails{Name: nodeName}
|
||||||
|
// Add 66% of servers to ServerManager
|
||||||
|
if rand.Float64() > 0.33 {
|
||||||
|
sm.AddServer(node)
|
||||||
|
|
||||||
|
// Of healthy servers, (ab)use connPoolPinger to
|
||||||
|
// failPct of the servers for the reconcile. This
|
||||||
|
// allows for the selected server to no longer be
|
||||||
|
// healthy for the reconcile below.
|
||||||
|
if ok, _ := sm.connPoolPinger.PingConsulServer(node); ok {
|
||||||
|
// Will still be present
|
||||||
|
healthyServers = append(healthyServers, node)
|
||||||
|
} else {
|
||||||
|
// Will be missing
|
||||||
|
failedServers = append(failedServers, node)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Will be added from the call to reconcile
|
||||||
|
healthyServers = append(healthyServers, node)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Randomize ServerManager's server list
|
||||||
|
sm.RebalanceServers()
|
||||||
|
selectedServer := sm.FindServer()
|
||||||
|
|
||||||
|
var selectedServerFailed bool
|
||||||
|
for _, s := range failedServers {
|
||||||
|
if selectedServer.Key().Equal(s.Key()) {
|
||||||
|
selectedServerFailed = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update ServerManager's server list to be "healthy" based on Serf.
|
||||||
|
// Reconcile this with origServers, which is shuffled and has a live
|
||||||
|
// connection, but possibly out of date.
|
||||||
|
origServers := sm.getServerConfig()
|
||||||
|
sm.saveServerConfig(serverConfig{servers: healthyServers})
|
||||||
|
|
||||||
|
// This should always succeed with non-zero server lists
|
||||||
|
if !selectedServerFailed && !sm.reconcileServerList(&origServers) &&
|
||||||
|
len(sm.getServerConfig().servers) != 0 &&
|
||||||
|
len(origServers.servers) != 0 {
|
||||||
|
// If the random gods are unfavorable and we end up with zero
|
||||||
|
// length lists, expect things to fail and retry the test.
|
||||||
|
return false, fmt.Errorf("Expected reconcile to succeed: %v %d %d",
|
||||||
|
selectedServerFailed,
|
||||||
|
len(sm.getServerConfig().servers),
|
||||||
|
len(origServers.servers))
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we have zero-length server lists, test succeeded in degenerate
|
||||||
|
// case.
|
||||||
|
if len(sm.getServerConfig().servers) == 0 &&
|
||||||
|
len(origServers.servers) == 0 {
|
||||||
|
// Failed as expected w/ zero length list
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
resultingServerMap := make(map[server_details.Key]bool)
|
||||||
|
for _, s := range sm.getServerConfig().servers {
|
||||||
|
resultingServerMap[*s.Key()] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test to make sure no failed servers are in the ServerManager's
|
||||||
|
// list. Error if there are any failedServers in sc.servers
|
||||||
|
for _, s := range failedServers {
|
||||||
|
_, ok := resultingServerMap[*s.Key()]
|
||||||
|
if ok {
|
||||||
|
return false, fmt.Errorf("Found failed server %v in merged list %v", s, resultingServerMap)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test to make sure all healthy servers are in the healthy list.
|
||||||
|
if len(healthyServers) != len(sm.getServerConfig().servers) {
|
||||||
|
return false, fmt.Errorf("Expected healthy map and servers to match: %d/%d", len(healthyServers), len(healthyServers))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test to make sure all healthy servers are in the resultingServerMap list.
|
||||||
|
for _, s := range healthyServers {
|
||||||
|
_, ok := resultingServerMap[*s.Key()]
|
||||||
|
if !ok {
|
||||||
|
return false, fmt.Errorf("Server %v missing from healthy map after merged lists", s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true, nil
|
||||||
|
}
|
||||||
|
|
||||||
// func (sc *serverConfig) refreshServerRebalanceTimer() {
|
// func (sc *serverConfig) refreshServerRebalanceTimer() {
|
||||||
func TestServerManagerInternal_refreshServerRebalanceTimer(t *testing.T) {
|
func TestServerManagerInternal_refreshServerRebalanceTimer(t *testing.T) {
|
||||||
type clusterSizes struct {
|
type clusterSizes struct {
|
||||||
|
|
Loading…
Reference in a new issue