Merge pull request #1331 from slackpad/f-network-tomography

Adds network tomography features to Consul.
This commit is contained in:
James Phillips 2015-10-23 16:34:13 -07:00
commit f6c774bec3
55 changed files with 4517 additions and 104 deletions

View file

@ -44,6 +44,12 @@ type QueryOptions struct {
// Token is used to provide a per-request ACL token
// which overrides the agent's default token.
Token string
// Near is used to provide a node name that will sort the results
// in ascending order based on the estimated round trip time from
// that node. Setting this to "_agent" will use the agent's node
// for the sort.
Near string
}
// WriteOptions are used to parameterize a write
@ -250,6 +256,9 @@ func (r *request) setQueryOptions(q *QueryOptions) {
if q.Token != "" {
r.params.Set("token", q.Token)
}
if q.Near != "" {
r.params.Set("near", q.Near)
}
}
// durToMsec converts a duration to a millisecond specified string

View file

@ -127,6 +127,7 @@ func TestSetQueryOptions(t *testing.T) {
WaitIndex: 1000,
WaitTime: 100 * time.Second,
Token: "12345",
Near: "nodex",
}
r.setQueryOptions(q)
@ -148,6 +149,9 @@ func TestSetQueryOptions(t *testing.T) {
if r.params.Get("token") != "12345" {
t.Fatalf("bad: %v", r.params)
}
if r.params.Get("near") != "nodex" {
t.Fatalf("bad: %v", r.params)
}
}
func TestSetWriteOptions(t *testing.T) {

66
api/coordinate.go Normal file
View file

@ -0,0 +1,66 @@
package api
import (
"github.com/hashicorp/serf/coordinate"
)
// CoordinateEntry represents a node and its associated network coordinate.
type CoordinateEntry struct {
Node string
Coord *coordinate.Coordinate
}
// CoordinateDatacenterMap represents a datacenter and its associated WAN
// nodes and their associates coordinates.
type CoordinateDatacenterMap struct {
Datacenter string
Coordinates []CoordinateEntry
}
// Coordinate can be used to query the coordinate endpoints
type Coordinate struct {
c *Client
}
// Coordinate returns a handle to the coordinate endpoints
func (c *Client) Coordinate() *Coordinate {
return &Coordinate{c}
}
// Datacenters is used to return the coordinates of all the servers in the WAN
// pool.
func (c *Coordinate) Datacenters() ([]*CoordinateDatacenterMap, error) {
r := c.c.newRequest("GET", "/v1/coordinate/datacenters")
_, resp, err := requireOK(c.c.doRequest(r))
if err != nil {
return nil, err
}
defer resp.Body.Close()
var out []*CoordinateDatacenterMap
if err := decodeBody(resp, &out); err != nil {
return nil, err
}
return out, nil
}
// Nodes is used to return the coordinates of all the nodes in the LAN pool.
func (c *Coordinate) Nodes(q *QueryOptions) ([]*CoordinateEntry, *QueryMeta, error) {
r := c.c.newRequest("GET", "/v1/coordinate/nodes")
r.setQueryOptions(q)
rtt, resp, err := requireOK(c.c.doRequest(r))
if err != nil {
return nil, nil, err
}
defer resp.Body.Close()
qm := &QueryMeta{}
parseQueryMeta(resp, qm)
qm.RequestTime = rtt
var out []*CoordinateEntry
if err := decodeBody(resp, &out); err != nil {
return nil, nil, err
}
return out, qm, nil
}

54
api/coordinate_test.go Normal file
View file

@ -0,0 +1,54 @@
package api
import (
"fmt"
"testing"
"github.com/hashicorp/consul/testutil"
)
func TestCoordinate_Datacenters(t *testing.T) {
t.Parallel()
c, s := makeClient(t)
defer s.Stop()
coordinate := c.Coordinate()
testutil.WaitForResult(func() (bool, error) {
datacenters, err := coordinate.Datacenters()
if err != nil {
return false, err
}
if len(datacenters) == 0 {
return false, fmt.Errorf("Bad: %v", datacenters)
}
return true, nil
}, func(err error) {
t.Fatalf("err: %s", err)
})
}
func TestCoordinate_Nodes(t *testing.T) {
t.Parallel()
c, s := makeClient(t)
defer s.Stop()
coordinate := c.Coordinate()
testutil.WaitForResult(func() (bool, error) {
_, _, err := coordinate.Nodes(nil)
if err != nil {
return false, err
}
// There's not a good way to populate coordinates without
// waiting for them to calculate and update, so the best
// we can do is call the endpoint and make sure we don't
// get an error.
return true, nil
}, func(err error) {
t.Fatalf("err: %s", err)
})
}

View file

@ -17,6 +17,7 @@ import (
"github.com/hashicorp/consul/consul"
"github.com/hashicorp/consul/consul/state"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/serf/coordinate"
"github.com/hashicorp/serf/serf"
)
@ -191,6 +192,11 @@ func Create(config *Config, logOutput io.Writer) (*Agent, error) {
// Start handling events
go agent.handleEvents()
// Start sending network coordinate to the server.
if !config.DisableCoordinates {
go agent.sendCoordinate()
}
// Write out the PID file if necessary
err = agent.storePid()
if err != nil {
@ -539,6 +545,22 @@ func (a *Agent) WANMembers() []serf.Member {
}
}
// CanServersUnderstandProtocol checks to see if all the servers understand the
// given protocol version.
func (a *Agent) CanServersUnderstandProtocol(version uint8) bool {
numServers, numWhoGrok := 0, 0
members := a.LANMembers()
for _, member := range members {
if member.Tags["role"] == "consul" {
numServers++
if member.ProtocolMax >= version {
numWhoGrok++
}
}
}
return (numServers > 0) && (numWhoGrok == numServers)
}
// StartSync is called once Services and Checks are registered.
// This is called to prevent a race between clients and the anti-entropy routines
func (a *Agent) StartSync() {
@ -556,6 +578,58 @@ func (a *Agent) ResumeSync() {
a.state.Resume()
}
// Returns the coordinate of this node in the local pool (assumes coordinates
// are enabled, so check that before calling).
func (a *Agent) GetCoordinate() (*coordinate.Coordinate, error) {
if a.config.Server {
return a.server.GetLANCoordinate()
} else {
return a.client.GetCoordinate()
}
}
// sendCoordinate is a long-running loop that periodically sends our coordinate
// to the server. Closing the agent's shutdownChannel will cause this to exit.
func (a *Agent) sendCoordinate() {
for {
rate := a.config.SyncCoordinateRateTarget
min := a.config.SyncCoordinateIntervalMin
intv := rateScaledInterval(rate, min, len(a.LANMembers()))
intv = intv + randomStagger(intv)
select {
case <-time.After(intv):
if !a.CanServersUnderstandProtocol(3) {
continue
}
var c *coordinate.Coordinate
var err error
if c, err = a.GetCoordinate(); err != nil {
a.logger.Printf("[ERR] agent: failed to get coordinate: %s", err)
continue
}
// TODO - Consider adding a distance check so we don't send
// an update if the position hasn't changed by more than a
// threshold.
req := structs.CoordinateUpdateRequest{
Datacenter: a.config.Datacenter,
Node: a.config.NodeName,
Coord: c,
WriteRequest: structs.WriteRequest{Token: a.config.ACLToken},
}
var reply struct{}
if err := a.RPC("Coordinate.Update", &req, &reply); err != nil {
a.logger.Printf("[ERR] agent: coordinate update error: %s", err)
continue
}
case <-a.shutdownCh:
return
}
}
}
// persistService saves a service definition to a JSON file in the data dir
func (a *Agent) persistService(service *structs.NodeService) error {
svcPath := filepath.Join(a.config.DataDir, servicesDir, stringHash(service.ID))

View file

@ -3,6 +3,7 @@ package agent
import (
"fmt"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/serf/coordinate"
"github.com/hashicorp/serf/serf"
"net/http"
"strconv"
@ -11,12 +12,22 @@ import (
type AgentSelf struct {
Config *Config
Coord *coordinate.Coordinate
Member serf.Member
}
func (s *HTTPServer) AgentSelf(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
var c *coordinate.Coordinate
if !s.agent.config.DisableCoordinates {
var err error
if c, err = s.agent.GetCoordinate(); err != nil {
return nil, err
}
}
return AgentSelf{
Config: s.agent.config,
Coord: c,
Member: s.agent.LocalMember(),
}, nil
}

View file

@ -6,6 +6,7 @@ import (
"net/http"
"net/http/httptest"
"os"
"reflect"
"testing"
"time"
@ -81,7 +82,7 @@ func TestHTTPAgentSelf(t *testing.T) {
obj, err := srv.AgentSelf(nil, req)
if err != nil {
t.Fatalf("Err: %v", err)
t.Fatalf("err: %v", err)
}
val := obj.(AgentSelf)
@ -92,6 +93,24 @@ func TestHTTPAgentSelf(t *testing.T) {
if int(val.Config.Ports.SerfLan) != srv.agent.config.Ports.SerfLan {
t.Fatalf("incorrect port: %v", obj)
}
c, err := srv.agent.server.GetLANCoordinate()
if err != nil {
t.Fatalf("err: %v", err)
}
if !reflect.DeepEqual(c, val.Coord) {
t.Fatalf("coordinates are not equal: %v != %v", c, val.Coord)
}
srv.agent.config.DisableCoordinates = true
obj, err = srv.AgentSelf(nil, req)
if err != nil {
t.Fatalf("err: %v", err)
}
val = obj.(AgentSelf)
if val.Coord != nil {
t.Fatalf("should have been nil: %v", val.Coord)
}
}
func TestHTTPAgentMembers(t *testing.T) {

View file

@ -59,6 +59,8 @@ func nextConfig() *Config {
cons.RaftConfig.HeartbeatTimeout = 40 * time.Millisecond
cons.RaftConfig.ElectionTimeout = 40 * time.Millisecond
cons.DisableCoordinates = false
cons.CoordinateUpdatePeriod = 100 * time.Millisecond
return conf
}
@ -1579,3 +1581,51 @@ func TestAgent_purgeCheckState(t *testing.T) {
t.Fatalf("should have removed file")
}
}
func TestAgent_GetCoordinate(t *testing.T) {
check := func(server bool) {
config := nextConfig()
config.Server = server
dir, agent := makeAgent(t, config)
defer os.RemoveAll(dir)
defer agent.Shutdown()
// This doesn't verify the returned coordinate, but it makes
// sure that the agent chooses the correct Serf instance,
// depending on how it's configured as a client or a server.
// If it chooses the wrong one, this will crash.
if _, err := agent.GetCoordinate(); err != nil {
t.Fatalf("err: %s", err)
}
}
check(true)
check(false)
}
func TestAgent_CanServersUnderstandProtocol(t *testing.T) {
config := nextConfig()
dir, agent := makeAgent(t, config)
defer os.RemoveAll(dir)
defer agent.Shutdown()
min := uint8(consul.ProtocolVersionMin)
if !agent.CanServersUnderstandProtocol(min) {
t.Fatalf("should grok %d", min)
}
max := uint8(consul.ProtocolVersionMax)
if !agent.CanServersUnderstandProtocol(max) {
t.Fatalf("should grok %d", max)
}
current := uint8(config.Protocol)
if !agent.CanServersUnderstandProtocol(current) {
t.Fatalf("should grok %d", current)
}
future := max + 1
if agent.CanServersUnderstandProtocol(future) {
t.Fatalf("should not grok %d", future)
}
}

View file

@ -60,6 +60,7 @@ func (s *HTTPServer) CatalogDatacenters(resp http.ResponseWriter, req *http.Requ
func (s *HTTPServer) CatalogNodes(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
// Setup the request
args := structs.DCSpecificRequest{}
s.parseSource(req, &args.Source)
if done := s.parse(resp, req, &args.Datacenter, &args.QueryOptions); done {
return nil, nil
}
@ -90,6 +91,7 @@ func (s *HTTPServer) CatalogServices(resp http.ResponseWriter, req *http.Request
func (s *HTTPServer) CatalogServiceNodes(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
// Set default DC
args := structs.ServiceSpecificRequest{}
s.parseSource(req, &args.Source)
if done := s.parse(resp, req, &args.Datacenter, &args.QueryOptions); done {
return nil, nil
}

View file

@ -2,13 +2,15 @@ package agent
import (
"fmt"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/consul/testutil"
"net/http"
"net/http/httptest"
"os"
"testing"
"time"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/consul/testutil"
"github.com/hashicorp/serf/coordinate"
)
func TestCatalogRegister(t *testing.T) {
@ -204,6 +206,101 @@ func TestCatalogNodes_Blocking(t *testing.T) {
}
}
func TestCatalogNodes_DistanceSort(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()
testutil.WaitForLeader(t, srv.agent.RPC, "dc1")
// Register nodes.
args := &structs.RegisterRequest{
Datacenter: "dc1",
Node: "foo",
Address: "127.0.0.1",
}
var out struct{}
if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil {
t.Fatalf("err: %v", err)
}
args = &structs.RegisterRequest{
Datacenter: "dc1",
Node: "bar",
Address: "127.0.0.2",
}
if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil {
t.Fatalf("err: %v", err)
}
// Nobody has coordinates set so this will still return them in the
// order they are indexed.
req, err := http.NewRequest("GET", "/v1/catalog/nodes?dc=dc1&near=foo", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
resp := httptest.NewRecorder()
obj, err := srv.CatalogNodes(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
assertIndex(t, resp)
nodes := obj.(structs.Nodes)
if len(nodes) != 3 {
t.Fatalf("bad: %v", obj)
}
if nodes[0].Node != "bar" {
t.Fatalf("bad: %v", nodes)
}
if nodes[1].Node != "foo" {
t.Fatalf("bad: %v", nodes)
}
if nodes[2].Node != srv.agent.config.NodeName {
t.Fatalf("bad: %v", nodes)
}
// Send an update for the node and wait for it to get applied.
arg := structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "foo",
Coord: coordinate.NewCoordinate(coordinate.DefaultConfig()),
}
if err := srv.agent.RPC("Coordinate.Update", &arg, &out); err != nil {
t.Fatalf("err: %v", err)
}
time.Sleep(200 * time.Millisecond)
// Query again and now foo should have moved to the front of the line.
req, err = http.NewRequest("GET", "/v1/catalog/nodes?dc=dc1&near=foo", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
resp = httptest.NewRecorder()
obj, err = srv.CatalogNodes(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
assertIndex(t, resp)
nodes = obj.(structs.Nodes)
if len(nodes) != 3 {
t.Fatalf("bad: %v", obj)
}
if nodes[0].Node != "foo" {
t.Fatalf("bad: %v", nodes)
}
if nodes[1].Node != "bar" {
t.Fatalf("bad: %v", nodes)
}
if nodes[2].Node != srv.agent.config.NodeName {
t.Fatalf("bad: %v", nodes)
}
}
func TestCatalogServices(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
@ -289,6 +386,108 @@ func TestCatalogServiceNodes(t *testing.T) {
}
}
func TestCatalogServiceNodes_DistanceSort(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()
testutil.WaitForLeader(t, srv.agent.RPC, "dc1")
// Register nodes.
args := &structs.RegisterRequest{
Datacenter: "dc1",
Node: "bar",
Address: "127.0.0.1",
Service: &structs.NodeService{
Service: "api",
Tags: []string{"a"},
},
}
var out struct{}
if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil {
t.Fatalf("err: %v", err)
}
req, err := http.NewRequest("GET", "/v1/catalog/service/api?tag=a", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
args = &structs.RegisterRequest{
Datacenter: "dc1",
Node: "foo",
Address: "127.0.0.2",
Service: &structs.NodeService{
Service: "api",
Tags: []string{"a"},
},
}
if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil {
t.Fatalf("err: %v", err)
}
// Nobody has coordinates set so this will still return them in the
// order they are indexed.
req, err = http.NewRequest("GET", "/v1/catalog/service/api?tag=a&near=foo", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
resp := httptest.NewRecorder()
obj, err := srv.CatalogServiceNodes(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
assertIndex(t, resp)
nodes := obj.(structs.ServiceNodes)
if len(nodes) != 2 {
t.Fatalf("bad: %v", obj)
}
if nodes[0].Node != "bar" {
t.Fatalf("bad: %v", nodes)
}
if nodes[1].Node != "foo" {
t.Fatalf("bad: %v", nodes)
}
// Send an update for the node and wait for it to get applied.
arg := structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "foo",
Coord: coordinate.NewCoordinate(coordinate.DefaultConfig()),
}
if err := srv.agent.RPC("Coordinate.Update", &arg, &out); err != nil {
t.Fatalf("err: %v", err)
}
time.Sleep(200 * time.Millisecond)
// Query again and now foo should have moved to the front of the line.
req, err = http.NewRequest("GET", "/v1/catalog/service/api?tag=a&near=foo", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
resp = httptest.NewRecorder()
obj, err = srv.CatalogServiceNodes(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
assertIndex(t, resp)
nodes = obj.(structs.ServiceNodes)
if len(nodes) != 2 {
t.Fatalf("bad: %v", obj)
}
if nodes[0].Node != "foo" {
t.Fatalf("bad: %v", nodes)
}
if nodes[1].Node != "bar" {
t.Fatalf("bad: %v", nodes)
}
}
func TestCatalogNodeServices(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)

View file

@ -368,10 +368,25 @@ type Config struct {
AtlasEndpoint string `mapstructure:"atlas_endpoint"`
// AEInterval controls the anti-entropy interval. This is how often
// the agent attempts to reconcile it's local state with the server'
// the agent attempts to reconcile its local state with the server's
// representation of our state. Defaults to every 60s.
AEInterval time.Duration `mapstructure:"-" json:"-"`
// DisableCoordinates controls features related to network coordinates.
DisableCoordinates bool `mapstructure:"disable_coordinates"`
// SyncCoordinateRateTarget controls the rate for sending network
// coordinates to the server, in updates per second. This is the max rate
// that the server supports, so we scale our interval based on the size
// of the cluster to try to achieve this in aggregate at the server.
SyncCoordinateRateTarget float64 `mapstructure:"-" json:"-"`
// SyncCoordinateIntervalMin sets the minimum interval that coordinates
// will be sent to the server. We scale the interval based on the cluster
// size, but below a certain interval it doesn't make sense send them any
// faster.
SyncCoordinateIntervalMin time.Duration `mapstructure:"-" json:"-"`
// Checks holds the provided check definitions
Checks []*CheckDefinition `mapstructure:"-" json:"-"`
@ -463,14 +478,23 @@ func DefaultConfig() *Config {
},
StatsitePrefix: "consul",
SyslogFacility: "LOCAL0",
Protocol: consul.ProtocolVersionMax,
Protocol: consul.ProtocolVersion2Compatible,
CheckUpdateInterval: 5 * time.Minute,
AEInterval: time.Minute,
ACLTTL: 30 * time.Second,
ACLDownPolicy: "extend-cache",
ACLDefaultPolicy: "allow",
RetryInterval: 30 * time.Second,
RetryIntervalWan: 30 * time.Second,
DisableCoordinates: false,
// SyncCoordinateRateTarget is set based on the rate that we want
// the server to handle as an aggregate across the entire cluster.
// If you update this, you'll need to adjust CoordinateUpdate* in
// the server-side config accordingly.
SyncCoordinateRateTarget: 64.0, // updates / second
SyncCoordinateIntervalMin: 15 * time.Second,
ACLTTL: 30 * time.Second,
ACLDownPolicy: "extend-cache",
ACLDefaultPolicy: "allow",
RetryInterval: 30 * time.Second,
RetryIntervalWan: 30 * time.Second,
}
}
@ -1063,6 +1087,9 @@ func MergeConfig(a, b *Config) *Config {
if b.AtlasEndpoint != "" {
result.AtlasEndpoint = b.AtlasEndpoint
}
if b.DisableCoordinates {
result.DisableCoordinates = true
}
if b.SessionTTLMinRaw != "" {
result.SessionTTLMin = b.SessionTTLMin
result.SessionTTLMinRaw = b.SessionTTLMinRaw

View file

@ -734,6 +734,17 @@ func TestDecodeConfig(t *testing.T) {
t.Fatalf("bad: %#v", config)
}
// Coordinate disable
input = `{"disable_coordinates": true}`
config, err = DecodeConfig(bytes.NewReader([]byte(input)))
if err != nil {
t.Fatalf("err: %s", err)
}
if config.DisableCoordinates != true {
t.Fatalf("bad: coordinates not disabled: %#v", config)
}
// SessionTTLMin
input = `{"session_ttl_min": "5s"}`
config, err = DecodeConfig(bytes.NewReader([]byte(input)))

View file

@ -0,0 +1,66 @@
package agent
import (
"github.com/hashicorp/consul/consul/structs"
"net/http"
"sort"
)
// coordinateDisabled handles all the endpoints when coordinates are not enabled,
// returning an error message.
func coordinateDisabled(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
resp.WriteHeader(401)
resp.Write([]byte("Coordinate support disabled"))
return nil, nil
}
// sorter wraps a coordinate list and implements the sort.Interface to sort by
// node name.
type sorter struct {
coordinates structs.Coordinates
}
// See sort.Interface.
func (s *sorter) Len() int {
return len(s.coordinates)
}
// See sort.Interface.
func (s *sorter) Swap(i, j int) {
s.coordinates[i], s.coordinates[j] = s.coordinates[j], s.coordinates[i]
}
// See sort.Interface.
func (s *sorter) Less(i, j int) bool {
return s.coordinates[i].Node < s.coordinates[j].Node
}
// CoordinateDatacenters returns the WAN nodes in each datacenter, along with
// raw network coordinates.
func (s *HTTPServer) CoordinateDatacenters(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
var out []structs.DatacenterMap
if err := s.agent.RPC("Coordinate.ListDatacenters", struct{}{}, &out); err != nil {
for i := range out {
sort.Sort(&sorter{out[i].Coordinates})
}
return nil, err
}
return out, nil
}
// CoordinateNodes returns the LAN nodes in the given datacenter, along with
// raw network coordinates.
func (s *HTTPServer) CoordinateNodes(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
args := structs.DCSpecificRequest{}
if done := s.parse(resp, req, &args.Datacenter, &args.QueryOptions); done {
return nil, nil
}
var out structs.IndexedCoordinates
defer setMeta(resp, &out.QueryMeta)
if err := s.agent.RPC("Coordinate.ListNodes", &args, &out); err != nil {
sort.Sort(&sorter{out.Coordinates})
return nil, err
}
return out.Coordinates, nil
}

View file

@ -0,0 +1,105 @@
package agent
import (
"net/http"
"net/http/httptest"
"os"
"testing"
"time"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/consul/testutil"
"github.com/hashicorp/serf/coordinate"
)
func TestCoordinate_Datacenters(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()
testutil.WaitForLeader(t, srv.agent.RPC, "dc1")
req, err := http.NewRequest("GET", "/v1/coordinate/datacenters", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
resp := httptest.NewRecorder()
obj, err := srv.CoordinateDatacenters(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
maps := obj.([]structs.DatacenterMap)
if len(maps) != 1 ||
maps[0].Datacenter != "dc1" ||
len(maps[0].Coordinates) != 1 ||
maps[0].Coordinates[0].Node != srv.agent.config.NodeName {
t.Fatalf("bad: %v", maps)
}
}
func TestCoordinate_Nodes(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()
testutil.WaitForLeader(t, srv.agent.RPC, "dc1")
// Register the nodes.
nodes := []string{"foo", "bar"}
for _, node := range nodes {
req := structs.RegisterRequest{
Datacenter: "dc1",
Node: node,
Address: "127.0.0.1",
}
var reply struct{}
if err := srv.agent.RPC("Catalog.Register", &req, &reply); err != nil {
t.Fatalf("err: %s", err)
}
}
// Send some coordinates for a few nodes, waiting a little while for the
// batch update to run.
arg1 := structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "foo",
Coord: coordinate.NewCoordinate(coordinate.DefaultConfig()),
}
var out struct{}
if err := srv.agent.RPC("Coordinate.Update", &arg1, &out); err != nil {
t.Fatalf("err: %v", err)
}
arg2 := structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "bar",
Coord: coordinate.NewCoordinate(coordinate.DefaultConfig()),
}
if err := srv.agent.RPC("Coordinate.Update", &arg2, &out); err != nil {
t.Fatalf("err: %v", err)
}
time.Sleep(200 * time.Millisecond)
// Query back and check the nodes are present and sorted correctly.
req, err := http.NewRequest("GET", "/v1/coordinate/nodes?dc=dc1", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
resp := httptest.NewRecorder()
obj, err := srv.CoordinateNodes(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
coordinates := obj.(structs.Coordinates)
if len(coordinates) != 2 ||
coordinates[0].Node != "bar" ||
coordinates[1].Node != "foo" {
t.Fatalf("bad: %v", coordinates)
}
}

View file

@ -9,6 +9,7 @@ import (
func (s *HTTPServer) HealthChecksInState(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
// Set default DC
args := structs.ChecksInStateRequest{}
s.parseSource(req, &args.Source)
if done := s.parse(resp, req, &args.Datacenter, &args.QueryOptions); done {
return nil, nil
}
@ -57,6 +58,7 @@ func (s *HTTPServer) HealthNodeChecks(resp http.ResponseWriter, req *http.Reques
func (s *HTTPServer) HealthServiceChecks(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
// Set default DC
args := structs.ServiceSpecificRequest{}
s.parseSource(req, &args.Source)
if done := s.parse(resp, req, &args.Datacenter, &args.QueryOptions); done {
return nil, nil
}
@ -81,6 +83,7 @@ func (s *HTTPServer) HealthServiceChecks(resp http.ResponseWriter, req *http.Req
func (s *HTTPServer) HealthServiceNodes(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
// Set default DC
args := structs.ServiceSpecificRequest{}
s.parseSource(req, &args.Source)
if done := s.parse(resp, req, &args.Datacenter, &args.QueryOptions); done {
return nil, nil
}

View file

@ -2,13 +2,16 @@ package agent
import (
"fmt"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/consul/testutil"
"net/http"
"net/http/httptest"
"os"
"reflect"
"testing"
"time"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/consul/testutil"
"github.com/hashicorp/serf/coordinate"
)
func TestHealthChecksInState(t *testing.T) {
@ -38,6 +41,87 @@ func TestHealthChecksInState(t *testing.T) {
})
}
func TestHealthChecksInState_DistanceSort(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()
testutil.WaitForLeader(t, srv.agent.RPC, "dc1")
args := &structs.RegisterRequest{
Datacenter: "dc1",
Node: "bar",
Address: "127.0.0.1",
Check: &structs.HealthCheck{
Node: "bar",
Name: "node check",
Status: structs.HealthCritical,
},
}
var out struct{}
if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil {
t.Fatalf("err: %v", err)
}
args.Node, args.Check.Node = "foo", "foo"
if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil {
t.Fatalf("err: %v", err)
}
req, err := http.NewRequest("GET", "/v1/health/state/critical?dc=dc1&near=foo", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
resp := httptest.NewRecorder()
obj, err := srv.HealthChecksInState(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
assertIndex(t, resp)
nodes := obj.(structs.HealthChecks)
if len(nodes) != 2 {
t.Fatalf("bad: %v", nodes)
}
if nodes[0].Node != "bar" {
t.Fatalf("bad: %v", nodes)
}
if nodes[1].Node != "foo" {
t.Fatalf("bad: %v", nodes)
}
// Send an update for the node and wait for it to get applied.
arg := structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "foo",
Coord: coordinate.NewCoordinate(coordinate.DefaultConfig()),
}
if err := srv.agent.RPC("Coordinate.Update", &arg, &out); err != nil {
t.Fatalf("err: %v", err)
}
time.Sleep(200 * time.Millisecond)
// Query again and now foo should have moved to the front of the line.
resp = httptest.NewRecorder()
obj, err = srv.HealthChecksInState(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
assertIndex(t, resp)
nodes = obj.(structs.HealthChecks)
if len(nodes) != 2 {
t.Fatalf("bad: %v", nodes)
}
if nodes[0].Node != "foo" {
t.Fatalf("bad: %v", nodes)
}
if nodes[1].Node != "bar" {
t.Fatalf("bad: %v", nodes)
}
}
func TestHealthNodeChecks(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
@ -110,6 +194,92 @@ func TestHealthServiceChecks(t *testing.T) {
}
}
func TestHealthServiceChecks_DistanceSort(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()
testutil.WaitForLeader(t, srv.agent.RPC, "dc1")
// Create a service check
args := &structs.RegisterRequest{
Datacenter: "dc1",
Node: "bar",
Address: "127.0.0.1",
Service: &structs.NodeService{
ID: "test",
Service: "test",
},
Check: &structs.HealthCheck{
Node: "bar",
Name: "test check",
ServiceID: "test",
},
}
var out struct{}
if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil {
t.Fatalf("err: %v", err)
}
args.Node, args.Check.Node = "foo", "foo"
if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil {
t.Fatalf("err: %v", err)
}
req, err := http.NewRequest("GET", "/v1/health/checks/test?dc=dc1&near=foo", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
resp := httptest.NewRecorder()
obj, err := srv.HealthServiceChecks(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
assertIndex(t, resp)
nodes := obj.(structs.HealthChecks)
if len(nodes) != 2 {
t.Fatalf("bad: %v", obj)
}
if nodes[0].Node != "bar" {
t.Fatalf("bad: %v", nodes)
}
if nodes[1].Node != "foo" {
t.Fatalf("bad: %v", nodes)
}
// Send an update for the node and wait for it to get applied.
arg := structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "foo",
Coord: coordinate.NewCoordinate(coordinate.DefaultConfig()),
}
if err := srv.agent.RPC("Coordinate.Update", &arg, &out); err != nil {
t.Fatalf("err: %v", err)
}
time.Sleep(200 * time.Millisecond)
// Query again and now foo should have moved to the front of the line.
resp = httptest.NewRecorder()
obj, err = srv.HealthServiceChecks(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
assertIndex(t, resp)
nodes = obj.(structs.HealthChecks)
if len(nodes) != 2 {
t.Fatalf("bad: %v", obj)
}
if nodes[0].Node != "foo" {
t.Fatalf("bad: %v", nodes)
}
if nodes[1].Node != "bar" {
t.Fatalf("bad: %v", nodes)
}
}
func TestHealthServiceNodes(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
@ -138,6 +308,92 @@ func TestHealthServiceNodes(t *testing.T) {
}
}
func TestHealthServiceNodes_DistanceSort(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()
testutil.WaitForLeader(t, srv.agent.RPC, "dc1")
// Create a service check
args := &structs.RegisterRequest{
Datacenter: "dc1",
Node: "bar",
Address: "127.0.0.1",
Service: &structs.NodeService{
ID: "test",
Service: "test",
},
Check: &structs.HealthCheck{
Node: "bar",
Name: "test check",
ServiceID: "test",
},
}
var out struct{}
if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil {
t.Fatalf("err: %v", err)
}
args.Node, args.Check.Node = "foo", "foo"
if err := srv.agent.RPC("Catalog.Register", args, &out); err != nil {
t.Fatalf("err: %v", err)
}
req, err := http.NewRequest("GET", "/v1/health/service/test?dc=dc1&near=foo", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
resp := httptest.NewRecorder()
obj, err := srv.HealthServiceNodes(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
assertIndex(t, resp)
nodes := obj.(structs.CheckServiceNodes)
if len(nodes) != 2 {
t.Fatalf("bad: %v", obj)
}
if nodes[0].Node.Node != "bar" {
t.Fatalf("bad: %v", nodes)
}
if nodes[1].Node.Node != "foo" {
t.Fatalf("bad: %v", nodes)
}
// Send an update for the node and wait for it to get applied.
arg := structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "foo",
Coord: coordinate.NewCoordinate(coordinate.DefaultConfig()),
}
if err := srv.agent.RPC("Coordinate.Update", &arg, &out); err != nil {
t.Fatalf("err: %v", err)
}
time.Sleep(200 * time.Millisecond)
// Query again and now foo should have moved to the front of the line.
resp = httptest.NewRecorder()
obj, err = srv.HealthServiceNodes(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
assertIndex(t, resp)
nodes = obj.(structs.CheckServiceNodes)
if len(nodes) != 2 {
t.Fatalf("bad: %v", obj)
}
if nodes[0].Node.Node != "foo" {
t.Fatalf("bad: %v", nodes)
}
if nodes[1].Node.Node != "bar" {
t.Fatalf("bad: %v", nodes)
}
}
func TestHealthServiceNodes_PassingFilter(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)

View file

@ -206,6 +206,14 @@ func (s *HTTPServer) registerHandlers(enableDebug bool) {
s.mux.HandleFunc("/v1/catalog/service/", s.wrap(s.CatalogServiceNodes))
s.mux.HandleFunc("/v1/catalog/node/", s.wrap(s.CatalogNodeServices))
if !s.agent.config.DisableCoordinates {
s.mux.HandleFunc("/v1/coordinate/datacenters", s.wrap(s.CoordinateDatacenters))
s.mux.HandleFunc("/v1/coordinate/nodes", s.wrap(s.CoordinateNodes))
} else {
s.mux.HandleFunc("/v1/coordinate/datacenters", s.wrap(coordinateDisabled))
s.mux.HandleFunc("/v1/coordinate/nodes", s.wrap(coordinateDisabled))
}
s.mux.HandleFunc("/v1/health/node/", s.wrap(s.HealthNodeChecks))
s.mux.HandleFunc("/v1/health/checks/", s.wrap(s.HealthServiceChecks))
s.mux.HandleFunc("/v1/health/state/", s.wrap(s.HealthChecksInState))
@ -485,6 +493,20 @@ func (s *HTTPServer) parseToken(req *http.Request, token *string) {
*token = s.agent.config.ACLToken
}
// parseSource is used to parse the ?near=<node> query parameter, used for
// sorting by RTT based on a source node. We set the source's DC to the target
// DC in the request, if given, or else the agent's DC.
func (s *HTTPServer) parseSource(req *http.Request, source *structs.QuerySource) {
s.parseDC(req, &source.Datacenter)
if node := req.URL.Query().Get("near"); node != "" {
if node == "_agent" {
source.Node = s.agent.config.NodeName
} else {
source.Node = node
}
}
}
// parse is a convenience method for endpoints that need
// to use both parseWait and parseDC.
func (s *HTTPServer) parse(resp http.ResponseWriter, req *http.Request, dc *string, b *structs.QueryOptions) bool {

View file

@ -337,6 +337,63 @@ func testPrettyPrint(pretty string, t *testing.T) {
}
}
func TestParseSource(t *testing.T) {
dir, srv := makeHTTPServer(t)
defer os.RemoveAll(dir)
defer srv.Shutdown()
defer srv.agent.Shutdown()
// Default is agent's DC and no node (since the user didn't care, then
// just give them the cheapest possible query).
req, err := http.NewRequest("GET",
"/v1/catalog/nodes", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
source := structs.QuerySource{}
srv.parseSource(req, &source)
if source.Datacenter != "dc1" || source.Node != "" {
t.Fatalf("bad: %v", source)
}
// Adding the source parameter should set that node.
req, err = http.NewRequest("GET",
"/v1/catalog/nodes?near=bob", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
source = structs.QuerySource{}
srv.parseSource(req, &source)
if source.Datacenter != "dc1" || source.Node != "bob" {
t.Fatalf("bad: %v", source)
}
// We should follow whatever dc parameter was given so that the node is
// looked up correctly on the receiving end.
req, err = http.NewRequest("GET",
"/v1/catalog/nodes?near=bob&dc=foo", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
source = structs.QuerySource{}
srv.parseSource(req, &source)
if source.Datacenter != "foo" || source.Node != "bob" {
t.Fatalf("bad: %v", source)
}
// The magic "_agent" node name will use the agent's local node name.
req, err = http.NewRequest("GET",
"/v1/catalog/nodes?near=_agent", nil)
if err != nil {
t.Fatalf("err: %v", err)
}
source = structs.QuerySource{}
srv.parseSource(req, &source)
if source.Datacenter != "dc1" || source.Node != srv.agent.config.NodeName {
t.Fatalf("bad: %v", source)
}
}
func TestParseWait(t *testing.T) {
resp := httptest.NewRecorder()
var b structs.QueryOptions

View file

@ -387,6 +387,12 @@ func TestAgentAntiEntropy_Services_WithChecks(t *testing.T) {
}
}
var testRegisterRules = `
service "api" {
policy = "write"
}
`
func TestAgentAntiEntropy_Services_ACLDeny(t *testing.T) {
conf := nextConfig()
conf.ACLDatacenter = "dc1"
@ -796,8 +802,35 @@ func TestAgent_nestedPauseResume(t *testing.T) {
}
var testRegisterRules = `
service "api" {
policy = "write"
func TestAgent_sendCoordinate(t *testing.T) {
conf := nextConfig()
conf.SyncCoordinateRateTarget = 10.0 // updates/sec
conf.SyncCoordinateIntervalMin = 1 * time.Millisecond
conf.ConsulConfig.CoordinateUpdatePeriod = 100 * time.Millisecond
conf.ConsulConfig.CoordinateUpdateBatchSize = 10
conf.ConsulConfig.CoordinateUpdateMaxBatches = 1
dir, agent := makeAgent(t, conf)
defer os.RemoveAll(dir)
defer agent.Shutdown()
testutil.WaitForLeader(t, agent.RPC, "dc1")
// Wait a little while for an update.
time.Sleep(2 * conf.ConsulConfig.CoordinateUpdatePeriod)
// Make sure the coordinate is present.
req := structs.DCSpecificRequest{
Datacenter: agent.config.Datacenter,
}
var reply structs.IndexedCoordinates
if err := agent.RPC("Coordinate.ListNodes", &req, &reply); err != nil {
t.Fatalf("err: %s", err)
}
if len(reply.Coordinates) != 1 {
t.Fatalf("expected a coordinate: %v", reply)
}
coord := reply.Coordinates[0]
if coord.Node != agent.config.NodeName || coord.Coord == nil {
t.Fatalf("bad: %v", coord)
}
}
`

View file

@ -18,14 +18,17 @@ import (
)
const (
// This scale factor means we will add a minute after we
// cross 128 nodes, another at 256, another at 512, etc.
// By 8192 nodes, we will scale up by a factor of 8
// This scale factor means we will add a minute after we cross 128 nodes,
// another at 256, another at 512, etc. By 8192 nodes, we will scale up
// by a factor of 8.
//
// If you update this, you may need to adjust the tuning of
// CoordinateUpdatePeriod and CoordinateUpdateMaxBatchSize.
aeScaleThreshold = 128
)
// aeScale is used to scale the time interval at which anti-entropy
// take place. It is used to prevent saturation as the cluster size grows
// aeScale is used to scale the time interval at which anti-entropy updates take
// place. It is used to prevent saturation as the cluster size grows.
func aeScale(interval time.Duration, n int) time.Duration {
// Don't scale until we cross the threshold
if n <= aeScaleThreshold {
@ -36,6 +39,17 @@ func aeScale(interval time.Duration, n int) time.Duration {
return time.Duration(multiplier) * interval
}
// rateScaledInterval is used to choose an interval to perform an action in order
// to target an aggregate number of actions per second across the whole cluster.
func rateScaledInterval(rate float64, min time.Duration, n int) time.Duration {
interval := time.Duration(float64(time.Second) * float64(n) / rate)
if interval < min {
return min
}
return interval
}
// Returns a random stagger interval between 0 and the duration
func randomStagger(intv time.Duration) time.Duration {
return time.Duration(uint64(rand.Int63()) % uint64(intv))

View file

@ -24,6 +24,29 @@ func TestAEScale(t *testing.T) {
}
}
func TestRateScaledInterval(t *testing.T) {
min := 1 * time.Second
rate := 200.0
if v := rateScaledInterval(rate, min, 0); v != min {
t.Fatalf("Bad: %v", v)
}
if v := rateScaledInterval(rate, min, 100); v != min {
t.Fatalf("Bad: %v", v)
}
if v := rateScaledInterval(rate, min, 200); v != 1*time.Second {
t.Fatalf("Bad: %v", v)
}
if v := rateScaledInterval(rate, min, 1000); v != 5*time.Second {
t.Fatalf("Bad: %v", v)
}
if v := rateScaledInterval(rate, min, 5000); v != 25*time.Second {
t.Fatalf("Bad: %v", v)
}
if v := rateScaledInterval(rate, min, 10000); v != 50*time.Second {
t.Fatalf("Bad: %v", v)
}
}
func TestRandomStagger(t *testing.T) {
intv := time.Minute
for i := 0; i < 10; i++ {

184
command/rtt.go Normal file
View file

@ -0,0 +1,184 @@
package command
import (
"flag"
"fmt"
"strings"
"github.com/hashicorp/consul/api"
"github.com/hashicorp/serf/coordinate"
"github.com/mitchellh/cli"
)
// RTTCommand is a Command implementation that allows users to query the
// estimated round trip time between nodes using network coordinates.
type RTTCommand struct {
Ui cli.Ui
}
func (c *RTTCommand) Help() string {
helpText := `
Usage: consul rtt [options] node1 [node2]
Estimates the round trip time between two nodes using Consul's network
coordinate model of the cluster.
At least one node name is required. If the second node name isn't given, it
is set to the agent's node name. Note that these are node names as known to
Consul as "consul members" would show, not IP addresses.
By default, the two nodes are assumed to be nodes in the local datacenter
and the LAN coordinates are used. If the -wan option is given, then the WAN
coordinates are used, and the node names must be suffixed by a period and
the datacenter (eg. "myserver.dc1").
It is not possible to measure between LAN coordinates and WAN coordinates
because they are maintained by independent Serf gossip pools, so they are
not compatible.
Options:
-wan Use WAN coordinates instead of LAN coordinates.
-http-addr=127.0.0.1:8500 HTTP address of the Consul agent.
`
return strings.TrimSpace(helpText)
}
func (c *RTTCommand) Run(args []string) int {
var wan bool
cmdFlags := flag.NewFlagSet("rtt", flag.ContinueOnError)
cmdFlags.Usage = func() { c.Ui.Output(c.Help()) }
cmdFlags.BoolVar(&wan, "wan", false, "wan")
httpAddr := HTTPAddrFlag(cmdFlags)
if err := cmdFlags.Parse(args); err != nil {
return 1
}
// They must provide at least one node.
nodes := cmdFlags.Args()
if len(nodes) < 1 || len(nodes) > 2 {
c.Ui.Error("One or two node names must be specified")
c.Ui.Error("")
c.Ui.Error(c.Help())
return 1
}
// Create and test the HTTP client.
conf := api.DefaultConfig()
conf.Address = *httpAddr
client, err := api.NewClient(conf)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error connecting to Consul agent: %s", err))
return 1
}
coordClient := client.Coordinate()
var source string
var coord1, coord2 *coordinate.Coordinate
if wan {
source = "WAN"
// Default the second node to the agent if none was given.
if len(nodes) < 2 {
agent := client.Agent()
self, err := agent.Self()
if err != nil {
c.Ui.Error(fmt.Sprintf("Unable to look up agent info: %s", err))
return 1
}
node, dc := self["Config"]["NodeName"], self["Config"]["Datacenter"]
nodes = append(nodes, fmt.Sprintf("%s.%s", node, dc))
}
// Parse the input nodes.
parts1 := strings.Split(nodes[0], ".")
parts2 := strings.Split(nodes[1], ".")
if len(parts1) != 2 || len(parts2) != 2 {
c.Ui.Error("Node names must be specified as <datacenter>.<node name> with -wan")
return 1
}
node1, dc1 := parts1[0], parts1[1]
node2, dc2 := parts2[0], parts2[1]
// Pull all the WAN coordinates.
dcs, err := coordClient.Datacenters()
if err != nil {
c.Ui.Error(fmt.Sprintf("Error getting coordinates: %s", err))
return 1
}
// See if the requested nodes are in there.
for _, dc := range dcs {
for _, entry := range dc.Coordinates {
if dc.Datacenter == dc1 && entry.Node == node1 {
coord1 = entry.Coord
}
if dc.Datacenter == dc2 && entry.Node == node2 {
coord2 = entry.Coord
}
if coord1 != nil && coord2 != nil {
goto SHOW_RTT
}
}
}
} else {
source = "LAN"
// Default the second node to the agent if none was given.
if len(nodes) < 2 {
agent := client.Agent()
node, err := agent.NodeName()
if err != nil {
c.Ui.Error(fmt.Sprintf("Unable to look up agent info: %s", err))
return 1
}
nodes = append(nodes, node)
}
// Pull all the LAN coordinates.
entries, _, err := coordClient.Nodes(nil)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error getting coordinates: %s", err))
return 1
}
// See if the requested nodes are in there.
for _, entry := range entries {
if entry.Node == nodes[0] {
coord1 = entry.Coord
}
if entry.Node == nodes[1] {
coord2 = entry.Coord
}
if coord1 != nil && coord2 != nil {
goto SHOW_RTT
}
}
}
// Make sure we found both coordinates.
if coord1 == nil {
c.Ui.Error(fmt.Sprintf("Could not find a coordinate for node %q", nodes[0]))
return 1
}
if coord2 == nil {
c.Ui.Error(fmt.Sprintf("Could not find a coordinate for node %q", nodes[1]))
return 1
}
SHOW_RTT:
// Report the round trip time.
dist := fmt.Sprintf("%.3f ms", coord1.DistanceTo(coord2).Seconds()*1000.0)
c.Ui.Output(fmt.Sprintf("Estimated %s <-> %s rtt: %s (using %s coordinates)", nodes[0], nodes[1], dist, source))
return 0
}
func (c *RTTCommand) Synopsis() string {
return "Estimates network round trip time between nodes"
}

215
command/rtt_test.go Normal file
View file

@ -0,0 +1,215 @@
package command
import (
"fmt"
"strings"
"testing"
"time"
"github.com/hashicorp/consul/command/agent"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/serf/coordinate"
"github.com/mitchellh/cli"
)
func TestRTTCommand_Implements(t *testing.T) {
var _ cli.Command = &RTTCommand{}
}
func TestRTTCommand_Run_BadArgs(t *testing.T) {
ui := new(cli.MockUi)
c := &RTTCommand{Ui: ui}
if code := c.Run([]string{}); code != 1 {
t.Fatalf("expected return code 1, got %d", code)
}
if code := c.Run([]string{"node1", "node2", "node3"}); code != 1 {
t.Fatalf("expected return code 1, got %d", code)
}
if code := c.Run([]string{"-wan", "node1", "node2"}); code != 1 {
t.Fatalf("expected return code 1, got %d", code)
}
if code := c.Run([]string{"-wan", "node1.dc1", "node2"}); code != 1 {
t.Fatalf("expected return code 1, got %d", code)
}
if code := c.Run([]string{"-wan", "node1", "node2.dc1"}); code != 1 {
t.Fatalf("expected return code 1, got %d", code)
}
}
func TestRTTCommand_Run_LAN(t *testing.T) {
updatePeriod := 10 * time.Millisecond
a := testAgentWithConfig(t, func(c *agent.Config) {
c.ConsulConfig.CoordinateUpdatePeriod = updatePeriod
})
defer a.Shutdown()
waitForLeader(t, a.httpAddr)
// Inject some known coordinates.
c1 := coordinate.NewCoordinate(coordinate.DefaultConfig())
c2 := c1.Clone()
c2.Vec[0] = 0.123
dist_str := fmt.Sprintf("%.3f ms", c1.DistanceTo(c2).Seconds()*1000.0)
{
req := structs.CoordinateUpdateRequest{
Datacenter: a.config.Datacenter,
Node: a.config.NodeName,
Coord: c1,
}
var reply struct{}
if err := a.agent.RPC("Coordinate.Update", &req, &reply); err != nil {
t.Fatalf("err: %s", err)
}
}
{
req := structs.RegisterRequest{
Datacenter: a.config.Datacenter,
Node: "dogs",
Address: "127.0.0.2",
}
var reply struct{}
if err := a.agent.RPC("Catalog.Register", &req, &reply); err != nil {
t.Fatalf("err: %s", err)
}
}
{
var reply struct{}
req := structs.CoordinateUpdateRequest{
Datacenter: a.config.Datacenter,
Node: "dogs",
Coord: c2,
}
if err := a.agent.RPC("Coordinate.Update", &req, &reply); err != nil {
t.Fatalf("err: %s", err)
}
}
// Wait for the updates to get flushed to the data store.
time.Sleep(2 * updatePeriod)
// Try two known nodes.
{
ui := new(cli.MockUi)
c := &RTTCommand{Ui: ui}
args := []string{
"-http-addr=" + a.httpAddr,
a.config.NodeName,
"dogs",
}
code := c.Run(args)
if code != 0 {
t.Fatalf("bad: %d: %#v", code, ui.ErrorWriter.String())
}
// Make sure the proper RTT was reported in the output.
expected := fmt.Sprintf("rtt: %s", dist_str)
if !strings.Contains(ui.OutputWriter.String(), expected) {
t.Fatalf("bad: %#v", ui.OutputWriter.String())
}
}
// Default to the agent's node.
{
ui := new(cli.MockUi)
c := &RTTCommand{Ui: ui}
args := []string{
"-http-addr=" + a.httpAddr,
"dogs",
}
code := c.Run(args)
if code != 0 {
t.Fatalf("bad: %d: %#v", code, ui.ErrorWriter.String())
}
// Make sure the proper RTT was reported in the output.
expected := fmt.Sprintf("rtt: %s", dist_str)
if !strings.Contains(ui.OutputWriter.String(), expected) {
t.Fatalf("bad: %#v", ui.OutputWriter.String())
}
}
// Try an unknown node.
{
ui := new(cli.MockUi)
c := &RTTCommand{Ui: ui}
args := []string{
"-http-addr=" + a.httpAddr,
a.config.NodeName,
"nope",
}
code := c.Run(args)
if code != 1 {
t.Fatalf("bad: %d: %#v", code, ui.ErrorWriter.String())
}
}
}
func TestRTTCommand_Run_WAN(t *testing.T) {
a := testAgent(t)
defer a.Shutdown()
waitForLeader(t, a.httpAddr)
node := fmt.Sprintf("%s.%s", a.config.NodeName, a.config.Datacenter)
// We can't easily inject WAN coordinates, so we will just query the
// node with itself.
{
ui := new(cli.MockUi)
c := &RTTCommand{Ui: ui}
args := []string{
"-wan",
"-http-addr=" + a.httpAddr,
node,
node,
}
code := c.Run(args)
if code != 0 {
t.Fatalf("bad: %d: %#v", code, ui.ErrorWriter.String())
}
// Make sure there was some kind of RTT reported in the output.
if !strings.Contains(ui.OutputWriter.String(), "rtt: ") {
t.Fatalf("bad: %#v", ui.OutputWriter.String())
}
}
// Default to the agent's node.
{
ui := new(cli.MockUi)
c := &RTTCommand{Ui: ui}
args := []string{
"-wan",
"-http-addr=" + a.httpAddr,
node,
}
code := c.Run(args)
if code != 0 {
t.Fatalf("bad: %d: %#v", code, ui.ErrorWriter.String())
}
// Make sure there was some kind of RTT reported in the output.
if !strings.Contains(ui.OutputWriter.String(), "rtt: ") {
t.Fatalf("bad: %#v", ui.OutputWriter.String())
}
}
// Try an unknown node.
{
ui := new(cli.MockUi)
c := &RTTCommand{Ui: ui}
args := []string{
"-wan",
"-http-addr=" + a.httpAddr,
node,
"dc1.nope",
}
code := c.Run(args)
if code != 1 {
t.Fatalf("bad: %d: %#v", code, ui.ErrorWriter.String())
}
}
}

View file

@ -114,6 +114,12 @@ func init() {
}, nil
},
"rtt": func() (cli.Command, error) {
return &command.RTTCommand{
Ui: ui,
}, nil
},
"version": func() (cli.Command, error) {
ver := Version
rel := VersionPrerelease

View file

@ -105,8 +105,13 @@ func (c *Catalog) ListDatacenters(args *struct{}, reply *[]string) error {
dcs = append(dcs, dc)
}
// Sort the DCs
// TODO - do we want to control the sort behavior with an argument?
// Sort the DCs by name first, then apply a stable sort by distance.
sort.Strings(dcs)
if err := c.srv.sortDatacentersByDistance(dcs); err != nil {
return err
}
// Return
*reply = dcs
@ -132,7 +137,7 @@ func (c *Catalog) ListNodes(args *structs.DCSpecificRequest, reply *structs.Inde
}
reply.Index, reply.Nodes = index, nodes
return nil
return c.srv.sortNodesByDistanceFrom(args.Source, reply.Nodes)
})
}
@ -189,7 +194,10 @@ func (c *Catalog) ServiceNodes(args *structs.ServiceSpecificRequest, reply *stru
return err
}
reply.Index, reply.ServiceNodes = index, services
return c.srv.filterACL(args.Token, reply)
if err := c.srv.filterACL(args.Token, reply); err != nil {
return err
}
return c.srv.sortNodesByDistanceFrom(args.Source, reply.ServiceNodes)
})
// Provide some metrics

View file

@ -4,7 +4,6 @@ import (
"fmt"
"net/rpc"
"os"
"sort"
"strings"
"testing"
"time"
@ -234,9 +233,7 @@ func TestCatalogListDatacenters(t *testing.T) {
t.Fatalf("err: %v", err)
}
// Sort the dcs
sort.Strings(out)
// The DCs should come out sorted by default.
if len(out) != 2 {
t.Fatalf("bad: %v", out)
}
@ -248,6 +245,57 @@ func TestCatalogListDatacenters(t *testing.T) {
}
}
func TestCatalogListDatacenters_DistanceSort(t *testing.T) {
dir1, s1 := testServer(t)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
codec := rpcClient(t, s1)
defer codec.Close()
dir2, s2 := testServerDC(t, "dc2")
defer os.RemoveAll(dir2)
defer s2.Shutdown()
dir3, s3 := testServerDC(t, "acdc")
defer os.RemoveAll(dir3)
defer s3.Shutdown()
// Try to join
addr := fmt.Sprintf("127.0.0.1:%d",
s1.config.SerfWANConfig.MemberlistConfig.BindPort)
if _, err := s2.JoinWAN([]string{addr}); err != nil {
t.Fatalf("err: %v", err)
}
if _, err := s3.JoinWAN([]string{addr}); err != nil {
t.Fatalf("err: %v", err)
}
testutil.WaitForLeader(t, s1.RPC, "dc1")
var out []string
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ListDatacenters", struct{}{}, &out); err != nil {
t.Fatalf("err: %v", err)
}
// It's super hard to force the Serfs into a known configuration of
// coordinates, so the best we can do is make sure that the sorting
// function is getting called (it's tested extensively in rtt_test.go).
// Since this is relative to dc1, it will be listed first (proving we
// went into the sort fn) and the other two will be sorted by name since
// there are no known coordinates for them.
if len(out) != 3 {
t.Fatalf("bad: %v", out)
}
if out[0] != "dc1" {
t.Fatalf("bad: %v", out)
}
if out[1] != "acdc" {
t.Fatalf("bad: %v", out)
}
if out[2] != "dc2" {
t.Fatalf("bad: %v", out)
}
}
func TestCatalogListNodes(t *testing.T) {
dir1, s1 := testServer(t)
defer os.RemoveAll(dir1)
@ -456,6 +504,94 @@ func TestCatalogListNodes_ConsistentRead(t *testing.T) {
}
}
func TestCatalogListNodes_DistanceSort(t *testing.T) {
dir1, s1 := testServer(t)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
codec := rpcClient(t, s1)
defer codec.Close()
testutil.WaitForLeader(t, s1.RPC, "dc1")
if err := s1.fsm.State().EnsureNode(1, &structs.Node{Node: "aaa", Address: "127.0.0.1"}); err != nil {
t.Fatalf("err: %v", err)
}
if err := s1.fsm.State().EnsureNode(2, &structs.Node{Node: "foo", Address: "127.0.0.2"}); err != nil {
t.Fatalf("err: %v", err)
}
if err := s1.fsm.State().EnsureNode(3, &structs.Node{Node: "bar", Address: "127.0.0.3"}); err != nil {
t.Fatalf("err: %v", err)
}
if err := s1.fsm.State().EnsureNode(4, &structs.Node{Node: "baz", Address: "127.0.0.4"}); err != nil {
t.Fatalf("err: %v", err)
}
// Set all but one of the nodes to known coordinates.
updates := structs.Coordinates{
{"foo", generateCoordinate(2 * time.Millisecond)},
{"bar", generateCoordinate(5 * time.Millisecond)},
{"baz", generateCoordinate(1 * time.Millisecond)},
}
if err := s1.fsm.State().CoordinateBatchUpdate(5, updates); err != nil {
t.Fatalf("err: %v", err)
}
// Query with no given source node, should get the natural order from
// the index.
args := structs.DCSpecificRequest{
Datacenter: "dc1",
}
var out structs.IndexedNodes
testutil.WaitForResult(func() (bool, error) {
msgpackrpc.CallWithCodec(codec, "Catalog.ListNodes", &args, &out)
return len(out.Nodes) == 5, nil
}, func(err error) {
t.Fatalf("err: %v", err)
})
if out.Nodes[0].Node != "aaa" {
t.Fatalf("bad: %v", out)
}
if out.Nodes[1].Node != "bar" {
t.Fatalf("bad: %v", out)
}
if out.Nodes[2].Node != "baz" {
t.Fatalf("bad: %v", out)
}
if out.Nodes[3].Node != "foo" {
t.Fatalf("bad: %v", out)
}
if out.Nodes[4].Node != s1.config.NodeName {
t.Fatalf("bad: %v", out)
}
// Query relative to foo, note that there's no known coordinate for the
// default-added Serf node nor "aaa" so they will go at the end.
args = structs.DCSpecificRequest{
Datacenter: "dc1",
Source: structs.QuerySource{Datacenter: "dc1", Node: "foo"},
}
testutil.WaitForResult(func() (bool, error) {
msgpackrpc.CallWithCodec(codec, "Catalog.ListNodes", &args, &out)
return len(out.Nodes) == 5, nil
}, func(err error) {
t.Fatalf("err: %v", err)
})
if out.Nodes[0].Node != "foo" {
t.Fatalf("bad: %v", out)
}
if out.Nodes[1].Node != "baz" {
t.Fatalf("bad: %v", out)
}
if out.Nodes[2].Node != "bar" {
t.Fatalf("bad: %v", out)
}
if out.Nodes[3].Node != "aaa" {
t.Fatalf("bad: %v", out)
}
if out.Nodes[4].Node != s1.config.NodeName {
t.Fatalf("bad: %v", out)
}
}
func BenchmarkCatalogListNodes(t *testing.B) {
dir1, s1 := testServer(nil)
defer os.RemoveAll(dir1)
@ -714,6 +850,93 @@ func TestCatalogListServiceNodes(t *testing.T) {
}
}
func TestCatalogListServiceNodes_DistanceSort(t *testing.T) {
dir1, s1 := testServer(t)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
codec := rpcClient(t, s1)
defer codec.Close()
args := structs.ServiceSpecificRequest{
Datacenter: "dc1",
ServiceName: "db",
}
var out structs.IndexedServiceNodes
err := msgpackrpc.CallWithCodec(codec, "Catalog.ServiceNodes", &args, &out)
if err == nil || err.Error() != "No cluster leader" {
t.Fatalf("err: %v", err)
}
testutil.WaitForLeader(t, s1.RPC, "dc1")
// Add a few nodes for the associated services.
s1.fsm.State().EnsureNode(1, &structs.Node{Node: "aaa", Address: "127.0.0.1"})
s1.fsm.State().EnsureService(2, "aaa", &structs.NodeService{ID: "db", Service: "db", Tags: []string{"primary"}, Address: "127.0.0.1", Port: 5000})
s1.fsm.State().EnsureNode(3, &structs.Node{Node: "foo", Address: "127.0.0.2"})
s1.fsm.State().EnsureService(4, "foo", &structs.NodeService{ID: "db", Service: "db", Tags: []string{"primary"}, Address: "127.0.0.2", Port: 5000})
s1.fsm.State().EnsureNode(5, &structs.Node{Node: "bar", Address: "127.0.0.3"})
s1.fsm.State().EnsureService(6, "bar", &structs.NodeService{ID: "db", Service: "db", Tags: []string{"primary"}, Address: "127.0.0.3", Port: 5000})
s1.fsm.State().EnsureNode(7, &structs.Node{Node: "baz", Address: "127.0.0.4"})
s1.fsm.State().EnsureService(8, "baz", &structs.NodeService{ID: "db", Service: "db", Tags: []string{"primary"}, Address: "127.0.0.4", Port: 5000})
// Set all but one of the nodes to known coordinates.
updates := structs.Coordinates{
{"foo", generateCoordinate(2 * time.Millisecond)},
{"bar", generateCoordinate(5 * time.Millisecond)},
{"baz", generateCoordinate(1 * time.Millisecond)},
}
if err := s1.fsm.State().CoordinateBatchUpdate(9, updates); err != nil {
t.Fatalf("err: %v", err)
}
// Query with no given source node, should get the natural order from
// the index.
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ServiceNodes", &args, &out); err != nil {
t.Fatalf("err: %v", err)
}
if len(out.ServiceNodes) != 4 {
t.Fatalf("bad: %v", out)
}
if out.ServiceNodes[0].Node != "aaa" {
t.Fatalf("bad: %v", out)
}
if out.ServiceNodes[1].Node != "bar" {
t.Fatalf("bad: %v", out)
}
if out.ServiceNodes[2].Node != "baz" {
t.Fatalf("bad: %v", out)
}
if out.ServiceNodes[3].Node != "foo" {
t.Fatalf("bad: %v", out)
}
// Query relative to foo, note that there's no known coordinate for "aaa"
// so it will go at the end.
args = structs.ServiceSpecificRequest{
Datacenter: "dc1",
ServiceName: "db",
Source: structs.QuerySource{Datacenter: "dc1", Node: "foo"},
}
if err := msgpackrpc.CallWithCodec(codec, "Catalog.ServiceNodes", &args, &out); err != nil {
t.Fatalf("err: %v", err)
}
if len(out.ServiceNodes) != 4 {
t.Fatalf("bad: %v", out)
}
if out.ServiceNodes[0].Node != "foo" {
t.Fatalf("bad: %v", out)
}
if out.ServiceNodes[1].Node != "baz" {
t.Fatalf("bad: %v", out)
}
if out.ServiceNodes[2].Node != "bar" {
t.Fatalf("bad: %v", out)
}
if out.ServiceNodes[3].Node != "aaa" {
t.Fatalf("bad: %v", out)
}
}
func TestCatalogNodeServices(t *testing.T) {
dir1, s1 := testServer(t)
defer os.RemoveAll(dir1)

View file

@ -12,6 +12,7 @@ import (
"time"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/serf/coordinate"
"github.com/hashicorp/serf/serf"
)
@ -138,6 +139,7 @@ func (c *Client) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (
conf.ProtocolVersion = protocolVersionMap[c.config.ProtocolVersion]
conf.RejoinAfterLeave = c.config.RejoinAfterLeave
conf.Merge = &lanMergeDelegate{dc: c.config.Datacenter}
conf.DisableCoordinates = c.config.DisableCoordinates
if err := ensurePath(conf.SnapshotPath, false); err != nil {
return nil, err
}
@ -376,3 +378,9 @@ func (c *Client) Stats() map[string]map[string]string {
}
return stats
}
// GetCoordinate returns the network coordinate of the current node, as
// maintained by Serf.
func (c *Client) GetCoordinate() (*coordinate.Coordinate, error) {
return c.serf.GetCoordinate()
}

View file

@ -32,6 +32,7 @@ func init() {
protocolVersionMap = map[uint8]uint8{
1: 4,
2: 4,
3: 4,
}
}
@ -202,6 +203,24 @@ type Config struct {
// UserEventHandler callback can be used to handle incoming
// user events. This function should not block.
UserEventHandler func(serf.UserEvent)
// DisableCoordinates controls features related to network coordinates.
DisableCoordinates bool
// CoordinateUpdatePeriod controls how long a server batches coordinate
// updates before applying them in a Raft transaction. A larger period
// leads to fewer Raft transactions, but also the stored coordinates
// being more stale.
CoordinateUpdatePeriod time.Duration
// CoordinateUpdateBatchSize controls the maximum number of updates a
// server batches before applying them in a Raft transaction.
CoordinateUpdateBatchSize int
// CoordinateUpdateMaxBatches controls the maximum number of batches we
// are willing to apply in one period. After this limit we will issue a
// warning and discard the remaining updates.
CoordinateUpdateMaxBatches int
}
// CheckVersion is used to check if the ProtocolVersion is valid
@ -249,13 +268,21 @@ func DefaultConfig() *Config {
SerfLANConfig: serf.DefaultConfig(),
SerfWANConfig: serf.DefaultConfig(),
ReconcileInterval: 60 * time.Second,
ProtocolVersion: ProtocolVersionMax,
ProtocolVersion: ProtocolVersion2Compatible,
ACLTTL: 30 * time.Second,
ACLDefaultPolicy: "allow",
ACLDownPolicy: "extend-cache",
TombstoneTTL: 15 * time.Minute,
TombstoneTTLGranularity: 30 * time.Second,
SessionTTLMin: 10 * time.Second,
DisableCoordinates: false,
// These are tuned to provide a total throughput of 128 updates
// per second. If you update these, you should update the client-
// side SyncCoordinateRateTarget parameter accordingly.
CoordinateUpdatePeriod: 5 * time.Second,
CoordinateUpdateBatchSize: 128,
CoordinateUpdateMaxBatches: 5,
}
// Increase our reap interval to 3 days instead of 24h.

View file

@ -0,0 +1,170 @@
package consul
import (
"fmt"
"sort"
"strings"
"sync"
"time"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/serf/coordinate"
)
// Coordinate manages queries and updates for network coordinates.
type Coordinate struct {
// srv is a pointer back to the server.
srv *Server
// updates holds pending coordinate updates for the given nodes.
updates map[string]*coordinate.Coordinate
// updatesLock synchronizes access to the updates map.
updatesLock sync.Mutex
}
// NewCoordinate returns a new Coordinate endpoint.
func NewCoordinate(srv *Server) *Coordinate {
c := &Coordinate{
srv: srv,
updates: make(map[string]*coordinate.Coordinate),
}
go c.batchUpdate()
return c
}
// batchUpdate is a long-running routine that flushes pending coordinates to the
// Raft log in batches.
func (c *Coordinate) batchUpdate() {
for {
select {
case <-time.After(c.srv.config.CoordinateUpdatePeriod):
if err := c.batchApplyUpdates(); err != nil {
c.srv.logger.Printf("[ERR] consul.coordinate: Batch update failed: %v", err)
}
case <-c.srv.shutdownCh:
return
}
}
}
// batchApplyUpdates applies all pending updates to the Raft log in a series of
// batches.
func (c *Coordinate) batchApplyUpdates() error {
// Grab the pending updates and release the lock so we can still handle
// incoming messages.
c.updatesLock.Lock()
pending := c.updates
c.updates = make(map[string]*coordinate.Coordinate)
c.updatesLock.Unlock()
// Enforce the rate limit.
limit := c.srv.config.CoordinateUpdateBatchSize * c.srv.config.CoordinateUpdateMaxBatches
size := len(pending)
if size > limit {
c.srv.logger.Printf("[WARN] consul.coordinate: Discarded %d coordinate updates", size-limit)
size = limit
}
// Transform the map into a slice that we can feed to the Raft log in
// batches.
i := 0
updates := make(structs.Coordinates, size)
for node, coord := range pending {
if !(i < size) {
break
}
updates[i] = &structs.Coordinate{node, coord}
i++
}
// Apply the updates to the Raft log in batches.
for start := 0; start < size; start += c.srv.config.CoordinateUpdateBatchSize {
end := start + c.srv.config.CoordinateUpdateBatchSize
if end > size {
end = size
}
slice := updates[start:end]
if _, err := c.srv.raftApply(structs.CoordinateBatchUpdateType, slice); err != nil {
return err
}
}
return nil
}
// Update inserts or updates the LAN coordinate of a node.
func (c *Coordinate) Update(args *structs.CoordinateUpdateRequest, reply *struct{}) (err error) {
if done, err := c.srv.forward("Coordinate.Update", args, args, reply); done {
return err
}
// Since this is a coordinate coming from some place else we harden this
// and look for dimensionality problems proactively.
coord, err := c.srv.serfLAN.GetCoordinate()
if err != nil {
return err
}
if !coord.IsCompatibleWith(args.Coord) {
return fmt.Errorf("rejected bad coordinate: %v", args.Coord)
}
// Add the coordinate to the map of pending updates.
c.updatesLock.Lock()
c.updates[args.Node] = args.Coord
c.updatesLock.Unlock()
return nil
}
// ListDatacenters returns the list of datacenters and their respective nodes
// and the raw coordinates of those nodes (if no coordinates are available for
// any of the nodes, the node list may be empty).
func (c *Coordinate) ListDatacenters(args *struct{}, reply *[]structs.DatacenterMap) error {
c.srv.remoteLock.RLock()
defer c.srv.remoteLock.RUnlock()
// Build up a map of all the DCs, sort it first since getDatacenterMaps
// will preserve the order of this list in the output.
dcs := make([]string, 0, len(c.srv.remoteConsuls))
for dc := range c.srv.remoteConsuls {
dcs = append(dcs, dc)
}
sort.Strings(dcs)
maps := c.srv.getDatacenterMaps(dcs)
// Strip the datacenter suffixes from all the node names.
for i := range maps {
suffix := fmt.Sprintf(".%s", maps[i].Datacenter)
for j := range maps[i].Coordinates {
node := maps[i].Coordinates[j].Node
maps[i].Coordinates[j].Node = strings.TrimSuffix(node, suffix)
}
}
*reply = maps
return nil
}
// ListNodes returns the list of nodes with their raw network coordinates (if no
// coordinates are available for a node it won't appear in this list).
func (c *Coordinate) ListNodes(args *structs.DCSpecificRequest, reply *structs.IndexedCoordinates) error {
if done, err := c.srv.forward("Coordinate.ListNodes", args, args, reply); done {
return err
}
state := c.srv.fsm.State()
return c.srv.blockingRPC(&args.QueryOptions,
&reply.QueryMeta,
state.GetQueryWatch("Coordinates"),
func() error {
index, coords, err := state.Coordinates()
if err != nil {
return err
}
reply.Index, reply.Coordinates = index, coords
return nil
})
}

View file

@ -0,0 +1,291 @@
package consul
import (
"fmt"
"math/rand"
"os"
"reflect"
"strings"
"testing"
"time"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/consul/testutil"
"github.com/hashicorp/net-rpc-msgpackrpc"
"github.com/hashicorp/serf/coordinate"
)
// generateRandomCoordinate creates a random coordinate. This mucks with the
// underlying structure directly, so it's not really useful for any particular
// position in the network, but it's a good payload to send through to make
// sure things come out the other side or get stored correctly.
func generateRandomCoordinate() *coordinate.Coordinate {
config := coordinate.DefaultConfig()
coord := coordinate.NewCoordinate(config)
for i := range coord.Vec {
coord.Vec[i] = rand.NormFloat64()
}
coord.Error = rand.NormFloat64()
coord.Adjustment = rand.NormFloat64()
return coord
}
// verifyCoordinatesEqual will compare a and b and fail if they are not exactly
// equal (no floating point fuzz is considered since we are trying to make sure
// we are getting exactly the coordinates we expect, without math on them).
func verifyCoordinatesEqual(t *testing.T, a, b *coordinate.Coordinate) {
if !reflect.DeepEqual(a, b) {
t.Fatalf("coordinates are not equal: %v != %v", a, b)
}
}
func TestCoordinate_Update(t *testing.T) {
name := fmt.Sprintf("Node %d", getPort())
dir1, config1 := testServerConfig(t, name)
defer os.RemoveAll(dir1)
config1.CoordinateUpdatePeriod = 500 * time.Millisecond
config1.CoordinateUpdateBatchSize = 5
config1.CoordinateUpdateMaxBatches = 2
s1, err := NewServer(config1)
if err != nil {
t.Fatal(err)
}
defer s1.Shutdown()
codec := rpcClient(t, s1)
defer codec.Close()
testutil.WaitForLeader(t, s1.RPC, "dc1")
// Register some nodes.
nodes := []string{"node1", "node2"}
for _, node := range nodes {
req := structs.RegisterRequest{
Datacenter: "dc1",
Node: node,
Address: "127.0.0.1",
}
var reply struct{}
if err := msgpackrpc.CallWithCodec(codec, "Catalog.Register", &req, &reply); err != nil {
t.Fatalf("err: %v", err)
}
}
// Send an update for the first node.
arg1 := structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "node1",
Coord: generateRandomCoordinate(),
}
var out struct{}
if err := msgpackrpc.CallWithCodec(codec, "Coordinate.Update", &arg1, &out); err != nil {
t.Fatalf("err: %v", err)
}
// Send an update for the second node.
arg2 := structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "node2",
Coord: generateRandomCoordinate(),
}
if err := msgpackrpc.CallWithCodec(codec, "Coordinate.Update", &arg2, &out); err != nil {
t.Fatalf("err: %v", err)
}
// Make sure the updates did not yet apply because the update period
// hasn't expired.
state := s1.fsm.State()
c, err := state.CoordinateGetRaw("node1")
if err != nil {
t.Fatalf("err: %v", err)
}
if c != nil {
t.Fatalf("should be nil because the update should be batched")
}
c, err = state.CoordinateGetRaw("node2")
if err != nil {
t.Fatalf("err: %v", err)
}
if c != nil {
t.Fatalf("should be nil because the update should be batched")
}
// Send another update for the second node. It should take precedence
// since there will be two updates in the same batch.
arg2.Coord = generateRandomCoordinate()
if err := msgpackrpc.CallWithCodec(codec, "Coordinate.Update", &arg2, &out); err != nil {
t.Fatalf("err: %v", err)
}
// Wait a while and the updates should get picked up.
time.Sleep(2 * s1.config.CoordinateUpdatePeriod)
c, err = state.CoordinateGetRaw("node1")
if err != nil {
t.Fatalf("err: %v", err)
}
if c == nil {
t.Fatalf("should return a coordinate but it's nil")
}
verifyCoordinatesEqual(t, c, arg1.Coord)
c, err = state.CoordinateGetRaw("node2")
if err != nil {
t.Fatalf("err: %v", err)
}
if c == nil {
t.Fatalf("should return a coordinate but it's nil")
}
verifyCoordinatesEqual(t, c, arg2.Coord)
// Register a bunch of additional nodes.
spamLen := s1.config.CoordinateUpdateBatchSize*s1.config.CoordinateUpdateMaxBatches + 1
for i := 0; i < spamLen; i++ {
req := structs.RegisterRequest{
Datacenter: "dc1",
Node: fmt.Sprintf("bogusnode%d", i),
Address: "127.0.0.1",
}
var reply struct{}
if err := msgpackrpc.CallWithCodec(codec, "Catalog.Register", &req, &reply); err != nil {
t.Fatalf("err: %v", err)
}
}
// Now spam some coordinate updates and make sure it starts throwing
// them away if they exceed the batch allowance. Note we have to make
// unique names since these are held in map by node name.
for i := 0; i < spamLen; i++ {
arg1.Node = fmt.Sprintf("bogusnode%d", i)
arg1.Coord = generateRandomCoordinate()
if err := msgpackrpc.CallWithCodec(codec, "Coordinate.Update", &arg1, &out); err != nil {
t.Fatalf("err: %v", err)
}
}
// Wait a little while for the batch routine to run, then make sure
// exactly one of the updates got dropped (we won't know which one).
time.Sleep(2 * s1.config.CoordinateUpdatePeriod)
numDropped := 0
for i := 0; i < spamLen; i++ {
c, err = state.CoordinateGetRaw(fmt.Sprintf("bogusnode%d", i))
if err != nil {
t.Fatalf("err: %v", err)
}
if c == nil {
numDropped++
}
}
if numDropped != 1 {
t.Fatalf("wrong number of coordinates dropped, %d != 1", numDropped)
}
// Finally, send a coordinate with the wrong dimensionality to make sure
// there are no panics, and that it gets rejected.
arg2.Coord.Vec = make([]float64, 2*len(arg2.Coord.Vec))
err = msgpackrpc.CallWithCodec(codec, "Coordinate.Update", &arg2, &out)
if err == nil || !strings.Contains(err.Error(), "rejected bad coordinate") {
t.Fatalf("should have failed with an error, got %v", err)
}
}
func TestCoordinate_ListDatacenters(t *testing.T) {
dir1, s1 := testServer(t)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
codec := rpcClient(t, s1)
defer codec.Close()
testutil.WaitForLeader(t, s1.RPC, "dc1")
// It's super hard to force the Serfs into a known configuration of
// coordinates, so the best we can do is make sure our own DC shows
// up in the list with the proper coordinates. The guts of the algorithm
// are extensively tested in rtt_test.go using a mock database.
var out []structs.DatacenterMap
if err := msgpackrpc.CallWithCodec(codec, "Coordinate.ListDatacenters", struct{}{}, &out); err != nil {
t.Fatalf("err: %v", err)
}
if len(out) != 1 ||
out[0].Datacenter != "dc1" ||
len(out[0].Coordinates) != 1 ||
out[0].Coordinates[0].Node != s1.config.NodeName {
t.Fatalf("bad: %v", out)
}
c, err := s1.serfWAN.GetCoordinate()
if err != nil {
t.Fatalf("bad: %v", err)
}
verifyCoordinatesEqual(t, c, out[0].Coordinates[0].Coord)
}
func TestCoordinate_ListNodes(t *testing.T) {
dir1, s1 := testServer(t)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
codec := rpcClient(t, s1)
defer codec.Close()
testutil.WaitForLeader(t, s1.RPC, "dc1")
// Register some nodes.
nodes := []string{"foo", "bar", "baz"}
for _, node := range nodes {
req := structs.RegisterRequest{
Datacenter: "dc1",
Node: node,
Address: "127.0.0.1",
}
var reply struct{}
if err := msgpackrpc.CallWithCodec(codec, "Catalog.Register", &req, &reply); err != nil {
t.Fatalf("err: %v", err)
}
}
// Send coordinate updates for a few nodes, waiting a little while for
// the batch update to run.
arg1 := structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "foo",
Coord: generateRandomCoordinate(),
}
var out struct{}
if err := msgpackrpc.CallWithCodec(codec, "Coordinate.Update", &arg1, &out); err != nil {
t.Fatalf("err: %v", err)
}
arg2 := structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "bar",
Coord: generateRandomCoordinate(),
}
if err := msgpackrpc.CallWithCodec(codec, "Coordinate.Update", &arg2, &out); err != nil {
t.Fatalf("err: %v", err)
}
arg3 := structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "baz",
Coord: generateRandomCoordinate(),
}
if err := msgpackrpc.CallWithCodec(codec, "Coordinate.Update", &arg3, &out); err != nil {
t.Fatalf("err: %v", err)
}
time.Sleep(2 * s1.config.CoordinateUpdatePeriod)
// Now query back for all the nodes.
arg := structs.DCSpecificRequest{
Datacenter: "dc1",
}
resp := structs.IndexedCoordinates{}
if err := msgpackrpc.CallWithCodec(codec, "Coordinate.ListNodes", &arg, &resp); err != nil {
t.Fatalf("err: %v", err)
}
if len(resp.Coordinates) != 3 ||
resp.Coordinates[0].Node != "bar" ||
resp.Coordinates[1].Node != "baz" ||
resp.Coordinates[2].Node != "foo" {
t.Fatalf("bad: %v", resp.Coordinates)
}
verifyCoordinatesEqual(t, resp.Coordinates[0].Coord, arg2.Coord) // bar
verifyCoordinatesEqual(t, resp.Coordinates[1].Coord, arg3.Coord) // baz
verifyCoordinatesEqual(t, resp.Coordinates[2].Coord, arg1.Coord) // foo
}

View file

@ -89,6 +89,8 @@ func (c *consulFSM) Apply(log *raft.Log) interface{} {
return c.applyACLOperation(buf[1:], log.Index)
case structs.TombstoneRequestType:
return c.applyTombstoneOperation(buf[1:], log.Index)
case structs.CoordinateBatchUpdateType:
return c.applyCoordinateBatchUpdate(buf[1:], log.Index)
default:
if ignoreUnknown {
c.logger.Printf("[WARN] consul.fsm: ignoring unknown message type (%d), upgrade to newer version", msgType)
@ -246,6 +248,22 @@ func (c *consulFSM) applyTombstoneOperation(buf []byte, index uint64) interface{
}
}
// applyCoordinateBatchUpdate processes a batch of coordinate updates and applies
// them in a single underlying transaction. This interface isn't 1:1 with the outer
// update interface that the coordinate endpoint exposes, so we made it single
// purpose and avoided the opcode convention.
func (c *consulFSM) applyCoordinateBatchUpdate(buf []byte, index uint64) interface{} {
var updates structs.Coordinates
if err := structs.Decode(buf, &updates); err != nil {
panic(fmt.Errorf("failed to decode batch updates: %v", err))
}
defer metrics.MeasureSince([]string{"consul", "fsm", "coordinate", "batch-update"}, time.Now())
if err := c.state.CoordinateBatchUpdate(index, updates); err != nil {
return err
}
return nil
}
func (c *consulFSM) Snapshot() (raft.FSMSnapshot, error) {
defer func(start time.Time) {
c.logger.Printf("[INFO] consul.fsm: snapshot created in %v", time.Now().Sub(start))
@ -343,6 +361,16 @@ func (c *consulFSM) Restore(old io.ReadCloser) error {
return err
}
case structs.CoordinateBatchUpdateType:
var req structs.Coordinates
if err := dec.Decode(&req); err != nil {
return err
}
if err := restore.Coordinates(header.LastIndex, req); err != nil {
return err
}
default:
return fmt.Errorf("Unrecognized msg type: %v", msgType)
}
@ -444,6 +472,21 @@ func (s *consulSnapshot) persistNodes(sink raft.SnapshotSink,
}
}
}
// Save the coordinates separately since they are not part of the
// register request interface. To avoid copying them out, we turn
// them into batches with a single coordinate each.
coords, err := s.state.Coordinates()
if err != nil {
return err
}
for coord := coords.Next(); coord != nil; coord = coords.Next() {
sink.Write([]byte{byte(structs.CoordinateBatchUpdateType)})
updates := structs.Coordinates{coord.(*structs.Coordinate)}
if err := encoder.Encode(&updates); err != nil {
return err
}
}
return nil
}

View file

@ -3,6 +3,7 @@ package consul
import (
"bytes"
"os"
"reflect"
"testing"
"github.com/hashicorp/consul/consul/state"
@ -382,6 +383,20 @@ func TestFSM_SnapshotRestore(t *testing.T) {
t.Fatalf("bad index: %d", idx)
}
updates := structs.Coordinates{
&structs.Coordinate{
Node: "baz",
Coord: generateRandomCoordinate(),
},
&structs.Coordinate{
Node: "foo",
Coord: generateRandomCoordinate(),
},
}
if err := fsm.state.CoordinateBatchUpdate(13, updates); err != nil {
t.Fatalf("err: %s", err)
}
// Snapshot
snap, err := fsm.Snapshot()
if err != nil {
@ -490,6 +505,15 @@ func TestFSM_SnapshotRestore(t *testing.T) {
t.Fatalf("unexpected extra tombstones")
}
}()
// Verify coordinates are restored
_, coords, err := fsm2.state.Coordinates()
if err != nil {
t.Fatalf("err: %s", err)
}
if !reflect.DeepEqual(coords, updates) {
t.Fatalf("bad: %#v", coords)
}
}
func TestFSM_KVSSet(t *testing.T) {
@ -728,6 +752,46 @@ func TestFSM_KVSCheckAndSet(t *testing.T) {
}
}
func TestFSM_CoordinateUpdate(t *testing.T) {
fsm, err := NewFSM(nil, os.Stderr)
if err != nil {
t.Fatalf("err: %v", err)
}
// Register some nodes.
fsm.state.EnsureNode(1, &structs.Node{Node: "node1", Address: "127.0.0.1"})
fsm.state.EnsureNode(2, &structs.Node{Node: "node2", Address: "127.0.0.1"})
// Write a batch of two coordinates.
updates := structs.Coordinates{
&structs.Coordinate{
Node: "node1",
Coord: generateRandomCoordinate(),
},
&structs.Coordinate{
Node: "node2",
Coord: generateRandomCoordinate(),
},
}
buf, err := structs.Encode(structs.CoordinateBatchUpdateType, updates)
if err != nil {
t.Fatalf("err: %v", err)
}
resp := fsm.Apply(makeLog(buf))
if resp != nil {
t.Fatalf("resp: %v", resp)
}
// Read back the two coordinates to make sure they got updated.
_, coords, err := fsm.state.Coordinates()
if err != nil {
t.Fatalf("err: %s", err)
}
if !reflect.DeepEqual(coords, updates) {
t.Fatalf("bad: %#v", coords)
}
}
func TestFSM_SessionCreate_Destroy(t *testing.T) {
fsm, err := NewFSM(nil, os.Stderr)
if err != nil {

View file

@ -30,7 +30,10 @@ func (h *Health) ChecksInState(args *structs.ChecksInStateRequest,
return err
}
reply.Index, reply.HealthChecks = index, checks
return h.srv.filterACL(args.Token, reply)
if err := h.srv.filterACL(args.Token, reply); err != nil {
return err
}
return h.srv.sortNodesByDistanceFrom(args.Source, reply.HealthChecks)
})
}
@ -82,7 +85,10 @@ func (h *Health) ServiceChecks(args *structs.ServiceSpecificRequest,
return err
}
reply.Index, reply.HealthChecks = index, checks
return h.srv.filterACL(args.Token, reply)
if err := h.srv.filterACL(args.Token, reply); err != nil {
return err
}
return h.srv.sortNodesByDistanceFrom(args.Source, reply.HealthChecks)
})
}
@ -115,8 +121,12 @@ func (h *Health) ServiceNodes(args *structs.ServiceSpecificRequest, reply *struc
if err != nil {
return err
}
reply.Index, reply.Nodes = index, nodes
return h.srv.filterACL(args.Token, reply)
if err := h.srv.filterACL(args.Token, reply); err != nil {
return err
}
return h.srv.sortNodesByDistanceFrom(args.Source, reply.Nodes)
})
// Provide some metrics

View file

@ -3,6 +3,7 @@ package consul
import (
"os"
"testing"
"time"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/consul/testutil"
@ -55,6 +56,83 @@ func TestHealth_ChecksInState(t *testing.T) {
}
}
func TestHealth_ChecksInState_DistanceSort(t *testing.T) {
dir1, s1 := testServer(t)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
codec := rpcClient(t, s1)
defer codec.Close()
testutil.WaitForLeader(t, s1.RPC, "dc1")
if err := s1.fsm.State().EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.2"}); err != nil {
t.Fatalf("err: %v", err)
}
if err := s1.fsm.State().EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.3"}); err != nil {
t.Fatalf("err: %v", err)
}
updates := structs.Coordinates{
{"foo", generateCoordinate(1 * time.Millisecond)},
{"bar", generateCoordinate(2 * time.Millisecond)},
}
if err := s1.fsm.State().CoordinateBatchUpdate(3, updates); err != nil {
t.Fatalf("err: %v", err)
}
arg := structs.RegisterRequest{
Datacenter: "dc1",
Node: "foo",
Address: "127.0.0.1",
Check: &structs.HealthCheck{
Name: "memory utilization",
Status: structs.HealthPassing,
},
}
var out struct{}
if err := msgpackrpc.CallWithCodec(codec, "Catalog.Register", &arg, &out); err != nil {
t.Fatalf("err: %v", err)
}
arg.Node = "bar"
if err := msgpackrpc.CallWithCodec(codec, "Catalog.Register", &arg, &out); err != nil {
t.Fatalf("err: %v", err)
}
// Query relative to foo to make sure it shows up first in the list.
var out2 structs.IndexedHealthChecks
inState := structs.ChecksInStateRequest{
Datacenter: "dc1",
State: structs.HealthPassing,
Source: structs.QuerySource{
Datacenter: "dc1",
Node: "foo",
},
}
if err := msgpackrpc.CallWithCodec(codec, "Health.ChecksInState", &inState, &out2); err != nil {
t.Fatalf("err: %v", err)
}
checks := out2.HealthChecks
if len(checks) != 3 {
t.Fatalf("Bad: %v", checks)
}
if checks[0].Node != "foo" {
t.Fatalf("Bad: %v", checks[1])
}
// Now query relative to bar to make sure it shows up first.
inState.Source.Node = "bar"
if err := msgpackrpc.CallWithCodec(codec, "Health.ChecksInState", &inState, &out2); err != nil {
t.Fatalf("err: %v", err)
}
checks = out2.HealthChecks
if len(checks) != 3 {
t.Fatalf("Bad: %v", checks)
}
if checks[0].Node != "bar" {
t.Fatalf("Bad: %v", checks[1])
}
}
func TestHealth_NodeChecks(t *testing.T) {
dir1, s1 := testServer(t)
defer os.RemoveAll(dir1)
@ -142,6 +220,94 @@ func TestHealth_ServiceChecks(t *testing.T) {
}
}
func TestHealth_ServiceChecks_DistanceSort(t *testing.T) {
dir1, s1 := testServer(t)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
codec := rpcClient(t, s1)
defer codec.Close()
testutil.WaitForLeader(t, s1.RPC, "dc1")
if err := s1.fsm.State().EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.2"}); err != nil {
t.Fatalf("err: %v", err)
}
if err := s1.fsm.State().EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.3"}); err != nil {
t.Fatalf("err: %v", err)
}
updates := structs.Coordinates{
{"foo", generateCoordinate(1 * time.Millisecond)},
{"bar", generateCoordinate(2 * time.Millisecond)},
}
if err := s1.fsm.State().CoordinateBatchUpdate(3, updates); err != nil {
t.Fatalf("err: %v", err)
}
arg := structs.RegisterRequest{
Datacenter: "dc1",
Node: "foo",
Address: "127.0.0.1",
Service: &structs.NodeService{
ID: "db",
Service: "db",
},
Check: &structs.HealthCheck{
Name: "db connect",
Status: structs.HealthPassing,
ServiceID: "db",
},
}
var out struct{}
if err := msgpackrpc.CallWithCodec(codec, "Catalog.Register", &arg, &out); err != nil {
t.Fatalf("err: %v", err)
}
arg.Node = "bar"
if err := msgpackrpc.CallWithCodec(codec, "Catalog.Register", &arg, &out); err != nil {
t.Fatalf("err: %v", err)
}
// Query relative to foo to make sure it shows up first in the list.
var out2 structs.IndexedHealthChecks
node := structs.ServiceSpecificRequest{
Datacenter: "dc1",
ServiceName: "db",
Source: structs.QuerySource{
Datacenter: "dc1",
Node: "foo",
},
}
if err := msgpackrpc.CallWithCodec(codec, "Health.ServiceChecks", &node, &out2); err != nil {
t.Fatalf("err: %v", err)
}
checks := out2.HealthChecks
if len(checks) != 2 {
t.Fatalf("Bad: %v", checks)
}
if checks[0].Node != "foo" {
t.Fatalf("Bad: %v", checks)
}
if checks[1].Node != "bar" {
t.Fatalf("Bad: %v", checks)
}
// Now query relative to bar to make sure it shows up first.
node.Source.Node = "bar"
if err := msgpackrpc.CallWithCodec(codec, "Health.ServiceChecks", &node, &out2); err != nil {
t.Fatalf("err: %v", err)
}
checks = out2.HealthChecks
if len(checks) != 2 {
t.Fatalf("Bad: %v", checks)
}
if checks[0].Node != "bar" {
t.Fatalf("Bad: %v", checks)
}
if checks[1].Node != "foo" {
t.Fatalf("Bad: %v", checks)
}
}
func TestHealth_ServiceNodes(t *testing.T) {
dir1, s1 := testServer(t)
defer os.RemoveAll(dir1)
@ -225,6 +391,94 @@ func TestHealth_ServiceNodes(t *testing.T) {
}
}
func TestHealth_ServiceNodes_DistanceSort(t *testing.T) {
dir1, s1 := testServer(t)
defer os.RemoveAll(dir1)
defer s1.Shutdown()
codec := rpcClient(t, s1)
defer codec.Close()
testutil.WaitForLeader(t, s1.RPC, "dc1")
if err := s1.fsm.State().EnsureNode(1, &structs.Node{Node: "foo", Address: "127.0.0.2"}); err != nil {
t.Fatalf("err: %v", err)
}
if err := s1.fsm.State().EnsureNode(2, &structs.Node{Node: "bar", Address: "127.0.0.3"}); err != nil {
t.Fatalf("err: %v", err)
}
updates := structs.Coordinates{
{"foo", generateCoordinate(1 * time.Millisecond)},
{"bar", generateCoordinate(2 * time.Millisecond)},
}
if err := s1.fsm.State().CoordinateBatchUpdate(3, updates); err != nil {
t.Fatalf("err: %v", err)
}
arg := structs.RegisterRequest{
Datacenter: "dc1",
Node: "foo",
Address: "127.0.0.1",
Service: &structs.NodeService{
ID: "db",
Service: "db",
},
Check: &structs.HealthCheck{
Name: "db connect",
Status: structs.HealthPassing,
ServiceID: "db",
},
}
var out struct{}
if err := msgpackrpc.CallWithCodec(codec, "Catalog.Register", &arg, &out); err != nil {
t.Fatalf("err: %v", err)
}
arg.Node = "bar"
if err := msgpackrpc.CallWithCodec(codec, "Catalog.Register", &arg, &out); err != nil {
t.Fatalf("err: %v", err)
}
// Query relative to foo to make sure it shows up first in the list.
var out2 structs.IndexedCheckServiceNodes
req := structs.ServiceSpecificRequest{
Datacenter: "dc1",
ServiceName: "db",
Source: structs.QuerySource{
Datacenter: "dc1",
Node: "foo",
},
}
if err := msgpackrpc.CallWithCodec(codec, "Health.ServiceNodes", &req, &out2); err != nil {
t.Fatalf("err: %v", err)
}
nodes := out2.Nodes
if len(nodes) != 2 {
t.Fatalf("Bad: %v", nodes)
}
if nodes[0].Node.Node != "foo" {
t.Fatalf("Bad: %v", nodes[0])
}
if nodes[1].Node.Node != "bar" {
t.Fatalf("Bad: %v", nodes[1])
}
// Now query relative to bar to make sure it shows up first.
req.Source.Node = "bar"
if err := msgpackrpc.CallWithCodec(codec, "Health.ServiceNodes", &req, &out2); err != nil {
t.Fatalf("err: %v", err)
}
nodes = out2.Nodes
if len(nodes) != 2 {
t.Fatalf("Bad: %v", nodes)
}
if nodes[0].Node.Node != "bar" {
t.Fatalf("Bad: %v", nodes[0])
}
if nodes[1].Node.Node != "foo" {
t.Fatalf("Bad: %v", nodes[1])
}
}
func TestHealth_NodeChecks_FilterACL(t *testing.T) {
dir, token, srv, codec := testACLFilterServer(t)
defer os.RemoveAll(dir)

398
consul/rtt.go Normal file
View file

@ -0,0 +1,398 @@
package consul
import (
"fmt"
"math"
"sort"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/serf/coordinate"
)
// computeDistance returns the distance between the two network coordinates in
// seconds. If either of the coordinates is nil then this will return positive
// infinity.
func computeDistance(a *coordinate.Coordinate, b *coordinate.Coordinate) float64 {
if a == nil || b == nil {
return math.Inf(1.0)
}
return a.DistanceTo(b).Seconds()
}
// nodeSorter takes a list of nodes and a parallel vector of distances and
// implements sort.Interface, keeping both structures coherent and sorting by
// distance.
type nodeSorter struct {
Nodes structs.Nodes
Vec []float64
}
// newNodeSorter returns a new sorter for the given source coordinate and set of
// nodes.
func (s *Server) newNodeSorter(c *coordinate.Coordinate, nodes structs.Nodes) (sort.Interface, error) {
state := s.fsm.State()
vec := make([]float64, len(nodes))
for i, node := range nodes {
coord, err := state.CoordinateGetRaw(node.Node)
if err != nil {
return nil, err
}
vec[i] = computeDistance(c, coord)
}
return &nodeSorter{nodes, vec}, nil
}
// See sort.Interface.
func (n *nodeSorter) Len() int {
return len(n.Nodes)
}
// See sort.Interface.
func (n *nodeSorter) Swap(i, j int) {
n.Nodes[i], n.Nodes[j] = n.Nodes[j], n.Nodes[i]
n.Vec[i], n.Vec[j] = n.Vec[j], n.Vec[i]
}
// See sort.Interface.
func (n *nodeSorter) Less(i, j int) bool {
return n.Vec[i] < n.Vec[j]
}
// serviceNodeSorter takes a list of service nodes and a parallel vector of
// distances and implements sort.Interface, keeping both structures coherent and
// sorting by distance.
type serviceNodeSorter struct {
Nodes structs.ServiceNodes
Vec []float64
}
// newServiceNodeSorter returns a new sorter for the given source coordinate and
// set of service nodes.
func (s *Server) newServiceNodeSorter(c *coordinate.Coordinate, nodes structs.ServiceNodes) (sort.Interface, error) {
state := s.fsm.State()
vec := make([]float64, len(nodes))
for i, node := range nodes {
coord, err := state.CoordinateGetRaw(node.Node)
if err != nil {
return nil, err
}
vec[i] = computeDistance(c, coord)
}
return &serviceNodeSorter{nodes, vec}, nil
}
// See sort.Interface.
func (n *serviceNodeSorter) Len() int {
return len(n.Nodes)
}
// See sort.Interface.
func (n *serviceNodeSorter) Swap(i, j int) {
n.Nodes[i], n.Nodes[j] = n.Nodes[j], n.Nodes[i]
n.Vec[i], n.Vec[j] = n.Vec[j], n.Vec[i]
}
// See sort.Interface.
func (n *serviceNodeSorter) Less(i, j int) bool {
return n.Vec[i] < n.Vec[j]
}
// serviceNodeSorter takes a list of health checks and a parallel vector of
// distances and implements sort.Interface, keeping both structures coherent and
// sorting by distance.
type healthCheckSorter struct {
Checks structs.HealthChecks
Vec []float64
}
// newHealthCheckSorter returns a new sorter for the given source coordinate and
// set of health checks with nodes.
func (s *Server) newHealthCheckSorter(c *coordinate.Coordinate, checks structs.HealthChecks) (sort.Interface, error) {
state := s.fsm.State()
vec := make([]float64, len(checks))
for i, check := range checks {
coord, err := state.CoordinateGetRaw(check.Node)
if err != nil {
return nil, err
}
vec[i] = computeDistance(c, coord)
}
return &healthCheckSorter{checks, vec}, nil
}
// See sort.Interface.
func (n *healthCheckSorter) Len() int {
return len(n.Checks)
}
// See sort.Interface.
func (n *healthCheckSorter) Swap(i, j int) {
n.Checks[i], n.Checks[j] = n.Checks[j], n.Checks[i]
n.Vec[i], n.Vec[j] = n.Vec[j], n.Vec[i]
}
// See sort.Interface.
func (n *healthCheckSorter) Less(i, j int) bool {
return n.Vec[i] < n.Vec[j]
}
// checkServiceNodeSorter takes a list of service nodes and a parallel vector of
// distances and implements sort.Interface, keeping both structures coherent and
// sorting by distance.
type checkServiceNodeSorter struct {
Nodes structs.CheckServiceNodes
Vec []float64
}
// newCheckServiceNodeSorter returns a new sorter for the given source coordinate
// and set of nodes with health checks.
func (s *Server) newCheckServiceNodeSorter(c *coordinate.Coordinate, nodes structs.CheckServiceNodes) (sort.Interface, error) {
state := s.fsm.State()
vec := make([]float64, len(nodes))
for i, node := range nodes {
coord, err := state.CoordinateGetRaw(node.Node.Node)
if err != nil {
return nil, err
}
vec[i] = computeDistance(c, coord)
}
return &checkServiceNodeSorter{nodes, vec}, nil
}
// See sort.Interface.
func (n *checkServiceNodeSorter) Len() int {
return len(n.Nodes)
}
// See sort.Interface.
func (n *checkServiceNodeSorter) Swap(i, j int) {
n.Nodes[i], n.Nodes[j] = n.Nodes[j], n.Nodes[i]
n.Vec[i], n.Vec[j] = n.Vec[j], n.Vec[i]
}
// See sort.Interface.
func (n *checkServiceNodeSorter) Less(i, j int) bool {
return n.Vec[i] < n.Vec[j]
}
// newSorterByDistanceFrom returns a sorter for the given type.
func (s *Server) newSorterByDistanceFrom(c *coordinate.Coordinate, subj interface{}) (sort.Interface, error) {
switch v := subj.(type) {
case structs.Nodes:
return s.newNodeSorter(c, v)
case structs.ServiceNodes:
return s.newServiceNodeSorter(c, v)
case structs.HealthChecks:
return s.newHealthCheckSorter(c, v)
case structs.CheckServiceNodes:
return s.newCheckServiceNodeSorter(c, v)
default:
panic(fmt.Errorf("Unhandled type passed to newSorterByDistanceFrom: %#v", subj))
}
}
// sortNodesByDistanceFrom is used to sort results from our service catalog based
// on the round trip time from the given source node. Nodes with missing coordinates
// will get stable sorted at the end of the list.
//
// If coordinates are disabled this will be a no-op.
func (s *Server) sortNodesByDistanceFrom(source structs.QuerySource, subj interface{}) error {
// Make it safe to call this without having to check if coordinates are
// disabled first.
if s.config.DisableCoordinates {
return nil
}
// We can't sort if there's no source node.
if source.Node == "" {
return nil
}
// We can't compare coordinates across DCs.
if source.Datacenter != s.config.Datacenter {
return nil
}
// There won't always be a coordinate for the source node. If there's not
// one then we can bail out because there's no meaning for the sort.
state := s.fsm.State()
coord, err := state.CoordinateGetRaw(source.Node)
if err != nil {
return err
}
if coord == nil {
return nil
}
// Do the sort!
sorter, err := s.newSorterByDistanceFrom(coord, subj)
if err != nil {
return err
}
sort.Stable(sorter)
return nil
}
// serfer provides the coordinate information we need from the Server in an
// interface that's easy to mock out for testing. Without this, we'd have to
// do some really painful setup to get good unit test coverage of all the cases.
type serfer interface {
GetDatacenter() string
GetCoordinate() (*coordinate.Coordinate, error)
GetCachedCoordinate(node string) (*coordinate.Coordinate, bool)
GetNodesForDatacenter(dc string) []string
}
// serverSerfer wraps a Server with the serfer interface.
type serverSerfer struct {
server *Server
}
// See serfer.
func (s *serverSerfer) GetDatacenter() string {
return s.server.config.Datacenter
}
// See serfer.
func (s *serverSerfer) GetCoordinate() (*coordinate.Coordinate, error) {
return s.server.serfWAN.GetCoordinate()
}
// See serfer.
func (s *serverSerfer) GetCachedCoordinate(node string) (*coordinate.Coordinate, bool) {
return s.server.serfWAN.GetCachedCoordinate(node)
}
// See serfer.
func (s *serverSerfer) GetNodesForDatacenter(dc string) []string {
s.server.remoteLock.RLock()
defer s.server.remoteLock.RUnlock()
nodes := make([]string, 0)
for _, part := range s.server.remoteConsuls[dc] {
nodes = append(nodes, part.Name)
}
return nodes
}
// sortDatacentersByDistance will sort the given list of DCs based on the
// median RTT to all nodes we know about from the WAN gossip pool). DCs with
// missing coordinates will be stable sorted to the end of the list.
//
// If coordinates are disabled this will be a no-op.
func (s *Server) sortDatacentersByDistance(dcs []string) error {
// Make it safe to call this without having to check if coordinates are
// disabled first.
if s.config.DisableCoordinates {
return nil
}
// Do the sort!
serfer := serverSerfer{s}
return sortDatacentersByDistance(&serfer, dcs)
}
// getDatacenterDistance will return the median round trip time estimate for
// the given DC from the given serfer, in seconds. This will return positive
// infinity if no coordinates are available.
func getDatacenterDistance(s serfer, dc string) (float64, error) {
// If this is the serfer's DC then just bail with zero RTT.
if dc == s.GetDatacenter() {
return 0.0, nil
}
// Otherwise measure from the serfer to the nodes in the other DC.
coord, err := s.GetCoordinate()
if err != nil {
return 0.0, err
}
// Fetch all the nodes in the DC and record their distance, if available.
nodes := s.GetNodesForDatacenter(dc)
subvec := make([]float64, 0, len(nodes))
for _, node := range nodes {
if other, ok := s.GetCachedCoordinate(node); ok {
subvec = append(subvec, computeDistance(coord, other))
}
}
// Compute the median by sorting and taking the middle item.
if len(subvec) > 0 {
sort.Float64s(subvec)
return subvec[len(subvec)/2], nil
}
// Return the default infinity value.
return computeDistance(coord, nil), nil
}
// datacenterSorter takes a list of DC names and a parallel vector of distances
// and implements sort.Interface, keeping both structures coherent and sorting
// by distance.
type datacenterSorter struct {
Names []string
Vec []float64
}
// See sort.Interface.
func (n *datacenterSorter) Len() int {
return len(n.Names)
}
// See sort.Interface.
func (n *datacenterSorter) Swap(i, j int) {
n.Names[i], n.Names[j] = n.Names[j], n.Names[i]
n.Vec[i], n.Vec[j] = n.Vec[j], n.Vec[i]
}
// See sort.Interface.
func (n *datacenterSorter) Less(i, j int) bool {
return n.Vec[i] < n.Vec[j]
}
// sortDatacentersByDistance will sort the given list of DCs based on the
// median RTT to all nodes the given serfer knows about from the WAN gossip
// pool). DCs with missing coordinates will be stable sorted to the end of the
// list.
func sortDatacentersByDistance(s serfer, dcs []string) error {
// Build up a list of median distances to the other DCs.
vec := make([]float64, len(dcs))
for i, dc := range dcs {
rtt, err := getDatacenterDistance(s, dc)
if err != nil {
return err
}
vec[i] = rtt
}
sorter := &datacenterSorter{dcs, vec}
sort.Stable(sorter)
return nil
}
// getDatacenterMaps returns the raw coordinates of all the nodes in the
// given list of DCs (the output list will preserve the incoming order).
func (s *Server) getDatacenterMaps(dcs []string) []structs.DatacenterMap {
serfer := serverSerfer{s}
return getDatacenterMaps(&serfer, dcs)
}
// getDatacenterMaps returns the raw coordinates of all the nodes in the
// given list of DCs (the output list will preserve the incoming order).
func getDatacenterMaps(s serfer, dcs []string) []structs.DatacenterMap {
maps := make([]structs.DatacenterMap, 0, len(dcs))
for _, dc := range dcs {
m := structs.DatacenterMap{Datacenter: dc}
nodes := s.GetNodesForDatacenter(dc)
for _, node := range nodes {
if coord, ok := s.GetCachedCoordinate(node); ok {
entry := &structs.Coordinate{node, coord}
m.Coordinates = append(m.Coordinates, entry)
}
}
maps = append(maps, m)
}
return maps
}

580
consul/rtt_test.go Normal file
View file

@ -0,0 +1,580 @@
package consul
import (
"fmt"
"math"
"net/rpc"
"os"
"sort"
"strings"
"testing"
"time"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/consul/testutil"
"github.com/hashicorp/net-rpc-msgpackrpc"
"github.com/hashicorp/serf/coordinate"
)
// generateCoordinate creates a new coordinate with the given distance from the
// origin.
func generateCoordinate(rtt time.Duration) *coordinate.Coordinate {
coord := coordinate.NewCoordinate(coordinate.DefaultConfig())
coord.Vec[0] = rtt.Seconds()
coord.Height = 0
return coord
}
// verifyNodeSort makes sure the order of the nodes in the slice is the same as
// the expected order, expressed as a comma-separated string.
func verifyNodeSort(t *testing.T, nodes structs.Nodes, expected string) {
vec := make([]string, len(nodes))
for i, node := range nodes {
vec[i] = node.Node
}
actual := strings.Join(vec, ",")
if actual != expected {
t.Fatalf("bad sort: %s != %s", actual, expected)
}
}
// verifyServiceNodeSort makes sure the order of the nodes in the slice is the
// same as the expected order, expressed as a comma-separated string.
func verifyServiceNodeSort(t *testing.T, nodes structs.ServiceNodes, expected string) {
vec := make([]string, len(nodes))
for i, node := range nodes {
vec[i] = node.Node
}
actual := strings.Join(vec, ",")
if actual != expected {
t.Fatalf("bad sort: %s != %s", actual, expected)
}
}
// verifyHealthCheckSort makes sure the order of the nodes in the slice is the
// same as the expected order, expressed as a comma-separated string.
func verifyHealthCheckSort(t *testing.T, checks structs.HealthChecks, expected string) {
vec := make([]string, len(checks))
for i, check := range checks {
vec[i] = check.Node
}
actual := strings.Join(vec, ",")
if actual != expected {
t.Fatalf("bad sort: %s != %s", actual, expected)
}
}
// verifyCheckServiceNodeSort makes sure the order of the nodes in the slice is
// the same as the expected order, expressed as a comma-separated string.
func verifyCheckServiceNodeSort(t *testing.T, nodes structs.CheckServiceNodes, expected string) {
vec := make([]string, len(nodes))
for i, node := range nodes {
vec[i] = node.Node.Node
}
actual := strings.Join(vec, ",")
if actual != expected {
t.Fatalf("bad sort: %s != %s", actual, expected)
}
}
// seedCoordinates uses the client to set up a set of nodes with a specific
// set of distances from the origin. We also include the server so that we
// can wait for the coordinates to get committed to the Raft log.
//
// Here's the layout of the nodes:
//
// node3 node2 node5 node4 node1
// | | | | | | | | | | |
// 0 1 2 3 4 5 6 7 8 9 10 (ms)
//
func seedCoordinates(t *testing.T, codec rpc.ClientCodec, server *Server) {
// Register some nodes.
for i := 0; i < 5; i++ {
req := structs.RegisterRequest{
Datacenter: "dc1",
Node: fmt.Sprintf("node%d", i+1),
Address: "127.0.0.1",
}
var reply struct{}
if err := msgpackrpc.CallWithCodec(codec, "Catalog.Register", &req, &reply); err != nil {
t.Fatalf("err: %v", err)
}
}
// Seed the fixed setup of the nodes.
updates := []structs.CoordinateUpdateRequest{
structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "node1",
Coord: generateCoordinate(10 * time.Millisecond),
},
structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "node2",
Coord: generateCoordinate(2 * time.Millisecond),
},
structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "node3",
Coord: generateCoordinate(1 * time.Millisecond),
},
structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "node4",
Coord: generateCoordinate(8 * time.Millisecond),
},
structs.CoordinateUpdateRequest{
Datacenter: "dc1",
Node: "node5",
Coord: generateCoordinate(3 * time.Millisecond),
},
}
// Apply the updates and wait a while for the batch to get committed to
// the Raft log.
for _, update := range updates {
var out struct{}
if err := msgpackrpc.CallWithCodec(codec, "Coordinate.Update", &update, &out); err != nil {
t.Fatalf("err: %v", err)
}
}
time.Sleep(2 * server.config.CoordinateUpdatePeriod)
}
func TestRtt_sortNodesByDistanceFrom(t *testing.T) {
dir, server := testServer(t)
defer os.RemoveAll(dir)
defer server.Shutdown()
codec := rpcClient(t, server)
defer codec.Close()
testutil.WaitForLeader(t, server.RPC, "dc1")
seedCoordinates(t, codec, server)
nodes := structs.Nodes{
&structs.Node{Node: "apple"},
&structs.Node{Node: "node1"},
&structs.Node{Node: "node2"},
&structs.Node{Node: "node3"},
&structs.Node{Node: "node4"},
&structs.Node{Node: "node5"},
}
// The zero value for the source should not trigger any sorting.
var source structs.QuerySource
if err := server.sortNodesByDistanceFrom(source, nodes); err != nil {
t.Fatalf("err: %v", err)
}
verifyNodeSort(t, nodes, "apple,node1,node2,node3,node4,node5")
// Same for a source in some other DC.
source.Node = "node1"
source.Datacenter = "dc2"
if err := server.sortNodesByDistanceFrom(source, nodes); err != nil {
t.Fatalf("err: %v", err)
}
verifyNodeSort(t, nodes, "apple,node1,node2,node3,node4,node5")
// Same for a source node in our DC that we have no coordinate for.
source.Node = "apple"
source.Datacenter = "dc1"
if err := server.sortNodesByDistanceFrom(source, nodes); err != nil {
t.Fatalf("err: %v", err)
}
verifyNodeSort(t, nodes, "apple,node1,node2,node3,node4,node5")
// Set source to legit values relative to node1 but disable coordinates.
source.Node = "node1"
source.Datacenter = "dc1"
server.config.DisableCoordinates = true
if err := server.sortNodesByDistanceFrom(source, nodes); err != nil {
t.Fatalf("err: %v", err)
}
verifyNodeSort(t, nodes, "apple,node1,node2,node3,node4,node5")
// Now enable coordinates and sort relative to node1, note that apple
// doesn't have any seeded coordinate info so it should end up at the
// end, despite its lexical hegemony.
server.config.DisableCoordinates = false
if err := server.sortNodesByDistanceFrom(source, nodes); err != nil {
t.Fatalf("err: %v", err)
}
verifyNodeSort(t, nodes, "node1,node4,node5,node2,node3,apple")
}
func TestRtt_sortNodesByDistanceFrom_Nodes(t *testing.T) {
dir, server := testServer(t)
defer os.RemoveAll(dir)
defer server.Shutdown()
codec := rpcClient(t, server)
defer codec.Close()
testutil.WaitForLeader(t, server.RPC, "dc1")
seedCoordinates(t, codec, server)
nodes := structs.Nodes{
&structs.Node{Node: "apple"},
&structs.Node{Node: "node1"},
&structs.Node{Node: "node2"},
&structs.Node{Node: "node3"},
&structs.Node{Node: "node4"},
&structs.Node{Node: "node5"},
}
// Now sort relative to node1, note that apple doesn't have any
// seeded coordinate info so it should end up at the end, despite
// its lexical hegemony.
var source structs.QuerySource
source.Node = "node1"
source.Datacenter = "dc1"
if err := server.sortNodesByDistanceFrom(source, nodes); err != nil {
t.Fatalf("err: %v", err)
}
verifyNodeSort(t, nodes, "node1,node4,node5,node2,node3,apple")
// Try another sort from node2. Note that node5 and node3 are the
// same distance away so the stable sort should preserve the order
// they were in from the previous sort.
source.Node = "node2"
source.Datacenter = "dc1"
if err := server.sortNodesByDistanceFrom(source, nodes); err != nil {
t.Fatalf("err: %v", err)
}
verifyNodeSort(t, nodes, "node2,node5,node3,node4,node1,apple")
// Let's exercise the stable sort explicitly to make sure we didn't
// just get lucky.
nodes[1], nodes[2] = nodes[2], nodes[1]
if err := server.sortNodesByDistanceFrom(source, nodes); err != nil {
t.Fatalf("err: %v", err)
}
verifyNodeSort(t, nodes, "node2,node3,node5,node4,node1,apple")
}
func TestRtt_sortNodesByDistanceFrom_ServiceNodes(t *testing.T) {
dir, server := testServer(t)
defer os.RemoveAll(dir)
defer server.Shutdown()
codec := rpcClient(t, server)
defer codec.Close()
testutil.WaitForLeader(t, server.RPC, "dc1")
seedCoordinates(t, codec, server)
nodes := structs.ServiceNodes{
&structs.ServiceNode{Node: "apple"},
&structs.ServiceNode{Node: "node1"},
&structs.ServiceNode{Node: "node2"},
&structs.ServiceNode{Node: "node3"},
&structs.ServiceNode{Node: "node4"},
&structs.ServiceNode{Node: "node5"},
}
// Now sort relative to node1, note that apple doesn't have any
// seeded coordinate info so it should end up at the end, despite
// its lexical hegemony.
var source structs.QuerySource
source.Node = "node1"
source.Datacenter = "dc1"
if err := server.sortNodesByDistanceFrom(source, nodes); err != nil {
t.Fatalf("err: %v", err)
}
verifyServiceNodeSort(t, nodes, "node1,node4,node5,node2,node3,apple")
// Try another sort from node2. Note that node5 and node3 are the
// same distance away so the stable sort should preserve the order
// they were in from the previous sort.
source.Node = "node2"
source.Datacenter = "dc1"
if err := server.sortNodesByDistanceFrom(source, nodes); err != nil {
t.Fatalf("err: %v", err)
}
verifyServiceNodeSort(t, nodes, "node2,node5,node3,node4,node1,apple")
// Let's exercise the stable sort explicitly to make sure we didn't
// just get lucky.
nodes[1], nodes[2] = nodes[2], nodes[1]
if err := server.sortNodesByDistanceFrom(source, nodes); err != nil {
t.Fatalf("err: %v", err)
}
verifyServiceNodeSort(t, nodes, "node2,node3,node5,node4,node1,apple")
}
func TestRtt_sortNodesByDistanceFrom_HealthChecks(t *testing.T) {
dir, server := testServer(t)
defer os.RemoveAll(dir)
defer server.Shutdown()
codec := rpcClient(t, server)
defer codec.Close()
testutil.WaitForLeader(t, server.RPC, "dc1")
seedCoordinates(t, codec, server)
checks := structs.HealthChecks{
&structs.HealthCheck{Node: "apple"},
&structs.HealthCheck{Node: "node1"},
&structs.HealthCheck{Node: "node2"},
&structs.HealthCheck{Node: "node3"},
&structs.HealthCheck{Node: "node4"},
&structs.HealthCheck{Node: "node5"},
}
// Now sort relative to node1, note that apple doesn't have any
// seeded coordinate info so it should end up at the end, despite
// its lexical hegemony.
var source structs.QuerySource
source.Node = "node1"
source.Datacenter = "dc1"
if err := server.sortNodesByDistanceFrom(source, checks); err != nil {
t.Fatalf("err: %v", err)
}
verifyHealthCheckSort(t, checks, "node1,node4,node5,node2,node3,apple")
// Try another sort from node2. Note that node5 and node3 are the
// same distance away so the stable sort should preserve the order
// they were in from the previous sort.
source.Node = "node2"
source.Datacenter = "dc1"
if err := server.sortNodesByDistanceFrom(source, checks); err != nil {
t.Fatalf("err: %v", err)
}
verifyHealthCheckSort(t, checks, "node2,node5,node3,node4,node1,apple")
// Let's exercise the stable sort explicitly to make sure we didn't
// just get lucky.
checks[1], checks[2] = checks[2], checks[1]
if err := server.sortNodesByDistanceFrom(source, checks); err != nil {
t.Fatalf("err: %v", err)
}
verifyHealthCheckSort(t, checks, "node2,node3,node5,node4,node1,apple")
}
func TestRtt_sortNodesByDistanceFrom_CheckServiceNodes(t *testing.T) {
dir, server := testServer(t)
defer os.RemoveAll(dir)
defer server.Shutdown()
codec := rpcClient(t, server)
defer codec.Close()
testutil.WaitForLeader(t, server.RPC, "dc1")
seedCoordinates(t, codec, server)
nodes := structs.CheckServiceNodes{
structs.CheckServiceNode{Node: &structs.Node{Node: "apple"}},
structs.CheckServiceNode{Node: &structs.Node{Node: "node1"}},
structs.CheckServiceNode{Node: &structs.Node{Node: "node2"}},
structs.CheckServiceNode{Node: &structs.Node{Node: "node3"}},
structs.CheckServiceNode{Node: &structs.Node{Node: "node4"}},
structs.CheckServiceNode{Node: &structs.Node{Node: "node5"}},
}
// Now sort relative to node1, note that apple doesn't have any
// seeded coordinate info so it should end up at the end, despite
// its lexical hegemony.
var source structs.QuerySource
source.Node = "node1"
source.Datacenter = "dc1"
if err := server.sortNodesByDistanceFrom(source, nodes); err != nil {
t.Fatalf("err: %v", err)
}
verifyCheckServiceNodeSort(t, nodes, "node1,node4,node5,node2,node3,apple")
// Try another sort from node2. Note that node5 and node3 are the
// same distance away so the stable sort should preserve the order
// they were in from the previous sort.
source.Node = "node2"
source.Datacenter = "dc1"
if err := server.sortNodesByDistanceFrom(source, nodes); err != nil {
t.Fatalf("err: %v", err)
}
verifyCheckServiceNodeSort(t, nodes, "node2,node5,node3,node4,node1,apple")
// Let's exercise the stable sort explicitly to make sure we didn't
// just get lucky.
nodes[1], nodes[2] = nodes[2], nodes[1]
if err := server.sortNodesByDistanceFrom(source, nodes); err != nil {
t.Fatalf("err: %v", err)
}
verifyCheckServiceNodeSort(t, nodes, "node2,node3,node5,node4,node1,apple")
}
// mockNodeMap is keyed by node name and the values are the coordinates of the
// node.
type mockNodeMap map[string]*coordinate.Coordinate
// mockServer is used to provide a serfer interface for unit tests. The key is
// DC, which selects a map from node name to coordinate for that node.
type mockServer map[string]mockNodeMap
// newMockServer is used to generate a serfer interface that presents a known DC
// topology for unit tests. The server is in dc0.
//
// Here's the layout of the nodes:
//
// /---- dc1 ----\ /- dc2 -\ /- dc0 -\
// node2 node1 node3 node1 node1
// | | | | | | | | | | |
// 0 1 2 3 4 5 6 7 8 9 10 (ms)
//
// We also include a node4 in dc1 with no known coordinate, as well as a
// mysterious dcX with no nodes with known coordinates.
//
func newMockServer() *mockServer {
s := make(mockServer)
s["dc0"] = mockNodeMap{
"dc0.node1": generateCoordinate(10 * time.Millisecond),
}
s["dc1"] = mockNodeMap{
"dc1.node1": generateCoordinate(3 * time.Millisecond),
"dc1.node2": generateCoordinate(2 * time.Millisecond),
"dc1.node3": generateCoordinate(5 * time.Millisecond),
"dc1.node4": nil, // no known coordinate
}
s["dc2"] = mockNodeMap{
"dc2.node1": generateCoordinate(8 * time.Millisecond),
}
s["dcX"] = mockNodeMap{
"dcX.node1": nil, // no known coordinate
}
return &s
}
// See serfer.
func (s *mockServer) GetDatacenter() string {
return "dc0"
}
// See serfer.
func (s *mockServer) GetCoordinate() (*coordinate.Coordinate, error) {
return (*s)["dc0"]["dc0.node1"], nil
}
// See serfer.
func (s *mockServer) GetCachedCoordinate(node string) (*coordinate.Coordinate, bool) {
for _, nodes := range *s {
for n, coord := range nodes {
if n == node && coord != nil {
return coord, true
}
}
}
return nil, false
}
// See serfer.
func (s *mockServer) GetNodesForDatacenter(dc string) []string {
nodes := make([]string, 0)
if n, ok := (*s)[dc]; ok {
for name := range n {
nodes = append(nodes, name)
}
}
sort.Strings(nodes)
return nodes
}
func TestRtt_getDatacenterDistance(t *testing.T) {
s := newMockServer()
// The serfer's own DC is always 0 ms away.
if dist, err := getDatacenterDistance(s, "dc0"); err != nil || dist != 0.0 {
t.Fatalf("bad: %v err: %v", dist, err)
}
// Check a DC with no coordinates, which should give positive infinity.
if dist, err := getDatacenterDistance(s, "dcX"); err != nil || dist != math.Inf(1.0) {
t.Fatalf("bad: %v err: %v", dist, err)
}
// Similar for a totally unknown DC.
if dist, err := getDatacenterDistance(s, "acdc"); err != nil || dist != math.Inf(1.0) {
t.Fatalf("bad: %v err: %v", dist, err)
}
// Check the trivial median case (just one node).
if dist, err := getDatacenterDistance(s, "dc2"); err != nil || dist != 0.002 {
t.Fatalf("bad: %v err: %v", dist, err)
}
// Check the more interesting median case, note that there's a mystery
// node4 in there that should be excluded to make the distances sort
// like this:
//
// [0] node3 (0.005), [1] node1 (0.007), [2] node2 (0.008)
//
// So the median should be at index 3 / 2 = 1 -> 0.007.
if dist, err := getDatacenterDistance(s, "dc1"); err != nil || dist != 0.007 {
t.Fatalf("bad: %v err: %v", dist, err)
}
}
func TestRtt_sortDatacentersByDistance(t *testing.T) {
s := newMockServer()
dcs := []string{"acdc", "dc0", "dc1", "dc2", "dcX"}
if err := sortDatacentersByDistance(s, dcs); err != nil {
t.Fatalf("err: %v", err)
}
expected := "dc0,dc2,dc1,acdc,dcX"
if actual := strings.Join(dcs, ","); actual != expected {
t.Fatalf("bad sort: %s != %s", actual, expected)
}
// Make sure the sort is stable and we didn't just get lucky.
dcs = []string{"dcX", "dc0", "dc1", "dc2", "acdc"}
if err := sortDatacentersByDistance(s, dcs); err != nil {
t.Fatalf("err: %v", err)
}
expected = "dc0,dc2,dc1,dcX,acdc"
if actual := strings.Join(dcs, ","); actual != expected {
t.Fatalf("bad sort: %s != %s", actual, expected)
}
}
func TestRtt_getDatacenterMaps(t *testing.T) {
s := newMockServer()
dcs := []string{"dc0", "acdc", "dc1", "dc2", "dcX"}
maps := getDatacenterMaps(s, dcs)
if len(maps) != 5 {
t.Fatalf("bad: %v", maps)
}
if maps[0].Datacenter != "dc0" || len(maps[0].Coordinates) != 1 ||
maps[0].Coordinates[0].Node != "dc0.node1" {
t.Fatalf("bad: %v", maps[0])
}
verifyCoordinatesEqual(t, maps[0].Coordinates[0].Coord,
generateCoordinate(10*time.Millisecond))
if maps[1].Datacenter != "acdc" || len(maps[1].Coordinates) != 0 {
t.Fatalf("bad: %v", maps[1])
}
if maps[2].Datacenter != "dc1" || len(maps[2].Coordinates) != 3 ||
maps[2].Coordinates[0].Node != "dc1.node1" ||
maps[2].Coordinates[1].Node != "dc1.node2" ||
maps[2].Coordinates[2].Node != "dc1.node3" {
t.Fatalf("bad: %v", maps[2])
}
verifyCoordinatesEqual(t, maps[2].Coordinates[0].Coord,
generateCoordinate(3*time.Millisecond))
verifyCoordinatesEqual(t, maps[2].Coordinates[1].Coord,
generateCoordinate(2*time.Millisecond))
verifyCoordinatesEqual(t, maps[2].Coordinates[2].Coord,
generateCoordinate(5*time.Millisecond))
if maps[3].Datacenter != "dc2" || len(maps[3].Coordinates) != 1 ||
maps[3].Coordinates[0].Node != "dc2.node1" {
t.Fatalf("bad: %v", maps[3])
}
verifyCoordinatesEqual(t, maps[3].Coordinates[0].Coord,
generateCoordinate(8*time.Millisecond))
if maps[4].Datacenter != "dcX" || len(maps[4].Coordinates) != 0 {
t.Fatalf("bad: %v", maps[4])
}
}

View file

@ -38,11 +38,11 @@ func (s *Server) lanEventHandler() {
case e := <-s.eventChLAN:
switch e.EventType() {
case serf.EventMemberJoin:
s.nodeJoin(e.(serf.MemberEvent), false)
s.lanNodeJoin(e.(serf.MemberEvent))
s.localMemberEvent(e.(serf.MemberEvent))
case serf.EventMemberLeave, serf.EventMemberFailed:
s.nodeFailed(e.(serf.MemberEvent), false)
s.lanNodeFailed(e.(serf.MemberEvent))
s.localMemberEvent(e.(serf.MemberEvent))
case serf.EventMemberReap:
@ -68,9 +68,9 @@ func (s *Server) wanEventHandler() {
case e := <-s.eventChWAN:
switch e.EventType() {
case serf.EventMemberJoin:
s.nodeJoin(e.(serf.MemberEvent), true)
s.wanNodeJoin(e.(serf.MemberEvent))
case serf.EventMemberLeave, serf.EventMemberFailed:
s.nodeFailed(e.(serf.MemberEvent), true)
s.wanNodeFailed(e.(serf.MemberEvent))
case serf.EventMemberUpdate: // Ignore
case serf.EventMemberReap: // Ignore
case serf.EventUser:
@ -137,19 +137,40 @@ func (s *Server) localEvent(event serf.UserEvent) {
}
}
// nodeJoin is used to handle join events on the both serf clusters
func (s *Server) nodeJoin(me serf.MemberEvent, wan bool) {
// lanNodeJoin is used to handle join events on the LAN pool.
func (s *Server) lanNodeJoin(me serf.MemberEvent) {
for _, m := range me.Members {
ok, parts := isConsulServer(m)
if !ok {
if wan {
s.logger.Printf("[WARN] consul: non-server in WAN pool: %s", m.Name)
}
continue
}
s.logger.Printf("[INFO] consul: adding server %s", parts)
s.logger.Printf("[INFO] consul: adding LAN server %s", parts)
// Check if this server is known
// See if it's configured as part of our DC.
if parts.Datacenter == s.config.Datacenter {
s.localLock.Lock()
s.localConsuls[parts.Addr.String()] = parts
s.localLock.Unlock()
}
// If we still expecting to bootstrap, may need to handle this.
if s.config.BootstrapExpect != 0 {
s.maybeBootstrap()
}
}
}
// wanNodeJoin is used to handle join events on the WAN pool.
func (s *Server) wanNodeJoin(me serf.MemberEvent) {
for _, m := range me.Members {
ok, parts := isConsulServer(m)
if !ok {
s.logger.Printf("[WARN] consul: non-server in WAN pool: %s", m.Name)
continue
}
s.logger.Printf("[INFO] consul: adding WAN server %s", parts)
// Search for this node in our existing remotes.
found := false
s.remoteLock.Lock()
existing := s.remoteConsuls[parts.Datacenter]
@ -161,23 +182,11 @@ func (s *Server) nodeJoin(me serf.MemberEvent, wan bool) {
}
}
// Add ot the list if not known
// Add to the list if not known.
if !found {
s.remoteConsuls[parts.Datacenter] = append(existing, parts)
}
s.remoteLock.Unlock()
// Add to the local list as well
if !wan && parts.Datacenter == s.config.Datacenter {
s.localLock.Lock()
s.localConsuls[parts.Addr.String()] = parts
s.localLock.Unlock()
}
// If we still expecting to bootstrap, may need to handle this
if s.config.BootstrapExpect != 0 {
s.maybeBootstrap()
}
}
}
@ -235,14 +244,29 @@ func (s *Server) maybeBootstrap() {
s.config.BootstrapExpect = 0
}
// nodeFailed is used to handle fail events on both the serf clusters
func (s *Server) nodeFailed(me serf.MemberEvent, wan bool) {
// lanNodeFailed is used to handle fail events on the LAN pool.
func (s *Server) lanNodeFailed(me serf.MemberEvent) {
for _, m := range me.Members {
ok, parts := isConsulServer(m)
if !ok {
continue
}
s.logger.Printf("[INFO] consul: removing server %s", parts)
s.logger.Printf("[INFO] consul: removing LAN server %s", parts)
s.localLock.Lock()
delete(s.localConsuls, parts.Addr.String())
s.localLock.Unlock()
}
}
// wanNodeFailed is used to handle fail events on the WAN pool.
func (s *Server) wanNodeFailed(me serf.MemberEvent) {
for _, m := range me.Members {
ok, parts := isConsulServer(m)
if !ok {
continue
}
s.logger.Printf("[INFO] consul: removing WAN server %s", parts)
// Remove the server if known
s.remoteLock.Lock()
@ -264,12 +288,5 @@ func (s *Server) nodeFailed(me serf.MemberEvent, wan bool) {
s.remoteConsuls[parts.Datacenter] = existing
}
s.remoteLock.Unlock()
// Remove from the local list as well
if !wan {
s.localLock.Lock()
delete(s.localConsuls, parts.Addr.String())
s.localLock.Unlock()
}
}
}

View file

@ -19,6 +19,7 @@ import (
"github.com/hashicorp/consul/tlsutil"
"github.com/hashicorp/raft"
"github.com/hashicorp/raft-boltdb"
"github.com/hashicorp/serf/coordinate"
"github.com/hashicorp/serf/serf"
)
@ -27,7 +28,15 @@ import (
// protocol versions.
const (
ProtocolVersionMin uint8 = 1
ProtocolVersionMax = 2
// Version 3 added support for network coordinates but we kept the
// default protocol version at 2 to ease the transition to this new
// feature. A Consul agent speaking version 2 of the protocol will
// attempt to send its coordinates to a server who understands version
// 3 or greater.
ProtocolVersion2Compatible = 2
ProtocolVersionMax = 3
)
const (
@ -144,13 +153,14 @@ type Server struct {
// Holds the RPC endpoints
type endpoints struct {
Catalog *Catalog
Health *Health
Status *Status
KVS *KVS
Session *Session
Internal *Internal
ACL *ACL
Catalog *Catalog
Health *Health
Status *Status
KVS *KVS
Session *Session
Internal *Internal
ACL *ACL
Coordinate *Coordinate
}
// NewServer is used to construct a new Consul server from the
@ -306,6 +316,10 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string, w
if err := ensurePath(conf.SnapshotPath, false); err != nil {
return nil, err
}
// Plumb down the enable coordinates flag.
conf.DisableCoordinates = s.config.DisableCoordinates
return serf.Create(conf)
}
@ -396,6 +410,7 @@ func (s *Server) setupRPC(tlsWrap tlsutil.DCWrapper) error {
s.endpoints.Session = &Session{s}
s.endpoints.Internal = &Internal{s}
s.endpoints.ACL = &ACL{s}
s.endpoints.Coordinate = NewCoordinate(s)
// Register the handlers
s.rpcServer.Register(s.endpoints.Status)
@ -405,6 +420,7 @@ func (s *Server) setupRPC(tlsWrap tlsutil.DCWrapper) error {
s.rpcServer.Register(s.endpoints.Session)
s.rpcServer.Register(s.endpoints.Internal)
s.rpcServer.Register(s.endpoints.ACL)
s.rpcServer.Register(s.endpoints.Coordinate)
list, err := net.ListenTCP("tcp", s.config.RPCAddr)
if err != nil {
@ -690,3 +706,13 @@ func (s *Server) Stats() map[string]map[string]string {
}
return stats
}
// GetLANCoordinate returns the coordinate of the server in the LAN gossip pool.
func (s *Server) GetLANCoordinate() (*coordinate.Coordinate, error) {
return s.serfLAN.GetCoordinate()
}
// GetWANCoordinate returns the coordinate of the server in the WAN gossip pool.
func (s *Server) GetWANCoordinate() (*coordinate.Coordinate, error) {
return s.serfWAN.GetCoordinate()
}

View file

@ -66,6 +66,9 @@ func testServerConfig(t *testing.T, NodeName string) (string, *Config) {
config.RaftConfig.ElectionTimeout = 40 * time.Millisecond
config.ReconcileInterval = 100 * time.Millisecond
config.DisableCoordinates = false
config.CoordinateUpdatePeriod = 100 * time.Millisecond
return dir, config
}

View file

@ -29,6 +29,7 @@ func stateStoreSchema() *memdb.DBSchema {
sessionsTableSchema,
sessionChecksTableSchema,
aclsTableSchema,
coordinatesTableSchema,
}
// Add the tables to the root schema
@ -345,3 +346,22 @@ func aclsTableSchema() *memdb.TableSchema {
},
}
}
// coordinatesTableSchema returns a new table schema used for storing
// network coordinates.
func coordinatesTableSchema() *memdb.TableSchema {
return &memdb.TableSchema{
Name: "coordinates",
Indexes: map[string]*memdb.IndexSchema{
"id": &memdb.IndexSchema{
Name: "id",
AllowMissing: false,
Unique: true,
Indexer: &memdb.StringFieldIndex{
Field: "Node",
Lowercase: true,
},
},
},
}
}

View file

@ -8,6 +8,7 @@ import (
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/go-memdb"
"github.com/hashicorp/serf/coordinate"
)
var (
@ -196,6 +197,15 @@ func (s *StateSnapshot) ACLs() (memdb.ResultIterator, error) {
return iter, nil
}
// Coordinates is used to pull all the coordinates from the snapshot.
func (s *StateSnapshot) Coordinates() (memdb.ResultIterator, error) {
iter, err := s.tx.Get("coordinates", "id")
if err != nil {
return nil, err
}
return iter, nil
}
// Restore is used to efficiently manage restoring a large amount of data into
// the state store. It works by doing all the restores inside of a single
// transaction.
@ -299,6 +309,24 @@ func (s *StateRestore) ACL(acl *structs.ACL) error {
return nil
}
// Coordinates is used when restoring from a snapshot. For general inserts, use
// CoordinateBatchUpdate. We do less vetting of the updates here because they
// already got checked on the way in during a batch update.
func (s *StateRestore) Coordinates(idx uint64, updates structs.Coordinates) error {
for _, update := range updates {
if err := s.tx.Insert("coordinates", update); err != nil {
return fmt.Errorf("failed restoring coordinate: %s", err)
}
}
if err := indexUpdateMaxTxn(s.tx, idx, "coordinates"); err != nil {
return fmt.Errorf("failed updating index: %s", err)
}
s.watches.Arm("coordinates")
return nil
}
// maxIndex is a helper used to retrieve the highest known index
// amongst a set of tables in the db.
func (s *StateStore) maxIndex(tables ...string) uint64 {
@ -379,6 +407,8 @@ func (s *StateStore) getWatchTables(method string) []string {
return []string{"sessions"}
case "ACLGet", "ACLList":
return []string{"acls"}
case "Coordinates":
return []string{"coordinates"}
}
panic(fmt.Sprintf("Unknown method %s", method))
@ -583,7 +613,6 @@ func (s *StateStore) deleteNodeTxn(tx *memdb.Txn, idx uint64, nodeID string) err
// Use a watch manager since the inner functions can perform multiple
// ops per table.
watches := NewDumbWatchManager(s.tableWatches)
watches.Arm("nodes")
// Delete all services associated with the node and update the service index.
services, err := tx.Get("services", "node", nodeID)
@ -620,6 +649,21 @@ func (s *StateStore) deleteNodeTxn(tx *memdb.Txn, idx uint64, nodeID string) err
}
}
// Delete any coordinate associated with this node.
coord, err := tx.First("coordinates", "id", nodeID)
if err != nil {
return fmt.Errorf("failed coordinate lookup: %s", err)
}
if coord != nil {
if err := tx.Delete("coordinates", coord); err != nil {
return fmt.Errorf("failed deleting coordinate: %s", err)
}
if err := tx.Insert("index", &IndexEntry{"coordinates", idx}); err != nil {
return fmt.Errorf("failed updating index: %s", err)
}
watches.Arm("coordinates")
}
// Delete the node and update the index.
if err := tx.Delete("nodes", node); err != nil {
return fmt.Errorf("failed deleting node: %s", err)
@ -645,6 +689,7 @@ func (s *StateStore) deleteNodeTxn(tx *memdb.Txn, idx uint64, nodeID string) err
}
}
watches.Arm("nodes")
tx.Defer(func() { watches.Notify() })
return nil
}
@ -2231,3 +2276,84 @@ func (s *StateStore) aclDeleteTxn(tx *memdb.Txn, idx uint64, aclID string) error
tx.Defer(func() { s.tableWatches["acls"].Notify() })
return nil
}
// CoordinateGetRaw queries for the coordinate of the given node. This is an
// unusual state store method because it just returns the raw coordinate or
// nil, none of the Raft or node information is returned. This hits the 90%
// internal-to-Consul use case for this data, and this isn't exposed via an
// endpoint, so it doesn't matter that the Raft info isn't available.
func (s *StateStore) CoordinateGetRaw(node string) (*coordinate.Coordinate, error) {
tx := s.db.Txn(false)
defer tx.Abort()
// Pull the full coordinate entry.
coord, err := tx.First("coordinates", "id", node)
if err != nil {
return nil, fmt.Errorf("failed coordinate lookup: %s", err)
}
// Pick out just the raw coordinate.
if coord != nil {
return coord.(*structs.Coordinate).Coord, nil
}
return nil, nil
}
// Coordinates queries for all nodes with coordinates.
func (s *StateStore) Coordinates() (uint64, structs.Coordinates, error) {
tx := s.db.Txn(false)
defer tx.Abort()
// Get the table index.
idx := maxIndexTxn(tx, s.getWatchTables("Coordinates")...)
// Pull all the coordinates.
coords, err := tx.Get("coordinates", "id")
if err != nil {
return 0, nil, fmt.Errorf("failed coordinate lookup: %s", err)
}
var results structs.Coordinates
for coord := coords.Next(); coord != nil; coord = coords.Next() {
results = append(results, coord.(*structs.Coordinate))
}
return idx, results, nil
}
// CoordinateBatchUpdate processes a batch of coordinate updates and applies
// them in a single transaction.
func (s *StateStore) CoordinateBatchUpdate(idx uint64, updates structs.Coordinates) error {
tx := s.db.Txn(true)
defer tx.Abort()
// Upsert the coordinates.
for _, update := range updates {
// Since the cleanup of coordinates is tied to deletion of
// nodes, we silently drop any updates for nodes that we don't
// know about. This might be possible during normal operation
// if we happen to get a coordinate update for a node that
// hasn't been able to add itself to the catalog yet. Since we
// don't carefully sequence this, and since it will fix itself
// on the next coordinate update from that node, we don't return
// an error or log anything.
node, err := tx.First("nodes", "id", update.Node)
if err != nil {
return fmt.Errorf("failed node lookup: %s", err)
}
if node == nil {
continue
}
if err := tx.Insert("coordinates", update); err != nil {
return fmt.Errorf("failed inserting coordinate: %s", err)
}
}
// Update the index.
if err := tx.Insert("index", &IndexEntry{"coordinates", idx}); err != nil {
return fmt.Errorf("failed updating index: %s", err)
}
tx.Defer(func() { s.tableWatches["coordinates"].Notify() })
tx.Commit()
return nil
}

View file

@ -3,6 +3,7 @@ package state
import (
crand "crypto/rand"
"fmt"
"math/rand"
"reflect"
"sort"
"strings"
@ -10,6 +11,7 @@ import (
"time"
"github.com/hashicorp/consul/consul/structs"
"github.com/hashicorp/serf/coordinate"
)
func testUUID() string {
@ -999,17 +1001,28 @@ func TestStateStore_Node_Watches(t *testing.T) {
}
})
// Check that a delete of a node + service + check triggers all three
// tables in one shot.
// Check that a delete of a node + service + check + coordinate triggers
// all tables in one shot.
testRegisterNode(t, s, 4, "node1")
testRegisterService(t, s, 5, "node1", "service1")
testRegisterCheck(t, s, 6, "node1", "service1", "check3", structs.HealthPassing)
updates := structs.Coordinates{
&structs.Coordinate{
Node: "node1",
Coord: generateRandomCoordinate(),
},
}
if err := s.CoordinateBatchUpdate(7, updates); err != nil {
t.Fatalf("err: %s", err)
}
verifyWatch(t, s.getTableWatch("nodes"), func() {
verifyWatch(t, s.getTableWatch("services"), func() {
verifyWatch(t, s.getTableWatch("checks"), func() {
if err := s.DeleteNode(7, "node1"); err != nil {
t.Fatalf("err: %s", err)
}
verifyWatch(t, s.getTableWatch("coordinates"), func() {
if err := s.DeleteNode(7, "node1"); err != nil {
t.Fatalf("err: %s", err)
}
})
})
})
})
@ -4721,3 +4734,291 @@ func TestStateStore_ACL_Watches(t *testing.T) {
restore.Commit()
})
}
// generateRandomCoordinate creates a random coordinate. This mucks with the
// underlying structure directly, so it's not really useful for any particular
// position in the network, but it's a good payload to send through to make
// sure things come out the other side or get stored correctly.
func generateRandomCoordinate() *coordinate.Coordinate {
config := coordinate.DefaultConfig()
coord := coordinate.NewCoordinate(config)
for i := range coord.Vec {
coord.Vec[i] = rand.NormFloat64()
}
coord.Error = rand.NormFloat64()
coord.Adjustment = rand.NormFloat64()
return coord
}
func TestStateStore_Coordinate_Updates(t *testing.T) {
s := testStateStore(t)
// Make sure the coordinates list starts out empty, and that a query for
// a raw coordinate for a nonexistent node doesn't do anything bad.
idx, coords, err := s.Coordinates()
if err != nil {
t.Fatalf("err: %s", err)
}
if idx != 0 {
t.Fatalf("bad index: %d", idx)
}
if coords != nil {
t.Fatalf("bad: %#v", coords)
}
coord, err := s.CoordinateGetRaw("nope")
if err != nil {
t.Fatalf("err: %s", err)
}
if coord != nil {
t.Fatalf("bad: %#v", coord)
}
// Make an update for nodes that don't exist and make sure they get
// ignored.
updates := structs.Coordinates{
&structs.Coordinate{
Node: "node1",
Coord: generateRandomCoordinate(),
},
&structs.Coordinate{
Node: "node2",
Coord: generateRandomCoordinate(),
},
}
if err := s.CoordinateBatchUpdate(1, updates); err != nil {
t.Fatalf("err: %s", err)
}
// Should still be empty, though applying an empty batch does bump
// the table index.
idx, coords, err = s.Coordinates()
if err != nil {
t.Fatalf("err: %s", err)
}
if idx != 1 {
t.Fatalf("bad index: %d", idx)
}
if coords != nil {
t.Fatalf("bad: %#v", coords)
}
// Register the nodes then do the update again.
testRegisterNode(t, s, 1, "node1")
testRegisterNode(t, s, 2, "node2")
if err := s.CoordinateBatchUpdate(3, updates); err != nil {
t.Fatalf("err: %s", err)
}
// Should go through now.
idx, coords, err = s.Coordinates()
if err != nil {
t.Fatalf("err: %s", err)
}
if idx != 3 {
t.Fatalf("bad index: %d", idx)
}
if !reflect.DeepEqual(coords, updates) {
t.Fatalf("bad: %#v", coords)
}
// Also verify the raw coordinate interface.
for _, update := range updates {
coord, err := s.CoordinateGetRaw(update.Node)
if err != nil {
t.Fatalf("err: %s", err)
}
if !reflect.DeepEqual(coord, update.Coord) {
t.Fatalf("bad: %#v", coord)
}
}
// Update the coordinate for one of the nodes.
updates[1].Coord = generateRandomCoordinate()
if err := s.CoordinateBatchUpdate(4, updates); err != nil {
t.Fatalf("err: %s", err)
}
// Verify it got applied.
idx, coords, err = s.Coordinates()
if err != nil {
t.Fatalf("err: %s", err)
}
if idx != 4 {
t.Fatalf("bad index: %d", idx)
}
if !reflect.DeepEqual(coords, updates) {
t.Fatalf("bad: %#v", coords)
}
// And check the raw coordinate version of the same thing.
for _, update := range updates {
coord, err := s.CoordinateGetRaw(update.Node)
if err != nil {
t.Fatalf("err: %s", err)
}
if !reflect.DeepEqual(coord, update.Coord) {
t.Fatalf("bad: %#v", coord)
}
}
}
func TestStateStore_Coordinate_Cleanup(t *testing.T) {
s := testStateStore(t)
// Register a node and update its coordinate.
testRegisterNode(t, s, 1, "node1")
updates := structs.Coordinates{
&structs.Coordinate{
Node: "node1",
Coord: generateRandomCoordinate(),
},
}
if err := s.CoordinateBatchUpdate(2, updates); err != nil {
t.Fatalf("err: %s", err)
}
// Make sure it's in there.
coord, err := s.CoordinateGetRaw("node1")
if err != nil {
t.Fatalf("err: %s", err)
}
if !reflect.DeepEqual(coord, updates[0].Coord) {
t.Fatalf("bad: %#v", coord)
}
// Now delete the node.
if err := s.DeleteNode(3, "node1"); err != nil {
t.Fatalf("err: %s", err)
}
// Make sure the coordinate is gone.
coord, err = s.CoordinateGetRaw("node1")
if err != nil {
t.Fatalf("err: %s", err)
}
if coord != nil {
t.Fatalf("bad: %#v", coord)
}
// Make sure the index got updated.
idx, coords, err := s.Coordinates()
if err != nil {
t.Fatalf("err: %s", err)
}
if idx != 3 {
t.Fatalf("bad index: %d", idx)
}
if coords != nil {
t.Fatalf("bad: %#v", coords)
}
}
func TestStateStore_Coordinate_Snapshot_Restore(t *testing.T) {
s := testStateStore(t)
// Register two nodes and update their coordinates.
testRegisterNode(t, s, 1, "node1")
testRegisterNode(t, s, 2, "node2")
updates := structs.Coordinates{
&structs.Coordinate{
Node: "node1",
Coord: generateRandomCoordinate(),
},
&structs.Coordinate{
Node: "node2",
Coord: generateRandomCoordinate(),
},
}
if err := s.CoordinateBatchUpdate(3, updates); err != nil {
t.Fatalf("err: %s", err)
}
// Snapshot the coordinates.
snap := s.Snapshot()
defer snap.Close()
// Alter the real state store.
trash := structs.Coordinates{
&structs.Coordinate{
Node: "node1",
Coord: generateRandomCoordinate(),
},
&structs.Coordinate{
Node: "node2",
Coord: generateRandomCoordinate(),
},
}
if err := s.CoordinateBatchUpdate(4, trash); err != nil {
t.Fatalf("err: %s", err)
}
// Verify the snapshot.
if idx := snap.LastIndex(); idx != 3 {
t.Fatalf("bad index: %d", idx)
}
iter, err := snap.Coordinates()
if err != nil {
t.Fatalf("err: %s", err)
}
var dump structs.Coordinates
for coord := iter.Next(); coord != nil; coord = iter.Next() {
dump = append(dump, coord.(*structs.Coordinate))
}
if !reflect.DeepEqual(dump, updates) {
t.Fatalf("bad: %#v", dump)
}
// Restore the values into a new state store.
func() {
s := testStateStore(t)
restore := s.Restore()
if err := restore.Coordinates(5, dump); err != nil {
t.Fatalf("err: %s", err)
}
restore.Commit()
// Read the restored coordinates back out and verify that they match.
idx, res, err := s.Coordinates()
if err != nil {
t.Fatalf("err: %s", err)
}
if idx != 5 {
t.Fatalf("bad index: %d", idx)
}
if !reflect.DeepEqual(res, updates) {
t.Fatalf("bad: %#v", res)
}
// Check that the index was updated (note that it got passed
// in during the restore).
if idx := s.maxIndex("coordinates"); idx != 5 {
t.Fatalf("bad index: %d", idx)
}
}()
}
func TestStateStore_Coordinate_Watches(t *testing.T) {
s := testStateStore(t)
testRegisterNode(t, s, 1, "node1")
// Call functions that update the coordinates table and make sure a watch fires
// each time.
verifyWatch(t, s.getTableWatch("coordinates"), func() {
updates := structs.Coordinates{
&structs.Coordinate{
Node: "node1",
Coord: generateRandomCoordinate(),
},
}
if err := s.CoordinateBatchUpdate(2, updates); err != nil {
t.Fatalf("err: %s", err)
}
})
verifyWatch(t, s.getTableWatch("coordinates"), func() {
if err := s.DeleteNode(3, "node1"); err != nil {
t.Fatalf("err: %s", err)
}
})
}

View file

@ -7,6 +7,7 @@ import (
"github.com/hashicorp/consul/acl"
"github.com/hashicorp/go-msgpack/codec"
"github.com/hashicorp/serf/coordinate"
)
var (
@ -31,6 +32,7 @@ const (
SessionRequestType
ACLRequestType
TombstoneRequestType
CoordinateBatchUpdateType
)
const (
@ -182,9 +184,18 @@ func (r *DeregisterRequest) RequestDatacenter() string {
return r.Datacenter
}
// QuerySource is used to pass along information about the source node
// in queries so that we can adjust the response based on its network
// coordinates.
type QuerySource struct {
Datacenter string
Node string
}
// DCSpecificRequest is used to query about a specific DC
type DCSpecificRequest struct {
Datacenter string
Source QuerySource
QueryOptions
}
@ -192,12 +203,13 @@ func (r *DCSpecificRequest) RequestDatacenter() string {
return r.Datacenter
}
// ServiceSpecificRequest is used to query about a specific node
// ServiceSpecificRequest is used to query about a specific service
type ServiceSpecificRequest struct {
Datacenter string
ServiceName string
ServiceTag string
TagFilter bool // Controls tag filtering
Source QuerySource
QueryOptions
}
@ -220,6 +232,7 @@ func (r *NodeSpecificRequest) RequestDatacenter() string {
type ChecksInStateRequest struct {
Datacenter string
State string
Source QuerySource
QueryOptions
}
@ -344,7 +357,7 @@ type HealthCheck struct {
type HealthChecks []*HealthCheck
// CheckServiceNode is used to provide the node, its service
// definition, as well as a HealthCheck that is associated
// definition, as well as a HealthCheck that is associated.
type CheckServiceNode struct {
Node *Node
Service *NodeService
@ -618,6 +631,49 @@ type ACLPolicy struct {
QueryMeta
}
// Coordinate stores a node name with its associated network coordinate.
type Coordinate struct {
Node string
Coord *coordinate.Coordinate
}
type Coordinates []*Coordinate
// IndexedCoordinate is used to represent a single node's coordinate from the state
// store.
type IndexedCoordinate struct {
Coord *coordinate.Coordinate
QueryMeta
}
// IndexedCoordinates is used to represent a list of nodes and their
// corresponding raw coordinates.
type IndexedCoordinates struct {
Coordinates Coordinates
QueryMeta
}
// DatacenterMap is used to represent a list of nodes with their raw coordinates,
// associated with a datacenter.
type DatacenterMap struct {
Datacenter string
Coordinates Coordinates
}
// CoordinateUpdateRequest is used to update the network coordinate of a given
// node.
type CoordinateUpdateRequest struct {
Datacenter string
Node string
Coord *coordinate.Coordinate
WriteRequest
}
// RequestDatacenter returns the datacenter for a given update request.
func (c *CoordinateUpdateRequest) RequestDatacenter() string {
return c.Datacenter
}
// EventFireRequest is used to ask a server to fire
// a Serf event. It is a bit odd, since it doesn't depend on
// the catalog or leader. Any node can respond, so it's not quite

View file

@ -141,7 +141,7 @@ func isConsulServer(m serf.Member) (bool, *serverParts) {
return true, parts
}
// Returns if a member is a consul node. Returns a boo,
// Returns if a member is a consul node. Returns a bool,
// and the datacenter.
func isConsulNode(m serf.Member) (bool, string) {
if m.Tags["role"] != "node" {

View file

@ -1,5 +1,5 @@
source "https://rubygems.org"
ruby "2.2.2"
ruby "2.2.3"
gem "middleman-hashicorp", github: "hashicorp/middleman-hashicorp"

View file

@ -186,6 +186,3 @@ PLATFORMS
DEPENDENCIES
middleman-hashicorp!
BUNDLED WITH
1.10.6

View file

@ -14,17 +14,18 @@ to enable changes without breaking backwards compatibility.
Each endpoint manages a different aspect of Consul:
* [kv](http/kv.html) - Key/Value store
* [acl](http/acl.html) - Access Control Lists
* [agent](http/agent.html) - Consul Agent
* [catalog](http/catalog.html) - Nodes and services
* [health](http/health.html) - Health checks
* [session](http/session.html) - Sessions
* [acl](http/acl.html) - Access Control Lists
* [coordinate](http/coordinate.html) - Network coordinates
* [event](http/event.html) - User Events
* [health](http/health.html) - Health checks
* [kv](http/kv.html) - Key/Value store
* [session](http/session.html) - Sessions
* [status](http/status.html) - Consul system status
* internal - Internal APIs. Purposely undocumented, subject to change.
Each of these is documented in detail at the links above.
Each of these is documented in detail at the links above. Consul also has a number
of internal APIs which are purposely undocumented and subject to change.
## Blocking Queries

View file

@ -163,6 +163,11 @@ It returns a JSON body like this:
"EnableSyslog": false,
"RejoinAfterLeave": false
},
"Coord": {
"Adjustment": 0,
"Error": 1.5,
"Vec": [0,0,0,0,0,0,0,0]
},
"Member": {
"Name": "foobar",
"Addr": "10.1.10.12",

View file

@ -159,6 +159,10 @@ If the API call succeeds a 200 status code is returned.
This endpoint is hit with a GET and is used to return all the
datacenters that are known by the Consul server.
The datacenters will be sorted in ascending order based on the
estimated median round trip time from the server to the servers
in that datacenter.
It returns a JSON body like this:
```javascript
@ -175,6 +179,11 @@ This endpoint is hit with a GET and returns the nodes registered
in a given DC. By default, the datacenter of the agent is queried;
however, the dc can be provided using the "?dc=" query parameter.
Adding the optional "?near=" parameter with a node name will sort
the node list in ascending order based on the estimated round trip
time from that node. Passing "?near=_agent" will use the agent's
node for the sort.
It returns a JSON body like this:
```javascript
@ -226,6 +235,11 @@ The service being queried must be provided on the path. By default
all nodes in that service are returned. However, the list can be filtered
by tag using the "?tag=" query parameter.
Adding the optional "?near=" parameter with a node name will sort
the node list in ascending order based on the estimated round trip
time from that node. Passing "?near=_agent" will use the agent's
node for the sort.
It returns a JSON body like this:
```javascript

View file

@ -0,0 +1,78 @@
---
layout: "docs"
page_title: "Coordinate (HTTP)"
sidebar_current: "docs-agent-http-coordinate"
description: >
The Coordinate endpoint is used to query for the nework coordinates for
nodes in the local datacenter as well as Consul servers in the local
datacenter and remote datacenters.
---
# Coordinate HTTP Endpoint
The Coordinate endpoint is used to query for the nework coordinates for nodes
in the local datacenter as well as Consul servers in the local datacenter and
remote datacenters.
See the [Network Coordinates](/docs/internals/coordinates.html) internals guide
for more information on how these coordinates are computed, and for details on
how to perform calculations with them.
The following endpoints are supported:
* [`/v1/coordinate/datacenters`](#coordinate_datacenters) : Queries for WAN coordinates of Consul servers
* [`/v1/coordinate/nodes`](#coordinate_nodes) : Queries for LAN coordinates of Consul nodes
### <a name="coordinate_datacenters"></a> /v1/coordinate/datacenters
This endpoint is hit with a GET and returns the WAN network coordinates for
all Consul servers, organized by DCs.
It returns a JSON body like this:
```javascript
[
{
"Datacenter": "dc1",
"Coordinates": [
{
"Node": "agent-one",
"Coord": {
"Adjustment": 0,
"Error": 1.5,
"Height": 0,
"Vec": [0,0,0,0,0,0,0,0]
}
}
]
}
]
```
This endpoint serves data out of the server's local Serf data about the WAN, so
its results may vary as requests are handled by different servers in the
cluster. Also, it does not support blocking queries or any consistency modes.
### <a name=""coordinate_nodes></a> /v1/coordinate/nodes
This endpoint is hit with a GET and returns the LAN network coordinates for
all nodes in a given DC. By default, the datacenter of the agent is queried;
however, the dc can be provided using the "?dc=" query parameter.
It returns a JSON body like this:
```javascript
[
{
"Node": "agent-one",
"Coord": {
"Adjustment": 0,
"Error": 1.5,
"Height": 0,
"Vec": [0,0,0,0,0,0,0,0]
}
}
]
```
This endpoint supports blocking queries and all consistency modes.

View file

@ -70,6 +70,11 @@ This endpoint is hit with a GET and returns the checks associated with
the service provided on the path. By default, the datacenter of the agent is queried;
however, the dc can be provided using the "?dc=" query parameter.
Adding the optional "?near=" parameter with a node name will sort
the node list in ascending order based on the estimated round trip
time from that node. Passing "?near=_agent" will use the agent's
node for the sort.
It returns a JSON body like this:
```javascript
@ -95,6 +100,11 @@ This endpoint is hit with a GET and returns the nodes providing
the service indicated on the path. By default, the datacenter of the agent is queried;
however, the dc can be provided using the "?dc=" query parameter.
Adding the optional "?near=" parameter with a node name will sort
the node list in ascending order based on the estimated round trip
time from that node. Passing "?near=_agent" will use the agent's
node for the sort.
By default, all nodes matching the service are returned. The list can be filtered
by tag using the "?tag=" query parameter.
@ -159,6 +169,11 @@ This endpoint is hit with a GET and returns the checks in the
state provided on the path. By default, the datacenter of the agent is queried;
however, the dc can be provided using the "?dc=" query parameter.
Adding the optional "?near=" parameter with a node name will sort
the node list in ascending order based on the estimated round trip
time from that node. Passing "?near=_agent" will use the agent's
node for the sort.
The supported states are `any`, `unknown`, `passing`, `warning`, or `critical`.
The `any` state is a wildcard that can be used to return all checks.

View file

@ -39,6 +39,7 @@ Available commands are:
members Lists the members of a Consul cluster
monitor Stream logs from a Consul agent
reload Triggers the agent to reload configuration files
rtt Estimates network round trip time between nodes
version Prints the Consul version
watch Watch for changes in Consul
```

View file

@ -0,0 +1,55 @@
---
layout: "docs"
page_title: "Commands: RTT"
sidebar_current: "docs-commands-rtt"
description: >
The rtt command estimates the network round trip time between two nodes.
---
# Consul RTT
Command: `consul rtt`
The `rtt` command estimates the network round trip time between two nodes using
Consul's network coordinate model of the cluster.
See the [Network Coordinates](/docs/internals/coordinates.html) internals guide
for more information on how these coordinates are computed.
## Usage
Usage: `consul rtt [options] node1 [node2]`
At least one node name is required. If the second node name isn't given, it
is set to the agent's node name. Note that these are node names as known to
Consul as `consul members` would show, not IP addresses.
The list of available flags are:
* `-wan` - Instructs the command to use WAN coordinates instead of LAN
coordinates. By default, the two nodes are assumed to be nodes in the local
datacenter and the LAN coordinates are used. If the -wan option is given,
then the WAN coordinates are used, and the node names must be suffixed by a period
and the datacenter (eg. "myserver.dc1"). It is not possible to measure between
LAN coordinates and WAN coordinates, so both nodes must be in the same pool.
* `-http-addr` - Address to the HTTP server of the agent you want to contact
to send this command. If this isn't specified, the command will contact
"127.0.0.1:8500" which is the default HTTP address of a Consul agent.
## Output
If coordinates are available, the command will print the estimated round trip
time between the given nodes:
```
$ consul rtt n1 n2
Estimated n1 <-> n2 rtt: 0.610 ms (using LAN coordinates)
$ consul rtt n2 # Running from n1
Estimated n1 <-> n2 rtt: 0.610 ms (using LAN coordinates)
$ consul rtt -wan n1.dc1 n2.dc2
Estimated n1.dc1 <-> n2.dc2 rtt: 1.275 ms (using WAN coordinates)
```

View file

@ -0,0 +1,101 @@
---
layout: "docs"
page_title: "Network Coordinates"
sidebar_current: "docs-internals-coordinates"
description: |-
Serf uses a network tomography system to compute network coordinates for nodes in the cluster. These coordinates are useful for easily calculating the estimated network round trip time between any two nodes in the cluster. This page documents the details of this system. The core of the network tomography system us based on Vivaldi: A Decentralized Network Coordinate System, with several improvements based on several follow-on papers.
---
# Network Coordinates
Consul uses a [network tomography](https://en.wikipedia.org/wiki/Network_tomography)
system to compute network coordinates for nodes in the cluster. These coordinates
allow the network round trip time to be estimated between any two nodes using a
a very simple calculation. This allows for many useful applications, such as finding
the service node nearest a requesting node, or failing over to services in the next
closest datacenter.
All of this is provided through the use of the [Serf library](https://www.serfdom.io/).
Serf's network tomography is based on ["Vivaldi: A Decentralized Network Coordinate System"](http://www.cs.ucsb.edu/~ravenben/classes/276/papers/vivaldi-sigcomm04.pdf),
with some enhancements based on other research. There are more details about
[Serf's network coordinates here](https://www.serfdom.io/docs/internals/coordinates.html).
~> **Advanced Topic!** This page covers the technical details of
the internals of Consul. You don't need to know these details to effectively
operate and use Consul. These details are documented here for those who wish
to learn about them without having to go spelunking through the source code.
## Network Coordinates in Consul
Network coordinates manifest in several ways inside Consul:
* The [`consul rtt`](/docs/commands/rtt.html) command can be used to query for the
network round trip time between any two nodes.
* The [Catalog endpoints](/docs/agent/http/catalog.html) and
[Health endpoints](/docs/agent/http/health.html) can sort the results of queries based
on the network round trip time from a given node using a "?near=" parameter.
* The [Coordinate endpoint](/docs/agent/http/coordinate.html) exposes raw network
coordinates for use in other applications.
Consul uses Serf to manage two different gossip pools, one for the LAN with members
of a given datacenter, and one for the WAN which is made up of just the Consul servers
in all datacenters. It's important to note that **network coordinates are not compatible
between these two pools**. LAN coordinates only make sense in calculations with other
LAN coordinates, and WAN coordinates only make sense with other WAN coordinates.
## Working with Coordinates
Computing the estimated network round trip time between any two nodes is simple
once you have their coordinates. Here's a sample coordinate, as returned from the
[Coordinate endpoint](/docs/agent/http/coordinate.html).
```
"Coord": {
"Adjustment": 0.1,
"Error": 1.5,
"Height": 0.02,
"Vec": [0.34,0.68,0.003,0.01,0.05,0.1,0.34,0.06]
}
```
All values are floating point numbers in units of seconds, except for the error
term which isn't used for distance calculations.
Here's a complete example in Go showing how to compute the distance between two
coordinates:
```
import (
"github.com/hashicorp/serf/coordinate"
"math"
"time"
)
func dist(a *coordinate.Coordinate, b *coordinate.Coordinate) time.Duration {
// Coordinates will always have the same dimensionality, so this is
// just a sanity check.
if len(a.Vec) != len(b.Vec) {
panic("dimensions aren't compatible")
}
// Calculate the Euclidean distance plus the heights.
sumsq := 0.0
for i := 0; i < len(a.Vec); i++ {
diff := a.Vec[i] - b.Vec[i]
sumsq += diff * diff
}
rtt := math.Sqrt(sumsq) + a.Height + b.Height
// Apply the adjustment components, guarding against negatives.
adjusted := rtt + a.Adjustment + b.Adjustment
if adjusted > 0.0 {
rtt = adjusted
}
// Go's times are natively nanoseconds, so we convert from seconds.
const secondsToNanoseconds = 1.0e9
return time.Duration(rtt * secondsToNanoseconds)
}
```

View file

@ -15,7 +15,7 @@
<li<%= sidebar_current("docs-upgrading-compat") %>>
<a href="/docs/compatibility.html">Compatibility Promise</a>
</li>
</li>
<li<%= sidebar_current("docs-upgrading-specific") %>>
<a href="/docs/upgrade-specific.html">Specific Version Details</a>
@ -38,6 +38,10 @@
<a href="/docs/internals/gossip.html">Gossip Protocol</a>
</li>
<li<%= sidebar_current("docs-internals-coordinates") %>>
<a href="/docs/internals/coordinates.html">Network Coordinates</a>
</li>
<li<%= sidebar_current("docs-internals-sessions") %>>
<a href="/docs/internals/sessions.html">Sessions</a>
</li>
@ -96,8 +100,7 @@
</li>
<li<%= sidebar_current("docs-commands-leave") %>>
<a href="/docs/commands/leave.html">leave</a>
</li>
<a href="/docs/commands/leave.html">leave</a></li>
<li<%= sidebar_current("docs-commands-lock") %>>
<a href="/docs/commands/lock.html">lock</a>
@ -123,6 +126,10 @@
<a href="/docs/commands/reload.html">reload</a>
</li>
<li<%= sidebar_current("docs-commands-rtt") %>>
<a href="/docs/commands/rtt.html">rtt</a>
</li>
<li<%= sidebar_current("docs-commands-watch") %>>
<a href="/docs/commands/watch.html">watch</a>
</li>
@ -143,8 +150,8 @@
<li<%= sidebar_current("docs-agent-http") %>>
<a href="/docs/agent/http.html">HTTP API</a>
<ul class="subnav">
<li<%= sidebar_current("docs-agent-http-kv") %>>
<a href="/docs/agent/http/kv.html">Key/Value store</a>
<li<%= sidebar_current("docs-agent-http-acl") %>>
<a href="/docs/agent/http/acl.html">ACLs</a>
</li>
<li<%= sidebar_current("docs-agent-http-agent") %>>
@ -155,20 +162,24 @@
<a href="/docs/agent/http/catalog.html">Catalog</a>
</li>
<li<%= sidebar_current("docs-agent-http-session") %>>
<a href="/docs/agent/http/session.html">Sessions</a>
<li<%= sidebar_current("docs-agent-http-event") %>>
<a href="/docs/agent/http/event.html">Events</a>
</li>
<li<%= sidebar_current("docs-agent-http-health") %>>
<a href="/docs/agent/http/health.html">Health Checks</a>
</li>
<li<%= sidebar_current("docs-agent-http-acl") %>>
<a href="/docs/agent/http/acl.html">ACLs</a>
<li<%= sidebar_current("docs-agent-http-kv") %>>
<a href="/docs/agent/http/kv.html">Key/Value store</a>
</li>
<li<%= sidebar_current("docs-agent-http-event") %>>
<a href="/docs/agent/http/event.html">Events</a>
<li<%= sidebar_current("docs-agent-http-coordinate") %>>
<a href="/docs/agent/http/coordinate.html">Network Coordinates</a>
</li>
<li<%= sidebar_current("docs-agent-http-session") %>>
<a href="/docs/agent/http/session.html">Sessions</a>
</li>
<li<%= sidebar_current("docs-agent-http-status") %>>