agent: prevent very old servers re-joining a cluster with stale data (#17171)

* agent: configure server lastseen timestamp

Signed-off-by: Dan Bond <danbond@protonmail.com>

* use correct config

Signed-off-by: Dan Bond <danbond@protonmail.com>

* add comments

Signed-off-by: Dan Bond <danbond@protonmail.com>

* use default age in test golden data

Signed-off-by: Dan Bond <danbond@protonmail.com>

* add changelog

Signed-off-by: Dan Bond <danbond@protonmail.com>

* fix runtime test

Signed-off-by: Dan Bond <danbond@protonmail.com>

* agent: add server_metadata

Signed-off-by: Dan Bond <danbond@protonmail.com>

* update comments

Signed-off-by: Dan Bond <danbond@protonmail.com>

* correctly check if metadata file does not exist

Signed-off-by: Dan Bond <danbond@protonmail.com>

* follow instructions for adding new config

Signed-off-by: Dan Bond <danbond@protonmail.com>

* add comments

Signed-off-by: Dan Bond <danbond@protonmail.com>

* update comments

Signed-off-by: Dan Bond <danbond@protonmail.com>

* Update agent/agent.go

Co-authored-by: Dan Upton <daniel@floppy.co>

* agent/config: add validation for duration with min

Signed-off-by: Dan Bond <danbond@protonmail.com>

* docs: add new server_rejoin_age_max config definition

Signed-off-by: Dan Bond <danbond@protonmail.com>

* agent: add unit test for checking server last seen

Signed-off-by: Dan Bond <danbond@protonmail.com>

* agent: log continually for 60s before erroring

Signed-off-by: Dan Bond <danbond@protonmail.com>

* pr comments

Signed-off-by: Dan Bond <danbond@protonmail.com>

* remove unneeded todo

* agent: fix error message

Signed-off-by: Dan Bond <danbond@protonmail.com>

---------

Signed-off-by: Dan Bond <danbond@protonmail.com>
Co-authored-by: Dan Upton <daniel@floppy.co>
This commit is contained in:
Dan Bond 2023-05-15 04:05:47 -07:00 committed by GitHub
parent 91ed8de9f5
commit 6bb7782745
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 385 additions and 46 deletions

3
.changelog/17171.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
agent: add a configurable maximimum age (default: 7 days) to prevent servers re-joining a cluster with stale data
```

View File

@ -7,6 +7,7 @@ import (
"context"
"crypto/tls"
"encoding/json"
"errors"
"fmt"
"io"
"net"
@ -22,8 +23,6 @@ import (
"github.com/armon/go-metrics"
"github.com/armon/go-metrics/prometheus"
"github.com/hashicorp/consul/agent/rpcclient"
"github.com/hashicorp/consul/agent/rpcclient/configentry"
"github.com/hashicorp/go-connlimit"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-memdb"
@ -50,12 +49,13 @@ import (
grpcDNS "github.com/hashicorp/consul/agent/grpc-external/services/dns"
middleware "github.com/hashicorp/consul/agent/grpc-middleware"
"github.com/hashicorp/consul/agent/hcp/scada"
libscada "github.com/hashicorp/consul/agent/hcp/scada"
"github.com/hashicorp/consul/agent/local"
"github.com/hashicorp/consul/agent/proxycfg"
proxycfgglue "github.com/hashicorp/consul/agent/proxycfg-glue"
catalogproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/catalog"
localproxycfg "github.com/hashicorp/consul/agent/proxycfg-sources/local"
"github.com/hashicorp/consul/agent/rpcclient"
"github.com/hashicorp/consul/agent/rpcclient/configentry"
"github.com/hashicorp/consul/agent/rpcclient/health"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/agent/systemd"
@ -575,11 +575,11 @@ func (a *Agent) Start(ctx context.Context) error {
return err
}
// copy over the existing node id, this cannot be
// changed while running anyways but this prevents
// breaking some existing behavior. then overwrite
// the configuration
// Copy over the existing node id. This cannot be
// changed while running, but this prevents
// breaking some existing behavior.
c.NodeID = a.config.NodeID
// Overwrite the configuration.
a.config = c
if err := a.tlsConfigurator.Update(a.config.TLS); err != nil {
@ -625,6 +625,20 @@ func (a *Agent) Start(ctx context.Context) error {
if c.ServerMode {
serverLogger := a.baseDeps.Logger.NamedIntercept(logging.ConsulServer)
// Check for a last seen timestamp and exit if deemed stale before attempting to join
// Serf/Raft or listen for requests.
if err := a.checkServerLastSeen(consul.ReadServerMetadata); err != nil {
deadline := time.Now().Add(time.Minute)
for time.Now().Before(deadline) {
a.logger.Error("startup error", "error", err)
time.Sleep(10 * time.Second)
}
return err
}
// periodically write server metadata to disk.
go a.persistServerMetadata()
incomingRPCLimiter := consul.ConfiguredIncomingRPCLimiter(
&lib.StopChannelContext{StopCh: a.shutdownCh},
serverLogger,
@ -661,7 +675,6 @@ func (a *Agent) Start(ctx context.Context) error {
return fmt.Errorf("failed to start server cert manager: %w", err)
}
}
} else {
a.externalGRPCServer = external.NewServer(
a.logger.Named("grpc.external"),
@ -1094,7 +1107,7 @@ func (a *Agent) listenHTTP() ([]apiServer, error) {
MaxHeaderBytes: a.config.HTTPMaxHeaderBytes,
}
if libscada.IsCapability(l.Addr()) {
if scada.IsCapability(l.Addr()) {
// wrap in http2 server handler
httpServer.Handler = h2c.NewHandler(srv.handler(a.config.EnableDebug), &http2.Server{})
}
@ -1521,6 +1534,8 @@ func newConsulConfig(runtimeCfg *config.RuntimeConfig, logger hclog.Logger) (*co
cfg.Reporting.License.Enabled = runtimeCfg.Reporting.License.Enabled
cfg.ServerRejoinAgeMax = runtimeCfg.ServerRejoinAgeMax
enterpriseConsulConfig(cfg, runtimeCfg)
return cfg, nil
@ -4529,7 +4544,70 @@ func (a *Agent) proxyDataSources() proxycfg.DataSources {
a.fillEnterpriseProxyDataSources(&sources)
return sources
}
// persistServerMetadata periodically writes a server's metadata to a file
// in the configured data directory.
func (a *Agent) persistServerMetadata() {
file := filepath.Join(a.config.DataDir, consul.ServerMetadataFile)
// Create a timer with no initial tick to allow metadata to be written immediately.
t := time.NewTimer(0)
defer t.Stop()
for {
select {
case <-t.C:
// Reset the timer to the larger periodic interval.
t.Reset(1 * time.Hour)
f, err := consul.OpenServerMetadata(file)
if err != nil {
a.logger.Error("failed to open existing server metadata: %w", err)
continue
}
if err := consul.WriteServerMetadata(f); err != nil {
f.Close()
a.logger.Error("failed to write server metadata: %w", err)
continue
}
f.Close()
case <-a.shutdownCh:
return
}
}
}
// checkServerLastSeen is a safety check that only occurs once of startup to prevent old servers
// with stale data from rejoining an existing cluster.
//
// It attempts to read a server's metadata file and check the last seen Unix timestamp against a
// configurable max age. If the metadata file does not exist, we treat this as an initial startup
// and return no error.
//
// Example: if the server recorded a last seen timestamp of now-7d, and we configure a max age
// of 3d, then we should prevent the server from rejoining.
func (a *Agent) checkServerLastSeen(readFn consul.ServerMetadataReadFunc) error {
filename := filepath.Join(a.config.DataDir, consul.ServerMetadataFile)
// Read server metadata file.
md, err := readFn(filename)
if err != nil {
// Return early if it doesn't exist as this likely indicates the server is starting for the first time.
if errors.Is(err, os.ErrNotExist) {
return nil
}
return fmt.Errorf("error reading server metadata: %w", err)
}
maxAge := a.config.ServerRejoinAgeMax
if md.IsLastSeenStale(maxAge) {
return fmt.Errorf("refusing to rejoin cluster because server has been offline for more than the configured server_rejoin_age_max (%s) - consider wiping your data dir", maxAge)
}
return nil
}
func listenerPortKey(svcID structs.ServiceID, checkID structs.CheckID) string {

View File

@ -12,6 +12,7 @@ import (
"crypto/x509"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
mathrand "math/rand"
"net"
@ -6204,6 +6205,70 @@ cloud {
require.NoError(t, err)
}
func TestAgent_checkServerLastSeen(t *testing.T) {
bd := BaseDeps{
Deps: consul.Deps{
Logger: hclog.NewInterceptLogger(nil),
Tokens: new(token.Store),
GRPCConnPool: &fakeGRPCConnPool{},
},
RuntimeConfig: &config.RuntimeConfig{},
Cache: cache.New(cache.Options{}),
}
agent, err := New(bd)
require.NoError(t, err)
// Test that an ErrNotExist OS error is treated as ok.
t.Run("TestReadErrNotExist", func(t *testing.T) {
readFn := func(filename string) (*consul.ServerMetadata, error) {
return nil, os.ErrNotExist
}
err := agent.checkServerLastSeen(readFn)
require.NoError(t, err)
})
// Test that an error reading server metadata is treated as an error.
t.Run("TestReadErr", func(t *testing.T) {
expected := errors.New("read error")
readFn := func(filename string) (*consul.ServerMetadata, error) {
return nil, expected
}
err := agent.checkServerLastSeen(readFn)
require.ErrorIs(t, err, expected)
})
// Test that a server with a 7d old last seen timestamp is treated as an error.
t.Run("TestIsLastSeenStaleErr", func(t *testing.T) {
agent.config.ServerRejoinAgeMax = time.Hour
readFn := func(filename string) (*consul.ServerMetadata, error) {
return &consul.ServerMetadata{
LastSeenUnix: time.Now().Add(-24 * 7 * time.Hour).Unix(),
}, nil
}
err := agent.checkServerLastSeen(readFn)
require.Error(t, err)
require.ErrorContains(t, err, "refusing to rejoin cluster because server has been offline for more than the configured server_rejoin_age_max")
})
// Test that a server with a 6h old last seen timestamp is not treated as an error.
t.Run("TestNoErr", func(t *testing.T) {
agent.config.ServerRejoinAgeMax = 24 * 7 * time.Hour
readFn := func(filename string) (*consul.ServerMetadata, error) {
return &consul.ServerMetadata{
LastSeenUnix: time.Now().Add(-6 * time.Hour).Unix(),
}, nil
}
err := agent.checkServerLastSeen(readFn)
require.NoError(t, err)
})
}
func getExpectedCaPoolByFile(t *testing.T) *x509.CertPool {
pool := x509.NewCertPool()
data, err := os.ReadFile("../test/ca/root.cer")

View File

@ -28,8 +28,6 @@ import (
"github.com/hashicorp/memberlist"
"golang.org/x/time/rate"
hcpconfig "github.com/hashicorp/consul/agent/hcp/config"
"github.com/hashicorp/consul/agent/cache"
"github.com/hashicorp/consul/agent/checks"
"github.com/hashicorp/consul/agent/connect/ca"
@ -37,6 +35,7 @@ import (
"github.com/hashicorp/consul/agent/consul/authmethod/ssoauth"
consulrate "github.com/hashicorp/consul/agent/consul/rate"
"github.com/hashicorp/consul/agent/dns"
hcpconfig "github.com/hashicorp/consul/agent/hcp/config"
"github.com/hashicorp/consul/agent/rpc/middleware"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/agent/token"
@ -1090,6 +1089,7 @@ func (b *builder) build() (rt RuntimeConfig, err error) {
ServerMode: serverMode,
ServerName: stringVal(c.ServerName),
ServerPort: serverPort,
ServerRejoinAgeMax: b.durationValWithDefaultMin("server_rejoin_age_max", c.ServerRejoinAgeMax, 24*7*time.Hour, 6*time.Hour),
Services: services,
SessionTTLMin: b.durationVal("session_ttl_min", c.SessionTTLMin),
SkipLeaveOnInt: skipLeaveOnInt,
@ -1952,6 +1952,16 @@ func (b *builder) durationValWithDefault(name string, v *string, defaultVal time
return d
}
// durationValWithDefaultMin is equivalent to durationValWithDefault, but enforces a minimum duration.
func (b *builder) durationValWithDefaultMin(name string, v *string, defaultVal, minVal time.Duration) (d time.Duration) {
d = b.durationValWithDefault(name, v, defaultVal)
if d < minVal {
b.err = multierror.Append(b.err, fmt.Errorf("%s: duration '%s' cannot be less than: %s", name, *v, minVal))
}
return d
}
func (b *builder) durationVal(name string, v *string) (d time.Duration) {
return b.durationValWithDefault(name, v, 0)
}

View File

@ -311,6 +311,21 @@ func TestBuilder_DurationVal_InvalidDuration(t *testing.T) {
require.Contains(t, b.err.Error(), badDuration2)
}
func TestBuilder_DurationValWithDefaultMin(t *testing.T) {
b := builder{}
// Attempt to validate that a duration of 10 hours will not error when the min val is 1 hour.
dur := "10h0m0s"
b.durationValWithDefaultMin("field2", &dur, 24*7*time.Hour, time.Hour)
require.NoError(t, b.err)
// Attempt to validate that a duration of 1 min will error when the min val is 1 hour.
dur = "0h1m0s"
b.durationValWithDefaultMin("field1", &dur, 24*7*time.Hour, time.Hour)
require.Error(t, b.err)
require.Contains(t, b.err.Error(), "1 error")
}
func TestBuilder_ServiceVal_MultiError(t *testing.T) {
b := builder{}
b.serviceVal(&ServiceDefinition{

View File

@ -228,6 +228,7 @@ type Config struct {
SerfBindAddrWAN *string `mapstructure:"serf_wan" json:"serf_wan,omitempty"`
ServerMode *bool `mapstructure:"server" json:"server,omitempty"`
ServerName *string `mapstructure:"server_name" json:"server_name,omitempty"`
ServerRejoinAgeMax *string `mapstructure:"server_rejoin_age_max" json:"server_rejoin_age_max,omitempty"`
Service *ServiceDefinition `mapstructure:"service" json:"-"`
Services []ServiceDefinition `mapstructure:"services" json:"-"`
SessionTTLMin *string `mapstructure:"session_ttl_min" json:"session_ttl_min,omitempty"`

View File

@ -58,6 +58,7 @@ func DefaultSource() Source {
segment_limit = 64
server = false
server_rejoin_age_max = "168h"
syslog_facility = "LOCAL0"
tls = {

View File

@ -1358,6 +1358,18 @@ type RuntimeConfig struct {
// hcl: ports { server = int }
ServerPort int
// ServerRejoinAgeMax is used to specify the duration of time a server
// is allowed to be down/offline before a startup operation is refused.
//
// For example: if a server has been offline for 5 days, and this option
// is configured to 3 days, then any subsequent startup operation will fail
// and require an operator to manually intervene.
//
// The default is: 7 days
//
// hcl: server_rejoin_age_max = "duration"
ServerRejoinAgeMax time.Duration
// Services contains the provided service definitions:
//
// hcl: services = [

View File

@ -25,13 +25,12 @@ import (
"github.com/stretchr/testify/require"
"golang.org/x/time/rate"
hcpconfig "github.com/hashicorp/consul/agent/hcp/config"
"github.com/hashicorp/consul/acl"
"github.com/hashicorp/consul/agent/cache"
"github.com/hashicorp/consul/agent/checks"
"github.com/hashicorp/consul/agent/consul"
consulrate "github.com/hashicorp/consul/agent/consul/rate"
hcpconfig "github.com/hashicorp/consul/agent/hcp/config"
"github.com/hashicorp/consul/agent/structs"
"github.com/hashicorp/consul/agent/token"
"github.com/hashicorp/consul/lib"
@ -6419,6 +6418,7 @@ func TestLoad_FullConfig(t *testing.T) {
SerfPortWAN: 8302,
ServerMode: true,
ServerName: "Oerr9n1G",
ServerRejoinAgeMax: 604800 * time.Second,
ServerPort: 3757,
Services: []*structs.ServiceDefinition{
{
@ -7163,7 +7163,8 @@ func TestRuntimeConfig_Sanitize(t *testing.T) {
},
},
},
Locality: &Locality{Region: strPtr("us-west-1"), Zone: strPtr("us-west-1a")},
Locality: &Locality{Region: strPtr("us-west-1"), Zone: strPtr("us-west-1a")},
ServerRejoinAgeMax: 24 * 7 * time.Hour,
}
b, err := json.MarshalIndent(rt.Sanitized(), "", " ")

View File

@ -332,6 +332,7 @@
"ServerMode": false,
"ServerName": "",
"ServerPort": 0,
"ServerRejoinAgeMax": "168h0m0s",
"Services": [
{
"Address": "",

View File

@ -394,6 +394,7 @@ serf_lan = "99.43.63.15"
serf_wan = "67.88.33.19"
server = true
server_name = "Oerr9n1G"
server_rejoin_age_max = "604800s"
service = {
id = "dLOXpSCI"
name = "o1ynPkp0"

View File

@ -453,6 +453,7 @@
"serf_wan": "67.88.33.19",
"server": true,
"server_name": "Oerr9n1G",
"server_rejoin_age_max": "604800s",
"service": {
"id": "dLOXpSCI",
"name": "o1ynPkp0",

View File

@ -447,6 +447,10 @@ type Config struct {
// Embedded Consul Enterprise specific configuration
*EnterpriseConfig
// ServerRejoinAgeMax is used to specify the duration of time a server
// is allowed to be down/offline before a startup operation is refused.
ServerRejoinAgeMax time.Duration
}
func (c *Config) InPrimaryDatacenter() bool {
@ -574,6 +578,8 @@ func DefaultConfig() *Config {
PeeringTestAllowPeerRegistrations: false,
EnterpriseConfig: DefaultEnterpriseConfig(),
ServerRejoinAgeMax: 24 * 7 * time.Hour,
}
// Increase our reap interval to 3 days instead of 24h.

View File

@ -0,0 +1,71 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package consul
import (
"encoding/json"
"io"
"os"
"time"
)
// ServerMetadataFile is the name of the file on disk that server metadata
// should be written to.
const ServerMetadataFile = "server_metadata.json"
// ServerMetadata represents specific metadata about a running server.
type ServerMetadata struct {
// LastSeenUnix is the timestamp a server was last seen, in Unix format.
LastSeenUnix int64 `json:"last_seen_unix"`
}
// IsLastSeenStale checks whether the last seen timestamp is older than a given duration.
func (md *ServerMetadata) IsLastSeenStale(d time.Duration) bool {
lastSeen := time.Unix(md.LastSeenUnix, 0)
maxAge := time.Now().Add(-d)
return lastSeen.Before(maxAge)
}
// OpenServerMetadata is a helper function for opening the server metadata file
// with the correct permissions.
func OpenServerMetadata(filename string) (io.WriteCloser, error) {
return os.OpenFile(filename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0600)
}
type ServerMetadataReadFunc func(filename string) (*ServerMetadata, error)
// ReadServerMetadata is a helper function for reading the contents of a server
// metadata file and unmarshaling the data from JSON.
func ReadServerMetadata(filename string) (*ServerMetadata, error) {
b, err := os.ReadFile(filename)
if err != nil {
return nil, err
}
var md ServerMetadata
if err := json.Unmarshal(b, &md); err != nil {
return nil, err
}
return &md, nil
}
// WriteServerMetadata writes server metadata to a file in JSON format.
func WriteServerMetadata(w io.Writer) error {
md := &ServerMetadata{
LastSeenUnix: time.Now().Unix(),
}
b, err := json.Marshal(md)
if err != nil {
return err
}
if _, err := w.Write(b); err != nil {
return err
}
return nil
}

View File

@ -0,0 +1,68 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package consul
import (
"bytes"
"errors"
"testing"
"time"
"github.com/stretchr/testify/assert"
)
type mockServerMetadataWriter struct {
writeErr error
}
func (m *mockServerMetadataWriter) Write(p []byte) (n int, err error) {
if m.writeErr != nil {
return 0, m.writeErr
}
return 1, nil
}
func TestServerMetadata(t *testing.T) {
now := time.Now()
t.Run("TestIsLastSeenStaleTrue", func(t *testing.T) {
// Create a server that is 48 hours old.
md := &ServerMetadata{
LastSeenUnix: now.Add(-48 * time.Hour).Unix(),
}
stale := md.IsLastSeenStale(24 * time.Hour)
assert.True(t, stale)
})
t.Run("TestIsLastSeenStaleFalse", func(t *testing.T) {
// Create a server that is 1 hour old.
md := &ServerMetadata{
LastSeenUnix: now.Add(-1 * time.Hour).Unix(),
}
stale := md.IsLastSeenStale(24 * time.Hour)
assert.False(t, stale)
})
}
func TestWriteServerMetadata(t *testing.T) {
t.Run("TestWriteError", func(t *testing.T) {
m := &mockServerMetadataWriter{
writeErr: errors.New("write error"),
}
err := WriteServerMetadata(m)
assert.Error(t, err)
})
t.Run("TestOK", func(t *testing.T) {
b := new(bytes.Buffer)
err := WriteServerMetadata(b)
assert.NoError(t, err)
assert.True(t, b.Len() > 0)
})
}

View File

@ -736,6 +736,11 @@ Refer to the [formatting specification](https://golang.org/pkg/time/#ParseDurati
- `server` Equivalent to the [`-server` command-line flag](/consul/docs/agent/config/cli-flags#_server).
- `server_rejoin_age_max` - controls the allowed maximum age of a stale server attempting to rejoin a cluster.
If a server is not running for this period, then it will refuse to start up again until an operator intervenes. This is to protect
clusters from instability caused by decommissioned servers accidentally being started again.
Note: the default value is 7d and the minimum value is 6h.
- `non_voting_server` - **This field is deprecated in Consul 1.9.1. See the [`read_replica`](#read_replica) field instead.**
- `read_replica` - Equivalent to the [`-read-replica` command-line flag](/consul/docs/agent/config/cli-flags#_read_replica).