open-consul/agent/consul/leader_federation_state_ae.go
Daniel Nephin 8654adfc53 Handle FSM.Apply errors in raftApply
Previously we were inconsistently checking the response for errors. This
PR moves the response-is-error check into raftApply, so that all callers
can look at only the error response, instead of having to know that
errors could come from two places.

This should expose a few more errors that were previously hidden because
in some calls to raftApply we were ignoring the response return value.

Also handle errors more consistently. In some cases we would log the
error before returning it. This can be very confusing because it can
result in the same error being logged multiple times. Instead return
a wrapped error.
2021-04-20 13:29:29 -04:00

234 lines
6.2 KiB
Go

package consul
import (
"context"
"fmt"
"time"
memdb "github.com/hashicorp/go-memdb"
"github.com/hashicorp/consul/agent/consul/state"
"github.com/hashicorp/consul/agent/structs"
)
const (
// federationStatePruneInterval is how often we check for stale federation
// states to remove should a datacenter be removed from the WAN.
federationStatePruneInterval = time.Hour
)
func (s *Server) startFederationStateAntiEntropy() {
// Check to see if we can skip waiting for serf feature detection below.
if !s.DatacenterSupportsFederationStates() {
_, fedStates, err := s.fsm.State().FederationStateList(nil)
if err != nil {
s.logger.Warn("Failed to check for existing federation states and activate the feature flag quicker; skipping this optimization", "error", err)
} else if len(fedStates) > 0 {
s.setDatacenterSupportsFederationStates()
}
}
if s.config.DisableFederationStateAntiEntropy {
return
}
s.leaderRoutineManager.Start(federationStateAntiEntropyRoutineName, s.federationStateAntiEntropySync)
// If this is the primary, then also prune any stale datacenters from the
// list of federation states.
if s.config.PrimaryDatacenter == s.config.Datacenter {
s.leaderRoutineManager.Start(federationStatePruningRoutineName, s.federationStatePruning)
}
}
func (s *Server) stopFederationStateAntiEntropy() {
if s.config.DisableFederationStateAntiEntropy {
return
}
s.leaderRoutineManager.Stop(federationStateAntiEntropyRoutineName)
if s.config.PrimaryDatacenter == s.config.Datacenter {
s.leaderRoutineManager.Stop(federationStatePruningRoutineName)
}
}
func (s *Server) federationStateAntiEntropySync(ctx context.Context) error {
var lastFetchIndex uint64
retryLoopBackoff(ctx, func() error {
if !s.DatacenterSupportsFederationStates() {
return nil
}
idx, err := s.federationStateAntiEntropyMaybeSync(ctx, lastFetchIndex)
if err != nil {
return err
}
lastFetchIndex = idx
return nil
}, func(err error) {
s.logger.Error("error performing anti-entropy sync of federation state", "error", err)
})
return nil
}
func (s *Server) federationStateAntiEntropyMaybeSync(ctx context.Context, lastFetchIndex uint64) (uint64, error) {
queryOpts := &structs.QueryOptions{
MinQueryIndex: lastFetchIndex,
RequireConsistent: true,
// This is just for a local blocking query so no token is needed.
}
idx, prev, curr, err := s.fetchFederationStateAntiEntropyDetails(queryOpts)
if err != nil {
return 0, err
}
// We should check to see if our context was cancelled while we were blocked.
select {
case <-ctx.Done():
return 0, ctx.Err()
default:
}
if prev != nil && prev.IsSame(curr) {
s.logger.Trace("federation state anti-entropy sync skipped; already up to date")
return idx, nil
}
if err := s.updateOurFederationState(curr); err != nil {
return 0, fmt.Errorf("error performing federation state anti-entropy sync: %v", err)
}
s.logger.Info("federation state anti-entropy synced")
return idx, nil
}
func (s *Server) updateOurFederationState(curr *structs.FederationState) error {
if curr.Datacenter != s.config.Datacenter { // sanity check
return fmt.Errorf("cannot use this mechanism to update federation states for other datacenters")
}
curr.UpdatedAt = time.Now().UTC()
args := structs.FederationStateRequest{
Op: structs.FederationStateUpsert,
State: curr,
}
if s.config.Datacenter == s.config.PrimaryDatacenter {
// We are the primary, so we can't do an RPC as we don't have a replication token.
_, err := s.raftApply(structs.FederationStateRequestType, args)
if err != nil {
return err
}
} else {
args.WriteRequest = structs.WriteRequest{
Token: s.tokens.ReplicationToken(),
}
ignored := false
if err := s.forwardDC("FederationState.Apply", s.config.PrimaryDatacenter, &args, &ignored); err != nil {
return err
}
}
return nil
}
func (s *Server) fetchFederationStateAntiEntropyDetails(
queryOpts *structs.QueryOptions,
) (uint64, *structs.FederationState, *structs.FederationState, error) {
var (
prevFedState, currFedState *structs.FederationState
queryMeta structs.QueryMeta
)
err := s.blockingQuery(
queryOpts,
&queryMeta,
func(ws memdb.WatchSet, state *state.Store) error {
// Get the existing stored version of this FedState that has replicated down.
// We could phone home to get this but that would incur extra WAN traffic
// when we already have enough information locally to figure it out
// (assuming that our replicator is still functioning).
idx1, prev, err := state.FederationStateGet(ws, s.config.Datacenter)
if err != nil {
return err
}
// Fetch our current list of all mesh gateways.
entMeta := structs.WildcardEnterpriseMeta()
idx2, raw, err := state.ServiceDump(ws, structs.ServiceKindMeshGateway, true, entMeta)
if err != nil {
return err
}
curr := &structs.FederationState{
Datacenter: s.config.Datacenter,
MeshGateways: raw,
}
// Compute the maximum index seen.
if idx2 > idx1 {
queryMeta.Index = idx2
} else {
queryMeta.Index = idx1
}
prevFedState = prev
currFedState = curr
return nil
})
if err != nil {
return 0, nil, nil, err
}
return queryMeta.Index, prevFedState, currFedState, nil
}
func (s *Server) federationStatePruning(ctx context.Context) error {
ticker := time.NewTicker(federationStatePruneInterval)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return nil
case <-ticker.C:
if err := s.pruneStaleFederationStates(); err != nil {
s.logger.Error("error pruning stale federation states", "error", err)
}
}
}
}
func (s *Server) pruneStaleFederationStates() error {
state := s.fsm.State()
_, fedStates, err := state.FederationStateList(nil)
if err != nil {
return err
}
for _, fedState := range fedStates {
dc := fedState.Datacenter
if s.router.HasDatacenter(dc) {
continue
}
s.logger.Info("pruning stale federation state", "datacenter", dc)
req := structs.FederationStateRequest{
Op: structs.FederationStateDelete,
State: &structs.FederationState{
Datacenter: dc,
},
}
_, err := s.raftApply(structs.FederationStateRequestType, &req)
if err != nil {
return fmt.Errorf("Failed to delete federation state %s: %v", dc, err)
}
}
return nil
}