open-nomad/api/operator_autopilot.go

226 lines
6.7 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package api
import (
"encoding/json"
"strconv"
"time"
)
// AutopilotConfiguration is used for querying/setting the Autopilot configuration.
// Autopilot helps manage operator tasks related to Nomad servers like removing
// failed servers from the Raft quorum.
type AutopilotConfiguration struct {
// CleanupDeadServers controls whether to remove dead servers from the Raft
// peer list when a new server joins
CleanupDeadServers bool
// LastContactThreshold is the limit on the amount of time a server can go
// without leader contact before being considered unhealthy.
LastContactThreshold time.Duration
// MaxTrailingLogs is the amount of entries in the Raft Log that a server can
// be behind before being considered unhealthy.
MaxTrailingLogs uint64
// MinQuorum sets the minimum number of servers allowed in a cluster before
// autopilot can prune dead servers.
MinQuorum uint
// ServerStabilizationTime is the minimum amount of time a server must be
// in a stable, healthy state before it can be added to the cluster. Only
// applicable with Raft protocol version 3 or higher.
ServerStabilizationTime time.Duration
// (Enterprise-only) EnableRedundancyZones specifies whether to enable redundancy zones.
EnableRedundancyZones bool
// (Enterprise-only) DisableUpgradeMigration will disable Autopilot's upgrade migration
// strategy of waiting until enough newer-versioned servers have been added to the
// cluster before promoting them to voters.
DisableUpgradeMigration bool
// (Enterprise-only) EnableCustomUpgrades specifies whether to enable using custom
// upgrade versions when performing migrations.
EnableCustomUpgrades bool
// CreateIndex holds the index corresponding the creation of this configuration.
// This is a read-only field.
CreateIndex uint64
// ModifyIndex will be set to the index of the last update when retrieving the
// Autopilot configuration. Resubmitting a configuration with
// AutopilotCASConfiguration will perform a check-and-set operation which ensures
// there hasn't been a subsequent update since the configuration was retrieved.
ModifyIndex uint64
}
func (u *AutopilotConfiguration) MarshalJSON() ([]byte, error) {
type Alias AutopilotConfiguration
return json.Marshal(&struct {
LastContactThreshold string
ServerStabilizationTime string
*Alias
}{
LastContactThreshold: u.LastContactThreshold.String(),
ServerStabilizationTime: u.ServerStabilizationTime.String(),
Alias: (*Alias)(u),
})
}
func (u *AutopilotConfiguration) UnmarshalJSON(data []byte) error {
type Alias AutopilotConfiguration
aux := &struct {
LastContactThreshold string
ServerStabilizationTime string
*Alias
}{
Alias: (*Alias)(u),
}
if err := json.Unmarshal(data, &aux); err != nil {
return err
}
var err error
if aux.LastContactThreshold != "" {
if u.LastContactThreshold, err = time.ParseDuration(aux.LastContactThreshold); err != nil {
return err
}
}
if aux.ServerStabilizationTime != "" {
if u.ServerStabilizationTime, err = time.ParseDuration(aux.ServerStabilizationTime); err != nil {
return err
}
}
return nil
}
// ServerHealth is the health (from the leader's point of view) of a server.
type ServerHealth struct {
// ID is the raft ID of the server.
ID string
// Name is the node name of the server.
Name string
// Address is the address of the server.
Address string
// The status of the SerfHealth check for the server.
SerfStatus string
// Version is the Nomad version of the server.
Version string
// Leader is whether this server is currently the leader.
Leader bool
// LastContact is the time since this node's last contact with the leader.
LastContact time.Duration
// LastTerm is the highest leader term this server has a record of in its Raft log.
LastTerm uint64
// LastIndex is the last log index this server has a record of in its Raft log.
LastIndex uint64
// Healthy is whether or not the server is healthy according to the current
// Autopilot config.
Healthy bool
// Voter is whether this is a voting server.
Voter bool
// StableSince is the last time this server's Healthy value changed.
StableSince time.Time
}
func (u *ServerHealth) MarshalJSON() ([]byte, error) {
type Alias ServerHealth
return json.Marshal(&struct {
LastContact string
*Alias
}{
LastContact: u.LastContact.String(),
Alias: (*Alias)(u),
})
}
func (u *ServerHealth) UnmarshalJSON(data []byte) error {
type Alias ServerHealth
aux := &struct {
LastContact string
*Alias
}{
Alias: (*Alias)(u),
}
if err := json.Unmarshal(data, &aux); err != nil {
return err
}
var err error
if aux.LastContact != "" {
if u.LastContact, err = time.ParseDuration(aux.LastContact); err != nil {
return err
}
}
return nil
}
// OperatorHealthReply is a representation of the overall health of the cluster
type OperatorHealthReply struct {
// Healthy is true if all the servers in the cluster are healthy.
Healthy bool
// FailureTolerance is the number of healthy servers that could be lost without
// an outage occurring.
FailureTolerance int
// Servers holds the health of each server.
Servers []ServerHealth
}
// AutopilotGetConfiguration is used to query the current Autopilot configuration.
func (op *Operator) AutopilotGetConfiguration(q *QueryOptions) (*AutopilotConfiguration, *QueryMeta, error) {
var resp AutopilotConfiguration
qm, err := op.c.query("/v1/operator/autopilot/configuration", &resp, q)
if err != nil {
return nil, nil, err
}
return &resp, qm, nil
}
// AutopilotSetConfiguration is used to set the current Autopilot configuration.
func (op *Operator) AutopilotSetConfiguration(conf *AutopilotConfiguration, q *WriteOptions) (*WriteMeta, error) {
var out bool
wm, err := op.c.put("/v1/operator/autopilot/configuration", conf, &out, q)
if err != nil {
return nil, err
}
return wm, nil
}
// AutopilotCASConfiguration is used to perform a Check-And-Set update on the
// Autopilot configuration. The ModifyIndex value will be respected. Returns
// true on success or false on failures.
func (op *Operator) AutopilotCASConfiguration(conf *AutopilotConfiguration, q *WriteOptions) (bool, *WriteMeta, error) {
var out bool
wm, err := op.c.put("/v1/operator/autopilot/configuration?cas="+strconv.FormatUint(conf.ModifyIndex, 10), conf, &out, q)
if err != nil {
return false, nil, err
}
return out, wm, nil
}
// AutopilotServerHealth is used to query Autopilot's top-level view of the health
// of each Nomad server.
func (op *Operator) AutopilotServerHealth(q *QueryOptions) (*OperatorHealthReply, *QueryMeta, error) {
var out OperatorHealthReply
qm, err := op.c.query("/v1/operator/autopilot/health", &out, q)
if err != nil {
return nil, nil, err
}
return &out, qm, nil
}