595 lines
16 KiB
Go
595 lines
16 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
package api
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"net/url"
|
|
"strconv"
|
|
)
|
|
|
|
// Agent encapsulates an API client which talks to Nomad's
|
|
// agent endpoints for a specific node.
|
|
type Agent struct {
|
|
client *Client
|
|
|
|
// Cache static agent info
|
|
nodeName string
|
|
datacenter string
|
|
region string
|
|
}
|
|
|
|
// KeyringResponse is a unified key response and can be used for install,
|
|
// remove, use, as well as listing key queries.
|
|
type KeyringResponse struct {
|
|
Messages map[string]string
|
|
Keys map[string]int
|
|
NumNodes int
|
|
}
|
|
|
|
// KeyringRequest is request objects for serf key operations.
|
|
type KeyringRequest struct {
|
|
Key string
|
|
}
|
|
|
|
// ForceLeaveOpts are used to configure the ForceLeave method.
|
|
type ForceLeaveOpts struct {
|
|
// Prune indicates whether to remove a node from the list of members
|
|
Prune bool
|
|
}
|
|
|
|
// Agent returns a new agent which can be used to query
|
|
// the agent-specific endpoints.
|
|
func (c *Client) Agent() *Agent {
|
|
return &Agent{client: c}
|
|
}
|
|
|
|
// Self is used to query the /v1/agent/self endpoint and
|
|
// returns information specific to the running agent.
|
|
func (a *Agent) Self() (*AgentSelf, error) {
|
|
var out *AgentSelf
|
|
|
|
// Query the self endpoint on the agent
|
|
_, err := a.client.query("/v1/agent/self", &out, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed querying self endpoint: %s", err)
|
|
}
|
|
|
|
// Populate the cache for faster queries
|
|
a.populateCache(out)
|
|
|
|
return out, nil
|
|
}
|
|
|
|
// populateCache is used to insert various pieces of static
|
|
// data into the agent handle. This is used during subsequent
|
|
// lookups for the same data later on to save the round trip.
|
|
func (a *Agent) populateCache(self *AgentSelf) {
|
|
if a.nodeName == "" {
|
|
a.nodeName = self.Member.Name
|
|
}
|
|
if a.datacenter == "" {
|
|
if val, ok := self.Config["Datacenter"]; ok {
|
|
a.datacenter, _ = val.(string)
|
|
}
|
|
}
|
|
if a.region == "" {
|
|
if val, ok := self.Config["Region"]; ok {
|
|
a.region, _ = val.(string)
|
|
}
|
|
}
|
|
}
|
|
|
|
// NodeName is used to query the Nomad agent for its node name.
|
|
func (a *Agent) NodeName() (string, error) {
|
|
// Return from cache if we have it
|
|
if a.nodeName != "" {
|
|
return a.nodeName, nil
|
|
}
|
|
|
|
// Query the node name
|
|
_, err := a.Self()
|
|
return a.nodeName, err
|
|
}
|
|
|
|
// Datacenter is used to return the name of the datacenter which
|
|
// the agent is a member of.
|
|
func (a *Agent) Datacenter() (string, error) {
|
|
// Return from cache if we have it
|
|
if a.datacenter != "" {
|
|
return a.datacenter, nil
|
|
}
|
|
|
|
// Query the agent for the DC
|
|
_, err := a.Self()
|
|
return a.datacenter, err
|
|
}
|
|
|
|
// Region is used to look up the region the agent is in.
|
|
func (a *Agent) Region() (string, error) {
|
|
// Return from cache if we have it
|
|
if a.region != "" {
|
|
return a.region, nil
|
|
}
|
|
|
|
// Query the agent for the region
|
|
_, err := a.Self()
|
|
return a.region, err
|
|
}
|
|
|
|
// Join is used to instruct a server node to join another server
|
|
// via the gossip protocol. Multiple addresses may be specified.
|
|
// We attempt to join all the hosts in the list. Returns the
|
|
// number of nodes successfully joined and any error. If one or
|
|
// more nodes have a successful result, no error is returned.
|
|
func (a *Agent) Join(addrs ...string) (int, error) {
|
|
// Accumulate the addresses
|
|
v := url.Values{}
|
|
for _, addr := range addrs {
|
|
v.Add("address", addr)
|
|
}
|
|
|
|
// Send the join request
|
|
var resp joinResponse
|
|
_, err := a.client.put("/v1/agent/join?"+v.Encode(), nil, &resp, nil)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("failed joining: %s", err)
|
|
}
|
|
if resp.Error != "" {
|
|
return 0, fmt.Errorf("failed joining: %s", resp.Error)
|
|
}
|
|
return resp.NumJoined, nil
|
|
}
|
|
|
|
// Members is used to query all of the known server members
|
|
func (a *Agent) Members() (*ServerMembers, error) {
|
|
var resp *ServerMembers
|
|
|
|
// Query the known members
|
|
_, err := a.client.query("/v1/agent/members", &resp, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return resp, nil
|
|
}
|
|
|
|
// Members is used to query all of the known server members
|
|
// with the ability to set QueryOptions
|
|
func (a *Agent) MembersOpts(opts *QueryOptions) (*ServerMembers, error) {
|
|
var resp *ServerMembers
|
|
_, err := a.client.query("/v1/agent/members", &resp, opts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return resp, nil
|
|
}
|
|
|
|
// ForceLeave is used to eject an existing node from the cluster.
|
|
func (a *Agent) ForceLeave(node string) error {
|
|
v := url.Values{}
|
|
v.Add("node", node)
|
|
_, err := a.client.put("/v1/agent/force-leave?"+v.Encode(), nil, nil, nil)
|
|
return err
|
|
}
|
|
|
|
// ForceLeaveWithOptions is used to eject an existing node from the cluster
|
|
// with additional options such as prune.
|
|
func (a *Agent) ForceLeaveWithOptions(node string, opts ForceLeaveOpts) error {
|
|
v := url.Values{}
|
|
v.Add("node", node)
|
|
if opts.Prune {
|
|
v.Add("prune", "1")
|
|
}
|
|
_, err := a.client.put("/v1/agent/force-leave?"+v.Encode(), nil, nil, nil)
|
|
return err
|
|
}
|
|
|
|
// Servers is used to query the list of servers on a client node.
|
|
func (a *Agent) Servers() ([]string, error) {
|
|
var resp []string
|
|
_, err := a.client.query("/v1/agent/servers", &resp, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return resp, nil
|
|
}
|
|
|
|
// SetServers is used to update the list of servers on a client node.
|
|
func (a *Agent) SetServers(addrs []string) error {
|
|
// Accumulate the addresses
|
|
v := url.Values{}
|
|
for _, addr := range addrs {
|
|
v.Add("address", addr)
|
|
}
|
|
|
|
_, err := a.client.put("/v1/agent/servers?"+v.Encode(), nil, nil, nil)
|
|
return err
|
|
}
|
|
|
|
// ListKeys returns the list of installed keys
|
|
func (a *Agent) ListKeys() (*KeyringResponse, error) {
|
|
var resp KeyringResponse
|
|
_, err := a.client.query("/v1/agent/keyring/list", &resp, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &resp, nil
|
|
}
|
|
|
|
// InstallKey installs a key in the keyrings of all the serf members
|
|
func (a *Agent) InstallKey(key string) (*KeyringResponse, error) {
|
|
args := KeyringRequest{
|
|
Key: key,
|
|
}
|
|
var resp KeyringResponse
|
|
_, err := a.client.put("/v1/agent/keyring/install", &args, &resp, nil)
|
|
return &resp, err
|
|
}
|
|
|
|
// UseKey uses a key from the keyring of serf members
|
|
func (a *Agent) UseKey(key string) (*KeyringResponse, error) {
|
|
args := KeyringRequest{
|
|
Key: key,
|
|
}
|
|
var resp KeyringResponse
|
|
_, err := a.client.put("/v1/agent/keyring/use", &args, &resp, nil)
|
|
return &resp, err
|
|
}
|
|
|
|
// RemoveKey removes a particular key from keyrings of serf members
|
|
func (a *Agent) RemoveKey(key string) (*KeyringResponse, error) {
|
|
args := KeyringRequest{
|
|
Key: key,
|
|
}
|
|
var resp KeyringResponse
|
|
_, err := a.client.put("/v1/agent/keyring/remove", &args, &resp, nil)
|
|
return &resp, err
|
|
}
|
|
|
|
// Health queries the agent's health
|
|
func (a *Agent) Health() (*AgentHealthResponse, error) {
|
|
req, err := a.client.newRequest("GET", "/v1/agent/health")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var health AgentHealthResponse
|
|
_, resp, err := a.client.doRequest(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// Always try to decode the response as JSON
|
|
err = json.NewDecoder(resp.Body).Decode(&health)
|
|
if err == nil {
|
|
return &health, nil
|
|
}
|
|
|
|
// Return custom error when response is not expected JSON format
|
|
return nil, fmt.Errorf("unable to unmarshal response with status %d: %v", resp.StatusCode, err)
|
|
}
|
|
|
|
// Host returns debugging context about the agent's host operating system
|
|
func (a *Agent) Host(serverID, nodeID string, q *QueryOptions) (*HostDataResponse, error) {
|
|
if q == nil {
|
|
q = &QueryOptions{}
|
|
}
|
|
if q.Params == nil {
|
|
q.Params = make(map[string]string)
|
|
}
|
|
|
|
if serverID != "" {
|
|
q.Params["server_id"] = serverID
|
|
}
|
|
|
|
if nodeID != "" {
|
|
q.Params["node_id"] = nodeID
|
|
}
|
|
|
|
var resp HostDataResponse
|
|
_, err := a.client.query("/v1/agent/host", &resp, q)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &resp, nil
|
|
}
|
|
|
|
// Monitor returns a channel which will receive streaming logs from the agent
|
|
// Providing a non-nil stopCh can be used to close the connection and stop log streaming
|
|
func (a *Agent) Monitor(stopCh <-chan struct{}, q *QueryOptions) (<-chan *StreamFrame, <-chan error) {
|
|
errCh := make(chan error, 1)
|
|
r, err := a.client.newRequest("GET", "/v1/agent/monitor")
|
|
if err != nil {
|
|
errCh <- err
|
|
return nil, errCh
|
|
}
|
|
|
|
r.setQueryOptions(q)
|
|
_, resp, err := requireOK(a.client.doRequest(r))
|
|
if err != nil {
|
|
errCh <- err
|
|
return nil, errCh
|
|
}
|
|
|
|
frames := make(chan *StreamFrame, 10)
|
|
go func() {
|
|
defer resp.Body.Close()
|
|
|
|
dec := json.NewDecoder(resp.Body)
|
|
|
|
for {
|
|
select {
|
|
case <-stopCh:
|
|
close(frames)
|
|
return
|
|
default:
|
|
}
|
|
|
|
// Decode the next frame
|
|
var frame StreamFrame
|
|
if err := dec.Decode(&frame); err != nil {
|
|
close(frames)
|
|
errCh <- err
|
|
return
|
|
}
|
|
|
|
// Discard heartbeat frame
|
|
if frame.IsHeartbeat() {
|
|
continue
|
|
}
|
|
|
|
frames <- &frame
|
|
}
|
|
}()
|
|
|
|
return frames, errCh
|
|
}
|
|
|
|
// PprofOptions contain a set of parameters for profiling a node or server.
|
|
type PprofOptions struct {
|
|
// ServerID is the server ID, name, or special value "leader" to
|
|
// specify the server that a given profile should be run on.
|
|
ServerID string
|
|
|
|
// NodeID is the node ID that a given profile should be run on.
|
|
NodeID string
|
|
|
|
// Seconds specifies the amount of time a profile should be run for.
|
|
// Seconds only applies for certain runtime profiles like CPU and Trace.
|
|
Seconds int
|
|
|
|
// GC determines if a runtime.GC() should be called before a heap
|
|
// profile.
|
|
GC int
|
|
|
|
// Debug specifies if the output of a lookup profile should be returned
|
|
// in human readable format instead of binary.
|
|
Debug int
|
|
}
|
|
|
|
// CPUProfile returns a runtime/pprof cpu profile for a given server or node.
|
|
// The profile will run for the amount of seconds passed in or default to 1.
|
|
// If no serverID or nodeID are provided the current Agents server will be
|
|
// used.
|
|
//
|
|
// The call blocks until the profile finishes, and returns the raw bytes of the
|
|
// profile.
|
|
func (a *Agent) CPUProfile(opts PprofOptions, q *QueryOptions) ([]byte, error) {
|
|
return a.pprofRequest("profile", opts, q)
|
|
}
|
|
|
|
// Trace returns a runtime/pprof trace for a given server or node.
|
|
// The trace will run for the amount of seconds passed in or default to 1.
|
|
// If no serverID or nodeID are provided the current Agents server will be
|
|
// used.
|
|
//
|
|
// The call blocks until the profile finishes, and returns the raw bytes of the
|
|
// profile.
|
|
func (a *Agent) Trace(opts PprofOptions, q *QueryOptions) ([]byte, error) {
|
|
return a.pprofRequest("trace", opts, q)
|
|
}
|
|
|
|
// Lookup returns a runtime/pprof profile using pprof.Lookup to determine
|
|
// which profile to run. Accepts a client or server ID but not both simultaneously.
|
|
//
|
|
// The call blocks until the profile finishes, and returns the raw bytes of the
|
|
// profile unless debug is set.
|
|
func (a *Agent) Lookup(profile string, opts PprofOptions, q *QueryOptions) ([]byte, error) {
|
|
return a.pprofRequest(profile, opts, q)
|
|
}
|
|
|
|
func (a *Agent) pprofRequest(req string, opts PprofOptions, q *QueryOptions) ([]byte, error) {
|
|
if q == nil {
|
|
q = &QueryOptions{}
|
|
}
|
|
if q.Params == nil {
|
|
q.Params = make(map[string]string)
|
|
}
|
|
|
|
q.Params["seconds"] = strconv.Itoa(opts.Seconds)
|
|
q.Params["debug"] = strconv.Itoa(opts.Debug)
|
|
q.Params["gc"] = strconv.Itoa(opts.GC)
|
|
q.Params["node_id"] = opts.NodeID
|
|
q.Params["server_id"] = opts.ServerID
|
|
|
|
body, err := a.client.rawQuery(fmt.Sprintf("/v1/agent/pprof/%s", req), q)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
resp, err := io.ReadAll(body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return resp, nil
|
|
}
|
|
|
|
// joinResponse is used to decode the response we get while
|
|
// sending a member join request.
|
|
type joinResponse struct {
|
|
NumJoined int `json:"num_joined"`
|
|
Error string `json:"error"`
|
|
}
|
|
|
|
type ServerMembers struct {
|
|
ServerName string
|
|
ServerRegion string
|
|
ServerDC string
|
|
Members []*AgentMember
|
|
}
|
|
|
|
type AgentSelf struct {
|
|
Config map[string]interface{} `json:"config"`
|
|
Member AgentMember `json:"member"`
|
|
Stats map[string]map[string]string `json:"stats"`
|
|
}
|
|
|
|
// AgentMember represents a cluster member known to the agent
|
|
type AgentMember struct {
|
|
Name string
|
|
Addr string
|
|
Port uint16
|
|
Tags map[string]string
|
|
Status string
|
|
ProtocolMin uint8
|
|
ProtocolMax uint8
|
|
ProtocolCur uint8
|
|
DelegateMin uint8
|
|
DelegateMax uint8
|
|
DelegateCur uint8
|
|
}
|
|
|
|
// AgentMembersNameSort implements sort.Interface for []*AgentMembersNameSort
|
|
// based on the Name, DC and Region
|
|
type AgentMembersNameSort []*AgentMember
|
|
|
|
func (a AgentMembersNameSort) Len() int { return len(a) }
|
|
func (a AgentMembersNameSort) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
|
func (a AgentMembersNameSort) Less(i, j int) bool {
|
|
if a[i].Tags["region"] != a[j].Tags["region"] {
|
|
return a[i].Tags["region"] < a[j].Tags["region"]
|
|
}
|
|
|
|
if a[i].Tags["dc"] != a[j].Tags["dc"] {
|
|
return a[i].Tags["dc"] < a[j].Tags["dc"]
|
|
}
|
|
|
|
return a[i].Name < a[j].Name
|
|
|
|
}
|
|
|
|
// AgentHealthResponse is the response from the Health endpoint describing an
|
|
// agent's health.
|
|
type AgentHealthResponse struct {
|
|
Client *AgentHealth `json:"client,omitempty"`
|
|
Server *AgentHealth `json:"server,omitempty"`
|
|
}
|
|
|
|
// AgentHealth describes the Client or Server's health in a Health request.
|
|
type AgentHealth struct {
|
|
// Ok is false if the agent is unhealthy
|
|
Ok bool `json:"ok"`
|
|
|
|
// Message describes why the agent is unhealthy
|
|
Message string `json:"message"`
|
|
}
|
|
|
|
type HostData struct {
|
|
OS string
|
|
Network []map[string]string
|
|
ResolvConf string
|
|
Hosts string
|
|
Environment map[string]string
|
|
Disk map[string]DiskUsage
|
|
}
|
|
|
|
type DiskUsage struct {
|
|
DiskMB int64
|
|
UsedMB int64
|
|
}
|
|
|
|
type HostDataResponse struct {
|
|
AgentID string
|
|
HostData *HostData `json:",omitempty"`
|
|
}
|
|
|
|
// GetSchedulerWorkerConfig returns the targeted agent's worker pool configuration
|
|
func (a *Agent) GetSchedulerWorkerConfig(q *QueryOptions) (*SchedulerWorkerPoolArgs, error) {
|
|
var resp AgentSchedulerWorkerConfigResponse
|
|
_, err := a.client.query("/v1/agent/schedulers/config", &resp, q)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &SchedulerWorkerPoolArgs{NumSchedulers: resp.NumSchedulers, EnabledSchedulers: resp.EnabledSchedulers}, nil
|
|
}
|
|
|
|
// SetSchedulerWorkerConfig attempts to update the targeted agent's worker pool configuration
|
|
func (a *Agent) SetSchedulerWorkerConfig(args SchedulerWorkerPoolArgs, q *WriteOptions) (*SchedulerWorkerPoolArgs, error) {
|
|
req := AgentSchedulerWorkerConfigRequest(args)
|
|
var resp AgentSchedulerWorkerConfigResponse
|
|
|
|
_, err := a.client.put("/v1/agent/schedulers/config", &req, &resp, q)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return &SchedulerWorkerPoolArgs{NumSchedulers: resp.NumSchedulers, EnabledSchedulers: resp.EnabledSchedulers}, nil
|
|
}
|
|
|
|
type SchedulerWorkerPoolArgs struct {
|
|
NumSchedulers int
|
|
EnabledSchedulers []string
|
|
}
|
|
|
|
// AgentSchedulerWorkerConfigRequest is used to provide new scheduler worker configuration
|
|
// to a specific Nomad server. EnabledSchedulers must contain at least the `_core` scheduler
|
|
// to be valid.
|
|
type AgentSchedulerWorkerConfigRequest struct {
|
|
NumSchedulers int `json:"num_schedulers"`
|
|
EnabledSchedulers []string `json:"enabled_schedulers"`
|
|
}
|
|
|
|
// AgentSchedulerWorkerConfigResponse contains the Nomad server's current running configuration
|
|
// as well as the server's id as a convenience. This can be used to provide starting values for
|
|
// creating an AgentSchedulerWorkerConfigRequest to make changes to the running configuration.
|
|
type AgentSchedulerWorkerConfigResponse struct {
|
|
ServerID string `json:"server_id"`
|
|
NumSchedulers int `json:"num_schedulers"`
|
|
EnabledSchedulers []string `json:"enabled_schedulers"`
|
|
}
|
|
|
|
// GetSchedulerWorkersInfo returns the current status of all of the scheduler workers on
|
|
// a Nomad server.
|
|
func (a *Agent) GetSchedulerWorkersInfo(q *QueryOptions) (*AgentSchedulerWorkersInfo, error) {
|
|
var out *AgentSchedulerWorkersInfo
|
|
|
|
_, err := a.client.query("/v1/agent/schedulers", &out, q)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return out, nil
|
|
}
|
|
|
|
// AgentSchedulerWorkersInfo is the response from the scheduler information endpoint containing
|
|
// a detailed status of each scheduler worker running on the server.
|
|
type AgentSchedulerWorkersInfo struct {
|
|
ServerID string `json:"server_id"`
|
|
Schedulers []AgentSchedulerWorkerInfo `json:"schedulers"`
|
|
}
|
|
|
|
// AgentSchedulerWorkerInfo holds the detailed status information for a single scheduler worker.
|
|
type AgentSchedulerWorkerInfo struct {
|
|
ID string `json:"id"`
|
|
EnabledSchedulers []string `json:"enabled_schedulers"`
|
|
Started string `json:"started"`
|
|
Status string `json:"status"`
|
|
WorkloadStatus string `json:"workload_status"`
|
|
}
|