Non-locked accessors to common Node fields
This PR removes locking around commonly accessed node attributes that do not need to be locked. The locking could cause nodes to TTL as the heartbeat code path was acquiring a lock that could be held for an excessively long time. An example of this is when Vault is inaccessible, since the fingerprint is run with a lock held but the Vault fingerprinter makes the API calls with a large timeout. Fixes https://github.com/hashicorp/nomad/issues/2689
This commit is contained in:
parent
2102bc3968
commit
07ed83fdd5
|
@ -296,7 +296,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic
|
|||
// Start collecting stats
|
||||
go c.emitStats()
|
||||
|
||||
c.logger.Printf("[INFO] client: Node ID %q", c.Node().ID)
|
||||
c.logger.Printf("[INFO] client: Node ID %q", c.NodeID())
|
||||
return c, nil
|
||||
}
|
||||
|
||||
|
@ -369,9 +369,7 @@ func (c *Client) Leave() error {
|
|||
|
||||
// Datacenter returns the datacenter for the given client
|
||||
func (c *Client) Datacenter() string {
|
||||
c.configLock.RLock()
|
||||
dc := c.configCopy.Node.Datacenter
|
||||
c.configLock.RUnlock()
|
||||
dc := c.config.Node.Datacenter
|
||||
return dc
|
||||
}
|
||||
|
||||
|
@ -380,6 +378,16 @@ func (c *Client) Region() string {
|
|||
return c.config.Region
|
||||
}
|
||||
|
||||
// NodeID returns the node ID for the given client
|
||||
func (c *Client) NodeID() string {
|
||||
return c.config.Node.ID
|
||||
}
|
||||
|
||||
// secretNodeID returns the secret node ID for the given client
|
||||
func (c *Client) secretNodeID() string {
|
||||
return c.config.Node.SecretID
|
||||
}
|
||||
|
||||
// RPCMajorVersion returns the structs.ApiMajorVersion supported by the
|
||||
// client.
|
||||
func (c *Client) RPCMajorVersion() int {
|
||||
|
@ -467,7 +475,7 @@ func (c *Client) Stats() map[string]map[string]string {
|
|||
defer c.heartbeatLock.Unlock()
|
||||
stats := map[string]map[string]string{
|
||||
"client": map[string]string{
|
||||
"node_id": c.Node().ID,
|
||||
"node_id": c.NodeID(),
|
||||
"known_servers": c.servers.all().String(),
|
||||
"num_allocations": strconv.Itoa(c.NumAllocs()),
|
||||
"last_heartbeat": fmt.Sprintf("%v", time.Since(c.lastHeartbeat)),
|
||||
|
@ -1159,9 +1167,8 @@ func (c *Client) updateNodeStatus() error {
|
|||
c.heartbeatLock.Lock()
|
||||
defer c.heartbeatLock.Unlock()
|
||||
|
||||
node := c.Node()
|
||||
req := structs.NodeUpdateStatusRequest{
|
||||
NodeID: node.ID,
|
||||
NodeID: c.NodeID(),
|
||||
Status: structs.NodeStatusReady,
|
||||
WriteRequest: structs.WriteRequest{Region: c.Region()},
|
||||
}
|
||||
|
@ -1226,7 +1233,7 @@ func (c *Client) updateAllocStatus(alloc *structs.Allocation) {
|
|||
// send the fields that are updatable by the client.
|
||||
stripped := new(structs.Allocation)
|
||||
stripped.ID = alloc.ID
|
||||
stripped.NodeID = c.Node().ID
|
||||
stripped.NodeID = c.NodeID()
|
||||
stripped.TaskStates = alloc.TaskStates
|
||||
stripped.ClientStatus = alloc.ClientStatus
|
||||
stripped.ClientDescription = alloc.ClientDescription
|
||||
|
@ -1303,10 +1310,9 @@ func (c *Client) watchAllocations(updates chan *allocUpdates) {
|
|||
// The request and response for getting the map of allocations that should
|
||||
// be running on the Node to their AllocModifyIndex which is incremented
|
||||
// when the allocation is updated by the servers.
|
||||
n := c.Node()
|
||||
req := structs.NodeSpecificRequest{
|
||||
NodeID: n.ID,
|
||||
SecretID: n.SecretID,
|
||||
NodeID: c.NodeID(),
|
||||
SecretID: c.secretNodeID(),
|
||||
QueryOptions: structs.QueryOptions{
|
||||
Region: c.Region(),
|
||||
AllowStale: true,
|
||||
|
@ -1664,8 +1670,8 @@ func (c *Client) deriveToken(alloc *structs.Allocation, taskNames []string, vcli
|
|||
// DeriveVaultToken of nomad server can take in a set of tasks and
|
||||
// creates tokens for all the tasks.
|
||||
req := &structs.DeriveVaultTokenRequest{
|
||||
NodeID: c.Node().ID,
|
||||
SecretID: c.Node().SecretID,
|
||||
NodeID: c.NodeID(),
|
||||
SecretID: c.secretNodeID(),
|
||||
AllocID: alloc.ID,
|
||||
Tasks: verifiedTasks,
|
||||
QueryOptions: structs.QueryOptions{
|
||||
|
@ -1849,7 +1855,7 @@ DISCOLOOP:
|
|||
func (c *Client) emitStats() {
|
||||
// Assign labels directly before emitting stats so the information expected
|
||||
// is ready
|
||||
c.baseLabels = []metrics.Label{{Name: "node_id", Value: c.Node().ID}, {Name: "datacenter", Value: c.Node().Datacenter}}
|
||||
c.baseLabels = []metrics.Label{{Name: "node_id", Value: c.NodeID()}, {Name: "datacenter", Value: c.Datacenter()}}
|
||||
|
||||
// Start collecting host stats right away and then keep collecting every
|
||||
// collection interval
|
||||
|
@ -2027,7 +2033,7 @@ func (c *Client) setGaugeForUptime(hStats *stats.HostStats) {
|
|||
|
||||
// emitHostStats pushes host resource usage stats to remote metrics collection sinks
|
||||
func (c *Client) emitHostStats() {
|
||||
nodeID := c.Node().ID
|
||||
nodeID := c.NodeID()
|
||||
hStats := c.hostStatsCollector.Stats()
|
||||
|
||||
c.setGaugeForMemoryStats(nodeID, hStats)
|
||||
|
@ -2041,7 +2047,7 @@ func (c *Client) emitHostStats() {
|
|||
|
||||
// emitClientMetrics emits lower volume client metrics
|
||||
func (c *Client) emitClientMetrics() {
|
||||
nodeID := c.Node().ID
|
||||
nodeID := c.NodeID()
|
||||
|
||||
// Emit allocation metrics
|
||||
blocked, migrating, pending, running, terminal := 0, 0, 0, 0, 0
|
||||
|
|
|
@ -92,6 +92,9 @@ func NewHostStatsCollector(logger *log.Logger, allocDir string) *HostStatsCollec
|
|||
|
||||
// Collect collects stats related to resource usage of a host
|
||||
func (h *HostStatsCollector) Collect() error {
|
||||
h.hostStatsLock.Lock()
|
||||
defer h.hostStatsLock.Unlock()
|
||||
|
||||
hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()}
|
||||
|
||||
// Determine up-time
|
||||
|
@ -131,9 +134,7 @@ func (h *HostStatsCollector) Collect() error {
|
|||
hs.AllocDirStats = h.toDiskStats(usage, nil)
|
||||
|
||||
// Update the collected status object.
|
||||
h.hostStatsLock.Lock()
|
||||
h.hostStats = hs
|
||||
h.hostStatsLock.Unlock()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue