diff --git a/CHANGELOG.md b/CHANGELOG.md index eade1e4a8..21e6947ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ BUG FIXES: * core: *Fix restoration of stopped periodic jobs [GH-3201] * api: Fix search handling of jobs with more than four hyphens and case were length could cause lookup error [GH-3203] + * client: Fix lock contention that could cause a node to miss a heartbeat and + be marked as down [GH-3195] ## 0.6.3 (September 11, 2017) diff --git a/client/client.go b/client/client.go index cec4709e1..b77bf6107 100644 --- a/client/client.go +++ b/client/client.go @@ -296,7 +296,7 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulServic // Start collecting stats go c.emitStats() - c.logger.Printf("[INFO] client: Node ID %q", c.Node().ID) + c.logger.Printf("[INFO] client: Node ID %q", c.NodeID()) return c, nil } @@ -369,10 +369,7 @@ func (c *Client) Leave() error { // Datacenter returns the datacenter for the given client func (c *Client) Datacenter() string { - c.configLock.RLock() - dc := c.configCopy.Node.Datacenter - c.configLock.RUnlock() - return dc + return c.config.Node.Datacenter } // Region returns the region for the given client @@ -380,6 +377,16 @@ func (c *Client) Region() string { return c.config.Region } +// NodeID returns the node ID for the given client +func (c *Client) NodeID() string { + return c.config.Node.ID +} + +// secretNodeID returns the secret node ID for the given client +func (c *Client) secretNodeID() string { + return c.config.Node.SecretID +} + // RPCMajorVersion returns the structs.ApiMajorVersion supported by the // client. func (c *Client) RPCMajorVersion() int { @@ -467,7 +474,7 @@ func (c *Client) Stats() map[string]map[string]string { defer c.heartbeatLock.Unlock() stats := map[string]map[string]string{ "client": map[string]string{ - "node_id": c.Node().ID, + "node_id": c.NodeID(), "known_servers": c.servers.all().String(), "num_allocations": strconv.Itoa(c.NumAllocs()), "last_heartbeat": fmt.Sprintf("%v", time.Since(c.lastHeartbeat)), @@ -1159,9 +1166,8 @@ func (c *Client) updateNodeStatus() error { c.heartbeatLock.Lock() defer c.heartbeatLock.Unlock() - node := c.Node() req := structs.NodeUpdateStatusRequest{ - NodeID: node.ID, + NodeID: c.NodeID(), Status: structs.NodeStatusReady, WriteRequest: structs.WriteRequest{Region: c.Region()}, } @@ -1226,7 +1232,7 @@ func (c *Client) updateAllocStatus(alloc *structs.Allocation) { // send the fields that are updatable by the client. stripped := new(structs.Allocation) stripped.ID = alloc.ID - stripped.NodeID = c.Node().ID + stripped.NodeID = c.NodeID() stripped.TaskStates = alloc.TaskStates stripped.ClientStatus = alloc.ClientStatus stripped.ClientDescription = alloc.ClientDescription @@ -1303,10 +1309,9 @@ func (c *Client) watchAllocations(updates chan *allocUpdates) { // The request and response for getting the map of allocations that should // be running on the Node to their AllocModifyIndex which is incremented // when the allocation is updated by the servers. - n := c.Node() req := structs.NodeSpecificRequest{ - NodeID: n.ID, - SecretID: n.SecretID, + NodeID: c.NodeID(), + SecretID: c.secretNodeID(), QueryOptions: structs.QueryOptions{ Region: c.Region(), AllowStale: true, @@ -1664,8 +1669,8 @@ func (c *Client) deriveToken(alloc *structs.Allocation, taskNames []string, vcli // DeriveVaultToken of nomad server can take in a set of tasks and // creates tokens for all the tasks. req := &structs.DeriveVaultTokenRequest{ - NodeID: c.Node().ID, - SecretID: c.Node().SecretID, + NodeID: c.NodeID(), + SecretID: c.secretNodeID(), AllocID: alloc.ID, Tasks: verifiedTasks, QueryOptions: structs.QueryOptions{ @@ -1849,7 +1854,7 @@ DISCOLOOP: func (c *Client) emitStats() { // Assign labels directly before emitting stats so the information expected // is ready - c.baseLabels = []metrics.Label{{Name: "node_id", Value: c.Node().ID}, {Name: "datacenter", Value: c.Node().Datacenter}} + c.baseLabels = []metrics.Label{{Name: "node_id", Value: c.NodeID()}, {Name: "datacenter", Value: c.Datacenter()}} // Start collecting host stats right away and then keep collecting every // collection interval @@ -2027,7 +2032,7 @@ func (c *Client) setGaugeForUptime(hStats *stats.HostStats) { // emitHostStats pushes host resource usage stats to remote metrics collection sinks func (c *Client) emitHostStats() { - nodeID := c.Node().ID + nodeID := c.NodeID() hStats := c.hostStatsCollector.Stats() c.setGaugeForMemoryStats(nodeID, hStats) @@ -2041,7 +2046,7 @@ func (c *Client) emitHostStats() { // emitClientMetrics emits lower volume client metrics func (c *Client) emitClientMetrics() { - nodeID := c.Node().ID + nodeID := c.NodeID() // Emit allocation metrics blocked, migrating, pending, running, terminal := 0, 0, 0, 0, 0 diff --git a/client/stats/host.go b/client/stats/host.go index 98fdc7c19..d528ca784 100644 --- a/client/stats/host.go +++ b/client/stats/host.go @@ -92,6 +92,9 @@ func NewHostStatsCollector(logger *log.Logger, allocDir string) *HostStatsCollec // Collect collects stats related to resource usage of a host func (h *HostStatsCollector) Collect() error { + h.hostStatsLock.Lock() + defer h.hostStatsLock.Unlock() + hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()} // Determine up-time @@ -131,9 +134,7 @@ func (h *HostStatsCollector) Collect() error { hs.AllocDirStats = h.toDiskStats(usage, nil) // Update the collected status object. - h.hostStatsLock.Lock() h.hostStats = hs - h.hostStatsLock.Unlock() return nil }