open-nomad/client/driver/executor/pid_collector.go
Nick Ethier 5dee1141d1 executor v2 (#4656)
* client/executor: refactor client to remove interpolation

* executor: POC libcontainer based executor

* vendor: use hashicorp libcontainer fork

* vendor: add libcontainer/nsenter dep

* executor: updated executor interface to simplify operations

* executor: implement logging pipe

* logmon: new logmon plugin to manage task logs

* driver/executor: use logmon for log management

* executor: fix tests and windows build

* executor: fix logging key names

* executor: fix test failures

* executor: add config field to toggle between using libcontainer and standard executors

* logmon: use discover utility to discover nomad executable

* executor: only call libcontainer-shim on main in linux

* logmon: use seperate path configs for stdout/stderr fifos

* executor: windows fixes

* executor: created reusable pid stats collection utility that can be used in an executor

* executor: update fifo.Open calls

* executor: fix build

* remove executor from docker driver

* executor: Shutdown func to kill and cleanup executor and its children

* executor: move linux specific universal executor funcs to seperate file

* move logmon initialization to a task runner hook

* client: doc fixes and renaming from code review


* taskrunner: use shared config struct for logmon fifo fields

* taskrunner: logmon only needs to be started once per task
2018-10-16 16:53:31 -07:00

220 lines
5.7 KiB
Go

package executor
import (
"os"
"strconv"
"sync"
"time"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/client/stats"
cstructs "github.com/hashicorp/nomad/client/structs"
ps "github.com/mitchellh/go-ps"
"github.com/shirou/gopsutil/process"
)
var (
// pidScanInterval is the interval at which the executor scans the process
// tree for finding out the pids that the executor and it's child processes
// have forked
pidScanInterval = 5 * time.Second
)
// pidCollector is a utility that can be embedded in an executor to collect pid
// stats
type pidCollector struct {
pids map[int]*nomadPid
pidLock sync.RWMutex
logger hclog.Logger
}
// nomadPid holds a pid and it's cpu percentage calculator
type nomadPid struct {
pid int
cpuStatsTotal *stats.CpuStats
cpuStatsUser *stats.CpuStats
cpuStatsSys *stats.CpuStats
}
// allPidGetter is a func which is used by the pid collector to gather
// stats on
type allPidGetter func() (map[int]*nomadPid, error)
func newPidCollector(logger hclog.Logger) *pidCollector {
return &pidCollector{
pids: make(map[int]*nomadPid),
logger: logger.Named("pid_collector"),
}
}
// collectPids collects the pids of the child processes that the executor is
// running every 5 seconds
func (c *pidCollector) collectPids(stopCh chan interface{}, pidGetter allPidGetter) {
// Fire the timer right away when the executor starts from there on the pids
// are collected every scan interval
timer := time.NewTimer(0)
defer timer.Stop()
for {
select {
case <-timer.C:
pids, err := pidGetter()
if err != nil {
c.logger.Debug("error collecting pids", "error", err)
}
c.pidLock.Lock()
// Adding pids which are not being tracked
for pid, np := range pids {
if _, ok := c.pids[pid]; !ok {
c.pids[pid] = np
}
}
// Removing pids which are no longer present
for pid := range c.pids {
if _, ok := pids[pid]; !ok {
delete(c.pids, pid)
}
}
c.pidLock.Unlock()
timer.Reset(pidScanInterval)
case <-stopCh:
return
}
}
}
// scanPids scans all the pids on the machine running the current executor and
// returns the child processes of the executor.
func scanPids(parentPid int, allPids []ps.Process) (map[int]*nomadPid, error) {
processFamily := make(map[int]struct{})
processFamily[parentPid] = struct{}{}
// A mapping of pids to their parent pids. It is used to build the process
// tree of the executing task
pidsRemaining := make(map[int]int, len(allPids))
for _, pid := range allPids {
pidsRemaining[pid.Pid()] = pid.PPid()
}
for {
// flag to indicate if we have found a match
foundNewPid := false
for pid, ppid := range pidsRemaining {
_, childPid := processFamily[ppid]
// checking if the pid is a child of any of the parents
if childPid {
processFamily[pid] = struct{}{}
delete(pidsRemaining, pid)
foundNewPid = true
}
}
// not scanning anymore if we couldn't find a single match
if !foundNewPid {
break
}
}
res := make(map[int]*nomadPid)
for pid := range processFamily {
np := nomadPid{
pid: pid,
cpuStatsTotal: stats.NewCpuStats(),
cpuStatsUser: stats.NewCpuStats(),
cpuStatsSys: stats.NewCpuStats(),
}
res[pid] = &np
}
return res, nil
}
// pidStats returns the resource usage stats per pid
func (c *pidCollector) pidStats() (map[string]*cstructs.ResourceUsage, error) {
stats := make(map[string]*cstructs.ResourceUsage)
c.pidLock.RLock()
pids := make(map[int]*nomadPid, len(c.pids))
for k, v := range c.pids {
pids[k] = v
}
c.pidLock.RUnlock()
for pid, np := range pids {
p, err := process.NewProcess(int32(pid))
if err != nil {
c.logger.Trace("unable to create new process", "pid", pid, "error", err)
continue
}
ms := &cstructs.MemoryStats{}
if memInfo, err := p.MemoryInfo(); err == nil {
ms.RSS = memInfo.RSS
ms.Swap = memInfo.Swap
ms.Measured = ExecutorBasicMeasuredMemStats
}
cs := &cstructs.CpuStats{}
if cpuStats, err := p.Times(); err == nil {
cs.SystemMode = np.cpuStatsSys.Percent(cpuStats.System * float64(time.Second))
cs.UserMode = np.cpuStatsUser.Percent(cpuStats.User * float64(time.Second))
cs.Measured = ExecutorBasicMeasuredCpuStats
// calculate cpu usage percent
cs.Percent = np.cpuStatsTotal.Percent(cpuStats.Total() * float64(time.Second))
}
stats[strconv.Itoa(pid)] = &cstructs.ResourceUsage{MemoryStats: ms, CpuStats: cs}
}
return stats, nil
}
// aggregatedResourceUsage aggregates the resource usage of all the pids and
// returns a TaskResourceUsage data point
func aggregatedResourceUsage(systemCpuStats *stats.CpuStats, pidStats map[string]*cstructs.ResourceUsage) *cstructs.TaskResourceUsage {
ts := time.Now().UTC().UnixNano()
var (
systemModeCPU, userModeCPU, percent float64
totalRSS, totalSwap uint64
)
for _, pidStat := range pidStats {
systemModeCPU += pidStat.CpuStats.SystemMode
userModeCPU += pidStat.CpuStats.UserMode
percent += pidStat.CpuStats.Percent
totalRSS += pidStat.MemoryStats.RSS
totalSwap += pidStat.MemoryStats.Swap
}
totalCPU := &cstructs.CpuStats{
SystemMode: systemModeCPU,
UserMode: userModeCPU,
Percent: percent,
Measured: ExecutorBasicMeasuredCpuStats,
TotalTicks: systemCpuStats.TicksConsumed(percent),
}
totalMemory := &cstructs.MemoryStats{
RSS: totalRSS,
Swap: totalSwap,
Measured: ExecutorBasicMeasuredMemStats,
}
resourceUsage := cstructs.ResourceUsage{
MemoryStats: totalMemory,
CpuStats: totalCPU,
}
return &cstructs.TaskResourceUsage{
ResourceUsage: &resourceUsage,
Timestamp: ts,
Pids: pidStats,
}
}
func getAllPids() (map[int]*nomadPid, error) {
allProcesses, err := ps.Processes()
if err != nil {
return nil, err
}
return scanPids(os.Getpid(), allProcesses)
}