open-vault/helper/metricsutil/gauge_process.go

package metricsutil

import (
	"context"
	"math/rand"
	"sort"
	"time"

	log "github.com/hashicorp/go-hclog"
)

// This interface allows unit tests to substitute in a simulated clock.
type clock interface {
	Now() time.Time
	NewTicker(time.Duration) *time.Ticker
}

type defaultClock struct{}

func (_ defaultClock) Now() time.Time {
	return time.Now()
}

func (_ defaultClock) NewTicker(d time.Duration) *time.Ticker {
	return time.NewTicker(d)
}

// GaugeLabelValues is one gauge in a set sharing a single key, that
// are measured in a batch.
type GaugeLabelValues struct {
	Labels []Label
	Value  float32
}

// GaugeCollector is a callback function that returns an unfiltered
// set of label-value pairs. It may be cancelled if it takes too long.
type GaugeCollector = func(context.Context) ([]GaugeLabelValues, error)

// collectionBound is a hard limit on how long a collection process
// may take, as a fraction of the current interval.
const collectionBound = 0.02

// collectionTarget is a soft limit; if exceeded, the collection interval
// will be doubled.
const collectionTarget = 0.01

// A GaugeCollectionProcess is responsible for one particular gauge metric.
// It handles a delay on initial startup; limiting the cardinality; and
// exponential backoff on the requested interval.
type GaugeCollectionProcess struct {
	stop    chan struct{}
	stopped chan struct{}

	// gauge name
	key []string
	// labels to use when reporting
	labels []Label

	// callback function
	collector GaugeCollector

	// destination for metrics
	sink   *ClusterMetricSink
	logger log.Logger

	// time between collections
	originalInterval time.Duration
	currentInterval  time.Duration
	ticker           *time.Ticker

	// time source
	clock clock
}

// NewGaugeCollectionProcess creates a new collection process for the callback
// function given as an argument, and starts it running.
// A label should be provided for metrics *about* this collection process.
//
// The Run() method must be called to start the process.
func (m *ClusterMetricSink) NewGaugeCollectionProcess(
	key []string,
	id []Label,
	collector GaugeCollector,
	logger log.Logger,
) (*GaugeCollectionProcess, error) {
	return m.newGaugeCollectionProcessWithClock(
		key,
		id,
		collector,
		logger,
		defaultClock{},
	)
}

// test version allows an alternative clock implementation
func (m *ClusterMetricSink) newGaugeCollectionProcessWithClock(
	key []string,
	id []Label,
	collector GaugeCollector,
	logger log.Logger,
	clock clock,
) (*GaugeCollectionProcess, error) {
	process := &GaugeCollectionProcess{
		stop:             make(chan struct{}, 1),
		stopped:          make(chan struct{}, 1),
		key:              key,
		labels:           id,
		collector:        collector,
		sink:             m,
		originalInterval: m.GaugeInterval,
		currentInterval:  m.GaugeInterval,
		logger:           logger,
		clock:            clock,
	}
	return process, nil
}

// delayStart randomly delays by up to one extra interval
// so that collection processes do not all run at the time time.
// If we knew all the procsses in advance, we could just schedule them
// evenly, but a new one could be added per secret engine.
func (p *GaugeCollectionProcess) delayStart() bool {
	randomDelay := time.Duration(rand.Int63n(int64(p.currentInterval)))
	// A Timer might be better, but then we'd have to simulate
	// one of those too?
	delayTick := p.clock.NewTicker(randomDelay)
	defer delayTick.Stop()

	select {
	case <-p.stop:
		return true
	case <-delayTick.C:
		break
	}
	return false
}

// resetTicker stops the old ticker and starts a new one at the current
// interval setting.
func (p *GaugeCollectionProcess) resetTicker() {
	if p.ticker != nil {
		p.ticker.Stop()
	}
	p.ticker = p.clock.NewTicker(p.currentInterval)
}

// collectAndFilterGauges executes the callback function,
// limits the cardinality, and streams the results to the metrics sink.
func (p *GaugeCollectionProcess) collectAndFilterGauges() {
	// Run for only an allotted amount of time.
	timeout := time.Duration(collectionBound * float64(p.currentInterval))
	ctx, cancel := context.WithTimeout(context.Background(),
		timeout)
	defer cancel()

	p.sink.AddDurationWithLabels([]string{"metrics", "collection", "interval"},
		p.currentInterval,
		p.labels)

	start := p.clock.Now()
	values, err := p.collector(ctx)
	end := p.clock.Now()
	duration := end.Sub(start)

	// Report how long it took to perform the operation.
	p.sink.AddDurationWithLabels([]string{"metrics", "collection"},
		duration,
		p.labels)

	// If over threshold, back off by doubling the measurement interval.
	// Currently a restart is the only way to bring it back down.
	threshold := time.Duration(collectionTarget * float64(p.currentInterval))
	if duration > threshold {
		p.logger.Warn("gauge collection time exceeded target", "target", threshold, "actual", duration, "id", p.labels)
		p.currentInterval *= 2
		p.resetTicker()
	}

	if err != nil {
		p.logger.Error("error collecting gauge", "id", p.labels, "error", err)
		p.sink.IncrCounterWithLabels([]string{"metrics", "collection", "error"},
			1,
			p.labels)
		return
	}

	// Filter to top N.
	// This does not guarantee total cardinality is <= N, but it does slow things down
	// a little if the cardinality *is* too high and the gauge needs to be disabled.
	if len(values) > p.sink.MaxGaugeCardinality {
		sort.Slice(values, func(a, b int) bool {
			return values[a].Value > values[b].Value
		})
		values = values[:p.sink.MaxGaugeCardinality]
	}

	p.streamGaugesToSink(values)
}

// batchSize is the number of metrics to be sent per tick duration.
const batchSize = 25

func (p *GaugeCollectionProcess) streamGaugesToSink(values []GaugeLabelValues) {
	// Dumping 500 metrics in one big chunk is somewhat unfriendly to UDP-based
	// transport, and to the rest of the metrics trying to get through.
	// Let's smooth things out over the course of a second.
	// 1 second / 500 = 2 ms each, so we can send 25 (batchSize) per 50 milliseconds.
	// That should be one or two packets.
	sendTick := p.clock.NewTicker(50 * time.Millisecond)
	defer sendTick.Stop()

	for i, lv := range values {
		if i > 0 && i%batchSize == 0 {
			select {
			case <-p.stop:
				// because the channel is closed,
				// the main loop will successfully
				// read from p.stop too, and exit.
				return
			case <-sendTick.C:
				break
			}
		}
		p.sink.SetGaugeWithLabels(p.key, lv.Value, lv.Labels)
	}
}

// Run should be called as a goroutine.
func (p *GaugeCollectionProcess) Run() {
	defer close(p.stopped)

	// Wait a random amount of time
	stopReceived := p.delayStart()
	if stopReceived {
		return
	}

	// Create a ticker to start each cycle
	p.resetTicker()

	// Loop until we get a signal to stop
	for {
		select {
		case <-p.ticker.C:
			p.collectAndFilterGauges()
		case <-p.stop:
			// Can't use defer because this might
			// not be the original ticker.
			p.ticker.Stop()
			return
		}
	}
}

// Stop the collection process
func (p *GaugeCollectionProcess) Stop() {
	close(p.stop)
}
Utility for collecting new gauge metrics (#9017) 2020-05-19 22:54:43 +00:00			`package metricsutil`

			`import (`
			`"context"`
			`"math/rand"`
			`"sort"`
			`"time"`

			`log "github.com/hashicorp/go-hclog"`
			`)`

			`// This interface allows unit tests to substitute in a simulated clock.`
			`type clock interface {`
			`Now() time.Time`
			`NewTicker(time.Duration) *time.Ticker`
			`}`

Run a more strict formatter over the code (#11312) * Update tooling * Run gofumpt * go mod vendor 2021-04-08 16:43:39 +00:00			`type defaultClock struct{}`
Utility for collecting new gauge metrics (#9017) 2020-05-19 22:54:43 +00:00
			`func (_ defaultClock) Now() time.Time {`
			`return time.Now()`
			`}`

			`func (_ defaultClock) NewTicker(d time.Duration) *time.Ticker {`
			`return time.NewTicker(d)`
			`}`

			`// GaugeLabelValues is one gauge in a set sharing a single key, that`
			`// are measured in a batch.`
			`type GaugeLabelValues struct {`
			`Labels []Label`
			`Value float32`
			`}`

			`// GaugeCollector is a callback function that returns an unfiltered`
			`// set of label-value pairs. It may be cancelled if it takes too long.`
			`type GaugeCollector = func(context.Context) ([]GaugeLabelValues, error)`

			`// collectionBound is a hard limit on how long a collection process`
			`// may take, as a fraction of the current interval.`
			`const collectionBound = 0.02`

			`// collectionTarget is a soft limit; if exceeded, the collection interval`
			`// will be doubled.`
			`const collectionTarget = 0.01`

			`// A GaugeCollectionProcess is responsible for one particular gauge metric.`
			`// It handles a delay on initial startup; limiting the cardinality; and`
			`// exponential backoff on the requested interval.`
			`type GaugeCollectionProcess struct {`
			`stop chan struct{}`
			`stopped chan struct{}`

			`// gauge name`
			`key []string`
			`// labels to use when reporting`
			`labels []Label`

			`// callback function`
			`collector GaugeCollector`

			`// destination for metrics`
			`sink *ClusterMetricSink`
			`logger log.Logger`

			`// time between collections`
			`originalInterval time.Duration`
			`currentInterval time.Duration`
			`ticker *time.Ticker`

			`// time source`
			`clock clock`
			`}`

			`// NewGaugeCollectionProcess creates a new collection process for the callback`
			`// function given as an argument, and starts it running.`
			`// A label should be provided for metrics about this collection process.`
			`//`
			`// The Run() method must be called to start the process.`
			`func (m *ClusterMetricSink) NewGaugeCollectionProcess(`
			`key []string,`
			`id []Label,`
			`collector GaugeCollector,`
			`logger log.Logger,`
			`) (*GaugeCollectionProcess, error) {`
			`return m.newGaugeCollectionProcessWithClock(`
			`key,`
			`id,`
			`collector,`
			`logger,`
			`defaultClock{},`
			`)`
			`}`

			`// test version allows an alternative clock implementation`
			`func (m *ClusterMetricSink) newGaugeCollectionProcessWithClock(`
			`key []string,`
			`id []Label,`
			`collector GaugeCollector,`
			`logger log.Logger,`
			`clock clock,`
			`) (*GaugeCollectionProcess, error) {`
			`process := &GaugeCollectionProcess{`
			`stop: make(chan struct{}, 1),`
			`stopped: make(chan struct{}, 1),`
			`key: key,`
			`labels: id,`
			`collector: collector,`
			`sink: m,`
			`originalInterval: m.GaugeInterval,`
			`currentInterval: m.GaugeInterval,`
			`logger: logger,`
			`clock: clock,`
			`}`
			`return process, nil`
			`}`

			`// delayStart randomly delays by up to one extra interval`
			`// so that collection processes do not all run at the time time.`
			`// If we knew all the procsses in advance, we could just schedule them`
			`// evenly, but a new one could be added per secret engine.`
			`func (p *GaugeCollectionProcess) delayStart() bool {`
Don't assume that time.Duration can be represented by an int. (#9554) 2020-07-22 16:04:52 +00:00			`randomDelay := time.Duration(rand.Int63n(int64(p.currentInterval)))`
Utility for collecting new gauge metrics (#9017) 2020-05-19 22:54:43 +00:00			`// A Timer might be better, but then we'd have to simulate`
			`// one of those too?`
			`delayTick := p.clock.NewTicker(randomDelay)`
			`defer delayTick.Stop()`

			`select {`
			`case <-p.stop:`
			`return true`
			`case <-delayTick.C:`
			`break`
			`}`
			`return false`
			`}`

			`// resetTicker stops the old ticker and starts a new one at the current`
			`// interval setting.`
			`func (p *GaugeCollectionProcess) resetTicker() {`
			`if p.ticker != nil {`
			`p.ticker.Stop()`
			`}`
			`p.ticker = p.clock.NewTicker(p.currentInterval)`
			`}`

			`// collectAndFilterGauges executes the callback function,`
			`// limits the cardinality, and streams the results to the metrics sink.`
			`func (p *GaugeCollectionProcess) collectAndFilterGauges() {`
			`// Run for only an allotted amount of time.`
			`timeout := time.Duration(collectionBound * float64(p.currentInterval))`
			`ctx, cancel := context.WithTimeout(context.Background(),`
			`timeout)`
			`defer cancel()`

			`p.sink.AddDurationWithLabels([]string{"metrics", "collection", "interval"},`
			`p.currentInterval,`
			`p.labels)`

			`start := p.clock.Now()`
			`values, err := p.collector(ctx)`
			`end := p.clock.Now()`
			`duration := end.Sub(start)`

			`// Report how long it took to perform the operation.`
			`p.sink.AddDurationWithLabels([]string{"metrics", "collection"},`
			`duration,`
			`p.labels)`

			`// If over threshold, back off by doubling the measurement interval.`
			`// Currently a restart is the only way to bring it back down.`
			`threshold := time.Duration(collectionTarget * float64(p.currentInterval))`
			`if duration > threshold {`
			`p.logger.Warn("gauge collection time exceeded target", "target", threshold, "actual", duration, "id", p.labels)`
			`p.currentInterval *= 2`
			`p.resetTicker()`
			`}`

			`if err != nil {`
			`p.logger.Error("error collecting gauge", "id", p.labels, "error", err)`
			`p.sink.IncrCounterWithLabels([]string{"metrics", "collection", "error"},`
			`1,`
			`p.labels)`
			`return`
			`}`

			`// Filter to top N.`
			`// This does not guarantee total cardinality is <= N, but it does slow things down`
			`// a little if the cardinality is too high and the gauge needs to be disabled.`
			`if len(values) > p.sink.MaxGaugeCardinality {`
			`sort.Slice(values, func(a, b int) bool {`
			`return values[a].Value > values[b].Value`
			`})`
			`values = values[:p.sink.MaxGaugeCardinality]`
			`}`

			`p.streamGaugesToSink(values)`
			`}`

Fix max measurements gauge test (#14024) * make streamGaugesToSink batch size a const * attempt to fix for timeout failures for TestGauge_MaximumMeasurements 2022-02-23 18:36:25 +00:00			`// batchSize is the number of metrics to be sent per tick duration.`
			`const batchSize = 25`

Utility for collecting new gauge metrics (#9017) 2020-05-19 22:54:43 +00:00			`func (p *GaugeCollectionProcess) streamGaugesToSink(values []GaugeLabelValues) {`
			`// Dumping 500 metrics in one big chunk is somewhat unfriendly to UDP-based`
			`// transport, and to the rest of the metrics trying to get through.`
			`// Let's smooth things out over the course of a second.`
Fix max measurements gauge test (#14024) * make streamGaugesToSink batch size a const * attempt to fix for timeout failures for TestGauge_MaximumMeasurements 2022-02-23 18:36:25 +00:00			`// 1 second / 500 = 2 ms each, so we can send 25 (batchSize) per 50 milliseconds.`
Utility for collecting new gauge metrics (#9017) 2020-05-19 22:54:43 +00:00			`// That should be one or two packets.`
			`sendTick := p.clock.NewTicker(50 * time.Millisecond)`
helper/metricsutil: Prevent potential Ticker leak (#10913) 2021-02-19 04:31:53 +00:00			`defer sendTick.Stop()`

Utility for collecting new gauge metrics (#9017) 2020-05-19 22:54:43 +00:00			`for i, lv := range values {`
			`if i > 0 && i%batchSize == 0 {`
			`select {`
			`case <-p.stop:`
			`// because the channel is closed,`
			`// the main loop will successfully`
			`// read from p.stop too, and exit.`
			`return`
			`case <-sendTick.C:`
			`break`
			`}`
			`}`
			`p.sink.SetGaugeWithLabels(p.key, lv.Value, lv.Labels)`
			`}`
			`}`

			`// Run should be called as a goroutine.`
			`func (p *GaugeCollectionProcess) Run() {`
			`defer close(p.stopped)`

			`// Wait a random amount of time`
			`stopReceived := p.delayStart()`
			`if stopReceived {`
			`return`
			`}`

			`// Create a ticker to start each cycle`
			`p.resetTicker()`

			`// Loop until we get a signal to stop`
			`for {`
			`select {`
			`case <-p.ticker.C:`
			`p.collectAndFilterGauges()`
			`case <-p.stop:`
			`// Can't use defer because this might`
			`// not be the original ticker.`
			`p.ticker.Stop()`
			`return`
			`}`
			`}`
			`}`

			`// Stop the collection process`
			`func (p *GaugeCollectionProcess) Stop() {`
			`close(p.stop)`
			`}`