open-vault/vault/diagnose/helpers.go

264 lines
7.8 KiB
Go

package diagnose
import (
"context"
"fmt"
"io"
"strings"
"time"
"github.com/shirou/gopsutil/disk"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/codes"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
"go.opentelemetry.io/otel/trace"
)
const (
warningEventName = "warning"
skippedEventName = "skipped"
actionKey = "actionKey"
spotCheckOkEventName = "spot-check-ok"
spotCheckWarnEventName = "spot-check-warn"
spotCheckErrorEventName = "spot-check-error"
errorMessageKey = attribute.Key("error.message")
nameKey = attribute.Key("name")
messageKey = attribute.Key("message")
)
var (
MainSection = trace.WithAttributes(attribute.Key("diagnose").String("main-section"))
)
var diagnoseSession = struct{}{}
var noopTracer = trace.NewNoopTracerProvider().Tracer("vault-diagnose")
type testFunction func(context.Context) error
type Session struct {
tc *TelemetryCollector
tracer trace.Tracer
tp *sdktrace.TracerProvider
skip map[string]bool
}
// New initializes a Diagnose tracing session. In particular this wires a TelemetryCollector, which
// synchronously receives and tracks OpenTelemetry spans in order to provide a tree structure of results
// when the outermost span ends.
func New(w io.Writer) *Session {
tc := NewTelemetryCollector(w)
//so, _ := stdout.NewExporter(stdout.WithPrettyPrint())
tp := sdktrace.NewTracerProvider(
sdktrace.WithSampler(sdktrace.AlwaysSample()),
//sdktrace.WithSpanProcessor(sdktrace.NewSimpleSpanProcessor(so)),
sdktrace.WithSpanProcessor(tc),
)
tracer := tp.Tracer("vault-diagnose")
sess := &Session{
tp: tp,
tc: tc,
tracer: tracer,
skip: make(map[string]bool),
}
return sess
}
func (s *Session) SetSkipList(ls []string) {
for _, e := range ls {
s.skip[e] = true
}
}
// IsSkipped returns true if skipName is present in the skip list. Can be used in combination with Skip to mark a
// span skipped and conditionally skip some logic.
func (s *Session) IsSkipped(skipName string) bool {
return s.skip[skipName]
}
// Context returns a new context with a defined diagnose session
func Context(ctx context.Context, sess *Session) context.Context {
return context.WithValue(ctx, diagnoseSession, sess)
}
// CurrentSession retrieves the active diagnose session from the context, or nil if none.
func CurrentSession(ctx context.Context) *Session {
sessionCtxVal := ctx.Value(diagnoseSession)
if sessionCtxVal != nil {
return sessionCtxVal.(*Session)
}
return nil
}
// Finalize ends the Diagnose session, returning the root of the result tree. This will be empty until
// the outermost span ends.
func (s *Session) Finalize(ctx context.Context) *Result {
s.tp.ForceFlush(ctx)
return s.tc.RootResult
}
// StartSpan starts a "diagnose" span, which is really just an OpenTelemetry Tracing span.
func StartSpan(ctx context.Context, spanName string, options ...trace.SpanOption) (context.Context, trace.Span) {
session := CurrentSession(ctx)
if session != nil {
return session.tracer.Start(ctx, spanName, options...)
} else {
return noopTracer.Start(ctx, spanName, options...)
}
}
// Success sets the span to Successful (overriding any previous status) and sets the message to the input.
func Success(ctx context.Context, message string) {
span := trace.SpanFromContext(ctx)
span.SetStatus(codes.Ok, message)
}
// Fail records a failure in the current span
func Fail(ctx context.Context, message string) {
span := trace.SpanFromContext(ctx)
span.SetStatus(codes.Error, message)
}
// Error records an error in the current span (but unlike Fail, doesn't set the overall span status to Error)
func Error(ctx context.Context, err error, options ...trace.EventOption) error {
span := trace.SpanFromContext(ctx)
span.RecordError(err, options...)
return err
}
// Skipped marks the current span skipped
func Skipped(ctx context.Context) {
span := trace.SpanFromContext(ctx)
span.AddEvent(skippedEventName)
}
// Warn records a warning on the current span
func Warn(ctx context.Context, msg string) {
span := trace.SpanFromContext(ctx)
span.AddEvent(warningEventName, trace.WithAttributes(messageKey.String(msg)))
}
// SpotOk adds an Ok result without adding a new Span. This should be used for instantaneous checks with no
// possible sub-spans
func SpotOk(ctx context.Context, checkName, message string, options ...trace.EventOption) {
addSpotCheckResult(ctx, spotCheckOkEventName, checkName, message, options...)
}
// SpotWarn adds a Warning result without adding a new Span. This should be used for instantaneous checks with no
// possible sub-spans
func SpotWarn(ctx context.Context, checkName, message string, options ...trace.EventOption) {
addSpotCheckResult(ctx, spotCheckWarnEventName, checkName, message, options...)
}
// SpotError adds an Error result without adding a new Span. This should be used for instantaneous checks with no
// possible sub-spans
func SpotError(ctx context.Context, checkName string, err error, options ...trace.EventOption) error {
var message string
if err != nil {
message = err.Error()
}
addSpotCheckResult(ctx, spotCheckErrorEventName, checkName, message, options...)
return err
}
func addSpotCheckResult(ctx context.Context, eventName, checkName, message string, options ...trace.EventOption) {
span := trace.SpanFromContext(ctx)
attrs := append(options, trace.WithAttributes(nameKey.String(checkName)))
if message != "" {
attrs = append(attrs, trace.WithAttributes(messageKey.String(message)))
}
span.AddEvent(eventName, attrs...)
}
func SpotCheck(ctx context.Context, checkName string, f func() error) error {
err := f()
if err != nil {
SpotError(ctx, checkName, err)
return err
} else {
SpotOk(ctx, checkName, "")
}
return nil
}
// Test creates a new named span, and executes the provided function within it. If the function returns an error,
// the span is considered to have failed.
func Test(ctx context.Context, spanName string, function testFunction, options ...trace.SpanOption) error {
ctx, span := StartSpan(ctx, spanName, options...)
defer span.End()
err := function(ctx)
if err != nil {
span.SetStatus(codes.Error, err.Error())
}
return err
}
// WithTimeout wraps a context consuming function, and when called, returns an error if the sub-function does not
// complete within the timeout, e.g.
//
// diagnose.Test(ctx, "my-span", diagnose.WithTimeout(5 * time.Second, myTestFunc))
func WithTimeout(d time.Duration, f testFunction) testFunction {
return func(ctx context.Context) error {
rch := make(chan error)
t := time.NewTimer(d)
defer t.Stop()
go func() { rch <- f(ctx) }()
select {
case <-t.C:
return fmt.Errorf("timed out after %s", d.String())
case err := <-rch:
return err
}
}
}
// Skippable wraps a Test function with logic that will not run the test if the skipName
// was in the session's skip list
func Skippable(skipName string, f testFunction) testFunction {
return func(ctx context.Context) error {
session := CurrentSession(ctx)
if session != nil {
if !session.IsSkipped(skipName) {
return f(ctx)
} else {
Skipped(ctx)
}
}
return nil
}
}
func DiskUsageCheck(ctx context.Context) error {
partitions, err := disk.Partitions(false)
if err != nil {
return err
}
partitionExcludes := []string{"/boot"}
partLoop:
for _, partition := range partitions {
for _, exc := range partitionExcludes {
if strings.HasPrefix(partition.Mountpoint, exc) {
continue partLoop
}
}
usage, err := disk.Usage(partition.Mountpoint)
testName := "disk-usage: " + partition.Mountpoint
if err != nil {
Warn(ctx, fmt.Sprintf("could not obtain partition usage for %s: %v", partition.Mountpoint, err))
} else {
if usage.UsedPercent > 95 {
SpotWarn(ctx, testName, "more than 95% full")
} else if usage.Free < 2<<30 {
SpotWarn(ctx, testName, "less than 1GB free")
} else {
SpotOk(ctx, testName, "ok")
}
}
}
return nil
}