open-vault/vault/diagnose/helpers.go

package diagnose

import (
	"context"
	"fmt"
	"io"
	"strings"
	"time"

	"github.com/shirou/gopsutil/disk"
	"go.opentelemetry.io/otel/attribute"
	"go.opentelemetry.io/otel/codes"
	sdktrace "go.opentelemetry.io/otel/sdk/trace"
	"go.opentelemetry.io/otel/trace"
)

const (
	warningEventName        = "warning"
	skippedEventName        = "skipped"
	actionKey               = "actionKey"
	spotCheckOkEventName    = "spot-check-ok"
	spotCheckWarnEventName  = "spot-check-warn"
	spotCheckErrorEventName = "spot-check-error"
	errorMessageKey         = attribute.Key("error.message")
	nameKey                 = attribute.Key("name")
	messageKey              = attribute.Key("message")
)

var (
	MainSection = trace.WithAttributes(attribute.Key("diagnose").String("main-section"))
)

var diagnoseSession = struct{}{}
var noopTracer = trace.NewNoopTracerProvider().Tracer("vault-diagnose")

type testFunction func(context.Context) error

type Session struct {
	tc     *TelemetryCollector
	tracer trace.Tracer
	tp     *sdktrace.TracerProvider
	skip   map[string]bool
}

// New initializes a Diagnose tracing session.  In particular this wires a TelemetryCollector, which
// synchronously receives and tracks OpenTelemetry spans in order to provide a tree structure of results
// when the outermost span ends.
func New(w io.Writer) *Session {
	tc := NewTelemetryCollector(w)
	//so, _ := stdout.NewExporter(stdout.WithPrettyPrint())
	tp := sdktrace.NewTracerProvider(
		sdktrace.WithSampler(sdktrace.AlwaysSample()),
		//sdktrace.WithSpanProcessor(sdktrace.NewSimpleSpanProcessor(so)),
		sdktrace.WithSpanProcessor(tc),
	)
	tracer := tp.Tracer("vault-diagnose")
	sess := &Session{
		tp:     tp,
		tc:     tc,
		tracer: tracer,
		skip:   make(map[string]bool),
	}
	return sess
}

func (s *Session) SetSkipList(ls []string) {
	for _, e := range ls {
		s.skip[e] = true
	}
}

// IsSkipped returns true if skipName is present in the skip list.  Can be used in combination with Skip to mark a
// span skipped and conditionally skip some logic.
func (s *Session) IsSkipped(skipName string) bool {
	return s.skip[skipName]
}

// Context returns a new context with a defined diagnose session
func Context(ctx context.Context, sess *Session) context.Context {
	return context.WithValue(ctx, diagnoseSession, sess)
}

// CurrentSession retrieves the active diagnose session from the context, or nil if none.
func CurrentSession(ctx context.Context) *Session {
	sessionCtxVal := ctx.Value(diagnoseSession)
	if sessionCtxVal != nil {

		return sessionCtxVal.(*Session)

	}
	return nil
}

// Finalize ends the Diagnose session, returning the root of the result tree.  This will be empty until
// the outermost span ends.
func (s *Session) Finalize(ctx context.Context) *Result {
	s.tp.ForceFlush(ctx)
	return s.tc.RootResult
}

// StartSpan starts a "diagnose" span, which is really just an OpenTelemetry Tracing span.
func StartSpan(ctx context.Context, spanName string, options ...trace.SpanOption) (context.Context, trace.Span) {
	session := CurrentSession(ctx)
	if session != nil {
		return session.tracer.Start(ctx, spanName, options...)
	} else {
		return noopTracer.Start(ctx, spanName, options...)
	}
}

// Success sets the span to Successful (overriding any previous status) and sets the message to the input.
func Success(ctx context.Context, message string) {
	span := trace.SpanFromContext(ctx)
	span.SetStatus(codes.Ok, message)
}

// Fail records a failure in the current span
func Fail(ctx context.Context, message string) {
	span := trace.SpanFromContext(ctx)
	span.SetStatus(codes.Error, message)
}

// Error records an error in the current span (but unlike Fail, doesn't set the overall span status to Error)
func Error(ctx context.Context, err error, options ...trace.EventOption) error {
	span := trace.SpanFromContext(ctx)
	span.RecordError(err, options...)
	return err
}

// Skipped marks the current span skipped
func Skipped(ctx context.Context) {
	span := trace.SpanFromContext(ctx)
	span.AddEvent(skippedEventName)
}

// Warn records a warning on the current span
func Warn(ctx context.Context, msg string) {
	span := trace.SpanFromContext(ctx)
	span.AddEvent(warningEventName, trace.WithAttributes(messageKey.String(msg)))
}

// SpotOk adds an Ok result without adding a new Span.  This should be used for instantaneous checks with no
// possible sub-spans
func SpotOk(ctx context.Context, checkName, message string, options ...trace.EventOption) {
	addSpotCheckResult(ctx, spotCheckOkEventName, checkName, message, options...)
}

// SpotWarn adds a Warning result without adding a new Span.  This should be used for instantaneous checks with no
// possible sub-spans
func SpotWarn(ctx context.Context, checkName, message string, options ...trace.EventOption) {
	addSpotCheckResult(ctx, spotCheckWarnEventName, checkName, message, options...)
}

// SpotError adds an Error result without adding a new Span.  This should be used for instantaneous checks with no
// possible sub-spans
func SpotError(ctx context.Context, checkName string, err error, options ...trace.EventOption) error {
	var message string
	if err != nil {
		message = err.Error()
	}
	addSpotCheckResult(ctx, spotCheckErrorEventName, checkName, message, options...)
	return err
}

func addSpotCheckResult(ctx context.Context, eventName, checkName, message string, options ...trace.EventOption) {
	span := trace.SpanFromContext(ctx)
	attrs := append(options, trace.WithAttributes(nameKey.String(checkName)))
	if message != "" {
		attrs = append(attrs, trace.WithAttributes(messageKey.String(message)))
	}
	span.AddEvent(eventName, attrs...)
}

func SpotCheck(ctx context.Context, checkName string, f func() error) error {
	err := f()
	if err != nil {
		SpotError(ctx, checkName, err)
		return err
	} else {
		SpotOk(ctx, checkName, "")
	}
	return nil
}

// Test creates a new named span, and executes the provided function within it.  If the function returns an error,
// the span is considered to have failed.
func Test(ctx context.Context, spanName string, function testFunction, options ...trace.SpanOption) error {
	ctx, span := StartSpan(ctx, spanName, options...)
	defer span.End()

	err := function(ctx)
	if err != nil {
		span.SetStatus(codes.Error, err.Error())
	}
	return err
}

// WithTimeout wraps a context consuming function, and when called, returns an error if the sub-function does not
// complete within the timeout, e.g.
//
// diagnose.Test(ctx, "my-span", diagnose.WithTimeout(5 * time.Second, myTestFunc))
func WithTimeout(d time.Duration, f testFunction) testFunction {
	return func(ctx context.Context) error {
		rch := make(chan error)
		t := time.NewTimer(d)
		defer t.Stop()
		go func() { rch <- f(ctx) }()
		select {
		case <-t.C:
			return fmt.Errorf("timed out after %s", d.String())
		case err := <-rch:
			return err
		}
	}
}

// Skippable wraps a Test function with logic that will not run the test if the skipName
// was in the session's skip list
func Skippable(skipName string, f testFunction) testFunction {
	return func(ctx context.Context) error {
		session := CurrentSession(ctx)
		if session != nil {
			if !session.IsSkipped(skipName) {
				return f(ctx)
			} else {
				Skipped(ctx)
			}
		}
		return nil
	}
}

func DiskUsageCheck(ctx context.Context) error {
	partitions, err := disk.Partitions(false)
	if err != nil {
		return err
	}

	partitionExcludes := []string{"/boot"}
partLoop:
	for _, partition := range partitions {
		for _, exc := range partitionExcludes {
			if strings.HasPrefix(partition.Mountpoint, exc) {
				continue partLoop
			}
		}
		usage, err := disk.Usage(partition.Mountpoint)
		testName := "disk-usage: " + partition.Mountpoint
		if err != nil {
			Warn(ctx, fmt.Sprintf("could not obtain partition usage for %s: %v", partition.Mountpoint, err))
		} else {
			if usage.UsedPercent > 95 {
				SpotWarn(ctx, testName, "more than 95% full")
			} else if usage.Free < 2<<30 {
				SpotWarn(ctx, testName, "less than 1GB free")
			} else {
				SpotOk(ctx, testName, "ok")
			}
		}

	}
	return nil
}