open-vault/vault/request_forwarding.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0

package vault

import (
	"bytes"
	"context"
	"crypto/ecdsa"
	"crypto/tls"
	"crypto/x509"
	"errors"
	"fmt"
	"math"
	"net/http"
	"net/url"
	"sync"
	"time"

	"github.com/armon/go-metrics"
	log "github.com/hashicorp/go-hclog"
	"github.com/hashicorp/vault/helper/forwarding"
	"github.com/hashicorp/vault/sdk/helper/consts"
	"github.com/hashicorp/vault/vault/cluster"
	"github.com/hashicorp/vault/vault/replication"
	"golang.org/x/net/http2"
	"google.golang.org/grpc"
	"google.golang.org/grpc/backoff"
	"google.golang.org/grpc/keepalive"
)

type requestForwardingHandler struct {
	fws         *http2.Server
	fwRPCServer *grpc.Server
	logger      log.Logger
	ha          bool
	core        *Core
	stopCh      chan struct{}
}

type requestForwardingClusterClient struct {
	core *Core
}

// NewRequestForwardingHandler creates a cluster handler for use with request
// forwarding.
func NewRequestForwardingHandler(c *Core, fws *http2.Server, perfStandbySlots chan struct{}, perfStandbyRepCluster *replication.Cluster) (*requestForwardingHandler, error) {
	// Resolve locally to avoid races
	ha := c.ha != nil

	fwRPCServer := grpc.NewServer(
		grpc.KeepaliveParams(keepalive.ServerParameters{
			Time: 2 * c.clusterHeartbeatInterval,
		}),
		grpc.MaxRecvMsgSize(math.MaxInt32),
		grpc.MaxSendMsgSize(math.MaxInt32),
	)

	if ha && c.clusterHandler != nil {
		RegisterRequestForwardingServer(fwRPCServer, &forwardedRequestRPCServer{
			core:                  c,
			handler:               c.clusterHandler,
			perfStandbySlots:      perfStandbySlots,
			perfStandbyRepCluster: perfStandbyRepCluster,
			raftFollowerStates:    c.raftFollowerStates,
		})
	}

	return &requestForwardingHandler{
		fws:         fws,
		fwRPCServer: fwRPCServer,
		ha:          ha,
		logger:      c.logger.Named("request-forward"),
		core:        c,
		stopCh:      make(chan struct{}),
	}, nil
}

// ClientLookup satisfies the ClusterClient interface and returns the ha tls
// client certs.
func (c *requestForwardingClusterClient) ClientLookup(ctx context.Context, requestInfo *tls.CertificateRequestInfo) (*tls.Certificate, error) {
	parsedCert := c.core.localClusterParsedCert.Load().(*x509.Certificate)
	if parsedCert == nil {
		return nil, nil
	}
	currCert := c.core.localClusterCert.Load().([]byte)
	if len(currCert) == 0 {
		return nil, nil
	}
	localCert := make([]byte, len(currCert))
	copy(localCert, currCert)

	for _, subj := range requestInfo.AcceptableCAs {
		if bytes.Equal(subj, parsedCert.RawIssuer) {
			return &tls.Certificate{
				Certificate: [][]byte{localCert},
				PrivateKey:  c.core.localClusterPrivateKey.Load().(*ecdsa.PrivateKey),
				Leaf:        c.core.localClusterParsedCert.Load().(*x509.Certificate),
			}, nil
		}
	}

	return nil, nil
}

func (c *requestForwardingClusterClient) ServerName() string {
	parsedCert := c.core.localClusterParsedCert.Load().(*x509.Certificate)
	if parsedCert == nil {
		return ""
	}

	return parsedCert.Subject.CommonName
}

func (c *requestForwardingClusterClient) CACert(ctx context.Context) *x509.Certificate {
	return c.core.localClusterParsedCert.Load().(*x509.Certificate)
}

// ServerLookup satisfies the ClusterHandler interface and returns the server's
// tls certs.
func (rf *requestForwardingHandler) ServerLookup(ctx context.Context, clientHello *tls.ClientHelloInfo) (*tls.Certificate, error) {
	currCert := rf.core.localClusterCert.Load().([]byte)
	if len(currCert) == 0 {
		return nil, fmt.Errorf("got forwarding connection but no local cert")
	}

	localCert := make([]byte, len(currCert))
	copy(localCert, currCert)

	return &tls.Certificate{
		Certificate: [][]byte{localCert},
		PrivateKey:  rf.core.localClusterPrivateKey.Load().(*ecdsa.PrivateKey),
		Leaf:        rf.core.localClusterParsedCert.Load().(*x509.Certificate),
	}, nil
}

// CALookup satisfies the ClusterHandler interface and returns the ha ca cert.
func (rf *requestForwardingHandler) CALookup(ctx context.Context) ([]*x509.Certificate, error) {
	parsedCert := rf.core.localClusterParsedCert.Load().(*x509.Certificate)

	if parsedCert == nil {
		return nil, fmt.Errorf("forwarding connection client but no local cert")
	}

	return []*x509.Certificate{parsedCert}, nil
}

// Handoff serves a request forwarding connection.
func (rf *requestForwardingHandler) Handoff(ctx context.Context, shutdownWg *sync.WaitGroup, closeCh chan struct{}, tlsConn *tls.Conn) error {
	if !rf.ha {
		tlsConn.Close()
		return nil
	}

	rf.logger.Debug("got request forwarding connection")

	shutdownWg.Add(2)
	// quitCh is used to close the connection and the second
	// goroutine if the server closes before closeCh.
	quitCh := make(chan struct{})
	go func() {
		select {
		case <-quitCh:
		case <-closeCh:
		case <-rf.stopCh:
		}
		tlsConn.Close()
		shutdownWg.Done()
	}()

	go func() {
		rf.fws.ServeConn(tlsConn, &http2.ServeConnOpts{
			Handler: rf.fwRPCServer,
			BaseConfig: &http.Server{
				ErrorLog: rf.logger.StandardLogger(nil),
			},
		})

		// close the quitCh which will close the connection and
		// the other goroutine.
		close(quitCh)
		shutdownWg.Done()
	}()

	return nil
}

// Stop stops the request forwarding server and closes connections.
func (rf *requestForwardingHandler) Stop() error {
	// Give some time for existing RPCs to drain.
	time.Sleep(cluster.ListenerAcceptDeadline)
	close(rf.stopCh)
	rf.fwRPCServer.Stop()
	return nil
}

// Starts the listeners and servers necessary to handle forwarded requests
func (c *Core) startForwarding(ctx context.Context) error {
	c.logger.Debug("request forwarding setup function")
	defer c.logger.Debug("leaving request forwarding setup function")

	// Clean up in case we have transitioned from a client to a server
	c.requestForwardingConnectionLock.Lock()
	c.clearForwardingClients()
	c.requestForwardingConnectionLock.Unlock()

	clusterListener := c.getClusterListener()
	if c.ha == nil || clusterListener == nil {
		c.logger.Debug("request forwarding not setup")
		return nil
	}

	perfStandbyRepCluster, perfStandbySlots, err := c.perfStandbyClusterHandler()
	if err != nil {
		return err
	}

	handler, err := NewRequestForwardingHandler(c, clusterListener.Server(), perfStandbySlots, perfStandbyRepCluster)
	if err != nil {
		return err
	}

	clusterListener.AddHandler(consts.RequestForwardingALPN, handler)

	return nil
}

func (c *Core) stopForwarding() {
	clusterListener := c.getClusterListener()
	if clusterListener != nil {
		clusterListener.StopHandler(consts.RequestForwardingALPN)
		clusterListener.StopHandler(consts.PerfStandbyALPN)
	}
	c.removeAllPerfStandbySecondaries()
}

// refreshRequestForwardingConnection ensures that the client/transport are
// alive and that the current active address value matches the most
// recently-known address.
func (c *Core) refreshRequestForwardingConnection(ctx context.Context, clusterAddr string) error {
	c.logger.Debug("refreshing forwarding connection", "clusterAddr", clusterAddr)
	defer c.logger.Debug("done refreshing forwarding connection", "clusterAddr", clusterAddr)

	c.requestForwardingConnectionLock.Lock()
	defer c.requestForwardingConnectionLock.Unlock()

	// Clean things up first
	c.clearForwardingClients()

	// If we don't have anything to connect to, just return
	if clusterAddr == "" {
		return nil
	}

	clusterURL, err := url.Parse(clusterAddr)
	if err != nil {
		c.logger.Error("error parsing cluster address attempting to refresh forwarding connection", "error", err)
		return err
	}

	parsedCert := c.localClusterParsedCert.Load().(*x509.Certificate)
	if parsedCert == nil {
		c.logger.Error("no request forwarding cluster certificate found")
		return errors.New("no request forwarding cluster certificate found")
	}

	clusterListener := c.getClusterListener()
	if clusterListener == nil {
		c.logger.Error("no cluster listener configured")
		return nil
	}

	clusterListener.AddClient(consts.RequestForwardingALPN, &requestForwardingClusterClient{
		core: c,
	})

	// Set up grpc forwarding handling
	// It's not really insecure, but we have to dial manually to get the
	// ALPN header right. It's just "insecure" because GRPC isn't managing
	// the TLS state.
	dctx, cancelFunc := context.WithCancel(ctx)

	opts := []grpc.DialOption{
		grpc.WithDialer(clusterListener.GetDialerFunc(ctx, consts.RequestForwardingALPN)),
		grpc.WithInsecure(), // it's not, we handle it in the dialer
		grpc.WithKeepaliveParams(keepalive.ClientParameters{
			Time: 2 * c.clusterHeartbeatInterval,
		}),
		grpc.WithDefaultCallOptions(
			grpc.MaxCallRecvMsgSize(math.MaxInt32),
			grpc.MaxCallSendMsgSize(math.MaxInt32),
		),
	}
	if c.grpcMinConnectTimeout != 0 {
		opts = append(opts, grpc.WithConnectParams(grpc.ConnectParams{
			MinConnectTimeout: c.grpcMinConnectTimeout,
			Backoff:           backoff.DefaultConfig,
		}))
	}
	c.rpcClientConn, err = grpc.DialContext(dctx, clusterURL.Host, opts...)
	if err != nil {
		cancelFunc()
		c.logger.Error("err setting up forwarding rpc client", "error", err)
		return err
	}
	c.rpcClientConnContext = dctx
	c.rpcClientConnCancelFunc = cancelFunc
	c.rpcForwardingClient = &forwardingClient{
		RequestForwardingClient: NewRequestForwardingClient(c.rpcClientConn),
		core:                    c,
		echoTicker:              time.NewTicker(c.clusterHeartbeatInterval),
		echoContext:             dctx,
	}
	c.rpcForwardingClient.startHeartbeat()

	return nil
}

func (c *Core) clearForwardingClients() {
	c.logger.Debug("clearing forwarding clients")
	defer c.logger.Debug("done clearing forwarding clients")

	if c.rpcClientConnCancelFunc != nil {
		c.rpcClientConnCancelFunc()
		c.rpcClientConnCancelFunc = nil
	}
	if c.rpcClientConn != nil {
		c.rpcClientConn.Close()
		c.rpcClientConn = nil
	}

	c.rpcClientConnContext = nil
	c.rpcForwardingClient = nil

	clusterListener := c.getClusterListener()
	if clusterListener != nil {
		clusterListener.RemoveClient(consts.RequestForwardingALPN)
	}
	c.clusterLeaderParams.Store((*ClusterLeaderParams)(nil))
}

// ForwardRequest forwards a given request to the active node and returns the
// response.
func (c *Core) ForwardRequest(req *http.Request) (int, http.Header, []byte, error) {
	// checking if the node is perfStandby here to avoid a deadlock between
	// Core.stateLock and Core.requestForwardingConnectionLock
	isPerfStandby := c.PerfStandby()
	c.requestForwardingConnectionLock.RLock()
	defer c.requestForwardingConnectionLock.RUnlock()

	if c.rpcForwardingClient == nil {
		return 0, nil, nil, ErrCannotForward
	}

	defer metrics.MeasureSince([]string{"ha", "rpc", "client", "forward"}, time.Now())

	origPath := req.URL.Path
	defer func() {
		req.URL.Path = origPath
	}()

	req.URL.Path = req.Context().Value("original_request_path").(string)

	freq, err := forwarding.GenerateForwardedRequest(req)
	if err != nil {
		c.logger.Error("error creating forwarding RPC request", "error", err)
		return 0, nil, nil, fmt.Errorf("error creating forwarding RPC request")
	}
	if freq == nil {
		c.logger.Error("got nil forwarding RPC request")
		return 0, nil, nil, fmt.Errorf("got nil forwarding RPC request")
	}
	resp, err := c.rpcForwardingClient.ForwardRequest(req.Context(), freq)
	if err != nil {
		metrics.IncrCounter([]string{"ha", "rpc", "client", "forward", "errors"}, 1)
		c.logger.Error("error during forwarded RPC request", "error", err)
		return 0, nil, nil, fmt.Errorf("error during forwarding RPC request")
	}

	var header http.Header
	if resp.HeaderEntries != nil {
		header = make(http.Header)
		for k, v := range resp.HeaderEntries {
			header[k] = v.Values
		}
	}

	// If we are a perf standby and the request was forwarded to the active node
	// we should attempt to wait for the WAL to ship to offer best effort read after
	// write guarantees
	if isPerfStandby && resp.LastRemoteWal > 0 {
		WaitUntilWALShipped(req.Context(), c, resp.LastRemoteWal)
	}

	return int(resp.StatusCode), header, resp.Body, nil
}