351 lines
11 KiB
Go
351 lines
11 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
package raft
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"net"
|
|
"strconv"
|
|
|
|
"golang.org/x/sync/errgroup"
|
|
"google.golang.org/grpc"
|
|
|
|
"github.com/hashicorp/go-hclog"
|
|
|
|
"github.com/hashicorp/consul/internal/storage"
|
|
"github.com/hashicorp/consul/internal/storage/inmem"
|
|
"github.com/hashicorp/consul/proto-public/pbresource"
|
|
|
|
pbstorage "github.com/hashicorp/consul/proto/private/pbstorage"
|
|
)
|
|
|
|
// NewBackend returns a storage backend that uses Raft for durable persistence
|
|
// and serves reads from an in-memory database. It's suitable for production use.
|
|
//
|
|
// It's not an entirely clean abstraction because rather than owning the Raft
|
|
// subsystem directly, it has to integrate with the existing FSM and related
|
|
// machinery from before generic resources.
|
|
//
|
|
// The given Handle will be used to apply logs and interrogate leadership state.
|
|
// In certain restricted circumstances, Handle may be nil, such as during tests
|
|
// that only exercise snapshot restoration, or when initializing a throwaway FSM
|
|
// during peers.json recovery - but calling any of the data access methods (read
|
|
// or write) will result in a panic.
|
|
//
|
|
// With Raft, writes and strongly consistent reads must be done on the leader.
|
|
// Backend implements a gRPC server, which followers will use to transparently
|
|
// forward operations to the leader. To do so, they will obtain a connection
|
|
// using Handle.DialLeader. Connections are cached for re-use, so when there's
|
|
// a new leader, you must call LeaderChanged to refresh the connection. Leaders
|
|
// must accept connections and hand them off by calling Backend.HandleConnection.
|
|
// Backend's gRPC client and server *DO NOT* handle TLS themselves, as they are
|
|
// intended to communicate over Consul's multiplexed server port (which handles
|
|
// TLS).
|
|
//
|
|
// For more information, see here:
|
|
// https://github.com/hashicorp/consul/tree/main/docs/resources#raft-storage-backend
|
|
//
|
|
// You must call Run before using the backend.
|
|
func NewBackend(h Handle, l hclog.Logger) (*Backend, error) {
|
|
s, err := inmem.NewStore()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
b := &Backend{handle: h, store: s}
|
|
b.forwardingServer = newForwardingServer(b)
|
|
b.forwardingClient = newForwardingClient(h, l)
|
|
return b, nil
|
|
}
|
|
|
|
// Handle provides glue for interacting with the Raft subsystem via existing
|
|
// machinery on consul.Server.
|
|
type Handle interface {
|
|
// Apply the given log message.
|
|
Apply(msg []byte) (any, error)
|
|
|
|
// IsLeader determines if this server is the Raft leader (so can handle writes).
|
|
IsLeader() bool
|
|
|
|
// EnsureStrongConsistency checks the server is able to handle consistent reads by
|
|
// verifying its leadership and checking the FSM has applied all queued writes.
|
|
EnsureStrongConsistency(ctx context.Context) error
|
|
|
|
// DialLeader dials a gRPC connection to the leader for forwarding.
|
|
DialLeader() (*grpc.ClientConn, error)
|
|
}
|
|
|
|
// Backend is a Raft-backed storage backend implementation.
|
|
type Backend struct {
|
|
handle Handle
|
|
store *inmem.Store
|
|
|
|
forwardingServer *forwardingServer
|
|
forwardingClient *forwardingClient
|
|
}
|
|
|
|
// Run until the given context is canceled. This method blocks, so should be
|
|
// called in a goroutine.
|
|
func (b *Backend) Run(ctx context.Context) {
|
|
group, groupCtx := errgroup.WithContext(ctx)
|
|
|
|
group.Go(func() error {
|
|
b.store.Run(groupCtx)
|
|
return nil
|
|
})
|
|
|
|
group.Go(func() error {
|
|
return b.forwardingServer.run(groupCtx)
|
|
})
|
|
|
|
group.Wait()
|
|
}
|
|
|
|
// Read implements the storage.Backend interface.
|
|
func (b *Backend) Read(ctx context.Context, consistency storage.ReadConsistency, id *pbresource.ID) (*pbresource.Resource, error) {
|
|
// Easy case. Both leaders and followers can read from the local store.
|
|
if consistency == storage.EventualConsistency {
|
|
return b.store.Read(id)
|
|
}
|
|
|
|
if consistency != storage.StrongConsistency {
|
|
return nil, fmt.Errorf("%w: unknown consistency: %s", storage.ErrInconsistent, consistency)
|
|
}
|
|
|
|
// We are the leader. Handle the request ourself.
|
|
if b.handle.IsLeader() {
|
|
return b.leaderRead(ctx, id)
|
|
}
|
|
|
|
// Forward the request to the leader.
|
|
rsp, err := b.forwardingClient.read(ctx, &pbstorage.ReadRequest{Id: id})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return rsp.GetResource(), nil
|
|
}
|
|
|
|
func (b *Backend) leaderRead(ctx context.Context, id *pbresource.ID) (*pbresource.Resource, error) {
|
|
if err := b.ensureStrongConsistency(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
return b.store.Read(id)
|
|
}
|
|
|
|
// WriteCAS implements the storage.Backend interface.
|
|
func (b *Backend) WriteCAS(ctx context.Context, res *pbresource.Resource) (*pbresource.Resource, error) {
|
|
req := &pbstorage.WriteRequest{Resource: res}
|
|
|
|
if b.handle.IsLeader() {
|
|
rsp, err := b.raftApply(&pbstorage.Log{
|
|
Type: pbstorage.LogType_LOG_TYPE_WRITE,
|
|
Request: &pbstorage.Log_Write{
|
|
Write: req,
|
|
},
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return rsp.GetWrite().GetResource(), nil
|
|
}
|
|
|
|
rsp, err := b.forwardingClient.write(ctx, req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return rsp.GetResource(), nil
|
|
}
|
|
|
|
// DeleteCAS implements the storage.Backend interface.
|
|
func (b *Backend) DeleteCAS(ctx context.Context, id *pbresource.ID, version string) error {
|
|
req := &pbstorage.DeleteRequest{
|
|
Id: id,
|
|
Version: version,
|
|
}
|
|
|
|
if b.handle.IsLeader() {
|
|
_, err := b.raftApply(&pbstorage.Log{
|
|
Type: pbstorage.LogType_LOG_TYPE_DELETE,
|
|
Request: &pbstorage.Log_Delete{
|
|
Delete: req,
|
|
},
|
|
})
|
|
return err
|
|
}
|
|
|
|
return b.forwardingClient.delete(ctx, req)
|
|
}
|
|
|
|
// List implements the storage.Backend interface.
|
|
func (b *Backend) List(ctx context.Context, consistency storage.ReadConsistency, resType storage.UnversionedType, tenancy *pbresource.Tenancy, namePrefix string) ([]*pbresource.Resource, error) {
|
|
// Easy case. Both leaders and followers can read from the local store.
|
|
if consistency == storage.EventualConsistency {
|
|
return b.store.List(resType, tenancy, namePrefix)
|
|
}
|
|
|
|
if consistency != storage.StrongConsistency {
|
|
return nil, fmt.Errorf("%w: unknown consistency: %s", storage.ErrInconsistent, consistency)
|
|
}
|
|
|
|
// We are the leader. Handle the request ourself.
|
|
if b.handle.IsLeader() {
|
|
return b.leaderList(ctx, resType, tenancy, namePrefix)
|
|
}
|
|
|
|
// Forward the request to the leader.
|
|
rsp, err := b.forwardingClient.list(ctx, &pbstorage.ListRequest{
|
|
Type: &pbresource.Type{
|
|
Group: resType.Group,
|
|
Kind: resType.Kind,
|
|
},
|
|
Tenancy: tenancy,
|
|
NamePrefix: namePrefix,
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return rsp.GetResources(), nil
|
|
}
|
|
|
|
func (b *Backend) leaderList(ctx context.Context, resType storage.UnversionedType, tenancy *pbresource.Tenancy, namePrefix string) ([]*pbresource.Resource, error) {
|
|
if err := b.ensureStrongConsistency(ctx); err != nil {
|
|
return nil, err
|
|
}
|
|
return b.store.List(resType, tenancy, namePrefix)
|
|
}
|
|
|
|
// WatchList implements the storage.Backend interface.
|
|
func (b *Backend) WatchList(_ context.Context, resType storage.UnversionedType, tenancy *pbresource.Tenancy, namePrefix string) (storage.Watch, error) {
|
|
return b.store.WatchList(resType, tenancy, namePrefix)
|
|
}
|
|
|
|
// ListByOwner implements the storage.Backend interface.
|
|
func (b *Backend) ListByOwner(_ context.Context, id *pbresource.ID) ([]*pbresource.Resource, error) {
|
|
return b.store.ListByOwner(id)
|
|
}
|
|
|
|
// Apply is called by the FSM with the bytes of a Raft log entry, with Consul's
|
|
// envelope (i.e. type prefix and msgpack wrapper) stripped off.
|
|
func (b *Backend) Apply(buf []byte, idx uint64) any {
|
|
var req pbstorage.Log
|
|
if err := req.UnmarshalBinary(buf); err != nil {
|
|
return fmt.Errorf("failed to decode request: %w", err)
|
|
}
|
|
|
|
switch req.Type {
|
|
case pbstorage.LogType_LOG_TYPE_WRITE:
|
|
res := req.GetWrite().GetResource()
|
|
oldVsn := res.Version
|
|
res.Version = strconv.Itoa(int(idx))
|
|
|
|
if err := b.store.WriteCAS(res, oldVsn); err != nil {
|
|
return err
|
|
}
|
|
|
|
return &pbstorage.LogResponse{
|
|
Response: &pbstorage.LogResponse_Write{
|
|
Write: &pbstorage.WriteResponse{Resource: res},
|
|
},
|
|
}
|
|
case pbstorage.LogType_LOG_TYPE_DELETE:
|
|
req := req.GetDelete()
|
|
if err := b.store.DeleteCAS(req.Id, req.Version); err != nil {
|
|
return err
|
|
}
|
|
return &pbstorage.LogResponse{
|
|
Response: &pbstorage.LogResponse_Delete{},
|
|
}
|
|
}
|
|
|
|
return fmt.Errorf("unexpected request type: %s", req.Type)
|
|
}
|
|
|
|
// LeaderChanged should be called whenever the current Raft leader changes, to
|
|
// drop and re-create the gRPC connection used for forwarding.
|
|
func (b *Backend) LeaderChanged() { b.forwardingClient.leaderChanged() }
|
|
|
|
// HandleConnection should be called whenever a forwarding connection is opened.
|
|
func (b *Backend) HandleConnection(conn net.Conn) { b.forwardingServer.listener.Handle(conn) }
|
|
|
|
// raftApply round trips the given request through the Raft log and FSM.
|
|
func (b *Backend) raftApply(req *pbstorage.Log) (*pbstorage.LogResponse, error) {
|
|
msg, err := req.MarshalBinary()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
rsp, err := b.handle.Apply(msg)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
switch t := rsp.(type) {
|
|
case *pbstorage.LogResponse:
|
|
return t, nil
|
|
default:
|
|
return nil, fmt.Errorf("unexpected response from Raft apply: %T", rsp)
|
|
}
|
|
}
|
|
|
|
func (b *Backend) ensureStrongConsistency(ctx context.Context) error {
|
|
if err := b.handle.EnsureStrongConsistency(ctx); err != nil {
|
|
return fmt.Errorf("%w: %v", storage.ErrInconsistent, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Snapshot obtains a point-in-time snapshot of the backend's state, so that it
|
|
// can be written to disk as a backup or sent to bootstrap a follower.
|
|
func (b *Backend) Snapshot() (*Snapshot, error) {
|
|
s, err := b.store.Snapshot()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &Snapshot{s}, nil
|
|
}
|
|
|
|
// Snapshot is a point-in-time snapshot of a backend's state.
|
|
type Snapshot struct{ s *inmem.Snapshot }
|
|
|
|
// Next returns the next resource in the snapshot, protobuf encoded. nil bytes
|
|
// will be returned when the end of the snapshot has been reached.
|
|
func (s *Snapshot) Next() ([]byte, error) {
|
|
res := s.s.Next()
|
|
if res == nil {
|
|
return nil, nil
|
|
}
|
|
return res.MarshalBinary()
|
|
}
|
|
|
|
// Restore starts the process of restoring a snapshot (i.e. from an on-disk
|
|
// backup, or to bootstrap from a leader).
|
|
//
|
|
// Callers *must* call Abort or Commit when done, to free resources.
|
|
func (b *Backend) Restore() (*Restoration, error) {
|
|
r, err := b.store.Restore()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &Restoration{r}, nil
|
|
}
|
|
|
|
// Restoration is a handle that can be used to restore a snapshot.
|
|
type Restoration struct{ r *inmem.Restoration }
|
|
|
|
// Apply the given protobuf-encoded resource to the backend.
|
|
func (r *Restoration) Apply(msg []byte) error {
|
|
var res pbresource.Resource
|
|
if err := res.UnmarshalBinary(msg); err != nil {
|
|
return err
|
|
}
|
|
return r.r.Apply(&res)
|
|
}
|
|
|
|
// Commit the restoration.
|
|
func (r *Restoration) Commit() { r.r.Commit() }
|
|
|
|
// Abort the restoration. It's safe to always call this in a defer statement
|
|
// because aborting a committed restoration is a no-op.
|
|
func (r *Restoration) Abort() { r.r.Abort() }
|