config: add field for enabling streaming in the client

agent: register the new streaming cache-type
2020-10-05 17:31:35 -04:00 · 2020-10-05 17:31:35 -04:00 · 195511140f
parent 05df7b18a9
commit 195511140f
8 changed files with 83 additions and 33 deletions
--- a/agent/agent.go
+++ b/agent/agent.go
@ -359,7 +359,11 @@ func New(bd BaseDeps) (*Agent, error) {
 		cache:           bd.Cache,
 	}

-	a.rpcClientHealth = &health.Client{Cache: bd.Cache, NetRPC: &a}
+	cacheName := cachetype.HealthServicesName
+	if bd.RuntimeConfig.CacheUseStreamingBackend {
+		cacheName = cachetype.StreamingHealthServicesName
+	}
+	a.rpcClientHealth = &health.Client{Cache: bd.Cache, NetRPC: &a, CacheName: cacheName}

 	a.serviceManager = NewServiceManager(&a)

@ -3675,10 +3679,11 @@ func (a *Agent) LocalBlockingQuery(alwaysBlock bool, hash string, wait time.Dura
 	}
 }

-// registerCache configures the cache and registers all the supported
-// types onto the cache. This is NOT safe to call multiple times so
-// care should be taken to call this exactly once after the cache
-// field has been initialized.
+// registerCache types on a.cache.
+// This function may only be called once from New.
+//
+// Note: this function no longer registered all cache-types. Newer cache-types
+// that do not depend on Agent are registered from registerCacheTypes.
 func (a *Agent) registerCache() {
 	// Note that you should register the _agent_ as the RPC implementation and not
 	// the a.delegate directly, otherwise tests that rely on overriding RPC
--- a/agent/config/builder.go
+++ b/agent/config/builder.go
@ -1091,6 +1091,8 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
 		Watches:                     c.Watches,
 	}

+	rt.CacheUseStreamingBackend = b.boolVal(c.Cache.UseStreamingBackend)
+
 	if rt.Cache.EntryFetchMaxBurst <= 0 {
 		return RuntimeConfig{}, fmt.Errorf("cache.entry_fetch_max_burst must be strictly positive, was: %v", rt.Cache.EntryFetchMaxBurst)
 	}
--- a/agent/config/config.go
+++ b/agent/config/config.go
@ -97,12 +97,15 @@ func (l LiteralSource) Parse() (Config, mapstructure.Metadata, error) {
 	return l.Config, mapstructure.Metadata{}, nil
 }

-// Cache is the tunning configuration for cache, values are optional
+// Cache configuration for the agent/cache.
 type Cache struct {
 	// EntryFetchMaxBurst max burst size of RateLimit for a single cache entry
 	EntryFetchMaxBurst *int `json:"entry_fetch_max_burst,omitempty" hcl:"entry_fetch_max_burst" mapstructure:"entry_fetch_max_burst"`
 	// EntryFetchRate represents the max calls/sec for a single cache entry
 	EntryFetchRate *float64 `json:"entry_fetch_rate,omitempty" hcl:"entry_fetch_rate" mapstructure:"entry_fetch_rate"`
+	// UseStreamingBackend instead of blocking queries to populate the cache.
+	// Only supported by some cache types.
+	UseStreamingBackend *bool `json:"use_streaming_backend" hcl:"use_streaming_backend" mapstructure:"use_streaming_backend"`
 }

 // Config defines the format of a configuration file in either JSON or
--- a/agent/config/runtime.go
+++ b/agent/config/runtime.go
@ -937,6 +937,8 @@ type RuntimeConfig struct {

 	RPCConfig consul.RPCConfig

+	CacheUseStreamingBackend bool
+
 	// RaftProtocol sets the Raft protocol version to use on this server.
 	// Defaults to 3.
 	//
--- a/agent/config/runtime_test.go
+++ b/agent/config/runtime_test.go
@ -4878,7 +4878,8 @@ func TestFullConfig(t *testing.T) {
 			"bootstrap_expect": 53,
 			"cache": {
 				"entry_fetch_max_burst": 42,
-				"entry_fetch_rate": 0.334
+				"entry_fetch_rate": 0.334,
+				"use_streaming_backend": true
 			},
 			"ca_file": "erA7T0PM",
 			"ca_path": "mQEN1Mfp",
@ -5561,6 +5562,7 @@ func TestFullConfig(t *testing.T) {
 			cache = {
 				entry_fetch_max_burst = 42
 				entry_fetch_rate = 0.334
+				use_streaming_backend = true
 			},
 			ca_file = "erA7T0PM"
 			ca_path = "mQEN1Mfp"
@ -6854,16 +6856,17 @@ func TestFullConfig(t *testing.T) {
 				},
 			},
 		},
-		SerfAdvertiseAddrLAN: tcpAddr("17.99.29.16:8301"),
-		SerfAdvertiseAddrWAN: tcpAddr("78.63.37.19:8302"),
-		SerfBindAddrLAN:      tcpAddr("99.43.63.15:8301"),
-		SerfBindAddrWAN:      tcpAddr("67.88.33.19:8302"),
-		SerfAllowedCIDRsLAN:  []net.IPNet{},
-		SerfAllowedCIDRsWAN:  []net.IPNet{},
-		SessionTTLMin:        26627 * time.Second,
-		SkipLeaveOnInt:       true,
-		StartJoinAddrsLAN:    []string{"LR3hGDoG", "MwVpZ4Up"},
-		StartJoinAddrsWAN:    []string{"EbFSc3nA", "kwXTh623"},
+		CacheUseStreamingBackend: true,
+		SerfAdvertiseAddrLAN:     tcpAddr("17.99.29.16:8301"),
+		SerfAdvertiseAddrWAN:     tcpAddr("78.63.37.19:8302"),
+		SerfBindAddrLAN:          tcpAddr("99.43.63.15:8301"),
+		SerfBindAddrWAN:          tcpAddr("67.88.33.19:8302"),
+		SerfAllowedCIDRsLAN:      []net.IPNet{},
+		SerfAllowedCIDRsWAN:      []net.IPNet{},
+		SessionTTLMin:            26627 * time.Second,
+		SkipLeaveOnInt:           true,
+		StartJoinAddrsLAN:        []string{"LR3hGDoG", "MwVpZ4Up"},
+		StartJoinAddrsWAN:        []string{"EbFSc3nA", "kwXTh623"},
 		Telemetry: lib.TelemetryConfig{
 			CirconusAPIApp:                     "p4QOTe9j",
 			CirconusAPIToken:                   "E3j35V23",
@ -7501,6 +7504,7 @@ func TestSanitize(t *testing.T) {
 		"SerfBindAddrWAN": "",
 		"SerfPortLAN": 0,
 		"SerfPortWAN": 0,
+		"CacheUseStreamingBackend": false,
 		"ServerMode": false,
 		"ServerName": "",
 		"ServerPort": 0,
--- a/agent/rpcclient/health/health.go
+++ b/agent/rpcclient/health/health.go
@ -4,13 +4,13 @@ import (
 	"context"

 	"github.com/hashicorp/consul/agent/cache"
-	cachetype "github.com/hashicorp/consul/agent/cache-types"
 	"github.com/hashicorp/consul/agent/structs"
 )

 type Client struct {
-	NetRPC NetRPC
-	Cache  CacheGetter
+	NetRPC    NetRPC
+	Cache     CacheGetter
+	CacheName string
 }

 type NetRPC interface {
@ -51,7 +51,7 @@ func (c *Client) getServiceNodes(
 		return out, cache.ResultMeta{}, err
 	}

-	raw, md, err := c.Cache.Get(ctx, cachetype.HealthServicesName, &req)
+	raw, md, err := c.Cache.Get(ctx, c.CacheName, &req)
 	if err != nil {
 		return out, md, err
 	}
--- a/agent/setup.go
+++ b/agent/setup.go
@ -7,8 +7,12 @@ import (
 	"net/http"
 	"time"

+	"github.com/hashicorp/go-hclog"
+	"google.golang.org/grpc/grpclog"
+
 	autoconf "github.com/hashicorp/consul/agent/auto-config"
 	"github.com/hashicorp/consul/agent/cache"
+	cachetype "github.com/hashicorp/consul/agent/cache-types"
 	"github.com/hashicorp/consul/agent/config"
 	"github.com/hashicorp/consul/agent/consul"
 	"github.com/hashicorp/consul/agent/grpc"
@ -19,9 +23,8 @@ import (
 	"github.com/hashicorp/consul/ipaddr"
 	"github.com/hashicorp/consul/lib"
 	"github.com/hashicorp/consul/logging"
+	"github.com/hashicorp/consul/proto/pbsubscribe"
 	"github.com/hashicorp/consul/tlsutil"
-	"github.com/hashicorp/go-hclog"
-	"google.golang.org/grpc/grpclog"
 )

 // TODO: BaseDeps should be renamed in the future once more of Agent.Start
@ -84,7 +87,6 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer) (BaseDeps, error)
 	d.Cache = cache.New(cfg.Cache)
 	d.ConnPool = newConnPool(cfg, d.Logger, d.TLSConfigurator)

-	// TODO(streaming): setConfig.Scheme name for tests
 	builder := resolver.NewServerResolverBuilder(resolver.Config{})
 	resolver.RegisterWithGRPC(builder)
 	d.GRPCConnPool = grpc.NewClientConnPool(builder, grpc.TLSWrapper(d.TLSConfigurator.OutgoingRPCWrapper()))
@ -105,9 +107,33 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer) (BaseDeps, error)
 		return d, err
 	}

+	if err := registerCacheTypes(d); err != nil {
+		return d, err
+	}
+
 	return d, nil
 }

+// registerCacheTypes on bd.Cache.
+//
+// Note: most cache types are still registered in Agent.registerCache. This
+// function is for registering newer cache-types which no longer have a dependency
+// on Agent.
+func registerCacheTypes(bd BaseDeps) error {
+	if bd.RuntimeConfig.CacheUseStreamingBackend {
+		conn, err := bd.GRPCConnPool.ClientConn(bd.RuntimeConfig.Datacenter)
+		if err != nil {
+			return err
+		}
+		matDeps := cachetype.MaterializerDeps{
+			Client: pbsubscribe.NewStateChangeSubscriptionClient(conn),
+			Logger: bd.Logger,
+		}
+		bd.Cache.RegisterType(cachetype.StreamingHealthServicesName, cachetype.NewStreamingHealthServices(matDeps))
+	}
+	return nil
+}
+
 func newConnPool(config *config.RuntimeConfig, logger hclog.Logger, tls *tlsutil.Configurator) *pool.ConnPool {
 	var rpcSrcAddr *net.TCPAddr
 	if !ipaddr.IsAny(config.RPCBindAddr) {
--- a/website/pages/docs/agent/options.mdx
+++ b/website/pages/docs/agent/options.mdx
@ -1128,14 +1128,14 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'."
  </Tab>
  </Tabs>

- `cache` Cache configuration of agent. The configurable values are the following:
+- `cache` configuration for client agents. The configurable values are the following:

-  - `entry_fetch_max_burst`: The size of the token bucket used to recharge the rate-limit per
+  - `entry_fetch_max_burst` The size of the token bucket used to recharge the rate-limit per
    cache entry. The default value is 2 and means that when cache has not been updated
    for a long time, 2 successive queries can be made as long as the rate-limit is not
    reached.

-  - `entry_fetch_rate`: configures the rate-limit at which the cache may refresh a single
+  - `entry_fetch_rate` configures the rate-limit at which the cache may refresh a single
    entry. On a cluster with many changes/s, watching changes in the cache might put high
    pressure on the servers. This ensures the number of requests for a single cache entry
    will never go beyond this limit, even when a given service changes every 1/100s.
@ -1146,6 +1146,13 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'."
    The default value is "No limit" and should be tuned on large
    clusters to avoid performing too many RPCs on entries changing a lot.

+  - `use_streaming_backend` when enabled Consul client agents will use streaming rpc to
+    populate the cache, instead of the traditional blocking queries. All servers must
+    have [`rpc.enable_streaming`](#rpc_enable_streaming) enabled before any client can enable `use_streaming_backend`.
+    At least one of [`dns.use_cache`](#dns_use_cache) or
+    [`http_config.use_cache`](#http_config_use_cache) must be enabled, otherwise
+    this setting has no effect.
+
 - `ca_file` This provides a file path to a PEM-encoded certificate
  authority. The certificate authority is used to check the authenticity of client
  and server connections with the appropriate [`verify_incoming`](#verify_incoming)
@ -1618,7 +1625,7 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'."

  - `allow_write_http_from` This object is a list of networks in CIDR notation (eg "127.0.0.0/8") that are allowed to call the agent write endpoints. It defaults to an empty list, which means all networks are allowed. This is used to make the agent read-only, except for select ip ranges. - To block write calls from anywhere, use `[ "255.255.255.255/32" ]`. - To only allow write calls from localhost, use `[ "127.0.0.0/8" ]` - To only allow specific IPs, use `[ "10.0.0.1/32", "10.0.0.2/32" ]`

-  - `use_cache` Defaults to true. If disabled, the agent won't be using [agent caching](/api/features/caching) to answer the request. Even when the url parameter is provided.
+  - `use_cache` ((#http_config_use_cache)) Defaults to true. If disabled, the agent won't be using [agent caching](/api/features/caching) to answer the request. Even when the url parameter is provided.

 - `leave_on_terminate` If enabled, when the agent receives a TERM signal, it will send a `Leave` message to the rest of the cluster and gracefully leave. The default behavior for this feature varies based on whether or not the agent is running as a client or a server (prior to Consul 0.7 the default value was unconditionally set to `false`). On agents in client-mode, this defaults to `true` and for agents in server-mode, this defaults to `false`.

@ -1633,11 +1640,6 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'."
  - `rpc_max_conns_per_client` - Configures a limit of how many concurrent TCP connections a single source IP address is allowed to open to a single server. It affects both clients connections and other server connections. In general Consul clients multiplex many RPC calls over a single TCP connection so this can typically be kept low. It needs to be more than one though since servers open at least one additional connection for raft RPC, possibly more for WAN federation when using network areas, and snapshot requests from clients run over a separate TCP conn. A reasonably low limit significantly reduces the ability of an unauthenticated attacker to consume unbounded resources by holding open many connections. You may need to increase this if WAN federated servers connect via proxies or NAT gateways or similar causing many legitimate connections from a single source IP. Default value is `100` which is designed to be extremely conservative to limit issues with certain deployment patterns. Most deployments can probably reduce this safely. 100 connections on modern server hardware should not cause a significant impact on resource usage from an unauthenticated attacker though.
  - `rpc_rate` - Configures the RPC rate limiter on Consul _clients_ by setting the maximum request rate that this agent is allowed to make for RPC requests to Consul servers, in requests per second. Defaults to infinite, which disables rate limiting.
  - `rpc_max_burst` - The size of the token bucket used to recharge the RPC rate limiter on Consul _clients_. Defaults to 1000 tokens, and each token is good for a single RPC call to a Consul server. See https://en.wikipedia.org/wiki/Token_bucket for more details about how token bucket rate limiters operate.
-
-  - `rpc.enable_streaming` - Enable the gRPC subscribe endpoint on a Consul Server. All
-    Servers in all connected datacenters must have this enabled before any client can use
-    streaming.
-
  - `kv_max_value_size` - **(Advanced)** Configures the maximum number of bytes for a kv request body to the [`/v1/kv`](/api/kv) endpoint. This limit defaults to [raft's](https://github.com/hashicorp/raft) suggested max size (512KB). **Note that tuning these improperly can cause Consul to fail in unexpected ways**, it may potentially affect leadership stability and prevent timely heartbeat signals by increasing RPC IO duration. This option affects the txn endpoint too, but Consul 1.7.2 introduced `txn_max_req_len` which is the preferred way to set the limit for the txn endpoint. If both limits are set, the higher one takes precedence.
  - `txn_max_req_len` - **(Advanced)** Configures the maximum number of bytes for a transaction request body to the [`/v1/txn`](/api/txn) endpoint. This limit defaults to [raft's](https://github.com/hashicorp/raft) suggested max size (512KB). **Note that tuning these improperly can cause Consul to fail in unexpected ways**, it may potentially affect leadership stability and prevent timely heartbeat signals by increasing RPC IO duration.

@ -1820,6 +1822,12 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'."

 - `retry_interval_wan` Equivalent to the [`-retry-interval-wan` command-line flag](#_retry_interval_wan).

+- `rpc` configuration for Consul servers.
+
+  - `enable_streaming` ((#rpc_enable_streaming)) enables the gRPC subscribe endpoint on a Consul Server. All
+    servers in all federated datacenters must have this enabled before any client can use
+    [`cache.use_streaming_backend`](#use_streaming_backend). This setting will default to true in a future release of Consul.
+
 - `segment` <EnterpriseAlert inline /> - Equivalent to the [`-segment` command-line flag](#_segment).

 - `segments` <EnterpriseAlert inline /> - This is a list of nested objects