From 195511140f9a56f3c861efb559a6cff834eb93a3 Mon Sep 17 00:00:00 2001
From: Daniel Nephin <dnephin@hashicorp.com>
Date: Mon, 5 Oct 2020 17:31:35 -0400
Subject: [PATCH] config: add field for enabling streaming in the client

agent: register the new streaming cache-type
---
 agent/agent.go                       | 15 ++++++++-----
 agent/config/builder.go              |  2 ++
 agent/config/config.go               |  5 ++++-
 agent/config/runtime.go              |  2 ++
 agent/config/runtime_test.go         | 26 ++++++++++++----------
 agent/rpcclient/health/health.go     |  8 +++----
 agent/setup.go                       | 32 +++++++++++++++++++++++++---
 website/pages/docs/agent/options.mdx | 26 ++++++++++++++--------
 8 files changed, 83 insertions(+), 33 deletions(-)

diff --git a/agent/agent.go b/agent/agent.go
index 564d76711..351fde8ab 100644
--- a/agent/agent.go
+++ b/agent/agent.go
@@ -359,7 +359,11 @@ func New(bd BaseDeps) (*Agent, error) {
 		cache:           bd.Cache,
 	}
 
-	a.rpcClientHealth = &health.Client{Cache: bd.Cache, NetRPC: &a}
+	cacheName := cachetype.HealthServicesName
+	if bd.RuntimeConfig.CacheUseStreamingBackend {
+		cacheName = cachetype.StreamingHealthServicesName
+	}
+	a.rpcClientHealth = &health.Client{Cache: bd.Cache, NetRPC: &a, CacheName: cacheName}
 
 	a.serviceManager = NewServiceManager(&a)
 
@@ -3675,10 +3679,11 @@ func (a *Agent) LocalBlockingQuery(alwaysBlock bool, hash string, wait time.Dura
 	}
 }
 
-// registerCache configures the cache and registers all the supported
-// types onto the cache. This is NOT safe to call multiple times so
-// care should be taken to call this exactly once after the cache
-// field has been initialized.
+// registerCache types on a.cache.
+// This function may only be called once from New.
+//
+// Note: this function no longer registered all cache-types. Newer cache-types
+// that do not depend on Agent are registered from registerCacheTypes.
 func (a *Agent) registerCache() {
 	// Note that you should register the _agent_ as the RPC implementation and not
 	// the a.delegate directly, otherwise tests that rely on overriding RPC
diff --git a/agent/config/builder.go b/agent/config/builder.go
index 1d2cdddfe..4d5a42dd4 100644
--- a/agent/config/builder.go
+++ b/agent/config/builder.go
@@ -1091,6 +1091,8 @@ func (b *Builder) Build() (rt RuntimeConfig, err error) {
 		Watches:                     c.Watches,
 	}
 
+	rt.CacheUseStreamingBackend = b.boolVal(c.Cache.UseStreamingBackend)
+
 	if rt.Cache.EntryFetchMaxBurst <= 0 {
 		return RuntimeConfig{}, fmt.Errorf("cache.entry_fetch_max_burst must be strictly positive, was: %v", rt.Cache.EntryFetchMaxBurst)
 	}
diff --git a/agent/config/config.go b/agent/config/config.go
index 97f3cec2d..890e5c746 100644
--- a/agent/config/config.go
+++ b/agent/config/config.go
@@ -97,12 +97,15 @@ func (l LiteralSource) Parse() (Config, mapstructure.Metadata, error) {
 	return l.Config, mapstructure.Metadata{}, nil
 }
 
-// Cache is the tunning configuration for cache, values are optional
+// Cache configuration for the agent/cache.
 type Cache struct {
 	// EntryFetchMaxBurst max burst size of RateLimit for a single cache entry
 	EntryFetchMaxBurst *int `json:"entry_fetch_max_burst,omitempty" hcl:"entry_fetch_max_burst" mapstructure:"entry_fetch_max_burst"`
 	// EntryFetchRate represents the max calls/sec for a single cache entry
 	EntryFetchRate *float64 `json:"entry_fetch_rate,omitempty" hcl:"entry_fetch_rate" mapstructure:"entry_fetch_rate"`
+	// UseStreamingBackend instead of blocking queries to populate the cache.
+	// Only supported by some cache types.
+	UseStreamingBackend *bool `json:"use_streaming_backend" hcl:"use_streaming_backend" mapstructure:"use_streaming_backend"`
 }
 
 // Config defines the format of a configuration file in either JSON or
diff --git a/agent/config/runtime.go b/agent/config/runtime.go
index 8dc4b9866..ad3d042fd 100644
--- a/agent/config/runtime.go
+++ b/agent/config/runtime.go
@@ -937,6 +937,8 @@ type RuntimeConfig struct {
 
 	RPCConfig consul.RPCConfig
 
+	CacheUseStreamingBackend bool
+
 	// RaftProtocol sets the Raft protocol version to use on this server.
 	// Defaults to 3.
 	//
diff --git a/agent/config/runtime_test.go b/agent/config/runtime_test.go
index b4cf06b7c..990dc5fd7 100644
--- a/agent/config/runtime_test.go
+++ b/agent/config/runtime_test.go
@@ -4878,7 +4878,8 @@ func TestFullConfig(t *testing.T) {
 			"bootstrap_expect": 53,
 			"cache": {
 				"entry_fetch_max_burst": 42,
-				"entry_fetch_rate": 0.334
+				"entry_fetch_rate": 0.334,
+				"use_streaming_backend": true
 			},
 			"ca_file": "erA7T0PM",
 			"ca_path": "mQEN1Mfp",
@@ -5561,6 +5562,7 @@ func TestFullConfig(t *testing.T) {
 			cache = {
 				entry_fetch_max_burst = 42
 				entry_fetch_rate = 0.334
+				use_streaming_backend = true
 			},
 			ca_file = "erA7T0PM"
 			ca_path = "mQEN1Mfp"
@@ -6854,16 +6856,17 @@ func TestFullConfig(t *testing.T) {
 				},
 			},
 		},
-		SerfAdvertiseAddrLAN: tcpAddr("17.99.29.16:8301"),
-		SerfAdvertiseAddrWAN: tcpAddr("78.63.37.19:8302"),
-		SerfBindAddrLAN:      tcpAddr("99.43.63.15:8301"),
-		SerfBindAddrWAN:      tcpAddr("67.88.33.19:8302"),
-		SerfAllowedCIDRsLAN:  []net.IPNet{},
-		SerfAllowedCIDRsWAN:  []net.IPNet{},
-		SessionTTLMin:        26627 * time.Second,
-		SkipLeaveOnInt:       true,
-		StartJoinAddrsLAN:    []string{"LR3hGDoG", "MwVpZ4Up"},
-		StartJoinAddrsWAN:    []string{"EbFSc3nA", "kwXTh623"},
+		CacheUseStreamingBackend: true,
+		SerfAdvertiseAddrLAN:     tcpAddr("17.99.29.16:8301"),
+		SerfAdvertiseAddrWAN:     tcpAddr("78.63.37.19:8302"),
+		SerfBindAddrLAN:          tcpAddr("99.43.63.15:8301"),
+		SerfBindAddrWAN:          tcpAddr("67.88.33.19:8302"),
+		SerfAllowedCIDRsLAN:      []net.IPNet{},
+		SerfAllowedCIDRsWAN:      []net.IPNet{},
+		SessionTTLMin:            26627 * time.Second,
+		SkipLeaveOnInt:           true,
+		StartJoinAddrsLAN:        []string{"LR3hGDoG", "MwVpZ4Up"},
+		StartJoinAddrsWAN:        []string{"EbFSc3nA", "kwXTh623"},
 		Telemetry: lib.TelemetryConfig{
 			CirconusAPIApp:                     "p4QOTe9j",
 			CirconusAPIToken:                   "E3j35V23",
@@ -7501,6 +7504,7 @@ func TestSanitize(t *testing.T) {
 		"SerfBindAddrWAN": "",
 		"SerfPortLAN": 0,
 		"SerfPortWAN": 0,
+		"CacheUseStreamingBackend": false,
 		"ServerMode": false,
 		"ServerName": "",
 		"ServerPort": 0,
diff --git a/agent/rpcclient/health/health.go b/agent/rpcclient/health/health.go
index 4c8d5f4d8..09fe452ab 100644
--- a/agent/rpcclient/health/health.go
+++ b/agent/rpcclient/health/health.go
@@ -4,13 +4,13 @@ import (
 	"context"
 
 	"github.com/hashicorp/consul/agent/cache"
-	cachetype "github.com/hashicorp/consul/agent/cache-types"
 	"github.com/hashicorp/consul/agent/structs"
 )
 
 type Client struct {
-	NetRPC NetRPC
-	Cache  CacheGetter
+	NetRPC    NetRPC
+	Cache     CacheGetter
+	CacheName string
 }
 
 type NetRPC interface {
@@ -51,7 +51,7 @@ func (c *Client) getServiceNodes(
 		return out, cache.ResultMeta{}, err
 	}
 
-	raw, md, err := c.Cache.Get(ctx, cachetype.HealthServicesName, &req)
+	raw, md, err := c.Cache.Get(ctx, c.CacheName, &req)
 	if err != nil {
 		return out, md, err
 	}
diff --git a/agent/setup.go b/agent/setup.go
index 213ef304e..7c65777c9 100644
--- a/agent/setup.go
+++ b/agent/setup.go
@@ -7,8 +7,12 @@ import (
 	"net/http"
 	"time"
 
+	"github.com/hashicorp/go-hclog"
+	"google.golang.org/grpc/grpclog"
+
 	autoconf "github.com/hashicorp/consul/agent/auto-config"
 	"github.com/hashicorp/consul/agent/cache"
+	cachetype "github.com/hashicorp/consul/agent/cache-types"
 	"github.com/hashicorp/consul/agent/config"
 	"github.com/hashicorp/consul/agent/consul"
 	"github.com/hashicorp/consul/agent/grpc"
@@ -19,9 +23,8 @@ import (
 	"github.com/hashicorp/consul/ipaddr"
 	"github.com/hashicorp/consul/lib"
 	"github.com/hashicorp/consul/logging"
+	"github.com/hashicorp/consul/proto/pbsubscribe"
 	"github.com/hashicorp/consul/tlsutil"
-	"github.com/hashicorp/go-hclog"
-	"google.golang.org/grpc/grpclog"
 )
 
 // TODO: BaseDeps should be renamed in the future once more of Agent.Start
@@ -84,7 +87,6 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer) (BaseDeps, error)
 	d.Cache = cache.New(cfg.Cache)
 	d.ConnPool = newConnPool(cfg, d.Logger, d.TLSConfigurator)
 
-	// TODO(streaming): setConfig.Scheme name for tests
 	builder := resolver.NewServerResolverBuilder(resolver.Config{})
 	resolver.RegisterWithGRPC(builder)
 	d.GRPCConnPool = grpc.NewClientConnPool(builder, grpc.TLSWrapper(d.TLSConfigurator.OutgoingRPCWrapper()))
@@ -105,9 +107,33 @@ func NewBaseDeps(configLoader ConfigLoader, logOut io.Writer) (BaseDeps, error)
 		return d, err
 	}
 
+	if err := registerCacheTypes(d); err != nil {
+		return d, err
+	}
+
 	return d, nil
 }
 
+// registerCacheTypes on bd.Cache.
+//
+// Note: most cache types are still registered in Agent.registerCache. This
+// function is for registering newer cache-types which no longer have a dependency
+// on Agent.
+func registerCacheTypes(bd BaseDeps) error {
+	if bd.RuntimeConfig.CacheUseStreamingBackend {
+		conn, err := bd.GRPCConnPool.ClientConn(bd.RuntimeConfig.Datacenter)
+		if err != nil {
+			return err
+		}
+		matDeps := cachetype.MaterializerDeps{
+			Client: pbsubscribe.NewStateChangeSubscriptionClient(conn),
+			Logger: bd.Logger,
+		}
+		bd.Cache.RegisterType(cachetype.StreamingHealthServicesName, cachetype.NewStreamingHealthServices(matDeps))
+	}
+	return nil
+}
+
 func newConnPool(config *config.RuntimeConfig, logger hclog.Logger, tls *tlsutil.Configurator) *pool.ConnPool {
 	var rpcSrcAddr *net.TCPAddr
 	if !ipaddr.IsAny(config.RPCBindAddr) {
diff --git a/website/pages/docs/agent/options.mdx b/website/pages/docs/agent/options.mdx
index 64dad15f0..d178a0ee4 100644
--- a/website/pages/docs/agent/options.mdx
+++ b/website/pages/docs/agent/options.mdx
@@ -1128,14 +1128,14 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'."
   </Tab>
   </Tabs>
 
-- `cache` Cache configuration of agent. The configurable values are the following:
+- `cache` configuration for client agents. The configurable values are the following:
 
-  - `entry_fetch_max_burst`: The size of the token bucket used to recharge the rate-limit per
+  - `entry_fetch_max_burst` The size of the token bucket used to recharge the rate-limit per
     cache entry. The default value is 2 and means that when cache has not been updated
     for a long time, 2 successive queries can be made as long as the rate-limit is not
     reached.
 
-  - `entry_fetch_rate`: configures the rate-limit at which the cache may refresh a single
+  - `entry_fetch_rate` configures the rate-limit at which the cache may refresh a single
     entry. On a cluster with many changes/s, watching changes in the cache might put high
     pressure on the servers. This ensures the number of requests for a single cache entry
     will never go beyond this limit, even when a given service changes every 1/100s.
@@ -1146,6 +1146,13 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'."
     The default value is "No limit" and should be tuned on large
     clusters to avoid performing too many RPCs on entries changing a lot.
 
+  - `use_streaming_backend` when enabled Consul client agents will use streaming rpc to
+    populate the cache, instead of the traditional blocking queries. All servers must
+    have [`rpc.enable_streaming`](#rpc_enable_streaming) enabled before any client can enable `use_streaming_backend`.
+    At least one of [`dns.use_cache`](#dns_use_cache) or
+    [`http_config.use_cache`](#http_config_use_cache) must be enabled, otherwise
+    this setting has no effect.
+
 - `ca_file` This provides a file path to a PEM-encoded certificate
   authority. The certificate authority is used to check the authenticity of client
   and server connections with the appropriate [`verify_incoming`](#verify_incoming)
@@ -1618,7 +1625,7 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'."
 
   - `allow_write_http_from` This object is a list of networks in CIDR notation (eg "127.0.0.0/8") that are allowed to call the agent write endpoints. It defaults to an empty list, which means all networks are allowed. This is used to make the agent read-only, except for select ip ranges. - To block write calls from anywhere, use `[ "255.255.255.255/32" ]`. - To only allow write calls from localhost, use `[ "127.0.0.0/8" ]` - To only allow specific IPs, use `[ "10.0.0.1/32", "10.0.0.2/32" ]`
 
-  - `use_cache` Defaults to true. If disabled, the agent won't be using [agent caching](/api/features/caching) to answer the request. Even when the url parameter is provided.
+  - `use_cache` ((#http_config_use_cache)) Defaults to true. If disabled, the agent won't be using [agent caching](/api/features/caching) to answer the request. Even when the url parameter is provided.
 
 - `leave_on_terminate` If enabled, when the agent receives a TERM signal, it will send a `Leave` message to the rest of the cluster and gracefully leave. The default behavior for this feature varies based on whether or not the agent is running as a client or a server (prior to Consul 0.7 the default value was unconditionally set to `false`). On agents in client-mode, this defaults to `true` and for agents in server-mode, this defaults to `false`.
 
@@ -1633,11 +1640,6 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'."
   - `rpc_max_conns_per_client` - Configures a limit of how many concurrent TCP connections a single source IP address is allowed to open to a single server. It affects both clients connections and other server connections. In general Consul clients multiplex many RPC calls over a single TCP connection so this can typically be kept low. It needs to be more than one though since servers open at least one additional connection for raft RPC, possibly more for WAN federation when using network areas, and snapshot requests from clients run over a separate TCP conn. A reasonably low limit significantly reduces the ability of an unauthenticated attacker to consume unbounded resources by holding open many connections. You may need to increase this if WAN federated servers connect via proxies or NAT gateways or similar causing many legitimate connections from a single source IP. Default value is `100` which is designed to be extremely conservative to limit issues with certain deployment patterns. Most deployments can probably reduce this safely. 100 connections on modern server hardware should not cause a significant impact on resource usage from an unauthenticated attacker though.
   - `rpc_rate` - Configures the RPC rate limiter on Consul _clients_ by setting the maximum request rate that this agent is allowed to make for RPC requests to Consul servers, in requests per second. Defaults to infinite, which disables rate limiting.
   - `rpc_max_burst` - The size of the token bucket used to recharge the RPC rate limiter on Consul _clients_. Defaults to 1000 tokens, and each token is good for a single RPC call to a Consul server. See https://en.wikipedia.org/wiki/Token_bucket for more details about how token bucket rate limiters operate.
-
-  - `rpc.enable_streaming` - Enable the gRPC subscribe endpoint on a Consul Server. All
-    Servers in all connected datacenters must have this enabled before any client can use
-    streaming.
-
   - `kv_max_value_size` - **(Advanced)** Configures the maximum number of bytes for a kv request body to the [`/v1/kv`](/api/kv) endpoint. This limit defaults to [raft's](https://github.com/hashicorp/raft) suggested max size (512KB). **Note that tuning these improperly can cause Consul to fail in unexpected ways**, it may potentially affect leadership stability and prevent timely heartbeat signals by increasing RPC IO duration. This option affects the txn endpoint too, but Consul 1.7.2 introduced `txn_max_req_len` which is the preferred way to set the limit for the txn endpoint. If both limits are set, the higher one takes precedence.
   - `txn_max_req_len` - **(Advanced)** Configures the maximum number of bytes for a transaction request body to the [`/v1/txn`](/api/txn) endpoint. This limit defaults to [raft's](https://github.com/hashicorp/raft) suggested max size (512KB). **Note that tuning these improperly can cause Consul to fail in unexpected ways**, it may potentially affect leadership stability and prevent timely heartbeat signals by increasing RPC IO duration.
 
@@ -1820,6 +1822,12 @@ Valid time units are 'ns', 'us' (or 'µs'), 'ms', 's', 'm', 'h'."
 
 - `retry_interval_wan` Equivalent to the [`-retry-interval-wan` command-line flag](#_retry_interval_wan).
 
+- `rpc` configuration for Consul servers.
+
+  - `enable_streaming` ((#rpc_enable_streaming)) enables the gRPC subscribe endpoint on a Consul Server. All
+    servers in all federated datacenters must have this enabled before any client can use
+    [`cache.use_streaming_backend`](#use_streaming_backend). This setting will default to true in a future release of Consul.
+
 - `segment` <EnterpriseAlert inline /> - Equivalent to the [`-segment` command-line flag](#_segment).
 
 - `segments` <EnterpriseAlert inline /> - This is a list of nested objects