Return 429 response on HTTP max connection limit (#13621)

Return 429 response on HTTP max connection limit. Instead of silently closing
the connection, return a `429 Too Many Requests` HTTP response with a helpful
error message to aid debugging when the connection limit is unintentionally
reached.

Set a 10-millisecond write timeout and rate limiter for connection-limit 429
response to prevent writing the HTTP response from consuming too many server
resources.

Add `nomad.agent.http.exceeded metric` counting the number of HTTP connections
exceeding concurrency limit.
This commit is contained in:
Will Jordan 2022-07-20 11:12:21 -07:00 committed by GitHub
parent 301c6e57f5
commit 5354409b1a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 55 additions and 43 deletions

3
.changelog/13621.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
api: HTTP server now returns a 429 error code when hitting the connection limit
```

View File

@ -14,6 +14,7 @@ import (
"strings" "strings"
"time" "time"
"github.com/armon/go-metrics"
assetfs "github.com/elazarl/go-bindata-assetfs" assetfs "github.com/elazarl/go-bindata-assetfs"
"github.com/gorilla/handlers" "github.com/gorilla/handlers"
"github.com/gorilla/websocket" "github.com/gorilla/websocket"
@ -22,6 +23,7 @@ import (
"github.com/hashicorp/go-msgpack/codec" "github.com/hashicorp/go-msgpack/codec"
multierror "github.com/hashicorp/go-multierror" multierror "github.com/hashicorp/go-multierror"
"github.com/rs/cors" "github.com/rs/cors"
"golang.org/x/time/rate"
"github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/acl"
"github.com/hashicorp/nomad/helper/noxssrw" "github.com/hashicorp/nomad/helper/noxssrw"
@ -156,7 +158,7 @@ func NewHTTPServers(agent *Agent, config *Config) ([]*HTTPServer, error) {
httpServer := http.Server{ httpServer := http.Server{
Addr: srv.Addr, Addr: srv.Addr,
Handler: handlers.CompressHandler(srv.mux), Handler: handlers.CompressHandler(srv.mux),
ConnState: makeConnState(config.TLSConfig.EnableHTTP, handshakeTimeout, maxConns), ConnState: makeConnState(config.TLSConfig.EnableHTTP, handshakeTimeout, maxConns, srv.logger),
ErrorLog: newHTTPServerLogger(srv.logger), ErrorLog: newHTTPServerLogger(srv.logger),
} }
@ -213,13 +215,12 @@ func NewHTTPServers(agent *Agent, config *Config) ([]*HTTPServer, error) {
// //
// If limit > 0, a per-address connection limit will be enabled regardless of // If limit > 0, a per-address connection limit will be enabled regardless of
// TLS. If connLimit == 0 there is no connection limit. // TLS. If connLimit == 0 there is no connection limit.
func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int) func(conn net.Conn, state http.ConnState) { func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int, logger log.Logger) func(conn net.Conn, state http.ConnState) {
connLimiter := connLimiter(connLimit, logger)
if !isTLS || handshakeTimeout == 0 { if !isTLS || handshakeTimeout == 0 {
if connLimit > 0 { if connLimit > 0 {
// Still return the connection limiter // Still return the connection limiter
return connlimit.NewLimiter(connlimit.Config{ return connLimiter
MaxConnsPerClientIP: connLimit,
}).HTTPConnStateFunc()
} }
return nil return nil
} }
@ -227,9 +228,6 @@ func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int) fu
if connLimit > 0 { if connLimit > 0 {
// Return conn state callback with connection limiting and a // Return conn state callback with connection limiting and a
// handshake timeout. // handshake timeout.
connLimiter := connlimit.NewLimiter(connlimit.Config{
MaxConnsPerClientIP: connLimit,
}).HTTPConnStateFunc()
return func(conn net.Conn, state http.ConnState) { return func(conn net.Conn, state http.ConnState) {
switch state { switch state {
@ -268,6 +266,35 @@ func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int) fu
} }
} }
// connLimiter returns a connection-limiter function with a rate-limited 429-response error handler.
// The rate-limit prevents the TLS handshake necessary to write the HTTP response
// from consuming too many server resources.
func connLimiter(connLimit int, logger log.Logger) func(conn net.Conn, state http.ConnState) {
// Global rate-limit of 10 responses per second with a 100-response burst.
limiter := rate.NewLimiter(10, 100)
tooManyConnsMsg := "Your IP is issuing too many concurrent connections, please rate limit your calls\n"
tooManyRequestsResponse := []byte(fmt.Sprintf("HTTP/1.1 429 Too Many Requests\r\n"+
"Content-Type: text/plain\r\n"+
"Content-Length: %d\r\n"+
"Connection: close\r\n\r\n%s", len(tooManyConnsMsg), tooManyConnsMsg))
return connlimit.NewLimiter(connlimit.Config{
MaxConnsPerClientIP: connLimit,
}).HTTPConnStateFuncWithErrorHandler(func(err error, conn net.Conn) {
if err == connlimit.ErrPerClientIPLimitReached {
metrics.IncrCounter([]string{"nomad", "agent", "http", "exceeded"}, 1)
if n := limiter.Reserve(); n.Delay() == 0 {
logger.Warn("Too many concurrent connections", "address", conn.RemoteAddr().String(), "limit", connLimit)
conn.SetDeadline(time.Now().Add(10 * time.Millisecond))
conn.Write(tooManyRequestsResponse)
} else {
n.Cancel()
}
}
conn.Close()
})
}
// tcpKeepAliveListener sets TCP keep-alive timeouts on accepted // tcpKeepAliveListener sets TCP keep-alive timeouts on accepted
// connections. It's used by NewHttpServer so // connections. It's used by NewHttpServer so
// dead TCP connections eventually go away. // dead TCP connections eventually go away.

View File

@ -1254,41 +1254,15 @@ func TestHTTPServer_Limits_OK(t *testing.T) {
// Create a new connection that will go over the connection limit. // Create a new connection that will go over the connection limit.
limitConn, err := dial(t, addr, useTLS) limitConn, err := dial(t, addr, useTLS)
// At this point, the state of the connection + handshake are up in the response := "HTTP/1.1 429"
// air. buf := make([]byte, len(response))
// deadline := time.Now().Add(10 * time.Second)
// 1) dial failed, handshake never started require.NoError(t, limitConn.SetReadDeadline(deadline))
// => Conn is nil + io.EOF n, err := limitConn.Read(buf)
// 2) dial completed, handshake failed require.Equal(t, response, string(buf))
// => Conn is malformed + (net.OpError OR io.EOF) require.Nil(t, err)
// 3) dial completed, handshake succeeded require.Equal(t, len(response), n)
// => Conn is not nil + no error, however using the Conn should require.NoError(t, limitConn.Close())
// result in io.EOF
//
// At no point should Nomad actually write through the limited Conn.
if limitConn == nil || err != nil {
// Case 1 or Case 2 - returned Conn is useless and the error should
// be one of:
// "EOF"
// "closed network connection"
// "connection reset by peer"
msg := err.Error()
acceptable := strings.Contains(msg, "EOF") ||
strings.Contains(msg, "closed network connection") ||
strings.Contains(msg, "connection reset by peer")
require.True(t, acceptable)
} else {
// Case 3 - returned Conn is usable, but Nomad should not write
// anything before closing it.
buf := make([]byte, bufSize)
deadline := time.Now().Add(10 * time.Second)
require.NoError(t, limitConn.SetReadDeadline(deadline))
n, err := limitConn.Read(buf)
require.Equal(t, io.EOF, err)
require.Zero(t, n)
require.NoError(t, limitConn.Close())
}
// Assert existing connections are ok // Assert existing connections are ok
require.Len(t, errCh, 0) require.Len(t, errCh, 0)

View File

@ -480,6 +480,14 @@ Raft database metrics are emitted by the `raft-boltdb` library.
| `nomad.raft.boltdb.txstats.write` | Count of total write operations | Integer | Counter | | `nomad.raft.boltdb.txstats.write` | Count of total write operations | Integer | Counter |
| `nomad.raft.boltdb.txstats.writeTime` | Sample of write operation times | Nanoseconds | Summary | | `nomad.raft.boltdb.txstats.writeTime` | Sample of write operation times | Nanoseconds | Summary |
## Agent Metrics
Agent metrics are emitted by all Nomad agents running in either client or server mode.
| Metric | Description | Unit | Type |
| ----------------------------------------- | ----------------------------------------------------- | ----------- | ------- |
| `nomad.agent.http.exceeded` | Count of HTTP connections exceeding concurrency limit | Integer | Counter |
[tagged-metrics]: /docs/telemetry/metrics#tagged-metrics [tagged-metrics]: /docs/telemetry/metrics#tagged-metrics
[sticky]: /docs/job-specification/ephemeral_disk#sticky [sticky]: /docs/job-specification/ephemeral_disk#sticky
[s_port_plan_failure]: /s/port-plan-failure [s_port_plan_failure]: /s/port-plan-failure