Return 429 response on HTTP max connection limit (#13621)
Return 429 response on HTTP max connection limit. Instead of silently closing the connection, return a `429 Too Many Requests` HTTP response with a helpful error message to aid debugging when the connection limit is unintentionally reached. Set a 10-millisecond write timeout and rate limiter for connection-limit 429 response to prevent writing the HTTP response from consuming too many server resources. Add `nomad.agent.http.exceeded metric` counting the number of HTTP connections exceeding concurrency limit.
This commit is contained in:
parent
301c6e57f5
commit
5354409b1a
|
@ -0,0 +1,3 @@
|
|||
```release-note:improvement
|
||||
api: HTTP server now returns a 429 error code when hitting the connection limit
|
||||
```
|
|
@ -14,6 +14,7 @@ import (
|
|||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/armon/go-metrics"
|
||||
assetfs "github.com/elazarl/go-bindata-assetfs"
|
||||
"github.com/gorilla/handlers"
|
||||
"github.com/gorilla/websocket"
|
||||
|
@ -22,6 +23,7 @@ import (
|
|||
"github.com/hashicorp/go-msgpack/codec"
|
||||
multierror "github.com/hashicorp/go-multierror"
|
||||
"github.com/rs/cors"
|
||||
"golang.org/x/time/rate"
|
||||
|
||||
"github.com/hashicorp/nomad/acl"
|
||||
"github.com/hashicorp/nomad/helper/noxssrw"
|
||||
|
@ -156,7 +158,7 @@ func NewHTTPServers(agent *Agent, config *Config) ([]*HTTPServer, error) {
|
|||
httpServer := http.Server{
|
||||
Addr: srv.Addr,
|
||||
Handler: handlers.CompressHandler(srv.mux),
|
||||
ConnState: makeConnState(config.TLSConfig.EnableHTTP, handshakeTimeout, maxConns),
|
||||
ConnState: makeConnState(config.TLSConfig.EnableHTTP, handshakeTimeout, maxConns, srv.logger),
|
||||
ErrorLog: newHTTPServerLogger(srv.logger),
|
||||
}
|
||||
|
||||
|
@ -213,13 +215,12 @@ func NewHTTPServers(agent *Agent, config *Config) ([]*HTTPServer, error) {
|
|||
//
|
||||
// If limit > 0, a per-address connection limit will be enabled regardless of
|
||||
// TLS. If connLimit == 0 there is no connection limit.
|
||||
func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int) func(conn net.Conn, state http.ConnState) {
|
||||
func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int, logger log.Logger) func(conn net.Conn, state http.ConnState) {
|
||||
connLimiter := connLimiter(connLimit, logger)
|
||||
if !isTLS || handshakeTimeout == 0 {
|
||||
if connLimit > 0 {
|
||||
// Still return the connection limiter
|
||||
return connlimit.NewLimiter(connlimit.Config{
|
||||
MaxConnsPerClientIP: connLimit,
|
||||
}).HTTPConnStateFunc()
|
||||
return connLimiter
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
@ -227,9 +228,6 @@ func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int) fu
|
|||
if connLimit > 0 {
|
||||
// Return conn state callback with connection limiting and a
|
||||
// handshake timeout.
|
||||
connLimiter := connlimit.NewLimiter(connlimit.Config{
|
||||
MaxConnsPerClientIP: connLimit,
|
||||
}).HTTPConnStateFunc()
|
||||
|
||||
return func(conn net.Conn, state http.ConnState) {
|
||||
switch state {
|
||||
|
@ -268,6 +266,35 @@ func makeConnState(isTLS bool, handshakeTimeout time.Duration, connLimit int) fu
|
|||
}
|
||||
}
|
||||
|
||||
// connLimiter returns a connection-limiter function with a rate-limited 429-response error handler.
|
||||
// The rate-limit prevents the TLS handshake necessary to write the HTTP response
|
||||
// from consuming too many server resources.
|
||||
func connLimiter(connLimit int, logger log.Logger) func(conn net.Conn, state http.ConnState) {
|
||||
// Global rate-limit of 10 responses per second with a 100-response burst.
|
||||
limiter := rate.NewLimiter(10, 100)
|
||||
|
||||
tooManyConnsMsg := "Your IP is issuing too many concurrent connections, please rate limit your calls\n"
|
||||
tooManyRequestsResponse := []byte(fmt.Sprintf("HTTP/1.1 429 Too Many Requests\r\n"+
|
||||
"Content-Type: text/plain\r\n"+
|
||||
"Content-Length: %d\r\n"+
|
||||
"Connection: close\r\n\r\n%s", len(tooManyConnsMsg), tooManyConnsMsg))
|
||||
return connlimit.NewLimiter(connlimit.Config{
|
||||
MaxConnsPerClientIP: connLimit,
|
||||
}).HTTPConnStateFuncWithErrorHandler(func(err error, conn net.Conn) {
|
||||
if err == connlimit.ErrPerClientIPLimitReached {
|
||||
metrics.IncrCounter([]string{"nomad", "agent", "http", "exceeded"}, 1)
|
||||
if n := limiter.Reserve(); n.Delay() == 0 {
|
||||
logger.Warn("Too many concurrent connections", "address", conn.RemoteAddr().String(), "limit", connLimit)
|
||||
conn.SetDeadline(time.Now().Add(10 * time.Millisecond))
|
||||
conn.Write(tooManyRequestsResponse)
|
||||
} else {
|
||||
n.Cancel()
|
||||
}
|
||||
}
|
||||
conn.Close()
|
||||
})
|
||||
}
|
||||
|
||||
// tcpKeepAliveListener sets TCP keep-alive timeouts on accepted
|
||||
// connections. It's used by NewHttpServer so
|
||||
// dead TCP connections eventually go away.
|
||||
|
|
|
@ -1254,41 +1254,15 @@ func TestHTTPServer_Limits_OK(t *testing.T) {
|
|||
// Create a new connection that will go over the connection limit.
|
||||
limitConn, err := dial(t, addr, useTLS)
|
||||
|
||||
// At this point, the state of the connection + handshake are up in the
|
||||
// air.
|
||||
//
|
||||
// 1) dial failed, handshake never started
|
||||
// => Conn is nil + io.EOF
|
||||
// 2) dial completed, handshake failed
|
||||
// => Conn is malformed + (net.OpError OR io.EOF)
|
||||
// 3) dial completed, handshake succeeded
|
||||
// => Conn is not nil + no error, however using the Conn should
|
||||
// result in io.EOF
|
||||
//
|
||||
// At no point should Nomad actually write through the limited Conn.
|
||||
|
||||
if limitConn == nil || err != nil {
|
||||
// Case 1 or Case 2 - returned Conn is useless and the error should
|
||||
// be one of:
|
||||
// "EOF"
|
||||
// "closed network connection"
|
||||
// "connection reset by peer"
|
||||
msg := err.Error()
|
||||
acceptable := strings.Contains(msg, "EOF") ||
|
||||
strings.Contains(msg, "closed network connection") ||
|
||||
strings.Contains(msg, "connection reset by peer")
|
||||
require.True(t, acceptable)
|
||||
} else {
|
||||
// Case 3 - returned Conn is usable, but Nomad should not write
|
||||
// anything before closing it.
|
||||
buf := make([]byte, bufSize)
|
||||
deadline := time.Now().Add(10 * time.Second)
|
||||
require.NoError(t, limitConn.SetReadDeadline(deadline))
|
||||
n, err := limitConn.Read(buf)
|
||||
require.Equal(t, io.EOF, err)
|
||||
require.Zero(t, n)
|
||||
require.NoError(t, limitConn.Close())
|
||||
}
|
||||
response := "HTTP/1.1 429"
|
||||
buf := make([]byte, len(response))
|
||||
deadline := time.Now().Add(10 * time.Second)
|
||||
require.NoError(t, limitConn.SetReadDeadline(deadline))
|
||||
n, err := limitConn.Read(buf)
|
||||
require.Equal(t, response, string(buf))
|
||||
require.Nil(t, err)
|
||||
require.Equal(t, len(response), n)
|
||||
require.NoError(t, limitConn.Close())
|
||||
|
||||
// Assert existing connections are ok
|
||||
require.Len(t, errCh, 0)
|
||||
|
|
|
@ -480,6 +480,14 @@ Raft database metrics are emitted by the `raft-boltdb` library.
|
|||
| `nomad.raft.boltdb.txstats.write` | Count of total write operations | Integer | Counter |
|
||||
| `nomad.raft.boltdb.txstats.writeTime` | Sample of write operation times | Nanoseconds | Summary |
|
||||
|
||||
## Agent Metrics
|
||||
|
||||
Agent metrics are emitted by all Nomad agents running in either client or server mode.
|
||||
|
||||
| Metric | Description | Unit | Type |
|
||||
| ----------------------------------------- | ----------------------------------------------------- | ----------- | ------- |
|
||||
| `nomad.agent.http.exceeded` | Count of HTTP connections exceeding concurrency limit | Integer | Counter |
|
||||
|
||||
[tagged-metrics]: /docs/telemetry/metrics#tagged-metrics
|
||||
[sticky]: /docs/job-specification/ephemeral_disk#sticky
|
||||
[s_port_plan_failure]: /s/port-plan-failure
|
||||
|
|
Loading…
Reference in New Issue