2023-04-10 15:36:59 +00:00
|
|
|
// Copyright (c) HashiCorp, Inc.
|
|
|
|
// SPDX-License-Identifier: MPL-2.0
|
|
|
|
|
2015-09-13 21:35:28 +00:00
|
|
|
package structs
|
2015-09-13 02:34:46 +00:00
|
|
|
|
|
|
|
import (
|
2015-09-13 23:40:53 +00:00
|
|
|
"fmt"
|
2015-09-13 02:34:46 +00:00
|
|
|
"math/rand"
|
|
|
|
"net"
|
2016-02-20 20:18:22 +00:00
|
|
|
"sync"
|
2022-01-15 01:09:14 +00:00
|
|
|
|
2022-09-21 19:53:25 +00:00
|
|
|
"golang.org/x/exp/maps"
|
scheduling: prevent self-collision in dynamic port network offerings (#16401)
When the scheduler tries to find a placement for a new allocation, it iterates
over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with
the ports of all existing allocations and any other allocations already proposed
as part of this same evaluation via its `SetAllocs` method. Then we make an
"ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive
an "offer" in return. The offer will include both static ports and any dynamic
port assignments.
The `AssignPorts` method was written to support group networks, and it shares
code that selects dynamic ports with the original `AssignTaskNetwork`
code. `AssignTaskNetwork` can request multiple ports from the bitmap at a
time. But `AssignPorts` requests them one at a time and does not account for
possible collisions, and doesn't return an error in that case.
What happens next varies:
1. If the scheduler doesn't place the allocation on that node, the port
conflict is thrown away and there's no problem.
2. If the node is picked and this is the only allocation (or last allocation),
the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect.
3. If the node is picked and there are additional allocations in the same eval
that iterate over the same node, their call to `SetAllocs` will detect the
impossible state and the node will be rejected. This can have the puzzling
behavior where a second task group for the job without any networking at all
can hit a port collision error!
It looks like this bug has existed since we implemented group networks, but
there are several factors that add up to making the issue rare for many users
yet frustratingly frequent for others:
* You're more likely to hit this bug the more tightly packed your range for
dynamic ports is. With 12000 ports in the range by default, many clusters can
avoid this for a long time.
* You're more likely to hit case (3) for jobs with lots of allocations or if a
scheduler has to iterate over a large number of nodes, such as with system jobs,
jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints.
For unlucky combinations of these factors, it's possible that case (3) happens
repeatedly, preventing scheduling of a given job until a client state
change (ex. restarting the agent so all its allocations are rescheduled
elsewhere) re-opens the range of dynamic ports available.
This changeset:
* Fixes the bug by accounting for collisions in dynamic port selection in
`AssignPorts`.
* Adds test coverage for `AssignPorts`, expands coverage of this case for the
deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a
scheduler test for spread scheduling to more easily detect this kind of problem
in the future.
* Adds a `String()` method to `Bitmap` so that any future "screaming" log lines
have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
|
|
|
"golang.org/x/exp/slices"
|
2015-09-13 02:34:46 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
2021-10-04 22:43:35 +00:00
|
|
|
// DefaultMinDynamicPort is the smallest dynamic port generated by
|
2021-10-01 00:05:46 +00:00
|
|
|
// default
|
2021-10-04 22:43:35 +00:00
|
|
|
DefaultMinDynamicPort = 20000
|
2021-10-01 00:05:46 +00:00
|
|
|
|
2021-10-04 22:43:35 +00:00
|
|
|
// DefaultMaxDynamicPort is the largest dynamic port generated by
|
2021-10-01 00:05:46 +00:00
|
|
|
// default
|
2021-10-04 22:43:35 +00:00
|
|
|
DefaultMaxDynamicPort = 32000
|
2021-10-01 00:05:46 +00:00
|
|
|
|
2015-09-13 02:34:46 +00:00
|
|
|
// maxRandPortAttempts is the maximum number of attempt
|
|
|
|
// to assign a random port
|
|
|
|
maxRandPortAttempts = 20
|
2016-02-20 20:08:27 +00:00
|
|
|
|
2021-10-13 22:50:46 +00:00
|
|
|
// MaxValidPort is the max valid port number
|
|
|
|
MaxValidPort = 65536
|
2015-09-13 02:34:46 +00:00
|
|
|
)
|
|
|
|
|
2016-02-20 20:18:22 +00:00
|
|
|
var (
|
|
|
|
// bitmapPool is used to pool the bitmaps used for port collision
|
|
|
|
// checking. They are fairly large (8K) so we can re-use them to
|
|
|
|
// avoid GC pressure. Care should be taken to call Clear() on any
|
|
|
|
// bitmap coming from the pool.
|
|
|
|
bitmapPool = new(sync.Pool)
|
|
|
|
)
|
|
|
|
|
2015-09-13 02:34:46 +00:00
|
|
|
// NetworkIndex is used to index the available network resources
|
|
|
|
// and the used network resources on a machine given allocations
|
2022-07-12 21:40:25 +00:00
|
|
|
//
|
|
|
|
// Fields are exported so they may be JSON serialized for debugging.
|
|
|
|
// Fields are *not* intended to be used directly.
|
2015-09-13 02:34:46 +00:00
|
|
|
type NetworkIndex struct {
|
2022-07-12 21:40:25 +00:00
|
|
|
// TaskNetworks are the node networks available for
|
|
|
|
// task.resources.network asks.
|
|
|
|
TaskNetworks []*NetworkResource
|
|
|
|
|
|
|
|
// GroupNetworks are the node networks available for group.network
|
|
|
|
// asks.
|
|
|
|
GroupNetworks []*NodeNetworkResource
|
|
|
|
|
|
|
|
// HostNetworks indexes addresses by host network alias
|
|
|
|
HostNetworks map[string][]NodeNetworkAddress
|
|
|
|
|
|
|
|
// UsedPorts tracks which ports are used on a per-IP address basis. For
|
|
|
|
// example if a node has `network_interface=lo` and port 22 reserved,
|
|
|
|
// then on a dual stack loopback interface UsedPorts would contain:
|
|
|
|
// {
|
|
|
|
// "127.0.0.1": Bitmap{22},
|
|
|
|
// "::1": Bitmap{22},
|
|
|
|
// }
|
|
|
|
UsedPorts map[string]Bitmap
|
|
|
|
|
|
|
|
// Deprecated bandwidth fields
|
|
|
|
AvailBandwidth map[string]int // Bandwidth by device
|
|
|
|
UsedBandwidth map[string]int // Bandwidth by device
|
2021-09-10 08:52:47 +00:00
|
|
|
|
|
|
|
MinDynamicPort int // The smallest dynamic port generated
|
|
|
|
MaxDynamicPort int // The largest dynamic port generated
|
2015-09-13 02:34:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewNetworkIndex is used to construct a new network index
|
|
|
|
func NewNetworkIndex() *NetworkIndex {
|
|
|
|
return &NetworkIndex{
|
2022-07-12 21:40:25 +00:00
|
|
|
HostNetworks: make(map[string][]NodeNetworkAddress),
|
2016-02-20 20:08:27 +00:00
|
|
|
UsedPorts: make(map[string]Bitmap),
|
2022-07-12 21:40:25 +00:00
|
|
|
AvailBandwidth: make(map[string]int),
|
2015-09-13 02:34:46 +00:00
|
|
|
UsedBandwidth: make(map[string]int),
|
2021-10-04 22:43:35 +00:00
|
|
|
MinDynamicPort: DefaultMinDynamicPort,
|
|
|
|
MaxDynamicPort: DefaultMaxDynamicPort,
|
2015-09-13 02:34:46 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-16 15:53:10 +00:00
|
|
|
func (idx *NetworkIndex) getUsedPortsFor(ip string) Bitmap {
|
|
|
|
used := idx.UsedPorts[ip]
|
|
|
|
if used == nil {
|
|
|
|
// Try to get a bitmap from the pool, else create
|
|
|
|
raw := bitmapPool.Get()
|
|
|
|
if raw != nil {
|
|
|
|
used = raw.(Bitmap)
|
|
|
|
used.Clear()
|
|
|
|
} else {
|
2021-10-13 22:50:46 +00:00
|
|
|
used, _ = NewBitmap(MaxValidPort)
|
2020-06-16 15:53:10 +00:00
|
|
|
}
|
|
|
|
idx.UsedPorts[ip] = used
|
|
|
|
}
|
|
|
|
return used
|
|
|
|
}
|
|
|
|
|
2022-01-15 01:09:14 +00:00
|
|
|
func (idx *NetworkIndex) Copy() *NetworkIndex {
|
|
|
|
if idx == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
c := new(NetworkIndex)
|
|
|
|
*c = *idx
|
|
|
|
|
2022-07-12 21:40:25 +00:00
|
|
|
c.TaskNetworks = copyNetworkResources(idx.TaskNetworks)
|
|
|
|
c.GroupNetworks = copyNodeNetworks(idx.GroupNetworks)
|
|
|
|
c.HostNetworks = copyAvailAddresses(idx.HostNetworks)
|
2022-01-15 01:09:14 +00:00
|
|
|
if idx.AvailBandwidth != nil && len(idx.AvailBandwidth) == 0 {
|
|
|
|
c.AvailBandwidth = make(map[string]int)
|
|
|
|
} else {
|
2022-09-21 19:53:25 +00:00
|
|
|
c.AvailBandwidth = maps.Clone(idx.AvailBandwidth)
|
2022-01-15 01:09:14 +00:00
|
|
|
}
|
|
|
|
if len(idx.UsedPorts) > 0 {
|
|
|
|
c.UsedPorts = make(map[string]Bitmap, len(idx.UsedPorts))
|
|
|
|
for k, v := range idx.UsedPorts {
|
|
|
|
c.UsedPorts[k], _ = v.Copy()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if idx.UsedBandwidth != nil && len(idx.UsedBandwidth) == 0 {
|
|
|
|
c.UsedBandwidth = make(map[string]int)
|
|
|
|
} else {
|
2022-09-21 19:53:25 +00:00
|
|
|
c.UsedBandwidth = maps.Clone(idx.UsedBandwidth)
|
2022-01-15 01:09:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return c
|
|
|
|
}
|
|
|
|
|
|
|
|
func copyNetworkResources(resources []*NetworkResource) []*NetworkResource {
|
|
|
|
l := len(resources)
|
|
|
|
if l == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
c := make([]*NetworkResource, l)
|
|
|
|
for i, resource := range resources {
|
|
|
|
c[i] = resource.Copy()
|
|
|
|
}
|
|
|
|
return c
|
|
|
|
}
|
|
|
|
|
|
|
|
func copyNodeNetworks(resources []*NodeNetworkResource) []*NodeNetworkResource {
|
|
|
|
l := len(resources)
|
|
|
|
if l == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
c := make([]*NodeNetworkResource, l)
|
|
|
|
for i, resource := range resources {
|
|
|
|
c[i] = resource.Copy()
|
|
|
|
}
|
|
|
|
return c
|
|
|
|
}
|
|
|
|
|
|
|
|
func copyAvailAddresses(a map[string][]NodeNetworkAddress) map[string][]NodeNetworkAddress {
|
|
|
|
l := len(a)
|
|
|
|
if l == 0 {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
c := make(map[string][]NodeNetworkAddress, l)
|
|
|
|
for k, v := range a {
|
|
|
|
if len(v) == 0 {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
c[k] = make([]NodeNetworkAddress, len(v))
|
2022-05-31 23:31:58 +00:00
|
|
|
copy(c[k], v)
|
2022-01-15 01:09:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return c
|
|
|
|
}
|
|
|
|
|
2016-02-20 20:18:22 +00:00
|
|
|
// Release is called when the network index is no longer needed
|
|
|
|
// to attempt to re-use some of the memory it has allocated
|
|
|
|
func (idx *NetworkIndex) Release() {
|
|
|
|
for _, b := range idx.UsedPorts {
|
|
|
|
bitmapPool.Put(b)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-13 21:56:51 +00:00
|
|
|
// Overcommitted checks if the network is overcommitted
|
|
|
|
func (idx *NetworkIndex) Overcommitted() bool {
|
2020-06-16 15:53:10 +00:00
|
|
|
// TODO remove since bandwidth is deprecated
|
|
|
|
/*for device, used := range idx.UsedBandwidth {
|
2015-09-13 21:56:51 +00:00
|
|
|
avail := idx.AvailBandwidth[device]
|
|
|
|
if used > avail {
|
|
|
|
return true
|
|
|
|
}
|
2020-06-16 15:53:10 +00:00
|
|
|
}*/
|
2015-09-13 21:56:51 +00:00
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2022-07-12 21:40:25 +00:00
|
|
|
// SetNode is used to initialize a node's network index with available IPs,
|
|
|
|
// reserved ports, and other details from a node's configuration and
|
|
|
|
// fingerprinting.
|
|
|
|
//
|
|
|
|
// SetNode must be idempotent as preemption causes SetNode to be called
|
|
|
|
// multiple times on the same NetworkIndex, only clearing UsedPorts between
|
|
|
|
// calls.
|
|
|
|
//
|
|
|
|
// An error is returned if the Node cannot produce a consistent NetworkIndex
|
|
|
|
// such as if reserved_ports are unparseable.
|
|
|
|
//
|
|
|
|
// Any errors returned by SetNode indicate a bug! The bug may lie in client
|
|
|
|
// code not properly validating its configuration or it may lie in improper
|
|
|
|
// Node object handling by servers. Users should not be able to cause SetNode
|
|
|
|
// to error. Data that cause SetNode to error should be caught upstream such as
|
|
|
|
// a client agent refusing to start with an invalid configuration.
|
|
|
|
func (idx *NetworkIndex) SetNode(node *Node) error {
|
|
|
|
|
|
|
|
// COMPAT(0.11): Deprecated. taskNetworks are only used for
|
|
|
|
// task.resources.network asks which have been deprecated since before
|
|
|
|
// 0.11.
|
|
|
|
// Grab the network resources, handling both new and old Node layouts
|
|
|
|
// from clients.
|
|
|
|
var taskNetworks []*NetworkResource
|
2018-10-02 20:36:04 +00:00
|
|
|
if node.NodeResources != nil && len(node.NodeResources.Networks) != 0 {
|
2022-07-12 21:40:25 +00:00
|
|
|
taskNetworks = node.NodeResources.Networks
|
2018-10-02 20:36:04 +00:00
|
|
|
} else if node.Resources != nil {
|
2022-07-12 21:40:25 +00:00
|
|
|
taskNetworks = node.Resources.Networks
|
2018-10-02 20:36:04 +00:00
|
|
|
}
|
|
|
|
|
2022-07-12 21:40:25 +00:00
|
|
|
// Reserved ports get merged downward. For example given an agent
|
|
|
|
// config:
|
|
|
|
//
|
|
|
|
// client.reserved.reserved_ports = "22"
|
|
|
|
// client.host_network["eth0"] = {reserved_ports = "80,443"}
|
|
|
|
// client.host_network["eth1"] = {reserved_ports = "1-1000"}
|
|
|
|
//
|
|
|
|
// Addresses on taskNetworks reserve port 22
|
|
|
|
// Addresses on eth0 reserve 22,80,443 (note 22 is also reserved!)
|
|
|
|
// Addresses on eth1 reserve 1-1000
|
|
|
|
globalResPorts := []uint{}
|
|
|
|
|
|
|
|
if node.ReservedResources != nil && node.ReservedResources.Networks.ReservedHostPorts != "" {
|
|
|
|
resPorts, err := ParsePortRanges(node.ReservedResources.Networks.ReservedHostPorts)
|
|
|
|
if err != nil {
|
|
|
|
// This is a fatal error that should have been
|
|
|
|
// prevented by client validation.
|
|
|
|
return fmt.Errorf("error parsing reserved_ports: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
globalResPorts = make([]uint, len(resPorts))
|
|
|
|
for i, p := range resPorts {
|
|
|
|
globalResPorts[i] = uint(p)
|
|
|
|
}
|
|
|
|
} else if node.Reserved != nil {
|
|
|
|
// COMPAT(0.11): Remove after 0.11. Nodes stopped reporting
|
|
|
|
// reserved ports under Node.Reserved.Resources in #4750 / v0.9
|
|
|
|
for _, n := range node.Reserved.Networks {
|
|
|
|
used := idx.getUsedPortsFor(n.IP)
|
|
|
|
for _, ports := range [][]Port{n.ReservedPorts, n.DynamicPorts} {
|
|
|
|
for _, p := range ports {
|
|
|
|
if p.Value > MaxValidPort || p.Value < 0 {
|
|
|
|
// This is a fatal error that
|
|
|
|
// should have been prevented
|
|
|
|
// by validation upstream.
|
|
|
|
return fmt.Errorf("invalid port %d for reserved_ports", p.Value)
|
|
|
|
}
|
|
|
|
|
|
|
|
globalResPorts = append(globalResPorts, uint(p.Value))
|
|
|
|
used.Set(uint(p.Value))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reserve mbits
|
|
|
|
if n.Device != "" {
|
|
|
|
idx.UsedBandwidth[n.Device] += n.MBits
|
|
|
|
}
|
|
|
|
}
|
2020-06-16 15:53:10 +00:00
|
|
|
}
|
|
|
|
|
2022-07-12 21:40:25 +00:00
|
|
|
// Filter task networks down to those with a device. For example
|
|
|
|
// taskNetworks may contain a "bridge" interface which has no device
|
|
|
|
// set and cannot be used to fulfill asks.
|
|
|
|
for _, n := range taskNetworks {
|
2015-09-14 01:46:40 +00:00
|
|
|
if n.Device != "" {
|
2022-07-12 21:40:25 +00:00
|
|
|
idx.TaskNetworks = append(idx.TaskNetworks, n)
|
2015-09-13 02:34:46 +00:00
|
|
|
idx.AvailBandwidth[n.Device] = n.MBits
|
2022-07-12 21:40:25 +00:00
|
|
|
|
|
|
|
// Reserve ports
|
|
|
|
used := idx.getUsedPortsFor(n.IP)
|
|
|
|
for _, p := range globalResPorts {
|
|
|
|
used.Set(p)
|
|
|
|
}
|
2015-09-13 02:34:46 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-12 21:40:25 +00:00
|
|
|
// nodeNetworks are used for group.network asks.
|
|
|
|
var nodeNetworks []*NodeNetworkResource
|
|
|
|
if node.NodeResources != nil && len(node.NodeResources.NodeNetworks) != 0 {
|
|
|
|
nodeNetworks = node.NodeResources.NodeNetworks
|
|
|
|
}
|
|
|
|
|
2020-06-16 15:53:10 +00:00
|
|
|
for _, n := range nodeNetworks {
|
|
|
|
for _, a := range n.Addresses {
|
2022-07-12 21:40:25 +00:00
|
|
|
// Index host networks by their unique alias for asks
|
|
|
|
// with group.network.port.host_network set.
|
|
|
|
idx.HostNetworks[a.Alias] = append(idx.HostNetworks[a.Alias], a)
|
|
|
|
|
|
|
|
// Mark reserved ports as used without worrying about
|
|
|
|
// collisions. This effectively merges
|
|
|
|
// client.reserved.reserved_ports into each
|
|
|
|
// host_network.
|
|
|
|
used := idx.getUsedPortsFor(a.Address)
|
|
|
|
for _, p := range globalResPorts {
|
|
|
|
used.Set(p)
|
2020-06-16 15:53:10 +00:00
|
|
|
}
|
|
|
|
|
2022-07-12 21:40:25 +00:00
|
|
|
// If ReservedPorts is set on the NodeNetwork, use it
|
|
|
|
// and the global reserved ports.
|
|
|
|
if a.ReservedPorts != "" {
|
|
|
|
rp, err := ParsePortRanges(a.ReservedPorts)
|
|
|
|
if err != nil {
|
|
|
|
// This is a fatal error that should
|
|
|
|
// have been prevented by validation
|
|
|
|
// upstream.
|
|
|
|
return fmt.Errorf("error parsing reserved_ports for network %q: %w", a.Alias, err)
|
|
|
|
}
|
|
|
|
for _, p := range rp {
|
|
|
|
used.Set(uint(p))
|
|
|
|
}
|
2015-09-13 21:56:51 +00:00
|
|
|
}
|
2015-09-13 02:34:46 +00:00
|
|
|
}
|
|
|
|
}
|
2018-10-02 20:36:04 +00:00
|
|
|
|
2022-07-12 21:40:25 +00:00
|
|
|
// Set dynamic port range (applies to all addresses)
|
2021-09-10 08:52:47 +00:00
|
|
|
if node.NodeResources != nil && node.NodeResources.MinDynamicPort > 0 {
|
|
|
|
idx.MinDynamicPort = node.NodeResources.MinDynamicPort
|
|
|
|
}
|
|
|
|
|
|
|
|
if node.NodeResources != nil && node.NodeResources.MaxDynamicPort > 0 {
|
|
|
|
idx.MaxDynamicPort = node.NodeResources.MaxDynamicPort
|
|
|
|
}
|
|
|
|
|
2022-07-12 21:40:25 +00:00
|
|
|
return nil
|
2015-09-13 02:34:46 +00:00
|
|
|
}
|
|
|
|
|
2015-09-13 21:56:51 +00:00
|
|
|
// AddAllocs is used to add the used network resources. Returns
|
|
|
|
// true if there is a collision
|
2022-07-12 21:40:25 +00:00
|
|
|
//
|
|
|
|
// AddAllocs may be called multiple times for the same NetworkIndex with
|
|
|
|
// UsedPorts cleared between calls (by Release). Therefore AddAllocs must be
|
|
|
|
// determistic and must not manipulate state outside of UsedPorts as that state
|
|
|
|
// would persist between Release calls.
|
2022-01-15 01:09:14 +00:00
|
|
|
func (idx *NetworkIndex) AddAllocs(allocs []*Allocation) (collide bool, reason string) {
|
2015-09-13 02:34:46 +00:00
|
|
|
for _, alloc := range allocs {
|
2018-10-02 20:36:04 +00:00
|
|
|
// Do not consider the resource impact of terminal allocations
|
2022-09-13 19:52:47 +00:00
|
|
|
if alloc.ClientTerminalStatus() {
|
2018-10-02 20:36:04 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if alloc.AllocatedResources != nil {
|
2020-07-06 22:51:46 +00:00
|
|
|
// Only look at AllocatedPorts if populated, otherwise use pre 0.12 logic
|
|
|
|
// COMPAT(1.0): Remove when network resources struct is removed.
|
|
|
|
if len(alloc.AllocatedResources.Shared.Ports) > 0 {
|
2022-01-15 01:09:14 +00:00
|
|
|
if c, r := idx.AddReservedPorts(alloc.AllocatedResources.Shared.Ports); c {
|
2020-07-06 22:51:46 +00:00
|
|
|
collide = true
|
2022-01-15 01:09:14 +00:00
|
|
|
reason = fmt.Sprintf("collision when reserving port for alloc %s: %v", alloc.ID, r)
|
2020-07-06 22:51:46 +00:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Add network resources that are at the task group level
|
|
|
|
if len(alloc.AllocatedResources.Shared.Networks) > 0 {
|
|
|
|
for _, network := range alloc.AllocatedResources.Shared.Networks {
|
2022-01-15 01:09:14 +00:00
|
|
|
if c, r := idx.AddReserved(network); c {
|
2020-07-06 22:51:46 +00:00
|
|
|
collide = true
|
2022-01-15 01:09:14 +00:00
|
|
|
reason = fmt.Sprintf("collision when reserving port for network %s in alloc %s: %v", network.IP, alloc.ID, r)
|
2020-07-06 22:51:46 +00:00
|
|
|
}
|
2019-07-03 18:29:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-01-15 01:09:14 +00:00
|
|
|
for task, resources := range alloc.AllocatedResources.Tasks {
|
|
|
|
if len(resources.Networks) == 0 {
|
2020-07-06 22:51:46 +00:00
|
|
|
continue
|
|
|
|
}
|
2022-01-15 01:09:14 +00:00
|
|
|
n := resources.Networks[0]
|
|
|
|
if c, r := idx.AddReserved(n); c {
|
2020-07-06 22:51:46 +00:00
|
|
|
collide = true
|
2022-01-15 01:09:14 +00:00
|
|
|
reason = fmt.Sprintf("collision when reserving port for network %s in task %s of alloc %s: %v", n.IP, task, alloc.ID, r)
|
2020-07-06 22:51:46 +00:00
|
|
|
}
|
2018-10-02 20:36:04 +00:00
|
|
|
}
|
2015-09-13 02:34:46 +00:00
|
|
|
}
|
2018-10-02 20:36:04 +00:00
|
|
|
} else {
|
|
|
|
// COMPAT(0.11): Remove in 0.11
|
2022-01-15 01:09:14 +00:00
|
|
|
for task, resources := range alloc.TaskResources {
|
|
|
|
if len(resources.Networks) == 0 {
|
2018-10-02 20:36:04 +00:00
|
|
|
continue
|
|
|
|
}
|
2022-01-15 01:09:14 +00:00
|
|
|
n := resources.Networks[0]
|
|
|
|
if c, r := idx.AddReserved(n); c {
|
2018-10-02 20:36:04 +00:00
|
|
|
collide = true
|
2022-01-15 01:09:14 +00:00
|
|
|
reason = fmt.Sprintf("(deprecated) collision when reserving port for network %s in task %s of alloc %s: %v", n.IP, task, alloc.ID, r)
|
2018-10-02 20:36:04 +00:00
|
|
|
}
|
2015-09-13 21:56:51 +00:00
|
|
|
}
|
2015-09-13 02:34:46 +00:00
|
|
|
}
|
|
|
|
}
|
2015-09-13 21:56:51 +00:00
|
|
|
return
|
2015-09-13 02:34:46 +00:00
|
|
|
}
|
|
|
|
|
2015-09-13 21:56:51 +00:00
|
|
|
// AddReserved is used to add a reserved network usage, returns true
|
|
|
|
// if there is a port collision
|
2022-01-15 01:09:14 +00:00
|
|
|
func (idx *NetworkIndex) AddReserved(n *NetworkResource) (collide bool, reasons []string) {
|
2015-09-13 02:34:46 +00:00
|
|
|
// Add the port usage
|
2020-06-16 15:53:10 +00:00
|
|
|
used := idx.getUsedPortsFor(n.IP)
|
2015-11-24 00:32:30 +00:00
|
|
|
|
|
|
|
for _, ports := range [][]Port{n.ReservedPorts, n.DynamicPorts} {
|
|
|
|
for _, port := range ports {
|
2016-02-20 20:08:27 +00:00
|
|
|
// Guard against invalid port
|
2021-10-13 22:50:46 +00:00
|
|
|
if port.Value < 0 || port.Value >= MaxValidPort {
|
2022-01-15 01:09:14 +00:00
|
|
|
return true, []string{fmt.Sprintf("invalid port %d", port.Value)}
|
2016-02-20 20:08:27 +00:00
|
|
|
}
|
|
|
|
if used.Check(uint(port.Value)) {
|
2015-11-24 00:32:30 +00:00
|
|
|
collide = true
|
2022-01-15 01:09:14 +00:00
|
|
|
reason := fmt.Sprintf("port %d already in use", port.Value)
|
|
|
|
reasons = append(reasons, reason)
|
2015-11-24 00:32:30 +00:00
|
|
|
} else {
|
2016-02-20 20:08:27 +00:00
|
|
|
used.Set(uint(port.Value))
|
2015-11-24 00:32:30 +00:00
|
|
|
}
|
2015-09-13 21:56:51 +00:00
|
|
|
}
|
2015-09-13 02:34:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Add the bandwidth
|
|
|
|
idx.UsedBandwidth[n.Device] += n.MBits
|
2015-09-13 21:56:51 +00:00
|
|
|
return
|
2015-09-13 02:34:46 +00:00
|
|
|
}
|
|
|
|
|
2022-01-15 01:09:14 +00:00
|
|
|
func (idx *NetworkIndex) AddReservedPorts(ports AllocatedPorts) (collide bool, reasons []string) {
|
2020-06-16 15:53:10 +00:00
|
|
|
for _, port := range ports {
|
|
|
|
used := idx.getUsedPortsFor(port.HostIP)
|
2021-10-13 22:50:46 +00:00
|
|
|
if port.Value < 0 || port.Value >= MaxValidPort {
|
2022-01-15 01:09:14 +00:00
|
|
|
return true, []string{fmt.Sprintf("invalid port %d", port.Value)}
|
2020-06-16 15:53:10 +00:00
|
|
|
}
|
|
|
|
if used.Check(uint(port.Value)) {
|
|
|
|
collide = true
|
2022-01-15 01:09:14 +00:00
|
|
|
reason := fmt.Sprintf("port %d already in use", port.Value)
|
|
|
|
reasons = append(reasons, reason)
|
2020-06-16 15:53:10 +00:00
|
|
|
} else {
|
|
|
|
used.Set(uint(port.Value))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2021-08-30 09:08:12 +00:00
|
|
|
// AddReservedPortsForIP checks whether any reserved ports collide with those
|
|
|
|
// in use for the IP address.
|
2022-07-12 21:40:25 +00:00
|
|
|
func (idx *NetworkIndex) AddReservedPortsForIP(ports []uint64, ip string) (collide bool, reasons []string) {
|
2020-06-16 15:53:10 +00:00
|
|
|
used := idx.getUsedPortsFor(ip)
|
2022-07-12 21:40:25 +00:00
|
|
|
for _, port := range ports {
|
2020-06-16 15:53:10 +00:00
|
|
|
// Guard against invalid port
|
2021-10-13 22:50:46 +00:00
|
|
|
if port >= MaxValidPort {
|
2022-01-15 01:09:14 +00:00
|
|
|
return true, []string{fmt.Sprintf("invalid port %d", port)}
|
2020-06-16 15:53:10 +00:00
|
|
|
}
|
|
|
|
if used.Check(uint(port)) {
|
|
|
|
collide = true
|
2022-01-15 01:09:14 +00:00
|
|
|
reason := fmt.Sprintf("port %d already in use", port)
|
|
|
|
reasons = append(reasons, reason)
|
2020-06-16 15:53:10 +00:00
|
|
|
} else {
|
|
|
|
used.Set(uint(port))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2015-09-13 02:34:46 +00:00
|
|
|
// yieldIP is used to iteratively invoke the callback with
|
|
|
|
// an available IP
|
2022-07-12 21:40:25 +00:00
|
|
|
func (idx *NetworkIndex) yieldIP(cb func(net *NetworkResource, offerIP net.IP) bool) {
|
|
|
|
for _, n := range idx.TaskNetworks {
|
2015-09-13 02:34:46 +00:00
|
|
|
ip, ipnet, err := net.ParseCIDR(n.CIDR)
|
|
|
|
if err != nil {
|
|
|
|
continue
|
|
|
|
}
|
2022-07-12 21:40:25 +00:00
|
|
|
for ip := ip.Mask(ipnet.Mask); ipnet.Contains(ip); incIP(ip) {
|
2015-09-13 02:34:46 +00:00
|
|
|
if cb(n, ip) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-07-12 21:40:25 +00:00
|
|
|
func incIP(ip net.IP) {
|
|
|
|
// Iterate over IP octects from right to left
|
|
|
|
for j := len(ip) - 1; j >= 0; j-- {
|
|
|
|
|
|
|
|
// Increment octect
|
|
|
|
ip[j]++
|
|
|
|
|
|
|
|
// If this octect did not wrap around to 0, it's the next IP to
|
|
|
|
// try. If it did wrap (p[j]==0), then the next octect is
|
|
|
|
// incremented.
|
|
|
|
if ip[j] > 0 {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// AssignPorts based on an ask from the scheduler processing a group.network
|
2023-01-30 14:48:43 +00:00
|
|
|
// block. Supports multi-interfaces through node configured host_networks.
|
2022-07-12 21:40:25 +00:00
|
|
|
//
|
2023-01-30 14:48:43 +00:00
|
|
|
// AssignTaskNetwork supports the deprecated task.resources.network block.
|
2020-06-16 15:53:10 +00:00
|
|
|
func (idx *NetworkIndex) AssignPorts(ask *NetworkResource) (AllocatedPorts, error) {
|
|
|
|
var offer AllocatedPorts
|
scheduling: prevent self-collision in dynamic port network offerings (#16401)
When the scheduler tries to find a placement for a new allocation, it iterates
over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with
the ports of all existing allocations and any other allocations already proposed
as part of this same evaluation via its `SetAllocs` method. Then we make an
"ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive
an "offer" in return. The offer will include both static ports and any dynamic
port assignments.
The `AssignPorts` method was written to support group networks, and it shares
code that selects dynamic ports with the original `AssignTaskNetwork`
code. `AssignTaskNetwork` can request multiple ports from the bitmap at a
time. But `AssignPorts` requests them one at a time and does not account for
possible collisions, and doesn't return an error in that case.
What happens next varies:
1. If the scheduler doesn't place the allocation on that node, the port
conflict is thrown away and there's no problem.
2. If the node is picked and this is the only allocation (or last allocation),
the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect.
3. If the node is picked and there are additional allocations in the same eval
that iterate over the same node, their call to `SetAllocs` will detect the
impossible state and the node will be rejected. This can have the puzzling
behavior where a second task group for the job without any networking at all
can hit a port collision error!
It looks like this bug has existed since we implemented group networks, but
there are several factors that add up to making the issue rare for many users
yet frustratingly frequent for others:
* You're more likely to hit this bug the more tightly packed your range for
dynamic ports is. With 12000 ports in the range by default, many clusters can
avoid this for a long time.
* You're more likely to hit case (3) for jobs with lots of allocations or if a
scheduler has to iterate over a large number of nodes, such as with system jobs,
jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints.
For unlucky combinations of these factors, it's possible that case (3) happens
repeatedly, preventing scheduling of a given job until a client state
change (ex. restarting the agent so all its allocations are rescheduled
elsewhere) re-opens the range of dynamic ports available.
This changeset:
* Fixes the bug by accounting for collisions in dynamic port selection in
`AssignPorts`.
* Adds test coverage for `AssignPorts`, expands coverage of this case for the
deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a
scheduler test for spread scheduling to more easily detect this kind of problem
in the future.
* Adds a `String()` method to `Bitmap` so that any future "screaming" log lines
have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
|
|
|
var portsInOffer []int
|
2020-06-16 15:53:10 +00:00
|
|
|
|
|
|
|
// index of host network name to slice of reserved ports, used during dynamic port assignment
|
|
|
|
reservedIdx := map[string][]Port{}
|
|
|
|
|
|
|
|
for _, port := range ask.ReservedPorts {
|
|
|
|
reservedIdx[port.HostNetwork] = append(reservedIdx[port.HostNetwork], port)
|
|
|
|
|
|
|
|
// allocPort is set in the inner for loop if a port mapping can be created
|
|
|
|
// if allocPort is still nil after the loop, the port wasn't available for reservation
|
|
|
|
var allocPort *AllocatedPortMapping
|
|
|
|
var addrErr error
|
2022-07-12 21:40:25 +00:00
|
|
|
for _, addr := range idx.HostNetworks[port.HostNetwork] {
|
2020-06-16 15:53:10 +00:00
|
|
|
used := idx.getUsedPortsFor(addr.Address)
|
|
|
|
// Guard against invalid port
|
2021-10-13 22:50:46 +00:00
|
|
|
if port.Value < 0 || port.Value >= MaxValidPort {
|
2020-06-16 15:53:10 +00:00
|
|
|
return nil, fmt.Errorf("invalid port %d (out of range)", port.Value)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if in use
|
|
|
|
if used != nil && used.Check(uint(port.Value)) {
|
2020-07-06 22:51:46 +00:00
|
|
|
return nil, fmt.Errorf("reserved port collision %s=%d", port.Label, port.Value)
|
2020-06-16 15:53:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
allocPort = &AllocatedPortMapping{
|
|
|
|
Label: port.Label,
|
|
|
|
Value: port.Value,
|
|
|
|
To: port.To,
|
|
|
|
HostIP: addr.Address,
|
|
|
|
}
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
if allocPort == nil {
|
|
|
|
if addrErr != nil {
|
|
|
|
return nil, addrErr
|
|
|
|
}
|
|
|
|
|
2022-01-15 01:09:14 +00:00
|
|
|
return nil, fmt.Errorf("no addresses available for %s network", port.HostNetwork)
|
2020-06-16 15:53:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
offer = append(offer, *allocPort)
|
scheduling: prevent self-collision in dynamic port network offerings (#16401)
When the scheduler tries to find a placement for a new allocation, it iterates
over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with
the ports of all existing allocations and any other allocations already proposed
as part of this same evaluation via its `SetAllocs` method. Then we make an
"ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive
an "offer" in return. The offer will include both static ports and any dynamic
port assignments.
The `AssignPorts` method was written to support group networks, and it shares
code that selects dynamic ports with the original `AssignTaskNetwork`
code. `AssignTaskNetwork` can request multiple ports from the bitmap at a
time. But `AssignPorts` requests them one at a time and does not account for
possible collisions, and doesn't return an error in that case.
What happens next varies:
1. If the scheduler doesn't place the allocation on that node, the port
conflict is thrown away and there's no problem.
2. If the node is picked and this is the only allocation (or last allocation),
the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect.
3. If the node is picked and there are additional allocations in the same eval
that iterate over the same node, their call to `SetAllocs` will detect the
impossible state and the node will be rejected. This can have the puzzling
behavior where a second task group for the job without any networking at all
can hit a port collision error!
It looks like this bug has existed since we implemented group networks, but
there are several factors that add up to making the issue rare for many users
yet frustratingly frequent for others:
* You're more likely to hit this bug the more tightly packed your range for
dynamic ports is. With 12000 ports in the range by default, many clusters can
avoid this for a long time.
* You're more likely to hit case (3) for jobs with lots of allocations or if a
scheduler has to iterate over a large number of nodes, such as with system jobs,
jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints.
For unlucky combinations of these factors, it's possible that case (3) happens
repeatedly, preventing scheduling of a given job until a client state
change (ex. restarting the agent so all its allocations are rescheduled
elsewhere) re-opens the range of dynamic ports available.
This changeset:
* Fixes the bug by accounting for collisions in dynamic port selection in
`AssignPorts`.
* Adds test coverage for `AssignPorts`, expands coverage of this case for the
deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a
scheduler test for spread scheduling to more easily detect this kind of problem
in the future.
* Adds a `String()` method to `Bitmap` so that any future "screaming" log lines
have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
|
|
|
portsInOffer = append(portsInOffer, allocPort.Value)
|
2020-06-16 15:53:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, port := range ask.DynamicPorts {
|
|
|
|
var allocPort *AllocatedPortMapping
|
|
|
|
var addrErr error
|
2022-07-12 21:40:25 +00:00
|
|
|
for _, addr := range idx.HostNetworks[port.HostNetwork] {
|
2020-06-16 15:53:10 +00:00
|
|
|
used := idx.getUsedPortsFor(addr.Address)
|
|
|
|
// Try to stochastically pick the dynamic ports as it is faster and
|
|
|
|
// lower memory usage.
|
|
|
|
var dynPorts []int
|
2020-06-19 15:29:41 +00:00
|
|
|
// TODO: its more efficient to find multiple dynamic ports at once
|
scheduling: prevent self-collision in dynamic port network offerings (#16401)
When the scheduler tries to find a placement for a new allocation, it iterates
over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with
the ports of all existing allocations and any other allocations already proposed
as part of this same evaluation via its `SetAllocs` method. Then we make an
"ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive
an "offer" in return. The offer will include both static ports and any dynamic
port assignments.
The `AssignPorts` method was written to support group networks, and it shares
code that selects dynamic ports with the original `AssignTaskNetwork`
code. `AssignTaskNetwork` can request multiple ports from the bitmap at a
time. But `AssignPorts` requests them one at a time and does not account for
possible collisions, and doesn't return an error in that case.
What happens next varies:
1. If the scheduler doesn't place the allocation on that node, the port
conflict is thrown away and there's no problem.
2. If the node is picked and this is the only allocation (or last allocation),
the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect.
3. If the node is picked and there are additional allocations in the same eval
that iterate over the same node, their call to `SetAllocs` will detect the
impossible state and the node will be rejected. This can have the puzzling
behavior where a second task group for the job without any networking at all
can hit a port collision error!
It looks like this bug has existed since we implemented group networks, but
there are several factors that add up to making the issue rare for many users
yet frustratingly frequent for others:
* You're more likely to hit this bug the more tightly packed your range for
dynamic ports is. With 12000 ports in the range by default, many clusters can
avoid this for a long time.
* You're more likely to hit case (3) for jobs with lots of allocations or if a
scheduler has to iterate over a large number of nodes, such as with system jobs,
jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints.
For unlucky combinations of these factors, it's possible that case (3) happens
repeatedly, preventing scheduling of a given job until a client state
change (ex. restarting the agent so all its allocations are rescheduled
elsewhere) re-opens the range of dynamic ports available.
This changeset:
* Fixes the bug by accounting for collisions in dynamic port selection in
`AssignPorts`.
* Adds test coverage for `AssignPorts`, expands coverage of this case for the
deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a
scheduler test for spread scheduling to more easily detect this kind of problem
in the future.
* Adds a `String()` method to `Bitmap` so that any future "screaming" log lines
have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
|
|
|
dynPorts, addrErr = getDynamicPortsStochastic(
|
|
|
|
used, portsInOffer, idx.MinDynamicPort, idx.MaxDynamicPort,
|
|
|
|
reservedIdx[port.HostNetwork], 1)
|
2020-06-16 15:53:10 +00:00
|
|
|
if addrErr != nil {
|
|
|
|
// Fall back to the precise method if the random sampling failed.
|
scheduling: prevent self-collision in dynamic port network offerings (#16401)
When the scheduler tries to find a placement for a new allocation, it iterates
over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with
the ports of all existing allocations and any other allocations already proposed
as part of this same evaluation via its `SetAllocs` method. Then we make an
"ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive
an "offer" in return. The offer will include both static ports and any dynamic
port assignments.
The `AssignPorts` method was written to support group networks, and it shares
code that selects dynamic ports with the original `AssignTaskNetwork`
code. `AssignTaskNetwork` can request multiple ports from the bitmap at a
time. But `AssignPorts` requests them one at a time and does not account for
possible collisions, and doesn't return an error in that case.
What happens next varies:
1. If the scheduler doesn't place the allocation on that node, the port
conflict is thrown away and there's no problem.
2. If the node is picked and this is the only allocation (or last allocation),
the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect.
3. If the node is picked and there are additional allocations in the same eval
that iterate over the same node, their call to `SetAllocs` will detect the
impossible state and the node will be rejected. This can have the puzzling
behavior where a second task group for the job without any networking at all
can hit a port collision error!
It looks like this bug has existed since we implemented group networks, but
there are several factors that add up to making the issue rare for many users
yet frustratingly frequent for others:
* You're more likely to hit this bug the more tightly packed your range for
dynamic ports is. With 12000 ports in the range by default, many clusters can
avoid this for a long time.
* You're more likely to hit case (3) for jobs with lots of allocations or if a
scheduler has to iterate over a large number of nodes, such as with system jobs,
jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints.
For unlucky combinations of these factors, it's possible that case (3) happens
repeatedly, preventing scheduling of a given job until a client state
change (ex. restarting the agent so all its allocations are rescheduled
elsewhere) re-opens the range of dynamic ports available.
This changeset:
* Fixes the bug by accounting for collisions in dynamic port selection in
`AssignPorts`.
* Adds test coverage for `AssignPorts`, expands coverage of this case for the
deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a
scheduler test for spread scheduling to more easily detect this kind of problem
in the future.
* Adds a `String()` method to `Bitmap` so that any future "screaming" log lines
have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
|
|
|
dynPorts, addrErr = getDynamicPortsPrecise(used, portsInOffer,
|
|
|
|
idx.MinDynamicPort, idx.MaxDynamicPort,
|
|
|
|
reservedIdx[port.HostNetwork], 1)
|
2020-06-16 15:53:10 +00:00
|
|
|
if addrErr != nil {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
allocPort = &AllocatedPortMapping{
|
|
|
|
Label: port.Label,
|
|
|
|
Value: dynPorts[0],
|
|
|
|
To: port.To,
|
|
|
|
HostIP: addr.Address,
|
|
|
|
}
|
|
|
|
if allocPort.To == -1 {
|
|
|
|
allocPort.To = allocPort.Value
|
|
|
|
}
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
if allocPort == nil {
|
|
|
|
if addrErr != nil {
|
|
|
|
return nil, addrErr
|
|
|
|
}
|
|
|
|
|
2022-01-15 01:09:14 +00:00
|
|
|
return nil, fmt.Errorf("no addresses available for %s network", port.HostNetwork)
|
2020-06-16 15:53:10 +00:00
|
|
|
}
|
|
|
|
offer = append(offer, *allocPort)
|
scheduling: prevent self-collision in dynamic port network offerings (#16401)
When the scheduler tries to find a placement for a new allocation, it iterates
over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with
the ports of all existing allocations and any other allocations already proposed
as part of this same evaluation via its `SetAllocs` method. Then we make an
"ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive
an "offer" in return. The offer will include both static ports and any dynamic
port assignments.
The `AssignPorts` method was written to support group networks, and it shares
code that selects dynamic ports with the original `AssignTaskNetwork`
code. `AssignTaskNetwork` can request multiple ports from the bitmap at a
time. But `AssignPorts` requests them one at a time and does not account for
possible collisions, and doesn't return an error in that case.
What happens next varies:
1. If the scheduler doesn't place the allocation on that node, the port
conflict is thrown away and there's no problem.
2. If the node is picked and this is the only allocation (or last allocation),
the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect.
3. If the node is picked and there are additional allocations in the same eval
that iterate over the same node, their call to `SetAllocs` will detect the
impossible state and the node will be rejected. This can have the puzzling
behavior where a second task group for the job without any networking at all
can hit a port collision error!
It looks like this bug has existed since we implemented group networks, but
there are several factors that add up to making the issue rare for many users
yet frustratingly frequent for others:
* You're more likely to hit this bug the more tightly packed your range for
dynamic ports is. With 12000 ports in the range by default, many clusters can
avoid this for a long time.
* You're more likely to hit case (3) for jobs with lots of allocations or if a
scheduler has to iterate over a large number of nodes, such as with system jobs,
jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints.
For unlucky combinations of these factors, it's possible that case (3) happens
repeatedly, preventing scheduling of a given job until a client state
change (ex. restarting the agent so all its allocations are rescheduled
elsewhere) re-opens the range of dynamic ports available.
This changeset:
* Fixes the bug by accounting for collisions in dynamic port selection in
`AssignPorts`.
* Adds test coverage for `AssignPorts`, expands coverage of this case for the
deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a
scheduler test for spread scheduling to more easily detect this kind of problem
in the future.
* Adds a `String()` method to `Bitmap` so that any future "screaming" log lines
have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
|
|
|
portsInOffer = append(portsInOffer, allocPort.Value)
|
2020-06-16 15:53:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return offer, nil
|
|
|
|
}
|
|
|
|
|
2022-07-12 21:40:25 +00:00
|
|
|
// AssignTaskNetwork is used to offer network resources given a
|
|
|
|
// task.resources.network ask. If the ask cannot be satisfied, returns nil
|
|
|
|
//
|
|
|
|
// AssignTaskNetwork and task.resources.network are deprecated in favor of
|
|
|
|
// AssignPorts and group.network. AssignTaskNetwork does not support multiple
|
|
|
|
// interfaces and only uses the node's default interface. AssignPorts is the
|
|
|
|
// method that is used for group.network asks.
|
|
|
|
func (idx *NetworkIndex) AssignTaskNetwork(ask *NetworkResource) (out *NetworkResource, err error) {
|
2015-09-23 01:33:29 +00:00
|
|
|
err = fmt.Errorf("no networks available")
|
2022-07-12 21:40:25 +00:00
|
|
|
idx.yieldIP(func(n *NetworkResource, offerIP net.IP) (stop bool) {
|
2015-09-13 02:34:46 +00:00
|
|
|
// Convert the IP to a string
|
2022-07-12 21:40:25 +00:00
|
|
|
offerIPStr := offerIP.String()
|
2015-09-13 02:34:46 +00:00
|
|
|
|
|
|
|
// Check if we would exceed the bandwidth cap
|
|
|
|
availBandwidth := idx.AvailBandwidth[n.Device]
|
|
|
|
usedBandwidth := idx.UsedBandwidth[n.Device]
|
|
|
|
if usedBandwidth+ask.MBits > availBandwidth {
|
2015-09-13 23:40:53 +00:00
|
|
|
err = fmt.Errorf("bandwidth exceeded")
|
2015-09-13 02:34:46 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2022-07-12 21:40:25 +00:00
|
|
|
used := idx.UsedPorts[offerIPStr]
|
2016-08-05 23:08:35 +00:00
|
|
|
|
2015-09-13 02:34:46 +00:00
|
|
|
// Check if any of the reserved ports are in use
|
|
|
|
for _, port := range ask.ReservedPorts {
|
2016-02-20 20:08:27 +00:00
|
|
|
// Guard against invalid port
|
2021-10-13 22:50:46 +00:00
|
|
|
if port.Value < 0 || port.Value >= MaxValidPort {
|
2016-02-20 20:08:27 +00:00
|
|
|
err = fmt.Errorf("invalid port %d (out of range)", port.Value)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if in use
|
|
|
|
if used != nil && used.Check(uint(port.Value)) {
|
2020-07-06 22:51:46 +00:00
|
|
|
err = fmt.Errorf("reserved port collision %s=%d", port.Label, port.Value)
|
2015-09-13 23:40:53 +00:00
|
|
|
return
|
2015-09-13 02:34:46 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create the offer
|
2015-09-13 21:35:28 +00:00
|
|
|
offer := &NetworkResource{
|
2019-08-28 03:59:36 +00:00
|
|
|
Mode: ask.Mode,
|
2015-09-13 02:34:46 +00:00
|
|
|
Device: n.Device,
|
2022-07-12 21:40:25 +00:00
|
|
|
IP: offerIPStr,
|
2016-05-06 04:32:01 +00:00
|
|
|
MBits: ask.MBits,
|
2020-04-28 03:11:06 +00:00
|
|
|
DNS: ask.DNS,
|
2015-09-13 02:34:46 +00:00
|
|
|
ReservedPorts: ask.ReservedPorts,
|
2015-09-24 01:01:49 +00:00
|
|
|
DynamicPorts: ask.DynamicPorts,
|
2015-09-13 02:34:46 +00:00
|
|
|
}
|
|
|
|
|
2016-08-10 18:47:20 +00:00
|
|
|
// Try to stochastically pick the dynamic ports as it is faster and
|
|
|
|
// lower memory usage.
|
|
|
|
var dynPorts []int
|
|
|
|
var dynErr error
|
scheduling: prevent self-collision in dynamic port network offerings (#16401)
When the scheduler tries to find a placement for a new allocation, it iterates
over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with
the ports of all existing allocations and any other allocations already proposed
as part of this same evaluation via its `SetAllocs` method. Then we make an
"ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive
an "offer" in return. The offer will include both static ports and any dynamic
port assignments.
The `AssignPorts` method was written to support group networks, and it shares
code that selects dynamic ports with the original `AssignTaskNetwork`
code. `AssignTaskNetwork` can request multiple ports from the bitmap at a
time. But `AssignPorts` requests them one at a time and does not account for
possible collisions, and doesn't return an error in that case.
What happens next varies:
1. If the scheduler doesn't place the allocation on that node, the port
conflict is thrown away and there's no problem.
2. If the node is picked and this is the only allocation (or last allocation),
the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect.
3. If the node is picked and there are additional allocations in the same eval
that iterate over the same node, their call to `SetAllocs` will detect the
impossible state and the node will be rejected. This can have the puzzling
behavior where a second task group for the job without any networking at all
can hit a port collision error!
It looks like this bug has existed since we implemented group networks, but
there are several factors that add up to making the issue rare for many users
yet frustratingly frequent for others:
* You're more likely to hit this bug the more tightly packed your range for
dynamic ports is. With 12000 ports in the range by default, many clusters can
avoid this for a long time.
* You're more likely to hit case (3) for jobs with lots of allocations or if a
scheduler has to iterate over a large number of nodes, such as with system jobs,
jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints.
For unlucky combinations of these factors, it's possible that case (3) happens
repeatedly, preventing scheduling of a given job until a client state
change (ex. restarting the agent so all its allocations are rescheduled
elsewhere) re-opens the range of dynamic ports available.
This changeset:
* Fixes the bug by accounting for collisions in dynamic port selection in
`AssignPorts`.
* Adds test coverage for `AssignPorts`, expands coverage of this case for the
deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a
scheduler test for spread scheduling to more easily detect this kind of problem
in the future.
* Adds a `String()` method to `Bitmap` so that any future "screaming" log lines
have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
|
|
|
dynPorts, dynErr = getDynamicPortsStochastic(used, nil,
|
|
|
|
idx.MinDynamicPort, idx.MaxDynamicPort, ask.ReservedPorts, len(ask.DynamicPorts))
|
2016-08-11 23:18:45 +00:00
|
|
|
if dynErr == nil {
|
2016-08-10 18:47:20 +00:00
|
|
|
goto BUILD_OFFER
|
2016-08-05 23:08:35 +00:00
|
|
|
}
|
|
|
|
|
2016-08-10 18:47:20 +00:00
|
|
|
// Fall back to the precise method if the random sampling failed.
|
scheduling: prevent self-collision in dynamic port network offerings (#16401)
When the scheduler tries to find a placement for a new allocation, it iterates
over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with
the ports of all existing allocations and any other allocations already proposed
as part of this same evaluation via its `SetAllocs` method. Then we make an
"ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive
an "offer" in return. The offer will include both static ports and any dynamic
port assignments.
The `AssignPorts` method was written to support group networks, and it shares
code that selects dynamic ports with the original `AssignTaskNetwork`
code. `AssignTaskNetwork` can request multiple ports from the bitmap at a
time. But `AssignPorts` requests them one at a time and does not account for
possible collisions, and doesn't return an error in that case.
What happens next varies:
1. If the scheduler doesn't place the allocation on that node, the port
conflict is thrown away and there's no problem.
2. If the node is picked and this is the only allocation (or last allocation),
the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect.
3. If the node is picked and there are additional allocations in the same eval
that iterate over the same node, their call to `SetAllocs` will detect the
impossible state and the node will be rejected. This can have the puzzling
behavior where a second task group for the job without any networking at all
can hit a port collision error!
It looks like this bug has existed since we implemented group networks, but
there are several factors that add up to making the issue rare for many users
yet frustratingly frequent for others:
* You're more likely to hit this bug the more tightly packed your range for
dynamic ports is. With 12000 ports in the range by default, many clusters can
avoid this for a long time.
* You're more likely to hit case (3) for jobs with lots of allocations or if a
scheduler has to iterate over a large number of nodes, such as with system jobs,
jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints.
For unlucky combinations of these factors, it's possible that case (3) happens
repeatedly, preventing scheduling of a given job until a client state
change (ex. restarting the agent so all its allocations are rescheduled
elsewhere) re-opens the range of dynamic ports available.
This changeset:
* Fixes the bug by accounting for collisions in dynamic port selection in
`AssignPorts`.
* Adds test coverage for `AssignPorts`, expands coverage of this case for the
deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a
scheduler test for spread scheduling to more easily detect this kind of problem
in the future.
* Adds a `String()` method to `Bitmap` so that any future "screaming" log lines
have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
|
|
|
dynPorts, dynErr = getDynamicPortsPrecise(used, nil,
|
|
|
|
idx.MinDynamicPort, idx.MaxDynamicPort, ask.ReservedPorts, len(ask.DynamicPorts))
|
2016-08-10 23:37:26 +00:00
|
|
|
if dynErr != nil {
|
2016-08-10 18:47:20 +00:00
|
|
|
err = dynErr
|
|
|
|
return
|
2016-08-05 23:08:35 +00:00
|
|
|
}
|
|
|
|
|
2016-08-10 18:47:20 +00:00
|
|
|
BUILD_OFFER:
|
|
|
|
for i, port := range dynPorts {
|
|
|
|
offer.DynamicPorts[i].Value = port
|
2019-08-28 03:59:36 +00:00
|
|
|
|
|
|
|
// This syntax allows you to set the mapped to port to the same port
|
|
|
|
// allocated by the scheduler on the host.
|
|
|
|
if offer.DynamicPorts[i].To == -1 {
|
|
|
|
offer.DynamicPorts[i].To = port
|
|
|
|
}
|
2015-09-13 02:34:46 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Stop, we have an offer!
|
|
|
|
out = offer
|
2015-09-13 23:40:53 +00:00
|
|
|
err = nil
|
2015-09-13 02:34:46 +00:00
|
|
|
return true
|
|
|
|
})
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2016-08-10 18:47:20 +00:00
|
|
|
// getDynamicPortsPrecise takes the nodes used port bitmap which may be nil if
|
scheduling: prevent self-collision in dynamic port network offerings (#16401)
When the scheduler tries to find a placement for a new allocation, it iterates
over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with
the ports of all existing allocations and any other allocations already proposed
as part of this same evaluation via its `SetAllocs` method. Then we make an
"ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive
an "offer" in return. The offer will include both static ports and any dynamic
port assignments.
The `AssignPorts` method was written to support group networks, and it shares
code that selects dynamic ports with the original `AssignTaskNetwork`
code. `AssignTaskNetwork` can request multiple ports from the bitmap at a
time. But `AssignPorts` requests them one at a time and does not account for
possible collisions, and doesn't return an error in that case.
What happens next varies:
1. If the scheduler doesn't place the allocation on that node, the port
conflict is thrown away and there's no problem.
2. If the node is picked and this is the only allocation (or last allocation),
the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect.
3. If the node is picked and there are additional allocations in the same eval
that iterate over the same node, their call to `SetAllocs` will detect the
impossible state and the node will be rejected. This can have the puzzling
behavior where a second task group for the job without any networking at all
can hit a port collision error!
It looks like this bug has existed since we implemented group networks, but
there are several factors that add up to making the issue rare for many users
yet frustratingly frequent for others:
* You're more likely to hit this bug the more tightly packed your range for
dynamic ports is. With 12000 ports in the range by default, many clusters can
avoid this for a long time.
* You're more likely to hit case (3) for jobs with lots of allocations or if a
scheduler has to iterate over a large number of nodes, such as with system jobs,
jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints.
For unlucky combinations of these factors, it's possible that case (3) happens
repeatedly, preventing scheduling of a given job until a client state
change (ex. restarting the agent so all its allocations are rescheduled
elsewhere) re-opens the range of dynamic ports available.
This changeset:
* Fixes the bug by accounting for collisions in dynamic port selection in
`AssignPorts`.
* Adds test coverage for `AssignPorts`, expands coverage of this case for the
deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a
scheduler test for spread scheduling to more easily detect this kind of problem
in the future.
* Adds a `String()` method to `Bitmap` so that any future "screaming" log lines
have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
|
|
|
// no ports have been allocated yet, any ports already offered in the caller,
|
|
|
|
// and the network ask. It returns a set of unused ports to fulfil the ask's
|
|
|
|
// DynamicPorts or an error if it failed. An error means the ask can not be
|
|
|
|
// satisfied as the method does a precise search.
|
|
|
|
func getDynamicPortsPrecise(nodeUsed Bitmap, portsInOffer []int, minDynamicPort, maxDynamicPort int, reserved []Port, numDyn int) ([]int, error) {
|
2020-06-16 15:53:10 +00:00
|
|
|
// Create a copy of the used ports and apply the new reserves
|
|
|
|
var usedSet Bitmap
|
|
|
|
var err error
|
|
|
|
if nodeUsed != nil {
|
|
|
|
usedSet, err = nodeUsed.Copy()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
} else {
|
2021-10-13 22:50:46 +00:00
|
|
|
usedSet, err = NewBitmap(MaxValidPort)
|
2020-06-16 15:53:10 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, port := range reserved {
|
|
|
|
usedSet.Set(uint(port.Value))
|
|
|
|
}
|
|
|
|
|
scheduling: prevent self-collision in dynamic port network offerings (#16401)
When the scheduler tries to find a placement for a new allocation, it iterates
over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with
the ports of all existing allocations and any other allocations already proposed
as part of this same evaluation via its `SetAllocs` method. Then we make an
"ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive
an "offer" in return. The offer will include both static ports and any dynamic
port assignments.
The `AssignPorts` method was written to support group networks, and it shares
code that selects dynamic ports with the original `AssignTaskNetwork`
code. `AssignTaskNetwork` can request multiple ports from the bitmap at a
time. But `AssignPorts` requests them one at a time and does not account for
possible collisions, and doesn't return an error in that case.
What happens next varies:
1. If the scheduler doesn't place the allocation on that node, the port
conflict is thrown away and there's no problem.
2. If the node is picked and this is the only allocation (or last allocation),
the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect.
3. If the node is picked and there are additional allocations in the same eval
that iterate over the same node, their call to `SetAllocs` will detect the
impossible state and the node will be rejected. This can have the puzzling
behavior where a second task group for the job without any networking at all
can hit a port collision error!
It looks like this bug has existed since we implemented group networks, but
there are several factors that add up to making the issue rare for many users
yet frustratingly frequent for others:
* You're more likely to hit this bug the more tightly packed your range for
dynamic ports is. With 12000 ports in the range by default, many clusters can
avoid this for a long time.
* You're more likely to hit case (3) for jobs with lots of allocations or if a
scheduler has to iterate over a large number of nodes, such as with system jobs,
jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints.
For unlucky combinations of these factors, it's possible that case (3) happens
repeatedly, preventing scheduling of a given job until a client state
change (ex. restarting the agent so all its allocations are rescheduled
elsewhere) re-opens the range of dynamic ports available.
This changeset:
* Fixes the bug by accounting for collisions in dynamic port selection in
`AssignPorts`.
* Adds test coverage for `AssignPorts`, expands coverage of this case for the
deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a
scheduler test for spread scheduling to more easily detect this kind of problem
in the future.
* Adds a `String()` method to `Bitmap` so that any future "screaming" log lines
have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
|
|
|
// Get the indexes of the unset ports, less those which have already been
|
|
|
|
// picked as part of this offer
|
|
|
|
availablePorts := usedSet.IndexesInRangeFiltered(
|
|
|
|
false, uint(minDynamicPort), uint(maxDynamicPort), portsInOffer)
|
2020-06-16 15:53:10 +00:00
|
|
|
|
|
|
|
// Randomize the amount we need
|
|
|
|
if len(availablePorts) < numDyn {
|
|
|
|
return nil, fmt.Errorf("dynamic port selection failed")
|
|
|
|
}
|
|
|
|
|
|
|
|
numAvailable := len(availablePorts)
|
|
|
|
for i := 0; i < numDyn; i++ {
|
|
|
|
j := rand.Intn(numAvailable)
|
|
|
|
availablePorts[i], availablePorts[j] = availablePorts[j], availablePorts[i]
|
|
|
|
}
|
|
|
|
|
|
|
|
return availablePorts[:numDyn], nil
|
|
|
|
}
|
|
|
|
|
scheduling: prevent self-collision in dynamic port network offerings (#16401)
When the scheduler tries to find a placement for a new allocation, it iterates
over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with
the ports of all existing allocations and any other allocations already proposed
as part of this same evaluation via its `SetAllocs` method. Then we make an
"ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive
an "offer" in return. The offer will include both static ports and any dynamic
port assignments.
The `AssignPorts` method was written to support group networks, and it shares
code that selects dynamic ports with the original `AssignTaskNetwork`
code. `AssignTaskNetwork` can request multiple ports from the bitmap at a
time. But `AssignPorts` requests them one at a time and does not account for
possible collisions, and doesn't return an error in that case.
What happens next varies:
1. If the scheduler doesn't place the allocation on that node, the port
conflict is thrown away and there's no problem.
2. If the node is picked and this is the only allocation (or last allocation),
the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect.
3. If the node is picked and there are additional allocations in the same eval
that iterate over the same node, their call to `SetAllocs` will detect the
impossible state and the node will be rejected. This can have the puzzling
behavior where a second task group for the job without any networking at all
can hit a port collision error!
It looks like this bug has existed since we implemented group networks, but
there are several factors that add up to making the issue rare for many users
yet frustratingly frequent for others:
* You're more likely to hit this bug the more tightly packed your range for
dynamic ports is. With 12000 ports in the range by default, many clusters can
avoid this for a long time.
* You're more likely to hit case (3) for jobs with lots of allocations or if a
scheduler has to iterate over a large number of nodes, such as with system jobs,
jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints.
For unlucky combinations of these factors, it's possible that case (3) happens
repeatedly, preventing scheduling of a given job until a client state
change (ex. restarting the agent so all its allocations are rescheduled
elsewhere) re-opens the range of dynamic ports available.
This changeset:
* Fixes the bug by accounting for collisions in dynamic port selection in
`AssignPorts`.
* Adds test coverage for `AssignPorts`, expands coverage of this case for the
deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a
scheduler test for spread scheduling to more easily detect this kind of problem
in the future.
* Adds a `String()` method to `Bitmap` so that any future "screaming" log lines
have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
|
|
|
// getDynamicPortsStochastic takes the nodes used port bitmap which may be nil
|
|
|
|
// if no ports have been allocated yet, any ports already offered in the caller,
|
|
|
|
// and the network ask. It returns a set of unused ports to fulfil the ask's
|
|
|
|
// DynamicPorts or an error if it failed. An error does not mean the ask can not
|
|
|
|
// be satisfied as the method has a fixed amount of random probes and if these
|
|
|
|
// fail, the search is aborted.
|
|
|
|
func getDynamicPortsStochastic(nodeUsed Bitmap, portsInOffer []int, minDynamicPort, maxDynamicPort int, reservedPorts []Port, count int) ([]int, error) {
|
2020-06-16 15:53:10 +00:00
|
|
|
var reserved, dynamic []int
|
|
|
|
for _, port := range reservedPorts {
|
|
|
|
reserved = append(reserved, port.Value)
|
|
|
|
}
|
|
|
|
|
|
|
|
for i := 0; i < count; i++ {
|
|
|
|
attempts := 0
|
|
|
|
PICK:
|
|
|
|
attempts++
|
|
|
|
if attempts > maxRandPortAttempts {
|
|
|
|
return nil, fmt.Errorf("stochastic dynamic port selection failed")
|
|
|
|
}
|
|
|
|
|
2021-09-10 08:52:47 +00:00
|
|
|
randPort := minDynamicPort + rand.Intn(maxDynamicPort-minDynamicPort)
|
2020-06-16 15:53:10 +00:00
|
|
|
if nodeUsed != nil && nodeUsed.Check(uint(randPort)) {
|
2016-08-10 18:47:20 +00:00
|
|
|
goto PICK
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, ports := range [][]int{reserved, dynamic} {
|
|
|
|
if isPortReserved(ports, randPort) {
|
|
|
|
goto PICK
|
|
|
|
}
|
|
|
|
}
|
scheduling: prevent self-collision in dynamic port network offerings (#16401)
When the scheduler tries to find a placement for a new allocation, it iterates
over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with
the ports of all existing allocations and any other allocations already proposed
as part of this same evaluation via its `SetAllocs` method. Then we make an
"ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive
an "offer" in return. The offer will include both static ports and any dynamic
port assignments.
The `AssignPorts` method was written to support group networks, and it shares
code that selects dynamic ports with the original `AssignTaskNetwork`
code. `AssignTaskNetwork` can request multiple ports from the bitmap at a
time. But `AssignPorts` requests them one at a time and does not account for
possible collisions, and doesn't return an error in that case.
What happens next varies:
1. If the scheduler doesn't place the allocation on that node, the port
conflict is thrown away and there's no problem.
2. If the node is picked and this is the only allocation (or last allocation),
the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect.
3. If the node is picked and there are additional allocations in the same eval
that iterate over the same node, their call to `SetAllocs` will detect the
impossible state and the node will be rejected. This can have the puzzling
behavior where a second task group for the job without any networking at all
can hit a port collision error!
It looks like this bug has existed since we implemented group networks, but
there are several factors that add up to making the issue rare for many users
yet frustratingly frequent for others:
* You're more likely to hit this bug the more tightly packed your range for
dynamic ports is. With 12000 ports in the range by default, many clusters can
avoid this for a long time.
* You're more likely to hit case (3) for jobs with lots of allocations or if a
scheduler has to iterate over a large number of nodes, such as with system jobs,
jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints.
For unlucky combinations of these factors, it's possible that case (3) happens
repeatedly, preventing scheduling of a given job until a client state
change (ex. restarting the agent so all its allocations are rescheduled
elsewhere) re-opens the range of dynamic ports available.
This changeset:
* Fixes the bug by accounting for collisions in dynamic port selection in
`AssignPorts`.
* Adds test coverage for `AssignPorts`, expands coverage of this case for the
deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a
scheduler test for spread scheduling to more easily detect this kind of problem
in the future.
* Adds a `String()` method to `Bitmap` so that any future "screaming" log lines
have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
|
|
|
// the pick conflicted with a previous pick that hasn't been saved to
|
|
|
|
// the index yet
|
|
|
|
if slices.Contains(portsInOffer, randPort) {
|
|
|
|
goto PICK
|
|
|
|
}
|
|
|
|
|
2016-08-10 18:47:20 +00:00
|
|
|
dynamic = append(dynamic, randPort)
|
|
|
|
}
|
|
|
|
|
|
|
|
return dynamic, nil
|
|
|
|
}
|
|
|
|
|
2015-09-13 21:36:33 +00:00
|
|
|
// IntContains scans an integer slice for a value
|
2016-08-10 18:47:20 +00:00
|
|
|
func isPortReserved(haystack []int, needle int) bool {
|
2015-09-13 02:34:46 +00:00
|
|
|
for _, item := range haystack {
|
2016-08-10 18:47:20 +00:00
|
|
|
if item == needle {
|
2015-09-13 02:34:46 +00:00
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
2020-06-16 15:53:10 +00:00
|
|
|
|
2021-08-30 09:08:12 +00:00
|
|
|
// AllocatedPortsToNetworkResouce is a COMPAT(1.0) remove when NetworkResource
|
|
|
|
// is no longer used for materialized client view of ports.
|
2020-07-06 22:51:46 +00:00
|
|
|
func AllocatedPortsToNetworkResouce(ask *NetworkResource, ports AllocatedPorts, node *NodeResources) *NetworkResource {
|
2020-06-16 15:53:10 +00:00
|
|
|
out := ask.Copy()
|
|
|
|
|
|
|
|
for i, port := range ask.DynamicPorts {
|
|
|
|
if p, ok := ports.Get(port.Label); ok {
|
|
|
|
out.DynamicPorts[i].Value = p.Value
|
|
|
|
out.DynamicPorts[i].To = p.To
|
|
|
|
}
|
|
|
|
}
|
2020-07-06 22:51:46 +00:00
|
|
|
if len(node.NodeNetworks) > 0 {
|
|
|
|
for _, nw := range node.NodeNetworks {
|
|
|
|
if nw.Mode == "host" {
|
|
|
|
out.IP = nw.Addresses[0].Address
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for _, nw := range node.Networks {
|
|
|
|
if nw.Mode == "host" {
|
|
|
|
out.IP = nw.IP
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-06-16 15:53:10 +00:00
|
|
|
return out
|
|
|
|
}
|
|
|
|
|
|
|
|
type ClientHostNetworkConfig struct {
|
|
|
|
Name string `hcl:",key"`
|
|
|
|
CIDR string `hcl:"cidr"`
|
|
|
|
Interface string `hcl:"interface"`
|
|
|
|
ReservedPorts string `hcl:"reserved_ports"`
|
|
|
|
}
|
2021-11-05 13:02:46 +00:00
|
|
|
|
|
|
|
func (p *ClientHostNetworkConfig) Copy() *ClientHostNetworkConfig {
|
|
|
|
if p == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
c := new(ClientHostNetworkConfig)
|
|
|
|
*c = *p
|
|
|
|
return c
|
|
|
|
}
|