open-nomad/nomad/structs/network_test.go

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1019 lines
23 KiB
Go
Raw Normal View History

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
2015-09-13 21:35:28 +00:00
package structs
import (
2018-10-02 20:36:04 +00:00
"fmt"
"net"
"reflect"
"testing"
"github.com/hashicorp/nomad/ci"
scheduling: prevent self-collision in dynamic port network offerings (#16401) When the scheduler tries to find a placement for a new allocation, it iterates over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with the ports of all existing allocations and any other allocations already proposed as part of this same evaluation via its `SetAllocs` method. Then we make an "ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive an "offer" in return. The offer will include both static ports and any dynamic port assignments. The `AssignPorts` method was written to support group networks, and it shares code that selects dynamic ports with the original `AssignTaskNetwork` code. `AssignTaskNetwork` can request multiple ports from the bitmap at a time. But `AssignPorts` requests them one at a time and does not account for possible collisions, and doesn't return an error in that case. What happens next varies: 1. If the scheduler doesn't place the allocation on that node, the port conflict is thrown away and there's no problem. 2. If the node is picked and this is the only allocation (or last allocation), the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect. 3. If the node is picked and there are additional allocations in the same eval that iterate over the same node, their call to `SetAllocs` will detect the impossible state and the node will be rejected. This can have the puzzling behavior where a second task group for the job without any networking at all can hit a port collision error! It looks like this bug has existed since we implemented group networks, but there are several factors that add up to making the issue rare for many users yet frustratingly frequent for others: * You're more likely to hit this bug the more tightly packed your range for dynamic ports is. With 12000 ports in the range by default, many clusters can avoid this for a long time. * You're more likely to hit case (3) for jobs with lots of allocations or if a scheduler has to iterate over a large number of nodes, such as with system jobs, jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints. For unlucky combinations of these factors, it's possible that case (3) happens repeatedly, preventing scheduling of a given job until a client state change (ex. restarting the agent so all its allocations are rescheduled elsewhere) re-opens the range of dynamic ports available. This changeset: * Fixes the bug by accounting for collisions in dynamic port selection in `AssignPorts`. * Adds test coverage for `AssignPorts`, expands coverage of this case for the deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a scheduler test for spread scheduling to more easily detect this kind of problem in the future. * Adds a `String()` method to `Bitmap` so that any future "screaming" log lines have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
"github.com/shoenig/test/must"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestNetworkIndex_Copy(t *testing.T) {
ci.Parallel(t)
n := &Node{
NodeResources: &NodeResources{
Networks: []*NetworkResource{
{
Device: "eth0",
CIDR: "192.168.0.100/32",
IP: "192.168.0.100",
MBits: 1000,
},
},
NodeNetworks: []*NodeNetworkResource{
{
Mode: "host",
Device: "eth0",
Speed: 1000,
Addresses: []NodeNetworkAddress{
{
Alias: "default",
Address: "192.168.0.100",
Family: NodeNetworkAF_IPv4,
},
},
},
},
},
Reserved: &Resources{
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "192.168.0.100",
ReservedPorts: []Port{{Label: "ssh", Value: 22}},
MBits: 1,
},
},
},
ReservedResources: &NodeReservedResources{
Networks: NodeReservedNetworkResources{
ReservedHostPorts: "22",
},
},
}
allocs := []*Allocation{
{
AllocatedResources: &AllocatedResources{
Tasks: map[string]*AllocatedTaskResources{
"web": {
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "192.168.0.100",
MBits: 20,
ReservedPorts: []Port{{"one", 8000, 0, ""}, {"two", 9000, 0, ""}},
},
},
},
},
},
},
{
AllocatedResources: &AllocatedResources{
Tasks: map[string]*AllocatedTaskResources{
"api": {
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "192.168.0.100",
MBits: 50,
ReservedPorts: []Port{{"one", 10000, 0, ""}},
},
},
},
},
},
},
}
netIdx := NewNetworkIndex()
netIdx.SetNode(n)
netIdx.AddAllocs(allocs)
// Copy must be equal.
netIdxCopy := netIdx.Copy()
require.Equal(t, netIdx, netIdxCopy)
// Modifying copy should not affect original value.
n.NodeResources.Networks[0].Device = "eth1"
n.ReservedResources.Networks.ReservedHostPorts = "22,80"
allocs = append(allocs, &Allocation{
AllocatedResources: &AllocatedResources{
Tasks: map[string]*AllocatedTaskResources{
"db": {
Networks: []*NetworkResource{
{
Device: "eth1",
IP: "192.168.0.104",
MBits: 50,
ReservedPorts: []Port{{"one", 4567, 0, ""}},
},
},
},
},
},
})
netIdxCopy.SetNode(n)
netIdxCopy.AddAllocs(allocs)
netIdxCopy.MinDynamicPort = 1000
netIdxCopy.MaxDynamicPort = 2000
require.NotEqual(t, netIdx, netIdxCopy)
}
func TestNetworkIndex_Overcommitted(t *testing.T) {
t.Skip()
ci.Parallel(t)
idx := NewNetworkIndex()
// Consume some network
reserved := &NetworkResource{
Device: "eth0",
IP: "192.168.0.100",
MBits: 505,
ReservedPorts: []Port{{"one", 8000, 0, ""}, {"two", 9000, 0, ""}},
}
collide, reasons := idx.AddReserved(reserved)
if collide || len(reasons) != 0 {
t.Fatalf("bad")
}
if !idx.Overcommitted() {
t.Fatalf("have no resources")
}
// Add resources
n := &Node{
2018-10-02 20:36:04 +00:00
NodeResources: &NodeResources{
Networks: []*NetworkResource{
2017-09-26 22:26:33 +00:00
{
Device: "eth0",
CIDR: "192.168.0.100/32",
MBits: 1000,
},
},
},
}
idx.SetNode(n)
if idx.Overcommitted() {
t.Fatalf("have resources")
}
2018-03-11 19:11:59 +00:00
// Double up our usage
idx.AddReserved(reserved)
if !idx.Overcommitted() {
t.Fatalf("should be overcommitted")
}
}
func TestNetworkIndex_SetNode(t *testing.T) {
ci.Parallel(t)
idx := NewNetworkIndex()
2015-09-13 21:35:28 +00:00
n := &Node{
2018-10-02 20:36:04 +00:00
NodeResources: &NodeResources{
2015-09-13 21:35:28 +00:00
Networks: []*NetworkResource{
2017-09-26 22:26:33 +00:00
{
2015-09-13 21:35:28 +00:00
Device: "eth0",
CIDR: "192.168.0.100/32",
2018-10-02 20:36:04 +00:00
IP: "192.168.0.100",
2015-09-13 21:35:28 +00:00
MBits: 1000,
},
},
},
2018-10-02 20:36:04 +00:00
ReservedResources: &NodeReservedResources{
Networks: NodeReservedNetworkResources{
ReservedHostPorts: "22",
2015-09-13 21:35:28 +00:00
},
},
}
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
require.NoError(t, idx.SetNode(n))
require.Len(t, idx.TaskNetworks, 1)
require.Equal(t, 1000, idx.AvailBandwidth["eth0"])
require.True(t, idx.UsedPorts["192.168.0.100"].Check(22))
}
func TestNetworkIndex_AddAllocs(t *testing.T) {
ci.Parallel(t)
idx := NewNetworkIndex()
2015-09-13 21:35:28 +00:00
allocs := []*Allocation{
2017-09-26 22:26:33 +00:00
{
ClientStatus: AllocClientStatusRunning,
DesiredStatus: AllocDesiredStatusRun,
2018-10-02 20:36:04 +00:00
AllocatedResources: &AllocatedResources{
Tasks: map[string]*AllocatedTaskResources{
"web": {
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "192.168.0.100",
MBits: 20,
ReservedPorts: []Port{{"one", 8000, 0, ""}, {"two", 9000, 0, ""}},
2018-10-02 20:36:04 +00:00
},
},
},
},
},
},
2017-09-26 22:26:33 +00:00
{
ClientStatus: AllocClientStatusRunning,
DesiredStatus: AllocDesiredStatusRun,
2018-10-02 20:36:04 +00:00
AllocatedResources: &AllocatedResources{
Tasks: map[string]*AllocatedTaskResources{
"api": {
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "192.168.0.100",
MBits: 50,
ReservedPorts: []Port{{"one", 10000, 0, ""}},
2018-10-02 20:36:04 +00:00
},
},
},
},
},
},
{
// Allocations running on clients should have their
// ports counted even if their DesiredStatus=stop
ClientStatus: AllocClientStatusRunning,
DesiredStatus: AllocDesiredStatusStop,
AllocatedResources: &AllocatedResources{
Tasks: map[string]*AllocatedTaskResources{
"api": {
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "192.168.0.100",
MBits: 50,
ReservedPorts: []Port{{"one", 10001, 0, ""}},
},
},
},
},
},
},
{
// Allocations *not* running on clients should *not*
// have their ports counted even if their
// DesiredStatus=run
ClientStatus: AllocClientStatusFailed,
DesiredStatus: AllocDesiredStatusRun,
AllocatedResources: &AllocatedResources{
Tasks: map[string]*AllocatedTaskResources{
"api": {
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "192.168.0.100",
MBits: 50,
ReservedPorts: []Port{{"one", 10001, 0, ""}},
},
},
},
},
},
},
}
collide, reason := idx.AddAllocs(allocs)
assert.False(t, collide)
assert.Empty(t, reason)
assert.True(t, idx.UsedPorts["192.168.0.100"].Check(8000))
assert.True(t, idx.UsedPorts["192.168.0.100"].Check(9000))
assert.True(t, idx.UsedPorts["192.168.0.100"].Check(10000))
assert.True(t, idx.UsedPorts["192.168.0.100"].Check(10001))
}
func TestNetworkIndex_AddReserved(t *testing.T) {
ci.Parallel(t)
idx := NewNetworkIndex()
2015-09-13 21:35:28 +00:00
reserved := &NetworkResource{
Device: "eth0",
IP: "192.168.0.100",
MBits: 20,
ReservedPorts: []Port{{"one", 8000, 0, ""}, {"two", 9000, 0, ""}},
}
collide, reasons := idx.AddReserved(reserved)
if collide || len(reasons) > 0 {
t.Fatalf("bad")
}
if idx.UsedBandwidth["eth0"] != 20 {
t.Fatalf("Bad")
}
if !idx.UsedPorts["192.168.0.100"].Check(8000) {
t.Fatalf("Bad")
}
if !idx.UsedPorts["192.168.0.100"].Check(9000) {
t.Fatalf("Bad")
}
// Try to reserve the same network
collide, reasons = idx.AddReserved(reserved)
if !collide || len(reasons) == 0 {
t.Fatalf("bad")
}
}
2018-10-02 20:36:04 +00:00
// XXX Reserving ports doesn't work when yielding from a CIDR block. This is
// okay for now since we do not actually fingerprint CIDR blocks.
func TestNetworkIndex_yieldIP(t *testing.T) {
ci.Parallel(t)
2018-10-02 20:36:04 +00:00
idx := NewNetworkIndex()
n := &Node{
NodeResources: &NodeResources{
Networks: []*NetworkResource{
{
Device: "eth0",
CIDR: "192.168.0.100/30",
MBits: 1000,
},
},
},
}
idx.SetNode(n)
var out []string
idx.yieldIP(func(n *NetworkResource, ip net.IP) (stop bool) {
out = append(out, ip.String())
return
})
expect := []string{"192.168.0.100", "192.168.0.101",
"192.168.0.102", "192.168.0.103"}
if !reflect.DeepEqual(out, expect) {
t.Fatalf("bad: %v", out)
}
}
scheduling: prevent self-collision in dynamic port network offerings (#16401) When the scheduler tries to find a placement for a new allocation, it iterates over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with the ports of all existing allocations and any other allocations already proposed as part of this same evaluation via its `SetAllocs` method. Then we make an "ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive an "offer" in return. The offer will include both static ports and any dynamic port assignments. The `AssignPorts` method was written to support group networks, and it shares code that selects dynamic ports with the original `AssignTaskNetwork` code. `AssignTaskNetwork` can request multiple ports from the bitmap at a time. But `AssignPorts` requests them one at a time and does not account for possible collisions, and doesn't return an error in that case. What happens next varies: 1. If the scheduler doesn't place the allocation on that node, the port conflict is thrown away and there's no problem. 2. If the node is picked and this is the only allocation (or last allocation), the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect. 3. If the node is picked and there are additional allocations in the same eval that iterate over the same node, their call to `SetAllocs` will detect the impossible state and the node will be rejected. This can have the puzzling behavior where a second task group for the job without any networking at all can hit a port collision error! It looks like this bug has existed since we implemented group networks, but there are several factors that add up to making the issue rare for many users yet frustratingly frequent for others: * You're more likely to hit this bug the more tightly packed your range for dynamic ports is. With 12000 ports in the range by default, many clusters can avoid this for a long time. * You're more likely to hit case (3) for jobs with lots of allocations or if a scheduler has to iterate over a large number of nodes, such as with system jobs, jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints. For unlucky combinations of these factors, it's possible that case (3) happens repeatedly, preventing scheduling of a given job until a client state change (ex. restarting the agent so all its allocations are rescheduled elsewhere) re-opens the range of dynamic ports available. This changeset: * Fixes the bug by accounting for collisions in dynamic port selection in `AssignPorts`. * Adds test coverage for `AssignPorts`, expands coverage of this case for the deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a scheduler test for spread scheduling to more easily detect this kind of problem in the future. * Adds a `String()` method to `Bitmap` so that any future "screaming" log lines have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
// TestNetworkIndex_AssignPorts exercises assigning ports on group networks.
func TestNetworkIndex_AssignPorts(t *testing.T) {
ci.Parallel(t)
// Create a node that only has one free port
idx := NewNetworkIndex()
n := &Node{
NodeResources: &NodeResources{
Networks: []*NetworkResource{
{
Device: "eth0",
CIDR: "192.168.0.100/32",
IP: "192.168.0.100",
MBits: 1000,
},
},
NodeNetworks: []*NodeNetworkResource{
{
Mode: "host",
Device: "eth0",
Speed: 1000,
Addresses: []NodeNetworkAddress{
{
Alias: "default",
Address: "192.168.0.100",
Family: NodeNetworkAF_IPv4,
},
},
},
},
},
ReservedResources: &NodeReservedResources{
Networks: NodeReservedNetworkResources{
ReservedHostPorts: fmt.Sprintf("%d-%d", idx.MinDynamicPort, idx.MaxDynamicPort-2),
},
},
}
idx.SetNode(n)
// Ask for 2 dynamic ports
ask := &NetworkResource{
ReservedPorts: []Port{{"static", 443, 443, "default"}},
DynamicPorts: []Port{{"http", 0, 80, "default"}, {"admin", 0, 8080, "default"}},
}
offer, err := idx.AssignPorts(ask)
must.NoError(t, err)
must.NotNil(t, offer, must.Sprint("did not get an offer"))
staticPortMapping, ok := offer.Get("static")
must.True(t, ok)
httpPortMapping, ok := offer.Get("http")
must.True(t, ok)
adminPortMapping, ok := offer.Get("admin")
must.True(t, ok)
must.NotEq(t, httpPortMapping.Value, adminPortMapping.Value,
must.Sprint("assigned dynamic ports must not conflict"))
must.Eq(t, 443, staticPortMapping.Value)
must.Between(t, idx.MaxDynamicPort-1, httpPortMapping.Value, idx.MaxDynamicPort)
must.Between(t, idx.MaxDynamicPort-1, adminPortMapping.Value, idx.MaxDynamicPort)
}
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
func TestNetworkIndex_AssignTaskNetwork(t *testing.T) {
ci.Parallel(t)
2018-10-02 20:36:04 +00:00
idx := NewNetworkIndex()
n := &Node{
NodeResources: &NodeResources{
Networks: []*NetworkResource{
{
Device: "eth0",
CIDR: "192.168.0.100/30",
MBits: 1000,
},
},
},
}
idx.SetNode(n)
allocs := []*Allocation{
{
TaskResources: map[string]*Resources{
"web": {
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "192.168.0.100",
MBits: 20,
ReservedPorts: []Port{{"one", 8000, 0, ""}, {"two", 9000, 0, ""}},
2018-10-02 20:36:04 +00:00
},
},
},
},
},
{
TaskResources: map[string]*Resources{
"api": {
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "192.168.0.100",
MBits: 50,
ReservedPorts: []Port{{"main", 10000, 0, ""}},
2018-10-02 20:36:04 +00:00
},
},
},
},
},
}
idx.AddAllocs(allocs)
// Ask for a reserved port
ask := &NetworkResource{
ReservedPorts: []Port{{"main", 8000, 0, ""}},
2018-10-02 20:36:04 +00:00
}
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
offer, err := idx.AssignTaskNetwork(ask)
require.NoError(t, err)
require.NotNil(t, offer)
require.Equal(t, "192.168.0.101", offer.IP)
rp := Port{"main", 8000, 0, ""}
require.Len(t, offer.ReservedPorts, 1)
require.Exactly(t, rp, offer.ReservedPorts[0])
2018-10-02 20:36:04 +00:00
// Ask for dynamic ports
ask = &NetworkResource{
DynamicPorts: []Port{{"http", 0, 80, ""}, {"https", 0, 443, ""}, {"admin", 0, -1, ""}},
2018-10-02 20:36:04 +00:00
}
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
offer, err = idx.AssignTaskNetwork(ask)
require.NoError(t, err)
require.NotNil(t, offer)
require.Equal(t, "192.168.0.100", offer.IP)
require.Len(t, offer.DynamicPorts, 3)
var adminPort Port
2018-10-02 20:36:04 +00:00
for _, port := range offer.DynamicPorts {
require.NotZero(t, port.Value)
if port.Label == "admin" {
adminPort = port
2018-10-02 20:36:04 +00:00
}
}
require.Equal(t, adminPort.Value, adminPort.To)
2018-10-02 20:36:04 +00:00
// Ask for reserved + dynamic ports
ask = &NetworkResource{
ReservedPorts: []Port{{"main", 2345, 0, ""}},
DynamicPorts: []Port{{"http", 0, 80, ""}, {"https", 0, 443, ""}, {"admin", 0, 8080, ""}},
2018-10-02 20:36:04 +00:00
}
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
offer, err = idx.AssignTaskNetwork(ask)
require.NoError(t, err)
require.NotNil(t, offer)
require.Equal(t, "192.168.0.100", offer.IP)
2018-10-02 20:36:04 +00:00
rp = Port{"main", 2345, 0, ""}
require.Len(t, offer.ReservedPorts, 1)
require.Exactly(t, rp, offer.ReservedPorts[0])
2018-10-02 20:36:04 +00:00
// Ask for too much bandwidth
ask = &NetworkResource{
MBits: 1000,
}
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
offer, err = idx.AssignTaskNetwork(ask)
require.Error(t, err)
require.Equal(t, "bandwidth exceeded", err.Error())
require.Nil(t, offer)
2018-10-02 20:36:04 +00:00
}
// This test ensures that even with a small domain of available ports we are
// able to make a dynamic port allocation.
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
func TestNetworkIndex_AssignTaskNetwork_Dynamic_Contention(t *testing.T) {
ci.Parallel(t)
2018-10-02 20:36:04 +00:00
// Create a node that only has one free port
idx := NewNetworkIndex()
n := &Node{
NodeResources: &NodeResources{
Networks: []*NetworkResource{
{
Device: "eth0",
CIDR: "192.168.0.100/32",
IP: "192.168.0.100",
MBits: 1000,
},
},
},
ReservedResources: &NodeReservedResources{
Networks: NodeReservedNetworkResources{
scheduling: prevent self-collision in dynamic port network offerings (#16401) When the scheduler tries to find a placement for a new allocation, it iterates over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with the ports of all existing allocations and any other allocations already proposed as part of this same evaluation via its `SetAllocs` method. Then we make an "ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive an "offer" in return. The offer will include both static ports and any dynamic port assignments. The `AssignPorts` method was written to support group networks, and it shares code that selects dynamic ports with the original `AssignTaskNetwork` code. `AssignTaskNetwork` can request multiple ports from the bitmap at a time. But `AssignPorts` requests them one at a time and does not account for possible collisions, and doesn't return an error in that case. What happens next varies: 1. If the scheduler doesn't place the allocation on that node, the port conflict is thrown away and there's no problem. 2. If the node is picked and this is the only allocation (or last allocation), the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect. 3. If the node is picked and there are additional allocations in the same eval that iterate over the same node, their call to `SetAllocs` will detect the impossible state and the node will be rejected. This can have the puzzling behavior where a second task group for the job without any networking at all can hit a port collision error! It looks like this bug has existed since we implemented group networks, but there are several factors that add up to making the issue rare for many users yet frustratingly frequent for others: * You're more likely to hit this bug the more tightly packed your range for dynamic ports is. With 12000 ports in the range by default, many clusters can avoid this for a long time. * You're more likely to hit case (3) for jobs with lots of allocations or if a scheduler has to iterate over a large number of nodes, such as with system jobs, jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints. For unlucky combinations of these factors, it's possible that case (3) happens repeatedly, preventing scheduling of a given job until a client state change (ex. restarting the agent so all its allocations are rescheduled elsewhere) re-opens the range of dynamic ports available. This changeset: * Fixes the bug by accounting for collisions in dynamic port selection in `AssignPorts`. * Adds test coverage for `AssignPorts`, expands coverage of this case for the deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a scheduler test for spread scheduling to more easily detect this kind of problem in the future. * Adds a `String()` method to `Bitmap` so that any future "screaming" log lines have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
ReservedHostPorts: fmt.Sprintf("%d-%d", idx.MinDynamicPort, idx.MaxDynamicPort-2),
2018-10-02 20:36:04 +00:00
},
},
}
scheduling: prevent self-collision in dynamic port network offerings (#16401) When the scheduler tries to find a placement for a new allocation, it iterates over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with the ports of all existing allocations and any other allocations already proposed as part of this same evaluation via its `SetAllocs` method. Then we make an "ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive an "offer" in return. The offer will include both static ports and any dynamic port assignments. The `AssignPorts` method was written to support group networks, and it shares code that selects dynamic ports with the original `AssignTaskNetwork` code. `AssignTaskNetwork` can request multiple ports from the bitmap at a time. But `AssignPorts` requests them one at a time and does not account for possible collisions, and doesn't return an error in that case. What happens next varies: 1. If the scheduler doesn't place the allocation on that node, the port conflict is thrown away and there's no problem. 2. If the node is picked and this is the only allocation (or last allocation), the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect. 3. If the node is picked and there are additional allocations in the same eval that iterate over the same node, their call to `SetAllocs` will detect the impossible state and the node will be rejected. This can have the puzzling behavior where a second task group for the job without any networking at all can hit a port collision error! It looks like this bug has existed since we implemented group networks, but there are several factors that add up to making the issue rare for many users yet frustratingly frequent for others: * You're more likely to hit this bug the more tightly packed your range for dynamic ports is. With 12000 ports in the range by default, many clusters can avoid this for a long time. * You're more likely to hit case (3) for jobs with lots of allocations or if a scheduler has to iterate over a large number of nodes, such as with system jobs, jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints. For unlucky combinations of these factors, it's possible that case (3) happens repeatedly, preventing scheduling of a given job until a client state change (ex. restarting the agent so all its allocations are rescheduled elsewhere) re-opens the range of dynamic ports available. This changeset: * Fixes the bug by accounting for collisions in dynamic port selection in `AssignPorts`. * Adds test coverage for `AssignPorts`, expands coverage of this case for the deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a scheduler test for spread scheduling to more easily detect this kind of problem in the future. * Adds a `String()` method to `Bitmap` so that any future "screaming" log lines have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
2018-10-02 20:36:04 +00:00
idx.SetNode(n)
scheduling: prevent self-collision in dynamic port network offerings (#16401) When the scheduler tries to find a placement for a new allocation, it iterates over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with the ports of all existing allocations and any other allocations already proposed as part of this same evaluation via its `SetAllocs` method. Then we make an "ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive an "offer" in return. The offer will include both static ports and any dynamic port assignments. The `AssignPorts` method was written to support group networks, and it shares code that selects dynamic ports with the original `AssignTaskNetwork` code. `AssignTaskNetwork` can request multiple ports from the bitmap at a time. But `AssignPorts` requests them one at a time and does not account for possible collisions, and doesn't return an error in that case. What happens next varies: 1. If the scheduler doesn't place the allocation on that node, the port conflict is thrown away and there's no problem. 2. If the node is picked and this is the only allocation (or last allocation), the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect. 3. If the node is picked and there are additional allocations in the same eval that iterate over the same node, their call to `SetAllocs` will detect the impossible state and the node will be rejected. This can have the puzzling behavior where a second task group for the job without any networking at all can hit a port collision error! It looks like this bug has existed since we implemented group networks, but there are several factors that add up to making the issue rare for many users yet frustratingly frequent for others: * You're more likely to hit this bug the more tightly packed your range for dynamic ports is. With 12000 ports in the range by default, many clusters can avoid this for a long time. * You're more likely to hit case (3) for jobs with lots of allocations or if a scheduler has to iterate over a large number of nodes, such as with system jobs, jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints. For unlucky combinations of these factors, it's possible that case (3) happens repeatedly, preventing scheduling of a given job until a client state change (ex. restarting the agent so all its allocations are rescheduled elsewhere) re-opens the range of dynamic ports available. This changeset: * Fixes the bug by accounting for collisions in dynamic port selection in `AssignPorts`. * Adds test coverage for `AssignPorts`, expands coverage of this case for the deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a scheduler test for spread scheduling to more easily detect this kind of problem in the future. * Adds a `String()` method to `Bitmap` so that any future "screaming" log lines have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
// Ask for 2 dynamic ports
2018-10-02 20:36:04 +00:00
ask := &NetworkResource{
scheduling: prevent self-collision in dynamic port network offerings (#16401) When the scheduler tries to find a placement for a new allocation, it iterates over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with the ports of all existing allocations and any other allocations already proposed as part of this same evaluation via its `SetAllocs` method. Then we make an "ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive an "offer" in return. The offer will include both static ports and any dynamic port assignments. The `AssignPorts` method was written to support group networks, and it shares code that selects dynamic ports with the original `AssignTaskNetwork` code. `AssignTaskNetwork` can request multiple ports from the bitmap at a time. But `AssignPorts` requests them one at a time and does not account for possible collisions, and doesn't return an error in that case. What happens next varies: 1. If the scheduler doesn't place the allocation on that node, the port conflict is thrown away and there's no problem. 2. If the node is picked and this is the only allocation (or last allocation), the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect. 3. If the node is picked and there are additional allocations in the same eval that iterate over the same node, their call to `SetAllocs` will detect the impossible state and the node will be rejected. This can have the puzzling behavior where a second task group for the job without any networking at all can hit a port collision error! It looks like this bug has existed since we implemented group networks, but there are several factors that add up to making the issue rare for many users yet frustratingly frequent for others: * You're more likely to hit this bug the more tightly packed your range for dynamic ports is. With 12000 ports in the range by default, many clusters can avoid this for a long time. * You're more likely to hit case (3) for jobs with lots of allocations or if a scheduler has to iterate over a large number of nodes, such as with system jobs, jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints. For unlucky combinations of these factors, it's possible that case (3) happens repeatedly, preventing scheduling of a given job until a client state change (ex. restarting the agent so all its allocations are rescheduled elsewhere) re-opens the range of dynamic ports available. This changeset: * Fixes the bug by accounting for collisions in dynamic port selection in `AssignPorts`. * Adds test coverage for `AssignPorts`, expands coverage of this case for the deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a scheduler test for spread scheduling to more easily detect this kind of problem in the future. * Adds a `String()` method to `Bitmap` so that any future "screaming" log lines have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
DynamicPorts: []Port{{"http", 0, 80, ""}, {"admin", 0, 443, ""}},
2018-10-02 20:36:04 +00:00
}
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
offer, err := idx.AssignTaskNetwork(ask)
scheduling: prevent self-collision in dynamic port network offerings (#16401) When the scheduler tries to find a placement for a new allocation, it iterates over a subset of nodes. For each node, we populate a `NetworkIndex` bitmap with the ports of all existing allocations and any other allocations already proposed as part of this same evaluation via its `SetAllocs` method. Then we make an "ask" of the `NetworkIndex` in `AssignPorts` for any ports we need and receive an "offer" in return. The offer will include both static ports and any dynamic port assignments. The `AssignPorts` method was written to support group networks, and it shares code that selects dynamic ports with the original `AssignTaskNetwork` code. `AssignTaskNetwork` can request multiple ports from the bitmap at a time. But `AssignPorts` requests them one at a time and does not account for possible collisions, and doesn't return an error in that case. What happens next varies: 1. If the scheduler doesn't place the allocation on that node, the port conflict is thrown away and there's no problem. 2. If the node is picked and this is the only allocation (or last allocation), the plan applier will reject the plan when it calls `SetAllocs`, as we'd expect. 3. If the node is picked and there are additional allocations in the same eval that iterate over the same node, their call to `SetAllocs` will detect the impossible state and the node will be rejected. This can have the puzzling behavior where a second task group for the job without any networking at all can hit a port collision error! It looks like this bug has existed since we implemented group networks, but there are several factors that add up to making the issue rare for many users yet frustratingly frequent for others: * You're more likely to hit this bug the more tightly packed your range for dynamic ports is. With 12000 ports in the range by default, many clusters can avoid this for a long time. * You're more likely to hit case (3) for jobs with lots of allocations or if a scheduler has to iterate over a large number of nodes, such as with system jobs, jobs with `spread` blocks, or (sometimes) jobs using `unique` constraints. For unlucky combinations of these factors, it's possible that case (3) happens repeatedly, preventing scheduling of a given job until a client state change (ex. restarting the agent so all its allocations are rescheduled elsewhere) re-opens the range of dynamic ports available. This changeset: * Fixes the bug by accounting for collisions in dynamic port selection in `AssignPorts`. * Adds test coverage for `AssignPorts`, expands coverage of this case for the deprecated `AssignTaskNetwork`, and tightens the dynamic port range in a scheduler test for spread scheduling to more easily detect this kind of problem in the future. * Adds a `String()` method to `Bitmap` so that any future "screaming" log lines have a human-readable list of used ports.
2023-03-09 15:09:54 +00:00
must.NoError(t, err)
must.NotNil(t, offer, must.Sprint("did not get an offer"))
must.Eq(t, "192.168.0.100", offer.IP)
must.Len(t, 2, offer.DynamicPorts, must.Sprint("There should be one dynamic ports"))
must.NotEq(t, offer.DynamicPorts[0].Value, offer.DynamicPorts[1].Value,
must.Sprint("assigned dynamic ports must not conflict"))
must.Between(t, idx.MaxDynamicPort-1, offer.DynamicPorts[0].Value, idx.MaxDynamicPort)
must.Between(t, idx.MaxDynamicPort-1, offer.DynamicPorts[1].Value, idx.MaxDynamicPort)
2018-10-02 20:36:04 +00:00
}
// COMPAT(0.11): Remove in 0.11
func TestNetworkIndex_SetNode_Old(t *testing.T) {
ci.Parallel(t)
2018-10-02 20:36:04 +00:00
idx := NewNetworkIndex()
n := &Node{
Resources: &Resources{
Networks: []*NetworkResource{
{
Device: "eth0",
CIDR: "192.168.0.100/32",
MBits: 1000,
},
},
},
Reserved: &Resources{
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "192.168.0.100",
ReservedPorts: []Port{{"ssh", 22, 0, ""}},
2018-10-02 20:36:04 +00:00
MBits: 1,
},
},
},
}
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
require.NoError(t, idx.SetNode(n))
require.Len(t, idx.TaskNetworks, 1)
require.Equal(t, 1000, idx.AvailBandwidth["eth0"])
require.Equal(t, 1, idx.UsedBandwidth["eth0"])
require.True(t, idx.UsedPorts["192.168.0.100"].Check(22))
2018-10-02 20:36:04 +00:00
}
// COMPAT(0.11): Remove in 0.11
func TestNetworkIndex_AddAllocs_Old(t *testing.T) {
ci.Parallel(t)
2018-10-02 20:36:04 +00:00
idx := NewNetworkIndex()
allocs := []*Allocation{
{
TaskResources: map[string]*Resources{
"web": {
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "192.168.0.100",
MBits: 20,
ReservedPorts: []Port{{"one", 8000, 0, ""}, {"two", 9000, 0, ""}},
2018-10-02 20:36:04 +00:00
},
},
},
},
},
{
TaskResources: map[string]*Resources{
"api": {
Networks: []*NetworkResource{
{
Device: "eth0",
IP: "192.168.0.100",
MBits: 50,
ReservedPorts: []Port{{"one", 10000, 0, ""}},
2018-10-02 20:36:04 +00:00
},
},
},
},
},
}
collide, reason := idx.AddAllocs(allocs)
if collide || reason != "" {
2018-10-02 20:36:04 +00:00
t.Fatalf("bad")
}
if idx.UsedBandwidth["eth0"] != 70 {
t.Fatalf("Bad")
}
if !idx.UsedPorts["192.168.0.100"].Check(8000) {
t.Fatalf("Bad")
}
if !idx.UsedPorts["192.168.0.100"].Check(9000) {
t.Fatalf("Bad")
}
if !idx.UsedPorts["192.168.0.100"].Check(10000) {
t.Fatalf("Bad")
}
}
// COMPAT(0.11): Remove in 0.11
func TestNetworkIndex_yieldIP_Old(t *testing.T) {
ci.Parallel(t)
idx := NewNetworkIndex()
2015-09-13 21:35:28 +00:00
n := &Node{
Resources: &Resources{
Networks: []*NetworkResource{
2017-09-26 22:26:33 +00:00
{
2015-09-13 21:35:28 +00:00
Device: "eth0",
CIDR: "192.168.0.100/30",
MBits: 1000,
},
},
},
Reserved: &Resources{
Networks: []*NetworkResource{
2017-09-26 22:26:33 +00:00
{
2015-09-13 21:35:28 +00:00
Device: "eth0",
IP: "192.168.0.100",
ReservedPorts: []Port{{"ssh", 22, 0, ""}},
2015-09-13 21:35:28 +00:00
MBits: 1,
},
},
},
}
idx.SetNode(n)
var out []string
2015-09-13 21:35:28 +00:00
idx.yieldIP(func(n *NetworkResource, ip net.IP) (stop bool) {
out = append(out, ip.String())
return
})
expect := []string{"192.168.0.100", "192.168.0.101",
"192.168.0.102", "192.168.0.103"}
if !reflect.DeepEqual(out, expect) {
t.Fatalf("bad: %v", out)
}
}
2018-10-02 20:36:04 +00:00
// COMPAT(0.11): Remove in 0.11
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
func TestNetworkIndex_AssignTaskNetwork_Old(t *testing.T) {
ci.Parallel(t)
idx := NewNetworkIndex()
2015-09-13 21:35:28 +00:00
n := &Node{
Resources: &Resources{
Networks: []*NetworkResource{
2017-09-26 22:26:33 +00:00
{
2015-09-13 21:35:28 +00:00
Device: "eth0",
CIDR: "192.168.0.100/30",
MBits: 1000,
},
},
},
Reserved: &Resources{
Networks: []*NetworkResource{
2017-09-26 22:26:33 +00:00
{
2015-09-13 21:35:28 +00:00
Device: "eth0",
IP: "192.168.0.100",
ReservedPorts: []Port{{"ssh", 22, 0, ""}},
2015-09-13 21:35:28 +00:00
MBits: 1,
},
},
},
}
idx.SetNode(n)
2015-09-13 21:35:28 +00:00
allocs := []*Allocation{
2017-09-26 22:26:33 +00:00
{
2015-09-13 21:35:28 +00:00
TaskResources: map[string]*Resources{
2017-09-26 22:26:33 +00:00
"web": {
2015-09-13 21:35:28 +00:00
Networks: []*NetworkResource{
2017-09-26 22:26:33 +00:00
{
Device: "eth0",
IP: "192.168.0.100",
MBits: 20,
ReservedPorts: []Port{{"one", 8000, 0, ""}, {"two", 9000, 0, ""}},
},
},
},
},
},
2017-09-26 22:26:33 +00:00
{
2015-09-13 21:35:28 +00:00
TaskResources: map[string]*Resources{
2017-09-26 22:26:33 +00:00
"api": {
2015-09-13 21:35:28 +00:00
Networks: []*NetworkResource{
2017-09-26 22:26:33 +00:00
{
Device: "eth0",
IP: "192.168.0.100",
MBits: 50,
ReservedPorts: []Port{{"main", 10000, 0, ""}},
},
},
},
},
},
}
idx.AddAllocs(allocs)
// Ask for a reserved port
2015-09-13 21:35:28 +00:00
ask := &NetworkResource{
ReservedPorts: []Port{{"main", 8000, 0, ""}},
}
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
offer, err := idx.AssignTaskNetwork(ask)
if err != nil {
t.Fatalf("err: %v", err)
}
if offer == nil {
t.Fatalf("bad")
}
if offer.IP != "192.168.0.101" {
t.Fatalf("bad: %#v", offer)
}
rp := Port{"main", 8000, 0, ""}
2015-11-15 09:56:21 +00:00
if len(offer.ReservedPorts) != 1 || offer.ReservedPorts[0] != rp {
t.Fatalf("bad: %#v", offer)
}
// Ask for dynamic ports
2015-09-13 21:35:28 +00:00
ask = &NetworkResource{
DynamicPorts: []Port{{"http", 0, 80, ""}, {"https", 0, 443, ""}, {"admin", 0, 8080, ""}},
}
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
offer, err = idx.AssignTaskNetwork(ask)
if err != nil {
t.Fatalf("err: %v", err)
}
if offer == nil {
t.Fatalf("bad")
}
if offer.IP != "192.168.0.100" {
t.Fatalf("bad: %#v", offer)
}
2015-11-15 09:56:21 +00:00
if len(offer.DynamicPorts) != 3 {
t.Fatalf("There should be three dynamic ports")
}
for _, port := range offer.DynamicPorts {
if port.Value == 0 {
t.Fatalf("Dynamic Port: %v should have been assigned a host port", port.Label)
}
}
// Ask for reserved + dynamic ports
2015-09-13 21:35:28 +00:00
ask = &NetworkResource{
ReservedPorts: []Port{{"main", 2345, 0, ""}},
DynamicPorts: []Port{{"http", 0, 80, ""}, {"https", 0, 443, ""}, {"admin", 0, 8080, ""}},
}
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
offer, err = idx.AssignTaskNetwork(ask)
if err != nil {
t.Fatalf("err: %v", err)
}
if offer == nil {
t.Fatalf("bad")
}
if offer.IP != "192.168.0.100" {
t.Fatalf("bad: %#v", offer)
}
2015-11-15 09:56:21 +00:00
rp = Port{"main", 2345, 0, ""}
2015-11-15 09:56:21 +00:00
if len(offer.ReservedPorts) != 1 || offer.ReservedPorts[0] != rp {
t.Fatalf("bad: %#v", offer)
}
// Ask for too much bandwidth
2015-09-13 21:35:28 +00:00
ask = &NetworkResource{
MBits: 1000,
}
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
offer, err = idx.AssignTaskNetwork(ask)
if err.Error() != "bandwidth exceeded" {
t.Fatalf("err: %v", err)
}
if offer != nil {
t.Fatalf("bad")
}
}
2018-10-02 20:36:04 +00:00
// COMPAT(0.11): Remove in 0.11
2016-08-05 23:23:41 +00:00
// This test ensures that even with a small domain of available ports we are
// able to make a dynamic port allocation.
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
func TestNetworkIndex_AssignTaskNetwork_Dynamic_Contention_Old(t *testing.T) {
ci.Parallel(t)
2016-08-05 23:23:41 +00:00
// Create a node that only has one free port
idx := NewNetworkIndex()
n := &Node{
Resources: &Resources{
Networks: []*NetworkResource{
2017-09-26 22:26:33 +00:00
{
2016-08-05 23:23:41 +00:00
Device: "eth0",
CIDR: "192.168.0.100/32",
MBits: 1000,
},
},
},
Reserved: &Resources{
Networks: []*NetworkResource{
2017-09-26 22:26:33 +00:00
{
2016-08-05 23:23:41 +00:00
Device: "eth0",
IP: "192.168.0.100",
MBits: 1,
},
},
},
}
for i := idx.MinDynamicPort; i < idx.MaxDynamicPort; i++ {
2016-08-05 23:23:41 +00:00
n.Reserved.Networks[0].ReservedPorts = append(n.Reserved.Networks[0].ReservedPorts, Port{Value: i})
}
idx.SetNode(n)
// Ask for dynamic ports
ask := &NetworkResource{
DynamicPorts: []Port{{"http", 0, 80, ""}},
2016-08-05 23:23:41 +00:00
}
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
offer, err := idx.AssignTaskNetwork(ask)
2016-08-05 23:23:41 +00:00
if err != nil {
t.Fatalf("err: %v", err)
}
if offer == nil {
t.Fatalf("bad")
}
if offer.IP != "192.168.0.100" {
t.Fatalf("bad: %#v", offer)
}
if len(offer.DynamicPorts) != 1 {
t.Fatalf("There should be three dynamic ports")
}
if p := offer.DynamicPorts[0].Value; p != idx.MaxDynamicPort {
t.Fatalf("Dynamic Port: should have been assigned %d; got %d", p, idx.MaxDynamicPort)
2016-08-05 23:23:41 +00:00
}
}
func TestIntContains(t *testing.T) {
ci.Parallel(t)
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
2016-08-10 18:47:20 +00:00
l := []int{1, 2, 10, 20}
2015-11-15 09:56:21 +00:00
if isPortReserved(l, 50) {
t.Fatalf("bad")
}
2015-11-15 09:56:21 +00:00
if !isPortReserved(l, 20) {
t.Fatalf("bad")
}
2015-11-15 09:56:21 +00:00
if !isPortReserved(l, 1) {
t.Fatalf("bad")
}
}
core: merge reserved_ports into host_networks (#13651) Fixes #13505 This fixes #13505 by treating reserved_ports like we treat a lot of jobspec settings: merging settings from more global stanzas (client.reserved.reserved_ports) "down" into more specific stanzas (client.host_networks[].reserved_ports). As discussed in #13505 there are other options, and since it's totally broken right now we have some flexibility: Treat overlapping reserved_ports on addresses as invalid and refuse to start agents. However, I'm not sure there's a cohesive model we want to publish right now since so much 0.9-0.12 compat code still exists! We would have to explain to folks that if their -network-interface and host_network addresses overlapped, they could only specify reserved_ports in one place or the other?! It gets ugly. Use the global client.reserved.reserved_ports value as the default and treat host_network[].reserverd_ports as overrides. My first suggestion in the issue, but @groggemans made me realize the addresses on the agent's interface (as configured by -network-interface) may overlap with host_networks, so you'd need to remove the global reserved_ports from addresses shared with a shared network?! This seemed really confusing and subtle for users to me. So I think "merging down" creates the most expressive yet understandable approach. I've played around with it a bit, and it doesn't seem too surprising. The only frustrating part is how difficult it is to observe the available addresses and ports on a node! However that's a job for another PR.
2022-07-12 21:40:25 +00:00
func TestNetworkIndex_SetNode_HostNets(t *testing.T) {
ci.Parallel(t)
idx := NewNetworkIndex()
n := &Node{
NodeResources: &NodeResources{
Networks: []*NetworkResource{
// As of Nomad v1.3 bridge networks get
// registered with only their mode set.
{
Mode: "bridge",
},
// Localhost (agent interface)
{
CIDR: "127.0.0.1/32",
Device: "lo",
IP: "127.0.0.1",
MBits: 1000,
Mode: "host",
},
{
CIDR: "::1/128",
Device: "lo",
IP: "::1",
MBits: 1000,
Mode: "host",
},
// Node.NodeResources.Networks does *not*
// contain host_networks.
},
NodeNetworks: []*NodeNetworkResource{
// As of Nomad v1.3 bridge networks get
// registered with only their mode set.
{
Mode: "bridge",
},
{
Addresses: []NodeNetworkAddress{
{
Address: "127.0.0.1",
Alias: "default",
Family: "ipv4",
},
{
Address: "::1",
Alias: "default",
Family: "ipv6",
},
},
Device: "lo",
Mode: "host",
Speed: 1000,
},
{
Addresses: []NodeNetworkAddress{
{
Address: "192.168.0.1",
Alias: "eth0",
Family: "ipv4",
ReservedPorts: "22",
},
},
Device: "enxaaaaaaaaaaaa",
MacAddress: "aa:aa:aa:aa:aa:aa",
Mode: "host",
Speed: 1000,
},
{
Addresses: []NodeNetworkAddress{
{
Address: "192.168.1.1",
Alias: "eth1",
Family: "ipv4",
ReservedPorts: "80",
},
},
Device: "enxbbbbbbbbbbbb",
MacAddress: "bb:bb:bb:bb:bb:bb",
Mode: "host",
Speed: 1000,
},
},
},
ReservedResources: &NodeReservedResources{
Networks: NodeReservedNetworkResources{
ReservedHostPorts: "22",
},
},
}
require.NoError(t, idx.SetNode(n))
// TaskNetworks should only contain the bridge and agent network
require.Len(t, idx.TaskNetworks, 2)
// Ports should be used across all 4 IPs
require.Equal(t, 4, len(idx.UsedPorts))
// 22 should be reserved on all IPs
require.True(t, idx.UsedPorts["127.0.0.1"].Check(22))
require.True(t, idx.UsedPorts["::1"].Check(22))
require.True(t, idx.UsedPorts["192.168.0.1"].Check(22))
require.True(t, idx.UsedPorts["192.168.1.1"].Check(22))
// 80 should only be reserved on eth1's address
require.False(t, idx.UsedPorts["127.0.0.1"].Check(80))
require.False(t, idx.UsedPorts["::1"].Check(80))
require.False(t, idx.UsedPorts["192.168.0.1"].Check(80))
require.True(t, idx.UsedPorts["192.168.1.1"].Check(80))
}