a65fbeb3b3
This PR adds a secondary path for cleaning up iptables created for an allocation when the normal CNI library fails to do so. This typically happens when the state of the pause container is unexpected - e.g. deleted out of band from Nomad. Before, the iptables rules would be leaked which could lead to unexpected nat routing behavior later on (in addition to leaked resources). With this change, we scan for the rules created on behalf of the allocation being GC'd and delete them. Fixes #6385
376 lines
11 KiB
Go
376 lines
11 KiB
Go
// For now CNI is supported only on Linux.
|
|
//
|
|
//go:build linux
|
|
// +build linux
|
|
|
|
package allocrunner
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"math/rand"
|
|
"os"
|
|
"path/filepath"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"time"
|
|
|
|
cni "github.com/containerd/go-cni"
|
|
cnilibrary "github.com/containernetworking/cni/libcni"
|
|
"github.com/coreos/go-iptables/iptables"
|
|
log "github.com/hashicorp/go-hclog"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
"github.com/hashicorp/nomad/plugins/drivers"
|
|
)
|
|
|
|
const (
|
|
|
|
// envCNIPath is the environment variable name to use to derive the CNI path
|
|
// when it is not explicitly set by the client
|
|
envCNIPath = "CNI_PATH"
|
|
|
|
// defaultCNIPath is the CNI path to use when it is not set by the client
|
|
// and is not set by environment variable
|
|
defaultCNIPath = "/opt/cni/bin"
|
|
|
|
// defaultCNIInterfacePrefix is the network interface to use if not set in
|
|
// client config
|
|
defaultCNIInterfacePrefix = "eth"
|
|
)
|
|
|
|
type cniNetworkConfigurator struct {
|
|
cni cni.CNI
|
|
cniConf []byte
|
|
ignorePortMappingHostIP bool
|
|
|
|
rand *rand.Rand
|
|
logger log.Logger
|
|
}
|
|
|
|
func newCNINetworkConfigurator(logger log.Logger, cniPath, cniInterfacePrefix, cniConfDir, networkName string, ignorePortMappingHostIP bool) (*cniNetworkConfigurator, error) {
|
|
cniConf, err := loadCNIConf(cniConfDir, networkName)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to load CNI config: %v", err)
|
|
}
|
|
|
|
return newCNINetworkConfiguratorWithConf(logger, cniPath, cniInterfacePrefix, ignorePortMappingHostIP, cniConf)
|
|
}
|
|
|
|
func newCNINetworkConfiguratorWithConf(logger log.Logger, cniPath, cniInterfacePrefix string, ignorePortMappingHostIP bool, cniConf []byte) (*cniNetworkConfigurator, error) {
|
|
conf := &cniNetworkConfigurator{
|
|
cniConf: cniConf,
|
|
rand: rand.New(rand.NewSource(time.Now().Unix())),
|
|
logger: logger,
|
|
ignorePortMappingHostIP: ignorePortMappingHostIP,
|
|
}
|
|
if cniPath == "" {
|
|
if cniPath = os.Getenv(envCNIPath); cniPath == "" {
|
|
cniPath = defaultCNIPath
|
|
}
|
|
}
|
|
|
|
if cniInterfacePrefix == "" {
|
|
cniInterfacePrefix = defaultCNIInterfacePrefix
|
|
}
|
|
|
|
c, err := cni.New(cni.WithPluginDir(filepath.SplitList(cniPath)),
|
|
cni.WithInterfacePrefix(cniInterfacePrefix))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
conf.cni = c
|
|
|
|
return conf, nil
|
|
}
|
|
|
|
// Setup calls the CNI plugins with the add action
|
|
func (c *cniNetworkConfigurator) Setup(ctx context.Context, alloc *structs.Allocation, spec *drivers.NetworkIsolationSpec) (*structs.AllocNetworkStatus, error) {
|
|
if err := c.ensureCNIInitialized(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Depending on the version of bridge cni plugin used, a known race could occure
|
|
// where two alloc attempt to create the nomad bridge at the same time, resulting
|
|
// in one of them to fail. This rety attempts to overcome those erroneous failures.
|
|
const retry = 3
|
|
var firstError error
|
|
var res *cni.CNIResult
|
|
for attempt := 1; ; attempt++ {
|
|
var err error
|
|
if res, err = c.cni.Setup(ctx, alloc.ID, spec.Path, cni.WithCapabilityPortMap(getPortMapping(alloc, c.ignorePortMappingHostIP))); err != nil {
|
|
c.logger.Warn("failed to configure network", "error", err, "attempt", attempt)
|
|
switch attempt {
|
|
case 1:
|
|
firstError = err
|
|
case retry:
|
|
return nil, fmt.Errorf("failed to configure network: %v", firstError)
|
|
}
|
|
|
|
// Sleep for 1 second + jitter
|
|
time.Sleep(time.Second + (time.Duration(c.rand.Int63n(1000)) * time.Millisecond))
|
|
continue
|
|
}
|
|
break
|
|
}
|
|
|
|
if c.logger.IsDebug() {
|
|
resultJSON, _ := json.Marshal(res)
|
|
c.logger.Debug("received result from CNI", "result", string(resultJSON))
|
|
}
|
|
|
|
return c.cniToAllocNet(res)
|
|
|
|
}
|
|
|
|
// cniToAllocNet converts a CNIResult to an AllocNetworkStatus or returns an
|
|
// error. The first interface and IP with a sandbox and address set are
|
|
// preferred. Failing that the first interface with an IP is selected.
|
|
//
|
|
// Unfortunately the go-cni library returns interfaces in an unordered map so
|
|
// the results may be nondeterministic depending on CNI plugin output.
|
|
func (c *cniNetworkConfigurator) cniToAllocNet(res *cni.CNIResult) (*structs.AllocNetworkStatus, error) {
|
|
netStatus := new(structs.AllocNetworkStatus)
|
|
|
|
// Use the first sandbox interface with an IP address
|
|
if len(res.Interfaces) > 0 {
|
|
for name, iface := range res.Interfaces {
|
|
if iface == nil {
|
|
// this should never happen but this value is coming from external
|
|
// plugins so we should guard against it
|
|
delete(res.Interfaces, name)
|
|
}
|
|
|
|
if iface.Sandbox != "" && len(iface.IPConfigs) > 0 {
|
|
netStatus.Address = iface.IPConfigs[0].IP.String()
|
|
netStatus.InterfaceName = name
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// If no IP address was found, use the first interface with an address
|
|
// found as a fallback
|
|
if netStatus.Address == "" {
|
|
var found bool
|
|
for name, iface := range res.Interfaces {
|
|
if len(iface.IPConfigs) > 0 {
|
|
ip := iface.IPConfigs[0].IP.String()
|
|
c.logger.Debug("no sandbox interface with an address found CNI result, using first available", "interface", name, "ip", ip)
|
|
netStatus.Address = ip
|
|
netStatus.InterfaceName = name
|
|
found = true
|
|
break
|
|
}
|
|
}
|
|
if !found {
|
|
c.logger.Warn("no address could be found from CNI result")
|
|
}
|
|
}
|
|
|
|
// If no IP address could be found, return an error
|
|
if netStatus.Address == "" {
|
|
return nil, fmt.Errorf("failed to configure network: no interface with an address")
|
|
|
|
}
|
|
|
|
// Use the first DNS results.
|
|
if len(res.DNS) > 0 {
|
|
netStatus.DNS = &structs.DNSConfig{
|
|
Servers: res.DNS[0].Nameservers,
|
|
Searches: res.DNS[0].Search,
|
|
Options: res.DNS[0].Options,
|
|
}
|
|
}
|
|
|
|
return netStatus, nil
|
|
}
|
|
|
|
func loadCNIConf(confDir, name string) ([]byte, error) {
|
|
files, err := cnilibrary.ConfFiles(confDir, []string{".conf", ".conflist", ".json"})
|
|
switch {
|
|
case err != nil:
|
|
return nil, fmt.Errorf("failed to detect CNI config file: %v", err)
|
|
case len(files) == 0:
|
|
return nil, fmt.Errorf("no CNI network config found in %s", confDir)
|
|
}
|
|
|
|
// files contains the network config files associated with cni network.
|
|
// Use lexicographical way as a defined order for network config files.
|
|
sort.Strings(files)
|
|
for _, confFile := range files {
|
|
if strings.HasSuffix(confFile, ".conflist") {
|
|
confList, err := cnilibrary.ConfListFromFile(confFile)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to load CNI config list file %s: %v", confFile, err)
|
|
}
|
|
if confList.Name == name {
|
|
return confList.Bytes, nil
|
|
}
|
|
} else {
|
|
conf, err := cnilibrary.ConfFromFile(confFile)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to load CNI config file %s: %v", confFile, err)
|
|
}
|
|
if conf.Network.Name == name {
|
|
return conf.Bytes, nil
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil, fmt.Errorf("CNI network config not found for name %q", name)
|
|
}
|
|
|
|
// Teardown calls the CNI plugins with the delete action
|
|
func (c *cniNetworkConfigurator) Teardown(ctx context.Context, alloc *structs.Allocation, spec *drivers.NetworkIsolationSpec) error {
|
|
if err := c.ensureCNIInitialized(); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := c.cni.Remove(ctx, alloc.ID, spec.Path, cni.WithCapabilityPortMap(getPortMapping(alloc, c.ignorePortMappingHostIP))); err != nil {
|
|
// create a real handle to iptables
|
|
ipt, iptErr := iptables.New()
|
|
if iptErr != nil {
|
|
return fmt.Errorf("failed to detect iptables: %w", iptErr)
|
|
}
|
|
// most likely the pause container was removed from underneath nomad
|
|
return c.forceCleanup(ipt, alloc.ID)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// IPTables is a subset of iptables.IPTables
|
|
type IPTables interface {
|
|
List(table, chain string) ([]string, error)
|
|
Delete(table, chain string, rule ...string) error
|
|
ClearAndDeleteChain(table, chain string) error
|
|
}
|
|
|
|
var (
|
|
// ipRuleRe is used to parse a postrouting iptables rule created by nomad, e.g.
|
|
// -A POSTROUTING -s 172.26.64.191/32 -m comment --comment "name: \"nomad\" id: \"6b235529-8111-4bbe-520b-d639b1d2a94e\"" -j CNI-50e58ea77dc52e0c731e3799
|
|
ipRuleRe = regexp.MustCompile(`-A POSTROUTING -s (\S+) -m comment --comment "name: \\"nomad\\" id: \\"([[:xdigit:]-]+)\\"" -j (CNI-[[:xdigit:]]+)`)
|
|
)
|
|
|
|
// forceCleanup is the backup plan for removing the iptables rule and chain associated with
|
|
// an allocation that was using bridge networking. The cni library refuses to handle a
|
|
// dirty state - e.g. the pause container is removed out of band, and so we must cleanup
|
|
// iptables ourselves to avoid leaking rules.
|
|
func (c *cniNetworkConfigurator) forceCleanup(ipt IPTables, allocID string) error {
|
|
const (
|
|
natTable = "nat"
|
|
postRoutingChain = "POSTROUTING"
|
|
commentFmt = `--comment "name: \"nomad\" id: \"%s\""`
|
|
)
|
|
|
|
// list the rules on the POSTROUTING chain of the nat table
|
|
rules, err := ipt.List(natTable, postRoutingChain)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to list iptables rules: %w", err)
|
|
}
|
|
|
|
// find the POSTROUTING rule associated with our allocation
|
|
matcher := fmt.Sprintf(commentFmt, allocID)
|
|
var ruleToPurge string
|
|
for _, rule := range rules {
|
|
if strings.Contains(rule, matcher) {
|
|
ruleToPurge = rule
|
|
break
|
|
}
|
|
}
|
|
|
|
// no rule found for our allocation, just give up
|
|
if ruleToPurge == "" {
|
|
return fmt.Errorf("failed to find postrouting rule for alloc %s", allocID)
|
|
}
|
|
|
|
// re-create the rule we need to delete, as tokens
|
|
subs := ipRuleRe.FindStringSubmatch(ruleToPurge)
|
|
if len(subs) != 4 {
|
|
return fmt.Errorf("failed to parse postrouting rule for alloc %s", allocID)
|
|
}
|
|
cidr := subs[1]
|
|
id := subs[2]
|
|
chainID := subs[3]
|
|
toDel := []string{
|
|
`-s`,
|
|
cidr,
|
|
`-m`,
|
|
`comment`,
|
|
`--comment`,
|
|
`name: "nomad" id: "` + id + `"`,
|
|
`-j`,
|
|
chainID,
|
|
}
|
|
|
|
// remove the jump rule
|
|
ok := true
|
|
if err = ipt.Delete(natTable, postRoutingChain, toDel...); err != nil {
|
|
c.logger.Warn("failed to remove iptables nat.POSTROUTING rule", "alloc_id", allocID, "chain", chainID, "error", err)
|
|
ok = false
|
|
}
|
|
|
|
// remote the associated chain
|
|
if err = ipt.ClearAndDeleteChain(natTable, chainID); err != nil {
|
|
c.logger.Warn("failed to remove iptables nat chain", "chain", chainID, "error", err)
|
|
ok = false
|
|
}
|
|
|
|
if !ok {
|
|
return fmt.Errorf("failed to cleanup iptables rules for alloc %s", allocID)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (c *cniNetworkConfigurator) ensureCNIInitialized() error {
|
|
if err := c.cni.Status(); cni.IsCNINotInitialized(err) {
|
|
return c.cni.Load(cni.WithConfListBytes(c.cniConf))
|
|
} else {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// getPortMapping builds a list of portMapping structs that are used as the
|
|
// portmapping capability arguments for the portmap CNI plugin
|
|
func getPortMapping(alloc *structs.Allocation, ignoreHostIP bool) []cni.PortMapping {
|
|
var ports []cni.PortMapping
|
|
|
|
if len(alloc.AllocatedResources.Shared.Ports) == 0 && len(alloc.AllocatedResources.Shared.Networks) > 0 {
|
|
for _, network := range alloc.AllocatedResources.Shared.Networks {
|
|
for _, port := range append(network.DynamicPorts, network.ReservedPorts...) {
|
|
if port.To < 1 {
|
|
port.To = port.Value
|
|
}
|
|
for _, proto := range []string{"tcp", "udp"} {
|
|
ports = append(ports, cni.PortMapping{
|
|
HostPort: int32(port.Value),
|
|
ContainerPort: int32(port.To),
|
|
Protocol: proto,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
for _, port := range alloc.AllocatedResources.Shared.Ports {
|
|
if port.To < 1 {
|
|
port.To = port.Value
|
|
}
|
|
for _, proto := range []string{"tcp", "udp"} {
|
|
portMapping := cni.PortMapping{
|
|
HostPort: int32(port.Value),
|
|
ContainerPort: int32(port.To),
|
|
Protocol: proto,
|
|
}
|
|
if !ignoreHostIP {
|
|
portMapping.HostIP = port.HostIP
|
|
}
|
|
ports = append(ports, portMapping)
|
|
}
|
|
}
|
|
}
|
|
return ports
|
|
}
|