package agent import ( "fmt" "io" "io/ioutil" "log" "math/rand" "net/http/httptest" "os" "path/filepath" "strconv" "strings" "testing" "time" metrics "github.com/armon/go-metrics" uuid "github.com/hashicorp/go-uuid" "github.com/hashicorp/consul/agent/config" "github.com/hashicorp/consul/agent/connect" "github.com/hashicorp/consul/agent/consul" "github.com/hashicorp/consul/agent/structs" "github.com/hashicorp/consul/api" "github.com/hashicorp/consul/logger" "github.com/hashicorp/consul/sdk/freeport" "github.com/hashicorp/consul/sdk/testutil" "github.com/hashicorp/consul/sdk/testutil/retry" ) func init() { rand.Seed(time.Now().UnixNano()) // seed random number generator } // TempDir defines the base dir for temporary directories. var TempDir = os.TempDir() // TestAgent encapsulates an Agent with a default configuration and // startup procedure suitable for testing. It panics if there are errors // during creation or startup instead of returning errors. It manages a // temporary data directory which is removed after shutdown. type TestAgent struct { // Name is an optional name of the agent. Name string HCL string // ExpectConfigError can be set to prevent the agent retrying Start on errors // and eventually blowing up with runtime.Goexit. This enables tests to assert // that some specific bit of config actually does prevent startup entirely in // a reasonable way without reproducing a lot of the boilerplate here. ExpectConfigError bool // Config is the agent configuration. If Config is nil then // TestConfig() is used. If Config.DataDir is set then it is // the callers responsibility to clean up the data directory. // Otherwise, a temporary data directory is created and removed // when Shutdown() is called. Config *config.RuntimeConfig // LogOutput is the sink for the logs. If nil, logs are written // to os.Stderr. LogOutput io.Writer // LogWriter is used for streaming logs. LogWriter *logger.LogWriter // DataDir is the data directory which is used when Config.DataDir // is not set. It is created automatically and removed when // Shutdown() is called. DataDir string // Key is the optional encryption key for the LAN and WAN keyring. Key string // UseTLS, if true, will disable the HTTP port and enable the HTTPS // one. UseTLS bool // dns is a reference to the first started DNS endpoint. // It is valid after Start(). dns *DNSServer // srv is a reference to the first started HTTP endpoint. // It is valid after Start(). srv *HTTPServer // Agent is the embedded consul agent. // It is valid after Start(). *Agent } // NewTestAgent returns a started agent with the given name and // configuration. It fails the test if the Agent could not be started. The // caller should call Shutdown() to stop the agent and remove temporary // directories. func NewTestAgent(t *testing.T, name string, hcl string) *TestAgent { a := &TestAgent{Name: name, HCL: hcl, LogOutput: testutil.TestWriter(t)} retry.RunWith(retry.ThreeTimes(), t, func(r *retry.R) { if err := a.Start(); err != nil { r.Fatal(err) } }) return a } // TODO: testing.T should be removed as a parameter, as it is not being used. func NewUnstartedAgent(t *testing.T, name string, hcl string) (*Agent, error) { c := TestConfig(config.Source{Name: name, Format: "hcl", Data: hcl}) a, err := New(c, nil) if err != nil { return nil, err } return a, nil } // Start starts a test agent. It returns an error if the agent could not be started. // If no error is returned, the caller must call Shutdown() when finished. func (a *TestAgent) Start() (err error) { if a.Agent != nil { return fmt.Errorf("TestAgent already started") } var cleanupTmpDir = func() { // Clean out the data dir if we are responsible for it before we // try again, since the old ports may have gotten written to // the data dir, such as in the Raft configuration. if a.DataDir != "" { if err := os.RemoveAll(a.DataDir); err != nil { fmt.Printf("%s Error resetting data dir: %s", a.Name, err) } } } var hclDataDir string if a.DataDir == "" { name := "agent" if a.Name != "" { name = a.Name + "-agent" } name = strings.Replace(name, "/", "_", -1) d, err := ioutil.TempDir(TempDir, name) if err != nil { return fmt.Errorf("Error creating data dir %s: %s", filepath.Join(TempDir, name), err) } hclDataDir = `data_dir = "` + d + `"` } a.Config = TestConfig( randomPortsSource(a.UseTLS), config.Source{Name: a.Name, Format: "hcl", Data: a.HCL}, config.Source{Name: a.Name + ".data_dir", Format: "hcl", Data: hclDataDir}, ) // write the keyring if a.Key != "" { writeKey := func(key, filename string) error { path := filepath.Join(a.Config.DataDir, filename) if err := initKeyring(path, key); err != nil { cleanupTmpDir() return fmt.Errorf("Error creating keyring %s: %s", path, err) } return nil } if err = writeKey(a.Key, SerfLANKeyring); err != nil { cleanupTmpDir() return err } if err = writeKey(a.Key, SerfWANKeyring); err != nil { cleanupTmpDir() return err } } logOutput := a.LogOutput if logOutput == nil { // TODO: move this out of Start() and back into NewTestAgent, // and make `logOutput = testutil.TestWriter(t)` logOutput = os.Stderr } agentLogger := log.New(logOutput, a.Name+" - ", log.LstdFlags|log.Lmicroseconds) agent, err := New(a.Config, agentLogger) if err != nil { cleanupTmpDir() return fmt.Errorf("Error creating agent: %s", err) } agent.LogOutput = logOutput agent.LogWriter = a.LogWriter agent.MemSink = metrics.NewInmemSink(1*time.Second, time.Minute) id := string(a.Config.NodeID) if err := agent.Start(); err != nil { cleanupTmpDir() agent.ShutdownAgent() agent.ShutdownEndpoints() if a.ExpectConfigError { // Panic the error since this can be caught if needed. Pretty gross way to // detect errors but enough for now and this is a tiny edge case that I'd // otherwise not have a way to test at all... // // TODO(sadams): This can be refactored away by returning an // error here instead of panicing, removing the `ExpectConfigError` // field from `TestAgent`, and having the test that uses this // (TestAgent_ConnectClusterIDConfig) check for an error instead of // catching a panic. panic(err) } return fmt.Errorf("%s %s Error starting agent: %s", id, a.Name, err) } a.Agent = agent // Start the anti-entropy syncer a.Agent.StartSync() if err := a.waitForUp(); err != nil { cleanupTmpDir() a.Shutdown() return err } a.dns = a.dnsServers[0] a.srv = a.httpServers[0] return nil } // waitForUp waits for leader election, or waits for the agent HTTP // endpoint to start responding, depending on the agent config. func (a *TestAgent) waitForUp() error { timer := retry.TwoSeconds() deadline := time.Now().Add(timer.Timeout) var retErr error var out structs.IndexedNodes for ; !time.Now().After(deadline); time.Sleep(timer.Wait) { if len(a.httpServers) == 0 { retErr = fmt.Errorf("%s: waiting for server", a.Name) continue // fail, try again } if a.Config.Bootstrap && a.Config.ServerMode { // Ensure we have a leader and a node registration. args := &structs.DCSpecificRequest{ Datacenter: a.Config.Datacenter, QueryOptions: structs.QueryOptions{ MinQueryIndex: out.Index, MaxQueryTime: 25 * time.Millisecond, }, } if err := a.RPC("Catalog.ListNodes", args, &out); err != nil { retErr = fmt.Errorf("Catalog.ListNodes failed: %v", err) continue // fail, try again } if !out.QueryMeta.KnownLeader { retErr = fmt.Errorf("%s: No leader", a.Name) continue // fail, try again } if out.Index == 0 { retErr = fmt.Errorf("%s: Consul index is 0", a.Name) continue // fail, try again } return nil // success } else { req := httptest.NewRequest("GET", "/v1/agent/self", nil) resp := httptest.NewRecorder() _, err := a.httpServers[0].AgentSelf(resp, req) if err != nil || resp.Code != 200 { retErr = fmt.Errorf("%s: failed OK response: %v", a.Name, err) continue } return nil // success } } return fmt.Errorf("unavailable. last error: %v", retErr) } // Shutdown stops the agent and removes the data directory if it is // managed by the test agent. func (a *TestAgent) Shutdown() error { /* Removed this because it was breaking persistence tests where we would persist a service and load it through a new agent with the same data-dir. Not sure if we still need this for other things, everywhere we manually make a data dir we already do 'defer os.RemoveAll()' defer func() { if a.DataDir != "" { os.RemoveAll(a.DataDir) } }()*/ // shutdown agent before endpoints defer a.Agent.ShutdownEndpoints() return a.Agent.ShutdownAgent() } func (a *TestAgent) DNSAddr() string { if a.dns == nil { return "" } return a.dns.Addr } func (a *TestAgent) HTTPAddr() string { if a.srv == nil { return "" } return a.srv.Addr } func (a *TestAgent) SegmentAddr(name string) string { if server, ok := a.Agent.delegate.(*consul.Server); ok { return server.LANSegmentAddr(name) } return "" } func (a *TestAgent) Client() *api.Client { conf := api.DefaultConfig() conf.Address = a.HTTPAddr() c, err := api.NewClient(conf) if err != nil { panic(fmt.Sprintf("Error creating consul API client: %s", err)) } return c } // DNSDisableCompression disables compression for all started DNS servers. func (a *TestAgent) DNSDisableCompression(b bool) { for _, srv := range a.dnsServers { cfg := srv.config.Load().(*dnsConfig) cfg.DisableCompression = b } } func (a *TestAgent) consulConfig() *consul.Config { c, err := a.Agent.consulConfig() if err != nil { panic(err) } return c } // pickRandomPorts selects random ports from fixed size random blocks of // ports. This does not eliminate the chance for port conflict but // reduces it significantly with little overhead. Furthermore, asking // the kernel for a random port by binding to port 0 prolongs the test // execution (in our case +20sec) while also not fully eliminating the // chance of port conflicts for concurrently executed test binaries. // Instead of relying on one set of ports to be sufficient we retry // starting the agent with different ports on port conflict. func randomPortsSource(tls bool) config.Source { ports := freeport.Get(6) if tls { ports[1] = -1 } else { ports[2] = -1 } return config.Source{ Name: "ports", Format: "hcl", Data: ` ports = { dns = ` + strconv.Itoa(ports[0]) + ` http = ` + strconv.Itoa(ports[1]) + ` https = ` + strconv.Itoa(ports[2]) + ` serf_lan = ` + strconv.Itoa(ports[3]) + ` serf_wan = ` + strconv.Itoa(ports[4]) + ` server = ` + strconv.Itoa(ports[5]) + ` } `, } } func NodeID() string { id, err := uuid.GenerateUUID() if err != nil { panic(err) } return id } // TestConfig returns a unique default configuration for testing an // agent. func TestConfig(sources ...config.Source) *config.RuntimeConfig { nodeID := NodeID() testsrc := config.Source{ Name: "test", Format: "hcl", Data: ` bind_addr = "127.0.0.1" advertise_addr = "127.0.0.1" datacenter = "dc1" bootstrap = true server = true node_id = "` + nodeID + `" node_name = "Node-` + nodeID + `" connect { enabled = true ca_config { cluster_id = "` + connect.TestClusterID + `" } } performance { raft_multiplier = 1 } `, } b, err := config.NewBuilder(config.Flags{}) if err != nil { panic("NewBuilder failed: " + err.Error()) } b.Head = append(b.Head, testsrc) b.Tail = append(b.Tail, config.DefaultConsulSource(), config.DevConsulSource()) b.Tail = append(b.Tail, sources...) cfg, err := b.BuildAndValidate() if err != nil { panic("Error building config: " + err.Error()) } for _, w := range b.Warnings { fmt.Println("WARNING:", w) } // Effectively disables the delay after root rotation before requesting CSRs // to make test deterministic. 0 results in default jitter being applied but a // tiny delay is effectively thre same. cfg.ConnectTestCALeafRootChangeSpread = 1 * time.Nanosecond return &cfg } // TestACLConfig returns a default configuration for testing an agent // with ACLs. func TestACLConfig() string { return ` acl_datacenter = "dc1" acl_default_policy = "deny" acl_master_token = "root" acl_agent_token = "root" acl_agent_master_token = "towel" acl_enforce_version_8 = true ` } func TestACLConfigNew() string { return ` primary_datacenter = "dc1" acl { enabled = true default_policy = "deny" tokens { master = "root" agent = "root" agent_master = "towel" } } ` }