sdk: mitigate api test timeout
Occasionally we are seeing the go-test-api job timeout at 10 minutes. Looking at the stack trace I saw the following: 1. Lots of tests blocked on server.Stop in NewTestServerConfigT. This suggests that SIGINT is being sent to the server, but the server is not properly shutting down. 2. Over 20k goroutines that look like this: goroutine 16355 [select, 8 minutes]: net/http.(*persistConn).readLoop(0xc004270240) /usr/local/go/src/net/http/transport.go:2099 +0x99e created by net/http.(*Transport).dialConn /usr/local/go/src/net/http/transport.go:1647 +0xc56 Issue 1 seems to be the main problem, but debugging that directly is not possible because our buffered logs do not get sent when the tests timeout. To mitigate this problem I've added a timeout to the cmd.Wait() to force kill the process and return an error. Unfortunately because we retry this operation, we still may not see the cause because the next attempt will likely pass. I'm tempted to remove the retry around NewTestServerConfigT. Issue 2 seems to be caused by not closing the response body. Since the request is performed many times in a loop, many goroutines are created and are not closed until the response body is closed.
This commit is contained in:
parent
74ef405728
commit
71e51263be
|
@ -328,9 +328,22 @@ func (s *TestServer) Stop() error {
|
|||
}
|
||||
}
|
||||
|
||||
waitDone := make(chan error)
|
||||
go func() {
|
||||
waitDone <- s.cmd.Wait()
|
||||
close(waitDone)
|
||||
}()
|
||||
|
||||
// wait for the process to exit to be sure that the data dir can be
|
||||
// deleted on all platforms.
|
||||
return s.cmd.Wait()
|
||||
select {
|
||||
case err := <-waitDone:
|
||||
return err
|
||||
case <-time.After(10 * time.Second):
|
||||
s.cmd.Process.Kill()
|
||||
s.cmd.Wait()
|
||||
return fmt.Errorf("timeout waiting for server to stop gracefully")
|
||||
}
|
||||
}
|
||||
|
||||
// waitForAPI waits for the /status/leader HTTP endpoint to start
|
||||
|
@ -351,11 +364,12 @@ func (s *TestServer) waitForAPI() error {
|
|||
time.Sleep(timer.Wait)
|
||||
|
||||
url := s.url("/v1/status/leader")
|
||||
_, err := s.masterGet(url)
|
||||
resp, err := s.masterGet(url)
|
||||
if err != nil {
|
||||
failed = true
|
||||
continue
|
||||
}
|
||||
resp.Body.Close()
|
||||
|
||||
failed = false
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue