Backport of Tuning job versions retention. #17635 into release/1.6.x (#18169)

This pull request was automerged via backport-assistant
This commit is contained in:
hc-github-team-nomad-core 2023-08-07 13:48:09 -05:00 committed by GitHub
parent ebcdd4d82d
commit f812bccb4e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 67 additions and 31 deletions

3
.changelog/17939.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
config: Added an option to configure how many historic versions of jobs are retained in the state store
```

View File

@ -343,6 +343,13 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) {
conf.JobMaxPriority = jobMaxPriority conf.JobMaxPriority = jobMaxPriority
conf.JobDefaultPriority = jobDefaultPriority conf.JobDefaultPriority = jobDefaultPriority
if agentConfig.Server.JobTrackedVersions != nil {
if *agentConfig.Server.JobTrackedVersions <= 0 {
return nil, fmt.Errorf("job_tracked_versions must be greater than 0")
}
conf.JobTrackedVersions = *agentConfig.Server.JobTrackedVersions
}
// Set up the bind addresses // Set up the bind addresses
rpcAddr, err := net.ResolveTCPAddr("tcp", agentConfig.normalizedAddrs.RPC) rpcAddr, err := net.ResolveTCPAddr("tcp", agentConfig.normalizedAddrs.RPC)
if err != nil { if err != nil {

View File

@ -674,6 +674,9 @@ type ServerConfig struct {
// before being discarded automatically. If unset, the maximum size defaults // before being discarded automatically. If unset, the maximum size defaults
// to 1 MB. If the value is zero, no job sources will be stored. // to 1 MB. If the value is zero, no job sources will be stored.
JobMaxSourceSize *string `hcl:"job_max_source_size"` JobMaxSourceSize *string `hcl:"job_max_source_size"`
// JobTrackedVersions is the number of historic job versions that are kept.
JobTrackedVersions *int `hcl:"job_tracked_versions"`
} }
func (s *ServerConfig) Copy() *ServerConfig { func (s *ServerConfig) Copy() *ServerConfig {
@ -702,6 +705,7 @@ func (s *ServerConfig) Copy() *ServerConfig {
ns.RaftTrailingLogs = pointer.Copy(s.RaftTrailingLogs) ns.RaftTrailingLogs = pointer.Copy(s.RaftTrailingLogs)
ns.JobDefaultPriority = pointer.Copy(s.JobDefaultPriority) ns.JobDefaultPriority = pointer.Copy(s.JobDefaultPriority)
ns.JobMaxPriority = pointer.Copy(s.JobMaxPriority) ns.JobMaxPriority = pointer.Copy(s.JobMaxPriority)
ns.JobTrackedVersions = pointer.Copy(s.JobTrackedVersions)
return &ns return &ns
} }
@ -1329,7 +1333,8 @@ func DefaultConfig() *Config {
LimitResults: 100, LimitResults: 100,
MinTermLength: 2, MinTermLength: 2,
}, },
JobMaxSourceSize: pointer.Of("1M"), JobMaxSourceSize: pointer.Of("1M"),
JobTrackedVersions: pointer.Of(structs.JobDefaultTrackedVersions),
}, },
ACL: &ACLConfig{ ACL: &ACLConfig{
Enabled: false, Enabled: false,
@ -2033,6 +2038,10 @@ func (s *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
} }
} }
if b.JobTrackedVersions != nil {
result.JobTrackedVersions = b.JobTrackedVersions
}
// Add the schedulers // Add the schedulers
result.EnabledSchedulers = append(result.EnabledSchedulers, b.EnabledSchedulers...) result.EnabledSchedulers = append(result.EnabledSchedulers, b.EnabledSchedulers...)

View File

@ -416,6 +416,9 @@ type Config struct {
// JobMaxPriority is an upper bound on the Job priority. // JobMaxPriority is an upper bound on the Job priority.
JobMaxPriority int JobMaxPriority int
// JobTrackedVersions is the number of historic Job versions that are kept.
JobTrackedVersions int
} }
func (c *Config) Copy() *Config { func (c *Config) Copy() *Config {
@ -535,6 +538,7 @@ func DefaultConfig() *Config {
DeploymentQueryRateLimit: deploymentwatcher.LimitStateQueriesPerSecond, DeploymentQueryRateLimit: deploymentwatcher.LimitStateQueriesPerSecond,
JobDefaultPriority: structs.JobDefaultPriority, JobDefaultPriority: structs.JobDefaultPriority,
JobMaxPriority: structs.JobDefaultMaxPriority, JobMaxPriority: structs.JobDefaultMaxPriority,
JobTrackedVersions: structs.JobDefaultTrackedVersions,
} }
// Enable all known schedulers by default // Enable all known schedulers by default

View File

@ -148,16 +148,20 @@ type FSMConfig struct {
// EventBufferSize is the amount of messages to hold in memory // EventBufferSize is the amount of messages to hold in memory
EventBufferSize int64 EventBufferSize int64
// JobTrackedVersions is the number of historic job versions that are kept.
JobTrackedVersions int
} }
// NewFSM is used to construct a new FSM with a blank state. // NewFSM is used to construct a new FSM with a blank state.
func NewFSM(config *FSMConfig) (*nomadFSM, error) { func NewFSM(config *FSMConfig) (*nomadFSM, error) {
// Create a state store // Create a state store
sconfig := &state.StateStoreConfig{ sconfig := &state.StateStoreConfig{
Logger: config.Logger, Logger: config.Logger,
Region: config.Region, Region: config.Region,
EnablePublisher: config.EnableEventBroker, EnablePublisher: config.EnableEventBroker,
EventBufferSize: config.EventBufferSize, EventBufferSize: config.EventBufferSize,
JobTrackedVersions: config.JobTrackedVersions,
} }
state, err := state.NewStateStore(sconfig) state, err := state.NewStateStore(sconfig)
if err != nil { if err != nil {

View File

@ -59,13 +59,14 @@ func testFSM(t *testing.T) *nomadFSM {
dispatcher, _ := testPeriodicDispatcher(t) dispatcher, _ := testPeriodicDispatcher(t)
logger := testlog.HCLogger(t) logger := testlog.HCLogger(t)
fsmConfig := &FSMConfig{ fsmConfig := &FSMConfig{
EvalBroker: broker, EvalBroker: broker,
Periodic: dispatcher, Periodic: dispatcher,
Blocked: NewBlockedEvals(broker, logger), Blocked: NewBlockedEvals(broker, logger),
Logger: logger, Logger: logger,
Region: "global", Region: "global",
EnableEventBroker: true, EnableEventBroker: true,
EventBufferSize: 100, EventBufferSize: 100,
JobTrackedVersions: structs.JobDefaultTrackedVersions,
} }
fsm, err := NewFSM(fsmConfig) fsm, err := NewFSM(fsmConfig)
if err != nil { if err != nil {

View File

@ -1295,13 +1295,14 @@ func (s *Server) setupRaft() error {
// Create the FSM // Create the FSM
fsmConfig := &FSMConfig{ fsmConfig := &FSMConfig{
EvalBroker: s.evalBroker, EvalBroker: s.evalBroker,
Periodic: s.periodicDispatcher, Periodic: s.periodicDispatcher,
Blocked: s.blockedEvals, Blocked: s.blockedEvals,
Logger: s.logger, Logger: s.logger,
Region: s.Region(), Region: s.Region(),
EnableEventBroker: s.config.EnableEventBroker, EnableEventBroker: s.config.EnableEventBroker,
EventBufferSize: s.config.EventBufferSize, EventBufferSize: s.config.EventBufferSize,
JobTrackedVersions: s.config.JobTrackedVersions,
} }
var err error var err error
s.fsm, err = NewFSM(fsmConfig) s.fsm, err = NewFSM(fsmConfig)

View File

@ -99,6 +99,9 @@ type StateStoreConfig struct {
// EventBufferSize configures the amount of events to hold in memory // EventBufferSize configures the amount of events to hold in memory
EventBufferSize int64 EventBufferSize int64
// JobTrackedVersions is the number of historic job versions that are kept.
JobTrackedVersions int
} }
// The StateStore is responsible for maintaining all the Nomad // The StateStore is responsible for maintaining all the Nomad
@ -1956,7 +1959,7 @@ func (s *StateStore) deleteJobScalingPolicies(index uint64, job *structs.Job, tx
func (s *StateStore) deleteJobSubmission(job *structs.Job, txn *txn) error { func (s *StateStore) deleteJobSubmission(job *structs.Job, txn *txn) error {
// find submissions associated with job // find submissions associated with job
remove := *set.NewHashSet[*structs.JobSubmission, string](structs.JobTrackedVersions) remove := *set.NewHashSet[*structs.JobSubmission, string](s.config.JobTrackedVersions)
iter, err := txn.Get("job_submission", "id_prefix", job.Namespace, job.ID) iter, err := txn.Get("job_submission", "id_prefix", job.Namespace, job.ID)
if err != nil { if err != nil {
@ -2045,7 +2048,7 @@ func (s *StateStore) upsertJobVersion(index uint64, job *structs.Job, txn *txn)
} }
// If we are below the limit there is no GCing to be done // If we are below the limit there is no GCing to be done
if len(all) <= structs.JobTrackedVersions { if len(all) <= s.config.JobTrackedVersions {
return nil return nil
} }
@ -2061,7 +2064,7 @@ func (s *StateStore) upsertJobVersion(index uint64, job *structs.Job, txn *txn)
// If the stable job is the oldest version, do a swap to bring it into the // If the stable job is the oldest version, do a swap to bring it into the
// keep set. // keep set.
max := structs.JobTrackedVersions max := s.config.JobTrackedVersions
if stableIdx == max { if stableIdx == max {
all[max-1], all[max] = all[max], all[max-1] all[max-1], all[max] = all[max], all[max-1]
} }
@ -5498,7 +5501,7 @@ func (s *StateStore) pruneJobSubmissions(namespace, jobID string, txn *txn) erro
// although the number of tracked submissions is the same as the number of // although the number of tracked submissions is the same as the number of
// tracked job versions, do not assume a 1:1 correlation, as there could be // tracked job versions, do not assume a 1:1 correlation, as there could be
// holes in the submissions (or none at all) // holes in the submissions (or none at all)
limit := structs.JobTrackedVersions limit := s.config.JobTrackedVersions
// iterate through all stored submissions // iterate through all stored submissions
iter, err := txn.Get("job_submission", "id_prefix", namespace, jobID) iter, err := txn.Get("job_submission", "id_prefix", namespace, jobID)

View File

@ -2523,7 +2523,7 @@ func TestStateStore_UpsertJob_submission(t *testing.T) {
must.Eq(t, index, sub.JobModifyIndex) must.Eq(t, index, sub.JobModifyIndex)
// insert 6 more, going over the limit // insert 6 more, going over the limit
for i := 1; i <= structs.JobTrackedVersions; i++ { for i := 1; i <= structs.JobDefaultTrackedVersions; i++ {
index++ index++
job2 := job.Copy() job2 := job.Copy()
job2.Meta["version"] = strconv.Itoa(i) job2.Meta["version"] = strconv.Itoa(i)
@ -2624,8 +2624,8 @@ func TestStateStore_UpdateUpsertJob_JobVersion(t *testing.T) {
if err != nil { if err != nil {
t.Fatalf("err: %v", err) t.Fatalf("err: %v", err)
} }
if len(allVersions) != structs.JobTrackedVersions { if len(allVersions) != structs.JobDefaultTrackedVersions {
t.Fatalf("got %d; want %d", len(allVersions), structs.JobTrackedVersions) t.Fatalf("got %d; want %d", len(allVersions), structs.JobDefaultTrackedVersions)
} }
if a := allVersions[0]; a.ID != job.ID || a.Version != 299 || a.Name != "299" { if a := allVersions[0]; a.ID != job.ID || a.Version != 299 || a.Name != "299" {
@ -2636,7 +2636,7 @@ func TestStateStore_UpdateUpsertJob_JobVersion(t *testing.T) {
} }
// Ensure we didn't delete the stable job // Ensure we didn't delete the stable job
if a := allVersions[structs.JobTrackedVersions-1]; a.ID != job.ID || if a := allVersions[structs.JobDefaultTrackedVersions-1]; a.ID != job.ID ||
a.Version != 0 || a.Name != "0" || !a.Stable { a.Version != 0 || a.Name != "0" || !a.Stable {
t.Fatalf("bad: %+v", a) t.Fatalf("bad: %+v", a)
} }

View File

@ -16,8 +16,9 @@ import (
func TestStateStore(t testing.TB) *StateStore { func TestStateStore(t testing.TB) *StateStore {
config := &StateStoreConfig{ config := &StateStoreConfig{
Logger: testlog.HCLogger(t), Logger: testlog.HCLogger(t),
Region: "global", Region: "global",
JobTrackedVersions: structs.JobDefaultTrackedVersions,
} }
state, err := NewStateStore(config) state, err := NewStateStore(config)
if err != nil { if err != nil {

View File

@ -4304,9 +4304,9 @@ const (
// for the system to remain healthy. // for the system to remain healthy.
CoreJobPriority = math.MaxInt16 CoreJobPriority = math.MaxInt16
// JobTrackedVersions is the number of historic job versions that are // JobDefaultTrackedVersions is the number of historic job versions that are
// kept. // kept.
JobTrackedVersions = 6 JobDefaultTrackedVersions = 6
// JobTrackedScalingEvents is the number of scaling events that are // JobTrackedScalingEvents is the number of scaling events that are
// kept for a single task group. // kept for a single task group.

View File

@ -264,6 +264,9 @@ server {
size of a job. If the limit is exceeded, the original source is simply discarded size of a job. If the limit is exceeded, the original source is simply discarded
and no error is returned from the job API. and no error is returned from the job API.
- `job_tracked_versions` `(int: 6)` - Specifies the number of historic job versions that
are kept.
### Deprecated Parameters ### Deprecated Parameters
- `retry_join` `(array<string>: [])` - Specifies a list of server addresses to - `retry_join` `(array<string>: [])` - Specifies a list of server addresses to