2015-10-12 07:42:09 +00:00
|
|
|
package state
|
2014-12-11 01:17:29 +00:00
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
)
|
|
|
|
|
2017-04-27 23:41:28 +00:00
|
|
|
// TombstoneGC is used to track creation of tombstones so that they can be
|
|
|
|
// garbage collected after their TTL expires. The tombstones allow queries to
|
|
|
|
// provide monotonic index values within the TTL window. The GC is used to
|
|
|
|
// prevent monotonic growth in storage usage. This is a trade off between the
|
|
|
|
// length of the TTL and the storage overhead.
|
2014-12-11 01:17:29 +00:00
|
|
|
//
|
2017-04-27 23:41:28 +00:00
|
|
|
// In practice, this is required to fix the issue of delete visibility. When
|
|
|
|
// data is deleted from the KV store, the "latest" row can go backwards if the
|
|
|
|
// newest row is removed. The tombstones provide a way to ensure time doesn't
|
|
|
|
// move backwards within some interval.
|
2014-12-11 01:17:29 +00:00
|
|
|
type TombstoneGC struct {
|
2017-04-27 23:41:28 +00:00
|
|
|
// ttl sets the TTL for tombstones.
|
|
|
|
ttl time.Duration
|
|
|
|
|
|
|
|
// granularity determines how we bin TTLs into timers.
|
2014-12-11 01:17:29 +00:00
|
|
|
granularity time.Duration
|
|
|
|
|
2015-01-05 22:58:59 +00:00
|
|
|
// enabled controls if we actually setup any timers.
|
|
|
|
enabled bool
|
|
|
|
|
2017-04-27 23:41:28 +00:00
|
|
|
// expires maps the time of expiration to the highest tombstone value
|
|
|
|
// that should be expired.
|
2015-01-05 22:58:59 +00:00
|
|
|
expires map[time.Time]*expireInterval
|
2014-12-11 01:17:29 +00:00
|
|
|
|
2017-04-27 23:41:28 +00:00
|
|
|
// expireCh is used to stream expiration to the leader for processing.
|
2014-12-11 01:17:29 +00:00
|
|
|
expireCh chan uint64
|
2015-01-05 22:58:59 +00:00
|
|
|
|
2017-04-28 00:04:49 +00:00
|
|
|
sync.Mutex
|
2014-12-11 01:17:29 +00:00
|
|
|
}
|
|
|
|
|
2017-04-27 23:41:28 +00:00
|
|
|
// expireInterval is used to track the maximum index to expire in a given
|
|
|
|
// interval with a timer.
|
2014-12-11 06:33:26 +00:00
|
|
|
type expireInterval struct {
|
2017-04-27 23:41:28 +00:00
|
|
|
// maxIndex has the highest tombstone index that should be GC-d.
|
2014-12-11 06:33:26 +00:00
|
|
|
maxIndex uint64
|
2017-04-27 23:41:28 +00:00
|
|
|
|
|
|
|
// timer is the timer tracking this bin.
|
|
|
|
timer *time.Timer
|
2014-12-11 06:33:26 +00:00
|
|
|
}
|
|
|
|
|
2017-04-27 23:41:28 +00:00
|
|
|
// NewTombstoneGC is used to construct a new TombstoneGC given a TTL for
|
|
|
|
// tombstones and a tracking granularity. Longer TTLs ensure correct behavior
|
|
|
|
// for more time, but use more storage. A shorter granularity increases the
|
|
|
|
// number of Raft transactions and reduce how far past the TTL we perform GC.
|
2014-12-11 01:17:29 +00:00
|
|
|
func NewTombstoneGC(ttl, granularity time.Duration) (*TombstoneGC, error) {
|
|
|
|
// Sanity check the inputs
|
|
|
|
if ttl <= 0 || granularity <= 0 {
|
|
|
|
return nil, fmt.Errorf("Tombstone TTL and granularity must be positive")
|
|
|
|
}
|
|
|
|
|
|
|
|
t := &TombstoneGC{
|
|
|
|
ttl: ttl,
|
|
|
|
granularity: granularity,
|
2014-12-11 06:33:26 +00:00
|
|
|
expires: make(map[time.Time]*expireInterval),
|
2014-12-11 01:17:29 +00:00
|
|
|
expireCh: make(chan uint64, 1),
|
|
|
|
}
|
|
|
|
return t, nil
|
|
|
|
}
|
|
|
|
|
2017-04-27 23:41:28 +00:00
|
|
|
// ExpireCh is used to return a channel that streams the next index that should
|
|
|
|
// be expired.
|
2014-12-11 01:17:29 +00:00
|
|
|
func (t *TombstoneGC) ExpireCh() <-chan uint64 {
|
|
|
|
return t.expireCh
|
|
|
|
}
|
|
|
|
|
2015-01-05 22:58:59 +00:00
|
|
|
// SetEnabled is used to control if the tombstone GC is
|
|
|
|
// enabled. Should only be enabled by the leader node.
|
|
|
|
func (t *TombstoneGC) SetEnabled(enabled bool) {
|
2017-04-28 00:04:49 +00:00
|
|
|
t.Lock()
|
|
|
|
defer t.Unlock()
|
2015-01-05 22:58:59 +00:00
|
|
|
if enabled == t.enabled {
|
|
|
|
return
|
2014-12-11 06:33:26 +00:00
|
|
|
}
|
2015-01-05 22:58:59 +00:00
|
|
|
|
|
|
|
// Stop all the timers and clear
|
|
|
|
if !enabled {
|
|
|
|
for _, exp := range t.expires {
|
|
|
|
exp.timer.Stop()
|
|
|
|
}
|
|
|
|
t.expires = make(map[time.Time]*expireInterval)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Update the status
|
|
|
|
t.enabled = enabled
|
2014-12-11 06:33:26 +00:00
|
|
|
}
|
|
|
|
|
2014-12-11 01:17:29 +00:00
|
|
|
// Hint is used to indicate that keys at the given index have been
|
|
|
|
// deleted, and that their GC should be scheduled.
|
|
|
|
func (t *TombstoneGC) Hint(index uint64) {
|
|
|
|
expires := t.nextExpires()
|
|
|
|
|
2017-04-28 00:04:49 +00:00
|
|
|
t.Lock()
|
|
|
|
defer t.Unlock()
|
2015-01-05 22:58:59 +00:00
|
|
|
if !t.enabled {
|
|
|
|
return
|
|
|
|
}
|
2014-12-11 01:17:29 +00:00
|
|
|
|
2017-04-27 23:41:28 +00:00
|
|
|
// Check for an existing expiration timer and bump its index if we
|
|
|
|
// find one.
|
2014-12-11 06:33:26 +00:00
|
|
|
exp, ok := t.expires[expires]
|
2014-12-11 01:17:29 +00:00
|
|
|
if ok {
|
2014-12-11 06:33:26 +00:00
|
|
|
if index > exp.maxIndex {
|
|
|
|
exp.maxIndex = index
|
2014-12-11 01:17:29 +00:00
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2017-04-27 23:41:28 +00:00
|
|
|
// Create a new expiration timer.
|
2014-12-11 06:33:26 +00:00
|
|
|
t.expires[expires] = &expireInterval{
|
|
|
|
maxIndex: index,
|
|
|
|
timer: time.AfterFunc(expires.Sub(time.Now()), func() {
|
|
|
|
t.expireTime(expires)
|
|
|
|
}),
|
|
|
|
}
|
2014-12-11 01:17:29 +00:00
|
|
|
}
|
|
|
|
|
2017-04-27 23:41:28 +00:00
|
|
|
// PendingExpiration is used to check if any expirations are pending.
|
2014-12-15 22:22:32 +00:00
|
|
|
func (t *TombstoneGC) PendingExpiration() bool {
|
2017-04-28 00:04:49 +00:00
|
|
|
t.Lock()
|
|
|
|
defer t.Unlock()
|
2017-04-27 23:41:28 +00:00
|
|
|
|
2014-12-15 22:22:32 +00:00
|
|
|
return len(t.expires) > 0
|
|
|
|
}
|
|
|
|
|
2017-04-27 23:41:28 +00:00
|
|
|
// nextExpires is used to calculate the next expiration time, based on the
|
|
|
|
// granularity that is set. This allows us to bin expirations and avoid a ton
|
|
|
|
// of timers.
|
2014-12-11 01:17:29 +00:00
|
|
|
func (t *TombstoneGC) nextExpires() time.Time {
|
2017-11-29 18:34:24 +00:00
|
|
|
// The Round(0) call here is to shed the monotonic time so that we
|
|
|
|
// can safely use these as map keys. See #3670 for more details.
|
|
|
|
expires := time.Now().Add(t.ttl).Round(0)
|
2014-12-11 01:17:29 +00:00
|
|
|
remain := expires.UnixNano() % int64(t.granularity)
|
|
|
|
adj := expires.Add(t.granularity - time.Duration(remain))
|
|
|
|
return adj
|
|
|
|
}
|
|
|
|
|
2017-11-20 00:13:40 +00:00
|
|
|
// purgeBin gets the index for the given bin and then deletes the bin. If there
|
|
|
|
// is no bin then this will return 0 for the index, which is ok.
|
|
|
|
func (t *TombstoneGC) purgeBin(expires time.Time) uint64 {
|
2017-04-28 00:04:49 +00:00
|
|
|
t.Lock()
|
|
|
|
defer t.Unlock()
|
2014-12-11 01:17:29 +00:00
|
|
|
|
2017-04-27 23:43:07 +00:00
|
|
|
// Get the maximum index and clear the entry. It's possible that the GC
|
|
|
|
// has been shut down while this timer fired and got blocked on the lock,
|
|
|
|
// so if there's nothing in the map for us we just exit out since there
|
|
|
|
// is no work to do.
|
|
|
|
exp, ok := t.expires[expires]
|
|
|
|
if !ok {
|
2017-11-20 00:13:40 +00:00
|
|
|
return 0
|
2017-04-27 23:43:07 +00:00
|
|
|
}
|
|
|
|
delete(t.expires, expires)
|
2017-11-20 00:13:40 +00:00
|
|
|
return exp.maxIndex
|
|
|
|
}
|
|
|
|
|
|
|
|
// expireTime is used to expire the entries at the given time.
|
|
|
|
func (t *TombstoneGC) expireTime(expires time.Time) {
|
|
|
|
// This is careful to take the lock only while we are fetching the index
|
|
|
|
// since the channel write might get blocked for reasons that could also
|
|
|
|
// need to hint GC (see #3700).
|
|
|
|
if index := t.purgeBin(expires); index > 0 {
|
|
|
|
t.expireCh <- index
|
|
|
|
}
|
2014-12-11 01:17:29 +00:00
|
|
|
}
|