Port: Use Stored Hll to Compute New Clients For Current Month (#16184)

* port hll storage changes

* changelog
This commit is contained in:
Hridoy Roy 2022-06-29 10:51:23 -07:00 committed by GitHub
parent 7288bb3c57
commit 2f14d60a4b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 283 additions and 62 deletions

3
changelog/16184.txt Normal file
View File

@ -0,0 +1,3 @@
```release-note:improvement
core/activity: use monthly hyperloglogs to calculate new clients approximation for current month
```

View File

@ -367,53 +367,6 @@ func (a *ActivityLog) saveCurrentSegmentToStorageLocked(ctx context.Context, for
return nil return nil
} }
// CreateOrFetchHyperlogLog creates a new hyperlogLog for each startTime (month) if it does not exist in storage.
// hyperlogLog is used here to solve count-distinct problem i.e, to count the number of distinct clients
// In activity log, hyperloglog is a sketch containing clientID's in a given month
func (a *ActivityLog) CreateOrFetchHyperlogLog(ctx context.Context, startTime time.Time) *hyperloglog.Sketch {
monthlyHLLPath := fmt.Sprintf("%s%d", distinctClientsBasePath, startTime.Unix())
hll := hyperloglog.New()
a.logger.Trace("fetching hyperloglog ", "path", monthlyHLLPath)
data, err := a.view.Get(ctx, monthlyHLLPath)
if err != nil {
a.logger.Error("error fetching hyperloglog", "path", monthlyHLLPath, "error", err)
return hll
}
if data == nil {
a.logger.Trace("creating hyperloglog ", "path", monthlyHLLPath)
err = a.StoreHyperlogLog(ctx, startTime, hll)
if err != nil {
a.logger.Error("error storing hyperloglog", "path", monthlyHLLPath, "error", err)
return hll
}
} else {
err = hll.UnmarshalBinary(data.Value)
if err != nil {
a.logger.Error("error unmarshaling hyperloglog", "path", monthlyHLLPath, "error", err)
return hll
}
}
return hll
}
// StoreHyperlogLog stores the hyperloglog (a sketch containing client IDs) for startTime (month) in storage
func (a *ActivityLog) StoreHyperlogLog(ctx context.Context, startTime time.Time, newHll *hyperloglog.Sketch) error {
monthlyHLLPath := fmt.Sprintf("%s%d", distinctClientsBasePath, startTime.Unix())
a.logger.Trace("storing hyperloglog ", "path", monthlyHLLPath)
marshalledHll, err := newHll.MarshalBinary()
if err != nil {
return err
}
err = a.view.Put(ctx, &logical.StorageEntry{
Key: monthlyHLLPath,
Value: marshalledHll,
})
if err != nil {
return err
}
return nil
}
// :force: forces a save of tokens/entities even if the in-memory log is empty // :force: forces a save of tokens/entities even if the in-memory log is empty
func (a *ActivityLog) saveCurrentSegmentInternal(ctx context.Context, force bool) error { func (a *ActivityLog) saveCurrentSegmentInternal(ctx context.Context, force bool) error {
entityPath := fmt.Sprintf("%s%d/%d", activityEntityBasePath, a.currentSegment.startTimestamp, a.currentSegment.clientSequenceNumber) entityPath := fmt.Sprintf("%s%d/%d", activityEntityBasePath, a.currentSegment.startTimestamp, a.currentSegment.clientSequenceNumber)
@ -1613,7 +1566,7 @@ func (a *ActivityLog) handleQuery(ctx context.Context, startTime, endTime time.T
distinctEntitiesResponse := totalEntities distinctEntitiesResponse := totalEntities
if computePartial { if computePartial {
currentMonth, err := a.computeCurrentMonthForBillingPeriod(partialByMonth, startTime, endTime) currentMonth, err := a.computeCurrentMonthForBillingPeriod(ctx, partialByMonth, startTime, endTime)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -2067,7 +2020,12 @@ func (a *ActivityLog) precomputedQueryWorker(ctx context.Context) error {
break break
} }
hyperloglog := a.CreateOrFetchHyperlogLog(ctx, startTime) hyperloglog, err := a.CreateOrFetchHyperlogLog(ctx, startTime)
if err != nil {
// We were unable to create or fetch the hll, but we should still
// continue with our precomputation
a.logger.Warn("unable to create or fetch hyperloglog", "start time", startTime, "error", err)
}
err = a.WalkEntitySegments(ctx, startTime, hyperloglog, walkEntities) err = a.WalkEntitySegments(ctx, startTime, hyperloglog, walkEntities)
if err != nil { if err != nil {
a.logger.Warn("failed to load previous segments", "error", err) a.logger.Warn("failed to load previous segments", "error", err)

View File

@ -494,7 +494,7 @@ func TestActivityLog_StoreAndReadHyperloglog(t *testing.T) {
if err != nil { if err != nil {
t.Fatalf("error storing hyperloglog in storage: %v", err) t.Fatalf("error storing hyperloglog in storage: %v", err)
} }
fetchedHll := a.CreateOrFetchHyperlogLog(ctx, currentMonth) fetchedHll, err := a.CreateOrFetchHyperlogLog(ctx, currentMonth)
// check the distinct count stored from hll // check the distinct count stored from hll
if fetchedHll.Estimate() != 4 { if fetchedHll.Estimate() != 4 {
t.Fatalf("wrong number of distinct elements: expected: 5 actual: %v", fetchedHll.Estimate()) t.Fatalf("wrong number of distinct elements: expected: 5 actual: %v", fetchedHll.Estimate())

View File

@ -1,12 +1,156 @@
package vault package vault
import ( import (
"context"
"errors"
"fmt"
"sort" "sort"
"time" "time"
"github.com/axiomhq/hyperloglog"
"github.com/hashicorp/vault/helper/timeutil"
"github.com/hashicorp/vault/sdk/logical"
"github.com/hashicorp/vault/vault/activity" "github.com/hashicorp/vault/vault/activity"
) )
type HLLGetter func(ctx context.Context, startTime time.Time) (*hyperloglog.Sketch, error)
// computeCurrentMonthForBillingPeriod computes the current month's data with respect
// to a billing period.
func (a *ActivityLog) computeCurrentMonthForBillingPeriod(ctx context.Context, byMonth map[int64]*processMonth, startTime time.Time, endTime time.Time) (*activity.MonthRecord, error) {
return a.computeCurrentMonthForBillingPeriodInternal(ctx, byMonth, a.CreateOrFetchHyperlogLog, startTime, endTime)
}
// CreateOrFetchHyperlogLog creates a new hyperlogLog for each startTime (month) if it does not exist in storage.
// hyperlogLog is used here to solve count-distinct problem i.e, to count the number of distinct clients
// In activity log, hyperloglog is a sketch containing clientID's in a given month
func (a *ActivityLog) CreateOrFetchHyperlogLog(ctx context.Context, startTime time.Time) (*hyperloglog.Sketch, error) {
monthlyHLLPath := fmt.Sprintf("%s%d", distinctClientsBasePath, startTime.Unix())
hll := hyperloglog.New()
data, err := a.view.Get(ctx, monthlyHLLPath)
if err != nil {
// If there is no hll, we should log the error, as having this fire multiple times
// is a sign that something is wrong with hll store/get. However, this is not a
// critical failure (in fact it is expected during the first month rotation after
// this code is deployed), so we will not throw an error.
a.logger.Warn("fetch of hyperloglog threw an error at path", monthlyHLLPath, "error", err)
}
if data == nil {
a.logger.Trace("creating hyperloglog ", "path", monthlyHLLPath)
err = a.StoreHyperlogLog(ctx, startTime, hll)
if err != nil {
return hll, fmt.Errorf("error storing hyperloglog at path %s: error %w", monthlyHLLPath, err)
}
} else {
err = hll.UnmarshalBinary(data.Value)
if err != nil {
return hll, fmt.Errorf("error unmarshaling hyperloglog at path %s: error %w", monthlyHLLPath, err)
}
}
return hll, nil
}
// StoreHyperlogLog stores the hyperloglog (a sketch containing client IDs) for startTime (month) in storage
func (a *ActivityLog) StoreHyperlogLog(ctx context.Context, startTime time.Time, newHll *hyperloglog.Sketch) error {
monthlyHLLPath := fmt.Sprintf("%s%d", distinctClientsBasePath, startTime.Unix())
a.logger.Trace("storing hyperloglog ", "path", monthlyHLLPath)
marshalledHll, err := newHll.MarshalBinary()
if err != nil {
return err
}
err = a.view.Put(ctx, &logical.StorageEntry{
Key: monthlyHLLPath,
Value: marshalledHll,
})
if err != nil {
return err
}
return nil
}
func (a *ActivityLog) computeCurrentMonthForBillingPeriodInternal(ctx context.Context, byMonth map[int64]*processMonth, hllGetFunc HLLGetter, startTime time.Time, endTime time.Time) (*activity.MonthRecord, error) {
// Fetch all hyperloglogs for months from startMonth to endMonth. If a month doesn't have an associated
// hll, warn and continue.
// hllMonthlyTimestamp is the start time of the month corresponding to which a hyperloglog of that month's
// client data is stored. The path at which the hyperloglog for a month is stored containes this timestamp.
hllMonthlyTimestamp := timeutil.StartOfMonth(startTime)
billingPeriodHLL := hyperloglog.New()
for hllMonthlyTimestamp.Before(timeutil.StartOfMonth(endTime)) || hllMonthlyTimestamp.Equal(timeutil.StartOfMonth(endTime)) {
monthSketch, err := hllGetFunc(ctx, hllMonthlyTimestamp)
// If there's an error with the hyperloglog fetch, we should still deduplicate on
// the hlls that we have so we will warn that we couldn't find a hll for the month
// and continue.
if err != nil {
a.logger.Warn("no hyperloglog associated with timestamp", "timestamp", hllMonthlyTimestamp)
hllMonthlyTimestamp = timeutil.StartOfNextMonth(hllMonthlyTimestamp)
continue
}
// Union the monthly hll into the billing period's hll
err = billingPeriodHLL.Merge(monthSketch)
if err != nil {
// In this case we can't afford to fail silently. Since this error indicates
// data corruption, we should not try to do any further deduplication
return nil, err
}
hllMonthlyTimestamp = timeutil.StartOfNextMonth(hllMonthlyTimestamp)
}
// Now we will add the clients for the current month to a copy of the billing period's hll to
// see how the cardinality grows.
billingPeriodHLLWithCurrentMonthEntityClients := billingPeriodHLL.Clone()
billingPeriodHLLWithCurrentMonthNonEntityClients := billingPeriodHLL.Clone()
// There's at most one month of data here. We should validate this assumption explicitly
if len(byMonth) > 1 {
return nil, errors.New(fmt.Sprintf("multiple months of data found in partial month's client count breakdowns: %+v\n", byMonth))
}
totalEntities := 0
totalNonEntities := 0
for _, month := range byMonth {
if month.NewClients == nil || month.NewClients.Counts == nil || month.Counts == nil {
return nil, errors.New("malformed current month used to calculate current month's activity")
}
// Note that the following calculations assume that all clients seen are currently in
// the NewClients section of byMonth. It is best to explicitly check this, just verify
// our assumptions about the passed in byMonth argument.
if len(month.Counts.Entities) != len(month.NewClients.Counts.Entities) ||
len(month.Counts.NonEntities) != len(month.NewClients.Counts.NonEntities) {
return nil, errors.New("current month clients cache assumes billing period")
}
// All the clients for the current month are in the newClients section, initially.
// We need to deduplicate these clients across the billing period by adding them
// into the billing period hyperloglogs.
entities := month.NewClients.Counts.Entities
nonEntities := month.NewClients.Counts.NonEntities
if entities != nil {
for entityID := range entities {
billingPeriodHLLWithCurrentMonthEntityClients.Insert([]byte(entityID))
totalEntities += 1
}
}
if nonEntities != nil {
for nonEntityID := range nonEntities {
billingPeriodHLLWithCurrentMonthNonEntityClients.Insert([]byte(nonEntityID))
totalNonEntities += 1
}
}
}
// The number of new entities for the current month is approximately the size of the hll with
// the current month's entities minus the size of the initial billing period hll.
currentMonthNewEntities := billingPeriodHLLWithCurrentMonthEntityClients.Estimate() - billingPeriodHLL.Estimate()
currentMonthNewNonEntities := billingPeriodHLLWithCurrentMonthNonEntityClients.Estimate() - billingPeriodHLL.Estimate()
return &activity.MonthRecord{
NewClients: &activity.NewClientRecord{Counts: &activity.CountsRecord{EntityClients: int(currentMonthNewEntities), NonEntityClients: int(currentMonthNewNonEntities)}},
Counts: &activity.CountsRecord{EntityClients: totalEntities, NonEntityClients: totalNonEntities},
}, nil
}
// sortALResponseNamespaces sorts the namespaces for activity log responses. // sortALResponseNamespaces sorts the namespaces for activity log responses.
func (a *ActivityLog) sortALResponseNamespaces(byNamespaceResponse []*ResponseNamespace) { func (a *ActivityLog) sortALResponseNamespaces(byNamespaceResponse []*ResponseNamespace) {
sort.Slice(byNamespaceResponse, func(i, j int) bool { sort.Slice(byNamespaceResponse, func(i, j int) bool {
@ -106,15 +250,3 @@ func (a *ActivityLog) sortActivityLogMonthsResponse(months []*ResponseMonth) {
} }
} }
} }
// TODO
// computeCurrentMonthForBillingPeriod computes the current month's data with respect
// to a billing period. This function is currently a stub with the bare minimum amount
// of data to get the pre-existing tests to pass. It will be filled out in a separate PR
// and this comment will be removed.
func (a *ActivityLog) computeCurrentMonthForBillingPeriod(byMonth map[int64]*processMonth, startTime time.Time, endTime time.Time) (*activity.MonthRecord, error) {
return &activity.MonthRecord{
NewClients: &activity.NewClientRecord{Counts: &activity.CountsRecord{EntityClients: 0, NonEntityClients: 0}},
Counts: &activity.CountsRecord{EntityClients: 0, NonEntityClients: 0},
}, nil
}

View File

@ -0,0 +1,128 @@
package vault
import (
"context"
"fmt"
"testing"
"time"
"github.com/axiomhq/hyperloglog"
"github.com/hashicorp/vault/helper/timeutil"
)
func Test_ActivityLog_ComputeCurrentMonthForBillingPeriodInternal(t *testing.T) {
// populate the first month with clients 1-10
monthOneHLL := hyperloglog.New()
// populate the second month with clients 5-15
monthTwoHLL := hyperloglog.New()
// populate the third month with clients 10-20
monthThreeHLL := hyperloglog.New()
for i := 0; i < 20; i++ {
clientID := []byte(fmt.Sprintf("client_%d", i))
if i < 10 {
monthOneHLL.Insert(clientID)
}
if 5 <= i && i < 15 {
monthTwoHLL.Insert(clientID)
}
if 10 <= i && i < 20 {
monthThreeHLL.Insert(clientID)
}
}
mockHLLGetFunc := func(ctx context.Context, startTime time.Time) (*hyperloglog.Sketch, error) {
currMonthStart := timeutil.StartOfMonth(time.Now())
if startTime.Equal(timeutil.MonthsPreviousTo(3, currMonthStart)) {
return monthThreeHLL, nil
}
if startTime.Equal(timeutil.MonthsPreviousTo(2, currMonthStart)) {
return monthTwoHLL, nil
}
if startTime.Equal(timeutil.MonthsPreviousTo(1, currMonthStart)) {
return monthOneHLL, nil
}
return nil, fmt.Errorf("bad start time")
}
// Let's add 2 entities exclusive to month 1 (clients 0,1),
// 2 entities shared by month 1 and 2 (clients 5,6),
// 2 entities shared by month 2 and 3 (clients 10,11), and
// 2 entities exclusive to month 3 (15,16). Furthermore, we can add
// 3 new entities (clients 20,21, and 22).
entitiesStruct := make(map[string]struct{}, 0)
entitiesStruct["client_0"] = struct{}{}
entitiesStruct["client_1"] = struct{}{}
entitiesStruct["client_5"] = struct{}{}
entitiesStruct["client_6"] = struct{}{}
entitiesStruct["client_10"] = struct{}{}
entitiesStruct["client_11"] = struct{}{}
entitiesStruct["client_15"] = struct{}{}
entitiesStruct["client_16"] = struct{}{}
entitiesStruct["client_20"] = struct{}{}
entitiesStruct["client_21"] = struct{}{}
entitiesStruct["client_22"] = struct{}{}
// We will add 3 nonentity clients from month 1 (clients 2,3,4),
// 3 shared by months 1 and 2 (7,8,9),
// 3 shared by months 2 and 3 (12,13,14), and
// 3 exclusive to month 3 (17,18,19). We will also
// add 4 new nonentity clients.
nonEntitiesStruct := make(map[string]struct{}, 0)
nonEntitiesStruct["client_2"] = struct{}{}
nonEntitiesStruct["client_3"] = struct{}{}
nonEntitiesStruct["client_4"] = struct{}{}
nonEntitiesStruct["client_7"] = struct{}{}
nonEntitiesStruct["client_8"] = struct{}{}
nonEntitiesStruct["client_9"] = struct{}{}
nonEntitiesStruct["client_12"] = struct{}{}
nonEntitiesStruct["client_13"] = struct{}{}
nonEntitiesStruct["client_14"] = struct{}{}
nonEntitiesStruct["client_17"] = struct{}{}
nonEntitiesStruct["client_18"] = struct{}{}
nonEntitiesStruct["client_19"] = struct{}{}
nonEntitiesStruct["client_23"] = struct{}{}
nonEntitiesStruct["client_24"] = struct{}{}
nonEntitiesStruct["client_25"] = struct{}{}
nonEntitiesStruct["client_26"] = struct{}{}
counts := &processCounts{
Entities: entitiesStruct,
NonEntities: nonEntitiesStruct,
}
currentMonthClientsMap := make(map[int64]*processMonth, 1)
currentMonthClients := &processMonth{
Counts: counts,
NewClients: &processNewClients{Counts: counts},
}
// Technially I think currentMonthClientsMap should have the keys as
// unix timestamps, but for the purposes of the unit test it doesn't
// matter what the values actually are.
currentMonthClientsMap[0] = currentMonthClients
core, _, _ := TestCoreUnsealed(t)
a := core.activityLog
endTime := timeutil.StartOfMonth(time.Now())
startTime := timeutil.MonthsPreviousTo(3, endTime)
monthRecord, err := a.computeCurrentMonthForBillingPeriodInternal(context.Background(), currentMonthClientsMap, mockHLLGetFunc, startTime, endTime)
if err != nil {
t.Fatal(err)
}
// We should have 11 entity clients and 16 nonentity clients, and 3 new entity clients
// and 4 new nonentity clients
if monthRecord.Counts.EntityClients != 11 {
t.Fatalf("wrong number of entity clients. Expected 11, got %d", monthRecord.Counts.EntityClients)
}
if monthRecord.Counts.NonEntityClients != 16 {
t.Fatalf("wrong number of non entity clients. Expected 16, got %d", monthRecord.Counts.NonEntityClients)
}
if monthRecord.NewClients.Counts.EntityClients != 3 {
t.Fatalf("wrong number of new entity clients. Expected 3, got %d", monthRecord.NewClients.Counts.EntityClients)
}
if monthRecord.NewClients.Counts.NonEntityClients != 4 {
t.Fatalf("wrong number of new non entity clients. Expected 4, got %d", monthRecord.NewClients.Counts.NonEntityClients)
}
}