Skip to content

Commit

Permalink
hot/candidate cache: remove damping on key qps
Browse files Browse the repository at this point in the history
The damping was a bit problematic in that it prevented any hits less
than a second after the initial candidate-cache insertion from having a
non-zero qps calculated.

In this new windowed-QPS type, we specifically avoid including a
time.Time so there are no pointers for the Garbage Collector to scan. As
such, we now measure time relative to a new `baseTime` in the Galaxy. To
facilitate testing, time is now managed by an implementation of the
go-clocks package's Clock interface.

Additionally, this has a few other properties:
 - After some time a value is considered "stale" and reset. (so as not
   to penalize a key that's hung in the candidate cache for hours
   relative to one that's been evicted and brought back)
 - QPS is calculated relative to the current "epoch" for that key (which
   is either the creation time or the last time it was reset due to
   staleness.

To facilitate a simpler Promoter implementation, we add a Hits field
that's guaranteed to be 0 on the first fetch.
  • Loading branch information
dfinkel committed Jul 29, 2022
1 parent 9d0cb74 commit f1529d7
Show file tree
Hide file tree
Showing 6 changed files with 296 additions and 144 deletions.
84 changes: 66 additions & 18 deletions galaxycache.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ import (
"github.com/vimeo/galaxycache/promoter"
"github.com/vimeo/galaxycache/singleflight"

"github.com/vimeo/go-clocks"

"go.opencensus.io/stats"
"go.opencensus.io/tag"
"go.opencensus.io/trace"
Expand Down Expand Up @@ -142,21 +144,26 @@ func (universe *Universe) NewGalaxy(name string, cacheBytes int64, getter Backen
}

gOpts := galaxyOpts{
promoter: &promoter.DefaultPromoter{},
hcRatio: 8, // default hotcache size is 1/8th of cacheBytes
maxCandidates: 100,
promoter: &promoter.DefaultPromoter{},
hcRatio: 8, // default hotcache size is 1/8th of cacheBytes
maxCandidates: 100,
clock: clocks.DefaultClock(),
resetIdleStatsAge: time.Minute,
}
for _, opt := range opts {
opt.apply(&gOpts)
}
g := &Galaxy{
name: name,
getter: getter,
peerPicker: universe.peerPicker,
cacheBytes: cacheBytes,
mainCache: newCache(MainCache),
hotCache: newCache(HotCache),
candidateCache: newCandidateCache(gOpts.maxCandidates),
name: name,
getter: getter,
peerPicker: universe.peerPicker,
cacheBytes: cacheBytes,
mainCache: newCache(MainCache),
hotCache: newCache(HotCache),
candidateCache: newCandidateCache(gOpts.maxCandidates),
baseTime: gOpts.clock.Now(),
resetIdleStatsAge: gOpts.resetIdleStatsAge,
clock: gOpts.clock,
hcStatsWithTime: HCStatsWithTime{
hcs: &promoter.HCStats{
HCCapacity: cacheBytes / gOpts.hcRatio,
Expand Down Expand Up @@ -241,6 +248,16 @@ type Galaxy struct {

opts galaxyOpts

baseTime time.Time

// Time that must elapse without any touches to a key before we clear
// its stats with the next touch.
// This protects intermittently hot keys from having very low qps
// calculations during a traffic burst.
resetIdleStatsAge time.Duration

clock clocks.Clock

_ int32 // force Stats to be 8-byte aligned on 32-bit platforms

// Stats are statistics on the galaxy.
Expand All @@ -250,6 +267,11 @@ type Galaxy struct {
parent *Universe
}

// now returns the current time relative to the baseTime
func (g *Galaxy) now() time.Duration {
return g.clock.Now().Sub(g.baseTime)
}

// GalaxyOption is an interface for implementing functional galaxy options
type GalaxyOption interface {
apply(*galaxyOpts)
Expand All @@ -258,9 +280,11 @@ type GalaxyOption interface {
// galaxyOpts contains optional fields for the galaxy (each with a default
// value if not set)
type galaxyOpts struct {
promoter promoter.Interface
hcRatio int64
maxCandidates int
promoter promoter.Interface
hcRatio int64
maxCandidates int
clock clocks.Clock
resetIdleStatsAge time.Duration
}

type funcGalaxyOption func(*galaxyOpts)
Expand Down Expand Up @@ -298,6 +322,23 @@ func WithMaxCandidates(n int) GalaxyOption {
})
}

// WithClock lets one override the clock used internally for key-stats
// accounting (among other things).
func WithClock(clk clocks.Clock) GalaxyOption {
return newFuncGalaxyOption(func(g *galaxyOpts) {
g.clock = clk
})
}

// WithIdleStatsAgeResetWindow overrides the default interval after which a key
// that's been idle for a while gets its stats reset (such that that hit is
// recorded as if it were the first).
func WithIdleStatsAgeResetWindow(age time.Duration) GalaxyOption {
return newFuncGalaxyOption(func(g *galaxyOpts) {
g.resetIdleStatsAge = age
})
}

// flightGroup is defined as an interface which flightgroup.Group
// satisfies. We define this so that we may test with an alternate
// implementation.
Expand Down Expand Up @@ -418,7 +459,7 @@ func (g *Galaxy) Get(ctx context.Context, key string, dest Codec) error {

if hlvl.isHit() {
span.Annotatef([]trace.Attribute{trace.BoolAttribute("cache_hit", true)}, "Cache hit in %s", hlvl)
value.stats.touch()
value.stats.touch(g.resetIdleStatsAge, g.now())
g.recordRequest(ctx, hlvl, false)
g.recordStats(ctx, nil, MValueLength.M(int64(len(value.data))))
return dest.UnmarshalBinary(value.data)
Expand All @@ -436,7 +477,7 @@ func (g *Galaxy) Get(ctx context.Context, key string, dest Codec) error {
g.recordStats(ctx, nil, MLoadErrors.M(1))
return err
}
value.stats.touch()
value.stats.touch(g.resetIdleStatsAge, g.now())
g.recordStats(ctx, nil, MValueLength.M(int64(len(value.data))))
if destPopulated {
return nil
Expand Down Expand Up @@ -520,7 +561,7 @@ func (g *Galaxy) load(ctx context.Context, key string, dest Codec) (value valWit
g.Stats.CoalescedBackendLoads.Add(1)
g.recordStats(ctx, nil, MCoalescedBackendLoads.M(1))
destPopulated = true // only one caller of load gets this return value
value = newValWithStat(data, nil)
value = g.newValWithStat(data, nil)
g.populateCache(ctx, key, value, &g.mainCache)
return &valWithLevel{value, hitBackend, authoritative, peerErr, err}, nil
})
Expand Down Expand Up @@ -549,15 +590,22 @@ func (g *Galaxy) getFromPeer(ctx context.Context, peer RemoteFetcher, key string
kStats, ok := g.candidateCache.get(key)
if !ok {
kStats = g.addNewToCandidateCache(key)
// NB: we do not touch() kStats here because that's reserved
// for code outside the singleflight block.
// This has the advantageous effect of guaranteeing that
// hitCount is 0 if it's a new key, thus making it easy for a
// promoter to distinguish a new key.
}

g.maybeUpdateHotCacheStats() // will update if at least a second has passed since the last update

hitCount, keyQPS := kStats.val(g.now())
stats := promoter.Stats{
KeyQPS: kStats.val(),
KeyQPS: keyQPS,
Hits: hitCount,
HCStats: g.hcStatsWithTime.hcs,
}
value := newValWithStat(data, kStats)
value := g.newValWithStat(data, kStats)
if g.opts.promoter.ShouldPromote(key, value.data, stats) {
g.populateCache(ctx, key, value, &g.hotCache)
}
Expand Down
Loading

0 comments on commit f1529d7

Please sign in to comment.