From 4fd1618fa21782c88c5c3ca5293f2b282502512d Mon Sep 17 00:00:00 2001 From: amitsingh21 Date: Thu, 2 May 2024 17:36:52 +0530 Subject: [PATCH] fix(hscontrol): fixes high cpu usage issue during node conn/disconn (#4) --- CHANGELOG.md | 5 +++ hscontrol/app.go | 54 +++++++++++++++++-------------- hscontrol/machine.go | 16 +++++---- hscontrol/protocol_common_poll.go | 20 ++++++------ hscontrol/routes.go | 13 +++----- 5 files changed, 59 insertions(+), 49 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f8acb5e2bd..c9b52976ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,16 @@ # CHANGELOG + ## 0.23.0 (2023-XX-XX) ### BREAKING - Code reorganisation, a lot of code has moved, please review the following PRs accordingly [#1444](https://github.com/juanfont/headscale/pull/1444) +## 0.22.5-rr (2024-05-02) +### Fixed +- High CPU consumption during multiple machines/nodes connect/disconnect in tailnet + ## 0.22.4-rr (2024-01-11) ### Fixed - Subnet failover handler checks for user ID when finding new primary route diff --git a/hscontrol/app.go b/hscontrol/app.go index b8dceba8ae..27ca534811 100644 --- a/hscontrol/app.go +++ b/hscontrol/app.go @@ -8,6 +8,7 @@ import ( "io" "net" "net/http" + _ "net/http/pprof" //nolint "os" "os/signal" "sort" @@ -245,7 +246,7 @@ func (h *Headscale) expireEphemeralNodesWorker() { return } - + usersChanged := make(map[string]User, 0) for _, user := range users { machines, err := h.ListMachinesByUser(user.Name) if err != nil { @@ -257,12 +258,11 @@ func (h *Headscale) expireEphemeralNodesWorker() { return } - expiredFound := false for _, machine := range machines { if machine.isEphemeral() && machine.LastSeen != nil && time.Now(). After(machine.LastSeen.Add(h.cfg.EphemeralNodeInactivityTimeout)) { - expiredFound = true + usersChanged[user.Name] = user log.Info(). Str("machine", machine.Hostname). Msg("Ephemeral client removed from database") @@ -276,10 +276,9 @@ func (h *Headscale) expireEphemeralNodesWorker() { } } } - - if expiredFound { - h.setLastStateChangeToNow() - } + } + for _, user := range usersChanged { + h.setLastStateChangeToNow(user) } } @@ -290,7 +289,7 @@ func (h *Headscale) expireExpiredMachinesWorker() { return } - + usersChanged := make(map[string]User, 0) for _, user := range users { machines, err := h.ListMachinesByUser(user.Name) if err != nil { @@ -302,12 +301,10 @@ func (h *Headscale) expireExpiredMachinesWorker() { return } - expiredFound := false for index, machine := range machines { if machine.isExpired() && machine.Expiry.After(h.getLastStateChange(user)) { - expiredFound = true - + usersChanged[user.Name] = user err := h.ExpireMachine(&machines[index]) if err != nil { log.Error(). @@ -323,10 +320,9 @@ func (h *Headscale) expireExpiredMachinesWorker() { } } } - - if expiredFound { - h.setLastStateChangeToNow() - } + } + for _, user := range usersChanged { + h.setLastStateChangeToNow(user) } } @@ -717,6 +713,7 @@ func (h *Headscale) Serve() error { Msgf("listening and serving HTTP on: %s", h.cfg.Addr) promMux := http.NewServeMux() + promMux.Handle("/debug/pprof/", http.DefaultServeMux) promMux.Handle("/metrics", promhttp.Handler()) promHTTPServer := &http.Server{ @@ -905,24 +902,31 @@ func (h *Headscale) getTLSSettings() (*tls.Config, error) { } } -func (h *Headscale) setLastStateChangeToNow() { - var err error +func (h *Headscale) setLastStateChangeToNow(filteredUsers ...User) { + var ( + err error + users []User + ) + if len(filteredUsers) > 0 { + users = filteredUsers + } else { + users, err = h.ListUsers() + if err != nil { + log.Error(). + Caller(). + Err(err). + Msg("failed to fetch all users, failing to update last changed state.") + } + } now := time.Now().UTC() - users, err := h.ListUsers() - if err != nil { - log.Error(). - Caller(). - Err(err). - Msg("failed to fetch all users, failing to update last changed state.") - } - for _, user := range users { lastStateUpdate.WithLabelValues(user.Name, "headscale").Set(float64(now.Unix())) if h.lastStateChange == nil { h.lastStateChange = xsync.NewMapOf[time.Time]() } + h.lastStateChange.Store(user.Name, now) } } diff --git a/hscontrol/machine.go b/hscontrol/machine.go index 9f04d8ce30..a773fb9605 100644 --- a/hscontrol/machine.go +++ b/hscontrol/machine.go @@ -204,6 +204,10 @@ func filterMachinesByACL( if peer.ID == machine.ID { continue } + //Skip machines not belonging to this user + if peer.User.Name != machine.User.Name { + continue + } if machine.canAccess(filter, &machines[index]) || peer.canAccess(filter, machine) { result = append(result, peer) @@ -414,7 +418,7 @@ func (h *Headscale) SetTags(machine *Machine, tags []string) error { if err := h.UpdateACLRules(); err != nil && !errors.Is(err, errEmptyPolicy) { return err } - h.setLastStateChangeToNow() + h.setLastStateChangeToNow(machine.User) if err := h.db.Save(machine).Error; err != nil { return fmt.Errorf("failed to update tags for machine in the database: %w", err) @@ -428,7 +432,7 @@ func (h *Headscale) ExpireMachine(machine *Machine) error { now := time.Now() machine.Expiry = &now - h.setLastStateChangeToNow() + h.setLastStateChangeToNow(machine.User) if err := h.db.Save(machine).Error; err != nil { return fmt.Errorf("failed to expire machine in the database: %w", err) @@ -455,7 +459,7 @@ func (h *Headscale) RenameMachine(machine *Machine, newName string) error { } machine.GivenName = newName - h.setLastStateChangeToNow() + h.setLastStateChangeToNow(machine.User) if err := h.db.Save(machine).Error; err != nil { return fmt.Errorf("failed to rename machine in the database: %w", err) @@ -471,7 +475,7 @@ func (h *Headscale) RefreshMachine(machine *Machine, expiry time.Time) error { machine.LastSuccessfulUpdate = &now machine.Expiry = &expiry - h.setLastStateChangeToNow() + h.setLastStateChangeToNow(machine.User) if err := h.db.Save(machine).Error; err != nil { return fmt.Errorf( @@ -536,7 +540,7 @@ func (h *Headscale) isOutdated(machine *Machine) bool { // TODO(kradalby): Only request updates from users where we can talk to nodes // This would mostly be for a bit of performance, and can be calculated based on // ACLs. - lastChange := h.getLastStateChange() + lastChange := h.getLastStateChange(machine.User) lastUpdate := machine.CreatedAt if machine.LastSuccessfulUpdate != nil { lastUpdate = *machine.LastSuccessfulUpdate @@ -1068,7 +1072,7 @@ func (h *Headscale) enableRoutes(machine *Machine, routeStrs ...string) error { } } - h.setLastStateChangeToNow() + h.setLastStateChangeToNow(machine.User) return nil } diff --git a/hscontrol/protocol_common_poll.go b/hscontrol/protocol_common_poll.go index f267c9999a..25458c7be0 100644 --- a/hscontrol/protocol_common_poll.go +++ b/hscontrol/protocol_common_poll.go @@ -146,7 +146,7 @@ func (h *Headscale) handlePollCommon( // There has been an update to _any_ of the nodes that the other nodes would // need to know about - h.setLastStateChangeToNow() + h.setLastStateChangeToNow(machine.User) // The request is not ReadOnly, so we need to set up channels for updating // peers via longpoll @@ -322,9 +322,9 @@ func (h *Headscale) pollNetMapStream( Str("channel", "pollData"). Int("bytes", len(data)). Msg("Data from pollData channel written successfully") - // TODO(kradalby): Abstract away all the database calls, this can cause race conditions - // when an outdated machine object is kept alive, e.g. db is update from - // command line, but then overwritten. + // TODO(kradalby): Abstract away all the database calls, this can cause race conditions + // when an outdated machine object is kept alive, e.g. db is update from + // command line, but then overwritten. err = h.UpdateMachineFromDatabase(machine) if err != nil { log.Error(). @@ -406,9 +406,9 @@ func (h *Headscale) pollNetMapStream( Str("channel", "keepAlive"). Int("bytes", len(data)). Msg("Keep alive sent successfully") - // TODO(kradalby): Abstract away all the database calls, this can cause race conditions - // when an outdated machine object is kept alive, e.g. db is update from - // command line, but then overwritten. + // TODO(kradalby): Abstract away all the database calls, this can cause race conditions + // when an outdated machine object is kept alive, e.g. db is update from + // command line, but then overwritten. err = h.UpdateMachineFromDatabase(machine) if err != nil { log.Error(). @@ -575,9 +575,9 @@ func (h *Headscale) pollNetMapStream( Str("handler", "PollNetMapStream"). Str("machine", machine.Hostname). Msg("The client has closed the connection") - // TODO: Abstract away all the database calls, this can cause race conditions - // when an outdated machine object is kept alive, e.g. db is update from - // command line, but then overwritten. + // TODO: Abstract away all the database calls, this can cause race conditions + // when an outdated machine object is kept alive, e.g. db is update from + // command line, but then overwritten. err := h.UpdateMachineFromDatabase(machine) if err != nil { log.Error(). diff --git a/hscontrol/routes.go b/hscontrol/routes.go index 2b38140784..3e1f2874a4 100644 --- a/hscontrol/routes.go +++ b/hscontrol/routes.go @@ -313,7 +313,7 @@ func (h *Headscale) handlePrimarySubnetFailover() error { log.Error().Err(err).Msg("error getting routes") } - routesChanged := false + usersChanged := make(map[string]User, 0) for pos, route := range routes { if route.isExitRoute() { continue @@ -333,9 +333,7 @@ func (h *Headscale) handlePrimarySubnetFailover() error { return err } - - routesChanged = true - + usersChanged[route.Machine.User.Name] = route.Machine.User continue } } @@ -408,12 +406,11 @@ func (h *Headscale) handlePrimarySubnetFailover() error { return err } - routesChanged = true + usersChanged[route.Machine.User.Name] = route.Machine.User } } - - if routesChanged { - h.setLastStateChangeToNow() + for _, user := range usersChanged { + h.setLastStateChangeToNow(user) } return nil