Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Periodically capture resource metrics for every sandbox #216

Open
wants to merge 27 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
749fbf2
make metrics struct more readable and bump env version
0div Dec 5, 2024
882f847
update orchestrator checks to use ticker and check metrics at differe…
0div Dec 5, 2024
c1c4994
add logger methods to log cpu percent and memory MB
0div Dec 5, 2024
203e525
log metrics on start and on end; reduce httpClient timeout to one second
0div Dec 5, 2024
290bbd3
Merge branch 'main' of https://github.com/e2b-dev/infra into periodic…
0div Dec 6, 2024
419b581
check if envd version is correct on every metrics log attempt
0div Dec 6, 2024
ff936ff
make CPU and MEM logger send event
0div Dec 6, 2024
f49461a
address PR comments
0div Dec 6, 2024
5ccbb3d
Update packages/envd/internal/host/metrics.go
0div Dec 6, 2024
2baee66
Update packages/envd/internal/host/metrics.go
0div Dec 6, 2024
805b77d
fix old method name
0div Dec 6, 2024
3c0343f
Merge branch 'main' of https://github.com/e2b-dev/infra into periodic…
0div Dec 6, 2024
c705e1c
remove legacu stats logic
0div Dec 6, 2024
9c94a61
specify platform/arch in docker image tf resrource to avoid docker pu…
0div Dec 6, 2024
5f859c6
report more memory data for debugging purposes; attempt to calculte f…
0div Dec 7, 2024
5df1d59
Update packages/shared/pkg/logs/logger.go
0div Dec 9, 2024
d1df2ac
Update packages/shared/pkg/logs/logger.go
0div Dec 9, 2024
1eb4d77
Update packages/shared/pkg/logs/logger.go
0div Dec 9, 2024
d7589b2
Merge branch 'main' of https://github.com/e2b-dev/infra into periodic…
0div Dec 9, 2024
27e9859
address PR review comments
0div Dec 9, 2024
c1294a1
Merge branch 'main' of https://github.com/e2b-dev/infra into periodic…
0div Dec 16, 2024
bbd42e6
add metrics endpoint to api spec and generate
0div Dec 17, 2024
9f44dc9
unmarshall log line into metric
0div Dec 18, 2024
4736cdf
fix merge conflicts
0div Jan 17, 2025
562e22c
Merge branch 'main' of https://github.com/e2b-dev/infra into periodic…
0div Jan 17, 2025
11880e6
Merge branch 'main' of https://github.com/e2b-dev/infra into periodic…
0div Jan 18, 2025
a3034c1
* add cpuTotal to spec
0div Jan 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions packages/api/internal/api/api.gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions packages/api/internal/api/types.gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

101 changes: 101 additions & 0 deletions packages/api/internal/handlers/sanbox_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package handlers

import (
"encoding/json"
"fmt"
"net/http"
"slices"
"strings"
"time"

"github.com/gin-gonic/gin"
"github.com/grafana/loki/pkg/loghttp"
"github.com/grafana/loki/pkg/logproto"
"go.opentelemetry.io/otel/attribute"

"github.com/e2b-dev/infra/packages/api/internal/api"
"github.com/e2b-dev/infra/packages/api/internal/auth"
authcache "github.com/e2b-dev/infra/packages/api/internal/cache/auth"
"github.com/e2b-dev/infra/packages/api/internal/utils"
"github.com/e2b-dev/infra/packages/shared/pkg/telemetry"
)

func (a *APIStore) GetSandboxesSandboxIDMetrics(
c *gin.Context,
sandboxID string,
) {
ctx := c.Request.Context()
sandboxID = utils.ShortID(sandboxID)

teamID := c.Value(auth.TeamContextKey).(authcache.AuthTeamInfo).Team.ID

telemetry.SetAttributes(ctx,
attribute.String("instance.id", sandboxID),
attribute.String("team.id", teamID.String()),
)

var start time.Time

end := time.Now()

// Sanitize ID
// https://grafana.com/blog/2021/01/05/how-to-escape-special-characters-with-lokis-logql/
id := strings.ReplaceAll(sandboxID, "`", "")
query := fmt.Sprintf(
"{source=\"logs-collector\", category=\"metrics\", service=\"envd\", teamID=`%s`, sandboxID=`%s`}", teamID.String(), id)

res, err := a.lokiClient.QueryRange(query, 100, start, end, logproto.FORWARD, time.Duration(0), time.Duration(0), true)
if err != nil {
errMsg := fmt.Errorf("error when returning metrics for sandbox: %w", err)
telemetry.ReportCriticalError(ctx, errMsg)
a.sendAPIStoreError(c, http.StatusNotFound, fmt.Sprintf("Error returning metrics for sandbox '%s'", sandboxID))

return
}

switch res.Data.Result.Type() {
case loghttp.ResultTypeStream:
value := res.Data.Result.(loghttp.Streams)

metrics := make([]api.SandboxMetric, 0)

for _, stream := range value {
for _, entry := range stream.Entries {

var metric struct {
CPUPct *float32 `json:"cpuPct"`
MemTotalMiB int64 `json:"memTotalMiB"`
MemUsedMiB int64 `json:"memUsedMiB"`
}

err := json.Unmarshal([]byte(entry.Line), &metric)
if err != nil {
telemetry.ReportCriticalError(ctx, fmt.Errorf("failed to unmarshal metric: %w", err))
continue
}
metrics = append(metrics, api.SandboxMetric{
Timestamp: entry.Timestamp,
CpuPct: metric.CPUPct,
MemMiBTotal: metric.MemTotalMiB,
MemMiBUsed: metric.MemUsedMiB,
})
}
}

// Sort metrics by timestamp (they are returned by the time they arrived in Loki)
slices.SortFunc(metrics, func(a, b api.SandboxMetric) int {
return a.Timestamp.Compare(b.Timestamp)
})

c.JSON(http.StatusOK, &api.SandboxMetrics{
Metrics: metrics,
})

default:
errMsg := fmt.Errorf("unexpected value type %T", res.Data.Result.Type())
telemetry.ReportCriticalError(ctx, errMsg)
a.sendAPIStoreError(c, http.StatusInternalServerError, fmt.Sprintf("Error returning metrics for sandbox '%s", sandboxID))

return
}
}
17 changes: 11 additions & 6 deletions packages/envd/internal/host/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@ import (
)

type Metrics struct {
CPU float64 `json:"cpu_pct"` // Percent rounded to 2 decimal places
Mem uint64 `json:"mem_bytes"` // Total virtual memory in bytes
Timestamp int64 `json:"ts"` // Unix Timestamp in UTC
Timestamp int64 `json:"ts"` // Unix Timestamp in UTC
CPUPercent float64 `json:"cpu_pct"` // Percent rounded to 2 decimal places
MemTotalMiB uint64 `json:"mem_total_mib"` // Total virtual memory in MiB
MemUsedMiB uint64 `json:"mem_used_mib"` // Used virtual memory in MiB
}

func GetMetrics() (*Metrics, error) {
Expand All @@ -20,6 +21,9 @@ func GetMetrics() (*Metrics, error) {
return nil, err
}

memUsedMiB := v.Used / 1024 / 1024
memTotalMiB := v.Total / 1024 / 1024

cpuPcts, err := cpu.Percent(0, false)
if err != nil {
return nil, err
Expand All @@ -32,8 +36,9 @@ func GetMetrics() (*Metrics, error) {
}

return &Metrics{
CPU: cpuPctRounded,
Mem: v.Total,
Timestamp: time.Now().UTC().Unix(),
Timestamp: time.Now().UTC().Unix(),
CPUPercent: cpuPctRounded,
MemUsedMiB: memUsedMiB,
MemTotalMiB: memTotalMiB,
}, nil
}
82 changes: 73 additions & 9 deletions packages/orchestrator/internal/sandbox/checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,49 @@ package sandbox

import (
"context"
"encoding/json"
"fmt"
"io"
"net/http"
"time"

"github.com/e2b-dev/infra/packages/shared/pkg/consts"
"github.com/e2b-dev/infra/packages/shared/pkg/utils"
"golang.org/x/mod/semver"
)

const (
healthCheckInterval = 10 * time.Second
metricsCheckInterval = 5 * time.Second
minEnvdVersionForMetrcis = "0.1.3"
)

func (s *Sandbox) logHeathAndUsage(ctx *utils.LockableCancelableContext) {
healthTicker := time.NewTicker(healthCheckInterval)
metricsTicker := time.NewTicker(metricsCheckInterval)
defer func() {
healthTicker.Stop()
metricsTicker.Stop()
}()

// Get metrics on sandbox startup
go s.LogMetrics(ctx)
0div marked this conversation as resolved.
Show resolved Hide resolved

ticker := time.NewTicker(15 * time.Second)
defer ticker.Stop()

for {
select {
case <-ticker.C:
case <-healthTicker.C:
childCtx, cancel := context.WithTimeout(ctx, time.Second)

ctx.Lock()
s.Healthcheck(childCtx, false)
ctx.Unlock()

cancel()

stats, err := s.stats.Stats()
if err != nil {
s.Logger.Warnf("failed to get stats: %s", err)
} else {
s.Logger.CPUUsage(stats.CPUCount)
s.Logger.MemoryUsage(stats.MemoryMB)
}
case <-metricsTicker.C:
s.LogMetrics(ctx)
case <-ctx.Done():
return
}
Expand Down Expand Up @@ -68,3 +80,55 @@ func (s *Sandbox) Healthcheck(ctx context.Context, alwaysReport bool) {
return
}
}

func (s *Sandbox) GetMetrics(ctx context.Context) (SandboxMetrics, error) {
address := fmt.Sprintf("http://%s:%d/metrics", s.slot.HostIP(), consts.DefaultEnvdServerPort)

request, err := http.NewRequestWithContext(ctx, "GET", address, nil)
if err != nil {
return SandboxMetrics{}, err
}

response, err := httpClient.Do(request)
if err != nil {
return SandboxMetrics{}, err
}
defer response.Body.Close()

if response.StatusCode != http.StatusOK {
err = fmt.Errorf("unexpected status code: %d", response.StatusCode)
return SandboxMetrics{}, err
}

var metrics SandboxMetrics
err = json.NewDecoder(response.Body).Decode(&metrics)
if err != nil {
return SandboxMetrics{}, err
}

return metrics, nil
}

func (s *Sandbox) LogMetrics(ctx context.Context) {
if isGTEVersion(s.Sandbox.EnvdVersion, minEnvdVersionForMetrcis) {
metrics, err := s.GetMetrics(ctx)
if err != nil {
s.Logger.Warnf("failed to get metrics: %s", err)
} else {
s.Logger.CPUPct(metrics.CPUPercent)
s.Logger.MemMiB(metrics.MemTotalMiB, metrics.MemUsedMiB)
}
}
}

func isGTEVersion(curVersion, minVersion string) bool {
if len(curVersion) > 0 && curVersion[0] != 'v' {
curVersion = "v" + curVersion
}

if !semver.IsValid(curVersion) {
return false
}

return semver.Compare(curVersion, minVersion) >= 0
}
8 changes: 8 additions & 0 deletions packages/orchestrator/internal/sandbox/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package sandbox

type SandboxMetrics struct {
Timestamp int64 `json:"ts"` // Unix Timestamp in UTC
CPUPercent float64 `json:"cpu_pct"` // Percent rounded to 2 decimal places
MemTotalMiB uint64 `json:"mem_total_mib"` // Total virtual memory in MiB
MemUsedMiB uint64 `json:"mem_used_mib"` // Used virtual memory in MiB
}
11 changes: 11 additions & 0 deletions packages/orchestrator/internal/server/sandboxes.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,18 @@ func (s *server) Delete(ctx context.Context, in *orchestrator.SandboxDeleteReque
// Ideally we would rely only on the goroutine defer.
s.sandboxes.Remove(in.SandboxId)

// Don't allow connecting to the sandbox anymore.
s.dns.Remove(in.SandboxId, sbx.Slot.HostIP())

// Remove the sandbox from the cache to prevent loading it again in API during the time the instance is stopping.
// Old comment:
// Ensure the sandbox is removed from cache.
// Ideally we would rely only on the goroutine defer.
s.sandboxes.Remove(in.SandboxId)

0div marked this conversation as resolved.
Show resolved Hide resolved
// Check health metrics before stopping the sandbox
sbx.Healthcheck(ctx, true)
sbx.LogMetrics(ctx)
0div marked this conversation as resolved.
Show resolved Hide resolved

err := sbx.Stop()
if err != nil {
Expand Down
Loading