Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

monitoring PR handling failures. #42

Merged
merged 16 commits into from
Dec 13, 2024
8 changes: 8 additions & 0 deletions cmd/telefonistka/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ func serve() {
mainGhClientCache, _ := lru.New[string, githubapi.GhClientPair](128)
prApproverGhClientCache, _ := lru.New[string, githubapi.GhClientPair](128)

go func() {
for {
log.Debugf("Stale check runner")
githubapi.GetPrMetrics(mainGhClientCache)
time.Sleep(60 * time.Second) // TODO: make this configurable? GH API rate limits are a facor here
}
}()

mux := http.NewServeMux()
mux.HandleFunc("/webhook", handleWebhook(githubWebhookSecret, mainGhClientCache, prApproverGhClientCache))
mux.Handle("/metrics", promhttp.Handler())
Expand Down
1 change: 1 addition & 0 deletions internal/pkg/githubapi/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ func HandlePREvent(eventPayload *github.PullRequestEvent, ghPrClientDetails GhPr
defer func() {
if err != nil {
SetCommitStatus(ghPrClientDetails, "error")
prom.IncPrHandleFailuresCounter(ghPrClientDetails.Owner + "/" + ghPrClientDetails.Repo)
return
}
SetCommitStatus(ghPrClientDetails, "success")
Expand Down
84 changes: 84 additions & 0 deletions internal/pkg/githubapi/pr_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package githubapi

import (
"context"
"time"

"github.com/google/go-github/v62/github"
lru "github.com/hashicorp/golang-lru/v2"
log "github.com/sirupsen/logrus"
prom "github.com/wayfair-incubator/telefonistka/internal/pkg/prometheus"
)

const (
minutesToDfineStale = 20
Oded-B marked this conversation as resolved.
Show resolved Hide resolved
)

func getRepoPrMetrics(ctx context.Context, ghClient GhClientPair, repo *github.Repository) (prWithStakeChecks int, openPRs int, openPromotionPrs int, err error) {
log.Debugf("Checking repo %s", repo.GetName())
ghOwner := repo.GetOwner().GetLogin()
prListOpts := &github.PullRequestListOptions{
State: "open",
}
prs := []*github.PullRequest{}

// paginate through PRs, there might be lot of them.
for {
perPagePrs, resp, err := ghClient.v3Client.PullRequests.List(ctx, ghOwner, repo.GetName(), prListOpts)
_ = prom.InstrumentGhCall(resp)
if err != nil {
log.Errorf("error getting PRs for %s/%s: %v", ghOwner, repo.GetName(), err)
}
Comment on lines +37 to +39
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not checking the error first, and then calling the prom.InstrumentGhCall(resp)?
Another question if the output for prom.InstrumentGhCall(resp) is not important why do bother to assign it something?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The InstrumentGhCall function returns labels just for testing.
During runtime these labels are just used internally in the function.
I chose to instrument before error handling to ensure we still instrument even if we change the error handling to break out of that code path in the future, be I can change the order if you feel it makes it less readable

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I chose to instrument before error handling to ensure we still instrument even if we change the error handling to break out of that code path in the future, be I can change the order if you feel it makes it less readable

I would add that as a comment

prs = append(prs, perPagePrs...)
if resp.NextPage == 0 {
break
}
prListOpts.Page = resp.NextPage
}

for _, pr := range prs {
if DoesPrHasLabel(pr.Labels, "promotion") {
openPromotionPrs++
}
log.Debugf("Checking PR %d", pr.GetNumber())
commitStatuses, resp, err := ghClient.v3Client.Repositories.GetCombinedStatus(ctx, ghOwner, repo.GetName(), pr.GetHead().GetSHA(), nil)
_ = prom.InstrumentGhCall(resp)
if err != nil {
log.Errorf("error getting statuses for %s/%s/%d: %v", ghOwner, repo.GetName(), pr.GetNumber(), err)
continue
}
for _, status := range commitStatuses.Statuses {
if *status.Context == "telefonistka" &&
*status.State == "pending" &&
status.UpdatedAt.GetTime().Before(time.Now().Add(-time.Minute*minutesToDfineStale)) {
log.Debugf("Adding status %s-%v-%s !!!", *status.Context, status.UpdatedAt.GetTime(), *status.State)
prWithStakeChecks++
} else {
log.Debugf("Ignoring status %s-%v-%s", *status.Context, status.UpdatedAt.GetTime(), *status.State)
}
}
}
openPRs = len(prs)

return
}

// GetPrMetrics counts the number of pending checks that are older than 20 minutes
func GetPrMetrics(mainGhClientCache *lru.Cache[string, GhClientPair]) {
ctx := context.Background() // TODO!!!!

for _, ghOwner := range mainGhClientCache.Keys() {
log.Debugf("Checking gh Owner %s", ghOwner)
ghClient, _ := mainGhClientCache.Get(ghOwner)
Oded-B marked this conversation as resolved.
Show resolved Hide resolved
repos, resp, err := ghClient.v3Client.Apps.ListRepos(ctx, nil)
_ = prom.InstrumentGhCall(resp)
Oded-B marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
log.Errorf("error getting repos for %s: %v", ghOwner, err)
continue
}
for _, repo := range repos.Repositories {
stalePendingChecks, openPrs, promotionPrs, _ := getRepoPrMetrics(ctx, ghClient, repo)
prom.PublishPrMetrics(openPrs, promotionPrs, stalePendingChecks, repo.GetFullName())
}
}
}
41 changes: 41 additions & 0 deletions internal/pkg/prometheus/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,34 @@ var (
Subsystem: "github",
}, []string{"api_group", "api_path", "repo_slug", "status", "method"})

ghOpenPrsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "open_prs",
Help: "The total number of open PRs",
Namespace: "telefonistka",
Subsystem: "github",
}, []string{"repo_slug"})

ghOpenPromotionPrsGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "open_promotion_prs",
Help: "The total number of open PRs with promotion label",
Namespace: "telefonistka",
Subsystem: "github",
}, []string{"repo_slug"})

ghOpenPrsWithPendingCheckGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "open_prs_with_pending_telefonistka_checks",
Help: "The total number of open PRs with pending Telefonistka checks(excluding PRs with very recent commits)",
Namespace: "telefonistka",
Subsystem: "github",
}, []string{"repo_slug"})

prHandleFailuresCounter = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "pr_handle_failures_total",
Help: "The total number of PR handling failures",
Namespace: "telefonistka",
Subsystem: "core",
}, []string{"repo_slug"})

whUpstreamRequestsCountVec = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "upstream_requests_total",
Help: "The total number of requests forwarded upstream servers",
Expand All @@ -47,6 +75,19 @@ var (
}, []string{"status", "method", "url"})
)

func IncPrHandleFailuresCounter(repoSlug string) {
prHandleFailuresCounter.With(prometheus.Labels{"repo_slug": repoSlug}).Inc()
}

func PublishPrMetrics(openPrs int, openPromPRs int, openPrsWithPendingChecks int, repoSlug string) {
metricLables := prometheus.Labels{
"repo_slug": repoSlug,
}
ghOpenPrsGauge.With(metricLables).Set(float64(openPrs))
ghOpenPromotionPrsGauge.With(metricLables).Set(float64(openPromPRs))
ghOpenPrsWithPendingCheckGauge.With(metricLables).Set(float64(openPrsWithPendingChecks))
}

// This function instrument Webhook hits and parsing of their content
func InstrumentWebhookHit(parsing_status string) {
webhookHitsVec.With(prometheus.Labels{"parsing": parsing_status}).Inc()
Expand Down
Loading