From 95851eaa9ea5bca5266ea46000af2dbc5654ca0c Mon Sep 17 00:00:00 2001 From: Adam Gordon Bell Date: Fri, 7 Jun 2024 15:48:00 -0400 Subject: [PATCH] Better error message for WITH DOCKER (unlazy force execution) (#4180) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a run fails in WITH DOCKER you get an error message like this: ``` ================================== ❌ FAILURE =================================== ./t/with-docker+fail-test1 *failed* | Repeating the failure error... ./t/with-docker+fail-test1 *failed* | --> WITH DOCKER RUN --privileged echo "dummy" && false ./t/with-docker+fail-test1 *failed* | Starting dockerd with data root /var/earthly/dind/7b5da64c458d405fff1f21d3a985f6812f9eaad6af9c6eb0d2b8c1ca7422a2e2/tmp.lbamkM ./t/with-docker+fail-test1 *failed* | dummy ./t/with-docker+fail-test1 *failed* | ERROR tests/with-docker/Earthfile:186:8 ./t/with-docker+fail-test1 *failed* | The command ./t/with-docker+fail-test1 *failed* | WITH DOCKER RUN --privileged echo "dummy" && false ./t/with-docker+fail-test1 *failed* | did not complete successfully. Exit code 1 ./t/with-docker+fail-test1 | unlazy force execution: process "/bin/sh -c EARTHLY_DOCKERD_DATA_ROOT=\"/var/earthly/dind/7b5da64c458d405fff1f21d3a985f6812f9eaad6af9c6eb0d2b8c1 ca7422a2e2\" EARTHLY_DOCKERD_CACHE_DATA=\"false\" EARTHLY_DOCKER_LOAD_FILES=\"\" EARTHLY_IMAGES_WITH_DIGESTS=\"\" EARTHLY_START_COMPOSE=\"false\" EARTHLY_COMPOSE_FILES=\"\" EARTHLY_COMPOSE_SERVICES=\"\" DIND_IMAGE=earthly/dind:alpine-3.19-docker-25.0.5-r0 OTEL_TRACES_EXPORTER=none PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin /var/earthly/dockerd- wrapper.sh execute /usr/bin/earth_debugger /bin/sh -c 'echo \"dummy\" && false'" did not complete successfully: exit code: 1 ``` The last part is confusing: ``` unlazy force execution: process "/bin/sh -c EARTHLY_DOCKERD_DATA_ROOT=\"/var/earthly/dind/7b5da64c458d405fff1f21d3a985f6812f9eaad6af9c6eb0d2b8c1 ca7422a2e2\" EARTHLY_DOCKERD_CACHE_DATA=\"false\" EARTHLY_DOCKER_LOAD_FILES=\"\" EARTHLY_IMAGES_WITH_DIGESTS=\"\" EARTHLY_START_COMPOSE=\"false\" EARTHLY_COMPOSE_FILES=\"\" EARTHLY_COMPOSE_SERVICES=\"\" DIND_IMAGE=earthly/dind:alpine-3.19-docker-25.0.5-r0 OTEL_TRACES_EXPORTER=none PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin /var/earthly/dockerd- wrapper.sh execute /usr/bin/earth_debugger /bin/sh -c 'echo \"dummy\" && false'" did not complete successfully: exit code: 1 ``` In some conditions, only `unlazy force execution` generates an error, so we need it there, even if duplicated in this case. With this change the last error is properly formatted: ``` /U/a/s/gha_earthly_test+fail1 | The command /U/a/s/gha_earthly_test+fail1 | WITH DOCKER RUN /U/a/s/gha_earthly_test+fail1 | did not complete successfully. Exit code 1 ``` I also changed vertormon.parseError to use templates and added some tests around the error type and exit code parsing. --- earthfile2llb/with_docker_run_local_reg.go | 8 +- earthfile2llb/with_docker_run_reg.go | 8 +- logbus/solvermon/vertexmon.go | 205 +++++++++++++-------- logbus/solvermon/vertexmon_test.go | 131 +++++++++++++ 4 files changed, 273 insertions(+), 79 deletions(-) create mode 100644 logbus/solvermon/vertexmon_test.go diff --git a/earthfile2llb/with_docker_run_local_reg.go b/earthfile2llb/with_docker_run_local_reg.go index 715ecbc2..976c5429 100644 --- a/earthfile2llb/with_docker_run_local_reg.go +++ b/earthfile2llb/with_docker_run_local_reg.go @@ -3,8 +3,11 @@ package earthfile2llb import ( "context" "fmt" + "time" + "github.com/earthly/cloud-api/logstream" "github.com/earthly/earthly/domain" + "github.com/earthly/earthly/logbus/solvermon" "github.com/earthly/earthly/states" "github.com/earthly/earthly/util/containerutil" "github.com/earthly/earthly/util/syncutil/semutil" @@ -42,7 +45,10 @@ func (w *withDockerRunLocalReg) Run(ctx context.Context, args []string, opt With } defer func() { - cmd.SetEndError(retErr) + if retErr != nil { + message := solvermon.FormatError("WITH DOCKER RUN", retErr.Error()) + cmd.SetEnd(time.Now(), logstream.RunStatus_RUN_STATUS_FAILURE, message) + } }() var imagesToBuild []*states.ImageDef diff --git a/earthfile2llb/with_docker_run_reg.go b/earthfile2llb/with_docker_run_reg.go index 9f5b1b3e..0d4562fd 100644 --- a/earthfile2llb/with_docker_run_reg.go +++ b/earthfile2llb/with_docker_run_reg.go @@ -5,8 +5,11 @@ import ( "fmt" "sort" "strings" + "time" + "github.com/earthly/cloud-api/logstream" "github.com/earthly/earthly/domain" + "github.com/earthly/earthly/logbus/solvermon" "github.com/earthly/earthly/states" "github.com/earthly/earthly/util/llbutil/pllb" "github.com/earthly/earthly/util/platutil" @@ -126,7 +129,10 @@ func (w *withDockerRunRegistry) Run(ctx context.Context, args []string, opt With } defer func() { - cmd.SetEndError(retErr) + if retErr != nil { + message := solvermon.FormatError("WITH DOCKER RUN", retErr.Error()) + cmd.SetEnd(time.Now(), logstream.RunStatus_RUN_STATUS_FAILURE, message) + } }() err = w.installDeps(ctx, opt) diff --git a/logbus/solvermon/vertexmon.go b/logbus/solvermon/vertexmon.go index b48b99a9..4e0296fb 100644 --- a/logbus/solvermon/vertexmon.go +++ b/logbus/solvermon/vertexmon.go @@ -37,101 +37,123 @@ type vertexMonitor struct { isCanceled bool } -var reErrExitCode = regexp.MustCompile(`^(?:process ".*" did not complete successfully|error calling LocalhostExec): exit code: (?P[0-9]+)$`) -var reErrNotFound = regexp.MustCompile(`^failed to calculate checksum of ref ([^ ]*): (.*)$`) -var reHint = regexp.MustCompile(`^(?P.+?):Hint: .+`) +var reErrExitCode = regexp.MustCompile(`(?:process ".*" did not complete successfully|error calling LocalhostExec): exit code: (?P[0-9]+)$`) -func (vm *vertexMonitor) Write(dt []byte, ts time.Time, stream int) (int, error) { - if stream == BuildkitStatsStream { - stats, err := vm.ssp.Parse(dt) +var errNoExitCodeOMM = errors.New("no exit code, process was killed due to OOM") +var errNoExitCode = errors.New("no exit code in error message") + +// getExitCode returns the exit code (int), whether one was found (bool), and an error if the exit code was invalid +func getExitCode(errString string) (int, error) { + if matches, _ := stringutil.NamedGroupMatches(errString, reErrExitCode); len(matches["exit_code"]) == 1 { + exitCodeMatch := matches["exit_code"][0] + exitCode, err := strconv.ParseUint(exitCodeMatch, 10, 32) if err != nil { - return 0, errors.Wrap(err, "failed decoding stats stream") + return 0, err } - for _, statsSample := range stats { - statsJSON, err := json.Marshal(statsSample) - if err != nil { - return 0, errors.Wrap(err, "stats json encode failed") - } - _, err = vm.cp.Write(statsJSON, ts, int32(stream)) - if err != nil { - return 0, errors.Wrap(err, "write stats") - } + // See https://github.com/earthly/buildkit/commit/9b0bdb600641f3dd1d96f54ac2d86581ab6433b2 + if exitCode == math.MaxUint32 { + return 0, errNoExitCodeOMM } - return len(dt), nil - } - _, err := vm.cp.Write(dt, ts, int32(stream)) - if err != nil { - return 0, errors.Wrap(err, "write log line") + if exitCode > 255 { + return 0, fmt.Errorf("exit code %d out of expected range (0-255)", exitCode) + } + exitCodeByte := exitCode & 0xFF + return int(exitCodeByte), nil } - return len(dt), nil + return 0, errNoExitCode } -func (vm *vertexMonitor) parseError() { - errString := vm.vertex.Error - indentOp := strings.Join(strings.Split(vm.operation, "\n"), "\n ") - internalStr := "" - if vm.meta.Internal { - internalStr = " internal" +var reErrNotFound = regexp.MustCompile(`^failed to calculate checksum of ref ([^ ]*): (.*)$`) +var reHint = regexp.MustCompile(`^(?P.+?):Hint: .+`) + +// determineFatalErrorType returns logstream.FailureType +// and whether or not its a Fatal Error +func determineFatalErrorType(errString string, exitCode int, exitParseErr error) (logstream.FailureType, bool) { + if strings.Contains(errString, "context canceled") || errString == "no active sessions" { + return logstream.FailureType_FAILURE_TYPE_UNKNOWN, false + } + if exitParseErr == errNoExitCodeOMM { + return logstream.FailureType_FAILURE_TYPE_OOM_KILLED, true + } else if exitParseErr != nil && exitParseErr != errNoExitCode { + // We have an exit code, and can't parse it + return logstream.FailureType_FAILURE_TYPE_UNKNOWN, true } + if exitCode > 0 { + return logstream.FailureType_FAILURE_TYPE_NONZERO_EXIT, true + } + if reErrNotFound.MatchString(errString) { + return logstream.FailureType_FAILURE_TYPE_FILE_NOT_FOUND, true + } + if strings.Contains(errString, errutil.EarthlyGitStdErrMagicString) { + return logstream.FailureType_FAILURE_TYPE_GIT, true + } + return logstream.FailureType_FAILURE_TYPE_UNKNOWN, false +} + +func formatErrorMessage(errString, operation string, internal bool, fatalErrorType logstream.FailureType, exitCode int) string { if matches, _ := stringutil.NamedGroupMatches(errString, reHint); len(matches["msg"]) == 1 { errString = matches["msg"][0] } - switch { - case strings.Contains(errString, "context canceled"): - vm.isCanceled = true - vm.errorStr = "WARN: Canceled" - return - case reErrExitCode.MatchString(errString): - matches, _ := stringutil.NamedGroupMatches(errString, reErrExitCode) - exitCodeMatch := matches["exit_code"][0] - // Ignore the parse error as default case will print it as a string using - // the source, so we won't miss any data. - exitCode, _ := strconv.ParseUint(exitCodeMatch, 10, 32) - switch exitCode { - case math.MaxUint32: - errString = fmt.Sprintf(""+ - " The%s command\n"+ + internalStr := "" + if internal { + internalStr = " internal" + } + errString = fmt.Sprintf("%s%s", internalStr, errString) + + switch fatalErrorType { + case logstream.FailureType_FAILURE_TYPE_OOM_KILLED: + return fmt.Sprintf( + " The%s command\n"+ " %s\n"+ - " was terminated because the build system ran out of memory.\n"+ - " If you are using a satellite or other remote buildkit, it is the remote system that ran out of memory.", - internalStr, indentOp) - vm.fatalErrorType = logstream.FailureType_FAILURE_TYPE_OOM_KILLED - default: - errString = fmt.Sprintf(""+ - " The%s command\n"+ + " was terminated because the build system ran out of memory. If you are using a satellite or other remote buildkit, it is the remote system that ran out of memory.", internalStr, operation) + case logstream.FailureType_FAILURE_TYPE_NONZERO_EXIT: + return fmt.Sprintf( + " The%s command\n"+ " %s\n"+ - " did not complete successfully. Exit code %s", - internalStr, indentOp, exitCodeMatch) - vm.fatalErrorType = logstream.FailureType_FAILURE_TYPE_NONZERO_EXIT - } - vm.isFatalError = true - case reErrNotFound.MatchString(errString): + " did not complete successfully. Exit code %d", internalStr, operation, exitCode) + case logstream.FailureType_FAILURE_TYPE_FILE_NOT_FOUND: m := reErrNotFound.FindStringSubmatch(errString) - errString = fmt.Sprintf(""+ + return fmt.Sprintf( " The%s command\n"+ - " %s\n"+ - " failed: %s", - internalStr, indentOp, m[2]) - vm.isFatalError = true - vm.fatalErrorType = logstream.FailureType_FAILURE_TYPE_FILE_NOT_FOUND - case errString == "no active sessions": - vm.isCanceled = true - errString = "WARN: Canceled" - case strings.Contains(errString, errutil.EarthlyGitStdErrMagicString): + " %s\n"+ + " failed: %s", internalStr, operation, m[2]) + case logstream.FailureType_FAILURE_TYPE_GIT: gitStdErr, shorterErr, ok := errutil.ExtractEarthlyGitStdErr(errString) if ok { - errString = fmt.Sprintf( - "The%s command '%s' failed: %s\n\n%s", internalStr, vm.operation, shorterErr, gitStdErr) - } else { - errString = fmt.Sprintf( - "The%s command '%s' failed: %s", internalStr, vm.operation, errString) + return fmt.Sprintf( + "The%s command\n"+ + " %s\n"+ + "failed: %s\n\n%s", internalStr, operation, shorterErr, gitStdErr) } - vm.isFatalError = true + return fmt.Sprintf( + "The%s command\n"+ + " %s\n"+ + "failed: %s", internalStr, operation, errString) default: - errString = fmt.Sprintf( - "The%s command '%s' failed: %s", internalStr, vm.operation, errString) + return fmt.Sprintf( + "The%s command\n"+ + " %s\n"+ + "failed: %s", internalStr, operation, errString) } +} + +func FormatError(operation string, errString string) string { + exitCode, err := getExitCode(errString) + fatalErrorType, _ := determineFatalErrorType(errString, exitCode, err) + return formatErrorMessage(errString, operation, false, fatalErrorType, exitCode) +} + +func (vm *vertexMonitor) parseError() { + errString := vm.vertex.Error + + indentOp := strings.Join(strings.Split(vm.operation, "\n"), "\n ") + + exitCode, err := getExitCode(errString) + fatalErrorType, isFatalError := determineFatalErrorType(errString, exitCode, err) + formattedError := formatErrorMessage(errString, indentOp, vm.meta.Internal, fatalErrorType, exitCode) + + // Add Error location slString := "" if vm.meta.SourceLocation != nil { slString = fmt.Sprintf( @@ -139,9 +161,38 @@ func (vm *vertexMonitor) parseError() { vm.meta.SourceLocation.File, vm.meta.SourceLocation.StartLine, vm.meta.SourceLocation.StartColumn) } - if vm.isFatalError { - vm.errorStr = fmt.Sprintf("ERROR%s\n%s", slString, errString) + + // Set the error string and flags on the vertexMonitor + if isFatalError { + vm.errorStr = fmt.Sprintf("ERROR%s\n%s", slString, formattedError) } else { - vm.errorStr = fmt.Sprintf("WARN%s: %s", slString, errString) + vm.errorStr = fmt.Sprintf("WARN%s: %s", slString, formattedError) + } + vm.isFatalError = isFatalError + vm.fatalErrorType = fatalErrorType +} + +func (vm *vertexMonitor) Write(dt []byte, ts time.Time, stream int) (int, error) { + if stream == BuildkitStatsStream { + stats, err := vm.ssp.Parse(dt) + if err != nil { + return 0, errors.Wrap(err, "failed decoding stats stream") + } + for _, statsSample := range stats { + statsJSON, err := json.Marshal(statsSample) + if err != nil { + return 0, errors.Wrap(err, "stats json encode failed") + } + _, err = vm.cp.Write(statsJSON, ts, int32(stream)) + if err != nil { + return 0, errors.Wrap(err, "write stats") + } + } + return len(dt), nil } + _, err := vm.cp.Write(dt, ts, int32(stream)) + if err != nil { + return 0, errors.Wrap(err, "write log line") + } + return len(dt), nil } diff --git a/logbus/solvermon/vertexmon_test.go b/logbus/solvermon/vertexmon_test.go new file mode 100644 index 00000000..ac435305 --- /dev/null +++ b/logbus/solvermon/vertexmon_test.go @@ -0,0 +1,131 @@ +package solvermon + +import ( + "fmt" + "testing" + + "github.com/earthly/cloud-api/logstream" +) + +func TestGetExitCode(t *testing.T) { + tests := []struct { + name string + errString string + expectedCode int + expectedError error + }{ + { + name: "no match", + errString: "random error message", + expectedCode: 0, + expectedError: errNoExitCode, + }, + { + name: "match with exit code", + errString: "process \"foo\" did not complete successfully: exit code: 123", + expectedCode: 123, + expectedError: nil, + }, + { + name: "match with max uint32", + errString: "process \"foo\" did not complete successfully: exit code: 4294967295", + expectedCode: 0, + expectedError: errNoExitCodeOMM, + }, + { + name: "match with max uint32", + errString: "some wrap message: process \"foo\" did not complete successfully: exit code: 8", + expectedCode: 8, + expectedError: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + code, err := getExitCode(tt.errString) + if code != tt.expectedCode { + t.Errorf("getExitCode(%q) = %d, want %d", tt.errString, code, tt.expectedCode) + } + if err != tt.expectedError { + t.Errorf("getExitCode(%q) = %d, want %d", tt.errString, err, tt.expectedError) + } + }) + } +} + +func TestDetermineFatalErrorType(t *testing.T) { + tests := []struct { + name string + errString string + exitCode int + parseErr error + expectedType logstream.FailureType + expectedFatal bool + }{ + { + name: "context canceled", + errString: "context canceled", + exitCode: 0, + parseErr: nil, + expectedType: logstream.FailureType_FAILURE_TYPE_UNKNOWN, + expectedFatal: false, + }, + { + name: "exit code 123", + errString: "process \"foo\" did not complete successfully: exit code: 123", + exitCode: 123, + parseErr: nil, + expectedType: logstream.FailureType_FAILURE_TYPE_NONZERO_EXIT, + expectedFatal: true, + }, + { + name: "exit code max uint32", + errString: "process \"foo\" did not complete successfully: exit code: 4294967295", + exitCode: 0, + parseErr: errNoExitCodeOMM, + expectedType: logstream.FailureType_FAILURE_TYPE_OOM_KILLED, + expectedFatal: true, + }, + { + name: "file not found", + errString: "failed to calculate checksum of ref foo: bar", + exitCode: 0, + parseErr: nil, + expectedType: logstream.FailureType_FAILURE_TYPE_FILE_NOT_FOUND, + expectedFatal: true, + }, + { + name: "git error", + errString: "EARTHLY_GIT_STDERR: Z2l0IC1jI...", + parseErr: nil, + expectedType: logstream.FailureType_FAILURE_TYPE_GIT, + expectedFatal: true, + }, + { + name: "unknown error", + errString: "unknown error", + parseErr: nil, + expectedType: logstream.FailureType_FAILURE_TYPE_UNKNOWN, + expectedFatal: false, + }, + { + name: "invalid exit code", + errString: "exit code: 9999", + parseErr: fmt.Errorf("exit code 9999 out of expected range (0-255)"), + expectedType: logstream.FailureType_FAILURE_TYPE_UNKNOWN, + expectedFatal: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + fatalType, fatal := determineFatalErrorType(tt.errString, tt.exitCode, tt.parseErr) + if fatalType != tt.expectedType { + t.Errorf("determineFatalErrorType(%q, %d) = %v, want %v", tt.errString, tt.exitCode, fatalType, tt.expectedType) + } + if fatal != tt.expectedFatal { + t.Errorf("determineFatalErrorType(%q, %d) = %v, want %v", tt.errString, tt.exitCode, fatal, tt.expectedFatal) + } + }) + } +}