diff --git a/cmd/deprovision.go b/cmd/deprovision.go new file mode 100644 index 0000000..4291bee --- /dev/null +++ b/cmd/deprovision.go @@ -0,0 +1,174 @@ +package cmd + +import ( + "bytes" + "fmt" + "html/template" + "os" + "path/filepath" + "strings" + + "github.com/konstructio/colony/internal/constants" + "github.com/konstructio/colony/internal/k8s" + "github.com/konstructio/colony/internal/logger" + "github.com/konstructio/colony/internal/utils" + "github.com/konstructio/colony/manifests" + "github.com/spf13/cobra" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +type DeprovisionWorkflowRequest struct { + Mac string + RandomSuffix string +} + +func getDeprovisionCommand() *cobra.Command { + var hardwareID, bootDevice string + var efiBoot, destroy bool + deprovisionCmd := &cobra.Command{ + Use: "deprovision", + Short: "remove a hardware from your colony data center - very destructive", + RunE: func(cmd *cobra.Command, _ []string) error { + ctx := cmd.Context() + log := logger.New(logger.Debug) + + randomSuffix := utils.RandomString(6) + + homeDir, err := os.UserHomeDir() + if err != nil { + return fmt.Errorf("error getting user home directory: %w", err) + } + + k8sClient, err := k8s.New(log, filepath.Join(homeDir, constants.ColonyDir, constants.KubeconfigHostPath)) + if err != nil { + return fmt.Errorf("failed to create k8s client: %w", err) + } + + if err = k8sClient.LoadMappingsFromKubernetes(); err != nil { + return fmt.Errorf("error loading dynamic mappings from kubernetes: %w", err) + } + + log.Infof("rebooting hardware with id %q", hardwareID) + log.Infof("boot device %q", bootDevice) + log.Infof("efi boot %t", efiBoot) + log.Infof("destroy %t", destroy) + + // TODO if the machine state is powered on, restart it so the workflow will run + + // todo + //! POST to api to mark the hardware removed + // get hardware and remove ipxe + hw, err := k8sClient.HardwareRemoveIPXE(ctx, k8s.UpdateHardwareRequest{ + HardwareID: hardwareID, + Namespace: constants.ColonyNamespace, + RemoveIpXE: true, + }) + if err != nil { + return fmt.Errorf("error getting hardware: %w", err) + } + log.Infof("hardware: %v", hw) + + //! detokenize and apply the workflow + + file, err := manifests.Workflow.ReadFile("workflow/wipe-disks.yaml.tmpl") + if err != nil { + return fmt.Errorf("error reading templates file: %w", err) + } + + tmpl, err := template.New("ipmi").Funcs(template.FuncMap{ + "replaceColonsWithHyphens": func(s string) string { + return strings.ReplaceAll(s, ":", "-") + }, + }).Parse(string(file)) + if err != nil { + return fmt.Errorf("error parsing template: %w", err) + } + + var outputBuffer bytes.Buffer + + err = tmpl.Execute(&outputBuffer, DeprovisionWorkflowRequest{ + Mac: hw.Spec.Interfaces[0].DHCP.MAC, + RandomSuffix: randomSuffix, + }) + if err != nil { + return fmt.Errorf("error executing template: %w", err) + } + + log.Info(outputBuffer.String()) + + ip, err := k8sClient.GetHardwareMachineRefFromSecretLabel(ctx, constants.ColonyNamespace, metav1.ListOptions{ + LabelSelector: fmt.Sprintf("colony.konstruct.io/hardware-id=%s", hardwareID), + }) + if err != nil { + return fmt.Errorf("error getting machine ref secret: %w", err) + } + + //! NOT UNTIL WE'RE SURE + if err := k8sClient.ApplyManifests(ctx, []string{outputBuffer.String()}); err != nil { + return fmt.Errorf("error applying rufiojob: %w", err) + } + + err = k8sClient.FetchAndWaitForWorkflow(ctx, k8s.WorkflowWaitRequest{ + LabelValue: strings.ReplaceAll(ip, ".", "-"), + Namespace: constants.ColonyNamespace, + WaitTimeout: 300, + RandomSuffix: randomSuffix, + }) + if err != nil { + return fmt.Errorf("error waiting for workflow: %w", err) + } + + // reboot + file2, err := manifests.IPMI.ReadFile("ipmi/ipmi-off-pxe-on.yaml.tmpl") + if err != nil { + return fmt.Errorf("error reading templates file: %w", err) + } + + tmpl2, err := template.New("ipmi").Funcs(template.FuncMap{ + "replaceDotsWithDash": func(s string) string { + return strings.ReplaceAll(s, ".", "-") + }, + }).Parse(string(file2)) + if err != nil { + return fmt.Errorf("error parsing template: %w", err) + } + + var outputBuffer2 bytes.Buffer + + err = tmpl2.Execute(&outputBuffer2, RufioPowerCycleRequest{ + IP: ip, + EFIBoot: efiBoot, + BootDevice: bootDevice, + RandomSuffix: randomSuffix, + }) + if err != nil { + return fmt.Errorf("error executing template: %w", err) + } + + log.Info(outputBuffer2.String()) + + if err := k8sClient.ApplyManifests(ctx, []string{outputBuffer2.String()}); err != nil { + return fmt.Errorf("error applying rufiojob: %w", err) + } + + err = k8sClient.FetchAndWaitForRufioJobs(ctx, k8s.RufioJobWaitRequest{ + LabelValue: strings.ReplaceAll(ip, ".", "-"), + Namespace: constants.ColonyNamespace, + WaitTimeout: 300, + RandomSuffix: randomSuffix, + }) + + if err != nil { + return fmt.Errorf("error get machine: %w", err) + } + + return nil + }, + } + deprovisionCmd.Flags().StringVar(&hardwareID, "hardware-id", "", "hardware id of the server to deprovision - WARNING: you can not recover this server") + deprovisionCmd.Flags().StringVar(&bootDevice, "boot-device", "pxe", "the bootdev to set (pxe, bios) defaults to pxe") + deprovisionCmd.Flags().BoolVar(&efiBoot, "efiBoot", true, "boot device option (uefi, legacy) defaults to uefi") + deprovisionCmd.Flags().BoolVar(&destroy, "destroy", false, "whether to destroy the machine and its associated resources") + deprovisionCmd.MarkFlagRequired("hardware-id") + return deprovisionCmd +} diff --git a/cmd/reboot.go b/cmd/reboot.go index 172111b..e4cdc14 100644 --- a/cmd/reboot.go +++ b/cmd/reboot.go @@ -8,7 +8,6 @@ import ( "path/filepath" "strings" - "github.com/konstructio/colony/configs" "github.com/konstructio/colony/internal/constants" "github.com/konstructio/colony/internal/k8s" "github.com/konstructio/colony/internal/logger" @@ -28,7 +27,7 @@ func getRebootCommand() *cobra.Command { ctx := cmd.Context() log := logger.New(logger.Debug) - log.Info("colony cli version: ", configs.Version) + homeDir, err := os.UserHomeDir() if err != nil { return fmt.Errorf("error getting user home directory: %w", err) @@ -84,7 +83,7 @@ func getRebootCommand() *cobra.Command { log.Info(outputBuffer.String()) if err := k8sClient.ApplyManifests(ctx, []string{outputBuffer.String()}); err != nil { - return fmt.Errorf("error applying rufiojob: %w", err) + return fmt.Errorf("error applying rufio job: %w", err) } err = k8sClient.FetchAndWaitForRufioJobs(ctx, k8s.RufioJobWaitRequest{ @@ -95,7 +94,7 @@ func getRebootCommand() *cobra.Command { }) if err != nil { - return fmt.Errorf("error get machine: %w", err) + return fmt.Errorf("error get rufio job: %w", err) } return nil diff --git a/cmd/root.go b/cmd/root.go index 2dfc634..01b81c2 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -14,6 +14,13 @@ func GetRootCommand() *cobra.Command { SilenceErrors: true, // we print the errors ourselves on main } - cmd.AddCommand(getDestroyCommand(), getInitCommand(), getAddIPMICommand(), getRebootCommand(), getVersionCommand(), getAssetsCommand()) + cmd.AddCommand( + getDestroyCommand(), + getInitCommand(), + getAddIPMICommand(), + getRebootCommand(), + getVersionCommand(), + getAssetsCommand(), + getDeprovisionCommand()) return cmd } diff --git a/internal/k8s/k8s.go b/internal/k8s/k8s.go index 19b4601..1ca5012 100644 --- a/internal/k8s/k8s.go +++ b/internal/k8s/k8s.go @@ -469,7 +469,7 @@ func (c *Client) waitForMachineReady(ctx context.Context, gvr schema.GroupVersio err := wait.PollUntilContextTimeout(ctx, 5*time.Second, time.Duration(timeoutSeconds)*time.Second, true, func(ctx context.Context) (bool, error) { // Get the latest Machine object - m, err := c.dynamic.Resource(gvr).Namespace(namespace).Get(context.Background(), machineName, metav1.GetOptions{}) + m, err := c.dynamic.Resource(gvr).Namespace(namespace).Get(ctx, machineName, metav1.GetOptions{}) if err != nil { // If we couldn't connect, retry if isNetworkingError(err) { @@ -542,6 +542,13 @@ type RufioJobWaitRequest struct { WaitTimeout int } +type WorkflowWaitRequest struct { + LabelValue string + Namespace string + RandomSuffix string + WaitTimeout int +} + // ! refactor... this is so dupe func (c *Client) FetchAndWaitForRufioJobs(ctx context.Context, job RufioJobWaitRequest) error { c.logger.Infof("waiting for job %q in namespace %q", job.RandomSuffix, job.Namespace) @@ -571,6 +578,85 @@ func (c *Client) FetchAndWaitForRufioJobs(ctx context.Context, job RufioJobWaitR return nil } +func (c *Client) FetchAndWaitForWorkflow(ctx context.Context, workflow WorkflowWaitRequest) error { + c.logger.Infof("waiting for workflow %q in namespace %q", workflow.RandomSuffix, workflow.Namespace) + + gvr := schema.GroupVersionResource{ + Group: v1alpha1.GroupVersion.Group, + Version: v1alpha1.GroupVersion.Version, + Resource: v1alpha1.GroupVersion.WithResource("workflows").Resource, + } + + w, err := c.returnWorkflowObject(ctx, gvr, workflow.Namespace, workflow.WaitTimeout, metav1.ListOptions{ + LabelSelector: fmt.Sprintf("colony.konstruct.io/job-id=%s", workflow.RandomSuffix), + }) + if err != nil { + return fmt.Errorf("error finding job %q: %w", workflow.LabelValue, err) + } + + c.logger.Infof("job %q found in namespace %q", workflow.LabelValue, workflow.Namespace) + + _, err = c.waitWorkflowComplete(ctx, gvr, w, workflow.WaitTimeout) + if err != nil { + return fmt.Errorf("error waiting for job %q: %w", workflow.LabelValue, err) + } + + c.logger.Infof("job %q in namespace %q is ready", workflow.LabelValue, workflow.Namespace) + + return nil +} + +type UpdateHardwareRequest struct { + HardwareID string + Namespace string + RemoveIpXE bool +} + +func (c *Client) HardwareRemoveIPXE(ctx context.Context, hardware UpdateHardwareRequest) (*v1alpha1.Hardware, error) { + c.logger.Infof("getting hardware %q in namespace %q", hardware.HardwareID, hardware.Namespace) + + gvr := schema.GroupVersionResource{ + Group: v1alpha1.GroupVersion.Group, + Version: v1alpha1.GroupVersion.Version, + Resource: v1alpha1.GroupVersion.WithResource("hardware").Resource, + } + + hw, err := c.dynamic.Resource(gvr).Namespace(hardware.Namespace).Get(ctx, hardware.HardwareID, metav1.GetOptions{}) + if err != nil { + return nil, fmt.Errorf("error getting hardware %q: %w", hardware.HardwareID, err) + } + + h := &v1alpha1.Hardware{} + + err = runtime.DefaultUnstructuredConverter.FromUnstructured(hw.UnstructuredContent(), h) + if err != nil { + return nil, fmt.Errorf("error converting unstructured to hardware: %w", err) + } + + c.logger.Infof("hardware %q found, removing ipxe script ", hw.GetName()) + + h.Spec.Interfaces[0].Netboot.IPXE = &v1alpha1.IPXE{} + + unstructuredObj, err := runtime.DefaultUnstructuredConverter.ToUnstructured(h) + if err != nil { + return nil, fmt.Errorf("error converting hardware to unstructured: %w", err) + } + + obj, err := c.dynamic.Resource(gvr).Namespace(hardware.Namespace).Update(ctx, &unstructured.Unstructured{Object: unstructuredObj}, metav1.UpdateOptions{}) + if err != nil { + return nil, fmt.Errorf("error updating hardware %q: %w", hardware.HardwareID, err) + } + + c.logger.Infof("removed ipxe script from hardware %q", obj.GetName()) + + err = runtime.DefaultUnstructuredConverter.FromUnstructured(hw.UnstructuredContent(), h) + if err != nil { + return nil, fmt.Errorf("error converting updated unstructured to hardware: %w", err) + } + + return h, nil +} + func (c *Client) ListAssets(ctx context.Context) error { // Set up columns for hardware table columns := []table.Column{ diff --git a/internal/k8s/rufiojobs.go b/internal/k8s/rufiojobs.go index 80cbff8..60e3153 100644 --- a/internal/k8s/rufiojobs.go +++ b/internal/k8s/rufiojobs.go @@ -61,7 +61,7 @@ func (c *Client) waitForJobComplete(ctx context.Context, gvr schema.GroupVersion err := wait.PollUntilContextTimeout(ctx, 5*time.Second, time.Duration(timeoutSeconds)*time.Second, true, func(ctx context.Context) (bool, error) { // Get the latest Machine object - j, err := c.dynamic.Resource(gvr).Namespace(namespace).Get(context.Background(), jobName, metav1.GetOptions{}) + j, err := c.dynamic.Resource(gvr).Namespace(namespace).Get(ctx, jobName, metav1.GetOptions{}) if err != nil { // If we couldn't connect, retry if isNetworkingError(err) { diff --git a/internal/k8s/workflow.go b/internal/k8s/workflow.go new file mode 100644 index 0000000..c127756 --- /dev/null +++ b/internal/k8s/workflow.go @@ -0,0 +1,109 @@ +package k8s + +import ( + "context" + "encoding/json" + "fmt" + "log" + "time" + + v1alpha1 "github.com/kubefirst/tink/api/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/util/wait" +) + +func (c *Client) returnWorkflowObject(ctx context.Context, gvr schema.GroupVersionResource, namespace string, timeoutSeconds int, opts metav1.ListOptions) (*v1alpha1.Workflow, error) { + + wf := &v1alpha1.Workflow{} + + err := wait.PollUntilContextTimeout(ctx, 15*time.Second, time.Duration(timeoutSeconds)*time.Second, true, func(ctx context.Context) (bool, error) { + c.logger.Infof("getting workflow object with label %q", opts.LabelSelector) + wfs, err := c.dynamic.Resource(gvr).Namespace(namespace).List(ctx, opts) + if err != nil { + // if we couldn't connect, ask to try again + if isNetworkingError(err) { + return false, nil + } + + // if we got an error, return it + return false, fmt.Errorf("error getting workflow object %q in namespace %q: %w", "matchLabel", namespace, err) + } + + // if we couldn't find any workflow, ask to try again + if len(wfs.Items) == 0 { + return false, nil + } + + err = runtime.DefaultUnstructuredConverter.FromUnstructured(wfs.Items[0].UnstructuredContent(), wf) + if err != nil { + return false, fmt.Errorf("error converting unstructured to workflow: %w", err) + } + + // if we found a workflow, return it + return true, nil + }) + if err != nil { + return nil, fmt.Errorf("error waiting for workflow: %w", err) + } + + return wf, nil +} + +func (c *Client) waitWorkflowComplete(ctx context.Context, gvr schema.GroupVersionResource, wfObj *v1alpha1.Workflow, timeoutSeconds int) (bool, error) { + workflowName := wfObj.Name + namespace := wfObj.Namespace + + wf := &v1alpha1.Workflow{} + + c.logger.Infof("waiting for workflow %q in namespace %q to be ready - this could take up to %d seconds", workflowName, namespace, timeoutSeconds) + + err := wait.PollUntilContextTimeout(ctx, 5*time.Second, time.Duration(timeoutSeconds)*time.Second, true, func(ctx context.Context) (bool, error) { + // Get the latest workflow object + w, err := c.dynamic.Resource(gvr).Namespace(namespace).Get(ctx, workflowName, metav1.GetOptions{}) + if err != nil { + // If we couldn't connect, retry + if isNetworkingError(err) { + c.logger.Warn("connection error, retrying: %s", err) + return false, nil + } + + return false, fmt.Errorf("error listing workflows: %w", err) + } + err = runtime.DefaultUnstructuredConverter.FromUnstructured(w.UnstructuredContent(), wf) + if err != nil { + return false, fmt.Errorf("error converting unstructured to workflow: %w", err) + } + + jsonData, err := json.Marshal(wf.Status) + if err != nil { + log.Printf("Error marshaling: %v", err) + } + + c.logger.Infof("%+v", string(jsonData)) + + if len(wf.Status.Tasks) == 0 { + return false, nil + } + if wf.Status.State == v1alpha1.WorkflowStatePending || wf.Status.State == v1alpha1.WorkflowStateRunning { + return false, nil + } + + if wf.Status.State == v1alpha1.WorkflowStateFailed { + return true, fmt.Errorf("workflow %q in namespace %q failed: %+v", workflowName, namespace, wf.Status) + } + + if wf.Status.State == v1alpha1.WorkflowStateSuccess { + return true, nil + } + + // workflow is not yet ready, continue polling + return false, nil + }) + if err != nil { + return false, fmt.Errorf("the workflow %q in namespace %q was not ready within the timeout period: %w", workflowName, namespace, err) + } + + return true, nil +} diff --git a/manifests/ipmi/ipmi-off-pxe-on.yaml.tmpl b/manifests/ipmi/ipmi-off-pxe-on.yaml.tmpl index 8d100d4..33dacd6 100644 --- a/manifests/ipmi/ipmi-off-pxe-on.yaml.tmpl +++ b/manifests/ipmi/ipmi-off-pxe-on.yaml.tmpl @@ -1,10 +1,11 @@ apiVersion: bmc.tinkerbell.org/v1alpha1 kind: Job metadata: - name: "{{ .IP | replaceDotsWithDash }}-off-pxe-on" + name: "{{ .IP | replaceDotsWithDash }}-off-pxe-on-{{ .RandomSuffix }}" namespace: tink-system labels: colony.konstruct.io/name: "{{ .IP | replaceDotsWithDash }}" + colony.konstruct.io/job-id: "{{ .RandomSuffix }}" spec: machineRef: name: "{{ .IP | replaceDotsWithDash }}" @@ -13,6 +14,6 @@ spec: - powerAction: "off" - oneTimeBootDeviceAction: device: - - "pxe" - efiBoot: true + - "{{ .BootDevice }}" + efiBoot: {{ .EFIBoot }} - powerAction: "on" diff --git a/manifests/templates.go b/manifests/templates.go index a5d62a1..d438bb8 100644 --- a/manifests/templates.go +++ b/manifests/templates.go @@ -7,6 +7,9 @@ import ( //go:embed colony/*.yaml.tmpl var Colony embed.FS +//go:embed workflow/*.yaml.tmpl +var Workflow embed.FS + //go:embed downloads/*.yaml var Downloads embed.FS diff --git a/manifests/templates/hello-world.yaml b/manifests/templates/hello-world.yaml new file mode 100644 index 0000000..77ab5c1 --- /dev/null +++ b/manifests/templates/hello-world.yaml @@ -0,0 +1,19 @@ +apiVersion: "tinkerbell.org/v1alpha1" +kind: Template +metadata: + name: hello-world + namespace: tink-system +spec: + data: | + version: "0.1" + name: hello_world + global_timeout: 1800 + tasks: + - name: "hello-world" + worker: "{{.device_1}}" + actions: + - name: "hello-world" + image: ghcr.io/konstructio/tinkerbell-actions:hello-world1 + timeout: 90 + volumes: + - /var/run/docker.sock:/var/run/docker.sock diff --git a/manifests/templates/wipe-disks.yaml b/manifests/templates/wipe-disks.yaml new file mode 100644 index 0000000..c5f601e --- /dev/null +++ b/manifests/templates/wipe-disks.yaml @@ -0,0 +1,19 @@ +apiVersion: "tinkerbell.org/v1alpha1" +kind: Template +metadata: + name: wipe-disks + namespace: tink-system +spec: + data: | + version: "0.1" + name: wipe_disks + global_timeout: 1800 + tasks: + - name: "wipe-disks" + worker: "{{.device_1}}" + actions: + - name: "wipe-disks" + image: ghcr.io/konstructio/tinkerbell-actions:wipe-disk7 + timeout: 90 + volumes: + - /var/run/docker.sock:/var/run/docker.sock diff --git a/manifests/workflow/hello-world.yaml.tmpl b/manifests/workflow/hello-world.yaml.tmpl new file mode 100644 index 0000000..50d8a7a --- /dev/null +++ b/manifests/workflow/hello-world.yaml.tmpl @@ -0,0 +1,12 @@ +apiVersion: tinkerbell.org/v1alpha1 +kind: Workflow +metadata: + name: "{{ .Mac | replaceColonsWithHyphens }}-hello-world-{{ .RandomSuffix }}" + namespace: tink-system + labels: + colony.konstruct.io/job-id: "{{ .RandomSuffix }}" +spec: + hardwareMap: + device_1: "{{ .Mac }}" + hardwareRef: "{{ .Mac | replaceColonsWithHyphens }}" + templateRef: hello-world diff --git a/manifests/workflow/wipe-disks.yaml.tmpl b/manifests/workflow/wipe-disks.yaml.tmpl new file mode 100644 index 0000000..25a4285 --- /dev/null +++ b/manifests/workflow/wipe-disks.yaml.tmpl @@ -0,0 +1,12 @@ +apiVersion: tinkerbell.org/v1alpha1 +kind: Workflow +metadata: + name: "{{ .Mac | replaceColonsWithHyphens }}-wipe-disks-{{ .RandomSuffix }}" + namespace: tink-system + labels: + colony.konstruct.io/job-id: "{{ .RandomSuffix }}" +spec: + hardwareMap: + device_1: "{{ .Mac }}" + hardwareRef: "{{ .Mac | replaceColonsWithHyphens }}" + templateRef: wipe-disks