--wip-- [skipci]

RareCompute · Nov 18, 2024 · 32b0b85 · 32b0b85
1 parent f659fad
commit 32b0b85
Show file tree

Hide file tree

Showing 24 changed files with 391 additions and 25 deletions.
diff --git a/.github/gitleaks.yaml → .github/workflows/gitleaks.yaml b/.github/gitleaks.yaml → .github/workflows/gitleaks.yaml
diff --git a/.github/kubeconform.yaml → .github/workflows/kubeconform.yaml b/.github/kubeconform.yaml → .github/workflows/kubeconform.yaml
diff --git a/kubernetes/arc1/apps/kube-system/nvidia-device-plugin/app/helmrelease.yaml b/kubernetes/arc1/apps/kube-system/nvidia-device-plugin/app/helmrelease.yaml
@@ -63,4 +63,4 @@ spec:
               resources:
                 - name: nvidia.com/gpu
                   replicas: 7
-      default: "single"
+      default: "default"
diff --git a/kubernetes/arc1/apps/machine-learning/colabfold/app/helmrelease.yaml b/kubernetes/arc1/apps/machine-learning/colabfold/app/helmrelease.yaml
@@ -26,7 +26,7 @@ spec:
       retries: 3
   values:
     controllers:
-      ollama:
+      colabfold:
         type: deployment
         annotations:
           reloader.stakater.com/auto: "true"
@@ -56,11 +56,11 @@ spec:
               requests:
                 cpu: 200m
                 memory: 4Gi
-                gpu.intel.com/i915: "4"
+                nvidia.com/gpu: 2
               limits:
                 cpu: 32000m
                 memory: 64Gi
-                gpu.intel.com/i915: "4"
+                nvidia.com/gpu: 4
     service:
       app:
         controller: *app
@@ -88,6 +88,7 @@ spec:
           - secretName: colabfold-tls
             hosts: [*host]
     persistence:
+      # TODO: Replace with existing PVC
       data:
         storageClass: local-nvme
         accessMode: ReadWriteMany

diff --git a/kubernetes/arc1/apps/machine-learning/kustomization.yaml b/kubernetes/arc1/apps/machine-learning/kustomization.yaml
@@ -4,5 +4,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
   - ./namespace.yaml
+  - ./qlora/ks.yaml
   #- ./ollama/ks.yaml
   #- ./jupyterhub/ks.yaml
diff --git a/kubernetes/arc1/apps/machine-learning/mmseqs2/app/helmrelease.yaml b/kubernetes/arc1/apps/machine-learning/mmseqs2/app/helmrelease.yaml
@@ -5,7 +5,7 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
-  name: &app ollama
+  name: &app mmseqs2
 spec:
   interval: 30m
   chart:
@@ -26,7 +26,7 @@ spec:
       retries: 3
   values:
     controllers:
-      ollama:
+      mmseqs2:
         type: deployment
         annotations:
           reloader.stakater.com/auto: "true"
@@ -41,11 +41,11 @@ spec:
               requests:
                 cpu: 200m
                 memory: 4Gi
-                # gpu.intel.com/i915: "1"
+                nvidia.com/gpu: 2
               limits:
-                cpu: 8000m
-                memory: 8Gi
-                # gpu.intel.com/i915: "1"
+                cpu: 32000m
+                memory: 64Gi
+                nvidia.com/gpu: 4
     service:
       app:
         controller: *app
@@ -74,6 +74,7 @@ spec:
           - secretName: mmseqs2-tls
             hosts: [*host]
     persistence:
+      # TODO: Replace with existing PVC
       data:
         storageClass: local-nvme
         accessMode: ReadWriteMany

diff --git a/kubernetes/arc1/apps/machine-learning/qlora/app/helmrelease.yaml b/kubernetes/arc1/apps/machine-learning/qlora/app/helmrelease.yaml
@@ -0,0 +1,105 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/bjw-s/helm-charts/main/charts/other/app-template/schemas/helmrelease-helm-v2.schema.json
+# TODO: Finish this
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: &app qlora
+spec:
+  interval: 30m
+  chart:
+    spec:
+      chart: app-template
+      version: 3.5.1
+      sourceRef:
+        kind: HelmRepository
+        name: bjw-s
+        namespace: flux-system
+  install:
+    remediation:
+      retries: 3
+  upgrade:
+    cleanupOnFail: true
+    remediation:
+      strategy: rollback
+      retries: 3
+  values:
+    controllers:
+      qlora:
+        type: deployment
+        annotations:
+          reloader.stakater.com/auto: "true"
+        pod:
+          runtimeClassName: nvidia
+          terminationGracePeriodSeconds: 1
+          securityContext:
+            fsGroup: 1000
+          affinity:
+            nodeAffinity:
+              requiredDuringSchedulingIgnoredDuringExecution:
+                nodeSelectorTerms:
+                  - matchExpressions:
+                      - key: nvidia.com/gpu.present
+                        operator: In
+                        values:
+                          - "true"
+        containers:
+          app:
+            image:
+              repository: ghcr.io/rarecompute/qlora-docker
+              tag: main@sha256:65b3f17822d21d8158b70c7762968b2e0e9db1814428e0d425cfa217527526eb
+            env:
+              TZ: ${TIMEZONE}
+              NVIDIA_VISIBLE_DEVICES: all
+              NVIDIA_DRIVER_CAPABILITIES: all
+            resources:
+              requests:
+                cpu: 200m
+                memory: 8Gi
+              limits:
+                cpu: 32000m
+                memory: 32Gi
+                nvidia.com/gpu: 4
+            #            probes:
+            #              liveness:
+            #                enabled: true
+            #              readiness:
+            #                enabled: true
+            #              startup:
+            #                enabled: false
+            #                spec:
+            #                  failureThreshold: 30
+            #                  periodSeconds: 5
+    service:
+      app:
+        controller: *app
+        annotations:
+          teleport.dev/name: *app
+        labels:
+          teleport: enabled
+        #        ports:
+        #          http:
+        #            port: &port 80
+    persistence:
+      app:
+        storageClass: local-nvme
+        accessMode: ReadWriteOnce
+        size: 2Gi
+        globalMounts:
+          - path: /app
+      workspace:
+        storageClass: local-nvme
+        accessMode: ReadWriteOnce
+        size: 2048Gi
+        retain: true
+        globalMounts:
+          - path: /workspace
+      tmp:
+        type: emptyDir
+        globalMounts:
+          - path: /tmp
+      #      workspace:
+      #        enabled: true
+      #        existingClaim: machine-learning-workspace-pvc
+      #        globalMounts:
+      #          - path: /workspace
diff --git a/kubernetes/arc1/apps/machine-learning/qlora/app/kustomization.yaml b/kubernetes/arc1/apps/machine-learning/qlora/app/kustomization.yaml
@@ -0,0 +1,6 @@
+---
+# yaml-language-server: $schema=https://json.schemastore.org/kustomization
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - ./helmrelease.yaml
diff --git a/kubernetes/arc1/apps/machine-learning/qlora/ks.yaml b/kubernetes/arc1/apps/machine-learning/qlora/ks.yaml
@@ -0,0 +1,24 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: &app qlora
+  namespace: flux-system
+spec:
+  targetNamespace: machine-learning
+  commonMetadata:
+    labels:
+      app.kubernetes.io/name: *app
+  path: ./kubernetes/arc1/apps/machine-learning/qlora/app
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: k8s-gitops
+  wait: false
+  interval: 30m
+  retryInterval: 1m
+  timeout: 5m
+  postBuild:
+    substitute:
+      APP: *app
diff --git a/kubernetes/arc1/apps/security/kustomization.yaml b/kubernetes/arc1/apps/security/kustomization.yaml
@@ -4,4 +4,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
   - ./namespace.yaml
-  #- ./teleport/ks.yaml
+  - ./teleport/ks.yaml
diff --git a/kubernetes/arc1/apps/security/teleport/app/helmrelease.yaml b/kubernetes/arc1/apps/security/teleport/app/helmrelease.yaml
@@ -8,7 +8,7 @@ spec:
   chart:
     spec:
       chart: teleport-cluster
-      version: 16.4.6
+      version: 17.0.1
       sourceRef:
         kind: HelmRepository
         name: teleport
@@ -19,12 +19,11 @@ spec:
   upgrade:
     cleanupOnFail: true
     remediation:
-      strategy: rollback
       retries: 3
   values:
-    clusterName: teleport.${SECRET_EXTERNAL_DOMAIN}
+    clusterName: teleport.${SECRET_INTERNAL_DOMAIN}
     chartMode: standalone
-    kubeClusterName: ARC1
+    kubeClusterName: arc1
     validateConfigOnDeploy: true
     enterprise: false
     auth:
@@ -58,18 +57,22 @@ spec:
     annotations:
       ingress:
         cert-manager.io/cluster-issuer: "letsencrypt-production"
-        gethomepage.dev/enabled: "true"
-        gethomepage.dev/group: Services
-        gethomepage.dev/name: *app
-        gethomepage.dev/icon: teleport.png
-    tls:
-      existingSecretName: "teleport-cluster-tls"
+        external-dns.alpha.kubernetes.io/target: "teleport.${SECRET_EXTERNAL_DOMAIN}"
+        external-dns.alpha.kubernetes.io/exclude-unifi: "true"
+        external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
+        traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
+        traefik.ingress.kubernetes.io/router.middlewares: "networking-traefik-middleware-chain-no-auth@kubernetescrd"
+      service:
+        traefik.ingress.kubernetes.io/service.serversscheme: https
+    highAvailability:
+      tls:
+        existingSecretName: "teleport-tls"
     authentication:
       type: local
     proxyListenerMode: multiplex
     persistence:
       enabled: true
-      storageClassName: local-nvme
+      existingClaimName: teleport
     serviceAccount:
       create: true
     rbac:

diff --git a/kubernetes/arc1/apps/security/teleport/app/kustomization.yaml b/kubernetes/arc1/apps/security/teleport/app/kustomization.yaml
@@ -3,4 +3,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
+  - ./pvc.yaml
   - ./helmrelease.yaml
diff --git a/kubernetes/arc1/apps/security/teleport/app/pvc.yaml b/kubernetes/arc1/apps/security/teleport/app/pvc.yaml
@@ -0,0 +1,12 @@
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: teleport
+  namespace: security
+spec:
+  storageClassName: local-nvme
+  accessModes: ["ReadWriteOnce"]
+  resources:
+    requests:
+      storage: 8Gi
diff --git a/kubernetes/arc1/apps/security/teleport/app/resources/token.yaml b/kubernetes/arc1/apps/security/teleport/app/resources/token.yaml
@@ -0,0 +1,18 @@
+kind: token
+version: v2
+metadata:
+  name: kubernetes-token
+  # set a long expiry time, the default for tokens is only 30 minutes
+  expires: "2050-01-01T00:00:00Z"
+spec:
+  # Use the minimal set of system roles required.
+  roles: [kube, app, discovery, node, windowsdesktop]
+
+  # set the join method allowed for this token
+  join_method: kubernetes
+
+  kubernetes:
+    type: in_cluster
+    allow:
+      # Service account names follow the format "namespace:serviceaccountname".
+      - service_account: "security:teleport-kube-agent"
diff --git a/kubernetes/arc1/apps/storage/kustomization.yaml b/kubernetes/arc1/apps/storage/kustomization.yaml
@@ -0,0 +1,8 @@
+---
+# yaml-language-server: $schema=https://json.schemastore.org/kustomization
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - ./namespace.yaml
+  - ./snapshot-controller/ks.yaml
+  - ./volsync/ks.yaml
diff --git a/kubernetes/arc1/apps/storage/namespace.yaml b/kubernetes/arc1/apps/storage/namespace.yaml
@@ -0,0 +1,8 @@
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: storage
+  labels:
+    kustomize.toolkit.fluxcd.io/prune: disabled
+    volsync.backube/privileged-movers: "true"
diff --git a/kubernetes/arc1/apps/storage/snapshot-controller/app/helmrelease.yaml b/kubernetes/arc1/apps/storage/snapshot-controller/app/helmrelease.yaml
@@ -0,0 +1,33 @@
+---
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: snapshot-controller
+spec:
+  interval: 30m
+  chart:
+    spec:
+      chart: snapshot-controller
+      version: 3.0.6
+      sourceRef:
+        kind: HelmRepository
+        name: piraeus
+        namespace: flux-system
+  maxHistory: 2
+  install:
+    crds: CreateReplace
+    remediation:
+      retries: 3
+  upgrade:
+    cleanupOnFail: true
+    crds: CreateReplace
+    remediation:
+      retries: 3
+  uninstall:
+    keepHistory: false
+  values:
+    controller:
+      serviceMonitor:
+        create: true
+    webhook:
+      enabled: false
diff --git a/kubernetes/arc1/apps/storage/snapshot-controller/app/kustomization.yaml b/kubernetes/arc1/apps/storage/snapshot-controller/app/kustomization.yaml
@@ -0,0 +1,6 @@
+---
+# yaml-language-server: $schema=https://json.schemastore.org/kustomization
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - ./helmrelease.yaml