--wip-- [skipci]

RareCompute · Nov 19, 2024 · 6ec9f67 · 6ec9f67
1 parent f659fad
commit 6ec9f67
Show file tree

Hide file tree

Showing 26 changed files with 416 additions and 21 deletions.
diff --git a/.github/gitleaks.yaml → .github/workflows/gitleaks.yaml b/.github/gitleaks.yaml → .github/workflows/gitleaks.yaml
diff --git a/.github/kubeconform.yaml → .github/workflows/kubeconform.yaml b/.github/kubeconform.yaml → .github/workflows/kubeconform.yaml
diff --git a/kubernetes/arc1/apps/kube-system/nvidia-device-plugin/app/helmrelease.yaml b/kubernetes/arc1/apps/kube-system/nvidia-device-plugin/app/helmrelease.yaml
@@ -63,4 +63,4 @@ spec:
               resources:
                 - name: nvidia.com/gpu
                   replicas: 7
-      default: "single"
+      default: "default"
diff --git a/kubernetes/arc1/apps/machine-learning/colabfold/app/helmrelease.yaml b/kubernetes/arc1/apps/machine-learning/colabfold/app/helmrelease.yaml
@@ -26,7 +26,7 @@ spec:
       retries: 3
   values:
     controllers:
-      ollama:
+      colabfold:
         type: deployment
         annotations:
           reloader.stakater.com/auto: "true"
@@ -56,11 +56,11 @@ spec:
               requests:
                 cpu: 200m
                 memory: 4Gi
-                gpu.intel.com/i915: "4"
+                nvidia.com/gpu: 2
               limits:
                 cpu: 32000m
                 memory: 64Gi
-                gpu.intel.com/i915: "4"
+                nvidia.com/gpu: 4
     service:
       app:
         controller: *app
@@ -88,6 +88,7 @@ spec:
           - secretName: colabfold-tls
             hosts: [*host]
     persistence:
+      # TODO: Replace with existing PVC
       data:
         storageClass: local-nvme
         accessMode: ReadWriteMany

diff --git a/kubernetes/arc1/apps/machine-learning/kustomization.yaml b/kubernetes/arc1/apps/machine-learning/kustomization.yaml
@@ -4,5 +4,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
   - ./namespace.yaml
+  - ./qlora/ks.yaml
   #- ./ollama/ks.yaml
   #- ./jupyterhub/ks.yaml
diff --git a/kubernetes/arc1/apps/machine-learning/mmseqs2/app/helmrelease.yaml b/kubernetes/arc1/apps/machine-learning/mmseqs2/app/helmrelease.yaml
@@ -5,7 +5,7 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
-  name: &app ollama
+  name: &app mmseqs2
 spec:
   interval: 30m
   chart:
@@ -26,7 +26,7 @@ spec:
       retries: 3
   values:
     controllers:
-      ollama:
+      mmseqs2:
         type: deployment
         annotations:
           reloader.stakater.com/auto: "true"
@@ -41,11 +41,11 @@ spec:
               requests:
                 cpu: 200m
                 memory: 4Gi
-                # gpu.intel.com/i915: "1"
+                nvidia.com/gpu: 2
               limits:
-                cpu: 8000m
-                memory: 8Gi
-                # gpu.intel.com/i915: "1"
+                cpu: 32000m
+                memory: 64Gi
+                nvidia.com/gpu: 4
     service:
       app:
         controller: *app
@@ -74,6 +74,7 @@ spec:
           - secretName: mmseqs2-tls
             hosts: [*host]
     persistence:
+      # TODO: Replace with existing PVC
       data:
         storageClass: local-nvme
         accessMode: ReadWriteMany

diff --git a/kubernetes/arc1/apps/machine-learning/qlora/app/helmrelease.yaml b/kubernetes/arc1/apps/machine-learning/qlora/app/helmrelease.yaml
@@ -0,0 +1,102 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/bjw-s/helm-charts/main/charts/other/app-template/schemas/helmrelease-helm-v2.schema.json
+# TODO: Finish this
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: &app qlora
+spec:
+  interval: 30m
+  chart:
+    spec:
+      chart: app-template
+      version: 3.5.1
+      sourceRef:
+        kind: HelmRepository
+        name: bjw-s
+        namespace: flux-system
+  install:
+    remediation:
+      retries: 3
+  upgrade:
+    cleanupOnFail: true
+    remediation:
+      strategy: rollback
+      retries: 3
+  values:
+    controllers:
+      qlora:
+        type: deployment
+        annotations:
+          reloader.stakater.com/auto: "true"
+        pod:
+          runtimeClassName: nvidia
+          terminationGracePeriodSeconds: 1
+          affinity:
+            nodeAffinity:
+              requiredDuringSchedulingIgnoredDuringExecution:
+                nodeSelectorTerms:
+                  - matchExpressions:
+                      - key: nvidia.com/gpu.present
+                        operator: In
+                        values:
+                          - "true"
+        containers:
+          app:
+            image:
+              repository: ghcr.io/rarecompute/qlora-docker
+              tag: main@sha256:e56350596e17af5198bfc848b0de3a5a11cb98d97e1e02dbb322467269342541
+            env:
+              TZ: ${TIMEZONE}
+              # GITHUB_REPO: https://github.com/RareCompute/example-models
+              NVIDIA_VISIBLE_DEVICES: all
+              NVIDIA_DRIVER_CAPABILITIES: all
+            securityContext:
+              capabilities.drop: ["ALL"]
+            resources:
+              requests:
+                cpu: 200m
+                memory: 8Gi
+              limits:
+                cpu: 16
+                memory: 32Gi
+                nvidia.com/gpu: 4
+            #            probes:
+            #              liveness:
+            #                enabled: true
+            #              readiness:
+            #                enabled: true
+            #              startup:
+            #                enabled: false
+            #                spec:
+            #                  failureThreshold: 30
+            #                  periodSeconds: 5
+    service:
+      app:
+        controller: *app
+        annotations:
+          teleport.dev/name: *app
+        labels:
+          teleport: enabled
+        ports:
+          http:
+            port: &port 80
+    persistence:
+      app:
+        storageClass: local-nvme
+        accessMode: ReadWriteOnce
+        size: 2Gi
+        globalMounts:
+          - path: /app
+      workspace:
+        storageClass: local-nvme
+        # TODO: OpenEBS only support ReadWriteOnce
+        accessMode: ReadWriteOnce
+        size: 2048Gi
+        retain: true
+        globalMounts:
+          - path: /workspace
+      tmp:
+        type: emptyDir
+        globalMounts:
+          - path: /tmp
diff --git a/kubernetes/arc1/apps/machine-learning/qlora/app/kustomization.yaml b/kubernetes/arc1/apps/machine-learning/qlora/app/kustomization.yaml
@@ -0,0 +1,6 @@
+---
+# yaml-language-server: $schema=https://json.schemastore.org/kustomization
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - ./helmrelease.yaml
diff --git a/kubernetes/arc1/apps/machine-learning/qlora/ks.yaml b/kubernetes/arc1/apps/machine-learning/qlora/ks.yaml
@@ -0,0 +1,24 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: &app qlora
+  namespace: flux-system
+spec:
+  targetNamespace: machine-learning
+  commonMetadata:
+    labels:
+      app.kubernetes.io/name: *app
+  path: ./kubernetes/arc1/apps/machine-learning/qlora/app
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: k8s-gitops
+  wait: false
+  interval: 30m
+  retryInterval: 1m
+  timeout: 5m
+  postBuild:
+    substitute:
+      APP: *app
diff --git a/kubernetes/arc1/apps/security/kustomization.yaml b/kubernetes/arc1/apps/security/kustomization.yaml
@@ -4,4 +4,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
   - ./namespace.yaml
-  #- ./teleport/ks.yaml
+  - ./teleport/ks.yaml
diff --git a/kubernetes/arc1/apps/security/teleport/app/crt.yaml b/kubernetes/arc1/apps/security/teleport/app/crt.yaml
@@ -0,0 +1,14 @@
+---
+apiVersion: cert-manager.io/v1
+kind: Certificate
+metadata:
+  name: teleport-tls
+  namespace: security
+spec:
+  secretName: teleport-tls
+  issuerRef:
+    name: letsencrypt-production
+    kind: ClusterIssuer
+  dnsNames:
+    - "teleport.${SECRET_EXTERNAL_DOMAIN}"
+    - "*.teleport.${SECRET_EXTERNAL_DOMAIN}"
diff --git a/kubernetes/arc1/apps/security/teleport/app/helmrelease.yaml b/kubernetes/arc1/apps/security/teleport/app/helmrelease.yaml
@@ -8,7 +8,7 @@ spec:
   chart:
     spec:
       chart: teleport-cluster
-      version: 16.4.6
+      version: 17.0.1
       sourceRef:
         kind: HelmRepository
         name: teleport
@@ -19,12 +19,11 @@ spec:
   upgrade:
     cleanupOnFail: true
     remediation:
-      strategy: rollback
       retries: 3
   values:
     clusterName: teleport.${SECRET_EXTERNAL_DOMAIN}
     chartMode: standalone
-    kubeClusterName: ARC1
+    kubeClusterName: arc1
     validateConfigOnDeploy: true
     enterprise: false
     auth:
@@ -43,6 +42,8 @@ spec:
       teleportConfig:
         proxy_service:
           trust_x_forwarded_for: true
+    operator:
+      enabled: true
     podSecurityPolicy:
       enabled: true
     log:
@@ -60,16 +61,24 @@ spec:
         cert-manager.io/cluster-issuer: "letsencrypt-production"
         gethomepage.dev/enabled: "true"
         gethomepage.dev/group: Services
-        gethomepage.dev/name: *app
-        gethomepage.dev/icon: teleport.png
+        gethomepage.dev/name: Teleport
+        gethomepage.dev/description: Telelport dashboard
+        gethomepage.dev/icon: teleport
+        # external-dns.alpha.kubernetes.io/target: "teleport.${SECRET_EXTERNAL_DOMAIN}"
+        # external-dns.alpha.kubernetes.io/exclude-unifi: "true"
+        # external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
+        traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
+        traefik.ingress.kubernetes.io/router.middlewares: "networking-traefik-middleware-chain-no-auth@kubernetescrd"
+      service:
+        traefik.ingress.kubernetes.io/service.serversscheme: https
     tls:
-      existingSecretName: "teleport-cluster-tls"
+      existingSecretName: teleport-tls
     authentication:
       type: local
     proxyListenerMode: multiplex
     persistence:
       enabled: true
-      storageClassName: local-nvme
+      existingClaimName: teleport
     serviceAccount:
       create: true
     rbac:

diff --git a/kubernetes/arc1/apps/security/teleport/app/kustomization.yaml b/kubernetes/arc1/apps/security/teleport/app/kustomization.yaml
@@ -3,4 +3,6 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
+  - ./pvc.yaml
+  - ./crt.yaml
   - ./helmrelease.yaml
diff --git a/kubernetes/arc1/apps/security/teleport/app/pvc.yaml b/kubernetes/arc1/apps/security/teleport/app/pvc.yaml
@@ -0,0 +1,12 @@
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: teleport
+  namespace: security
+spec:
+  storageClassName: local-nvme
+  accessModes: ["ReadWriteOnce"]
+  resources:
+    requests:
+      storage: 8Gi
diff --git a/kubernetes/arc1/apps/security/teleport/app/resources/token.yaml b/kubernetes/arc1/apps/security/teleport/app/resources/token.yaml
@@ -0,0 +1,18 @@
+kind: token
+version: v2
+metadata:
+  name: kubernetes-token
+  # set a long expiry time, the default for tokens is only 30 minutes
+  expires: "2050-01-01T00:00:00Z"
+spec:
+  # Use the minimal set of system roles required.
+  roles: [kube, app, discovery, node, windowsdesktop]
+
+  # set the join method allowed for this token
+  join_method: kubernetes
+
+  kubernetes:
+    type: in_cluster
+    allow:
+      # Service account names follow the format "namespace:serviceaccountname".
+      - service_account: "security:teleport-kube-agent"