From ec159b93ab820f823409313d6de7bbc2799ad99d Mon Sep 17 00:00:00 2001 From: Liana Date: Mon, 25 Nov 2024 12:56:50 -0600 Subject: [PATCH] --wip-- [skipci] --- LICENSE | 7 ++ .../app/helmrelease.yaml | 25 +++++ .../app/kustomization.yaml | 6 ++ .../app/secret.sops.yaml | 30 ++++++ .../gha-runner-scale-set-controller/ks.yaml | 26 ++++++ .../gha-runner-scale-set/app/helmrelease.yaml | 53 +++++++++++ .../app/kustomization.yaml | 5 + .../apps/dev/gha-runner-scale-set/ks.yaml | 26 ++++++ kubernetes/arc1/apps/dev/kustomization.yaml | 8 ++ kubernetes/arc1/apps/dev/namespace.yaml | 7 ++ .../apps/machine-learning/boltz/README.md | 5 + .../boltz/app/helmrelease.yaml | 92 +++++++++++++++++++ .../boltz/app/kustomization.yaml | 6 ++ .../arc1/apps/machine-learning/boltz/ks.yaml | 24 +++++ .../colabfold/app/helmrelease.yaml | 7 +- .../apps/machine-learning/kustomization.yaml | 1 + .../apps/machine-learning/mmseqs2/README.md | 3 + .../mmseqs2/app/helmrelease.yaml | 2 +- .../apps/machine-learning/qlora/README.md | 7 ++ .../helm/actions-runner-controller.yaml | 11 +++ .../flux/repositories/helm/kustomization.yaml | 1 + 21 files changed, 348 insertions(+), 4 deletions(-) create mode 100644 LICENSE create mode 100644 kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/app/helmrelease.yaml create mode 100644 kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/app/kustomization.yaml create mode 100644 kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/app/secret.sops.yaml create mode 100644 kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/ks.yaml create mode 100644 kubernetes/arc1/apps/dev/gha-runner-scale-set/app/helmrelease.yaml create mode 100644 kubernetes/arc1/apps/dev/gha-runner-scale-set/app/kustomization.yaml create mode 100644 kubernetes/arc1/apps/dev/gha-runner-scale-set/ks.yaml create mode 100644 kubernetes/arc1/apps/dev/kustomization.yaml create mode 100644 kubernetes/arc1/apps/dev/namespace.yaml create mode 100644 kubernetes/arc1/apps/machine-learning/boltz/README.md create mode 100644 kubernetes/arc1/apps/machine-learning/boltz/app/helmrelease.yaml create mode 100644 kubernetes/arc1/apps/machine-learning/boltz/app/kustomization.yaml create mode 100644 kubernetes/arc1/apps/machine-learning/boltz/ks.yaml create mode 100644 kubernetes/arc1/apps/machine-learning/mmseqs2/README.md create mode 100644 kubernetes/arc1/apps/machine-learning/qlora/README.md create mode 100644 kubernetes/arc1/flux/repositories/helm/actions-runner-controller.yaml diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..04d75c2 --- /dev/null +++ b/LICENSE @@ -0,0 +1,7 @@ +Copyright (c) 2024 Rare Compute Foundation, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/app/helmrelease.yaml b/kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/app/helmrelease.yaml new file mode 100644 index 0000000..c8ac501 --- /dev/null +++ b/kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/app/helmrelease.yaml @@ -0,0 +1,25 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/helm.toolkit.fluxcd.io/helmrelease_v2beta2.json +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: &app gha-runner-scale-set-controller +spec: + interval: 15m + chart: + spec: + chart: gha-runner-scale-set-controller + version: 0.9.3 + sourceRef: + kind: HelmRepository + name: actions-runner-controller + namespace: flux-system + install: + remediation: + retries: 3 + upgrade: + cleanupOnFail: true + remediation: + retries: 3 + values: + fullnameOverride: *app diff --git a/kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/app/kustomization.yaml b/kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/app/kustomization.yaml new file mode 100644 index 0000000..95bf474 --- /dev/null +++ b/kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/app/kustomization.yaml @@ -0,0 +1,6 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./secret.sops.yaml + - ./helmrelease.yaml diff --git a/kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/app/secret.sops.yaml b/kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/app/secret.sops.yaml new file mode 100644 index 0000000..92decc2 --- /dev/null +++ b/kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/app/secret.sops.yaml @@ -0,0 +1,30 @@ +# yamllint disable +kind: Secret +apiVersion: v1 +type: Opaque +metadata: + name: actions-runner-controller-secret +stringData: + github_app_id: ENC[AES256_GCM,data:T7VFX3Irtw==,iv:PvWa4dJYJFHC8H1nDg6DalmS6vxObgyTFfhpT03OjkI=,tag:06K/NqcsOBLmCdyxToFuuA==,type:str] + github_app_installation_id: ENC[AES256_GCM,data:LCD24COnPaA=,iv:pfuQUnXJ68vq0ZHlgGz27HBRqqoZK4hFz4JE7CfN3GQ=,tag:YmwcHGS7uzW0yfDGMF/qqA==,type:str] + github_app_private_key: ENC[AES256_GCM,data:Fc39PxjiusRkCR7Hadompl5nVsTBJOK1SYZGH2aUWiZaOgjBHHm+7Avf21J4fcEhbc5qY+yWe8JjH2QiCK15cGS5fSZf/kdIjmj5XIt0dhHfsJmmv20TjcuZsHIp51aokTryOcx4c48ORfJFAvrhlSqRbVkAvIySS5dPlVTqPxh7K8ywD1wBWmP1z/m8zmstVXKO0fkle9rv5bUesnqtub2d87NxNfYXvzSntZY8jCgZkmo2Rrt7Dz13ISeUGNMlEKchrH+4DEFtVhPCn6SbB7AC/LID7tZMyAvN5coWePQzFxCFWG76Xe8osGlxECFZQkjOGwbCAPI57CPADVm98SctDKuHx8NqY2R7PH+1dCEEiiYJSH1ZMDB8icm1IKZ8xnK9Q14jrmyiJnqYe/kFVEFyc92OSQjM+dBPay6Haw0/wxOi3ABnUDD/9/RqTTqjnFci2/xHPoVvVIt6iiGwRL4FlAKwyziNZI4w2cYcFaGPfY+19Aqq4VR7xONz8xQ4R00YCMWbsUDB+boVaRP2ZUmox9+GSG8BScoeMlT8PN86XlGiixSfOa0g87mwPzH0Qg7jpq/feJ8OpBhJXMraKXWVm+mHMX8novcmJtNpJGAty02xXlaG1sdgHwwM7Ua2slXwbVhwtuUyTW8IDMHXe/OyjJWUluoUkjWCR98DlO4DSxdwxibAaPNVxUZeYm4PrZEbiSFWzhn7OAYpfD7N22uswsymapObWpGaQbC+ydJwy5ZsH3MoJ/7TiNEmmaCdD6YTW+G6IZ7/Tzn85N/baxbecu0Q3jlYZY8aWQm7S23YT/yCNbbdwHCx1lQ0GdiI9Leil4dcADLlpt7VOJ9FWdlr6xnJMQ25F9mExcJSJj0Awt2Xihai+iu38bsqAEoHGuGfAp9dGIVM2QFJU6iFr677rVtrpoNhd7NYPfrQCCgFyjBALs0f0+iR6MP+P+B5A3dy2ZsFILR/GlVRxuE6+98bliG+4agUf+Yc9KJZQYoLToiXA9oIoS/OTDDucK/tbZuiZ6Qxx8X87SvFLAk4Q+fCns6ORIaRejjNJJWsNDUWTQYyCufGeCqeay1ccYQsbz+rUxjhTSqdM3YFCT3rrX07lHHalU7tSxPKKA90idzgmYiAItAWz9Htv/TFfPhntd9i3g2zx5MInVOGb3btba1ccqEJHK0zNo4oF4v52lH4EnNw3M2pmLqWpJEFUakofw/vNGETXriG/O+An3Z0KjHjJmlkw27JVb2bSBpnWQk6agZLhfO4ajjRpAR4tT7teQ4R2jw7oNHch3I5lqMdV+jPn6vrW3t/QOYAK8zE6LYpmF+Mjx+5BsLM6X+L2vGwBcEt8zbztyICOLPO+VOgeJTYbW+vRVDAcRixqxh9ah3tzQJFEOffiIIQrS2q9Ga+j4gCfy97PI4++u+hqpWjCMF28g4WjlZY3AHvWB280GEt+DlSi+U+LabiytNvHvPPPNhYyjw194e4QOsAmUMp2oCRrn3bdL1WUQkqAcnq5q2lWELmKQaIJYssWXje/NeuliyOvxmvWf4jBOV12MDtBHIYZADaH5FFDBLz1otRCw2NnUD0omO7DKDZEg9bOxe/sPbwKPuuUM8dAPL7nIWAuYmpgtp8FbcRPMhD8BjfPrScTXusk8jmjwHtct52AyBgkE3H04k1limAGv2IapJP680Ne/KXx6TqUooh82Q5DoJqlD7jbqi68jVCMxSTVdYwuGYfnj+IhTJFzDVuoHW17PKZ97ZB5lNVLVntAdtYgiLkiCaEkkq9xEcCTjgGLv11rxpi1G8mEypeU/O1A7VmRkiCGVPIzbisMEUbIBc0f/VKmL4jKG7yF+nzXMmcs/ertaFfX0lXZ27eeMre5wfDRXzyk/z9lk54gI2eJCxOjCB1tB9YKlIji7wghs3ARQBKqCK3vgO/t0mdWwhd6SIDqYHmdLBC2CxqDxxCfeIZ/oNuYAHuEf6N9SyWcGEE2tb+mhO/qy/tLtmXdEhIPfeb92+Cmc2/DBg49SAhkCst98+JZW98XkyCNykHPdjPpqYwHLWOfElJQaIsCzr5RyaF80PiyqOM7iug2Oaf/wQXxY9/Y9bsd881b02OsO9gcFW0+XLoIusdweYfBDsV4yQuilqDTWAdYsB4+dQIgEDQqA5MTtBylBAAvuctOlvrw2pKyhu93FjWmnnCpf4lRcr/eL3S8zw28u1rcGtmJYiAo8fNpw11Fmr+Pzd3VekOOg==,iv:ny3e62bjDHwEOU73dDMmEhuSsVbxqYtXE+QKBKl9yxo=,tag:PGfdbaNBrqADErwOUJByUg==,type:str] +sops: + kms: [] + gcp_kms: [] + azure_kv: [] + hc_vault: [] + age: + - recipient: age1ey3reuxyffqynll464r4q3tlhq5v73nxesyktr44lfez8jzxm94s0644n7 + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBRb3A2SEQwMEpFK3h3bkFz + NkIrN25oSmJwZTNTSEF1aXNrcUtDaUFNaTI0CjZ1RlRoWW5MSXF3T0ZvT2huWEMr + Y3J3Nmd1M2ZzTE9zMGRPMjJlMWdDcW8KLS0tIGxXc2xheU1kQUh2akV1M1cwQk13 + Z3NjN1dpenJiTjRBcktRQlFtN0UzWnMKrlWCcVgsbPaQj6cvY/lbTQKbx0TzrWpl + gEZtYG51tl0Lt5SwjlLTnKdmZF/Zhi0mfq7Pq72xccuOqumJlJwl2A== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2024-11-25T18:56:50Z" + mac: ENC[AES256_GCM,data:fAhGUOpfGktZGWFKKK1bRA2w0MjqoaHSAUdRH4mX5ZT37Dce583mlSS0KCIlnZmxgUj+Y40x183JAwzIp/WvMRYkN1t5+NEukrNzGaKO98Wf4defunIvWFEQOTHt5hCaF6DMILM8jV9L2xc0CQulh8R355/0S9/YF8FxasJuA/c=,iv:MBXUfO4boPI9aqyUPKWF+Mt/t1x6HSMuYGgqBxf3qUc=,tag:DZOF7Ycy/lhtmsFMKCp2gw==,type:str] + pgp: [] + encrypted_regex: ^(data|stringData)$ + version: 3.9.1 diff --git a/kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/ks.yaml b/kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/ks.yaml new file mode 100644 index 0000000..34f33e7 --- /dev/null +++ b/kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/ks.yaml @@ -0,0 +1,26 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: &app gha-runner-scale-set-controller + namespace: flux-system +spec: + targetNamespace: dev + commonMetadata: + labels: + app.kubernetes.io/name: *app + dependsOn: + - name: traefik + path: ./kubernetes/arc1/apps/dev/gha-runner-scale-set-controller/app + prune: true + sourceRef: + kind: GitRepository + name: k8s-gitops + wait: false + interval: 30m + retryInterval: 1m + timeout: 5m + postBuild: + substitute: + APP: *app diff --git a/kubernetes/arc1/apps/dev/gha-runner-scale-set/app/helmrelease.yaml b/kubernetes/arc1/apps/dev/gha-runner-scale-set/app/helmrelease.yaml new file mode 100644 index 0000000..0faf51b --- /dev/null +++ b/kubernetes/arc1/apps/dev/gha-runner-scale-set/app/helmrelease.yaml @@ -0,0 +1,53 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/helm.toolkit.fluxcd.io/helmrelease_v2beta2.json +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: &app gha-runner-scale-set +spec: + interval: 30m + chart: + spec: + chart: gha-runner-scale-set + version: 0.9.3 + sourceRef: + kind: HelmRepository + name: actions-runner-controller + namespace: flux-system + install: + remediation: + retries: 3 + upgrade: + cleanupOnFail: true + remediation: + retries: 3 + valuesFrom: + - targetPath: githubConfigSecret.github_app_id + kind: Secret + name: actions-runner-controller-secret + valuesKey: github_app_id + - targetPath: githubConfigSecret.github_app_installation_id + kind: Secret + name: actions-runner-controller-secret + valuesKey: github_app_installation_id + - targetPath: githubConfigSecret.github_app_private_key + kind: Secret + name: actions-runner-controller-secret + valuesKey: github_app_private_key + values: + nameOverride: *app + runnerScaleSetName: *app + githubConfigUrl: https://github.com/ergho/homelab-ops + minRunners: 1 + maxRunners: 3 + containerMode: + type: dind + template: + spec: + containers: + - name: runner + image: ghcr.io/rarecompute/actions-runner:2.320.0@sha256:a357b0db3cb504da87cbbd46b304617fbf4a579e2d0f850ea4f177042f0a4786 + command: ["/home/runner/run.sh"] + controllerServiceAccount: + name: gha-runner-scale-set-controller + namespace: actions-runner-system diff --git a/kubernetes/arc1/apps/dev/gha-runner-scale-set/app/kustomization.yaml b/kubernetes/arc1/apps/dev/gha-runner-scale-set/app/kustomization.yaml new file mode 100644 index 0000000..5dd7bac --- /dev/null +++ b/kubernetes/arc1/apps/dev/gha-runner-scale-set/app/kustomization.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./helmrelease.yaml diff --git a/kubernetes/arc1/apps/dev/gha-runner-scale-set/ks.yaml b/kubernetes/arc1/apps/dev/gha-runner-scale-set/ks.yaml new file mode 100644 index 0000000..a324591 --- /dev/null +++ b/kubernetes/arc1/apps/dev/gha-runner-scale-set/ks.yaml @@ -0,0 +1,26 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: &app gha-runner-scale-set + namespace: flux-system +spec: + targetNamespace: dev + commonMetadata: + labels: + app.kubernetes.io/name: *app + dependsOn: + - name: traefik + path: ./kubernetes/arc1/apps/dev/gha-runner-scale-set/app + prune: true + sourceRef: + kind: GitRepository + name: k8s-gitops + wait: false + interval: 30m + retryInterval: 1m + timeout: 5m + postBuild: + substitute: + APP: *app diff --git a/kubernetes/arc1/apps/dev/kustomization.yaml b/kubernetes/arc1/apps/dev/kustomization.yaml new file mode 100644 index 0000000..7510f34 --- /dev/null +++ b/kubernetes/arc1/apps/dev/kustomization.yaml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./namespace.yaml + - ./gha-runner-scale-set-controller/ks.yaml + - ./gha-runner-scale-set/ks.yaml diff --git a/kubernetes/arc1/apps/dev/namespace.yaml b/kubernetes/arc1/apps/dev/namespace.yaml new file mode 100644 index 0000000..b237971 --- /dev/null +++ b/kubernetes/arc1/apps/dev/namespace.yaml @@ -0,0 +1,7 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: dev + labels: + kustomize.toolkit.fluxcd.io/prune: disabled diff --git a/kubernetes/arc1/apps/machine-learning/boltz/README.md b/kubernetes/arc1/apps/machine-learning/boltz/README.md new file mode 100644 index 0000000..5f9f473 --- /dev/null +++ b/kubernetes/arc1/apps/machine-learning/boltz/README.md @@ -0,0 +1,5 @@ + + +We have containerized [Boltz-1](https://github.com/jwohlwend/boltz) for testing and development purposes + +Boltz-1 is an open-source model which predicts the 3D structure of proteins, RNA, DNA and small molecules; it handles modified residues, covalent ligands and glycans, as well as condition the generation on pocket residues. diff --git a/kubernetes/arc1/apps/machine-learning/boltz/app/helmrelease.yaml b/kubernetes/arc1/apps/machine-learning/boltz/app/helmrelease.yaml new file mode 100644 index 0000000..0e7a276 --- /dev/null +++ b/kubernetes/arc1/apps/machine-learning/boltz/app/helmrelease.yaml @@ -0,0 +1,92 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/bjw-s/helm-charts/main/charts/other/app-template/schemas/helmrelease-helm-v2.schema.json +# TODO: Finish this +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: &app boltz +spec: + interval: 30m + chart: + spec: + chart: app-template + version: 3.5.1 + sourceRef: + kind: HelmRepository + name: bjw-s + namespace: flux-system + install: + remediation: + retries: 3 + upgrade: + cleanupOnFail: true + remediation: + strategy: rollback + retries: 3 + values: + controllers: + boltz: + type: deployment + annotations: + reloader.stakater.com/auto: "true" + pod: + runtimeClassName: nvidia + terminationGracePeriodSeconds: 1 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu.present + operator: In + values: + - "true" + containers: + app: + image: + repository: ghcr.io/rarecompute/boltz-docker-ci + # Base image + #tag: main@sha256:e2258b1a2ade6c5e92ee0b5e314fc8a073ca15d06bc4b6b6f32b0699ff958b96 + + # CUDA 12.1 + #tag: cuda12-1@sha256:c4da98989ade648a5158b01acc0d86648410ba9e73bfc281564aa8e000bd67ed + + # CUDA 12.4 + tag: cuda12-4@sha256:342069ec496df0027c355a5f7418732b8b0e86931e7ed2dfc3bc679b73b43e0b + command: ["tail", "-f", "/dev/null"] + env: + TZ: ${TIMEZONE} + NVIDIA_VISIBLE_DEVICES: all + NVIDIA_DRIVER_CAPABILITIES: all + securityContext: + capabilities.drop: ["ALL"] + resources: + requests: + cpu: 200m + memory: 8Gi + limits: + cpu: 16 + memory: 48Gi + nvidia.com/gpu: 4 + service: + app: + controller: *app + annotations: + ports: + http: + port: &port 80 + persistence: + # app: + # storageClass: local-nvme + # accessMode: ReadWriteOnce + # size: 1Gi + # globalMounts: + # - path: /app + workspace: + existingClaim: qlora-workspace + globalMounts: + - path: /workspace + tmp: + type: emptyDir + globalMounts: + - path: /tmp diff --git a/kubernetes/arc1/apps/machine-learning/boltz/app/kustomization.yaml b/kubernetes/arc1/apps/machine-learning/boltz/app/kustomization.yaml new file mode 100644 index 0000000..17cbc72 --- /dev/null +++ b/kubernetes/arc1/apps/machine-learning/boltz/app/kustomization.yaml @@ -0,0 +1,6 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./helmrelease.yaml diff --git a/kubernetes/arc1/apps/machine-learning/boltz/ks.yaml b/kubernetes/arc1/apps/machine-learning/boltz/ks.yaml new file mode 100644 index 0000000..901d624 --- /dev/null +++ b/kubernetes/arc1/apps/machine-learning/boltz/ks.yaml @@ -0,0 +1,24 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: &app boltz + namespace: flux-system +spec: + targetNamespace: machine-learning + commonMetadata: + labels: + app.kubernetes.io/name: *app + path: ./kubernetes/arc1/apps/machine-learning/boltz/app + prune: true + sourceRef: + kind: GitRepository + name: k8s-gitops + wait: false + interval: 30m + retryInterval: 1m + timeout: 5m + postBuild: + substitute: + APP: *app diff --git a/kubernetes/arc1/apps/machine-learning/colabfold/app/helmrelease.yaml b/kubernetes/arc1/apps/machine-learning/colabfold/app/helmrelease.yaml index b198781..4b7f857 100644 --- a/kubernetes/arc1/apps/machine-learning/colabfold/app/helmrelease.yaml +++ b/kubernetes/arc1/apps/machine-learning/colabfold/app/helmrelease.yaml @@ -26,7 +26,7 @@ spec: retries: 3 values: controllers: - ollama: + colabfold: type: deployment annotations: reloader.stakater.com/auto: "true" @@ -56,11 +56,11 @@ spec: requests: cpu: 200m memory: 4Gi - gpu.intel.com/i915: "4" + nvidia.com/gpu: 2 limits: cpu: 32000m memory: 64Gi - gpu.intel.com/i915: "4" + nvidia.com/gpu: 4 service: app: controller: *app @@ -88,6 +88,7 @@ spec: - secretName: colabfold-tls hosts: [*host] persistence: + # TODO: Replace with existing PVC data: storageClass: local-nvme accessMode: ReadWriteMany diff --git a/kubernetes/arc1/apps/machine-learning/kustomization.yaml b/kubernetes/arc1/apps/machine-learning/kustomization.yaml index b4cfb5f..52d428e 100644 --- a/kubernetes/arc1/apps/machine-learning/kustomization.yaml +++ b/kubernetes/arc1/apps/machine-learning/kustomization.yaml @@ -6,4 +6,5 @@ resources: - ./namespace.yaml - ./qlora/ks.yaml - ./mmseqs2/ks.yaml + - ./boltz/ks.yaml #- ./jupyterhub/ks.yaml diff --git a/kubernetes/arc1/apps/machine-learning/mmseqs2/README.md b/kubernetes/arc1/apps/machine-learning/mmseqs2/README.md new file mode 100644 index 0000000..f0d56fa --- /dev/null +++ b/kubernetes/arc1/apps/machine-learning/mmseqs2/README.md @@ -0,0 +1,3 @@ + + +MMseqs2 (Many-against-Many sequence searching) is a software suite to search and cluster huge protein and nucleotide sequence sets. MMseqs2 is free and open source software implemented in C++ for Linux, MacOS, and (as beta version, via cygwin) Windows. The software is designed to run on multiple cores and servers and exhibits very good scalability. MMseqs2 can run 10000 times faster than BLAST. At 100 times its speed it achieves almost the same sensitivity. It can perform profile searches with the same sensitivity as PSI-BLAST at over 400 times its speed. diff --git a/kubernetes/arc1/apps/machine-learning/mmseqs2/app/helmrelease.yaml b/kubernetes/arc1/apps/machine-learning/mmseqs2/app/helmrelease.yaml index a480dbc..c0a4cfc 100644 --- a/kubernetes/arc1/apps/machine-learning/mmseqs2/app/helmrelease.yaml +++ b/kubernetes/arc1/apps/machine-learning/mmseqs2/app/helmrelease.yaml @@ -60,7 +60,7 @@ spec: resources: requests: cpu: 200m - memory: 8Gi + memory: 4Gi limits: cpu: 16 memory: 32Gi diff --git a/kubernetes/arc1/apps/machine-learning/qlora/README.md b/kubernetes/arc1/apps/machine-learning/qlora/README.md new file mode 100644 index 0000000..bd3ba0c --- /dev/null +++ b/kubernetes/arc1/apps/machine-learning/qlora/README.md @@ -0,0 +1,7 @@ +We have containerized [QLoRA](https://arxiv.org/abs/2305.14314) for testing and development purposes + +From [artidoro/qlora](https://github.com/artidoro/qlora) + +> QLoRA uses bitsandbytes for quantization and is integrated with Hugging Face's PEFT and transformers libraries. QLoRA was developed by members of the University of Washington's UW NLP group. +> +> We present QLoRA, an efficient finetuning approach that reduces memory usage enough to finetune a 65B parameter model on a single 48GB GPU while preserving full 16-bit finetuning task performance. QLoRA backpropagates gradients through a frozen, 4-bit quantized pretrained language model into Low Rank Adapters (LoRA). Our best model family, which we name Guanaco, outperforms all previous openly released models on the Vicuna benchmark, reaching 99.3% of the performance level of ChatGPT while only requiring 24 hours of finetuning on a single GPU. QLoRA introduces a number of innovations to save memory without sacrificing performance: (a) 4-bit NormalFloat (NF4), a new data type that is information theoretically optimal for normally distributed weights (b) Double Quantization to reduce the average memory footprint by quantizing the quantization constants, and (c) Paged Optimizers to manage memory spikes. We use QLoRA to finetune more than 1,000 models, providing a detailed analysis of instruction following and chatbot performance across 8 instruction datasets, multiple model types (LLaMA, T5), and model scales that would be infeasible to run with regular finetuning (e.g. 33B and 65B parameter models). Our results show that QLoRA finetuning on a small high-quality dataset leads to state-of-the-art results, even when using smaller models than the previous SoTA. We provide a detailed analysis of chatbot performance based on both human and GPT-4 evaluations showing that GPT-4 evaluations are a cheap and reasonable alternative to human evaluation. Furthermore, we find that current chatbot benchmarks are not trustworthy to accurately evaluate the performance levels of chatbots. We release all of our models and code, including CUDA kernels for 4-bit training. diff --git a/kubernetes/arc1/flux/repositories/helm/actions-runner-controller.yaml b/kubernetes/arc1/flux/repositories/helm/actions-runner-controller.yaml new file mode 100644 index 0000000..54fa67b --- /dev/null +++ b/kubernetes/arc1/flux/repositories/helm/actions-runner-controller.yaml @@ -0,0 +1,11 @@ +--- +# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/source.toolkit.fluxcd.io/helmrepository_v1.json +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: actions-runner-controller + namespace: flux-system +spec: + type: oci + interval: 5m + url: oci://ghcr.io/actions/actions-runner-controller-charts diff --git a/kubernetes/arc1/flux/repositories/helm/kustomization.yaml b/kubernetes/arc1/flux/repositories/helm/kustomization.yaml index d0cc8a2..bfa795e 100644 --- a/kubernetes/arc1/flux/repositories/helm/kustomization.yaml +++ b/kubernetes/arc1/flux/repositories/helm/kustomization.yaml @@ -24,3 +24,4 @@ resources: - ./stakater.yaml - ./traefik.yaml - ./teleport.yaml + - ./actions-runner-controller.yaml