Skip to content

Commit

Permalink
--wip-- [skipci]
Browse files Browse the repository at this point in the history
  • Loading branch information
Liana64 committed Nov 19, 2024
1 parent f659fad commit 92a563d
Show file tree
Hide file tree
Showing 37 changed files with 790 additions and 22 deletions.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,4 @@ spec:
resources:
- name: nvidia.com/gpu
replicas: 7
default: "single"
default: "default"
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ spec:
retries: 3
values:
controllers:
ollama:
colabfold:
type: deployment
annotations:
reloader.stakater.com/auto: "true"
Expand Down Expand Up @@ -56,11 +56,11 @@ spec:
requests:
cpu: 200m
memory: 4Gi
gpu.intel.com/i915: "4"
nvidia.com/gpu: 2
limits:
cpu: 32000m
memory: 64Gi
gpu.intel.com/i915: "4"
nvidia.com/gpu: 4
service:
app:
controller: *app
Expand Down Expand Up @@ -88,6 +88,7 @@ spec:
- secretName: colabfold-tls
hosts: [*host]
persistence:
# TODO: Replace with existing PVC
data:
storageClass: local-nvme
accessMode: ReadWriteMany
Expand Down
1 change: 1 addition & 0 deletions kubernetes/arc1/apps/machine-learning/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./namespace.yaml
- ./qlora/ks.yaml
#- ./ollama/ks.yaml
#- ./jupyterhub/ks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: &app ollama
name: &app mmseqs2
spec:
interval: 30m
chart:
Expand All @@ -26,7 +26,7 @@ spec:
retries: 3
values:
controllers:
ollama:
mmseqs2:
type: deployment
annotations:
reloader.stakater.com/auto: "true"
Expand All @@ -41,11 +41,11 @@ spec:
requests:
cpu: 200m
memory: 4Gi
# gpu.intel.com/i915: "1"
nvidia.com/gpu: 2
limits:
cpu: 8000m
memory: 8Gi
# gpu.intel.com/i915: "1"
cpu: 32000m
memory: 64Gi
nvidia.com/gpu: 4
service:
app:
controller: *app
Expand Down Expand Up @@ -74,6 +74,7 @@ spec:
- secretName: mmseqs2-tls
hosts: [*host]
persistence:
# TODO: Replace with existing PVC
data:
storageClass: local-nvme
accessMode: ReadWriteMany
Expand Down
102 changes: 102 additions & 0 deletions kubernetes/arc1/apps/machine-learning/qlora/app/helmrelease.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/bjw-s/helm-charts/main/charts/other/app-template/schemas/helmrelease-helm-v2.schema.json
# TODO: Finish this
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: &app qlora
spec:
interval: 30m
chart:
spec:
chart: app-template
version: 3.5.1
sourceRef:
kind: HelmRepository
name: bjw-s
namespace: flux-system
install:
remediation:
retries: 3
upgrade:
cleanupOnFail: true
remediation:
strategy: rollback
retries: 3
values:
controllers:
qlora:
type: deployment
annotations:
reloader.stakater.com/auto: "true"
pod:
runtimeClassName: nvidia
terminationGracePeriodSeconds: 1
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
containers:
app:
image:
repository: ghcr.io/rarecompute/qlora-docker
tag: main@sha256:e56350596e17af5198bfc848b0de3a5a11cb98d97e1e02dbb322467269342541
env:
TZ: ${TIMEZONE}
# GITHUB_REPO: https://github.com/RareCompute/example-models
NVIDIA_VISIBLE_DEVICES: all
NVIDIA_DRIVER_CAPABILITIES: all
securityContext:
capabilities.drop: ["ALL"]
resources:
requests:
cpu: 200m
memory: 8Gi
limits:
cpu: 16
memory: 32Gi
nvidia.com/gpu: 4
# probes:
# liveness:
# enabled: true
# readiness:
# enabled: true
# startup:
# enabled: false
# spec:
# failureThreshold: 30
# periodSeconds: 5
service:
app:
controller: *app
annotations:
teleport.dev/name: *app
labels:
teleport: enabled
ports:
http:
port: &port 80
persistence:
app:
storageClass: local-nvme
accessMode: ReadWriteOnce
size: 2Gi
globalMounts:
- path: /app
workspace:
storageClass: local-nvme
# TODO: OpenEBS only support ReadWriteOnce
accessMode: ReadWriteOnce
size: 2048Gi
retain: true
globalMounts:
- path: /workspace
tmp:
type: emptyDir
globalMounts:
- path: /tmp
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./helmrelease.yaml
24 changes: 24 additions & 0 deletions kubernetes/arc1/apps/machine-learning/qlora/ks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: &app qlora
namespace: flux-system
spec:
targetNamespace: machine-learning
commonMetadata:
labels:
app.kubernetes.io/name: *app
path: ./kubernetes/arc1/apps/machine-learning/qlora/app
prune: true
sourceRef:
kind: GitRepository
name: k8s-gitops
wait: false
interval: 30m
retryInterval: 1m
timeout: 5m
postBuild:
substitute:
APP: *app
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: &app external-dns
spec:
interval: 30m
chart:
spec:
chart: external-dns
version: 1.15.0
sourceRef:
kind: HelmRepository
name: external-dns
namespace: flux-system
install:
crds: CreateReplace
remediation:
retries: 3
upgrade:
cleanupOnFail: true
crds: CreateReplace
remediation:
strategy: rollback
retries: 3
values:
fullnameOverride: *app
provider: cloudflare
env:
- name: CF_API_TOKEN
valueFrom:
secretKeyRef:
name: external-dns-secret
key: api-token
extraArgs:
- --ingress-class=external
- --cloudflare-proxied
- --crd-source-apiversion=externaldns.k8s.io/v1alpha1
- --crd-source-kind=DNSEndpoint
policy: sync
sources: ["crd", "ingress"]
txtPrefix: k8s.
txtOwnerId: default
domainFilters: ["${SECRET_EXTERNAL_DOMAIN}"]
serviceMonitor:
enabled: true
podAnnotations:
secret.reloader.stakater.com/reload: external-dns-secret
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./secret.sops.yaml
- ./helmrelease.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: v1
kind: Secret
metadata:
name: external-dns-secret
stringData:
api-token: ENC[AES256_GCM,data:JTphCAIBIt4+al2z37lUck1Zq7hlW2W5V9/KASVQae5jwkxI03OVmQ==,iv:ZxU2FjPgVtVpP8/xmxOun3jjoxeJ3gp2vIF3ml+A04c=,tag:GjT20j6rAp0sDhaq2WAkfw==,type:str]
sops:
kms: []
gcp_kms: []
azure_kv: []
hc_vault: []
age:
- recipient: age1ey3reuxyffqynll464r4q3tlhq5v73nxesyktr44lfez8jzxm94s0644n7
enc: |
-----BEGIN AGE ENCRYPTED FILE-----
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBrNVdPRFFxekJvQlprM0ps
QXBNenFZR2k3bUt3dFloRkNUbFFDWldYUlc0CnpOMVg3cytnblBIOUptdHRsTGFa
Qk4vay9lWjAyR2gwTmUxQ0U4QVVZc1kKLS0tIHRUWGYxYSttaTMrZGFUeloxNDh5
T2pIZXdHNEt5azJPU2lpcVpydjlIMVUKQm2uTdBw+T1RMWS1XTZ1TACEujS7nr8e
VWTpDUY+LRU62sJx30evTAUitn+qTxF2jhMudLC7YZBwrQD+pPTISA==
-----END AGE ENCRYPTED FILE-----
lastmodified: "2024-11-19T02:05:15Z"
mac: ENC[AES256_GCM,data:xTYv3RcaPMbV9gwhrtlYXVQ++y5ozSlzorHnloOc+40kI7dc5ue9cmQufthIhsoL6ebcbDz6/cN1/rjMcBrlPxBop0nGkvyyjL/PybEEwrHwa82/fEuGkdJcJKZF76FYdOSXeJY41m7QOJlh31xOIkRmZUAZg+2N94VCeyPrJ5U=,iv:wLBRew0er+yA2EWqnnipPdUa297Ph2O4WBchuizuOVM=,tag:8Ni9BfQvcoSQ1PwJrlOlBw==,type:str]
pgp: []
encrypted_regex: ^(data|stringData)$
version: 3.9.1
20 changes: 20 additions & 0 deletions kubernetes/arc1/apps/network-system/external-dns/ks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: &app external-dns
namespace: flux-system
spec:
targetNamespace: network-system
commonMetadata:
labels:
app.kubernetes.io/name: *app
path: ./kubernetes/arc1/apps/network-system/external-dns/app
prune: true
sourceRef:
kind: GitRepository
name: k8s-gitops
wait: true
interval: 30m
retryInterval: 1m
timeout: 5m
1 change: 1 addition & 0 deletions kubernetes/arc1/apps/network-system/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ kind: Kustomization
resources:
- ./namespace.yaml
- ./echo-server/ks.yaml
- ./external-dns/ks.yaml
- ./k8tz/ks.yaml
Loading

0 comments on commit 92a563d

Please sign in to comment.