Skip to content

Commit

Permalink
--wip-- [skipci]
Browse files Browse the repository at this point in the history
  • Loading branch information
Liana64 committed Nov 19, 2024
1 parent f659fad commit 41e0cd3
Show file tree
Hide file tree
Showing 26 changed files with 419 additions and 22 deletions.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,4 @@ spec:
resources:
- name: nvidia.com/gpu
replicas: 7
default: "single"
default: "default"
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ spec:
retries: 3
values:
controllers:
ollama:
colabfold:
type: deployment
annotations:
reloader.stakater.com/auto: "true"
Expand Down Expand Up @@ -56,11 +56,11 @@ spec:
requests:
cpu: 200m
memory: 4Gi
gpu.intel.com/i915: "4"
nvidia.com/gpu: 2
limits:
cpu: 32000m
memory: 64Gi
gpu.intel.com/i915: "4"
nvidia.com/gpu: 4
service:
app:
controller: *app
Expand Down Expand Up @@ -88,6 +88,7 @@ spec:
- secretName: colabfold-tls
hosts: [*host]
persistence:
# TODO: Replace with existing PVC
data:
storageClass: local-nvme
accessMode: ReadWriteMany
Expand Down
1 change: 1 addition & 0 deletions kubernetes/arc1/apps/machine-learning/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./namespace.yaml
- ./qlora/ks.yaml
#- ./ollama/ks.yaml
#- ./jupyterhub/ks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: &app ollama
name: &app mmseqs2
spec:
interval: 30m
chart:
Expand All @@ -26,7 +26,7 @@ spec:
retries: 3
values:
controllers:
ollama:
mmseqs2:
type: deployment
annotations:
reloader.stakater.com/auto: "true"
Expand All @@ -41,11 +41,11 @@ spec:
requests:
cpu: 200m
memory: 4Gi
# gpu.intel.com/i915: "1"
nvidia.com/gpu: 2
limits:
cpu: 8000m
memory: 8Gi
# gpu.intel.com/i915: "1"
cpu: 32000m
memory: 64Gi
nvidia.com/gpu: 4
service:
app:
controller: *app
Expand Down Expand Up @@ -74,6 +74,7 @@ spec:
- secretName: mmseqs2-tls
hosts: [*host]
persistence:
# TODO: Replace with existing PVC
data:
storageClass: local-nvme
accessMode: ReadWriteMany
Expand Down
102 changes: 102 additions & 0 deletions kubernetes/arc1/apps/machine-learning/qlora/app/helmrelease.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/bjw-s/helm-charts/main/charts/other/app-template/schemas/helmrelease-helm-v2.schema.json
# TODO: Finish this
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: &app qlora
spec:
interval: 30m
chart:
spec:
chart: app-template
version: 3.5.1
sourceRef:
kind: HelmRepository
name: bjw-s
namespace: flux-system
install:
remediation:
retries: 3
upgrade:
cleanupOnFail: true
remediation:
strategy: rollback
retries: 3
values:
controllers:
qlora:
type: deployment
annotations:
reloader.stakater.com/auto: "true"
pod:
runtimeClassName: nvidia
terminationGracePeriodSeconds: 1
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
containers:
app:
image:
repository: ghcr.io/rarecompute/qlora-docker
tag: main@sha256:e56350596e17af5198bfc848b0de3a5a11cb98d97e1e02dbb322467269342541
env:
TZ: ${TIMEZONE}
# GITHUB_REPO: https://github.com/RareCompute/example-models
NVIDIA_VISIBLE_DEVICES: all
NVIDIA_DRIVER_CAPABILITIES: all
securityContext:
capabilities.drop: ["ALL"]
resources:
requests:
cpu: 200m
memory: 8Gi
limits:
cpu: 16
memory: 32Gi
nvidia.com/gpu: 4
# probes:
# liveness:
# enabled: true
# readiness:
# enabled: true
# startup:
# enabled: false
# spec:
# failureThreshold: 30
# periodSeconds: 5
service:
app:
controller: *app
annotations:
teleport.dev/name: *app
labels:
teleport: enabled
ports:
http:
port: &port 80
persistence:
app:
storageClass: local-nvme
accessMode: ReadWriteOnce
size: 2Gi
globalMounts:
- path: /app
workspace:
storageClass: local-nvme
# TODO: OpenEBS only support ReadWriteOnce
accessMode: ReadWriteOnce
size: 2048Gi
retain: true
globalMounts:
- path: /workspace
tmp:
type: emptyDir
globalMounts:
- path: /tmp
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./helmrelease.yaml
24 changes: 24 additions & 0 deletions kubernetes/arc1/apps/machine-learning/qlora/ks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: &app qlora
namespace: flux-system
spec:
targetNamespace: machine-learning
commonMetadata:
labels:
app.kubernetes.io/name: *app
path: ./kubernetes/arc1/apps/machine-learning/qlora/app
prune: true
sourceRef:
kind: GitRepository
name: k8s-gitops
wait: false
interval: 30m
retryInterval: 1m
timeout: 5m
postBuild:
substitute:
APP: *app
2 changes: 1 addition & 1 deletion kubernetes/arc1/apps/security/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./namespace.yaml
#- ./teleport/ks.yaml
- ./teleport/ks.yaml
14 changes: 14 additions & 0 deletions kubernetes/arc1/apps/security/teleport/app/crt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
---
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: teleport-tls
namespace: security
spec:
secretName: teleport-tls
issuerRef:
name: letsencrypt-production
kind: ClusterIssuer
dnsNames:
- "teleport.${SECRET_EXTERNAL_DOMAIN}"
- "*.teleport.${SECRET_EXTERNAL_DOMAIN}"
27 changes: 19 additions & 8 deletions kubernetes/arc1/apps/security/teleport/app/helmrelease.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ spec:
chart:
spec:
chart: teleport-cluster
version: 16.4.6
version: 17.0.1
sourceRef:
kind: HelmRepository
name: teleport
Expand All @@ -19,12 +19,11 @@ spec:
upgrade:
cleanupOnFail: true
remediation:
strategy: rollback
retries: 3
values:
clusterName: teleport.${SECRET_EXTERNAL_DOMAIN}
chartMode: standalone
kubeClusterName: ARC1
kubeClusterName: arc1
validateConfigOnDeploy: true
enterprise: false
auth:
Expand Down Expand Up @@ -60,16 +59,28 @@ spec:
cert-manager.io/cluster-issuer: "letsencrypt-production"
gethomepage.dev/enabled: "true"
gethomepage.dev/group: Services
gethomepage.dev/name: *app
gethomepage.dev/icon: teleport.png
tls:
existingSecretName: "teleport-cluster-tls"
gethomepage.dev/name: Teleport
gethomepage.dev/description: Telelport dashboard
gethomepage.dev/icon: teleport
# external-dns.alpha.kubernetes.io/target: "teleport.${SECRET_EXTERNAL_DOMAIN}"
# external-dns.alpha.kubernetes.io/exclude-unifi: "true"
# external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
traefik.ingress.kubernetes.io/router.middlewares: "networking-traefik-middleware-chain-no-auth@kubernetescrd"
service:
traefik.ingress.kubernetes.io/service.serversscheme: https
highAvailability:
certManager:
enabled: true
issuerName: "letsencrypt-production"
# tls:
# existingSecretName: teleport-tls
authentication:
type: local
proxyListenerMode: multiplex
persistence:
enabled: true
storageClassName: local-nvme
existingClaimName: teleport
serviceAccount:
create: true
rbac:
Expand Down
2 changes: 2 additions & 0 deletions kubernetes/arc1/apps/security/teleport/app/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./pvc.yaml
- ./crt.yaml
- ./helmrelease.yaml
12 changes: 12 additions & 0 deletions kubernetes/arc1/apps/security/teleport/app/pvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: teleport
namespace: security
spec:
storageClassName: local-nvme
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 8Gi
18 changes: 18 additions & 0 deletions kubernetes/arc1/apps/security/teleport/app/resources/token.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
kind: token
version: v2
metadata:
name: kubernetes-token
# set a long expiry time, the default for tokens is only 30 minutes
expires: "2050-01-01T00:00:00Z"
spec:
# Use the minimal set of system roles required.
roles: [kube, app, discovery, node, windowsdesktop]

# set the join method allowed for this token
join_method: kubernetes

kubernetes:
type: in_cluster
allow:
# Service account names follow the format "namespace:serviceaccountname".
- service_account: "security:teleport-kube-agent"
Loading

0 comments on commit 41e0cd3

Please sign in to comment.