Skip to content

Commit

Permalink
--wip-- [skipci]
Browse files Browse the repository at this point in the history
  • Loading branch information
Liana64 committed Nov 18, 2024
1 parent f659fad commit 32b0b85
Show file tree
Hide file tree
Showing 24 changed files with 391 additions and 25 deletions.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,4 @@ spec:
resources:
- name: nvidia.com/gpu
replicas: 7
default: "single"
default: "default"
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ spec:
retries: 3
values:
controllers:
ollama:
colabfold:
type: deployment
annotations:
reloader.stakater.com/auto: "true"
Expand Down Expand Up @@ -56,11 +56,11 @@ spec:
requests:
cpu: 200m
memory: 4Gi
gpu.intel.com/i915: "4"
nvidia.com/gpu: 2
limits:
cpu: 32000m
memory: 64Gi
gpu.intel.com/i915: "4"
nvidia.com/gpu: 4
service:
app:
controller: *app
Expand Down Expand Up @@ -88,6 +88,7 @@ spec:
- secretName: colabfold-tls
hosts: [*host]
persistence:
# TODO: Replace with existing PVC
data:
storageClass: local-nvme
accessMode: ReadWriteMany
Expand Down
1 change: 1 addition & 0 deletions kubernetes/arc1/apps/machine-learning/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./namespace.yaml
- ./qlora/ks.yaml
#- ./ollama/ks.yaml
#- ./jupyterhub/ks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: &app ollama
name: &app mmseqs2
spec:
interval: 30m
chart:
Expand All @@ -26,7 +26,7 @@ spec:
retries: 3
values:
controllers:
ollama:
mmseqs2:
type: deployment
annotations:
reloader.stakater.com/auto: "true"
Expand All @@ -41,11 +41,11 @@ spec:
requests:
cpu: 200m
memory: 4Gi
# gpu.intel.com/i915: "1"
nvidia.com/gpu: 2
limits:
cpu: 8000m
memory: 8Gi
# gpu.intel.com/i915: "1"
cpu: 32000m
memory: 64Gi
nvidia.com/gpu: 4
service:
app:
controller: *app
Expand Down Expand Up @@ -74,6 +74,7 @@ spec:
- secretName: mmseqs2-tls
hosts: [*host]
persistence:
# TODO: Replace with existing PVC
data:
storageClass: local-nvme
accessMode: ReadWriteMany
Expand Down
105 changes: 105 additions & 0 deletions kubernetes/arc1/apps/machine-learning/qlora/app/helmrelease.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/bjw-s/helm-charts/main/charts/other/app-template/schemas/helmrelease-helm-v2.schema.json
# TODO: Finish this
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: &app qlora
spec:
interval: 30m
chart:
spec:
chart: app-template
version: 3.5.1
sourceRef:
kind: HelmRepository
name: bjw-s
namespace: flux-system
install:
remediation:
retries: 3
upgrade:
cleanupOnFail: true
remediation:
strategy: rollback
retries: 3
values:
controllers:
qlora:
type: deployment
annotations:
reloader.stakater.com/auto: "true"
pod:
runtimeClassName: nvidia
terminationGracePeriodSeconds: 1
securityContext:
fsGroup: 1000
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
containers:
app:
image:
repository: ghcr.io/rarecompute/qlora-docker
tag: main@sha256:65b3f17822d21d8158b70c7762968b2e0e9db1814428e0d425cfa217527526eb
env:
TZ: ${TIMEZONE}
NVIDIA_VISIBLE_DEVICES: all
NVIDIA_DRIVER_CAPABILITIES: all
resources:
requests:
cpu: 200m
memory: 8Gi
limits:
cpu: 32000m
memory: 32Gi
nvidia.com/gpu: 4
# probes:
# liveness:
# enabled: true
# readiness:
# enabled: true
# startup:
# enabled: false
# spec:
# failureThreshold: 30
# periodSeconds: 5
service:
app:
controller: *app
annotations:
teleport.dev/name: *app
labels:
teleport: enabled
# ports:
# http:
# port: &port 80
persistence:
app:
storageClass: local-nvme
accessMode: ReadWriteOnce
size: 2Gi
globalMounts:
- path: /app
workspace:
storageClass: local-nvme
accessMode: ReadWriteOnce
size: 2048Gi
retain: true
globalMounts:
- path: /workspace
tmp:
type: emptyDir
globalMounts:
- path: /tmp
# workspace:
# enabled: true
# existingClaim: machine-learning-workspace-pvc
# globalMounts:
# - path: /workspace
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./helmrelease.yaml
24 changes: 24 additions & 0 deletions kubernetes/arc1/apps/machine-learning/qlora/ks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: &app qlora
namespace: flux-system
spec:
targetNamespace: machine-learning
commonMetadata:
labels:
app.kubernetes.io/name: *app
path: ./kubernetes/arc1/apps/machine-learning/qlora/app
prune: true
sourceRef:
kind: GitRepository
name: k8s-gitops
wait: false
interval: 30m
retryInterval: 1m
timeout: 5m
postBuild:
substitute:
APP: *app
2 changes: 1 addition & 1 deletion kubernetes/arc1/apps/security/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./namespace.yaml
#- ./teleport/ks.yaml
- ./teleport/ks.yaml
25 changes: 14 additions & 11 deletions kubernetes/arc1/apps/security/teleport/app/helmrelease.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ spec:
chart:
spec:
chart: teleport-cluster
version: 16.4.6
version: 17.0.1
sourceRef:
kind: HelmRepository
name: teleport
Expand All @@ -19,12 +19,11 @@ spec:
upgrade:
cleanupOnFail: true
remediation:
strategy: rollback
retries: 3
values:
clusterName: teleport.${SECRET_EXTERNAL_DOMAIN}
clusterName: teleport.${SECRET_INTERNAL_DOMAIN}
chartMode: standalone
kubeClusterName: ARC1
kubeClusterName: arc1
validateConfigOnDeploy: true
enterprise: false
auth:
Expand Down Expand Up @@ -58,18 +57,22 @@ spec:
annotations:
ingress:
cert-manager.io/cluster-issuer: "letsencrypt-production"
gethomepage.dev/enabled: "true"
gethomepage.dev/group: Services
gethomepage.dev/name: *app
gethomepage.dev/icon: teleport.png
tls:
existingSecretName: "teleport-cluster-tls"
external-dns.alpha.kubernetes.io/target: "teleport.${SECRET_EXTERNAL_DOMAIN}"
external-dns.alpha.kubernetes.io/exclude-unifi: "true"
external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
traefik.ingress.kubernetes.io/router.middlewares: "networking-traefik-middleware-chain-no-auth@kubernetescrd"
service:
traefik.ingress.kubernetes.io/service.serversscheme: https
highAvailability:
tls:
existingSecretName: "teleport-tls"
authentication:
type: local
proxyListenerMode: multiplex
persistence:
enabled: true
storageClassName: local-nvme
existingClaimName: teleport
serviceAccount:
create: true
rbac:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./pvc.yaml
- ./helmrelease.yaml
12 changes: 12 additions & 0 deletions kubernetes/arc1/apps/security/teleport/app/pvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: teleport
namespace: security
spec:
storageClassName: local-nvme
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 8Gi
18 changes: 18 additions & 0 deletions kubernetes/arc1/apps/security/teleport/app/resources/token.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
kind: token
version: v2
metadata:
name: kubernetes-token
# set a long expiry time, the default for tokens is only 30 minutes
expires: "2050-01-01T00:00:00Z"
spec:
# Use the minimal set of system roles required.
roles: [kube, app, discovery, node, windowsdesktop]

# set the join method allowed for this token
join_method: kubernetes

kubernetes:
type: in_cluster
allow:
# Service account names follow the format "namespace:serviceaccountname".
- service_account: "security:teleport-kube-agent"
8 changes: 8 additions & 0 deletions kubernetes/arc1/apps/storage/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./namespace.yaml
- ./snapshot-controller/ks.yaml
- ./volsync/ks.yaml
8 changes: 8 additions & 0 deletions kubernetes/arc1/apps/storage/namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
apiVersion: v1
kind: Namespace
metadata:
name: storage
labels:
kustomize.toolkit.fluxcd.io/prune: disabled
volsync.backube/privileged-movers: "true"
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: snapshot-controller
spec:
interval: 30m
chart:
spec:
chart: snapshot-controller
version: 3.0.6
sourceRef:
kind: HelmRepository
name: piraeus
namespace: flux-system
maxHistory: 2
install:
crds: CreateReplace
remediation:
retries: 3
upgrade:
cleanupOnFail: true
crds: CreateReplace
remediation:
retries: 3
uninstall:
keepHistory: false
values:
controller:
serviceMonitor:
create: true
webhook:
enabled: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./helmrelease.yaml
Loading

0 comments on commit 32b0b85

Please sign in to comment.