Skip to content

Commit

Permalink
--wip-- [skipci]
Browse files Browse the repository at this point in the history
  • Loading branch information
Liana64 committed Nov 18, 2024
1 parent f659fad commit dc7d0d1
Show file tree
Hide file tree
Showing 25 changed files with 400 additions and 24 deletions.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,4 @@ spec:
resources:
- name: nvidia.com/gpu
replicas: 7
default: "single"
default: "default"
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ spec:
retries: 3
values:
controllers:
ollama:
colabfold:
type: deployment
annotations:
reloader.stakater.com/auto: "true"
Expand Down Expand Up @@ -56,11 +56,11 @@ spec:
requests:
cpu: 200m
memory: 4Gi
gpu.intel.com/i915: "4"
nvidia.com/gpu: 2
limits:
cpu: 32000m
memory: 64Gi
gpu.intel.com/i915: "4"
nvidia.com/gpu: 4
service:
app:
controller: *app
Expand Down Expand Up @@ -88,6 +88,7 @@ spec:
- secretName: colabfold-tls
hosts: [*host]
persistence:
# TODO: Replace with existing PVC
data:
storageClass: local-nvme
accessMode: ReadWriteMany
Expand Down
1 change: 1 addition & 0 deletions kubernetes/arc1/apps/machine-learning/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./namespace.yaml
- ./qlora/ks.yaml
#- ./ollama/ks.yaml
#- ./jupyterhub/ks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: &app ollama
name: &app mmseqs2
spec:
interval: 30m
chart:
Expand All @@ -26,7 +26,7 @@ spec:
retries: 3
values:
controllers:
ollama:
mmseqs2:
type: deployment
annotations:
reloader.stakater.com/auto: "true"
Expand All @@ -41,11 +41,11 @@ spec:
requests:
cpu: 200m
memory: 4Gi
# gpu.intel.com/i915: "1"
nvidia.com/gpu: 2
limits:
cpu: 8000m
memory: 8Gi
# gpu.intel.com/i915: "1"
cpu: 32000m
memory: 64Gi
nvidia.com/gpu: 4
service:
app:
controller: *app
Expand Down Expand Up @@ -74,6 +74,7 @@ spec:
- secretName: mmseqs2-tls
hosts: [*host]
persistence:
# TODO: Replace with existing PVC
data:
storageClass: local-nvme
accessMode: ReadWriteMany
Expand Down
105 changes: 105 additions & 0 deletions kubernetes/arc1/apps/machine-learning/qlora/app/helmrelease.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/bjw-s/helm-charts/main/charts/other/app-template/schemas/helmrelease-helm-v2.schema.json
# TODO: Finish this
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: &app qlora
spec:
interval: 30m
chart:
spec:
chart: app-template
version: 3.5.1
sourceRef:
kind: HelmRepository
name: bjw-s
namespace: flux-system
install:
remediation:
retries: 3
upgrade:
cleanupOnFail: true
remediation:
strategy: rollback
retries: 3
values:
controllers:
qlora:
type: deployment
annotations:
reloader.stakater.com/auto: "true"
pod:
runtimeClassName: nvidia
terminationGracePeriodSeconds: 1
securityContext:
fsGroup: 1000
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: nvidia.com/gpu.present
operator: In
values:
- "true"
containers:
app:
image:
repository: ghcr.io/rarecompute/qlora-docker
tag: main@sha256:8840e3501436bfd43135da4e9ad59a04f71da8265241c712de3af7ac62460d72
env:
TZ: ${TIMEZONE}
NVIDIA_VISIBLE_DEVICES: all
NVIDIA_DRIVER_CAPABILITIES: all
resources:
requests:
cpu: 200m
memory: 8Gi
limits:
cpu: 32000m
memory: 32Gi
nvidia.com/gpu: 4
# probes:
# liveness:
# enabled: true
# readiness:
# enabled: true
# startup:
# enabled: false
# spec:
# failureThreshold: 30
# periodSeconds: 5
service:
app:
controller: *app
annotations:
teleport.dev/name: *app
labels:
teleport: enabled
ports:
http:
port: &port 80
persistence:
app:
storageClass: local-nvme
accessMode: ReadWriteOnce
size: 2Gi
globalMounts:
- path: /app
workspace:
storageClass: local-nvme
accessMode: ReadWriteOnce
size: 2048Gi
retain: true
globalMounts:
- path: /workspace
tmp:
type: emptyDir
globalMounts:
- path: /tmp
# workspace:
# enabled: true
# existingClaim: machine-learning-workspace-pvc
# globalMounts:
# - path: /workspace
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
# yaml-language-server: $schema=https://json.schemastore.org/kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./helmrelease.yaml
24 changes: 24 additions & 0 deletions kubernetes/arc1/apps/machine-learning/qlora/ks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: &app qlora
namespace: flux-system
spec:
targetNamespace: machine-learning
commonMetadata:
labels:
app.kubernetes.io/name: *app
path: ./kubernetes/arc1/apps/machine-learning/qlora/app
prune: true
sourceRef:
kind: GitRepository
name: k8s-gitops
wait: false
interval: 30m
retryInterval: 1m
timeout: 5m
postBuild:
substitute:
APP: *app
2 changes: 1 addition & 1 deletion kubernetes/arc1/apps/security/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./namespace.yaml
#- ./teleport/ks.yaml
- ./teleport/ks.yaml
22 changes: 12 additions & 10 deletions kubernetes/arc1/apps/security/teleport/app/helmrelease.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ spec:
chart:
spec:
chart: teleport-cluster
version: 16.4.6
version: 17.0.1
sourceRef:
kind: HelmRepository
name: teleport
Expand All @@ -19,12 +19,11 @@ spec:
upgrade:
cleanupOnFail: true
remediation:
strategy: rollback
retries: 3
values:
clusterName: teleport.${SECRET_EXTERNAL_DOMAIN}
clusterName: teleport.${SECRET_INTERNAL_DOMAIN}
chartMode: standalone
kubeClusterName: ARC1
kubeClusterName: arc1
validateConfigOnDeploy: true
enterprise: false
auth:
Expand Down Expand Up @@ -58,18 +57,21 @@ spec:
annotations:
ingress:
cert-manager.io/cluster-issuer: "letsencrypt-production"
gethomepage.dev/enabled: "true"
gethomepage.dev/group: Services
gethomepage.dev/name: *app
gethomepage.dev/icon: teleport.png
external-dns.alpha.kubernetes.io/target: "teleport.${SECRET_EXTERNAL_DOMAIN}"
external-dns.alpha.kubernetes.io/exclude-unifi: "true"
external-dns.alpha.kubernetes.io/cloudflare-proxied: "false"
traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
traefik.ingress.kubernetes.io/router.middlewares: "networking-traefik-middleware-chain-no-auth@kubernetescrd"
service:
traefik.ingress.kubernetes.io/service.serversscheme: https
tls:
existingSecretName: "teleport-cluster-tls"
existingSecretName: "teleport-tls"
authentication:
type: local
proxyListenerMode: multiplex
persistence:
enabled: true
storageClassName: local-nvme
existingClaimName: teleport
serviceAccount:
create: true
rbac:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- ./pvc.yaml
- ./helmrelease.yaml
12 changes: 12 additions & 0 deletions kubernetes/arc1/apps/security/teleport/app/pvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: teleport
namespace: security
spec:
storageClassName: local-nvme
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 8Gi
18 changes: 18 additions & 0 deletions kubernetes/arc1/apps/security/teleport/app/resources/token.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
kind: token
version: v2
metadata:
name: kubernetes-token
# set a long expiry time, the default for tokens is only 30 minutes
expires: "2050-01-01T00:00:00Z"
spec:
# Use the minimal set of system roles required.
roles: [kube, app, discovery, node, windowsdesktop]

# set the join method allowed for this token
join_method: kubernetes

kubernetes:
type: in_cluster
allow:
# Service account names follow the format "namespace:serviceaccountname".
- service_account: "security:teleport-kube-agent"
56 changes: 53 additions & 3 deletions kubernetes/arc1/apps/traefik-ingress/traefik/app/helmrelease.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,69 @@ spec:
service:
annotations:
io.cilium/lb-ipam-ips: ${LB_TRAEFIK}
# spec:
#externalTrafficPolicy: Local
env:
- name: TZ
value: "${TIMEZONE}"
ingressClass:
enabled: true
isDefaultClass: true
ingressRoute:
dashboard:
enabled: false
globalArguments:
- "--serversTransport.insecureSkipVerify=true"
- "--global.sendanonymoususage=false"
additionalArguments:
- "--entrypoints.web.transport.respondingTimeouts.readTimeout=0"
- "--entrypoints.websecure.transport.respondingTimeouts.readTimeout=0"
ports:
traefik:
expose:
default: false
web:
redirectTo:
port: websecure
websecure:
tls:
enabled: true
options: default
forwardedHeaders:
trustedIPs:
- 10.0.0.0/8
- 172.16.0.0/12
- 192.168.0.0/16
proxyProtocol:
trustedIPs:
- 10.0.0.0/8
- 172.16.0.0/12
- 192.168.0.0/16
http3:
enabled: true
metrics:
expose:
default: false
metrics:
serviceMonitor:
enabled: true
namespaceSelector:
any: true
pilot:
enabled: false
providers:
kubernetesCRD:
enabled: true
ingressClass: traefik
allowCrossNamespace: true
allowExternalNameServices: true
kubernetesIngress:
enabled: true
ingressClass: traefik
allowExternalNameServices: true
publishedService:
enabled: true
resources:
requests:
memory: 128Mi
cpu: 100m
memory: 512Mi
limits:
memory: 1536Mi
Loading

0 comments on commit dc7d0d1

Please sign in to comment.