diff --git a/.github/gitleaks.yaml b/.github/workflows/gitleaks.yaml similarity index 100% rename from .github/gitleaks.yaml rename to .github/workflows/gitleaks.yaml diff --git a/.github/kubeconform.yaml b/.github/workflows/kubeconform.yaml similarity index 100% rename from .github/kubeconform.yaml rename to .github/workflows/kubeconform.yaml diff --git a/kubernetes/arc1/apps/kube-system/nvidia-device-plugin/app/helmrelease.yaml b/kubernetes/arc1/apps/kube-system/nvidia-device-plugin/app/helmrelease.yaml index 799055f..e0001e7 100644 --- a/kubernetes/arc1/apps/kube-system/nvidia-device-plugin/app/helmrelease.yaml +++ b/kubernetes/arc1/apps/kube-system/nvidia-device-plugin/app/helmrelease.yaml @@ -63,4 +63,4 @@ spec: resources: - name: nvidia.com/gpu replicas: 7 - default: "single" + default: "default" diff --git a/kubernetes/arc1/apps/machine-learning/colabfold/app/helmrelease.yaml b/kubernetes/arc1/apps/machine-learning/colabfold/app/helmrelease.yaml index b198781..4b7f857 100644 --- a/kubernetes/arc1/apps/machine-learning/colabfold/app/helmrelease.yaml +++ b/kubernetes/arc1/apps/machine-learning/colabfold/app/helmrelease.yaml @@ -26,7 +26,7 @@ spec: retries: 3 values: controllers: - ollama: + colabfold: type: deployment annotations: reloader.stakater.com/auto: "true" @@ -56,11 +56,11 @@ spec: requests: cpu: 200m memory: 4Gi - gpu.intel.com/i915: "4" + nvidia.com/gpu: 2 limits: cpu: 32000m memory: 64Gi - gpu.intel.com/i915: "4" + nvidia.com/gpu: 4 service: app: controller: *app @@ -88,6 +88,7 @@ spec: - secretName: colabfold-tls hosts: [*host] persistence: + # TODO: Replace with existing PVC data: storageClass: local-nvme accessMode: ReadWriteMany diff --git a/kubernetes/arc1/apps/machine-learning/kustomization.yaml b/kubernetes/arc1/apps/machine-learning/kustomization.yaml index 20d4825..f7f14e5 100644 --- a/kubernetes/arc1/apps/machine-learning/kustomization.yaml +++ b/kubernetes/arc1/apps/machine-learning/kustomization.yaml @@ -4,5 +4,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - ./namespace.yaml + - ./qlora/ks.yaml #- ./ollama/ks.yaml #- ./jupyterhub/ks.yaml diff --git a/kubernetes/arc1/apps/machine-learning/mmseqs2/app/helmrelease.yaml b/kubernetes/arc1/apps/machine-learning/mmseqs2/app/helmrelease.yaml index 52381ca..455f7e1 100644 --- a/kubernetes/arc1/apps/machine-learning/mmseqs2/app/helmrelease.yaml +++ b/kubernetes/arc1/apps/machine-learning/mmseqs2/app/helmrelease.yaml @@ -5,7 +5,7 @@ apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: - name: &app ollama + name: &app mmseqs2 spec: interval: 30m chart: @@ -26,7 +26,7 @@ spec: retries: 3 values: controllers: - ollama: + mmseqs2: type: deployment annotations: reloader.stakater.com/auto: "true" @@ -41,11 +41,11 @@ spec: requests: cpu: 200m memory: 4Gi - # gpu.intel.com/i915: "1" + nvidia.com/gpu: 2 limits: - cpu: 8000m - memory: 8Gi - # gpu.intel.com/i915: "1" + cpu: 32000m + memory: 64Gi + nvidia.com/gpu: 4 service: app: controller: *app @@ -74,6 +74,7 @@ spec: - secretName: mmseqs2-tls hosts: [*host] persistence: + # TODO: Replace with existing PVC data: storageClass: local-nvme accessMode: ReadWriteMany diff --git a/kubernetes/arc1/apps/machine-learning/qlora/app/helmrelease.yaml b/kubernetes/arc1/apps/machine-learning/qlora/app/helmrelease.yaml new file mode 100644 index 0000000..45ea32d --- /dev/null +++ b/kubernetes/arc1/apps/machine-learning/qlora/app/helmrelease.yaml @@ -0,0 +1,105 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/bjw-s/helm-charts/main/charts/other/app-template/schemas/helmrelease-helm-v2.schema.json +# TODO: Finish this +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: &app qlora +spec: + interval: 30m + chart: + spec: + chart: app-template + version: 3.5.1 + sourceRef: + kind: HelmRepository + name: bjw-s + namespace: flux-system + install: + remediation: + retries: 3 + upgrade: + cleanupOnFail: true + remediation: + strategy: rollback + retries: 3 + values: + controllers: + qlora: + type: deployment + annotations: + reloader.stakater.com/auto: "true" + pod: + runtimeClassName: nvidia + terminationGracePeriodSeconds: 1 + securityContext: + fsGroup: 1000 + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu.present + operator: In + values: + - "true" + containers: + app: + image: + repository: ghcr.io/rarecompute/qlora-docker + tag: main@sha256:65b3f17822d21d8158b70c7762968b2e0e9db1814428e0d425cfa217527526eb + env: + TZ: ${TIMEZONE} + NVIDIA_VISIBLE_DEVICES: all + NVIDIA_DRIVER_CAPABILITIES: all + resources: + requests: + cpu: 200m + memory: 8Gi + limits: + cpu: 32000m + memory: 32Gi + nvidia.com/gpu: 4 + # probes: + # liveness: + # enabled: true + # readiness: + # enabled: true + # startup: + # enabled: false + # spec: + # failureThreshold: 30 + # periodSeconds: 5 + service: + app: + controller: *app + annotations: + teleport.dev/name: *app + labels: + teleport: enabled + # ports: + # http: + # port: &port 80 + persistence: + app: + storageClass: local-nvme + accessMode: ReadWriteOnce + size: 2Gi + globalMounts: + - path: /app + workspace: + storageClass: local-nvme + accessMode: ReadWriteOnce + size: 2048Gi + retain: true + globalMounts: + - path: /workspace + tmp: + type: emptyDir + globalMounts: + - path: /tmp + # workspace: + # enabled: true + # existingClaim: machine-learning-workspace-pvc + # globalMounts: + # - path: /workspace diff --git a/kubernetes/arc1/apps/machine-learning/qlora/app/kustomization.yaml b/kubernetes/arc1/apps/machine-learning/qlora/app/kustomization.yaml new file mode 100644 index 0000000..17cbc72 --- /dev/null +++ b/kubernetes/arc1/apps/machine-learning/qlora/app/kustomization.yaml @@ -0,0 +1,6 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./helmrelease.yaml diff --git a/kubernetes/arc1/apps/machine-learning/qlora/ks.yaml b/kubernetes/arc1/apps/machine-learning/qlora/ks.yaml new file mode 100644 index 0000000..b89a998 --- /dev/null +++ b/kubernetes/arc1/apps/machine-learning/qlora/ks.yaml @@ -0,0 +1,24 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: &app qlora + namespace: flux-system +spec: + targetNamespace: machine-learning + commonMetadata: + labels: + app.kubernetes.io/name: *app + path: ./kubernetes/arc1/apps/machine-learning/qlora/app + prune: true + sourceRef: + kind: GitRepository + name: k8s-gitops + wait: false + interval: 30m + retryInterval: 1m + timeout: 5m + postBuild: + substitute: + APP: *app diff --git a/kubernetes/arc1/apps/security/kustomization.yaml b/kubernetes/arc1/apps/security/kustomization.yaml index e430c5f..b75ad0e 100644 --- a/kubernetes/arc1/apps/security/kustomization.yaml +++ b/kubernetes/arc1/apps/security/kustomization.yaml @@ -4,4 +4,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - ./namespace.yaml - #- ./teleport/ks.yaml + - ./teleport/ks.yaml diff --git a/kubernetes/arc1/apps/security/teleport/app/helmrelease.yaml b/kubernetes/arc1/apps/security/teleport/app/helmrelease.yaml index 6b8a278..c7d3879 100644 --- a/kubernetes/arc1/apps/security/teleport/app/helmrelease.yaml +++ b/kubernetes/arc1/apps/security/teleport/app/helmrelease.yaml @@ -8,7 +8,7 @@ spec: chart: spec: chart: teleport-cluster - version: 16.4.6 + version: 17.0.1 sourceRef: kind: HelmRepository name: teleport @@ -19,12 +19,11 @@ spec: upgrade: cleanupOnFail: true remediation: - strategy: rollback retries: 3 values: - clusterName: teleport.${SECRET_EXTERNAL_DOMAIN} + clusterName: teleport.${SECRET_INTERNAL_DOMAIN} chartMode: standalone - kubeClusterName: ARC1 + kubeClusterName: arc1 validateConfigOnDeploy: true enterprise: false auth: @@ -58,18 +57,22 @@ spec: annotations: ingress: cert-manager.io/cluster-issuer: "letsencrypt-production" - gethomepage.dev/enabled: "true" - gethomepage.dev/group: Services - gethomepage.dev/name: *app - gethomepage.dev/icon: teleport.png - tls: - existingSecretName: "teleport-cluster-tls" + external-dns.alpha.kubernetes.io/target: "teleport.${SECRET_EXTERNAL_DOMAIN}" + external-dns.alpha.kubernetes.io/exclude-unifi: "true" + external-dns.alpha.kubernetes.io/cloudflare-proxied: "false" + traefik.ingress.kubernetes.io/router.entrypoints: "websecure" + traefik.ingress.kubernetes.io/router.middlewares: "networking-traefik-middleware-chain-no-auth@kubernetescrd" + service: + traefik.ingress.kubernetes.io/service.serversscheme: https + highAvailability: + tls: + existingSecretName: "teleport-tls" authentication: type: local proxyListenerMode: multiplex persistence: enabled: true - storageClassName: local-nvme + existingClaimName: teleport serviceAccount: create: true rbac: diff --git a/kubernetes/arc1/apps/security/teleport/app/kustomization.yaml b/kubernetes/arc1/apps/security/teleport/app/kustomization.yaml index 17cbc72..f5b3d83 100644 --- a/kubernetes/arc1/apps/security/teleport/app/kustomization.yaml +++ b/kubernetes/arc1/apps/security/teleport/app/kustomization.yaml @@ -3,4 +3,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: + - ./pvc.yaml - ./helmrelease.yaml diff --git a/kubernetes/arc1/apps/security/teleport/app/pvc.yaml b/kubernetes/arc1/apps/security/teleport/app/pvc.yaml new file mode 100644 index 0000000..6ce438b --- /dev/null +++ b/kubernetes/arc1/apps/security/teleport/app/pvc.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: teleport + namespace: security +spec: + storageClassName: local-nvme + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 8Gi diff --git a/kubernetes/arc1/apps/security/teleport/app/resources/token.yaml b/kubernetes/arc1/apps/security/teleport/app/resources/token.yaml new file mode 100644 index 0000000..05d4595 --- /dev/null +++ b/kubernetes/arc1/apps/security/teleport/app/resources/token.yaml @@ -0,0 +1,18 @@ +kind: token +version: v2 +metadata: + name: kubernetes-token + # set a long expiry time, the default for tokens is only 30 minutes + expires: "2050-01-01T00:00:00Z" +spec: + # Use the minimal set of system roles required. + roles: [kube, app, discovery, node, windowsdesktop] + + # set the join method allowed for this token + join_method: kubernetes + + kubernetes: + type: in_cluster + allow: + # Service account names follow the format "namespace:serviceaccountname". + - service_account: "security:teleport-kube-agent" diff --git a/kubernetes/arc1/apps/storage/kustomization.yaml b/kubernetes/arc1/apps/storage/kustomization.yaml new file mode 100644 index 0000000..775f204 --- /dev/null +++ b/kubernetes/arc1/apps/storage/kustomization.yaml @@ -0,0 +1,8 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./namespace.yaml + - ./snapshot-controller/ks.yaml + - ./volsync/ks.yaml diff --git a/kubernetes/arc1/apps/storage/namespace.yaml b/kubernetes/arc1/apps/storage/namespace.yaml new file mode 100644 index 0000000..5036c55 --- /dev/null +++ b/kubernetes/arc1/apps/storage/namespace.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: storage + labels: + kustomize.toolkit.fluxcd.io/prune: disabled + volsync.backube/privileged-movers: "true" diff --git a/kubernetes/arc1/apps/storage/snapshot-controller/app/helmrelease.yaml b/kubernetes/arc1/apps/storage/snapshot-controller/app/helmrelease.yaml new file mode 100644 index 0000000..6ec363d --- /dev/null +++ b/kubernetes/arc1/apps/storage/snapshot-controller/app/helmrelease.yaml @@ -0,0 +1,33 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: snapshot-controller +spec: + interval: 30m + chart: + spec: + chart: snapshot-controller + version: 3.0.6 + sourceRef: + kind: HelmRepository + name: piraeus + namespace: flux-system + maxHistory: 2 + install: + crds: CreateReplace + remediation: + retries: 3 + upgrade: + cleanupOnFail: true + crds: CreateReplace + remediation: + retries: 3 + uninstall: + keepHistory: false + values: + controller: + serviceMonitor: + create: true + webhook: + enabled: false diff --git a/kubernetes/arc1/apps/storage/snapshot-controller/app/kustomization.yaml b/kubernetes/arc1/apps/storage/snapshot-controller/app/kustomization.yaml new file mode 100644 index 0000000..17cbc72 --- /dev/null +++ b/kubernetes/arc1/apps/storage/snapshot-controller/app/kustomization.yaml @@ -0,0 +1,6 @@ +--- +# yaml-language-server: $schema=https://json.schemastore.org/kustomization +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ./helmrelease.yaml diff --git a/kubernetes/arc1/apps/storage/snapshot-controller/ks.yaml b/kubernetes/arc1/apps/storage/snapshot-controller/ks.yaml new file mode 100644 index 0000000..27bb92a --- /dev/null +++ b/kubernetes/arc1/apps/storage/snapshot-controller/ks.yaml @@ -0,0 +1,24 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: &app snapshot-controller + namespace: flux-system +spec: + targetNamespace: storage + commonMetadata: + labels: + app.kubernetes.io/name: *app + path: ./kubernetes/arc1/apps/storage/snapshot-controller/app + prune: true + sourceRef: + kind: GitRepository + name: k8s-gitops + wait: false + interval: 30m + retryInterval: 1m + timeout: 5m + postBuild: + substitute: + APP: *app diff --git a/kubernetes/arc1/apps/storage/volsync/app/helmrelease.yaml b/kubernetes/arc1/apps/storage/volsync/app/helmrelease.yaml new file mode 100644 index 0000000..a4a18ed --- /dev/null +++ b/kubernetes/arc1/apps/storage/volsync/app/helmrelease.yaml @@ -0,0 +1,31 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: &app volsync +spec: + interval: 30m + chart: + spec: + chart: volsync + version: 0.11.0 + sourceRef: + kind: HelmRepository + name: backube + namespace: flux-system + install: + remediation: + retries: 3 + upgrade: + cleanupOnFail: true + remediation: + retries: 3 + uninstall: + keepHistory: false + dependsOn: + - name: snapshot-controller + namespace: storage + values: + manageCRDs: true + metrics: + disableAuth: true diff --git a/kubernetes/arc1/apps/storage/volsync/ks.yaml b/kubernetes/arc1/apps/storage/volsync/ks.yaml new file mode 100644 index 0000000..feb721e --- /dev/null +++ b/kubernetes/arc1/apps/storage/volsync/ks.yaml @@ -0,0 +1,24 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/fluxcd-community/flux2-schemas/main/kustomization-kustomize-v1.json +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: &app volsync + namespace: flux-system +spec: + targetNamespace: storage + commonMetadata: + labels: + app.kubernetes.io/name: *app + path: ./kubernetes/arc1/apps/storage/volsync/app + prune: true + sourceRef: + kind: GitRepository + name: k8s-gitops + wait: false + interval: 30m + retryInterval: 1m + timeout: 5m + postBuild: + substitute: + APP: *app diff --git a/kubernetes/arc1/apps/traefik-ingress/traefik/app/helmrelease.yaml b/kubernetes/arc1/apps/traefik-ingress/traefik/app/helmrelease.yaml index 2200b17..22524d3 100644 --- a/kubernetes/arc1/apps/traefik-ingress/traefik/app/helmrelease.yaml +++ b/kubernetes/arc1/apps/traefik-ingress/traefik/app/helmrelease.yaml @@ -31,19 +31,69 @@ spec: service: annotations: io.cilium/lb-ipam-ips: ${LB_TRAEFIK} - # spec: - #externalTrafficPolicy: Local + env: + - name: TZ + value: "${TIMEZONE}" ingressClass: enabled: true isDefaultClass: true + ingressRoute: + dashboard: + enabled: false + globalArguments: + - "--serversTransport.insecureSkipVerify=true" + - "--global.sendanonymoususage=false" + additionalArguments: + - "--entrypoints.web.transport.respondingTimeouts.readTimeout=0" + - "--entrypoints.websecure.transport.respondingTimeouts.readTimeout=0" + ports: + traefik: + expose: + default: false + web: + redirectTo: + port: websecure + websecure: + tls: + enabled: true + options: default + forwardedHeaders: + trustedIPs: + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + proxyProtocol: + trustedIPs: + - 10.0.0.0/8 + - 172.16.0.0/12 + - 192.168.0.0/16 + http3: + enabled: true + metrics: + expose: + default: false metrics: serviceMonitor: enabled: true namespaceSelector: any: true + pilot: + enabled: false + providers: + kubernetesCRD: + enabled: true + ingressClass: traefik + allowCrossNamespace: true + allowExternalNameServices: true + kubernetesIngress: + enabled: true + ingressClass: traefik + allowExternalNameServices: true + publishedService: + enabled: true resources: requests: - memory: 128Mi cpu: 100m + memory: 512Mi limits: memory: 1536Mi diff --git a/kubernetes/arc1/flux/repositories/helm/kustomization.yaml b/kubernetes/arc1/flux/repositories/helm/kustomization.yaml index fb4f48d..aa32f06 100644 --- a/kubernetes/arc1/flux/repositories/helm/kustomization.yaml +++ b/kubernetes/arc1/flux/repositories/helm/kustomization.yaml @@ -15,6 +15,7 @@ resources: - ./node-feature-discovery.yaml - ./nvidia-device-plugin.yaml - ./openebs.yaml + - ./piraeus.yaml - ./postfinance.yaml - ./prometheus-community.yaml - ./spegel.yaml diff --git a/kubernetes/arc1/flux/repositories/helm/piraeus.yaml b/kubernetes/arc1/flux/repositories/helm/piraeus.yaml new file mode 100644 index 0000000..84f361a --- /dev/null +++ b/kubernetes/arc1/flux/repositories/helm/piraeus.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: piraeus + namespace: flux-system +spec: + interval: 2h + url: https://piraeus.io/helm-charts/