Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: ui and scripts updated, pushing to allow for further testing #135

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions bicep/ccw.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ param resourceGroup string
param sharedFilesystem types.sharedFilesystem_t
param additionalFilesystem types.additionalFilesystem_t
param network types.vnet_t
param clusterInitSpecs types.cluster_init_param_t
param ood object
param slurmSettings types.slurmSettings_t
param schedulerNode types.scheduler_t
param loginNodes types.login_t
Expand All @@ -30,6 +32,7 @@ param databaseAdminPassword string
param databaseConfig types.databaseConfig_t
param clusterName string


var anfDefaultMountOptions = 'rw,hard,rsize=262144,wsize=262144,vers=3,tcp,_netdev'

func getTags(resource_type string, tags types.resource_tags_t) types.tags_t => tags[?resource_type] ?? {}
Expand Down Expand Up @@ -265,6 +268,8 @@ output filerInfoFinal types.filerInfo_t = {

output cyclecloudPrincipalId string = infrastructureOnly ? '' : ccwVM.outputs.principalId

output clusterInitSpecs types.cluster_init_param_t = clusterInitSpecs

output slurmSettings types.slurmSettings_t = slurmSettings

output schedulerNode types.scheduler_t = schedulerNode
Expand Down Expand Up @@ -313,3 +318,7 @@ output nodeArrayTags types.tags_t = tags[?'Node Array'] ?? {}
output branch string = branch
output projectVersion string = projectVersion
output insidersBuild bool = insidersBuild

output ood object = union(ood, {
version: '2024-11-14'
})
152 changes: 138 additions & 14 deletions bicep/files-to-load/create_cc_param.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
#!/usr/bin/env python

import argparse
import hashlib
import json
import os
import shutil
from subprocess import check_output
import sys
import argparse
import typing


def get_json_dict(file_name):
file_path = os.path.join(os.getcwd(),file_name)
with open(file_path, 'r') as file:
content = file.read()
data = json.loads(content)
return data
with open(file_name) as fr:
return json.load(fr)


def set_params(params, dbPassword, outputs):
def set_slurm_params(params, dbPassword, outputs):
params['Region'] = outputs['location']['value']
#params['Credentials']
if outputs['vnet']['value']['type'] == 'new':
subnetID = outputs['vnet']['value']['computeSubnetId']
subnet_toks = subnetID.split("/")
Expand Down Expand Up @@ -89,15 +89,139 @@ def set_params(params, dbPassword, outputs):
params['AdditionalNFSMountOptions'] = outputs['filerInfoFinal']['value']['additional']['mountOptions']
params['AdditionalNFSAddress'] = outputs['filerInfoFinal']['value']['additional']['ipAddress']


def set_ood_params(params, outputs):
slurm_params = get_json_dict('initial_params.json')
# We want to essentially inherit certain settings from the slurm cluster.
set_slurm_params(slurm_params, "", outputs)
params['NFSAddress'] = slurm_params.get('NFSAddress') or 'ccw-scheduler'
params['NFSSharedExportPath'] = slurm_params.get('NFSSharedExportPath') or '/shared'
params['NFSSharedMountOptions'] = slurm_params.get('NFSSharedMountOptions')
params['SubnetId'] = slurm_params["SubnetId"]
params['Region'] = slurm_params['Region']
params['Credentials'] = slurm_params['Credentials']

params['MachineType'] = outputs['ood']['value'].get('sku')
params['ManagedIdentity'] = outputs['ood']['value'].get('managed_identity')
params['BootDiskSize'] = outputs['ood']['value'].get('BootDiskSize')
params['ImageName'] = outputs['ood']['value'].get('ImageName')

params['ood_server_name'] = outputs['ood']['value'].get('ood_server_name')
params['ood_auth_method'] = outputs['ood']['value'].get('ood_auth_method')
params['ood_ldap_host'] = outputs['ood']['value'].get('ood_ldap_host')
params['ood_ldap_bind_dn'] = outputs['ood']['value'].get('ood_ldap_bind_dn')
params['ood_ldap_bind_pwd'] = outputs['ood']['value'].get('ood_ldap_bind_pwd')
params['ood_ldap_user_base_dn'] = outputs['ood']['value'].get('ood_ldap_user_base_dn')
params['ood_ldap_group_base_dn'] = outputs['ood']['value'].get('ood_ldap_group_base_dn')
params['ood_entra_client_id'] = outputs['ood']['value'].get('ood_entra_client_id')
params['ood_entra_client_secret'] = outputs['ood']['value'].get('ood_entra_client_secret')
params['ood_entra_tenant_id'] = outputs['ood']['value'].get('ood_entra_tenant_id')


class ClusterInitSpec:
def __init__(self, project: str, version: str, spec: str, targets: typing.List[str]):
self.project = project
self.version = version
self.spec = spec
self.targets = targets
self.cluster_init_key = f"{self.project}:{self.spec}:{self.version}"


def download_cluster_init(outputs, cluster_name, root_folder, locker) -> typing.List[ClusterInitSpec]:
ret = []
for record in outputs['clusterInitSpecs']["value"].get(cluster_name, []):
url = _strip_tags_from_github_url(record)
url_hash = hashlib.sha256(url.encode())

folder = os.path.join(root_folder, url_hash.hexdigest())
if not os.path.exists(folder):
# download and move to avoid repeated failures with partial downloads/uploads
check_output(["cyclecloud", "project", "fetch", url, folder + ".tmp"])
check_output(["cyclecloud", "project", "upload", locker], cwd=folder + ".tmp")
shutil.move(folder + ".tmp", folder)
with open(os.path.join(folder, "download-url"), "w") as fw:
fw.write(url)
proj_info_raw = check_output(["cyclecloud", "project", "info"], cwd=folder).decode()
proj_info = {}
for line in proj_info_raw.splitlines():
key, rest = line.split(":", 1)
proj_info[key.lower()] = rest.strip()
ret.append(ClusterInitSpec(proj_info["name"],
proj_info["version"],
record.get("spec") or "default",
record["target"]))
return ret


def _strip_tags_from_github_url(record):
url = record["gitHubReleaseURL"]
if "/tag/" in url:
return url.replace("/tag", "")
return url


def _version_from_url(record):
if record.get("version"):
return record["version"]
return record["gitHubReleaseURL"].split("/")[-1]


def set_cluster_init_params(params: dict, specs: typing.List[ClusterInitSpec], cluster_name: str, target_params: dict) -> None:
for spec in specs:
order = 10000
for target in spec.targets:

target_key = f"{target_params[target.lower()]}"
if not params.get(target_key):
params[target_key] = {}

params[target_key][spec.cluster_init_key] = {
"Order": order,
"Spec": spec.spec,
"Name": spec.cluster_init_key,
"Project": spec.project,
"Locker": "azure-storage",
"Version": spec.version
}
order += 100


def main():
parser = argparse.ArgumentParser(description="Accept database password")
parser.add_argument("--dbPassword", dest="dbPassword", default="", help="MySQL database password")
args = parser.parse_args()
parser = argparse.ArgumentParser(description="TODO RDH")
parser.add_argument("--locker", default="azure-storage")
parser.add_argument("--cluster-init-working-dir", default="cluster-init")
subparsers = parser.add_subparsers()
ccw_parser = subparsers.add_parser("ccw")
ccw_parser.set_defaults(cluster_name="ccw", target_params={
"login": "LoginClusterInitSpecs",
"gpu": "GPUClusterInitSpecs",
"hpc": "HPCClusterInitSpecs",
"htc": "HTCClusterInitSpecs",
"scheduler": "SchedulerClusterInitSpecs",
"dynamic": "DynamicClusterInitSpecs"
})
ccw_parser.add_argument("--dbPassword", dest="dbPassword", default="", help="MySQL database password")

slurm_params = get_json_dict('initial_params.json')
ood_parser = subparsers.add_parser("ood")
ood_parser.set_defaults(cluster_name="ood", target_params={
"ood": "ClusterInitSpecs"
})

args = parser.parse_args()

if args.cluster_name == "ccw":
output_params = get_json_dict('initial_params.json')
else:
output_params = {}
ccw_outputs = get_json_dict('ccwOutputs.json')
set_params(slurm_params,args.dbPassword,ccw_outputs)
print(json.dumps(slurm_params,indent=4))

specs = download_cluster_init(ccw_outputs, args.cluster_name, os.path.join(os.getcwd(), args.cluster_init_working_dir), args.locker)
set_cluster_init_params(output_params, specs, args.cluster_name, args.target_params)
if args.cluster_name == "ccw":
set_slurm_params(output_params, args.dbPassword, ccw_outputs)
else:
set_ood_params(output_params, ccw_outputs)
print(json.dumps(output_params, indent=4))

if __name__ == '__main__':
main()
47 changes: 40 additions & 7 deletions bicep/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ while [ ! -f "$SECRETS_FILE_PATH" ]; do
sleep 1
done
DATABASE_ADMIN_PASSWORD=$(jq -r .databaseAdminPassword $SECRETS_FILE_PATH)
(python3 create_cc_param.py --dbPassword="${DATABASE_ADMIN_PASSWORD}") > slurm_params.json
echo "Filework successful"

CYCLECLOUD_USERNAME=$(jq -r .adminUsername.value ccwOutputs.json)
Expand All @@ -124,6 +123,11 @@ CYCLECLOUD_USER_PUBKEY=$(jq -r .publicKey.value ccwOutputs.json)
CYCLECLOUD_STORAGE="$(jq -r .storageAccountName.value ccwOutputs.json)"
SLURM_CLUSTER_NAME=$(jq -r .clusterName.value ccwOutputs.json)
USE_INSIDERS_BUILD=$(jq -r .insidersBuild.value ccwOutputs.json)
INCLUDE_OOD=true
if [ $(jq -r .ood.value.ood_auth_method ccwOutputs.json) == 'disabled' ]; then
INCLUDE_OOD=false
fi

INSIDERS_BUILD_ARG=
if [ "$USE_INSIDERS_BUILD" == "true" ]; then
echo Using insiders build - we first need to uninstall cyclecloud8 and remove all files.
Expand Down Expand Up @@ -159,8 +163,8 @@ chown cycle_server:cycle_server /tmp/ccw_site_id.txt
chmod 664 /tmp/ccw_site_id.txt
mv /tmp/ccw_site_id.txt /opt/cycle_server/config/data/ccw_site_id.txt

# Create the project file
cat > /opt/cycle_server/config/data/ccw_project.txt <<EOF
# Create the project file, with a create and mv so it is atomic
cat > /tmp/ccw_project.txt <<EOF
AdType = "Cloud.Project"
Version = "$PROJECT_VERSION"
ProjectType = "scheduler"
Expand All @@ -169,8 +173,10 @@ AutoUpgrade = false
Name = "ccw"
EOF

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might as well chown it to owned by cycle_server:cycle_server to be safe

mv /tmp/ccw_project.txt /opt/cycle_server/config/data/

echo Waiting for records to be imported
timeout 360s bash -c 'until (! ls /opt/cycle_server/config/data/*.txt); do sleep 10; done'
timeout 360s bash -c 'until (! ls /opt/cycle_server/config/data/*.txt 2> /dev/null); do sleep 10; done'

echo Restarting cyclecloud so that new records take effect
cycle_server stop
Expand All @@ -181,13 +187,32 @@ curl -k https://localhost
cyclecloud initialize --batch --url=https://localhost --username=${CYCLECLOUD_USERNAME} --password=${CYCLECLOUD_PASSWORD} --verify-ssl=false --name=$SLURM_CLUSTER_NAME
echo "CC initialize successful"
sleep 5
# ensure machine types are loaded ASAP
cycle_server run_action 'Run:Application.Timer' -eq 'Name' 'plugin.azure.monitor_reference'

# get some useful work done while we are waiting.
echo Creating parameters files and downloading additional cluster-inits
(python3 create_cc_param.py ccw --dbPassword="${DATABASE_ADMIN_PASSWORD}") > slurm_params.json

cyclecloud import_template Slurm-Workspace -f slurm-workspace.txt
echo "CC import template successful"
cyclecloud create_cluster Slurm-Workspace $SLURM_CLUSTER_NAME -p slurm_params.json
echo "CC create_cluster successful"

# ensure machine types are loaded ASAP
cycle_server run_action 'Run:Application.Timer' -eq 'Name' 'plugin.azure.monitor_reference'
if [ $INCLUDE_OOD == true ]; then
(python3 create_cc_param.py ood) > ood_params.json

OOD_PROJECT_VERSION=$(jq -r .ood.value.version ccwOutputs.json)
ood_url="https://github.com/xpillons/ood4cc/releases/${OOD_PROJECT_VERSION}"

cyclecloud project fetch $ood_url ood
cd ood
cyclecloud project upload azure-storage
ood_template_name=OpenOnDemand_${OOD_PROJECT_VERSION}
cyclecloud import_template -c OpenOnDemand -f templates/OpenOnDemand.txt $ood_template_name
cd ..
cyclecloud create_cluster $ood_template_name OpenOnDemand -p ood_params.json
fi

# Wait for Azure.MachineType to be populated
while [ $(/opt/cycle_server/./cycle_server execute --format json "
Expand Down Expand Up @@ -219,7 +244,15 @@ timeout 360s bash -c 'until (! ls /opt/cycle_server/config/data/*.txt); do sleep
cyclecloud start_cluster "$SLURM_CLUSTER_NAME"
echo "CC start_cluster successful"
rm -f slurm_params.json
echo "Deleted input parameters file"
echo "Deleted Slurm input parameters file"

if [ $INCLUDE_OOD == true ]; then
cyclecloud start_cluster OpenOnDemand
echo "CC start_cluster for OpenOnDemand successful"
rm -f ood_params.json
echo "Deleted OOD input parameters file"
fi

#TODO next step: wait for scheduler node to be running, get IP address of scheduler + login nodes (if enabled)
popd
rm -f "$SECRETS_FILE_PATH"
Expand Down
5 changes: 5 additions & 0 deletions bicep/mainTemplate.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ param resourceGroup string
param sharedFilesystem types.sharedFilesystem_t
param additionalFilesystem types.additionalFilesystem_t = { type: 'disabled' }
param network types.vnet_t
param clusterInitSpecs types.cluster_init_param_t = {}
param slurmSettings types.slurmSettings_t = { version: '23.11.7-1', healthCheckEnabled: false }
param schedulerNode types.scheduler_t
param loginNodes types.login_t
Expand All @@ -26,6 +27,8 @@ param databaseConfig types.databaseConfig_t = { type: 'disabled' }
@description('The user-defined name of the cluster. Regex: ^[a-zA-Z0-9@_-]{3,}$')
param clusterName string = 'ccw'

param ood types.ood_t = { ood_auth_method: 'disabled' }

param infrastructureOnly bool = false
param insidersBuild bool = false

Expand Down Expand Up @@ -53,6 +56,8 @@ module makeCCWresources 'ccw.bicep' = {
sharedFilesystem: sharedFilesystem
additionalFilesystem: additionalFilesystem
network: network
clusterInitSpecs: clusterInitSpecs
ood: ood
slurmSettings: slurmSettings
schedulerNode: schedulerNode
loginNodes: loginNodes
Expand Down
58 changes: 58 additions & 0 deletions bicep/types.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -241,3 +241,61 @@ type databaseOutput_t = {
databaseUser: string?
url: string?
}

type cluster_init_target_t = 'login' | 'scheduler' | 'htc' | 'hpc' | 'gpu' | 'dynamic' | 'ood'


@export()
type github_cluster_init_t = {
type: 'gitHubReleaseURL'
gitHubReleaseURL: string
spec: string?
target: cluster_init_target_t[]
}

@export()
type prestaged_cluster_init_t = {
type: 'PreStaged'
spec: string?
version: string
target: cluster_init_target_t[]
}

@discriminator('type')
@export()
type cluster_init_t = github_cluster_init_t | prestaged_cluster_init_t

@export()
type cluster_init_param_t = {
ccw: cluster_init_t[]?
ood: cluster_init_t[]?
}


type ood_basic_t = {
ood_auth_method: 'Basic'
}

type ood_ldap_t = {
ood_auth_method: 'LDAP'
ood_ldap_host: string
ood_ldap_bind_dn: string
ood_ldap_bind_pwd: string
ood_ldap_user_base_dn: string
ood_ldap_group_base_dn: string
}

type ood_entra_t = {
ood_auth_method: 'Entra'
ood_entra_client_secret: string
ood_ldap_bind_dn: string
ood_entra_tenant_id: string
}

type ood_disabled_t = {
ood_auth_method: 'disabled'
}

@discriminator('ood_auth_method')
@export()
type ood_t = ood_basic_t | ood_ldap_t | ood_entra_t | ood_disabled_t
Loading