Ume datamodule - allow downloads of HF datasets (#50)

karinazad · web-flow · commit a69b7711ff01 · 2025-03-21T09:45:59.000-04:00
* config

* train script

* train script

* train.sh

* remove train.sh

* download

* download=true

* env vars

* slurm

* data dir

* data dir

* data dir
diff --git a/.gitignore b/.gitignore
@@ -36,3 +36,5 @@ dev
 outputs
 wandb
 lightning_logs
+
+.env
diff --git a/last.ckpt b/last.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3305e5c9d1885db6c97e86372aeaeb02d7fec4d465bef5779a0d74bd0a7417a9
+size 137159028
diff --git a/slurm/README.md b/slurm/README.md
@@ -0,0 +1,25 @@
+# Running LBSTER Jobs with SLURM
+
+This guide explains how to run a `lobster` training job using SLURM on a GPU-enabled system. It also describes which environment variables need to be exported for the job to run properly.
+
+# SLURM Job Script
+The provided example job script `scripts/train_ume.sh` is configured up for training the `Ume` model on a GPU-enabled SLURM cluster. 
+
+You will need to set specific environment variables to run the job. These will be read by the `Ume` hydra configuration file, which is located at `src/lobster/hydra_config/experiment/train_ume.yaml`.
+
+Variables:
+
+* `LOBSTER_DATA_DIR`: Path to the directory containing your training data. Datasets will be downloaded and cached to this directory (if `data.download` is set to `True` in the hydra configuration file).
+* `LOBSTER_RUNS_DIR`: Path to the directory where training results (model checkpoints, logs, etc.) will be stored.
+* `LOBSTER_USER`: The user entity for the logger (usually your wandb username).
+* `WANDB_BASE_URL`: The base URL for the Weights & Biases service. Optional - only needed if you wandb account is not on the default wandb server.
+
+Example:
+```bash
+    export LOBSTER_DATA_DIR="/data/lobster/ume/data"
+    export LOBSTER_RUNS_DIR="/data/lobster/ume/runs"
+    export LOBSTER_USER=$(whoami)
+    export WANDB_BASE_URL=https://your_org.wandb.io/
+```
+
+
diff --git a/slurm/scripts/train_ume.sh b/slurm/scripts/train_ume.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+#SBATCH --partition gpu2
+#SBATCH --nodes 1
+#SBATCH --ntasks-per-node 2
+#SBATCH --gpus-per-node 2
+#SBATCH --cpus-per-task 8
+#SBATCH -o slurm/logs/%J.out
+# srun hostname
+
+nvidia-smi
+
+source .venv/bin/activate
+echo "SLURM_JOB_ID = ${SLURM_JOB_ID}"
+
+export WANDB_INSECURE_DISABLE_SSL=true
+export HYDRA_FULL_ERROR=1
+export PYTHONUNBUFFERED=1
+
+export TOKENIZERS_PARALLELISM=true
+
+srun -u --cpus-per-task 8 --cpu-bind=cores,verbose \
+lobster_train experiment=train_ume \
+    logger.entity="$(whoami)" 
+
+
diff --git a/src/lobster/data/_ume_datamodule.py b/src/lobster/data/_ume_datamodule.py
@@ -50,7 +50,7 @@ def __post_init__(self):
         supported_splits={Split.TRAIN, Split.TEST},
         train_size=19_400_000,
         test_size=1_000_000,
-        kwargs={"download": False, "keys": ["smiles"]},
+        kwargs={"keys": ["smiles"]},
     ),
     DatasetInfo(
         name="Calm",
@@ -85,6 +85,7 @@ def __init__(
         tokenizer_max_length: int,
         *,
         datasets: None | Sequence[str] = None,
+        download: bool = False,
         root: Path | str | None = None,
         seed: int = 0,
         batch_size: int = 1,
@@ -104,6 +105,10 @@ def __init__(
         datasets : None | Sequence[str], optional
             List of dataset names to use. If None, all supported datasets will be used.
             Example: ["M320M", "Calm", "AMPLIFY", "Pinder"]
+        download: bool, optional
+            If True, will download the datasets first and stream locally.
+            Otherwise, streams directly from Hugging Face.
+            Downloaded datasets are cached in the `root` directory.
         root : Path | str | None, optional
             Root directory where the datasets are stored. If None, the default directory will be used.
         seed : int, optional
@@ -156,6 +161,7 @@ def __init__(
         self._stopping_condition = stopping_condition
         self._sample = sample
         self._weights = weights
+        self._download = download
 
         # Initialize tokenizer transforms for each modality
         tokenizer_instances = {
@@ -195,6 +201,7 @@ def _get_dataset(self, dataset_info: DatasetInfo, split: Split) -> Dataset:
 
         return dataset_class(
             root=self._root,
+            download=self._download,
             transform=transform,
             split=split.value,
             shuffle=(split == Split.TRAIN),
diff --git a/src/lobster/hydra_config/experiment/train_ume.yaml b/src/lobster/hydra_config/experiment/train_ume.yaml
@@ -13,20 +13,20 @@ compile: false
 
 data:
   _target_: lobster.data.UmeLightningDataModule
-  root: ${paths.root_dir}/data
+  root: ${oc.env:LOBSTER_DATA_DIR} 
   datasets: ["M320M", "Calm", "AMPLIFY", "Pinder"]
   batch_size: 128
   tokenizer_max_length: ${model.max_length}
   pin_memory: true
   shuffle_buffer_size: 1000
   num_workers: 4
   seed: 0
-  sample: false # if false, uses RoundRobinConcatIterableDataset, else MultiplexedSamplingDataset
-  stopping_condition: min # min or max, used only if sample is false
-  weights: null # used only if sample is true, if null and sample is true, samples with weights based on dataset sizes
+  download: true 
+  sample: true 
+  weights: null 
 
 paths:
-  root_dir: ./runs
+  root_dir: ${oc.env:LOBSTER_RUNS_DIR} 
   
 trainer:
   max_steps: 50_000
@@ -39,7 +39,7 @@ trainer:
   devices: auto
 
 model:
-  model_name: UME_mini
+  model_name: UME_small
   vocab_size: 1472
   pad_token_id: 1
   cls_token_id: 0
@@ -59,10 +59,13 @@ model:
 callbacks:
   moleculeace_linear_probe:
     max_length: ${model.max_length}
+    run_every_n_epochs: 1
   calm_linear_probe:
     max_length: ${model.max_length}
+    run_every_n_epochs: 1
 
 logger:
   name: ume_${model.model_name}_${now:%Y-%m-%d_%H-%M-%S}
   project: lobster
-  group: ume-dev-${now:%Y-%m-%d-%H-%M-%S}
+  group: ume-dev-${now:%Y-%m-%d-%H-%M-%S}
+  entity: ${oc.env:LOBSTER_USER}

-Original file line number
+Diff line change
 outputs
 wandb
 lightning_logs
++
 +.env
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:3305e5c9d1885db6c97e86372aeaeb02d7fec4d465bef5779a0d74bd0a7417a9`
	`3`	`+size 137159028`