diff --git a/DATA.md b/DATA.md index 643b538..92a22dd 100644 --- a/DATA.md +++ b/DATA.md @@ -10,6 +10,8 @@ We list the available data used in the current version of CrossOver in the table | ------------ | ----------------------------- | ----------------------------------- | -------------------------- | -------------------------- | | ScanNet | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` | ❌ | ✅ | | 3RScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ✅ | ✅ | +| ARKitScenes | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | +| MultiScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | We detail data download and release instructions for preprocessing with scripts for ScanNet + 3RScan. @@ -110,4 +112,69 @@ Scan3R/ | │ ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading) | │ └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan) | └── ... -``` \ No newline at end of file +``` +### MultiScan + +#### Running preprocessing scripts +Adjust the path parameters of `MultiScan` in the config files under `configs/preprocess`. Run the following (after changing the `--config-path` in the bash file): + +```bash +$ bash scripts/preprocess/process_multiscan.sh +``` + +Our script for MultiScan dataset performs the following additional processing: + +- 3D-to-2D projection for 2D segmentation and stores as `gt-projection-seg.pt` for each scan. + +Post running preprocessing, the data structure should look like the following: + +``` +MultiScan/ +├── objects_chunked/ (object data chunked into hdf5 format for instance baseline training) +| ├── train_objects.h5 +| └── val_objects.h5 +├── scans/ +| ├── scene_00000_00/ +| │ ├── gt-projection-seg.pt -> 3D-to-2D projected data consisting of framewise 2D instance segmentation +| │ ├── data1D.pt -> all 1D data + encoded (object referrals + BLIP features) +| │ ├── data2D.pt -> all 2D data + encoded (RGB + floorplan + DinoV2 features) +| │ ├── data2D_all_images.pt (RGB features of every image of every scan) +| │ ├── data3D.pt -> all 3D data + encoded (Point Cloud + I2PMAE features - object only) +| │ ├── object_id_to_label_id_map.pt -> Instance ID to NYU40 Label mapped +| │ ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading) +| │ └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan) +| └── ... +``` + +### ARKitScenes + +#### Running preprocessing scripts +Adjust the path parameters of `ARKitScenes` in the config files under `configs/preprocess`. Run the following (after changing the `--config-path` in the bash file): + +```bash +$ bash scripts/preprocess/process_arkit.sh +``` + +Our script for ARKitScenes dataset performs the following additional processing: + +- 3D-to-2D projection for 2D segmentation and stores as `gt-projection-seg.pt` for each scan. + +Post running preprocessing, the data structure should look like the following: + +``` +ARKitScenes/ +├── objects_chunked/ (object data chunked into hdf5 format for instance baseline training) +| ├── train_objects.h5 +| └── val_objects.h5 +├── scans/ +| ├── 40753679/ +| │ ├── gt-projection-seg.pt -> 3D-to-2D projected data consisting of framewise 2D instance segmentation +| │ ├── data1D.pt -> all 1D data + encoded (object referrals + BLIP features) +| │ ├── data2D.pt -> all 2D data + encoded (RGB + floorplan + DinoV2 features) +| │ ├── data2D_all_images.pt (RGB features of every image of every scan ) +| │ ├── data3D.pt -> all 3D data + encoded (Point Cloud + I2PMAE features - object only) +| │ ├── object_id_to_label_id_map.pt -> Instance ID to NYU40 Label mapped +| │ ├── objectsDataMultimodal.pt -> object data combined from data1D.pt + data2D.pt + data3D.pt (for easier loading) +| │ └── sel_cams_on_mesh.png (visualisation of the cameras selected for computing RGB features per scan) +| └── ... +``` diff --git a/README.md b/README.md index ab1f152..5f80fa0 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,9 @@ See [DATA.MD](DATA.md) for detailed instructions on data download, preparation a | ------------ | ----------------------------- | ----------------------------------- | -------------------------- | -------------------------- | | Scannet | `[point, rgb, cad, referral]` | `[point, rgb, floorplan, referral]` | ❌ | ✅ | | 3RScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ✅ | ✅ | +| ARKitScenes | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | +| MultiScan | `[point, rgb, referral]` | `[point, rgb, referral]` | ❌ | ✅ | + > To run our demo, you only need to download generated embedding data; no need for any data preprocessing. @@ -136,7 +139,7 @@ Various configurable parameters: - `--database_path`: Path to the precomputed embeddings of the database scenes downloaded before (eg: `./release_data/embed_scannet.pt`). - `--query_modality`: Modality of the query scene, Options: `point`, `rgb`, `floorplan`, `referral` - `--database_modality`: Modality used for retrieval. Same options as above. -- `--ckpt`: Path to the pre-trained scene crossover model checkpoint (details [here](#checkpoints)), example_path: `./checkpoints/scene_crossover_scannet+scan3r.pth/`). +- `--ckpt`: Path to the pre-trained scene crossover model checkpoint (details [here](#checkpoints)), example_path: `./checkpoints/scene_crossover_scannet+scan3r.pth/`. For embedding and pre-trained model download, refer to [generated embedding data](DATA.md#generated-embedding-data) and [checkpoints](#checkpoints) sections. diff --git a/TRAIN.md b/TRAIN.md index ffa4938..337cdef 100644 --- a/TRAIN.md +++ b/TRAIN.md @@ -21,7 +21,7 @@ $ bash scripts/train/train_instance_crossover.sh ``` #### Train Scene Retrieval Pipeline -Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet & 3RScan or either. Run the following: +Adjust path/configuration parameters in `configs/train/train_scene_crossover.yaml`. You can also add your customised dataset or choose to train on Scannet, 3RScan, MultiScan, & ARKitScenes or any combination of the same. Run the following: ```bash $ bash scripts/train/train_scene_crossover.sh diff --git a/common/load_utils.py b/common/load_utils.py index cd06a54..32ad0c7 100644 --- a/common/load_utils.py +++ b/common/load_utils.py @@ -50,6 +50,20 @@ def write_json(data_dict: Any, filename: str) -> None: with open(filename, "w") as outfile: outfile.write(json_obj) +def load_npz_as_dict(filename: str) -> dict: + with np.load(filename, allow_pickle=True) as npz: + if isinstance(npz, np.lib.npyio.NpzFile): + out = {} + for k in npz.files: + val = npz[k] + if (isinstance(val, np.ndarray) and + val.dtype == object and + val.shape == ()): + out[k] = val.item() + else: + out[k] = val + return out + def get_print_format(value: Any) -> str: """Determines the appropriate format string for a given value.""" if isinstance(value, int): diff --git a/configs/evaluation/eval_instance.yaml b/configs/evaluation/eval_instance.yaml index a14c626..eef264c 100644 --- a/configs/evaluation/eval_instance.yaml +++ b/configs/evaluation/eval_instance.yaml @@ -33,7 +33,7 @@ data : voxel_size : 0.02 Scan3R: - base_dir : /drive/datasets/Scan3R/ + base_dir : /media/sayan/internal/datasets/Scan3R/ process_dir : ${data.process_dir}/Scan3R/ processor3D : Scan3R3DProcessor processor2D : Scan3R2DProcessor @@ -43,14 +43,33 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : InferenceObjectRetrieval InferenceObjectRetrieval: val : [Scannet] modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'referral', 'floorplan'] - ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r.pth - + ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth inference_module: ObjectRetrieval diff --git a/configs/evaluation/eval_scene.yaml b/configs/evaluation/eval_scene.yaml index 0f1b6f2..fc19a4e 100644 --- a/configs/evaluation/eval_scene.yaml +++ b/configs/evaluation/eval_scene.yaml @@ -33,7 +33,7 @@ data : voxel_size : 0.02 Scan3R: - base_dir : /drive/datasets/Scan3R/ + base_dir : /media/sayan/internal/datasets/Scan3R/ process_dir : ${data.process_dir}/Scan3R/ processor3D : Scan3R3DProcessor processor2D : Scan3R2DProcessor @@ -43,13 +43,32 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + max_object_len : 150 + voxel_size : 0.02 + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : InferenceSceneRetrieval InferenceSceneRetrieval: val : [Scannet] modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'referral', 'floorplan'] #, 'point'] - ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r.pth + ckpt_path : /drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r+multiscan.pth inference_module: SceneRetrieval model: diff --git a/configs/preprocess/process_1d.yaml b/configs/preprocess/process_1d.yaml index c74b6bc..aaecd1c 100644 --- a/configs/preprocess/process_1d.yaml +++ b/configs/preprocess/process_1d.yaml @@ -17,7 +17,7 @@ data: aggre_subfix : _vh_clean.aggregation.json Scan3R: - base_dir : /drive/datasets/Scan3R/ + base_dir : /media/sayan/internal/datasets/Scan3R/ process_dir : ${data.process_dir}/Scan3R/ processor3D : Scan3R3DProcessor processor2D : Scan3R2DProcessor @@ -25,6 +25,21 @@ data: label_filename : labels.instances.align.annotated.v2.ply skip_frames : 1 + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + skip_frames : 1 + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + skip_frames : 1 + Shapenet: base_dir : /drive/datasets/Shapenet/ShapeNetCore.v2/ diff --git a/configs/preprocess/process_2d.yaml b/configs/preprocess/process_2d.yaml index 74898cd..5a6e8e3 100644 --- a/configs/preprocess/process_2d.yaml +++ b/configs/preprocess/process_2d.yaml @@ -19,7 +19,7 @@ data: skip_frames : 5 Scan3R: - base_dir : /drive/datasets/Scan3R/ + base_dir : /media/sayan/internal/datasets/Scan3R/ process_dir : ${data.process_dir}/Scan3R/ processor3D : Scan3R3DProcessor processor2D : Scan3R2DProcessor @@ -27,6 +27,22 @@ data: label_filename : labels.instances.align.annotated.v2.ply skip_frames : 1 + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + skip_frames : 1 + + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + skip_frames : 1 + modality_info: 1D : feature_extractor: @@ -60,4 +76,4 @@ task: name : Preprocess Preprocess : modality : '2D' - splits : ['val'] \ No newline at end of file + splits : ['train', 'val'] \ No newline at end of file diff --git a/configs/preprocess/process_3d.yaml b/configs/preprocess/process_3d.yaml index 3d15f23..86d5e68 100644 --- a/configs/preprocess/process_3d.yaml +++ b/configs/preprocess/process_3d.yaml @@ -12,18 +12,32 @@ data: layout_dir : /drive/datasets/SceneCAD/ process_dir : ${data.process_dir}/Scannet/ processor3D : Scannet3DProcessor - mesh_subfix : _vh_clean_2.labels.ply + mesh_subfix : _vh_clean_2.ply seg_subfix : _vh_clean_2.0.010000.segs.json aggre_subfix : _vh_clean.aggregation.json Scan3R: - base_dir : /drive/datasets/Scan3R/ + base_dir : /media/sayan/internal/datasets/Scan3R/ process_dir : ${data.process_dir}/Scan3R/ processor3D : Scan3R3DProcessor processor2D : Scan3R2DProcessor processor1D : Scan3R1DProcessor label_filename : labels.instances.align.annotated.v2.ply + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + skip_frames : 1 + modality_info: 1D : feature_extractor: diff --git a/configs/preprocess/process_multimodal.yaml b/configs/preprocess/process_multimodal.yaml index 3eb5ace..fcbbc7a 100644 --- a/configs/preprocess/process_multimodal.yaml +++ b/configs/preprocess/process_multimodal.yaml @@ -18,7 +18,7 @@ data: avail_modalities : ['point', 'cad', 'rgb', 'referral'] Scan3R: - base_dir : /drive/datasets/Scan3R + base_dir : /media/sayan/internal/datasets/Scan3R process_dir : ${data.process_dir}/Scan3R chunked_dir : ${data.process_dir}/Scan3R/objects_chunked/ processor3D : Scan3R3DProcessor @@ -28,6 +28,24 @@ data: skip_frames : 1 avail_modalities : ['point', 'rgb', 'referral'] + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : Scan3R3DProcessor + processor2D : Scan3R2DProcessor + processor1D : Scan3R1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + modality_info: 1D : feature_extractor: diff --git a/configs/train/train_instance_baseline.yaml b/configs/train/train_instance_baseline.yaml index 8b6bc89..bd630d5 100644 --- a/configs/train/train_instance_baseline.yaml +++ b/configs/train/train_instance_baseline.yaml @@ -44,6 +44,27 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /media/sayan/Expansion/data/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + MultiScan: + base_dir : /media/sayan/Expansion/data/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : ObjectLevelGrounding ObjectLevelGrounding : diff --git a/configs/train/train_instance_crossover.yaml b/configs/train/train_instance_crossover.yaml index c54257d..b93ab60 100644 --- a/configs/train/train_instance_crossover.yaml +++ b/configs/train/train_instance_crossover.yaml @@ -33,7 +33,7 @@ data : voxel_size : 0.02 Scan3R: - base_dir : /drive/datasets/Scan3R/ + base_dir : /media/sayan/internal/datasets/Scan3R/ process_dir : ${data.process_dir}/Scan3R/ chunked_dir : ${data.process_dir}/Scan3R/objects_chunked/ processor3D : Scan3R3DProcessor @@ -44,12 +44,33 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : SceneLevelGrounding SceneLevelGrounding : modalities : ['rgb', 'point', 'cad', 'referral'] - train : [Scannet, Scan3R] - val : [Scannet, Scan3R] + train : [Scannet, Scan3R, MultiScan, ARKitScenes] + val : [Scannet, Scan3R, MultiScan, ARKitScenes] trainer: GroundingTrainer diff --git a/configs/train/train_scene_crossover.yaml b/configs/train/train_scene_crossover.yaml index f9459da..4f75d80 100644 --- a/configs/train/train_scene_crossover.yaml +++ b/configs/train/train_scene_crossover.yaml @@ -33,7 +33,7 @@ data : voxel_size : 0.02 Scan3R: - base_dir : /drive/datasets/Scan3R/ + base_dir : /media/sayan/internal/datasets/Scan3R/ process_dir : ${data.process_dir}/Scan3R/ chunked_dir : ${data.process_dir}/Scan3R/objects_chunked processor3D : Scan3R3DProcessor @@ -44,14 +44,35 @@ data : max_object_len : 150 voxel_size : 0.02 + ARKitScenes: + base_dir : /media/sayan/internal/datasets/ARKitScenes + process_dir : ${data.process_dir}/ARKitScenes/ + chunked_dir : ${data.process_dir}/ARKitScenes/objects_chunked + processor3D : ARKitScenes3DProcessor + processor2D : ARKitScenes2DProcessor + processor1D : ARKitScenes1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + MultiScan: + base_dir : /media/sayan/internal/datasets/MultiScan + process_dir : ${data.process_dir}/MultiScan/ + chunked_dir : ${data.process_dir}/MultiScan/objects_chunked + processor3D : MultiScan3DProcessor + processor2D : MultiScan2DProcessor + processor1D : MultiScan1DProcessor + avail_modalities : ['point', 'cad', 'rgb', 'referral'] + max_object_len : 150 + voxel_size : 0.02 + task: name : UnifiedTrain UnifiedTrain : modalities : ['rgb', 'point', 'cad', 'referral'] scene_modalities : ['rgb', 'point', 'floorplan', 'referral'] - train : [Scannet, Scan3R, MultiScan] - val : [Scannet, Scan3R, MultiScan] - object_enc_ckpt : /drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r+multiscan.pth + train : [Scannet, Scan3R, MultiScan, ARKitScenes] + val : [Scannet, Scan3R, MultiScan, ARKitScenes] + object_enc_ckpt : /drive/dumps/multimodal-spaces/runs/new_runs/instance_crossover_scannet+scan3r+multiscan+arkitscenes.pth trainer: UnifiedTrainer diff --git a/data/datasets/__init__.py b/data/datasets/__init__.py index 9a1b744..9c7b829 100644 --- a/data/datasets/__init__.py +++ b/data/datasets/__init__.py @@ -1,2 +1,4 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * +from .multiscan import * diff --git a/data/datasets/arkit.py b/data/datasets/arkit.py new file mode 100644 index 0000000..4944dae --- /dev/null +++ b/data/datasets/arkit.py @@ -0,0 +1,41 @@ +import os.path as osp +import numpy as np +from typing import List, Any +from omegaconf import DictConfig +import pandas as pd +from ..build import DATASET_REGISTRY +from .scanbase import ScanObjectBase, ScanBase + +@DATASET_REGISTRY.register() +class ARKitScenesObject(ScanObjectBase): + """ARKitScenes dataset class for instance level baseline""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + +@DATASET_REGISTRY.register() +class ARKitScenes(ScanBase): + """ARKitScenes dataset class""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(self.split)) + self.scan_ids = np.genfromtxt(filepath, dtype = str) + + def get_temporal_scan_pairs(self): + """Groups scans into temporal pairs based on shared visit_id.""" + csv_path=osp.join(self.files_dir,'3dod_train_val_splits.csv') + df = pd.read_csv(csv_path) + + df = df[df["visit_id"].notna()] + + grouped_scans = df.groupby("visit_id")["video_id"].apply(list).to_dict() + + scene_pairs = [] + for video_ids in grouped_scans.values(): + if len(video_ids) > 1: + ref_scan_id = video_ids[0] # First video_id as reference + rescan_list = [{"scan_id": rescan_id} for rescan_id in video_ids[1:]] + + scene_pairs.append([ref_scan_id, rescan_list]) + + return scene_pairs \ No newline at end of file diff --git a/data/datasets/multiscan.py b/data/datasets/multiscan.py new file mode 100644 index 0000000..a43d8a1 --- /dev/null +++ b/data/datasets/multiscan.py @@ -0,0 +1,42 @@ +import os.path as osp +import numpy as np +from typing import List, Any +from omegaconf import DictConfig + +from ..build import DATASET_REGISTRY +from .scanbase import ScanObjectBase, ScanBase + +@DATASET_REGISTRY.register() +class MultiScanObject(ScanObjectBase): + """MultiScan dataset class for instance level baseline""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + +@DATASET_REGISTRY.register() +class MultiScan(ScanBase): + """MultiScan dataset class""" + def __init__(self, data_config: DictConfig, split: str) -> None: + super().__init__(data_config, split) + + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(self.split)) + self.scan_ids = np.genfromtxt(filepath, dtype = str) + + def get_temporal_scan_pairs(self) -> List[List[Any]]: + """Gets pairs of temporal scans from the dataset.""" + scene_pairs = [] + + ref_scan_ids = [scan_id for scan_id in self.scan_ids if scan_id.endswith('00')] + + for ref_scan_id in ref_scan_ids: + rescan_list = [] + + for rescan_id in self.scan_ids: + rescan = {} + if rescan_id.startswith(ref_scan_id.split('_')[0]) and rescan_id != ref_scan_id: + rescan['scan_id'] = rescan_id + rescan_list.append(rescan) + if len(rescan_list) == 0: + continue + + scene_pairs.append([ref_scan_id, rescan_list]) + return scene_pairs \ No newline at end of file diff --git a/data/datasets/scanbase.py b/data/datasets/scanbase.py index 7f8d3fe..ab75393 100644 --- a/data/datasets/scanbase.py +++ b/data/datasets/scanbase.py @@ -10,9 +10,11 @@ from omegaconf import DictConfig from typing import List, Dict, Any +from common.load_utils import load_npz_as_dict from ..transforms import get_transform from ..data_utils import pad_tensors + class ScanObjectBase(Dataset): """Base Dataset class for instance level training""" def __init__(self, data_config: DictConfig, split: str) -> None: @@ -131,11 +133,10 @@ def __getitem__(self, index: int) -> Dict[str, Any]: scan_process_dir = osp.join(self.process_dir, 'scans', scan_id) - scan_objects_data = torch.load(osp.join(scan_process_dir, 'objectsDataMultimodal.pt')) - - scandata_1d = torch.load(osp.join(scan_process_dir, 'data1D.pt')) - scandata_2d = torch.load(osp.join(scan_process_dir, 'data2D.pt')) - scandata_3d = torch.load(osp.join(scan_process_dir, 'data3D.pt')) + scan_objects_data = load_npz_as_dict(osp.join(scan_process_dir, 'objectsDataMultimodal.npz')) + scandata_1d = load_npz_as_dict(osp.join(scan_process_dir, 'data1D.npz')) + scandata_2d = load_npz_as_dict(osp.join(scan_process_dir, 'data2D.npz')) + scandata_3d = load_npz_as_dict(osp.join(scan_process_dir, 'data3D.npz')) # Point Cloud Data -- Scene points, feats, scene_label = scandata_3d['scene']['pcl_coords'], scandata_3d['scene']['pcl_feats'], scandata_3d['scene']['scene_label'] @@ -152,9 +153,9 @@ def __getitem__(self, index: int) -> Dict[str, Any]: _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True) coords, feats = points[sel], feats[sel] - # Get coords, shift to center + # Get coords coords = np.floor(coords / self.voxel_size) - coords-=coords.min(0) + coords -= coords.min(0) # Object Data scene_dict = {} diff --git a/prepare_data/README.md b/prepare_data/README.md index dba34f5..c369156 100644 --- a/prepare_data/README.md +++ b/prepare_data/README.md @@ -5,6 +5,8 @@ This document provides instructions for pre-processing different datasets, including - ScanNet - 3RScan +- ARKitScenes +- MultiScan ## Prerequisites @@ -16,20 +18,17 @@ Before you begin, simply activate the `crossover` conda environment. #### Original Data - **ScanNet**: Download ScanNet v2 data from the [official website](https://github.com/ScanNet/ScanNet), we use the official training and validation split from [here](https://github.com/ScanNet/ScanNet/tree/master/Tasks/Benchmark). -- **3RScan**: Download 3RScan dataset from the [official website](https://github.com/WaldJohannaU/3RScan), we use the official (full list of scan ids including reference + rescans) training split from [here](https://campar.in.tum.de/public_datasets/3RScan/train_scans.txt) and validation split from [here](https://campar.in.tum.de/public_datasets/3RScan/val_scans.txt). - - Download `3RScan.json` from [here](https://campar.in.tum.de/public_datasets/3RScan/3RScan.json) and `objects.json` from [here](https://campar.in.tum.de/public_datasets/3DSSG/3DSSG/objects.json). - - Download the class mapping file `3RScan.v2 Semantic Classes - Mapping.csv` from [here](https://docs.google.com/spreadsheets/d/1eRTJ2M9OHz7ypXfYD-KTR1AIT-CrVLmhJf8mxgVZWnI/edit?gid=0#gid=0). +- **3RScan**: Download 3RScan dataset from the [official website](https://github.com/WaldJohannaU/3RScan). -- **ShapeNet**: Download ShapenetCore dataset from the [official Huggingface release](https://huggingface.co/datasets/ShapeNet/ShapeNetCore) and unzip. +- **MultiScan**: Download MultiScan dataset from the [official website](https://github.com/smartscenes/multiscan). + +- **ARKitScenes**: Download ARKitScenes dataset from the [official website](https://github.com/apple/ARKitScenes). -#### Referral and CAD annotations -We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet & 3RScan) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). +- **ShapeNet**: Download ShapenetCore dataset from the [official Huggingface release](https://huggingface.co/datasets/ShapeNet/ShapeNetCore) and unzip. -- **SceneVerse** - Download the Scannet and 3RScan data under `annotations/refer` from the [official website](https://scene-verse.github.io/). -- **Scan2CAD** - Download `full_annotations.json` from the [official website](https://github.com/skanti/Scan2CAD?tab=readme-ov-file#download-dataset). +### Download Referral and CAD annotations +We use [SceneVerse](https://scene-verse.github.io/) for instance referrals (ScanNet, 3RScan, MultiScan, & ARKitScenes) and [Scan2CAD](https://github.com/skanti/Scan2CAD) for CAD annotations (ScanNet). Exact instructions for data setup below. -### Prepare The Data -Exact instructions for data setup + preparation below: #### ScanNet 1. Run the following to extract ScanNet data @@ -107,3 +106,81 @@ Scan3R/ └── sceneverse └── ssg_ref_rel2_template.json ``` + +#### ARKitScenes +1. Download ARKitScenes 3dod data using the following command: + +```bash +python ARKitScenes/download_data.py 3dod --video_id_csv PATH_TO_3dod_train_val_splits.csv --download_dir PATH_TO_ARKITSCENES +``` +The files mentioned in the above command - ```download_data.py``` and ```3dod_train_val_splits.csv``` can be found in the official repository [here](https://github.com/apple/ARKitScenes), along with more detailed instructions and descriptions of the data. + +2. Once the data is downloaded, run the following to organize it as per our requirements. + + ```bash +cd ARKitScenes +mv 3dod/Training/* scans +mv 3dod/Validation/* scans +``` + +3. Move the relevant files from `Sceneverse` and `ARKitScenes` under `files/`. + +Once completed, the data structure would look like the following: +``` +ARKitScenes/ +├── scans/ +│ ├── 40753679/ +│ │ ├── 40753679_frames/ +│ │ │ ├── lowres_depth/ (folder containing depth images) +│ │ │ ├── lowres_wide/ (folder containing rgb images) +│ │ │ ├── lowres_wide_intrinsics/ (folder containing frame wise camera intrinsics) +│ │ │ ├── lowres_wide.traj (camera trajectory) +│ │ ├── 40753679_3dod_annotation.json +│ │ ├── 40753679_3dod_mesh.ply +| └── +└── files + ├── scannetv2-labels.combined.tsv + ├── train_scans.txt + ├── val_scans.txt + ├── metadata.csv + ├── 3dod_train_val_splits.csv + └── sceneverse + └── ssg_ref_rel2_template.json +``` + +#### MultiScan +1. Download MultiScan data into MultiScan/scenes and run the following to extract MultiScan data + + ```bash +cd MultiScan/scenes +unzip '*.zip' +rm -rf '*.zip' +``` +3. To generate sequence of RGB images and corresponding camera poses from the ```.mp4``` file, run the follwing +```bash +cd prepare_data/multiscan +python preprocess_2d_multiscan.py --base_dir PATH_TO_MULTISCAN --frame_interval {frame_interval} +``` +Once completed, the data structure would look like the following: +``` +MultiScan/ +├── scenes/ +│ ├── scene_00000_00/ +│ │ ├── sequence/ (folder containing rgb images at specified frame interval) +| | ├── frame_ids.txt +│ │ ├── scene_00000_00.annotations.json +│ │ ├── scene_00000_00.jsonl +│ │ ├── scene_00000_00.confidence.zlib +│ │ ├── scene_00000_00.mp4 +│ │ ├── poses.jsonl +│ │ ├── scene_00000_00.ply +│ │ ├── scene_00000_00.align.json +│ │ ├── scene_00000_00.json +| └── +└── files + ├── scannetv2-labels.combined.tsv + ├── train_scans.txt + ├── test_scans.txt + └── sceneverse + └── ssg_ref_rel2_template.json +``` \ No newline at end of file diff --git a/prepare_data/multiscan/preprocess_2d_multiscan.py b/prepare_data/multiscan/preprocess_2d_multiscan.py new file mode 100644 index 0000000..da89da1 --- /dev/null +++ b/prepare_data/multiscan/preprocess_2d_multiscan.py @@ -0,0 +1,94 @@ +import os +import cv2 +import json +import jsonlines +import argparse +import os.path as osp +import shutil + +def process_scene_folders(base_dir, frame_interval=10): + base_dir=osp.join(base_dir, 'scenes') + scene_folders = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))] + + for scene_folder in scene_folders: + scene_path = os.path.join(base_dir, scene_folder) + video_path = os.path.join(scene_path, f"{scene_folder}.mp4") + jsonl_path = os.path.join(scene_path, f"{scene_folder}.jsonl") + frame_output_dir = os.path.join(scene_path, "sequence") + frame_ids_txt_path = os.path.join(scene_path, "frame_ids.txt") + metadata_output_path = os.path.join(scene_path, "poses.jsonl") + + if os.path.exists(frame_output_dir): + shutil.rmtree(frame_output_dir) + os.makedirs(frame_output_dir) + + if not os.path.exists(video_path): + print(f"Video file not found: {video_path}") + continue + if not os.path.exists(jsonl_path): + print(f"Metadata file not found: {jsonl_path}") + continue + + print(f"Processing scene: {scene_folder}") + + frame_ids = extract_frames_from_video(video_path, frame_output_dir, frame_interval) + + with open(frame_ids_txt_path, "w") as f: + for frame_id in frame_ids: + f.write(f"{frame_id}\n") + + selected_metadata = extract_metadata_by_line_number(jsonl_path, frame_ids) + + with jsonlines.open(metadata_output_path, mode="w") as writer: + for entry in selected_metadata: + writer.write(entry) + + print(f"Finished processing scene: {scene_folder}") + + +def extract_frames_from_video(video_path, output_dir, frame_interval): + + cap = cv2.VideoCapture(video_path) + if not cap.isOpened(): + raise ValueError(f"Could not open video file: {video_path}") + + frame_ids = [] + frame_count = 0 + + while True: + ret, frame = cap.read() + if not ret: + break # End of video + + if frame_count % frame_interval == 0: + frame_id = frame_count + frame_ids.append(frame_id) + output_path = os.path.join(output_dir, f"frame-{frame_id}.color.jpg") + cv2.imwrite(output_path, frame) # Save frame as an image + + frame_count += 1 + + cap.release() + return frame_ids + + +def extract_metadata_by_line_number(jsonl_path, line_numbers): + + selected_metadata = [] + + with jsonlines.open(jsonl_path) as reader: + for line_idx, entry in enumerate(reader): + if line_idx in line_numbers: + entry["frame_id"] = line_idx + selected_metadata.append(entry) + + return selected_metadata + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process scene folders.") + parser.add_argument("--base_dir", type=str, required=True, help="Base dataset directory.") + parser.add_argument("--frame_interval", type=int, default=10, help="Interval for saving frames.") + args = parser.parse_args() + + process_scene_folders(args.base_dir, args.frame_interval) \ No newline at end of file diff --git a/preprocess/build.py b/preprocess/build.py index 551d97f..fb3445e 100644 --- a/preprocess/build.py +++ b/preprocess/build.py @@ -3,5 +3,6 @@ PROCESSOR_REGISTRY = Registry("Processor") def build_processor(processor_name, data_config, modality_config, split): + print(f"Building processor: {processor_name}") processor = PROCESSOR_REGISTRY.get(processor_name)(data_config, modality_config, split) return processor \ No newline at end of file diff --git a/preprocess/feat1D/__init__.py b/preprocess/feat1D/__init__.py index 9a1b744..9c7b829 100644 --- a/preprocess/feat1D/__init__.py +++ b/preprocess/feat1D/__init__.py @@ -1,2 +1,4 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * +from .multiscan import * diff --git a/preprocess/feat1D/arkit.py b/preprocess/feat1D/arkit.py new file mode 100644 index 0000000..d02c23b --- /dev/null +++ b/preprocess/feat1D/arkit.py @@ -0,0 +1,102 @@ +import os.path as osp +import numpy as np +from common import load_utils +from util import arkit +from util.arkit import ARKITSCENE_SCANNET +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat1D.base import Base1DProcessor + +@PROCESSOR_REGISTRY.register() +class ARKitScenes1DProcessor(Base1DProcessor): + def __init__(self, config_data, config_1D, split) -> None: + super(ARKitScenes1DProcessor, self).__init__(config_data, config_1D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = arkit.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + # Object Referrals + self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + # label map + self.label_map = arkit.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id') + self.undefined = 0 + + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + + objects = [] + for _i, label_info in enumerate(annotations["data"]): + obj_label = label_info["label"] + object_id = _i + 1 + scannet_class=ARKITSCENE_SCANNET[obj_label] + nyu40id=self.label_map[scannet_class] + objects.append({ + "objectId": object_id, + "global_id": nyu40id + }) + + return objects + + def compute1DFeaturesEachScan(self, scan_id): + data1D = {} + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + npz_data = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz')) + objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'] + + scan_objects = self.load_objects_for_scan(scan_id) + + object_referral_embeddings, scene_referral_embeddings = {}, None + if len(scan_objects) != 0: + object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map) + + scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + if len(scene_referrals) != 0: + if len(scene_referrals) > 10: + scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + + scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] + scene_referrals = ' '.join(scene_referrals) + scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) + assert scene_referral_embeddings is not None + + data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} + data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D) + + def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map): + object_referral_embeddings = {} + + scan_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + + if instance_id not in objectID_to_labelID_map.keys(): + continue + + # Object Referral + object_referral = [referral['utterance'] for referral in scan_referrals if int(referral['target_id']) == instance_id] + if len(object_referral) != 0: + object_referral_feats = self.extractTextFeats(object_referral) + if object_referral_feats is not None: + object_referral_feats = np.mean(object_referral_feats, axis = 0).reshape(1, -1) + assert object_referral_feats.shape == (1, self.embed_dim) + + object_referral_embeddings[instance_id] = {'referral' : object_referral, 'feats' : object_referral_feats} + + return object_referral_embeddings \ No newline at end of file diff --git a/preprocess/feat1D/multiscan.py b/preprocess/feat1D/multiscan.py new file mode 100644 index 0000000..eb64243 --- /dev/null +++ b/preprocess/feat1D/multiscan.py @@ -0,0 +1,98 @@ +import os.path as osp +import numpy as np +from common import load_utils +from util import multiscan + +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat1D.base import Base1DProcessor + +@PROCESSOR_REGISTRY.register() +class MultiScan1DProcessor(Base1DProcessor): + def __init__(self, config_data, config_1D, split) -> None: + super(MultiScan1DProcessor, self).__init__(config_data, config_1D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = multiscan.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + # Object Referrals + self.object_referrals = load_utils.load_json(osp.join(files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + # label map + self.undefined = 0 + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scenes', scan_id, f"{scan_id}.annotations.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + objects = [] + + for obj in annotations["objects"]: + objects.append({ + "objectId": obj["objectId"], + "global_id": obj.get("label") + }) + + return objects + + + def compute1DFeaturesEachScan(self, scan_id): + data1D = {} + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + npz_data = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz')) + objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'] + + scan_objects = self.load_objects_for_scan(scan_id) + + object_referral_embeddings, scene_referral_embeddings = {}, None + if len(scan_objects) != 0: + object_referral_embeddings = self.computeObjectWise1DFeaturesEachScan(scan_id, scan_objects, objectID_to_labelID_map) + + scene_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + if len(scene_referrals) != 0: + if len(scene_referrals) > 10: + scene_referrals = np.random.choice(scene_referrals, size=10, replace=False) + + scene_referrals = [scene_referral['utterance'] for scene_referral in scene_referrals] + scene_referrals = ' '.join(scene_referrals) + scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) + assert scene_referral_embeddings is not None + + data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} + data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + + np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D) + + def computeObjectWise1DFeaturesEachScan(self, scan_id, scan_objects, objectID_to_labelID_map): + object_referral_embeddings = {} + + scan_referrals = [referral for referral in self.object_referrals if referral['scan_id'] == scan_id] + + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + + if instance_id not in objectID_to_labelID_map.keys(): + continue + + # Object Referral + object_referral = [referral['utterance'] for referral in scan_referrals if int(referral['target_id']) == instance_id] + if len(object_referral) != 0: + object_referral_feats = self.extractTextFeats(object_referral) + if object_referral_feats is not None: + object_referral_feats = np.mean(object_referral_feats, axis = 0).reshape(1, -1) + assert object_referral_feats.shape == (1, self.embed_dim) + + object_referral_embeddings[instance_id] = {'referral' : object_referral, 'feats' : object_referral_feats} + + return object_referral_embeddings \ No newline at end of file diff --git a/preprocess/feat1D/scan3r.py b/preprocess/feat1D/scan3r.py index 65fb6e9..0c84043 100644 --- a/preprocess/feat1D/scan3r.py +++ b/preprocess/feat1D/scan3r.py @@ -1,5 +1,4 @@ import os.path as osp -import torch import numpy as np from common import load_utils from util import scan3r @@ -32,10 +31,12 @@ def __init__(self, config_data, config_1D, split) -> None: self.undefined = 0 def compute1DFeaturesEachScan(self, scan_id: str) -> None: + data1D = {} scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + npz_data = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz')) + objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'] scan_objects = [obj_data for obj_data in self.objects if obj_data['scan'] == scan_id][0]['objects'] object_referral_embeddings, scene_referral_embeddings = {}, None @@ -53,11 +54,12 @@ def compute1DFeaturesEachScan(self, scan_id: str) -> None: scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) assert scene_referral_embeddings is not None - data1D = {} data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} data1D['scene'] = {'referral_embedding': scene_referral_embeddings} + + # torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D) - torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) def computeObjectWise1DFeaturesEachScan(self, scan_id: str, scan_objects: Dict, objectID_to_labelID_map: Dict[int, int]) -> Dict[int, Dict[str, Union[List[str], np.ndarray]]]: diff --git a/preprocess/feat1D/scannet.py b/preprocess/feat1D/scannet.py index e49b8e0..11a94cb 100644 --- a/preprocess/feat1D/scannet.py +++ b/preprocess/feat1D/scannet.py @@ -1,7 +1,5 @@ import os.path as osp -import torch import numpy as np - from common import load_utils from util import scannet from typing import Dict, List, Union @@ -34,10 +32,12 @@ def __init__(self, config_data, config_1D, split) -> None: self.undefined = 0 def compute1DFeaturesEachScan(self, scan_id: str) -> None: + data1D = {} scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - objectID_to_labelID_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + npz_data = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz')) + objectID_to_labelID_map = npz_data['obj_id_to_label_id_map'] objects = [objects['objects'] for objects in self.objects if objects['scan'] == scan_id] object_referral_embeddings, scene_referral_embeddings = {}, None @@ -55,11 +55,10 @@ def compute1DFeaturesEachScan(self, scan_id: str) -> None: scene_referral_embeddings = self.extractTextFeats([scene_referrals], return_text=True) assert scene_referral_embeddings is not None - data1D = {} data1D['objects'] = {'referral_embeddings' : object_referral_embeddings} data1D['scene'] = {'referral_embedding': scene_referral_embeddings} - - torch.save(data1D, osp.join(scene_out_dir, 'data1D.pt')) + + np.savez_compressed(osp.join(scene_out_dir, 'data1D.npz'), **data1D) def computeObjectWise1DFeaturesEachScan(self, scan_id: str, objects: Dict, objectID_to_labelID_map: Dict[int, int]) -> Dict[int, Dict[str, Union[List[str], np.ndarray]]]: diff --git a/preprocess/feat2D/__init__.py b/preprocess/feat2D/__init__.py index 9a1b744..9c7b829 100644 --- a/preprocess/feat2D/__init__.py +++ b/preprocess/feat2D/__init__.py @@ -1,2 +1,4 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * +from .multiscan import * diff --git a/preprocess/feat2D/arkit.py b/preprocess/feat2D/arkit.py new file mode 100644 index 0000000..a2d02a8 --- /dev/null +++ b/preprocess/feat2D/arkit.py @@ -0,0 +1,233 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm +import shutil +from PIL import Image +from scipy.spatial.transform import Rotation as R +from omegaconf import DictConfig +from typing import List, Dict, Tuple +import pandas as pd +from common import load_utils +from util import render, arkit, visualisation +from util import image as image_util +import os + +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat2D.base import Base2DProcessor + +@PROCESSOR_REGISTRY.register() +class ARKitScenes2DProcessor(Base2DProcessor): + """ARKitScenes 2D (RGB) feature processor class.""" + def __init__(self, config_data: DictConfig, config_2D: DictConfig, split: str) -> None: + super(ARKitScenes2DProcessor, self).__init__(config_data, config_2D, split) + self.data_dir = config_data.base_dir + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.split = split + self.scan_ids = arkit.get_scan_ids(files_dir, self.split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + + self.orig_image_size = config_2D.image.orig_size + self.model_image_size = config_2D.image.model_size + + self.frame_skip = config_data.skip_frames + self.top_k = config_2D.image.top_k + self.num_levels = config_2D.image.num_levels + self.undefined = 0 + self.metadata = pd.read_csv(osp.join(files_dir,'metadata.csv')) + + self.frame_pose_data = {} + for scan_id in self.scan_ids: + pose_data = arkit.load_poses(osp.join(self.data_dir, 'scans', scan_id),scan_id, skip=self.frame_skip) + self.frame_pose_data[scan_id] = pose_data + + def compute2DFeatures(self) -> None: + for scan_id in tqdm(self.scan_ids): + self.compute2DImagesAndSeg(scan_id) + self.compute2DFeaturesEachScan(scan_id) + + def compute2DImagesAndSeg(self, scan_id: str) -> None: + obj_id_imgs = {} + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + ply_data = arkit.load_ply_data(osp.join(self.data_dir,'scans'), scan_id, annotations) + instance_ids = ply_data['objectId'] + + mesh_file = osp.join(self.data_dir, 'scans', scan_id, f'{scan_id}_3dod_mesh.ply') + mesh = o3d.io.read_triangle_mesh(mesh_file) + mesh_triangles = np.asarray(mesh.triangles) + colors = np.asarray(mesh.vertex_colors)*255.0 + colors = colors.round() + num_triangles = mesh_triangles.shape[0] + + scene = o3d.t.geometry.RaycastingScene() + scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) + + # project 3D model + for frame_idx in self.frame_pose_data[scan_id].keys(): + camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx) + intrinsics = camera_info['intrinsic_mat'] + img_width = int(camera_info['width']) + img_height = int(camera_info['height']) + img_pose = self.frame_pose_data[scan_id][frame_idx] + img_pose_inv = np.linalg.inv(img_pose) + + obj_id_map = render.project_mesh3DTo2D_with_objectseg( + scene, intrinsics, img_pose_inv, img_width, img_height, + mesh_triangles, num_triangles, instance_ids + ) + obj_id_imgs[frame_idx] = obj_id_map + + np.savez_compressed(osp.join(scene_out_dir,'gt-projection-seg.npz'),**obj_id_imgs) + + def compute2DFeaturesEachScan(self, scan_id: str) -> None: + data2D = {} + + scene_folder = osp.join(self.data_dir, 'scans', scan_id) + color_path = osp.join(scene_folder,f'{scan_id}_frames', 'lowres_wide') + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + obj_id_to_label_id_map = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'))['obj_id_to_label_id_map'] + + # Multi-view Image -- Object (Embeddings) + object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map) + + # Multi-view Image -- Scene (Images + Embeddings) + frame_idxs = list(self.frame_pose_data[scan_id].keys()) + pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs) + + # Visualise + for frame_idx in self.frame_pose_data[scan_id].keys(): + camera_info = arkit.load_intrinsics(osp.join(self.data_dir,'scans'),scan_id,frame_idx) + intrinsic_mat = camera_info['intrinsic_mat'] + break + + scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder, f'{scan_id}_3dod_mesh.ply')) + intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], + 'w' : int(camera_info['width']), 'h' : int(camera_info['height'])} + + cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1) + image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') + Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) + + data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} + data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} + + # dummy floorplan + floorplan_dict = {'img' : None, 'embedding' : None} + data2D['scene']['floorplan'] = floorplan_dict + + np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D) + + def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, frame_idxs: List[int]) -> Tuple[np.ndarray, List[torch.tensor], np.ndarray, List[int]]: + # Sample Camera Indexes Based on Rotation Matrix From Grid + pose_data = [] + for frame_idx in frame_idxs: + pose = self.frame_pose_data[scan_id][frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data = np.array(pose_data) + + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data) + # sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] + + # Extract Scene Image Features + scene_images_pt = [] + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + + image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png')) + image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + image_pt = self.model.base_tf(image) + scene_images_pt.append(image_pt) + + scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False) + + return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs + + def computeImageFeaturesAllObjectsEachScan(self, scene_folder: str, scene_out_dir: str, obj_id_to_label_id_map: dict) -> Tuple[Dict[int, Dict[int, np.ndarray]], Dict[int, List[int]], List[str]]: + object_anno_2D = np.load(osp.join(scene_out_dir, 'gt-projection-seg.npz'),allow_pickle=True) + object_image_votes = {} + scan_id=scene_folder.split('/')[-1] + # iterate over all frames + for frame_idx in object_anno_2D: + obj_2D_anno_frame = object_anno_2D[frame_idx] + # process 2D anno + obj_ids, counts = np.unique(obj_2D_anno_frame, return_counts=True) + for idx in range(len(obj_ids)): + obj_id = obj_ids[idx] + count = counts[idx] + if obj_id == self.undefined: + continue + + if obj_id not in object_image_votes: + object_image_votes[obj_id] = {} + if frame_idx not in object_image_votes[obj_id]: + object_image_votes[obj_id][frame_idx] = 0 + object_image_votes[obj_id][frame_idx] = count + + # select top K frames for each obj + object_image_votes_topK = {} + for obj_id in object_image_votes: + object_image_votes_topK[obj_id] = [] + obj_image_votes_f = object_image_votes[obj_id] + sorted_frame_idxs = sorted(obj_image_votes_f, key=obj_image_votes_f.get, reverse=True) + if len(sorted_frame_idxs) > self.top_k: + object_image_votes_topK[obj_id] = sorted_frame_idxs[:self.top_k] + else: + object_image_votes_topK[obj_id] = sorted_frame_idxs + + object_ids_in_image_votes = list(object_image_votes_topK.keys()) + for obj_id in object_ids_in_image_votes: + if obj_id not in list(obj_id_to_label_id_map.keys()): + del object_image_votes_topK[obj_id] + + assert len(list(obj_id_to_label_id_map.keys())) >= len(list(object_image_votes_topK.keys())), 'Mapped < Found' + + object_image_embeddings = {} + for object_id in object_image_votes_topK: + object_image_votes_topK_frames = object_image_votes_topK[object_id] + object_image_embeddings[object_id] = {} + + for frame_idx in object_image_votes_topK_frames: + image_path = osp.join(scene_folder, f'{scan_id}_frames', 'lowres_wide', f'{scan_id}_{frame_idx}.png') + color_img = Image.open(image_path) + object_image_embeddings[object_id][frame_idx] = self.computeImageFeaturesEachObject(scan_id, color_img, object_id, object_anno_2D[frame_idx]) + + return object_image_embeddings, object_image_votes_topK, object_anno_2D.keys() + + def computeImageFeaturesEachObject(self, scan_id, image: Image.Image, object_id: int, object_anno_2d: np.ndarray) -> np.ndarray: + object_anno_2d = object_anno_2d.transpose(1, 0) + object_anno_2d = np.flip(object_anno_2d, 1) + + object_mask = object_anno_2d == object_id + + images_crops = [] + for level in range(self.num_levels): + mask_tensor = torch.from_numpy(object_mask).float() + x1, y1, x2, y2 = image_util.mask2box_multi_level(mask_tensor, level) + cropped_img = image.crop((x1, y1, x2, y2)) + cropped_img = cropped_img.resize((self.model_image_size[1], self.model_image_size[1]), Image.BICUBIC) + img_pt = self.model.base_tf(cropped_img) + images_crops.append(img_pt) + + if(len(images_crops) > 0): + mean_feats = self.extractFeatures(images_crops, return_only_cls_mean = True) + return mean_feats \ No newline at end of file diff --git a/preprocess/feat2D/multiscan.py b/preprocess/feat2D/multiscan.py new file mode 100644 index 0000000..b325a31 --- /dev/null +++ b/preprocess/feat2D/multiscan.py @@ -0,0 +1,231 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import torch +from tqdm import tqdm +from PIL import Image +from scipy.spatial.transform import Rotation as R +import os +from common import load_utils +from util import render, multiscan, visualisation +from util import image as image_util + +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat2D.base import Base2DProcessor + + +@PROCESSOR_REGISTRY.register() +class MultiScan2DProcessor(Base2DProcessor): + def __init__(self, config_data, config_2D, split) -> None: + super(MultiScan2DProcessor, self).__init__(config_data, config_2D, split) + self.data_dir = config_data.base_dir + files_dir = osp.join(config_data.base_dir, 'files') + self.split = split + + self.scan_ids = [] + self.scan_ids = multiscan.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + + self.orig_image_size = config_2D.image.orig_size + self.model_image_size = config_2D.image.model_size + + self.frame_skip = config_data.skip_frames + self.top_k = config_2D.image.top_k + self.num_levels = config_2D.image.num_levels + self.undefined = 0 + + + # get frame_indexes + self.frame_pose_data = {} + for scan_id in self.scan_ids: + scene_folder = osp.join(self.data_dir, 'scenes', scan_id) + frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=self.frame_skip) + while(len(frame_idxs) > 500): + self.frame_skip += 2 + frame_idxs = multiscan.load_frame_idxs(scene_folder, skip=self.frame_skip) + + pose_data = multiscan.load_all_poses(scene_folder, frame_idxs) + self.frame_pose_data[scan_id] = pose_data + + + def compute2DFeatures(self): + for scan_id in tqdm(self.scan_ids): + self.compute2DImagesAndSeg(scan_id) + self.compute2DFeaturesEachScan(scan_id) + + def compute2DImagesAndSeg(self, scan_id): + scene_folder = osp.join(self.data_dir, 'scenes', scan_id) + obj_id_imgs = {} + + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + mesh_file = osp.join(scene_folder, '{}.ply'.format(scan_id)) + ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id) + instance_ids = ply_data['objectId'] + + mesh = o3d.io.read_triangle_mesh(mesh_file) + mesh_triangles = np.asarray(mesh.triangles) + colors = np.asarray(mesh.vertex_colors)*255.0 + colors = colors.round() + num_triangles = mesh_triangles.shape[0] + + scene = o3d.t.geometry.RaycastingScene() + scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) + + # project 3D model + for frame_idx in self.frame_pose_data[scan_id]: + camera_info = multiscan.load_intrinsics(scene_folder,scan_id,int(frame_idx)) + intrinsics = camera_info['intrinsic_mat'] + img_width = int(camera_info['width']) + img_height = int(camera_info['height']) + img_pose = self.frame_pose_data[scan_id][frame_idx] + img_pose_inv = np.linalg.inv(img_pose) + + obj_id_map = render.project_mesh3DTo2D_with_objectseg( + scene, intrinsics, img_pose_inv, img_width, img_height, + mesh_triangles, num_triangles, instance_ids + ) + obj_id_imgs[frame_idx] = obj_id_map + + np.savez_compressed(osp.join(scene_out_dir,'gt-projection-seg.npz'),**obj_id_imgs) + + def compute2DFeaturesEachScan(self, scan_id): + data2D = {} + + scene_folder = osp.join(self.data_dir, 'scenes', scan_id) + color_path = osp.join(scene_folder, 'sequence') + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + obj_id_to_label_id_map = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'))['obj_id_to_label_id_map'] + + # Multi-view Image -- Object (Embeddings) + object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map) + + # Multi-view Image -- Scene (Images + Embeddings) + frame_idxs = list(self.frame_pose_data[scan_id].keys()) + pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs = self.computeSelectedImageFeaturesEachScan(scan_id, color_path, frame_idxs) + + # Visualise + camera_info = multiscan.load_meta_intrinsics(scene_folder,scan_id) + intrinsic_mat = camera_info['intrinsic_mat'] + + scene_mesh = o3d.io.read_triangle_mesh(osp.join(scene_folder,'{}.ply'.format(scan_id))) + intrinsics = { 'f' : intrinsic_mat[0, 0], 'cx' : intrinsic_mat[0, 2], 'cy' : intrinsic_mat[1, 2], + 'w' : int(camera_info['width']), 'h' : int(camera_info['height'])} + + cams_visualised_on_mesh = visualisation.visualise_camera_on_mesh(scene_mesh, pose_data[sampled_frame_idxs], intrinsics, stride=1) + image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') + Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) + + data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} + data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, + 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} + + # dummy floorplan + floorplan_dict = {'img' : None, 'embedding' : None} + data2D['scene']['floorplan'] = floorplan_dict + + np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D) + + def computeSelectedImageFeaturesEachScan(self, scan_id, color_path, frame_idxs): + # Sample Camera Indexes Based on Rotation Matrix From Grid + pose_data = [] + for frame_idx in frame_idxs: + pose = self.frame_pose_data[scan_id][frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data = np.array(pose_data) + + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data) + + # Extract Scene Image Features + scene_images_pt = [] + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + + image = Image.open(osp.join(color_path, f'frame-{frame_index}.color.jpg')) + image = image.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) + image_pt = self.model.base_tf(image) + scene_images_pt.append(image_pt) + + scene_image_embeddings = self.extractFeatures(scene_images_pt, return_only_cls_mean= False) + + return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs + + def computeImageFeaturesAllObjectsEachScan(self, scene_folder, scene_out_dir, obj_id_to_label_id_map): + # object_anno_2D = torch.load(osp.join(scene_out_dir, 'gt-projection-seg.pt')) + object_anno_2D = np.load(osp.join(scene_out_dir, 'gt-projection-seg.npz'),allow_pickle=True) + + object_image_votes = {} + + # iterate over all frames + for frame_idx in object_anno_2D: + obj_2D_anno_frame = object_anno_2D[frame_idx] + # process 2D anno + obj_ids, counts = np.unique(obj_2D_anno_frame, return_counts=True) + for idx in range(len(obj_ids)): + obj_id = obj_ids[idx] + count = counts[idx] + if obj_id == self.undefined: + continue + + if obj_id not in object_image_votes: + object_image_votes[obj_id] = {} + if frame_idx not in object_image_votes[obj_id]: + object_image_votes[obj_id][frame_idx] = 0 + object_image_votes[obj_id][frame_idx] = count + + # select top K frames for each obj + object_image_votes_topK = {} + for obj_id in object_image_votes: + object_image_votes_topK[obj_id] = [] + obj_image_votes_f = object_image_votes[obj_id] + sorted_frame_idxs = sorted(obj_image_votes_f, key=obj_image_votes_f.get, reverse=True) + if len(sorted_frame_idxs) > self.top_k: + object_image_votes_topK[obj_id] = sorted_frame_idxs[:self.top_k] + else: + object_image_votes_topK[obj_id] = sorted_frame_idxs + + object_ids_in_image_votes = list(object_image_votes_topK.keys()) + for obj_id in object_ids_in_image_votes: + if obj_id not in list(obj_id_to_label_id_map.keys()): + del object_image_votes_topK[obj_id] + + assert len(list(obj_id_to_label_id_map.keys())) >= len(list(object_image_votes_topK.keys())), 'Mapped < Found' + + object_image_embeddings = {} + for object_id in object_image_votes_topK: + object_image_votes_topK_frames = object_image_votes_topK[object_id] + object_image_embeddings[object_id] = {} + + for frame_idx in object_image_votes_topK_frames: + image_path = osp.join(scene_folder, 'sequence', f'frame-{frame_idx}.color.jpg') + color_img = Image.open(image_path) + object_image_embeddings[object_id][frame_idx] = self.computeImageFeaturesEachObject(color_img, object_id, object_anno_2D[frame_idx]) + + return object_image_embeddings, object_image_votes_topK, object_anno_2D.keys() + + def computeImageFeaturesEachObject(self, image, object_id, object_anno_2d): + # load image + object_mask = object_anno_2d == object_id + + images_crops = [] + for level in range(self.num_levels): + mask_tensor = torch.from_numpy(object_mask).float() + x1, y1, x2, y2 = image_util.mask2box_multi_level(mask_tensor, level) + cropped_img = image.crop((x1, y1, x2, y2)) + cropped_img = cropped_img.resize((self.model_image_size[1], self.model_image_size[1]), Image.BICUBIC) + img_pt = self.model.base_tf(cropped_img) + images_crops.append(img_pt) + # images_crops.append(cropped_img) + + + if(len(images_crops) > 0): + mean_feats = self.extractFeatures(images_crops, return_only_cls_mean = True) + return mean_feats \ No newline at end of file diff --git a/preprocess/feat2D/scan3r.py b/preprocess/feat2D/scan3r.py index 4927c97..5b1d307 100644 --- a/preprocess/feat2D/scan3r.py +++ b/preprocess/feat2D/scan3r.py @@ -7,7 +7,7 @@ from scipy.spatial.transform import Rotation as R from omegaconf import DictConfig from typing import List, Dict, Tuple - +import os from common import load_utils from util import render, scan3r, visualisation from util import image as image_util @@ -58,9 +58,13 @@ def compute2DImagesAndSeg(self, scan_id: str) -> None: scene_folder = osp.join(self.data_dir, 'scans', scan_id) mesh_file = osp.join(scene_folder, self.label_filename.replace('.align', '')) - ply_data = scan3r.load_ply_data(self.data_dir, scene_folder, self.label_filename) + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + obj_id_imgs = {} + ply_data = scan3r.load_ply_data(self.data_dir, scan_id, self.label_filename) instance_ids = ply_data['objectId'] - + camera_info = scan3r.load_intrinsics(scene_folder) intrinsics = camera_info['intrinsic_mat'] img_width = int(camera_info['width']) @@ -74,9 +78,8 @@ def compute2DImagesAndSeg(self, scan_id: str) -> None: scene = o3d.t.geometry.RaycastingScene() scene.add_triangles(o3d.t.geometry.TriangleMesh.from_legacy(mesh)) - + # project 3D model - obj_id_imgs = {} for frame_idx in self.frame_pose_data[scan_id]: img_pose = self.frame_pose_data[scan_id][frame_idx] img_pose_inv = np.linalg.inv(img_pose) @@ -87,21 +90,17 @@ def compute2DImagesAndSeg(self, scan_id: str) -> None: ) obj_id_imgs[frame_idx] = obj_id_map - - # save scene-level file for efficient loading - scene_out_dir = osp.join(self.out_dir, scan_id) - load_utils.ensure_dir(scene_out_dir) - - torch.save(obj_id_imgs, osp.join(scene_out_dir, 'gt-projection-seg.pt')) + np.savez_compressed(osp.join(scene_out_dir,'gt-projection-seg.npz'),**obj_id_imgs) def compute2DFeaturesEachScan(self, scan_id: str) -> None: + data2D = {} scene_folder = osp.join(self.data_dir, 'scans', scan_id) color_path = osp.join(scene_folder, 'sequence') scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - obj_id_to_label_id_map = torch.load(osp.join(scene_out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + obj_id_to_label_id_map = load_utils.load_npz_as_dict(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'))['obj_id_to_label_id_map'] # Multi-view Image -- Object (Embeddings) object_image_embeddings, object_image_votes_topK, frame_idxs = self.computeImageFeaturesAllObjectsEachScan(scene_folder, scene_out_dir, obj_id_to_label_id_map) @@ -122,7 +121,7 @@ def compute2DFeaturesEachScan(self, scan_id: str) -> None: image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) - data2D = {} + data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} @@ -131,7 +130,7 @@ def compute2DFeaturesEachScan(self, scan_id: str) -> None: floorplan_dict = {'img' : None, 'embedding' : None} data2D['scene']['floorplan'] = floorplan_dict - torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D) def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, frame_idxs: List[int]) -> Tuple[np.ndarray, List[torch.tensor], np.ndarray, List[int]]: # Sample Camera Indexes Based on Rotation Matrix From Grid @@ -162,7 +161,7 @@ def computeSelectedImageFeaturesEachScan(self, scan_id: str, color_path: str, fr return pose_data, scene_images_pt, scene_image_embeddings, sampled_frame_idxs def computeImageFeaturesAllObjectsEachScan(self, scene_folder: str, scene_out_dir: str, obj_id_to_label_id_map: dict) -> Tuple[Dict[int, Dict[int, np.ndarray]], Dict[int, List[int]], List[str]]: - object_anno_2D = torch.load(osp.join(scene_out_dir, 'gt-projection-seg.pt')) + object_anno_2D = np.load(osp.join(scene_out_dir, 'gt-projection-seg.npz'), allow_pickle=True) object_image_votes = {} # iterate over all frames diff --git a/preprocess/feat2D/scannet.py b/preprocess/feat2D/scannet.py index 8c59354..ec3d29d 100644 --- a/preprocess/feat2D/scannet.py +++ b/preprocess/feat2D/scannet.py @@ -3,7 +3,7 @@ import numpy as np import torch from tqdm import tqdm - +import os import imageio import skimage.transform as sktf from PIL import Image @@ -81,12 +81,14 @@ def renderShapeAndFloorplan(self, scene_folder: str, scene_out_folder: str, scan return render_img def compute2DFeaturesEachScan(self, scan_id: str) -> None: + data2D = {} frame_idxs = list(self.frame_pose_data[scan_id].keys()) scene_folder = osp.join(self.data_dir, 'scans', scan_id) scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) + # Floor-plan rendering render_img = self.renderShapeAndFloorplan(scene_folder, scene_out_dir, scan_id) floorplan_embeddings = None @@ -95,7 +97,6 @@ def compute2DFeaturesEachScan(self, scan_id: str) -> None: render_img = render_img.resize((self.model_image_size[1], self.model_image_size[0]), Image.BICUBIC) render_img_pt = self.model.base_tf(render_img) floorplan_embeddings = self.extractFeatures([render_img_pt], return_only_cls_mean = False) - floorplan_dict = {'img' : render_img, 'embedding' : floorplan_embeddings} # Multi-view Image -- Object (Embeddings) @@ -117,13 +118,12 @@ def compute2DFeaturesEachScan(self, scan_id: str) -> None: image_path = osp.join(scene_out_dir, 'sel_cams_on_mesh.png') Image.fromarray((cams_visualised_on_mesh * 255).astype(np.uint8)).save(image_path) - data2D = {} data2D['objects'] = {'image_embeddings': object_image_embeddings, 'topK_images_votes' : object_image_votes_topK} data2D['scene'] = {'scene_embeddings': scene_image_embeddings, 'images' : scene_images_pt, 'frame_idxs' : frame_idxs, 'sampled_cam_idxs' : sampled_frame_idxs} data2D['scene']['floorplan'] = floorplan_dict - torch.save(data2D, osp.join(scene_out_dir, 'data2D.pt')) + np.savez_compressed(osp.join(scene_out_dir, 'data2D.npz'), **data2D) def computeImageFeaturesEachScan(self, scan_id: str, color_path: str, frame_idxs: List[int]) -> Tuple[np.ndarray, List[torch.tensor], np.ndarray, List[int]]: # Sample Camera Indexes Based on Rotation Matrix From Grid diff --git a/preprocess/feat3D/__init__.py b/preprocess/feat3D/__init__.py index 9a1b744..9c7b829 100644 --- a/preprocess/feat3D/__init__.py +++ b/preprocess/feat3D/__init__.py @@ -1,2 +1,4 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +from .arkit import * +from .multiscan import * diff --git a/preprocess/feat3D/arkit.py b/preprocess/feat3D/arkit.py new file mode 100644 index 0000000..74f66c2 --- /dev/null +++ b/preprocess/feat3D/arkit.py @@ -0,0 +1,93 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import os +from common import load_utils +from util import arkit +from util.arkit import ARKITSCENE_SCANNET +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat3D.base import Base3DProcessor + +@PROCESSOR_REGISTRY.register() +class ARKitScenes3DProcessor(Base3DProcessor): + def __init__(self, config_data, config_3D, split) -> None: + super(ARKitScenes3DProcessor, self).__init__(config_data, config_3D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = arkit.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + self.label_map = arkit.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id') + + self.undefined = 0 + + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + + objects = [] + for _i, label_info in enumerate(annotations["data"]): + obj_label = label_info["label"] + object_id = _i + 1 + scannet_class=ARKITSCENE_SCANNET[obj_label] + nyu40id=self.label_map[scannet_class] + objects.append({ + "objectId": object_id, + "global_id": nyu40id + }) + + return objects + + def compute3DFeaturesEachScan(self, scan_id): + objects_path = osp.join(self.data_dir, 'scans', scan_id, f"{scan_id}_3dod_annotation.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + ply_data = arkit.load_ply_data(osp.join(self.data_dir, 'scans'), scan_id, annotations) + mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0)) + + mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scans', scan_id,'{}_3dod_mesh.ply'.format(scan_id))) + mesh_colors = np.asarray(mesh.vertex_colors)*255.0 + mesh_colors = mesh_colors.round() + + scan_objects=self.load_objects_for_scan(scan_id) + + object_pcl_embeddings, object_cad_embeddings = {}, {} + object_id_to_label_id = {} + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + global_object_id = scan_object['global_id'] + + object_pcl = mesh_points[np.where(ply_data['objectId'] == instance_id)] + + if object_pcl.shape[0] <= self.config_3D.min_points_per_object: + continue + + assert instance_id not in object_id_to_label_id + object_id_to_label_id[instance_id] = global_object_id + + if object_pcl.shape[0] >= self.config_3D.min_points_per_object: + object_pcl_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_pcl) + + data3D = {} + data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings} + data3D['scene'] = {'pcl_coords': mesh_points[ply_data['objectId'] != self.undefined], 'pcl_feats': mesh_colors[ply_data['objectId'] != self.undefined], 'scene_label' : None} + + object_id_to_label_id_map = { 'obj_id_to_label_id_map' : object_id_to_label_id} + + assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id) + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D) + np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map) \ No newline at end of file diff --git a/preprocess/feat3D/multiscan.py b/preprocess/feat3D/multiscan.py new file mode 100644 index 0000000..e2c047e --- /dev/null +++ b/preprocess/feat3D/multiscan.py @@ -0,0 +1,89 @@ +import os.path as osp +import open3d as o3d +import numpy as np +import os +from common import load_utils +from util import multiscan +from util.multiscan import MULTISCAN_SCANNET +from preprocess.build import PROCESSOR_REGISTRY +from preprocess.feat3D.base import Base3DProcessor + +@PROCESSOR_REGISTRY.register() +class MultiScan3DProcessor(Base3DProcessor): + def __init__(self, config_data, config_3D, split) -> None: + super(MultiScan3DProcessor, self).__init__(config_data, config_3D, split) + self.data_dir = config_data.base_dir + + files_dir = osp.join(config_data.base_dir, 'files') + + self.scan_ids = [] + self.scan_ids = multiscan.get_scan_ids(files_dir, split) + + self.out_dir = osp.join(config_data.process_dir, 'scans') + load_utils.ensure_dir(self.out_dir) + self.label_map = multiscan.read_label_map(files_dir, label_from = 'raw_category', label_to = 'nyu40id') + + self.undefined = 0 + + + def load_objects_for_scan(self, scan_id): + """Load and parse the annotations JSON for the given scan ID.""" + objects_path = osp.join(self.data_dir, 'scenes', scan_id, f"{scan_id}.annotations.json") + if not osp.exists(objects_path): + raise FileNotFoundError(f"Annotations file not found for scan ID: {scan_id}") + + annotations = load_utils.load_json(objects_path) + + objects = [] + + for obj in annotations["objects"]: + object_id=obj["objectId"] + objectName=obj["label"].split('.')[0] + scannet_class=MULTISCAN_SCANNET[objectName] + nyu40id=self.label_map[scannet_class] + objects.append({ + "objectId": object_id, + "global_id": nyu40id + }) + + return objects + + def compute3DFeaturesEachScan(self, scan_id): + ply_data = multiscan.load_ply_data(osp.join(self.data_dir, 'scenes'), scan_id) + mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0)) + + mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scenes', scan_id,'{}.ply'.format(scan_id))) + mesh_colors = np.asarray(mesh.vertex_colors)*255.0 + mesh_colors = mesh_colors.round() + + scan_objects=self.load_objects_for_scan(scan_id) + + object_pcl_embeddings, object_cad_embeddings = {}, {} + object_id_to_label_id = {} + for idx, scan_object in enumerate(scan_objects): + instance_id = int(scan_object['objectId']) + global_object_id = scan_object['global_id'] + + object_pcl = mesh_points[np.where(ply_data['objectId'] == instance_id)] + + if object_pcl.shape[0] <= self.config_3D.min_points_per_object: + continue + + assert instance_id not in object_id_to_label_id + object_id_to_label_id[instance_id] = global_object_id + + if object_pcl.shape[0] >= self.config_3D.min_points_per_object: + object_pcl_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_pcl) + + data3D = {} + data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings} + data3D['scene'] = {'pcl_coords': mesh_points[ply_data['objectId'] != self.undefined], 'pcl_feats': mesh_colors[ply_data['objectId'] != self.undefined], 'scene_label' : None} + + object_id_to_label_id_map = { 'obj_id_to_label_id_map' : object_id_to_label_id} + + assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id) + scene_out_dir = osp.join(self.out_dir, scan_id) + load_utils.ensure_dir(scene_out_dir) + + np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D) + np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map) diff --git a/preprocess/feat3D/scan3r.py b/preprocess/feat3D/scan3r.py index 7b949ca..b015609 100644 --- a/preprocess/feat3D/scan3r.py +++ b/preprocess/feat3D/scan3r.py @@ -1,13 +1,12 @@ +import os import os.path as osp import open3d as o3d import numpy as np import torch -from tqdm import tqdm from omegaconf import DictConfig -from typing import Any, Dict from common import load_utils -from util import point_cloud, scan3r +from util import scan3r from preprocess.build import PROCESSOR_REGISTRY from preprocess.feat3D.base import Base3DProcessor @@ -43,12 +42,17 @@ def __init__(self, config_data: DictConfig, config_3D: DictConfig, split: str) - self.feature_extractor = self.loadFeatureExtractor(config_3D, "3D") def compute3DFeaturesEachScan(self, scan_id: str) -> None: + """ + Computes 3D features for a single scan. + """ ply_data = scan3r.load_ply_data(osp.join(self.data_dir, 'scans'), scan_id, self.label_filename) mesh_points = np.stack([ply_data['x'], ply_data['y'], ply_data['z']]).transpose((1, 0)) - mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scans', scan_id, self.label_filename)) - mesh_colors = np.asarray(mesh.vertex_colors)*255.0 - mesh_colors = mesh_colors.round() + # mesh = o3d.io.read_triangle_mesh(osp.join(self.data_dir, 'scans', scan_id, self.label_filename)) + # mesh_colors = np.asarray(mesh.vertex_colors)*255.0 + # mesh_colors = mesh_colors.round() + mesh_colors = np.stack([ply_data['red'], ply_data['green'], ply_data['blue']]).transpose((1, 0)) + scan_objects = [obj_data for obj_data in self.objects if obj_data['scan'] == scan_id][0]['objects'] @@ -79,5 +83,5 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None: scene_out_dir = osp.join(self.out_dir, scan_id) load_utils.ensure_dir(scene_out_dir) - torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) - torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) \ No newline at end of file + np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D) + np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map) \ No newline at end of file diff --git a/preprocess/feat3D/scannet.py b/preprocess/feat3D/scannet.py index e530195..e4380b9 100644 --- a/preprocess/feat3D/scannet.py +++ b/preprocess/feat3D/scannet.py @@ -1,5 +1,5 @@ import os.path as osp - +import os import numpy as np import torch from omegaconf import DictConfig @@ -64,7 +64,7 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None: mesh_points = mesh_vertices[:, 0:3] mesh_colors = mesh_vertices[:, 3:] - text_file = mesh_file.replace('_vh_clean_2.labels.ply' , '.txt') + text_file = mesh_file.replace('_vh_clean_2.ply' , '.txt') with open(text_file, 'r') as file: for line in file: if line.startswith('sceneType'): @@ -79,10 +79,7 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None: if len(shape_annot) > 0: shape_annot = shape_annot[0] shape_annot_to_instance_map = scannet.get_cad_model_to_instance_mapping(instance_bboxes, shape_annot, meta_file, self.shape_dir) - - render_out_dir = osp.join(scene_out_dir, 'render') - load_utils.ensure_dir(render_out_dir) - + for instance_id in unique_instance_ids: if instance_id == self.undefined: continue @@ -98,11 +95,7 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None: shape_annot_instance = shape_annot_to_instance_map[instance_id] object_cad_pcl = shape_annot_instance['points'] object_cad_embeddings[instance_id] = self.normalizeObjectPCLAndExtractFeats(object_cad_pcl) - - obj_verts, obj_faces, transform_shape = shape_annot_instance['verts'], shape_annot_instance['faces'], shape_annot_instance['transform_shape'] - # load_utils.ensure_dir(osp.join(render_out_dir, f'{instance_id}')) - # render.render_multiview_images(obj_verts, obj_faces, transform_shape, osp.join(render_out_dir, f'{instance_id}')) - + data3D = {} data3D['objects'] = {'pcl_embeddings' : object_pcl_embeddings, 'cad_embeddings': object_cad_embeddings} data3D['scene'] = {'pcl_coords': mesh_points[instance_ids != self.undefined], 'pcl_feats': mesh_colors[instance_ids != self.undefined], 'scene_label' : scene_label} @@ -112,7 +105,5 @@ def compute3DFeaturesEachScan(self, scan_id: str) -> None: assert len(list(object_id_to_label_id.keys())) >= len(list(object_pcl_embeddings.keys())), 'PC does not match for {}'.format(scan_id) assert len(list(object_id_to_label_id.keys())) >= len(list(object_cad_embeddings.keys())), 'CAD does not match for {}'.format(scan_id) - - - torch.save(data3D, osp.join(scene_out_dir, 'data3D.pt')) - torch.save(object_id_to_label_id_map, osp.join(scene_out_dir, 'object_id_to_label_id_map.pt')) \ No newline at end of file + np.savez_compressed(osp.join(scene_out_dir, 'data3D.npz'), **data3D) + np.savez_compressed(osp.join(scene_out_dir, 'object_id_to_label_id_map.npz'), **object_id_to_label_id_map) \ No newline at end of file diff --git a/preprocess/multimodal_preprocess.py b/preprocess/multimodal_preprocess.py index 822135d..17772f0 100644 --- a/preprocess/multimodal_preprocess.py +++ b/preprocess/multimodal_preprocess.py @@ -2,13 +2,12 @@ import numpy as np from functools import reduce from operator import add -import torch from tqdm import tqdm from omegaconf import DictConfig import h5py from common import load_utils from common.constants import ModalityType -from util import scan3r, scannet +from util import scan3r, scannet, arkit, multiscan from typing import Dict, Optional from preprocess.build import PROCESSOR_REGISTRY @@ -33,6 +32,10 @@ def __init__(self, config_data: DictConfig, modality_config: DictConfig, split: self.scan_ids = scannet.get_scan_ids(self.files_dir, self.split) elif self.dataset_name == 'Scan3R': self.scan_ids = scan3r.get_scan_ids(self.files_dir, self.split) + elif self.dataset_name == 'ARKitScenes': + self.scan_ids = arkit.get_scan_ids(self.files_dir, self.split) + elif self.dataset_name == 'MultiScan': + self.scan_ids = multiscan.get_scan_ids(self.files_dir, self.split) else: raise NotImplementedError @@ -71,7 +74,8 @@ def prepareObjectWiseDataEachScan(self, data2D: Optional[Dict] = None, data3D: Optional[Dict] = None) -> Dict: """Process object-wise data for a single scan combining features from all modalities.""" - object_id_to_label_id_map = torch.load(osp.join(out_dir, 'object_id_to_label_id_map.pt'))['obj_id_to_label_id_map'] + object_id_to_label_id_map = load_utils.load_npz_as_dict(osp.join(out_dir, 'object_id_to_label_id_map.npz'))['obj_id_to_label_id_map'] + map_object_ids = list(object_id_to_label_id_map.keys()) precomputed_feats, inputs = {}, {} @@ -139,17 +143,16 @@ def prepareObjectWiseDataEachScan(self, 'object_ids' : object_ids, 'topK_images_votes' : data2D['objects']['topK_images_votes'] } - - torch.save(objects_data_pt, osp.join(out_dir, 'objectsDataMultimodal.pt')) + np.savez_compressed(osp.join(out_dir, 'objectsDataMultimodal.npz'), **objects_data_pt) return objects_data_pt def prepareDataEachScan(self, scan_id: str, hf_handler: h5py.File) -> None: """Process data for a single scan and store it in the HDF5 file.""" out_dir = osp.join(self.out_dir, scan_id) - data1D = torch.load(osp.join(out_dir, 'data1D.pt')) - data2D = torch.load(osp.join(out_dir, 'data2D.pt')) - data3D = torch.load(osp.join(out_dir, 'data3D.pt')) + data1D = load_utils.load_npz_as_dict(osp.join(out_dir, 'data1D.npz')) + data2D = load_utils.load_npz_as_dict(osp.join(out_dir, 'data2D.npz')) + data3D = load_utils.load_npz_as_dict(osp.join(out_dir, 'data3D.npz')) objects_data_pt = self.prepareObjectWiseDataEachScan(out_dir, data1D, data2D, data3D) self.dumpEachObjectDataPerScan(scan_id, objects_data_pt, hf_handler) diff --git a/retrieval/object_retrieval.py b/retrieval/object_retrieval.py index 54c144f..526e5a2 100644 --- a/retrieval/object_retrieval.py +++ b/retrieval/object_retrieval.py @@ -293,6 +293,6 @@ def run(self) -> None: # Object Retrieval Evaluation self.eval(output_dict) - self.logger.info('Scene Retrieval Evaluation (Instance Baseline)...') + self.logger.info('Scene Retrieval Evaluation (Instance CrossOver)...') # Scene Retrieval Evaluation self.scene_eval(output_dict) \ No newline at end of file diff --git a/scripts/evaluation/eval_object_retrieval.sh b/scripts/evaluation/eval_object_retrieval.sh index 23f84f1..b6c37ea 100644 --- a/scripts/evaluation/eval_object_retrieval.sh +++ b/scripts/evaluation/eval_object_retrieval.sh @@ -3,17 +3,17 @@ export PYTHONWARNINGS="ignore" # Change val according to the dataset you want to evaluate on # Instance Baseline -python run_evaluation.py --config-path "$(pwd)/configs/evaluation" \ ---config-name eval_instance.yaml \ -task.InferenceObjectRetrieval.val=['Scannet'] \ -task.InferenceObjectRetrieval.ckpt_path=/drive/dumps/multimodal-spaces/runs/release_runs/instance_baseline_scannet+scan3r.pth \ -model.name=ObjectLevelEncoder \ -hydra.run.dir=. hydra.output_subdir=null +# python run_evaluation.py --config-path "$(pwd)/configs/evaluation" \ +# --config-name eval_instance.yaml \ +# task.InferenceObjectRetrieval.val=['Scannet'] \ +# task.InferenceObjectRetrieval.ckpt_path=/drive/dumps/multimodal-spaces/runs/release_runs/instance_baseline_scannet+scan3r.pth \ +# model.name=ObjectLevelEncoder \ +# hydra.run.dir=. hydra.output_subdir=null # Instance CrossOver python run_evaluation.py --config-path "$(pwd)/configs/evaluation" \ --config-name eval_instance.yaml \ -task.InferenceObjectRetrieval.val=['Scan3R'] \ -task.InferenceObjectRetrieval.ckpt_path=/drive/dumps/multimodal-spaces/runs/release_runs/instance_crossover_scannet+scan3r.pth \ +task.InferenceObjectRetrieval.val=['ARKitScenes'] \ +task.InferenceObjectRetrieval.ckpt_path=/drive/dumps/multimodal-spaces/runs/new_runs/instance_crossover_scannet+scan3r+multiscan+arkitscenes.pth \ model.name=SceneLevelEncoder \ hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/scripts/evaluation/eval_scene_retrieval.sh b/scripts/evaluation/eval_scene_retrieval.sh index 1752e84..1a1f397 100644 --- a/scripts/evaluation/eval_scene_retrieval.sh +++ b/scripts/evaluation/eval_scene_retrieval.sh @@ -2,6 +2,6 @@ export PYTHONWARNINGS="ignore" # Scene Retrieval Inference python run_evaluation.py --config-path "$(pwd)/configs/evaluation" --config-name eval_scene.yaml \ -task.InferenceSceneRetrieval.val=['Scan3R'] \ -task.InferenceSceneRetrieval.ckpt_path=/drive/dumps/multimodal-spaces/runs/release_runs/scene_crossover_scannet+scan3r.pth \ +task.InferenceSceneRetrieval.val=['ARKitScenes'] \ +task.InferenceSceneRetrieval.ckpt_path=/drive/dumps/multimodal-spaces/runs/UnifiedTrain_Scannet+Scan3R+MultiScan+ARKitScenes/2025-07-03-07:39:02.553100/ckpt/best.pth \ hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/scripts/preprocess/process_arkit.sh b/scripts/preprocess/process_arkit.sh new file mode 100644 index 0000000..466f751 --- /dev/null +++ b/scripts/preprocess/process_arkit.sh @@ -0,0 +1,9 @@ +export PYTHONWARNINGS="ignore" + +# Preprocessing Object Level + Scene Level + Unified Data +python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null +# python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null +# python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null + +# Multi-modal dumping +python3 preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['ARKitScenes'] hydra.run.dir=. hydra.output_subdir=null diff --git a/scripts/preprocess/process_multiscan.sh b/scripts/preprocess/process_multiscan.sh new file mode 100644 index 0000000..a13a93c --- /dev/null +++ b/scripts/preprocess/process_multiscan.sh @@ -0,0 +1,9 @@ +export PYTHONWARNINGS="ignore" + +# Preprocessing Object Level + Scene Level + Unified Data +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_3d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path /"$(pwd)/configs/preprocess" --config-name process_2d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_1d.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null + +# Multi-modal dumping +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_multimodal.yaml data.sources=['MultiScan'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/scripts/preprocess/process_scan3r.sh b/scripts/preprocess/process_scan3r.sh index 6d8a981..0adfbae 100644 --- a/scripts/preprocess/process_scan3r.sh +++ b/scripts/preprocess/process_scan3r.sh @@ -1,9 +1,8 @@ export PYTHONWARNINGS="ignore" # Preprocessing Object Level + Scene Level + Unified Data -# python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null +python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_3d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null # python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_2d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null # python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_1d.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null - # Multi-modal dumping python preprocessor.py --config-path "$(pwd)/configs/preprocess/" --config-name process_multimodal.yaml data.sources=['Scan3R'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/scripts/preprocess/process_scannet.sh b/scripts/preprocess/process_scannet.sh index 68a2366..ec86441 100644 --- a/scripts/preprocess/process_scannet.sh +++ b/scripts/preprocess/process_scannet.sh @@ -1,9 +1,8 @@ export PYTHONWARNINGS="ignore" # Preprocessing Object Level + Scene Level + Unified Data -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_3d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_2d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_1d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null - +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_3d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null +# python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_2d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null +# python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_1d.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null # Multi-modal dumping -python preprocessor.py --config-path /home/sayan/Documents/code/multimodal-reality/CrossOver/configs/preprocess/ --config-name process_multimodal.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file +python preprocessor.py --config-path "$(pwd)/configs/preprocess" --config-name process_multimodal.yaml data.sources=['Scannet'] hydra.run.dir=. hydra.output_subdir=null \ No newline at end of file diff --git a/single_inference/datasets/__init__.py b/single_inference/datasets/__init__.py index 9a1b744..d7126ea 100644 --- a/single_inference/datasets/__init__.py +++ b/single_inference/datasets/__init__.py @@ -1,2 +1,7 @@ from .scannet import * -from .scan3r import * \ No newline at end of file +from .scan3r import * +<<<<<<< HEAD +from .arkit import * +======= +from .multiscan import * +>>>>>>> f86c782 (adding support for multiscan) diff --git a/single_inference/datasets/arkit.py b/single_inference/datasets/arkit.py new file mode 100644 index 0000000..6434bde --- /dev/null +++ b/single_inference/datasets/arkit.py @@ -0,0 +1,126 @@ +import os.path as osp +import numpy as np +from torch.utils.data import Dataset +import MinkowskiEngine as ME +from PIL import Image +from scipy.spatial.transform import Rotation as R +from torchvision import transforms as tvf +import torch +import open3d as o3d +import pandas as pd +from common import load_utils +from util import arkit +from util import image as image_util + +class ARKitScenesInferDataset(Dataset): + def __init__(self, data_dir,voxel_size=0.02, frame_skip=5, image_size=[224, 224]) -> None: + self.voxel_size = voxel_size + self.frame_skip = frame_skip + self.image_size = image_size + + self.scans_dir = osp.join(data_dir, 'scans') + self.files_dir = osp.join(data_dir, 'files') + self.referrals = load_utils.load_json(osp.join(self.files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + self.scan_ids = [] + for split in ['train', 'val']: + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(split)) + self.scan_ids.extend(np.genfromtxt(filepath, dtype = str)) + + self.base_tf = tvf.Compose([ + tvf.ToTensor(), + tvf.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + ]) + self.metadata = pd.read_csv(osp.join(self.files_dir,'metadata.csv')) + + + def extract_images(self, scan_id, color_path): + pose_data = arkit.load_poses(self.scans_dir, scan_id, skip=self.frame_skip) + frame_idxs = list(pose_data.keys()) + + pose_data_arr = [] + for frame_idx in frame_idxs: + pose = pose_data[frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data_arr.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data_arr = np.array(pose_data_arr) + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data_arr) + sky_direction=self.metadata[self.metadata['video_id']==int(scan_id)]['sky_direction'].values[0] + + image_data = None + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + image = Image.open(osp.join(color_path, f'{scan_id}_{frame_index}.png')) + if sky_direction=='Left': + image = image.transpose(Image.ROTATE_270) + elif sky_direction=='Right': + image = image.transpose(Image.ROTATE_90) + image = image.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + image_pt = self.base_tf(image).unsqueeze(0) + image_data = image_pt if image_data is None else torch.cat((image_data, image_pt), dim=0) + + return image_data.unsqueeze(0) + + def __getitem__(self, index): + if isinstance(index, int): + scan_id = self.scan_ids[index] + + if isinstance(index, str): + scan_id = index + + scan_folder = osp.join(self.scans_dir, scan_id) + data_dict = {} + data_dict['masks'] = {} + + # Point Cloud + mesh = o3d.io.read_triangle_mesh(osp.join(scan_folder, '{}_3dod_mesh.ply'.format(scan_id))) + points = np.asarray(mesh.vertices) + feats = np.asarray(mesh.vertex_colors)*255.0 + feats = feats.round() + + feats /= 255. + feats -= 0.5 + + _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True) + coords, feats = points[sel], feats[sel] + coords = np.floor(coords / self.voxel_size) + coords-= coords.min(0) + + coords, feats = ME.utils.sparse_collate([coords], [feats]) + data_dict['masks']['point'] = True + + # RGB + color_path = osp.join(scan_folder, f'{scan_id}_frames','lowres_wide') + image_data = self.extract_images(scan_id, color_path) + data_dict['masks']['rgb'] = True + + # Floorplan (dummy) + floorplan_img = np.zeros((self.image_size[0], self.image_size[1], 3), dtype=np.uint8) + floorplan_img = Image.fromarray(floorplan_img) + data_dict['masks']['floorplan'] = False + + floorplan_img = floorplan_img.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + floorplan_data = self.base_tf(floorplan_img).unsqueeze(0) + + # Referral + referrals = [referral for referral in self.referrals if referral['scan_id'] == scan_id] + if len(referrals) != 0: + if len(referrals) > 10: + referrals = np.random.choice(referrals, size=10, replace=False) + referrals = [referral['utterance'] for referral in referrals] + referrals = [' '.join(referrals)] + data_dict['masks']['referral'] = True + else: + referrals = [''] + data_dict['masks']['referral'] = False + + data_dict['coordinates'] = coords + data_dict['features'] = feats + data_dict['rgb'] = image_data + data_dict['floorplan'] = floorplan_data + data_dict['referral'] = referrals + + return data_dict \ No newline at end of file diff --git a/single_inference/datasets/multiscan.py b/single_inference/datasets/multiscan.py new file mode 100644 index 0000000..06538e6 --- /dev/null +++ b/single_inference/datasets/multiscan.py @@ -0,0 +1,120 @@ +import os.path as osp +import numpy as np +from torch.utils.data import Dataset +import MinkowskiEngine as ME +from PIL import Image +from scipy.spatial.transform import Rotation as R +from torchvision import transforms as tvf +import torch +import open3d as o3d + +from common import load_utils +from util import multiscan +from util import image as image_util + +class MultiScanInferDataset(Dataset): + def __init__(self, data_dir, voxel_size=0.02, frame_skip=1, image_size=[224, 224]) -> None: + self.voxel_size = voxel_size + self.frame_skip = frame_skip + self.image_size = image_size + + self.scans_dir = osp.join(data_dir, 'scenes') + self.files_dir = osp.join(data_dir, 'files') + self.referrals = load_utils.load_json(osp.join(self.files_dir, 'sceneverse/ssg_ref_rel2_template.json')) + + self.scan_ids = [] + for split in ['train', 'val']: + filepath = osp.join(self.files_dir, '{}_scans.txt'.format(split)) + self.scan_ids.extend(np.genfromtxt(filepath, dtype = str)) + + self.base_tf = tvf.Compose([ + tvf.ToTensor(), + tvf.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + ]) + + def extract_images(self, scan_id, color_path): + frame_idxs = multiscan.load_frame_idxs(osp.join(self.scans_dir, scan_id)) + pose_data = multiscan.load_all_poses(osp.join(self.scans_dir, scan_id), frame_idxs) + frame_idxs = list(pose_data.keys()) + + pose_data_arr = [] + for frame_idx in frame_idxs: + pose = pose_data[frame_idx] + rot_quat = R.from_matrix(pose[:3, :3]).as_quat() + trans = pose[:3, 3] + pose_data_arr.append([trans[0], trans[1], trans[2], rot_quat[0], rot_quat[1], rot_quat[2], rot_quat[3]]) + + pose_data_arr = np.array(pose_data_arr) + sampled_frame_idxs = image_util.sample_camera_pos_on_grid(pose_data_arr) + + image_data = None + for idx in sampled_frame_idxs: + frame_index = frame_idxs[idx] + image = Image.open(osp.join(color_path, f'frame-{frame_index}.color.jpg')) + image = image.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + image_pt = self.base_tf(image).unsqueeze(0) + image_data = image_pt if image_data is None else torch.cat((image_data, image_pt), dim=0) + + return image_data.unsqueeze(0) + + def __getitem__(self, index): + if isinstance(index, int): + scan_id = self.scan_ids[index] + + if isinstance(index, str): + scan_id = index + + scan_folder = osp.join(self.scans_dir, scan_id) + data_dict = {} + data_dict['masks'] = {} + + # Point Cloud + mesh = o3d.io.read_triangle_mesh(osp.join(scan_folder, f'{scan_id}.ply')) + points = np.asarray(mesh.vertices) + feats = np.asarray(mesh.vertex_colors)*255.0 + feats = feats.round() + + feats /= 255. + feats -= 0.5 + + _, sel = ME.utils.sparse_quantize(points / self.voxel_size, return_index=True) + coords, feats = points[sel], feats[sel] + coords = np.floor(coords / self.voxel_size) + coords-= coords.min(0) + + coords, feats = ME.utils.sparse_collate([coords], [feats]) + data_dict['masks']['point'] = True + + # RGB + color_path = osp.join(scan_folder, 'sequence') + image_data = self.extract_images(scan_id, color_path) + data_dict['masks']['rgb'] = True + + # Floorplan (dummy) + floorplan_img = np.zeros((self.image_size[0], self.image_size[1], 3), dtype=np.uint8) + floorplan_img = Image.fromarray(floorplan_img) + data_dict['masks']['floorplan'] = False + + floorplan_img = floorplan_img.resize((self.image_size[1], self.image_size[0]), Image.BICUBIC) + floorplan_data = self.base_tf(floorplan_img).unsqueeze(0) + + # Referral + referrals = [referral for referral in self.referrals if referral['scan_id'] == scan_id] + if len(referrals) != 0: + if len(referrals) > 10: + referrals = np.random.choice(referrals, size=10, replace=False) + referrals = [referral['utterance'] for referral in referrals] + referrals = [' '.join(referrals)] + data_dict['masks']['referral'] = True + else: + referrals = [''] + data_dict['masks']['referral'] = False + + data_dict['coordinates'] = coords + data_dict['features'] = feats + data_dict['rgb'] = image_data + data_dict['floorplan'] = floorplan_data + data_dict['referral'] = referrals + + return data_dict \ No newline at end of file diff --git a/single_inference/scene_inference.py b/single_inference/scene_inference.py index 9846dd5..1d13b5e 100644 --- a/single_inference/scene_inference.py +++ b/single_inference/scene_inference.py @@ -26,6 +26,10 @@ def run_inference(args, scan_id=None): dataset = datasets.ScannetInferDataset(args.data_dir, args.floorplan_dir) elif args.dataset == 'Scan3R': dataset = datasets.Scan3RInferDataset(args.data_dir) + elif args.dataset == 'ARKitScenes': + dataset = datasets.ARKitScenesInferDataset(args.data_dir) + elif args.dataset == 'MultiScan': + dataset = datasets.MultiScanInferDataset(args.data_dir) else: raise NotImplementedError('Dataset not implemented') diff --git a/trainer/unified_trainer.py b/trainer/unified_trainer.py index 9b48516..d43f795 100644 --- a/trainer/unified_trainer.py +++ b/trainer/unified_trainer.py @@ -16,14 +16,23 @@ def __init__(self, cfg: DictConfig) -> None: super().__init__(cfg) self.task_config = rgetattr(cfg.task, cfg.task.name) - object_enc_ckpt = self.task_config.object_enc_ckpt + # ckpt = '/drive/dumps/multimodal-spaces/runs/new_runs/scene_crossover_scannet+scan3r_scratch.pth' + # self.logger.info(f"Loading Initial Weights from {ckpt}") + + # # Load model weights from safetensors files + # ckpt = osp.join(ckpt, 'model.safetensors') + # weights = load_file(ckpt, device = str(self.accelerator.device)) + # self.model.load_state_dict(weights) + # self.logger.info(f"Successfully loaded initial weights from {ckpt}") + + object_enc_ckpt = self.task_config.object_enc_ckpt self.logger.info(f"Loading Object Wise Modality Encoder from {str(object_enc_ckpt)}") # Load model weights from safetensors files object_enc_ckpt = osp.join(object_enc_ckpt, 'model.safetensors') object_enc_ckpt = load_file(object_enc_ckpt, device = str(self.accelerator.device)) self.model.objectwise_modality_encoder.load_state_dict(object_enc_ckpt) - self.logger.info(f"Successfully loaded from {self.task_config.object_enc_ckpt}") + self.logger.info(f"Successfully loaded Object Wise Modality Encoder from {self.task_config.object_enc_ckpt}") def train_step(self, epoch: int) -> None: self.model.train() diff --git a/util/arkit.py b/util/arkit.py new file mode 100644 index 0000000..3eb332a --- /dev/null +++ b/util/arkit.py @@ -0,0 +1,331 @@ +import os.path as osp +import numpy as np +from plyfile import PlyData +from glob import glob +import csv +import jsonlines +import json +import os +import trimesh +import pandas as pd +import cv2 + +ARKITSCENE_SCANNET= { +'bed': 'bed', +'cabinet': 'cabinet', +'refrigerator': 'refrigerator', +'table': 'table', +'chair': 'chair', +'sink': 'sink', +'stove': 'stove', +'oven': 'oven', +'washer': 'washing machine', +'shelf': 'shelf', +'tv_monitor': 'tv', +'bathtub': 'bathtub', +'toilet': 'toilet', +'sofa': 'sofa', +'stool': 'stool', +'fireplace': 'fireplace', +'build_in_cabinet': 'cabinet', +'dishwasher': 'dishwasher', +'stairs': 'stairs' +} + +def get_scan_ids(dirname, split): + filepath = osp.join(dirname, '{}_scans.txt'.format(split)) + scan_ids = np.genfromtxt(filepath, dtype = str) + return scan_ids + +def load_frame_idxs(scan_dir, skip=None): + frames_paths = glob(osp.join(scan_dir, f"{scan_dir.split('/')[-1]}_frames", 'lowres_wide', '*.png')) + frame_names = [osp.basename(frame_path) for frame_path in frames_paths] + frame_idxs = [frame_name.split('.png')[0].split("_")[1] for frame_name in frame_names] + frame_idxs.sort() + + if skip is not None: + frame_idxs = frame_idxs[::skip] + + return frame_idxs + +def TrajStringToMatrix(traj_str): + """ convert traj_str into translation and rotation matrices + Args: + traj_str: A space-delimited file where each line represents a camera position at a particular timestamp. + The file has seven columns: + * Column 1: timestamp + * Columns 2-4: rotation (axis-angle representation in radians) + * Columns 5-7: translation (usually in meters) + + Returns: + ts: translation matrix + Rt: rotation matrix + """ + # line=[float(x) for x in traj_str.split()] + # ts = line[0]; + # R = cv2.Rodrigues(np.array(line[1:4]))[0]; + # t = np.array(line[4:7]); + # Rt = np.concatenate((np.concatenate((R, t[:,np.newaxis]), axis=1), [[0.0,0.0,0.0,1.0]]), axis=0) + tokens = traj_str.split() + assert len(tokens) == 7 + ts = tokens[0] + # Rotation in angle axis + angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])] + r_w_to_p = convert_angle_axis_to_matrix3(np.asarray(angle_axis)) + # Translation + t_w_to_p = np.asarray([float(tokens[4]), float(tokens[5]), float(tokens[6])]) + extrinsics = np.eye(4, 4) + extrinsics[:3, :3] = r_w_to_p + extrinsics[:3, -1] = t_w_to_p + Rt = np.linalg.inv(extrinsics) + return Rt + +def convert_angle_axis_to_matrix3(angle_axis): + """Return a Matrix3 for the angle axis. + Arguments: + angle_axis {Point3} -- a rotation in angle axis form. + """ + matrix, jacobian = cv2.Rodrigues(angle_axis) + return matrix + +def load_poses(scan_dir, scan_id, skip=None): + frame_poses = {} + frame_idxs = load_frame_idxs(scan_dir, skip=skip) + traj_file = osp.join(scan_dir, f'{scan_id}_frames', 'lowres_wide.traj') + with open(traj_file) as f: + traj = f.readlines() + for i,line in enumerate(traj): + ts=line.split(" ")[0] + rounded_ts = round(float(ts), 3) + formatted_ts = f"{rounded_ts:.3f}" + if formatted_ts not in frame_idxs: + if f"{rounded_ts - 0.001:.3f}" in frame_idxs: + frame_poses[f"{rounded_ts - 0.001:.3f}"] = TrajStringToMatrix(line) + elif f"{rounded_ts + 0.001:.3f}" in frame_idxs: + frame_poses[f"{rounded_ts + 0.001:.3f}"] = TrajStringToMatrix(line) + else: + print("no matching pose for frame", formatted_ts) + continue + # if f"{round(float(ts), 3):.3f}" not in frame_idxs: + # if f"{round(float(ts), 3)-0.001 :.3f}" in frame_idxs: + # frame_poses[f"{round(float(ts), 3)-0.001:.3f}"] = TrajStringToMatrix(line) + # elif f"{round(float(ts), 3)+0.001 :.3f}" in frame_idxs: + # frame_poses[f"{round(float(ts), 3)+0.001:.3f}"] = TrajStringToMatrix(line) + # else: + # continue + else: + frame_poses[f"{round(float(ts), 3):.3f}"] = TrajStringToMatrix(line) + # data = pd.read_csv(osp.join(scan_dir,f'{scan_id}_frames','lowres_wide.traj'), delim_whitespace=True, header=None) + # for frame_idx,(index, row) in zip(frame_idxs,data.iterrows()): + # if skip is not None and index % skip != 0: + # continue + # rotation_axis = row[1:4].values + # rotation_angle = np.linalg.norm(rotation_axis) + # if rotation_angle != 0: + # rotation_axis = rotation_axis / rotation_angle + # translation = row[4:7].values + # # Convert axis-angle to rotation matrix + # # rotation_matrix = axis_angle_to_rotation_matrix(rotation_axis, rotation_angle) + # rotation_matrix= + # # Construct the 4x4 homogeneous transformation matrix + # homogenous_matrix = np.eye(4) + # homogenous_matrix[:3, :3] = rotation_matrix + # homogenous_matrix[:3, 3] = translation + # frame_poses[frame_idx] = homogenous_matrix + + return frame_poses + +def axis_angle_to_rotation_matrix(axis, angle): + # Normalize the rotation axis + axis = axis / np.linalg.norm(axis) + x, y, z = axis + c = np.cos(angle) + s = np.sin(angle) + t = 1 - c + + # Compute the rotation matrix using the axis-angle formula + rotation_matrix = np.array([ + [t*x*x + c, t*x*y - s*z, t*x*z + s*y], + [t*x*y + s*z, t*y*y + c, t*y*z - s*x], + [t*x*z - s*y, t*y*z + s*x, t*z*z + c] + ]) + + return rotation_matrix + +def load_intrinsics(data_dir, scan_id, frame_id): + ''' + Load ARKit intrinsic information + ''' + pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{frame_id}.pincam') + if not os.path.exists(pincam_path): + pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{float(frame_id)-0.001:.3f}.pincam') + if not os.path.exists(pincam_path): + pincam_path = osp.join(data_dir, scan_id, f'{scan_id}_frames', 'lowres_wide_intrinsics', f'{scan_id}_{float(frame_id)+0.001:.3f}.pincam') + + + intrinsics = {} + + # Read the .pincam file + with open(pincam_path, "r") as f: + line = f.readline().strip() + + # Parse the intrinsic parameters + width, height, focal_length_x, focal_length_y, principal_point_x, principal_point_y = map(float, line.split()) + + # Store the width and height + intrinsics['width'] = width + intrinsics['height'] = height + + # Construct the intrinsic matrix + intrinsic_mat = np.array([ + [focal_length_x, 0, principal_point_x], + [0, focal_length_y, principal_point_y], + [0, 0, 1] + ]) + intrinsics['intrinsic_mat'] = intrinsic_mat + + return intrinsics + +def read_label_map(metadata_dir, label_from='raw_category', label_to='nyu40id'): + LABEL_MAP_FILE = osp.join(metadata_dir, 'scannetv2-labels.combined.tsv') + assert osp.exists(LABEL_MAP_FILE) + + raw_label_map = read_label_mapping(LABEL_MAP_FILE, label_from=label_from, label_to=label_to) + return raw_label_map + +def read_label_mapping(filename, label_from='raw_category', label_to='nyu40id'): + assert osp.isfile(filename) + mapping = dict() + with open(filename) as csvfile: + reader = csv.DictReader(csvfile, delimiter='\t') + for row in reader: + mapping[row[label_from]] = row[label_to] + + if represents_int(list(mapping.keys())[0]): + mapping = {int(k):v for k,v in mapping.items()} + + return mapping + +def represents_int(s): + ''' if string s represents an int. ''' + try: + int(s) + return True + except ValueError: + return False + +def load_ply_data(data_dir, scan_id, annotations): + filename_in = osp.join(data_dir, scan_id, f'{scan_id}_3dod_mesh.ply') + file = open(filename_in, 'rb') + plydata = PlyData.read(file) + file.close() + vertices = plydata['vertex']['x'], plydata['vertex']['y'], plydata['vertex']['z'] + vertices = np.vstack(vertices).T + + vertex_colors = plydata['vertex']['red'], plydata['vertex']['green'], plydata['vertex']['blue'] + vertex_colors = np.vstack(vertex_colors).T + vertex_dtype = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'), + ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), + ('objectId', 'h')] + vertices_structured = np.empty(vertices.shape[0], dtype=vertex_dtype) + + # Assign x, y, z, and color values to the structured array + vertices_structured['red'] = vertex_colors[:, 0] + vertices_structured['green'] = vertex_colors[:, 1] + vertices_structured['blue'] = vertex_colors[:, 2] + + vertex_instance = np.zeros(vertices.shape[0], dtype='h') # Use 'h' for signed 16-bit integer + bbox_list=[] + for _i, label_info in enumerate(annotations["data"]): + object_id = _i + 1 + rotation = np.array(label_info["segments"]["obbAligned"]["normalizedAxes"]).reshape(3, 3) + + transform = np.array(label_info["segments"]["obbAligned"]["centroid"]).reshape(-1, 3) + scale = np.array(label_info["segments"]["obbAligned"]["axesLengths"]).reshape(-1, 3) + + trns = np.eye(4) + trns[0:3, 3] = transform + trns[0:3, 0:3] = rotation.T + + box_trimesh_fmt = trimesh.creation.box(scale.reshape(3,), trns) + obj_containment = np.argwhere(box_trimesh_fmt.contains(vertices)) + + vertex_instance[obj_containment] = object_id + box3d = compute_box_3d(scale.reshape(3).tolist(), transform, rotation) + bbox_list.append(box3d) + + vertices_structured['objectId'] = vertex_instance + if np.max(vertex_colors) <= 1: + vertex_colors = vertex_colors * 255.0 + + + vertices_structured['x'] = plydata['vertex']['x'] + vertices_structured['y'] = plydata['vertex']['y'] + vertices_structured['z'] = plydata['vertex']['z'] + + return vertices_structured + +def compute_box_3d(size, center, rotmat): + """Compute corners of a single box from rotation matrix + Args: + size: list of float [dx, dy, dz] + center: np.array [x, y, z] + rotmat: np.array (3, 3) + Returns: + corners: (8, 3) + """ + l, h, w = [i / 2 for i in size] + center = np.reshape(center, (-1, 3)) + center = center.reshape(3) + x_corners = [l, l, -l, -l, l, l, -l, -l] + y_corners = [h, -h, -h, h, h, -h, -h, h] + z_corners = [w, w, w, w, -w, -w, -w, -w] + corners_3d = np.dot( + np.transpose(rotmat), np.vstack([x_corners, y_corners, z_corners]) + ) + corners_3d[0, :] += center[0] + corners_3d[1, :] += center[1] + corners_3d[2, :] += center[2] + return np.transpose(corners_3d) + +def rotate_z_axis_by_degrees(pointcloud, theta, clockwise=True): + theta = np.deg2rad(theta) + cos_t = np.cos(theta) + sin_t = np.sin(theta) + rot_matrix = np.array([[cos_t, -sin_t, 0], + [sin_t, cos_t, 0], + [0, 0, 1]], pointcloud.dtype) + if not clockwise: + rot_matrix = rot_matrix.T + return pointcloud.dot(rot_matrix) + +def calc_align_matrix(bbox_list): + RANGE = [-45, 45] + NUM_BIN = 90 + angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) + angle_counts = {} + for _a in angles: + bucket = round(_a, 3) + for box in bbox_list: + box_r = rotate_z_axis_by_degrees(box, bucket) + bottom = box_r[4:] + if is_axis_aligned(bottom): + angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 + if len(angle_counts) == 0: + RANGE = [-90, 90] + NUM_BIN = 180 + angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) + for _a in angles: + bucket = round(_a, 3) + for box in bbox_list: + box_r = rotate_z_axis_by_degrees(box, bucket) + bottom = box_r[4:] + if is_axis_aligned(bottom, thres=0.15): + angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 + most_common_angle = max(angle_counts, key=angle_counts.get) + return most_common_angle + +def is_axis_aligned(rotated_box, thres=0.05): + x_diff = abs(rotated_box[0][0] - rotated_box[1][0]) + y_diff = abs(rotated_box[0][1] - rotated_box[3][1]) + return x_diff < thres and y_diff < thres diff --git a/util/multiscan.py b/util/multiscan.py new file mode 100644 index 0000000..5ce872e --- /dev/null +++ b/util/multiscan.py @@ -0,0 +1,698 @@ +import os.path as osp +import numpy as np +from plyfile import PlyData +from glob import glob +import csv +import jsonlines +import json +import os +import pandas as pd + +MULTISCAN_SCANNET = { + "wall": "wall", + "door": "door", + "slippers": "shoe", + "mop": "broom", + "rug": "rug", + "floor": "floor", + "basin": "sink", + "basin_stand": "sink", + "bucket": "bucket", + "shower": "shower", + "water_tank": "container", + "beam": "wood beam", + "pillar": "pillar", + "ceiling": "ceiling", + "sink": "sink", + "toilet": "toilet", + "cabinet": "cabinet", + "remove": "object", + "towel": "towel", + "pillow": "pillow", + "sofa": "sofa", + "footstool": "footstool", + "picture": "picture", + "window": "window", + "heater": "heater", + "mirror": "mirror", + "pipe": "pipe", + "scarf": "cloth", + "ceiling_light": "ceiling light", + "chair": "chair", + "table": "table", + "vent": "vent", + "bag": "bag", + "wall_cabinet": "cabinet", + "range": "stove", + "ricemaker": "rice cooker", + "pan": "cooking pan", + "coffee_machine": "coffee maker", + "rice_bag": "bag", + "light": "light", + "trashbin": "trash bin", + "kettle": "kettle", + "refrigerator": "refrigerator", + "microwave": "microwave", + "light_switch": "light switch", + "rice_cooker": "rice cooker", + "box": "box", + "shoe": "shoe", + "range_hood": "range hood", + "wok": "cooking pan", + "router": "object", + "paper_towel": "paper towel roll", + "stock_pot": "pot", + "cutting_board": "cutting board", + "wall_calendar": "calendar", + "baseboard": "object", + "coke_box": "box", + "printer": "printer", + "bowl": "bowl", + "backpack": "backpack", + "baseboard_heater": "heater", + "broom": "broom", + "dust_pan": "dustpan", + "trash_bin": "trash bin", + "rigid_duct": "vent", + "electric_range": "stove", + "spatula": "object", + "faucet": "faucet", + "bottle": "bottle", + "countertop": "counter", + "railing": "railing", + "suitcase": "suitcase", + "trash": "trash can", + "pot": "pot", + "kitchen_tool": "object", + "vegetable": "object", + "board": "board", + "washing_machine": "washing machine", + "jar": "jar", + "object": "object", + "notebook": "book", + "induction_cooker": "stove", + "instant_pot_lid": "cooking pot", + "oven": "oven", + "air_fryer": "object", + "lid": "pot", + "sponge": "sponge", + "blender": "object", + "spoon": "object", + "dishwasher": "dishwasher", + "detergent": "laundry detergent", + "watermelon": "bananas", + "yard_waste_bag": "garbage bag", + "container": "container", + "newspapers": "paper", + "rag": "cloth", + "ladder": "ladder", + "gate": "door", + "napkin_box": "tissue box", + "jacket": "jacket", + "windowsill": "windowsill", + "water_faucet": "faucet", + "steel_ball": "ball", + "rice_maker": "rice cooker", + "watter_bottle": "water bottle", + "plastic_bag": "bag", + "paper_bag": "paper bag", + "cuttting_board": "cutting board", + "trash_bin_lid": "trash bin", + "hair_dryer": "hair dryer", + "electric_socket": "power outlet", + "electric_panel": "electric panel", + "wash_stand": "sink", + "soap": "soap", + "curtain": "curtain", + "bathtub": "bathtub", + "smoke_detector": "smoke detector", + "roll_paper": "paper towel roll", + "chandelier": "chandelier", + "hand_sanitizer": "hand sanitzer dispenser", + "plate": "plate", + "sticker": "sticker", + "power_socket": "power outlet", + "stacked_cups": "stack of cups", + "stacked_chairs": "stack of chairs", + "air_vent": "vent", + "cornice": "cabinet", + "wine_cabinet": "kitchen cabinet", + "crock": "bowl", + "liquor_box": "cabinet", + "shampoo": "shampoo", + "shower_curtain": "shower curtain", + "wall_light": "wall lamp", + "sink_cabinet": "sink", + "toilet_roll": "toilet paper", + "shelf": "shelf", + "paper_bin": "recycling bin", + "toilet_brush": "toilet brush", + "shower_head": "shower head", + "tv": "tv", + "remote_control": "remote", + "tv_box": "tv stand", + "nightstand": "nightstand", + "bed": "bed", + "quilt": "blanket", + "telephone": "telephone", + "monitor": "monitor", + "desk": "desk", + "radiator_shell": "radiator", + "calendar": "calendar", + "clock": "clock", + "keyboard": "keyboard", + "speaker": "speaker", + "clothes": "clothes", + "door_frame": "doorframe", + "sliding_door": "sliding door", + "ceiling_lamp": "ceiling lamp", + "scale": "scale", + "power_strip": "power strip", + "switch": "light switch", + "basket": "basket", + "stool": "stool", + "shoes": "shoe", + "slipper": "slippers", + "bifold_door": "door", + "rangehood": "range hood", + "books": "books", + "toilet_paper": "toilet paper", + "mouse_pad": "mouse", + "ipad": "ipad", + "scissor": "knife block", + "radiator": "radiator", + "pc": "computer tower", + "bicycle": "bicycle", + "wardrobe": "wardrobe", + "mouse": "mouse", + "advertising_board": "poster", + "banner": "banner", + "ceiling_decoration": "ceiling light", + "whiteboard": "whiteboard", + "wall_storage_set": "shelf", + "traffic_cone": "traffic cone", + "wall_decoration": "decoration", + "papers": "papers", + "hat": "hat", + "velvet_hangers": "clothes hanger", + "circular_plate": "plate", + "cellphone": "telephone", + "pen": "keyboard piano", + "paper": "paper", + "lamp": "lamp", + "curtain_box": "curtains", + "woodcarving": "wood", + "scissors": "knife block", + "hand_dryer": "hand dryer", + "machine": "machine", + "vase": "vase", + "plant": "plant", + "power_socket_case": "power outlet", + "gloves": "clothes", + "dishcloth": "cloth", + "painting": "painting", + "shower_wall": "shower wall", + "showerhead": "shower head", + "tooth_mug": "cup", + "map": "map", + "knot_artwork": "decoration", + "fan": "fan", + "sphygmomanometer": "scale", + "electric_kettle": "kettle", + "bread_maker": "oven", + "knife_set": "knife block", + "soup_pot": "cooking pot", + "flatware_set": "cutting board", + "candle": "candle", + "lid_rack": "dish rack", + "flower": "flowerpot", + "can": "can", + "scoop": "bowl", + "laptop": "laptop", + "glass": "glass doors", + "wet_floor_sign": "wet floor sign", + "shower_enclosure": "shower doors", + "jewelry_box": "jewelry box", + "bath_brush": "hair brush", + "sofa_cushion": "couch cushions", + "tv_cabinet": "tv stand", + "wood_fence": "wood beam", + "floor_lamp": "lamp", + "computer_case": "computer tower", + "waste_container": "trash bin", + "roadblock": "barricade", + "trash_can_lids": "trash can", + "hand_sanitizer_stand": "soap dispenser", + "air_conditioner": "conditioner bottle", + "pattern": "rug", + "remote_controller": "remote", + "phone": "telephone", + "speakers": "speaker", + "table_divider": "divider", + "table_card": "card", + "paper_trimmer": "paper cutter", + "stapler": "stapler", + "cup": "cup", + "bathroom_heater": "heater", + "wall_shelf": "shelf", + "towel_rack": "towel", + "sink_drain": "sink", + "floor_drain": "floor", + "broom_head": "broom", + "door_curtain": "curtain", + "refill_pouch": "plastic container", + "bin": "bin", + "stall_wall": "bathroom stall door", + "wall_speaker": "speaker", + "laundry_basket": "laundry basket", + "tissue_box": "tissue box", + "document_holder": "file cabinet", + "yoga_mat": "yoga mat", + "gas_range": "stove", + "chopping_board": "cutting board", + "book_scanner": "scanner", + "payment_terminal": "vending machine", + "napkin_roll": "paper towel roll", + "faucet_switch": "faucet", + "glass_door": "glass doors", + "carpet": "carpet", + "shower_floor": "shower floor", + "toilet_plunger": "plunger", + "plug_panel": "power outlet", + "stand": "stand", + "potted_plant": "potted plant", + "poster": "poster", + "isolation_board": "divider", + "soap_holder": "soap dish", + "plug": "power outlet", + "brush": "hair brush", + "threshold": "doorframe", + "air_conditioner_controller": "remote", + "iron": "iron", + "ironing_board": "ironing board", + "safe": "suitcase", + "gas_cooker": "stove", + "pressure_cooker": "cooking pot", + "steamer_pot": "pot", + "soy_sauce_bottle": "bottle", + "dishwashing_liquid": "dishwashing soap bottle", + "water_ladle": "bowl", + "power_socket_set": "power strip", + "kitchen_tool_holder": "kitchen cabinet", + "case": "case", + "wall_paper": "wall", + "comb": "hair brush", + "paper_cutter": "paper cutter", + "pencil_sharpener": "pen holder", + "sealing_machine": "machine", + "poster_board": "poster", + "shredder": "shredder", + "footstep": "stair", + "planter": "plant", + "floor_light": "lamp", + "paper_cup": "cup", + "divider": "divider", + "hanger": "clothes hanger", + "glove": "clothing", + "blanket": "blanket", + "remote": "remote", + "cloth": "cloth", + "clutter": "object", + "extinguisher": "fire extinguisher", + "dryer": "clothes dryer", + "soap_bottle": "soap bottle", + "fabric_softener_box": "box", + "dryer_sheet_box": "box", + "detergent_bottle": "laundry detergent", + "toaster": "toaster", + "stacked_bowls": "bowl", + "pot_lid": "pot", + "electric_pressure_cooker": "rice cooker", + "bread": "food display", + "bagels": "object", + "oranges": "bananas", + "card_reader": "card", + "whiteboard_detergent": "soap dispenser", + "power_outlet": "power outlet", + "bouquet": "vase", + "water_bottle": "water bottle", + "wall_mounted_telephone": "telephone", + "fridge": "refrigerator", + "toy": "toy dinosaur", + "shoe_box": "box", + "hole_puncher": "paper cutter", + "landline_telephone": "telephone", + "base": "stand", + "handkerchief": "cloth", + "cornice_molding": "frame", + "bathtub_base": "bathtub", + "bidet": "toilet", + "pedestal_urinal": "urinal", + "pedestal_urinal_covered": "urinal", + "pit_toilet": "toilet", + "low_wall": "wall", + "rail": "rail", + "bottles": "bottles", + "floor_otherroom": "floor", + "wall_otherroom": "wall", + "canopy": "canopy", + "cable_manager": "cable", + "sneakers": "shoes", + "purse": "purse", + "cushion": "cushion", + "napkin": "towel", + "plush_toy": "stuffed animal", + "adjustable_desk": "desk", + "tableware": "plates", + "computer_desk": "desk", + "cat_kennel": "cat litter box", + "back_cushion": "pillow", + "ukulele_bag": "guitar case", + "litter_box": "trash can", + "storage_box": "storage bin", + "toy_doll": "doll", + "drawer_unit": "drawer", + "doll": "stuffed animal", + "laptop_bag": "messenger bag", + "clothing_rack": "clothing rack", + "bookshelf": "bookshelves", + "mask": "cloth", + "watch": "clock", + "book": "books", + "ashtray": "tray", + "car_key": "car", + "wallet": "purse", + "tea_pot": "tea kettle", + "wire": "cable", + "rake": "broom", + "dispenser": "soap dispenser", + "toilet_tank": "toilet", + "door_sill": "doorframe", + "cleanser": "soap", + "armrest": "armchair", + "short_wall": "wall", + "suspended_ceiling": "ceiling", + "fire_extinguisher_cabinet": "fire extinguisher", + "plastic_box": "plastic container", + "sanitation_station": "soap dispenser", + "plant_pot": "flowerpot", + "fireplace": "fireplace", + "computer_table": "desk", + "tissue_bag": "tissue box", + "wall_frame": "frame", + "map_board": "map", + "automated_teller_machine": "vending machine", + "ticket": "card", + "tablet": "ipad", + "blankets": "blanket", + "bags": "bag", + "flag": "flag", + "blackboard": "blackboard", + "bar_table": "bar", + "cardboard_holder": "cardboard", + "potted_planet": "potted plant", + "tray": "tray", + "utensil_holder": "kitchen counter", + "bird_ceramics": "statue", + "shirt": "shirt", + "clothes_rail": "clothes hanger", + "power_strips": "power strip", + "card_board": "board", + "pile_of_blankets": "blanket", + "bed_net": "bed", + "umbrella": "umbrella", + "dragon_fruit": "bananas", + "tissue": "tissue box", + "electrical_panel": "electric panel", + "panel": "door", + "tube": "tube", + "pile_of_cloth": "cloth", + "surface": "table", + "chair_cushion": "cushion", + "guide": "book", + "parapet": "railing", + "camera": "camera", + "light_base": "lamp base", + "first_aid": "object", + "bench": "bench", + "potted_plants": "potted plant", + "pot_cover": "pot", + "yoga_mat_roll": "yoga mat", + "panda_doll": "stuffed animal", + "window_trim": "window", + "shoe_cabinet": "shoe rack", + "toilet_paper_holder": "toilet paper dispenser", + "shower_faucet": "shower faucet handle", + "bath_sponge": "sponge", + "ornament": "decoration", + "planter_box": "plant", + "cooktop": "stove", + "knife_block": "knife block", + "step_stool": "step stool", + "touchpad": "keyboard", + "light_box": "light", + "sound": "speaker", + "exhaust_fan_vent": "vent", + "paperbin": "recycling bin", + "mop_bucket": "bucket", + "sneaker": "shoes", + "objects": "object", + "cd_tray": "cd case", + "wall_board": "board", + "room_divider": "divider", + "paiting": "painting", + "cabinet_otherroom": "cabinet", + "electric_switch": "light switch", + "sign": "exit sign", + "hand_soap": "soap bottle", + "window_blinds": "blinds" +} + +def read_label_map(metadata_dir, label_from='raw_category', label_to='nyu40id'): + LABEL_MAP_FILE = osp.join(metadata_dir, 'scannetv2-labels.combined.tsv') + assert osp.exists(LABEL_MAP_FILE) + + raw_label_map = read_label_mapping(LABEL_MAP_FILE, label_from=label_from, label_to=label_to) + return raw_label_map + +def read_label_mapping(filename, label_from='raw_category', label_to='nyu40id'): + assert osp.isfile(filename) + mapping = dict() + with open(filename) as csvfile: + reader = csv.DictReader(csvfile, delimiter='\t') + for row in reader: + mapping[row[label_from]] = row[label_to] + + if represents_int(list(mapping.keys())[0]): + mapping = {int(k):v for k,v in mapping.items()} + + return mapping + +def get_scan_ids(dirname, split): + filepath = osp.join(dirname, '{}_scans.txt'.format(split)) + scan_ids = np.genfromtxt(filepath, dtype = str) + return scan_ids + +def annotations_to_dataframe_obj(annotations): + objects = annotations['objects'] + df_list = [] + for obj in objects: + object_id = obj['objectId'] + object_label = obj['label'] + df_row = pd.DataFrame( + [[object_id, object_label]], + columns=['objectId', 'objectLabel'] + ) + df_list.append(df_row) + df = pd.concat(df_list) + return df + + +def load_ply_data(data_dir, scan_id): + """ + Load PLY data and propagate object IDs from faces to vertices. + + Args: + data_dir (str): Directory containing the PLY file. + scan_id (str): Identifier for the scan. + + Returns: + np.ndarray: Vertex data with propagated object IDs. + """ + # with open(osp.join(data_dir, scan_id, f'{scan_id}.annotations.json'), "r", encoding='utf-8') as f: + # annotations = json.load(f) + + filename_in = osp.join(data_dir, scan_id, '{}.ply'.format(scan_id)) + + if not osp.exists(filename_in): + raise FileNotFoundError(f"PLY file not found: {filename_in}") + + with open(filename_in, 'rb') as file: + ply_data = PlyData.read(file) + + # Extract vertex properties + x = np.array(ply_data['vertex']['x']) + y = np.array(ply_data['vertex']['y']) + z = np.array(ply_data['vertex']['z']) + red = np.array(ply_data['vertex']['red']) + green = np.array(ply_data['vertex']['green']) + blue = np.array(ply_data['vertex']['blue']) + + # Extract normals if available + if 'nx' in ply_data['vertex'] and 'ny' in ply_data['vertex'] and 'nz' in ply_data['vertex']: + nx = np.array(ply_data['vertex']['nx']) + ny = np.array(ply_data['vertex']['ny']) + nz = np.array(ply_data['vertex']['nz']) + normals = np.stack([nx, ny, nz], axis=-1) + else: + normals = None + + + vertex_object_ids = np.full(len(x), -1, dtype='int32') + + # Extract face data + faces = ply_data['face'].data + face_vertex_indices = [face['vertex_indices'] for face in faces] + face_object_ids = [face['objectId'] for face in faces] + + # Propagate object IDs to vertices + for face_indices, obj_id in zip(face_vertex_indices, face_object_ids): + vertex_object_ids[face_indices] = obj_id # Assign object ID to all vertices in the face + + + vertex_dtype = [ + ('x', 'f4'), ('y', 'f4'), ('z', 'f4'), # Coordinates + ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), # Colors + ('objectId', 'i4') # Propagated Object ID + ] + + if normals is not None: + vertex_dtype.extend([('nx', 'f4'), ('ny', 'f4'), ('nz', 'f4')]) # Normals + + vertices = np.empty(len(x), dtype=vertex_dtype) + vertices['x'] = x.astype('f4') + vertices['y'] = y.astype('f4') + vertices['z'] = z.astype('f4') + vertices['red'] = red.astype('u1') + vertices['green'] = green.astype('u1') + vertices['blue'] = blue.astype('u1') + vertices['objectId'] = vertex_object_ids.astype('i4') + + if normals is not None: + vertices['nx'] = normals[:, 0].astype('f4') + vertices['ny'] = normals[:, 1].astype('f4') + vertices['nz'] = normals[:, 2].astype('f4') + + return vertices + +def load_meta_intrinsics(scan_dir, scene_id, stream_type="color_camera"): + ''' + Load MultiScan intrinsic information + ''' + meta_intrinsics_path = osp.join(scan_dir, f'{scene_id}.json') + intrinsics = {} + + with open(meta_intrinsics_path,"r") as f: + json_data=json.load(f) + + for stream in json_data.get("streams", []): + if stream.get("type") == stream_type: + intrinsic_mat = np.array(stream.get("intrinsics")) + intrinsic_mat = np.reshape(intrinsic_mat, (3, 3), order='F') + intrinsics['intrinsic_mat']=intrinsic_mat + resolution = stream.get("resolution") + width, height = resolution[1], resolution[0] # [width, height] + intrinsics['width']=float(width) + intrinsics['height']=float(height) + + return intrinsics + +def load_intrinsics(scan_dir, scene_id, frame_id, stream_type="color_camera"): + ''' + Load MultiScan intrinsic information + ''' + intrinsics_path = osp.join(scan_dir, 'poses.jsonl') + resoultion_path = osp.join(scan_dir, f'{scene_id}.json') + intrinsics = {} + + with open(resoultion_path,"r") as f: + json_data=json.load(f) + + for stream in json_data.get("streams", []): + if stream.get("type") == stream_type: + resolution = stream.get("resolution", None) + if resolution: + width, height = resolution[1], resolution[0] # [width, height] + intrinsics['width']=float(width) + intrinsics['height']=float(height) + + + with jsonlines.open(intrinsics_path) as reader: + for entry in reader: + if entry.get("frame_id") == frame_id: + intrinsic_mat = np.asarray(entry.get('intrinsics')) + intrinsic_mat = np.reshape(intrinsic_mat, (3, 3), order='F') + intrinsics['intrinsic_mat']=intrinsic_mat + break + + return intrinsics + +def load_pose(scan_dir, frame_id): + # Find alignment file + alignment_path = None + for file_name in os.listdir(scan_dir): + if file_name.endswith('.align.json'): + alignment_path = osp.join(scan_dir, file_name) + break + + if alignment_path is None: + raise FileNotFoundError(f"No alignment file found in {scan_dir}") + + with open(alignment_path, "r") as f: + alignment_data = json.load(f) + if 'coordinate_transform' not in alignment_data: + raise ValueError(f"Alignment file {alignment_path} does not contain 'coordinate_transform'") + coordinate_transform = np.reshape(alignment_data['coordinate_transform'], (4, 4), order='F') + inv_transform = np.linalg.inv(coordinate_transform) + + pose_path = osp.join(scan_dir, 'poses.jsonl') + with jsonlines.open(pose_path) as reader: + for entry in reader: + if entry.get("frame_id") == frame_id: + transform = np.asarray(entry.get('transform')) + transform = np.reshape(transform, (4, 4), order='F') + transform = np.dot(transform, np.diag([1, -1, -1, 1])) + transform = transform / transform[3][3] + aligned_pose = inv_transform @ transform #align camera poses + return aligned_pose + + raise ValueError(f"Pose for frame_id {frame_id} not found in {pose_path}") + + +def load_all_poses(scan_dir, frame_idxs): + frame_poses = {} + for frame_idx in frame_idxs: + frame_pose = load_pose(scan_dir, int(frame_idx)) + frame_poses[frame_idx] = frame_pose + return frame_poses + +def load_frame_idxs(scan_dir, skip=None): + frames_paths = glob(osp.join(scan_dir, 'sequence', '*.jpg')) + frame_names = [osp.basename(frame_path) for frame_path in frames_paths] + frame_idxs = [frame_name.split('.')[0].split('-')[-1] for frame_name in frame_names] + frame_idxs.sort() + + if skip is None: + frame_idxs = frame_idxs + else: + frame_idxs = [frame_idx for frame_idx in frame_idxs[::skip]] + return frame_idxs + + +def represents_int(s): + ''' if string s represents an int. ''' + try: + int(s) + return True + except ValueError: + return False \ No newline at end of file diff --git a/util/scan3r.py b/util/scan3r.py index 2727d5a..31684aa 100644 --- a/util/scan3r.py +++ b/util/scan3r.py @@ -3,6 +3,8 @@ from plyfile import PlyData from glob import glob import csv +import json +import trimesh def get_scan_ids(dirname: str, split: str) -> np.ndarray: """Retrieve scan IDs for the given directory and split.""" @@ -10,34 +12,54 @@ def get_scan_ids(dirname: str, split: str) -> np.ndarray: scan_ids = np.genfromtxt(filepath, dtype = str) return scan_ids -def load_ply_data(data_dir: str, scan_id: str, label_file_name: str) -> np.ndarray: - """Load PLY data from specified directory, scan ID, and label file.""" +def load_ply_data(data_dir, scan_id, label_file_name): filename_in = osp.join(data_dir, scan_id, label_file_name) file = open(filename_in, 'rb') ply_data = PlyData.read(file) file.close() x = ply_data['vertex']['x'] - y = ply_data['vertex']['y'] - z = ply_data['vertex']['z'] - red = ply_data['vertex']['red'] - green = ply_data['vertex']['green'] - blue = ply_data['vertex']['blue'] + # y = ply_data['vertex']['y'] + # z = ply_data['vertex']['z'] + # red = ply_data['vertex']['red'] + # green = ply_data['vertex']['green'] + # blue = ply_data['vertex']['blue'] object_id = ply_data['vertex']['objectId'] global_id = ply_data['vertex']['globalId'] nyu40_id = ply_data['vertex']['NYU40'] eigen13_id = ply_data['vertex']['Eigen13'] rio27_id = ply_data['vertex']['RIO27'] - vertices = np.empty(len(x), dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4'), ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), + obj_mesh = trimesh.load(osp.join(data_dir, scan_id, 'mesh.refined.v2.obj')) + + obj_mesh_points = np.asarray(obj_mesh.vertices) + obj_mesh_colors = obj_mesh.visual.to_color().vertex_colors[:,:3] + + min_vertices = min(len(object_id), len(x), obj_mesh_points.shape[0]) + + obj_mesh_points = obj_mesh_points[:min_vertices] + object_ids = object_id[:min_vertices] + obj_mesh_colors = obj_mesh_colors[:min_vertices] + global_id = global_id[:min_vertices] + nyu40_id = nyu40_id[:min_vertices] + eigen13_id = eigen13_id[:min_vertices] + rio27_id = rio27_id[:min_vertices] + + vertices = np.empty(min_vertices, dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4'), ('red', 'u1'), ('green', 'u1'), ('blue', 'u1'), ('objectId', 'h'), ('globalId', 'h'), ('NYU40', 'u1'), ('Eigen13', 'u1'), ('RIO27', 'u1')]) - vertices['x'] = x.astype('f4') - vertices['y'] = y.astype('f4') - vertices['z'] = z.astype('f4') - vertices['red'] = red.astype('u1') - vertices['green'] = green.astype('u1') - vertices['blue'] = blue.astype('u1') - vertices['objectId'] = object_id.astype('h') + # vertices['x'] = x.astype('f4') + # vertices['y'] = y.astype('f4') + # vertices['z'] = z.astype('f4') + # vertices['red'] = red.astype('u1') + # vertices['green'] = green.astype('u1') + # vertices['blue'] = blue.astype('u1') + vertices['x'] = obj_mesh_points[:, 0].astype('f4') + vertices['y'] = obj_mesh_points[:, 1].astype('f4') + vertices['z'] = obj_mesh_points[:, 2].astype('f4') + vertices['red'] = obj_mesh_colors[:, 0].astype('u1') + vertices['green'] = obj_mesh_colors[:, 1].astype('u1') + vertices['blue'] = obj_mesh_colors[:, 2].astype('u1') + vertices['objectId'] = object_ids.astype('h') vertices['globalId'] = global_id.astype('h') vertices['NYU40'] = nyu40_id.astype('u1') vertices['Eigen13'] = eigen13_id.astype('u1') @@ -136,4 +158,69 @@ def represents_int(s: str) -> bool: int(s) return True except ValueError: - return False \ No newline at end of file + return False + +def calc_align_matrix(bbox_list): + RANGE = [-45, 45] + NUM_BIN = 90 + angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) + angle_counts = {} + for _a in angles: + bucket = round(_a, 3) + for box in bbox_list: + box_r = rotate_z_axis_by_degrees(box, bucket) + bottom = box_r[4:] + if is_axis_aligned(bottom): + angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 + if len(angle_counts) == 0: + RANGE = [-90, 90] + NUM_BIN = 180 + angles = np.linspace(RANGE[0], RANGE[1], NUM_BIN) + for _a in angles: + bucket = round(_a, 3) + for box in bbox_list: + box_r = rotate_z_axis_by_degrees(box, bucket) + bottom = box_r[4:] + if is_axis_aligned(bottom, thres=0.15): + angle_counts[bucket] = angle_counts.get(bucket, 0) + 1 + most_common_angle = max(angle_counts, key=angle_counts.get) + return most_common_angle + +def is_axis_aligned(rotated_box, thres=0.05): + x_diff = abs(rotated_box[0][0] - rotated_box[1][0]) + y_diff = abs(rotated_box[0][1] - rotated_box[3][1]) + return x_diff < thres and y_diff < thres + +def rotate_z_axis_by_degrees(pointcloud, theta, clockwise=True): + theta = np.deg2rad(theta) + cos_t = np.cos(theta) + sin_t = np.sin(theta) + rot_matrix = np.array([[cos_t, -sin_t, 0], + [sin_t, cos_t, 0], + [0, 0, 1]], pointcloud.dtype) + if not clockwise: + rot_matrix = rot_matrix.T + return pointcloud.dot(rot_matrix) + +def compute_box_3d(size, center, rotmat): + """Compute corners of a single box from rotation matrix + Args: + size: list of float [dx, dy, dz] + center: np.array [x, y, z] + rotmat: np.array (3, 3) + Returns: + corners: (8, 3) + """ + l, h, w = [i / 2 for i in size] + center = np.reshape(center, (-1, 3)) + center = center.reshape(3) + x_corners = [l, l, -l, -l, l, l, -l, -l] + y_corners = [h, -h, -h, h, h, -h, -h, h] + z_corners = [w, w, w, w, -w, -w, -w, -w] + corners_3d = np.dot( + np.transpose(rotmat), np.vstack([x_corners, y_corners, z_corners]) + ) + corners_3d[0, :] += center[0] + corners_3d[1, :] += center[1] + corners_3d[2, :] += center[2] + return np.transpose(corners_3d) \ No newline at end of file