added ymir datasest preparation

2025-10-16 11:45:42 +00:00 · 2022-07-17 21:42:01 +02:00 · 2022-07-17 21:42:01 +02:00 · 04358926d0
commit 04358926d0
parent a87940664d
4 changed files with 195 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -80,3 +80,8 @@ dataset/s3dis/Stanford3dDataset_v1.2
 dataset/stpls3d/train
 dataset/stpls3d/val
 dataset/stpls3d/Synthetic_v3_InstanceSegmentation
+
+# ymir dataset files
+dataset/ymir/overfit_no_filter
+**/*.h5
+**/*.pth
--- a/configs/softgroup_ymir_backbone_overfit_no_filter.yaml
+++ b/configs/softgroup_ymir_backbone_overfit_no_filter.yaml
@ -0,0 +1,81 @@
+model:
+  channels: 16
+  num_blocks: 7
+  semantic_classes: 2
+  instance_classes: 1
+  sem2ins_classes: []
+  semantic_only: True
+  semantic_weight: [1.0, 1.0, 44.0, 21.9, 1.8, 25.1, 31.5, 21.8, 24.0, 54.4, 114.4,
+                    81.2, 43.6, 9.7, 22.4]    # TODO: What is this?!
+  with_coords: False
+  ignore_label: -100
+  grouping_cfg:
+    score_thr: 0.2
+    radius: 0.9 # TODO: depending on the scale
+    mean_active: 3
+    # TODO: Insert the computed values
+    class_numpoint_mean: [-1., 10408., 58., 124., 1351., 162., 430., 1090., 451., 26., 43.,
+                          61., 39., 109., 1239]
+    npoint_thr: 0.05  # absolute if class_numpoint == -1, relative if class_numpoint != -1
+    ignore_classes: [0] # TODO: Should we add the walls here?
+  instance_voxel_cfg:
+    scale: 3 # TODO: Adjust (?)
+    spatial_shape: 20
+  train_cfg:
+    max_proposal_num: 200 # TODO: probably needs to be adjusted to the number of pores in one sample
+    pos_iou_thr: 0.5
+  test_cfg:
+    x4_split: False
+    cls_score_thr: 0.001
+    mask_score_thr: -0.5
+    min_npoint: 100
+  fixed_modules: []
+
+data:
+  train:
+    type: 'ymir'
+    data_root: 'dataset/ymir/overfit_no_filter/prepared'
+    prefix: 'train'
+    suffix: '.pth'
+    training: True
+    repeat: 4
+    voxel_cfg:
+      scale: 3 # TODO: Adjust
+      spatial_shape: [128, 512]
+      max_npoint: 250000
+      min_npoint: 5000
+  test:
+    type: 'ymir'
+    data_root: 'dataset/ymir/overfit_no_filter/prepared'
+    prefix: 'val'
+    suffix: '.pth'
+    training: False
+    voxel_cfg:
+      scale: 3 # TODO: Adjust
+      spatial_shape: [128, 512]
+      max_npoint: 250000
+      min_npoint: 5000
+
+dataloader:
+  train:
+    batch_size: 4
+    num_workers: 4
+  test:
+    batch_size: 1
+    num_workers: 1
+
+optimizer:
+  type: 'Adam'
+  lr: 0.004
+
+save_cfg:
+  semantic: True
+  offset: True
+  instance: True
+
+fp16: False
+epochs: 20
+step_epoch: 20
+save_freq: 4
+pretrain: ''
+work_dir: ''
--- a/dataset/ymir/prepare_data.py
+++ b/dataset/ymir/prepare_data.py
@ -0,0 +1,108 @@
+from random import sample
+from typing import List
+import torch
+import numpy as np
+import glob
+import h5py
+import os
+
+
+def convertToPointCloud(
+    files: List[str],
+    outPutFolder: str,
+    split: str = 'train',
+    samplePoints: int = 0,  # no sampling
+):
+
+    train_instance_numpoints = 0
+    train_instances = 0
+    for file in files:
+        name = os.path.basename(file).strip('.h5')
+        outFilePath = os.path.join(outPutFolder, name + '.pth')
+        # read in file
+        with h5py.File(file, "r") as data:
+
+            raw = np.array(data['raw'])
+            colors = raw.flatten()  # column first
+            colors = np.repeat(colors[:, np.newaxis], 3, axis=1)
+            colors = colors.astype(np.float32)
+            # normalize
+            colors = colors / 32767.5 - 1
+
+            coords = np.mgrid[
+                0:1:raw.shape[0] * 1j,
+                0:1:raw.shape[1] * 1j,
+                0:1:raw.shape[2] * 1j,
+            ].reshape(3, -1).T
+            coords = coords.astype(np.float32)
+
+            # sampling of points
+            samples = np.arange(0, coords.shape[0])
+            if samplePoints > 0:
+                samples = np.random.choice(coords.shape[0], samplePoints)
+
+            colors = colors[samples]
+            coords = coords[samples]
+
+            if split != 'test':
+                # seems a bit weird, but they used float64 fort the labels so
+                # let's use it as well
+                sem_labels = np.array(data['foreground']).flatten().astype(np.float64)
+                # map the background value (= 0 i.e. sugar walls) to -100
+                sem_labels[sem_labels == 0] = -100
+
+                # seems a bit weird, but they used float64 fort the labels so
+                # let's use it as well
+                instance_labels = np.array(data['label']).flatten().astype(np.float64)
+
+                # sampling
+                sem_labels = sem_labels[samples]
+                instance_labels = instance_labels[samples]
+
+                # keep track of the mean number of points per instance for the training dataset
+                # NOTE: This does only work as long as we have one type of class
+                if split == 'train':
+                    values, counts = np.unique(
+                        instance_labels, return_counts=True)
+                    assert values[0] == 0
+                    print(values, counts)
+                    train_instance_numpoints += np.sum(counts[1:])
+                    train_instances += len(counts[1:])
+
+                torch.save((coords, colors, sem_labels,
+                            instance_labels), outFilePath)
+            else:
+                torch.save((coords, colors), outFilePath)
+
+    if split == 'train':
+        assert train_instances > 0
+        print('class_numpoints_mean: ', train_instance_numpoints / train_instances)
+
+
+def getFiles(files, fileSplit):
+    res = []
+    for filePath in files:
+        name = os.path.basename(filePath)
+        num = name[:2] if name[:2].isdigit() else name[:1]
+        if int(num) in fileSplit:
+            res.append(filePath)
+    return res
+
+
+if __name__ == '__main__':
+    data_folder = 'overfit_no_filter'
+    split = 'train'
+    trainFiles = sorted(glob.glob(data_folder + "/" + split + '/*.h5'))
+    print(trainFiles)
+    assert len(trainFiles) > 0
+    trainOutDir = split
+    os.makedirs(trainOutDir, exist_ok=True)
+    convertToPointCloud(trainFiles, trainOutDir, split, samplePoints=35145)
+
+    split = 'val'
+    valFiles = sorted(glob.glob(data_folder + "/" + split + '/*.h5'))
+    print(valFiles)
+    assert len(valFiles) > 0
+    valOutDir = split
+    os.makedirs(valOutDir, exist_ok=True)
+    convertToPointCloud(valFiles, valOutDir, split, samplePoints=35145)
--- a/docs/config_explanation.md
+++ b/docs/config_explanation.md
@ -1,4 +1,4 @@
-```
+``` yaml
 model:
  channels: 32  # number of base channel for the backbone network
  num_blocks: 7  # number of backbone blocks