refactor bfs_cluster

This commit is contained in:
Thang Vu
2022-04-08 09:06:37 +00:00
parent 190415587a
commit 0a17ca2cd1
12 changed files with 59 additions and 572 deletions

View File

@@ -12,10 +12,13 @@ model:
mean_active: 300
class_numpoint_mean: [1823, 7457, 6189, 7424, 34229, 1724, 5439,
6016, 39796, 5279, 5092, 12210, 10225]
npoint_thr: 0.05 # absolute if class_numpoint == -1, relative if class_numpoint != -1
ignore_classes: [0, 1]
instance_voxel_cfg:
scale: 50
spatial_shape: 20
train_cfg:
max_proposal_num: 200
pos_iou_thr: 0.5
test_cfg:
x4_split: True

View File

@@ -19,7 +19,6 @@ class HierarchicalAggregation(Function):
'''
N = start_len.size(0)
assert cluster_numpoint_mean.is_contiguous()
assert semantic_label.is_contiguous()
assert coord_shift.is_contiguous()
assert ball_query_idxs.is_contiguous()
@@ -329,10 +328,9 @@ ballquery_batch_p = BallQueryBatchP.apply
class BFSCluster(Function):
@staticmethod
def forward(ctx, semantic_label, ball_query_idxs, start_len, threshold):
def forward(ctx, cluster_numpoint_mean, ball_query_idxs, start_len, threshold, class_id):
'''
:param ctx:
:param semantic_label: (N), int
:param ball_query_idxs: (nActive), int
:param start_len: (N, 2), int
:return: cluster_idxs: int (sumNPoint, 2), dim 0 for cluster_id, dim 1 for corresponding point idxs in N
@@ -340,15 +338,14 @@ class BFSCluster(Function):
'''
N = start_len.size(0)
assert semantic_label.is_contiguous()
assert cluster_numpoint_mean.is_contiguous()
assert ball_query_idxs.is_contiguous()
assert start_len.is_contiguous()
cluster_idxs = semantic_label.new()
cluster_offsets = semantic_label.new()
cluster_idxs = ball_query_idxs.new()
cluster_offsets = ball_query_idxs.new()
SOFTGROUP_OP.bfs_cluster(semantic_label, ball_query_idxs, start_len, cluster_idxs, cluster_offsets, N, threshold)
SOFTGROUP_OP.bfs_cluster(cluster_numpoint_mean, ball_query_idxs, start_len, cluster_idxs, cluster_offsets, N, threshold, class_id)
return cluster_idxs, cluster_offsets

View File

@@ -2,12 +2,13 @@
Ball Query with BatchIdx & Clustering Algorithm
Written by Li Jiang
All Rights Reserved 2020.
Modified by Thang Vu - Remove semantic label in clustering
*/
#include "bfs_cluster.h"
/* ================================== ballquery_batch_p
* ================================== */
/* =================== ballquery_batch_p================================= */
// input xyz: (n, 3) float
// input batch_idxs: (n) int
// input batch_offsets: (B+1) int, batch_offsets[-1]
@@ -29,10 +30,8 @@ int ballquery_batch_p(at::Tensor xyz_tensor, at::Tensor batch_idxs_tensor,
return cumsum;
}
/* ================================== bfs_cluster
* ================================== */
ConnectedComponent find_cc(Int idx, int *semantic_label, Int *ball_query_idxs,
int *start_len, int *visited) {
ConnectedComponent find_cc(Int idx, Int *ball_query_idxs, int *start_len,
int *visited) {
ConnectedComponent cc;
cc.addPoint(idx);
visited[idx] = 1;
@@ -46,47 +45,46 @@ ConnectedComponent find_cc(Int idx, int *semantic_label, Int *ball_query_idxs,
Q.pop();
int start = start_len[cur * 2];
int len = start_len[cur * 2 + 1];
int label_cur = semantic_label[cur];
for (Int i = start; i < start + len; i++) {
Int idx_i = ball_query_idxs[i];
if (semantic_label[idx_i] != label_cur)
continue;
if (visited[idx_i] == 1)
continue;
cc.addPoint(idx_i);
visited[idx_i] = 1;
Q.push(idx_i);
}
}
return cc;
}
// input: semantic_label, int, N
// input: ball_query_idxs, Int, (nActive)
// input: start_len, int, (N, 2)
// output: clusters, CCs
int get_clusters(int *semantic_label, Int *ball_query_idxs, int *start_len,
const Int nPoint, int threshold,
ConnectedComponents &clusters) {
int visited[nPoint] = {0};
int get_clusters(float *class_numpoint_mean, int *ball_query_idxs,
int *start_len, const int nPoint, float threshold,
ConnectedComponents &clusters, const int class_id) {
int *visited = new int[nPoint]{0};
float _class_numpoint_mean, thr;
int sumNPoint = 0;
for (Int i = 0; i < nPoint; i++) {
for (int i = 0; i < nPoint; i++) {
if (visited[i] == 0) {
ConnectedComponent CC =
find_cc(i, semantic_label, ball_query_idxs, start_len, visited);
if ((int)CC.pt_idxs.size() >= threshold) {
ConnectedComponent CC = find_cc(i, ball_query_idxs, start_len, visited);
_class_numpoint_mean = class_numpoint_mean[class_id];
// if _class_num_point_mean is not defined (-1) directly use threshold
if (_class_numpoint_mean == -1) {
thr = threshold;
} else {
thr = threshold * _class_numpoint_mean;
}
if ((int)CC.pt_idxs.size() >= thr) {
clusters.push_back(CC);
sumNPoint += (int)CC.pt_idxs.size();
}
}
}
return sumNPoint;
}
// convert from ConnectedComponents to (idxs, offsets) representation
void fill_cluster_idxs_(ConnectedComponents &CCs, int *cluster_idxs,
int *cluster_offsets) {
for (int i = 0; i < (int)CCs.size(); i++) {
@@ -99,33 +97,29 @@ void fill_cluster_idxs_(ConnectedComponents &CCs, int *cluster_idxs,
}
}
// input: semantic_label, int, N
// input: class_numpoint_mean_tensor
// input: ball_query_idxs, int, (nActive)
// input: start_len, int, (N, 2)
// output: cluster_idxs, int (sumNPoint, 2), dim 0 for cluster_id, dim 1 for
// corresponding point idxs in N
// output: cluster_offsets, int (nCluster + 1)
void bfs_cluster(at::Tensor semantic_label_tensor,
void bfs_cluster(at::Tensor class_numpoint_mean_tensor,
at::Tensor ball_query_idxs_tensor, at::Tensor start_len_tensor,
at::Tensor cluster_idxs_tensor,
at::Tensor cluster_offsets_tensor, const int N,
int threshold) {
int *semantic_label = semantic_label_tensor.data<int>();
float threshold, const int class_id) {
float *class_numpoint_mean = class_numpoint_mean_tensor.data<float>();
Int *ball_query_idxs = ball_query_idxs_tensor.data<Int>();
int *start_len = start_len_tensor.data<int>();
ConnectedComponents CCs;
int sumNPoint = get_clusters(semantic_label, ball_query_idxs, start_len, N,
threshold, CCs);
int sumNPoint = get_clusters(class_numpoint_mean, ball_query_idxs, start_len,
N, threshold, CCs, class_id);
int nCluster = (int)CCs.size();
cluster_idxs_tensor.resize_({sumNPoint, 2});
cluster_offsets_tensor.resize_({nCluster + 1});
cluster_idxs_tensor.zero_();
cluster_offsets_tensor.zero_();
int *cluster_idxs = cluster_idxs_tensor.data<int>();
int *cluster_offsets = cluster_offsets_tensor.data<int>();
fill_cluster_idxs_(CCs, cluster_idxs, cluster_offsets);
}
}

View File

@@ -21,9 +21,10 @@ int ballquery_batch_p_cuda(int n, int meanActive, float radius,
const int *batch_offsets, int *idx, int *start_len,
cudaStream_t stream);
void bfs_cluster(at::Tensor semantic_label_tensor,
void bfs_cluster(at::Tensor class_numpoint_mean_tensor,
at::Tensor ball_query_idxs_tensor, at::Tensor start_len_tensor,
at::Tensor cluster_idxs_tensor,
at::Tensor cluster_offsets_tensor, const int N, int threshold);
at::Tensor cluster_offsets_tensor, const int N,
float threshold, const int class_id);
#endif // BFS_CLUSTER_H
#endif // BFS_CLUSTER_H

View File

@@ -1,10 +1,8 @@
#include "datatype/datatype.h"
#include <ATen/ATen.h>
#include "cal_iou_and_masklabel/cal_iou_and_masklabel.cu"
#include "hierarchical_aggregation/hierarchical_aggregation.cu"
#include "bfs_cluster/bfs_cluster.cu"
#include "cal_iou_and_masklabel/cal_iou_and_masklabel.cu"
#include "get_iou/get_iou.cu"
#include "roipool/roipool.cu"
#include "sec_mean/sec_mean.cu"
@@ -17,4 +15,4 @@ template void voxelize_fp_cuda<float>(Int nOutputRows, Int maxActive,
template void voxelize_bp_cuda<float>(Int nOutputRows, Int maxActive,
Int nPlanes, float *d_output_feats,
float *d_feats, Int *rules, bool average);
float *d_feats, Int *rules, bool average);

View File

@@ -1,210 +0,0 @@
#include "hierarchical_aggregation.h"
#include "time.h"
/* ================================== hierarchical_aggregation
* ================================== */
// instance point num for each class, statistical data from the training set
float class_numpoint_mean_dict[20] = {
-1., -1., 3917., 12056., 2303., 8331., 3948., 3166., 5629., 11719.,
1003., 3317., 4912., 10221., 3889., 4136., 2120., 945., 3967., 2589.};
ConnectedComponent find_cc(int idx, int *semantic_label, float *coord_shift,
int *batch_idxs, int *ball_query_idxs,
int *start_len, int *visited) {
ConnectedComponent cc;
cc.addPoint(idx);
cc.accum_x += coord_shift[idx * 3 + 0];
cc.accum_y += coord_shift[idx * 3 + 1];
cc.accum_z += coord_shift[idx * 3 + 2];
// cc.cls_label = semantic_label[idx]; // currently cc's label is the label of
// the start point, convert to float
cc.batch_idx = batch_idxs[idx]; // record batch info
visited[idx] = 1;
std::queue<int> Q;
assert(Q.empty());
Q.push(idx);
while (!Q.empty()) {
int cur = Q.front();
Q.pop();
int start = start_len[cur * 2];
int len = start_len[cur * 2 + 1];
// int label_cur = semantic_label[cur];
for (int i = start; i < start + len; i++) {
int idx_i = ball_query_idxs[i];
// if (semantic_label[idx_i] != label_cur) continue;
if (visited[idx_i] == 1)
continue;
cc.addPoint(idx_i);
cc.accum_x += coord_shift[idx_i * 3 + 0];
cc.accum_y += coord_shift[idx_i * 3 + 1];
cc.accum_z += coord_shift[idx_i * 3 + 2];
visited[idx_i] = 1;
Q.push(idx_i);
}
}
return cc;
}
// split clusters into fragment and primary based on point num
void split_clusters(float *class_numpoint_mean, int *semantic_label,
float *coord_shift, int *batch_idxs, int *ball_query_idxs,
int *start_len, const int nPoint,
ConnectedComponents &CCs_fragment,
ConnectedComponents &CCs_kept,
ConnectedComponents &CCs_primary, int *sumNPoint_fragment,
int *sumNPoint_kept, int *sumNPoint_primary,
const int class_id) {
int *visited = new int[nPoint]{0};
// int _class_idx;
float _class_numpoint_mean, low_thre, high_thre;
for (int i = 0; i < nPoint; i++) {
if (visited[i] == 0) {
ConnectedComponent CC =
find_cc(i, semantic_label, coord_shift, batch_idxs, ball_query_idxs,
start_len, visited);
// _class_idx = CC.cls_label;
_class_numpoint_mean = class_numpoint_mean[class_id];
low_thre = 0.05 * _class_numpoint_mean;
high_thre = 0.3 * _class_numpoint_mean;
if ((int)CC.pt_idxs.size() < high_thre) {
CCs_fragment.push_back(CC);
*sumNPoint_fragment += (int)CC.pt_idxs.size();
// keep fragments which are large enough to be independent instances
if ((int)CC.pt_idxs.size() >= low_thre &&
(int)CC.pt_idxs.size() < high_thre) {
CCs_kept.push_back(CC);
*sumNPoint_kept += (int)CC.pt_idxs.size();
}
} else {
CCs_primary.push_back(CC);
*sumNPoint_primary += (int)CC.pt_idxs.size();
}
}
}
return;
}
// convert from ConnectedComponents to (idxs, offsets) representation
void fill_cluster_idxs_(ConnectedComponents &CCs, int *cluster_idxs,
int *cluster_offsets, float *cluster_centers) {
for (int i = 0; i < (int)CCs.size(); i++) {
cluster_offsets[i + 1] = cluster_offsets[i] + (int)CCs[i].pt_idxs.size();
cluster_centers[i * 5 + 0] = CCs[i].accum_x / (float)CCs[i].pt_idxs.size();
cluster_centers[i * 5 + 1] = CCs[i].accum_y / (float)CCs[i].pt_idxs.size();
cluster_centers[i * 5 + 2] = CCs[i].accum_z / (float)CCs[i].pt_idxs.size();
cluster_centers[i * 5 + 3] = (float)CCs[i].cls_label;
cluster_centers[i * 5 + 4] = (float)CCs[i].batch_idx;
for (int j = 0; j < (int)CCs[i].pt_idxs.size(); j++) {
int idx = CCs[i].pt_idxs[j];
cluster_idxs[(cluster_offsets[i] + j) * 2 + 0] = i;
cluster_idxs[(cluster_offsets[i] + j) * 2 + 1] = idx;
}
}
}
// input: semantic_label, int, (N)
// input: coord_shift, float, (N, 3)
// input: batch_idxs, int, (N)
// input: ball_query_idxs, int, (nActive)
// input: start_len, int, (N, 2)
//(fragment_idxs, fragment_offsets, fragment_centers) for fragment clusters
//(cluster_idxs_kept_tensor, cluster_offsets_kept_tensor,
//cluster_centers_kept_tensor) for keeping some fragments
//(primary_idxs_tensor, primary_offsets, primary_centers) for primary clusters
//(primary_idxs_post_tensor, primary_offsets_post_tensor) for aggregated
//clusters
void hierarchical_aggregation(
at::Tensor class_numpoint_mean_tensor, at::Tensor semantic_label_tensor,
at::Tensor coord_shift_tensor, at::Tensor batch_idxs_tensor,
at::Tensor ball_query_idxs_tensor, at::Tensor start_len_tensor,
at::Tensor fragment_idxs_tensor, at::Tensor fragment_offsets_tensor,
at::Tensor fragment_centers_tensor, at::Tensor cluster_idxs_kept_tensor,
at::Tensor cluster_offsets_kept_tensor,
at::Tensor cluster_centers_kept_tensor, at::Tensor primary_idxs_tensor,
at::Tensor primary_offsets_tensor, at::Tensor primary_centers_tensor,
at::Tensor primary_idxs_post_tensor, at::Tensor primary_offsets_post_tensor,
const int N, const int training_mode_, const int using_set_aggr_,
const int class_id) {
float *class_numpoint_mean = class_numpoint_mean_tensor.data<float>();
int *semantic_label = semantic_label_tensor.data<int>();
float *coord_shift = coord_shift_tensor.data<float>();
int *batch_idxs = batch_idxs_tensor.data<int>();
int *ball_query_idxs = ball_query_idxs_tensor.data<int>();
int *start_len = start_len_tensor.data<int>();
ConnectedComponents CCs_fragment;
ConnectedComponents CCs_kept;
ConnectedComponents CCs_primary;
int sumNPoint_fragment = 0, sumNPoint_kept = 0, sumNPoint_primary = 0;
split_clusters(class_numpoint_mean, semantic_label, coord_shift, batch_idxs,
ball_query_idxs, start_len, N, CCs_fragment, CCs_kept,
CCs_primary, &sumNPoint_fragment, &sumNPoint_kept,
&sumNPoint_primary, class_id);
cluster_idxs_kept_tensor.resize_({sumNPoint_kept, 2});
cluster_offsets_kept_tensor.resize_({(int)CCs_kept.size() + 1});
cluster_centers_kept_tensor.resize_({(int)CCs_kept.size(), 5});
cluster_idxs_kept_tensor.zero_();
cluster_offsets_kept_tensor.zero_();
cluster_centers_kept_tensor.zero_();
int *cluster_idxs_kept = cluster_idxs_kept_tensor.data<int>();
int *cluster_offsets_kept = cluster_offsets_kept_tensor.data<int>();
float *cluster_centers_kept = cluster_centers_kept_tensor.data<float>();
fill_cluster_idxs_(CCs_kept, cluster_idxs_kept, cluster_offsets_kept,
cluster_centers_kept);
primary_idxs_tensor.resize_({sumNPoint_primary, 2});
primary_offsets_tensor.resize_({(int)CCs_primary.size() + 1});
primary_centers_tensor.resize_({(int)CCs_primary.size(), 5});
primary_idxs_tensor.zero_();
primary_offsets_tensor.zero_();
primary_centers_tensor.zero_();
int *primary_idxs = primary_idxs_tensor.data<int>();
int *primary_offsets = primary_offsets_tensor.data<int>();
float *primary_centers = primary_centers_tensor.data<float>();
fill_cluster_idxs_(CCs_primary, primary_idxs, primary_offsets,
primary_centers);
if (using_set_aggr_ == 0) { // only point aggr
return;
}
fragment_idxs_tensor.resize_({sumNPoint_fragment, 2});
fragment_offsets_tensor.resize_({(int)CCs_fragment.size() + 1});
fragment_centers_tensor.resize_(
{(int)CCs_fragment.size(),
5}); //[:, -2] for cls_label, [:, -1] for batch_idx
fragment_idxs_tensor.zero_();
fragment_offsets_tensor.zero_();
fragment_centers_tensor.zero_();
int *fragment_idxs = fragment_idxs_tensor.data<int>();
int *fragment_offsets = fragment_offsets_tensor.data<int>();
float *fragment_centers = fragment_centers_tensor.data<float>();
fill_cluster_idxs_(CCs_fragment, fragment_idxs, fragment_offsets,
fragment_centers);
// prerare tensor for storing post-primary
primary_idxs_post_tensor.resize_(
{sumNPoint_fragment + sumNPoint_primary,
2}); // never overflow, but need to cut off tails
primary_offsets_post_tensor.resize_({(int)CCs_primary.size() + 1});
primary_idxs_post_tensor.zero_();
primary_offsets_post_tensor.zero_();
int *primary_idxs_post = primary_idxs_post_tensor.data<int>();
int *primary_offsets_post = primary_offsets_post_tensor.data<int>();
// set aggr
hierarchical_aggregation_cuda(
sumNPoint_fragment, (int)CCs_fragment.size(), fragment_idxs,
fragment_offsets, fragment_centers, sumNPoint_primary,
(int)CCs_primary.size(), primary_idxs, primary_offsets, primary_centers,
primary_idxs_post, primary_offsets_post);
}

View File

@@ -1,254 +0,0 @@
#include "hierarchical_aggregation.h"
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define MAX_PRIMARY_NUM 1024
#define MAX_PER_PRIMARY_ABSORB_FRAGMENT_NUM 1024
#define INFINITY_DIS_SQUARE 10000
#define MAX_PER_PRIMARY_ABSORB_POINT_NUM 8192
#define MAX_THREADS_PER_BLOCK 512
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
// input: cuda_fragment_centers (fragment_num * 5,), 5 for (x, y, z, cls_label,
// batch_idx)
// input: cuda_primary_centers (primary_num * 5,), 5 for (x, y, z, cls_label,
// batch_idx)
// input: ...
// output: cuda_primary_absorb_fragment_idx
// output: cuda_primary_absorb_fragment_cnt
__global__ void
fragment_find_primary_(int primary_num, int *cuda_primary_offsets,
float *cuda_primary_centers, int fragment_num,
int *cuda_fragment_offsets, float *cuda_fragment_centers,
int *cuda_primary_absorb_fragment_idx,
int *cuda_primary_absorb_fragment_cnt) {
int fragment_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (fragment_idx >= fragment_num)
return;
// find the nearest primary for each fragment
float nearest_dis_square = INFINITY_DIS_SQUARE;
int nearest_idx = -1; // primary_idx
for (int i = 0; i < primary_num; i++) {
if (abs(cuda_primary_centers[i * 5 + 3] -
cuda_fragment_centers[fragment_idx * 5 + 3]) >
0.1) { // judge same cls_label or not
continue;
}
if (abs(cuda_primary_centers[i * 5 + 4] -
cuda_fragment_centers[fragment_idx * 5 + 4]) >
0.1) { // judge same batch_idx or not
continue;
}
float temp_dis_square = pow((cuda_primary_centers[i * 5 + 0] -
cuda_fragment_centers[fragment_idx * 5 + 0]),
2) +
pow((cuda_primary_centers[i * 5 + 1] -
cuda_fragment_centers[fragment_idx * 5 + 1]),
2) +
pow((cuda_primary_centers[i * 5 + 2] -
cuda_fragment_centers[fragment_idx * 5 + 2]),
2);
if (temp_dis_square < nearest_dis_square) {
nearest_dis_square = temp_dis_square;
nearest_idx = i;
}
}
if (nearest_idx == -1)
return; // fragment not belong to any primary
// r_size
int primary_point_num =
cuda_primary_offsets[nearest_idx + 1] - cuda_primary_offsets[nearest_idx];
float r_size = 0.01 * sqrt(float(primary_point_num));
// r_cls
// instance radius for each class, statistical data from the training set
float class_radius_mean[20] = {-1.,
-1.,
0.7047687683952325,
1.1732690381942337,
0.39644035821116036,
1.011516629020215,
0.7260155292902369,
0.8674973999335017,
0.8374931435447094,
1.0454153869133096,
0.32879464797430913,
1.1954566226966346,
0.8628817944400078,
1.0416287916782507,
0.6602697958671507,
0.8541363897836871,
0.38055290598206537,
0.3011878752684007,
0.7420871812436316,
0.4474268644407741};
int _class_idx = (int)cuda_fragment_centers[fragment_idx * 5 + 3];
float r_cls = class_radius_mean[_class_idx] * 1.;
// r_set
float r_set = max(r_size, r_cls);
// judge
if (nearest_dis_square < r_set * r_set) {
int _offect = atomicAdd(cuda_primary_absorb_fragment_cnt + nearest_idx, 1);
if (_offect < MAX_PER_PRIMARY_ABSORB_FRAGMENT_NUM)
cuda_primary_absorb_fragment_idx[nearest_idx *
MAX_PER_PRIMARY_ABSORB_FRAGMENT_NUM +
_offect] = fragment_idx;
else {
;
}
}
}
// input: ...
// output: cuda_concat_idxs
// output: cuda_concat_point_num,
__global__ void
concat_fragments_(int *cuda_fragment_idxs, int *cuda_fragment_offsets,
int *cuda_primary_idxs, int *cuda_primary_offsets,
int *cuda_primary_absorb_fragment_idx,
int *cuda_primary_absorb_fragment_cnt, int *cuda_concat_idxs,
int *cuda_concat_point_num, int primary_num) {
int primary_idx = blockIdx.x;
if (primary_idx >= primary_num)
return;
int _accu_offset = 0; // unit is point
for (int i = 0; i < cuda_primary_absorb_fragment_cnt[primary_idx] &&
i < MAX_PER_PRIMARY_ABSORB_FRAGMENT_NUM;
i++) {
int idx = cuda_primary_absorb_fragment_idx
[primary_idx * MAX_PER_PRIMARY_ABSORB_FRAGMENT_NUM + i];
for (int j = cuda_fragment_offsets[idx]; j < cuda_fragment_offsets[idx + 1];
j++) {
if (_accu_offset < MAX_PER_PRIMARY_ABSORB_POINT_NUM) {
cuda_concat_idxs[primary_idx * MAX_PER_PRIMARY_ABSORB_POINT_NUM * 2 +
_accu_offset * 2 + 0] = primary_idx;
cuda_concat_idxs[primary_idx * MAX_PER_PRIMARY_ABSORB_POINT_NUM * 2 +
_accu_offset * 2 + 1] = cuda_fragment_idxs[j * 2 + 1];
_accu_offset++;
} else {
;
}
}
}
cuda_concat_point_num[primary_idx] = _accu_offset;
}
void hierarchical_aggregation_cuda(
int fragment_total_point_num, int fragment_num, int *fragment_idxs,
int *fragment_offsets, float *fragment_centers, int primary_total_point_num,
int primary_num, int *primary_idxs, int *primary_offsets,
float *primary_centers, int *primary_idxs_post, int *primary_offsets_post) {
if (primary_num == 0) {
return;
}
// on devices, allocate and copy memory
int *cuda_fragment_idxs;
int *cuda_fragment_offsets;
float *cuda_fragment_centers;
cudaMalloc((void **)&cuda_fragment_idxs,
fragment_total_point_num * 2 * sizeof(int) +
sizeof(int)); // prevent alloc 0 space
cudaMalloc((void **)&cuda_fragment_offsets, (fragment_num + 1) * sizeof(int));
cudaMalloc((void **)&cuda_fragment_centers,
fragment_num * 5 * sizeof(float) +
sizeof(float)); // prevent alloc 0 space
cudaMemcpy(cuda_fragment_idxs, fragment_idxs,
fragment_total_point_num * 2 * sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(cuda_fragment_offsets, fragment_offsets,
(fragment_num + 1) * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(cuda_fragment_centers, fragment_centers,
fragment_num * 5 * sizeof(float), cudaMemcpyHostToDevice);
int *cuda_primary_idxs;
int *cuda_primary_offsets;
float *cuda_primary_centers;
cudaMalloc((void **)&cuda_primary_idxs,
primary_total_point_num * 2 * sizeof(int) +
sizeof(int)); // prevent alloc 0 space
cudaMalloc((void **)&cuda_primary_offsets, (primary_num + 1) * sizeof(int));
cudaMalloc((void **)&cuda_primary_centers,
primary_num * 5 * sizeof(float) +
sizeof(float)); // prevent alloc 0 space
cudaMemcpy(cuda_primary_idxs, primary_idxs,
primary_total_point_num * 2 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(cuda_primary_offsets, primary_offsets,
(primary_num + 1) * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(cuda_primary_centers, primary_centers,
primary_num * 5 * sizeof(float), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
// // for each fragment, find its primary
int *cuda_primary_absorb_fragment_idx; // array for saving the fragment idxs
int *cuda_primary_absorb_fragment_cnt; // array for saving the fragment nums
cudaMalloc((void **)&cuda_primary_absorb_fragment_idx,
primary_num * MAX_PER_PRIMARY_ABSORB_FRAGMENT_NUM * sizeof(int) +
sizeof(int));
cudaMalloc((void **)&cuda_primary_absorb_fragment_cnt,
primary_num * sizeof(int) + sizeof(int));
if (fragment_num != 0)
fragment_find_primary_<<<int(DIVUP(fragment_num, MAX_THREADS_PER_BLOCK)),
(int)MAX_THREADS_PER_BLOCK>>>(
primary_num, cuda_primary_offsets, cuda_primary_centers, fragment_num,
cuda_fragment_offsets, cuda_fragment_centers,
cuda_primary_absorb_fragment_idx, cuda_primary_absorb_fragment_cnt);
cudaDeviceSynchronize();
// concatenate fragments belonging to the same primary
int *cuda_concat_idxs;
int *cuda_concat_point_num;
cudaMalloc((void **)&cuda_concat_idxs,
primary_num * MAX_PER_PRIMARY_ABSORB_POINT_NUM * 2 * sizeof(int) +
sizeof(int));
cudaMalloc((void **)&cuda_concat_point_num,
primary_num * sizeof(int) + sizeof(int));
assert(primary_num <= MAX_PRIMARY_NUM);
concat_fragments_<<<primary_num, (int)1>>>(
cuda_fragment_idxs, cuda_fragment_offsets, cuda_primary_idxs,
cuda_primary_offsets, cuda_primary_absorb_fragment_idx,
cuda_primary_absorb_fragment_cnt, cuda_concat_idxs, cuda_concat_point_num,
primary_num);
cudaDeviceSynchronize();
// merge primary instances and fragments
int *concat_point_num = new int[primary_num + 1]; // allocate on host
cudaMemcpy(concat_point_num, cuda_concat_point_num, primary_num * sizeof(int),
cudaMemcpyDeviceToHost);
int _accu_offset = 0;
for (int i = 0; i < primary_num; i++) {
// add primary instances
cudaMemcpy(primary_idxs_post + _accu_offset * 2,
cuda_primary_idxs + primary_offsets[i] * 2,
(primary_offsets[i + 1] - primary_offsets[i]) * 2 * sizeof(int),
cudaMemcpyDeviceToHost);
_accu_offset += (primary_offsets[i + 1] - primary_offsets[i]);
// add absorbed fragments
cudaMemcpy(primary_idxs_post + _accu_offset * 2,
cuda_concat_idxs + i * MAX_PER_PRIMARY_ABSORB_POINT_NUM * 2,
concat_point_num[i] * 2 * sizeof(int), cudaMemcpyDeviceToHost);
_accu_offset += concat_point_num[i];
// writing offsets
primary_offsets_post[i + 1] = _accu_offset;
}
cudaDeviceSynchronize();
cudaError_t err;
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}

View File

@@ -1,31 +0,0 @@
/*
Hierarchichal Aggregation Algorithm
*/
#ifndef HIERARCHICAL_AGGREGATION_H
#define HIERARCHICAL_AGGREGATION_H
#include <ATen/cuda/CUDAContext.h>
#include <THC/THC.h>
#include <torch/serialize/tensor.h>
#include "../datatype/datatype.h"
void hierarchical_aggregation(
at::Tensor class_numpoint_mean_tensor, at::Tensor semantic_label_tensor,
at::Tensor coord_shift_tensor, at::Tensor batch_idxs_tensor,
at::Tensor ball_query_idxs_tensor, at::Tensor start_len_tensor,
at::Tensor fragment_idxs_tensor, at::Tensor fragment_offsets_tensor,
at::Tensor fragment_centers_tensor, at::Tensor cluster_idxs_kept_tensor,
at::Tensor cluster_offsets_kept_tensor,
at::Tensor cluster_centers_kept_tensor, at::Tensor primary_idxs_tensor,
at::Tensor primary_offsets_tensor, at::Tensor primary_centers_tensor,
at::Tensor primary_idxs_post_tensor, at::Tensor primary_offsets_post_tensor,
const int N, const int training_mode_, const int using_set_aggr_,
const int class_id);
void hierarchical_aggregation_cuda(
int fragment_total_point_num, int fragment_num, int *fragment_idxs,
int *fragment_offsets, float *fragment_centers, int primary_total_point_num,
int primary_num, int *primary_idxs, int *primary_offsets,
float *primary_centers, int *primary_idxs_post, int *primary_offsets_post);
#endif // HIERARCHICAL_AGGREGATION_H

View File

@@ -5,10 +5,6 @@
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("hierarchical_aggregation", &hierarchical_aggregation,
"hierarchical_aggregation");
// m.def("cal_iou_and_masklabel", &cal_iou_and_masklabel,
// "cal_iou_and_masklabel");
m.def("get_mask_iou_on_cluster", &get_mask_iou_on_cluster,
"get_mask_iou_on_cluster");
m.def("get_mask_iou_on_pred", &get_mask_iou_on_pred, "get_mask_iou_on_pred");

View File

@@ -2,12 +2,9 @@
#include <cuda_runtime.h>
#include <torch/extension.h>
#include "datatype/datatype.cpp"
#include "cal_iou_and_masklabel/cal_iou_and_masklabel.cpp"
#include "hierarchical_aggregation/hierarchical_aggregation.cpp"
#include "bfs_cluster/bfs_cluster.cpp"
#include "cal_iou_and_masklabel/cal_iou_and_masklabel.cpp"
#include "datatype/datatype.cpp"
#include "get_iou/get_iou.cpp"
#include "roipool/roipool.cpp"
#include "sec_mean/sec_mean.cpp"

View File

@@ -1,11 +1,8 @@
#ifndef HAIS_H
#define HAIS_H
#include "datatype/datatype.h"
#include "cal_iou_and_masklabel/cal_iou_and_masklabel.h"
#include "hierarchical_aggregation/hierarchical_aggregation.h"
#include "bfs_cluster/bfs_cluster.h"
#include "cal_iou_and_masklabel/cal_iou_and_masklabel.h"
#include "datatype/datatype.h"
#include "get_iou/get_iou.h"
#include "roipool/roipool.h"
#include "sec_mean/sec_mean.h"
@@ -37,4 +34,4 @@ void point_recover_bp_feat(/* cuda float N*C */ at::Tensor d_output_feats,
/* cuda Int M*(maxActive+1) */ at::Tensor idx_map,
Int nActive, Int maxActive, Int nPlane);
#endif // HAIS_H
#endif // HAIS_H

View File

@@ -134,6 +134,10 @@ class SoftGroup(nn.Module):
proposals_idx, proposals_offset = self.forward_grouping(semantic_scores, pt_offsets,
batch_idxs, coords_float,
self.grouping_cfg)
if proposals_offset.shape[0] > self.train_cfg.max_proposal_num:
proposals_offset = proposals_offset[:self.train_cfg.max_proposal_num + 1]
proposals_idx = proposals_idx[:proposals_offset[-1]]
assert proposals_idx.shape[0] == proposals_offset[-1]
instance_batch_idxs, cls_scores, iou_scores, mask_scores = self.forward_instance(
proposals_idx, proposals_offset, output_feats, coords_float)
instance_loss = self.instance_loss(cls_scores, mask_scores, iou_scores, proposals_idx,
@@ -303,32 +307,27 @@ class SoftGroup(nn.Module):
proposals_offset_list = []
batch_size = batch_idxs.max() + 1
semantic_scores = semantic_scores.softmax(dim=-1)
semantic_preds = semantic_scores.max(1)[1] # TODO remove this
radius = self.grouping_cfg.radius
mean_active = self.grouping_cfg.mean_active
npoint_thr = self.grouping_cfg.npoint_thr
class_numpoint_mean = torch.tensor(
self.grouping_cfg.class_numpoint_mean, dtype=torch.float32)
training_mode = None # TODO remove this
for class_id in range(self.semantic_classes):
# ignore "floor" and "wall"
if class_id < 2:
if class_id in self.grouping_cfg.ignore_classes:
continue
scores = semantic_scores[:, class_id].contiguous()
object_idxs = (scores > self.grouping_cfg.score_thr).nonzero().view(-1)
if object_idxs.size(0) < 100: # TODO
if object_idxs.size(0) < self.test_cfg.min_npoint:
continue
batch_idxs_ = batch_idxs[object_idxs]
batch_offsets_ = utils.get_batch_offsets(batch_idxs_, batch_size)
coords_ = coords_float[object_idxs]
pt_offsets_ = pt_offsets[object_idxs] # (N_fg, 3), float32
semantic_preds_cpu = semantic_preds[object_idxs].int().cpu()
pt_offsets_ = pt_offsets[object_idxs]
idx, start_len = softgroup_ops.ballquery_batch_p(coords_ + pt_offsets_, batch_idxs_,
batch_offsets_, radius, mean_active)
using_set_aggr = False # TODO refactor this
proposals_idx, proposals_offset = softgroup_ops.hierarchical_aggregation(
class_numpoint_mean, semantic_preds_cpu, (coords_ + pt_offsets_).cpu(), idx.cpu(),
start_len.cpu(), batch_idxs_.cpu(), training_mode, using_set_aggr, class_id)
proposals_idx, proposals_offset = softgroup_ops.bfs_cluster(
class_numpoint_mean, idx.cpu(), start_len.cpu(), npoint_thr, class_id)
proposals_idx[:, 1] = object_idxs[proposals_idx[:, 1].long()].int()
# merge proposals