From e97bc79c3ddcac03f7b1158d3ee25ea159cf5f76 Mon Sep 17 00:00:00 2001 From: hehefan Date: Wed, 25 Aug 2021 18:56:46 +0800 Subject: [PATCH] Initial commit --- README.md | 11 +- datasets/msr.py | 79 +++ datasets/synthia.py | 193 +++++++ models/.synthia2.py.swp | Bin 0 -> 16384 bytes models/.video.py.swp | Bin 0 -> 12288 bytes models/point.py | 117 ++++ models/video.py | 49 ++ .../_ext_src/include/ball_query.h | 10 + .../_ext_src/include/cuda_utils.h | 46 ++ .../_ext_src/include/group_points.h | 10 + .../_ext_src/include/interpolate.h | 15 + .../_ext_src/include/sampling.h | 11 + .../_ext_src/include/utils.h | 30 + .../_ext_src/src/ball_query.cpp | 37 ++ .../_ext_src/src/ball_query_gpu.cu | 59 ++ .../_ext_src/src/bindings.cpp | 24 + .../_ext_src/src/group_points.cpp | 65 +++ .../_ext_src/src/group_points_gpu.cu | 80 +++ .../_ext_src/src/interpolate.cpp | 104 ++++ .../_ext_src/src/interpolate_gpu.cu | 159 ++++++ .../_ext_src/src/sampling.cpp | 91 +++ .../_ext_src/src/sampling_gpu.cu | 234 ++++++++ modules-pytorch-1.4.0/point_4d_convolution.py | 271 +++++++++ modules-pytorch-1.4.0/pointnet2_test.py | 33 ++ modules-pytorch-1.4.0/pointnet2_utils.py | 412 ++++++++++++++ modules-pytorch-1.4.0/setup.py | 31 ++ .../_ext_src/include/ball_query.h | 10 + .../_ext_src/include/cuda_utils.h | 46 ++ .../_ext_src/include/group_points.h | 10 + .../_ext_src/include/interpolate.h | 15 + .../_ext_src/include/sampling.h | 11 + .../_ext_src/include/utils.h | 30 + .../_ext_src/src/ball_query.cpp | 37 ++ .../_ext_src/src/ball_query_gpu.cu | 59 ++ .../_ext_src/src/bindings.cpp | 24 + .../_ext_src/src/group_points.cpp | 65 +++ .../_ext_src/src/group_points_gpu.cu | 80 +++ .../_ext_src/src/interpolate.cpp | 104 ++++ .../_ext_src/src/interpolate_gpu.cu | 159 ++++++ .../_ext_src/src/sampling.cpp | 91 +++ .../_ext_src/src/sampling_gpu.cu | 234 ++++++++ modules-pytorch-1.9.0/point_4d_convolution.py | 271 +++++++++ modules-pytorch-1.9.0/pointnet2_modules.py | 518 ++++++++++++++++++ modules-pytorch-1.9.0/pointnet2_test.py | 33 ++ modules-pytorch-1.9.0/pointnet2_utils.py | 425 ++++++++++++++ modules-pytorch-1.9.0/pytorch_utils.py | 298 ++++++++++ modules-pytorch-1.9.0/setup.py | 34 ++ scheduler.py | 47 ++ train-msr.py | 257 +++++++++ train-syn.py | 267 +++++++++ utils.py | 255 +++++++++ 51 files changed, 5550 insertions(+), 1 deletion(-) create mode 100644 datasets/msr.py create mode 100644 datasets/synthia.py create mode 100644 models/.synthia2.py.swp create mode 100644 models/.video.py.swp create mode 100644 models/point.py create mode 100644 models/video.py create mode 100644 modules-pytorch-1.4.0/_ext_src/include/ball_query.h create mode 100644 modules-pytorch-1.4.0/_ext_src/include/cuda_utils.h create mode 100644 modules-pytorch-1.4.0/_ext_src/include/group_points.h create mode 100644 modules-pytorch-1.4.0/_ext_src/include/interpolate.h create mode 100644 modules-pytorch-1.4.0/_ext_src/include/sampling.h create mode 100644 modules-pytorch-1.4.0/_ext_src/include/utils.h create mode 100644 modules-pytorch-1.4.0/_ext_src/src/ball_query.cpp create mode 100644 modules-pytorch-1.4.0/_ext_src/src/ball_query_gpu.cu create mode 100644 modules-pytorch-1.4.0/_ext_src/src/bindings.cpp create mode 100644 modules-pytorch-1.4.0/_ext_src/src/group_points.cpp create mode 100644 modules-pytorch-1.4.0/_ext_src/src/group_points_gpu.cu create mode 100644 modules-pytorch-1.4.0/_ext_src/src/interpolate.cpp create mode 100644 modules-pytorch-1.4.0/_ext_src/src/interpolate_gpu.cu create mode 100644 modules-pytorch-1.4.0/_ext_src/src/sampling.cpp create mode 100644 modules-pytorch-1.4.0/_ext_src/src/sampling_gpu.cu create mode 100644 modules-pytorch-1.4.0/point_4d_convolution.py create mode 100644 modules-pytorch-1.4.0/pointnet2_test.py create mode 100644 modules-pytorch-1.4.0/pointnet2_utils.py create mode 100644 modules-pytorch-1.4.0/setup.py create mode 100644 modules-pytorch-1.9.0/_ext_src/include/ball_query.h create mode 100644 modules-pytorch-1.9.0/_ext_src/include/cuda_utils.h create mode 100644 modules-pytorch-1.9.0/_ext_src/include/group_points.h create mode 100644 modules-pytorch-1.9.0/_ext_src/include/interpolate.h create mode 100644 modules-pytorch-1.9.0/_ext_src/include/sampling.h create mode 100644 modules-pytorch-1.9.0/_ext_src/include/utils.h create mode 100644 modules-pytorch-1.9.0/_ext_src/src/ball_query.cpp create mode 100644 modules-pytorch-1.9.0/_ext_src/src/ball_query_gpu.cu create mode 100644 modules-pytorch-1.9.0/_ext_src/src/bindings.cpp create mode 100644 modules-pytorch-1.9.0/_ext_src/src/group_points.cpp create mode 100644 modules-pytorch-1.9.0/_ext_src/src/group_points_gpu.cu create mode 100644 modules-pytorch-1.9.0/_ext_src/src/interpolate.cpp create mode 100644 modules-pytorch-1.9.0/_ext_src/src/interpolate_gpu.cu create mode 100644 modules-pytorch-1.9.0/_ext_src/src/sampling.cpp create mode 100644 modules-pytorch-1.9.0/_ext_src/src/sampling_gpu.cu create mode 100644 modules-pytorch-1.9.0/point_4d_convolution.py create mode 100644 modules-pytorch-1.9.0/pointnet2_modules.py create mode 100644 modules-pytorch-1.9.0/pointnet2_test.py create mode 100644 modules-pytorch-1.9.0/pointnet2_utils.py create mode 100644 modules-pytorch-1.9.0/pytorch_utils.py create mode 100644 modules-pytorch-1.9.0/setup.py create mode 100644 scheduler.py create mode 100644 train-msr.py create mode 100644 train-syn.py create mode 100644 utils.py diff --git a/README.md b/README.md index 7fcde43..50e22e2 100644 --- a/README.md +++ b/README.md @@ -1 +1,10 @@ -# PST-Transformer \ No newline at end of file +# PST-Transformer + +The code is tested with Red Hat Enterprise Linux Workstation release 7.7 (Maipo), g++ (GCC) 8.3.1, PyTorch (both v1.4.0 and v1.9.0 are supported), CUDA 10.2 and cuDNN v7.6. + +Compile the CUDA layers for [PointNet++](http://arxiv.org/abs/1706.02413), which we used for furthest point sampling (FPS) and radius neighbouring search: +``` +mv modules-pytorch-1.4.0/modules-pytorch-1.9.0 modules +cd modules +python setup.py install +``` diff --git a/datasets/msr.py b/datasets/msr.py new file mode 100644 index 0000000..153af8b --- /dev/null +++ b/datasets/msr.py @@ -0,0 +1,79 @@ +import os +import sys +import numpy as np +from torch.utils.data import Dataset + +class MSRAction3D(Dataset): + def __init__(self, root, frames_per_clip=16, step_between_clips=1, num_points=2048, train=True): + super(MSRAction3D, self).__init__() + + self.videos = [] + self.labels = [] + self.index_map = [] + index = 0 + for video_name in os.listdir(root): + if train and (int(video_name.split('_')[1].split('s')[1]) <= 5): + video = np.load(os.path.join(root, video_name), allow_pickle=True)['point_clouds'] + self.videos.append(video) + label = int(video_name.split('_')[0][1:])-1 + self.labels.append(label) + + nframes = video.shape[0] + for t in range(0, nframes-step_between_clips*(frames_per_clip-1), step_between_clips): + self.index_map.append((index, t)) + index += 1 + + if not train and (int(video_name.split('_')[1].split('s')[1]) > 5): + video = np.load(os.path.join(root, video_name), allow_pickle=True)['point_clouds'] + self.videos.append(video) + label = int(video_name.split('_')[0][1:])-1 + self.labels.append(label) + + nframes = video.shape[0] + for t in range(0, nframes-step_between_clips*(frames_per_clip-1), step_between_clips): + self.index_map.append((index, t)) + index += 1 + + self.frames_per_clip = frames_per_clip + self.step_between_clips = step_between_clips + self.num_points = num_points + self.train = train + self.num_classes = max(self.labels) + 1 + + + def __len__(self): + return len(self.index_map) + + def __getitem__(self, idx): + index, t = self.index_map[idx] + + video = self.videos[index] + label = self.labels[index] + + clip = [video[t+i*self.step_between_clips] for i in range(self.frames_per_clip)] + for i, p in enumerate(clip): + if p.shape[0] > self.num_points: + r = np.random.choice(p.shape[0], size=self.num_points, replace=False) + else: + repeat, residue = self.num_points // p.shape[0], self.num_points % p.shape[0] + r = np.random.choice(p.shape[0], size=residue, replace=False) + r = np.concatenate([np.arange(p.shape[0]) for _ in range(repeat)] + [r], axis=0) + clip[i] = p[r, :] + clip = np.array(clip) + + if self.train: + # scale the points + scales = np.random.uniform(0.9, 1.1, size=3) + clip = clip * scales + + clip = clip / 300 + + return clip.astype(np.float32), label, index + +if __name__ == '__main__': + dataset = MSRAction(root='../data/msr_action', frames_per_clip=16) + clip, label, video_idx = dataset[0] + print(clip) + print(label) + print(video_idx) + print(dataset.num_classes) diff --git a/datasets/synthia.py b/datasets/synthia.py new file mode 100644 index 0000000..8ba6e9d --- /dev/null +++ b/datasets/synthia.py @@ -0,0 +1,193 @@ +import os +import sys +import numpy as np +from pyquaternion import Quaternion +from torch.utils.data import Dataset + +index_to_label = np.array([12, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, 11], dtype='int32') +label_to_index = np.array([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 0], dtype='int32') +index_to_class = ['Void', 'Sky', 'Building', 'Road', 'Sidewalk', 'Fence', 'Vegetation', 'Pole', 'Car', 'Traffic Sign', 'Pedestrian', 'Bicycle', 'Lanemarking', 'Reserved', 'Reserved', 'Traffic Light'] + +def index_to_label_func(x): + return index_to_label[x] +index_to_label_vec_func = np.vectorize(index_to_label_func) + +class SegDataset(Dataset): + def __init__(self, root='data/pc', meta='data/train_raw.txt', labelweight = 'data/labelweights.npz', frames_per_clip=3, num_points=16384, train=True): + super(SegDataset, self).__init__() + + self.num_points = num_points + self.train = train + self.root = root + self.frames_per_clip = frames_per_clip + + labelweights = np.load(labelweight)['labelweights'].astype(np.float32) + if train: + labelweights = 1/np.log(1.2 + labelweights) + self.labelweights = labelweights / labelweights.min() + else: + self.labelweights = np.ones_like(labelweights) + + self.meta = [] + self.data = {} + #m = 0 + with open(meta, 'r') as f: + for line in f: + line = line.split(' ')[0] + line = line.split('/') + sequence_name = line[0] + frame_id = int(line[-1].split('.')[0]) + + fn = os.path.join(root, sequence_name + '-' + str(frame_id).zfill(6) + '.npz') + data = np.load(fn) + + pc = data['pc'] # (16384, 3) + rgb = data['rgb'] # (16384, 3) + semantic = data['semantic'] # (16384, ) + center = data['center'] # (3, ) + semantic = semantic.astype('uint8') + + self.data[sequence_name + '-' + str(frame_id)] = (pc, rgb, semantic, center) + self.meta.append([sequence_name, frame_id]) + #m+=1 + #if m == 100: + #break + self.meta.sort() + + def __len__(self): + return len(self.meta) + + def read_training_data_point(self, index): + sequence_name, frame_id = self.meta[index] + + pcs = [] + rgbs = [] + semantics = [] + center_0 = None + + most_recent_success = -1 + for diff in range(0, self.frames_per_clip): + key = sequence_name + '-' + str(frame_id-diff) + if key in self.data: + pc, rgb, semantic, center = self.data[key] + most_recent_success = frame_id - diff + else: + pc, rgb, semantic, center = self.data[sequence_name + '-' + str(most_recent_success)] + + if diff == 0: + center_0 = center + + pcs.append(pc) + rgbs.append(rgb) + semantics.append(semantic) + + pc = np.stack(pcs, axis=0) + rgb = np.stack(rgbs, axis=0) + semantic = np.stack(semantics, axis=0) + + return pc, rgb, semantic, center_0 + + + def half_crop_w_context(self, half, context, pc, rgb, semantic, center): + frames_per_clip = pc.shape[0] + all_idx = np.arange(pc.shape[1]) + sample_indicies_half_w_context = [] + if half == 0: + for f in range(frames_per_clip): + sample_idx_half_w_context = all_idx[pc[f, :, 2] > (center[2] - context)] + sample_indicies_half_w_context.append(sample_idx_half_w_context) + else: + for f in range(frames_per_clip): + sample_idx_half_w_context = all_idx[pc[f, :, 2] < (center[2] + context)] + sample_indicies_half_w_context.append(sample_idx_half_w_context) + + pc_half_w_context = [pc[f, s] for f, s in enumerate(sample_indicies_half_w_context)] + rgb_half_w_context = [rgb[f, s] for f, s in enumerate(sample_indicies_half_w_context)] + semantic_half_w_context = [semantic[f, s] for f, s in enumerate(sample_indicies_half_w_context)] + if half == 0: + loss_masks = [p[:, 2] > center[2] for p in pc_half_w_context] + else: + loss_masks = [p[:, 2] < center[2] for p in pc_half_w_context] + valid_pred_idx_in_full = sample_indicies_half_w_context + + return pc_half_w_context, rgb_half_w_context, semantic_half_w_context, loss_masks, valid_pred_idx_in_full + + def augment(self, pc, center): + flip = np.random.uniform(0, 1) > 0.5 + if flip: + pc = (pc - center) + pc[:, 0] *= -1 + pc += center + + scale = np.random.uniform(0.8, 1.2) + pc = (pc - center) * scale + center + + rot_axis = np.array([0, 1, 0]) + rot_angle = np.random.uniform(np.pi * 2) + q = Quaternion(axis=rot_axis, angle=rot_angle) + R = q.rotation_matrix + + pc = np.dot(pc - center, R) + center + return pc + + def mask_and_label_conversion(self, semantic, loss_mask): + labels = [] + loss_masks = [] + for i, s in enumerate(semantic): + sem = s.astype('int32') + label = index_to_label_vec_func(sem) + loss_mask_ = (label != 12) * loss_mask[i] + label[label == 12] = 0 + + labels.append(label) + loss_masks.append(loss_mask_) + return labels, loss_masks + + def choice_to_num_points(self, pc, rgb, label, loss_mask, valid_pred_idx_in_full): + + # shuffle idx to change point order (change FPS behavior) + for f in range(self.frames_per_clip): + idx = np.arange(pc[f].shape[0]) + choice_num = self.num_points + if pc[f].shape[0] > choice_num: + shuffle_idx = np.random.choice(idx, choice_num, replace=False) + else: + shuffle_idx = np.concatenate([np.random.choice(idx, choice_num - idx.shape[0]), np.arange(idx.shape[0])]) + pc[f] = pc[f][shuffle_idx] + rgb[f] = rgb[f][shuffle_idx] + label[f] = label[f][shuffle_idx] + loss_mask[f] = loss_mask[f][shuffle_idx] + valid_pred_idx_in_full[f] = valid_pred_idx_in_full[f][shuffle_idx] + + pc = np.stack(pc, axis=0) + rgb = np.stack(rgb, axis=0) + label = np.stack(label, axis=0) + loss_mask = np.stack(loss_mask, axis=0) + valid_pred_idx_in_full = np.stack(valid_pred_idx_in_full, axis=0) + + return pc, rgb, label, loss_mask, valid_pred_idx_in_full + + def __getitem__(self, index): + context = 1. + + pc, rgb, semantic, center = self.read_training_data_point(index) + + half = 0 + pc1, rgb1, semantic1, mask1, valid_pred_idx_in_full1 = self.half_crop_w_context(half, context, pc, rgb, semantic, center) + label1, mask1 = self.mask_and_label_conversion(semantic1, mask1) + pc1, rgb1, label1, mask1, valid_pred_idx_in_full1 = self.choice_to_num_points(pc1, rgb1, label1, mask1, valid_pred_idx_in_full1) + + half = 1 + pc2, rgb2, semantic2, mask2, valid_pred_idx_in_full2 = self.half_crop_w_context(half, context, pc, rgb, semantic, center) + label2, mask2 = self.mask_and_label_conversion(semantic2, mask2) + pc2, rgb2, label2, mask2, valid_pred_idx_in_full2 = self.choice_to_num_points(pc2, rgb2, label2, mask2, valid_pred_idx_in_full2) + + if self.train: + pc1 = self.augment(pc1, center) + pc2 = self.augment(pc2, center) + + rgb1 = np.swapaxes(rgb1, 1, 2) + rgb2 = np.swapaxes(rgb2, 1, 2) + + return pc1.astype(np.float32), rgb1.astype(np.float32), label1.astype(np.int64), mask1.astype(np.float32), pc2.astype(np.float32), rgb2.astype(np.float32), label2.astype(np.int64), mask2.astype(np.float32) + diff --git a/models/.synthia2.py.swp b/models/.synthia2.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..78185621bb5f89a5adc0fbef4ac450a9f8cb1027 GIT binary patch literal 16384 zcmeI2OKjXk7{`ZaTA;iItpwDCL%q?;+4ZI^4O%29uZkv(((({dYMgjBYuL42+Z#v$ zacau}M3Gt^RZHQOYpgrkSXV5Yw6VehsaHYgj*Z7$I@r{Oi-^86-x&7#*IcC z8PsVQc4K;_9>p%Q+ zA5Y&R<^Rp|@8;xRiemuP5x^ z#Rn|f7u`}Awl1l|M(z#gz0YyoS*a&YuwLVgC{f)Bwy5C9+C3$6rHOHdAc2A%iwJoe zYy*>E09*yWxR8(+K^3e4X8{5ZUqHxyunAlXrWT`3umfxd4}cZm@FMgV_~2%+2y}t3 zx(In1Yy<;fDfsq$LS6vRg9pJi;K%a_`2oBJ9FPM#_!L5Y0``M_U=-M38CVLwLXhE0 zuoLV68dwhAMXccv7y$j?7_|NsybfLidqASUc)&FN32EF2JW z$9!)>_vj;5h1w`fBd2J@q3uP1A5p!SH*-1Zd)pGL_UPm`RLN_dCPV`cW%i3n14&6U zud%CC9;xB@Xq3w^jjRsSW~XVhZMxJf$;`|B;aOsc#jJ*;GES+pp;Eh{-gZNB?|7DI z$joX;D&zEaHq_f{DAkVCImtvRqZz44s}VLI?It9m+x9?KrMCUM{3&NUXz|u72o_yPjRM zf|~8osEkROsi>UJI>u`aCp{IvM)EBi$uRl4#4rbYZ`&-Ed;f+(HdthH_o^b+oIEKreNR6pqRUC=v-{09 zkD2>jESSBs3LevX;D_v?<||^jkS~nP*5g*K){2^Ch*4n2E?VK`&GuuTEk&z=&J}!u zt)wdro89auv>mq*;ed$jdQhWLO3>^c$Wy_lgDIM-n5@`j-)_!B=Q72_&q4iSzPRA( z~%F>9m2!fdVQs1eXm$C)rdW4!O+ttw|& zmg~8(W$DdboxOIol2{46<0qSAxnQhqZ;<6uH+#cu)UC0a9YuKG!9mtXk}vGgK?6Eo z#rl1xQKQHpmC&zif$w^;mFxt*)tYJiIXDVMn#<0bSnF<xc5%O5Tx`x!klWqEwEE zJ7_KJIBw|Kb*kTb%ck3zsT>h^8HGe8dz9%=)h(;y)~IEnQNjn{N8%1W*Tg-<4kk4_ z(!78xO1y6%bpbLC`5h1UF0Q7;i;A*fJe*8;d6kG?6z>oh4 zya6g82Xyf3VnW^mTR{N&z;E#3KLGE8O<)N000SI{FMkZY3Elvszy`|z^Yed!Uq1zo zfs;?h2%L`o{tz@b%d#8fZjHMr?eBIPrs z$~QMY#KNj}K;aPwjkqkbaoVxevVLn`+mP1=@~x(gBxBed$>l^Z|I76TZMbRDL!e~c z2oZHf!x%^62&9fPoz_Qb5@5)+$K6U36_8+swARptrBq>>8|zXi=VzwJ>rB);T@jqTaBvuQ)57OB!$`e|pL-@G^P_uiXqk{NW~ zzO_a#wH67U7YMmRUST_DFQ5G4G9lyfhzTV>;@M32(3gHB&aW+9>U18c{ZVePy%G(x zkc(8dZ~Ez5*`U3-wS9g&VM30gB;-juj65F5_P+1&s1;8hXwSSb3>XHE!@vT$cwsSb z(8kN>=!<7}jXf_#h^PL^@xe+R($L^;== z<GHFz^I2z&t`GPZ9F&)7U)z|F3`l|MPi5 zegS?2z6QPmJ_qgt?*nfDF9E-wA>=pUC*WJ)8{iY*9Pr%&AsfIS&%qY(GtdYAdX|t7 zI0GyIKb$7yd*D-G4_F0W2TlS1JVVIezyNp?xPOX}&wwE?06M_0*!!2j9uNcX0VGn|7C!dPr}nI5j4uuI7pOaO4T3(MGVAeuf5@@JJU6Xfp1Os|Dl+Nu5yj=k4_V?_iZG{8w3i?Gejjdd z8qK+y{Yv=v{rq=Sz@WOL{%Z!EOdtflM)$qNYx3HZ(d*BvF634jeLD+=ny3~ z;mJDgeOl0~ysmWxh=rRizL(+JsLujxw)2DK2i3||Qe_k^!sB%8P}N_kOo;MUbLcd> zfQ#L9T&cWjfc0^Nie>1t^+Ay7CTB z1P@&4-{alGgcT<;O>k3ord7-<3Ng!DrQ1V`b^7)6X~*?X9anW1y@Yvw1_Fg-VH|LY zZqRz=7V6WPqasPQzD1rkmsYMtVjmss#sL#tc5w=*oRqU3G6f$>GGOU&>jSRW6?3U~L!cSed?8RE7(-No_`0=~4p^x{PaW#2p0Ty-V%q^N1 zG-&0BC3ElpQ-vJur=fGQJT2kempZt<8 literal 0 HcmV?d00001 diff --git a/models/point.py b/models/point.py new file mode 100644 index 0000000..0c4fb1a --- /dev/null +++ b/models/point.py @@ -0,0 +1,117 @@ +import torch +import torch.nn.functional as F +from torch import nn +import numpy as np +import sys +import os + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +ROOT_DIR = os.path.dirname(BASE_DIR) +sys.path.append(ROOT_DIR) +sys.path.append(os.path.join(ROOT_DIR, 'modules')) + +import pointnet2_utils +from point_4d_convolution import * +from transformer import * + +class PSTTransformer(nn.Module): + def __init__(self, radius=0.9, nsamples=3*3, num_classes=12): + super(P4Transformer, self).__init__() + + self.conv1 = P4DConv(in_planes=3, + mlp_planes=[32,64,128], + mlp_batch_norm=[True, True, True], + mlp_activation=[True, True, True], + spatial_kernel_size=[radius, nsamples], + temporal_kernel_size=1, + spatial_stride=4, + temporal_stride=1, + temporal_padding=[0,0]) + + self.conv2 = P4DConv(in_planes=128, + mlp_planes=[128, 128, 256], + mlp_batch_norm=[True, True, True], + mlp_activation=[True, True, True], + spatial_kernel_size=[2*radius, nsamples], + temporal_kernel_size=1, + spatial_stride=4, + temporal_stride=1, + temporal_padding=[0,0]) + + self.conv3 = P4DConv(in_planes=256, + mlp_planes=[256,256,512], + mlp_batch_norm=[True, True, True], + mlp_activation=[True, True, True], + spatial_kernel_size=[2*2*radius, nsamples], + temporal_kernel_size=3, + spatial_stride=4, + temporal_stride=1, + temporal_padding=[1,1]) + + self.conv4 = P4DConv(in_planes=512, + mlp_planes=[512,512,1024], + mlp_batch_norm=[True, True, True], + mlp_activation=[True, True, True], + spatial_kernel_size=[2*2*2*radius, nsamples], + temporal_kernel_size=1, + spatial_stride=2, + temporal_stride=1, + temporal_padding=[0,0]) + + self.emb_relu = nn.ReLU() + self.transformer = Transformer(dim=1024, depth=2, heads=4, dim_head=256, mlp_dim=1024, length=3) + + self.deconv4 = P4DTransConv(in_planes=1024, + mlp_planes=[256, 256], + mlp_batch_norm=[True, True, True], + mlp_activation=[True, True, True], + original_planes=512) + + self.deconv3 = P4DTransConv(in_planes=256, + mlp_planes=[256, 256], + mlp_batch_norm=[True, True, True], + mlp_activation=[True, True, True], + original_planes=256) + + self.deconv2 = P4DTransConv(in_planes=256, + mlp_planes=[128, 128], + mlp_batch_norm=[True, True, True], + mlp_activation=[True, True, True], + original_planes=128) + + self.deconv1 = P4DTransConv(in_planes=128, + mlp_planes=[128, 128], + mlp_batch_norm=[True, True, True], + mlp_activation=[True, True, True], + original_planes=3) + + self.outconv = nn.Conv2d(in_channels=128, out_channels=12, kernel_size=1, stride=1, padding=0) + + def forward(self, xyzs, rgbs): + + new_xyzs1, new_features1 = self.conv1(xyzs, rgbs) + + new_xyzs2, new_features2 = self.conv2(new_xyzs1, new_features1) + + new_xyzs3, new_features3 = self.conv3(new_xyzs2, new_features2) + + new_xyzs4, new_features4 = self.conv4(new_xyzs3, new_features3) + + features = new_features4.permute(0, 1, 3, 2) # [B, L, n2, C] + embedding = self.emb_relu(features) + features = self.transformer(new_xyzs4, embedding) + features = features.permute(0, 1, 3, 2) + + new_features4 = features + new_xyzsd4, new_featuresd4 = self.deconv4(new_xyzs4, new_xyzs3, new_features4, new_features3) + + new_xyzsd3, new_featuresd3 = self.deconv3(new_xyzsd4, new_xyzs2, new_featuresd4, new_features2) + + new_xyzsd2, new_featuresd2 = self.deconv2(new_xyzsd3, new_xyzs1, new_featuresd3, new_features1) + + new_xyzsd1, new_featuresd1 = self.deconv1(new_xyzsd2, xyzs, new_featuresd2, rgbs) + + out = self.outconv(new_featuresd1.transpose(1,2)).transpose(1,2) + + return out + diff --git a/models/video.py b/models/video.py new file mode 100644 index 0000000..849f407 --- /dev/null +++ b/models/video.py @@ -0,0 +1,49 @@ +import torch +import torch.nn.functional as F +from torch import nn +import numpy as np +import sys +import os + +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +ROOT_DIR = os.path.dirname(BASE_DIR) +sys.path.append(ROOT_DIR) +sys.path.append(os.path.join(ROOT_DIR, 'modules')) + +from point_4d_convolution import * +from transformer import * + +class PSTTransformer(nn.Module): + def __init__(self, radius, nsamples, spatial_stride, # P4DConv: spatial + temporal_kernel_size, temporal_stride, # P4DConv: temporal + dim, depth, heads, dim_head, length, # transformer + mlp_dim, num_classes): # output + super().__init__() + + self.tube_embedding = P4DConv(in_planes=0, mlp_planes=[dim], mlp_batch_norm=[False], mlp_activation=[False], + spatial_kernel_size=[radius, nsamples], spatial_stride=spatial_stride, + temporal_kernel_size=temporal_kernel_size, temporal_stride=temporal_stride, temporal_padding=[1, 0], + operator='+', spatial_pooling='max', temporal_pooling='max') + + self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, length) + + self.mlp_head = nn.Sequential( + nn.LayerNorm(dim), + nn.Linear(dim, mlp_dim), + nn.GELU(), + nn.Linear(mlp_dim, num_classes), + ) + + def forward(self, input): # [B, L, N, 3] + device = input.get_device() + xyzs, features = self.tube_embedding(input) # [B, L, n, 3], [B, L, C, n] + + features = features.permute(0, 1, 3, 2) + + output = self.transformer(xyzs, features) + output = torch.max(input=output, dim=1, keepdim=False, out=None)[0] + output = torch.max(input=output, dim=1, keepdim=False, out=None)[0] + output = self.mlp_head(output) + + return output + diff --git a/modules-pytorch-1.4.0/_ext_src/include/ball_query.h b/modules-pytorch-1.4.0/_ext_src/include/ball_query.h new file mode 100644 index 0000000..4a65b5a --- /dev/null +++ b/modules-pytorch-1.4.0/_ext_src/include/ball_query.h @@ -0,0 +1,10 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once +#include + +at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius, + const int nsample); diff --git a/modules-pytorch-1.4.0/_ext_src/include/cuda_utils.h b/modules-pytorch-1.4.0/_ext_src/include/cuda_utils.h new file mode 100644 index 0000000..d4c4bb4 --- /dev/null +++ b/modules-pytorch-1.4.0/_ext_src/include/cuda_utils.h @@ -0,0 +1,46 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#ifndef _CUDA_UTILS_H +#define _CUDA_UTILS_H + +#include +#include +#include + +#include +#include + +#include + +#define TOTAL_THREADS 512 + +inline int opt_n_threads(int work_size) { + const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); + + return max(min(1 << pow_2, TOTAL_THREADS), 1); +} + +inline dim3 opt_block_config(int x, int y) { + const int x_threads = opt_n_threads(x); + const int y_threads = + max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1); + dim3 block_config(x_threads, y_threads, 1); + + return block_config; +} + +#define CUDA_CHECK_ERRORS() \ + do { \ + cudaError_t err = cudaGetLastError(); \ + if (cudaSuccess != err) { \ + fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n", \ + cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, \ + __FILE__); \ + exit(-1); \ + } \ + } while (0) + +#endif diff --git a/modules-pytorch-1.4.0/_ext_src/include/group_points.h b/modules-pytorch-1.4.0/_ext_src/include/group_points.h new file mode 100644 index 0000000..24e7cc7 --- /dev/null +++ b/modules-pytorch-1.4.0/_ext_src/include/group_points.h @@ -0,0 +1,10 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once +#include + +at::Tensor group_points(at::Tensor points, at::Tensor idx); +at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n); diff --git a/modules-pytorch-1.4.0/_ext_src/include/interpolate.h b/modules-pytorch-1.4.0/_ext_src/include/interpolate.h new file mode 100644 index 0000000..2af34c6 --- /dev/null +++ b/modules-pytorch-1.4.0/_ext_src/include/interpolate.h @@ -0,0 +1,15 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include + +std::vector three_nn(at::Tensor unknowns, at::Tensor knows); +at::Tensor three_interpolate(at::Tensor points, at::Tensor idx, + at::Tensor weight); +at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx, + at::Tensor weight, const int m); diff --git a/modules-pytorch-1.4.0/_ext_src/include/sampling.h b/modules-pytorch-1.4.0/_ext_src/include/sampling.h new file mode 100644 index 0000000..366ef31 --- /dev/null +++ b/modules-pytorch-1.4.0/_ext_src/include/sampling.h @@ -0,0 +1,11 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once +#include + +at::Tensor gather_points(at::Tensor points, at::Tensor idx); +at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx, const int n); +at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples); diff --git a/modules-pytorch-1.4.0/_ext_src/include/utils.h b/modules-pytorch-1.4.0/_ext_src/include/utils.h new file mode 100644 index 0000000..3e7d8c3 --- /dev/null +++ b/modules-pytorch-1.4.0/_ext_src/include/utils.h @@ -0,0 +1,30 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once +#include +#include + +#define CHECK_CUDA(x) \ + do { \ + AT_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor"); \ + } while (0) + +#define CHECK_CONTIGUOUS(x) \ + do { \ + AT_CHECK(x.is_contiguous(), #x " must be a contiguous tensor"); \ + } while (0) + +#define CHECK_IS_INT(x) \ + do { \ + AT_CHECK(x.scalar_type() == at::ScalarType::Int, \ + #x " must be an int tensor"); \ + } while (0) + +#define CHECK_IS_FLOAT(x) \ + do { \ + AT_CHECK(x.scalar_type() == at::ScalarType::Float, \ + #x " must be a float tensor"); \ + } while (0) diff --git a/modules-pytorch-1.4.0/_ext_src/src/ball_query.cpp b/modules-pytorch-1.4.0/_ext_src/src/ball_query.cpp new file mode 100644 index 0000000..a79dae0 --- /dev/null +++ b/modules-pytorch-1.4.0/_ext_src/src/ball_query.cpp @@ -0,0 +1,37 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include "ball_query.h" +#include "utils.h" + +void query_ball_point_kernel_wrapper(int b, int n, int m, float radius, + int nsample, const float *new_xyz, + const float *xyz, int *idx); + +at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius, + const int nsample) { + CHECK_CONTIGUOUS(new_xyz); + CHECK_CONTIGUOUS(xyz); + CHECK_IS_FLOAT(new_xyz); + CHECK_IS_FLOAT(xyz); + + if (new_xyz.type().is_cuda()) { + CHECK_CUDA(xyz); + } + + at::Tensor idx = + torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample}, + at::device(new_xyz.device()).dtype(at::ScalarType::Int)); + + if (new_xyz.type().is_cuda()) { + query_ball_point_kernel_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1), + radius, nsample, new_xyz.data(), + xyz.data(), idx.data()); + } else { + AT_CHECK(false, "CPU not supported"); + } + + return idx; +} diff --git a/modules-pytorch-1.4.0/_ext_src/src/ball_query_gpu.cu b/modules-pytorch-1.4.0/_ext_src/src/ball_query_gpu.cu new file mode 100644 index 0000000..cfc2eeb --- /dev/null +++ b/modules-pytorch-1.4.0/_ext_src/src/ball_query_gpu.cu @@ -0,0 +1,59 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "cuda_utils.h" + +// input: new_xyz(b, m, 3) xyz(b, n, 3) +// output: idx(b, m, nsample) +__global__ void query_ball_point_kernel(int b, int n, int m, float radius, + int nsample, + const float *__restrict__ new_xyz, + const float *__restrict__ xyz, + int *__restrict__ idx) { + int batch_index = blockIdx.x; + xyz += batch_index * n * 3; + new_xyz += batch_index * m * 3; + idx += m * nsample * batch_index; + + int index = threadIdx.x; + int stride = blockDim.x; + + float radius2 = radius * radius; + for (int j = index; j < m; j += stride) { + float new_x = new_xyz[j * 3 + 0]; + float new_y = new_xyz[j * 3 + 1]; + float new_z = new_xyz[j * 3 + 2]; + for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) { + float x = xyz[k * 3 + 0]; + float y = xyz[k * 3 + 1]; + float z = xyz[k * 3 + 2]; + float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + + (new_z - z) * (new_z - z); + if (d2 < radius2) { + if (cnt == 0) { + for (int l = 0; l < nsample; ++l) { + idx[j * nsample + l] = k; + } + } + idx[j * nsample + cnt] = k; + ++cnt; + } + } + } +} + +void query_ball_point_kernel_wrapper(int b, int n, int m, float radius, + int nsample, const float *new_xyz, + const float *xyz, int *idx) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + query_ball_point_kernel<<>>( + b, n, m, radius, nsample, new_xyz, xyz, idx); + + CUDA_CHECK_ERRORS(); +} diff --git a/modules-pytorch-1.4.0/_ext_src/src/bindings.cpp b/modules-pytorch-1.4.0/_ext_src/src/bindings.cpp new file mode 100644 index 0000000..9783d87 --- /dev/null +++ b/modules-pytorch-1.4.0/_ext_src/src/bindings.cpp @@ -0,0 +1,24 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include "ball_query.h" +#include "group_points.h" +#include "interpolate.h" +#include "sampling.h" + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("gather_points", &gather_points); + m.def("gather_points_grad", &gather_points_grad); + m.def("furthest_point_sampling", &furthest_point_sampling); + + m.def("three_nn", &three_nn); + m.def("three_interpolate", &three_interpolate); + m.def("three_interpolate_grad", &three_interpolate_grad); + + m.def("ball_query", &ball_query); + + m.def("group_points", &group_points); + m.def("group_points_grad", &group_points_grad); +} diff --git a/modules-pytorch-1.4.0/_ext_src/src/group_points.cpp b/modules-pytorch-1.4.0/_ext_src/src/group_points.cpp new file mode 100644 index 0000000..3efb32d --- /dev/null +++ b/modules-pytorch-1.4.0/_ext_src/src/group_points.cpp @@ -0,0 +1,65 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include "group_points.h" +#include "utils.h" + +void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample, + const float *points, const int *idx, + float *out); + +void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints, + int nsample, const float *grad_out, + const int *idx, float *grad_points); + +at::Tensor group_points(at::Tensor points, at::Tensor idx) { + CHECK_CONTIGUOUS(points); + CHECK_CONTIGUOUS(idx); + CHECK_IS_FLOAT(points); + CHECK_IS_INT(idx); + + if (points.type().is_cuda()) { + CHECK_CUDA(idx); + } + + at::Tensor output = + torch::zeros({points.size(0), points.size(1), idx.size(1), idx.size(2)}, + at::device(points.device()).dtype(at::ScalarType::Float)); + + if (points.type().is_cuda()) { + group_points_kernel_wrapper(points.size(0), points.size(1), points.size(2), + idx.size(1), idx.size(2), points.data(), + idx.data(), output.data()); + } else { + AT_CHECK(false, "CPU not supported"); + } + + return output; +} + +at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n) { + CHECK_CONTIGUOUS(grad_out); + CHECK_CONTIGUOUS(idx); + CHECK_IS_FLOAT(grad_out); + CHECK_IS_INT(idx); + + if (grad_out.type().is_cuda()) { + CHECK_CUDA(idx); + } + + at::Tensor output = + torch::zeros({grad_out.size(0), grad_out.size(1), n}, + at::device(grad_out.device()).dtype(at::ScalarType::Float)); + + if (grad_out.type().is_cuda()) { + group_points_grad_kernel_wrapper( + grad_out.size(0), grad_out.size(1), n, idx.size(1), idx.size(2), + grad_out.data(), idx.data(), output.data()); + } else { + AT_CHECK(false, "CPU not supported"); + } + + return output; +} diff --git a/modules-pytorch-1.4.0/_ext_src/src/group_points_gpu.cu b/modules-pytorch-1.4.0/_ext_src/src/group_points_gpu.cu new file mode 100644 index 0000000..98a3be1 --- /dev/null +++ b/modules-pytorch-1.4.0/_ext_src/src/group_points_gpu.cu @@ -0,0 +1,80 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include "cuda_utils.h" + +// input: points(b, c, n) idx(b, npoints, nsample) +// output: out(b, c, npoints, nsample) +__global__ void group_points_kernel(int b, int c, int n, int npoints, + int nsample, + const float *__restrict__ points, + const int *__restrict__ idx, + float *__restrict__ out) { + int batch_index = blockIdx.x; + points += batch_index * n * c; + idx += batch_index * npoints * nsample; + out += batch_index * npoints * nsample * c; + + const int index = threadIdx.y * blockDim.x + threadIdx.x; + const int stride = blockDim.y * blockDim.x; + for (int i = index; i < c * npoints; i += stride) { + const int l = i / npoints; + const int j = i % npoints; + for (int k = 0; k < nsample; ++k) { + int ii = idx[j * nsample + k]; + out[(l * npoints + j) * nsample + k] = points[l * n + ii]; + } + } +} + +void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample, + const float *points, const int *idx, + float *out) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + group_points_kernel<<>>( + b, c, n, npoints, nsample, points, idx, out); + + CUDA_CHECK_ERRORS(); +} + +// input: grad_out(b, c, npoints, nsample), idx(b, npoints, nsample) +// output: grad_points(b, c, n) +__global__ void group_points_grad_kernel(int b, int c, int n, int npoints, + int nsample, + const float *__restrict__ grad_out, + const int *__restrict__ idx, + float *__restrict__ grad_points) { + int batch_index = blockIdx.x; + grad_out += batch_index * npoints * nsample * c; + idx += batch_index * npoints * nsample; + grad_points += batch_index * n * c; + + const int index = threadIdx.y * blockDim.x + threadIdx.x; + const int stride = blockDim.y * blockDim.x; + for (int i = index; i < c * npoints; i += stride) { + const int l = i / npoints; + const int j = i % npoints; + for (int k = 0; k < nsample; ++k) { + int ii = idx[j * nsample + k]; + atomicAdd(grad_points + l * n + ii, + grad_out[(l * npoints + j) * nsample + k]); + } + } +} + +void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints, + int nsample, const float *grad_out, + const int *idx, float *grad_points) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + group_points_grad_kernel<<>>( + b, c, n, npoints, nsample, grad_out, idx, grad_points); + + CUDA_CHECK_ERRORS(); +} diff --git a/modules-pytorch-1.4.0/_ext_src/src/interpolate.cpp b/modules-pytorch-1.4.0/_ext_src/src/interpolate.cpp new file mode 100644 index 0000000..cc908de --- /dev/null +++ b/modules-pytorch-1.4.0/_ext_src/src/interpolate.cpp @@ -0,0 +1,104 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include "interpolate.h" +#include "utils.h" + +void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown, + const float *known, float *dist2, int *idx); +void three_interpolate_kernel_wrapper(int b, int c, int m, int n, + const float *points, const int *idx, + const float *weight, float *out); +void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m, + const float *grad_out, + const int *idx, const float *weight, + float *grad_points); + +std::vector three_nn(at::Tensor unknowns, at::Tensor knows) { + CHECK_CONTIGUOUS(unknowns); + CHECK_CONTIGUOUS(knows); + CHECK_IS_FLOAT(unknowns); + CHECK_IS_FLOAT(knows); + + if (unknowns.type().is_cuda()) { + CHECK_CUDA(knows); + } + + at::Tensor idx = + torch::zeros({unknowns.size(0), unknowns.size(1), 3}, + at::device(unknowns.device()).dtype(at::ScalarType::Int)); + at::Tensor dist2 = + torch::zeros({unknowns.size(0), unknowns.size(1), 3}, + at::device(unknowns.device()).dtype(at::ScalarType::Float)); + + if (unknowns.type().is_cuda()) { + three_nn_kernel_wrapper(unknowns.size(0), unknowns.size(1), knows.size(1), + unknowns.data(), knows.data(), + dist2.data(), idx.data()); + } else { + AT_CHECK(false, "CPU not supported"); + } + + return {dist2, idx}; +} + +at::Tensor three_interpolate(at::Tensor points, at::Tensor idx, + at::Tensor weight) { + CHECK_CONTIGUOUS(points); + CHECK_CONTIGUOUS(idx); + CHECK_CONTIGUOUS(weight); + CHECK_IS_FLOAT(points); + CHECK_IS_INT(idx); + CHECK_IS_FLOAT(weight); + + if (points.type().is_cuda()) { + CHECK_CUDA(idx); + CHECK_CUDA(weight); + } + + at::Tensor output = + torch::zeros({points.size(0), points.size(1), idx.size(1)}, + at::device(points.device()).dtype(at::ScalarType::Float)); + + if (points.type().is_cuda()) { + three_interpolate_kernel_wrapper( + points.size(0), points.size(1), points.size(2), idx.size(1), + points.data(), idx.data(), weight.data(), + output.data()); + } else { + AT_CHECK(false, "CPU not supported"); + } + + return output; +} +at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx, + at::Tensor weight, const int m) { + CHECK_CONTIGUOUS(grad_out); + CHECK_CONTIGUOUS(idx); + CHECK_CONTIGUOUS(weight); + CHECK_IS_FLOAT(grad_out); + CHECK_IS_INT(idx); + CHECK_IS_FLOAT(weight); + + if (grad_out.type().is_cuda()) { + CHECK_CUDA(idx); + CHECK_CUDA(weight); + } + + at::Tensor output = + torch::zeros({grad_out.size(0), grad_out.size(1), m}, + at::device(grad_out.device()).dtype(at::ScalarType::Float)); + + if (grad_out.type().is_cuda()) { + three_interpolate_grad_kernel_wrapper( + grad_out.size(0), grad_out.size(1), grad_out.size(2), m, + grad_out.data(), idx.data(), weight.data(), + output.data()); + } else { + AT_CHECK(false, "CPU not supported"); + } + + return output; +} diff --git a/modules-pytorch-1.4.0/_ext_src/src/interpolate_gpu.cu b/modules-pytorch-1.4.0/_ext_src/src/interpolate_gpu.cu new file mode 100644 index 0000000..b13dbfa --- /dev/null +++ b/modules-pytorch-1.4.0/_ext_src/src/interpolate_gpu.cu @@ -0,0 +1,159 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "cuda_utils.h" + +// input: unknown(b, n, 3) known(b, m, 3) +// output: dist2(b, n, 3), idx(b, n, 3) +__global__ void three_nn_kernel(int b, int n, int m, + const float *__restrict__ unknown, + const float *__restrict__ known, + float *__restrict__ dist2, + int *__restrict__ idx) { + int batch_index = blockIdx.x; + unknown += batch_index * n * 3; + known += batch_index * m * 3; + dist2 += batch_index * n * 3; + idx += batch_index * n * 3; + + int index = threadIdx.x; + int stride = blockDim.x; + for (int j = index; j < n; j += stride) { + float ux = unknown[j * 3 + 0]; + float uy = unknown[j * 3 + 1]; + float uz = unknown[j * 3 + 2]; + + double best1 = 1e40, best2 = 1e40, best3 = 1e40; + int besti1 = 0, besti2 = 0, besti3 = 0; + for (int k = 0; k < m; ++k) { + float x = known[k * 3 + 0]; + float y = known[k * 3 + 1]; + float z = known[k * 3 + 2]; + float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); + if (d < best1) { + best3 = best2; + besti3 = besti2; + best2 = best1; + besti2 = besti1; + best1 = d; + besti1 = k; + } else if (d < best2) { + best3 = best2; + besti3 = besti2; + best2 = d; + besti2 = k; + } else if (d < best3) { + best3 = d; + besti3 = k; + } + } + dist2[j * 3 + 0] = best1; + dist2[j * 3 + 1] = best2; + dist2[j * 3 + 2] = best3; + + idx[j * 3 + 0] = besti1; + idx[j * 3 + 1] = besti2; + idx[j * 3 + 2] = besti3; + } +} + +void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown, + const float *known, float *dist2, int *idx) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + three_nn_kernel<<>>(b, n, m, unknown, known, + dist2, idx); + + CUDA_CHECK_ERRORS(); +} + +// input: points(b, c, m), idx(b, n, 3), weight(b, n, 3) +// output: out(b, c, n) +__global__ void three_interpolate_kernel(int b, int c, int m, int n, + const float *__restrict__ points, + const int *__restrict__ idx, + const float *__restrict__ weight, + float *__restrict__ out) { + int batch_index = blockIdx.x; + points += batch_index * m * c; + + idx += batch_index * n * 3; + weight += batch_index * n * 3; + + out += batch_index * n * c; + + const int index = threadIdx.y * blockDim.x + threadIdx.x; + const int stride = blockDim.y * blockDim.x; + for (int i = index; i < c * n; i += stride) { + const int l = i / n; + const int j = i % n; + float w1 = weight[j * 3 + 0]; + float w2 = weight[j * 3 + 1]; + float w3 = weight[j * 3 + 2]; + + int i1 = idx[j * 3 + 0]; + int i2 = idx[j * 3 + 1]; + int i3 = idx[j * 3 + 2]; + + out[i] = points[l * m + i1] * w1 + points[l * m + i2] * w2 + + points[l * m + i3] * w3; + } +} + +void three_interpolate_kernel_wrapper(int b, int c, int m, int n, + const float *points, const int *idx, + const float *weight, float *out) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + three_interpolate_kernel<<>>( + b, c, m, n, points, idx, weight, out); + + CUDA_CHECK_ERRORS(); +} + +// input: grad_out(b, c, n), idx(b, n, 3), weight(b, n, 3) +// output: grad_points(b, c, m) + +__global__ void three_interpolate_grad_kernel( + int b, int c, int n, int m, const float *__restrict__ grad_out, + const int *__restrict__ idx, const float *__restrict__ weight, + float *__restrict__ grad_points) { + int batch_index = blockIdx.x; + grad_out += batch_index * n * c; + idx += batch_index * n * 3; + weight += batch_index * n * 3; + grad_points += batch_index * m * c; + + const int index = threadIdx.y * blockDim.x + threadIdx.x; + const int stride = blockDim.y * blockDim.x; + for (int i = index; i < c * n; i += stride) { + const int l = i / n; + const int j = i % n; + float w1 = weight[j * 3 + 0]; + float w2 = weight[j * 3 + 1]; + float w3 = weight[j * 3 + 2]; + + int i1 = idx[j * 3 + 0]; + int i2 = idx[j * 3 + 1]; + int i3 = idx[j * 3 + 2]; + + atomicAdd(grad_points + l * m + i1, grad_out[i] * w1); + atomicAdd(grad_points + l * m + i2, grad_out[i] * w2); + atomicAdd(grad_points + l * m + i3, grad_out[i] * w3); + } +} + +void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m, + const float *grad_out, + const int *idx, const float *weight, + float *grad_points) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + three_interpolate_grad_kernel<<>>( + b, c, n, m, grad_out, idx, weight, grad_points); + + CUDA_CHECK_ERRORS(); +} diff --git a/modules-pytorch-1.4.0/_ext_src/src/sampling.cpp b/modules-pytorch-1.4.0/_ext_src/src/sampling.cpp new file mode 100644 index 0000000..1d7b058 --- /dev/null +++ b/modules-pytorch-1.4.0/_ext_src/src/sampling.cpp @@ -0,0 +1,91 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include "sampling.h" +#include "utils.h" + +void gather_points_kernel_wrapper(int b, int c, int n, int npoints, + const float *points, const int *idx, + float *out); +void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints, + const float *grad_out, const int *idx, + float *grad_points); + +void furthest_point_sampling_kernel_wrapper(int b, int n, int m, + const float *dataset, float *temp, + int *idxs); + +at::Tensor gather_points(at::Tensor points, at::Tensor idx) { + CHECK_CONTIGUOUS(points); + CHECK_CONTIGUOUS(idx); + CHECK_IS_FLOAT(points); + CHECK_IS_INT(idx); + + if (points.type().is_cuda()) { + CHECK_CUDA(idx); + } + + at::Tensor output = + torch::zeros({points.size(0), points.size(1), idx.size(1)}, + at::device(points.device()).dtype(at::ScalarType::Float)); + + if (points.type().is_cuda()) { + gather_points_kernel_wrapper(points.size(0), points.size(1), points.size(2), + idx.size(1), points.data(), + idx.data(), output.data()); + } else { + AT_CHECK(false, "CPU not supported"); + } + + return output; +} + +at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx, + const int n) { + CHECK_CONTIGUOUS(grad_out); + CHECK_CONTIGUOUS(idx); + CHECK_IS_FLOAT(grad_out); + CHECK_IS_INT(idx); + + if (grad_out.type().is_cuda()) { + CHECK_CUDA(idx); + } + + at::Tensor output = + torch::zeros({grad_out.size(0), grad_out.size(1), n}, + at::device(grad_out.device()).dtype(at::ScalarType::Float)); + + if (grad_out.type().is_cuda()) { + gather_points_grad_kernel_wrapper(grad_out.size(0), grad_out.size(1), n, + idx.size(1), grad_out.data(), + idx.data(), output.data()); + } else { + AT_CHECK(false, "CPU not supported"); + } + + return output; +} +at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples) { + CHECK_CONTIGUOUS(points); + CHECK_IS_FLOAT(points); + + at::Tensor output = + torch::zeros({points.size(0), nsamples}, + at::device(points.device()).dtype(at::ScalarType::Int)); + + at::Tensor tmp = + torch::full({points.size(0), points.size(1)}, 1e10, + at::device(points.device()).dtype(at::ScalarType::Float)); + + if (points.type().is_cuda()) { + furthest_point_sampling_kernel_wrapper( + points.size(0), points.size(1), nsamples, points.data(), + tmp.data(), output.data()); + } else { + AT_CHECK(false, "CPU not supported"); + } + + return output; +} diff --git a/modules-pytorch-1.4.0/_ext_src/src/sampling_gpu.cu b/modules-pytorch-1.4.0/_ext_src/src/sampling_gpu.cu new file mode 100644 index 0000000..e2f5806 --- /dev/null +++ b/modules-pytorch-1.4.0/_ext_src/src/sampling_gpu.cu @@ -0,0 +1,234 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include "cuda_utils.h" + +// input: points(b, c, n) idx(b, m) +// output: out(b, c, m) +__global__ void gather_points_kernel(int b, int c, int n, int m, + const float *__restrict__ points, + const int *__restrict__ idx, + float *__restrict__ out) { + for (int i = blockIdx.x; i < b; i += gridDim.x) { + for (int l = blockIdx.y; l < c; l += gridDim.y) { + for (int j = threadIdx.x; j < m; j += blockDim.x) { + int a = idx[i * m + j]; + out[(i * c + l) * m + j] = points[(i * c + l) * n + a]; + } + } + } +} + +void gather_points_kernel_wrapper(int b, int c, int n, int npoints, + const float *points, const int *idx, + float *out) { + gather_points_kernel<<>>(b, c, n, npoints, + points, idx, out); + + CUDA_CHECK_ERRORS(); +} + +// input: grad_out(b, c, m) idx(b, m) +// output: grad_points(b, c, n) +__global__ void gather_points_grad_kernel(int b, int c, int n, int m, + const float *__restrict__ grad_out, + const int *__restrict__ idx, + float *__restrict__ grad_points) { + for (int i = blockIdx.x; i < b; i += gridDim.x) { + for (int l = blockIdx.y; l < c; l += gridDim.y) { + for (int j = threadIdx.x; j < m; j += blockDim.x) { + int a = idx[i * m + j]; + atomicAdd(grad_points + (i * c + l) * n + a, + grad_out[(i * c + l) * m + j]); + } + } + } +} + +void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints, + const float *grad_out, const int *idx, + float *grad_points) { + gather_points_grad_kernel<<>>( + b, c, n, npoints, grad_out, idx, grad_points); + + CUDA_CHECK_ERRORS(); +} + +__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, + int idx1, int idx2) { + const float v1 = dists[idx1], v2 = dists[idx2]; + const int i1 = dists_i[idx1], i2 = dists_i[idx2]; + dists[idx1] = max(v1, v2); + dists_i[idx1] = v2 > v1 ? i2 : i1; +} + +// Input dataset: (b, n, 3), tmp: (b, n) +// Ouput idxs (b, m) +template +__global__ void furthest_point_sampling_kernel( + int b, int n, int m, const float *__restrict__ dataset, + float *__restrict__ temp, int *__restrict__ idxs) { + if (m <= 0) return; + __shared__ float dists[block_size]; + __shared__ int dists_i[block_size]; + + int batch_index = blockIdx.x; + dataset += batch_index * n * 3; + temp += batch_index * n; + idxs += batch_index * m; + + int tid = threadIdx.x; + const int stride = block_size; + + int old = 0; + if (threadIdx.x == 0) idxs[0] = old; + + __syncthreads(); + for (int j = 1; j < m; j++) { + int besti = 0; + float best = -1; + float x1 = dataset[old * 3 + 0]; + float y1 = dataset[old * 3 + 1]; + float z1 = dataset[old * 3 + 2]; + for (int k = tid; k < n; k += stride) { + float x2, y2, z2; + x2 = dataset[k * 3 + 0]; + y2 = dataset[k * 3 + 1]; + z2 = dataset[k * 3 + 2]; + float mag = (x2 * x2) + (y2 * y2) + (z2 * z2); + if (mag <= 1e-3) continue; + + float d = + (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1); + + float d2 = min(d, temp[k]); + temp[k] = d2; + besti = d2 > best ? k : besti; + best = d2 > best ? d2 : best; + } + dists[tid] = best; + dists_i[tid] = besti; + __syncthreads(); + + if (block_size >= 512) { + if (tid < 256) { + __update(dists, dists_i, tid, tid + 256); + } + __syncthreads(); + } + if (block_size >= 256) { + if (tid < 128) { + __update(dists, dists_i, tid, tid + 128); + } + __syncthreads(); + } + if (block_size >= 128) { + if (tid < 64) { + __update(dists, dists_i, tid, tid + 64); + } + __syncthreads(); + } + if (block_size >= 64) { + if (tid < 32) { + __update(dists, dists_i, tid, tid + 32); + } + __syncthreads(); + } + if (block_size >= 32) { + if (tid < 16) { + __update(dists, dists_i, tid, tid + 16); + } + __syncthreads(); + } + if (block_size >= 16) { + if (tid < 8) { + __update(dists, dists_i, tid, tid + 8); + } + __syncthreads(); + } + if (block_size >= 8) { + if (tid < 4) { + __update(dists, dists_i, tid, tid + 4); + } + __syncthreads(); + } + if (block_size >= 4) { + if (tid < 2) { + __update(dists, dists_i, tid, tid + 2); + } + __syncthreads(); + } + if (block_size >= 2) { + if (tid < 1) { + __update(dists, dists_i, tid, tid + 1); + } + __syncthreads(); + } + + old = dists_i[0]; + if (tid == 0) idxs[j] = old; + } +} + +void furthest_point_sampling_kernel_wrapper(int b, int n, int m, + const float *dataset, float *temp, + int *idxs) { + unsigned int n_threads = opt_n_threads(n); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + switch (n_threads) { + case 512: + furthest_point_sampling_kernel<512> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 256: + furthest_point_sampling_kernel<256> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 128: + furthest_point_sampling_kernel<128> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 64: + furthest_point_sampling_kernel<64> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 32: + furthest_point_sampling_kernel<32> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 16: + furthest_point_sampling_kernel<16> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 8: + furthest_point_sampling_kernel<8> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 4: + furthest_point_sampling_kernel<4> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 2: + furthest_point_sampling_kernel<2> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 1: + furthest_point_sampling_kernel<1> + <<>>(b, n, m, dataset, temp, idxs); + break; + default: + furthest_point_sampling_kernel<512> + <<>>(b, n, m, dataset, temp, idxs); + } + + CUDA_CHECK_ERRORS(); +} diff --git a/modules-pytorch-1.4.0/point_4d_convolution.py b/modules-pytorch-1.4.0/point_4d_convolution.py new file mode 100644 index 0000000..9a89e08 --- /dev/null +++ b/modules-pytorch-1.4.0/point_4d_convolution.py @@ -0,0 +1,271 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +import math +import os +import sys +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(BASE_DIR) + +import pointnet2_utils +from typing import List + + +class P4DConv(nn.Module): + def __init__(self, + in_planes: int, + mlp_planes: List[int], + mlp_batch_norm: List[bool], + mlp_activation: List[bool], + spatial_kernel_size: [float, int], + spatial_stride: int, + temporal_kernel_size: int, + temporal_stride: int = 1, + temporal_padding: [int, int] = [0, 0], + temporal_padding_mode: str = 'replicate', + operator: str = 'addition', + spatial_pooling: str = 'max', + temporal_pooling: str = 'sum', + bias: bool = False): + + super().__init__() + + self.in_planes = in_planes + self.mlp_planes = mlp_planes + self.mlp_batch_norm = mlp_batch_norm + self.mlp_activation = mlp_activation + + self.r, self.k = spatial_kernel_size + self.spatial_stride = spatial_stride + + self.temporal_kernel_size = temporal_kernel_size + self.temporal_stride = temporal_stride + self.temporal_padding = temporal_padding + self.temporal_padding_mode = temporal_padding_mode + + self.operator = operator + self.spatial_pooling = spatial_pooling + self.temporal_pooling = temporal_pooling + + conv_d = [nn.Conv2d(in_channels=4, out_channels=mlp_planes[0], kernel_size=1, stride=1, padding=0, bias=bias)] + if mlp_batch_norm[0]: + conv_d.append(nn.BatchNorm2d(num_features=mlp_planes[0])) + if mlp_activation[0]: + conv_d.append(nn.ReLU(inplace=True)) + self.conv_d = nn.Sequential(*conv_d) + + if in_planes != 0: + conv_f = [nn.Conv2d(in_channels=in_planes, out_channels=mlp_planes[0], kernel_size=1, stride=1, padding=0, bias=bias)] + if mlp_batch_norm[0]: + conv_f.append(nn.BatchNorm2d(num_features=mlp_planes[0])) + if mlp_activation[0]: + conv_f.append(nn.ReLU(inplace=True)) + self.conv_f = nn.Sequential(*conv_f) + + mlp = [] + for i in range(1, len(mlp_planes)): + if mlp_planes[i] != 0: + mlp.append(nn.Conv2d(in_channels=mlp_planes[i-1], out_channels=mlp_planes[i], kernel_size=1, stride=1, padding=0, bias=bias)) + if mlp_batch_norm[i]: + mlp.append(nn.BatchNorm2d(num_features=mlp_planes[i])) + if mlp_activation[i]: + mlp.append(nn.ReLU(inplace=True)) + self.mlp = nn.Sequential(*mlp) + + + def forward(self, xyzs: torch.Tensor, features: torch.Tensor = None) -> (torch.Tensor, torch.Tensor): + """ + Args: + xyzs: torch.Tensor + (B, T, N, 3) tensor of sequence of the xyz coordinates + features: torch.Tensor + (B, T, C, N) tensor of sequence of the features + """ + device = xyzs.get_device() + + nframes = xyzs.size(1) + npoints = xyzs.size(2) + + assert (self.temporal_kernel_size % 2 == 1), "P4DConv: Temporal kernel size should be odd!" + assert ((nframes + sum(self.temporal_padding) - self.temporal_kernel_size) % self.temporal_stride == 0), "P4DConv: Temporal length error!" + + xyzs = torch.split(tensor=xyzs, split_size_or_sections=1, dim=1) + xyzs = [torch.squeeze(input=xyz, dim=1).contiguous() for xyz in xyzs] + + if self.temporal_padding_mode == 'zeros': + xyz_padding = torch.zeros(xyzs[0].size(), dtype=torch.float32, device=device) + for i in range(self.temporal_padding[0]): + xyzs = [xyz_padding] + xyzs + for i in range(self.temporal_padding[1]): + xyzs = xyzs + [xyz_padding] + else: + for i in range(self.temporal_padding[0]): + xyzs = [xyzs[0]] + xyzs + for i in range(self.temporal_padding[1]): + xyzs = xyzs + [xyzs[-1]] + + if self.in_planes != 0: + features = torch.split(tensor=features, split_size_or_sections=1, dim=1) + features = [torch.squeeze(input=feature, dim=1).contiguous() for feature in features] + + if self.temporal_padding_mode == 'zeros': + feature_padding = torch.zeros(features[0].size(), dtype=torch.float32, device=device) + for i in range(self.temporal_padding[0]): + features = [feature_padding] + features + for i in range(self.temporal_padding[1]): + features = features + [feature_padding] + else: + for i in range(self.temporal_padding[0]): + features = [features[0]] + features + for i in range(self.temporal_padding[1]): + features = features + [features[-1]] + + new_xyzs = [] + new_features = [] + for t in range(self.temporal_kernel_size//2, len(xyzs)-self.temporal_kernel_size//2, self.temporal_stride): # temporal anchor frames + # spatial anchor point subsampling by FPS + anchor_idx = pointnet2_utils.furthest_point_sample(xyzs[t], npoints//self.spatial_stride) # (B, N//self.spatial_stride) + anchor_xyz_flipped = pointnet2_utils.gather_operation(xyzs[t].transpose(1, 2).contiguous(), anchor_idx) # (B, 3, N//self.spatial_stride) + anchor_xyz_expanded = torch.unsqueeze(anchor_xyz_flipped, 3) # (B, 3, N//spatial_stride, 1) + anchor_xyz = anchor_xyz_flipped.transpose(1, 2).contiguous() # (B, N//spatial_stride, 3) + + new_feature = [] + for i in range(t-self.temporal_kernel_size//2, t+self.temporal_kernel_size//2+1): + neighbor_xyz = xyzs[i] + + idx = pointnet2_utils.ball_query(self.r, self.k, neighbor_xyz, anchor_xyz) + + neighbor_xyz_flipped = neighbor_xyz.transpose(1, 2).contiguous() # (B, 3, N) + neighbor_xyz_grouped = pointnet2_utils.grouping_operation(neighbor_xyz_flipped, idx) # (B, 3, N//spatial_stride, k) + + xyz_displacement = neighbor_xyz_grouped - anchor_xyz_expanded # (B, 3, N//spatial_stride, k) + t_displacement = torch.ones((xyz_displacement.size()[0], 1, xyz_displacement.size()[2], xyz_displacement.size()[3]), dtype=torch.float32, device=device) * (i-t) + displacement = torch.cat(tensors=(xyz_displacement, t_displacement), dim=1, out=None) # (B, 4, N//spatial_stride, k) + displacement = self.conv_d(displacement) + + if self.in_planes != 0: + neighbor_feature_grouped = pointnet2_utils.grouping_operation(features[i], idx) # (B, in_planes, N//spatial_stride, k) + feature = self.conv_f(neighbor_feature_grouped) + if self.operator == '+': + feature = feature + displacement + else: + feature = feature * displacement + else: + feature = displacement + + feature = self.mlp(feature) + if self.spatial_pooling == 'max': + feature = torch.max(input=feature, dim=-1, keepdim=False)[0] # (B, out_planes, n) + elif self.spatial_pooling == 'sum': + feature = torch.sum(input=feature, dim=-1, keepdim=False) + else: + feature = torch.mean(input=feature, dim=-1, keepdim=False) + + new_feature.append(feature) + new_feature = torch.stack(tensors=new_feature, dim=1) + if self.temporal_pooling == 'max': + new_feature = torch.max(input=new_feature, dim=1, keepdim=False)[0] + elif self.temporal_pooling == 'sum': + new_feature = torch.sum(input=new_feature, dim=1, keepdim=False) + else: + new_feature = torch.mean(input=new_feature, dim=1, keepdim=False) + new_xyzs.append(anchor_xyz) + new_features.append(new_feature) + + new_xyzs = torch.stack(tensors=new_xyzs, dim=1) + new_features = torch.stack(tensors=new_features, dim=1) + + return new_xyzs, new_features + +class P4DTransConv(nn.Module): + def __init__(self, + in_planes: int, + mlp_planes: List[int], + mlp_batch_norm: List[bool], + mlp_activation: List[bool], + original_planes: int = 0, + bias: bool = False): + """ + Args: + in_planes: C'. when point features are not available, in_planes is 0. + out_planes: C" + original_planes: skip connection from original points. when original point features are not available, original_in_planes is 0. + bias: whether to use bias + batch_norm: whether to use batch norm + activation: + """ + super().__init__() + + self.in_planes = in_planes + self.mlp_planes = mlp_planes + self.mlp_batch_norm = mlp_batch_norm + + conv = [] + for i in range(len(mlp_planes)): + if i == 0: + conv.append(nn.Conv1d(in_channels=in_planes+original_planes, out_channels=mlp_planes[i], kernel_size=1, stride=1, padding=0, bias=bias)) + else: + conv.append(nn.Conv1d(in_channels=mlp_planes[i-1], out_channels=mlp_planes[i], kernel_size=1, stride=1, padding=0, bias=bias)) + if mlp_batch_norm[i]: + conv.append(nn.BatchNorm1d(num_features=mlp_planes[i])) + if mlp_activation[i]: + conv.append(nn.ReLU(inplace=True)) + self.conv = nn.Sequential(*conv) + + def forward(self, xyzs: torch.Tensor, original_xyzs: torch.Tensor, features: torch.Tensor, original_features: torch.Tensor = None) -> torch.Tensor: + r""" + Parameters + ---------- + xyzs : torch.Tensor + (B, T, N', 3) tensor of the xyz positions of the convolved features + original_xyzs : torch.Tensor + (B, T, N, 3) tensor of the xyz positions of the original points + features : torch.Tensor + (B, T, C', N') tensor of the features to be propigated to + original_features : torch.Tensor + (B, T, C, N) tensor of original point features for skip connection + + Returns + ------- + new_features : torch.Tensor + (B, T, C", N) tensor of the features of the unknown features + """ + + T = xyzs.size(1) + + xyzs = torch.split(tensor=xyzs, split_size_or_sections=1, dim=1) + xyzs = [torch.squeeze(input=xyz, dim=1).contiguous() for xyz in xyzs] + + features = torch.split(tensor=features, split_size_or_sections=1, dim=1) + features = [torch.squeeze(input=feature, dim=1).contiguous() for feature in features] + + new_xyzs = original_xyzs + + original_xyzs = torch.split(tensor=original_xyzs, split_size_or_sections=1, dim=1) + original_xyzs = [torch.squeeze(input=original_xyz, dim=1).contiguous() for original_xyz in original_xyzs] + + if original_features is not None: + original_features = torch.split(tensor=original_features, split_size_or_sections=1, dim=1) + original_features = [torch.squeeze(input=feature, dim=1).contiguous() for feature in original_features] + + new_features = [] + + for t in range(T): + dist, idx = pointnet2_utils.three_nn(original_xyzs[t], xyzs[t]) + + dist_recip = 1.0 / (dist + 1e-8) + norm = torch.sum(dist_recip, dim=2, keepdim=True) + weight = dist_recip / norm + + interpolated_feat = pointnet2_utils.three_interpolate(features[t], idx, weight) + + if original_features is not None: + new_feature = torch.cat([interpolated_feat, original_features[t]], dim=1) + new_feature = self.conv(new_feature) + new_features.append(new_feature) + + new_features = torch.stack(tensors=new_features, dim=1) + + return new_xyzs, new_features diff --git a/modules-pytorch-1.4.0/pointnet2_test.py b/modules-pytorch-1.4.0/pointnet2_test.py new file mode 100644 index 0000000..be60b28 --- /dev/null +++ b/modules-pytorch-1.4.0/pointnet2_test.py @@ -0,0 +1,33 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +''' Testing customized ops. ''' + +import torch +from torch.autograd import gradcheck +import numpy as np + +import os +import sys +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(BASE_DIR) +import pointnet2_utils + +def test_interpolation_grad(): + batch_size = 1 + feat_dim = 2 + m = 4 + feats = torch.randn(batch_size, feat_dim, m, requires_grad=True).float().cuda() + + def interpolate_func(inputs): + idx = torch.from_numpy(np.array([[[0,1,2],[1,2,3]]])).int().cuda() + weight = torch.from_numpy(np.array([[[1,1,1],[2,2,2]]])).float().cuda() + interpolated_feats = pointnet2_utils.three_interpolate(inputs, idx, weight) + return interpolated_feats + + assert (gradcheck(interpolate_func, feats, atol=1e-1, rtol=1e-1)) + +if __name__=='__main__': + test_interpolation_grad() diff --git a/modules-pytorch-1.4.0/pointnet2_utils.py b/modules-pytorch-1.4.0/pointnet2_utils.py new file mode 100644 index 0000000..56279b2 --- /dev/null +++ b/modules-pytorch-1.4.0/pointnet2_utils.py @@ -0,0 +1,412 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +''' Modified based on: https://github.com/erikwijmans/Pointnet2_PyTorch ''' +from __future__ import ( + division, + absolute_import, + with_statement, + print_function, + unicode_literals, +) +import torch +from torch.autograd import Function +import torch.nn as nn +import sys + +try: + import builtins +except: + import __builtin__ as builtins + +try: + import pointnet2._ext as _ext +except ImportError: + if not getattr(builtins, "__POINTNET2_SETUP__", False): + raise ImportError( + "Could not import _ext module.\n" + "Please see the setup instructions in the README: " + "https://github.com/erikwijmans/Pointnet2_PyTorch/blob/master/README.rst" + ) + +if False: + # Workaround for type hints without depending on the `typing` module + from typing import * + +class FurthestPointSampling(Function): + @staticmethod + def forward(ctx, xyz, npoint): + # type: (Any, torch.Tensor, int) -> torch.Tensor + r""" + Uses iterative furthest point sampling to select a set of npoint features that have the largest + minimum distance + + Parameters + ---------- + xyz : torch.Tensor + (B, N, 3) tensor where N > npoint + npoint : int32 + number of features in the sampled set + + Returns + ------- + torch.Tensor + (B, npoint) tensor containing the set + """ + fps_inds = _ext.furthest_point_sampling(xyz, npoint) + ctx.mark_non_differentiable(fps_inds) + return fps_inds + + @staticmethod + def backward(xyz, a=None): + return None, None + + +furthest_point_sample = FurthestPointSampling.apply + + +class GatherOperation(Function): + @staticmethod + def forward(ctx, features, idx): + # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor + r""" + + Parameters + ---------- + features : torch.Tensor + (B, C, N) tensor + + idx : torch.Tensor + (B, npoint) tensor of the features to gather + + Returns + ------- + torch.Tensor + (B, C, npoint) tensor + """ + + _, C, N = features.size() + + ctx.for_backwards = (idx, C, N) + + return _ext.gather_points(features, idx) + + @staticmethod + def backward(ctx, grad_out): + idx, C, N = ctx.for_backwards + + grad_features = _ext.gather_points_grad(grad_out.contiguous(), idx, N) + return grad_features, None + + +gather_operation = GatherOperation.apply + + +class ThreeNN(Function): + @staticmethod + def forward(ctx, unknown, known): + # type: (Any, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor] + r""" + Find the three nearest neighbors of unknown in known + Parameters + ---------- + unknown : torch.Tensor + (B, n, 3) tensor of known features + known : torch.Tensor + (B, m, 3) tensor of unknown features + + Returns + ------- + dist : torch.Tensor + (B, n, 3) l2 distance to the three nearest neighbors + idx : torch.Tensor + (B, n, 3) index of 3 nearest neighbors + """ + dist2, idx = _ext.three_nn(unknown, known) + + return torch.sqrt(dist2), idx + + @staticmethod + def backward(ctx, a=None, b=None): + return None, None + + +three_nn = ThreeNN.apply + + +class ThreeInterpolate(Function): + @staticmethod + def forward(ctx, features, idx, weight): + # type(Any, torch.Tensor, torch.Tensor, torch.Tensor) -> Torch.Tensor + r""" + Performs weight linear interpolation on 3 features + Parameters + ---------- + features : torch.Tensor + (B, c, m) Features descriptors to be interpolated from + idx : torch.Tensor + (B, n, 3) three nearest neighbors of the target features in features + weight : torch.Tensor + (B, n, 3) weights + + Returns + ------- + torch.Tensor + (B, c, n) tensor of the interpolated features + """ + B, c, m = features.size() + n = idx.size(1) + + ctx.three_interpolate_for_backward = (idx, weight, m) + + return _ext.three_interpolate(features, idx, weight) + + @staticmethod + def backward(ctx, grad_out): + # type: (Any, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor] + r""" + Parameters + ---------- + grad_out : torch.Tensor + (B, c, n) tensor with gradients of ouputs + + Returns + ------- + grad_features : torch.Tensor + (B, c, m) tensor with gradients of features + + None + + None + """ + idx, weight, m = ctx.three_interpolate_for_backward + + grad_features = _ext.three_interpolate_grad( + grad_out.contiguous(), idx, weight, m + ) + + return grad_features, None, None + + +three_interpolate = ThreeInterpolate.apply + + +class GroupingOperation(Function): + @staticmethod + def forward(ctx, features, idx): + # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor + r""" + + Parameters + ---------- + features : torch.Tensor + (B, C, N) tensor of features to group + idx : torch.Tensor + (B, npoint, nsample) tensor containing the indicies of features to group with + + Returns + ------- + torch.Tensor + (B, C, npoint, nsample) tensor + """ + B, nfeatures, nsample = idx.size() + _, C, N = features.size() + + ctx.for_backwards = (idx, N) + + return _ext.group_points(features, idx) + + @staticmethod + def backward(ctx, grad_out): + # type: (Any, torch.tensor) -> Tuple[torch.Tensor, torch.Tensor] + r""" + + Parameters + ---------- + grad_out : torch.Tensor + (B, C, npoint, nsample) tensor of the gradients of the output from forward + + Returns + ------- + torch.Tensor + (B, C, N) gradient of the features + None + """ + idx, N = ctx.for_backwards + + grad_features = _ext.group_points_grad(grad_out.contiguous(), idx, N) + + return grad_features, None + + +grouping_operation = GroupingOperation.apply + + +class BallQuery(Function): + @staticmethod + def forward(ctx, radius, nsample, xyz, new_xyz): + # type: (Any, float, int, torch.Tensor, torch.Tensor) -> torch.Tensor + r""" + + Parameters + ---------- + radius : float + radius of the balls + nsample : int + maximum number of features in the balls + xyz : torch.Tensor + (B, N, 3) xyz coordinates of the features + new_xyz : torch.Tensor + (B, npoint, 3) centers of the ball query + + Returns + ------- + torch.Tensor + (B, npoint, nsample) tensor with the indicies of the features that form the query balls + """ + inds = _ext.ball_query(new_xyz, xyz, radius, nsample) + ctx.mark_non_differentiable(inds) + return inds + + @staticmethod + def backward(ctx, a=None): + return None, None, None, None + + +ball_query = BallQuery.apply + + +class QueryAndGroup(nn.Module): + r""" + Groups with a ball query of radius + + Parameters + --------- + radius : float32 + Radius of ball + nsample : int32 + Maximum number of features to gather in the ball + """ + + def __init__(self, radius, nsample, use_xyz=True, ret_grouped_xyz=False, normalize_xyz=False, sample_uniformly=False, ret_unique_cnt=False): + # type: (QueryAndGroup, float, int, bool) -> None + super(QueryAndGroup, self).__init__() + self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz + self.ret_grouped_xyz = ret_grouped_xyz + self.normalize_xyz = normalize_xyz + self.sample_uniformly = sample_uniformly + self.ret_unique_cnt = ret_unique_cnt + if self.ret_unique_cnt: + assert(self.sample_uniformly) + + def forward(self, xyz, new_xyz, features=None): + # type: (QueryAndGroup, torch.Tensor. torch.Tensor, torch.Tensor) -> Tuple[Torch.Tensor] + r""" + Parameters + ---------- + xyz : torch.Tensor + xyz coordinates of the features (B, N, 3) + new_xyz : torch.Tensor + centriods (B, npoint, 3) + features : torch.Tensor + Descriptors of the features (B, C, N) + + Returns + ------- + new_features : torch.Tensor + (B, 3 + C, npoint, nsample) tensor + """ + idx = ball_query(self.radius, self.nsample, xyz, new_xyz) + + if self.sample_uniformly: + unique_cnt = torch.zeros((idx.shape[0], idx.shape[1])) + for i_batch in range(idx.shape[0]): + for i_region in range(idx.shape[1]): + unique_ind = torch.unique(idx[i_batch, i_region, :]) + num_unique = unique_ind.shape[0] + unique_cnt[i_batch, i_region] = num_unique + sample_ind = torch.randint(0, num_unique, (self.nsample - num_unique,), dtype=torch.long) + all_ind = torch.cat((unique_ind, unique_ind[sample_ind])) + idx[i_batch, i_region, :] = all_ind + + + xyz_trans = xyz.transpose(1, 2).contiguous() + grouped_xyz = grouping_operation(xyz_trans, idx) # (B, 3, npoint, nsample) + grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1) + if self.normalize_xyz: + grouped_xyz /= self.radius + + if features is not None: + grouped_features = grouping_operation(features, idx) + if self.use_xyz: + new_features = torch.cat( + [grouped_xyz, grouped_features], dim=1 + ) # (B, C + 3, npoint, nsample) + else: + new_features = grouped_features + else: + assert ( + self.use_xyz + ), "Cannot have not features and not use xyz as a feature!" + new_features = grouped_xyz + + ret = [new_features] + if self.ret_grouped_xyz: + ret.append(grouped_xyz) + if self.ret_unique_cnt: + ret.append(unique_cnt) + if len(ret) == 1: + return ret[0] + else: + return tuple(ret) + + +class GroupAll(nn.Module): + r""" + Groups all features + + Parameters + --------- + """ + + def __init__(self, use_xyz=True, ret_grouped_xyz=False): + # type: (GroupAll, bool) -> None + super(GroupAll, self).__init__() + self.use_xyz = use_xyz + + def forward(self, xyz, new_xyz, features=None): + # type: (GroupAll, torch.Tensor, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor] + r""" + Parameters + ---------- + xyz : torch.Tensor + xyz coordinates of the features (B, N, 3) + new_xyz : torch.Tensor + Ignored + features : torch.Tensor + Descriptors of the features (B, C, N) + + Returns + ------- + new_features : torch.Tensor + (B, C + 3, 1, N) tensor + """ + + grouped_xyz = xyz.transpose(1, 2).unsqueeze(2) + if features is not None: + grouped_features = features.unsqueeze(2) + if self.use_xyz: + new_features = torch.cat( + [grouped_xyz, grouped_features], dim=1 + ) # (B, 3 + C, 1, N) + else: + new_features = grouped_features + else: + new_features = grouped_xyz + + if self.ret_grouped_xyz: + return new_features, grouped_xyz + else: + return new_features diff --git a/modules-pytorch-1.4.0/setup.py b/modules-pytorch-1.4.0/setup.py new file mode 100644 index 0000000..8e9c793 --- /dev/null +++ b/modules-pytorch-1.4.0/setup.py @@ -0,0 +1,31 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension +import glob + +_ext_src_root = "_ext_src" +_ext_sources = glob.glob("{}/src/*.cpp".format(_ext_src_root)) + glob.glob( + "{}/src/*.cu".format(_ext_src_root) +) +_ext_headers = glob.glob("{}/include/*".format(_ext_src_root)) + +setup( + name='pointnet2', + ext_modules=[ + CUDAExtension( + name='pointnet2._ext', + sources=_ext_sources, + extra_compile_args={ + "cxx": ["-O2", "-I{}".format("{}/include".format(_ext_src_root))], + "nvcc": ["-O2", "-I{}".format("{}/include".format(_ext_src_root))], + }, + ) + ], + cmdclass={ + 'build_ext': BuildExtension + } +) diff --git a/modules-pytorch-1.9.0/_ext_src/include/ball_query.h b/modules-pytorch-1.9.0/_ext_src/include/ball_query.h new file mode 100644 index 0000000..4a65b5a --- /dev/null +++ b/modules-pytorch-1.9.0/_ext_src/include/ball_query.h @@ -0,0 +1,10 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once +#include + +at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius, + const int nsample); diff --git a/modules-pytorch-1.9.0/_ext_src/include/cuda_utils.h b/modules-pytorch-1.9.0/_ext_src/include/cuda_utils.h new file mode 100644 index 0000000..d4c4bb4 --- /dev/null +++ b/modules-pytorch-1.9.0/_ext_src/include/cuda_utils.h @@ -0,0 +1,46 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#ifndef _CUDA_UTILS_H +#define _CUDA_UTILS_H + +#include +#include +#include + +#include +#include + +#include + +#define TOTAL_THREADS 512 + +inline int opt_n_threads(int work_size) { + const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); + + return max(min(1 << pow_2, TOTAL_THREADS), 1); +} + +inline dim3 opt_block_config(int x, int y) { + const int x_threads = opt_n_threads(x); + const int y_threads = + max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1); + dim3 block_config(x_threads, y_threads, 1); + + return block_config; +} + +#define CUDA_CHECK_ERRORS() \ + do { \ + cudaError_t err = cudaGetLastError(); \ + if (cudaSuccess != err) { \ + fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n", \ + cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, \ + __FILE__); \ + exit(-1); \ + } \ + } while (0) + +#endif diff --git a/modules-pytorch-1.9.0/_ext_src/include/group_points.h b/modules-pytorch-1.9.0/_ext_src/include/group_points.h new file mode 100644 index 0000000..24e7cc7 --- /dev/null +++ b/modules-pytorch-1.9.0/_ext_src/include/group_points.h @@ -0,0 +1,10 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once +#include + +at::Tensor group_points(at::Tensor points, at::Tensor idx); +at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n); diff --git a/modules-pytorch-1.9.0/_ext_src/include/interpolate.h b/modules-pytorch-1.9.0/_ext_src/include/interpolate.h new file mode 100644 index 0000000..2af34c6 --- /dev/null +++ b/modules-pytorch-1.9.0/_ext_src/include/interpolate.h @@ -0,0 +1,15 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once + +#include +#include + +std::vector three_nn(at::Tensor unknowns, at::Tensor knows); +at::Tensor three_interpolate(at::Tensor points, at::Tensor idx, + at::Tensor weight); +at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx, + at::Tensor weight, const int m); diff --git a/modules-pytorch-1.9.0/_ext_src/include/sampling.h b/modules-pytorch-1.9.0/_ext_src/include/sampling.h new file mode 100644 index 0000000..366ef31 --- /dev/null +++ b/modules-pytorch-1.9.0/_ext_src/include/sampling.h @@ -0,0 +1,11 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once +#include + +at::Tensor gather_points(at::Tensor points, at::Tensor idx); +at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx, const int n); +at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples); diff --git a/modules-pytorch-1.9.0/_ext_src/include/utils.h b/modules-pytorch-1.9.0/_ext_src/include/utils.h new file mode 100644 index 0000000..925f769 --- /dev/null +++ b/modules-pytorch-1.9.0/_ext_src/include/utils.h @@ -0,0 +1,30 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#pragma once +#include +#include + +#define CHECK_CUDA(x) \ + do { \ + TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor"); \ + } while (0) + +#define CHECK_CONTIGUOUS(x) \ + do { \ + TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor"); \ + } while (0) + +#define CHECK_IS_INT(x) \ + do { \ + TORCH_CHECK(x.scalar_type() == at::ScalarType::Int, \ + #x " must be an int tensor"); \ + } while (0) + +#define CHECK_IS_FLOAT(x) \ + do { \ + TORCH_CHECK(x.scalar_type() == at::ScalarType::Float, \ + #x " must be a float tensor"); \ + } while (0) diff --git a/modules-pytorch-1.9.0/_ext_src/src/ball_query.cpp b/modules-pytorch-1.9.0/_ext_src/src/ball_query.cpp new file mode 100644 index 0000000..b9cf4f9 --- /dev/null +++ b/modules-pytorch-1.9.0/_ext_src/src/ball_query.cpp @@ -0,0 +1,37 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include "ball_query.h" +#include "utils.h" + +void query_ball_point_kernel_wrapper(int b, int n, int m, float radius, + int nsample, const float *new_xyz, + const float *xyz, int *idx); + +at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius, + const int nsample) { + CHECK_CONTIGUOUS(new_xyz); + CHECK_CONTIGUOUS(xyz); + CHECK_IS_FLOAT(new_xyz); + CHECK_IS_FLOAT(xyz); + + if (new_xyz.type().is_cuda()) { + CHECK_CUDA(xyz); + } + + at::Tensor idx = + torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample}, + at::device(new_xyz.device()).dtype(at::ScalarType::Int)); + + if (new_xyz.type().is_cuda()) { + query_ball_point_kernel_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1), + radius, nsample, new_xyz.data(), + xyz.data(), idx.data()); + } else { + TORCH_CHECK(false, "CPU not supported"); + } + + return idx; +} diff --git a/modules-pytorch-1.9.0/_ext_src/src/ball_query_gpu.cu b/modules-pytorch-1.9.0/_ext_src/src/ball_query_gpu.cu new file mode 100644 index 0000000..cfc2eeb --- /dev/null +++ b/modules-pytorch-1.9.0/_ext_src/src/ball_query_gpu.cu @@ -0,0 +1,59 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "cuda_utils.h" + +// input: new_xyz(b, m, 3) xyz(b, n, 3) +// output: idx(b, m, nsample) +__global__ void query_ball_point_kernel(int b, int n, int m, float radius, + int nsample, + const float *__restrict__ new_xyz, + const float *__restrict__ xyz, + int *__restrict__ idx) { + int batch_index = blockIdx.x; + xyz += batch_index * n * 3; + new_xyz += batch_index * m * 3; + idx += m * nsample * batch_index; + + int index = threadIdx.x; + int stride = blockDim.x; + + float radius2 = radius * radius; + for (int j = index; j < m; j += stride) { + float new_x = new_xyz[j * 3 + 0]; + float new_y = new_xyz[j * 3 + 1]; + float new_z = new_xyz[j * 3 + 2]; + for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) { + float x = xyz[k * 3 + 0]; + float y = xyz[k * 3 + 1]; + float z = xyz[k * 3 + 2]; + float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + + (new_z - z) * (new_z - z); + if (d2 < radius2) { + if (cnt == 0) { + for (int l = 0; l < nsample; ++l) { + idx[j * nsample + l] = k; + } + } + idx[j * nsample + cnt] = k; + ++cnt; + } + } + } +} + +void query_ball_point_kernel_wrapper(int b, int n, int m, float radius, + int nsample, const float *new_xyz, + const float *xyz, int *idx) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + query_ball_point_kernel<<>>( + b, n, m, radius, nsample, new_xyz, xyz, idx); + + CUDA_CHECK_ERRORS(); +} diff --git a/modules-pytorch-1.9.0/_ext_src/src/bindings.cpp b/modules-pytorch-1.9.0/_ext_src/src/bindings.cpp new file mode 100644 index 0000000..9783d87 --- /dev/null +++ b/modules-pytorch-1.9.0/_ext_src/src/bindings.cpp @@ -0,0 +1,24 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include "ball_query.h" +#include "group_points.h" +#include "interpolate.h" +#include "sampling.h" + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("gather_points", &gather_points); + m.def("gather_points_grad", &gather_points_grad); + m.def("furthest_point_sampling", &furthest_point_sampling); + + m.def("three_nn", &three_nn); + m.def("three_interpolate", &three_interpolate); + m.def("three_interpolate_grad", &three_interpolate_grad); + + m.def("ball_query", &ball_query); + + m.def("group_points", &group_points); + m.def("group_points_grad", &group_points_grad); +} diff --git a/modules-pytorch-1.9.0/_ext_src/src/group_points.cpp b/modules-pytorch-1.9.0/_ext_src/src/group_points.cpp new file mode 100644 index 0000000..ab2fe1f --- /dev/null +++ b/modules-pytorch-1.9.0/_ext_src/src/group_points.cpp @@ -0,0 +1,65 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include "group_points.h" +#include "utils.h" + +void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample, + const float *points, const int *idx, + float *out); + +void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints, + int nsample, const float *grad_out, + const int *idx, float *grad_points); + +at::Tensor group_points(at::Tensor points, at::Tensor idx) { + CHECK_CONTIGUOUS(points); + CHECK_CONTIGUOUS(idx); + CHECK_IS_FLOAT(points); + CHECK_IS_INT(idx); + + if (points.type().is_cuda()) { + CHECK_CUDA(idx); + } + + at::Tensor output = + torch::zeros({points.size(0), points.size(1), idx.size(1), idx.size(2)}, + at::device(points.device()).dtype(at::ScalarType::Float)); + + if (points.type().is_cuda()) { + group_points_kernel_wrapper(points.size(0), points.size(1), points.size(2), + idx.size(1), idx.size(2), points.data(), + idx.data(), output.data()); + } else { + TORCH_CHECK(false, "CPU not supported"); + } + + return output; +} + +at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n) { + CHECK_CONTIGUOUS(grad_out); + CHECK_CONTIGUOUS(idx); + CHECK_IS_FLOAT(grad_out); + CHECK_IS_INT(idx); + + if (grad_out.type().is_cuda()) { + CHECK_CUDA(idx); + } + + at::Tensor output = + torch::zeros({grad_out.size(0), grad_out.size(1), n}, + at::device(grad_out.device()).dtype(at::ScalarType::Float)); + + if (grad_out.type().is_cuda()) { + group_points_grad_kernel_wrapper( + grad_out.size(0), grad_out.size(1), n, idx.size(1), idx.size(2), + grad_out.data(), idx.data(), output.data()); + } else { + TORCH_CHECK(false, "CPU not supported"); + } + + return output; +} diff --git a/modules-pytorch-1.9.0/_ext_src/src/group_points_gpu.cu b/modules-pytorch-1.9.0/_ext_src/src/group_points_gpu.cu new file mode 100644 index 0000000..98a3be1 --- /dev/null +++ b/modules-pytorch-1.9.0/_ext_src/src/group_points_gpu.cu @@ -0,0 +1,80 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include "cuda_utils.h" + +// input: points(b, c, n) idx(b, npoints, nsample) +// output: out(b, c, npoints, nsample) +__global__ void group_points_kernel(int b, int c, int n, int npoints, + int nsample, + const float *__restrict__ points, + const int *__restrict__ idx, + float *__restrict__ out) { + int batch_index = blockIdx.x; + points += batch_index * n * c; + idx += batch_index * npoints * nsample; + out += batch_index * npoints * nsample * c; + + const int index = threadIdx.y * blockDim.x + threadIdx.x; + const int stride = blockDim.y * blockDim.x; + for (int i = index; i < c * npoints; i += stride) { + const int l = i / npoints; + const int j = i % npoints; + for (int k = 0; k < nsample; ++k) { + int ii = idx[j * nsample + k]; + out[(l * npoints + j) * nsample + k] = points[l * n + ii]; + } + } +} + +void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample, + const float *points, const int *idx, + float *out) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + group_points_kernel<<>>( + b, c, n, npoints, nsample, points, idx, out); + + CUDA_CHECK_ERRORS(); +} + +// input: grad_out(b, c, npoints, nsample), idx(b, npoints, nsample) +// output: grad_points(b, c, n) +__global__ void group_points_grad_kernel(int b, int c, int n, int npoints, + int nsample, + const float *__restrict__ grad_out, + const int *__restrict__ idx, + float *__restrict__ grad_points) { + int batch_index = blockIdx.x; + grad_out += batch_index * npoints * nsample * c; + idx += batch_index * npoints * nsample; + grad_points += batch_index * n * c; + + const int index = threadIdx.y * blockDim.x + threadIdx.x; + const int stride = blockDim.y * blockDim.x; + for (int i = index; i < c * npoints; i += stride) { + const int l = i / npoints; + const int j = i % npoints; + for (int k = 0; k < nsample; ++k) { + int ii = idx[j * nsample + k]; + atomicAdd(grad_points + l * n + ii, + grad_out[(l * npoints + j) * nsample + k]); + } + } +} + +void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints, + int nsample, const float *grad_out, + const int *idx, float *grad_points) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + group_points_grad_kernel<<>>( + b, c, n, npoints, nsample, grad_out, idx, grad_points); + + CUDA_CHECK_ERRORS(); +} diff --git a/modules-pytorch-1.9.0/_ext_src/src/interpolate.cpp b/modules-pytorch-1.9.0/_ext_src/src/interpolate.cpp new file mode 100644 index 0000000..065ac31 --- /dev/null +++ b/modules-pytorch-1.9.0/_ext_src/src/interpolate.cpp @@ -0,0 +1,104 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include "interpolate.h" +#include "utils.h" + +void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown, + const float *known, float *dist2, int *idx); +void three_interpolate_kernel_wrapper(int b, int c, int m, int n, + const float *points, const int *idx, + const float *weight, float *out); +void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m, + const float *grad_out, + const int *idx, const float *weight, + float *grad_points); + +std::vector three_nn(at::Tensor unknowns, at::Tensor knows) { + CHECK_CONTIGUOUS(unknowns); + CHECK_CONTIGUOUS(knows); + CHECK_IS_FLOAT(unknowns); + CHECK_IS_FLOAT(knows); + + if (unknowns.type().is_cuda()) { + CHECK_CUDA(knows); + } + + at::Tensor idx = + torch::zeros({unknowns.size(0), unknowns.size(1), 3}, + at::device(unknowns.device()).dtype(at::ScalarType::Int)); + at::Tensor dist2 = + torch::zeros({unknowns.size(0), unknowns.size(1), 3}, + at::device(unknowns.device()).dtype(at::ScalarType::Float)); + + if (unknowns.type().is_cuda()) { + three_nn_kernel_wrapper(unknowns.size(0), unknowns.size(1), knows.size(1), + unknowns.data(), knows.data(), + dist2.data(), idx.data()); + } else { + TORCH_CHECK(false, "CPU not supported"); + } + + return {dist2, idx}; +} + +at::Tensor three_interpolate(at::Tensor points, at::Tensor idx, + at::Tensor weight) { + CHECK_CONTIGUOUS(points); + CHECK_CONTIGUOUS(idx); + CHECK_CONTIGUOUS(weight); + CHECK_IS_FLOAT(points); + CHECK_IS_INT(idx); + CHECK_IS_FLOAT(weight); + + if (points.type().is_cuda()) { + CHECK_CUDA(idx); + CHECK_CUDA(weight); + } + + at::Tensor output = + torch::zeros({points.size(0), points.size(1), idx.size(1)}, + at::device(points.device()).dtype(at::ScalarType::Float)); + + if (points.type().is_cuda()) { + three_interpolate_kernel_wrapper( + points.size(0), points.size(1), points.size(2), idx.size(1), + points.data(), idx.data(), weight.data(), + output.data()); + } else { + TORCH_CHECK(false, "CPU not supported"); + } + + return output; +} +at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx, + at::Tensor weight, const int m) { + CHECK_CONTIGUOUS(grad_out); + CHECK_CONTIGUOUS(idx); + CHECK_CONTIGUOUS(weight); + CHECK_IS_FLOAT(grad_out); + CHECK_IS_INT(idx); + CHECK_IS_FLOAT(weight); + + if (grad_out.type().is_cuda()) { + CHECK_CUDA(idx); + CHECK_CUDA(weight); + } + + at::Tensor output = + torch::zeros({grad_out.size(0), grad_out.size(1), m}, + at::device(grad_out.device()).dtype(at::ScalarType::Float)); + + if (grad_out.type().is_cuda()) { + three_interpolate_grad_kernel_wrapper( + grad_out.size(0), grad_out.size(1), grad_out.size(2), m, + grad_out.data(), idx.data(), weight.data(), + output.data()); + } else { + TORCH_CHECK(false, "CPU not supported"); + } + + return output; +} diff --git a/modules-pytorch-1.9.0/_ext_src/src/interpolate_gpu.cu b/modules-pytorch-1.9.0/_ext_src/src/interpolate_gpu.cu new file mode 100644 index 0000000..b13dbfa --- /dev/null +++ b/modules-pytorch-1.9.0/_ext_src/src/interpolate_gpu.cu @@ -0,0 +1,159 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include +#include + +#include "cuda_utils.h" + +// input: unknown(b, n, 3) known(b, m, 3) +// output: dist2(b, n, 3), idx(b, n, 3) +__global__ void three_nn_kernel(int b, int n, int m, + const float *__restrict__ unknown, + const float *__restrict__ known, + float *__restrict__ dist2, + int *__restrict__ idx) { + int batch_index = blockIdx.x; + unknown += batch_index * n * 3; + known += batch_index * m * 3; + dist2 += batch_index * n * 3; + idx += batch_index * n * 3; + + int index = threadIdx.x; + int stride = blockDim.x; + for (int j = index; j < n; j += stride) { + float ux = unknown[j * 3 + 0]; + float uy = unknown[j * 3 + 1]; + float uz = unknown[j * 3 + 2]; + + double best1 = 1e40, best2 = 1e40, best3 = 1e40; + int besti1 = 0, besti2 = 0, besti3 = 0; + for (int k = 0; k < m; ++k) { + float x = known[k * 3 + 0]; + float y = known[k * 3 + 1]; + float z = known[k * 3 + 2]; + float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); + if (d < best1) { + best3 = best2; + besti3 = besti2; + best2 = best1; + besti2 = besti1; + best1 = d; + besti1 = k; + } else if (d < best2) { + best3 = best2; + besti3 = besti2; + best2 = d; + besti2 = k; + } else if (d < best3) { + best3 = d; + besti3 = k; + } + } + dist2[j * 3 + 0] = best1; + dist2[j * 3 + 1] = best2; + dist2[j * 3 + 2] = best3; + + idx[j * 3 + 0] = besti1; + idx[j * 3 + 1] = besti2; + idx[j * 3 + 2] = besti3; + } +} + +void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown, + const float *known, float *dist2, int *idx) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + three_nn_kernel<<>>(b, n, m, unknown, known, + dist2, idx); + + CUDA_CHECK_ERRORS(); +} + +// input: points(b, c, m), idx(b, n, 3), weight(b, n, 3) +// output: out(b, c, n) +__global__ void three_interpolate_kernel(int b, int c, int m, int n, + const float *__restrict__ points, + const int *__restrict__ idx, + const float *__restrict__ weight, + float *__restrict__ out) { + int batch_index = blockIdx.x; + points += batch_index * m * c; + + idx += batch_index * n * 3; + weight += batch_index * n * 3; + + out += batch_index * n * c; + + const int index = threadIdx.y * blockDim.x + threadIdx.x; + const int stride = blockDim.y * blockDim.x; + for (int i = index; i < c * n; i += stride) { + const int l = i / n; + const int j = i % n; + float w1 = weight[j * 3 + 0]; + float w2 = weight[j * 3 + 1]; + float w3 = weight[j * 3 + 2]; + + int i1 = idx[j * 3 + 0]; + int i2 = idx[j * 3 + 1]; + int i3 = idx[j * 3 + 2]; + + out[i] = points[l * m + i1] * w1 + points[l * m + i2] * w2 + + points[l * m + i3] * w3; + } +} + +void three_interpolate_kernel_wrapper(int b, int c, int m, int n, + const float *points, const int *idx, + const float *weight, float *out) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + three_interpolate_kernel<<>>( + b, c, m, n, points, idx, weight, out); + + CUDA_CHECK_ERRORS(); +} + +// input: grad_out(b, c, n), idx(b, n, 3), weight(b, n, 3) +// output: grad_points(b, c, m) + +__global__ void three_interpolate_grad_kernel( + int b, int c, int n, int m, const float *__restrict__ grad_out, + const int *__restrict__ idx, const float *__restrict__ weight, + float *__restrict__ grad_points) { + int batch_index = blockIdx.x; + grad_out += batch_index * n * c; + idx += batch_index * n * 3; + weight += batch_index * n * 3; + grad_points += batch_index * m * c; + + const int index = threadIdx.y * blockDim.x + threadIdx.x; + const int stride = blockDim.y * blockDim.x; + for (int i = index; i < c * n; i += stride) { + const int l = i / n; + const int j = i % n; + float w1 = weight[j * 3 + 0]; + float w2 = weight[j * 3 + 1]; + float w3 = weight[j * 3 + 2]; + + int i1 = idx[j * 3 + 0]; + int i2 = idx[j * 3 + 1]; + int i3 = idx[j * 3 + 2]; + + atomicAdd(grad_points + l * m + i1, grad_out[i] * w1); + atomicAdd(grad_points + l * m + i2, grad_out[i] * w2); + atomicAdd(grad_points + l * m + i3, grad_out[i] * w3); + } +} + +void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m, + const float *grad_out, + const int *idx, const float *weight, + float *grad_points) { + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + three_interpolate_grad_kernel<<>>( + b, c, n, m, grad_out, idx, weight, grad_points); + + CUDA_CHECK_ERRORS(); +} diff --git a/modules-pytorch-1.9.0/_ext_src/src/sampling.cpp b/modules-pytorch-1.9.0/_ext_src/src/sampling.cpp new file mode 100644 index 0000000..0a76abf --- /dev/null +++ b/modules-pytorch-1.9.0/_ext_src/src/sampling.cpp @@ -0,0 +1,91 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include "sampling.h" +#include "utils.h" + +void gather_points_kernel_wrapper(int b, int c, int n, int npoints, + const float *points, const int *idx, + float *out); +void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints, + const float *grad_out, const int *idx, + float *grad_points); + +void furthest_point_sampling_kernel_wrapper(int b, int n, int m, + const float *dataset, float *temp, + int *idxs); + +at::Tensor gather_points(at::Tensor points, at::Tensor idx) { + CHECK_CONTIGUOUS(points); + CHECK_CONTIGUOUS(idx); + CHECK_IS_FLOAT(points); + CHECK_IS_INT(idx); + + if (points.type().is_cuda()) { + CHECK_CUDA(idx); + } + + at::Tensor output = + torch::zeros({points.size(0), points.size(1), idx.size(1)}, + at::device(points.device()).dtype(at::ScalarType::Float)); + + if (points.type().is_cuda()) { + gather_points_kernel_wrapper(points.size(0), points.size(1), points.size(2), + idx.size(1), points.data(), + idx.data(), output.data()); + } else { + TORCH_CHECK(false, "CPU not supported"); + } + + return output; +} + +at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx, + const int n) { + CHECK_CONTIGUOUS(grad_out); + CHECK_CONTIGUOUS(idx); + CHECK_IS_FLOAT(grad_out); + CHECK_IS_INT(idx); + + if (grad_out.type().is_cuda()) { + CHECK_CUDA(idx); + } + + at::Tensor output = + torch::zeros({grad_out.size(0), grad_out.size(1), n}, + at::device(grad_out.device()).dtype(at::ScalarType::Float)); + + if (grad_out.type().is_cuda()) { + gather_points_grad_kernel_wrapper(grad_out.size(0), grad_out.size(1), n, + idx.size(1), grad_out.data(), + idx.data(), output.data()); + } else { + TORCH_CHECK(false, "CPU not supported"); + } + + return output; +} +at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples) { + CHECK_CONTIGUOUS(points); + CHECK_IS_FLOAT(points); + + at::Tensor output = + torch::zeros({points.size(0), nsamples}, + at::device(points.device()).dtype(at::ScalarType::Int)); + + at::Tensor tmp = + torch::full({points.size(0), points.size(1)}, 1e10, + at::device(points.device()).dtype(at::ScalarType::Float)); + + if (points.type().is_cuda()) { + furthest_point_sampling_kernel_wrapper( + points.size(0), points.size(1), nsamples, points.data(), + tmp.data(), output.data()); + } else { + TORCH_CHECK(false, "CPU not supported"); + } + + return output; +} diff --git a/modules-pytorch-1.9.0/_ext_src/src/sampling_gpu.cu b/modules-pytorch-1.9.0/_ext_src/src/sampling_gpu.cu new file mode 100644 index 0000000..e2f5806 --- /dev/null +++ b/modules-pytorch-1.9.0/_ext_src/src/sampling_gpu.cu @@ -0,0 +1,234 @@ +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. + +#include +#include + +#include "cuda_utils.h" + +// input: points(b, c, n) idx(b, m) +// output: out(b, c, m) +__global__ void gather_points_kernel(int b, int c, int n, int m, + const float *__restrict__ points, + const int *__restrict__ idx, + float *__restrict__ out) { + for (int i = blockIdx.x; i < b; i += gridDim.x) { + for (int l = blockIdx.y; l < c; l += gridDim.y) { + for (int j = threadIdx.x; j < m; j += blockDim.x) { + int a = idx[i * m + j]; + out[(i * c + l) * m + j] = points[(i * c + l) * n + a]; + } + } + } +} + +void gather_points_kernel_wrapper(int b, int c, int n, int npoints, + const float *points, const int *idx, + float *out) { + gather_points_kernel<<>>(b, c, n, npoints, + points, idx, out); + + CUDA_CHECK_ERRORS(); +} + +// input: grad_out(b, c, m) idx(b, m) +// output: grad_points(b, c, n) +__global__ void gather_points_grad_kernel(int b, int c, int n, int m, + const float *__restrict__ grad_out, + const int *__restrict__ idx, + float *__restrict__ grad_points) { + for (int i = blockIdx.x; i < b; i += gridDim.x) { + for (int l = blockIdx.y; l < c; l += gridDim.y) { + for (int j = threadIdx.x; j < m; j += blockDim.x) { + int a = idx[i * m + j]; + atomicAdd(grad_points + (i * c + l) * n + a, + grad_out[(i * c + l) * m + j]); + } + } + } +} + +void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints, + const float *grad_out, const int *idx, + float *grad_points) { + gather_points_grad_kernel<<>>( + b, c, n, npoints, grad_out, idx, grad_points); + + CUDA_CHECK_ERRORS(); +} + +__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i, + int idx1, int idx2) { + const float v1 = dists[idx1], v2 = dists[idx2]; + const int i1 = dists_i[idx1], i2 = dists_i[idx2]; + dists[idx1] = max(v1, v2); + dists_i[idx1] = v2 > v1 ? i2 : i1; +} + +// Input dataset: (b, n, 3), tmp: (b, n) +// Ouput idxs (b, m) +template +__global__ void furthest_point_sampling_kernel( + int b, int n, int m, const float *__restrict__ dataset, + float *__restrict__ temp, int *__restrict__ idxs) { + if (m <= 0) return; + __shared__ float dists[block_size]; + __shared__ int dists_i[block_size]; + + int batch_index = blockIdx.x; + dataset += batch_index * n * 3; + temp += batch_index * n; + idxs += batch_index * m; + + int tid = threadIdx.x; + const int stride = block_size; + + int old = 0; + if (threadIdx.x == 0) idxs[0] = old; + + __syncthreads(); + for (int j = 1; j < m; j++) { + int besti = 0; + float best = -1; + float x1 = dataset[old * 3 + 0]; + float y1 = dataset[old * 3 + 1]; + float z1 = dataset[old * 3 + 2]; + for (int k = tid; k < n; k += stride) { + float x2, y2, z2; + x2 = dataset[k * 3 + 0]; + y2 = dataset[k * 3 + 1]; + z2 = dataset[k * 3 + 2]; + float mag = (x2 * x2) + (y2 * y2) + (z2 * z2); + if (mag <= 1e-3) continue; + + float d = + (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1); + + float d2 = min(d, temp[k]); + temp[k] = d2; + besti = d2 > best ? k : besti; + best = d2 > best ? d2 : best; + } + dists[tid] = best; + dists_i[tid] = besti; + __syncthreads(); + + if (block_size >= 512) { + if (tid < 256) { + __update(dists, dists_i, tid, tid + 256); + } + __syncthreads(); + } + if (block_size >= 256) { + if (tid < 128) { + __update(dists, dists_i, tid, tid + 128); + } + __syncthreads(); + } + if (block_size >= 128) { + if (tid < 64) { + __update(dists, dists_i, tid, tid + 64); + } + __syncthreads(); + } + if (block_size >= 64) { + if (tid < 32) { + __update(dists, dists_i, tid, tid + 32); + } + __syncthreads(); + } + if (block_size >= 32) { + if (tid < 16) { + __update(dists, dists_i, tid, tid + 16); + } + __syncthreads(); + } + if (block_size >= 16) { + if (tid < 8) { + __update(dists, dists_i, tid, tid + 8); + } + __syncthreads(); + } + if (block_size >= 8) { + if (tid < 4) { + __update(dists, dists_i, tid, tid + 4); + } + __syncthreads(); + } + if (block_size >= 4) { + if (tid < 2) { + __update(dists, dists_i, tid, tid + 2); + } + __syncthreads(); + } + if (block_size >= 2) { + if (tid < 1) { + __update(dists, dists_i, tid, tid + 1); + } + __syncthreads(); + } + + old = dists_i[0]; + if (tid == 0) idxs[j] = old; + } +} + +void furthest_point_sampling_kernel_wrapper(int b, int n, int m, + const float *dataset, float *temp, + int *idxs) { + unsigned int n_threads = opt_n_threads(n); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + switch (n_threads) { + case 512: + furthest_point_sampling_kernel<512> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 256: + furthest_point_sampling_kernel<256> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 128: + furthest_point_sampling_kernel<128> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 64: + furthest_point_sampling_kernel<64> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 32: + furthest_point_sampling_kernel<32> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 16: + furthest_point_sampling_kernel<16> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 8: + furthest_point_sampling_kernel<8> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 4: + furthest_point_sampling_kernel<4> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 2: + furthest_point_sampling_kernel<2> + <<>>(b, n, m, dataset, temp, idxs); + break; + case 1: + furthest_point_sampling_kernel<1> + <<>>(b, n, m, dataset, temp, idxs); + break; + default: + furthest_point_sampling_kernel<512> + <<>>(b, n, m, dataset, temp, idxs); + } + + CUDA_CHECK_ERRORS(); +} diff --git a/modules-pytorch-1.9.0/point_4d_convolution.py b/modules-pytorch-1.9.0/point_4d_convolution.py new file mode 100644 index 0000000..9a89e08 --- /dev/null +++ b/modules-pytorch-1.9.0/point_4d_convolution.py @@ -0,0 +1,271 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor + +import math +import os +import sys +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(BASE_DIR) + +import pointnet2_utils +from typing import List + + +class P4DConv(nn.Module): + def __init__(self, + in_planes: int, + mlp_planes: List[int], + mlp_batch_norm: List[bool], + mlp_activation: List[bool], + spatial_kernel_size: [float, int], + spatial_stride: int, + temporal_kernel_size: int, + temporal_stride: int = 1, + temporal_padding: [int, int] = [0, 0], + temporal_padding_mode: str = 'replicate', + operator: str = 'addition', + spatial_pooling: str = 'max', + temporal_pooling: str = 'sum', + bias: bool = False): + + super().__init__() + + self.in_planes = in_planes + self.mlp_planes = mlp_planes + self.mlp_batch_norm = mlp_batch_norm + self.mlp_activation = mlp_activation + + self.r, self.k = spatial_kernel_size + self.spatial_stride = spatial_stride + + self.temporal_kernel_size = temporal_kernel_size + self.temporal_stride = temporal_stride + self.temporal_padding = temporal_padding + self.temporal_padding_mode = temporal_padding_mode + + self.operator = operator + self.spatial_pooling = spatial_pooling + self.temporal_pooling = temporal_pooling + + conv_d = [nn.Conv2d(in_channels=4, out_channels=mlp_planes[0], kernel_size=1, stride=1, padding=0, bias=bias)] + if mlp_batch_norm[0]: + conv_d.append(nn.BatchNorm2d(num_features=mlp_planes[0])) + if mlp_activation[0]: + conv_d.append(nn.ReLU(inplace=True)) + self.conv_d = nn.Sequential(*conv_d) + + if in_planes != 0: + conv_f = [nn.Conv2d(in_channels=in_planes, out_channels=mlp_planes[0], kernel_size=1, stride=1, padding=0, bias=bias)] + if mlp_batch_norm[0]: + conv_f.append(nn.BatchNorm2d(num_features=mlp_planes[0])) + if mlp_activation[0]: + conv_f.append(nn.ReLU(inplace=True)) + self.conv_f = nn.Sequential(*conv_f) + + mlp = [] + for i in range(1, len(mlp_planes)): + if mlp_planes[i] != 0: + mlp.append(nn.Conv2d(in_channels=mlp_planes[i-1], out_channels=mlp_planes[i], kernel_size=1, stride=1, padding=0, bias=bias)) + if mlp_batch_norm[i]: + mlp.append(nn.BatchNorm2d(num_features=mlp_planes[i])) + if mlp_activation[i]: + mlp.append(nn.ReLU(inplace=True)) + self.mlp = nn.Sequential(*mlp) + + + def forward(self, xyzs: torch.Tensor, features: torch.Tensor = None) -> (torch.Tensor, torch.Tensor): + """ + Args: + xyzs: torch.Tensor + (B, T, N, 3) tensor of sequence of the xyz coordinates + features: torch.Tensor + (B, T, C, N) tensor of sequence of the features + """ + device = xyzs.get_device() + + nframes = xyzs.size(1) + npoints = xyzs.size(2) + + assert (self.temporal_kernel_size % 2 == 1), "P4DConv: Temporal kernel size should be odd!" + assert ((nframes + sum(self.temporal_padding) - self.temporal_kernel_size) % self.temporal_stride == 0), "P4DConv: Temporal length error!" + + xyzs = torch.split(tensor=xyzs, split_size_or_sections=1, dim=1) + xyzs = [torch.squeeze(input=xyz, dim=1).contiguous() for xyz in xyzs] + + if self.temporal_padding_mode == 'zeros': + xyz_padding = torch.zeros(xyzs[0].size(), dtype=torch.float32, device=device) + for i in range(self.temporal_padding[0]): + xyzs = [xyz_padding] + xyzs + for i in range(self.temporal_padding[1]): + xyzs = xyzs + [xyz_padding] + else: + for i in range(self.temporal_padding[0]): + xyzs = [xyzs[0]] + xyzs + for i in range(self.temporal_padding[1]): + xyzs = xyzs + [xyzs[-1]] + + if self.in_planes != 0: + features = torch.split(tensor=features, split_size_or_sections=1, dim=1) + features = [torch.squeeze(input=feature, dim=1).contiguous() for feature in features] + + if self.temporal_padding_mode == 'zeros': + feature_padding = torch.zeros(features[0].size(), dtype=torch.float32, device=device) + for i in range(self.temporal_padding[0]): + features = [feature_padding] + features + for i in range(self.temporal_padding[1]): + features = features + [feature_padding] + else: + for i in range(self.temporal_padding[0]): + features = [features[0]] + features + for i in range(self.temporal_padding[1]): + features = features + [features[-1]] + + new_xyzs = [] + new_features = [] + for t in range(self.temporal_kernel_size//2, len(xyzs)-self.temporal_kernel_size//2, self.temporal_stride): # temporal anchor frames + # spatial anchor point subsampling by FPS + anchor_idx = pointnet2_utils.furthest_point_sample(xyzs[t], npoints//self.spatial_stride) # (B, N//self.spatial_stride) + anchor_xyz_flipped = pointnet2_utils.gather_operation(xyzs[t].transpose(1, 2).contiguous(), anchor_idx) # (B, 3, N//self.spatial_stride) + anchor_xyz_expanded = torch.unsqueeze(anchor_xyz_flipped, 3) # (B, 3, N//spatial_stride, 1) + anchor_xyz = anchor_xyz_flipped.transpose(1, 2).contiguous() # (B, N//spatial_stride, 3) + + new_feature = [] + for i in range(t-self.temporal_kernel_size//2, t+self.temporal_kernel_size//2+1): + neighbor_xyz = xyzs[i] + + idx = pointnet2_utils.ball_query(self.r, self.k, neighbor_xyz, anchor_xyz) + + neighbor_xyz_flipped = neighbor_xyz.transpose(1, 2).contiguous() # (B, 3, N) + neighbor_xyz_grouped = pointnet2_utils.grouping_operation(neighbor_xyz_flipped, idx) # (B, 3, N//spatial_stride, k) + + xyz_displacement = neighbor_xyz_grouped - anchor_xyz_expanded # (B, 3, N//spatial_stride, k) + t_displacement = torch.ones((xyz_displacement.size()[0], 1, xyz_displacement.size()[2], xyz_displacement.size()[3]), dtype=torch.float32, device=device) * (i-t) + displacement = torch.cat(tensors=(xyz_displacement, t_displacement), dim=1, out=None) # (B, 4, N//spatial_stride, k) + displacement = self.conv_d(displacement) + + if self.in_planes != 0: + neighbor_feature_grouped = pointnet2_utils.grouping_operation(features[i], idx) # (B, in_planes, N//spatial_stride, k) + feature = self.conv_f(neighbor_feature_grouped) + if self.operator == '+': + feature = feature + displacement + else: + feature = feature * displacement + else: + feature = displacement + + feature = self.mlp(feature) + if self.spatial_pooling == 'max': + feature = torch.max(input=feature, dim=-1, keepdim=False)[0] # (B, out_planes, n) + elif self.spatial_pooling == 'sum': + feature = torch.sum(input=feature, dim=-1, keepdim=False) + else: + feature = torch.mean(input=feature, dim=-1, keepdim=False) + + new_feature.append(feature) + new_feature = torch.stack(tensors=new_feature, dim=1) + if self.temporal_pooling == 'max': + new_feature = torch.max(input=new_feature, dim=1, keepdim=False)[0] + elif self.temporal_pooling == 'sum': + new_feature = torch.sum(input=new_feature, dim=1, keepdim=False) + else: + new_feature = torch.mean(input=new_feature, dim=1, keepdim=False) + new_xyzs.append(anchor_xyz) + new_features.append(new_feature) + + new_xyzs = torch.stack(tensors=new_xyzs, dim=1) + new_features = torch.stack(tensors=new_features, dim=1) + + return new_xyzs, new_features + +class P4DTransConv(nn.Module): + def __init__(self, + in_planes: int, + mlp_planes: List[int], + mlp_batch_norm: List[bool], + mlp_activation: List[bool], + original_planes: int = 0, + bias: bool = False): + """ + Args: + in_planes: C'. when point features are not available, in_planes is 0. + out_planes: C" + original_planes: skip connection from original points. when original point features are not available, original_in_planes is 0. + bias: whether to use bias + batch_norm: whether to use batch norm + activation: + """ + super().__init__() + + self.in_planes = in_planes + self.mlp_planes = mlp_planes + self.mlp_batch_norm = mlp_batch_norm + + conv = [] + for i in range(len(mlp_planes)): + if i == 0: + conv.append(nn.Conv1d(in_channels=in_planes+original_planes, out_channels=mlp_planes[i], kernel_size=1, stride=1, padding=0, bias=bias)) + else: + conv.append(nn.Conv1d(in_channels=mlp_planes[i-1], out_channels=mlp_planes[i], kernel_size=1, stride=1, padding=0, bias=bias)) + if mlp_batch_norm[i]: + conv.append(nn.BatchNorm1d(num_features=mlp_planes[i])) + if mlp_activation[i]: + conv.append(nn.ReLU(inplace=True)) + self.conv = nn.Sequential(*conv) + + def forward(self, xyzs: torch.Tensor, original_xyzs: torch.Tensor, features: torch.Tensor, original_features: torch.Tensor = None) -> torch.Tensor: + r""" + Parameters + ---------- + xyzs : torch.Tensor + (B, T, N', 3) tensor of the xyz positions of the convolved features + original_xyzs : torch.Tensor + (B, T, N, 3) tensor of the xyz positions of the original points + features : torch.Tensor + (B, T, C', N') tensor of the features to be propigated to + original_features : torch.Tensor + (B, T, C, N) tensor of original point features for skip connection + + Returns + ------- + new_features : torch.Tensor + (B, T, C", N) tensor of the features of the unknown features + """ + + T = xyzs.size(1) + + xyzs = torch.split(tensor=xyzs, split_size_or_sections=1, dim=1) + xyzs = [torch.squeeze(input=xyz, dim=1).contiguous() for xyz in xyzs] + + features = torch.split(tensor=features, split_size_or_sections=1, dim=1) + features = [torch.squeeze(input=feature, dim=1).contiguous() for feature in features] + + new_xyzs = original_xyzs + + original_xyzs = torch.split(tensor=original_xyzs, split_size_or_sections=1, dim=1) + original_xyzs = [torch.squeeze(input=original_xyz, dim=1).contiguous() for original_xyz in original_xyzs] + + if original_features is not None: + original_features = torch.split(tensor=original_features, split_size_or_sections=1, dim=1) + original_features = [torch.squeeze(input=feature, dim=1).contiguous() for feature in original_features] + + new_features = [] + + for t in range(T): + dist, idx = pointnet2_utils.three_nn(original_xyzs[t], xyzs[t]) + + dist_recip = 1.0 / (dist + 1e-8) + norm = torch.sum(dist_recip, dim=2, keepdim=True) + weight = dist_recip / norm + + interpolated_feat = pointnet2_utils.three_interpolate(features[t], idx, weight) + + if original_features is not None: + new_feature = torch.cat([interpolated_feat, original_features[t]], dim=1) + new_feature = self.conv(new_feature) + new_features.append(new_feature) + + new_features = torch.stack(tensors=new_features, dim=1) + + return new_xyzs, new_features diff --git a/modules-pytorch-1.9.0/pointnet2_modules.py b/modules-pytorch-1.9.0/pointnet2_modules.py new file mode 100644 index 0000000..bfb4c3e --- /dev/null +++ b/modules-pytorch-1.9.0/pointnet2_modules.py @@ -0,0 +1,518 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +''' Pointnet2 layers. +Modified based on: https://github.com/erikwijmans/Pointnet2_PyTorch +Extended with the following: +1. Uniform sampling in each local region (sample_uniformly) +2. Return sampled points indices to support votenet. +''' +import torch +import torch.nn as nn +import torch.nn.functional as F + +import os +import sys +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(BASE_DIR) + +import pointnet2_utils +import pytorch_utils as pt_utils +from typing import List + + +class _PointnetSAModuleBase(nn.Module): + + def __init__(self): + super().__init__() + self.npoint = None + self.groupers = None + self.mlps = None + + def forward(self, xyz: torch.Tensor, + features: torch.Tensor = None) -> (torch.Tensor, torch.Tensor): + r""" + Parameters + ---------- + xyz : torch.Tensor + (B, N, 3) tensor of the xyz coordinates of the features + features : torch.Tensor + (B, N, C) tensor of the descriptors of the the features + + Returns + ------- + new_xyz : torch.Tensor + (B, npoint, 3) tensor of the new features' xyz + new_features : torch.Tensor + (B, npoint, \sum_k(mlps[k][-1])) tensor of the new_features descriptors + """ + + new_features_list = [] + + xyz_flipped = xyz.transpose(1, 2).contiguous() + new_xyz = pointnet2_utils.gather_operation( + xyz_flipped, + pointnet2_utils.furthest_point_sample(xyz, self.npoint) + ).transpose(1, 2).contiguous() if self.npoint is not None else None + + for i in range(len(self.groupers)): + new_features = self.groupers[i]( + xyz, new_xyz, features + ) # (B, C, npoint, nsample) + + new_features = self.mlps[i]( + new_features + ) # (B, mlp[-1], npoint, nsample) + new_features = F.max_pool2d( + new_features, kernel_size=[1, new_features.size(3)] + ) # (B, mlp[-1], npoint, 1) + new_features = new_features.squeeze(-1) # (B, mlp[-1], npoint) + + new_features_list.append(new_features) + + return new_xyz, torch.cat(new_features_list, dim=1) + + +class PointnetSAModuleMSG(_PointnetSAModuleBase): + r"""Pointnet set abstrction layer with multiscale grouping + + Parameters + ---------- + npoint : int + Number of features + radii : list of float32 + list of radii to group with + nsamples : list of int32 + Number of samples in each ball query + mlps : list of list of int32 + Spec of the pointnet before the global max_pool for each scale + bn : bool + Use batchnorm + """ + + def __init__( + self, + *, + npoint: int, + radii: List[float], + nsamples: List[int], + mlps: List[List[int]], + bn: bool = True, + use_xyz: bool = True, + sample_uniformly: bool = False + ): + super().__init__() + + assert len(radii) == len(nsamples) == len(mlps) + + self.npoint = npoint + self.groupers = nn.ModuleList() + self.mlps = nn.ModuleList() + for i in range(len(radii)): + radius = radii[i] + nsample = nsamples[i] + self.groupers.append( + pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz, sample_uniformly=sample_uniformly) + if npoint is not None else pointnet2_utils.GroupAll(use_xyz) + ) + mlp_spec = mlps[i] + if use_xyz: + mlp_spec[0] += 3 + + self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn)) + + +class PointnetSAModule(PointnetSAModuleMSG): + r"""Pointnet set abstrction layer + + Parameters + ---------- + npoint : int + Number of features + radius : float + Radius of ball + nsample : int + Number of samples in the ball query + mlp : list + Spec of the pointnet before the global max_pool + bn : bool + Use batchnorm + """ + + def __init__( + self, + *, + mlp: List[int], + npoint: int = None, + radius: float = None, + nsample: int = None, + bn: bool = True, + use_xyz: bool = True + ): + super().__init__( + mlps=[mlp], + npoint=npoint, + radii=[radius], + nsamples=[nsample], + bn=bn, + use_xyz=use_xyz + ) + + +class PointnetSAModuleVotes(nn.Module): + ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG + with extra support for returning point indices for getting their GT votes ''' + + def __init__( + self, + *, + mlp: List[int], + npoint: int = None, + radius: float = None, + nsample: int = None, + bn: bool = True, + use_xyz: bool = True, + pooling: str = 'max', + sigma: float = None, # for RBF pooling + normalize_xyz: bool = False, # noramlize local XYZ with radius + sample_uniformly: bool = False, + ret_unique_cnt: bool = False + ): + super().__init__() + + self.npoint = npoint + self.radius = radius + self.nsample = nsample + self.pooling = pooling + self.mlp_module = None + self.use_xyz = use_xyz + self.sigma = sigma + if self.sigma is None: + self.sigma = self.radius/2 + self.normalize_xyz = normalize_xyz + self.ret_unique_cnt = ret_unique_cnt + + if npoint is not None: + self.grouper = pointnet2_utils.QueryAndGroup(radius, nsample, + use_xyz=use_xyz, ret_grouped_xyz=True, normalize_xyz=normalize_xyz, + sample_uniformly=sample_uniformly, ret_unique_cnt=ret_unique_cnt) + else: + self.grouper = pointnet2_utils.GroupAll(use_xyz, ret_grouped_xyz=True) + + mlp_spec = mlp + if use_xyz and len(mlp_spec)>0: + mlp_spec[0] += 3 + self.mlp_module = pt_utils.SharedMLP(mlp_spec, bn=bn) + + + def forward(self, xyz: torch.Tensor, + features: torch.Tensor = None, + inds: torch.Tensor = None) -> (torch.Tensor, torch.Tensor): + r""" + Parameters + ---------- + xyz : torch.Tensor + (B, N, 3) tensor of the xyz coordinates of the features + features : torch.Tensor + (B, C, N) tensor of the descriptors of the the features + inds : torch.Tensor + (B, npoint) tensor that stores index to the xyz points (values in 0-N-1) + + Returns + ------- + new_xyz : torch.Tensor + (B, npoint, 3) tensor of the new features' xyz + new_features : torch.Tensor + (B, \sum_k(mlps[k][-1]), npoint) tensor of the new_features descriptors + inds: torch.Tensor + (B, npoint) tensor of the inds + """ + + xyz_flipped = xyz.transpose(1, 2).contiguous() + if inds is None: + inds = pointnet2_utils.furthest_point_sample(xyz, self.npoint) + else: + assert(inds.shape[1] == self.npoint) + new_xyz = pointnet2_utils.gather_operation( + xyz_flipped, inds + ).transpose(1, 2).contiguous() if self.npoint is not None else None + + if not self.ret_unique_cnt: + grouped_features, grouped_xyz = self.grouper( + xyz, new_xyz, features + ) # (B, C, npoint, nsample) + else: + grouped_features, grouped_xyz, unique_cnt = self.grouper( + xyz, new_xyz, features + ) # (B, C, npoint, nsample), (B,3,npoint,nsample), (B,npoint) + + new_features = self.mlp_module( + grouped_features + ) # (B, mlp[-1], npoint, nsample) + if self.pooling == 'max': + new_features = F.max_pool2d( + new_features, kernel_size=[1, new_features.size(3)] + ) # (B, mlp[-1], npoint, 1) + elif self.pooling == 'avg': + new_features = F.avg_pool2d( + new_features, kernel_size=[1, new_features.size(3)] + ) # (B, mlp[-1], npoint, 1) + elif self.pooling == 'rbf': + # Use radial basis function kernel for weighted sum of features (normalized by nsample and sigma) + # Ref: https://en.wikipedia.org/wiki/Radial_basis_function_kernel + rbf = torch.exp(-1 * grouped_xyz.pow(2).sum(1,keepdim=False) / (self.sigma**2) / 2) # (B, npoint, nsample) + new_features = torch.sum(new_features * rbf.unsqueeze(1), -1, keepdim=True) / float(self.nsample) # (B, mlp[-1], npoint, 1) + new_features = new_features.squeeze(-1) # (B, mlp[-1], npoint) + + if not self.ret_unique_cnt: + return new_xyz, new_features, inds + else: + return new_xyz, new_features, inds, unique_cnt + +class PointnetSAModuleMSGVotes(nn.Module): + ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG + with extra support for returning point indices for getting their GT votes ''' + + def __init__( + self, + *, + mlps: List[List[int]], + npoint: int, + radii: List[float], + nsamples: List[int], + bn: bool = True, + use_xyz: bool = True, + sample_uniformly: bool = False + ): + super().__init__() + + assert(len(mlps) == len(nsamples) == len(radii)) + + self.npoint = npoint + self.groupers = nn.ModuleList() + self.mlps = nn.ModuleList() + for i in range(len(radii)): + radius = radii[i] + nsample = nsamples[i] + self.groupers.append( + pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz, sample_uniformly=sample_uniformly) + if npoint is not None else pointnet2_utils.GroupAll(use_xyz) + ) + mlp_spec = mlps[i] + if use_xyz: + mlp_spec[0] += 3 + + self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn)) + + def forward(self, xyz: torch.Tensor, + features: torch.Tensor = None, inds: torch.Tensor = None) -> (torch.Tensor, torch.Tensor): + r""" + Parameters + ---------- + xyz : torch.Tensor + (B, N, 3) tensor of the xyz coordinates of the features + features : torch.Tensor + (B, C, C) tensor of the descriptors of the the features + inds : torch.Tensor + (B, npoint) tensor that stores index to the xyz points (values in 0-N-1) + + Returns + ------- + new_xyz : torch.Tensor + (B, npoint, 3) tensor of the new features' xyz + new_features : torch.Tensor + (B, \sum_k(mlps[k][-1]), npoint) tensor of the new_features descriptors + inds: torch.Tensor + (B, npoint) tensor of the inds + """ + new_features_list = [] + + xyz_flipped = xyz.transpose(1, 2).contiguous() + if inds is None: + inds = pointnet2_utils.furthest_point_sample(xyz, self.npoint) + new_xyz = pointnet2_utils.gather_operation( + xyz_flipped, inds + ).transpose(1, 2).contiguous() if self.npoint is not None else None + + for i in range(len(self.groupers)): + new_features = self.groupers[i]( + xyz, new_xyz, features + ) # (B, C, npoint, nsample) + new_features = self.mlps[i]( + new_features + ) # (B, mlp[-1], npoint, nsample) + new_features = F.max_pool2d( + new_features, kernel_size=[1, new_features.size(3)] + ) # (B, mlp[-1], npoint, 1) + new_features = new_features.squeeze(-1) # (B, mlp[-1], npoint) + + new_features_list.append(new_features) + + return new_xyz, torch.cat(new_features_list, dim=1), inds + + +class PointnetFPModule(nn.Module): + r"""Propigates the features of one set to another + + Parameters + ---------- + mlp : list + Pointnet module parameters + bn : bool + Use batchnorm + """ + + def __init__(self, *, mlp: List[int], bn: bool = True): + super().__init__() + self.mlp = pt_utils.SharedMLP(mlp, bn=bn) + + def forward( + self, unknown: torch.Tensor, known: torch.Tensor, + unknow_feats: torch.Tensor, known_feats: torch.Tensor + ) -> torch.Tensor: + r""" + Parameters + ---------- + unknown : torch.Tensor + (B, n, 3) tensor of the xyz positions of the unknown features + known : torch.Tensor + (B, m, 3) tensor of the xyz positions of the known features + unknow_feats : torch.Tensor + (B, C1, n) tensor of the features to be propigated to + known_feats : torch.Tensor + (B, C2, m) tensor of features to be propigated + + Returns + ------- + new_features : torch.Tensor + (B, mlp[-1], n) tensor of the features of the unknown features + """ + + if known is not None: + dist, idx = pointnet2_utils.three_nn(unknown, known) + dist_recip = 1.0 / (dist + 1e-8) + norm = torch.sum(dist_recip, dim=2, keepdim=True) + weight = dist_recip / norm + + interpolated_feats = pointnet2_utils.three_interpolate( + known_feats, idx, weight + ) + else: + interpolated_feats = known_feats.expand( + *known_feats.size()[0:2], unknown.size(1) + ) + + if unknow_feats is not None: + new_features = torch.cat([interpolated_feats, unknow_feats], + dim=1) #(B, C2 + C1, n) + else: + new_features = interpolated_feats + + new_features = new_features.unsqueeze(-1) + new_features = self.mlp(new_features) + + return new_features.squeeze(-1) + +class PointnetLFPModuleMSG(nn.Module): + ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG + learnable feature propagation layer.''' + + def __init__( + self, + *, + mlps: List[List[int]], + radii: List[float], + nsamples: List[int], + post_mlp: List[int], + bn: bool = True, + use_xyz: bool = True, + sample_uniformly: bool = False + ): + super().__init__() + + assert(len(mlps) == len(nsamples) == len(radii)) + + self.post_mlp = pt_utils.SharedMLP(post_mlp, bn=bn) + + self.groupers = nn.ModuleList() + self.mlps = nn.ModuleList() + for i in range(len(radii)): + radius = radii[i] + nsample = nsamples[i] + self.groupers.append( + pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz, + sample_uniformly=sample_uniformly) + ) + mlp_spec = mlps[i] + if use_xyz: + mlp_spec[0] += 3 + + self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn)) + + def forward(self, xyz2: torch.Tensor, xyz1: torch.Tensor, + features2: torch.Tensor, features1: torch.Tensor) -> torch.Tensor: + r""" Propagate features from xyz1 to xyz2. + Parameters + ---------- + xyz2 : torch.Tensor + (B, N2, 3) tensor of the xyz coordinates of the features + xyz1 : torch.Tensor + (B, N1, 3) tensor of the xyz coordinates of the features + features2 : torch.Tensor + (B, C2, N2) tensor of the descriptors of the the features + features1 : torch.Tensor + (B, C1, N1) tensor of the descriptors of the the features + + Returns + ------- + new_features1 : torch.Tensor + (B, \sum_k(mlps[k][-1]), N1) tensor of the new_features descriptors + """ + new_features_list = [] + + for i in range(len(self.groupers)): + new_features = self.groupers[i]( + xyz1, xyz2, features1 + ) # (B, C1, N2, nsample) + new_features = self.mlps[i]( + new_features + ) # (B, mlp[-1], N2, nsample) + new_features = F.max_pool2d( + new_features, kernel_size=[1, new_features.size(3)] + ) # (B, mlp[-1], N2, 1) + new_features = new_features.squeeze(-1) # (B, mlp[-1], N2) + + if features2 is not None: + new_features = torch.cat([new_features, features2], + dim=1) #(B, mlp[-1] + C2, N2) + + new_features = new_features.unsqueeze(-1) + new_features = self.post_mlp(new_features) + + new_features_list.append(new_features) + + return torch.cat(new_features_list, dim=1).squeeze(-1) + + +if __name__ == "__main__": + from torch.autograd import Variable + torch.manual_seed(1) + torch.cuda.manual_seed_all(1) + xyz = Variable(torch.randn(2, 9, 3).cuda(), requires_grad=True) + xyz_feats = Variable(torch.randn(2, 9, 6).cuda(), requires_grad=True) + + test_module = PointnetSAModuleMSG( + npoint=2, radii=[5.0, 10.0], nsamples=[6, 3], mlps=[[9, 3], [9, 6]] + ) + test_module.cuda() + print(test_module(xyz, xyz_feats)) + + for _ in range(1): + _, new_features = test_module(xyz, xyz_feats) + new_features.backward( + torch.cuda.FloatTensor(*new_features.size()).fill_(1) + ) + print(new_features) + print(xyz.grad) diff --git a/modules-pytorch-1.9.0/pointnet2_test.py b/modules-pytorch-1.9.0/pointnet2_test.py new file mode 100644 index 0000000..be60b28 --- /dev/null +++ b/modules-pytorch-1.9.0/pointnet2_test.py @@ -0,0 +1,33 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +''' Testing customized ops. ''' + +import torch +from torch.autograd import gradcheck +import numpy as np + +import os +import sys +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(BASE_DIR) +import pointnet2_utils + +def test_interpolation_grad(): + batch_size = 1 + feat_dim = 2 + m = 4 + feats = torch.randn(batch_size, feat_dim, m, requires_grad=True).float().cuda() + + def interpolate_func(inputs): + idx = torch.from_numpy(np.array([[[0,1,2],[1,2,3]]])).int().cuda() + weight = torch.from_numpy(np.array([[[1,1,1],[2,2,2]]])).float().cuda() + interpolated_feats = pointnet2_utils.three_interpolate(inputs, idx, weight) + return interpolated_feats + + assert (gradcheck(interpolate_func, feats, atol=1e-1, rtol=1e-1)) + +if __name__=='__main__': + test_interpolation_grad() diff --git a/modules-pytorch-1.9.0/pointnet2_utils.py b/modules-pytorch-1.9.0/pointnet2_utils.py new file mode 100644 index 0000000..7ade909 --- /dev/null +++ b/modules-pytorch-1.9.0/pointnet2_utils.py @@ -0,0 +1,425 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +''' Modified based on: https://github.com/erikwijmans/Pointnet2_PyTorch ''' +from __future__ import ( + division, + absolute_import, + with_statement, + print_function, + unicode_literals, +) +import torch +from torch.autograd import Function +import torch.nn as nn +import pytorch_utils as pt_utils +import sys + +try: + import builtins +except: + import __builtin__ as builtins + +try: + import pointnet2._ext as _ext +except ImportError: + if not getattr(builtins, "__POINTNET2_SETUP__", False): + raise ImportError( + "Could not import _ext module.\n" + "Please see the setup instructions in the README: " + "https://github.com/erikwijmans/Pointnet2_PyTorch/blob/master/README.rst" + ) + +if False: + # Workaround for type hints without depending on the `typing` module + from typing import * + + +class RandomDropout(nn.Module): + def __init__(self, p=0.5, inplace=False): + super(RandomDropout, self).__init__() + self.p = p + self.inplace = inplace + + def forward(self, X): + theta = torch.Tensor(1).uniform_(0, self.p)[0] + return pt_utils.feature_dropout_no_scaling(X, theta, self.train, self.inplace) + + +class FurthestPointSampling(Function): + @staticmethod + def forward(ctx, xyz, npoint): + # type: (Any, torch.Tensor, int) -> torch.Tensor + r""" + Uses iterative furthest point sampling to select a set of npoint features that have the largest + minimum distance + + Parameters + ---------- + xyz : torch.Tensor + (B, N, 3) tensor where N > npoint + npoint : int32 + number of features in the sampled set + + Returns + ------- + torch.Tensor + (B, npoint) tensor containing the set + """ + fps_inds = _ext.furthest_point_sampling(xyz, npoint) + ctx.mark_non_differentiable(fps_inds) + return fps_inds + + @staticmethod + def backward(xyz, a=None): + return None, None + + +furthest_point_sample = FurthestPointSampling.apply + + +class GatherOperation(Function): + @staticmethod + def forward(ctx, features, idx): + # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor + r""" + + Parameters + ---------- + features : torch.Tensor + (B, C, N) tensor + + idx : torch.Tensor + (B, npoint) tensor of the features to gather + + Returns + ------- + torch.Tensor + (B, C, npoint) tensor + """ + + _, C, N = features.size() + + ctx.for_backwards = (idx, C, N) + + return _ext.gather_points(features, idx) + + @staticmethod + def backward(ctx, grad_out): + idx, C, N = ctx.for_backwards + + grad_features = _ext.gather_points_grad(grad_out.contiguous(), idx, N) + return grad_features, None + + +gather_operation = GatherOperation.apply + + +class ThreeNN(Function): + @staticmethod + def forward(ctx, unknown, known): + # type: (Any, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor] + r""" + Find the three nearest neighbors of unknown in known + Parameters + ---------- + unknown : torch.Tensor + (B, n, 3) tensor of known features + known : torch.Tensor + (B, m, 3) tensor of unknown features + + Returns + ------- + dist : torch.Tensor + (B, n, 3) l2 distance to the three nearest neighbors + idx : torch.Tensor + (B, n, 3) index of 3 nearest neighbors + """ + dist2, idx = _ext.three_nn(unknown, known) + + return torch.sqrt(dist2), idx + + @staticmethod + def backward(ctx, a=None, b=None): + return None, None + + +three_nn = ThreeNN.apply + + +class ThreeInterpolate(Function): + @staticmethod + def forward(ctx, features, idx, weight): + # type(Any, torch.Tensor, torch.Tensor, torch.Tensor) -> Torch.Tensor + r""" + Performs weight linear interpolation on 3 features + Parameters + ---------- + features : torch.Tensor + (B, c, m) Features descriptors to be interpolated from + idx : torch.Tensor + (B, n, 3) three nearest neighbors of the target features in features + weight : torch.Tensor + (B, n, 3) weights + + Returns + ------- + torch.Tensor + (B, c, n) tensor of the interpolated features + """ + B, c, m = features.size() + n = idx.size(1) + + ctx.three_interpolate_for_backward = (idx, weight, m) + + return _ext.three_interpolate(features, idx, weight) + + @staticmethod + def backward(ctx, grad_out): + # type: (Any, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor] + r""" + Parameters + ---------- + grad_out : torch.Tensor + (B, c, n) tensor with gradients of ouputs + + Returns + ------- + grad_features : torch.Tensor + (B, c, m) tensor with gradients of features + + None + + None + """ + idx, weight, m = ctx.three_interpolate_for_backward + + grad_features = _ext.three_interpolate_grad( + grad_out.contiguous(), idx, weight, m + ) + + return grad_features, None, None + + +three_interpolate = ThreeInterpolate.apply + + +class GroupingOperation(Function): + @staticmethod + def forward(ctx, features, idx): + # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor + r""" + + Parameters + ---------- + features : torch.Tensor + (B, C, N) tensor of features to group + idx : torch.Tensor + (B, npoint, nsample) tensor containing the indicies of features to group with + + Returns + ------- + torch.Tensor + (B, C, npoint, nsample) tensor + """ + B, nfeatures, nsample = idx.size() + _, C, N = features.size() + + ctx.for_backwards = (idx, N) + + return _ext.group_points(features, idx) + + @staticmethod + def backward(ctx, grad_out): + # type: (Any, torch.tensor) -> Tuple[torch.Tensor, torch.Tensor] + r""" + + Parameters + ---------- + grad_out : torch.Tensor + (B, C, npoint, nsample) tensor of the gradients of the output from forward + + Returns + ------- + torch.Tensor + (B, C, N) gradient of the features + None + """ + idx, N = ctx.for_backwards + + grad_features = _ext.group_points_grad(grad_out.contiguous(), idx, N) + + return grad_features, None + + +grouping_operation = GroupingOperation.apply + + +class BallQuery(Function): + @staticmethod + def forward(ctx, radius, nsample, xyz, new_xyz): + # type: (Any, float, int, torch.Tensor, torch.Tensor) -> torch.Tensor + r""" + + Parameters + ---------- + radius : float + radius of the balls + nsample : int + maximum number of features in the balls + xyz : torch.Tensor + (B, N, 3) xyz coordinates of the features + new_xyz : torch.Tensor + (B, npoint, 3) centers of the ball query + + Returns + ------- + torch.Tensor + (B, npoint, nsample) tensor with the indicies of the features that form the query balls + """ + inds = _ext.ball_query(new_xyz, xyz, radius, nsample) + ctx.mark_non_differentiable(inds) + return inds + + @staticmethod + def backward(ctx, a=None): + return None, None, None, None + + +ball_query = BallQuery.apply + + +class QueryAndGroup(nn.Module): + r""" + Groups with a ball query of radius + + Parameters + --------- + radius : float32 + Radius of ball + nsample : int32 + Maximum number of features to gather in the ball + """ + + def __init__(self, radius, nsample, use_xyz=True, ret_grouped_xyz=False, normalize_xyz=False, sample_uniformly=False, ret_unique_cnt=False): + # type: (QueryAndGroup, float, int, bool) -> None + super(QueryAndGroup, self).__init__() + self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz + self.ret_grouped_xyz = ret_grouped_xyz + self.normalize_xyz = normalize_xyz + self.sample_uniformly = sample_uniformly + self.ret_unique_cnt = ret_unique_cnt + if self.ret_unique_cnt: + assert(self.sample_uniformly) + + def forward(self, xyz, new_xyz, features=None): + # type: (QueryAndGroup, torch.Tensor. torch.Tensor, torch.Tensor) -> Tuple[Torch.Tensor] + r""" + Parameters + ---------- + xyz : torch.Tensor + xyz coordinates of the features (B, N, 3) + new_xyz : torch.Tensor + centriods (B, npoint, 3) + features : torch.Tensor + Descriptors of the features (B, C, N) + + Returns + ------- + new_features : torch.Tensor + (B, 3 + C, npoint, nsample) tensor + """ + idx = ball_query(self.radius, self.nsample, xyz, new_xyz) + + if self.sample_uniformly: + unique_cnt = torch.zeros((idx.shape[0], idx.shape[1])) + for i_batch in range(idx.shape[0]): + for i_region in range(idx.shape[1]): + unique_ind = torch.unique(idx[i_batch, i_region, :]) + num_unique = unique_ind.shape[0] + unique_cnt[i_batch, i_region] = num_unique + sample_ind = torch.randint(0, num_unique, (self.nsample - num_unique,), dtype=torch.long) + all_ind = torch.cat((unique_ind, unique_ind[sample_ind])) + idx[i_batch, i_region, :] = all_ind + + + xyz_trans = xyz.transpose(1, 2).contiguous() + grouped_xyz = grouping_operation(xyz_trans, idx) # (B, 3, npoint, nsample) + grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1) + if self.normalize_xyz: + grouped_xyz /= self.radius + + if features is not None: + grouped_features = grouping_operation(features, idx) + if self.use_xyz: + new_features = torch.cat( + [grouped_xyz, grouped_features], dim=1 + ) # (B, C + 3, npoint, nsample) + else: + new_features = grouped_features + else: + assert ( + self.use_xyz + ), "Cannot have not features and not use xyz as a feature!" + new_features = grouped_xyz + + ret = [new_features] + if self.ret_grouped_xyz: + ret.append(grouped_xyz) + if self.ret_unique_cnt: + ret.append(unique_cnt) + if len(ret) == 1: + return ret[0] + else: + return tuple(ret) + + +class GroupAll(nn.Module): + r""" + Groups all features + + Parameters + --------- + """ + + def __init__(self, use_xyz=True, ret_grouped_xyz=False): + # type: (GroupAll, bool) -> None + super(GroupAll, self).__init__() + self.use_xyz = use_xyz + + def forward(self, xyz, new_xyz, features=None): + # type: (GroupAll, torch.Tensor, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor] + r""" + Parameters + ---------- + xyz : torch.Tensor + xyz coordinates of the features (B, N, 3) + new_xyz : torch.Tensor + Ignored + features : torch.Tensor + Descriptors of the features (B, C, N) + + Returns + ------- + new_features : torch.Tensor + (B, C + 3, 1, N) tensor + """ + + grouped_xyz = xyz.transpose(1, 2).unsqueeze(2) + if features is not None: + grouped_features = features.unsqueeze(2) + if self.use_xyz: + new_features = torch.cat( + [grouped_xyz, grouped_features], dim=1 + ) # (B, 3 + C, 1, N) + else: + new_features = grouped_features + else: + new_features = grouped_xyz + + if self.ret_grouped_xyz: + return new_features, grouped_xyz + else: + return new_features diff --git a/modules-pytorch-1.9.0/pytorch_utils.py b/modules-pytorch-1.9.0/pytorch_utils.py new file mode 100644 index 0000000..b9c9263 --- /dev/null +++ b/modules-pytorch-1.9.0/pytorch_utils.py @@ -0,0 +1,298 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +''' Modified based on Ref: https://github.com/erikwijmans/Pointnet2_PyTorch ''' +import torch +import torch.nn as nn +from typing import List, Tuple + +class SharedMLP(nn.Sequential): + + def __init__( + self, + args: List[int], + *, + bn: bool = False, + activation=nn.ReLU(inplace=True), + preact: bool = False, + first: bool = False, + name: str = "" + ): + super().__init__() + + for i in range(len(args) - 1): + self.add_module( + name + 'layer{}'.format(i), + Conv2d( + args[i], + args[i + 1], + bn=(not first or not preact or (i != 0)) and bn, + activation=activation + if (not first or not preact or (i != 0)) else None, + preact=preact + ) + ) + + +class _BNBase(nn.Sequential): + + def __init__(self, in_size, batch_norm=None, name=""): + super().__init__() + self.add_module(name + "bn", batch_norm(in_size)) + + nn.init.constant_(self[0].weight, 1.0) + nn.init.constant_(self[0].bias, 0) + + +class BatchNorm1d(_BNBase): + + def __init__(self, in_size: int, *, name: str = ""): + super().__init__(in_size, batch_norm=nn.BatchNorm1d, name=name) + + +class BatchNorm2d(_BNBase): + + def __init__(self, in_size: int, name: str = ""): + super().__init__(in_size, batch_norm=nn.BatchNorm2d, name=name) + + +class BatchNorm3d(_BNBase): + + def __init__(self, in_size: int, name: str = ""): + super().__init__(in_size, batch_norm=nn.BatchNorm3d, name=name) + + +class _ConvBase(nn.Sequential): + + def __init__( + self, + in_size, + out_size, + kernel_size, + stride, + padding, + activation, + bn, + init, + conv=None, + batch_norm=None, + bias=True, + preact=False, + name="" + ): + super().__init__() + + bias = bias and (not bn) + conv_unit = conv( + in_size, + out_size, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias=bias + ) + init(conv_unit.weight) + if bias: + nn.init.constant_(conv_unit.bias, 0) + + if bn: + if not preact: + bn_unit = batch_norm(out_size) + else: + bn_unit = batch_norm(in_size) + + if preact: + if bn: + self.add_module(name + 'bn', bn_unit) + + if activation is not None: + self.add_module(name + 'activation', activation) + + self.add_module(name + 'conv', conv_unit) + + if not preact: + if bn: + self.add_module(name + 'bn', bn_unit) + + if activation is not None: + self.add_module(name + 'activation', activation) + + +class Conv1d(_ConvBase): + + def __init__( + self, + in_size: int, + out_size: int, + *, + kernel_size: int = 1, + stride: int = 1, + padding: int = 0, + activation=nn.ReLU(inplace=True), + bn: bool = False, + init=nn.init.kaiming_normal_, + bias: bool = True, + preact: bool = False, + name: str = "" + ): + super().__init__( + in_size, + out_size, + kernel_size, + stride, + padding, + activation, + bn, + init, + conv=nn.Conv1d, + batch_norm=BatchNorm1d, + bias=bias, + preact=preact, + name=name + ) + + +class Conv2d(_ConvBase): + + def __init__( + self, + in_size: int, + out_size: int, + *, + kernel_size: Tuple[int, int] = (1, 1), + stride: Tuple[int, int] = (1, 1), + padding: Tuple[int, int] = (0, 0), + activation=nn.ReLU(inplace=True), + bn: bool = False, + init=nn.init.kaiming_normal_, + bias: bool = True, + preact: bool = False, + name: str = "" + ): + super().__init__( + in_size, + out_size, + kernel_size, + stride, + padding, + activation, + bn, + init, + conv=nn.Conv2d, + batch_norm=BatchNorm2d, + bias=bias, + preact=preact, + name=name + ) + + +class Conv3d(_ConvBase): + + def __init__( + self, + in_size: int, + out_size: int, + *, + kernel_size: Tuple[int, int, int] = (1, 1, 1), + stride: Tuple[int, int, int] = (1, 1, 1), + padding: Tuple[int, int, int] = (0, 0, 0), + activation=nn.ReLU(inplace=True), + bn: bool = False, + init=nn.init.kaiming_normal_, + bias: bool = True, + preact: bool = False, + name: str = "" + ): + super().__init__( + in_size, + out_size, + kernel_size, + stride, + padding, + activation, + bn, + init, + conv=nn.Conv3d, + batch_norm=BatchNorm3d, + bias=bias, + preact=preact, + name=name + ) + + +class FC(nn.Sequential): + + def __init__( + self, + in_size: int, + out_size: int, + *, + activation=nn.ReLU(inplace=True), + bn: bool = False, + init=None, + preact: bool = False, + name: str = "" + ): + super().__init__() + + fc = nn.Linear(in_size, out_size, bias=not bn) + if init is not None: + init(fc.weight) + if not bn: + nn.init.constant_(fc.bias, 0) + + if preact: + if bn: + self.add_module(name + 'bn', BatchNorm1d(in_size)) + + if activation is not None: + self.add_module(name + 'activation', activation) + + self.add_module(name + 'fc', fc) + + if not preact: + if bn: + self.add_module(name + 'bn', BatchNorm1d(out_size)) + + if activation is not None: + self.add_module(name + 'activation', activation) + +def set_bn_momentum_default(bn_momentum): + + def fn(m): + if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)): + m.momentum = bn_momentum + + return fn + + +class BNMomentumScheduler(object): + + def __init__( + self, model, bn_lambda, last_epoch=-1, + setter=set_bn_momentum_default + ): + if not isinstance(model, nn.Module): + raise RuntimeError( + "Class '{}' is not a PyTorch nn Module".format( + type(model).__name__ + ) + ) + + self.model = model + self.setter = setter + self.lmbd = bn_lambda + + self.step(last_epoch + 1) + self.last_epoch = last_epoch + + def step(self, epoch=None): + if epoch is None: + epoch = self.last_epoch + 1 + + self.last_epoch = epoch + self.model.apply(self.setter(self.lmbd(epoch))) + + diff --git a/modules-pytorch-1.9.0/setup.py b/modules-pytorch-1.9.0/setup.py new file mode 100644 index 0000000..d7ebc7b --- /dev/null +++ b/modules-pytorch-1.9.0/setup.py @@ -0,0 +1,34 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension +import glob +import os + +_ext_src_root = "_ext_src" +_ext_sources = glob.glob("{}/src/*.cpp".format(_ext_src_root)) + glob.glob( + "{}/src/*.cu".format(_ext_src_root) +) +_ext_headers = glob.glob("{}/include/*".format(_ext_src_root)) + +headers = "-I" + os.path.join(os.path.dirname(os.path.abspath(__file__)), '_ext_src', 'include') + +setup( + name='pointnet2', + ext_modules=[ + CUDAExtension( + name='pointnet2._ext', + sources=_ext_sources, + extra_compile_args={ + "cxx": ["-O2", headers], + "nvcc": ["-O2", headers], + }, + ) + ], + cmdclass={ + 'build_ext': BuildExtension + } +) diff --git a/scheduler.py b/scheduler.py new file mode 100644 index 0000000..f0f862d --- /dev/null +++ b/scheduler.py @@ -0,0 +1,47 @@ +import torch +from bisect import bisect_right + + +class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): + def __init__( + self, + optimizer, + milestones, + gamma=0.1, + warmup_factor=1.0 / 3, + warmup_iters=5, + warmup_method="linear", + last_epoch=-1, + ): + if not milestones == sorted(milestones): + raise ValueError( + "Milestones should be a list of" " increasing integers. Got {}", + milestones, + ) + + if warmup_method not in ("constant", "linear"): + raise ValueError( + "Only 'constant' or 'linear' warmup_method accepted" + "got {}".format(warmup_method) + ) + self.milestones = milestones + self.gamma = gamma + self.warmup_factor = warmup_factor + self.warmup_iters = warmup_iters + self.warmup_method = warmup_method + super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) + + def get_lr(self): + warmup_factor = 1 + if self.last_epoch < self.warmup_iters: + if self.warmup_method == "constant": + warmup_factor = self.warmup_factor + elif self.warmup_method == "linear": + alpha = float(self.last_epoch) / self.warmup_iters + warmup_factor = self.warmup_factor * (1 - alpha) + alpha + return [ + base_lr * + warmup_factor * + self.gamma ** bisect_right(self.milestones, self.last_epoch) + for base_lr in self.base_lrs + ] diff --git a/train-msr.py b/train-msr.py new file mode 100644 index 0000000..b806e10 --- /dev/null +++ b/train-msr.py @@ -0,0 +1,257 @@ +from __future__ import print_function +import datetime +import os +import time +import sys +import numpy as np +import torch +import torch.utils.data +from torch.utils.data.dataloader import default_collate +from torch import nn +import torch.nn.functional as F +import torchvision +from torchvision import transforms + +import utils + +from scheduler import WarmupMultiStepLR + +from datasets.msr import MSRAction3D +import models.video as Models + +def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, print_freq): + model.train() + metric_logger = utils.MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value}')) + metric_logger.add_meter('clips/s', utils.SmoothedValue(window_size=10, fmt='{value:.3f}')) + + header = 'Epoch: [{}]'.format(epoch) + for clip, target, _ in metric_logger.log_every(data_loader, print_freq, header): + start_time = time.time() + clip, target = clip.to(device), target.to(device) + output = model(clip) + loss = criterion(output, target) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) + batch_size = clip.shape[0] + metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"]) + metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) + metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) + metric_logger.meters['clips/s'].update(batch_size / (time.time() - start_time)) + lr_scheduler.step() + sys.stdout.flush() + +def evaluate(model, criterion, data_loader, device): + model.eval() + metric_logger = utils.MetricLogger(delimiter=" ") + header = 'Test:' + video_prob = {} + video_label = {} + with torch.no_grad(): + for clip, target, video_idx in metric_logger.log_every(data_loader, 100, header): + clip = clip.to(device, non_blocking=True) + target = target.to(device, non_blocking=True) + output = model(clip) + loss = criterion(output, target) + + acc1, acc5 = utils.accuracy(output, target, topk=(1, 5)) + prob = F.softmax(input=output, dim=1) + + # FIXME need to take into account that the datasets + # could have been padded in distributed setup + batch_size = clip.shape[0] + target = target.cpu().numpy() + video_idx = video_idx.cpu().numpy() + prob = prob.cpu().numpy() + for i in range(0, batch_size): + idx = video_idx[i] + if idx in video_prob: + video_prob[idx] += prob[i] + else: + video_prob[idx] = prob[i] + video_label[idx] = target[i] + metric_logger.update(loss=loss.item()) + metric_logger.meters['acc1'].update(acc1.item(), n=batch_size) + metric_logger.meters['acc5'].update(acc5.item(), n=batch_size) + # gather the stats from all processes + metric_logger.synchronize_between_processes() + + print(' * Clip Acc@1 {top1.global_avg:.3f} Clip Acc@5 {top5.global_avg:.3f}'.format(top1=metric_logger.acc1, top5=metric_logger.acc5)) + + # video level prediction + video_pred = {k: np.argmax(v) for k, v in video_prob.items()} + pred_correct = [video_pred[k]==video_label[k] for k in video_pred] + total_acc = np.mean(pred_correct) + + class_count = [0] * data_loader.dataset.num_classes + class_correct = [0] * data_loader.dataset.num_classes + + for k, v in video_pred.items(): + label = video_label[k] + class_count[label] += 1 + class_correct[label] += (v==label) + class_acc = [c/float(s) for c, s in zip(class_correct, class_count)] + + print(' * Video Acc@1 %f'%total_acc) + print(' * Class Acc@1 %s'%str(class_acc)) + + return total_acc + + +def main(args): + + if args.output_dir: + utils.mkdir(args.output_dir) + + print(args) + print("torch version: ", torch.__version__) + print("torchvision version: ", torchvision.__version__) + + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed(args.seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + device = torch.device('cuda') + + # Data loading code + print("Loading data") + + st = time.time() + + dataset = MSRAction3D( + root=args.data_path, + frames_per_clip=args.clip_len, + step_between_clips=1, + num_points=args.num_points, + train=True + ) + + dataset_test = MSRAction3D( + root=args.data_path, + frames_per_clip=args.clip_len, + step_between_clips=1, + num_points=args.num_points, + train=False + ) + + print("Creating data loaders") + + data_loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) + + data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True) + + print("Creating model") + Model = getattr(Models, args.model) + model = Model(radius=args.radius, nsamples=args.nsamples, spatial_stride=args.spatial_stride, + temporal_kernel_size=args.temporal_kernel_size, temporal_stride=args.temporal_stride, + emb_relu=args.emb_relu, + dim=args.dim, depth=args.depth, heads=args.heads, dim_head=args.dim_head, + mlp_dim=args.mlp_dim, num_classes=dataset.num_classes) + + if torch.cuda.device_count() > 1: + model = nn.DataParallel(model) + model.to(device) + + criterion = nn.CrossEntropyLoss() + + lr = args.lr + optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=args.momentum, weight_decay=args.weight_decay) + + # convert scheduler to be per iteration, not per epoch, for warmup that lasts + # between different epochs + warmup_iters = args.lr_warmup_epochs * len(data_loader) + lr_milestones = [len(data_loader) * m for m in args.lr_milestones] + lr_scheduler = WarmupMultiStepLR(optimizer, milestones=lr_milestones, gamma=args.lr_gamma, warmup_iters=warmup_iters, warmup_factor=1e-5) + + model_without_ddp = model + + if args.resume: + checkpoint = torch.load(args.resume, map_location='cpu') + model_without_ddp.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) + args.start_epoch = checkpoint['epoch'] + 1 + + + print("Start training") + start_time = time.time() + acc = 0 + for epoch in range(args.start_epoch, args.epochs): + train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, args.print_freq) + + acc = max(acc, evaluate(model, criterion, data_loader_test, device=device)) + + if args.output_dir: + checkpoint = { + 'model': model_without_ddp.state_dict(), + 'optimizer': optimizer.state_dict(), + 'lr_scheduler': lr_scheduler.state_dict(), + 'epoch': epoch, + 'args': args} + utils.save_on_master( + checkpoint, + os.path.join(output_dir, 'model_{}.pth'.format(epoch))) + utils.save_on_master( + checkpoint, + os.path.join(output_dir, 'checkpoint.pth')) + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + print('Accuracy {}'.format(acc)) + + +def parse_args(): + import argparse + parser = argparse.ArgumentParser(description='PST-Transformer Model Training') + + parser.add_argument('--data-path', default='/scratch/HeheFan-data/MSR-Action3D', type=str, help='dataset') + parser.add_argument('--seed', default=0, type=int, help='random seed') + parser.add_argument('--model', default='PSTTransformer', type=str, help='model') + # input + parser.add_argument('--clip-len', default=24, type=int, metavar='N', help='number of frames per clip') + parser.add_argument('--num-points', default=2048, type=int, metavar='N', help='number of points per frame') + # P4D + parser.add_argument('--radius', default=0.7, type=float, help='radius for the ball query') + parser.add_argument('--nsamples', default=32, type=int, help='number of neighbors for the ball query') + parser.add_argument('--spatial-stride', default=32, type=int, help='spatial subsampling rate') + parser.add_argument('--temporal-kernel-size', default=3, type=int, help='temporal kernel size') + parser.add_argument('--temporal-stride', default=2, type=int, help='temporal stride') + # embedding + parser.add_argument('--emb-relu', default=False, action='store_true') + # transformer + parser.add_argument('--dim', default=1024, type=int, help='transformer dim') + parser.add_argument('--depth', default=5, type=int, help='transformer depth') + parser.add_argument('--heads', default=8, type=int, help='transformer head') + parser.add_argument('--dim-head', default=128, type=int, help='transformer dim for each head') + parser.add_argument('--mlp-dim', default=2048, type=int, help='transformer mlp dim') + # training + parser.add_argument('-b', '--batch-size', default=14, type=int) + parser.add_argument('--epochs', default=50, type=int, metavar='N', help='number of total epochs to run') + parser.add_argument('-j', '--workers', default=10, type=int, metavar='N', help='number of data loading workers (default: 16)') + parser.add_argument('--lr', default=0.01, type=float, help='initial learning rate') + parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') + parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, metavar='W', help='weight decay (default: 1e-4)', dest='weight_decay') + parser.add_argument('--lr-milestones', nargs='+', default=[20, 30], type=int, help='decrease lr on milestones') + parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma') + parser.add_argument('--lr-warmup-epochs', default=10, type=int, help='number of warmup epochs') + # output + parser.add_argument('--print-freq', default=10, type=int, help='print frequency') + parser.add_argument('--output-dir', default='', type=str, help='path where to save') + # resume + parser.add_argument('--resume', default='', help='resume from checkpoint') + parser.add_argument('--start-epoch', default=0, type=int, metavar='N', help='start epoch') + + args = parser.parse_args() + + return args + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/train-syn.py b/train-syn.py new file mode 100644 index 0000000..d8deb03 --- /dev/null +++ b/train-syn.py @@ -0,0 +1,267 @@ +from __future__ import print_function +import datetime +import os +import time +import sys +import numpy as np +import torch +import torch.utils.data +from torch.utils.data.dataloader import default_collate +from torch import nn +import torch.nn.functional as F +import torchvision +from torchvision import transforms + +import utils + +from scheduler import WarmupMultiStepLR + +from datasets.synthia import * +import models.point as Models + + +def train_one_epoch(model, criterion, optimizer, lr_scheduler, data_loader, device, epoch, print_freq): + model.train() + metric_logger = utils.MetricLogger(delimiter=" ") + metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value}')) + + header = 'Epoch: [{}]'.format(epoch) + for pc1, rgb1, label1, mask1, pc2, rgb2, label2, mask2 in metric_logger.log_every(data_loader, print_freq, header): + start_time = time.time() + + pc1, rgb1, label1, mask1 = pc1.to(device), rgb1.to(device), label1.to(device), mask1.to(device) + output1 = model(pc1, rgb1).transpose(1, 2) + loss1 = criterion(output1, label1)*mask1 + loss1 = torch.sum(loss1) / (torch.sum(mask1) + 1) + optimizer.zero_grad() + loss1.backward() + optimizer.step() + + pc2, rgb2, label2, mask2 = pc2.to(device), rgb2.to(device), label2.to(device), mask2.to(device) + output2 = model(pc2, rgb2).transpose(1, 2) + loss2 = criterion(output2, label2)*mask1 + loss2 = torch.sum(loss2) / (torch.sum(mask2) + 1) + optimizer.zero_grad() + loss2.backward() + optimizer.step() + + metric_logger.update(loss=(loss1.item()+loss2.item())/2.0, lr=optimizer.param_groups[0]["lr"]) + lr_scheduler.step() + sys.stdout.flush() + +def evaluate(model, criterion, data_loader, device, print_freq): + model.eval() + metric_logger = utils.MetricLogger(delimiter=" ") + header = 'Test:' + total_loss = 0 + total_correct = 0 + total_seen = 0 + total_pred_class = [0] * 12 + total_correct_class = [0] * 12 + total_class = [0] * 12 + + with torch.no_grad(): + for pc1, rgb1, label1, mask1, pc2, rgb2, label2, mask2 in metric_logger.log_every(data_loader, print_freq, header): + pc1, rgb1 = pc1.to(device), rgb1.to(device) + output1 = model(pc1, rgb1).transpose(1, 2) + loss1 = criterion(output1, label1.to(device))*mask1.to(device) + loss1 = torch.sum(loss1) / (torch.sum(mask1.to(device)) + 1) + label1, mask1 = label1.numpy().astype(np.int32), mask1.numpy().astype(np.int32) + output1 = output1.cpu().numpy() + pred1 = np.argmax(output1, 1) # BxTxN + correct1 = np.sum((pred1 == label1) * mask1) + total_correct += correct1 + total_seen += np.sum(mask1) + for c in range(12): + total_pred_class[c] += np.sum(((pred1==c) | (label1==c)) & mask1) + total_correct_class[c] += np.sum((pred1==c) & (label1==c) & mask1) + total_class[c] += np.sum((label1==c) & mask1) + + pc2, rgb2 = pc2.to(device), rgb2.to(device) + output2 = model(pc2, rgb2).transpose(1, 2) + loss2 = criterion(output2, label2.to(device))*mask2.to(device) + loss2 = torch.sum(loss2) / (torch.sum(mask2.to(device)) + 1) + label2, mask2 = label2.numpy().astype(np.int32), mask2.numpy().astype(np.int32) + output2 = output2.cpu().numpy() + pred2 = np.argmax(output2, 1) # BxTxN + correct2 = np.sum((pred2 == label2) * mask2) + total_correct += correct2 + total_seen += np.sum(mask2) + for c in range(12): + total_pred_class[c] += np.sum(((pred2==c) | (label2==c)) & mask2) + total_correct_class[c] += np.sum((pred2==c) & (label2==c) & mask2) + total_class[c] += np.sum((label2==c) & mask2) + + metric_logger.update(loss=(loss1.item()+loss2.item())/2.0) + + ACCs = [] + for c in range(12): + acc = total_correct_class[c] / float(total_class[c]) + if total_class[c] == 0: + acc = 0 + print('eval acc of %s:\t %f'%(index_to_class[label_to_index[c]], acc)) + ACCs.append(acc) + print(' * Eval accuracy: %f'% (np.mean(np.array(ACCs)))) + + IoUs = [] + for c in range(12): + iou = total_correct_class[c] / float(total_pred_class[c]) + if total_pred_class[c] == 0: + iou = 0 + print('eval mIoU of %s:\t %f'%(index_to_class[label_to_index[c]], iou)) + IoUs.append(iou) + print(' * Eval mIoU:\t %f'%(np.mean(np.array(IoUs)))) + +def main(args): + + if args.output_dir: + utils.mkdir(args.output_dir) + + print(args) + print("torch version: ", torch.__version__) + print("torchvision version: ", torchvision.__version__) + + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed(args.seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + device = torch.device('cuda') + + # Data loading code + print("Loading data") + + st = time.time() + + dataset = SegDataset( + root=args.data_path, + meta=args.data_train, + labelweight=args.label_weight, + frames_per_clip=args.clip_len, + num_points=args.num_points, + train=True + ) + + dataset_test = SegDataset( + root=args.data_path, + meta=args.data_eval, + labelweight=args.label_weight, + frames_per_clip=args.clip_len, + num_points=args.num_points, + train=False + ) + + print("Creating data loaders") + + data_loader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) + + data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True) + + print("Creating model") + + Model = getattr(Models, args.model) + model = Model(radius=args.radius, nsamples=args.nsamples, num_classes=12) + if torch.cuda.device_count() > 1: + model = nn.DataParallel(model) + model.to(device) + + criterion_train = nn.CrossEntropyLoss(weight=torch.from_numpy(dataset.labelweights).to(device), reduction='none') + criterion_test = nn.CrossEntropyLoss(weight=torch.from_numpy(dataset_test.labelweights).to(device), reduction='none') + + lr = args.lr + optimizer = torch.optim.SGD( + model.parameters(), lr=lr, momentum=args.momentum, weight_decay=args.weight_decay) + + # convert scheduler to be per iteration, not per epoch, for warmup that lasts + # between different epochs + warmup_iters = args.lr_warmup_epochs * len(data_loader) + lr_milestones = [len(data_loader) * m for m in args.lr_milestones] + lr_scheduler = WarmupMultiStepLR( + optimizer, milestones=lr_milestones, gamma=args.lr_gamma, + warmup_iters=warmup_iters, warmup_factor=1e-5) + + model_without_ddp = model + + if args.resume: + checkpoint = torch.load(args.resume, map_location='cpu') + model_without_ddp.load_state_dict(checkpoint['model']) + optimizer.load_state_dict(checkpoint['optimizer']) + lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) + args.start_epoch = checkpoint['epoch'] + 1 + + print("Start training") + start_time = time.time() + for epoch in range(args.start_epoch, args.epochs): + train_one_epoch(model, criterion_train, optimizer, lr_scheduler, data_loader, device, epoch, args.print_freq) + + evaluate(model, criterion_test, data_loader_test, device=device, print_freq=args.print_freq) + + if args.output_dir: + checkpoint = { + 'model': model_without_ddp.state_dict(), + 'optimizer': optimizer.state_dict(), + 'lr_scheduler': lr_scheduler.state_dict(), + 'epoch': epoch, + 'args': args} + utils.save_on_master( + checkpoint, + os.path.join(output_dir, 'model_{}.pth'.format(epoch))) + utils.save_on_master( + checkpoint, + os.path.join(output_dir, 'checkpoint.pth')) + + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('Training time {}'.format(total_time_str)) + +def parse_args(): + import argparse + parser = argparse.ArgumentParser(description='PST-Transformer Model Training') + + parser.add_argument('--data-path', default='/scratch/HeheFan-data/Synthia4D/sequences', help='data path') + parser.add_argument('--data-train', default='/scratch/HeheFan-data/Synthia4D/trainval_raw.txt', help='meta list for training') + parser.add_argument('--data-eval', default='/scratch/HeheFan-data/Synthia4D/test_raw.txt', help='meta list for test') + parser.add_argument('--label-weight', default='/scratch/HeheFan-data/Synthia4D/labelweights.npz', help='training label weights') + + parser.add_argument('--seed', default=0, type=int, help='random seed') + parser.add_argument('--model', default='PSTTransformer', type=str, help='model') + # input + parser.add_argument('--clip-len', default=3, type=int, metavar='N', help='number of frames per clip') + parser.add_argument('--num-points', default=16384, type=int, metavar='N', help='number of points per frame') + # P4D + parser.add_argument('--radius', default=0.9, type=float, help='radius for the ball query') + parser.add_argument('--nsamples', default=32, type=int, help='number of neighbors for the ball query') + parser.add_argument('--spatial-stride', default=16, type=int, help='spatial subsampling rate') + parser.add_argument('--temporal-kernel-size', default=1, type=int, help='temporal kernel size') + # embedding + parser.add_argument('--emb-relu', default=False, action='store_true') + # transformer + parser.add_argument('--dim', default=1024, type=int, help='transformer dim') + parser.add_argument('--depth', default=2, type=int, help='transformer depth') + parser.add_argument('--head', default=4, type=int, help='transformer head') + parser.add_argument('--mlp-dim', default=2048, type=int, help='transformer mlp dim') + # training + parser.add_argument('-b', '--batch-size', default=8, type=int) + parser.add_argument('--epochs', default=150, type=int, metavar='N', help='number of total epochs to run') + parser.add_argument('-j', '--workers', default=10, type=int, metavar='N', help='number of data loading workers (default: 16)') + parser.add_argument('--lr', default=0.01, type=float, help='initial learning rate') + parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum') + parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, metavar='W', help='weight decay (default: 1e-4)', dest='weight_decay') + parser.add_argument('--lr-milestones', nargs='+', default=[30, 40, 50], type=int, help='decrease lr on milestones') + parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma') + parser.add_argument('--lr-warmup-epochs', default=10, type=int, help='number of warmup epochs') + # output + parser.add_argument('--print-freq', default=10, type=int, help='print frequency') + parser.add_argument('--output-dir', default='', type=str, help='path where to save') + # resume + parser.add_argument('--resume', default='', help='resume from checkpoint') + parser.add_argument('--start-epoch', default=0, type=int, metavar='N', help='start epoch') + + args = parser.parse_args() + + return args + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..5ea6dfe --- /dev/null +++ b/utils.py @@ -0,0 +1,255 @@ +from __future__ import print_function +from collections import defaultdict, deque +import datetime +import time +import torch +import torch.distributed as dist + +import errno +import os + + +class SmoothedValue(object): + """Track a series of values and provide access to smoothed values over a + window or the global series average. + """ + + def __init__(self, window_size=20, fmt=None): + if fmt is None: + fmt = "{median:.4f} ({global_avg:.4f})" + self.deque = deque(maxlen=window_size) + self.total = 0.0 + self.count = 0 + self.fmt = fmt + + def update(self, value, n=1): + self.deque.append(value) + self.count += n + self.total += value * n + + def synchronize_between_processes(self): + """ + Warning: does not synchronize the deque! + """ + if not is_dist_avail_and_initialized(): + return + t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') + dist.barrier() + dist.all_reduce(t) + t = t.tolist() + self.count = int(t[0]) + self.total = t[1] + + @property + def median(self): + d = torch.tensor(list(self.deque)) + return d.median().item() + + @property + def avg(self): + d = torch.tensor(list(self.deque), dtype=torch.float32) + return d.mean().item() + + @property + def global_avg(self): + return self.total / self.count + + @property + def max(self): + return max(self.deque) + + @property + def value(self): + return self.deque[-1] + + def __str__(self): + return self.fmt.format( + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value) + + +class MetricLogger(object): + def __init__(self, delimiter="\t"): + self.meters = defaultdict(SmoothedValue) + self.delimiter = delimiter + + def update(self, **kwargs): + for k, v in kwargs.items(): + if isinstance(v, torch.Tensor): + v = v.item() + assert isinstance(v, (float, int)) + self.meters[k].update(v) + + def __getattr__(self, attr): + if attr in self.meters: + return self.meters[attr] + if attr in self.__dict__: + return self.__dict__[attr] + raise AttributeError("'{}' object has no attribute '{}'".format( + type(self).__name__, attr)) + + def __str__(self): + loss_str = [] + for name, meter in self.meters.items(): + loss_str.append( + "{}: {}".format(name, str(meter)) + ) + return self.delimiter.join(loss_str) + + def synchronize_between_processes(self): + for meter in self.meters.values(): + meter.synchronize_between_processes() + + def add_meter(self, name, meter): + self.meters[name] = meter + + def log_every(self, iterable, print_freq, header=None): + i = 0 + if not header: + header = '' + start_time = time.time() + end = time.time() + iter_time = SmoothedValue(fmt='{avg:.4f}') + data_time = SmoothedValue(fmt='{avg:.4f}') + space_fmt = ':' + str(len(str(len(iterable)))) + 'd' + if torch.cuda.is_available(): + log_msg = self.delimiter.join([ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}', + 'max mem: {memory:.0f}' + ]) + else: + log_msg = self.delimiter.join([ + header, + '[{0' + space_fmt + '}/{1}]', + 'eta: {eta}', + '{meters}', + 'time: {time}', + 'data: {data}' + ]) + MB = 1024.0 * 1024.0 + for obj in iterable: + data_time.update(time.time() - end) + yield obj + iter_time.update(time.time() - end) + if i % print_freq == 0: + eta_seconds = iter_time.global_avg * (len(iterable) - i) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + if torch.cuda.is_available(): + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time), + memory=torch.cuda.max_memory_allocated() / MB)) + else: + print(log_msg.format( + i, len(iterable), eta=eta_string, + meters=str(self), + time=str(iter_time), data=str(data_time))) + i += 1 + end = time.time() + total_time = time.time() - start_time + total_time_str = str(datetime.timedelta(seconds=int(total_time))) + print('{} Total time: {}'.format(header, total_time_str)) + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target[None]) + + res = [] + for k in topk: + correct_k = correct[:k].flatten().sum(dtype=torch.float32) + res.append(correct_k * (100.0 / batch_size)) + return res + + +def mkdir(path): + try: + os.makedirs(path) + except OSError as e: + if e.errno != errno.EEXIST: + raise + + +def setup_for_distributed(is_master): + """ + This function disables printing when not in master process + """ + import builtins as __builtin__ + builtin_print = __builtin__.print + + def print(*args, **kwargs): + force = kwargs.pop('force', False) + if is_master or force: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + + +def get_world_size(): + if not is_dist_avail_and_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + + +def is_main_process(): + return get_rank() == 0 + + +def save_on_master(*args, **kwargs): + if is_main_process(): + torch.save(*args, **kwargs) + + +def init_distributed_mode(args): + if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: + args.rank = int(os.environ["RANK"]) + args.world_size = int(os.environ['WORLD_SIZE']) + args.gpu = int(os.environ['LOCAL_RANK']) + elif 'SLURM_PROCID' in os.environ: + args.rank = int(os.environ['SLURM_PROCID']) + args.gpu = args.rank % torch.cuda.device_count() + elif hasattr(args, "rank"): + pass + else: + print('Not using distributed mode') + args.distributed = False + return + + args.distributed = True + + torch.cuda.set_device(args.gpu) + args.dist_backend = 'nccl' + print('| distributed init (rank {}): {}'.format( + args.rank, args.dist_url), flush=True) + torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + setup_for_distributed(args.rank == 0)