-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
51 changed files
with
5,550 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,10 @@ | ||
# PST-Transformer | ||
# PST-Transformer | ||
|
||
The code is tested with Red Hat Enterprise Linux Workstation release 7.7 (Maipo), g++ (GCC) 8.3.1, PyTorch (both v1.4.0 and v1.9.0 are supported), CUDA 10.2 and cuDNN v7.6. | ||
|
||
Compile the CUDA layers for [PointNet++](http://arxiv.org/abs/1706.02413), which we used for furthest point sampling (FPS) and radius neighbouring search: | ||
``` | ||
mv modules-pytorch-1.4.0/modules-pytorch-1.9.0 modules | ||
cd modules | ||
python setup.py install | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import os | ||
import sys | ||
import numpy as np | ||
from torch.utils.data import Dataset | ||
|
||
class MSRAction3D(Dataset): | ||
def __init__(self, root, frames_per_clip=16, step_between_clips=1, num_points=2048, train=True): | ||
super(MSRAction3D, self).__init__() | ||
|
||
self.videos = [] | ||
self.labels = [] | ||
self.index_map = [] | ||
index = 0 | ||
for video_name in os.listdir(root): | ||
if train and (int(video_name.split('_')[1].split('s')[1]) <= 5): | ||
video = np.load(os.path.join(root, video_name), allow_pickle=True)['point_clouds'] | ||
self.videos.append(video) | ||
label = int(video_name.split('_')[0][1:])-1 | ||
self.labels.append(label) | ||
|
||
nframes = video.shape[0] | ||
for t in range(0, nframes-step_between_clips*(frames_per_clip-1), step_between_clips): | ||
self.index_map.append((index, t)) | ||
index += 1 | ||
|
||
if not train and (int(video_name.split('_')[1].split('s')[1]) > 5): | ||
video = np.load(os.path.join(root, video_name), allow_pickle=True)['point_clouds'] | ||
self.videos.append(video) | ||
label = int(video_name.split('_')[0][1:])-1 | ||
self.labels.append(label) | ||
|
||
nframes = video.shape[0] | ||
for t in range(0, nframes-step_between_clips*(frames_per_clip-1), step_between_clips): | ||
self.index_map.append((index, t)) | ||
index += 1 | ||
|
||
self.frames_per_clip = frames_per_clip | ||
self.step_between_clips = step_between_clips | ||
self.num_points = num_points | ||
self.train = train | ||
self.num_classes = max(self.labels) + 1 | ||
|
||
|
||
def __len__(self): | ||
return len(self.index_map) | ||
|
||
def __getitem__(self, idx): | ||
index, t = self.index_map[idx] | ||
|
||
video = self.videos[index] | ||
label = self.labels[index] | ||
|
||
clip = [video[t+i*self.step_between_clips] for i in range(self.frames_per_clip)] | ||
for i, p in enumerate(clip): | ||
if p.shape[0] > self.num_points: | ||
r = np.random.choice(p.shape[0], size=self.num_points, replace=False) | ||
else: | ||
repeat, residue = self.num_points // p.shape[0], self.num_points % p.shape[0] | ||
r = np.random.choice(p.shape[0], size=residue, replace=False) | ||
r = np.concatenate([np.arange(p.shape[0]) for _ in range(repeat)] + [r], axis=0) | ||
clip[i] = p[r, :] | ||
clip = np.array(clip) | ||
|
||
if self.train: | ||
# scale the points | ||
scales = np.random.uniform(0.9, 1.1, size=3) | ||
clip = clip * scales | ||
|
||
clip = clip / 300 | ||
|
||
return clip.astype(np.float32), label, index | ||
|
||
if __name__ == '__main__': | ||
dataset = MSRAction(root='../data/msr_action', frames_per_clip=16) | ||
clip, label, video_idx = dataset[0] | ||
print(clip) | ||
print(label) | ||
print(video_idx) | ||
print(dataset.num_classes) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
import os | ||
import sys | ||
import numpy as np | ||
from pyquaternion import Quaternion | ||
from torch.utils.data import Dataset | ||
|
||
index_to_label = np.array([12, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, 11], dtype='int32') | ||
label_to_index = np.array([2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 0], dtype='int32') | ||
index_to_class = ['Void', 'Sky', 'Building', 'Road', 'Sidewalk', 'Fence', 'Vegetation', 'Pole', 'Car', 'Traffic Sign', 'Pedestrian', 'Bicycle', 'Lanemarking', 'Reserved', 'Reserved', 'Traffic Light'] | ||
|
||
def index_to_label_func(x): | ||
return index_to_label[x] | ||
index_to_label_vec_func = np.vectorize(index_to_label_func) | ||
|
||
class SegDataset(Dataset): | ||
def __init__(self, root='data/pc', meta='data/train_raw.txt', labelweight = 'data/labelweights.npz', frames_per_clip=3, num_points=16384, train=True): | ||
super(SegDataset, self).__init__() | ||
|
||
self.num_points = num_points | ||
self.train = train | ||
self.root = root | ||
self.frames_per_clip = frames_per_clip | ||
|
||
labelweights = np.load(labelweight)['labelweights'].astype(np.float32) | ||
if train: | ||
labelweights = 1/np.log(1.2 + labelweights) | ||
self.labelweights = labelweights / labelweights.min() | ||
else: | ||
self.labelweights = np.ones_like(labelweights) | ||
|
||
self.meta = [] | ||
self.data = {} | ||
#m = 0 | ||
with open(meta, 'r') as f: | ||
for line in f: | ||
line = line.split(' ')[0] | ||
line = line.split('/') | ||
sequence_name = line[0] | ||
frame_id = int(line[-1].split('.')[0]) | ||
|
||
fn = os.path.join(root, sequence_name + '-' + str(frame_id).zfill(6) + '.npz') | ||
data = np.load(fn) | ||
|
||
pc = data['pc'] # (16384, 3) | ||
rgb = data['rgb'] # (16384, 3) | ||
semantic = data['semantic'] # (16384, ) | ||
center = data['center'] # (3, ) | ||
semantic = semantic.astype('uint8') | ||
|
||
self.data[sequence_name + '-' + str(frame_id)] = (pc, rgb, semantic, center) | ||
self.meta.append([sequence_name, frame_id]) | ||
#m+=1 | ||
#if m == 100: | ||
#break | ||
self.meta.sort() | ||
|
||
def __len__(self): | ||
return len(self.meta) | ||
|
||
def read_training_data_point(self, index): | ||
sequence_name, frame_id = self.meta[index] | ||
|
||
pcs = [] | ||
rgbs = [] | ||
semantics = [] | ||
center_0 = None | ||
|
||
most_recent_success = -1 | ||
for diff in range(0, self.frames_per_clip): | ||
key = sequence_name + '-' + str(frame_id-diff) | ||
if key in self.data: | ||
pc, rgb, semantic, center = self.data[key] | ||
most_recent_success = frame_id - diff | ||
else: | ||
pc, rgb, semantic, center = self.data[sequence_name + '-' + str(most_recent_success)] | ||
|
||
if diff == 0: | ||
center_0 = center | ||
|
||
pcs.append(pc) | ||
rgbs.append(rgb) | ||
semantics.append(semantic) | ||
|
||
pc = np.stack(pcs, axis=0) | ||
rgb = np.stack(rgbs, axis=0) | ||
semantic = np.stack(semantics, axis=0) | ||
|
||
return pc, rgb, semantic, center_0 | ||
|
||
|
||
def half_crop_w_context(self, half, context, pc, rgb, semantic, center): | ||
frames_per_clip = pc.shape[0] | ||
all_idx = np.arange(pc.shape[1]) | ||
sample_indicies_half_w_context = [] | ||
if half == 0: | ||
for f in range(frames_per_clip): | ||
sample_idx_half_w_context = all_idx[pc[f, :, 2] > (center[2] - context)] | ||
sample_indicies_half_w_context.append(sample_idx_half_w_context) | ||
else: | ||
for f in range(frames_per_clip): | ||
sample_idx_half_w_context = all_idx[pc[f, :, 2] < (center[2] + context)] | ||
sample_indicies_half_w_context.append(sample_idx_half_w_context) | ||
|
||
pc_half_w_context = [pc[f, s] for f, s in enumerate(sample_indicies_half_w_context)] | ||
rgb_half_w_context = [rgb[f, s] for f, s in enumerate(sample_indicies_half_w_context)] | ||
semantic_half_w_context = [semantic[f, s] for f, s in enumerate(sample_indicies_half_w_context)] | ||
if half == 0: | ||
loss_masks = [p[:, 2] > center[2] for p in pc_half_w_context] | ||
else: | ||
loss_masks = [p[:, 2] < center[2] for p in pc_half_w_context] | ||
valid_pred_idx_in_full = sample_indicies_half_w_context | ||
|
||
return pc_half_w_context, rgb_half_w_context, semantic_half_w_context, loss_masks, valid_pred_idx_in_full | ||
|
||
def augment(self, pc, center): | ||
flip = np.random.uniform(0, 1) > 0.5 | ||
if flip: | ||
pc = (pc - center) | ||
pc[:, 0] *= -1 | ||
pc += center | ||
|
||
scale = np.random.uniform(0.8, 1.2) | ||
pc = (pc - center) * scale + center | ||
|
||
rot_axis = np.array([0, 1, 0]) | ||
rot_angle = np.random.uniform(np.pi * 2) | ||
q = Quaternion(axis=rot_axis, angle=rot_angle) | ||
R = q.rotation_matrix | ||
|
||
pc = np.dot(pc - center, R) + center | ||
return pc | ||
|
||
def mask_and_label_conversion(self, semantic, loss_mask): | ||
labels = [] | ||
loss_masks = [] | ||
for i, s in enumerate(semantic): | ||
sem = s.astype('int32') | ||
label = index_to_label_vec_func(sem) | ||
loss_mask_ = (label != 12) * loss_mask[i] | ||
label[label == 12] = 0 | ||
|
||
labels.append(label) | ||
loss_masks.append(loss_mask_) | ||
return labels, loss_masks | ||
|
||
def choice_to_num_points(self, pc, rgb, label, loss_mask, valid_pred_idx_in_full): | ||
|
||
# shuffle idx to change point order (change FPS behavior) | ||
for f in range(self.frames_per_clip): | ||
idx = np.arange(pc[f].shape[0]) | ||
choice_num = self.num_points | ||
if pc[f].shape[0] > choice_num: | ||
shuffle_idx = np.random.choice(idx, choice_num, replace=False) | ||
else: | ||
shuffle_idx = np.concatenate([np.random.choice(idx, choice_num - idx.shape[0]), np.arange(idx.shape[0])]) | ||
pc[f] = pc[f][shuffle_idx] | ||
rgb[f] = rgb[f][shuffle_idx] | ||
label[f] = label[f][shuffle_idx] | ||
loss_mask[f] = loss_mask[f][shuffle_idx] | ||
valid_pred_idx_in_full[f] = valid_pred_idx_in_full[f][shuffle_idx] | ||
|
||
pc = np.stack(pc, axis=0) | ||
rgb = np.stack(rgb, axis=0) | ||
label = np.stack(label, axis=0) | ||
loss_mask = np.stack(loss_mask, axis=0) | ||
valid_pred_idx_in_full = np.stack(valid_pred_idx_in_full, axis=0) | ||
|
||
return pc, rgb, label, loss_mask, valid_pred_idx_in_full | ||
|
||
def __getitem__(self, index): | ||
context = 1. | ||
|
||
pc, rgb, semantic, center = self.read_training_data_point(index) | ||
|
||
half = 0 | ||
pc1, rgb1, semantic1, mask1, valid_pred_idx_in_full1 = self.half_crop_w_context(half, context, pc, rgb, semantic, center) | ||
label1, mask1 = self.mask_and_label_conversion(semantic1, mask1) | ||
pc1, rgb1, label1, mask1, valid_pred_idx_in_full1 = self.choice_to_num_points(pc1, rgb1, label1, mask1, valid_pred_idx_in_full1) | ||
|
||
half = 1 | ||
pc2, rgb2, semantic2, mask2, valid_pred_idx_in_full2 = self.half_crop_w_context(half, context, pc, rgb, semantic, center) | ||
label2, mask2 = self.mask_and_label_conversion(semantic2, mask2) | ||
pc2, rgb2, label2, mask2, valid_pred_idx_in_full2 = self.choice_to_num_points(pc2, rgb2, label2, mask2, valid_pred_idx_in_full2) | ||
|
||
if self.train: | ||
pc1 = self.augment(pc1, center) | ||
pc2 = self.augment(pc2, center) | ||
|
||
rgb1 = np.swapaxes(rgb1, 1, 2) | ||
rgb2 = np.swapaxes(rgb2, 1, 2) | ||
|
||
return pc1.astype(np.float32), rgb1.astype(np.float32), label1.astype(np.int64), mask1.astype(np.float32), pc2.astype(np.float32), rgb2.astype(np.float32), label2.astype(np.int64), mask2.astype(np.float32) | ||
|
Binary file not shown.
Binary file not shown.
Oops, something went wrong.