-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathvisual_3dbbox.py
129 lines (116 loc) · 5.75 KB
/
visual_3dbbox.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from pope_model_api import *
from utils.draw_utils import draw_bbox_3d, draw_axis
if __name__ == "__main__":
ckpt, model_type = get_model_info("h")
sam = sam_model_registry[model_type](checkpoint=ckpt)
DEVICE = "cuda"
sam.to(device=DEVICE)
MASK_GEN = SamAutomaticMaskGenerator(sam)
logger.info(f"load SAM model from {ckpt}")
crop_tool = CropImage()
dinov2_model = load_dinov2_model()
dinov2_model.to("cuda:0")
prompt_filename = os.path.join("data/demos/inputs", "prompt.png" )
targe_filename = os.path.join("data/demos/inputs", "target.png" )
K0 = np.array(
[[2.442288639999999759e+03, 0.000000000000000000e+00, 4.491140266666666321e+02],
[-2.776560722850263257e-13, 2.447233834666666553e+03 ,-1.107243093333333093e+02],
[0.000000000000000000e+00, 0.000000000000000000e+00, 1.000000000000000000e+00]]
)
K1 = np.array(
[[5.724113999999999578e+02, 0.000000000000000000e+00, 3.252610999999999990e+02],
[0.000000000000000000e+00, 5.735704299999999876e+02 , 2.420489900000000034e+02],
[0.000000000000000000e+00, 0.000000000000000000e+00 , 1.000000000000000000e+00]]
)
x, y , z = 3.793429999999999719e-02, 3.879959999999999659e-02 ,4.588450000000000167e-02
_3d_bbox = np.array([
[-x, -y , -z],
[-x, -y , z],
[-x, y , z],
[-x, y , -z],
[x, -y , -z],
[x, -y , z],
[x, y , z],
[x, y , -z],
])
prompt_image = cv2.imread(prompt_filename)
prompt_image_copy = prompt_image.copy()
ref_torch_image = set_torch_image(prompt_image, center_crop=True)
ref_fea = get_cls_token_torch(dinov2_model, ref_torch_image)
target_image = cv2.imread(targe_filename)
image_h,image_w,_ = target_image.shape
t1 = time.time()
masks = MASK_GEN.generate(target_image)
t2 = time.time()
similarity_score, top_images = np.array([0,0,0],np.float32) , [[],[],[]]
t3 = time.time()
compact_percent = 0.3
for _, mask in enumerate(masks):
object_mask = np.expand_dims(mask["segmentation"], -1)
x0, y0, w, h = mask["bbox"]
x1, y1 = x0+w,y0+h
x0 -= int(w * compact_percent)
y0 -= int(h * compact_percent)
x1 += int(w * compact_percent)
y1 += int(h * compact_percent)
box = np.array([x0, y0, x1, y1])
resize_shape = np.array([y1 - y0, x1 - x0])
K_crop, K_crop_homo = get_K_crop_resize(box, K1, resize_shape)
image_crop, _ = get_image_crop_resize(target_image, box, resize_shape)
# object_mask,_ = get_image_crop_resize(object_mask, box, resize_shape)
box_new = np.array([0, 0, x1 - x0, y1 - y0])
resize_shape = np.array([256, 256])
K_crop, K_crop_homo = get_K_crop_resize(box_new, K_crop, resize_shape)
image_crop, _ = get_image_crop_resize(image_crop, box_new, resize_shape)
crop_tensor = set_torch_image(image_crop, center_crop=True)
with torch.no_grad():
fea = get_cls_token_torch(dinov2_model, crop_tensor)
score = F.cosine_similarity(ref_fea, fea, dim=1, eps=1e-8)
if (score.item() > similarity_score).any():
mask["crop_image"] = image_crop
mask["K"] = K_crop
mask["bbox"] = box
min_idx = np.argmin(similarity_score)
similarity_score[min_idx] = score.item()
top_images[min_idx] = mask.copy()
prompt_image = cv2.cvtColor(prompt_image, cv2.COLOR_BGR2GRAY)
prompt_image = torch.from_numpy(prompt_image).float()[None] / 255.
prompt_image = prompt_image.unsqueeze(0).cuda()
matching_score = [ [0] for _ in range(len(top_images)) ]
for top_idx in range(len(top_images)):
img1 = cv2.cvtColor(top_images[top_idx]["crop_image"], cv2.COLOR_BGR2GRAY)
img1 = torch.from_numpy(img1).float()[None] / 255.
img1 = img1.unsqueeze(0).cuda()
batch = {'image0': prompt_image, 'image1': img1}
with torch.no_grad():
matcher(batch)
mkpts0 = batch['mkpts0_f'].cpu().numpy()
mkpts1 = batch['mkpts1_f'].cpu().numpy()
confidences = batch["mconf"].cpu().numpy()
conf_mask = np.where(confidences > 0.9)
matching_score[top_idx] = conf_mask[0].shape[0]
top_images[top_idx]["mkpts0"] = mkpts0
top_images[top_idx]["mkpts1"] = mkpts1
top_images[top_idx]["mconf"] = confidences
max_match_idx = np.argmax(matching_score)
pre_bbox = top_images[max_match_idx]["bbox"]
mkpts0 = top_images[max_match_idx]["mkpts0"]
mkpts1 = top_images[max_match_idx]["mkpts1"]
pre_K = top_images[max_match_idx]["K"]
crop_image = cv2.resize(top_images[np.argmax(matching_score)]["crop_image"],(256,256))
que_image = cv2.resize(prompt_image_copy,(256,256))
segment_mask = (255*top_images[np.argmax(matching_score)]["segmentation"]).astype(np.uint8)
stack_result_image = np.hstack((que_image, crop_image))
cv2.imwrite("query_result.png", stack_result_image)
R, t, inliers = estimate_pose(mkpts0, mkpts1 , K0 , pre_K , 0.5, 0.99)
prompt_pose = np.loadtxt(os.path.join("data/demos/inputs", "prompt.txt" ))
target_pose = np.loadtxt(os.path.join("data/demos/inputs", "target.txt" ))
predict_pose = np.zeros((3,4)).astype(np.float32)
predict_pose[:3,:3] = np.matmul(R , prompt_pose[:3,:3])
our_predict_pose = predict_pose[:3,:3].copy()
predict_pose[:3,3] = target_pose[:3,3]
pre_bbox_pts_3d, _ = project_points(_3d_bbox, predict_pose[:3,:4] , K1)
pre_bbox_pts_3d = pre_bbox_pts_3d.astype(np.int32)
our_bbox_img = draw_bbox_3d(target_image, pre_bbox_pts_3d,(255,255,255))
our_bbox_img = draw_axis(our_bbox_img,predict_pose[:3,:3], predict_pose[:3,3],K1)
cv2.imwrite(f"3D_BBox.png",our_bbox_img)