-
Notifications
You must be signed in to change notification settings - Fork 0
/
language_sam.py
124 lines (104 loc) · 5.14 KB
/
language_sam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Copied from https://github.com/luca-medeiros/lang-segment-anything/blob/main/lang_sam/lang_sam.py
import os
import groundingdino.datasets.transforms as T
import numpy as np
import torch
from groundingdino.models import build_model
from groundingdino.util import box_ops
from groundingdino.util.inference import predict
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict
from huggingface_hub import hf_hub_download
from segment_anything import sam_model_registry
from segment_anything import SamPredictor
SAM_MODELS = {
"vit_h": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth",
"vit_l": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth",
"vit_b": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth"
}
CACHE_PATH = os.environ.get("TORCH_HOME", os.path.expanduser("~/.cache/torch/hub/checkpoints"))
def load_model_hf(repo_id, filename, ckpt_config_filename, device='cpu'):
cache_config_file = hf_hub_download(repo_id=repo_id, filename=ckpt_config_filename)
args = SLConfig.fromfile(cache_config_file)
model = build_model(args)
args.device = device
cache_file = hf_hub_download(repo_id=repo_id, filename=filename)
checkpoint = torch.load(cache_file, map_location='cpu')
log = model.load_state_dict(clean_state_dict(checkpoint['model']), strict=False)
print(f"Model loaded from {cache_file} \n => {log}")
model.eval()
return model
def transform_image(image) -> torch.Tensor:
transform = T.Compose([
#T.RandomResize([800], max_size=1333),
T.RandomResize([1200], max_size=1200),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])
image_transformed, _ = transform(image, None)
return image_transformed
class LanguageSAM():
def __init__(self, sam_type="vit_h", ckpt_path=None):
self.sam_type = sam_type
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.build_groundingdino()
self.build_sam(ckpt_path)
def build_sam(self, ckpt_path):
if self.sam_type is None or ckpt_path is None:
if self.sam_type is None:
print("No sam type indicated. Using vit_h by default.")
self.sam_type = "vit_h"
checkpoint_url = SAM_MODELS[self.sam_type]
try:
sam = sam_model_registry[self.sam_type]()
state_dict = torch.hub.load_state_dict_from_url(checkpoint_url)
sam.load_state_dict(state_dict, strict=True)
except:
raise ValueError(f"Problem loading SAM please make sure you have the right model type: {self.sam_type} \
and a working checkpoint: {checkpoint_url}. Recommend deleting the checkpoint and \
re-downloading it.")
sam.to(device=self.device)
self.sam = SamPredictor(sam)
else:
try:
sam = sam_model_registry[self.sam_type](ckpt_path)
except:
raise ValueError(f"Problem loading SAM. Your model type: {self.sam_type} \
should match your checkpoint path: {ckpt_path}. Recommend calling LangSAM \
using matching model type AND checkpoint path")
sam.to(device=self.device)
self.sam = SamPredictor(sam)
def build_groundingdino(self):
ckpt_repo_id = "ShilongLiu/GroundingDINO"
ckpt_filename = "groundingdino_swinb_cogcoor.pth"
ckpt_config_filename = "GroundingDINO_SwinB.cfg.py"
self.groundingdino = load_model_hf(ckpt_repo_id, ckpt_filename, ckpt_config_filename)
def predict_dino(self, image_pil, text_prompt, box_threshold, text_threshold):
image_trans = transform_image(image_pil)
boxes, logits, phrases = predict(model=self.groundingdino,
image=image_trans,
caption=text_prompt,
box_threshold=box_threshold,
text_threshold=text_threshold,
device=self.device)
W, H = image_pil.size
boxes = box_ops.box_cxcywh_to_xyxy(boxes) * torch.Tensor([W, H, W, H])
return boxes, logits, phrases
def predict_sam(self, image_pil, boxes):
image_array = np.asarray(image_pil)
self.sam.set_image(image_array)
transformed_boxes = self.sam.transform.apply_boxes_torch(boxes, image_array.shape[:2])
masks, _, _ = self.sam.predict_torch(
point_coords=None,
point_labels=None,
boxes=transformed_boxes.to(self.sam.device),
multimask_output=False,
)
return masks.cpu()
def predict(self, image_pil, text_prompt, box_threshold=0.2, text_threshold=0.15):
boxes, logits, phrases = self.predict_dino(image_pil, text_prompt, box_threshold, text_threshold)
masks = torch.tensor([])
if len(boxes) > 0:
masks = self.predict_sam(image_pil, boxes)
masks = masks.squeeze(1)
return masks, boxes, phrases, logits