forked from DepthAnything/Depth-Anything-V2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
depth_to_pointcloud.py
114 lines (93 loc) · 4.91 KB
/
depth_to_pointcloud.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""
Born out of Depth Anything V1 Issue 36
Make sure you have the necessary libraries installed.
Code by @1ssb
This script processes a set of images to generate depth maps and corresponding point clouds.
The resulting point clouds are saved in the specified output directory.
Usage:
python script.py --encoder vitl --load-from path_to_model --max-depth 20 --img-path path_to_images --outdir output_directory --focal-length-x 470.4 --focal-length-y 470.4
Arguments:
--encoder: Model encoder to use. Choices are ['vits', 'vitb', 'vitl', 'vitg'].
--load-from: Path to the pre-trained model weights.
--max-depth: Maximum depth value for the depth map.
--img-path: Path to the input image or directory containing images.
--outdir: Directory to save the output point clouds.
--focal-length-x: Focal length along the x-axis.
--focal-length-y: Focal length along the y-axis.
"""
import argparse
import cv2
import glob
import numpy as np
import open3d as o3d
import os
from PIL import Image
import torch
from depth_anything_v2.dpt import DepthAnythingV2
def main():
# Parse command-line arguments
parser = argparse.ArgumentParser(description='Generate depth maps and point clouds from images.')
parser.add_argument('--encoder', default='vitl', type=str, choices=['vits', 'vitb', 'vitl', 'vitg'],
help='Model encoder to use.')
parser.add_argument('--load-from', default='', type=str, required=True,
help='Path to the pre-trained model weights.')
parser.add_argument('--max-depth', default=20, type=float,
help='Maximum depth value for the depth map.')
parser.add_argument('--img-path', type=str, required=True,
help='Path to the input image or directory containing images.')
parser.add_argument('--outdir', type=str, default='./vis_pointcloud',
help='Directory to save the output point clouds.')
parser.add_argument('--focal-length-x', default=470.4, type=float,
help='Focal length along the x-axis.')
parser.add_argument('--focal-length-y', default=470.4, type=float,
help='Focal length along the y-axis.')
args = parser.parse_args()
# Determine the device to use (CUDA, MPS, or CPU)
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
# Model configuration based on the chosen encoder
model_configs = {
'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
}
# Initialize the DepthAnythingV2 model with the specified configuration
depth_anything = DepthAnythingV2(**{**model_configs[args.encoder], 'max_depth': args.max_depth})
depth_anything.load_state_dict(torch.load(args.load_from, map_location='cpu'))
depth_anything = depth_anything.to(DEVICE).eval()
# Get the list of image files to process
if os.path.isfile(args.img_path):
if args.img_path.endswith('txt'):
with open(args.img_path, 'r') as f:
filenames = f.read().splitlines()
else:
filenames = [args.img_path]
else:
filenames = glob.glob(os.path.join(args.img_path, '**/*'), recursive=True)
# Create the output directory if it doesn't exist
os.makedirs(args.outdir, exist_ok=True)
# Process each image file
for k, filename in enumerate(filenames):
print(f'Processing {k+1}/{len(filenames)}: {filename}')
# Load the image
color_image = Image.open(filename).convert('RGB')
width, height = color_image.size
# Read the image using OpenCV
image = cv2.imread(filename)
pred = depth_anything.infer_image(image, height)
# Resize depth prediction to match the original image size
resized_pred = Image.fromarray(pred).resize((width, height), Image.NEAREST)
# Generate mesh grid and calculate point cloud coordinates
x, y = np.meshgrid(np.arange(width), np.arange(height))
x = (x - width / 2) / args.focal_length_x
y = (y - height / 2) / args.focal_length_y
z = np.array(resized_pred)
points = np.stack((np.multiply(x, z), np.multiply(y, z), z), axis=-1).reshape(-1, 3)
colors = np.array(color_image).reshape(-1, 3) / 255.0
# Create the point cloud and save it to the output directory
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(points)
pcd.colors = o3d.utility.Vector3dVector(colors)
o3d.io.write_point_cloud(os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + ".ply"), pcd)
if __name__ == '__main__':
main()