ZachL1/unproject_omnidata_point.py

## unproject_omnidata_point.py
import json
import math
import numpy as np
import cv2
import torch
from plyfile import PlyData, PlyElement
from pytorch3d.transforms import euler_angles_to_matrix
from pytorch3d.renderer import FoVPerspectiveCameras

# Copied from omnidata, which is badly named, the function returns what is actually world_to_camera
def _get_cam_to_world_R_T_K(point_info):
    EULER_X_OFFSET_RADS = math.radians(90.0)
    location = point_info['camera_location']
    rotation = point_info['camera_rotation_final']
    fov      = point_info['field_of_view_rads']

    # Recover cam -> world
    ex, ey, ez = rotation
    R     = euler_angles_to_matrix(torch.tensor(
                [(ex - EULER_X_OFFSET_RADS, -ey, -ez)],
                dtype=torch.double), 'XZY')
    Tx, Ty, Tz = location
    T     = torch.tensor([[-Tx, Tz, Ty]], dtype=torch.double)

    # P3D expects world -> cam
    R_inv = R.transpose(1,2)
    T_inv = -R.bmm(T.unsqueeze(-1)).squeeze(-1)
    # T_inv = -R.bmm(T.unsqueeze(-1)).squeeze(-1)
    # T_inv = T
    # R_inv = R
    K = FoVPerspectiveCameras(R=R_inv, T=T_inv, fov=fov, degrees=False).compute_projection_matrix(znear=0.001, zfar=512.0, fov=fov, aspect_ratio=1.0, degrees=False)

    return dict(
        cam_to_world_R=R_inv.squeeze(0).float(),
        cam_to_world_T=T_inv.squeeze(0).float(),
        proj_K=K.squeeze(0).float(),
        proj_K_inv=K[:,:3,:3].inverse().squeeze(0).float())


def project_rgb_to_point_cloud(rgb_image, depth_map, K, R, t):
    """
    unproject RGB-D image to point cloud.
    camera coordinate system: x-right, y-down, z-forward.
    3D coordinates in world coordinate system can project to 2D homogeneous coordinates by: x' = K * [R|t] * X

    Parameters:
    - rgb_image: RGB image
    - depth_map: metric depth map
    - K: camera intrinsic matrix, i.e. [[fx, 0, cx], [0, fy, cy], [0, 0, 1]]
    - R: camera rotation matrix, i.e. world to camera
    - t: camera translation vector, i.e. world to camera

    Returns:
    - point_cloud: point cloud with color, Nx6 array, each row is [x, y, z, r, g, b]
    """
    h, w = depth_map.shape
    i, j = np.meshgrid(np.arange(w), np.arange(h), indexing='xy')

    # construct homogeneous pixel coordinates
    uv1 = np.stack([i.ravel() + 0.5, j.ravel() + 0.5, np.ones_like(i).ravel()], axis=1)

    # pixel coordinates -> camera coordinates
    K_inv = np.linalg.inv(K)
    xyz_camera = K_inv @ (uv1.T * depth_map.ravel())

    # camera coordinates -> world coordinates
    R_inv = R.T # camera to world
    t_inv = -R_inv @ t # camera to world
    xyz_world = R_inv @ xyz_camera + t_inv[:, None]
    xyz_world = xyz_world.T

    # point cloud with color
    colors = rgb_image.reshape(-1, 3)
    point_cloud = np.hstack([xyz_world, colors])

    # filter out invalid points
    valid_mask = (depth_map.ravel() > 0) & (depth_map.ravel() < 100)
    point_cloud = point_cloud[valid_mask]

    return point_cloud

def save_point_cloud(points_3d, save_path, binary=True):
    """
    Save point cloud to disk

    Parameters:
    - points_3d: point cloud with color, Nx6 array, each row is [x, y, z, r, g, b]
    - save_path: path to save the point cloud
    """
    python_types = (float, float, float, int, int, int)
    npy_types = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'), ('red', 'u1'),
                 ('green', 'u1'), ('blue', 'u1')]
    if binary is True:
        # Format into Numpy structured array
        vertices = []
        for row_idx in range(points_3d.shape[0]):
            cur_point = points_3d[row_idx]
            vertices.append(
                tuple(
                    dtype(point)
                    for dtype, point in zip(python_types, cur_point)))
        vertices_array = np.array(vertices, dtype=npy_types)
        el = PlyElement.describe(vertices_array, 'vertex')

        # write
        PlyData([el]).write(save_path)
    else:
        x = np.squeeze(points_3d[:, 0])
        y = np.squeeze(points_3d[:, 1])
        z = np.squeeze(points_3d[:, 2])
        r = np.squeeze(points_3d[:, 3])
        g = np.squeeze(points_3d[:, 4])
        b = np.squeeze(points_3d[:, 5])

        ply_head = 'ply\n' \
                    'format ascii 1.0\n' \
                    'element vertex %d\n' \
                    'property float x\n' \
                    'property float y\n' \
                    'property float z\n' \
                    'property uchar red\n' \
                    'property uchar green\n' \
                    'property uchar blue\n' \
                    'end_header' % r.shape[0]
        # ---- Save ply data to disk
        np.savetxt(save_path, np.column_stack([x, y, z, r, g, b]), fmt='%f %f %f %d %d %d', header=ply_head, comments='')


def unproject_omni(rgb_path, pc_path):
    depth_path = rgb_path.replace('/images/', '/depth/')[:-7] + "depth_zbuffer.png"
    point_info_path = rgb_path.replace('/images/', '/point_info/')[:-7] + "point_info.json"

    rgb_image = cv2.imread(rgb_path)[:, :, ::-1]  # BGR -> RGB
    depth_map = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 512.0

    # camera intrinsics and pose
    with open(point_info_path, 'r') as f:
        omnidata = json.load(f)
    # load R, T, K, the function returns what is actually world_to_camera
    rtk = _get_cam_to_world_R_T_K(omnidata)

    # camera intrinsics, convert to OpenCV format
    P = rtk['proj_K'].numpy() # OpenGL projection matrix
    w = h = 512 # omnidata should all be 512
    fx, fy, cx, cy = P[0,0]*w/2, P[1,1]*h/2, (w-P[0,2]*w)/2, (P[1,2]*h+h)/2
    K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])

    # camera extrinsics/pose, convert to OpenCV format
    world_to_cam_R_r = rtk['cam_to_world_R'].numpy() # for right multiply
    world_to_cam_R = world_to_cam_R_r.T # for left multiply
    world_to_cam_T = rtk['cam_to_world_T'].numpy()
    # Coordinate system transformation, i.e. pose in pytorch3d -> pose in OpenCV
    # Pytorch3D: right-handed, x-left, y-up, z-forward
    # OpenCV: right-handed, x-right, y-down, z-forward
    pytorch3d_to_opencv = np.array([[-1, 0, 0], [0, -1, 0], [0, 0, 1]])
    world_to_cam_R = pytorch3d_to_opencv @ world_to_cam_R
    world_to_cam_T = pytorch3d_to_opencv @ world_to_cam_T

    # Now we get the R, T, K in OpenCV format
    pc = project_rgb_to_point_cloud(rgb_image, depth_map, K, world_to_cam_R, world_to_cam_T)
    save_point_cloud(pc, pc_path)


if __name__ == '__main__':
    # read omnidata
    rgb_path_1 = '/home/dzc/workspace/stereo/gaussian-splatting/data/taskonomy/onaga/images/point_0_view_0_domain_rgb.png'
    rgb_path_2 = '/home/dzc/workspace/stereo/gaussian-splatting/data/taskonomy/onaga/images/point_0_view_1_domain_rgb.png'

    unproject_omni(rgb_path_1, 'point_cloud_1.ply')
    unproject_omni(rgb_path_2, 'point_cloud_2.ply')
	import json
	import math
	import numpy as np
	import cv2
	import torch
	from plyfile import PlyData, PlyElement
	from pytorch3d.transforms import euler_angles_to_matrix
	from pytorch3d.renderer import FoVPerspectiveCameras

	# Copied from omnidata, which is badly named, the function returns what is actually world_to_camera
	def _get_cam_to_world_R_T_K(point_info):
	EULER_X_OFFSET_RADS = math.radians(90.0)
	location = point_info['camera_location']
	rotation = point_info['camera_rotation_final']
	fov = point_info['field_of_view_rads']

	# Recover cam -> world
	ex, ey, ez = rotation
	R = euler_angles_to_matrix(torch.tensor(
	[(ex - EULER_X_OFFSET_RADS, -ey, -ez)],
	dtype=torch.double), 'XZY')
	Tx, Ty, Tz = location
	T = torch.tensor([[-Tx, Tz, Ty]], dtype=torch.double)

	# P3D expects world -> cam
	R_inv = R.transpose(1,2)
	T_inv = -R.bmm(T.unsqueeze(-1)).squeeze(-1)
	# T_inv = -R.bmm(T.unsqueeze(-1)).squeeze(-1)
	# T_inv = T
	# R_inv = R
	K = FoVPerspectiveCameras(R=R_inv, T=T_inv, fov=fov, degrees=False).compute_projection_matrix(znear=0.001, zfar=512.0, fov=fov, aspect_ratio=1.0, degrees=False)

	return dict(
	cam_to_world_R=R_inv.squeeze(0).float(),
	cam_to_world_T=T_inv.squeeze(0).float(),
	proj_K=K.squeeze(0).float(),
	proj_K_inv=K[:,:3,:3].inverse().squeeze(0).float())


	def project_rgb_to_point_cloud(rgb_image, depth_map, K, R, t):
	"""
	unproject RGB-D image to point cloud.
	camera coordinate system: x-right, y-down, z-forward.
	3D coordinates in world coordinate system can project to 2D homogeneous coordinates by: x' = K * [R\|t] * X

	Parameters:
	- rgb_image: RGB image
	- depth_map: metric depth map
	- K: camera intrinsic matrix, i.e. [[fx, 0, cx], [0, fy, cy], [0, 0, 1]]
	- R: camera rotation matrix, i.e. world to camera
	- t: camera translation vector, i.e. world to camera

	Returns:
	- point_cloud: point cloud with color, Nx6 array, each row is [x, y, z, r, g, b]
	"""
	h, w = depth_map.shape
	i, j = np.meshgrid(np.arange(w), np.arange(h), indexing='xy')

	# construct homogeneous pixel coordinates
	uv1 = np.stack([i.ravel() + 0.5, j.ravel() + 0.5, np.ones_like(i).ravel()], axis=1)

	# pixel coordinates -> camera coordinates
	K_inv = np.linalg.inv(K)
	xyz_camera = K_inv @ (uv1.T * depth_map.ravel())

	# camera coordinates -> world coordinates
	R_inv = R.T # camera to world
	t_inv = -R_inv @ t # camera to world
	xyz_world = R_inv @ xyz_camera + t_inv[:, None]
	xyz_world = xyz_world.T

	# point cloud with color
	colors = rgb_image.reshape(-1, 3)
	point_cloud = np.hstack([xyz_world, colors])

	# filter out invalid points
	valid_mask = (depth_map.ravel() > 0) & (depth_map.ravel() < 100)
	point_cloud = point_cloud[valid_mask]

	return point_cloud

	def save_point_cloud(points_3d, save_path, binary=True):
	"""
	Save point cloud to disk

	Parameters:
	- points_3d: point cloud with color, Nx6 array, each row is [x, y, z, r, g, b]
	- save_path: path to save the point cloud
	"""
	python_types = (float, float, float, int, int, int)
	npy_types = [('x', 'f4'), ('y', 'f4'), ('z', 'f4'), ('red', 'u1'),
	('green', 'u1'), ('blue', 'u1')]
	if binary is True:
	# Format into Numpy structured array
	vertices = []
	for row_idx in range(points_3d.shape[0]):
	cur_point = points_3d[row_idx]
	vertices.append(
	tuple(
	dtype(point)
	for dtype, point in zip(python_types, cur_point)))
	vertices_array = np.array(vertices, dtype=npy_types)
	el = PlyElement.describe(vertices_array, 'vertex')

	# write
	PlyData([el]).write(save_path)
	else:
	x = np.squeeze(points_3d[:, 0])
	y = np.squeeze(points_3d[:, 1])
	z = np.squeeze(points_3d[:, 2])
	r = np.squeeze(points_3d[:, 3])
	g = np.squeeze(points_3d[:, 4])
	b = np.squeeze(points_3d[:, 5])

	ply_head = 'ply\n' \
	'format ascii 1.0\n' \
	'element vertex %d\n' \
	'property float x\n' \
	'property float y\n' \
	'property float z\n' \
	'property uchar red\n' \
	'property uchar green\n' \
	'property uchar blue\n' \
	'end_header' % r.shape[0]
	# ---- Save ply data to disk
	np.savetxt(save_path, np.column_stack([x, y, z, r, g, b]), fmt='%f %f %f %d %d %d', header=ply_head, comments='')


	def unproject_omni(rgb_path, pc_path):
	depth_path = rgb_path.replace('/images/', '/depth/')[:-7] + "depth_zbuffer.png"
	point_info_path = rgb_path.replace('/images/', '/point_info/')[:-7] + "point_info.json"

	rgb_image = cv2.imread(rgb_path)[:, :, ::-1] # BGR -> RGB
	depth_map = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED).astype(np.float32) / 512.0

	# camera intrinsics and pose
	with open(point_info_path, 'r') as f:
	omnidata = json.load(f)
	# load R, T, K, the function returns what is actually world_to_camera
	rtk = _get_cam_to_world_R_T_K(omnidata)

	# camera intrinsics, convert to OpenCV format
	P = rtk['proj_K'].numpy() # OpenGL projection matrix
	w = h = 512 # omnidata should all be 512
	fx, fy, cx, cy = P[0,0]w/2, P[1,1]h/2, (w-P[0,2]w)/2, (P[1,2]h+h)/2
	K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])

	# camera extrinsics/pose, convert to OpenCV format
	world_to_cam_R_r = rtk['cam_to_world_R'].numpy() # for right multiply
	world_to_cam_R = world_to_cam_R_r.T # for left multiply
	world_to_cam_T = rtk['cam_to_world_T'].numpy()
	# Coordinate system transformation, i.e. pose in pytorch3d -> pose in OpenCV
	# Pytorch3D: right-handed, x-left, y-up, z-forward
	# OpenCV: right-handed, x-right, y-down, z-forward
	pytorch3d_to_opencv = np.array([[-1, 0, 0], [0, -1, 0], [0, 0, 1]])
	world_to_cam_R = pytorch3d_to_opencv @ world_to_cam_R
	world_to_cam_T = pytorch3d_to_opencv @ world_to_cam_T

	# Now we get the R, T, K in OpenCV format
	pc = project_rgb_to_point_cloud(rgb_image, depth_map, K, world_to_cam_R, world_to_cam_T)
	save_point_cloud(pc, pc_path)


	if __name__ == '__main__':
	# read omnidata
	rgb_path_1 = '/home/dzc/workspace/stereo/gaussian-splatting/data/taskonomy/onaga/images/point_0_view_0_domain_rgb.png'
	rgb_path_2 = '/home/dzc/workspace/stereo/gaussian-splatting/data/taskonomy/onaga/images/point_0_view_1_domain_rgb.png'

	unproject_omni(rgb_path_1, 'point_cloud_1.ply')
	unproject_omni(rgb_path_2, 'point_cloud_2.ply')
No results found