making camera work again

2025-10-16 11:45:42 +00:00 · 2021-02-26 12:50:38 +01:00 · 2021-02-26 12:50:38 +01:00 · 36cd2c6648
commit 36cd2c6648
parent eb1b5876a8
13 changed files with 236 additions and 40 deletions
--- a/.gitignore
+++ b/.gitignore
@ -101,4 +101,6 @@ vposer_v1_0
 results/
 output/
 tests/
-samples/video*
+samples/
 raw/
 presentation/
--- a/config.yaml
+++ b/config.yaml
@ -19,8 +19,8 @@ camera:
  patience: 10
  optimizer: Adam
 orientation:
-  lr: 0.03
+  lr: 0.5
-  optimizer: LBFGS
+  optimizer: Adam
  iterations: 5
  joint_names: ["hip-left", "hip-right", "shoulder-left", "shoulder-right"] # joints to be used for optimization
 pose:
--- a/example_fit_camera.py
+++ b/example_fit_camera.py
@ -0,0 +1,84 @@
 import pickle
 import time
 from train import create_animation
 from dataset import SMPLyDataset
 from model import *
 from utils.general import *
 from renderer import *
 from utils.general import rename_files, get_new_filename
 START_IDX = 1  # starting index of the frame to optimize for
 FINISH_IDX = 60   # choose a big number to optimize for all frames in samples directory
 # if False, only run already saved animation without optimization
 RUN_OPTIMIZATION = True
 result_image = []
 idx = START_IDX
 device = torch.device('cpu')
 dtype = torch.float32
 config = load_config()
 dataset = SMPLyDataset.from_config(config)
 model = SMPLyModel.model_from_conf(config)
 # Rename files in samples directory to uniform format
 if config['data']['renameFiles']:
    rename_files(config['data']['rootDir'] + "/")
 '''
 Optimization part without visualization
 '''
 if RUN_OPTIMIZATION:
    model_outs, filename = create_animation(
        dataset,
        config,
        START_IDX,
        FINISH_IDX,
        verbose=False,
        offscreen=True,
        save_to_file=True,
        interpolate=False
    )
 def replay_animation(file, start_frame=0, end_frame=None, with_background=False, fps=30, interpolated=False):
    r = Renderer()
    r.start()
    model_anim = SMPLyModel.model_from_conf(config)
    with open(file, "rb") as fp:
        results = pickle.load(fp)
    if end_frame is None:
        end_frame = len(results)
    for model, camera_transform in results[start_frame::]:
        if interpolated:
            vertices = model
        else:
            vertices = model.vertices
        r.render_model_geometry(
            faces=model_anim.faces,
            vertices=vertices,
            pose=camera_transform
        )
        time.sleep(1 / fps)
 '''
 Play the animation.
 '''
 if RUN_OPTIMIZATION:
    anim_file = filename
 else:
    results_dir = config['output']['rootDir']
    result_prefix = config['output']['prefix']
    anim_file = results_dir + result_prefix + "0.pkl"
 replay_animation(anim_file, interpolated=True)
--- a/example_render_video.py
+++ b/example_render_video.py
@ -9,7 +9,7 @@ from renderer import *
 from utils.general import rename_files, get_new_filename
 START_IDX = 150  # starting index of the frame to optimize for
-FINISH_IDX = 300  # choose a big number to optimize for all frames in samples directory
+FINISH_IDX = 400  # choose a big number to optimize for all frames in samples directory
 result_image = []
 idx = START_IDX
--- a/examples/init.py
+++ b/examples/init.py
--- a/modules/perspective_cam.py
+++ b/modules/perspective_cam.py
@ -0,0 +1,84 @@
 from collections import namedtuple
 import torch
 import torch.nn as nn
 from smplx.lbs import transform_mat
 class PerspectiveCamera(nn.Module):
    FOCAL_LENGTH = 5000
    def __init__(self, rotation=None, translation=None,
                 focal_length_x=None, focal_length_y=None,
                 batch_size=1,
                 center=None, dtype=torch.float32, **kwargs):
        super(PerspectiveCamera, self).__init__()
        self.batch_size = batch_size
        self.dtype = dtype
        # Make a buffer so that PyTorch does not complain when creating
        # the camera matrix
        self.register_buffer('zero',
                             torch.zeros([batch_size], dtype=dtype))
        if focal_length_x is None or type(focal_length_x) == float:
            focal_length_x = torch.full(
                [batch_size],
                self.FOCAL_LENGTH if focal_length_x is None else
                focal_length_x,
                dtype=dtype)
        if focal_length_y is None or type(focal_length_y) == float:
            focal_length_y = torch.full(
                [batch_size],
                self.FOCAL_LENGTH if focal_length_y is None else
                focal_length_y,
                dtype=dtype)
        self.register_buffer('focal_length_x', focal_length_x)
        self.register_buffer('focal_length_y', focal_length_y)
        if center is None:
            center = torch.zeros([batch_size, 2], dtype=dtype)
        self.register_buffer('center', center)
        if rotation is None:
            rotation = torch.eye(
                3, dtype=dtype).unsqueeze(dim=0).repeat(batch_size, 1, 1)
        rotation = nn.Parameter(rotation, requires_grad=True)
        self.register_parameter('rotation', rotation)
        if translation is None:
            translation = torch.zeros([batch_size, 3], dtype=dtype)
        translation = nn.Parameter(translation,
                                   requires_grad=True)
        self.register_parameter('translation', translation)
    def forward(self, points):
        device = points.device
        with torch.no_grad():
            camera_mat = torch.zeros([self.batch_size, 2, 2],
                                     dtype=self.dtype, device=points.device)
            camera_mat[:, 0, 0] = self.focal_length_x
            camera_mat[:, 1, 1] = self.focal_length_y
        camera_transform = transform_mat(self.rotation,
                                         self.translation.unsqueeze(dim=-1))
        homog_coord = torch.ones(list(points.shape)[:-1] + [1],
                                 dtype=points.dtype,
                                 device=device)
        # Convert the points to homogeneous coordinates
        points_h = torch.cat([points, homog_coord], dim=-1)
        projected_points = torch.einsum('bki,bji->bjk',
                                        [camera_transform, points_h])
        img_points = torch.div(projected_points[:, :, :2],
                               projected_points[:, :, 2].unsqueeze(dim=-1))
        img_points = torch.einsum('bki,bji->bjk', [camera_mat, img_points]) \
            + self.center.unsqueeze(dim=1)
        return img_points
--- a/train.py
+++ b/train.py
@ -60,12 +60,12 @@ def optimize_sample(sample_index, dataset, config, device=torch.device('cpu'), d
        device = torch.device('cpu')
    # get camera estimation
-    pose_camera, cam_trans, cam_int, cam_params = SimpleCamera.from_estimation_cam(
+    # pose_camera, cam_trans, cam_int, cam_params = SimpleCamera.from_estimation_cam(
-        cam=camera,
+    #     cam=camera,
-        use_intrinsics=config['pose']['useCameraIntrinsics'],
+    #     use_intrinsics=config['pose']['useCameraIntrinsics'],
-        dtype=dtype,
+    #     dtype=dtype,
-        device=device,
+    #     device=device,
-    )
+    # )
    params = defaultdict(
        body_pose=initial_pose,
@ -74,15 +74,17 @@ def optimize_sample(sample_index, dataset, config, device=torch.device('cpu'), d
    with torch.no_grad():
        model(**params)
    r.start()
    # apply transform to scene
-    if r is not None:
+    # if r is not None:
-        r.set_group_pose("body", cam_trans.cpu().numpy())
+    #r.set_group_pose("body", cam_trans.cpu().numpy())
    global_orient = train_orient_with_conf(
        config=config,
        model=model,
        keypoints=keypoints,
-        camera_layer=pose_camera,
+        camera_layer=None,  # pose_camera,
        renderer=r,
        device=device,
        use_progress_bar=verbose,
--- a/train_camera.py
+++ b/train_camera.py
--- a/train_orient.py
+++ b/train_orient.py
@ -1,4 +1,5 @@
-from utils.mapping import get_indices_by_name
+from modules.perspective_cam import PerspectiveCamera
 from utils.mapping import get_indices_by_name, opengl_to_screen_space
 from modules.distance_loss import WeightedMSELoss
 from modules.utils import get_loss_layers
 from camera_estimation import TorchCameraEstimate
@ -52,10 +53,18 @@ def train_orient(
    )
    # make sure camera module is on the correct device
-    camera = camera.to(device=device, dtype=dtype)
+    #camera = camera.to(device=device, dtype=dtype)
    pers_cam = PerspectiveCamera(
        dtype=dtype, device=device,
        center=torch.tensor([[1920/2, 1080/2]], dtype=dtype),
        focal_length_x=850.0,
        focal_length_y=850.0
    ).to(device=device)
    # setup keypoint data
    keypoints = torch.tensor(keypoints).to(device=device, dtype=dtype)
    # keypoints = opengl_to_screen_space(keypoints, (1920, 1080))
    # do some janky conversion back to pixel :(
    # torso indices
    torso_indices = get_indices_by_name(joint_names)
@ -67,7 +76,7 @@ def train_orient(
    pose_layer = BodyPose(model, dtype=dtype, device=device,
                          useBodyMeanAngles=False).to(device=device, dtype=dtype)
-    parameters = [model.global_orient]
+    parameters = [model.global_orient, pers_cam.rotation, pers_cam.translation]
    if use_progress_bar:
        pbar = tqdm(total=iterations)
@ -85,19 +94,23 @@ def train_orient(
    optimizer = optimizer(parameters, learning_rate)
    print(keypoints[0][0])
    body_joints, cur_pose = pose_layer()
    body_joints = opengl_to_screen_space(body_joints.clone(), (1080, 1080))
    print(body_joints[0][0])
    # prediction and loss computation closere
    def predict():
        # return joints based on current model state
        body_joints, cur_pose = pose_layer()
-
+        # body_joints = opengl_to_screen_space(body_joints.clone(), (1920, 1080))
        # compute homogeneous coordinates and project them to 2D space
-        points = tgm.convert_points_to_homogeneous(body_joints)
+        #points = tgm.convert_points_to_homogeneous(body_joints)
-        points = camera(points).squeeze()
+        points = pers_cam(body_joints).squeeze()
-
+        print(points[0][0])
        # compute loss between 2D joint projection and OpenPose keypoints
        loss = loss_layer(points[torso_indices],
-                          keypoints[torso_indices])
+                          keypoints[torso_indices][:, :2])
        return loss
    # main optimizer closure
@ -112,10 +125,10 @@ def train_orient(
        return loss
    # camera translation
-    R = camera.trans.detach().cpu().numpy().squeeze()
+    #R = camera.trans.detach().cpu().numpy().squeeze()
    # main optimization loop
-    for t in range(iterations):
+    for t in range(2000):
        loss = optimizer.step(optim_closure)
        # compute loss
@ -136,12 +149,14 @@ def train_orient(
            pbar.set_description("Error %f" % cur_loss)
            pbar.update(1)
-        if renderer is not None and render_steps:
+        # if renderer is not None and render_steps:
-            renderer.render_model(
+            # renderer.render_model(
-                model=model,
+            #     model=model,
-                model_out=pose_layer.cur_out,
+            #     model_out=pose_layer.cur_out,
-                transform=R
+            #     transform=R
-            )
+            # )
    print("translation", pers_cam.translation)
    if use_progress_bar:
        pbar.close()
--- a/utils/general.py
+++ b/utils/general.py
@ -106,7 +106,7 @@ def estimate_scale(joints, keypoints, pairs=[
    smpl_height = np.linalg.norm(smpl_dists, axis=0).mean()
    ops_height = np.linalg.norm(ops_dists, axis=0).mean()
-    return cam_fy / 1080 * smpl_height / ops_height
+    return smpl_height / ops_height
 def estimate_focal_length(run_estimation: bool = False):
@ -192,7 +192,7 @@ def setup_training(model, dataset, sample_index, renderer=True, offscreen=False)
    est_scale = estimate_scale(joints, keypoints)
    # apply scaling to keypoints
-    keypoints = keypoints * est_scale
+    keypoints = keypoints  # * est_scale
    # integrating Camera Estimation
--- a/utils/mapping.py
+++ b/utils/mapping.py
@ -278,13 +278,15 @@ def openpose_to_opengl_coords(
        [type]: [description]
    """
-    points = np.array([
+    # points = np.array([
-        [
+    #     [
-            x / real_width * 2 - 1,
+    #         x / real_width * 2 - 1,
-            -y / real_height * 2 + 1,
+    #         -y / real_height * 2 + 1,
-            0
+    #         0
-        ] for (x, y, z) in input_data])
+    #     ] for (x, y, z) in input_data])
    points = np.array(input_data)[:, :3]
    points[:, 2] = 0
    conf = np.array([
        z for (_, _, z) in input_data
    ])
@ -292,6 +294,13 @@ def openpose_to_opengl_coords(
    return (points, conf)
 def opengl_to_screen_space(points, size):
    points[:, 0] = (points[:, 0] + 1) / 2 * size[0]
    points[:, 1] = -((points[:, 1] - 1) / 2 * size[1])
    points[:, 2] = ((points[:, 2] - 1) / 2 * size[1])
    return points
 def smpl_to_openpose(print_mapping: True):
    """Utility for remapping smpl mapping indices to openpose mapping indices. 
--- a/utils/video.py
+++ b/utils/video.py
@ -93,7 +93,7 @@ def save_to_video(
        r.render_model_geometry(
            faces=model_anim.faces,
            vertices=vertices,
-            pose=cam_trans  # cam_transform,
+            pose=cam_transform  # cam_transform,
        )
        frames.append(r.get_snapshot())