Add pose estimation and refine code

cleardusk · Dec 1, 2018 · e53a741 · e53a741
1 parent 09d138b
commit e53a741
Show file tree

Hide file tree

Showing 16 changed files with 254 additions and 44 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 .idea/
 *.pyc
 __pycache__/
+utils/__pycache__/
 test.data/
 training/snapshot/
 training/logs/
@@ -21,5 +22,3 @@ models/shape_predictor_68_face_landmarks.dat
 
 demo_obama/
 todo.md
-
-utils/__pycacje__/
diff --git a/benchmark_aflw.py b/benchmark_aflw.py
@@ -1,11 +1,8 @@
 #!/usr/bin/env python3
 # coding: utf-8
 
-import os
 import os.path as osp
 import numpy as np
-import sys
-from glob import glob
 from math import sqrt
 from utils.io import _load
 

diff --git a/main.py b/main.py
@@ -1,5 +1,9 @@
 #!/usr/bin/env python3
 # coding: utf-8
+import sys
+
+from utils.cv_plot import plot_pose_box
+
 __author__ = 'cleardusk'
 
 """
@@ -10,8 +14,6 @@
 1. CPU optimization: https://pmchojnacki.wordpress.com/2018/10/07/slow-pytorch-cpu-performance
 """
 
-# import modules
-
 import torch
 import torchvision.transforms as transforms
 import mobilenet_v1
@@ -22,9 +24,12 @@
 import scipy.io as sio
 from utils.inference import get_suffix, calc_roi_box, crop_img, predict_68pts, dump_to_ply, dump_vertex, draw_landmarks, \
     predict_dense
+from utils.estimate_pose import parse_pose
 import argparse
 import torch.backends.cudnn as cudnn
 
+STD_SIZE = 120
+
 
 def main(args):
     # 1. load pre-tained model
@@ -51,6 +56,7 @@ def main(args):
 
     # 3. forward
     tri = sio.loadmat('visualize/tri.mat')['tri']
+    transform = transforms.Compose([ToTensorGjz(), NormalizeGjz(mean=127.5, std=128)])
     for img_fp in args.files:
         img_ori = cv2.imread(img_fp)
         if args.dlib_bbox:
@@ -67,22 +73,21 @@ def main(args):
                 rect = dlib.rectangle(l, r, t, b)
                 rects.append(rect)
 
-        pts_dlib = []
         pts_res = []
+        Ps = []  # Camera matrix collection
+        poses = []  # pose collection, [todo: validate it]
         ind = 0
         suffix = get_suffix(img_fp)
         for rect in rects:
             # landmark & crop
             pts = face_regressor(img_ori, rect).parts()
             pts = np.array([[pt.x, pt.y] for pt in pts]).T
-            pts_dlib.append(pts)
 
             roi_box = calc_roi_box(pts)
             img = crop_img(img_ori, roi_box)
 
             # forward: one step
-            img = cv2.resize(img, dsize=(120, 120), interpolation=cv2.INTER_LINEAR)
-            transform = transforms.Compose([ToTensorGjz(), NormalizeGjz(mean=127.5, std=128)])
+            img = cv2.resize(img, dsize=(STD_SIZE, STD_SIZE), interpolation=cv2.INTER_LINEAR)
             input = transform(img).unsqueeze(0)
             with torch.no_grad():
                 if args.mode == 'gpu':
@@ -97,16 +102,20 @@ def main(args):
             if args.box_init == 'two':
                 roi_box = calc_roi_box(pts68)
                 img_step2 = crop_img(img_ori, roi_box)
-                img_step2 = cv2.resize(img_step2, dsize=(120, 120), interpolation=cv2.INTER_LINEAR)
+                img_step2 = cv2.resize(img_step2, dsize=(STD_SIZE, STD_SIZE), interpolation=cv2.INTER_LINEAR)
                 input = transform(img_step2).unsqueeze(0)
                 with torch.no_grad():
                     if args.mode == 'gpu':
                         input = input.cuda()
                     param = model(input)
                     param = param.squeeze().cpu().numpy().flatten().astype(np.float32)
+
                 pts68 = predict_68pts(param, roi_box)
 
             pts_res.append(pts68)
+            P, pose = parse_pose(param)
+            Ps.append(P)
+            poses.append(pose)
 
             # dense face vertices
             if args.dump_ply or args.dump_vertex:
@@ -123,8 +132,14 @@ def main(args):
                 wfp = '{}_{}.roibox'.format(img_fp.replace(suffix, ''), ind)
                 np.savetxt(wfp, roi_box, fmt='%.3f')
                 print('Save roi box to {}'.format(wfp))
-
             ind += 1
+
+        if args.dump_pose:
+            # P, pose = parse_pose(param)  # Camera matrix (without scale), and pose (yaw, pitch, roll, to verify)
+            img_pose = plot_pose_box(img_ori, Ps, pts_res)
+            wfp = img_fp.replace(suffix, '_pose.jpg')
+            cv2.imwrite(wfp, img_pose)
+            print('Dump to {}'.format(wfp))
         if args.dump_res:
             draw_landmarks(img_ori, pts_res, wfp=img_fp.replace(suffix, '_3DDFA.jpg'), show_flg=args.show_flg)
 
@@ -137,11 +152,12 @@ def main(args):
     parser.add_argument('--show_flg', default='True', type=str2bool, help='whether show the visualization result')
     parser.add_argument('--box_init', default='one', type=str, help='one|two: one-step bbox initialization or two-step')
     parser.add_argument('--dump_res', default='true', type=str2bool, help='whether write out the visualization image')
-    parser.add_argument('--dump_vertex', default='true', type=str2bool,
+    parser.add_argument('--dump_vertex', default='false', type=str2bool,
                         help='whether write out the dense face vertices to mat')
     parser.add_argument('--dump_ply', default='true', type=str2bool)
     parser.add_argument('--dump_pts', default='true', type=str2bool)
     parser.add_argument('--dump_roi_box', default='false', type=str2bool)
+    parser.add_argument('--dump_pose', default='true', type=str2bool)
     parser.add_argument('--dlib_bbox', default='true', type=str2bool, help='whether use dlib to predict bbox')
 
     args = parser.parse_args()

diff --git a/readme.md b/readme.md
@@ -8,6 +8,7 @@
 
 **\[Updates\]**
 
+ - `2018.12.1`: Add pose estimation and refine code, see [utils/estimate_pose.py](./utils/estimate_pose.py) for more details.
  - `2018.11.17`: Refine code and map the 3d vertex to original image space.
  - `2018.11.11`: **Update end-to-end inference pipeline: infer/serialize 3D face shape and 68 landmarks given one arbitrary image, please see readme.md below for more details.**
  - `2018.11.9`: Update trained model with higher performance in [models](./models).
@@ -76,23 +77,26 @@ In addition, I strongly recommend using Python3.6+ instead of older version for
     ```
     If you can see these output log in terminal, you run it successfully.
     ```
-    Dump to samples/test1_0.ply
-    Dump to samples/test1_0.mat
+    Dump tp samples/test1_0.ply
     Save 68 3d landmarks to samples/test1_0.txt
-    Dump to samples/test1_1.ply
-    Dump to samples/test1_1.mat
+    Dump tp samples/test1_1.ply
     Save 68 3d landmarks to samples/test1_1.txt
+    Dump to samples/test1_pose.jpg
     Save visualization result to samples/test1_3DDFA.jpg
     ```
 
     Because `test1.jpg` has two faces, there are two `mat` (stores dense face vertices, can be rendered by Matlab, see [visualize](./visualize)) and `ply` files (can be rendered by Meshlab or Microsoft 3D Builder) predicted.
 
     Please run `python3 main.py -h` or review the code for more details.
 
-    The result `samples/test1_3DDFA.jpg` is shown below
+    The 68 landmarks visualization result `samples/test1_3DDFA.jpg` and pose estimation result `samples/test1_pose.jpg` are shown below
 
 <p align="center">
-  <img src="samples/test1_3DDFA.jpg" alt="samples" width="700px">
+  <img src="samples/test1_3DDFA.jpg" alt="samples" width="650px">
+</p>
+
+<p align="center">
+  <img src="samples/test1_pose.jpg" alt="samples" width="650px">
 </p>
 
 3. Additional example
@@ -102,7 +106,11 @@ In addition, I strongly recommend using Python3.6+ instead of older version for
     ```
 
 <p align="center">
-  <img src="samples/emma_input_3DDFA.jpg" alt="samples" width="700px">
+  <img src="samples/emma_input_3DDFA.jpg" alt="samples" width="750px">
+</p>
+
+<p align="center">
+  <img src="samples/emma_input_pose.jpg" alt="samples" width="750px">
 </p>
 
 ## Citation
@@ -122,8 +130,6 @@ In addition, I strongly recommend using Python3.6+ instead of older version for
     }
 
     
-
-
 ## Inference speed
 When batch size is 128, the inference time of MobileNet-V1 takes about 34.7ms. The average speed is about **0.27ms/pic**.
 

diff --git a/samples/emma_input_3DDFA.jpg b/samples/emma_input_3DDFA.jpg
diff --git a/samples/emma_input_pose.jpg b/samples/emma_input_pose.jpg
diff --git a/samples/test1_3DDFA.jpg b/samples/test1_3DDFA.jpg
diff --git a/samples/test1_pose.jpg b/samples/test1_pose.jpg
diff --git a/utils/cv_plot.py b/utils/cv_plot.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# coding: utf-8
+
+
+"""
+Modified from: https://sourcegraph.com/github.com/YadiraF/PRNet@master/-/blob/utils/cv_plot.py
+"""
+
+import numpy as np
+import cv2
+
+from utils.inference import calc_hypotenuse
+
+end_list = np.array([17, 22, 27, 42, 48, 31, 36, 68], dtype=np.int32) - 1
+
+
+def plot_kpt(image, kpt):
+    ''' Draw 68 key points
+    Args:
+        image: the input image
+        kpt: (68, 3).
+    '''
+    image = image.copy()
+    kpt = np.round(kpt).astype(np.int32)
+    for i in range(kpt.shape[0]):
+        st = kpt[i, :2]
+        image = cv2.circle(image, (st[0], st[1]), 1, (0, 0, 255), 2)
+        if i in end_list:
+            continue
+        ed = kpt[i + 1, :2]
+        image = cv2.line(image, (st[0], st[1]), (ed[0], ed[1]), (255, 255, 255), 1)
+    return image
+
+
+def build_camera_box(rear_size=90):
+    point_3d = []
+    rear_depth = 0
+    point_3d.append((-rear_size, -rear_size, rear_depth))
+    point_3d.append((-rear_size, rear_size, rear_depth))
+    point_3d.append((rear_size, rear_size, rear_depth))
+    point_3d.append((rear_size, -rear_size, rear_depth))
+    point_3d.append((-rear_size, -rear_size, rear_depth))
+
+    front_size = int(4 / 3 * rear_size)
+    front_depth = int(4 / 3 * rear_size)
+    point_3d.append((-front_size, -front_size, front_depth))
+    point_3d.append((-front_size, front_size, front_depth))
+    point_3d.append((front_size, front_size, front_depth))
+    point_3d.append((front_size, -front_size, front_depth))
+    point_3d.append((-front_size, -front_size, front_depth))
+    point_3d = np.array(point_3d, dtype=np.float).reshape(-1, 3)
+
+    return point_3d
+
+
+def plot_pose_box(image, Ps, pts68s, color=(40, 255, 0), line_width=2):
+    ''' Draw a 3D box as annotation of pose. Ref:https://github.com/yinguobing/head-pose-estimation/blob/master/pose_estimator.py
+    Args:
+        image: the input image
+        P: (3, 4). Affine Camera Matrix.
+        kpt: (2, 68) or (3, 68)
+    '''
+    image = image.copy()
+    if not isinstance(pts68s, list):
+        pts68s = [pts68s]
+    if not isinstance(Ps, list):
+        Ps = [Ps]
+    for i in range(len(pts68s)):
+        pts68 = pts68s[i]
+        llength = calc_hypotenuse(pts68)
+        point_3d = build_camera_box(llength)
+        P = Ps[i]
+
+        # Map to 2d image points
+        point_3d_homo = np.hstack((point_3d, np.ones([point_3d.shape[0], 1])))  # n x 4
+        point_2d = point_3d_homo.dot(P.T)[:, :2]
+
+        point_2d[:, 1] = - point_2d[:, 1]
+        point_2d[:, :2] = point_2d[:, :2] - np.mean(point_2d[:4, :2], 0) + np.mean(pts68[:2, :27], 1)
+        point_2d = np.int32(point_2d.reshape(-1, 2))
+
+        # Draw all the lines
+        cv2.polylines(image, [point_2d], True, color, line_width, cv2.LINE_AA)
+        cv2.line(image, tuple(point_2d[1]), tuple(
+            point_2d[6]), color, line_width, cv2.LINE_AA)
+        cv2.line(image, tuple(point_2d[2]), tuple(
+            point_2d[7]), color, line_width, cv2.LINE_AA)
+        cv2.line(image, tuple(point_2d[3]), tuple(
+            point_2d[8]), color, line_width, cv2.LINE_AA)
+
+    return image
+
+
+def main():
+    pass
+
+
+if __name__ == '__main__':
+    main()
diff --git a/utils/ddfa.py b/utils/ddfa.py
@@ -11,13 +11,14 @@
 import pickle
 import argparse
 from .io import _numpy_to_tensor, _load_cpu, _load_gpu
-from params import *
+from utils.params import *
 
 
-def reconstruct_vertex(param, whitening=True, dense=False):
+def reconstruct_vertex(param, whitening=True, dense=False, transform=True):
     """Whitening param -> 3d vertex, based on the 3dmm param: u_base, w_shp, w_exp
     dense: if True, return dense vertex, else return 68 sparse landmarks. All dense or sparse vertex is transformed to
     image coordinate space, but without alignment caused by face cropping.
+    transform: whether transform to image space
     """
     if len(param) == 12:
         param = np.concatenate((param, [0] * 50))
@@ -36,14 +37,16 @@ def reconstruct_vertex(param, whitening=True, dense=False):
     if dense:
         vertex = p @ (u + w_shp @ alpha_shp + w_exp @ alpha_exp).reshape(3, -1, order='F') + offset
 
-        # transform to image coordinate space
-        vertex[1, :] = std_size + 1 - vertex[1, :]
+        if transform:
+            # transform to image coordinate space
+            vertex[1, :] = std_size + 1 - vertex[1, :]
     else:
         """For 68 pts"""
         vertex = p @ (u_base + w_shp_base @ alpha_shp + w_exp_base @ alpha_exp).reshape(3, -1, order='F') + offset
 
-        # transform to image coordinate space
-        vertex[1, :] = std_size + 1 - vertex[1, :]
+        if transform:
+            # transform to image coordinate space
+            vertex[1, :] = std_size + 1 - vertex[1, :]
 
     return vertex