forked from behretj/3D-Scene-Understanding
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathobject_detection.py
281 lines (219 loc) · 14 KB
/
object_detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import numpy as np
import pickle, glob, os
from projectaria_tools.core import data_provider, calibration
from projectaria_tools.core.mps.utils import get_nearest_wrist_and_palm_pose, get_nearest_pose
import projectaria_tools.core.mps as mps
### all functions here are not really needed anymore
def get_all_aria_hand_poses(scan_dir, mode_left_right=False):
""" Returns all Aria MPS poses for left and right hand """
vrs_files = glob.glob(os.path.join(scan_dir, '*.vrs'))
assert vrs_files is not None, "No vrs files found in directory"
vrs_file = vrs_files[0]
filename = os.path.splitext(os.path.basename(vrs_file))[0]
closed_loop_path = scan_dir + "/mps_" + filename + "_vrs/slam/closed_loop_trajectory.csv"
closed_loop_traj = mps.read_closed_loop_trajectory(closed_loop_path)
wrist_and_palm_poses_path = scan_dir + "/mps_" + filename + "_vrs/hand_tracking/wrist_and_palm_poses.csv"
wrist_and_palm_poses = mps.hand_tracking.read_wrist_and_palm_poses(wrist_and_palm_poses_path)
hand_poses_left, hand_poses_right, T_poses = [], [], []
both_count, i = 0, 0
for hand_pose in wrist_and_palm_poses:
query_timestamp = hand_pose.tracking_timestamp.total_seconds()*1e9
device_pose = get_nearest_pose(closed_loop_traj, query_timestamp)
if device_pose:
T_world_device = device_pose.transform_world_device.to_matrix()
if hand_pose.left_hand.confidence > 0.0:
left_palm_position_device = hand_pose.left_hand.palm_position_device
left_palm_position_world = np.dot(T_world_device, np.append(left_palm_position_device, 1))[:3]
hand_poses_left.append(left_palm_position_world)
T_poses.append(T_world_device)
if hand_pose.right_hand.confidence > 0.0:
right_palm_position_device = hand_pose.right_hand.palm_position_device
right_palm_position_world = np.dot(T_world_device, np.append(right_palm_position_device, 1))[:3]
hand_poses_right.append(right_palm_position_world) if mode_left_right else hand_poses_left.append(right_palm_position_world)
T_poses.append(T_world_device)
return (np.array(hand_poses_left), np.array(hand_poses_right), T_poses) if mode_left_right else (np.array(hand_poses_left), T_poses)
def get_hand_object_interactions(scan_dir, mode_left_right=False):
""" Returns nearest Aria MPS poses, separated for left and right hand, for all the interactions between
hand and object. Hand-object interactions are filtered and in the end only the interactions are
returned where the hand is in contact with the object, the object is movable, the hand and object
are detected with high confidence and all poses lie within 100ms of each other """
vrs_files = glob.glob(os.path.join(scan_dir, '*.vrs'))
assert vrs_files is not None, "No vrs files found in directory"
vrs_file = vrs_files[0]
filename = os.path.splitext(os.path.basename(vrs_file))[0]
provider = data_provider.create_vrs_data_provider(vrs_file)
assert provider is not None, "Cannot open file"
camera_label = "camera-rgb"
stream_id = provider.get_stream_id_from_label(camera_label)
detection_files = glob.glob(os.path.join(scan_dir, '*.pickle'))
assert detection_files is not None, "No detection files found in directory"
detection_path = detection_files[0]
with open(detection_path, "rb") as f:
detection_results = pickle.load(f)
wrist_and_palm_poses_path = scan_dir + "/mps_" + filename + "_vrs/hand_tracking/wrist_and_palm_poses.csv"
wrist_and_palm_poses = mps.hand_tracking.read_wrist_and_palm_poses(wrist_and_palm_poses_path)
closed_loop_path = scan_dir + "/mps_" + filename + "_vrs/slam/closed_loop_trajectory.csv"
closed_loop_traj = mps.read_closed_loop_trajectory(closed_loop_path)
hand_poses_left, hand_poses_right, T_world_devices = [], [], []
for index in range(0, provider.get_num_data(stream_id)):
name_curr = f"frame_{index:05}.jpg"
image_info = detection_results[name_curr]
hand_dets = image_info['hand_dets']
obj_dets = image_info['obj_dets']
if (obj_dets is not None) and (hand_dets is not None):
correct_state = False
for i in range(hand_dets.shape[0]):
hand_bbox = list(int(np.round(x)) for x in hand_dets[i, :4])
hand_score = hand_dets[i, 4]
hand_state = hand_dets[i, 5]
if hand_score > 0.8 and hand_state == 3: # hand is in contact
correct_state = True
break
# hand_vec = hand_dets[i, 6:9]
# hand_lr = hand_dets[i, -1]
if not correct_state:
continue
found_object = False
for i in range(obj_dets.shape[0]):
# coordinates are a bit off (very strange)
# obj_bbox = list(int(np.round(x)) for x in obj_dets[i, :4])
obj_score = obj_dets[i, 4]
if obj_score > 0.9:
found_object = True
break
if not found_object:
continue
# get timestamp of current image
query_timestamp = provider.get_image_data_by_index(stream_id, index)[1].capture_timestamp_ns
device_pose = get_nearest_pose(closed_loop_traj, query_timestamp)
if device_pose is None:
continue
if abs(device_pose.tracking_timestamp.total_seconds()*1e9 - query_timestamp) > 1e8:
continue
T_world_device = device_pose.transform_world_device.to_matrix()
wrist_and_palm_pose = get_nearest_wrist_and_palm_pose(wrist_and_palm_poses, query_timestamp)
if wrist_and_palm_pose is None:
continue
found_object = False
if wrist_and_palm_pose.left_hand.confidence > 0.5 and abs(wrist_and_palm_pose.tracking_timestamp.total_seconds()*1e9 - query_timestamp) < 1e8:
left_palm_position_device = wrist_and_palm_pose.left_hand.palm_position_device
left_palm_position_world = np.dot(T_world_device, np.append(left_palm_position_device, 1))[:3]
hand_poses_left.append(left_palm_position_world)
T_world_devices.append(T_world_device)
found_object = True
if wrist_and_palm_pose.right_hand.confidence > 0.5 and abs(wrist_and_palm_pose.tracking_timestamp.total_seconds()*1e9 - query_timestamp) < 1e8:
right_palm_position_device = wrist_and_palm_pose.right_hand.palm_position_device
right_palm_position_world = np.dot(T_world_device, np.append(right_palm_position_device, 1))[:3]
hand_poses_right.append(right_palm_position_world) if mode_left_right else hand_poses_left.append(right_palm_position_world)
if not found_object:
T_world_devices.append(T_world_device)
return (np.array(hand_poses_left), np.array(hand_poses_right), np.array(T_world_devices)) if mode_left_right else (np.array(hand_poses_left), np.array(T_world_devices))
def get_all_object_detections(scan_dir):
""" Returns all object detections with corresponding hand poses """
vrs_files = glob.glob(os.path.join(scan_dir, '*.vrs'))
assert vrs_files is not None, "No vrs files found in directory"
vrs_file = vrs_files[0]
filename = os.path.splitext(os.path.basename(vrs_file))[0]
provider = data_provider.create_vrs_data_provider(vrs_file)
assert provider is not None, "Cannot open file"
camera_label = "camera-rgb"
stream_id = provider.get_stream_id_from_label(camera_label)
calib = provider.get_device_calibration().get_camera_calib(camera_label)
w, h = calib.get_image_size()
pinhole = calibration.get_linear_camera_calibration(w, h, calib.get_focal_lengths()[0])
detection_files = glob.glob(os.path.join(scan_dir, '*.pickle'))
assert detection_files is not None, "No detection files found in directory"
detection_path = detection_files[0]
with open(detection_path, "rb") as f:
detection_results = pickle.load(f)
closed_loop_path = scan_dir + "/mps_" + filename + "_vrs/slam/closed_loop_trajectory.csv"
closed_loop_traj = mps.read_closed_loop_trajectory(closed_loop_path)
hands_left, objects, hands_right = [], [], []
for i in range(0, provider.get_num_data(stream_id)):
name_curr = f"frame_{i:05}.jpg"
if detection_results[name_curr]['obj_dets'] is not None and detection_results[name_curr]['hand_dets'] is not None:
query_timestamp = provider.get_image_data_by_index(stream_id, i)[1].capture_timestamp_ns
img = provider.get_image_data_by_index(stream_id, i)[0].to_numpy_array()
pose = get_nearest_pose(closed_loop_traj, query_timestamp)
if pose is None:
continue
T_world_device = pose.transform_world_device
T_device_camera = calib.get_transform_device_camera()
T_world_camera = T_world_device @ T_device_camera
T_world_camera = T_world_camera.to_matrix()
obj = detection_results[name_curr]['obj_dets'][0]
T_world_device = T_world_device.to_matrix()
# pixel coordinates of object
x, y = (obj[0] + obj[2])/2, (obj[1]+obj[3])/2
# rotate pixel coordinate (camera model is for rotated image)
obj_pos = y, w-x-1
obj_pos = calib.unproject(obj_pos)
# default value for z (depth)
obj_pos[2] = 0.3
objects.append(np.dot(T_world_device, np.append(obj_pos, 1))[:3])
for detection in detection_results[name_curr]['hand_dets']:
x, y = (detection[0]+detection[2])/2, (detection[1]+detection[3])/2
obj_pos = y, w-x-1
obj_pos = calib.unproject_no_checks(obj_pos)
obj_pos[2] = 0.3
if detection[-1] == 0:
hands_left.append(np.dot(T_world_device, np.append(obj_pos, 1))[:3])
else:
hands_right.append(np.dot(T_world_device, np.append(obj_pos, 1))[:3])
return hands_left, objects, hands_right
def get_first_detection(scan_dir):
vrs_files = glob.glob(os.path.join(scan_dir, '*.vrs'))
assert vrs_files is not None, "No vrs files found in directory"
vrs_file = vrs_files[0]
filename = os.path.splitext(os.path.basename(vrs_file))[0]
provider = data_provider.create_vrs_data_provider(vrs_file)
assert provider is not None, "Cannot open file"
camera_label = "camera-rgb"
stream_id = provider.get_stream_id_from_label(camera_label)
detection_files = glob.glob(os.path.join(scan_dir, '*.pickle'))
assert detection_files is not None, "No detection files found in directory"
detection_path = detection_files[0]
with open(detection_path, "rb") as f:
detection_results = pickle.load(f)
wrist_and_palm_poses_path = scan_dir + "/mps_" + filename + "_vrs/hand_tracking/wrist_and_palm_poses.csv"
wrist_and_palm_poses = mps.hand_tracking.read_wrist_and_palm_poses(wrist_and_palm_poses_path)
# valid_poses = [wrist_and_palm_poses[0].tracking_timestamp.total_seconds()*1e9, wrist_and_palm_poses[-1].tracking_timestamp.total_seconds()*1e9]
closed_loop_path = scan_dir + "/mps_" + filename + "_vrs/slam/closed_loop_trajectory.csv"
closed_loop_traj = mps.read_closed_loop_trajectory(closed_loop_path)
for i in range(1, provider.get_num_data(stream_id)-1):
name_prev = f"frame_{i-1:05}.jpg"
name_curr = f"frame_{i:05}.jpg"
name_next = f"frame_{i+1:05}.jpg"
# very simple heuristic: if current, previous and next image have object detections -> found detection
if detection_results[name_prev] and detection_results[name_curr] and detection_results[name_next]:
if detection_results[name_prev]['obj_dets'] is not None and \
detection_results[name_curr]['obj_dets'] is not None and \
detection_results[name_next]['obj_dets'] is not None:
print(f"Found detection at frame {i}")
# get timestamp of current image
image_data = provider.get_image_data_by_index(stream_id, i)
query_timestamp = image_data[1].capture_timestamp_ns
# TODO: check whether hand an device pose are close enough to timestamp
# get hand pose at this timestamp
wrist_and_palm_pose = get_nearest_wrist_and_palm_pose(wrist_and_palm_poses, query_timestamp)
if wrist_and_palm_pose is None:
print("No wrist and palm pose found")
continue
device_pose = get_nearest_pose(closed_loop_traj, query_timestamp)
if device_pose is None:
print("No device pose found")
continue
T_world_device = device_pose.transform_world_device.to_matrix()
# check whether left or right palm was detected with sufficient confidence
left_palm_position_world = None
if wrist_and_palm_pose.left_hand.confidence > 0.8:
left_palm_position_device = wrist_and_palm_pose.left_hand.palm_position_device
left_palm_position_world = np.dot(T_world_device, np.append(left_palm_position_device, 1))[:3]
right_palm_position_world = None
if wrist_and_palm_pose.right_hand.confidence > 0.8:
right_palm_position_device = wrist_and_palm_pose.right_hand.palm_position_device
right_palm_position_world = np.dot(T_world_device, np.append(right_palm_position_device, 1))[:3]
if right_palm_position_world is None and left_palm_position_world is None:
print("No palm position found")
continue
return left_palm_position_world, right_palm_position_world