diff --git a/Dockerfile b/Dockerfile index ecb2680b..bdeebfcc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -83,7 +83,6 @@ ARG CUPY_NVCC_GENERATE_CODE RUN if [[ -z ${CUPY_NVCC_GENERATE_CODE} ]]; then \ echo "CUPY_NVCC_GENERATE_CODE not set, building CuPy for all architectures (slower)"; \ fi && \ - pip install --no-cache-dir cython && \ if [[ ${TRT_IMAGE_VERSION} == 21.05 ]]; then \ CUPY_NUM_BUILD_JOBS=$(nproc) pip install --no-cache-dir -r <(grep -ivE "tensorflow" requirements.txt); \ else \ diff --git a/app.py b/app.py index 551c48f9..edadab58 100755 --- a/app.py +++ b/app.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from pathlib import Path +from types import SimpleNamespace import argparse import logging import json @@ -39,27 +40,27 @@ def main(): # load config file with open(args.config) as cfg_file: - config = json.load(cfg_file, cls=ConfigDecoder) + config = json.load(cfg_file, cls=ConfigDecoder, object_hook=lambda d: SimpleNamespace(**d)) + + stream = fastmot.VideoIO(config.resize_to, args.input_uri, args.output_uri, **vars(config.stream_cfg)) mot = None log = None - stream = fastmot.VideoIO(config['resize_to'], config['video_io'], args.input_uri, args.output_uri) - if args.mot: draw = args.gui or args.output_uri is not None - mot = fastmot.MOT(config['resize_to'], config['mot'], draw=draw, verbose=args.verbose) + mot = fastmot.MOT(config.resize_to, **vars(config.mot_cfg), draw=draw) mot.reset(stream.cap_dt) if args.log is not None: Path(args.log).parent.mkdir(parents=True, exist_ok=True) log = open(args.log, 'w') if args.gui: - cv2.namedWindow("Video", cv2.WINDOW_AUTOSIZE) + cv2.namedWindow('Video', cv2.WINDOW_AUTOSIZE) logger.info('Starting video capture...') stream.start_capture() try: with Profiler('app') as prof: - while not args.gui or cv2.getWindowProperty("Video", 0) >= 0: + while not args.gui or cv2.getWindowProperty('Video', 0) >= 0: frame = stream.read() if frame is None: break @@ -67,9 +68,9 @@ def main(): if args.mot: mot.step(frame) if log is not None: - for track in mot.visible_tracks: - tl = track.tlbr[:2] / config['resize_to'] * stream.resolution - br = track.tlbr[2:] / config['resize_to'] * stream.resolution + for track in mot.visible_tracks(): + tl = track.tlbr[:2] / config.resize_to * stream.resolution + br = track.tlbr[2:] / config.resize_to * stream.resolution w, h = br - tl + 1 log.write(f'{mot.frame_count},{track.trk_id},{tl[0]:.6f},{tl[1]:.6f},' f'{w:.6f},{h:.6f},-1,-1,-1\n') diff --git a/cfg/mot.json b/cfg/mot.json index f2008a63..c6029e40 100644 --- a/cfg/mot.json +++ b/cfg/mot.json @@ -1,78 +1,80 @@ { "resize_to": [1280, 720], - "video_io": { + "stream_cfg": { "resolution": [1920, 1080], "frame_rate": 30, "buffer_size": 10 }, - "mot": { + "mot_cfg": { "detector_type": "YOLO", "detector_frame_skip": 5, - "ssd_detector": { + "ssd_detector_cfg": { "model": "SSDInceptionV2", "class_ids": [1], "tile_overlap": 0.25, "tiling_grid": [4, 2], "conf_thresh": 0.5, - "max_area": 130000, - "merge_thresh": 0.6 + "merge_thresh": 0.6, + "max_area": 120000 }, - "yolo_detector": { + "yolo_detector_cfg": { "model": "YOLOv4", "class_ids": [1], "conf_thresh": 0.25, + "nms_thresh": 0.5, "max_area": 800000, - "nms_thresh": 0.5 + "min_aspect_ratio": 1.2 }, - "public_detector": { - "sequence": "eval/data/MOT20-03", + "public_detector_cfg": { + "sequence_path": "MOT20/train/MOT20-01", "conf_thresh": 0.5, "max_area": 800000 }, - "feature_extractor": { + "feature_extractor_cfg": { "model": "OSNet025", "batch_size": 16 }, - "multi_tracker": { + "tracker_cfg": { "max_age": 6, "age_penalty": 2, - "age_weight": 0.1, - "motion_weight": 0.02, - "max_feat_cost": 0.9, + "motion_weight": 0.2, + "max_assoc_cost": 0.8, "max_reid_cost": 0.6, "iou_thresh": 0.4, - "duplicate_iou": 0.8, + "duplicate_thresh": 0.8, + "occlusion_thresh": 0.7, "conf_thresh": 0.5, - "lost_buf_size": 50, + "confirm_hits": 1, + "history_size": 50, - "kalman_filter": { + "kalman_filter_cfg": { "std_factor_acc": 2.25, "std_offset_acc": 78.5, "std_factor_det": [0.08, 0.08], - "std_factor_flow": [0.14, 0.14], + "std_factor_klt": [0.14, 0.14], "min_std_det": [4.0, 4.0], - "min_std_flow": [5.0, 5.0], + "min_std_klt": [5.0, 5.0], "init_pos_weight": 5, - "init_vel_weight": 15, + "init_vel_weight": 12, "vel_coupling": 0.6, "vel_half_life": 2 }, - "flow": { + "flow_cfg": { "bg_feat_scale_factor": [0.1, 0.1], "opt_flow_scale_factor": [0.5, 0.5], - "feature_density": 0.005, + "feat_density": 0.005, "feat_dist_factor": 0.06, "ransac_max_iter": 500, "ransac_conf": 0.99, "max_error": 100, "inlier_thresh": 4, "bg_feat_thresh": 10, - "target_feat_params": { + "obj_feat_params": { "maxCorners": 1000, "qualityLevel": 0.06, "blockSize": 3 @@ -83,6 +85,15 @@ "criteria": [3, 10, 0.03] } } + }, + + "visualizer_cfg": { + "draw_detections": false, + "draw_confidence": false, + "draw_covariance": false, + "draw_klt": false, + "draw_obj_flow": false, + "draw_bg_flow": false } } } diff --git a/fastmot/detector.py b/fastmot/detector.py index 3868e042..a2d98712 100644 --- a/fastmot/detector.py +++ b/fastmot/detector.py @@ -10,8 +10,8 @@ from . import models from .utils import TRTInference -from .utils.rect import as_rect, to_tlbr, get_size, area -from .utils.rect import union, crop, multi_crop, iom, diou_nms +from .utils.rect import as_tlbr, aspect_ratio, to_tlbr, get_size, area +from .utils.rect import enclosing, multi_crop, iom, diou_nms DET_DTYPE = np.dtype( @@ -28,50 +28,83 @@ def __init__(self, size): self.size = size def __call__(self, frame): + """Detect objects synchronously.""" self.detect_async(frame) return self.postprocess() @abc.abstractmethod def detect_async(self, frame): - """ - Asynchronous detection. - """ raise NotImplementedError @abc.abstractmethod def postprocess(self): - """ - Synchronizes, applies postprocessing, and returns a record array - of detections (DET_DTYPE). - This function should be called after `detect_async`. - """ raise NotImplementedError class SSDDetector(Detector): - def __init__(self, size, config): + def __init__(self, size, + model='SSDInceptionV2', + class_ids=None, + tile_overlap=0.25, + tiling_grid=(4, 2), + conf_thresh=0.5, + merge_thresh=0.6, + max_area=120000): + """An object detector for SSD models. + + Parameters + ---------- + size : tuple + Width and height of each frame. + model : str, optional + SSD model to use. + Must be the name of a class that inherits `models.SSD`. + class_ids : tuple, optional + Class IDs to detect. + tile_overlap : float, optional + Ratio of overlap to width and height of each tile. + tiling_grid : tuple, optional + Width and height of tile layout to split each frame for batch inference. + conf_thresh : float, optional + Detection confidence threshold. + merge_thresh : float, optional + Overlap threshold to merge bounding boxes across tiles. + max_area : int, optional + Max area of bounding boxes to detect. + """ super().__init__(size) - self.label_mask = np.zeros(len(models.LABEL_MAP), dtype=bool) - self.label_mask[list(config['class_ids'])] = True - - self.model = getattr(models, config['model']) - self.tile_overlap = config['tile_overlap'] - self.tiling_grid = config['tiling_grid'] - self.conf_thresh = config['conf_thresh'] - self.max_area = config['max_area'] - self.merge_thresh = config['merge_thresh'] + self.model = models.SSD.get_model(model) + assert 0 <= tile_overlap <= 1 + self.tile_overlap = tile_overlap + assert tiling_grid[0] >= 1 and tiling_grid[1] >= 1 + self.tiling_grid = tiling_grid + assert 0 <= conf_thresh <= 1 + self.conf_thresh = conf_thresh + assert 0 <= merge_thresh <= 1 + self.merge_thresh = merge_thresh + assert max_area >= 0 + self.max_area = max_area + + class_ids = [] if class_ids is None else list(class_ids) + self.label_mask = np.zeros(len(models.LABEL_MAP), dtype=np.bool_) + self.label_mask[class_ids] = True self.batch_size = int(np.prod(self.tiling_grid)) - self.tiles, self.tiling_region_size = self._generate_tiles() - self.scale_factor = np.asarray(self.size) / self.tiling_region_size + self.tiles, self.tiling_region_sz = self._generate_tiles() + self.scale_factor = np.array(self.size) / self.tiling_region_sz self.backend = TRTInference(self.model, self.batch_size) self.inp_handle = self.backend.input.host.reshape(self.batch_size, *self.model.INPUT_SHAPE) def detect_async(self, frame): + """Detects objects asynchronously.""" self._preprocess(frame) self.backend.infer_async() def postprocess(self): + """Synchronizes, applies postprocessing, and returns a record array + of detections (DET_DTYPE). + This function should be called after `detect_async`. + """ det_out = self.backend.synchronize()[0] detections, tile_ids = self._filter_dets(det_out, self.tiles, self.model.TOPK, self.label_mask, self.max_area, @@ -80,12 +113,12 @@ def postprocess(self): return detections def _preprocess(self, frame): - frame = cv2.resize(frame, self.tiling_region_size) + frame = cv2.resize(frame, self.tiling_region_sz) self._normalize(frame, self.tiles, self.inp_handle) def _generate_tiles(self): - tile_size = np.asarray(self.model.INPUT_SHAPE[:0:-1]) - tiling_grid = np.asarray(self.tiling_grid) + tile_size = np.array(self.model.INPUT_SHAPE[:0:-1]) + tiling_grid = np.array(self.tiling_grid) step_size = (1 - self.tile_overlap) * tile_size total_size = (tiling_grid - 1) * step_size + tile_size total_size = np.rint(total_size).astype(int) @@ -94,8 +127,8 @@ def _generate_tiles(self): return tiles, tuple(total_size) def _merge_dets(self, detections, tile_ids): - detections = np.asarray(detections, dtype=DET_DTYPE).view(np.recarray) - tile_ids = np.asarray(tile_ids) + detections = np.fromiter(detections, DET_DTYPE, len(detections)).view(np.recarray) + tile_ids = np.fromiter(tile_ids, int, len(tile_ids)) if len(detections) == 0: return detections detections = self._merge(detections, tile_ids, self.batch_size, self.merge_thresh) @@ -121,7 +154,7 @@ def _filter_dets(det_out, tiles, topk, label_mask, max_area, thresh, scale_facto tile_ids = [] for tile_idx in range(len(tiles)): tile = tiles[tile_idx] - size = get_size(tile) + w, h = get_size(tile) tile_offset = tile_idx * topk for det_idx in range(topk): offset = (tile_offset + det_idx) * 7 @@ -130,9 +163,11 @@ def _filter_dets(det_out, tiles, topk, label_mask, max_area, thresh, scale_facto if conf < thresh: break if label_mask[label]: - tl = (det_out[offset + 3:offset + 5] * size + tile[:2]) * scale_factor - br = (det_out[offset + 5:offset + 7] * size + tile[:2]) * scale_factor - tlbr = as_rect(np.append(tl, br)) + xmin = (det_out[offset + 3] * w + tile[0]) * scale_factor[0] + ymin = (det_out[offset + 4] * h + tile[1]) * scale_factor[1] + xmax = (det_out[offset + 5] * w + tile[0]) * scale_factor[0] + ymax = (det_out[offset + 6] * h + tile[1]) * scale_factor[1] + tlbr = as_tlbr((xmin, ymin, xmax, ymax)) if 0 < area(tlbr) <= max_area: detections.append((tlbr, label, conf)) tile_ids.append(tile_idx) @@ -168,52 +203,92 @@ def _merge(dets, tile_ids, num_tile, thresh): tile_ids[j] = -1 stack.append(j) for k in candidates: - dets[i].tlbr[:] = union(dets[i].tlbr, dets[k].tlbr) + dets[i].tlbr[:] = enclosing(dets[i].tlbr, dets[k].tlbr) dets[i].conf = max(dets[i].conf, dets[k].conf) keep.discard(k) - keep = np.asarray(list(keep)) + keep = np.array(list(keep)) return dets[keep] class YOLODetector(Detector): - def __init__(self, size, config): + def __init__(self, size, + model='YOLOv4', + class_ids=None, + conf_thresh=0.25, + nms_thresh=0.5, + max_area=800000, + min_aspect_ratio=1.2): + """An object detector for YOLO models. + + Parameters + ---------- + size : tuple + Width and height of each frame. + model : str, optional + YOLO model to use. + Must be the name of a class that inherits `models.YOLO`. + class_ids : tuple, optional + Class IDs to detect. + conf_thresh : float, optional + Detection confidence threshold. + nms_thresh : float, optional + Nonmaximum suppression overlap threshold. + Set higher to detect crowded objects. + max_area : int, optional + Max area of bounding boxes to detect. + min_aspect_ratio : float, optional + Min aspect ratio (height over width) of bounding boxes to detect. + Set to 0.1 for square shaped objects. + """ super().__init__(size) - self.model = getattr(models, config['model']) - self.class_ids = config['class_ids'] - self.conf_thresh = config['conf_thresh'] - self.max_area = config['max_area'] - self.nms_thresh = config['nms_thresh'] + self.model = models.YOLO.get_model(model) + self.class_ids = tuple() if class_ids is None else class_ids + assert 0 <= conf_thresh <= 1 + self.conf_thresh = conf_thresh + assert 0 <= nms_thresh <= 1 + self.nms_thresh = nms_thresh + assert max_area >= 0 + self.max_area = max_area + assert min_aspect_ratio >= 0 + self.min_aspect_ratio = min_aspect_ratio self.backend = TRTInference(self.model, 1) self.inp_handle, self.upscaled_sz, self.bbox_offset = self._create_letterbox() def detect_async(self, frame): + """Detects objects asynchronously.""" self._preprocess(frame) self.backend.infer_async(from_device=True) def postprocess(self): + """Synchronizes, applies postprocessing, and returns a record array + of detections (DET_DTYPE). + This function should be called after `detect_async`. + """ det_out = self.backend.synchronize() det_out = np.concatenate(det_out).reshape(-1, 7) detections = self._filter_dets(det_out, self.upscaled_sz, self.class_ids, self.conf_thresh, - self.nms_thresh, self.max_area, self.bbox_offset) - detections = np.asarray(detections, dtype=DET_DTYPE).view(np.recarray) + self.nms_thresh, self.max_area, self.min_aspect_ratio, + self.bbox_offset) + detections = np.fromiter(detections, DET_DTYPE, len(detections)).view(np.recarray) return detections def _preprocess(self, frame): - frame_dev = cp.asarray(frame) - # resize - zoom = np.roll(self.inp_handle.shape, -1) / frame_dev.shape - small_dev = cupyx.scipy.ndimage.zoom(frame_dev, zoom, order=1, mode='opencv', grid_mode=True) - # BGR to RGB - rgb_dev = small_dev[..., ::-1] - # HWC -> CHW - chw_dev = rgb_dev.transpose(2, 0, 1) - # normalize to [0, 1] interval - cp.multiply(chw_dev, 1 / 255., out=self.inp_handle) + zoom = np.roll(self.inp_handle.shape, -1) / frame.shape + with self.backend.stream: + frame_dev = cp.asarray(frame) + # resize + small_dev = cupyx.scipy.ndimage.zoom(frame_dev, zoom, order=1, mode='opencv', grid_mode=True) + # BGR to RGB + rgb_dev = small_dev[..., ::-1] + # HWC -> CHW + chw_dev = rgb_dev.transpose(2, 0, 1) + # normalize to [0, 1] interval + cp.multiply(chw_dev, 1 / 255., out=self.inp_handle) def _create_letterbox(self): - src_size = np.asarray(self.size) - dst_size = np.asarray(self.model.INPUT_SHAPE[:0:-1]) + src_size = np.array(self.size) + dst_size = np.array(self.model.INPUT_SHAPE[:0:-1]) if self.model.LETTERBOX: scale_factor = min(dst_size / src_size) scaled_size = np.rint(src_size * scale_factor).astype(int) @@ -233,7 +308,7 @@ def _create_letterbox(self): @staticmethod @nb.njit(fastmath=True, cache=True) - def _filter_dets(det_out, size, class_ids, conf_thresh, nms_thresh, max_area, offset): + def _filter_dets(det_out, size, class_ids, conf_thresh, nms_thresh, max_area, min_ar, offset): """ det_out: a list of 3 tensors, where each tensor contains a multiple of 7 float32 numbers in @@ -254,30 +329,46 @@ def _filter_dets(det_out, size, class_ids, conf_thresh, nms_thresh, max_area, of class_dets = det_out[class_idx] class_keep = diou_nms(class_dets[:, :4], class_dets[:, 4], nms_thresh) keep.extend(class_idx[class_keep]) - keep = np.asarray(keep) + keep = np.array(keep) nms_dets = det_out[keep] detections = [] for i in range(len(nms_dets)): tlbr = to_tlbr(nms_dets[i, :4]) - # clip inside frame - tlbr = np.maximum(tlbr, 0) - tlbr = np.minimum(tlbr, np.append(size, size)) label = int(nms_dets[i, 5]) conf = nms_dets[i, 4] * nms_dets[i, 6] - if 0 < area(tlbr) <= max_area: + if 0 < area(tlbr) <= max_area and aspect_ratio(tlbr) >= min_ar: detections.append((tlbr, label, conf)) return detections class PublicDetector(Detector): - def __init__(self, size, frame_skip, config): + def __init__(self, size, frame_skip, sequence_path=None, conf_thresh=0.5, max_area=800000): + """Class to use MOT Challenge's public detections. + + Parameters + ---------- + size : tuple + Width and height of each frame. + frame_skip : int + Detector frame skip. + sequence_path : str, optional + Relative path to MOT Challenge's sequence directory. + conf_thresh : float, optional + Detection confidence threshold. + max_area : int, optional + Max area of bounding boxes to detect. + """ super().__init__(size) self.frame_skip = frame_skip - self.seq_root = Path(__file__).parents[1] / config['sequence'] - self.conf_thresh = config['conf_thresh'] - self.max_area = config['max_area'] - + assert sequence_path is not None + self.seq_root = Path(__file__).parents[1] / sequence_path + assert 0 <= conf_thresh <= 1 + self.conf_thresh = conf_thresh + assert max_area >= 0 + self.max_area = max_area + + assert self.seq_root.exists() seqinfo = configparser.ConfigParser() seqinfo.read(self.seq_root / 'seqinfo.ini') self.seq_size = (int(seqinfo['Sequence']['imWidth']), int(seqinfo['Sequence']['imHeight'])) @@ -286,17 +377,17 @@ def __init__(self, size, frame_skip, config): self.frame_id = 0 det_txt = self.seq_root / 'det' / 'det.txt' - for mot_det in np.loadtxt(det_txt, delimiter=','): - frame_id = int(mot_det[0]) - 1 - tlbr = to_tlbr(mot_det[2:6]) - conf = 1.0 # mot_det[6] - label = 1 # mot_det[7] (person) - # scale and clip inside frame + for mot_challenge_det in np.loadtxt(det_txt, delimiter=','): + frame_id = int(mot_challenge_det[0]) - 1 + tlbr = to_tlbr(mot_challenge_det[2:6]) + # mot_challenge_det[6] + conf = 1.0 + # mot_challenge_det[7] + label = 1 # person + # scale inside frame tlbr[:2] = tlbr[:2] / self.seq_size * self.size tlbr[2:] = tlbr[2:] / self.seq_size * self.size - tlbr = np.maximum(tlbr, 0) - tlbr = np.minimum(tlbr, np.append(self.size, self.size)) - tlbr = as_rect(tlbr) + tlbr = np.rint(tlbr) if conf >= self.conf_thresh and area(tlbr) <= self.max_area: self.detections[frame_id].append((tlbr, label, conf)) @@ -304,6 +395,6 @@ def detect_async(self, frame): pass def postprocess(self): - detections = np.asarray(self.detections[self.frame_id], dtype=DET_DTYPE).view(np.recarray) + detections = np.array(self.detections[self.frame_id], DET_DTYPE).view(np.recarray) self.frame_id += self.frame_skip return detections diff --git a/fastmot/feature_extractor.py b/fastmot/feature_extractor.py index 27296200..a0ece637 100644 --- a/fastmot/feature_extractor.py +++ b/fastmot/feature_extractor.py @@ -9,9 +9,20 @@ class FeatureExtractor: - def __init__(self, config): - self.model = getattr(models, config['model']) - self.batch_size = config['batch_size'] + def __init__(self, model='OSNet025', batch_size=16): + """A feature extractor for ReID embeddings. + + Parameters + ---------- + model : str, optional + ReID model to use. + Must be the name of a class that inherits `models.ReID`. + batch_size : int, optional + Batch size for inference. + """ + self.model = models.ReID.get_model(model) + assert batch_size >= 1 + self.batch_size = batch_size self.feature_dim = self.model.OUTPUT_LAYOUT self.backend = TRTInference(self.model, self.batch_size) @@ -25,19 +36,18 @@ def __del__(self): self.pool.close() self.pool.join() - def __call__(self, frame, detections): - self.extract_async(frame, detections) + def __call__(self, frame, tlbrs): + """Extract feature embeddings from bounding boxes synchronously.""" + self.extract_async(frame, tlbrs) return self.postprocess() @property def metric(self): return self.model.METRIC - def extract_async(self, frame, detections): - """ - Extract feature embeddings from detections asynchronously. - """ - imgs = multi_crop(frame, detections.tlbr) + def extract_async(self, frame, tlbrs): + """Extract feature embeddings from bounding boxes asynchronously.""" + imgs = multi_crop(frame, tlbrs) self.embeddings, cur_imgs = [], [] # pipeline inference and preprocessing the next batch in parallel for offset in range(0, len(imgs), self.batch_size): @@ -50,8 +60,7 @@ def extract_async(self, frame, detections): self.last_num_features = len(cur_imgs) def postprocess(self): - """ - Synchronizes, applies postprocessing, and returns a NxM matrix of N + """Synchronizes, applies postprocessing, and returns a NxM matrix of N extracted embeddings with dimension M. This API should be called after `extract_async`. """ @@ -65,8 +74,7 @@ def postprocess(self): return embeddings def null_embeddings(self, detections): - """ - Returns returns a NxM matrix of N identical embeddings with dimension M. + """Returns a NxM matrix of N identical embeddings with dimension M. This API effectively disables feature extraction. """ embeddings = np.ones((len(detections), self.feature_dim)) diff --git a/fastmot/flow.py b/fastmot/flow.py index 699ae313..92f77669 100644 --- a/fastmot/flow.py +++ b/fastmot/flow.py @@ -6,39 +6,91 @@ import cv2 from .utils.rect import to_tlbr, get_size, get_center -from .utils.rect import mask_area, intersection, crop, transform +from .utils.rect import intersection, crop +from .utils.numba import mask_area, transform LOGGER = logging.getLogger(__name__) class Flow: - """ - A KLT tracker based on optical flow feature point matching. - Camera motion is simultaneously estimated by tracking feature points - on the background. - Parameters - ---------- - size : (int, int) - Width and height of each frame. - config : Dict - KLT hyperparameters. - """ - - def __init__(self, size, config): + def __init__(self, size, + bg_feat_scale_factor=(0.1, 0.1), + opt_flow_scale_factor=(0.5, 0.5), + feat_density=0.005, + feat_dist_factor=0.06, + ransac_max_iter=500, + ransac_conf=0.99, + max_error=100, + inlier_thresh=4, + bg_feat_thresh=10, + obj_feat_params=None, + opt_flow_params=None): + """A KLT tracker based on optical flow feature point matching. + Camera motion is simultaneously estimated by tracking feature points + on the background. + + Parameters + ---------- + size : tuple + Width and height of each frame. + bg_feat_scale_factor : tuple, optional + Width and height scale factors to resize frame for background feature detection. + opt_flow_scale_factor : tuple, optional + Width and height scale factors to resize frame for optical flow. + feat_density : float, optional + Min feature point density to keep inside the bounding box. + feat_dist_factor : float, optional + Target size scale factor to estimate min feature point distance. + ransac_max_iter : int, optional + Max RANSAC iterations to filter matched outliers. + ransac_conf : float, optional + RANSAC confidence threshold to filter matched outliers. + max_error : int, optional + Max optical flow error. + inlier_thresh : int, optional + Min number of inliers for valid matching. + bg_feat_thresh : int, optional + FAST threshold for background feature detection. + obj_feat_params : SimpleNamespace, optional + GFTT parameters for object feature detection, see `cv2.goodFeaturesToTrack`. + opt_flow_params : SimpleNamespace, optional + Optical flow parameters, see `cv2.calcOpticalFlowPyrLK`. + """ self.size = size - self.bg_feat_scale_factor = config['bg_feat_scale_factor'] - self.opt_flow_scale_factor = config['opt_flow_scale_factor'] - self.feature_density = config['feature_density'] - self.feat_dist_factor = config['feat_dist_factor'] - self.ransac_max_iter = config['ransac_max_iter'] - self.ransac_conf = config['ransac_conf'] - self.max_error = config['max_error'] - self.inlier_thresh = config['inlier_thresh'] - - self.bg_feat_thresh = config['bg_feat_thresh'] - self.target_feat_params = config['target_feat_params'] - self.opt_flow_params = config['opt_flow_params'] + assert 0 < bg_feat_scale_factor[0] <= 1 and 0 < bg_feat_scale_factor[1] <= 1 + self.bg_feat_scale_factor = bg_feat_scale_factor + assert 0 < opt_flow_scale_factor[0] <= 1 and 0 < opt_flow_scale_factor[1] <= 1 + self.opt_flow_scale_factor = opt_flow_scale_factor + assert 0 <= feat_density <= 1 + self.feat_density = feat_density + assert feat_dist_factor >= 0 + self.feat_dist_factor = feat_dist_factor + assert ransac_max_iter >= 0 + self.ransac_max_iter = ransac_max_iter + assert 0 <= ransac_conf <= 1 + self.ransac_conf = ransac_conf + assert 0 <= max_error <= 255 + self.max_error = max_error + assert inlier_thresh >= 1 + self.inlier_thresh = inlier_thresh + assert bg_feat_thresh >= 0 + self.bg_feat_thresh = bg_feat_thresh + + self.obj_feat_params = { + "maxCorners": 1000, + "qualityLevel": 0.06, + "blockSize": 3 + } + self.opt_flow_params = { + "winSize": (5, 5), + "maxLevel": 5, + "criteria": (3, 10, 0.03) + } + if obj_feat_params is not None: + self.obj_feat_params.update(vars(obj_feat_params)) + if opt_flow_params is None: + self.opt_flow_params.update(vars(opt_flow_params)) self.bg_feat_detector = cv2.FastFeatureDetector_create(threshold=self.bg_feat_thresh) @@ -67,9 +119,8 @@ def __init__(self, size, config): self.frame_rect = to_tlbr((0, 0, *self.size)) def init(self, frame): - """ - Preprocesses the first frame to prepare for subsequent optical - flow computations. + """Preprocesses the first frame to prepare for subsequent `predict`. + Parameters ---------- frame : ndarray @@ -82,8 +133,8 @@ def init(self, frame): self.prev_bg_keypoints = np.empty((0, 2), np.float32) def predict(self, frame, tracks): - """ - Predicts tracklet positions in the next frame and estimates camera motion. + """Predicts tracklet positions in the next frame and estimates camera motion. + Parameters ---------- frame : ndarray @@ -91,6 +142,7 @@ def predict(self, frame, tracks): tracks : List[Track] List of tracks to predict. Feature points of each track are updated in place. + Returns ------- Dict[int, ndarray], ndarray @@ -113,12 +165,12 @@ def predict(self, frame, tracks): target_area = mask_area(target_mask) keypoints = self._rect_filter(track.keypoints, inside_tlbr, self.fg_mask) # only detect new keypoints when too few are propagated - if len(keypoints) < self.feature_density * target_area: + if len(keypoints) < self.feat_density * target_area: img = crop(self.prev_frame_gray, inside_tlbr) feature_dist = self._estimate_feature_dist(target_area, self.feat_dist_factor) keypoints = cv2.goodFeaturesToTrack(img, mask=target_mask, minDistance=feature_dist, - **self.target_feat_params) + **self.obj_feat_params) if keypoints is None: keypoints = np.empty((0, 2), np.float32) else: @@ -223,18 +275,17 @@ def _estimate_bbox(tlbr, affine_mat): tl = transform(tlbr[:2], affine_mat).ravel() scale = np.linalg.norm(affine_mat[:2, 0]) scale = 1. if scale < 0.9 or scale > 1.1 else scale - size = scale * get_size(tlbr) - return to_tlbr(np.append(tl, size)) + w, h = get_size(tlbr) + return to_tlbr((tl[0], tl[1], w * scale, h * scale)) @staticmethod @nb.njit(fastmath=True, cache=True) def _rect_filter(pts, tlbr, fg_mask): if len(pts) == 0: return np.empty((0, 2), np.float32) - tl, br = tlbr[:2], tlbr[2:] pts2i = np.rint(pts).astype(np.int32) # filter out points outside the rectangle - ge_le = (pts2i >= tl) & (pts2i <= br) + ge_le = (pts2i >= tlbr[:2]) & (pts2i <= tlbr[2:]) inside = np.where(ge_le[:, 0] & ge_le[:, 1]) pts, pts2i = pts[inside], pts2i[inside] # keep points inside the foreground area @@ -246,20 +297,20 @@ def _rect_filter(pts, tlbr, fg_mask): @nb.njit(fastmath=True, cache=True) def _ellipse_filter(pts, tlbr, offset): offset = np.asarray(offset, np.float32) + center = np.array(get_center(tlbr)) + semi_axes = np.array(get_size(tlbr)) * 0.5 pts = pts.reshape(-1, 2) pts = pts + offset - center = get_center(tlbr) - semi_axes = get_size(tlbr) * 0.5 # filter out points outside the ellipse keep = np.sum(((pts - center) / semi_axes)**2, axis=1) <= 1. return pts[keep] @staticmethod @nb.njit(fastmath=True, cache=True) - def _fg_filter(prev_pts, cur_pts, fg_mask, frame_size): + def _fg_filter(prev_pts, cur_pts, fg_mask, frame_sz): if len(cur_pts) == 0: return prev_pts, cur_pts - size = np.asarray(frame_size) + size = np.array(frame_sz) pts2i = np.rint(cur_pts).astype(np.int32) # filter out points outside the frame ge_lt = (pts2i >= 0) & (pts2i < size) @@ -274,7 +325,7 @@ def _fg_filter(prev_pts, cur_pts, fg_mask, frame_size): @staticmethod @nb.njit(fastmath=True, cache=True) def _scale_pts(pts, scale_factor): - scale_factor = np.asarray(scale_factor, np.float32) + scale_factor = np.array(scale_factor, np.float32) pts = pts * scale_factor pts = pts.reshape(-1, 1, 2) return pts @@ -282,13 +333,14 @@ def _scale_pts(pts, scale_factor): @staticmethod @nb.njit(fastmath=True, cache=True) def _unscale_pts(pts, scale_factor, mask=None): - scale_factor = np.asarray(scale_factor, np.float32) + scale_factor = np.array(scale_factor, np.float32) + unscale_factor = 1 / scale_factor pts = pts.reshape(-1, 2) if mask is None: - pts = pts / scale_factor + pts = pts * unscale_factor else: idx = np.where(mask) - pts[idx] = pts[idx] / scale_factor + pts[idx] = pts[idx] * unscale_factor return pts @staticmethod diff --git a/fastmot/kalman_filter.py b/fastmot/kalman_filter.py index 57ea1986..8bbd2459 100644 --- a/fastmot/kalman_filter.py +++ b/fastmot/kalman_filter.py @@ -11,39 +11,81 @@ class MeasType(Enum): class KalmanFilter: - """ - A simple Kalman filter for tracking bounding boxes in image space. - The 8-dimensional state space - x1, y1, x2, y2, v_x1, v_y1, v_x2, v_y2 - contains the bounding box top left corner, bottom right corner, - and their respective velocities. - Object motion follows a modified constant velocity model. - Velocity will decay over time without measurement and bounding box - corners are coupled together to minimize drifting. - Parameters - ---------- - config : Dict - Kalman Filter parameters. - """ - - def __init__(self, config): - self.std_factor_acc = config['std_factor_acc'] - self.std_offset_acc = config['std_offset_acc'] - self.std_factor_det = config['std_factor_det'] - self.std_factor_flow = config['std_factor_flow'] - self.min_std_det = config['min_std_det'] - self.min_std_flow = config['min_std_flow'] - self.init_pos_weight = config['init_pos_weight'] - self.init_vel_weight = config['init_vel_weight'] - self.vel_coupling = config['vel_coupling'] - self.vel_half_life = config['vel_half_life'] + def __init__(self, + std_factor_acc=2.25, + std_offset_acc=78.5, + std_factor_det=(0.08, 0.08), + std_factor_klt=(0.14, 0.14), + min_std_det=(4.0, 4.0), + min_std_klt=(5.0, 5.0), + init_pos_weight=5, + init_vel_weight=12, + vel_coupling=0.6, + vel_half_life=2): + """A simple Kalman filter for tracking bounding boxes in image space. + The 8-dimensional state space + x1, y1, x2, y2, v_x1, v_y1, v_x2, v_y2 + contains the bounding box top left corner, bottom right corner, + and their respective velocities. + Object motion follows a modified constant velocity model. + Velocity will decay over time without measurement and bounding box + corners are coupled together to minimize drifting. + + Parameters + ---------- + std_factor_acc : float, optional + Object size scale factor to calculate acceleration standard deviation + for process noise. + std_offset_acc : float, optional + Object size offset to calculate acceleration standard deviation + for process noise. Set larger for fast moving objects. + std_factor_det : tuple, optional + Object width and height scale factors to calculate detector measurement + noise standard deviation. + std_factor_klt : tuple, optional + Object wdith and height scale factors to calculate KLT measurement + noise standard deviation. + min_std_det : tuple, optional + Min detector measurement noise standard deviations. + min_std_klt : tuple, optional + Min KLT measurement noise standard deviations. + init_pos_weight : int, optional + Scale factor to initialize position state standard deviation. + init_vel_weight : int, optional + Scale factor to initialize velocity state standard deviation. + Set larger for fast moving objects. + vel_coupling : float, optional + Factor to couple bounding box corners. + Set 0.5 for max coupling and 1.0 to disable coupling. + vel_half_life : int, optional + Half life in seconds to decay velocity state. + """ + assert std_factor_acc >= 0 + self.std_factor_acc = std_factor_acc + self.std_offset_acc = std_offset_acc + assert std_factor_det[0] >= 0 and std_factor_det[1] >= 0 + self.std_factor_det = std_factor_det + assert std_factor_klt[0] >= 0 and std_factor_klt[1] >= 0 + self.std_factor_klt = std_factor_klt + assert min_std_det[0] >= 0 and min_std_det[1] >= 0 + self.min_std_det = min_std_det + assert min_std_klt[0] >= 0 and min_std_klt[1] >= 0 + self.min_std_klt = min_std_klt + assert init_pos_weight >= 0 + self.init_pos_weight = init_pos_weight + assert init_vel_weight >= 0 + self.init_vel_weight = init_vel_weight + assert 0 <= vel_coupling <= 1 + self.vel_coupling = vel_coupling + assert vel_half_life > 0 + self.vel_half_life = vel_half_life dt = 1 / 30. self.acc_cov, self.meas_mat, self.trans_mat = self._init_mat(dt) def reset_dt(self, dt): - """ - Resets process noise, measurement and transition matrices from dt. + """Resets process noise, measurement and transition matrices from dt. + Parameters ---------- dt : float @@ -52,15 +94,16 @@ def reset_dt(self, dt): self.acc_cov, self.meas_mat, self.trans_mat = self._init_mat(dt) def create(self, det_meas): - """ - Creates Kalman filter state from unassociated measurement. + """Creates Kalman filter state from unassociated measurement. + Parameters ---------- det_meas : ndarray Detected bounding box of [x1, x2, y1, y2]. + Returns ------- - (ndarray, ndarray) + ndarray, ndarray Returns the mean vector (8 dimensional) and covariance matrix (8x8 dimensional) of the new track. """ @@ -83,8 +126,8 @@ def create(self, det_meas): return mean, covariance def predict(self, mean, covariance): - """ - Runs Kalman filter prediction step. + """Runs Kalman filter prediction step. + Parameters ---------- mean : ndarray @@ -93,9 +136,10 @@ def predict(self, mean, covariance): covariance : ndarray The 8x8 dimensional covariance matrix of the object state at the previous time step. + Returns ------- - (ndarray, ndarray) + ndarray, ndarray Returns the mean vector and covariance matrix of the predicted state. """ @@ -103,8 +147,8 @@ def predict(self, mean, covariance): self.std_factor_acc, self.std_offset_acc) def project(self, mean, covariance, meas_type, multiplier=1.): - """ - Projects state distribution to measurement space. + """Projects state distribution to measurement space. + Parameters ---------- mean : ndarray @@ -115,15 +159,16 @@ def project(self, mean, covariance, meas_type, multiplier=1.): Measurement type indicating where the measurement comes from. multiplier : float Multiplier used to adjust the measurement std. + Returns ------- - (ndarray, ndarray) + ndarray, ndarray Returns the projected mean and covariance matrix of the given state estimate. """ if meas_type == MeasType.FLOW: - std_factor = self.std_factor_flow - min_std = self.min_std_flow + std_factor = self.std_factor_klt + min_std = self.min_std_klt elif meas_type == MeasType.DETECTOR: std_factor = self.std_factor_det min_std = self.min_std_det @@ -133,8 +178,8 @@ def project(self, mean, covariance, meas_type, multiplier=1.): return self._project(mean, covariance, self.meas_mat, std_factor, min_std, multiplier) def update(self, mean, covariance, measurement, meas_type, multiplier=1.): - """ - Runs Kalman filter correction step. + """Runs Kalman filter correction step. + Parameters ---------- mean : ndarray @@ -147,9 +192,10 @@ def update(self, mean, covariance, measurement, meas_type, multiplier=1.): Measurement type indicating where the measurement comes from. multiplier : float Multiplier used to adjust the measurement std. + Returns ------- - (ndarray, ndarray) + ndarray, ndarray Returns the measurement-corrected state distribution. """ projected_mean, projected_cov = self.project(mean, covariance, meas_type, multiplier) @@ -158,8 +204,8 @@ def update(self, mean, covariance, measurement, meas_type, multiplier=1.): projected_cov, measurement, self.meas_mat) def motion_distance(self, mean, covariance, measurements): - """ - Computes mahalanobis distance between `measurements` and state distribution. + """Computes mahalanobis distance between `measurements` and state distribution. + Parameters ---------- mean : ndarray @@ -168,6 +214,7 @@ def motion_distance(self, mean, covariance, measurements): The state's covariance matrix (8x8 dimensional). measurements : array_like An Nx4 matrix of N samples of [x1, x2, y1, y2]. + Returns ------- ndarray @@ -180,9 +227,10 @@ def motion_distance(self, mean, covariance, measurements): @staticmethod @nb.njit(fastmath=True, cache=True) def warp(mean, covariance, H): - """ - Warps kalman filter state using a homography transformation. + """Warps kalman filter state using a homography transformation. https://scholarsarchive.byu.edu/cgi/viewcontent.cgi?article=1301&context=studentpub + + Parameters ---------- mean : ndarray The predicted state's mean vector (8 dimensional). @@ -190,9 +238,10 @@ def warp(mean, covariance, H): The state's covariance matrix (8x8 dimensional). H : ndarray A 3x3 homography matrix. + Returns ------- - (ndarray, ndarray) + ndarray, ndarray Returns the mean vector and covariance matrix of the transformed state. """ diff --git a/fastmot/models/__init__.py b/fastmot/models/__init__.py index e444b68e..98a6161f 100644 --- a/fastmot/models/__init__.py +++ b/fastmot/models/__init__.py @@ -1,4 +1,4 @@ -from .ssd import * -from .yolo import * -from .reid import * -from .label import * \ No newline at end of file +from .ssd import SSD +from .yolo import YOLO +from .reid import ReID +from .label import LABEL_MAP \ No newline at end of file diff --git a/fastmot/models/reid.py b/fastmot/models/reid.py index 5bf25d3e..342cce61 100644 --- a/fastmot/models/reid.py +++ b/fastmot/models/reid.py @@ -8,10 +8,20 @@ class ReID: + __registry = {} + PLUGIN_PATH = None ENGINE_PATH = None MODEL_PATH = None - INPUT_SHAPE = () + INPUT_SHAPE = None + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + cls.__registry[cls.__name__] = cls + + @classmethod + def get_model(cls, name): + return cls.__registry[name] @classmethod def build_engine(cls, trt_logger, batch_size): @@ -53,3 +63,12 @@ class OSNet025(ReID): INPUT_SHAPE = (3, 256, 128) OUTPUT_LAYOUT = 512 METRIC = 'euclidean' + + +class OSNet10(ReID): + """Multi-source model trained on MSMT17, DukeMTMC, and CUHK03, not provided.""" + ENGINE_PATH = Path(__file__).parent / 'osnet_x1_0_msdc.trt' + MODEL_PATH = Path(__file__).parent / 'osnet_x1_0_msdc.onnx' + INPUT_SHAPE = (3, 256, 128) + OUTPUT_LAYOUT = 512 + METRIC = 'cosine' diff --git a/fastmot/models/ssd.py b/fastmot/models/ssd.py index ca09188a..a53454bd 100644 --- a/fastmot/models/ssd.py +++ b/fastmot/models/ssd.py @@ -7,12 +7,22 @@ class SSD: + __registry = {} + PLUGIN_PATH = None ENGINE_PATH = None MODEL_PATH = None - INPUT_SHAPE = () + INPUT_SHAPE = None OUTPUT_NAME = None + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + cls.__registry[cls.__name__] = cls + + @classmethod + def get_model(cls, name): + return cls.__registry[name] + @classmethod def add_plugin(cls, graph): raise NotImplementedError diff --git a/fastmot/models/yolo.py b/fastmot/models/yolo.py index 14f589d9..a820badb 100644 --- a/fastmot/models/yolo.py +++ b/fastmot/models/yolo.py @@ -9,16 +9,26 @@ class YOLO: + __registry = {} + PLUGIN_PATH = Path(__file__).parents[1] / 'plugins' / 'libyolo_layer.so' ENGINE_PATH = None MODEL_PATH = None NUM_CLASSES = None LETTERBOX = False NEW_COORDS = False - INPUT_SHAPE = () - LAYER_FACTORS = [] - SCALES = [] - ANCHORS = [] + INPUT_SHAPE = None + LAYER_FACTORS = None + SCALES = None + ANCHORS = None + + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + cls.__registry[cls.__name__] = cls + + @classmethod + def get_model(cls, name): + return cls.__registry[name] @classmethod def add_plugin(cls, network): @@ -29,6 +39,11 @@ def get_plugin_creator(plugin_name): return plugin_creator return None + assert len(cls.LAYER_FACTORS) == network.num_outputs + assert len(cls.SCALES) == network.num_outputs + assert len(cls.ANCHORS) == network.num_outputs + assert all(s >= 1.0 for s in cls.SCALES) + plugin_creator = get_plugin_creator('YoloLayer_TRT') if not plugin_creator: raise RuntimeError('Failed to get YoloLayer_TRT plugin creator') diff --git a/fastmot/mot.py b/fastmot/mot.py index 9e31eb3e..c8cfb2e4 100644 --- a/fastmot/mot.py +++ b/fastmot/mot.py @@ -1,3 +1,4 @@ +from types import SimpleNamespace from enum import Enum import logging import cv2 @@ -6,8 +7,7 @@ from .feature_extractor import FeatureExtractor from .tracker import MultiTracker from .utils import Profiler -from .utils.visualization import draw_tracks, draw_detections -from .utils.visualization import draw_flow_bboxes, draw_background_flow +from .utils.visualization import Visualizer LOGGER = logging.getLogger(__name__) @@ -20,64 +20,102 @@ class DetectorType(Enum): class MOT: - """ - This is the top level module that integrates detection, feature extraction, - and tracking together. - Parameters - ---------- - size : (int, int) - Width and height of each frame. - cap_dt : float - Time interval in seconds between each captured frame. - config : Dict - Tracker configuration. - draw : bool - Flag to toggle visualization drawing. - verbose : bool - Flag to toggle output verbosity. - """ - - def __init__(self, size, config, draw=False, verbose=False): + def __init__(self, size, + detector_type='YOLO', + detector_frame_skip=5, + ssd_detector_cfg=None, + yolo_detector_cfg=None, + public_detector_cfg=None, + feature_extractor_cfg=None, + tracker_cfg=None, + visualizer_cfg=None, + draw=False): + """Top level module that integrates detection, feature extraction, + and tracking together. + + Parameters + ---------- + size : tuple + Width and height of each frame. + detector_type : {'SSD', 'YOLO', 'public'}, optional + Type of detector to use. + detector_frame_skip : int, optional + Number of frames to skip for the detector. + ssd_detector_cfg : SimpleNamespace, optional + SSD detector configuration. + yolo_detector_cfg : SimpleNamespace, optional + YOLO detector configuration. + public_detector_cfg : SimpleNamespace, optional + Public detector configuration. + feature_extractor_cfg : SimpleNamespace, optional + Feature extractor configuration. + tracker_cfg : SimpleNamespace, optional + Tracker configuration. + visualizer_cfg : SimpleNamespace, optional + Visualization configuration. + draw : bool, optional + Enable visualization. + """ self.size = size + self.detector_type = DetectorType[detector_type.upper()] + assert detector_frame_skip >= 1 + self.detector_frame_skip = detector_frame_skip self.draw = draw - self.verbose = verbose - self.detector_type = DetectorType[config['detector_type']] - self.detector_frame_skip = config['detector_frame_skip'] + + if ssd_detector_cfg is None: + ssd_detector_cfg = SimpleNamespace() + if yolo_detector_cfg is None: + yolo_detector_cfg = SimpleNamespace() + if public_detector_cfg is None: + public_detector_cfg = SimpleNamespace() + if feature_extractor_cfg is None: + feature_extractor_cfg = SimpleNamespace() + if tracker_cfg is None: + tracker_cfg = SimpleNamespace() + if visualizer_cfg is None: + visualizer_cfg = SimpleNamespace() LOGGER.info('Loading detector model...') if self.detector_type == DetectorType.SSD: - self.detector = SSDDetector(self.size, config['ssd_detector']) + self.detector = SSDDetector(self.size, **vars(ssd_detector_cfg)) elif self.detector_type == DetectorType.YOLO: - self.detector = YOLODetector(self.size, config['yolo_detector']) + self.detector = YOLODetector(self.size, **vars(yolo_detector_cfg)) elif self.detector_type == DetectorType.PUBLIC: self.detector = PublicDetector(self.size, self.detector_frame_skip, - config['public_detector']) + **vars(public_detector_cfg)) LOGGER.info('Loading feature extractor model...') - self.extractor = FeatureExtractor(config['feature_extractor']) - self.tracker = MultiTracker(self.size, self.extractor.metric, config['multi_tracker']) + self.extractor = FeatureExtractor(**vars(feature_extractor_cfg)) + self.tracker = MultiTracker(self.size, self.extractor.metric, **vars(tracker_cfg)) + + self.visualizer = Visualizer(**vars(visualizer_cfg)) self.frame_count = 0 - @property def visible_tracks(self): - # retrieve confirmed and active tracks from the tracker - return [track for track in self.tracker.tracks.values() - if track.confirmed and track.active] + """Retrieve visible tracks from the tracker - def reset(self, cap_dt): + Returns + ------- + Iterator[Track] + Confirmed and active tracks from the tracker """ - Resets multiple object tracker. Must be called before `step`. + return (track for track in self.tracker.tracks.values() + if track.confirmed and track.active) + + def reset(self, cap_dt): + """Resets multiple object tracker. Must be called before `step`. + Parameters ---------- cap_dt : float Time interval in seconds between each frame. """ self.frame_count = 0 - self.tracker.reset_dt(cap_dt) + self.tracker.reset(cap_dt) def step(self, frame): - """ - Runs multiple object tracker on the next frame. + """Runs multiple object tracker on the next frame. + Parameters ---------- frame : ndarray @@ -98,7 +136,7 @@ def step(self, frame): detections = self.detector.postprocess() with Profiler('extract'): - self.extractor.extract_async(frame, detections) + self.extractor.extract_async(frame, detections.tlbr) with Profiler('track', aggregate=True): self.tracker.apply_kalman() embeddings = self.extractor.postprocess() @@ -124,10 +162,8 @@ def print_timing_info(): LOGGER.debug(f"{'association time:':<37}{Profiler.get_avg_millis('assoc'):>6.3f} ms") def _draw(self, frame, detections): - draw_tracks(frame, self.visible_tracks, show_flow=self.verbose) - if self.verbose: - draw_detections(frame, detections) - draw_flow_bboxes(frame, self.tracker) - draw_background_flow(frame, self.tracker) - cv2.putText(frame, f'visible: {len(self.visible_tracks)}', (30, 30), + visible_tracks = list(self.visible_tracks()) + self.visualizer.render(frame, visible_tracks, detections, self.tracker.klt_bboxes.values(), + self.tracker.flow.prev_bg_keypoints, self.tracker.flow.bg_keypoints) + cv2.putText(frame, f'visible: {len(visible_tracks)}', (30, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, 0, 2, cv2.LINE_AA) diff --git a/fastmot/plugins/yolo_layer.cu b/fastmot/plugins/yolo_layer.cu index 8590ffa0..9f56f256 100644 --- a/fastmot/plugins/yolo_layer.cu +++ b/fastmot/plugins/yolo_layer.cu @@ -60,25 +60,19 @@ namespace nvinfer1 assert(d == a + length); } - void YoloLayerPlugin::serialize(void* buffer) const + IPluginV2IOExt* YoloLayerPlugin::clone() const NOEXCEPT { - char* d = static_cast(buffer), *a = d; - write(d, mThreadCount); - write(d, mYoloWidth); - write(d, mYoloHeight); - write(d, mNumAnchors); - memcpy(d, mAnchorsHost, MAX_ANCHORS * 2 * sizeof(float)); - d += MAX_ANCHORS * 2 * sizeof(float); - write(d, mNumClasses); - write(d, mInputWidth); - write(d, mInputHeight); - write(d, mScaleXY); - write(d, mNewCoords); + YoloLayerPlugin *p = new YoloLayerPlugin(mYoloWidth, mYoloHeight, mNumAnchors, (float*) mAnchorsHost, mNumClasses, mInputWidth, mInputHeight, mScaleXY, mNewCoords); + p->setPluginNamespace(mPluginNamespace); + return p; + } - assert(d == a + getSerializationSize()); + void YoloLayerPlugin::terminate() NOEXCEPT + { + CHECK(cudaFree(mAnchors)); } - size_t YoloLayerPlugin::getSerializationSize() const + size_t YoloLayerPlugin::getSerializationSize() const NOEXCEPT { return sizeof(mThreadCount) + \ sizeof(mYoloWidth) + sizeof(mYoloHeight) + \ @@ -88,17 +82,25 @@ namespace nvinfer1 sizeof(mScaleXY) + sizeof(mNewCoords); } - int YoloLayerPlugin::initialize() + void YoloLayerPlugin::serialize(void* buffer) const NOEXCEPT { - return 0; - } + char* d = static_cast(buffer), *a = d; + write(d, mThreadCount); + write(d, mYoloWidth); + write(d, mYoloHeight); + write(d, mNumAnchors); + memcpy(d, mAnchorsHost, MAX_ANCHORS * 2 * sizeof(float)); + d += MAX_ANCHORS * 2 * sizeof(float); + write(d, mNumClasses); + write(d, mInputWidth); + write(d, mInputHeight); + write(d, mScaleXY); + write(d, mNewCoords); - void YoloLayerPlugin::terminate() - { - CHECK(cudaFree(mAnchors)); + assert(d == a + getSerializationSize()); } - Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) + Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) NOEXCEPT { assert(index == 0); assert(nbInputDims == 1); @@ -110,71 +112,6 @@ namespace nvinfer1 return Dims3(totalsize, 1, 1); } - void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace) - { - mPluginNamespace = pluginNamespace; - } - - const char* YoloLayerPlugin::getPluginNamespace() const - { - return mPluginNamespace; - } - - // Return the DataType of the plugin output at the requested index - DataType YoloLayerPlugin::getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const - { - return DataType::kFLOAT; - } - - // Return true if output tensor is broadcast across a batch. - bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const - { - return false; - } - - // Return true if plugin can use input that is broadcast across batch without replication. - bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const - { - return false; - } - - void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) - { - } - - // Attach the plugin object to an execution context and grant the plugin the access to some context resource. - void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) - { - } - - // Detach the plugin object from its execution context. - void YoloLayerPlugin::detachFromContext() - { - } - - const char* YoloLayerPlugin::getPluginType() const - { - return "YoloLayer_TRT"; - } - - const char* YoloLayerPlugin::getPluginVersion() const - { - return "1"; - } - - void YoloLayerPlugin::destroy() - { - delete this; - } - - // Clone the plugin - IPluginV2IOExt* YoloLayerPlugin::clone() const - { - YoloLayerPlugin *p = new YoloLayerPlugin(mYoloWidth, mYoloHeight, mNumAnchors, (float*) mAnchorsHost, mNumClasses, mInputWidth, mInputHeight, mScaleXY, mNewCoords); - p->setPluginNamespace(mPluginNamespace); - return p; - } - inline __device__ float sigmoidGPU(float x) { return 1.0f / (1.0f + __expf(-x)); } inline __device__ float scale_sigmoidGPU(float x, float s) @@ -307,7 +244,11 @@ namespace nvinfer1 } } - int YoloLayerPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) +#if NV_TENSORRT_MAJOR == 8 + int32_t YoloLayerPlugin::enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) NOEXCEPT +#else + int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) +#endif { forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, batchSize); return 0; @@ -321,22 +262,22 @@ namespace nvinfer1 mFC.fields = mPluginAttributes.data(); } - const char* YoloPluginCreator::getPluginName() const + const char* YoloPluginCreator::getPluginName() const NOEXCEPT { return "YoloLayer_TRT"; } - const char* YoloPluginCreator::getPluginVersion() const + const char* YoloPluginCreator::getPluginVersion() const NOEXCEPT { return "1"; } - const PluginFieldCollection* YoloPluginCreator::getFieldNames() + const PluginFieldCollection* YoloPluginCreator::getFieldNames() NOEXCEPT { return &mFC; } - IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) + IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) NOEXCEPT { assert(!strcmp(name, getPluginName())); const PluginField* fields = fc->fields; @@ -397,7 +338,9 @@ namespace nvinfer1 assert(yolo_width > 0 && yolo_height > 0); assert(anchors[0] > 0.0f && anchors[1] > 0.0f); assert(num_classes > 0); - assert(input_multiplier == 8 || input_multiplier == 16 || input_multiplier == 32 || input_multiplier == 64 || input_multiplier == 128); + assert(input_multiplier == 128 || input_multiplier == 64 || + input_multiplier == 32 || input_multiplier == 16 || + input_multiplier == 8); assert(scale_x_y >= 1.0); YoloLayerPlugin* obj = new YoloLayerPlugin(yolo_width, yolo_height, num_anchors, anchors, num_classes, yolo_width * input_multiplier, yolo_height * input_multiplier, scale_x_y, new_coords); @@ -405,7 +348,7 @@ namespace nvinfer1 return obj; } - IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) + IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) NOEXCEPT { YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength); obj->setPluginNamespace(mNamespace.c_str()); @@ -414,5 +357,4 @@ namespace nvinfer1 PluginFieldCollection YoloPluginCreator::mFC{}; std::vector YoloPluginCreator::mPluginAttributes; - REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); } // namespace nvinfer1 diff --git a/fastmot/plugins/yolo_layer.h b/fastmot/plugins/yolo_layer.h index 127bef43..4264cb28 100644 --- a/fastmot/plugins/yolo_layer.h +++ b/fastmot/plugins/yolo_layer.h @@ -10,6 +10,12 @@ #define MAX_ANCHORS 6 +#if NV_TENSORRT_MAJOR >= 8 +#define NOEXCEPT noexcept +#else +#define NOEXCEPT +#endif + #define CHECK(status) \ do { \ auto ret = status; \ @@ -43,52 +49,52 @@ namespace nvinfer1 ~YoloLayerPlugin() override = default; - int getNbOutputs() const override - { - return 1; - } + IPluginV2IOExt* clone() const NOEXCEPT override; - Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override; + int initialize() NOEXCEPT override { return 0; } - int initialize() override; + void terminate() NOEXCEPT override; - void terminate() override; + void destroy() NOEXCEPT override { delete this; } - virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;} + size_t getSerializationSize() const NOEXCEPT override; - virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override; + void serialize(void* buffer) const NOEXCEPT override; - virtual size_t getSerializationSize() const override; + int getNbOutputs() const NOEXCEPT override { return 1; } - virtual void serialize(void* buffer) const override; + Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) NOEXCEPT override; - bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override { - return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; - } + size_t getWorkspaceSize(int maxBatchSize) const NOEXCEPT override { return 0; } - const char* getPluginType() const override; + bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; } - const char* getPluginVersion() const override; + const char* getPluginType() const NOEXCEPT override { return "YoloLayer_TRT"; } - void destroy() override; + const char* getPluginVersion() const NOEXCEPT override { return "1"; } - IPluginV2IOExt* clone() const override; + void setPluginNamespace(const char* pluginNamespace) NOEXCEPT override { mPluginNamespace = pluginNamespace; } - void setPluginNamespace(const char* pluginNamespace) override; + const char* getPluginNamespace() const NOEXCEPT override { return mPluginNamespace; } - const char* getPluginNamespace() const override; + DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const NOEXCEPT override { return DataType::kFLOAT; } - DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const override; + bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const NOEXCEPT override { return false; } - bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override; + bool canBroadcastInputAcrossBatch(int inputIndex) const NOEXCEPT override { return false; } - bool canBroadcastInputAcrossBatch(int inputIndex) const override; + void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) NOEXCEPT override {} - void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override; + //using IPluginV2IOExt::configurePlugin; + void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) NOEXCEPT override {} - void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override TRTNOEXCEPT; + void detachFromContext() NOEXCEPT override {} - void detachFromContext() override; +#if NV_TENSORRT_MAJOR >= 8 + int32_t enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) NOEXCEPT override; +#else + int enqueue(int batchSize, const void* const * inputs, void** outputs, void* workspace, cudaStream_t stream) NOEXCEPT override; +#endif private: void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int batchSize = 1); @@ -103,9 +109,6 @@ namespace nvinfer1 int mNewCoords = 0; const char* mPluginNamespace; - - protected: - using IPluginV2IOExt::configurePlugin; }; class YoloPluginCreator : public IPluginCreator @@ -115,31 +118,33 @@ namespace nvinfer1 ~YoloPluginCreator() override = default; - const char* getPluginName() const override; + const char* getPluginName() const NOEXCEPT override; - const char* getPluginVersion() const override; + const char* getPluginVersion() const NOEXCEPT override; - const PluginFieldCollection* getFieldNames() override; + const PluginFieldCollection* getFieldNames() NOEXCEPT override; - IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override; + IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) NOEXCEPT override; - IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override; + IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) NOEXCEPT override; - void setPluginNamespace(const char* libNamespace) override + void setPluginNamespace(const char* libNamespace) NOEXCEPT override { mNamespace = libNamespace; } - const char* getPluginNamespace() const override + const char* getPluginNamespace() const NOEXCEPT override { return mNamespace.c_str(); } private: static PluginFieldCollection mFC; - static std::vector mPluginAttributes; + static std::vector mPluginAttributes; std::string mNamespace; }; + + REGISTER_TENSORRT_PLUGIN(YoloPluginCreator); }; #endif diff --git a/fastmot/track.py b/fastmot/track.py index 9d518e64..25a11e63 100644 --- a/fastmot/track.py +++ b/fastmot/track.py @@ -1,68 +1,225 @@ +from collections import deque import numpy as np +import numba as nb from .models import LABEL_MAP +from .utils.distance import cdist, cosine +from .utils.numba import apply_along_axis, normalize_vec from .utils.rect import get_center +class ClusterFeature: + def __init__(self, num_clusters, metric): + self.num_clusters = num_clusters + self.metric = metric + self.clusters = None + self.cluster_sizes = None + self._next_idx = 0 + + def __len__(self): + return self._next_idx + + def __call__(self): + return self.clusters[:self._next_idx] + + def update(self, embedding): + if self._next_idx < self.num_clusters: + if self.clusters is None: + self.clusters = np.empty((self.num_clusters, len(embedding)), embedding.dtype) + self.cluster_sizes = np.zeros(self.num_clusters, int) + self.clusters[self._next_idx] = embedding + self.cluster_sizes[self._next_idx] += 1 + self._next_idx += 1 + else: + nearest_idx = self._get_nearest_cluster(self.clusters, embedding) + self.cluster_sizes[nearest_idx] += 1 + self._seq_kmeans(self.clusters, self.cluster_sizes, embedding, nearest_idx) + + def distance(self, embeddings): + if self.clusters is None: + return np.ones(len(embeddings)) + clusters = normalize_vec(self.clusters[:self._next_idx]) + return apply_along_axis(np.min, cdist(clusters, embeddings, self.metric), axis=0) + + def merge(self, features, other, other_features): + if len(features) > len(other_features): + for feature in other_features: + if feature is not None: + self.update(feature) + else: + for feature in features: + if feature is not None: + other.update(feature) + self.clusters = other.clusters.copy() + self.clusters_sizes = other.cluster_sizes.copy() + self._next_idx = other._next_idx + + @staticmethod + @nb.njit(fastmath=True, cache=True) + def _get_nearest_cluster(clusters, embedding): + return np.argmin(cosine(np.atleast_2d(embedding), clusters)) + + @staticmethod + @nb.njit(fastmath=True, cache=True) + def _seq_kmeans(clusters, cluster_sizes, embedding, idx): + div_size = 1. / cluster_sizes[idx] + clusters[idx] += (embedding - clusters[idx]) * div_size + + +class SmoothFeature: + def __init__(self, learning_rate): + self.lr = learning_rate + self.smooth = None + + def __call__(self): + return self.smooth + + def update(self, embedding): + if self.smooth is None: + self.smooth = embedding.copy() + else: + self._rolling(self.smooth, embedding, self.lr) + + @staticmethod + @nb.njit(fastmath=True, cache=True) + def _rolling(smooth, embedding, lr): + smooth[:] = (1. - lr) * smooth + lr * embedding + norm_factor = 1. / np.linalg.norm(smooth) + smooth *= norm_factor + + +class AverageFeature: + def __init__(self): + self.sum = None + self.avg = None + self.count = 0 + + def __call__(self): + return self.avg + + def is_valid(self): + return self.count > 0 + + def update(self, embedding): + self.count += 1 + if self.sum is None: + self.sum = embedding.copy() + self.avg = embedding.copy() + else: + self._average(self.sum, self.avg, embedding, self.count) + + def merge(self, other): + self.count += other.count + if self.sum is None: + self.sum = other.sum + self.avg = other.avg + elif other.sum is not None: + self._average(self.sum, self.avg, other.sum, self.count) + + @staticmethod + @nb.njit(fastmath=True, cache=True) + def _average(sum, avg, vec, count): + sum += vec + div_cnt = 1. / count + avg[:] = sum * div_cnt + norm_factor = 1. / np.linalg.norm(avg) + avg *= norm_factor + + class Track: - def __init__(self, frame_id, trk_id, tlbr, state, label): + _count = 0 + + def __init__(self, frame_id, tlbr, state, label, confirm_hits=1, buffer_size=30): + self.trk_id = self.next_id() self.start_frame = frame_id - self.trk_id = trk_id - self.tlbr = tlbr + self.frame_ids = deque([frame_id], maxlen=buffer_size) + self.bboxes = deque([tlbr], maxlen=buffer_size) + self.confirm_hits = confirm_hits self.state = state self.label = label self.age = 0 self.hits = 0 - self.alpha = 0.9 - self.smooth_feature = None + self.avg_feat = AverageFeature() + self.last_feat = None self.inlier_ratio = 1. self.keypoints = np.empty((0, 2), np.float32) self.prev_keypoints = np.empty((0, 2), np.float32) def __str__(self): - coord = get_center(self.tlbr).astype(int) - return f'{LABEL_MAP[self.label]} {self.trk_id:>3} at ({coord[0]:>4}, {coord[1]:>3})' + x, y = get_center(self.tlbr) + return f'{LABEL_MAP[self.label]} {self.trk_id:>3} at ({int(x):>4}, {int(y):>3})' def __repr__(self): return self.__str__() + def __len__(self): + return self.end_frame - self.start_frame + def __lt__(self, other): # ordered by approximate distance to the image plane, closer is greater return (self.tlbr[-1], -self.age) < (other.tlbr[-1], -other.age) + @property + def tlbr(self): + return self.bboxes[-1] + + @property + def end_frame(self): + return self.frame_ids[-1] + @property def active(self): return self.age < 2 @property def confirmed(self): - return self.hits > 0 + return self.hits >= self.confirm_hits - def update(self, tlbr, state, embedding=None): - self.tlbr = tlbr + def update(self, tlbr, state): + self.bboxes.append(tlbr) self.state = state - if embedding is not None: - self.age = 0 - self.hits += 1 - self.update_feature(embedding) - def reactivate(self, frame_id, tlbr, state, embedding): + def add_detection(self, frame_id, tlbr, state, embedding, is_valid=True): + self.frame_ids.append(frame_id) + self.bboxes.append(tlbr) + self.state = state + if is_valid: + self.last_feat = embedding + self.avg_feat.update(embedding) + self.age = 0 + self.hits += 1 + + def reinstate(self, frame_id, tlbr, state, embedding): self.start_frame = frame_id - self.tlbr = tlbr + self.frame_ids.append(frame_id) + self.bboxes.append(tlbr) self.state = state + self.last_feat = embedding + self.avg_feat.update(embedding) self.age = 0 - self.update_feature(embedding) self.keypoints = np.empty((0, 2), np.float32) self.prev_keypoints = np.empty((0, 2), np.float32) def mark_missed(self): self.age += 1 - def update_feature(self, embedding): - if self.smooth_feature is None: - self.smooth_feature = embedding - else: - self.smooth_feature = self.alpha * self.smooth_feature + (1. - self.alpha) * embedding - self.smooth_feature /= np.linalg.norm(self.smooth_feature) + def merge_continuation(self, other): + self.frame_ids.extend(other.frame_ids) + self.bboxes.extend(other.bboxes) + self.state = other.state + self.age = other.age + self.hits += other.hits + + self.keypoints = other.keypoints + self.prev_keypoints = other.prev_keypoints + + if other.last_feat is not None: + self.last_feat = other.last_feat + self.avg_feat.merge(other.avg_feat) + + @staticmethod + def next_id(): + Track._count += 1 + return Track._count diff --git a/fastmot/tracker.py b/fastmot/tracker.py index 3931b384..59021eb7 100644 --- a/fastmot/tracker.py +++ b/fastmot/tracker.py @@ -1,77 +1,127 @@ +from types import SimpleNamespace from collections import OrderedDict import itertools import logging import numpy as np -import numba as nb -from scipy.optimize import linear_sum_assignment -from scipy.spatial.distance import cdist -from cython_bbox import bbox_overlaps from .track import Track from .flow import Flow from .kalman_filter import MeasType, KalmanFilter -from .utils.rect import as_rect, to_tlbr, iom +from .utils.distance import Metric, cdist, iou_dist +from .utils.matching import linear_assignment, greedy_match, fuse_motion, gate_cost +from .utils.rect import as_tlbr, to_tlbr, ios, bbox_ious, find_occluded +from .utils import Profiler LOGGER = logging.getLogger(__name__) -CHI_SQ_INV_95 = 9.4877 # 0.95 quantile of chi-square distribution -INF_COST = 1e5 class MultiTracker: - """ - Uses optical flow and Kalman filter to track multiple objects and - associates detections to tracklets based on motion and appearance. - Parameters - ---------- - size : (int, int) - Width and height of each frame. - dt : float - Time interval in seconds between each frame. - metric : string - Feature distance metric to associate tracklets. Usually - `euclidean` or `cosine`. - config : Dict - Tracker parameters. - """ - - def __init__(self, size, metric, config): + def __init__(self, size, metric, + max_age=6, + age_penalty=2, + motion_weight=0.2, + max_assoc_cost=0.9, + max_reid_cost=0.45, + iou_thresh=0.4, + duplicate_thresh=0.8, + occlusion_thresh=0.7, + conf_thresh=0.5, + confirm_hits=1, + history_size=50, + kalman_filter_cfg=None, + flow_cfg=None): + """Class that uses KLT and Kalman filter to track multiple objects and + associates detections to tracklets based on motion and appearance. + + Parameters + ---------- + size : tuple + Width and height of each frame. + metric : {'euclidean', 'cosine'} + Feature distance metric to associate tracks. + max_age : int, optional + Max number of undetected frames allowed before a track is terminated. + Note that skipped frames are not included. + age_penalty : int, optional + Scale factor to penalize KLT measurements for tracks with large age. + motion_weight : float, optional + Weight for motion term in matching cost function. + max_assoc_cost : float, optional + Max matching cost for valid primary association. + max_reid_cost : float, optional + Max ReID feature dissimilarity for valid reidentification. + iou_thresh : float, optional + IoU threshold for association with unconfirmed and unmatched active tracks. + duplicate_thresh : float, optional + Track overlap threshold for removing duplicate tracks. + occlusion_thresh : float, optional + Detection overlap threshold for nullifying the extracted embeddings for association/reID. + conf_thresh : float, optional + Detection confidence threshold for starting a new track. + confirm_hits : int, optional + Min number of detections to confirm a track. + history_size : int, optional + Max size of track history to keep for reID. + kalman_filter_cfg : SimpleNamespace, optional + Kalman Filter configuration. + flow_cfg : SimpleNamespace, optional + Flow configuration. + """ self.size = size - self.metric = metric - self.max_age = config['max_age'] - self.age_penalty = config['age_penalty'] - self.age_weight = config['age_weight'] - self.motion_weight = config['motion_weight'] - self.max_feat_cost = config['max_feat_cost'] - self.max_reid_cost = config['max_reid_cost'] - self.iou_thresh = config['iou_thresh'] - self.duplicate_iou = config['duplicate_iou'] - self.conf_thresh = config['conf_thresh'] - self.lost_buf_size = config['lost_buf_size'] - - self.next_id = 1 + self.metric = Metric[metric.upper()] + assert max_age >= 1 + self.max_age = max_age + assert age_penalty >= 1 + self.age_penalty = age_penalty + assert 0 <= motion_weight <= 1 + self.motion_weight = motion_weight + assert 0 <= max_assoc_cost <= 2 + self.max_assoc_cost = max_assoc_cost + assert 0 <= max_reid_cost <= 2 + self.max_reid_cost = max_reid_cost + assert 0 <= iou_thresh <= 1 + self.iou_thresh = iou_thresh + assert 0 <= duplicate_thresh <= 1 + self.duplicate_thresh = duplicate_thresh + assert 0 <= occlusion_thresh <= 1 + self.occlusion_thresh = occlusion_thresh + assert 0 <= conf_thresh <= 1 + self.conf_thresh = conf_thresh + assert confirm_hits >= 1 + self.confirm_hits = confirm_hits + assert history_size >= 0 + self.history_size = history_size + + if kalman_filter_cfg is None: + kalman_filter_cfg = SimpleNamespace() + if flow_cfg is None: + flow_cfg = SimpleNamespace() + self.tracks = {} - self.lost = OrderedDict() - self.kf = KalmanFilter(config['kalman_filter']) - self.flow = Flow(self.size, config['flow']) + self.hist_tracks = OrderedDict() + self.kf = KalmanFilter(**vars(kalman_filter_cfg)) + self.flow = Flow(self.size, **vars(flow_cfg)) self.frame_rect = to_tlbr((0, 0, *self.size)) - self.flow_bboxes = {} + self.klt_bboxes = {} self.homography = None - def reset_dt(self, dt): - """ - Set KalmanFilter dt parameter. + def reset(self, dt): + """Reset the tracker for new input context. + Parameters ---------- dt : float Time interval in seconds between each frame. """ self.kf.reset_dt(dt) + self.hist_tracks.clear() + Track._count = 0 def init(self, frame, detections): - """ - Initializes the tracker from detections in the first frame. + """Initializes the tracker from detections in the first frame. + Parameters ---------- frame : ndarray @@ -79,20 +129,17 @@ def init(self, frame, detections): detections : recarray[DET_DTYPE] Record array of N detections. """ - self.next_id = 1 self.tracks.clear() - self.lost.clear() self.flow.init(frame) for det in detections: state = self.kf.create(det.tlbr) - new_trk = Track(0, self.next_id, det.tlbr, state, det.label) - self.tracks[self.next_id] = new_trk + new_trk = Track(0, det.tlbr, state, det.label, self.confirm_hits) + self.tracks[new_trk.trk_id] = new_trk LOGGER.debug(f"{'Detected:':<14}{new_trk}") - self.next_id += 1 def track(self, frame): - """ - Convenience function that combines `compute_flow` and `apply_kalman`. + """Convenience function that combines `compute_flow` and `apply_kalman`. + Parameters ---------- frame : ndarray @@ -102,46 +149,43 @@ def track(self, frame): self.apply_kalman() def compute_flow(self, frame): - """ - Computes optical flow to estimate tracklet positions and camera motion. + """Computes optical flow to estimate tracklet positions and camera motion. + Parameters ---------- frame : ndarray The next frame. """ active_tracks = [track for track in self.tracks.values() if track.active] - self.flow_bboxes, self.homography = self.flow.predict(frame, active_tracks) + self.klt_bboxes, self.homography = self.flow.predict(frame, active_tracks) if self.homography is None: # clear tracks when camera motion cannot be estimated self.tracks.clear() def apply_kalman(self): - """ - Performs kalman filter prediction and update from flow measurements. + """Performs kalman filter predict and update from KLT measurements. The function should be called after `compute_flow`. """ for trk_id, track in list(self.tracks.items()): mean, cov = track.state mean, cov = self.kf.warp(mean, cov, self.homography) mean, cov = self.kf.predict(mean, cov) - if trk_id in self.flow_bboxes: - flow_tlbr = self.flow_bboxes[trk_id] - # give large flow uncertainty for occluded tracks - # usually these with high age and low inlier ratio + if trk_id in self.klt_bboxes: + klt_tlbr = self.klt_bboxes[trk_id] + # give large KLT uncertainty for occluded tracks + # usually these with large age and low inlier ratio std_multiplier = max(self.age_penalty * track.age, 1) / track.inlier_ratio - mean, cov = self.kf.update(mean, cov, flow_tlbr, MeasType.FLOW, std_multiplier) - next_tlbr = as_rect(mean[:4]) + mean, cov = self.kf.update(mean, cov, klt_tlbr, MeasType.FLOW, std_multiplier) + next_tlbr = as_tlbr(mean[:4]) track.update(next_tlbr, (mean, cov)) - if iom(next_tlbr, self.frame_rect) < 0.5: + if ios(next_tlbr, self.frame_rect) < 0.5: if track.confirmed: LOGGER.info(f"{'Out:':<14}{track}") - self._mark_lost(trk_id) - else: - del self.tracks[trk_id] + self._mark_lost(trk_id) def update(self, frame_id, detections, embeddings): - """ - Associates detections to tracklets based on motion and feature embeddings. + """Associates detections to tracklets based on motion and feature embeddings. + Parameters ---------- frame_id : int @@ -151,194 +195,229 @@ def update(self, frame_id, detections, embeddings): embeddings : ndarray NxM matrix of N extracted embeddings with dimension M. """ - det_ids = list(range(len(detections))) - confirmed = [trk_id for trk_id, track in self.tracks.items() if track.confirmed] - unconfirmed = [trk_id for trk_id, track in self.tracks.items() if not track.confirmed] - - # association with motion and embeddings - cost = self._matching_cost(confirmed, detections, embeddings) - matches1, u_trk_ids1, u_det_ids = self._linear_assignment(cost, confirmed, det_ids) + occluded_det_mask = find_occluded(detections.tlbr, self.occlusion_thresh) + confirmed_by_depth, unconfirmed = self._group_tracks_by_depth() + + # association with motion and embeddings, tracks with small age are prioritized + matches1 = [] + u_trk_ids1 = [] + u_det_ids = list(range(len(detections))) + for depth, trk_ids in enumerate(confirmed_by_depth): + if len(u_det_ids) == 0: + u_trk_ids1.extend(itertools.chain.from_iterable(confirmed_by_depth[depth:])) + break + if len(trk_ids) == 0: + continue + u_detections, u_embeddings = detections[u_det_ids], embeddings[u_det_ids] + u_occluded_dmask = occluded_det_mask[u_det_ids] + cost = self._matching_cost(trk_ids, u_detections, u_embeddings, u_occluded_dmask) + matches, u_trk_ids, u_det_ids = linear_assignment(cost, trk_ids, u_det_ids) + matches1 += matches + u_trk_ids1 += u_trk_ids # 2nd association with IoU active = [trk_id for trk_id in u_trk_ids1 if self.tracks[trk_id].active] u_trk_ids1 = [trk_id for trk_id in u_trk_ids1 if not self.tracks[trk_id].active] u_detections = detections[u_det_ids] cost = self._iou_cost(active, u_detections) - matches2, u_trk_ids2, u_det_ids = self._linear_assignment(cost, active, u_det_ids, True) + matches2, u_trk_ids2, u_det_ids = linear_assignment(cost, active, u_det_ids) # 3rd association with unconfirmed tracks u_detections = detections[u_det_ids] cost = self._iou_cost(unconfirmed, u_detections) - matches3, u_trk_ids3, u_det_ids = self._linear_assignment(cost, unconfirmed, - u_det_ids, True) + matches3, u_trk_ids3, u_det_ids = linear_assignment(cost, unconfirmed, u_det_ids) + + # reID with track history + hist_ids = [trk_id for trk_id, track in self.hist_tracks.items() + if track.avg_feat.count >= 2] - # re-id with lost tracks - lost_ids = list(self.lost.keys()) u_det_ids = [det_id for det_id in u_det_ids if detections[det_id].conf >= self.conf_thresh] - u_detections, u_embeddings = detections[u_det_ids], embeddings[u_det_ids] - cost = self._reid_cost(u_detections, u_embeddings) - reid_matches, _, u_det_ids = self._linear_assignment(cost, lost_ids, u_det_ids) + valid_u_det_ids = [det_id for det_id in u_det_ids if not occluded_det_mask[det_id]] + invalid_u_det_ids = [det_id for det_id in u_det_ids if occluded_det_mask[det_id]] + + u_detections, u_embeddings = detections[valid_u_det_ids], embeddings[valid_u_det_ids] + cost = self._reid_cost(hist_ids, u_detections, u_embeddings) + + reid_matches, _, reid_u_det_ids = greedy_match(cost, hist_ids, valid_u_det_ids, + self.max_reid_cost) matches = itertools.chain(matches1, matches2, matches3) u_trk_ids = itertools.chain(u_trk_ids1, u_trk_ids2, u_trk_ids3) - updated, aged = [], [] + + # rectify matches that may cause duplicate tracks + matches, u_trk_ids = self._rectify_matches(matches, u_trk_ids, detections) + + # reinstate matched tracks + for trk_id, det_id in reid_matches: + track = self.hist_tracks.pop(trk_id) + det = detections[det_id] + LOGGER.info(f"{'Reidentified:':<14}{track}") + state = self.kf.create(det.tlbr) + track.reinstate(frame_id, det.tlbr, state, embeddings[det_id]) + self.tracks[trk_id] = track # update matched tracks for trk_id, det_id in matches: track = self.tracks[trk_id] det = detections[det_id] mean, cov = self.kf.update(*track.state, det.tlbr, MeasType.DETECTOR) - next_tlbr = as_rect(mean[:4]) - track.update(next_tlbr, (mean, cov), embeddings[det_id]) - if track.hits == 1: + next_tlbr = as_tlbr(mean[:4]) + is_valid = not occluded_det_mask[det_id] + if track.hits == self.confirm_hits - 1: LOGGER.info(f"{'Found:':<14}{track}") - if iom(next_tlbr, self.frame_rect) < 0.5: - LOGGER.info(f"{'Out:':<14}{track}") + if ios(next_tlbr, self.frame_rect) < 0.5: + is_valid = False + if track.confirmed: + LOGGER.info(f"{'Out:':<14}{track}") self._mark_lost(trk_id) - else: - updated.append(trk_id) - - # reactivate matched lost tracks - for trk_id, det_id in reid_matches: - track = self.lost[trk_id] - det = detections[det_id] - LOGGER.info(f"{'Reidentified:':<14}{track}") - state = self.kf.create(det.tlbr) - track.reactivate(frame_id, det.tlbr, state, embeddings[det_id]) - self.tracks[trk_id] = track - del self.lost[trk_id] - updated.append(trk_id) + track.add_detection(frame_id, next_tlbr, (mean, cov), embeddings[det_id], is_valid) # clean up lost tracks for trk_id in u_trk_ids: track = self.tracks[trk_id] + track.mark_missed() if not track.confirmed: LOGGER.debug(f"{'Unconfirmed:':<14}{track}") del self.tracks[trk_id] continue - track.mark_missed() if track.age > self.max_age: LOGGER.info(f"{'Lost:':<14}{track}") self._mark_lost(trk_id) - else: - aged.append(trk_id) - # register new detections + u_det_ids = itertools.chain(invalid_u_det_ids, reid_u_det_ids) + # start new tracks for det_id in u_det_ids: det = detections[det_id] state = self.kf.create(det.tlbr) - new_trk = Track(frame_id, self.next_id, det.tlbr, state, det.label) - self.tracks[self.next_id] = new_trk + new_trk = Track(frame_id, det.tlbr, state, det.label, self.confirm_hits) + self.tracks[new_trk.trk_id] = new_trk LOGGER.debug(f"{'Detected:':<14}{new_trk}") - updated.append(self.next_id) - self.next_id += 1 - - # remove duplicate tracks - self._remove_duplicate(updated, aged) def _mark_lost(self, trk_id): - self.lost[trk_id] = self.tracks[trk_id] - if len(self.lost) > self.lost_buf_size: - self.lost.popitem(last=False) - del self.tracks[trk_id] + track = self.tracks.pop(trk_id) + if track.confirmed: + self.hist_tracks[trk_id] = track + if len(self.hist_tracks) > self.history_size: + self.hist_tracks.popitem(last=False) + + def _group_tracks_by_depth(self, group_size=2): + n_depth = (self.max_age + group_size) // group_size + confirmed_by_depth = [[] for _ in range(n_depth)] + unconfirmed = [] + for trk_id, track in self.tracks.items(): + if track.confirmed: + depth = track.age // group_size + confirmed_by_depth[depth].append(trk_id) + else: + unconfirmed.append(trk_id) + return confirmed_by_depth, unconfirmed - def _matching_cost(self, trk_ids, detections, embeddings): - if len(trk_ids) == 0 or len(detections) == 0: - return np.empty((len(trk_ids), len(detections))) + def _matching_cost(self, trk_ids, detections, embeddings, occluded_dmask): + n_trk, n_det = len(trk_ids), len(detections) + if n_trk == 0 or n_det == 0: + return np.empty((n_trk, n_det)) - features = [self.tracks[trk_id].smooth_feature for trk_id in trk_ids] - cost = cdist(features, embeddings, self.metric) + features = np.empty((n_trk, embeddings.shape[1])) + invalid_fmask = np.zeros(n_trk, np.bool_) for i, trk_id in enumerate(trk_ids): track = self.tracks[trk_id] - motion_dist = self.kf.motion_distance(*track.state, detections.tlbr) - normalized_age = track.age / self.max_age - cost[i] = self._fuse_motion(cost[i], motion_dist, detections.label, track.label, - normalized_age, self.max_feat_cost, self.motion_weight, - self.age_weight) - return cost + if track.avg_feat.is_valid(): + features[i, :] = track.avg_feat() + else: + invalid_fmask[i] = True - def _iou_cost(self, trk_ids, detections): - if len(trk_ids) == 0 or len(detections) == 0: - return np.empty((len(trk_ids), len(detections))) + empty_mask = invalid_fmask[:, None] | occluded_dmask + fill_val = min(self.max_assoc_cost + 0.1, 1.) + cost = cdist(features, embeddings, self.metric, empty_mask, fill_val) + + # fuse motion information + for row, trk_id in enumerate(trk_ids): + track = self.tracks[trk_id] + m_dist = self.kf.motion_distance(*track.state, detections.tlbr) + fuse_motion(cost[row], m_dist, self.motion_weight) # make sure associated pair has the same class label - trk_labels = np.array([self.tracks[trk_id].label for trk_id in trk_ids]) - trk_bboxes = np.array([self.tracks[trk_id].tlbr for trk_id in trk_ids]) - det_bboxes = detections.tlbr - ious = bbox_overlaps(trk_bboxes, det_bboxes) - ious = self._gate_cost(ious, trk_labels, detections.label, self.iou_thresh, True) - return ious - - def _reid_cost(self, detections, embeddings): - if len(self.lost) == 0 or len(detections) == 0: - return np.empty((len(self.lost), len(detections))) - - trk_labels = np.array([track.label for track in self.lost.values()]) - features = [track.smooth_feature for track in self.lost.values()] + t_labels = np.fromiter((self.tracks[trk_id].label for trk_id in trk_ids), int, n_trk) + gate_cost(cost, t_labels, detections.label, self.max_assoc_cost) + return cost + + def _iou_cost(self, trk_ids, detections): + n_trk, n_det = len(trk_ids), len(detections) + if n_trk == 0 or n_det == 0: + return np.empty((n_trk, n_det)) + + t_labels = np.fromiter((self.tracks[trk_id].label for trk_id in trk_ids), int, n_trk) + t_bboxes = np.array([self.tracks[trk_id].tlbr for trk_id in trk_ids]) + d_bboxes = detections.tlbr + iou_cost = iou_dist(t_bboxes, d_bboxes) + gate_cost(iou_cost, t_labels, detections.label, 1. - self.iou_thresh) + return iou_cost + + def _reid_cost(self, hist_ids, detections, embeddings): + n_hist, n_det = len(hist_ids), len(detections) + if n_hist == 0 or n_det == 0: + return np.empty((n_hist, n_det)) + + features = np.concatenate([self.hist_tracks[trk_id].avg_feat() + for trk_id in hist_ids]).reshape(n_hist, -1) cost = cdist(features, embeddings, self.metric) - cost = self._gate_cost(cost, trk_labels, detections.label, self.max_reid_cost) + + t_labels = np.fromiter((t.label for t in self.hist_tracks.values()), int, n_hist) + gate_cost(cost, t_labels, detections.label) return cost - def _remove_duplicate(self, updated, aged): - if len(updated) == 0 or len(aged) == 0: + def _rectify_matches(self, matches, u_trk_ids, detections): + matches, u_trk_ids = set(matches), set(u_trk_ids) + inactive_matches = [match for match in matches if not self.tracks[match[0]].active] + u_active = [trk_id for trk_id in u_trk_ids + if self.tracks[trk_id].confirmed and self.tracks[trk_id].active] + + n_inactive_matches = len(inactive_matches) + if n_inactive_matches == 0 or len(u_active) == 0: + return matches, u_trk_ids + + m_inactive, det_ids = zip(*inactive_matches) + t_bboxes = np.array([self.tracks[trk_id].tlbr for trk_id in u_active]) + d_bboxes = detections[det_ids,].tlbr + iou_cost = iou_dist(t_bboxes, d_bboxes) + + col_indices = list(range(n_inactive_matches)) + dup_matches, _, _ = greedy_match(iou_cost, u_active, col_indices, + 1. - self.duplicate_thresh) + + for u_trk_id, col in dup_matches: + m_trk_id, det_id = m_inactive[col], det_ids[col] + t_u_active, t_m_inactive = self.tracks[u_trk_id], self.tracks[m_trk_id] + if t_m_inactive.end_frame < t_u_active.start_frame: + LOGGER.debug(f"{'Merged:':<14}{u_trk_id} -> {m_trk_id}") + t_m_inactive.merge_continuation(t_u_active) + u_trk_ids.remove(u_trk_id) + del self.tracks[u_trk_id] + else: + LOGGER.debug(f"{'Duplicate:':<14}{m_trk_id} -> {u_trk_id}") + u_trk_ids.remove(u_trk_id) + u_trk_ids.add(m_trk_id) + matches.remove((m_trk_id, det_id)) + matches.add((u_trk_id, det_id)) + return matches, u_trk_ids + + def _remove_duplicate(self, trk_ids1, trk_ids2): + if len(trk_ids1) == 0 or len(trk_ids2) == 0: return - updated_bboxes = np.array([self.tracks[trk_id].tlbr for trk_id in updated]) - aged_bboxes = np.array([self.tracks[trk_id].tlbr for trk_id in aged]) + bboxes1 = np.array([self.tracks[trk_id].tlbr for trk_id in trk_ids1]) + bboxes2 = np.array([self.tracks[trk_id].tlbr for trk_id in trk_ids2]) - ious = bbox_overlaps(updated_bboxes, aged_bboxes) - idx = np.where(ious >= self.duplicate_iou) + ious = bbox_ious(bboxes1, bboxes2) + idx = np.where(ious >= self.duplicate_thresh) dup_ids = set() for row, col in zip(*idx): - updated_id, aged_id = updated[row], aged[col] - if self.tracks[updated_id].start_frame <= self.tracks[aged_id].start_frame: - dup_ids.add(aged_id) + trk_id1, trk_id2 = trk_ids1[row], trk_ids2[col] + track1, track2 = self.tracks[trk_id1], self.tracks[trk_id2] + if len(track1) > len(track2): + dup_ids.add(trk_id2) else: - dup_ids.add(updated_id) + dup_ids.add(trk_id1) for trk_id in dup_ids: LOGGER.debug(f"{'Duplicate:':<14}{self.tracks[trk_id]}") del self.tracks[trk_id] - - @staticmethod - def _linear_assignment(cost, trk_ids, det_ids, maximize=False): - rows, cols = linear_sum_assignment(cost, maximize) - unmatched_rows = list(set(range(cost.shape[0])) - set(rows)) - unmatched_cols = list(set(range(cost.shape[1])) - set(cols)) - unmatched_trk_ids = [trk_ids[row] for row in unmatched_rows] - unmatched_det_ids = [det_ids[col] for col in unmatched_cols] - matches = [] - if not maximize: - for row, col in zip(rows, cols): - if cost[row, col] < INF_COST: - matches.append((trk_ids[row], det_ids[col])) - else: - unmatched_trk_ids.append(trk_ids[row]) - unmatched_det_ids.append(det_ids[col]) - else: - for row, col in zip(rows, cols): - if cost[row, col] > 0: - matches.append((trk_ids[row], det_ids[col])) - else: - unmatched_trk_ids.append(trk_ids[row]) - unmatched_det_ids.append(det_ids[col]) - return matches, unmatched_trk_ids, unmatched_det_ids - - @staticmethod - @nb.njit(fastmath=True, cache=True) - def _fuse_motion(cost, motion_dist, det_labels, label, age, max_cost, w1, w2): - gate = (cost > max_cost) | (motion_dist > CHI_SQ_INV_95) | (label != det_labels) - cost = cost + w1 * motion_dist + w2 * age - cost[gate] = INF_COST - return cost - - @staticmethod - @nb.njit(parallel=True, fastmath=True, cache=True) - def _gate_cost(cost, trk_labels, det_labels, thresh, maximize=False): - for i in nb.prange(len(cost)): - if maximize: - gate = (cost[i] < thresh) | (trk_labels[i] != det_labels) - cost[i][gate] = 0 - else: - gate = (cost[i] > thresh) | (trk_labels[i] != det_labels) - cost[i][gate] = INF_COST - return cost diff --git a/fastmot/utils/distance.py b/fastmot/utils/distance.py new file mode 100644 index 00000000..75cdfbea --- /dev/null +++ b/fastmot/utils/distance.py @@ -0,0 +1,162 @@ +from enum import Enum +import numpy as np +import numba as nb + +from .rect import area, get_center + + +INF_DIST = 1e5 + + +class Metric(Enum): + EUCLIDEAN = 0 + COSINE = 1 + + +@nb.njit(parallel=True, fastmath=True, cache=True) +def cdist(XA, XB, metric, empty_mask=None, fill_val=None): + """Numba implementation of Scipy's cdist""" + assert XA.ndim == XB.ndim == 2 + assert XA.shape[1] == XB.shape[1] + if empty_mask is not None: + assert empty_mask.ndim == 2 + assert empty_mask.shape[0] == XA.shape[0] + assert empty_mask.shape[1] == XB.shape[0] + filler = 1. if fill_val is None else fill_val + + if metric == Metric.EUCLIDEAN: + return euclidean(XA, XB, empty_mask, filler) + elif metric == Metric.COSINE: + return cosine(XA, XB, empty_mask, filler) + else: + raise ValueError('Unsupported distance metric') + + +@nb.njit(parallel=True, fastmath=True, cache=True) +def pdist(X, metric): + """Numba implementation of Scipy's pdist""" + assert X.ndim == 2 + + if metric == Metric.EUCLIDEAN: + return euclidean(X, X, symmetric=True) + elif metric == Metric.COSINE: + return cosine(X, X, symmetric=True) + else: + raise ValueError('Unsupported distance metric') + + +@nb.njit(parallel=True, fastmath=True, cache=True, inline='always') +def euclidean(XA, XB, empty_mask=None, filler=1., symmetric=False): + """Numba implementation of Scipy's euclidean""" + Y = np.empty((XA.shape[0], XB.shape[0])) + for i in nb.prange(XA.shape[0]): + for j in range(XB.shape[0]): + if symmetric and i >= j: + Y[i, j] = INF_DIST + elif empty_mask is not None and empty_mask[i, j]: + Y[i, j] = filler + else: + norm = 0. + for k in range(XA.shape[1]): + norm += (XA[i, k] - XB[j, k])**2 + Y[i, j] = np.sqrt(norm) + return Y + + +@nb.njit(parallel=True, fastmath=True, cache=True, inline='always') +def cosine(XA, XB, empty_mask=None, filler=1., symmetric=False): + """Numba implementation of Scipy's cosine""" + Y = np.empty((XA.shape[0], XB.shape[0])) + for i in nb.prange(XA.shape[0]): + for j in range(XB.shape[0]): + if symmetric and i >= j: + Y[i, j] = INF_DIST + elif empty_mask is not None and empty_mask[i, j]: + Y[i, j] = filler + else: + dot = 0. + a_norm = 0. + b_norm = 0. + for k in range(XA.shape[1]): + dot += XA[i, k] * XB[j, k] + a_norm += XA[i, k] * XA[i, k] + b_norm += XB[j, k] * XB[j, k] + a_norm = np.sqrt(a_norm) + b_norm = np.sqrt(b_norm) + Y[i, j] = 1. - dot / (a_norm * b_norm) + return Y + + +@nb.njit(parallel=False, fastmath=True, cache=True) +def iou_dist(tlbrs1, tlbrs2): + """Computes pairwise IoU distance.""" + assert tlbrs1.ndim == tlbrs2.ndim == 2 + assert tlbrs1.shape[1] == tlbrs2.shape[1] == 4 + + Y = np.empty((tlbrs1.shape[0], tlbrs2.shape[0])) + for i in nb.prange(tlbrs1.shape[0]): + area1 = area(tlbrs1[i, :]) + for j in range(tlbrs2.shape[0]): + iw = min(tlbrs1[i, 2], tlbrs2[j, 2]) - max(tlbrs1[i, 0], tlbrs2[j, 0]) + 1 + ih = min(tlbrs1[i, 3], tlbrs2[j, 3]) - max(tlbrs1[i, 1], tlbrs2[j, 1]) + 1 + if iw > 0 and ih > 0: + area_inter = iw * ih + area_union = area1 + area(tlbrs2[j, :]) - area_inter + Y[i, j] = 1. - area_inter / area_union + else: + Y[i, j] = 1. + return Y + + +@nb.njit(parallel=False, fastmath=True, cache=True) +def giou_dist(tlbrs1, tlbrs2): + """Computes pairwise GIoU distance.""" + assert tlbrs1.ndim == tlbrs2.ndim == 2 + assert tlbrs1.shape[1] == tlbrs2.shape[1] == 4 + + Y = np.empty((tlbrs1.shape[0], tlbrs2.shape[0])) + for i in nb.prange(tlbrs1.shape[0]): + area1 = area(tlbrs1[i, :]) + for j in range(tlbrs2.shape[0]): + iou = 0. + area_union = area1 + area(tlbrs2[j, :]) + iw = min(tlbrs1[i, 2], tlbrs2[j, 2]) - max(tlbrs1[i, 0], tlbrs2[j, 0]) + 1 + ih = min(tlbrs1[i, 3], tlbrs2[j, 3]) - max(tlbrs1[i, 1], tlbrs2[j, 1]) + 1 + if iw > 0 and ih > 0: + area_inter = iw * ih + area_union -= area_inter + iou = area_inter / area_union + ew = max(tlbrs1[i, 2], tlbrs2[j, 2]) - min(tlbrs1[i, 0], tlbrs2[j, 0]) + 1 + eh = max(tlbrs1[i, 3], tlbrs2[j, 3]) - min(tlbrs1[i, 1], tlbrs2[j, 1]) + 1 + area_encls = ew * eh + giou = iou - (area_encls - area_union) / area_encls + Y[i, j] = (1. - giou) * 0.5 + return Y + + +@nb.njit(parallel=True, fastmath=True, cache=True) +def diou_dist(tlbrs1, tlbrs2): + """Computes pairwise DIoU distance.""" + assert tlbrs1.ndim == tlbrs2.ndim == 2 + assert tlbrs1.shape[1] == tlbrs2.shape[1] == 4 + + Y = np.empty((tlbrs1.shape[0], tlbrs2.shape[0])) + for i in nb.prange(tlbrs1.shape[0]): + area1 = area(tlbrs1[i, :]) + x1, y1 = get_center(tlbrs1[i, :]) + for j in range(tlbrs2.shape[0]): + iou = 0. + iw = min(tlbrs1[i, 2], tlbrs2[j, 2]) - max(tlbrs1[i, 0], tlbrs2[j, 0]) + 1 + ih = min(tlbrs1[i, 3], tlbrs2[j, 3]) - max(tlbrs1[i, 1], tlbrs2[j, 1]) + 1 + if iw > 0 and ih > 0: + area_inter = iw * ih + area_union = area1 + area(tlbrs2[j, :]) - area_inter + iou = area_inter / area_union + ew = max(tlbrs1[i, 2], tlbrs2[j, 2]) - min(tlbrs1[i, 0], tlbrs2[j, 0]) + 1 + eh = max(tlbrs1[i, 3], tlbrs2[j, 3]) - min(tlbrs1[i, 1], tlbrs2[j, 1]) + 1 + c = ew**2 + eh**2 + x2, y2 = get_center(tlbrs2[j, :]) + d = (x2 - x1)**2 + (y2 - y1)**2 + diou = iou - (d / c)**0.6 + Y[i, j] = (1. - diou) * 0.5 + return Y diff --git a/fastmot/utils/matching.py b/fastmot/utils/matching.py new file mode 100644 index 00000000..aa2427e7 --- /dev/null +++ b/fastmot/utils/matching.py @@ -0,0 +1,116 @@ +from scipy.optimize import linear_sum_assignment +import numpy as np +import numba as nb + + +CHI_SQ_INV_95 = 9.4877 # 0.95 quantile of chi-square distribution +INF_COST = 1e5 + + +def linear_assignment(cost, row_ids, col_ids): + """Solves the linear assignment problem. + + Parameters + ---------- + cost : ndarray + The cost matrix. + row_ids : List[int] + IDs that correspond to each row in the cost matrix. + col_ids : List[int] + IDs that correspond to each column in the cost matrix. + + Returns + ------- + List[tuple], List[int], List[int] + Matched row and column IDs, unmatched row IDs, and unmatched column IDs. + """ + m_rows, m_cols = linear_sum_assignment(cost) + row_ids = np.fromiter(row_ids, int, len(row_ids)) + col_ids = np.fromiter(col_ids, int, len(col_ids)) + return _get_assignment_matches(cost, row_ids, col_ids, m_rows, m_cols) + + +def greedy_match(cost, row_ids, col_ids, max_cost): + """Performs greedy matching until the cost exceeds `max_cost`. + + Parameters + ---------- + cost : ndarray + The cost matrix. + row_ids : List[int] + IDs that correspond to each row in the cost matrix. + col_ids : List[int] + IDs that correspond to each column in the cost matrix. + max_cost : float + Maximum cost allowed to match a row with a column. + + Returns + ------- + List[tuple], List[int], List[int] + Matched row and column IDs, unmatched row IDs, and unmatched column IDs. + """ + row_ids = np.fromiter(row_ids, int, len(row_ids)) + col_ids = np.fromiter(col_ids, int, len(col_ids)) + return _greedy_match(cost, row_ids, col_ids, max_cost) + + +@nb.njit(fastmath=True, cache=True) +def _get_assignment_matches(cost, row_ids, col_ids, m_rows, m_cols): + unmatched_rows = list(set(range(cost.shape[0])) - set(m_rows)) + unmatched_cols = list(set(range(cost.shape[1])) - set(m_cols)) + unmatched_row_ids = [row_ids[row] for row in unmatched_rows] + unmatched_col_ids = [col_ids[col] for col in unmatched_cols] + matches = [] + for row, col in zip(m_rows, m_cols): + if cost[row, col] < INF_COST: + matches.append((row_ids[row], col_ids[col])) + else: + unmatched_row_ids.append(row_ids[row]) + unmatched_col_ids.append(col_ids[col]) + return matches, unmatched_row_ids, unmatched_col_ids + + +@nb.njit(fastmath=True, cache=True) +def _greedy_match(cost, row_ids, col_ids, max_cost): + indices_rows = np.arange(cost.shape[0]) + indices_cols = np.arange(cost.shape[1]) + + matches = [] + while cost.shape[0] > 0 and cost.shape[1] > 0: + idx = np.argmin(cost) + i, j = idx // cost.shape[1], idx % cost.shape[1] + if cost[i, j] <= max_cost: + matches.append((row_ids[indices_rows[i]], col_ids[indices_cols[j]])) + row_mask = np.ones(cost.shape[0], np.bool_) + col_mask = np.ones(cost.shape[1], np.bool_) + row_mask[i] = False + col_mask[j] = False + + indices_rows = indices_rows[row_mask] + indices_cols = indices_cols[col_mask] + cost = cost[row_mask, :][:, col_mask] + else: + break + + unmatched_row_ids = [row_ids[row] for row in indices_rows] + unmatched_col_ids = [col_ids[col] for col in indices_cols] + return matches, unmatched_row_ids, unmatched_col_ids + + +@nb.njit(fastmath=True, cache=True) +def fuse_motion(cost, m_dist, m_weight): + """Fuse each row of cost matrix with motion information.""" + norm_factor = 1. / CHI_SQ_INV_95 + f_weight = 1. - m_weight + cost[:] = f_weight * cost + m_weight * norm_factor * m_dist + cost[m_dist > CHI_SQ_INV_95] = INF_COST + + +@nb.njit(parallel=False, fastmath=True, cache=True) +def gate_cost(cost, row_labels, col_labels, max_cost=None): + """Gate cost matrix if cost exceeds the maximum.""" + for i in nb.prange(cost.shape[0]): + for j in range(cost.shape[1]): + if (row_labels[i] != col_labels[j] or + max_cost is not None and cost[i, j] > max_cost): + cost[i, j] = INF_COST diff --git a/fastmot/utils/numba.py b/fastmot/utils/numba.py new file mode 100644 index 00000000..d7b2e0bb --- /dev/null +++ b/fastmot/utils/numba.py @@ -0,0 +1,64 @@ +import numpy as np +import numba as nb + + +@nb.njit(fastmath=True, cache=True) +def apply_along_axis(func1d, mat, axis): + """Numba utility to apply reduction to a given axis.""" + assert mat.ndim == 2 + assert axis in [0, 1] + if axis == 0: + result = np.empty(mat.shape[1], mat.dtype) + for i in range(len(result)): + result[i, :] = func1d(mat[:, i]) + else: + result = np.empty(mat.shape[0], mat.dtype) + for i in range(len(result)): + result[i, :] = func1d(mat[i, :]) + return result + + +@nb.njit(parallel=True, fastmath=True, cache=True) +def normalize_vec(vectors): + """Numba utility to normalize an array of vectors.""" + assert vectors.ndim == 2 + out = np.empty_like(vectors) + for i in nb.prange(vectors.shape[0]): + norm_factor = 1. / np.linalg.norm(vectors[i, :]) + out[i, :] = norm_factor * vectors[i, :] + return out + + +@nb.njit(fastmath=True, cache=True) +def mask_area(mask): + """Utility to calculate the area of a mask.""" + count = 0 + m_raveled = mask.ravel() + for i in range(mask.size): + if m_raveled[i] != 0: + count += 1 + return count + + +@nb.njit(fastmath=True, cache=True, inline='always') +def transform(pts, m): + """Numba implementation of OpenCV's transform.""" + pts = np.asarray(pts, dtype=np.float64) + pts = np.atleast_2d(pts) + + augment = np.ones((len(pts), 1)) + pts = np.concatenate((pts, augment), axis=1) + return pts @ m.T + + +@nb.njit(fastmath=True, cache=True, inline='always') +def perspective_transform(pts, m): + """Numba implementation of OpenCV's perspectiveTransform.""" + pts = np.asarray(pts, dtype=np.float64) + pts = np.atleast_2d(pts) + + augment = np.ones((len(pts), 1)) + pts = np.concatenate((pts, augment), axis=1).T + pts = m @ pts + pts = pts / pts[-1] + return pts[:2].T diff --git a/fastmot/utils/rect.py b/fastmot/utils/rect.py index b1981554..67b4c7ba 100644 --- a/fastmot/utils/rect.py +++ b/fastmot/utils/rect.py @@ -2,136 +2,170 @@ import numba as nb -@nb.njit(cache=True) -def as_rect(tlbr): - tlbr = np.asarray(tlbr, np.float64) - tlbr = np.rint(tlbr) - return tlbr - - -@nb.njit(cache=True) +@nb.njit(cache=True, inline='always') +def as_tlbr(tlbr): + """Construct a rectangle from a tuple or np.ndarray.""" + _tlbr = np.empty(4) + _tlbr[0] = round(float(tlbr[0]), 0) + _tlbr[1] = round(float(tlbr[1]), 0) + _tlbr[2] = round(float(tlbr[2]), 0) + _tlbr[3] = round(float(tlbr[3]), 0) + return _tlbr + + +@nb.njit(cache=True, inline='always') def get_size(tlbr): - tl, br = tlbr[:2], tlbr[2:] - size = br - tl + 1 - return size + return tlbr[2] - tlbr[0] + 1, tlbr[3] - tlbr[1] + 1 -@nb.njit(cache=True) -def area(tlbr): - size = get_size(tlbr) - return int(size[0] * size[1]) +@nb.njit(cache=True, inline='always') +def aspect_ratio(tlbr): + w, h = get_size(tlbr) + return h / w if w > 0 else 0. -@nb.njit(cache=True) -def mask_area(mask): - return np.count_nonzero(mask) +@nb.njit(cache=True, inline='always') +def area(tlbr): + w, h = get_size(tlbr) + if w <= 0 or h <= 0: + return 0. + return w * h -@nb.njit(cache=True) +@nb.njit(cache=True, inline='always') def get_center(tlbr): - xmin, ymin, xmax, ymax = tlbr - return np.array([(xmin + xmax) / 2, (ymin + ymax) / 2]) + return (tlbr[0] + tlbr[2]) / 2, (tlbr[1] + tlbr[3]) / 2 -@nb.njit(cache=True) +@nb.njit(cache=True, inline='always') def to_tlwh(tlbr): - return np.append(tlbr[:2], get_size(tlbr)) + tlwh = np.empty(4) + tlwh[:2] = tlbr[:2] + tlwh[2:] = get_size(tlbr) + return tlwh -@nb.njit(cache=True) +@nb.njit(cache=True, inline='always') def to_tlbr(tlwh): - tlwh = np.asarray(tlwh, np.float64) - tlwh = np.rint(tlwh) - tl, size = tlwh[:2], tlwh[2:] - br = tl + size - 1 - return np.append(tl, br) + tlbr = np.empty(4) + xmin = float(tlwh[0]) + ymin = float(tlwh[1]) + tlbr[0] = round(xmin, 0) + tlbr[1] = round(ymin, 0) + tlbr[2] = round(xmin + float(tlwh[2]) - 1., 0) + tlbr[3] = round(ymin + float(tlwh[3]) - 1., 0) + return tlbr -@nb.njit(cache=True) +@nb.njit(cache=True, inline='always') def intersection(tlbr1, tlbr2): - tl1, br1 = tlbr1[:2], tlbr1[2:] - tl2, br2 = tlbr2[:2], tlbr2[2:] - tl = np.maximum(tl1, tl2) - br = np.minimum(br1, br2) - tlbr = np.append(tl, br) - if np.any(get_size(tlbr) <= 0): + tlbr = np.empty(4) + tlbr[0] = max(tlbr1[0], tlbr2[0]) + tlbr[1] = max(tlbr1[1], tlbr2[1]) + tlbr[2] = min(tlbr1[2], tlbr2[2]) + tlbr[3] = min(tlbr1[3], tlbr2[3]) + if tlbr[2] < tlbr[0] or tlbr[3] < tlbr[1]: return None return tlbr -@nb.njit(cache=True) -def union(tlbr1, tlbr2): - tl1, br1 = tlbr1[:2], tlbr1[2:] - tl2, br2 = tlbr2[:2], tlbr2[2:] - tl = np.minimum(tl1, tl2) - br = np.maximum(br1, br2) - tlbr = np.append(tl, br) +@nb.njit(cache=True, inline='always') +def enclosing(tlbr1, tlbr2): + tlbr = np.empty(4) + tlbr[0] = min(tlbr1[0], tlbr2[0]) + tlbr[1] = min(tlbr1[1], tlbr2[1]) + tlbr[2] = max(tlbr1[2], tlbr2[2]) + tlbr[3] = max(tlbr1[3], tlbr2[3]) return tlbr -@nb.njit(cache=True) +@nb.njit(cache=True, inline='always') def crop(img, tlbr): - xmin, ymin, xmax, ymax = tlbr.astype(np.int_) + xmin = max(int(tlbr[0]), 0) + ymin = max(int(tlbr[1]), 0) + xmax = max(int(tlbr[2]), 0) + ymax = max(int(tlbr[3]), 0) return img[ymin:ymax + 1, xmin:xmax + 1] -@nb.njit(cache=True) +@nb.njit(cache=True, inline='always') def multi_crop(img, tlbrs): - tlbrs_ = tlbrs.astype(np.int_) - return [img[tlbrs_[i][1]:tlbrs_[i][3] + 1, tlbrs_[i][0]:tlbrs_[i][2] + 1] - for i in range(len(tlbrs_))] + _tlbrs = tlbrs.astype(np.int_) + _tlbrs = np.maximum(_tlbrs, 0) + return [img[_tlbrs[i, 1]:_tlbrs[i, 3] + 1, _tlbrs[i, 0]:_tlbrs[i, 2] + 1] + for i in range(len(_tlbrs))] + + +@nb.njit(fastmath=True, cache=True, inline='always') +def ios(tlbr1, tlbr2): + """Computes intersection over self.""" + iw = min(tlbr1[2], tlbr2[2]) - max(tlbr1[0], tlbr2[0]) + 1 + ih = min(tlbr1[3], tlbr2[3]) - max(tlbr1[1], tlbr2[1]) + 1 + if iw <= 0 or ih <= 0: + return 0. + area_inter = iw * ih + area_self = area(tlbr1) + return area_inter / area_self -@nb.njit(fastmath=True, cache=True) +@nb.njit(fastmath=True, cache=True, inline='always') def iom(tlbr1, tlbr2): - """ - Computes intersection over minimum. - """ - tlbr = intersection(tlbr1, tlbr2) - if tlbr is None: + """Computes intersection over minimum.""" + iw = min(tlbr1[2], tlbr2[2]) - max(tlbr1[0], tlbr2[0]) + 1 + ih = min(tlbr1[3], tlbr2[3]) - max(tlbr1[1], tlbr2[1]) + 1 + if iw <= 0 or ih <= 0: return 0. - area_intersection = area(tlbr) - area_minimum = min(area(tlbr1), area(tlbr2)) - return area_intersection / area_minimum - - -@nb.njit(fastmath=True, cache=True) -def transform(pts, m): - """ - Numba implementation of OpenCV's transform. - """ - pts = np.asarray(pts) - pts = np.atleast_2d(pts) - augment = np.ones((len(pts), 1)) - pts = np.concatenate((pts, augment), axis=1) - return pts @ m.T - - -@nb.njit(fastmath=True, cache=True) -def perspective_transform(pts, m): - """ - Numba implementation of OpenCV's perspectiveTransform. - """ - pts = np.asarray(pts) - pts = np.atleast_2d(pts) - augment = np.ones((len(pts), 1)) - pts = np.concatenate((pts, augment), axis=1).T - pts = m @ pts - pts = pts / pts[-1] - return pts[:2].T + area_inter = iw * ih + area_min = min(area(tlbr1), area(tlbr2)) + return area_inter / area_min + + +@nb.njit(parallel=False, fastmath=True, cache=True) +def bbox_ious(tlbrs1, tlbrs2): + """Computes pairwise bounding box overlaps using IoU.""" + ious = np.empty((tlbrs1.shape[0], tlbrs2.shape[0])) + for i in nb.prange(tlbrs1.shape[0]): + area1 = area(tlbrs1[i, :]) + for j in range(tlbrs2.shape[0]): + iw = min(tlbrs1[i, 2], tlbrs2[j, 2]) - max(tlbrs1[i, 0], tlbrs2[j, 0]) + 1 + ih = min(tlbrs1[i, 3], tlbrs2[j, 3]) - max(tlbrs1[i, 1], tlbrs2[j, 1]) + 1 + if iw > 0 and ih > 0: + area_inter = iw * ih + area_union = area1 + area(tlbrs2[j, :]) - area_inter + ious[i, j] = area_inter / area_union + else: + ious[i, j] = 0. + return ious + + +@nb.njit(parallel=False, fastmath=True, cache=True) +def find_occluded(tlbrs, occlusion_thresh): + """Computes a mask of occluded bounding boxes.""" + occluded_mask = np.zeros(tlbrs.shape[0], dtype=np.bool_) + for i in nb.prange(tlbrs.shape[0]): + area_self = area(tlbrs[i, :]) + for j in range(tlbrs.shape[0]): + if i != j: + iw = min(tlbrs[i, 2], tlbrs[j, 2]) - max(tlbrs[i, 0], tlbrs[j, 0]) + 1 + ih = min(tlbrs[i, 3], tlbrs[j, 3]) - max(tlbrs[i, 1], tlbrs[j, 1]) + 1 + if iw > 0 and ih > 0: + ios = iw * ih / area_self + if ios >= occlusion_thresh: + occluded_mask[i] = True + break + return occluded_mask @nb.njit(fastmath=True, cache=True) def nms(tlwhs, scores, nms_thresh): - """ - Applies Non-Maximum Suppression on the bounding boxes [x, y, w, h]. + """Applies Non-Maximum Suppression on the bounding boxes [x, y, w, h]. Returns an array with the indexes of the bounding boxes we want to keep. """ areas = tlwhs[:, 2] * tlwhs[:, 3] ordered = scores.argsort()[::-1] - tl = tlwhs[:, :2] - br = tlwhs[:, :2] + tlwhs[:, 2:] - 1 + tls = tlwhs[:, :2] + brs = tlwhs[:, :2] + tlwhs[:, 2:] - 1 keep = [] while ordered.size > 0: @@ -139,14 +173,14 @@ def nms(tlwhs, scores, nms_thresh): i = ordered[0] keep.append(i) - other_tl = tl[ordered[1:]] - other_br = br[ordered[1:]] + other_tls = tls[ordered[1:]] + other_brs = brs[ordered[1:]] # compute IoU - inter_xmin = np.maximum(tl[i, 0], other_tl[:, 0]) - inter_ymin = np.maximum(tl[i, 1], other_tl[:, 1]) - inter_xmax = np.minimum(br[i, 0], other_br[:, 0]) - inter_ymax = np.minimum(br[i, 1], other_br[:, 1]) + inter_xmin = np.maximum(tls[i, 0], other_tls[:, 0]) + inter_ymin = np.maximum(tls[i, 1], other_tls[:, 1]) + inter_xmax = np.minimum(brs[i, 0], other_brs[:, 0]) + inter_ymax = np.minimum(brs[i, 1], other_brs[:, 1]) inter_w = np.maximum(0, inter_xmax - inter_xmin + 1) inter_h = np.maximum(0, inter_ymax - inter_ymin + 1) @@ -156,21 +190,19 @@ def nms(tlwhs, scores, nms_thresh): idx = np.where(iou <= nms_thresh)[0] ordered = ordered[idx + 1] - keep = np.asarray(keep) + keep = np.array(keep) return keep @nb.njit(fastmath=True, cache=True) def diou_nms(tlwhs, scores, nms_thresh, beta=0.6): - """ - Applies Non-Maximum Suppression using the DIoU metric. - """ + """Applies Non-Maximum Suppression using the DIoU metric.""" areas = tlwhs[:, 2] * tlwhs[:, 3] ordered = scores.argsort()[::-1] - tl = tlwhs[:, :2] - br = tlwhs[:, :2] + tlwhs[:, 2:] - 1 - centers = (tl + br) / 2 + tls = tlwhs[:, :2] + brs = tlwhs[:, :2] + tlwhs[:, 2:] - 1 + centers = (tls + brs) / 2 keep = [] while ordered.size > 0: @@ -178,14 +210,14 @@ def diou_nms(tlwhs, scores, nms_thresh, beta=0.6): i = ordered[0] keep.append(i) - other_tl = tl[ordered[1:]] - other_br = br[ordered[1:]] + other_tls = tls[ordered[1:]] + other_brs = brs[ordered[1:]] # compute IoU - inter_xmin = np.maximum(tl[i, 0], other_tl[:, 0]) - inter_ymin = np.maximum(tl[i, 1], other_tl[:, 1]) - inter_xmax = np.minimum(br[i, 0], other_br[:, 0]) - inter_ymax = np.minimum(br[i, 1], other_br[:, 1]) + inter_xmin = np.maximum(tls[i, 0], other_tls[:, 0]) + inter_ymin = np.maximum(tls[i, 1], other_tls[:, 1]) + inter_xmax = np.minimum(brs[i, 0], other_brs[:, 0]) + inter_ymax = np.minimum(brs[i, 1], other_brs[:, 1]) inter_w = np.maximum(0, inter_xmax - inter_xmin + 1) inter_h = np.maximum(0, inter_ymax - inter_ymin + 1) @@ -194,18 +226,18 @@ def diou_nms(tlwhs, scores, nms_thresh, beta=0.6): iou = inter_area / union_area # compute DIoU - union_xmin = np.minimum(tl[i, 0], other_tl[:, 0]) - union_ymin = np.minimum(tl[i, 1], other_tl[:, 1]) - union_xmax = np.maximum(br[i, 0], other_br[:, 0]) - union_ymax = np.maximum(br[i, 1], other_br[:, 1]) - - union_w = union_xmax - union_xmin + 1 - union_h = union_ymax - union_ymin + 1 - c = union_w**2 + union_h**2 + encls_xmin = np.minimum(tls[i, 0], other_tls[:, 0]) + encls_ymin = np.minimum(tls[i, 1], other_tls[:, 1]) + encls_xmax = np.maximum(brs[i, 0], other_brs[:, 0]) + encls_ymax = np.maximum(brs[i, 1], other_brs[:, 1]) + + encls_w = encls_xmax - encls_xmin + 1 + encls_h = encls_ymax - encls_ymin + 1 + c = encls_w**2 + encls_h**2 d = np.sum((centers[i] - centers[ordered[1:]])**2, axis=1) diou = iou - (d / c)**beta idx = np.where(diou <= nms_thresh)[0] ordered = ordered[idx + 1] - keep = np.asarray(keep) + keep = np.array(keep) return keep diff --git a/fastmot/utils/visualization.py b/fastmot/utils/visualization.py index 76d9feb7..6f290921 100644 --- a/fastmot/utils/visualization.py +++ b/fastmot/utils/visualization.py @@ -15,26 +15,25 @@ def draw_tracks(frame, tracks, show_flow=False, show_cov=False): draw_covariance(frame, track.tlbr, track.state[1]) -def draw_detections(frame, detections): +def draw_detections(frame, detections, color=(255, 255, 255), show_conf=False): for det in detections: - draw_bbox(frame, det.tlbr, (255, 255, 255), 1) + text = f'{det.conf:.2f}' if show_conf else None + draw_bbox(frame, det.tlbr, color, 1, text) -def draw_flow_bboxes(frame, tracker): - for tlbr in tracker.flow_bboxes.values(): - draw_bbox(frame, tlbr, 0, 1) +def draw_klt_bboxes(frame, klt_bboxes, color=(0, 0, 0)): + for tlbr in klt_bboxes: + draw_bbox(frame, tlbr, color, 1) -def draw_tiles(frame, detector): - assert hasattr(detector, 'tiles') - for tile in detector.tiles: - tlbr = np.rint(tile * np.tile(detector.scale_factor, 2)) - draw_bbox(frame, tlbr, 0, 1) +def draw_tiles(frame, tiles, scale_factor, color=(0, 0, 0)): + for tile in tiles: + tlbr = np.rint(tile * np.tile(scale_factor, 2)) + draw_bbox(frame, tlbr, color, 1) -def draw_background_flow(frame, tracker): - draw_feature_match(frame, tracker.flow.prev_bg_keypoints, - tracker.flow.bg_keypoints, (0, 0, 255)) +def draw_background_flow(frame, prev_bg_keypoints, bg_keypoints, color=(0, 0, 255)): + draw_feature_match(frame, prev_bg_keypoints, bg_keypoints, color) def get_color(idx, s=0.8, vmin=0.7): @@ -84,3 +83,46 @@ def ellipse(cov): cv2.ellipse(frame, tl, axes, angle, 0, 360, (255, 255, 255), 1, cv2.LINE_AA) axes, angle = ellipse(covariance[2:4, 2:4]) cv2.ellipse(frame, br, axes, angle, 0, 360, (255, 255, 255), 1, cv2.LINE_AA) + + +class Visualizer: + def __init__(self, + draw_detections=False, + draw_confidence=False, + draw_covariance=False, + draw_klt=False, + draw_obj_flow=False, + draw_bg_flow=False): + """Class for visualization. + + Parameters + ---------- + draw_detections : bool, optional + Enable drawing detections. + draw_confidence : bool, optional + Enable drawing detection confidence, ignored if `draw_detections` is disabled. + draw_covariance : bool, optional + Enable drawing Kalman filter position covariance. + draw_klt : bool, optional + Enable drawing KLT bounding boxes. + draw_obj_flow : bool, optional + Enable drawing object flow matches. + draw_bg_flow : bool, optional + Enable drawing background flow matches. + """ + self.draw_detections = draw_detections + self.draw_confidence = draw_confidence + self.draw_covariance = draw_covariance + self.draw_klt = draw_klt + self.draw_obj_flow = draw_obj_flow + self.draw_bg_flow = draw_bg_flow + + def render(self, frame, tracks, detections, klt_bboxes, prev_bg_keypoints, bg_keypoints): + """Render visualizations onto the frame.""" + draw_tracks(frame, tracks, show_flow=self.draw_obj_flow, show_cov=self.draw_covariance) + if self.draw_detections: + draw_detections(frame, detections, show_conf=self.draw_confidence) + if self.draw_klt: + draw_klt_bboxes(frame, klt_bboxes) + if self.draw_bg_flow: + draw_background_flow(frame, prev_bg_keypoints, bg_keypoints) diff --git a/fastmot/videoio.py b/fastmot/videoio.py index 54c81c39..bd48b0e5 100644 --- a/fastmot/videoio.py +++ b/fastmot/videoio.py @@ -22,33 +22,45 @@ class Protocol(Enum): class VideoIO: - """ - Class for capturing from a video file, an image sequence, or a camera, and saving video output. - Encoding, decoding, and scaling can be accelerated using the GStreamer backend. - Parameters - ---------- - size : (int, int) - Width and height of each frame to output. - config : Dict - Camera and buffer configuration. - input_uri : string - URI to an input video file or capturing device. - output_uri : string - URI to an output video file. - proc_fps : int - Estimated processing speed. This depends on compute and scene complexity. - """ + def __init__(self, size, input_uri, + output_uri=None, + resolution=(1920, 1080), + frame_rate=30, + buffer_size=10, + proc_fps=30): + """Class for capturing from a video/image sequence/camera, and saving video output. + Encoding, decoding, and scaling can be accelerated using the GStreamer backend. - def __init__(self, size, config, input_uri, output_uri=None, proc_fps=30): + Parameters + ---------- + size : tuple + Width and height of each frame to output. + input_uri : str + URI to an input video file or capturing device. + output_uri : str, optional + URI to an output video file. + resolution : tuple, optional + Resolution of the input source. + frame_rate : int, optional + Frame rate of the input source. + buffer_size : int, optional + Number of frames to buffer. + For live sources, a larger buffer drops less frames but increases latency. + proc_fps : int, optional + Estimated processing speed that may limit the capture interval `cap_dt`. + This depends on hardware and processing complexity. + """ self.size = size self.input_uri = input_uri self.output_uri = output_uri + self.resolution = resolution + assert frame_rate > 0 + self.frame_rate = frame_rate + assert buffer_size >= 1 + self.buffer_size = buffer_size + assert proc_fps > 0 self.proc_fps = proc_fps - self.resolution = config['resolution'] - self.frame_rate = config['frame_rate'] - self.buffer_size = config['buffer_size'] - self.protocol = self._parse_uri(self.input_uri) self.is_live = self.protocol != Protocol.IMAGE and self.protocol != Protocol.VIDEO if WITH_GSTREAMER: @@ -90,18 +102,14 @@ def cap_dt(self): return 1 / min(self.cap_fps, self.proc_fps) if self.is_live else 1 / self.cap_fps def start_capture(self): - """ - Start capturing from file or device. - """ + """Start capturing from file or device.""" if not self.source.isOpened(): self.source.open(self._gst_cap_pipeline(), cv2.CAP_GSTREAMER) if not self.cap_thread.is_alive(): self.cap_thread.start() def stop_capture(self): - """ - Stop capturing from file or device. - """ + """Stop capturing from file or device.""" with self.cond: self.exit_event.set() self.cond.notify() @@ -109,9 +117,12 @@ def stop_capture(self): self.cap_thread.join() def read(self): - """ - Returns the next video frame. - Returns None if there are no more frames. + """Reads the next video frame. + + Returns + ------- + ndarray + Returns None if there are no more frames. """ with self.cond: while len(self.frame_queue) == 0 and not self.exit_event.is_set(): @@ -125,16 +136,12 @@ def read(self): return frame def write(self, frame): - """ - Writes the next video frame. - """ + """Writes the next video frame.""" assert hasattr(self, 'writer') self.writer.write(frame) def release(self): - """ - Closes video file or capturing device. - """ + """Cleans up input and output sources.""" self.stop_capture() if hasattr(self, 'writer'): self.writer.release() @@ -198,7 +205,10 @@ def _gst_cap_pipeline(self): else: raise RuntimeError('GStreamer V4L2 plugin not found') elif self.protocol == Protocol.RTSP: - pipeline = 'rtspsrc location=%s latency=0 ! capsfilter caps=application/x-rtp,media=video ! decodebin ! ' % self.input_uri + pipeline = ( + 'rtspsrc location=%s latency=0 ! ' + 'capsfilter caps=application/x-rtp,media=video ! decodebin ! ' % self.input_uri + ) elif self.protocol == Protocol.HTTP: pipeline = 'souphttpsrc location=%s is-live=true ! decodebin ! ' % self.input_uri return pipeline + cvt_pipeline @@ -209,7 +219,7 @@ def _gst_write_pipeline(self): if 'omxh264enc' in gst_elements: h264_encoder = 'omxh264enc preset-level=2' elif 'x264enc' in gst_elements: - h264_encoder = 'x264enc' + h264_encoder = 'x264enc pass=4' else: raise RuntimeError('GStreamer H.264 encoder not found') pipeline = ( diff --git a/requirements.txt b/requirements.txt index a07e65b5..3183a8de 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,4 @@ numpy >= 1.17 scipy >= 1.5 numba == 0.48 tensorflow < 2.0 -cupy == 9.2 -cython-bbox \ No newline at end of file +cupy == 9.2 \ No newline at end of file diff --git a/scripts/install_jetson.sh b/scripts/install_jetson.sh index 8c8ac157..ac554147 100755 --- a/scripts/install_jetson.sh +++ b/scripts/install_jetson.sh @@ -27,15 +27,14 @@ if [ ! -x "$(command -v nvcc)" ]; then source ~/.bashrc fi -# Numpy, PyCUDA, TensorFlow, cython-bbox +# NumPy and TensorFlow sudo apt-get update sudo apt-get install -y python3-pip libhdf5-serial-dev hdf5-tools libcanberra-gtk-module -sudo -H pip3 install cython -sudo -H pip3 install numpy cython-bbox +sudo -H pip3 install numpy sudo ln -s /usr/include/locale.h /usr/include/xlocale.h sudo -H pip3 install --no-cache-dir --extra-index-url https://developer.download.nvidia.com/compute/redist/jp/v$JP_VERSION tensorflow==$TF_VERSION+nv$NV_VERSION -# Scipy +# SciPy sudo apt-get install -y libatlas-base-dev gfortran sudo -H pip3 install scipy==1.5 diff --git a/scripts/yolo2onnx.py b/scripts/yolo2onnx.py index 023f46f3..4f89292d 100755 --- a/scripts/yolo2onnx.py +++ b/scripts/yolo2onnx.py @@ -661,6 +661,29 @@ def _make_conv_node(self, layer_name, layer_dict): inputs = [layer_name_mish] layer_name_output = layer_name_mish + elif layer_dict['activation'] == 'swish': + layer_name_sigmoid = layer_name + '_sigmoid' + layer_name_swish = layer_name + '_swish' + + sigmoid_node = helper.make_node( + 'Sigmoid', + inputs=inputs, + outputs=[layer_name_sigmoid], + name=layer_name_sigmoid + ) + self._nodes.append(sigmoid_node) + + inputs.append(layer_name_sigmoid) + swish_node = helper.make_node( + 'Mul', + inputs=inputs, + outputs=[layer_name_swish], + name=layer_name_swish + ) + self._nodes.append(swish_node) + + inputs = [layer_name_swish] + layer_name_output = layer_name_swish elif layer_dict['activation'] == 'logistic': layer_name_lgx = layer_name + '_lgx' @@ -888,10 +911,8 @@ def main(): print('Checking ONNX model...') onnx.checker.check_model(yolo_model_def) - print('Saving ONNX file...') onnx.save(yolo_model_def, output_file_path) - - print('Done.') + print(f'ONNX file saved to {output_file_path}') if __name__ == '__main__':