diff --git a/Dockerfile b/Dockerfile
index ecb2680b..bdeebfcc 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -83,7 +83,6 @@ ARG CUPY_NVCC_GENERATE_CODE
 RUN if [[ -z ${CUPY_NVCC_GENERATE_CODE} ]]; then \
         echo "CUPY_NVCC_GENERATE_CODE not set, building CuPy for all architectures (slower)"; \
     fi && \
-    pip install --no-cache-dir cython && \
     if [[ ${TRT_IMAGE_VERSION} == 21.05 ]]; then \
         CUPY_NUM_BUILD_JOBS=$(nproc) pip install --no-cache-dir -r <(grep -ivE "tensorflow" requirements.txt); \
     else \
diff --git a/app.py b/app.py
index 551c48f9..edadab58 100755
--- a/app.py
+++ b/app.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 from pathlib import Path
+from types import SimpleNamespace
 import argparse
 import logging
 import json
@@ -39,27 +40,27 @@ def main():
 
     # load config file
     with open(args.config) as cfg_file:
-        config = json.load(cfg_file, cls=ConfigDecoder)
+        config = json.load(cfg_file, cls=ConfigDecoder, object_hook=lambda d: SimpleNamespace(**d))
+
+    stream = fastmot.VideoIO(config.resize_to, args.input_uri, args.output_uri, **vars(config.stream_cfg))
 
     mot = None
     log = None
-    stream = fastmot.VideoIO(config['resize_to'], config['video_io'], args.input_uri, args.output_uri)
-
     if args.mot:
         draw = args.gui or args.output_uri is not None
-        mot = fastmot.MOT(config['resize_to'], config['mot'], draw=draw, verbose=args.verbose)
+        mot = fastmot.MOT(config.resize_to, **vars(config.mot_cfg), draw=draw)
         mot.reset(stream.cap_dt)
         if args.log is not None:
             Path(args.log).parent.mkdir(parents=True, exist_ok=True)
             log = open(args.log, 'w')
     if args.gui:
-        cv2.namedWindow("Video", cv2.WINDOW_AUTOSIZE)
+        cv2.namedWindow('Video', cv2.WINDOW_AUTOSIZE)
 
     logger.info('Starting video capture...')
     stream.start_capture()
     try:
         with Profiler('app') as prof:
-            while not args.gui or cv2.getWindowProperty("Video", 0) >= 0:
+            while not args.gui or cv2.getWindowProperty('Video', 0) >= 0:
                 frame = stream.read()
                 if frame is None:
                     break
@@ -67,9 +68,9 @@ def main():
                 if args.mot:
                     mot.step(frame)
                     if log is not None:
-                        for track in mot.visible_tracks:
-                            tl = track.tlbr[:2] / config['resize_to'] * stream.resolution
-                            br = track.tlbr[2:] / config['resize_to'] * stream.resolution
+                        for track in mot.visible_tracks():
+                            tl = track.tlbr[:2] / config.resize_to * stream.resolution
+                            br = track.tlbr[2:] / config.resize_to * stream.resolution
                             w, h = br - tl + 1
                             log.write(f'{mot.frame_count},{track.trk_id},{tl[0]:.6f},{tl[1]:.6f},'
                                       f'{w:.6f},{h:.6f},-1,-1,-1\n')
diff --git a/cfg/mot.json b/cfg/mot.json
index f2008a63..c6029e40 100644
--- a/cfg/mot.json
+++ b/cfg/mot.json
@@ -1,78 +1,80 @@
 {   
     "resize_to": [1280, 720],
 
-    "video_io": {
+    "stream_cfg": {
         "resolution": [1920, 1080],
         "frame_rate": 30,
         "buffer_size": 10
     },
 
-    "mot": {
+    "mot_cfg": {
         "detector_type": "YOLO",
         "detector_frame_skip": 5,
 
-        "ssd_detector": {
+        "ssd_detector_cfg": {
             "model": "SSDInceptionV2",
             "class_ids": [1],
             "tile_overlap": 0.25,
             "tiling_grid": [4, 2],
             "conf_thresh": 0.5,
-            "max_area": 130000,
-            "merge_thresh": 0.6
+            "merge_thresh": 0.6,
+            "max_area": 120000
         },
-        "yolo_detector": {
+        "yolo_detector_cfg": {
             "model": "YOLOv4",
             "class_ids": [1],
             "conf_thresh": 0.25,
+            "nms_thresh": 0.5,
             "max_area": 800000,
-            "nms_thresh": 0.5
+            "min_aspect_ratio": 1.2
         },
-        "public_detector": {
-            "sequence": "eval/data/MOT20-03",
+        "public_detector_cfg": {
+            "sequence_path": "MOT20/train/MOT20-01",
             "conf_thresh": 0.5,
             "max_area": 800000
         },
-        "feature_extractor": {
+        "feature_extractor_cfg": {
             "model": "OSNet025",
             "batch_size": 16
         },
 
-        "multi_tracker": {
+        "tracker_cfg": {
             "max_age": 6,
             "age_penalty": 2,
-            "age_weight": 0.1,
-            "motion_weight": 0.02,
-            "max_feat_cost": 0.9,
+            "motion_weight": 0.2,
+            "max_assoc_cost": 0.8,
             "max_reid_cost": 0.6,
             "iou_thresh": 0.4,
-            "duplicate_iou": 0.8,
+            "duplicate_thresh": 0.8,
+            "occlusion_thresh": 0.7,
             "conf_thresh": 0.5,
-            "lost_buf_size": 50,
+            "confirm_hits": 1,
+            "history_size": 50,
 
-            "kalman_filter": {
+            "kalman_filter_cfg": {
                 "std_factor_acc": 2.25,
                 "std_offset_acc": 78.5,
                 "std_factor_det": [0.08, 0.08],
-                "std_factor_flow": [0.14, 0.14],
+                "std_factor_klt": [0.14, 0.14],
                 "min_std_det": [4.0, 4.0],
-                "min_std_flow": [5.0, 5.0],
+                "min_std_klt": [5.0, 5.0],
                 "init_pos_weight": 5,
-                "init_vel_weight": 15,
+                "init_vel_weight": 12,
                 "vel_coupling": 0.6,
                 "vel_half_life": 2
             },
 
-            "flow": {
+            "flow_cfg": {
                 "bg_feat_scale_factor": [0.1, 0.1],
                 "opt_flow_scale_factor": [0.5, 0.5],
-                "feature_density": 0.005,
+                "feat_density": 0.005,
                 "feat_dist_factor": 0.06,
                 "ransac_max_iter": 500,
                 "ransac_conf": 0.99,
                 "max_error": 100,
                 "inlier_thresh": 4,
                 "bg_feat_thresh": 10,
-                "target_feat_params": {
+                "obj_feat_params": {
                     "maxCorners": 1000,
                     "qualityLevel": 0.06,
                     "blockSize": 3
@@ -83,6 +85,15 @@
                     "criteria": [3, 10, 0.03]
                 }
             }
+        },
+
+        "visualizer_cfg": {
+            "draw_detections": false,
+            "draw_confidence": false,
+            "draw_covariance": false,
+            "draw_klt": false,
+            "draw_obj_flow": false,
+            "draw_bg_flow": false
         }
     }
 }
diff --git a/fastmot/detector.py b/fastmot/detector.py
index 3868e042..a2d98712 100644
--- a/fastmot/detector.py
+++ b/fastmot/detector.py
@@ -10,8 +10,8 @@
 
 from . import models
 from .utils import TRTInference
-from .utils.rect import as_rect, to_tlbr, get_size, area
-from .utils.rect import union, crop, multi_crop, iom, diou_nms
+from .utils.rect import as_tlbr, aspect_ratio, to_tlbr, get_size, area
+from .utils.rect import enclosing, multi_crop, iom, diou_nms
 
 
 DET_DTYPE = np.dtype(
@@ -28,50 +28,83 @@ def __init__(self, size):
         self.size = size
 
     def __call__(self, frame):
+        """Detect objects synchronously."""
         self.detect_async(frame)
         return self.postprocess()
 
     @abc.abstractmethod
     def detect_async(self, frame):
-        """
-        Asynchronous detection.
-        """
         raise NotImplementedError
 
     @abc.abstractmethod
     def postprocess(self):
-        """
-        Synchronizes, applies postprocessing, and returns a record array
-        of detections (DET_DTYPE).
-        This function should be called after `detect_async`.
-        """
         raise NotImplementedError
 
 
 class SSDDetector(Detector):
-    def __init__(self, size, config):
+    def __init__(self, size,
+                 model='SSDInceptionV2',
+                 class_ids=None,
+                 tile_overlap=0.25,
+                 tiling_grid=(4, 2),
+                 conf_thresh=0.5,
+                 merge_thresh=0.6,
+                 max_area=120000):
+        """An object detector for SSD models.
+
+        Parameters
+        ----------
+        size : tuple
+            Width and height of each frame.
+        model : str, optional
+            SSD model to use.
+            Must be the name of a class that inherits `models.SSD`.
+        class_ids : tuple, optional
+            Class IDs to detect.
+        tile_overlap : float, optional
+            Ratio of overlap to width and height of each tile.
+        tiling_grid : tuple, optional
+            Width and height of tile layout to split each frame for batch inference.
+        conf_thresh : float, optional
+            Detection confidence threshold.
+        merge_thresh : float, optional
+            Overlap threshold to merge bounding boxes across tiles.
+        max_area : int, optional
+            Max area of bounding boxes to detect.
+        """
         super().__init__(size)
-        self.label_mask = np.zeros(len(models.LABEL_MAP), dtype=bool)
-        self.label_mask[list(config['class_ids'])] = True
-
-        self.model = getattr(models, config['model'])
-        self.tile_overlap = config['tile_overlap']
-        self.tiling_grid = config['tiling_grid']
-        self.conf_thresh = config['conf_thresh']
-        self.max_area = config['max_area']
-        self.merge_thresh = config['merge_thresh']
+        self.model = models.SSD.get_model(model)
+        assert 0 <= tile_overlap <= 1
+        self.tile_overlap = tile_overlap
+        assert tiling_grid[0] >= 1 and tiling_grid[1] >= 1
+        self.tiling_grid = tiling_grid
+        assert 0 <= conf_thresh <= 1
+        self.conf_thresh = conf_thresh
+        assert 0 <= merge_thresh <= 1
+        self.merge_thresh = merge_thresh
+        assert max_area >= 0
+        self.max_area = max_area
+
+        class_ids = [] if class_ids is None else list(class_ids)
+        self.label_mask = np.zeros(len(models.LABEL_MAP), dtype=np.bool_)
+        self.label_mask[class_ids] = True
 
         self.batch_size = int(np.prod(self.tiling_grid))
-        self.tiles, self.tiling_region_size = self._generate_tiles()
-        self.scale_factor = np.asarray(self.size) / self.tiling_region_size
+        self.tiles, self.tiling_region_sz = self._generate_tiles()
+        self.scale_factor = np.array(self.size) / self.tiling_region_sz
         self.backend = TRTInference(self.model, self.batch_size)
         self.inp_handle = self.backend.input.host.reshape(self.batch_size, *self.model.INPUT_SHAPE)
 
     def detect_async(self, frame):
+        """Detects objects asynchronously."""
         self._preprocess(frame)
         self.backend.infer_async()
 
     def postprocess(self):
+        """Synchronizes, applies postprocessing, and returns a record array
+        of detections (DET_DTYPE).
+        This function should be called after `detect_async`.
+        """
         det_out = self.backend.synchronize()[0]
         detections, tile_ids = self._filter_dets(det_out, self.tiles, self.model.TOPK,
                                                  self.label_mask, self.max_area,
@@ -80,12 +113,12 @@ def postprocess(self):
         return detections
 
     def _preprocess(self, frame):
-        frame = cv2.resize(frame, self.tiling_region_size)
+        frame = cv2.resize(frame, self.tiling_region_sz)
         self._normalize(frame, self.tiles, self.inp_handle)
 
     def _generate_tiles(self):
-        tile_size = np.asarray(self.model.INPUT_SHAPE[:0:-1])
-        tiling_grid = np.asarray(self.tiling_grid)
+        tile_size = np.array(self.model.INPUT_SHAPE[:0:-1])
+        tiling_grid = np.array(self.tiling_grid)
         step_size = (1 - self.tile_overlap) * tile_size
         total_size = (tiling_grid - 1) * step_size + tile_size
         total_size = np.rint(total_size).astype(int)
@@ -94,8 +127,8 @@ def _generate_tiles(self):
         return tiles, tuple(total_size)
 
     def _merge_dets(self, detections, tile_ids):
-        detections = np.asarray(detections, dtype=DET_DTYPE).view(np.recarray)
-        tile_ids = np.asarray(tile_ids)
+        detections = np.fromiter(detections, DET_DTYPE, len(detections)).view(np.recarray)
+        tile_ids = np.fromiter(tile_ids, int, len(tile_ids))
         if len(detections) == 0:
             return detections
         detections = self._merge(detections, tile_ids, self.batch_size, self.merge_thresh)
@@ -121,7 +154,7 @@ def _filter_dets(det_out, tiles, topk, label_mask, max_area, thresh, scale_facto
         tile_ids = []
         for tile_idx in range(len(tiles)):
             tile = tiles[tile_idx]
-            size = get_size(tile)
+            w, h = get_size(tile)
             tile_offset = tile_idx * topk
             for det_idx in range(topk):
                 offset = (tile_offset + det_idx) * 7
@@ -130,9 +163,11 @@ def _filter_dets(det_out, tiles, topk, label_mask, max_area, thresh, scale_facto
                 if conf < thresh:
                     break
                 if label_mask[label]:
-                    tl = (det_out[offset + 3:offset + 5] * size + tile[:2]) * scale_factor
-                    br = (det_out[offset + 5:offset + 7] * size + tile[:2]) * scale_factor
-                    tlbr = as_rect(np.append(tl, br))
+                    xmin = (det_out[offset + 3] * w + tile[0]) * scale_factor[0]
+                    ymin = (det_out[offset + 4] * h + tile[1]) * scale_factor[1]
+                    xmax = (det_out[offset + 5] * w + tile[0]) * scale_factor[0]
+                    ymax = (det_out[offset + 6] * h + tile[1]) * scale_factor[1]
+                    tlbr = as_tlbr((xmin, ymin, xmax, ymax))
                     if 0 < area(tlbr) <= max_area:
                         detections.append((tlbr, label, conf))
                         tile_ids.append(tile_idx)
@@ -168,52 +203,92 @@ def _merge(dets, tile_ids, num_tile, thresh):
                             tile_ids[j] = -1
                             stack.append(j)
                 for k in candidates:
-                    dets[i].tlbr[:] = union(dets[i].tlbr, dets[k].tlbr)
+                    dets[i].tlbr[:] = enclosing(dets[i].tlbr, dets[k].tlbr)
                     dets[i].conf = max(dets[i].conf, dets[k].conf)
                     keep.discard(k)
-        keep = np.asarray(list(keep))
+        keep = np.array(list(keep))
         return dets[keep]
 
 
 class YOLODetector(Detector):
-    def __init__(self, size, config):
+    def __init__(self, size,
+                 model='YOLOv4',
+                 class_ids=None,
+                 conf_thresh=0.25,
+                 nms_thresh=0.5,
+                 max_area=800000,
+                 min_aspect_ratio=1.2):
+        """An object detector for YOLO models.
+
+        Parameters
+        ----------
+        size : tuple
+            Width and height of each frame.
+        model : str, optional
+            YOLO model to use.
+            Must be the name of a class that inherits `models.YOLO`.
+        class_ids : tuple, optional
+            Class IDs to detect.
+        conf_thresh : float, optional
+            Detection confidence threshold.
+        nms_thresh : float, optional
+            Nonmaximum suppression overlap threshold.
+            Set higher to detect crowded objects.
+        max_area : int, optional
+            Max area of bounding boxes to detect.
+        min_aspect_ratio : float, optional
+            Min aspect ratio (height over width) of bounding boxes to detect.
+            Set to 0.1 for square shaped objects.
+        """
         super().__init__(size)
-        self.model = getattr(models, config['model'])
-        self.class_ids = config['class_ids']
-        self.conf_thresh = config['conf_thresh']
-        self.max_area = config['max_area']
-        self.nms_thresh = config['nms_thresh']
+        self.model = models.YOLO.get_model(model)
+        self.class_ids = tuple() if class_ids is None else class_ids
+        assert 0 <= conf_thresh <= 1
+        self.conf_thresh = conf_thresh
+        assert 0 <= nms_thresh <= 1
+        self.nms_thresh = nms_thresh
+        assert max_area >= 0
+        self.max_area = max_area
+        assert min_aspect_ratio >= 0
+        self.min_aspect_ratio = min_aspect_ratio
 
         self.backend = TRTInference(self.model, 1)
         self.inp_handle, self.upscaled_sz, self.bbox_offset = self._create_letterbox()
 
     def detect_async(self, frame):
+        """Detects objects asynchronously."""
         self._preprocess(frame)
         self.backend.infer_async(from_device=True)
 
     def postprocess(self):
+        """Synchronizes, applies postprocessing, and returns a record array
+        of detections (DET_DTYPE).
+        This function should be called after `detect_async`.
+        """
         det_out = self.backend.synchronize()
         det_out = np.concatenate(det_out).reshape(-1, 7)
         detections = self._filter_dets(det_out, self.upscaled_sz, self.class_ids, self.conf_thresh,
-                                       self.nms_thresh, self.max_area, self.bbox_offset)
-        detections = np.asarray(detections, dtype=DET_DTYPE).view(np.recarray)
+                                       self.nms_thresh, self.max_area, self.min_aspect_ratio,
+                                       self.bbox_offset)
+        detections = np.fromiter(detections, DET_DTYPE, len(detections)).view(np.recarray)
         return detections
 
     def _preprocess(self, frame):
-        frame_dev = cp.asarray(frame)
-        # resize
-        zoom = np.roll(self.inp_handle.shape, -1) / frame_dev.shape
-        small_dev = cupyx.scipy.ndimage.zoom(frame_dev, zoom, order=1, mode='opencv', grid_mode=True)
-        # BGR to RGB
-        rgb_dev = small_dev[..., ::-1]
-        # HWC -> CHW
-        chw_dev = rgb_dev.transpose(2, 0, 1)
-        # normalize to [0, 1] interval
-        cp.multiply(chw_dev, 1 / 255., out=self.inp_handle)
+        zoom = np.roll(self.inp_handle.shape, -1) / frame.shape
+        with self.backend.stream:
+            frame_dev = cp.asarray(frame)
+            # resize
+            small_dev = cupyx.scipy.ndimage.zoom(frame_dev, zoom, order=1, mode='opencv', grid_mode=True)
+            # BGR to RGB
+            rgb_dev = small_dev[..., ::-1]
+            # HWC -> CHW
+            chw_dev = rgb_dev.transpose(2, 0, 1)
+            # normalize to [0, 1] interval
+            cp.multiply(chw_dev, 1 / 255., out=self.inp_handle)
 
     def _create_letterbox(self):
-        src_size = np.asarray(self.size)
-        dst_size = np.asarray(self.model.INPUT_SHAPE[:0:-1])
+        src_size = np.array(self.size)
+        dst_size = np.array(self.model.INPUT_SHAPE[:0:-1])
         if self.model.LETTERBOX:
             scale_factor = min(dst_size / src_size)
             scaled_size = np.rint(src_size * scale_factor).astype(int)
@@ -233,7 +308,7 @@ def _create_letterbox(self):
 
     @staticmethod
     @nb.njit(fastmath=True, cache=True)
-    def _filter_dets(det_out, size, class_ids, conf_thresh, nms_thresh, max_area, offset):
+    def _filter_dets(det_out, size, class_ids, conf_thresh, nms_thresh, max_area, min_ar, offset):
         """
         det_out: a list of 3 tensors, where each tensor
                  contains a multiple of 7 float32 numbers in
@@ -254,30 +329,46 @@ def _filter_dets(det_out, size, class_ids, conf_thresh, nms_thresh, max_area, of
             class_dets = det_out[class_idx]
             class_keep = diou_nms(class_dets[:, :4], class_dets[:, 4], nms_thresh)
             keep.extend(class_idx[class_keep])
-        keep = np.asarray(keep)
+        keep = np.array(keep)
         nms_dets = det_out[keep]
 
         detections = []
         for i in range(len(nms_dets)):
             tlbr = to_tlbr(nms_dets[i, :4])
-            # clip inside frame
-            tlbr = np.maximum(tlbr, 0)
-            tlbr = np.minimum(tlbr, np.append(size, size))
             label = int(nms_dets[i, 5])
             conf = nms_dets[i, 4] * nms_dets[i, 6]
-            if 0 < area(tlbr) <= max_area:
+            if 0 < area(tlbr) <= max_area and aspect_ratio(tlbr) >= min_ar:
                 detections.append((tlbr, label, conf))
         return detections
 
 
 class PublicDetector(Detector):
-    def __init__(self, size, frame_skip, config):
+    def __init__(self, size, frame_skip, sequence_path=None, conf_thresh=0.5, max_area=800000):
+        """Class to use MOT Challenge's public detections.
+
+        Parameters
+        ----------
+        size : tuple
+            Width and height of each frame.
+        frame_skip : int
+            Detector frame skip.
+        sequence_path : str, optional
+            Relative path to MOT Challenge's sequence directory.
+        conf_thresh : float, optional
+            Detection confidence threshold.
+        max_area : int, optional
+            Max area of bounding boxes to detect.
+        """
         super().__init__(size)
         self.frame_skip = frame_skip
-        self.seq_root = Path(__file__).parents[1] / config['sequence']
-        self.conf_thresh = config['conf_thresh']
-        self.max_area = config['max_area']
-
+        assert sequence_path is not None
+        self.seq_root = Path(__file__).parents[1] / sequence_path
+        assert 0 <= conf_thresh <= 1
+        self.conf_thresh = conf_thresh
+        assert max_area >= 0
+        self.max_area = max_area
+
+        assert self.seq_root.exists()
         seqinfo = configparser.ConfigParser()
         seqinfo.read(self.seq_root / 'seqinfo.ini')
         self.seq_size = (int(seqinfo['Sequence']['imWidth']), int(seqinfo['Sequence']['imHeight']))
@@ -286,17 +377,17 @@ def __init__(self, size, frame_skip, config):
         self.frame_id = 0
 
         det_txt = self.seq_root / 'det' / 'det.txt'
-        for mot_det in np.loadtxt(det_txt, delimiter=','):
-            frame_id = int(mot_det[0]) - 1
-            tlbr = to_tlbr(mot_det[2:6])
-            conf = 1.0 # mot_det[6]
-            label = 1 # mot_det[7] (person)
-            # scale and clip inside frame
+        for mot_challenge_det in np.loadtxt(det_txt, delimiter=','):
+            frame_id = int(mot_challenge_det[0]) - 1
+            tlbr = to_tlbr(mot_challenge_det[2:6])
+            # mot_challenge_det[6]
+            conf = 1.0
+            # mot_challenge_det[7]
+            label = 1 # person
+            # scale inside frame
             tlbr[:2] = tlbr[:2] / self.seq_size * self.size
             tlbr[2:] = tlbr[2:] / self.seq_size * self.size
-            tlbr = np.maximum(tlbr, 0)
-            tlbr = np.minimum(tlbr, np.append(self.size, self.size))
-            tlbr = as_rect(tlbr)
+            tlbr = np.rint(tlbr)
             if conf >= self.conf_thresh and area(tlbr) <= self.max_area:
                 self.detections[frame_id].append((tlbr, label, conf))
 
@@ -304,6 +395,6 @@ def detect_async(self, frame):
         pass
 
     def postprocess(self):
-        detections = np.asarray(self.detections[self.frame_id], dtype=DET_DTYPE).view(np.recarray)
+        detections = np.array(self.detections[self.frame_id], DET_DTYPE).view(np.recarray)
         self.frame_id += self.frame_skip
         return detections
diff --git a/fastmot/feature_extractor.py b/fastmot/feature_extractor.py
index 27296200..a0ece637 100644
--- a/fastmot/feature_extractor.py
+++ b/fastmot/feature_extractor.py
@@ -9,9 +9,20 @@
 
 
 class FeatureExtractor:
-    def __init__(self, config):
-        self.model = getattr(models, config['model'])
-        self.batch_size = config['batch_size']
+    def __init__(self, model='OSNet025', batch_size=16):
+        """A feature extractor for ReID embeddings.
+
+        Parameters
+        ----------
+        model : str, optional
+            ReID model to use.
+            Must be the name of a class that inherits `models.ReID`.
+        batch_size : int, optional
+            Batch size for inference.
+        """
+        self.model = models.ReID.get_model(model)
+        assert batch_size >= 1
+        self.batch_size = batch_size
 
         self.feature_dim = self.model.OUTPUT_LAYOUT
         self.backend = TRTInference(self.model, self.batch_size)
@@ -25,19 +36,18 @@ def __del__(self):
         self.pool.close()
         self.pool.join()
 
-    def __call__(self, frame, detections):
-        self.extract_async(frame, detections)
+    def __call__(self, frame, tlbrs):
+        """Extract feature embeddings from bounding boxes synchronously."""
+        self.extract_async(frame, tlbrs)
         return self.postprocess()
 
     @property
     def metric(self):
         return self.model.METRIC
 
-    def extract_async(self, frame, detections):
-        """
-        Extract feature embeddings from detections asynchronously.
-        """
-        imgs = multi_crop(frame, detections.tlbr)
+    def extract_async(self, frame, tlbrs):
+        """Extract feature embeddings from bounding boxes asynchronously."""
+        imgs = multi_crop(frame, tlbrs)
         self.embeddings, cur_imgs = [], []
         # pipeline inference and preprocessing the next batch in parallel
         for offset in range(0, len(imgs), self.batch_size):
@@ -50,8 +60,7 @@ def extract_async(self, frame, detections):
         self.last_num_features = len(cur_imgs)
 
     def postprocess(self):
-        """
-        Synchronizes, applies postprocessing, and returns a NxM matrix of N
+        """Synchronizes, applies postprocessing, and returns a NxM matrix of N
         extracted embeddings with dimension M.
         This API should be called after `extract_async`.
         """
@@ -65,8 +74,7 @@ def postprocess(self):
         return embeddings
 
     def null_embeddings(self, detections):
-        """
-        Returns returns a NxM matrix of N identical embeddings with dimension M.
+        """Returns a NxM matrix of N identical embeddings with dimension M.
         This API effectively disables feature extraction.
         """
         embeddings = np.ones((len(detections), self.feature_dim))
diff --git a/fastmot/flow.py b/fastmot/flow.py
index 699ae313..92f77669 100644
--- a/fastmot/flow.py
+++ b/fastmot/flow.py
@@ -6,39 +6,91 @@
 import cv2
 
 from .utils.rect import to_tlbr, get_size, get_center
-from .utils.rect import mask_area, intersection, crop, transform
+from .utils.rect import intersection, crop
+from .utils.numba import mask_area, transform
 
 
 LOGGER = logging.getLogger(__name__)
 
 
 class Flow:
-    """
-    A KLT tracker based on optical flow feature point matching.
-    Camera motion is simultaneously estimated by tracking feature points
-    on the background.
-    Parameters
-    ----------
-    size : (int, int)
-        Width and height of each frame.
-    config : Dict
-        KLT hyperparameters.
-    """
-
-    def __init__(self, size, config):
+    def __init__(self, size,
+                 bg_feat_scale_factor=(0.1, 0.1),
+                 opt_flow_scale_factor=(0.5, 0.5),
+                 feat_density=0.005,
+                 feat_dist_factor=0.06,
+                 ransac_max_iter=500,
+                 ransac_conf=0.99,
+                 max_error=100,
+                 inlier_thresh=4,
+                 bg_feat_thresh=10,
+                 obj_feat_params=None,
+                 opt_flow_params=None):
+        """A KLT tracker based on optical flow feature point matching.
+        Camera motion is simultaneously estimated by tracking feature points
+        on the background.
+
+        Parameters
+        ----------
+        size : tuple
+            Width and height of each frame.
+        bg_feat_scale_factor : tuple, optional
+            Width and height scale factors to resize frame for background feature detection.
+        opt_flow_scale_factor : tuple, optional
+            Width and height scale factors to resize frame for optical flow.
+        feat_density : float, optional
+            Min feature point density to keep inside the bounding box.
+        feat_dist_factor : float, optional
+            Target size scale factor to estimate min feature point distance.
+        ransac_max_iter : int, optional
+            Max RANSAC iterations to filter matched outliers.
+        ransac_conf : float, optional
+            RANSAC confidence threshold to filter matched outliers.
+        max_error : int, optional
+            Max optical flow error.
+        inlier_thresh : int, optional
+            Min number of inliers for valid matching.
+        bg_feat_thresh : int, optional
+            FAST threshold for background feature detection.
+        obj_feat_params : SimpleNamespace, optional
+            GFTT parameters for object feature detection, see `cv2.goodFeaturesToTrack`.
+        opt_flow_params : SimpleNamespace, optional
+            Optical flow parameters, see `cv2.calcOpticalFlowPyrLK`.
+        """
         self.size = size
-        self.bg_feat_scale_factor = config['bg_feat_scale_factor']
-        self.opt_flow_scale_factor = config['opt_flow_scale_factor']
-        self.feature_density = config['feature_density']
-        self.feat_dist_factor = config['feat_dist_factor']
-        self.ransac_max_iter = config['ransac_max_iter']
-        self.ransac_conf = config['ransac_conf']
-        self.max_error = config['max_error']
-        self.inlier_thresh = config['inlier_thresh']
-
-        self.bg_feat_thresh = config['bg_feat_thresh']
-        self.target_feat_params = config['target_feat_params']
-        self.opt_flow_params = config['opt_flow_params']
+        assert 0 < bg_feat_scale_factor[0] <= 1 and 0 < bg_feat_scale_factor[1] <= 1
+        self.bg_feat_scale_factor = bg_feat_scale_factor
+        assert 0 < opt_flow_scale_factor[0] <= 1 and 0 < opt_flow_scale_factor[1] <= 1
+        self.opt_flow_scale_factor = opt_flow_scale_factor
+        assert 0 <= feat_density <= 1
+        self.feat_density = feat_density
+        assert feat_dist_factor >= 0
+        self.feat_dist_factor = feat_dist_factor
+        assert ransac_max_iter >= 0
+        self.ransac_max_iter = ransac_max_iter
+        assert 0 <= ransac_conf <= 1
+        self.ransac_conf = ransac_conf
+        assert 0 <= max_error <= 255
+        self.max_error = max_error
+        assert inlier_thresh >= 1
+        self.inlier_thresh = inlier_thresh
+        assert bg_feat_thresh >= 0
+        self.bg_feat_thresh = bg_feat_thresh
+
+        self.obj_feat_params = {
+            "maxCorners": 1000,
+            "qualityLevel": 0.06,
+            "blockSize": 3
+        }
+        self.opt_flow_params = {
+            "winSize": (5, 5),
+            "maxLevel": 5,
+            "criteria": (3, 10, 0.03)
+        }
+        if obj_feat_params is not None:
+            self.obj_feat_params.update(vars(obj_feat_params))
+        if opt_flow_params is None:
+            self.opt_flow_params.update(vars(opt_flow_params))
 
         self.bg_feat_detector = cv2.FastFeatureDetector_create(threshold=self.bg_feat_thresh)
 
@@ -67,9 +119,8 @@ def __init__(self, size, config):
         self.frame_rect = to_tlbr((0, 0, *self.size))
 
     def init(self, frame):
-        """
-        Preprocesses the first frame to prepare for subsequent optical
-        flow computations.
+        """Preprocesses the first frame to prepare for subsequent `predict`.
+
         Parameters
         ----------
         frame : ndarray
@@ -82,8 +133,8 @@ def init(self, frame):
         self.prev_bg_keypoints = np.empty((0, 2), np.float32)
 
     def predict(self, frame, tracks):
-        """
-        Predicts tracklet positions in the next frame and estimates camera motion.
+        """Predicts tracklet positions in the next frame and estimates camera motion.
+
         Parameters
         ----------
         frame : ndarray
@@ -91,6 +142,7 @@ def predict(self, frame, tracks):
         tracks : List[Track]
             List of tracks to predict.
             Feature points of each track are updated in place.
+
         Returns
         -------
         Dict[int, ndarray], ndarray
@@ -113,12 +165,12 @@ def predict(self, frame, tracks):
             target_area = mask_area(target_mask)
             keypoints = self._rect_filter(track.keypoints, inside_tlbr, self.fg_mask)
             # only detect new keypoints when too few are propagated
-            if len(keypoints) < self.feature_density * target_area:
+            if len(keypoints) < self.feat_density * target_area:
                 img = crop(self.prev_frame_gray, inside_tlbr)
                 feature_dist = self._estimate_feature_dist(target_area, self.feat_dist_factor)
                 keypoints = cv2.goodFeaturesToTrack(img, mask=target_mask,
                                                     minDistance=feature_dist,
-                                                    **self.target_feat_params)
+                                                    **self.obj_feat_params)
                 if keypoints is None:
                     keypoints = np.empty((0, 2), np.float32)
                 else:
@@ -223,18 +275,17 @@ def _estimate_bbox(tlbr, affine_mat):
         tl = transform(tlbr[:2], affine_mat).ravel()
         scale = np.linalg.norm(affine_mat[:2, 0])
         scale = 1. if scale < 0.9 or scale > 1.1 else scale
-        size = scale * get_size(tlbr)
-        return to_tlbr(np.append(tl, size))
+        w, h = get_size(tlbr)
+        return to_tlbr((tl[0], tl[1], w * scale, h * scale))
 
     @staticmethod
     @nb.njit(fastmath=True, cache=True)
     def _rect_filter(pts, tlbr, fg_mask):
         if len(pts) == 0:
             return np.empty((0, 2), np.float32)
-        tl, br = tlbr[:2], tlbr[2:]
         pts2i = np.rint(pts).astype(np.int32)
         # filter out points outside the rectangle
-        ge_le = (pts2i >= tl) & (pts2i <= br)
+        ge_le = (pts2i >= tlbr[:2]) & (pts2i <= tlbr[2:])
         inside = np.where(ge_le[:, 0] & ge_le[:, 1])
         pts, pts2i = pts[inside], pts2i[inside]
         # keep points inside the foreground area
@@ -246,20 +297,20 @@ def _rect_filter(pts, tlbr, fg_mask):
     @nb.njit(fastmath=True, cache=True)
     def _ellipse_filter(pts, tlbr, offset):
         offset = np.asarray(offset, np.float32)
+        center = np.array(get_center(tlbr))
+        semi_axes = np.array(get_size(tlbr)) * 0.5
         pts = pts.reshape(-1, 2)
         pts = pts + offset
-        center = get_center(tlbr)
-        semi_axes = get_size(tlbr) * 0.5
         # filter out points outside the ellipse
         keep = np.sum(((pts - center) / semi_axes)**2, axis=1) <= 1.
         return pts[keep]
 
     @staticmethod
     @nb.njit(fastmath=True, cache=True)
-    def _fg_filter(prev_pts, cur_pts, fg_mask, frame_size):
+    def _fg_filter(prev_pts, cur_pts, fg_mask, frame_sz):
         if len(cur_pts) == 0:
             return prev_pts, cur_pts
-        size = np.asarray(frame_size)
+        size = np.array(frame_sz)
         pts2i = np.rint(cur_pts).astype(np.int32)
         # filter out points outside the frame
         ge_lt = (pts2i >= 0) & (pts2i < size)
@@ -274,7 +325,7 @@ def _fg_filter(prev_pts, cur_pts, fg_mask, frame_size):
     @staticmethod
     @nb.njit(fastmath=True, cache=True)
     def _scale_pts(pts, scale_factor):
-        scale_factor = np.asarray(scale_factor, np.float32)
+        scale_factor = np.array(scale_factor, np.float32)
         pts = pts * scale_factor
         pts = pts.reshape(-1, 1, 2)
         return pts
@@ -282,13 +333,14 @@ def _scale_pts(pts, scale_factor):
     @staticmethod
     @nb.njit(fastmath=True, cache=True)
     def _unscale_pts(pts, scale_factor, mask=None):
-        scale_factor = np.asarray(scale_factor, np.float32)
+        scale_factor = np.array(scale_factor, np.float32)
+        unscale_factor = 1 / scale_factor
         pts = pts.reshape(-1, 2)
         if mask is None:
-            pts = pts / scale_factor
+            pts = pts * unscale_factor
         else:
             idx = np.where(mask)
-            pts[idx] = pts[idx] / scale_factor
+            pts[idx] = pts[idx] * unscale_factor
         return pts
 
     @staticmethod
diff --git a/fastmot/kalman_filter.py b/fastmot/kalman_filter.py
index 57ea1986..8bbd2459 100644
--- a/fastmot/kalman_filter.py
+++ b/fastmot/kalman_filter.py
@@ -11,39 +11,81 @@ class MeasType(Enum):
 
 
 class KalmanFilter:
-    """
-    A simple Kalman filter for tracking bounding boxes in image space.
-    The 8-dimensional state space
-        x1, y1, x2, y2, v_x1, v_y1, v_x2, v_y2
-    contains the bounding box top left corner, bottom right corner,
-    and their respective velocities.
-    Object motion follows a modified constant velocity model.
-    Velocity will decay over time without measurement and bounding box
-    corners are coupled together to minimize drifting.
-    Parameters
-    ----------
-    config : Dict
-        Kalman Filter parameters.
-    """
-
-    def __init__(self, config):
-        self.std_factor_acc = config['std_factor_acc']
-        self.std_offset_acc = config['std_offset_acc']
-        self.std_factor_det = config['std_factor_det']
-        self.std_factor_flow = config['std_factor_flow']
-        self.min_std_det = config['min_std_det']
-        self.min_std_flow = config['min_std_flow']
-        self.init_pos_weight = config['init_pos_weight']
-        self.init_vel_weight = config['init_vel_weight']
-        self.vel_coupling = config['vel_coupling']
-        self.vel_half_life = config['vel_half_life']
+    def __init__(self,
+                 std_factor_acc=2.25,
+                 std_offset_acc=78.5,
+                 std_factor_det=(0.08, 0.08),
+                 std_factor_klt=(0.14, 0.14),
+                 min_std_det=(4.0, 4.0),
+                 min_std_klt=(5.0, 5.0),
+                 init_pos_weight=5,
+                 init_vel_weight=12,
+                 vel_coupling=0.6,
+                 vel_half_life=2):
+        """A simple Kalman filter for tracking bounding boxes in image space.
+        The 8-dimensional state space
+            x1, y1, x2, y2, v_x1, v_y1, v_x2, v_y2
+        contains the bounding box top left corner, bottom right corner,
+        and their respective velocities.
+        Object motion follows a modified constant velocity model.
+        Velocity will decay over time without measurement and bounding box
+        corners are coupled together to minimize drifting.
+
+        Parameters
+        ----------
+        std_factor_acc : float, optional
+            Object size scale factor to calculate acceleration standard deviation
+            for process noise.
+        std_offset_acc : float, optional
+            Object size offset to calculate acceleration standard deviation
+            for process noise. Set larger for fast moving objects.
+        std_factor_det : tuple, optional
+            Object width and height scale factors to calculate detector measurement
+            noise standard deviation.
+        std_factor_klt : tuple, optional
+            Object wdith and height scale factors to calculate KLT measurement
+            noise standard deviation.
+        min_std_det : tuple, optional
+            Min detector measurement noise standard deviations.
+        min_std_klt : tuple, optional
+            Min KLT measurement noise standard deviations.
+        init_pos_weight : int, optional
+            Scale factor to initialize position state standard deviation.
+        init_vel_weight : int, optional
+            Scale factor to initialize velocity state standard deviation.
+            Set larger for fast moving objects.
+        vel_coupling : float, optional
+            Factor to couple bounding box corners.
+            Set 0.5 for max coupling and 1.0 to disable coupling.
+        vel_half_life : int, optional
+            Half life in seconds to decay velocity state.
+        """
+        assert std_factor_acc >= 0
+        self.std_factor_acc = std_factor_acc
+        self.std_offset_acc = std_offset_acc
+        assert std_factor_det[0] >= 0 and std_factor_det[1] >= 0
+        self.std_factor_det = std_factor_det
+        assert std_factor_klt[0] >= 0 and std_factor_klt[1] >= 0
+        self.std_factor_klt = std_factor_klt
+        assert min_std_det[0] >= 0 and min_std_det[1] >= 0
+        self.min_std_det = min_std_det
+        assert min_std_klt[0] >= 0 and min_std_klt[1] >= 0
+        self.min_std_klt = min_std_klt
+        assert init_pos_weight >= 0
+        self.init_pos_weight = init_pos_weight
+        assert init_vel_weight >= 0
+        self.init_vel_weight = init_vel_weight
+        assert 0 <= vel_coupling <= 1
+        self.vel_coupling = vel_coupling
+        assert vel_half_life > 0
+        self.vel_half_life = vel_half_life
 
         dt = 1 / 30.
         self.acc_cov, self.meas_mat, self.trans_mat = self._init_mat(dt)
 
     def reset_dt(self, dt):
-        """
-        Resets process noise, measurement and transition matrices from dt.
+        """Resets process noise, measurement and transition matrices from dt.
+
         Parameters
         ----------
         dt : float
@@ -52,15 +94,16 @@ def reset_dt(self, dt):
         self.acc_cov, self.meas_mat, self.trans_mat = self._init_mat(dt)
 
     def create(self, det_meas):
-        """
-        Creates Kalman filter state from unassociated measurement.
+        """Creates Kalman filter state from unassociated measurement.
+
         Parameters
         ----------
         det_meas : ndarray
             Detected bounding box of [x1, x2, y1, y2].
+
         Returns
         -------
-        (ndarray, ndarray)
+        ndarray, ndarray
             Returns the mean vector (8 dimensional) and covariance matrix (8x8
             dimensional) of the new track.
         """
@@ -83,8 +126,8 @@ def create(self, det_meas):
         return mean, covariance
 
     def predict(self, mean, covariance):
-        """
-        Runs Kalman filter prediction step.
+        """Runs Kalman filter prediction step.
+
         Parameters
         ----------
         mean : ndarray
@@ -93,9 +136,10 @@ def predict(self, mean, covariance):
         covariance : ndarray
             The 8x8 dimensional covariance matrix of the object state at the
             previous time step.
+
         Returns
         -------
-        (ndarray, ndarray)
+        ndarray, ndarray
             Returns the mean vector and covariance matrix of the predicted
             state.
         """
@@ -103,8 +147,8 @@ def predict(self, mean, covariance):
                              self.std_factor_acc, self.std_offset_acc)
 
     def project(self, mean, covariance, meas_type, multiplier=1.):
-        """
-        Projects state distribution to measurement space.
+        """Projects state distribution to measurement space.
+
         Parameters
         ----------
         mean : ndarray
@@ -115,15 +159,16 @@ def project(self, mean, covariance, meas_type, multiplier=1.):
             Measurement type indicating where the measurement comes from.
         multiplier : float
             Multiplier used to adjust the measurement std.
+
         Returns
         -------
-        (ndarray, ndarray)
+        ndarray, ndarray
             Returns the projected mean and covariance matrix of the given state
             estimate.
         """
         if meas_type == MeasType.FLOW:
-            std_factor = self.std_factor_flow
-            min_std = self.min_std_flow
+            std_factor = self.std_factor_klt
+            min_std = self.min_std_klt
         elif meas_type == MeasType.DETECTOR:
             std_factor = self.std_factor_det
             min_std = self.min_std_det
@@ -133,8 +178,8 @@ def project(self, mean, covariance, meas_type, multiplier=1.):
         return self._project(mean, covariance, self.meas_mat, std_factor, min_std, multiplier)
 
     def update(self, mean, covariance, measurement, meas_type, multiplier=1.):
-        """
-        Runs Kalman filter correction step.
+        """Runs Kalman filter correction step.
+
         Parameters
         ----------
         mean : ndarray
@@ -147,9 +192,10 @@ def update(self, mean, covariance, measurement, meas_type, multiplier=1.):
             Measurement type indicating where the measurement comes from.
         multiplier : float
             Multiplier used to adjust the measurement std.
+
         Returns
         -------
-        (ndarray, ndarray)
+        ndarray, ndarray
             Returns the measurement-corrected state distribution.
         """
         projected_mean, projected_cov = self.project(mean, covariance, meas_type, multiplier)
@@ -158,8 +204,8 @@ def update(self, mean, covariance, measurement, meas_type, multiplier=1.):
                             projected_cov, measurement, self.meas_mat)
 
     def motion_distance(self, mean, covariance, measurements):
-        """
-        Computes mahalanobis distance between `measurements` and state distribution.
+        """Computes mahalanobis distance between `measurements` and state distribution.
+
         Parameters
         ----------
         mean : ndarray
@@ -168,6 +214,7 @@ def motion_distance(self, mean, covariance, measurements):
             The state's covariance matrix (8x8 dimensional).
         measurements : array_like
             An Nx4 matrix of N samples of [x1, x2, y1, y2].
+
         Returns
         -------
         ndarray
@@ -180,9 +227,10 @@ def motion_distance(self, mean, covariance, measurements):
     @staticmethod
     @nb.njit(fastmath=True, cache=True)
     def warp(mean, covariance, H):
-        """
-        Warps kalman filter state using a homography transformation.
+        """Warps kalman filter state using a homography transformation.
         https://scholarsarchive.byu.edu/cgi/viewcontent.cgi?article=1301&context=studentpub
+
+        Parameters
         ----------
         mean : ndarray
             The predicted state's mean vector (8 dimensional).
@@ -190,9 +238,10 @@ def warp(mean, covariance, H):
             The state's covariance matrix (8x8 dimensional).
         H : ndarray
             A 3x3 homography matrix.
+
         Returns
         -------
-        (ndarray, ndarray)
+        ndarray, ndarray
             Returns the mean vector and covariance matrix of the transformed
             state.
         """
diff --git a/fastmot/models/__init__.py b/fastmot/models/__init__.py
index e444b68e..98a6161f 100644
--- a/fastmot/models/__init__.py
+++ b/fastmot/models/__init__.py
@@ -1,4 +1,4 @@
-from .ssd import *
-from .yolo import *
-from .reid import *
-from .label import *
\ No newline at end of file
+from .ssd import SSD
+from .yolo import YOLO
+from .reid import ReID
+from .label import LABEL_MAP
\ No newline at end of file
diff --git a/fastmot/models/reid.py b/fastmot/models/reid.py
index 5bf25d3e..342cce61 100644
--- a/fastmot/models/reid.py
+++ b/fastmot/models/reid.py
@@ -8,10 +8,20 @@
 
 
 class ReID:
+    __registry = {}
+
     PLUGIN_PATH = None
     ENGINE_PATH = None
     MODEL_PATH = None
-    INPUT_SHAPE = ()
+    INPUT_SHAPE = None
+
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        cls.__registry[cls.__name__] = cls
+
+    @classmethod
+    def get_model(cls, name):
+        return cls.__registry[name]
 
     @classmethod
     def build_engine(cls, trt_logger, batch_size):
@@ -53,3 +63,12 @@ class OSNet025(ReID):
     INPUT_SHAPE = (3, 256, 128)
     OUTPUT_LAYOUT = 512
     METRIC = 'euclidean'
+
+
+class OSNet10(ReID):
+    """Multi-source model trained on MSMT17, DukeMTMC, and CUHK03, not provided."""
+    ENGINE_PATH = Path(__file__).parent / 'osnet_x1_0_msdc.trt'
+    MODEL_PATH = Path(__file__).parent / 'osnet_x1_0_msdc.onnx'
+    INPUT_SHAPE = (3, 256, 128)
+    OUTPUT_LAYOUT = 512
+    METRIC = 'cosine'
diff --git a/fastmot/models/ssd.py b/fastmot/models/ssd.py
index ca09188a..a53454bd 100644
--- a/fastmot/models/ssd.py
+++ b/fastmot/models/ssd.py
@@ -7,12 +7,22 @@
 
 
 class SSD:
+    __registry = {}
+
     PLUGIN_PATH = None
     ENGINE_PATH = None
     MODEL_PATH = None
-    INPUT_SHAPE = ()
+    INPUT_SHAPE = None
     OUTPUT_NAME = None
 
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        cls.__registry[cls.__name__] = cls
+
+    @classmethod
+    def get_model(cls, name):
+        return cls.__registry[name]
+
     @classmethod
     def add_plugin(cls, graph):
         raise NotImplementedError
diff --git a/fastmot/models/yolo.py b/fastmot/models/yolo.py
index 14f589d9..a820badb 100644
--- a/fastmot/models/yolo.py
+++ b/fastmot/models/yolo.py
@@ -9,16 +9,26 @@
 
 
 class YOLO:
+    __registry = {}
+
     PLUGIN_PATH = Path(__file__).parents[1] / 'plugins' / 'libyolo_layer.so'
     ENGINE_PATH = None
     MODEL_PATH = None
     NUM_CLASSES = None
     LETTERBOX = False
     NEW_COORDS = False
-    INPUT_SHAPE = ()
-    LAYER_FACTORS = []
-    SCALES = []
-    ANCHORS = []
+    INPUT_SHAPE = None
+    LAYER_FACTORS = None
+    SCALES = None
+    ANCHORS = None
+
+    def __init_subclass__(cls, **kwargs):
+        super().__init_subclass__(**kwargs)
+        cls.__registry[cls.__name__] = cls
+
+    @classmethod
+    def get_model(cls, name):
+        return cls.__registry[name]
 
     @classmethod
     def add_plugin(cls, network):
@@ -29,6 +39,11 @@ def get_plugin_creator(plugin_name):
                     return plugin_creator
             return None
 
+        assert len(cls.LAYER_FACTORS) == network.num_outputs
+        assert len(cls.SCALES) == network.num_outputs
+        assert len(cls.ANCHORS) == network.num_outputs
+        assert all(s >= 1.0 for s in cls.SCALES)
+
         plugin_creator = get_plugin_creator('YoloLayer_TRT')
         if not plugin_creator:
             raise RuntimeError('Failed to get YoloLayer_TRT plugin creator')
diff --git a/fastmot/mot.py b/fastmot/mot.py
index 9e31eb3e..c8cfb2e4 100644
--- a/fastmot/mot.py
+++ b/fastmot/mot.py
@@ -1,3 +1,4 @@
+from types import SimpleNamespace
 from enum import Enum
 import logging
 import cv2
@@ -6,8 +7,7 @@
 from .feature_extractor import FeatureExtractor
 from .tracker import MultiTracker
 from .utils import Profiler
-from .utils.visualization import draw_tracks, draw_detections
-from .utils.visualization import draw_flow_bboxes, draw_background_flow
+from .utils.visualization import Visualizer
 
 
 LOGGER = logging.getLogger(__name__)
@@ -20,64 +20,102 @@ class DetectorType(Enum):
 
 
 class MOT:
-    """
-    This is the top level module that integrates detection, feature extraction,
-    and tracking together.
-    Parameters
-    ----------
-    size : (int, int)
-        Width and height of each frame.
-    cap_dt : float
-        Time interval in seconds between each captured frame.
-    config : Dict
-        Tracker configuration.
-    draw : bool
-        Flag to toggle visualization drawing.
-    verbose : bool
-        Flag to toggle output verbosity.
-    """
-
-    def __init__(self, size, config, draw=False, verbose=False):
+    def __init__(self, size,
+                 detector_type='YOLO',
+                 detector_frame_skip=5,
+                 ssd_detector_cfg=None,
+                 yolo_detector_cfg=None,
+                 public_detector_cfg=None,
+                 feature_extractor_cfg=None,
+                 tracker_cfg=None,
+                 visualizer_cfg=None,
+                 draw=False):
+        """Top level module that integrates detection, feature extraction,
+        and tracking together.
+
+        Parameters
+        ----------
+        size : tuple
+            Width and height of each frame.
+        detector_type : {'SSD', 'YOLO', 'public'}, optional
+            Type of detector to use.
+        detector_frame_skip : int, optional
+            Number of frames to skip for the detector.
+        ssd_detector_cfg : SimpleNamespace, optional
+            SSD detector configuration.
+        yolo_detector_cfg : SimpleNamespace, optional
+            YOLO detector configuration.
+        public_detector_cfg : SimpleNamespace, optional
+            Public detector configuration.
+        feature_extractor_cfg : SimpleNamespace, optional
+            Feature extractor configuration.
+        tracker_cfg : SimpleNamespace, optional
+            Tracker configuration.
+        visualizer_cfg : SimpleNamespace, optional
+            Visualization configuration.
+        draw : bool, optional
+            Enable visualization.
+        """
         self.size = size
+        self.detector_type = DetectorType[detector_type.upper()]
+        assert detector_frame_skip >= 1
+        self.detector_frame_skip = detector_frame_skip
         self.draw = draw
-        self.verbose = verbose
-        self.detector_type = DetectorType[config['detector_type']]
-        self.detector_frame_skip = config['detector_frame_skip']
+
+        if ssd_detector_cfg is None:
+            ssd_detector_cfg = SimpleNamespace()
+        if yolo_detector_cfg is None:
+            yolo_detector_cfg = SimpleNamespace()
+        if public_detector_cfg is None:
+            public_detector_cfg = SimpleNamespace()
+        if feature_extractor_cfg is None:
+            feature_extractor_cfg = SimpleNamespace()
+        if tracker_cfg is None:
+            tracker_cfg = SimpleNamespace()
+        if visualizer_cfg is None:
+            visualizer_cfg = SimpleNamespace()
 
         LOGGER.info('Loading detector model...')
         if self.detector_type == DetectorType.SSD:
-            self.detector = SSDDetector(self.size, config['ssd_detector'])
+            self.detector = SSDDetector(self.size, **vars(ssd_detector_cfg))
         elif self.detector_type == DetectorType.YOLO:
-            self.detector = YOLODetector(self.size, config['yolo_detector'])
+            self.detector = YOLODetector(self.size, **vars(yolo_detector_cfg))
         elif self.detector_type == DetectorType.PUBLIC:
             self.detector = PublicDetector(self.size, self.detector_frame_skip,
-                                           config['public_detector'])
+                                           **vars(public_detector_cfg))
 
         LOGGER.info('Loading feature extractor model...')
-        self.extractor = FeatureExtractor(config['feature_extractor'])
-        self.tracker = MultiTracker(self.size, self.extractor.metric, config['multi_tracker'])
+        self.extractor = FeatureExtractor(**vars(feature_extractor_cfg))
+        self.tracker = MultiTracker(self.size, self.extractor.metric, **vars(tracker_cfg))
+
+        self.visualizer = Visualizer(**vars(visualizer_cfg))
         self.frame_count = 0
 
-    @property
     def visible_tracks(self):
-        # retrieve confirmed and active tracks from the tracker
-        return [track for track in self.tracker.tracks.values()
-                if track.confirmed and track.active]
+        """Retrieve visible tracks from the tracker
 
-    def reset(self, cap_dt):
+        Returns
+        -------
+        Iterator[Track]
+            Confirmed and active tracks from the tracker
         """
-        Resets multiple object tracker. Must be called before `step`.
+        return (track for track in self.tracker.tracks.values()
+                if track.confirmed and track.active)
+
+    def reset(self, cap_dt):
+        """Resets multiple object tracker. Must be called before `step`.
+
         Parameters
         ----------
         cap_dt : float
             Time interval in seconds between each frame.
         """
         self.frame_count = 0
-        self.tracker.reset_dt(cap_dt)
+        self.tracker.reset(cap_dt)
 
     def step(self, frame):
-        """
-        Runs multiple object tracker on the next frame.
+        """Runs multiple object tracker on the next frame.
+
         Parameters
         ----------
         frame : ndarray
@@ -98,7 +136,7 @@ def step(self, frame):
                     detections = self.detector.postprocess()
 
                 with Profiler('extract'):
-                    self.extractor.extract_async(frame, detections)
+                    self.extractor.extract_async(frame, detections.tlbr)
                     with Profiler('track', aggregate=True):
                         self.tracker.apply_kalman()
                     embeddings = self.extractor.postprocess()
@@ -124,10 +162,8 @@ def print_timing_info():
         LOGGER.debug(f"{'association time:':<37}{Profiler.get_avg_millis('assoc'):>6.3f} ms")
 
     def _draw(self, frame, detections):
-        draw_tracks(frame, self.visible_tracks, show_flow=self.verbose)
-        if self.verbose:
-            draw_detections(frame, detections)
-            draw_flow_bboxes(frame, self.tracker)
-            draw_background_flow(frame, self.tracker)
-        cv2.putText(frame, f'visible: {len(self.visible_tracks)}', (30, 30),
+        visible_tracks = list(self.visible_tracks())
+        self.visualizer.render(frame, visible_tracks, detections, self.tracker.klt_bboxes.values(),
+                               self.tracker.flow.prev_bg_keypoints, self.tracker.flow.bg_keypoints)
+        cv2.putText(frame, f'visible: {len(visible_tracks)}', (30, 30),
                     cv2.FONT_HERSHEY_SIMPLEX, 1, 0, 2, cv2.LINE_AA)
diff --git a/fastmot/plugins/yolo_layer.cu b/fastmot/plugins/yolo_layer.cu
index 8590ffa0..9f56f256 100644
--- a/fastmot/plugins/yolo_layer.cu
+++ b/fastmot/plugins/yolo_layer.cu
@@ -60,25 +60,19 @@ namespace nvinfer1
         assert(d == a + length);
     }
 
-    void YoloLayerPlugin::serialize(void* buffer) const
+    IPluginV2IOExt* YoloLayerPlugin::clone() const NOEXCEPT
     {
-        char* d = static_cast<char*>(buffer), *a = d;
-        write(d, mThreadCount);
-        write(d, mYoloWidth);
-        write(d, mYoloHeight);
-        write(d, mNumAnchors);
-        memcpy(d, mAnchorsHost, MAX_ANCHORS * 2 * sizeof(float));
-        d += MAX_ANCHORS * 2 * sizeof(float);
-        write(d, mNumClasses);
-        write(d, mInputWidth);
-        write(d, mInputHeight);
-        write(d, mScaleXY);
-        write(d, mNewCoords);
+        YoloLayerPlugin *p = new YoloLayerPlugin(mYoloWidth, mYoloHeight, mNumAnchors, (float*) mAnchorsHost, mNumClasses, mInputWidth, mInputHeight, mScaleXY, mNewCoords);
+        p->setPluginNamespace(mPluginNamespace);
+        return p;
+    }
 
-        assert(d == a + getSerializationSize());
+    void YoloLayerPlugin::terminate() NOEXCEPT
+    {
+        CHECK(cudaFree(mAnchors));
     }
 
-    size_t YoloLayerPlugin::getSerializationSize() const
+    size_t YoloLayerPlugin::getSerializationSize() const NOEXCEPT
     {
         return sizeof(mThreadCount) + \
                sizeof(mYoloWidth) + sizeof(mYoloHeight) + \
@@ -88,17 +82,25 @@ namespace nvinfer1
                sizeof(mScaleXY) + sizeof(mNewCoords);
     }
 
-    int YoloLayerPlugin::initialize()
+    void YoloLayerPlugin::serialize(void* buffer) const NOEXCEPT
     {
-        return 0;
-    }
+        char* d = static_cast<char*>(buffer), *a = d;
+        write(d, mThreadCount);
+        write(d, mYoloWidth);
+        write(d, mYoloHeight);
+        write(d, mNumAnchors);
+        memcpy(d, mAnchorsHost, MAX_ANCHORS * 2 * sizeof(float));
+        d += MAX_ANCHORS * 2 * sizeof(float);
+        write(d, mNumClasses);
+        write(d, mInputWidth);
+        write(d, mInputHeight);
+        write(d, mScaleXY);
+        write(d, mNewCoords);
 
-    void YoloLayerPlugin::terminate()
-    {
-        CHECK(cudaFree(mAnchors));
+        assert(d == a + getSerializationSize());
     }
 
-    Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims)
+    Dims YoloLayerPlugin::getOutputDimensions(int index, const Dims* inputs, int nbInputDims) NOEXCEPT
     {
         assert(index == 0);
         assert(nbInputDims == 1);
@@ -110,71 +112,6 @@ namespace nvinfer1
         return Dims3(totalsize, 1, 1);
     }
 
-    void YoloLayerPlugin::setPluginNamespace(const char* pluginNamespace)
-    {
-        mPluginNamespace = pluginNamespace;
-    }
-
-    const char* YoloLayerPlugin::getPluginNamespace() const
-    {
-        return mPluginNamespace;
-    }
-
-    // Return the DataType of the plugin output at the requested index
-    DataType YoloLayerPlugin::getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const
-    {
-        return DataType::kFLOAT;
-    }
-
-    // Return true if output tensor is broadcast across a batch.
-    bool YoloLayerPlugin::isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const
-    {
-        return false;
-    }
-
-    // Return true if plugin can use input that is broadcast across batch without replication.
-    bool YoloLayerPlugin::canBroadcastInputAcrossBatch(int inputIndex) const
-    {
-        return false;
-    }
-
-    void YoloLayerPlugin::configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput)
-    {
-    }
-
-    // Attach the plugin object to an execution context and grant the plugin the access to some context resource.
-    void YoloLayerPlugin::attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator)
-    {
-    }
-
-    // Detach the plugin object from its execution context.
-    void YoloLayerPlugin::detachFromContext()
-    {
-    }
-
-    const char* YoloLayerPlugin::getPluginType() const
-    {
-        return "YoloLayer_TRT";
-    }
-
-    const char* YoloLayerPlugin::getPluginVersion() const
-    {
-        return "1";
-    }
-
-    void YoloLayerPlugin::destroy()
-    {
-        delete this;
-    }
-
-    // Clone the plugin
-    IPluginV2IOExt* YoloLayerPlugin::clone() const
-    {
-        YoloLayerPlugin *p = new YoloLayerPlugin(mYoloWidth, mYoloHeight, mNumAnchors, (float*) mAnchorsHost, mNumClasses, mInputWidth, mInputHeight, mScaleXY, mNewCoords);
-        p->setPluginNamespace(mPluginNamespace);
-        return p;
-    }
-
     inline __device__ float sigmoidGPU(float x) { return 1.0f / (1.0f + __expf(-x)); }
 
     inline __device__ float scale_sigmoidGPU(float x, float s)
@@ -307,7 +244,11 @@ namespace nvinfer1
         }
     }
 
-    int YoloLayerPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream)
+#if NV_TENSORRT_MAJOR == 8
+    int32_t YoloLayerPlugin::enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) NOEXCEPT
+#else
+    int YoloLayerPlugin::enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream)
+#endif
     {
         forwardGpu((const float* const*)inputs, (float*)outputs[0], stream, batchSize);
         return 0;
@@ -321,22 +262,22 @@ namespace nvinfer1
         mFC.fields = mPluginAttributes.data();
     }
 
-    const char* YoloPluginCreator::getPluginName() const
+    const char* YoloPluginCreator::getPluginName() const NOEXCEPT
     {
         return "YoloLayer_TRT";
     }
 
-    const char* YoloPluginCreator::getPluginVersion() const
+    const char* YoloPluginCreator::getPluginVersion() const NOEXCEPT
     {
         return "1";
     }
 
-    const PluginFieldCollection* YoloPluginCreator::getFieldNames()
+    const PluginFieldCollection* YoloPluginCreator::getFieldNames() NOEXCEPT
     {
         return &mFC;
     }
 
-    IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc)
+    IPluginV2IOExt* YoloPluginCreator::createPlugin(const char* name, const PluginFieldCollection* fc) NOEXCEPT
     {
         assert(!strcmp(name, getPluginName()));
         const PluginField* fields = fc->fields;
@@ -397,7 +338,9 @@ namespace nvinfer1
         assert(yolo_width > 0 && yolo_height > 0);
         assert(anchors[0] > 0.0f && anchors[1] > 0.0f);
         assert(num_classes > 0);
-        assert(input_multiplier == 8 || input_multiplier == 16 || input_multiplier == 32 || input_multiplier == 64 || input_multiplier == 128);
+        assert(input_multiplier == 128 || input_multiplier == 64 ||
+               input_multiplier == 32  || input_multiplier == 16 ||
+               input_multiplier == 8);
         assert(scale_x_y >= 1.0);
 
         YoloLayerPlugin* obj = new YoloLayerPlugin(yolo_width, yolo_height, num_anchors, anchors, num_classes, yolo_width * input_multiplier, yolo_height * input_multiplier, scale_x_y, new_coords);
@@ -405,7 +348,7 @@ namespace nvinfer1
         return obj;
     }
 
-    IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength)
+    IPluginV2IOExt* YoloPluginCreator::deserializePlugin(const char* name, const void* serialData, size_t serialLength) NOEXCEPT
     {
         YoloLayerPlugin* obj = new YoloLayerPlugin(serialData, serialLength);
         obj->setPluginNamespace(mNamespace.c_str());
@@ -414,5 +357,4 @@ namespace nvinfer1
 
     PluginFieldCollection YoloPluginCreator::mFC{};
     std::vector<PluginField> YoloPluginCreator::mPluginAttributes;
-    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
 } // namespace nvinfer1
diff --git a/fastmot/plugins/yolo_layer.h b/fastmot/plugins/yolo_layer.h
index 127bef43..4264cb28 100644
--- a/fastmot/plugins/yolo_layer.h
+++ b/fastmot/plugins/yolo_layer.h
@@ -10,6 +10,12 @@
 
 #define MAX_ANCHORS 6
 
+#if NV_TENSORRT_MAJOR >= 8
+#define NOEXCEPT noexcept
+#else
+#define NOEXCEPT
+#endif
+
 #define CHECK(status)                                           \
     do {                                                        \
         auto ret = status;                                      \
@@ -43,52 +49,52 @@ namespace nvinfer1
 
             ~YoloLayerPlugin() override = default;
 
-            int getNbOutputs() const override
-            {
-                return 1;
-            }
+            IPluginV2IOExt* clone() const NOEXCEPT override;
 
-            Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override;
+            int initialize() NOEXCEPT override { return 0; }
 
-            int initialize() override;
+            void terminate() NOEXCEPT override;
 
-            void terminate() override;
+            void destroy() NOEXCEPT override { delete this; }
 
-            virtual size_t getWorkspaceSize(int maxBatchSize) const override { return 0;}
+            size_t getSerializationSize() const NOEXCEPT override;
 
-            virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override;
+            void serialize(void* buffer) const NOEXCEPT override;
 
-            virtual size_t getSerializationSize() const override;
+            int getNbOutputs() const NOEXCEPT override { return 1; }
 
-            virtual void serialize(void* buffer) const override;
+            Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) NOEXCEPT override;
 
-            bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const override {
-                return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT;
-            }
+            size_t getWorkspaceSize(int maxBatchSize) const NOEXCEPT override { return 0; }
 
-            const char* getPluginType() const override;
+            bool supportsFormatCombination(int pos, const PluginTensorDesc* inOut, int nbInputs, int nbOutputs) const NOEXCEPT override { return inOut[pos].format == TensorFormat::kLINEAR && inOut[pos].type == DataType::kFLOAT; }
 
-            const char* getPluginVersion() const override;
+            const char* getPluginType() const NOEXCEPT override { return "YoloLayer_TRT"; }
 
-            void destroy() override;
+            const char* getPluginVersion() const NOEXCEPT override { return "1"; }
 
-            IPluginV2IOExt* clone() const override;
+            void setPluginNamespace(const char* pluginNamespace) NOEXCEPT override { mPluginNamespace = pluginNamespace; }
 
-            void setPluginNamespace(const char* pluginNamespace) override;
+            const char* getPluginNamespace() const NOEXCEPT override { return mPluginNamespace; }
 
-            const char* getPluginNamespace() const override;
+            DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const NOEXCEPT override { return DataType::kFLOAT; }
 
-            DataType getOutputDataType(int index, const DataType* inputTypes, int nbInputs) const override;
+            bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const NOEXCEPT override { return false; }
 
-            bool isOutputBroadcastAcrossBatch(int outputIndex, const bool* inputIsBroadcasted, int nbInputs) const override;
+            bool canBroadcastInputAcrossBatch(int inputIndex) const NOEXCEPT override { return false; }
 
-            bool canBroadcastInputAcrossBatch(int inputIndex) const override;
+            void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) NOEXCEPT override {}
 
-            void attachToContext(cudnnContext* cudnnContext, cublasContext* cublasContext, IGpuAllocator* gpuAllocator) override;
+            //using IPluginV2IOExt::configurePlugin;
+            void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) NOEXCEPT override {}
 
-            void configurePlugin(const PluginTensorDesc* in, int nbInput, const PluginTensorDesc* out, int nbOutput) override TRTNOEXCEPT;
+            void detachFromContext() NOEXCEPT override {}
 
-            void detachFromContext() override;
+#if NV_TENSORRT_MAJOR >= 8
+            int32_t enqueue(int32_t batchSize, void const* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) NOEXCEPT override;
+#else
+            int enqueue(int batchSize, const void* const * inputs, void** outputs, void* workspace, cudaStream_t stream) NOEXCEPT override;
+#endif
 
         private:
             void forwardGpu(const float* const* inputs, float* output, cudaStream_t stream, int batchSize = 1);
@@ -103,9 +109,6 @@ namespace nvinfer1
             int mNewCoords = 0;
 
             const char* mPluginNamespace;
-
-        protected:
-            using IPluginV2IOExt::configurePlugin;
     };
 
     class YoloPluginCreator : public IPluginCreator
@@ -115,31 +118,33 @@ namespace nvinfer1
 
             ~YoloPluginCreator() override = default;
 
-            const char* getPluginName() const override;
+            const char* getPluginName() const NOEXCEPT override;
 
-            const char* getPluginVersion() const override;
+            const char* getPluginVersion() const NOEXCEPT override;
 
-            const PluginFieldCollection* getFieldNames() override;
+            const PluginFieldCollection* getFieldNames() NOEXCEPT override;
 
-            IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) override;
+            IPluginV2IOExt* createPlugin(const char* name, const PluginFieldCollection* fc) NOEXCEPT override;
 
-            IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override;
+            IPluginV2IOExt* deserializePlugin(const char* name, const void* serialData, size_t serialLength) NOEXCEPT override;
 
-            void setPluginNamespace(const char* libNamespace) override
+            void setPluginNamespace(const char* libNamespace) NOEXCEPT override
             {
                 mNamespace = libNamespace;
             }
 
-            const char* getPluginNamespace() const override
+            const char* getPluginNamespace() const NOEXCEPT override
             {
                 return mNamespace.c_str();
             }
 
         private:
             static PluginFieldCollection mFC;
-            static std::vector<nvinfer1::PluginField> mPluginAttributes;
+            static std::vector<PluginField> mPluginAttributes;
             std::string mNamespace;
     };
+
+    REGISTER_TENSORRT_PLUGIN(YoloPluginCreator);
 };
 
 #endif
diff --git a/fastmot/track.py b/fastmot/track.py
index 9d518e64..25a11e63 100644
--- a/fastmot/track.py
+++ b/fastmot/track.py
@@ -1,68 +1,225 @@
+from collections import deque
 import numpy as np
+import numba as nb
 
 from .models import LABEL_MAP
+from .utils.distance import cdist, cosine
+from .utils.numba import apply_along_axis, normalize_vec
 from .utils.rect import get_center
 
 
+class ClusterFeature:
+    def __init__(self, num_clusters, metric):
+        self.num_clusters = num_clusters
+        self.metric = metric
+        self.clusters = None
+        self.cluster_sizes = None
+        self._next_idx = 0
+
+    def __len__(self):
+        return self._next_idx
+
+    def __call__(self):
+        return self.clusters[:self._next_idx]
+
+    def update(self, embedding):
+        if self._next_idx < self.num_clusters:
+            if self.clusters is None:
+                self.clusters = np.empty((self.num_clusters, len(embedding)), embedding.dtype)
+                self.cluster_sizes = np.zeros(self.num_clusters, int)
+            self.clusters[self._next_idx] = embedding
+            self.cluster_sizes[self._next_idx] += 1
+            self._next_idx += 1
+        else:
+            nearest_idx = self._get_nearest_cluster(self.clusters, embedding)
+            self.cluster_sizes[nearest_idx] += 1
+            self._seq_kmeans(self.clusters, self.cluster_sizes, embedding, nearest_idx)
+
+    def distance(self, embeddings):
+        if self.clusters is None:
+            return np.ones(len(embeddings))
+        clusters = normalize_vec(self.clusters[:self._next_idx])
+        return apply_along_axis(np.min, cdist(clusters, embeddings, self.metric), axis=0)
+
+    def merge(self, features, other, other_features):
+        if len(features) > len(other_features):
+            for feature in other_features:
+                if feature is not None:
+                    self.update(feature)
+        else:
+            for feature in features:
+                if feature is not None:
+                    other.update(feature)
+            self.clusters = other.clusters.copy()
+            self.clusters_sizes = other.cluster_sizes.copy()
+            self._next_idx = other._next_idx
+
+    @staticmethod
+    @nb.njit(fastmath=True, cache=True)
+    def _get_nearest_cluster(clusters, embedding):
+        return np.argmin(cosine(np.atleast_2d(embedding), clusters))
+
+    @staticmethod
+    @nb.njit(fastmath=True, cache=True)
+    def _seq_kmeans(clusters, cluster_sizes, embedding, idx):
+        div_size = 1. / cluster_sizes[idx]
+        clusters[idx] += (embedding - clusters[idx]) * div_size
+
+
+class SmoothFeature:
+    def __init__(self, learning_rate):
+        self.lr = learning_rate
+        self.smooth = None
+
+    def __call__(self):
+        return self.smooth
+
+    def update(self, embedding):
+        if self.smooth is None:
+            self.smooth = embedding.copy()
+        else:
+            self._rolling(self.smooth, embedding, self.lr)
+
+    @staticmethod
+    @nb.njit(fastmath=True, cache=True)
+    def _rolling(smooth, embedding, lr):
+        smooth[:] = (1. - lr) * smooth + lr * embedding
+        norm_factor = 1. / np.linalg.norm(smooth)
+        smooth *= norm_factor
+
+
+class AverageFeature:
+    def __init__(self):
+        self.sum = None
+        self.avg = None
+        self.count = 0
+
+    def __call__(self):
+        return self.avg
+
+    def is_valid(self):
+        return self.count > 0
+
+    def update(self, embedding):
+        self.count += 1
+        if self.sum is None:
+            self.sum = embedding.copy()
+            self.avg = embedding.copy()
+        else:
+            self._average(self.sum, self.avg, embedding, self.count)
+
+    def merge(self, other):
+        self.count += other.count
+        if self.sum is None:
+            self.sum = other.sum
+            self.avg = other.avg
+        elif other.sum is not None:
+            self._average(self.sum, self.avg, other.sum, self.count)
+
+    @staticmethod
+    @nb.njit(fastmath=True, cache=True)
+    def _average(sum, avg, vec, count):
+        sum += vec
+        div_cnt = 1. / count
+        avg[:] = sum * div_cnt
+        norm_factor = 1. / np.linalg.norm(avg)
+        avg *= norm_factor
+
+
 class Track:
-    def __init__(self, frame_id, trk_id, tlbr, state, label):
+    _count = 0
+
+    def __init__(self, frame_id, tlbr, state, label, confirm_hits=1, buffer_size=30):
+        self.trk_id = self.next_id()
         self.start_frame = frame_id
-        self.trk_id = trk_id
-        self.tlbr = tlbr
+        self.frame_ids = deque([frame_id], maxlen=buffer_size)
+        self.bboxes = deque([tlbr], maxlen=buffer_size)
+        self.confirm_hits = confirm_hits
         self.state = state
         self.label = label
 
         self.age = 0
         self.hits = 0
-        self.alpha = 0.9
-        self.smooth_feature = None
+        self.avg_feat = AverageFeature()
+        self.last_feat = None
 
         self.inlier_ratio = 1.
         self.keypoints = np.empty((0, 2), np.float32)
         self.prev_keypoints = np.empty((0, 2), np.float32)
 
     def __str__(self):
-        coord = get_center(self.tlbr).astype(int)
-        return f'{LABEL_MAP[self.label]} {self.trk_id:>3} at ({coord[0]:>4}, {coord[1]:>3})'
+        x, y = get_center(self.tlbr)
+        return f'{LABEL_MAP[self.label]} {self.trk_id:>3} at ({int(x):>4}, {int(y):>3})'
 
     def __repr__(self):
         return self.__str__()
 
+    def __len__(self):
+        return self.end_frame - self.start_frame
+
     def __lt__(self, other):
         # ordered by approximate distance to the image plane, closer is greater
         return (self.tlbr[-1], -self.age) < (other.tlbr[-1], -other.age)
 
+    @property
+    def tlbr(self):
+        return self.bboxes[-1]
+
+    @property
+    def end_frame(self):
+        return self.frame_ids[-1]
+
     @property
     def active(self):
         return self.age < 2
 
     @property
     def confirmed(self):
-        return self.hits > 0
+        return self.hits >= self.confirm_hits
 
-    def update(self, tlbr, state, embedding=None):
-        self.tlbr = tlbr
+    def update(self, tlbr, state):
+        self.bboxes.append(tlbr)
         self.state = state
-        if embedding is not None:
-            self.age = 0
-            self.hits += 1
-            self.update_feature(embedding)
 
-    def reactivate(self, frame_id, tlbr, state, embedding):
+    def add_detection(self, frame_id, tlbr, state, embedding, is_valid=True):
+        self.frame_ids.append(frame_id)
+        self.bboxes.append(tlbr)
+        self.state = state
+        if is_valid:
+            self.last_feat = embedding
+            self.avg_feat.update(embedding)
+        self.age = 0
+        self.hits += 1
+
+    def reinstate(self, frame_id, tlbr, state, embedding):
         self.start_frame = frame_id
-        self.tlbr = tlbr
+        self.frame_ids.append(frame_id)
+        self.bboxes.append(tlbr)
         self.state = state
+        self.last_feat = embedding
+        self.avg_feat.update(embedding)
         self.age = 0
-        self.update_feature(embedding)
         self.keypoints = np.empty((0, 2), np.float32)
         self.prev_keypoints = np.empty((0, 2), np.float32)
 
     def mark_missed(self):
         self.age += 1
 
-    def update_feature(self, embedding):
-        if self.smooth_feature is None:
-            self.smooth_feature = embedding
-        else:
-            self.smooth_feature = self.alpha * self.smooth_feature + (1. - self.alpha) * embedding
-            self.smooth_feature /= np.linalg.norm(self.smooth_feature)
+    def merge_continuation(self, other):
+        self.frame_ids.extend(other.frame_ids)
+        self.bboxes.extend(other.bboxes)
+        self.state = other.state
+        self.age = other.age
+        self.hits += other.hits
+
+        self.keypoints = other.keypoints
+        self.prev_keypoints = other.prev_keypoints
+
+        if other.last_feat is not None:
+            self.last_feat = other.last_feat
+        self.avg_feat.merge(other.avg_feat)
+
+    @staticmethod
+    def next_id():
+        Track._count += 1
+        return Track._count
diff --git a/fastmot/tracker.py b/fastmot/tracker.py
index 3931b384..59021eb7 100644
--- a/fastmot/tracker.py
+++ b/fastmot/tracker.py
@@ -1,77 +1,127 @@
+from types import SimpleNamespace
 from collections import OrderedDict
 import itertools
 import logging
 import numpy as np
-import numba as nb
-from scipy.optimize import linear_sum_assignment
-from scipy.spatial.distance import cdist
-from cython_bbox import bbox_overlaps
 
 from .track import Track
 from .flow import Flow
 from .kalman_filter import MeasType, KalmanFilter
-from .utils.rect import as_rect, to_tlbr, iom
+from .utils.distance import Metric, cdist, iou_dist
+from .utils.matching import linear_assignment, greedy_match, fuse_motion, gate_cost
+from .utils.rect import as_tlbr, to_tlbr, ios, bbox_ious, find_occluded
+from .utils import Profiler
 
 
 LOGGER = logging.getLogger(__name__)
-CHI_SQ_INV_95 = 9.4877 # 0.95 quantile of chi-square distribution
-INF_COST = 1e5
 
 
 class MultiTracker:
-    """
-    Uses optical flow and Kalman filter to track multiple objects and
-    associates detections to tracklets based on motion and appearance.
-    Parameters
-    ----------
-    size : (int, int)
-        Width and height of each frame.
-    dt : float
-        Time interval in seconds between each frame.
-    metric : string
-        Feature distance metric to associate tracklets. Usually
-        `euclidean` or `cosine`.
-    config : Dict
-        Tracker parameters.
-    """
-
-    def __init__(self, size, metric, config):
+    def __init__(self, size, metric,
+                 max_age=6,
+                 age_penalty=2,
+                 motion_weight=0.2,
+                 max_assoc_cost=0.9,
+                 max_reid_cost=0.45,
+                 iou_thresh=0.4,
+                 duplicate_thresh=0.8,
+                 occlusion_thresh=0.7,
+                 conf_thresh=0.5,
+                 confirm_hits=1,
+                 history_size=50,
+                 kalman_filter_cfg=None,
+                 flow_cfg=None):
+        """Class that uses KLT and Kalman filter to track multiple objects and
+        associates detections to tracklets based on motion and appearance.
+
+        Parameters
+        ----------
+        size : tuple
+            Width and height of each frame.
+        metric : {'euclidean', 'cosine'}
+            Feature distance metric to associate tracks.
+        max_age : int, optional
+            Max number of undetected frames allowed before a track is terminated.
+            Note that skipped frames are not included.
+        age_penalty : int, optional
+            Scale factor to penalize KLT measurements for tracks with large age.
+        motion_weight : float, optional
+            Weight for motion term in matching cost function.
+        max_assoc_cost : float, optional
+            Max matching cost for valid primary association.
+        max_reid_cost : float, optional
+            Max ReID feature dissimilarity for valid reidentification.
+        iou_thresh : float, optional
+            IoU threshold for association with unconfirmed and unmatched active tracks.
+        duplicate_thresh : float, optional
+            Track overlap threshold for removing duplicate tracks.
+        occlusion_thresh : float, optional
+            Detection overlap threshold for nullifying the extracted embeddings for association/reID.
+        conf_thresh : float, optional
+            Detection confidence threshold for starting a new track.
+        confirm_hits : int, optional
+            Min number of detections to confirm a track.
+        history_size : int, optional
+            Max size of track history to keep for reID.
+        kalman_filter_cfg : SimpleNamespace, optional
+            Kalman Filter configuration.
+        flow_cfg : SimpleNamespace, optional
+            Flow configuration.
+        """
         self.size = size
-        self.metric = metric
-        self.max_age = config['max_age']
-        self.age_penalty = config['age_penalty']
-        self.age_weight = config['age_weight']
-        self.motion_weight = config['motion_weight']
-        self.max_feat_cost = config['max_feat_cost']
-        self.max_reid_cost = config['max_reid_cost']
-        self.iou_thresh = config['iou_thresh']
-        self.duplicate_iou = config['duplicate_iou']
-        self.conf_thresh = config['conf_thresh']
-        self.lost_buf_size = config['lost_buf_size']
-
-        self.next_id = 1
+        self.metric = Metric[metric.upper()]
+        assert max_age >= 1
+        self.max_age = max_age
+        assert age_penalty >= 1
+        self.age_penalty = age_penalty
+        assert 0 <= motion_weight <= 1
+        self.motion_weight = motion_weight
+        assert 0 <= max_assoc_cost <= 2
+        self.max_assoc_cost = max_assoc_cost
+        assert 0 <= max_reid_cost <= 2
+        self.max_reid_cost = max_reid_cost
+        assert 0 <= iou_thresh <= 1
+        self.iou_thresh = iou_thresh
+        assert 0 <= duplicate_thresh <= 1
+        self.duplicate_thresh = duplicate_thresh
+        assert 0 <= occlusion_thresh <= 1
+        self.occlusion_thresh = occlusion_thresh
+        assert 0 <= conf_thresh <= 1
+        self.conf_thresh = conf_thresh
+        assert confirm_hits >= 1
+        self.confirm_hits = confirm_hits
+        assert history_size >= 0
+        self.history_size = history_size
+
+        if kalman_filter_cfg is None:
+            kalman_filter_cfg = SimpleNamespace()
+        if flow_cfg is None:
+            flow_cfg = SimpleNamespace()
+
         self.tracks = {}
-        self.lost = OrderedDict()
-        self.kf = KalmanFilter(config['kalman_filter'])
-        self.flow = Flow(self.size, config['flow'])
+        self.hist_tracks = OrderedDict()
+        self.kf = KalmanFilter(**vars(kalman_filter_cfg))
+        self.flow = Flow(self.size, **vars(flow_cfg))
         self.frame_rect = to_tlbr((0, 0, *self.size))
 
-        self.flow_bboxes = {}
+        self.klt_bboxes = {}
         self.homography = None
 
-    def reset_dt(self, dt):
-        """
-        Set KalmanFilter dt parameter.
+    def reset(self, dt):
+        """Reset the tracker for new input context.
+
         Parameters
         ----------
         dt : float
             Time interval in seconds between each frame.
         """
         self.kf.reset_dt(dt)
+        self.hist_tracks.clear()
+        Track._count = 0
 
     def init(self, frame, detections):
-        """
-        Initializes the tracker from detections in the first frame.
+        """Initializes the tracker from detections in the first frame.
+
         Parameters
         ----------
         frame : ndarray
@@ -79,20 +129,17 @@ def init(self, frame, detections):
         detections : recarray[DET_DTYPE]
             Record array of N detections.
         """
-        self.next_id = 1
         self.tracks.clear()
-        self.lost.clear()
         self.flow.init(frame)
         for det in detections:
             state = self.kf.create(det.tlbr)
-            new_trk = Track(0, self.next_id, det.tlbr, state, det.label)
-            self.tracks[self.next_id] = new_trk
+            new_trk = Track(0, det.tlbr, state, det.label, self.confirm_hits)
+            self.tracks[new_trk.trk_id] = new_trk
             LOGGER.debug(f"{'Detected:':<14}{new_trk}")
-            self.next_id += 1
 
     def track(self, frame):
-        """
-        Convenience function that combines `compute_flow` and `apply_kalman`.
+        """Convenience function that combines `compute_flow` and `apply_kalman`.
+
         Parameters
         ----------
         frame : ndarray
@@ -102,46 +149,43 @@ def track(self, frame):
         self.apply_kalman()
 
     def compute_flow(self, frame):
-        """
-        Computes optical flow to estimate tracklet positions and camera motion.
+        """Computes optical flow to estimate tracklet positions and camera motion.
+
         Parameters
         ----------
         frame : ndarray
             The next frame.
         """
         active_tracks = [track for track in self.tracks.values() if track.active]
-        self.flow_bboxes, self.homography = self.flow.predict(frame, active_tracks)
+        self.klt_bboxes, self.homography = self.flow.predict(frame, active_tracks)
         if self.homography is None:
             # clear tracks when camera motion cannot be estimated
             self.tracks.clear()
 
     def apply_kalman(self):
-        """
-        Performs kalman filter prediction and update from flow measurements.
+        """Performs kalman filter predict and update from KLT measurements.
         The function should be called after `compute_flow`.
         """
         for trk_id, track in list(self.tracks.items()):
             mean, cov = track.state
             mean, cov = self.kf.warp(mean, cov, self.homography)
             mean, cov = self.kf.predict(mean, cov)
-            if trk_id in self.flow_bboxes:
-                flow_tlbr = self.flow_bboxes[trk_id]
-                # give large flow uncertainty for occluded tracks
-                # usually these with high age and low inlier ratio
+            if trk_id in self.klt_bboxes:
+                klt_tlbr = self.klt_bboxes[trk_id]
+                # give large KLT uncertainty for occluded tracks
+                # usually these with large age and low inlier ratio
                 std_multiplier = max(self.age_penalty * track.age, 1) / track.inlier_ratio
-                mean, cov = self.kf.update(mean, cov, flow_tlbr, MeasType.FLOW, std_multiplier)
-            next_tlbr = as_rect(mean[:4])
+                mean, cov = self.kf.update(mean, cov, klt_tlbr, MeasType.FLOW, std_multiplier)
+            next_tlbr = as_tlbr(mean[:4])
             track.update(next_tlbr, (mean, cov))
-            if iom(next_tlbr, self.frame_rect) < 0.5:
+            if ios(next_tlbr, self.frame_rect) < 0.5:
                 if track.confirmed:
                     LOGGER.info(f"{'Out:':<14}{track}")
-                    self._mark_lost(trk_id)
-                else:
-                    del self.tracks[trk_id]
+                self._mark_lost(trk_id)
 
     def update(self, frame_id, detections, embeddings):
-        """
-        Associates detections to tracklets based on motion and feature embeddings.
+        """Associates detections to tracklets based on motion and feature embeddings.
+
         Parameters
         ----------
         frame_id : int
@@ -151,194 +195,229 @@ def update(self, frame_id, detections, embeddings):
         embeddings : ndarray
             NxM matrix of N extracted embeddings with dimension M.
         """
-        det_ids = list(range(len(detections)))
-        confirmed = [trk_id for trk_id, track in self.tracks.items() if track.confirmed]
-        unconfirmed = [trk_id for trk_id, track in self.tracks.items() if not track.confirmed]
-
-        # association with motion and embeddings
-        cost = self._matching_cost(confirmed, detections, embeddings)
-        matches1, u_trk_ids1, u_det_ids = self._linear_assignment(cost, confirmed, det_ids)
+        occluded_det_mask = find_occluded(detections.tlbr, self.occlusion_thresh)
+        confirmed_by_depth, unconfirmed = self._group_tracks_by_depth()
+
+        # association with motion and embeddings, tracks with small age are prioritized
+        matches1 = []
+        u_trk_ids1 = []
+        u_det_ids = list(range(len(detections)))
+        for depth, trk_ids in enumerate(confirmed_by_depth):
+            if len(u_det_ids) == 0:
+                u_trk_ids1.extend(itertools.chain.from_iterable(confirmed_by_depth[depth:]))
+                break
+            if len(trk_ids) == 0:
+                continue
+            u_detections, u_embeddings = detections[u_det_ids], embeddings[u_det_ids]
+            u_occluded_dmask = occluded_det_mask[u_det_ids]
+            cost = self._matching_cost(trk_ids, u_detections, u_embeddings, u_occluded_dmask)
+            matches, u_trk_ids, u_det_ids = linear_assignment(cost, trk_ids, u_det_ids)
+            matches1 += matches
+            u_trk_ids1 += u_trk_ids
 
         # 2nd association with IoU
         active = [trk_id for trk_id in u_trk_ids1 if self.tracks[trk_id].active]
         u_trk_ids1 = [trk_id for trk_id in u_trk_ids1 if not self.tracks[trk_id].active]
         u_detections = detections[u_det_ids]
         cost = self._iou_cost(active, u_detections)
-        matches2, u_trk_ids2, u_det_ids = self._linear_assignment(cost, active, u_det_ids, True)
+        matches2, u_trk_ids2, u_det_ids = linear_assignment(cost, active, u_det_ids)
 
         # 3rd association with unconfirmed tracks
         u_detections = detections[u_det_ids]
         cost = self._iou_cost(unconfirmed, u_detections)
-        matches3, u_trk_ids3, u_det_ids = self._linear_assignment(cost, unconfirmed,
-                                                                  u_det_ids, True)
+        matches3, u_trk_ids3, u_det_ids = linear_assignment(cost, unconfirmed, u_det_ids)
+
+        # reID with track history
+        hist_ids = [trk_id for trk_id, track in self.hist_tracks.items()
+                    if track.avg_feat.count >= 2]
 
-        # re-id with lost tracks
-        lost_ids = list(self.lost.keys())
         u_det_ids = [det_id for det_id in u_det_ids if detections[det_id].conf >= self.conf_thresh]
-        u_detections, u_embeddings = detections[u_det_ids], embeddings[u_det_ids]
-        cost = self._reid_cost(u_detections, u_embeddings)
-        reid_matches, _, u_det_ids = self._linear_assignment(cost, lost_ids, u_det_ids)
+        valid_u_det_ids = [det_id for det_id in u_det_ids if not occluded_det_mask[det_id]]
+        invalid_u_det_ids = [det_id for det_id in u_det_ids if occluded_det_mask[det_id]]
+
+        u_detections, u_embeddings = detections[valid_u_det_ids], embeddings[valid_u_det_ids]
+        cost = self._reid_cost(hist_ids, u_detections, u_embeddings)
+
+        reid_matches, _, reid_u_det_ids = greedy_match(cost, hist_ids, valid_u_det_ids,
+                                                       self.max_reid_cost)
 
         matches = itertools.chain(matches1, matches2, matches3)
         u_trk_ids = itertools.chain(u_trk_ids1, u_trk_ids2, u_trk_ids3)
-        updated, aged = [], []
+
+        # rectify matches that may cause duplicate tracks
+        matches, u_trk_ids = self._rectify_matches(matches, u_trk_ids, detections)
+
+        # reinstate matched tracks
+        for trk_id, det_id in reid_matches:
+            track = self.hist_tracks.pop(trk_id)
+            det = detections[det_id]
+            LOGGER.info(f"{'Reidentified:':<14}{track}")
+            state = self.kf.create(det.tlbr)
+            track.reinstate(frame_id, det.tlbr, state, embeddings[det_id])
+            self.tracks[trk_id] = track
 
         # update matched tracks
         for trk_id, det_id in matches:
             track = self.tracks[trk_id]
             det = detections[det_id]
             mean, cov = self.kf.update(*track.state, det.tlbr, MeasType.DETECTOR)
-            next_tlbr = as_rect(mean[:4])
-            track.update(next_tlbr, (mean, cov), embeddings[det_id])
-            if track.hits == 1:
+            next_tlbr = as_tlbr(mean[:4])
+            is_valid = not occluded_det_mask[det_id]
+            if track.hits == self.confirm_hits - 1:
                 LOGGER.info(f"{'Found:':<14}{track}")
-            if iom(next_tlbr, self.frame_rect) < 0.5:
-                LOGGER.info(f"{'Out:':<14}{track}")
+            if ios(next_tlbr, self.frame_rect) < 0.5:
+                is_valid = False
+                if track.confirmed:
+                    LOGGER.info(f"{'Out:':<14}{track}")
                 self._mark_lost(trk_id)
-            else:
-                updated.append(trk_id)
-
-        # reactivate matched lost tracks
-        for trk_id, det_id in reid_matches:
-            track = self.lost[trk_id]
-            det = detections[det_id]
-            LOGGER.info(f"{'Reidentified:':<14}{track}")
-            state = self.kf.create(det.tlbr)
-            track.reactivate(frame_id, det.tlbr, state, embeddings[det_id])
-            self.tracks[trk_id] = track
-            del self.lost[trk_id]
-            updated.append(trk_id)
+            track.add_detection(frame_id, next_tlbr, (mean, cov), embeddings[det_id], is_valid)
 
         # clean up lost tracks
         for trk_id in u_trk_ids:
             track = self.tracks[trk_id]
+            track.mark_missed()
             if not track.confirmed:
                 LOGGER.debug(f"{'Unconfirmed:':<14}{track}")
                 del self.tracks[trk_id]
                 continue
-            track.mark_missed()
             if track.age > self.max_age:
                 LOGGER.info(f"{'Lost:':<14}{track}")
                 self._mark_lost(trk_id)
-            else:
-                aged.append(trk_id)
 
-        # register new detections
+        u_det_ids = itertools.chain(invalid_u_det_ids, reid_u_det_ids)
+        # start new tracks
         for det_id in u_det_ids:
             det = detections[det_id]
             state = self.kf.create(det.tlbr)
-            new_trk = Track(frame_id, self.next_id, det.tlbr, state, det.label)
-            self.tracks[self.next_id] = new_trk
+            new_trk = Track(frame_id, det.tlbr, state, det.label, self.confirm_hits)
+            self.tracks[new_trk.trk_id] = new_trk
             LOGGER.debug(f"{'Detected:':<14}{new_trk}")
-            updated.append(self.next_id)
-            self.next_id += 1
-
-        # remove duplicate tracks
-        self._remove_duplicate(updated, aged)
 
     def _mark_lost(self, trk_id):
-        self.lost[trk_id] = self.tracks[trk_id]
-        if len(self.lost) > self.lost_buf_size:
-            self.lost.popitem(last=False)
-        del self.tracks[trk_id]
+        track = self.tracks.pop(trk_id)
+        if track.confirmed:
+            self.hist_tracks[trk_id] = track
+            if len(self.hist_tracks) > self.history_size:
+                self.hist_tracks.popitem(last=False)
+
+    def _group_tracks_by_depth(self, group_size=2):
+        n_depth = (self.max_age + group_size) // group_size
+        confirmed_by_depth = [[] for _ in range(n_depth)]
+        unconfirmed = []
+        for trk_id, track in self.tracks.items():
+            if track.confirmed:
+                depth = track.age // group_size
+                confirmed_by_depth[depth].append(trk_id)
+            else:
+                unconfirmed.append(trk_id)
+        return confirmed_by_depth, unconfirmed
 
-    def _matching_cost(self, trk_ids, detections, embeddings):
-        if len(trk_ids) == 0 or len(detections) == 0:
-            return np.empty((len(trk_ids), len(detections)))
+    def _matching_cost(self, trk_ids, detections, embeddings, occluded_dmask):
+        n_trk, n_det = len(trk_ids), len(detections)
+        if n_trk == 0 or n_det == 0:
+            return np.empty((n_trk, n_det))
 
-        features = [self.tracks[trk_id].smooth_feature for trk_id in trk_ids]
-        cost = cdist(features, embeddings, self.metric)
+        features = np.empty((n_trk, embeddings.shape[1]))
+        invalid_fmask = np.zeros(n_trk, np.bool_)
         for i, trk_id in enumerate(trk_ids):
             track = self.tracks[trk_id]
-            motion_dist = self.kf.motion_distance(*track.state, detections.tlbr)
-            normalized_age = track.age / self.max_age
-            cost[i] = self._fuse_motion(cost[i], motion_dist, detections.label, track.label,
-                                        normalized_age, self.max_feat_cost, self.motion_weight,
-                                        self.age_weight)
-        return cost
+            if track.avg_feat.is_valid():
+                features[i, :] = track.avg_feat()
+            else:
+                invalid_fmask[i] = True
 
-    def _iou_cost(self, trk_ids, detections):
-        if len(trk_ids) == 0 or len(detections) == 0:
-            return np.empty((len(trk_ids), len(detections)))
+        empty_mask = invalid_fmask[:, None] | occluded_dmask
+        fill_val = min(self.max_assoc_cost + 0.1, 1.)
+        cost = cdist(features, embeddings, self.metric, empty_mask, fill_val)
+
+        # fuse motion information
+        for row, trk_id in enumerate(trk_ids):
+            track = self.tracks[trk_id]
+            m_dist = self.kf.motion_distance(*track.state, detections.tlbr)
+            fuse_motion(cost[row], m_dist, self.motion_weight)
 
         # make sure associated pair has the same class label
-        trk_labels = np.array([self.tracks[trk_id].label for trk_id in trk_ids])
-        trk_bboxes = np.array([self.tracks[trk_id].tlbr for trk_id in trk_ids])
-        det_bboxes = detections.tlbr
-        ious = bbox_overlaps(trk_bboxes, det_bboxes)
-        ious = self._gate_cost(ious, trk_labels, detections.label, self.iou_thresh, True)
-        return ious
-
-    def _reid_cost(self, detections, embeddings):
-        if len(self.lost) == 0 or len(detections) == 0:
-            return np.empty((len(self.lost), len(detections)))
-
-        trk_labels = np.array([track.label for track in self.lost.values()])
-        features = [track.smooth_feature for track in self.lost.values()]
+        t_labels = np.fromiter((self.tracks[trk_id].label for trk_id in trk_ids), int, n_trk)
+        gate_cost(cost, t_labels, detections.label, self.max_assoc_cost)
+        return cost
+
+    def _iou_cost(self, trk_ids, detections):
+        n_trk, n_det = len(trk_ids), len(detections)
+        if n_trk == 0 or n_det == 0:
+            return np.empty((n_trk, n_det))
+
+        t_labels = np.fromiter((self.tracks[trk_id].label for trk_id in trk_ids), int, n_trk)
+        t_bboxes = np.array([self.tracks[trk_id].tlbr for trk_id in trk_ids])
+        d_bboxes = detections.tlbr
+        iou_cost = iou_dist(t_bboxes, d_bboxes)
+        gate_cost(iou_cost, t_labels, detections.label, 1. - self.iou_thresh)
+        return iou_cost
+
+    def _reid_cost(self, hist_ids, detections, embeddings):
+        n_hist, n_det = len(hist_ids), len(detections)
+        if n_hist == 0 or n_det == 0:
+            return np.empty((n_hist, n_det))
+
+        features = np.concatenate([self.hist_tracks[trk_id].avg_feat()
+                                   for trk_id in hist_ids]).reshape(n_hist, -1)
         cost = cdist(features, embeddings, self.metric)
-        cost = self._gate_cost(cost, trk_labels, detections.label, self.max_reid_cost)
+
+        t_labels = np.fromiter((t.label for t in self.hist_tracks.values()), int, n_hist)
+        gate_cost(cost, t_labels, detections.label)
         return cost
 
-    def _remove_duplicate(self, updated, aged):
-        if len(updated) == 0 or len(aged) == 0:
+    def _rectify_matches(self, matches, u_trk_ids, detections):
+        matches, u_trk_ids = set(matches), set(u_trk_ids)
+        inactive_matches = [match for match in matches if not self.tracks[match[0]].active]
+        u_active = [trk_id for trk_id in u_trk_ids
+                    if self.tracks[trk_id].confirmed and self.tracks[trk_id].active]
+
+        n_inactive_matches = len(inactive_matches)
+        if n_inactive_matches == 0 or len(u_active) == 0:
+            return matches, u_trk_ids
+
+        m_inactive, det_ids = zip(*inactive_matches)
+        t_bboxes = np.array([self.tracks[trk_id].tlbr for trk_id in u_active])
+        d_bboxes = detections[det_ids,].tlbr
+        iou_cost = iou_dist(t_bboxes, d_bboxes)
+
+        col_indices = list(range(n_inactive_matches))
+        dup_matches, _, _ = greedy_match(iou_cost, u_active, col_indices,
+                                         1. - self.duplicate_thresh)
+
+        for u_trk_id, col in dup_matches:
+            m_trk_id, det_id = m_inactive[col], det_ids[col]
+            t_u_active, t_m_inactive = self.tracks[u_trk_id], self.tracks[m_trk_id]
+            if t_m_inactive.end_frame < t_u_active.start_frame:
+                LOGGER.debug(f"{'Merged:':<14}{u_trk_id} -> {m_trk_id}")
+                t_m_inactive.merge_continuation(t_u_active)
+                u_trk_ids.remove(u_trk_id)
+                del self.tracks[u_trk_id]
+            else:
+                LOGGER.debug(f"{'Duplicate:':<14}{m_trk_id} -> {u_trk_id}")
+                u_trk_ids.remove(u_trk_id)
+                u_trk_ids.add(m_trk_id)
+                matches.remove((m_trk_id, det_id))
+                matches.add((u_trk_id, det_id))
+        return matches, u_trk_ids
+
+    def _remove_duplicate(self, trk_ids1, trk_ids2):
+        if len(trk_ids1) == 0 or len(trk_ids2) == 0:
             return
 
-        updated_bboxes = np.array([self.tracks[trk_id].tlbr for trk_id in updated])
-        aged_bboxes = np.array([self.tracks[trk_id].tlbr for trk_id in aged])
+        bboxes1 = np.array([self.tracks[trk_id].tlbr for trk_id in trk_ids1])
+        bboxes2 = np.array([self.tracks[trk_id].tlbr for trk_id in trk_ids2])
 
-        ious = bbox_overlaps(updated_bboxes, aged_bboxes)
-        idx = np.where(ious >= self.duplicate_iou)
+        ious = bbox_ious(bboxes1, bboxes2)
+        idx = np.where(ious >= self.duplicate_thresh)
         dup_ids = set()
         for row, col in zip(*idx):
-            updated_id, aged_id = updated[row], aged[col]
-            if self.tracks[updated_id].start_frame <= self.tracks[aged_id].start_frame:
-                dup_ids.add(aged_id)
+            trk_id1, trk_id2 = trk_ids1[row], trk_ids2[col]
+            track1, track2 = self.tracks[trk_id1], self.tracks[trk_id2]
+            if len(track1) > len(track2):
+                dup_ids.add(trk_id2)
             else:
-                dup_ids.add(updated_id)
+                dup_ids.add(trk_id1)
         for trk_id in dup_ids:
             LOGGER.debug(f"{'Duplicate:':<14}{self.tracks[trk_id]}")
             del self.tracks[trk_id]
-
-    @staticmethod
-    def _linear_assignment(cost, trk_ids, det_ids, maximize=False):
-        rows, cols = linear_sum_assignment(cost, maximize)
-        unmatched_rows = list(set(range(cost.shape[0])) - set(rows))
-        unmatched_cols = list(set(range(cost.shape[1])) - set(cols))
-        unmatched_trk_ids = [trk_ids[row] for row in unmatched_rows]
-        unmatched_det_ids = [det_ids[col] for col in unmatched_cols]
-        matches = []
-        if not maximize:
-            for row, col in zip(rows, cols):
-                if cost[row, col] < INF_COST:
-                    matches.append((trk_ids[row], det_ids[col]))
-                else:
-                    unmatched_trk_ids.append(trk_ids[row])
-                    unmatched_det_ids.append(det_ids[col])
-        else:
-            for row, col in zip(rows, cols):
-                if cost[row, col] > 0:
-                    matches.append((trk_ids[row], det_ids[col]))
-                else:
-                    unmatched_trk_ids.append(trk_ids[row])
-                    unmatched_det_ids.append(det_ids[col])
-        return matches, unmatched_trk_ids, unmatched_det_ids
-
-    @staticmethod
-    @nb.njit(fastmath=True, cache=True)
-    def _fuse_motion(cost, motion_dist, det_labels, label, age, max_cost, w1, w2):
-        gate = (cost > max_cost) | (motion_dist > CHI_SQ_INV_95) | (label != det_labels)
-        cost = cost + w1 * motion_dist + w2 * age
-        cost[gate] = INF_COST
-        return cost
-
-    @staticmethod
-    @nb.njit(parallel=True, fastmath=True, cache=True)
-    def _gate_cost(cost, trk_labels, det_labels, thresh, maximize=False):
-        for i in nb.prange(len(cost)):
-            if maximize:
-                gate = (cost[i] < thresh) | (trk_labels[i] != det_labels)
-                cost[i][gate] = 0
-            else:
-                gate = (cost[i] > thresh) | (trk_labels[i] != det_labels)
-                cost[i][gate] = INF_COST
-        return cost
diff --git a/fastmot/utils/distance.py b/fastmot/utils/distance.py
new file mode 100644
index 00000000..75cdfbea
--- /dev/null
+++ b/fastmot/utils/distance.py
@@ -0,0 +1,162 @@
+from enum import Enum
+import numpy as np
+import numba as nb
+
+from .rect import area, get_center
+
+
+INF_DIST = 1e5
+
+
+class Metric(Enum):
+    EUCLIDEAN = 0
+    COSINE = 1
+
+
+@nb.njit(parallel=True, fastmath=True, cache=True)
+def cdist(XA, XB, metric, empty_mask=None, fill_val=None):
+    """Numba implementation of Scipy's cdist"""
+    assert XA.ndim == XB.ndim == 2
+    assert XA.shape[1] == XB.shape[1]
+    if empty_mask is not None:
+        assert empty_mask.ndim == 2
+        assert empty_mask.shape[0] == XA.shape[0]
+        assert empty_mask.shape[1] == XB.shape[0]
+    filler = 1. if fill_val is None else fill_val
+
+    if metric == Metric.EUCLIDEAN:
+        return euclidean(XA, XB, empty_mask, filler)
+    elif metric == Metric.COSINE:
+        return cosine(XA, XB, empty_mask, filler)
+    else:
+        raise ValueError('Unsupported distance metric')
+
+
+@nb.njit(parallel=True, fastmath=True, cache=True)
+def pdist(X, metric):
+    """Numba implementation of Scipy's pdist"""
+    assert X.ndim == 2
+
+    if metric == Metric.EUCLIDEAN:
+        return euclidean(X, X, symmetric=True)
+    elif metric == Metric.COSINE:
+        return cosine(X, X, symmetric=True)
+    else:
+        raise ValueError('Unsupported distance metric')
+
+
+@nb.njit(parallel=True, fastmath=True, cache=True, inline='always')
+def euclidean(XA, XB, empty_mask=None, filler=1., symmetric=False):
+    """Numba implementation of Scipy's euclidean"""
+    Y = np.empty((XA.shape[0], XB.shape[0]))
+    for i in nb.prange(XA.shape[0]):
+        for j in range(XB.shape[0]):
+            if symmetric and i >= j:
+                Y[i, j] = INF_DIST
+            elif empty_mask is not None and empty_mask[i, j]:
+                Y[i, j] = filler
+            else:
+                norm = 0.
+                for k in range(XA.shape[1]):
+                    norm += (XA[i, k] - XB[j, k])**2
+                Y[i, j] = np.sqrt(norm)
+    return Y
+
+
+@nb.njit(parallel=True, fastmath=True, cache=True, inline='always')
+def cosine(XA, XB, empty_mask=None, filler=1., symmetric=False):
+    """Numba implementation of Scipy's cosine"""
+    Y = np.empty((XA.shape[0], XB.shape[0]))
+    for i in nb.prange(XA.shape[0]):
+        for j in range(XB.shape[0]):
+            if symmetric and i >= j:
+                Y[i, j] = INF_DIST
+            elif empty_mask is not None and empty_mask[i, j]:
+                Y[i, j] = filler
+            else:
+                dot    = 0.
+                a_norm = 0.
+                b_norm = 0.
+                for k in range(XA.shape[1]):
+                    dot    += XA[i, k] * XB[j, k]
+                    a_norm += XA[i, k] * XA[i, k]
+                    b_norm += XB[j, k] * XB[j, k]
+                a_norm = np.sqrt(a_norm)
+                b_norm = np.sqrt(b_norm)
+                Y[i, j] = 1. - dot / (a_norm * b_norm)
+    return Y
+
+
+@nb.njit(parallel=False, fastmath=True, cache=True)
+def iou_dist(tlbrs1, tlbrs2):
+    """Computes pairwise IoU distance."""
+    assert tlbrs1.ndim == tlbrs2.ndim == 2
+    assert tlbrs1.shape[1] == tlbrs2.shape[1] == 4
+
+    Y = np.empty((tlbrs1.shape[0], tlbrs2.shape[0]))
+    for i in nb.prange(tlbrs1.shape[0]):
+        area1 = area(tlbrs1[i, :])
+        for j in range(tlbrs2.shape[0]):
+            iw = min(tlbrs1[i, 2], tlbrs2[j, 2]) - max(tlbrs1[i, 0], tlbrs2[j, 0]) + 1
+            ih = min(tlbrs1[i, 3], tlbrs2[j, 3]) - max(tlbrs1[i, 1], tlbrs2[j, 1]) + 1
+            if iw > 0 and ih > 0:
+                area_inter = iw * ih
+                area_union = area1 + area(tlbrs2[j, :]) - area_inter
+                Y[i, j] = 1. - area_inter / area_union
+            else:
+                Y[i, j] = 1.
+    return Y
+
+
+@nb.njit(parallel=False, fastmath=True, cache=True)
+def giou_dist(tlbrs1, tlbrs2):
+    """Computes pairwise GIoU distance."""
+    assert tlbrs1.ndim == tlbrs2.ndim == 2
+    assert tlbrs1.shape[1] == tlbrs2.shape[1] == 4
+
+    Y = np.empty((tlbrs1.shape[0], tlbrs2.shape[0]))
+    for i in nb.prange(tlbrs1.shape[0]):
+        area1 = area(tlbrs1[i, :])
+        for j in range(tlbrs2.shape[0]):
+            iou = 0.
+            area_union = area1 + area(tlbrs2[j, :])
+            iw = min(tlbrs1[i, 2], tlbrs2[j, 2]) - max(tlbrs1[i, 0], tlbrs2[j, 0]) + 1
+            ih = min(tlbrs1[i, 3], tlbrs2[j, 3]) - max(tlbrs1[i, 1], tlbrs2[j, 1]) + 1
+            if iw > 0 and ih > 0:
+                area_inter = iw * ih
+                area_union -= area_inter
+                iou = area_inter / area_union
+            ew = max(tlbrs1[i, 2], tlbrs2[j, 2]) - min(tlbrs1[i, 0], tlbrs2[j, 0]) + 1
+            eh = max(tlbrs1[i, 3], tlbrs2[j, 3]) - min(tlbrs1[i, 1], tlbrs2[j, 1]) + 1
+            area_encls = ew * eh
+            giou = iou - (area_encls - area_union) / area_encls
+            Y[i, j] = (1. - giou) * 0.5
+    return Y
+
+
+@nb.njit(parallel=True, fastmath=True, cache=True)
+def diou_dist(tlbrs1, tlbrs2):
+    """Computes pairwise DIoU distance."""
+    assert tlbrs1.ndim == tlbrs2.ndim == 2
+    assert tlbrs1.shape[1] == tlbrs2.shape[1] == 4
+
+    Y = np.empty((tlbrs1.shape[0], tlbrs2.shape[0]))
+    for i in nb.prange(tlbrs1.shape[0]):
+        area1 = area(tlbrs1[i, :])
+        x1, y1 = get_center(tlbrs1[i, :])
+        for j in range(tlbrs2.shape[0]):
+            iou = 0.
+            iw = min(tlbrs1[i, 2], tlbrs2[j, 2]) - max(tlbrs1[i, 0], tlbrs2[j, 0]) + 1
+            ih = min(tlbrs1[i, 3], tlbrs2[j, 3]) - max(tlbrs1[i, 1], tlbrs2[j, 1]) + 1
+            if iw > 0 and ih > 0:
+                area_inter = iw * ih
+                area_union = area1 + area(tlbrs2[j, :]) - area_inter
+                iou = area_inter / area_union
+            ew = max(tlbrs1[i, 2], tlbrs2[j, 2]) - min(tlbrs1[i, 0], tlbrs2[j, 0]) + 1
+            eh = max(tlbrs1[i, 3], tlbrs2[j, 3]) - min(tlbrs1[i, 1], tlbrs2[j, 1]) + 1
+            c = ew**2 + eh**2
+            x2, y2 = get_center(tlbrs2[j, :])
+            d = (x2 - x1)**2 + (y2 - y1)**2
+            diou = iou - (d / c)**0.6
+            Y[i, j] = (1. - diou) * 0.5
+    return Y
diff --git a/fastmot/utils/matching.py b/fastmot/utils/matching.py
new file mode 100644
index 00000000..aa2427e7
--- /dev/null
+++ b/fastmot/utils/matching.py
@@ -0,0 +1,116 @@
+from scipy.optimize import linear_sum_assignment
+import numpy as np
+import numba as nb
+
+
+CHI_SQ_INV_95 = 9.4877 # 0.95 quantile of chi-square distribution
+INF_COST = 1e5
+
+
+def linear_assignment(cost, row_ids, col_ids):
+    """Solves the linear assignment problem.
+
+    Parameters
+    ----------
+    cost : ndarray
+        The cost matrix.
+    row_ids : List[int]
+        IDs that correspond to each row in the cost matrix.
+    col_ids : List[int]
+        IDs that correspond to each column in the cost matrix.
+
+    Returns
+    -------
+    List[tuple], List[int], List[int]
+        Matched row and column IDs, unmatched row IDs, and unmatched column IDs.
+    """
+    m_rows, m_cols = linear_sum_assignment(cost)
+    row_ids = np.fromiter(row_ids, int, len(row_ids))
+    col_ids = np.fromiter(col_ids, int, len(col_ids))
+    return _get_assignment_matches(cost, row_ids, col_ids, m_rows, m_cols)
+
+
+def greedy_match(cost, row_ids, col_ids, max_cost):
+    """Performs greedy matching until the cost exceeds `max_cost`.
+
+    Parameters
+    ----------
+    cost : ndarray
+        The cost matrix.
+    row_ids : List[int]
+        IDs that correspond to each row in the cost matrix.
+    col_ids : List[int]
+        IDs that correspond to each column in the cost matrix.
+    max_cost : float
+        Maximum cost allowed to match a row with a column.
+
+    Returns
+    -------
+    List[tuple], List[int], List[int]
+        Matched row and column IDs, unmatched row IDs, and unmatched column IDs.
+    """
+    row_ids = np.fromiter(row_ids, int, len(row_ids))
+    col_ids = np.fromiter(col_ids, int, len(col_ids))
+    return _greedy_match(cost, row_ids, col_ids, max_cost)
+
+
+@nb.njit(fastmath=True, cache=True)
+def _get_assignment_matches(cost, row_ids, col_ids, m_rows, m_cols):
+    unmatched_rows = list(set(range(cost.shape[0])) - set(m_rows))
+    unmatched_cols = list(set(range(cost.shape[1])) - set(m_cols))
+    unmatched_row_ids = [row_ids[row] for row in unmatched_rows]
+    unmatched_col_ids = [col_ids[col] for col in unmatched_cols]
+    matches = []
+    for row, col in zip(m_rows, m_cols):
+        if cost[row, col] < INF_COST:
+            matches.append((row_ids[row], col_ids[col]))
+        else:
+            unmatched_row_ids.append(row_ids[row])
+            unmatched_col_ids.append(col_ids[col])
+    return matches, unmatched_row_ids, unmatched_col_ids
+
+
+@nb.njit(fastmath=True, cache=True)
+def _greedy_match(cost, row_ids, col_ids, max_cost):
+    indices_rows = np.arange(cost.shape[0])
+    indices_cols = np.arange(cost.shape[1])
+
+    matches = []
+    while cost.shape[0] > 0 and cost.shape[1] > 0:
+        idx = np.argmin(cost)
+        i, j = idx // cost.shape[1], idx % cost.shape[1]
+        if cost[i, j] <= max_cost:
+            matches.append((row_ids[indices_rows[i]], col_ids[indices_cols[j]]))
+            row_mask = np.ones(cost.shape[0], np.bool_)
+            col_mask = np.ones(cost.shape[1], np.bool_)
+            row_mask[i] = False
+            col_mask[j] = False
+
+            indices_rows = indices_rows[row_mask]
+            indices_cols = indices_cols[col_mask]
+            cost = cost[row_mask, :][:, col_mask]
+        else:
+            break
+
+    unmatched_row_ids = [row_ids[row] for row in indices_rows]
+    unmatched_col_ids = [col_ids[col] for col in indices_cols]
+    return matches, unmatched_row_ids, unmatched_col_ids
+
+
+@nb.njit(fastmath=True, cache=True)
+def fuse_motion(cost, m_dist, m_weight):
+    """Fuse each row of cost matrix with motion information."""
+    norm_factor = 1. / CHI_SQ_INV_95
+    f_weight = 1. - m_weight
+    cost[:] = f_weight * cost + m_weight * norm_factor * m_dist
+    cost[m_dist > CHI_SQ_INV_95] = INF_COST
+
+
+@nb.njit(parallel=False, fastmath=True, cache=True)
+def gate_cost(cost, row_labels, col_labels, max_cost=None):
+    """Gate cost matrix if cost exceeds the maximum."""
+    for i in nb.prange(cost.shape[0]):
+        for j in range(cost.shape[1]):
+            if (row_labels[i] != col_labels[j] or
+                max_cost is not None and cost[i, j] > max_cost):
+                cost[i, j] = INF_COST
diff --git a/fastmot/utils/numba.py b/fastmot/utils/numba.py
new file mode 100644
index 00000000..d7b2e0bb
--- /dev/null
+++ b/fastmot/utils/numba.py
@@ -0,0 +1,64 @@
+import numpy as np
+import numba as nb
+
+
+@nb.njit(fastmath=True, cache=True)
+def apply_along_axis(func1d, mat, axis):
+    """Numba utility to apply reduction to a given axis."""
+    assert mat.ndim == 2
+    assert axis in [0, 1]
+    if axis == 0:
+        result = np.empty(mat.shape[1], mat.dtype)
+        for i in range(len(result)):
+            result[i, :] = func1d(mat[:, i])
+    else:
+        result = np.empty(mat.shape[0], mat.dtype)
+        for i in range(len(result)):
+            result[i, :] = func1d(mat[i, :])
+    return result
+
+
+@nb.njit(parallel=True, fastmath=True, cache=True)
+def normalize_vec(vectors):
+    """Numba utility to normalize an array of vectors."""
+    assert vectors.ndim == 2
+    out = np.empty_like(vectors)
+    for i in nb.prange(vectors.shape[0]):
+        norm_factor = 1. / np.linalg.norm(vectors[i, :])
+        out[i, :] = norm_factor * vectors[i, :]
+    return out
+
+
+@nb.njit(fastmath=True, cache=True)
+def mask_area(mask):
+    """Utility to calculate the area of a mask."""
+    count = 0
+    m_raveled = mask.ravel()
+    for i in range(mask.size):
+        if m_raveled[i] != 0:
+            count += 1
+    return count
+
+
+@nb.njit(fastmath=True, cache=True, inline='always')
+def transform(pts, m):
+    """Numba implementation of OpenCV's transform."""
+    pts = np.asarray(pts, dtype=np.float64)
+    pts = np.atleast_2d(pts)
+
+    augment = np.ones((len(pts), 1))
+    pts = np.concatenate((pts, augment), axis=1)
+    return pts @ m.T
+
+
+@nb.njit(fastmath=True, cache=True, inline='always')
+def perspective_transform(pts, m):
+    """Numba implementation of OpenCV's perspectiveTransform."""
+    pts = np.asarray(pts, dtype=np.float64)
+    pts = np.atleast_2d(pts)
+
+    augment = np.ones((len(pts), 1))
+    pts = np.concatenate((pts, augment), axis=1).T
+    pts = m @ pts
+    pts = pts / pts[-1]
+    return pts[:2].T
diff --git a/fastmot/utils/rect.py b/fastmot/utils/rect.py
index b1981554..67b4c7ba 100644
--- a/fastmot/utils/rect.py
+++ b/fastmot/utils/rect.py
@@ -2,136 +2,170 @@
 import numba as nb
 
 
-@nb.njit(cache=True)
-def as_rect(tlbr):
-    tlbr = np.asarray(tlbr, np.float64)
-    tlbr = np.rint(tlbr)
-    return tlbr
-
-
-@nb.njit(cache=True)
+@nb.njit(cache=True,  inline='always')
+def as_tlbr(tlbr):
+    """Construct a rectangle from a tuple or np.ndarray."""
+    _tlbr = np.empty(4)
+    _tlbr[0] = round(float(tlbr[0]), 0)
+    _tlbr[1] = round(float(tlbr[1]), 0)
+    _tlbr[2] = round(float(tlbr[2]), 0)
+    _tlbr[3] = round(float(tlbr[3]), 0)
+    return _tlbr
+
+
+@nb.njit(cache=True, inline='always')
 def get_size(tlbr):
-    tl, br = tlbr[:2], tlbr[2:]
-    size = br - tl + 1
-    return size
+    return tlbr[2] - tlbr[0] + 1, tlbr[3] - tlbr[1] + 1
 
 
-@nb.njit(cache=True)
-def area(tlbr):
-    size = get_size(tlbr)
-    return int(size[0] * size[1])
+@nb.njit(cache=True,  inline='always')
+def aspect_ratio(tlbr):
+    w, h = get_size(tlbr)
+    return h / w if w > 0 else 0.
 
 
-@nb.njit(cache=True)
-def mask_area(mask):
-    return np.count_nonzero(mask)
+@nb.njit(cache=True, inline='always')
+def area(tlbr):
+    w, h = get_size(tlbr)
+    if w <= 0 or h <= 0:
+        return 0.
+    return w * h
 
 
-@nb.njit(cache=True)
+@nb.njit(cache=True,  inline='always')
 def get_center(tlbr):
-    xmin, ymin, xmax, ymax = tlbr
-    return np.array([(xmin + xmax) / 2, (ymin + ymax) / 2])
+    return (tlbr[0] + tlbr[2]) / 2, (tlbr[1] + tlbr[3]) / 2
 
 
-@nb.njit(cache=True)
+@nb.njit(cache=True, inline='always')
 def to_tlwh(tlbr):
-    return np.append(tlbr[:2], get_size(tlbr))
+    tlwh = np.empty(4)
+    tlwh[:2] = tlbr[:2]
+    tlwh[2:] = get_size(tlbr)
+    return tlwh
 
 
-@nb.njit(cache=True)
+@nb.njit(cache=True, inline='always')
 def to_tlbr(tlwh):
-    tlwh = np.asarray(tlwh, np.float64)
-    tlwh = np.rint(tlwh)
-    tl, size = tlwh[:2], tlwh[2:]
-    br = tl + size - 1
-    return np.append(tl, br)
+    tlbr = np.empty(4)
+    xmin = float(tlwh[0])
+    ymin = float(tlwh[1])
+    tlbr[0] = round(xmin, 0)
+    tlbr[1] = round(ymin, 0)
+    tlbr[2] = round(xmin + float(tlwh[2]) - 1., 0)
+    tlbr[3] = round(ymin + float(tlwh[3]) - 1., 0)
+    return tlbr
 
 
-@nb.njit(cache=True)
+@nb.njit(cache=True, inline='always')
 def intersection(tlbr1, tlbr2):
-    tl1, br1 = tlbr1[:2], tlbr1[2:]
-    tl2, br2 = tlbr2[:2], tlbr2[2:]
-    tl = np.maximum(tl1, tl2)
-    br = np.minimum(br1, br2)
-    tlbr = np.append(tl, br)
-    if np.any(get_size(tlbr) <= 0):
+    tlbr = np.empty(4)
+    tlbr[0] = max(tlbr1[0], tlbr2[0])
+    tlbr[1] = max(tlbr1[1], tlbr2[1])
+    tlbr[2] = min(tlbr1[2], tlbr2[2])
+    tlbr[3] = min(tlbr1[3], tlbr2[3])
+    if tlbr[2] < tlbr[0] or tlbr[3] < tlbr[1]:
         return None
     return tlbr
 
 
-@nb.njit(cache=True)
-def union(tlbr1, tlbr2):
-    tl1, br1 = tlbr1[:2], tlbr1[2:]
-    tl2, br2 = tlbr2[:2], tlbr2[2:]
-    tl = np.minimum(tl1, tl2)
-    br = np.maximum(br1, br2)
-    tlbr = np.append(tl, br)
+@nb.njit(cache=True, inline='always')
+def enclosing(tlbr1, tlbr2):
+    tlbr = np.empty(4)
+    tlbr[0] = min(tlbr1[0], tlbr2[0])
+    tlbr[1] = min(tlbr1[1], tlbr2[1])
+    tlbr[2] = max(tlbr1[2], tlbr2[2])
+    tlbr[3] = max(tlbr1[3], tlbr2[3])
     return tlbr
 
 
-@nb.njit(cache=True)
+@nb.njit(cache=True, inline='always')
 def crop(img, tlbr):
-    xmin, ymin, xmax, ymax = tlbr.astype(np.int_)
+    xmin = max(int(tlbr[0]), 0)
+    ymin = max(int(tlbr[1]), 0)
+    xmax = max(int(tlbr[2]), 0)
+    ymax = max(int(tlbr[3]), 0)
     return img[ymin:ymax + 1, xmin:xmax + 1]
 
 
-@nb.njit(cache=True)
+@nb.njit(cache=True, inline='always')
 def multi_crop(img, tlbrs):
-    tlbrs_ = tlbrs.astype(np.int_)
-    return [img[tlbrs_[i][1]:tlbrs_[i][3] + 1, tlbrs_[i][0]:tlbrs_[i][2] + 1]
-            for i in range(len(tlbrs_))]
+    _tlbrs = tlbrs.astype(np.int_)
+    _tlbrs = np.maximum(_tlbrs, 0)
+    return [img[_tlbrs[i, 1]:_tlbrs[i, 3] + 1, _tlbrs[i, 0]:_tlbrs[i, 2] + 1]
+            for i in range(len(_tlbrs))]
+
+
+@nb.njit(fastmath=True, cache=True, inline='always')
+def ios(tlbr1, tlbr2):
+    """Computes intersection over self."""
+    iw = min(tlbr1[2], tlbr2[2]) - max(tlbr1[0], tlbr2[0]) + 1
+    ih = min(tlbr1[3], tlbr2[3]) - max(tlbr1[1], tlbr2[1]) + 1
+    if iw <= 0 or ih <= 0:
+        return 0.
+    area_inter = iw * ih
+    area_self = area(tlbr1)
+    return area_inter / area_self
 
 
-@nb.njit(fastmath=True, cache=True)
+@nb.njit(fastmath=True, cache=True, inline='always')
 def iom(tlbr1, tlbr2):
-    """
-    Computes intersection over minimum.
-    """
-    tlbr = intersection(tlbr1, tlbr2)
-    if tlbr is None:
+    """Computes intersection over minimum."""
+    iw = min(tlbr1[2], tlbr2[2]) - max(tlbr1[0], tlbr2[0]) + 1
+    ih = min(tlbr1[3], tlbr2[3]) - max(tlbr1[1], tlbr2[1]) + 1
+    if iw <= 0 or ih <= 0:
         return 0.
-    area_intersection = area(tlbr)
-    area_minimum = min(area(tlbr1), area(tlbr2))
-    return area_intersection / area_minimum
-
-
-@nb.njit(fastmath=True, cache=True)
-def transform(pts, m):
-    """
-    Numba implementation of OpenCV's transform.
-    """
-    pts = np.asarray(pts)
-    pts = np.atleast_2d(pts)
-    augment = np.ones((len(pts), 1))
-    pts = np.concatenate((pts, augment), axis=1)
-    return pts @ m.T
-
-
-@nb.njit(fastmath=True, cache=True)
-def perspective_transform(pts, m):
-    """
-    Numba implementation of OpenCV's perspectiveTransform.
-    """
-    pts = np.asarray(pts)
-    pts = np.atleast_2d(pts)
-    augment = np.ones((len(pts), 1))
-    pts = np.concatenate((pts, augment), axis=1).T
-    pts = m @ pts
-    pts = pts / pts[-1]
-    return pts[:2].T
+    area_inter = iw * ih
+    area_min = min(area(tlbr1), area(tlbr2))
+    return area_inter / area_min
+
+
+@nb.njit(parallel=False, fastmath=True, cache=True)
+def bbox_ious(tlbrs1, tlbrs2):
+    """Computes pairwise bounding box overlaps using IoU."""
+    ious = np.empty((tlbrs1.shape[0], tlbrs2.shape[0]))
+    for i in nb.prange(tlbrs1.shape[0]):
+        area1 = area(tlbrs1[i, :])
+        for j in range(tlbrs2.shape[0]):
+            iw = min(tlbrs1[i, 2], tlbrs2[j, 2]) - max(tlbrs1[i, 0], tlbrs2[j, 0]) + 1
+            ih = min(tlbrs1[i, 3], tlbrs2[j, 3]) - max(tlbrs1[i, 1], tlbrs2[j, 1]) + 1
+            if iw > 0 and ih > 0:
+                area_inter = iw * ih
+                area_union = area1 + area(tlbrs2[j, :]) - area_inter
+                ious[i, j] = area_inter / area_union
+            else:
+                ious[i, j] = 0.
+    return ious
+
+
+@nb.njit(parallel=False, fastmath=True, cache=True)
+def find_occluded(tlbrs, occlusion_thresh):
+    """Computes a mask of occluded bounding boxes."""
+    occluded_mask = np.zeros(tlbrs.shape[0], dtype=np.bool_)
+    for i in nb.prange(tlbrs.shape[0]):
+        area_self = area(tlbrs[i, :])
+        for j in range(tlbrs.shape[0]):
+            if i != j:
+                iw = min(tlbrs[i, 2], tlbrs[j, 2]) - max(tlbrs[i, 0], tlbrs[j, 0]) + 1
+                ih = min(tlbrs[i, 3], tlbrs[j, 3]) - max(tlbrs[i, 1], tlbrs[j, 1]) + 1
+                if iw > 0 and ih > 0:
+                    ios = iw * ih / area_self
+                    if ios >= occlusion_thresh:
+                        occluded_mask[i] = True
+                        break
+    return occluded_mask
 
 
 @nb.njit(fastmath=True, cache=True)
 def nms(tlwhs, scores, nms_thresh):
-    """
-    Applies Non-Maximum Suppression on the bounding boxes [x, y, w, h].
+    """Applies Non-Maximum Suppression on the bounding boxes [x, y, w, h].
     Returns an array with the indexes of the bounding boxes we want to keep.
     """
     areas = tlwhs[:, 2] * tlwhs[:, 3]
     ordered = scores.argsort()[::-1]
 
-    tl = tlwhs[:, :2]
-    br = tlwhs[:, :2] + tlwhs[:, 2:] - 1
+    tls = tlwhs[:, :2]
+    brs = tlwhs[:, :2] + tlwhs[:, 2:] - 1
 
     keep = []
     while ordered.size > 0:
@@ -139,14 +173,14 @@ def nms(tlwhs, scores, nms_thresh):
         i = ordered[0]
         keep.append(i)
 
-        other_tl = tl[ordered[1:]]
-        other_br = br[ordered[1:]]
+        other_tls = tls[ordered[1:]]
+        other_brs = brs[ordered[1:]]
 
         # compute IoU
-        inter_xmin = np.maximum(tl[i, 0], other_tl[:, 0])
-        inter_ymin = np.maximum(tl[i, 1], other_tl[:, 1])
-        inter_xmax = np.minimum(br[i, 0], other_br[:, 0])
-        inter_ymax = np.minimum(br[i, 1], other_br[:, 1])
+        inter_xmin = np.maximum(tls[i, 0], other_tls[:, 0])
+        inter_ymin = np.maximum(tls[i, 1], other_tls[:, 1])
+        inter_xmax = np.minimum(brs[i, 0], other_brs[:, 0])
+        inter_ymax = np.minimum(brs[i, 1], other_brs[:, 1])
 
         inter_w = np.maximum(0, inter_xmax - inter_xmin + 1)
         inter_h = np.maximum(0, inter_ymax - inter_ymin + 1)
@@ -156,21 +190,19 @@ def nms(tlwhs, scores, nms_thresh):
 
         idx = np.where(iou <= nms_thresh)[0]
         ordered = ordered[idx + 1]
-    keep = np.asarray(keep)
+    keep = np.array(keep)
     return keep
 
 
 @nb.njit(fastmath=True, cache=True)
 def diou_nms(tlwhs, scores, nms_thresh, beta=0.6):
-    """
-    Applies Non-Maximum Suppression using the DIoU metric.
-    """
+    """Applies Non-Maximum Suppression using the DIoU metric."""
     areas = tlwhs[:, 2] * tlwhs[:, 3]
     ordered = scores.argsort()[::-1]
 
-    tl = tlwhs[:, :2]
-    br = tlwhs[:, :2] + tlwhs[:, 2:] - 1
-    centers = (tl + br) / 2
+    tls = tlwhs[:, :2]
+    brs = tlwhs[:, :2] + tlwhs[:, 2:] - 1
+    centers = (tls + brs) / 2
 
     keep = []
     while ordered.size > 0:
@@ -178,14 +210,14 @@ def diou_nms(tlwhs, scores, nms_thresh, beta=0.6):
         i = ordered[0]
         keep.append(i)
 
-        other_tl = tl[ordered[1:]]
-        other_br = br[ordered[1:]]
+        other_tls = tls[ordered[1:]]
+        other_brs = brs[ordered[1:]]
 
         # compute IoU
-        inter_xmin = np.maximum(tl[i, 0], other_tl[:, 0])
-        inter_ymin = np.maximum(tl[i, 1], other_tl[:, 1])
-        inter_xmax = np.minimum(br[i, 0], other_br[:, 0])
-        inter_ymax = np.minimum(br[i, 1], other_br[:, 1])
+        inter_xmin = np.maximum(tls[i, 0], other_tls[:, 0])
+        inter_ymin = np.maximum(tls[i, 1], other_tls[:, 1])
+        inter_xmax = np.minimum(brs[i, 0], other_brs[:, 0])
+        inter_ymax = np.minimum(brs[i, 1], other_brs[:, 1])
 
         inter_w = np.maximum(0, inter_xmax - inter_xmin + 1)
         inter_h = np.maximum(0, inter_ymax - inter_ymin + 1)
@@ -194,18 +226,18 @@ def diou_nms(tlwhs, scores, nms_thresh, beta=0.6):
         iou = inter_area / union_area
 
         # compute DIoU
-        union_xmin = np.minimum(tl[i, 0], other_tl[:, 0])
-        union_ymin = np.minimum(tl[i, 1], other_tl[:, 1])
-        union_xmax = np.maximum(br[i, 0], other_br[:, 0])
-        union_ymax = np.maximum(br[i, 1], other_br[:, 1])
-
-        union_w = union_xmax - union_xmin + 1
-        union_h = union_ymax - union_ymin + 1
-        c = union_w**2 + union_h**2
+        encls_xmin = np.minimum(tls[i, 0], other_tls[:, 0])
+        encls_ymin = np.minimum(tls[i, 1], other_tls[:, 1])
+        encls_xmax = np.maximum(brs[i, 0], other_brs[:, 0])
+        encls_ymax = np.maximum(brs[i, 1], other_brs[:, 1])
+
+        encls_w = encls_xmax - encls_xmin + 1
+        encls_h = encls_ymax - encls_ymin + 1
+        c = encls_w**2 + encls_h**2
         d = np.sum((centers[i] - centers[ordered[1:]])**2, axis=1)
         diou = iou - (d / c)**beta
 
         idx = np.where(diou <= nms_thresh)[0]
         ordered = ordered[idx + 1]
-    keep = np.asarray(keep)
+    keep = np.array(keep)
     return keep
diff --git a/fastmot/utils/visualization.py b/fastmot/utils/visualization.py
index 76d9feb7..6f290921 100644
--- a/fastmot/utils/visualization.py
+++ b/fastmot/utils/visualization.py
@@ -15,26 +15,25 @@ def draw_tracks(frame, tracks, show_flow=False, show_cov=False):
             draw_covariance(frame, track.tlbr, track.state[1])
 
 
-def draw_detections(frame, detections):
+def draw_detections(frame, detections, color=(255, 255, 255), show_conf=False):
     for det in detections:
-        draw_bbox(frame, det.tlbr, (255, 255, 255), 1)
+        text = f'{det.conf:.2f}' if show_conf else None
+        draw_bbox(frame, det.tlbr, color, 1, text)
 
 
-def draw_flow_bboxes(frame, tracker):
-    for tlbr in tracker.flow_bboxes.values():
-        draw_bbox(frame, tlbr, 0, 1)
+def draw_klt_bboxes(frame, klt_bboxes, color=(0, 0, 0)):
+    for tlbr in klt_bboxes:
+        draw_bbox(frame, tlbr, color, 1)
 
 
-def draw_tiles(frame, detector):
-    assert hasattr(detector, 'tiles')
-    for tile in detector.tiles:
-        tlbr = np.rint(tile * np.tile(detector.scale_factor, 2))
-        draw_bbox(frame, tlbr, 0, 1)
+def draw_tiles(frame, tiles, scale_factor, color=(0, 0, 0)):
+    for tile in tiles:
+        tlbr = np.rint(tile * np.tile(scale_factor, 2))
+        draw_bbox(frame, tlbr, color, 1)
 
 
-def draw_background_flow(frame, tracker):
-    draw_feature_match(frame, tracker.flow.prev_bg_keypoints,
-                       tracker.flow.bg_keypoints, (0, 0, 255))
+def draw_background_flow(frame, prev_bg_keypoints, bg_keypoints, color=(0, 0, 255)):
+    draw_feature_match(frame, prev_bg_keypoints, bg_keypoints, color)
 
 
 def get_color(idx, s=0.8, vmin=0.7):
@@ -84,3 +83,46 @@ def ellipse(cov):
     cv2.ellipse(frame, tl, axes, angle, 0, 360, (255, 255, 255), 1, cv2.LINE_AA)
     axes, angle = ellipse(covariance[2:4, 2:4])
     cv2.ellipse(frame, br, axes, angle, 0, 360, (255, 255, 255), 1, cv2.LINE_AA)
+
+
+class Visualizer:
+    def __init__(self,
+                 draw_detections=False,
+                 draw_confidence=False,
+                 draw_covariance=False,
+                 draw_klt=False,
+                 draw_obj_flow=False,
+                 draw_bg_flow=False):
+        """Class for visualization.
+
+        Parameters
+        ----------
+        draw_detections : bool, optional
+            Enable drawing detections.
+        draw_confidence : bool, optional
+            Enable drawing detection confidence, ignored if `draw_detections` is disabled.
+        draw_covariance : bool, optional
+            Enable drawing Kalman filter position covariance.
+        draw_klt : bool, optional
+            Enable drawing KLT bounding boxes.
+        draw_obj_flow : bool, optional
+            Enable drawing object flow matches.
+        draw_bg_flow : bool, optional
+            Enable drawing background flow matches.
+        """
+        self.draw_detections = draw_detections
+        self.draw_confidence = draw_confidence
+        self.draw_covariance = draw_covariance
+        self.draw_klt = draw_klt
+        self.draw_obj_flow = draw_obj_flow
+        self.draw_bg_flow = draw_bg_flow
+
+    def render(self, frame, tracks, detections, klt_bboxes, prev_bg_keypoints, bg_keypoints):
+        """Render visualizations onto the frame."""
+        draw_tracks(frame, tracks, show_flow=self.draw_obj_flow, show_cov=self.draw_covariance)
+        if self.draw_detections:
+            draw_detections(frame, detections, show_conf=self.draw_confidence)
+        if self.draw_klt:
+            draw_klt_bboxes(frame, klt_bboxes)
+        if self.draw_bg_flow:
+            draw_background_flow(frame, prev_bg_keypoints, bg_keypoints)
diff --git a/fastmot/videoio.py b/fastmot/videoio.py
index 54c81c39..bd48b0e5 100644
--- a/fastmot/videoio.py
+++ b/fastmot/videoio.py
@@ -22,33 +22,45 @@ class Protocol(Enum):
 
 
 class VideoIO:
-    """
-    Class for capturing from a video file, an image sequence, or a camera, and saving video output.
-    Encoding, decoding, and scaling can be accelerated using the GStreamer backend.
-    Parameters
-    ----------
-    size : (int, int)
-        Width and height of each frame to output.
-    config : Dict
-        Camera and buffer configuration.
-    input_uri : string
-        URI to an input video file or capturing device.
-    output_uri : string
-        URI to an output video file.
-    proc_fps : int
-        Estimated processing speed. This depends on compute and scene complexity.
-    """
+    def __init__(self, size, input_uri,
+                 output_uri=None,
+                 resolution=(1920, 1080),
+                 frame_rate=30,
+                 buffer_size=10,
+                 proc_fps=30):
+        """Class for capturing from a video/image sequence/camera, and saving video output.
+        Encoding, decoding, and scaling can be accelerated using the GStreamer backend.
 
-    def __init__(self, size, config, input_uri, output_uri=None, proc_fps=30):
+        Parameters
+        ----------
+        size : tuple
+            Width and height of each frame to output.
+        input_uri : str
+            URI to an input video file or capturing device.
+        output_uri : str, optional
+            URI to an output video file.
+        resolution : tuple, optional
+            Resolution of the input source.
+        frame_rate : int, optional
+            Frame rate of the input source.
+        buffer_size : int, optional
+            Number of frames to buffer.
+            For live sources, a larger buffer drops less frames but increases latency.
+        proc_fps : int, optional
+            Estimated processing speed that may limit the capture interval `cap_dt`.
+            This depends on hardware and processing complexity.
+        """
         self.size = size
         self.input_uri = input_uri
         self.output_uri = output_uri
+        self.resolution = resolution
+        assert frame_rate > 0
+        self.frame_rate = frame_rate
+        assert buffer_size >= 1
+        self.buffer_size = buffer_size
+        assert proc_fps > 0
         self.proc_fps = proc_fps
 
-        self.resolution = config['resolution']
-        self.frame_rate = config['frame_rate']
-        self.buffer_size = config['buffer_size']
-
         self.protocol = self._parse_uri(self.input_uri)
         self.is_live = self.protocol != Protocol.IMAGE and self.protocol != Protocol.VIDEO
         if WITH_GSTREAMER:
@@ -90,18 +102,14 @@ def cap_dt(self):
         return 1 / min(self.cap_fps, self.proc_fps) if self.is_live else 1 / self.cap_fps
 
     def start_capture(self):
-        """
-        Start capturing from file or device.
-        """
+        """Start capturing from file or device."""
         if not self.source.isOpened():
             self.source.open(self._gst_cap_pipeline(), cv2.CAP_GSTREAMER)
         if not self.cap_thread.is_alive():
             self.cap_thread.start()
 
     def stop_capture(self):
-        """
-        Stop capturing from file or device.
-        """
+        """Stop capturing from file or device."""
         with self.cond:
             self.exit_event.set()
             self.cond.notify()
@@ -109,9 +117,12 @@ def stop_capture(self):
         self.cap_thread.join()
 
     def read(self):
-        """
-        Returns the next video frame.
-        Returns None if there are no more frames.
+        """Reads the next video frame.
+
+        Returns
+        -------
+        ndarray
+            Returns None if there are no more frames.
         """
         with self.cond:
             while len(self.frame_queue) == 0 and not self.exit_event.is_set():
@@ -125,16 +136,12 @@ def read(self):
         return frame
 
     def write(self, frame):
-        """
-        Writes the next video frame.
-        """
+        """Writes the next video frame."""
         assert hasattr(self, 'writer')
         self.writer.write(frame)
 
     def release(self):
-        """
-        Closes video file or capturing device.
-        """
+        """Cleans up input and output sources."""
         self.stop_capture()
         if hasattr(self, 'writer'):
             self.writer.release()
@@ -198,7 +205,10 @@ def _gst_cap_pipeline(self):
             else:
                 raise RuntimeError('GStreamer V4L2 plugin not found')
         elif self.protocol == Protocol.RTSP:
-            pipeline = 'rtspsrc location=%s latency=0 ! capsfilter caps=application/x-rtp,media=video ! decodebin ! ' % self.input_uri
+            pipeline = (
+                'rtspsrc location=%s latency=0 ! '
+                'capsfilter caps=application/x-rtp,media=video ! decodebin ! ' % self.input_uri
+            )
         elif self.protocol == Protocol.HTTP:
             pipeline = 'souphttpsrc location=%s is-live=true ! decodebin ! ' % self.input_uri
         return pipeline + cvt_pipeline
@@ -209,7 +219,7 @@ def _gst_write_pipeline(self):
         if 'omxh264enc' in gst_elements:
             h264_encoder = 'omxh264enc preset-level=2'
         elif 'x264enc' in gst_elements:
-            h264_encoder = 'x264enc'
+            h264_encoder = 'x264enc pass=4'
         else:
             raise RuntimeError('GStreamer H.264 encoder not found')
         pipeline = (
diff --git a/requirements.txt b/requirements.txt
index a07e65b5..3183a8de 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,5 +2,4 @@ numpy >= 1.17
 scipy >= 1.5
 numba == 0.48
 tensorflow < 2.0
-cupy == 9.2
-cython-bbox
\ No newline at end of file
+cupy == 9.2
\ No newline at end of file
diff --git a/scripts/install_jetson.sh b/scripts/install_jetson.sh
index 8c8ac157..ac554147 100755
--- a/scripts/install_jetson.sh
+++ b/scripts/install_jetson.sh
@@ -27,15 +27,14 @@ if [ ! -x "$(command -v nvcc)" ]; then
     source ~/.bashrc
 fi
 
-# Numpy, PyCUDA, TensorFlow, cython-bbox
+# NumPy and TensorFlow
 sudo apt-get update
 sudo apt-get install -y python3-pip libhdf5-serial-dev hdf5-tools libcanberra-gtk-module
-sudo -H pip3 install cython
-sudo -H pip3 install numpy cython-bbox
+sudo -H pip3 install numpy
 sudo ln -s /usr/include/locale.h /usr/include/xlocale.h
 sudo -H pip3 install --no-cache-dir --extra-index-url https://developer.download.nvidia.com/compute/redist/jp/v$JP_VERSION tensorflow==$TF_VERSION+nv$NV_VERSION
 
-# Scipy
+# SciPy
 sudo apt-get install -y libatlas-base-dev gfortran
 sudo -H pip3 install scipy==1.5
 
diff --git a/scripts/yolo2onnx.py b/scripts/yolo2onnx.py
index 023f46f3..4f89292d 100755
--- a/scripts/yolo2onnx.py
+++ b/scripts/yolo2onnx.py
@@ -661,6 +661,29 @@ def _make_conv_node(self, layer_name, layer_dict):
 
             inputs = [layer_name_mish]
             layer_name_output = layer_name_mish
+        elif layer_dict['activation'] == 'swish':
+            layer_name_sigmoid = layer_name + '_sigmoid'
+            layer_name_swish = layer_name + '_swish'
+
+            sigmoid_node = helper.make_node(
+                'Sigmoid',
+                inputs=inputs,
+                outputs=[layer_name_sigmoid],
+                name=layer_name_sigmoid
+            )
+            self._nodes.append(sigmoid_node)
+
+            inputs.append(layer_name_sigmoid)
+            swish_node = helper.make_node(
+                'Mul',
+                inputs=inputs,
+                outputs=[layer_name_swish],
+                name=layer_name_swish
+            )
+            self._nodes.append(swish_node)
+
+            inputs = [layer_name_swish]
+            layer_name_output = layer_name_swish
         elif layer_dict['activation'] == 'logistic':
             layer_name_lgx = layer_name + '_lgx'
 
@@ -888,10 +911,8 @@ def main():
     print('Checking ONNX model...')
     onnx.checker.check_model(yolo_model_def)
 
-    print('Saving ONNX file...')
     onnx.save(yolo_model_def, output_file_path)
-
-    print('Done.')
+    print(f'ONNX file saved to {output_file_path}')
 
 
 if __name__ == '__main__':