nanoporetech · iiSeymour · Sep 24, 2020 · Jan 19, 2022 · Jan 26, 2022 · Jan 26, 2022
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Bonito
 
-[![PyPI version](https://badge.fury.io/py/ont-bonito.svg)](https://badge.fury.io/py/ont-bonito) 
+[![PyPI version](https://badge.fury.io/py/ont-bonito.svg)](https://badge.fury.io/py/ont-bonito)
 [![py36](https://img.shields.io/badge/python-3.6-brightgreen.svg)](https://img.shields.io/badge/python-3.6-brightgreen.svg)
 [![py37](https://img.shields.io/badge/python-3.7-brightgreen.svg)](https://img.shields.io/badge/python-3.7-brightgreen.svg)
 [![py38](https://img.shields.io/badge/python-3.8-brightgreen.svg)](https://img.shields.io/badge/python-3.8-brightgreen.svg)
@@ -36,6 +36,12 @@ The default `ont-bonito` package is built against CUDA 10.2 however CUDA 11.1 an
 $ pip install -f https://download.pytorch.org/whl/torch_stable.html ont-bonito-cuda111
 ```
 
+To optimize inference on CPU with Intel OpenVINO use `--use_openvino` flag:
+
+```bash
+$ bonito basecaller dna_r9.4.1 --reference reference.mmi --use_openvino --device=cpu /data/reads > basecalls.sam
+```
+
 ## Modified Bases
 
 Modified base calling is handled by [Remora](https://github.com/nanoporetech/remora).
@@ -55,7 +61,7 @@ $ bonito basecaller dna_r9.4.1 --save-ctc --reference reference.mmi /data/reads
 $ bonito train --directory /data/training/ctc-data /data/training/model-dir
 ```
 
-In addition to training a new model from scratch you can also easily fine tune one of the pretrained models.  
+In addition to training a new model from scratch you can also easily fine tune one of the pretrained models.
 
 ```bash
 bonito train --epochs 1 --lr 5e-4 --pretrained [email protected] --directory /data/training/ctc-data /data/training/fine-tuned-model
@@ -68,7 +74,7 @@ $ bonito download --training
 $ bonito train /data/training/model-dir
 ```
 
-All training calls use Automatic Mixed Precision to speed up training. To disable this, set the `--no-amp` flag to True. 
+All training calls use Automatic Mixed Precision to speed up training. To disable this, set the `--no-amp` flag to True.
 
 ## Developer Quickstart
 
@@ -82,6 +88,11 @@ $ source venv3/bin/activate
 (venv3) $ python setup.py develop
 ```
 
+To build with OpenVINO backend:
+```bash
+(venv3) $ pip install develop .[openvino]
+```
+
 ## Interface
 
  - `bonito view` - view a model architecture for a given `.toml` file and the number of parameters in the network.

diff --git a/bonito/cli/basecaller.py b/bonito/cli/basecaller.py
@@ -12,6 +12,7 @@
 from itertools import islice as take
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 
+import bonito.openvino.basecall
 from bonito.aligner import align_map, Aligner
 from bonito.io import CTCWriter, Writer, biofmt
 from bonito.mod_util import call_mods, load_mods_model
@@ -40,6 +41,7 @@ def main(args):
             batchsize=args.batchsize,
             quantize=args.quantize,
             use_koi=True,
+            use_openvino=args.use_openvino,
         )
     except FileNotFoundError:
         sys.stderr.write(f"> error: failed to load {args.model_directory}\n")
@@ -50,7 +52,10 @@ def main(args):
     if args.verbose:
         sys.stderr.write(f"> model basecaller params: {model.config['basecaller']}\n")
 
-    basecall = load_symbol(args.model_directory, "basecall")
+    if args.use_openvino:
+        basecall = bonito.openvino.basecall.basecall
+    else:
+        basecall = load_symbol(args.model_directory, "basecall")
 
     mods_model = None
     if args.modified_base_model is not None or args.modified_bases is not None:
@@ -173,4 +178,5 @@ def argparser():
     parser.add_argument("--max-reads", default=0, type=int)
     parser.add_argument("--alignment-threads", default=8, type=int)
     parser.add_argument('-v', '--verbose', action='count', default=0)
+    parser.add_argument("--use_openvino", action="store_true", default=False)
     return parser
diff --git a/bonito/cli/evaluate.py b/bonito/cli/evaluate.py
@@ -45,7 +45,7 @@ def main(args):
         seqs = []
 
         print("* loading model", w)
-        model = load_model(args.model_directory, args.device, weights=w)
+        model = load_model(args.model_directory, args.device, weights=w, use_openvino=args.use_openvino)
 
         print("* calling")
         t0 = time.perf_counter()
@@ -109,4 +109,5 @@ def argparser():
     parser.add_argument("--beamsize", default=5, type=int)
     parser.add_argument("--poa", action="store_true", default=False)
     parser.add_argument("--min-coverage", default=0.5, type=float)
+    parser.add_argument("--use_openvino", action="store_true", default=False)
     return parser
diff --git a/bonito/crf/basecall.py b/bonito/crf/basecall.py
@@ -28,12 +28,13 @@ def compute_scores(model, batch, beam_width=32, beam_cut=100.0, scale=1.0, offse
     """
     with torch.inference_mode():
         device = next(model.parameters()).device
-        dtype = torch.float16 if half_supported() else torch.float32
+        dtype = torch.float16 if device != torch.device('cpu') and half_supported() else torch.float32
         scores = model(batch.to(dtype).to(device))
         if reverse:
             scores = model.seqdist.reverse_complement(scores)
+        # beam_search expects scores in FP16 precision
         sequence, qstring, moves = beam_search(
-            scores, beam_width=beam_width, beam_cut=beam_cut,
+            scores.to(torch.float16), beam_width=beam_width, beam_cut=beam_cut,
             scale=scale, offset=offset, blank_score=blank_score
         )
         return {

diff --git a/bonito/crf/model.py b/bonito/crf/model.py
@@ -176,6 +176,7 @@ def decode(self, x):
     def loss(self, scores, targets, target_lengths, **kwargs):
         return self.seqdist.ctc_loss(scores.to(torch.float32), targets, target_lengths, **kwargs)
 
+
 class Model(SeqdistModel):
 
     def __init__(self, config):

diff --git a/bonito/ctc/basecall.py b/bonito/ctc/basecall.py
@@ -35,7 +35,8 @@ def compute_scores(model, batch):
     """
     with torch.no_grad():
         device = next(model.parameters()).device
-        chunks = batch.to(torch.half).to(device)
+        chunks = batch.to(torch.half) if device != torch.device('cpu') and half_supported() else batch
+        chunks = chunks.to(device)
         probs = permute(model(chunks), 'TNC', 'NTC')
     return probs.cpu().to(torch.float32)
 

diff --git a/bonito/nn.py b/bonito/nn.py
@@ -116,7 +116,7 @@ def forward(self, x):
         if self.blank_score is not None and self.expand_blanks:
             T, N, C = scores.shape
             scores = torch.nn.functional.pad(
-                scores.view(T, N, C // self.n_base, self.n_base),
+                scores.view(T, N, -1, self.n_base),
                 (1, 0, 0, 0, 0, 0, 0, 0),
                 value=self.blank_score
             ).view(T, N, -1)

diff --git a/bonito/openvino/basecall.py b/bonito/openvino/basecall.py
@@ -0,0 +1,47 @@
+import torch
+from crf_beam import beam_search
+from bonito.crf.basecall import stitch_results
+from bonito.multiprocessing import thread_iter, thread_map
+from bonito.util import chunk, stitch, batchify, unbatchify
+
+
+def compute_scores(model, batch):
+    scores = model(batch)
+    fwd = model.seqdist.forward_scores(scores)
+    bwd = model.seqdist.backward_scores(scores)
+    posts = torch.softmax(fwd + bwd, dim=-1)
+    return {
+        'scores': scores.transpose(0, 1),
+        'bwd': bwd.transpose(0, 1),
+        'posts': posts.transpose(0, 1),
+    }
+
+
+def decode(x, beam_width=32, beam_cut=100.0, scale=1.0, offset=0.0, blank_score=2.0):
+    sequence, qstring, moves = beam_search(x['scores'], x['bwd'], x['posts'])
+    return {
+        'sequence': sequence,
+        'qstring': qstring,
+        'moves': moves,
+    }
+
+
+def basecall(model, reads, chunksize=4000, overlap=100, batchsize=32, reverse=False):
+
+    chunks = thread_iter(
+        ((read, 0, len(read.signal)), chunk(torch.from_numpy(read.signal), chunksize, overlap))
+        for read in reads
+    )
+
+    batches = thread_iter(batchify(chunks, batchsize=batchsize))
+
+    scores = thread_iter(
+        (read, compute_scores(model, batch)) for read, batch in batches
+    )
+
+    results = thread_iter(
+        (read, stitch_results(scores, end - start, chunksize, overlap, model.stride))
+        for ((read, start, end), scores) in unbatchify(scores)
+    )
+
+    return thread_map(decode, results, n_thread=48)