diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 0000000..446aa21
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,32 @@
+# This is a comment.
+# Each line is a file pattern followed by one or more owners.
+
+# These owners will be the default owners for everything in
+# the repo. Unless a later match takes precedence,
+# @global-owner1 and @global-owner2 will be requested for
+# review when someone opens a pull request.
+*       @wandera/datascience
+
+# Order is important; the last matching pattern takes the most
+# precedence. When someone opens a pull request that only
+# modifies JS files, only @js-owner and not the global
+# owner(s) will be requested for a review.
+# *.js    @js-owner
+
+# You can also use email addresses if you prefer. They'll be
+# used to look up users just like we do for commit author
+# emails.
+#*.go docs@example.com
+
+# The `docs/*` pattern will match files like
+# `docs/getting-started.md` but not further nested files like
+# `docs/build-app/troubleshooting.md`.
+# docs/*  docs@example.com
+
+# In this example, @octocat owns any file in an apps directory
+# anywhere in your repository.
+# apps/ @octocat
+
+# In this example, @doctocat owns any file in the `/docs`
+# directory in the root of your repository.
+# /docs/ @doctocat
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..cb6efa5
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.10.11-slim
+
+ADD . /asm2vec-pytorch
+WORKDIR asm2vec-pytorch
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    unixodbc-dev \
+    unixodbc \
+    libpq-dev && \
+    pip install -r requirements.txt && \
+    python setup.py install
+
+CMD ["/bin/sh"]
diff --git a/README.md b/README.md
index 7a2043b..637d5db 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # asm2vec-pytorch
 
-<a><img alt="release 1.0.0" src="https://img.shields.io/badge/release-v1.0.0-yellow?style=for-the-badge"></a>
+<a><img alt="release 1.0.3" src="https://img.shields.io/badge/release-v1.0.0-yellow?style=for-the-badge"></a>
 <a><img alt="mit" src="https://img.shields.io/badge/license-MIT-brightgreen?style=for-the-badge"></a>
 <a><img alt="python" src="https://img.shields.io/badge/-python-9cf?style=for-the-badge&logo=python"></a>
 
@@ -9,30 +9,17 @@ The details of the model can be found in the original paper: [(sp'19) Asm2Vec: B
 
 ## Requirements
 
-python >= 3.6
-
-| packages | for |
-| --- | --- |
-| r2pipe | `scripts/bin2asm.py` |
-| click | `scripts/*` |
-| torch | almost all code need it |
-
-You also need to install `radare2` to run `scripts/bin2asm.py`. `r2pipe` is just the python interface to `radare2`
-
-If you only want to use the library code, you just need to install `torch`
+* python >= 3.10
+* radare2
+* Packages listed in `requirements.txt`
 
 ## Install
 
 ```
+pip install -r requirements.txt && 
 python setup.py install
 ```
 
-or
-
-```
-pip install git+https://github.com/oalieno/asm2vec-pytorch.git
-```
-
 ## Benchmark
 
 An implementation already exists here: [Lancern/asm2vec](https://github.com/Lancern/asm2vec)  
@@ -46,141 +33,20 @@ Following is the benchmark of training 1000 functions in 1 epoch.
 
 ## Get Started
 
-```bash
-python scripts/bin2asm.py -i /bin/ -o asm/
-```
-
-First generate asm files from binarys under `/bin/`.  
-You can hit `Ctrl+C` anytime when there is enough data.
-
-```bash
-python scripts/train.py -i asm/ -l 100 -o model.pt --epochs 100
-```
-
-Try to train the model using only 100 functions and 100 epochs for a taste.  
-Then you can use more data if you want.
-
-```bash
-python scripts/test.py -i asm/123456 -m model.pt
-```
-
-After you train your model, try to grab an assembly function and see the result.  
-This script will show you how the model perform.  
-Once you satisfied, you can take out the embedding vector of the function and do whatever you want with it.
+### TODO - update this with description about to how use etc
 
-## Usage
+## Tests
 
-### bin2asm.py
+### Run test suite
 
-```
-Usage: bin2asm.py [OPTIONS]
+* Run all tests: ``python -m unittest discover -v``
+* Run a certain module's tests: ``python -m unittest -v test.test_binary_to_asm``
+* Run a certain test class: ``python -m unittest -v test.test_binary_to_asm.TestBinaryToAsm``
+* Run a certain test method: 
 
-  Extract assembly functions from binary executable
+  ``python -m unittest -v test.test_binary_to_asm.TestBinaryToAsm.test_sha3``
 
-Options:
-  -i, --input TEXT   input directory / file  [required]
-  -o, --output TEXT  output directory
-  -l, --len INTEGER  ignore assembly code with instructions amount smaller
-                     than minlen
+### Coverage
 
-  --help             Show this message and exit.
-```
-
-```bash
-# Example
-python bin2asm.py -i /bin/ -o asm/
-```
-
-### train.py
-
-```
-Usage: train.py [OPTIONS]
-
-Options:
-  -i, --input TEXT                training data folder  [required]
-  -o, --output TEXT               output model path  [default: model.pt]
-  -m, --model TEXT                load previous trained model path
-  -l, --limit INTEGER             limit the number of functions to be loaded
-  -d, --ebedding-dimension INTEGER
-                                  embedding dimension  [default: 100]
-  -b, --batch-size INTEGER        batch size  [default: 1024]
-  -e, --epochs INTEGER            training epochs  [default: 10]
-  -n, --neg-sample-num INTEGER    negative sampling amount  [default: 25]
-  -a, --calculate-accuracy        whether calculate accuracy ( will be
-                                  significantly slower )
-
-  -c, --device TEXT               hardware device to be used: cpu / cuda /
-                                  auto  [default: auto]
-
-  -lr, --learning-rate FLOAT      learning rate  [default: 0.02]
-  --help                          Show this message and exit.
-```
-
-```bash
-# Example
-python train.py -i asm/ -o model.pt --epochs 100
-```
-
-### test.py
-
-```
-Usage: test.py [OPTIONS]
-
-Options:
-  -i, --input TEXT              target function  [required]
-  -m, --model TEXT              model path  [required]
-  -e, --epochs INTEGER          training epochs  [default: 10]
-  -n, --neg-sample-num INTEGER  negative sampling amount  [default: 25]
-  -l, --limit INTEGER           limit the amount of output probability result
-  -c, --device TEXT             hardware device to be used: cpu / cuda / auto
-                                [default: auto]
-
-  -lr, --learning-rate FLOAT    learning rate  [default: 0.02]
-  -p, --pretty                  pretty print table  [default: False]
-  --help                        Show this message and exit.
-```
-
-```bash
-# Example
-python test.py -i asm/123456 -m model.pt
-```
-
-```
-┌──────────────────────────────────────────┐
-│    endbr64                               │
-│  ➔ push r15                              │
-│    push r14                              │
-├────────┬─────────────────────────────────┤
-│ 34.68% │ [rdx + rsi*CONST + CONST]       │
-│ 20.29% │ push                            │
-│ 16.22% │ r15                             │
-│ 04.36% │ r14                             │
-│ 03.55% │ r11d                            │
-└────────┴─────────────────────────────────┘
-```
-
-### compare.py
-
-```
-Usage: compare.py [OPTIONS]
-
-Options:
-  -i1, --input1 TEXT          target function 1  [required]
-  -i2, --input2 TEXT          target function 2  [required]
-  -m, --model TEXT            model path  [required]
-  -e, --epochs INTEGER        training epochs  [default: 10]
-  -c, --device TEXT           hardware device to be used: cpu / cuda / auto
-                              [default: auto]
-
-  -lr, --learning-rate FLOAT  learning rate  [default: 0.02]
-  --help                      Show this message and exit.
-```
-
-```bash
-# Example
-python compare.py -i1 asm/123456 -i2 asm/654321 -m model.pt -e 30
-```
-
-```
-cosine similarity : 0.873684
-```
+* Create report: ``coverage run -m unittest discover -v``
+* Read report: ``coverage report -m``
\ No newline at end of file
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000..c478391
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,26 @@
+Thanks for helping make GitHub safe for everyone.
+
+# Security
+
+Jamf takes the security of our software products and services seriously, including all of the open source code repositories managed through our GitHub organizations, such as asm2vec-pytorch.
+
+We will ensure that your finding gets passed along to the appropriate maintainers for remediation.
+
+# Reporting Security Issues
+
+If you believe you have found a security vulnerability in any Jamf-owned repository, please report it to us through coordinated disclosure.
+
+Please do not report security vulnerabilities through public GitHub issues, discussions, or pull requests.
+
+Instead, please send an email to info[@]jamf.com.
+
+Please include as much of the information listed below as you can to help us better understand and resolve the issue:
+- The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting)
+- Full paths of source file(s) related to the manifestation of the issue 
+- The location of the affected source code (tag/branch/commit or direct URL)
+- Any special configuration required to reproduce the issue 
+- Step-by-step instructions to reproduce the issue 
+- Proof-of-concept or exploit code (if possible)
+- Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py
index 0962ef8..6e9d963 100644
--- a/asm2vec/__init__.py
+++ b/asm2vec/__init__.py
@@ -1,6 +1,9 @@
-import importlib
+import os
 
-__all__ = ['model', 'datatype', 'utils']
+__home__ = os.path.dirname(os.path.abspath(__path__[0]))
+__data__ = os.path.join(__home__, "data")
 
-for module in __all__:
-    importlib.import_module(f'.{module}', 'asm2vec')
+__all__ = [
+    "__data__", "__home__", "binary_to_asm", "data", "datatype", "model", "similarity", "tensors", "test", "train",
+    "utilities", "version"
+]
diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py
new file mode 100644
index 0000000..1da1389
--- /dev/null
+++ b/asm2vec/binary_to_asm.py
@@ -0,0 +1,168 @@
+import re
+import os
+import hashlib
+import r2pipe
+import logging
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+def _sha3(asm: str) -> str:
+    """
+    Produces SHA3 for each assembly function
+    :param asm: Input assembly function
+    :return: Hashed string
+    """
+    return hashlib.sha3_256(asm.encode()).hexdigest()
+
+
+def _valid_exe(filename: str, magic_bytes: list[str]) -> bool:
+    """
+    Extracts magic bytes and returns the header
+    :param filename: Name of the malware file (SHA1)
+    :param magic_bytes: For the specific OS/type of binary
+    :return: Boolean of the header existing in magic bytes
+    """
+    magics = [bytes.fromhex(i) for i in magic_bytes]
+    with open(filename, 'rb') as f:
+        header = f.read(4)
+        return header in magics
+
+
+def _normalize(opcode: str) -> str:
+    """
+    Normalizes the input opcode string
+    :param opcode: Opcode of the binary
+    :return Normalized opcode string
+    """
+    opcode = opcode.replace(' - ', ' + ')
+    opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode)
+    opcode = re.sub(r'\*[0-9]', '*CONST', opcode)
+    opcode = re.sub(r' [0-9]', ' CONST', opcode)
+    return opcode
+
+
+def _fn_to_asm(pdf: dict | None, asm_minlen: int) -> str:
+    """
+    Converts functions to assembly code
+    :param pdf: disassembly
+    :param asm_minlen: minimum length of assembly functions to be extracted
+    :return: ASM string
+    """
+    if pdf is None:
+        return ''
+    if len(pdf['ops']) < asm_minlen:
+        return ''
+    if 'invalid' in [op['type'] for op in pdf['ops']]:
+        return ''
+
+    ops = pdf['ops']
+
+    labels, scope = {}, [op['offset'] for op in ops]
+    assert (None not in scope)
+    for i, op in enumerate(ops):
+        if op.get('jump') in scope:
+            labels.setdefault(op.get('jump'), i)
+
+    output = ''
+    for op in ops:
+        if labels.get(op.get('offset')) is not None:
+            output += f'LABEL{labels[op["offset"]]}:\n'
+        if labels.get(op.get('jump')) is not None:
+            output += f' {op["type"]} LABEL{labels[op["jump"]]}\n'
+        else:
+            output += f' {_normalize(op["opcode"])}\n'
+
+    return output
+
+
+def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int, magic_bytes: list[str]) -> int:
+    """
+    Fragments the input binary into assembly functions via r2pipe
+    :param filename: name of the malware file  (SHA1)
+    :param output_path: path to the folder to store the assembly functions for each malware
+    :param asm_minlen: the minimum length of assembly functions to be extracted
+    :param magic_bytes for the specific OS/type of binary
+    :return: the number of assembly functions
+    """
+    if not _valid_exe(filename, magic_bytes):
+        logging.info('The input file is invalid.')
+        return 0
+
+    r = r2pipe.open(str(filename))
+    r.cmd('aaaa')
+
+    count = 0
+
+    for fn in r.cmdj('aflj'):
+        r.cmd(f's {fn["offset"]}')
+        asm = _fn_to_asm(r.cmdj('pdfj'), asm_minlen)
+        if asm:
+            uid = _sha3(asm)
+            asm = f''' .name {fn["name"]}\
+            .offset {fn["offset"]:016x}\
+            .file {filename.name}''' + asm
+            output_asm = os.path.join(output_path, uid)
+            with open(output_asm, 'w') as file:
+                file.write(asm)
+                count += 1
+    return count
+
+
+def convert_to_asm(
+        input_path: str, output_path: str, minlen_upper: int, minlen_lower: int, magic_bytes: list[str] = None
+) -> list:
+    """
+    Extracts assembly functions from malware files and saves them into separate folder per binary
+    :param input_path: Path to the malware binaries
+    :param output_path: Path for the assembly functions to be extracted
+    :param minlen_upper: Minimum number of assembly functions needed for disassembling
+    :param minlen_lower: If disassembling is not possible with minlen_upper, lower the minimum number of assembly
+        functions to minlen_lower (WHAT?)
+    :param magic_bytes: List of valid for the specific OS/type of binary, e.g.
+        - 'cffaedfe': for Mach-O Little Endian (64-bit)
+        - 'feedfacf': for Mach-O Big Endian (64-bit)
+        - 'cefaedfe': for Mach-O Little Endian (32-bit)
+        - 'feedface': Mach-O Big Endian (32-bit)
+        - 'cafebabe':  Universal Binary Big Endian
+        - 'bebafeca'
+    :return: List of sha1 of disassembled malware files
+    """
+    if not magic_bytes:
+        magic_bytes = ['cffaedfe', 'feedfacf', 'cafebabe', 'cefaedfe', 'feedface', 'bebafeca']
+
+    binary_dir = Path(input_path)
+    asm_dir = Path(output_path)
+
+    if not os.path.exists(asm_dir):
+        os.mkdir(asm_dir)
+
+    function_count, binary_count, not_found = 0, 0, 0
+    disassembled_bins = []
+
+    if os.path.isdir(binary_dir):
+        for entry in os.scandir(binary_dir):
+            out_dir = os.path.join(asm_dir, entry.name)
+            if not (os.path.exists(out_dir)):
+                os.mkdir(out_dir)
+                function_count = bin_to_asm(Path(entry), Path(out_dir), minlen_upper, magic_bytes)
+                if function_count == 0:
+                    function_count = bin_to_asm(Path(entry), Path(out_dir), minlen_lower, magic_bytes)
+                    if function_count == 0:
+                        os.rmdir(out_dir)
+                        logging.info('The binary {} was not disassembled'.format(entry.name))
+                    else:
+                        binary_count += 1
+                        disassembled_bins.append(entry.name)
+                else:
+                    binary_count += 1
+                    disassembled_bins.append(entry.name)
+    else:
+        not_found += 1
+        logging.info("[Error] No such file or directory: {}".format(binary_dir))
+
+    logging.info("Total scanned binaries: {}".format(binary_count))
+    logging.info("Not converted binaries: {}".format(not_found))
+
+    return disassembled_bins
diff --git a/asm2vec/data.py b/asm2vec/data.py
new file mode 100644
index 0000000..6713c38
--- /dev/null
+++ b/asm2vec/data.py
@@ -0,0 +1,43 @@
+import os
+from pathlib import Path
+from torch.utils.data import Dataset
+
+from asm2vec.datatype import Tokens, Function
+
+
+class AsmDataset(Dataset):
+    # TODO - doc string - explain what this class does - how does it extend `Dataset`?
+    def __init__(self, x, y):
+        self.x = x
+        self.y = y
+
+    def __len__(self):
+        return len(self.x)
+
+    def __getitem__(self, index):
+        return self.x[index], self.y[index]
+
+
+def load_data(paths, limit=None):
+    # TODO - doc string
+    if type(paths) is not list:
+        paths = [paths]
+
+    filenames = []
+    for path in paths:
+        if os.path.isdir(path):
+            filenames += [Path(path) / filename for filename in sorted(os.listdir(path))
+                          if os.path.isfile(Path(path) / filename)]
+        else:
+            filenames += [Path(path)]
+
+    functions, tokens = [], Tokens()
+    for i, filename in enumerate(filenames):
+        if limit and i >= limit:
+            break
+        with open(filename) as f:
+            fn = Function.load(f.read())
+            functions.append(fn)
+            tokens.add(fn.tokens())
+
+    return functions, tokens
diff --git a/asm2vec/datatype.py b/asm2vec/datatype.py
index a3cd39b..f618800 100644
--- a/asm2vec/datatype.py
+++ b/asm2vec/datatype.py
@@ -2,19 +2,25 @@
 import random
 import warnings
 
+# TODO - doc strings
+
+
 class Token:
     def __init__(self, name, index):
         self.name = name
         self.index = index
         self.count = 1
+
     def __str__(self):
         return self.name
 
+
 class Tokens:
     def __init__(self, name_to_index=None, tokens=None):
         self.name_to_index = name_to_index or {}
         self.tokens = tokens or []
         self._weights = None
+
     def __getitem__(self, key):
         if type(key) is str:
             if self.name_to_index.get(key) is None:
@@ -28,13 +34,17 @@ def __getitem__(self, key):
                 return [self[k] for k in key]
             except:
                 raise ValueError
+
     def load_state_dict(self, sd):
         self.name_to_index = sd['name_to_index']
         self.tokens = sd['tokens']
+
     def state_dict(self):
         return {'name_to_index': self.name_to_index, 'tokens': self.tokens}
+
     def size(self):
         return len(self.tokens)
+
     def add(self, names):
         self._weights = None
         if type(names) is not list:
@@ -46,6 +56,7 @@ def add(self, names):
                 self.tokens.append(token)
             else:
                 self.tokens[self.name_to_index[name]].count += 1
+
     def update(self, tokens_new):
         for token in tokens_new:
             if token.name not in self.name_to_index:
@@ -54,6 +65,7 @@ def update(self, tokens_new):
                 self.tokens.append(token)
             else:
                 self.tokens[self.name_to_index[token.name]].count += token.count
+
     def weights(self):
         # if no cache, calculate
         if self._weights is None:
@@ -62,19 +74,22 @@ def weights(self):
             for token in self.tokens:
                 self._weights[token.index] = (token.count / total) ** 0.75
         return self._weights
+
     def sample(self, batch_size, num=5):
         return torch.multinomial(self.weights(), num * batch_size, replacement=True).view(batch_size, num)
 
+
 class Function:
     def __init__(self, insts, blocks, meta):
         self.insts = insts
         self.blocks = blocks
         self.meta = meta
+
     @classmethod
     def load(cls, text):
-        '''
-        gcc -S format compatiable
-        '''
+        """gcc -S format compatible
+        """
+
         label, labels, insts, blocks, meta = None, {}, [], [], {}
         for line in text.strip('\n').split('\n'):
             if line[0] in [' ', '\t']:
@@ -109,10 +124,13 @@ def load(cls, text):
                 if labels.get(arg):
                     inst.args[i] = 'CONST'
         return cls(insts, blocks, meta)
+
     def tokens(self):
         return [token for inst in self.insts for token in inst.tokens()]
+
     def random_walk(self, num=3):
         return [self._random_walk() for _ in range(num)]
+
     def _random_walk(self):
         current, visited, seq = self.blocks[0], [], []
         while current not in visited:
@@ -124,25 +142,31 @@ def _random_walk(self):
             current = random.choice(list(current.successors))
         return seq
 
+
 class BasicBlock:
     def __init__(self):
         self.insts = []
         self.successors = set()
+
     def add(self, inst):
         self.insts.append(inst)
+
     def end(self):
         inst = self.insts[-1]
         return inst.is_jmp() or inst.op == 'ret'
 
+
 class Instruction:
     def __init__(self, op, args):
         self.op = op
         self.args = args
+
     def __str__(self):
         return f'{self.op} {", ".join([str(arg) for arg in self.args if str(arg)])}'
+
     @classmethod
     def load(cls, text):
-        text = text.strip().strip('bnd').strip() # get rid of BND prefix
+        text = text.strip().strip('bnd').strip()
         op, _, args = text.strip().partition(' ')
         if args:
             args = [arg.strip() for arg in args.split(',')]
@@ -150,9 +174,12 @@ def load(cls, text):
             args = []
         args = (args + ['', ''])[:2]
         return cls(op, args)
+
     def tokens(self):
         return [self.op] + self.args
+
     def is_jmp(self):
         return 'jmp' in self.op or self.op[0] == 'j'
+
     def is_call(self):
         return self.op == 'call'
diff --git a/asm2vec/model.py b/asm2vec/model.py
index 301f3be..51dc433 100644
--- a/asm2vec/model.py
+++ b/asm2vec/model.py
@@ -1,43 +1,82 @@
 import torch
 import torch.nn as nn
 
+from asm2vec.datatype import Tokens
+
 bce, sigmoid, softmax = nn.BCELoss(), nn.Sigmoid(), nn.Softmax(dim=1)
 
+
+# TODO - doc strings
+
+
 class ASM2VEC(nn.Module):
     def __init__(self, vocab_size, function_size, embedding_size):
         super(ASM2VEC, self).__init__()
-        self.embeddings   = nn.Embedding(vocab_size, embedding_size, _weight=torch.zeros(vocab_size, embedding_size))
-        self.embeddings_f = nn.Embedding(function_size, 2 * embedding_size, _weight=(torch.rand(function_size, 2 * embedding_size)-0.5)/embedding_size/2)
-        self.embeddings_r = nn.Embedding(vocab_size, 2 * embedding_size, _weight=(torch.rand(vocab_size, 2 * embedding_size)-0.5)/embedding_size/2)
+        self.embeddings = nn.Embedding(vocab_size, embedding_size, _weight=torch.zeros(vocab_size, embedding_size))
+        self.embeddings_f = nn.Embedding(function_size, 2 * embedding_size,
+                                         _weight=(torch.rand(function_size, 2 * embedding_size)-0.5)/embedding_size/2)
+        self.embeddings_r = nn.Embedding(vocab_size, 2 * embedding_size,
+                                         _weight=(torch.rand(vocab_size, 2 * embedding_size)-0.5)/embedding_size/2)
 
     def update(self, function_size_new, vocab_size_new):
         device = self.embeddings.weight.device
-        vocab_size, function_size, embedding_size = self.embeddings.num_embeddings, self.embeddings_f.num_embeddings, self.embeddings.embedding_dim
+        vocab_size, function_size, embedding_size = (self.embeddings.num_embeddings,
+                                                     self.embeddings_f.num_embeddings, self.embeddings.embedding_dim)
         if vocab_size_new != vocab_size:
-            weight = torch.cat([self.embeddings.weight, torch.zeros(vocab_size_new - vocab_size, embedding_size).to(device)])
+            weight = torch.cat([self.embeddings.weight, torch.zeros(vocab_size_new - vocab_size, embedding_size).
+                               to(device)])
             self.embeddings = nn.Embedding(vocab_size_new, embedding_size, _weight=weight)
-            weight_r = torch.cat([self.embeddings_r.weight, ((torch.rand(vocab_size_new - vocab_size, 2 * embedding_size)-0.5)/embedding_size/2).to(device)])
+            weight_r = torch.cat([self.embeddings_r.weight,
+                                  ((torch.rand(vocab_size_new - vocab_size, 2 * embedding_size)-0.5)/embedding_size/2)
+                                 .to(device)])
             self.embeddings_r = nn.Embedding(vocab_size_new, 2 * embedding_size, _weight=weight_r)
-        self.embeddings_f = nn.Embedding(function_size_new, 2 * embedding_size, _weight=((torch.rand(function_size_new, 2 * embedding_size)-0.5)/embedding_size/2).to(device))
+        self.embeddings_f = nn.Embedding(function_size_new, 2 * embedding_size,
+                                         _weight=((torch.rand(function_size_new, 2 * embedding_size)-0.5) /
+                                                  embedding_size/2).to(device))
 
     def v(self, inp):
-        e  = self.embeddings(inp[:,1:])
-        v_f = self.embeddings_f(inp[:,0])
-        v_prev = torch.cat([e[:,0], (e[:,1] + e[:,2]) / 2], dim=1)
-        v_next = torch.cat([e[:,3], (e[:,4] + e[:,5]) / 2], dim=1)
+        e = self.embeddings(inp[:, 1:])
+        v_f = self.embeddings_f(inp[:, 0])
+        v_prev = torch.cat([e[:, 0], (e[:, 1] + e[:, 2]) / 2], dim=1)
+        v_next = torch.cat([e[:, 3], (e[:, 4] + e[:, 5]) / 2], dim=1)
         v = ((v_f + v_prev + v_next) / 3).unsqueeze(2)
         return v
 
     def forward(self, inp, pos, neg):
         device, batch_size = inp.device, inp.shape[0]
         v = self.v(inp)
-        # negative sampling loss
         pred = torch.bmm(self.embeddings_r(torch.cat([pos, neg], dim=1)), v).squeeze()
         label = torch.cat([torch.ones(batch_size, 3), torch.zeros(batch_size, neg.shape[1])], dim=1).to(device)
         return bce(sigmoid(pred), label)
 
-    def predict(self, inp, pos):
+    def predict(self, inp, pos):  # Why is pos not used? Why does Predict differ so much from Forward?
         device, batch_size = inp.device, inp.shape[0]
         v = self.v(inp)
-        probs = torch.bmm(self.embeddings_r(torch.arange(self.embeddings_r.num_embeddings).repeat(batch_size, 1).to(device)), v).squeeze(dim=2)
+        probs = torch.bmm(self.embeddings_r(torch.arange(self.embeddings_r.num_embeddings).repeat(batch_size, 1).
+                                            to(device)), v).squeeze(dim=2)
         return softmax(probs)
+
+
+def save_model(path: str, model: ASM2VEC, tokens: Tokens) -> None:
+    torch.save(
+        {
+            'model_params': (
+                model.embeddings.num_embeddings,
+                model.embeddings_f.num_embeddings,
+                model.embeddings.embedding_dim
+            ),
+            'model': model.state_dict(),
+            'tokens': tokens.state_dict(),
+        },
+        path
+    )
+
+
+def load_model(path: str, device: str = 'cpu') -> tuple[ASM2VEC, Tokens]:
+    checkpoint = torch.load(path, map_location=device)
+    tokens = Tokens()
+    tokens.load_state_dict(checkpoint['tokens'])
+    model = ASM2VEC(*checkpoint['model_params'])
+    model.load_state_dict(checkpoint['model'])
+    model = model.to(device)
+    return model, tokens
diff --git a/asm2vec/similarity.py b/asm2vec/similarity.py
new file mode 100644
index 0000000..ea52327
--- /dev/null
+++ b/asm2vec/similarity.py
@@ -0,0 +1,48 @@
+import torch
+
+from asm2vec.data import load_data
+from asm2vec.model import load_model
+from asm2vec.train import train
+
+
+def cosine_similarity(v1, v2) -> float:
+    return (v1 @ v2 / (v1.norm() * v2.norm())).item()
+
+
+def compare_two(
+        data_path_1: str, data_path_2: str, model_path: str, epochs: int = 10, device: str = "cpu",
+        learning_rate: float = 0.02
+) -> float:
+    """This function produces the cosine similarity of a pair of assembly functions
+    :param data_path_1: the path to the assembly function no. 1
+    :param data_path_2: the path to the assembly function no. 2
+    :param model_path: the path to the trained asm2vec model
+    :param epochs: the number of epochs for calculating the tensor representations; (Optional, default = 10)
+    :param device: 'auto' | 'cuda' | 'cpu' (Optional, default 'cpu')
+    :param learning_rate: learning rate; (Optional; default = 0.02)
+    :return the cosine similarity value
+    """
+    if device == "auto":
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    model, tokens = load_model(model_path, device=device)
+    functions, tokens_new = load_data([data_path_1, data_path_2])
+    tokens.update(tokens_new)
+    model.update(2, tokens.size())
+    model = model.to(device)
+
+    model = train(
+        functions,
+        tokens,
+        model=model,
+        epochs=epochs,
+        device=device,
+        mode="update",
+        learning_rate=learning_rate
+    )
+
+    v1, v2 = model.to("cpu").embeddings_f(torch.tensor([0, 1]))
+    similarity = cosine_similarity(v1, v2)
+    print(f"Cosine similarity : {similarity:.6f}")
+
+    return similarity
diff --git a/asm2vec/tensors.py b/asm2vec/tensors.py
new file mode 100644
index 0000000..78a356e
--- /dev/null
+++ b/asm2vec/tensors.py
@@ -0,0 +1,73 @@
+import os
+import torch
+import logging
+from pathlib import Path
+
+from asm2vec.train import train, load_model, load_data
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+def calc_tensors(
+        asm_path: str, tensor_path: str, model_path: str, epochs: int, device: str = 'cpu', learning_rate: float = 0.02
+) -> list:
+    """
+    Calculates vector representation of a binary as the mean per column of the vector representations of its assembly
+    functions.
+    :param asm_path: Path to folder with assembly function in a sub-folder per binary
+    :param tensor_path: Path to folder to store the tensors
+    :param model_path: Path to the trained model
+    :param epochs: Number of epochs
+    :param device: 'auto' | 'cuda' | 'cpu'
+    :param learning_rate: Learning rate
+    :return: List of tensors
+    """
+    tensors_list = []
+    if device == 'auto':
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+    if os.path.isfile(model_path):
+        model, tokens = load_model(model_path, device=device)
+    else:
+        print("No valid model")
+        return []
+
+    dir0 = Path(tensor_path)
+    if not (os.path.exists(dir0)):
+        os.mkdir(dir0)
+
+    if os.path.isdir(asm_path):
+        obj = os.scandir(asm_path)
+        for entry in obj:
+            if entry.is_dir() and os.listdir(entry) and entry.name:
+                tensor_file = os.path.join(dir0, entry.name)
+                if not (os.path.exists(tensor_file)):
+                    functions, tokens_new = load_data([entry])
+                    file_count = sum(len(files) for _, _, files in os.walk(entry))
+                    tokens.update(tokens_new)
+                    logging.info(f"Binary {entry.name}: {file_count} assembly functions")
+                    model.update(file_count, tokens.size())
+                    model = model.to(device)
+
+                    model = train(
+                        functions,
+                        tokens,
+                        model=model,
+                        epochs=epochs,
+                        device=device,
+                        mode='update',
+                        learning_rate=learning_rate
+                    )
+
+                    tensor = model.to('cpu').embeddings_f(torch.tensor([list(range(0, file_count))]))
+                    tens = torch.squeeze(tensor)
+                    if file_count == 1:
+                        torch.save(tensor, tensor_file)
+                    else:
+                        torch.save(tens.mean(0), tensor_file)
+                    tensors_list.append(entry.name)
+
+    else:
+        logging.info("No valid directory")
+
+    return tensors_list
diff --git a/asm2vec/test.py b/asm2vec/test.py
new file mode 100644
index 0000000..b80cc14
--- /dev/null
+++ b/asm2vec/test.py
@@ -0,0 +1,39 @@
+import torch
+
+from asm2vec.data import load_data
+from asm2vec.model import load_model
+from asm2vec.train import train, preprocess
+from asm2vec.utilities import show_probs
+
+
+def test_model(
+        data_path: str, model_path: str, epochs: int = 10, neg_sample_num: int = 25, limit: int | None = None,
+        device: str = "cpu", learning_rate: float = 0.02, pretty: bool = False
+) -> None:
+    # TODO - doc string
+    if device == "auto":
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    # load model, tokens
+    model, tokens = load_model(model_path, device=device)
+    functions, tokens_new = load_data(data_path)
+    tokens.update(tokens_new)
+    model.update(1, tokens.size())
+    model = model.to(device)
+
+    # train function embedding
+    model = train(
+        functions,
+        tokens,
+        model=model,
+        epochs=epochs,
+        neg_sample_num=neg_sample_num,
+        device=device,
+        mode="update",
+        learning_rate=learning_rate
+    )
+
+    # show predicted probability results
+    x, y = preprocess(functions, tokens)
+    probs = model.predict(x.to(device), y.to(device))
+    show_probs(x, y, probs, tokens, limit=limit, pretty=pretty)
diff --git a/asm2vec/train.py b/asm2vec/train.py
new file mode 100644
index 0000000..4de7a81
--- /dev/null
+++ b/asm2vec/train.py
@@ -0,0 +1,134 @@
+import time
+import torch
+from pathlib import Path
+from torch.utils.data import DataLoader
+from asm2vec.data import AsmDataset, load_data
+from asm2vec.datatype import Function, Tokens
+from asm2vec.model import ASM2VEC, load_model, save_model
+from asm2vec.utilities import accuracy, callback
+
+
+def preprocess(functions, tokens):
+    x, y = [], []
+    for i, fn in enumerate(functions):
+        for seq in fn.random_walk():
+            for j in range(1, len(seq) - 1):
+                x.append([i] + [tokens[token].index for token in seq[j - 1].tokens() + seq[j + 1].tokens()])
+                y.append([tokens[token].index for token in seq[j].tokens()])
+    return torch.tensor(x), torch.tensor(y)
+
+
+def train(
+        functions: list[Function], tokens: Tokens, model: ASM2VEC | None = None, embedding_size: int = 100,
+        batch_size: int = 1024, epochs: int = 10, neg_sample_num: int = 25, calc_acc: bool = False, device: str = 'cpu',
+        mode: str = 'train', verbose: bool = False, learning_rate: float = 0.02
+):
+    """This function trains a model on the given assembly functions and tokens
+    :param functions: list of assembly functions
+    :param tokens: tokens (operations, operands) of the assembly function
+    :param model: type of the model; ; (Optional, default ASM2VEC)
+    :param embedding_size: size of the tensor representation of an assembly function; (Optional, default value = 100)
+    :param batch_size: size of the batch for each epoch of training; (Optional, default value = 1024)
+    :param epochs: number of epochs for training the model; (Optional, default value = 10)
+    :param neg_sample_num: size of the negative sample; (Optional, default value = 25)
+    :param calc_acc: if set to True, the accuracy per training epoch is displayed; (Optional, default False)
+    :param device: the device used for processing; (Optional, default 'cpu')
+    :param mode: 'train' (to train a new model) | 'update' (to add to an already trained  model's dictionary);
+    (Optional, default 'train')
+    :param verbose: if True performs training in verbose mode; (Optional, default False)
+    :param learning_rate: learning rate
+    """
+    if mode == 'train':
+        if model is None:
+            model = ASM2VEC(tokens.size(), function_size=len(functions), embedding_size=embedding_size).to(device)
+        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+    elif mode == 'update':
+        if model is None:
+            raise ValueError("Update mode requires a pretrained model")
+        optimizer = torch.optim.Adam(model.embeddings_f.parameters(), lr=learning_rate)
+    else:
+        raise ValueError("Unknown mode")
+
+    loader = DataLoader(AsmDataset(*preprocess(functions, tokens)), batch_size=batch_size, shuffle=True)
+    for epoch in range(epochs):
+        start = time.time()
+        loss_sum, loss_count, accs = 0.0, 0, []
+
+        model.train()
+        for i, (inp, pos) in enumerate(loader):
+            neg = tokens.sample(inp.shape[0], neg_sample_num)
+            loss = model(inp.to(device), pos.to(device), neg.to(device))
+            loss_sum, loss_count = loss_sum + loss, loss_count + 1
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+
+            if i == 0 and calc_acc:
+                probs = model.predict(inp.to(device), pos.to(device))
+                accs.append(accuracy(pos, probs))
+
+        if verbose:
+            callback({
+                'model': model,
+                'tokens': tokens,
+                'epoch': epoch,
+                'time': time.time() - start,
+                'loss': loss_sum / loss_count,
+                'accuracy': torch.tensor(accs).mean() if calc_acc else None
+            })
+
+    return model
+
+
+def train_asm2vec_model(
+        train_set: str, new_model: str, model_path: str | None, epochs: int, limit: int | None = None,
+        calc_acc: bool = False, embedding_size: int = 100, batch_size: int = 1024, neg_sample: int = 25,
+        learning_rate: float = 0.02, device: str = 'cpu'
+) -> ASM2VEC:
+    # TODO - this is just a wrapper - can we do this smarter?
+    """Trains an ASM2VEC model
+    :param train_set: path to the training dataset
+    :param new_model: path to the model to be trained
+    :param model_path: path to already trained model
+    :param limit: number of the assembly functions that the model will be trained on; if not defined, all the assembly
+        functions in train_set_path
+    :param epochs: number of epochs
+    :param calc_acc: displays the accuracy per training epoch; setting it to True will slow down the training
+    :param embedding_size: size of the vector representation for a token; an assembly function will be represented
+        with a vector twice that size
+    :param batch_size: the size of batches for training
+    :param neg_sample: negative sampling amount
+    :param device: 'auto' | 'cuda' | 'cpu'
+    :param learning_rate: learning rate
+    :return an ASM2VEC model
+    """
+
+    if device == 'auto':
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+    if model_path:
+        model, tokens = load_model(model_path, device=device)
+        functions, tokens_new = load_data(train_set, limit=limit)
+        tokens.update(tokens_new)
+        model.update(len(functions), tokens.size())
+    else:
+        model = None
+        functions, tokens = load_data(Path(train_set), limit=limit)
+
+    model = train(
+        functions,
+        tokens,
+        model=model,
+        embedding_size=embedding_size,
+        batch_size=batch_size,
+        epochs=epochs,
+        neg_sample_num=neg_sample,
+        calc_acc=calc_acc,
+        device=device,
+        verbose=True,
+        learning_rate=learning_rate
+    )
+    save_model(new_model, model, tokens)
+
+    return model
diff --git a/asm2vec/utilities.py b/asm2vec/utilities.py
new file mode 100644
index 0000000..dd39aac
--- /dev/null
+++ b/asm2vec/utilities.py
@@ -0,0 +1,55 @@
+import logging
+import torch
+
+from asm2vec.datatype import Instruction
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+# TODO - Why do we have both logging and print?
+# TODO - Doc strings
+
+def accuracy(y, probs):
+    return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)]))
+
+
+def callback(context) -> None:
+    """Prettifies the display of accuracy, if chosen
+    """
+    progress = f'{context["epoch"]} | time = {context["time"]:.2f},\
+                  loss = {context["loss"]:.4f}'
+
+    if context["accuracy"]:
+        progress += f', accuracy = {context["accuracy"]:.4f}'
+    logging.info(f"{progress}")
+
+
+def show_probs(x, y, probs, tokens, limit=None, pretty=False):
+    if pretty:
+        tl, tr, bl, br = '┌', '┐', '└', '┘'
+        lm, rm, tm, bm = '├', '┤', '┬', '┴'
+        h, v = '─', '│'
+        arrow = ' ➔'
+    else:
+        tl, tr, bl, br = '+', '+', '+', '+'
+        lm, rm, tm, bm = '+', '+', '+', '+'
+        h, v = '-', '|'
+        arrow = '->'
+    top = probs.topk(5)
+    for i, (xi, yi) in enumerate(zip(x, y)):
+        if limit and i >= limit:
+            break
+        xi, yi = xi.tolist(), yi.tolist()
+        print(tl + h * 42 + tr)
+        print(f'{v}    {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {v}')
+        print(f'{v} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {v}')
+        print(f'{v}    {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {v}')
+        print(lm + h * 8 + tm + h * 33 + rm)
+        for value, index in zip(top.values[i], top.indices[i]):
+            if index in yi:
+                colorbegin, colorclear = '\033[92m', '\033[0m'
+            else:
+                colorbegin, colorclear = '', ''
+            print(f'{v} {colorbegin}{value * 100:05.2f}%{colorclear} {v} {colorbegin}'
+                  f'{tokens[index.item()].name:31}{colorclear} {v}')
+        print(bl + h * 8 + bm + h * 33 + br)
diff --git a/asm2vec/utils.py b/asm2vec/utils.py
deleted file mode 100644
index 4f9aa25..0000000
--- a/asm2vec/utils.py
+++ /dev/null
@@ -1,156 +0,0 @@
-import os
-import time
-import torch
-from torch.utils.data import DataLoader, Dataset
-from pathlib import Path
-from .datatype import Tokens, Function, Instruction
-from .model import ASM2VEC
-
-class AsmDataset(Dataset):
-    def __init__(self, x, y):
-        self.x = x
-        self.y = y
-    def __len__(self):
-        return len(self.x)
-    def __getitem__(self, index):
-        return self.x[index], self.y[index]
-
-def load_data(paths, limit=None):
-    if type(paths) is not list:
-        paths = [paths]
-   
-    filenames = []
-    for path in paths:
-        if os.path.isdir(path):
-            filenames += [Path(path) / filename for filename in sorted(os.listdir(path)) if os.path.isfile(Path(path) / filename)]
-        else:
-            filenames += [Path(path)]
-    
-    functions, tokens = [], Tokens()
-    for i, filename in enumerate(filenames):
-        if limit and i >= limit:
-            break
-        with open(filename) as f:
-            fn = Function.load(f.read())
-            functions.append(fn)
-            tokens.add(fn.tokens())
-    
-    return functions, tokens
-
-def preprocess(functions, tokens):
-    x, y = [], []
-    for i, fn in enumerate(functions):
-        for seq in fn.random_walk():
-            for j in range(1, len(seq) - 1):
-                x.append([i] + [tokens[token].index for token in seq[j-1].tokens() + seq[j+1].tokens()])
-                y.append([tokens[token].index for token in seq[j].tokens()])
-    return torch.tensor(x), torch.tensor(y)
-
-def train(
-    functions,
-    tokens,
-    model=None,
-    embedding_size=100,
-    batch_size=1024,
-    epochs=10,
-    neg_sample_num=25,
-    calc_acc=False,
-    device='cpu',
-    mode='train',
-    callback=None,
-    learning_rate=0.02
-):
-    if mode == 'train':
-        if model is None:
-            model = ASM2VEC(tokens.size(), function_size=len(functions), embedding_size=embedding_size).to(device)
-        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
-    elif mode == 'test':
-        if model is None:
-            raise ValueError("test mode required pretrained model")
-        optimizer = torch.optim.Adam(model.embeddings_f.parameters(), lr=learning_rate)
-    else:
-        raise ValueError("Unknown mode")
-
-    loader = DataLoader(AsmDataset(*preprocess(functions, tokens)), batch_size=batch_size, shuffle=True)
-    for epoch in range(epochs):
-        start = time.time()
-        loss_sum, loss_count, accs = 0.0, 0, []
-
-        model.train()
-        for i, (inp, pos) in enumerate(loader):
-            neg = tokens.sample(inp.shape[0], neg_sample_num)
-            loss = model(inp.to(device), pos.to(device), neg.to(device))
-            loss_sum, loss_count = loss_sum + loss, loss_count + 1
-            
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-
-            if i == 0 and calc_acc:
-                probs = model.predict(inp.to(device), pos.to(device))
-                accs.append(accuracy(pos, probs))
-
-        if callback:
-            callback({
-                'model': model,
-                'tokens': tokens,
-                'epoch': epoch,
-                'time': time.time() - start,
-                'loss': loss_sum / loss_count,
-                'accuracy': torch.tensor(accs).mean() if calc_acc else None
-            })
-
-    return model
-
-def save_model(path, model, tokens):
-    torch.save({
-        'model_params': (
-            model.embeddings.num_embeddings,
-            model.embeddings_f.num_embeddings,
-            model.embeddings.embedding_dim
-        ),
-        'model': model.state_dict(),
-        'tokens': tokens.state_dict(),
-    }, path)
-
-def load_model(path, device='cpu'):
-    checkpoint = torch.load(path, map_location=device)
-    tokens = Tokens()
-    tokens.load_state_dict(checkpoint['tokens'])
-    model = ASM2VEC(*checkpoint['model_params'])
-    model.load_state_dict(checkpoint['model'])
-    model = model.to(device)
-    return model, tokens
-
-def show_probs(x, y, probs, tokens, limit=None, pretty=False):
-    if pretty:
-        TL, TR, BL, BR = '┌', '┐', '└', '┘'
-        LM, RM, TM, BM = '├', '┤', '┬', '┴'
-        H, V = '─', '│'
-        arrow = ' ➔'
-    else:
-        TL = TR = BL = BR = '+'
-        LM = RM = TM = BM = '+'
-        H, V = '-', '|'
-        arrow = '->'
-    top = probs.topk(5)
-    for i, (xi, yi) in enumerate(zip(x, y)):
-        if limit and i >= limit:
-            break
-        xi, yi = xi.tolist(), yi.tolist()
-        print(TL + H * 42 + TR)
-        print(f'{V}    {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {V}')
-        print(f'{V} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {V}')
-        print(f'{V}    {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {V}')
-        print(LM + H * 8 + TM + H * 33 + RM)
-        for value, index in zip(top.values[i], top.indices[i]):
-            if index in yi:
-                colorbegin, colorclear = '\033[92m', '\033[0m'
-            else:
-                colorbegin, colorclear = '', ''
-            print(f'{V} {colorbegin}{value*100:05.2f}%{colorclear} {V} {colorbegin}{tokens[index.item()].name:31}{colorclear} {V}')
-        print(BL + H * 8 + BM + H * 33 + BR)
-
-def accuracy(y, probs):
-    return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)]))
-
diff --git a/asm2vec/version.py b/asm2vec/version.py
new file mode 100644
index 0000000..c85dc7e
--- /dev/null
+++ b/asm2vec/version.py
@@ -0,0 +1,4 @@
+VERSION = '1.0.3'
+DEV_VERSION = '0'
+
+radare2_version = "5.8.8"
diff --git a/catalog-info.yaml b/catalog-info.yaml
new file mode 100644
index 0000000..378ab88
--- /dev/null
+++ b/catalog-info.yaml
@@ -0,0 +1,15 @@
+apiVersion: backstage.io/v1alpha1
+kind: Component
+metadata:
+  name: asm2vec-pytorch
+  description: All code running ASM2VEC using PyTorch
+  labels:
+    - jira-key: DATASCI
+    - language: Python
+  annotations:
+    backstage.io/source-location: url:https://github.com/wandera/asm2vec-pytorch
+spec:
+  type: service
+  lifecycle: production
+  owner: datascience
+  system: datascience
diff --git a/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 b/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80
new file mode 100644
index 0000000..208607f
Binary files /dev/null and b/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 differ
diff --git a/requirements.txt b/requirements.txt
index d92495b..3163633 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,2 @@
 torch>=1.7,<2
-click>=7.1,<8
 r2pipe>=1.5,<2
diff --git a/scripts/bin2asm.py b/scripts/bin2asm.py
deleted file mode 100644
index 2134e8c..0000000
--- a/scripts/bin2asm.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/usr/bin/env python3
-import re
-import os
-import click
-import r2pipe
-import hashlib
-from pathlib import Path
-
-def sha3(data):
-    return hashlib.sha3_256(data.encode()).hexdigest()
-
-def validEXE(filename):
-    magics = [bytes.fromhex('7f454c46')]
-    with open(filename, 'rb') as f:
-        header = f.read(4)
-        return header in magics
-
-def normalize(opcode):
-    opcode = opcode.replace(' - ', ' + ')
-    opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode)
-    opcode = re.sub(r'\*[0-9]', '*CONST', opcode)
-    opcode = re.sub(r' [0-9]', ' CONST', opcode)
-    return opcode
-
-def fn2asm(pdf, minlen):
-    # check
-    if pdf is None:
-        return
-    if len(pdf['ops']) < minlen:
-        return
-    if 'invalid' in [op['type'] for op in pdf['ops']]:
-        return
-
-    ops = pdf['ops']
-
-    # set label
-    labels, scope = {}, [op['offset'] for op in ops]
-    assert(None not in scope)
-    for i, op in enumerate(ops):
-        if op.get('jump') in scope:
-            labels.setdefault(op.get('jump'), i)
-    
-    # dump output
-    output = ''
-    for op in ops:
-        # add label
-        if labels.get(op.get('offset')) is not None:
-            output += f'LABEL{labels[op["offset"]]}:\n'
-        # add instruction
-        if labels.get(op.get('jump')) is not None:
-            output += f' {op["type"]} LABEL{labels[op["jump"]]}\n'
-        else:
-            output += f' {normalize(op["opcode"])}\n'
-
-    return output
-
-def bin2asm(filename, opath, minlen):
-    # check
-    if not validEXE(filename):
-        return 0
-    
-    r = r2pipe.open(str(filename))
-    r.cmd('aaaa')
-
-    count = 0
-
-    for fn in r.cmdj('aflj'):
-        r.cmd(f's {fn["offset"]}')
-        asm = fn2asm(r.cmdj('pdfj'), minlen)
-        if asm:
-            uid = sha3(asm)
-            asm = f''' .name {fn["name"]}
- .offset {fn["offset"]:016x}
- .file {filename.name}
-''' + asm
-            with open(opath / uid, 'w') as f:
-                f.write(asm)
-                count += 1
-
-    print(f'[+] {filename}')
-
-    return count
-
-@click.command()
-@click.option('-i', '--input', 'ipath', help='input directory / file', required=True)
-@click.option('-o', '--output', 'opath', default='asm', help='output directory')
-@click.option('-l', '--len', 'minlen', default=10, help='ignore assembly code with instructions amount smaller than minlen')
-def cli(ipath, opath, minlen):
-    '''
-    Extract assembly functions from binary executable
-    '''
-    ipath = Path(ipath)
-    opath = Path(opath)
-
-    # create output directory
-    if not os.path.exists(opath):
-        os.mkdir(opath)
-
-    fcount, bcount = 0, 0
-
-    # directory
-    if os.path.isdir(ipath):
-        for f in os.listdir(ipath):
-            if not os.path.islink(ipath / f) and not os.path.isdir(ipath / f):
-                fcount += bin2asm(ipath / f, opath, minlen)
-                bcount += 1
-    # file
-    elif os.path.exists(ipath):
-        fcount += bin2asm(ipath, opath, minlen)
-        bcount += 1
-    else:
-        print(f'[Error] No such file or directory: {ipath}')
-
-    print(f'[+] Total scan binary: {bcount} => Total generated assembly functions: {fcount}')
-
-if __name__ == '__main__':
-    cli()
diff --git a/scripts/compare.py b/scripts/compare.py
deleted file mode 100644
index 3860b83..0000000
--- a/scripts/compare.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-import torch.nn as nn
-import click
-import asm2vec
-
-def cosine_similarity(v1, v2):
-    return (v1 @ v2 / (v1.norm() * v2.norm())).item()
-
-@click.command()
-@click.option('-i1', '--input1', 'ipath1', help='target function 1', required=True)
-@click.option('-i2', '--input2', 'ipath2', help='target function 2', required=True)
-@click.option('-m', '--model', 'mpath', help='model path', required=True)
-@click.option('-e', '--epochs', default=10, help='training epochs', show_default=True)
-@click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True)
-@click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True)
-def cli(ipath1, ipath2, mpath, epochs, device, lr):
-    if device == 'auto':
-        device = 'cuda' if torch.cuda.is_available() else 'cpu'
-
-    # load model, tokens
-    model, tokens = asm2vec.utils.load_model(mpath, device=device)
-    functions, tokens_new = asm2vec.utils.load_data([ipath1, ipath2])
-    tokens.update(tokens_new)
-    model.update(2, tokens.size())
-    model = model.to(device)
-    
-    # train function embedding
-    model = asm2vec.utils.train(
-        functions,
-        tokens,
-        model=model,
-        epochs=epochs,
-        device=device,
-        mode='test',
-        learning_rate=lr
-    )
-
-    # compare 2 function vectors
-    v1, v2 = model.to('cpu').embeddings_f(torch.tensor([0, 1]))
-
-    print(f'cosine similarity : {cosine_similarity(v1, v2):.6f}')
-
-if __name__ == '__main__':
-    cli()
diff --git a/scripts/test.py b/scripts/test.py
deleted file mode 100644
index 31372aa..0000000
--- a/scripts/test.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-import torch.nn as nn
-import click
-import asm2vec
-
-@click.command()
-@click.option('-i', '--input', 'ipath', help='target function', required=True)
-@click.option('-m', '--model', 'mpath', help='model path', required=True)
-@click.option('-e', '--epochs', default=10, help='training epochs', show_default=True)
-@click.option('-n', '--neg-sample-num', 'neg_sample_num', default=25, help='negative sampling amount', show_default=True)
-@click.option('-l', '--limit', help='limit the amount of output probability result', type=int)
-@click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True)
-@click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True)
-@click.option('-p', '--pretty', default=False, help='pretty print table', show_default=True, is_flag=True)
-def cli(ipath, mpath, epochs, neg_sample_num, limit, device, lr, pretty):
-    if device == 'auto':
-        device = 'cuda' if torch.cuda.is_available() else 'cpu'
-
-    # load model, tokens
-    model, tokens = asm2vec.utils.load_model(mpath, device=device)
-    functions, tokens_new = asm2vec.utils.load_data(ipath)
-    tokens.update(tokens_new)
-    model.update(1, tokens.size())
-    model = model.to(device)
-
-    # train function embedding
-    model = asm2vec.utils.train(
-        functions,
-        tokens,
-        model=model,
-        epochs=epochs,
-        neg_sample_num=neg_sample_num,
-        device=device,
-        mode='test',
-        learning_rate=lr
-    )
-
-    # show predicted probability results
-    x, y = asm2vec.utils.preprocess(functions, tokens)
-    probs = model.predict(x.to(device), y.to(device))
-    asm2vec.utils.show_probs(x, y, probs, tokens, limit=limit, pretty=pretty)
-
-if __name__ == '__main__':
-    cli()
diff --git a/scripts/train.py b/scripts/train.py
deleted file mode 100644
index 98391f4..0000000
--- a/scripts/train.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import torch
-import click
-import asm2vec
-
-@click.command()
-@click.option('-i', '--input', 'ipath', help='training data folder', required=True)
-@click.option('-o', '--output', 'opath', default='model.pt', help='output model path', show_default=True)
-@click.option('-m', '--model', 'mpath', help='load previous trained model path', type=str)
-@click.option('-l', '--limit', help='limit the number of functions to be loaded', show_default=True, type=int)
-@click.option('-d', '--ebedding-dimension', 'embedding_size', default=100, help='embedding dimension', show_default=True)
-@click.option('-b', '--batch-size', 'batch_size', default=1024, help='batch size', show_default=True)
-@click.option('-e', '--epochs', default=10, help='training epochs', show_default=True)
-@click.option('-n', '--neg-sample-num', 'neg_sample_num', default=25, help='negative sampling amount', show_default=True)
-@click.option('-a', '--calculate-accuracy', 'calc_acc', help='whether calculate accuracy ( will be significantly slower )', is_flag=True)
-@click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True)
-@click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True)
-def cli(ipath, opath, mpath, limit, embedding_size, batch_size, epochs, neg_sample_num, calc_acc, device, lr):
-    if device == 'auto':
-        device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    
-    if mpath:
-        model, tokens = asm2vec.utils.load_model(mpath, device=device)
-        functions, tokens_new = asm2vec.utils.load_data(ipath, limit=limit)
-        tokens.update(tokens_new)
-        model.update(len(functions), tokens.size())
-    else:
-        model = None
-        functions, tokens = asm2vec.utils.load_data(ipath, limit=limit)
-
-    def callback(context):
-        progress = f'{context["epoch"]} | time = {context["time"]:.2f}, loss = {context["loss"]:.4f}'
-        if context["accuracy"]:
-            progress += f', accuracy = {context["accuracy"]:.4f}'
-        print(progress)
-        asm2vec.utils.save_model(opath, context["model"], context["tokens"])
-
-    model = asm2vec.utils.train(
-        functions,
-        tokens,
-        model=model,
-        embedding_size=embedding_size,
-        batch_size=batch_size,
-        epochs=epochs,
-        neg_sample_num=neg_sample_num,
-        calc_acc=calc_acc,
-        device=device,
-        callback=callback,
-        learning_rate=lr
-    )
-
-if __name__ == '__main__':
-    cli()
diff --git a/setup.py b/setup.py
index 62ff843..19a3051 100644
--- a/setup.py
+++ b/setup.py
@@ -1,14 +1,69 @@
+import os
+import sys
+import platform
 from setuptools import setup, find_packages
+from setuptools.command.install import install as _install
+
+from asm2vec.version import VERSION, radare2_version
+
+
+class install(_install):
+    @staticmethod
+    def _setup_radare2() -> None:
+        if sys.platform.startswith("linux"):  # Install required in Docker images
+            machine = platform.machine()
+            if machine in ["aarch64", "arm"]:
+                architecture = "arm64"
+            elif machine in ["x86_64"]:
+                architecture = "amd64"
+            elif machine in ["i386", "i686"]:
+                architecture = "i386"
+            else:
+                raise Exception(f"No architecture for Linux Machine: '{machine}'")
+
+            commands = [
+                "apt-get update",
+                "apt-get install -y --no-install-recommends wget",
+                f"wget -O /tmp/radare2_{radare2_version}_{architecture}.deb https://github.com/radareorg/radare2/releases/download/{radare2_version}/radare2_{radare2_version}_{architecture}.deb",
+                f"dpkg -i /tmp/radare2_{radare2_version}_{architecture}.deb",
+                "r2pm init",
+                "r2pm update",
+                f"rm /tmp/radare2_{radare2_version}_{architecture}.deb"
+            ]
+            for command in commands:
+                if os.system(command) != 0:
+                    raise Exception(f"Install radare2 failed: '{command}'")
+        else:
+            print("Ensure 'radar2' is installed...")
+
+    def run(self):
+        self._setup_radare2()
+        _install.run(self)
+
+
+def readme():
+    with open('README.md') as f:
+        return f.read()
+
+
+def read_requirements():
+    with open('requirements.txt') as f:
+        return [s for s in f.read().split('\n') if not ('--index-url' in s)]
+
 
 setup(
     name='asm2vec',
-    version='1.0.0',
-    description='Unofficial implementation of asm2vec using pytorch',
-    install_requires=['torch>=1.7,<2'
-                      'click>=7.1,<8'
-                      'r2pipe>=1.5,<2'],
-    author='oalieno',
-    author_email='jeffrey6910@gmail.com',
+    version=VERSION,
+    description="Jamf's implementation of asm2vec using pytorch",
+    long_description=readme(),
+    author='oalieno/jamf',
+    author_email='jamie.nutter@jamf.com',
     license='MIT License',
-    packages = find_packages(),
+    install_requires=read_requirements(),
+    packages=find_packages(),
+    zip_safe=False,
+    include_package_data=True,
+    test_suite='nose.collector',
+    tests_require=['nose'],
+    cmdclass={'install': install}
 )
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000..472793c
--- /dev/null
+++ b/test/__init__.py
@@ -0,0 +1 @@
+__all__ = ["test_binary_to_asm"]
diff --git a/test/test_binary_to_asm.py b/test/test_binary_to_asm.py
new file mode 100644
index 0000000..ce53411
--- /dev/null
+++ b/test/test_binary_to_asm.py
@@ -0,0 +1,229 @@
+from os import path, mkdir
+from pathlib import Path
+from shutil import rmtree
+from unittest import TestCase
+
+from asm2vec import __data__
+from asm2vec.binary_to_asm import (bin_to_asm, convert_to_asm, _fn_to_asm, _normalize, _sha3, _valid_exe)
+
+
+class TestBinaryToAsm(TestCase):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        print("\n--- TestBinaryToAsm ---")
+        cls.output_path = "malware_asm/"
+        cls.data_path = path.join(__data__, "5cca32eb8f9c2a024a57ce12e3fb66070662de80")
+        cls.pdf_dict = {
+            'name': 'main',
+            'size': 18,
+            'addr': 4294974144,
+            'ops': [
+                {
+                    'offset': 4294974144,
+                    'esil': 'rbp,8,rsp,-,=[8],8,rsp,-=',
+                    'refptr': 0,
+                    'fcn_addr': 4294974144,
+                    'fcn_last': 4294974161,
+                    'size': 1,
+                    'opcode': 'push rbp',
+                    'disasm': 'push rbp',
+                    'bytes': '55',
+                    'family': 'cpu',
+                    'type': 'rpush',
+                    'reloc': 'False',
+                    'type_num': 268435468,
+                    'type2_num': 0,
+                    'flags': ['main', 'entry0', 'section.0.__TEXT.__text', 'sym.func.100001ac0', 'rip'],
+                    'comment': 'WzAwXSAtci14IHNlY3Rpb24gc2l6ZSA3Mzc2IG5hbWVkIDAuX19URVhULl9fdGV4dA=='
+                },
+                {
+                    'offset': 4294974145,
+                    'esil': 'rsp,rbp,=',
+                    'refptr': 0,
+                    'fcn_addr': 4294974144,
+                    'fcn_last': 4294974159,
+                    'size': 3,
+                    'opcode': 'mov rbp, rsp',
+                    'disasm': 'mov rbp, rsp',
+                    'bytes': '4889e5',
+                    'family': 'cpu',
+                    'type': 'mov',
+                    'reloc': 'False',
+                    'type_num': 9,
+                    'type2_num': 0
+                },
+                {
+                    'offset': 4294974148,
+                    'esil': 'rbx,8,rsp,-,=[8],8,rsp,-=',
+                    'refptr': 0,
+                    'fcn_addr': 4294974144,
+                    'fcn_last': 4294974161,
+                    'size': 1,
+                    'opcode': 'push rbx',
+                    'disasm': 'push rbx',
+                    'bytes': '53',
+                    'family': 'cpu',
+                    'type': 'rpush',
+                    'reloc': 'False',
+                    'type_num': 268435468,
+                    'type2_num': 0
+                },
+                {
+                    'offset': 4294974149,
+                    'esil': 'rax,8,rsp,-,=[8],8,rsp,-=',
+                    'refptr': 0,
+                    'fcn_addr': 4294974144,
+                    'fcn_last': 4294974161,
+                    'size': 1,
+                    'opcode': 'push rax',
+                    'disasm': 'push rax',
+                    'bytes': '50',
+                    'family': 'cpu',
+                    'type': 'rpush',
+                    'reloc': 'False',
+                    'type_num': 268435468,
+                    'type2_num': 0
+                },
+                {
+                    'offset': 4294974150,
+                    'esil': 'rsi,rbx,=',
+                    'refptr': 0,
+                    'fcn_addr': 4294974144,
+                    'fcn_last': 4294974159,
+                    'size': 3,
+                    'opcode': 'mov rbx, rsi',
+                    'disasm': 'mov rbx, rsi',
+                    'bytes': '4889f3',
+                    'family': 'cpu',
+                    'type': 'mov',
+                    'reloc': 'False',
+                    'type_num': 9,
+                    'type2_num': 0
+                },
+                {
+                    'offset': 4294974153,
+                    'ptr': 4294985864,
+                    'esil': '0x2db8,rip,+,[8],rax,=',
+                    'refptr': 8,
+                    'fcn_addr': 4294974144,
+                    'fcn_last': 4294974155,
+                    'size': 7,
+                    'opcode': 'mov rax, qword [rip + 0x2db8]',
+                    'disasm': 'mov rax, qword [0x100004888]',
+                    'bytes': '488b05b82d0000',
+                    'family': 'cpu',
+                    'type': 'mov',
+                    'reloc': 'False',
+                    'type_num': 9,
+                    'type2_num': 0,
+                    'refs': [
+                        {
+                            'addr': 4294985864,
+                            'type': 'DATA',
+                            'perm': 'r--'
+                        }
+                    ]
+                },
+                {
+                    'offset': 4294974160,
+                    'esil': 'rax,rip,=',
+                    'refptr': 0,
+                    'fcn_addr': 4294974144,
+                    'fcn_last': 4294974160,
+                    'size': 2,
+                    'opcode': 'jmp rax',
+                    'disasm': 'jmp rax',
+                    'bytes': 'ffe0',
+                    'family': 'cpu',
+                    'type': 'rjmp',
+                    'reloc': 'False',
+                    'type_num': 268435458,
+                    'type2_num': 0
+                }
+            ]
+        }
+        mkdir(cls.output_path)
+
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        rmtree(cls.output_path)
+
+    def test_sha3(self):
+        """Should return 64-character long string"""
+        asm = ("push rbp\n"
+               "mov rbp, rsp\n"
+               "push rbx\n"
+               "push rax\n"
+               "mov rbx, rsi\n"
+               "mov rax, qword [rip + CONST]\n"
+               "jmp rax")
+        self.assertRegex(_sha3(asm), '^[a-f0-9]{64}')
+
+    def test_valid_exe_when_valid_magic_bytes(self):
+        """Should return boolean"""
+        magic_bytes = ["cffaedfe"]
+        self.assertEqual(_valid_exe(self.data_path, magic_bytes), True)
+
+    def test_valid_exe_when_not_valid_magic_bytes(self):
+        """Should return boolean"""
+        magic_bytes = ["cafebabe"]
+        self.assertEqual(_valid_exe(self.data_path, magic_bytes), False)
+
+    def test_normalize_when_offset(self):
+        """Should return normalized opcode"""
+        opcode = "mov rax, qword [rip + 0x2db8]"
+        expected_norm_opcode = "mov rax, qword [rip + CONST]"
+        self.assertEqual(_normalize(opcode), expected_norm_opcode)
+
+    def test_normalize_when_no_offset(self):
+        """Should return normalized opcode"""
+        opcode = "mov rbx, rsi"
+        expected_norm_opcode = "mov rbx, rsi"
+        self.assertEqual(_normalize(opcode), expected_norm_opcode)
+
+    def test_fn_to_asm_returns_empty_string_when_pdf_none(self):
+        """Should return assembly functions with normalized opcode"""
+        pdf = None
+        asm_min = 5
+        expected_asm = ""
+        self.assertEqual(_fn_to_asm(pdf, asm_min), expected_asm)
+
+    def test_fn_to_asm_returns_empty_string_when_pdfops_shorter_than_minlen(self):
+        """Should return assembly functions with normalized opcode"""
+        asm_minlen = 10
+        expected_asm = ""
+        self.assertEqual(_fn_to_asm(self.pdf_dict, asm_minlen), expected_asm)
+
+    def test_fn_to_asm_returns_expected_asm(self):
+        """Should return assembly functions with normalized opcode"""
+        asm_min = 5
+        expected_asm = (" push rbp\n"
+                        " mov rbp, rsp\n"
+                        " push rbx\n"
+                        " push rax\n"
+                        " mov rbx, rsi\n"
+                        " mov rax, qword [rip + CONST]\n"
+                        " jmp rax\n")
+        self.assertEqual(_fn_to_asm(self.pdf_dict, asm_min), expected_asm)
+
+    def test_bin_to_asm_returns_expected_number_of_disassembled_files(self):
+        asm_minlen = 5
+        magic_bytes = ["cffaedfe"]
+        self.assertEqual(bin_to_asm(Path(self.data_path), Path(self.output_path), asm_minlen, magic_bytes), 1)
+
+    def test_bin_to_asm_returns_expected_number_of_disassembled_files_when_pdfops_shorter_than_minlen(self):
+        asm_minlen = 10
+        magic_bytes = ['cffaedfe']
+        self.assertEqual(bin_to_asm(Path(self.data_path), Path(self.output_path), asm_minlen, magic_bytes), 0)
+
+    def test_convert_to_asm_returns_expected_sha1(self):
+        input_path = __data__
+        asm_minlen_upper = 10
+        asm_minlen_lower = 5
+        expected_sha1 = ["5cca32eb8f9c2a024a57ce12e3fb66070662de80"]
+        self.assertEqual(
+            convert_to_asm(input_path, self.output_path, asm_minlen_upper, asm_minlen_lower),
+            expected_sha1
+        )