diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 0000000..446aa21
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,32 @@
+# This is a comment.
+# Each line is a file pattern followed by one or more owners.
+
+# These owners will be the default owners for everything in
+# the repo. Unless a later match takes precedence,
+# @global-owner1 and @global-owner2 will be requested for
+# review when someone opens a pull request.
+* @wandera/datascience
+
+# Order is important; the last matching pattern takes the most
+# precedence. When someone opens a pull request that only
+# modifies JS files, only @js-owner and not the global
+# owner(s) will be requested for a review.
+# *.js @js-owner
+
+# You can also use email addresses if you prefer. They'll be
+# used to look up users just like we do for commit author
+# emails.
+#*.go docs@example.com
+
+# The `docs/*` pattern will match files like
+# `docs/getting-started.md` but not further nested files like
+# `docs/build-app/troubleshooting.md`.
+# docs/* docs@example.com
+
+# In this example, @octocat owns any file in an apps directory
+# anywhere in your repository.
+# apps/ @octocat
+
+# In this example, @doctocat owns any file in the `/docs`
+# directory in the root of your repository.
+# /docs/ @doctocat
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..cb6efa5
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.10.11-slim
+
+ADD . /asm2vec-pytorch
+WORKDIR asm2vec-pytorch
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ unixodbc-dev \
+ unixodbc \
+ libpq-dev && \
+ pip install -r requirements.txt && \
+ python setup.py install
+
+CMD ["/bin/sh"]
diff --git a/README.md b/README.md
index 7a2043b..637d5db 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# asm2vec-pytorch
-
+
@@ -9,30 +9,17 @@ The details of the model can be found in the original paper: [(sp'19) Asm2Vec: B
## Requirements
-python >= 3.6
-
-| packages | for |
-| --- | --- |
-| r2pipe | `scripts/bin2asm.py` |
-| click | `scripts/*` |
-| torch | almost all code need it |
-
-You also need to install `radare2` to run `scripts/bin2asm.py`. `r2pipe` is just the python interface to `radare2`
-
-If you only want to use the library code, you just need to install `torch`
+* python >= 3.10
+* radare2
+* Packages listed in `requirements.txt`
## Install
```
+pip install -r requirements.txt &&
python setup.py install
```
-or
-
-```
-pip install git+https://github.com/oalieno/asm2vec-pytorch.git
-```
-
## Benchmark
An implementation already exists here: [Lancern/asm2vec](https://github.com/Lancern/asm2vec)
@@ -46,141 +33,20 @@ Following is the benchmark of training 1000 functions in 1 epoch.
## Get Started
-```bash
-python scripts/bin2asm.py -i /bin/ -o asm/
-```
-
-First generate asm files from binarys under `/bin/`.
-You can hit `Ctrl+C` anytime when there is enough data.
-
-```bash
-python scripts/train.py -i asm/ -l 100 -o model.pt --epochs 100
-```
-
-Try to train the model using only 100 functions and 100 epochs for a taste.
-Then you can use more data if you want.
-
-```bash
-python scripts/test.py -i asm/123456 -m model.pt
-```
-
-After you train your model, try to grab an assembly function and see the result.
-This script will show you how the model perform.
-Once you satisfied, you can take out the embedding vector of the function and do whatever you want with it.
+### TODO - update this with description about to how use etc
-## Usage
+## Tests
-### bin2asm.py
+### Run test suite
-```
-Usage: bin2asm.py [OPTIONS]
+* Run all tests: ``python -m unittest discover -v``
+* Run a certain module's tests: ``python -m unittest -v test.test_binary_to_asm``
+* Run a certain test class: ``python -m unittest -v test.test_binary_to_asm.TestBinaryToAsm``
+* Run a certain test method:
- Extract assembly functions from binary executable
+ ``python -m unittest -v test.test_binary_to_asm.TestBinaryToAsm.test_sha3``
-Options:
- -i, --input TEXT input directory / file [required]
- -o, --output TEXT output directory
- -l, --len INTEGER ignore assembly code with instructions amount smaller
- than minlen
+### Coverage
- --help Show this message and exit.
-```
-
-```bash
-# Example
-python bin2asm.py -i /bin/ -o asm/
-```
-
-### train.py
-
-```
-Usage: train.py [OPTIONS]
-
-Options:
- -i, --input TEXT training data folder [required]
- -o, --output TEXT output model path [default: model.pt]
- -m, --model TEXT load previous trained model path
- -l, --limit INTEGER limit the number of functions to be loaded
- -d, --ebedding-dimension INTEGER
- embedding dimension [default: 100]
- -b, --batch-size INTEGER batch size [default: 1024]
- -e, --epochs INTEGER training epochs [default: 10]
- -n, --neg-sample-num INTEGER negative sampling amount [default: 25]
- -a, --calculate-accuracy whether calculate accuracy ( will be
- significantly slower )
-
- -c, --device TEXT hardware device to be used: cpu / cuda /
- auto [default: auto]
-
- -lr, --learning-rate FLOAT learning rate [default: 0.02]
- --help Show this message and exit.
-```
-
-```bash
-# Example
-python train.py -i asm/ -o model.pt --epochs 100
-```
-
-### test.py
-
-```
-Usage: test.py [OPTIONS]
-
-Options:
- -i, --input TEXT target function [required]
- -m, --model TEXT model path [required]
- -e, --epochs INTEGER training epochs [default: 10]
- -n, --neg-sample-num INTEGER negative sampling amount [default: 25]
- -l, --limit INTEGER limit the amount of output probability result
- -c, --device TEXT hardware device to be used: cpu / cuda / auto
- [default: auto]
-
- -lr, --learning-rate FLOAT learning rate [default: 0.02]
- -p, --pretty pretty print table [default: False]
- --help Show this message and exit.
-```
-
-```bash
-# Example
-python test.py -i asm/123456 -m model.pt
-```
-
-```
-┌──────────────────────────────────────────┐
-│ endbr64 │
-│ ➔ push r15 │
-│ push r14 │
-├────────┬─────────────────────────────────┤
-│ 34.68% │ [rdx + rsi*CONST + CONST] │
-│ 20.29% │ push │
-│ 16.22% │ r15 │
-│ 04.36% │ r14 │
-│ 03.55% │ r11d │
-└────────┴─────────────────────────────────┘
-```
-
-### compare.py
-
-```
-Usage: compare.py [OPTIONS]
-
-Options:
- -i1, --input1 TEXT target function 1 [required]
- -i2, --input2 TEXT target function 2 [required]
- -m, --model TEXT model path [required]
- -e, --epochs INTEGER training epochs [default: 10]
- -c, --device TEXT hardware device to be used: cpu / cuda / auto
- [default: auto]
-
- -lr, --learning-rate FLOAT learning rate [default: 0.02]
- --help Show this message and exit.
-```
-
-```bash
-# Example
-python compare.py -i1 asm/123456 -i2 asm/654321 -m model.pt -e 30
-```
-
-```
-cosine similarity : 0.873684
-```
+* Create report: ``coverage run -m unittest discover -v``
+* Read report: ``coverage report -m``
\ No newline at end of file
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000..c478391
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,26 @@
+Thanks for helping make GitHub safe for everyone.
+
+# Security
+
+Jamf takes the security of our software products and services seriously, including all of the open source code repositories managed through our GitHub organizations, such as asm2vec-pytorch.
+
+We will ensure that your finding gets passed along to the appropriate maintainers for remediation.
+
+# Reporting Security Issues
+
+If you believe you have found a security vulnerability in any Jamf-owned repository, please report it to us through coordinated disclosure.
+
+Please do not report security vulnerabilities through public GitHub issues, discussions, or pull requests.
+
+Instead, please send an email to info[@]jamf.com.
+
+Please include as much of the information listed below as you can to help us better understand and resolve the issue:
+- The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting)
+- Full paths of source file(s) related to the manifestation of the issue
+- The location of the affected source code (tag/branch/commit or direct URL)
+- Any special configuration required to reproduce the issue
+- Step-by-step instructions to reproduce the issue
+- Proof-of-concept or exploit code (if possible)
+- Impact of the issue, including how an attacker might exploit the issue
+
+This information will help us triage your report more quickly.
diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py
index 0962ef8..6e9d963 100644
--- a/asm2vec/__init__.py
+++ b/asm2vec/__init__.py
@@ -1,6 +1,9 @@
-import importlib
+import os
-__all__ = ['model', 'datatype', 'utils']
+__home__ = os.path.dirname(os.path.abspath(__path__[0]))
+__data__ = os.path.join(__home__, "data")
-for module in __all__:
- importlib.import_module(f'.{module}', 'asm2vec')
+__all__ = [
+ "__data__", "__home__", "binary_to_asm", "data", "datatype", "model", "similarity", "tensors", "test", "train",
+ "utilities", "version"
+]
diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py
new file mode 100644
index 0000000..1da1389
--- /dev/null
+++ b/asm2vec/binary_to_asm.py
@@ -0,0 +1,168 @@
+import re
+import os
+import hashlib
+import r2pipe
+import logging
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+def _sha3(asm: str) -> str:
+ """
+ Produces SHA3 for each assembly function
+ :param asm: Input assembly function
+ :return: Hashed string
+ """
+ return hashlib.sha3_256(asm.encode()).hexdigest()
+
+
+def _valid_exe(filename: str, magic_bytes: list[str]) -> bool:
+ """
+ Extracts magic bytes and returns the header
+ :param filename: Name of the malware file (SHA1)
+ :param magic_bytes: For the specific OS/type of binary
+ :return: Boolean of the header existing in magic bytes
+ """
+ magics = [bytes.fromhex(i) for i in magic_bytes]
+ with open(filename, 'rb') as f:
+ header = f.read(4)
+ return header in magics
+
+
+def _normalize(opcode: str) -> str:
+ """
+ Normalizes the input opcode string
+ :param opcode: Opcode of the binary
+ :return Normalized opcode string
+ """
+ opcode = opcode.replace(' - ', ' + ')
+ opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode)
+ opcode = re.sub(r'\*[0-9]', '*CONST', opcode)
+ opcode = re.sub(r' [0-9]', ' CONST', opcode)
+ return opcode
+
+
+def _fn_to_asm(pdf: dict | None, asm_minlen: int) -> str:
+ """
+ Converts functions to assembly code
+ :param pdf: disassembly
+ :param asm_minlen: minimum length of assembly functions to be extracted
+ :return: ASM string
+ """
+ if pdf is None:
+ return ''
+ if len(pdf['ops']) < asm_minlen:
+ return ''
+ if 'invalid' in [op['type'] for op in pdf['ops']]:
+ return ''
+
+ ops = pdf['ops']
+
+ labels, scope = {}, [op['offset'] for op in ops]
+ assert (None not in scope)
+ for i, op in enumerate(ops):
+ if op.get('jump') in scope:
+ labels.setdefault(op.get('jump'), i)
+
+ output = ''
+ for op in ops:
+ if labels.get(op.get('offset')) is not None:
+ output += f'LABEL{labels[op["offset"]]}:\n'
+ if labels.get(op.get('jump')) is not None:
+ output += f' {op["type"]} LABEL{labels[op["jump"]]}\n'
+ else:
+ output += f' {_normalize(op["opcode"])}\n'
+
+ return output
+
+
+def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int, magic_bytes: list[str]) -> int:
+ """
+ Fragments the input binary into assembly functions via r2pipe
+ :param filename: name of the malware file (SHA1)
+ :param output_path: path to the folder to store the assembly functions for each malware
+ :param asm_minlen: the minimum length of assembly functions to be extracted
+ :param magic_bytes for the specific OS/type of binary
+ :return: the number of assembly functions
+ """
+ if not _valid_exe(filename, magic_bytes):
+ logging.info('The input file is invalid.')
+ return 0
+
+ r = r2pipe.open(str(filename))
+ r.cmd('aaaa')
+
+ count = 0
+
+ for fn in r.cmdj('aflj'):
+ r.cmd(f's {fn["offset"]}')
+ asm = _fn_to_asm(r.cmdj('pdfj'), asm_minlen)
+ if asm:
+ uid = _sha3(asm)
+ asm = f''' .name {fn["name"]}\
+ .offset {fn["offset"]:016x}\
+ .file {filename.name}''' + asm
+ output_asm = os.path.join(output_path, uid)
+ with open(output_asm, 'w') as file:
+ file.write(asm)
+ count += 1
+ return count
+
+
+def convert_to_asm(
+ input_path: str, output_path: str, minlen_upper: int, minlen_lower: int, magic_bytes: list[str] = None
+) -> list:
+ """
+ Extracts assembly functions from malware files and saves them into separate folder per binary
+ :param input_path: Path to the malware binaries
+ :param output_path: Path for the assembly functions to be extracted
+ :param minlen_upper: Minimum number of assembly functions needed for disassembling
+ :param minlen_lower: If disassembling is not possible with minlen_upper, lower the minimum number of assembly
+ functions to minlen_lower (WHAT?)
+ :param magic_bytes: List of valid for the specific OS/type of binary, e.g.
+ - 'cffaedfe': for Mach-O Little Endian (64-bit)
+ - 'feedfacf': for Mach-O Big Endian (64-bit)
+ - 'cefaedfe': for Mach-O Little Endian (32-bit)
+ - 'feedface': Mach-O Big Endian (32-bit)
+ - 'cafebabe': Universal Binary Big Endian
+ - 'bebafeca'
+ :return: List of sha1 of disassembled malware files
+ """
+ if not magic_bytes:
+ magic_bytes = ['cffaedfe', 'feedfacf', 'cafebabe', 'cefaedfe', 'feedface', 'bebafeca']
+
+ binary_dir = Path(input_path)
+ asm_dir = Path(output_path)
+
+ if not os.path.exists(asm_dir):
+ os.mkdir(asm_dir)
+
+ function_count, binary_count, not_found = 0, 0, 0
+ disassembled_bins = []
+
+ if os.path.isdir(binary_dir):
+ for entry in os.scandir(binary_dir):
+ out_dir = os.path.join(asm_dir, entry.name)
+ if not (os.path.exists(out_dir)):
+ os.mkdir(out_dir)
+ function_count = bin_to_asm(Path(entry), Path(out_dir), minlen_upper, magic_bytes)
+ if function_count == 0:
+ function_count = bin_to_asm(Path(entry), Path(out_dir), minlen_lower, magic_bytes)
+ if function_count == 0:
+ os.rmdir(out_dir)
+ logging.info('The binary {} was not disassembled'.format(entry.name))
+ else:
+ binary_count += 1
+ disassembled_bins.append(entry.name)
+ else:
+ binary_count += 1
+ disassembled_bins.append(entry.name)
+ else:
+ not_found += 1
+ logging.info("[Error] No such file or directory: {}".format(binary_dir))
+
+ logging.info("Total scanned binaries: {}".format(binary_count))
+ logging.info("Not converted binaries: {}".format(not_found))
+
+ return disassembled_bins
diff --git a/asm2vec/data.py b/asm2vec/data.py
new file mode 100644
index 0000000..6713c38
--- /dev/null
+++ b/asm2vec/data.py
@@ -0,0 +1,43 @@
+import os
+from pathlib import Path
+from torch.utils.data import Dataset
+
+from asm2vec.datatype import Tokens, Function
+
+
+class AsmDataset(Dataset):
+ # TODO - doc string - explain what this class does - how does it extend `Dataset`?
+ def __init__(self, x, y):
+ self.x = x
+ self.y = y
+
+ def __len__(self):
+ return len(self.x)
+
+ def __getitem__(self, index):
+ return self.x[index], self.y[index]
+
+
+def load_data(paths, limit=None):
+ # TODO - doc string
+ if type(paths) is not list:
+ paths = [paths]
+
+ filenames = []
+ for path in paths:
+ if os.path.isdir(path):
+ filenames += [Path(path) / filename for filename in sorted(os.listdir(path))
+ if os.path.isfile(Path(path) / filename)]
+ else:
+ filenames += [Path(path)]
+
+ functions, tokens = [], Tokens()
+ for i, filename in enumerate(filenames):
+ if limit and i >= limit:
+ break
+ with open(filename) as f:
+ fn = Function.load(f.read())
+ functions.append(fn)
+ tokens.add(fn.tokens())
+
+ return functions, tokens
diff --git a/asm2vec/datatype.py b/asm2vec/datatype.py
index a3cd39b..f618800 100644
--- a/asm2vec/datatype.py
+++ b/asm2vec/datatype.py
@@ -2,19 +2,25 @@
import random
import warnings
+# TODO - doc strings
+
+
class Token:
def __init__(self, name, index):
self.name = name
self.index = index
self.count = 1
+
def __str__(self):
return self.name
+
class Tokens:
def __init__(self, name_to_index=None, tokens=None):
self.name_to_index = name_to_index or {}
self.tokens = tokens or []
self._weights = None
+
def __getitem__(self, key):
if type(key) is str:
if self.name_to_index.get(key) is None:
@@ -28,13 +34,17 @@ def __getitem__(self, key):
return [self[k] for k in key]
except:
raise ValueError
+
def load_state_dict(self, sd):
self.name_to_index = sd['name_to_index']
self.tokens = sd['tokens']
+
def state_dict(self):
return {'name_to_index': self.name_to_index, 'tokens': self.tokens}
+
def size(self):
return len(self.tokens)
+
def add(self, names):
self._weights = None
if type(names) is not list:
@@ -46,6 +56,7 @@ def add(self, names):
self.tokens.append(token)
else:
self.tokens[self.name_to_index[name]].count += 1
+
def update(self, tokens_new):
for token in tokens_new:
if token.name not in self.name_to_index:
@@ -54,6 +65,7 @@ def update(self, tokens_new):
self.tokens.append(token)
else:
self.tokens[self.name_to_index[token.name]].count += token.count
+
def weights(self):
# if no cache, calculate
if self._weights is None:
@@ -62,19 +74,22 @@ def weights(self):
for token in self.tokens:
self._weights[token.index] = (token.count / total) ** 0.75
return self._weights
+
def sample(self, batch_size, num=5):
return torch.multinomial(self.weights(), num * batch_size, replacement=True).view(batch_size, num)
+
class Function:
def __init__(self, insts, blocks, meta):
self.insts = insts
self.blocks = blocks
self.meta = meta
+
@classmethod
def load(cls, text):
- '''
- gcc -S format compatiable
- '''
+ """gcc -S format compatible
+ """
+
label, labels, insts, blocks, meta = None, {}, [], [], {}
for line in text.strip('\n').split('\n'):
if line[0] in [' ', '\t']:
@@ -109,10 +124,13 @@ def load(cls, text):
if labels.get(arg):
inst.args[i] = 'CONST'
return cls(insts, blocks, meta)
+
def tokens(self):
return [token for inst in self.insts for token in inst.tokens()]
+
def random_walk(self, num=3):
return [self._random_walk() for _ in range(num)]
+
def _random_walk(self):
current, visited, seq = self.blocks[0], [], []
while current not in visited:
@@ -124,25 +142,31 @@ def _random_walk(self):
current = random.choice(list(current.successors))
return seq
+
class BasicBlock:
def __init__(self):
self.insts = []
self.successors = set()
+
def add(self, inst):
self.insts.append(inst)
+
def end(self):
inst = self.insts[-1]
return inst.is_jmp() or inst.op == 'ret'
+
class Instruction:
def __init__(self, op, args):
self.op = op
self.args = args
+
def __str__(self):
return f'{self.op} {", ".join([str(arg) for arg in self.args if str(arg)])}'
+
@classmethod
def load(cls, text):
- text = text.strip().strip('bnd').strip() # get rid of BND prefix
+ text = text.strip().strip('bnd').strip()
op, _, args = text.strip().partition(' ')
if args:
args = [arg.strip() for arg in args.split(',')]
@@ -150,9 +174,12 @@ def load(cls, text):
args = []
args = (args + ['', ''])[:2]
return cls(op, args)
+
def tokens(self):
return [self.op] + self.args
+
def is_jmp(self):
return 'jmp' in self.op or self.op[0] == 'j'
+
def is_call(self):
return self.op == 'call'
diff --git a/asm2vec/model.py b/asm2vec/model.py
index 301f3be..51dc433 100644
--- a/asm2vec/model.py
+++ b/asm2vec/model.py
@@ -1,43 +1,82 @@
import torch
import torch.nn as nn
+from asm2vec.datatype import Tokens
+
bce, sigmoid, softmax = nn.BCELoss(), nn.Sigmoid(), nn.Softmax(dim=1)
+
+# TODO - doc strings
+
+
class ASM2VEC(nn.Module):
def __init__(self, vocab_size, function_size, embedding_size):
super(ASM2VEC, self).__init__()
- self.embeddings = nn.Embedding(vocab_size, embedding_size, _weight=torch.zeros(vocab_size, embedding_size))
- self.embeddings_f = nn.Embedding(function_size, 2 * embedding_size, _weight=(torch.rand(function_size, 2 * embedding_size)-0.5)/embedding_size/2)
- self.embeddings_r = nn.Embedding(vocab_size, 2 * embedding_size, _weight=(torch.rand(vocab_size, 2 * embedding_size)-0.5)/embedding_size/2)
+ self.embeddings = nn.Embedding(vocab_size, embedding_size, _weight=torch.zeros(vocab_size, embedding_size))
+ self.embeddings_f = nn.Embedding(function_size, 2 * embedding_size,
+ _weight=(torch.rand(function_size, 2 * embedding_size)-0.5)/embedding_size/2)
+ self.embeddings_r = nn.Embedding(vocab_size, 2 * embedding_size,
+ _weight=(torch.rand(vocab_size, 2 * embedding_size)-0.5)/embedding_size/2)
def update(self, function_size_new, vocab_size_new):
device = self.embeddings.weight.device
- vocab_size, function_size, embedding_size = self.embeddings.num_embeddings, self.embeddings_f.num_embeddings, self.embeddings.embedding_dim
+ vocab_size, function_size, embedding_size = (self.embeddings.num_embeddings,
+ self.embeddings_f.num_embeddings, self.embeddings.embedding_dim)
if vocab_size_new != vocab_size:
- weight = torch.cat([self.embeddings.weight, torch.zeros(vocab_size_new - vocab_size, embedding_size).to(device)])
+ weight = torch.cat([self.embeddings.weight, torch.zeros(vocab_size_new - vocab_size, embedding_size).
+ to(device)])
self.embeddings = nn.Embedding(vocab_size_new, embedding_size, _weight=weight)
- weight_r = torch.cat([self.embeddings_r.weight, ((torch.rand(vocab_size_new - vocab_size, 2 * embedding_size)-0.5)/embedding_size/2).to(device)])
+ weight_r = torch.cat([self.embeddings_r.weight,
+ ((torch.rand(vocab_size_new - vocab_size, 2 * embedding_size)-0.5)/embedding_size/2)
+ .to(device)])
self.embeddings_r = nn.Embedding(vocab_size_new, 2 * embedding_size, _weight=weight_r)
- self.embeddings_f = nn.Embedding(function_size_new, 2 * embedding_size, _weight=((torch.rand(function_size_new, 2 * embedding_size)-0.5)/embedding_size/2).to(device))
+ self.embeddings_f = nn.Embedding(function_size_new, 2 * embedding_size,
+ _weight=((torch.rand(function_size_new, 2 * embedding_size)-0.5) /
+ embedding_size/2).to(device))
def v(self, inp):
- e = self.embeddings(inp[:,1:])
- v_f = self.embeddings_f(inp[:,0])
- v_prev = torch.cat([e[:,0], (e[:,1] + e[:,2]) / 2], dim=1)
- v_next = torch.cat([e[:,3], (e[:,4] + e[:,5]) / 2], dim=1)
+ e = self.embeddings(inp[:, 1:])
+ v_f = self.embeddings_f(inp[:, 0])
+ v_prev = torch.cat([e[:, 0], (e[:, 1] + e[:, 2]) / 2], dim=1)
+ v_next = torch.cat([e[:, 3], (e[:, 4] + e[:, 5]) / 2], dim=1)
v = ((v_f + v_prev + v_next) / 3).unsqueeze(2)
return v
def forward(self, inp, pos, neg):
device, batch_size = inp.device, inp.shape[0]
v = self.v(inp)
- # negative sampling loss
pred = torch.bmm(self.embeddings_r(torch.cat([pos, neg], dim=1)), v).squeeze()
label = torch.cat([torch.ones(batch_size, 3), torch.zeros(batch_size, neg.shape[1])], dim=1).to(device)
return bce(sigmoid(pred), label)
- def predict(self, inp, pos):
+ def predict(self, inp, pos): # Why is pos not used? Why does Predict differ so much from Forward?
device, batch_size = inp.device, inp.shape[0]
v = self.v(inp)
- probs = torch.bmm(self.embeddings_r(torch.arange(self.embeddings_r.num_embeddings).repeat(batch_size, 1).to(device)), v).squeeze(dim=2)
+ probs = torch.bmm(self.embeddings_r(torch.arange(self.embeddings_r.num_embeddings).repeat(batch_size, 1).
+ to(device)), v).squeeze(dim=2)
return softmax(probs)
+
+
+def save_model(path: str, model: ASM2VEC, tokens: Tokens) -> None:
+ torch.save(
+ {
+ 'model_params': (
+ model.embeddings.num_embeddings,
+ model.embeddings_f.num_embeddings,
+ model.embeddings.embedding_dim
+ ),
+ 'model': model.state_dict(),
+ 'tokens': tokens.state_dict(),
+ },
+ path
+ )
+
+
+def load_model(path: str, device: str = 'cpu') -> tuple[ASM2VEC, Tokens]:
+ checkpoint = torch.load(path, map_location=device)
+ tokens = Tokens()
+ tokens.load_state_dict(checkpoint['tokens'])
+ model = ASM2VEC(*checkpoint['model_params'])
+ model.load_state_dict(checkpoint['model'])
+ model = model.to(device)
+ return model, tokens
diff --git a/asm2vec/similarity.py b/asm2vec/similarity.py
new file mode 100644
index 0000000..ea52327
--- /dev/null
+++ b/asm2vec/similarity.py
@@ -0,0 +1,48 @@
+import torch
+
+from asm2vec.data import load_data
+from asm2vec.model import load_model
+from asm2vec.train import train
+
+
+def cosine_similarity(v1, v2) -> float:
+ return (v1 @ v2 / (v1.norm() * v2.norm())).item()
+
+
+def compare_two(
+ data_path_1: str, data_path_2: str, model_path: str, epochs: int = 10, device: str = "cpu",
+ learning_rate: float = 0.02
+) -> float:
+ """This function produces the cosine similarity of a pair of assembly functions
+ :param data_path_1: the path to the assembly function no. 1
+ :param data_path_2: the path to the assembly function no. 2
+ :param model_path: the path to the trained asm2vec model
+ :param epochs: the number of epochs for calculating the tensor representations; (Optional, default = 10)
+ :param device: 'auto' | 'cuda' | 'cpu' (Optional, default 'cpu')
+ :param learning_rate: learning rate; (Optional; default = 0.02)
+ :return the cosine similarity value
+ """
+ if device == "auto":
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ model, tokens = load_model(model_path, device=device)
+ functions, tokens_new = load_data([data_path_1, data_path_2])
+ tokens.update(tokens_new)
+ model.update(2, tokens.size())
+ model = model.to(device)
+
+ model = train(
+ functions,
+ tokens,
+ model=model,
+ epochs=epochs,
+ device=device,
+ mode="update",
+ learning_rate=learning_rate
+ )
+
+ v1, v2 = model.to("cpu").embeddings_f(torch.tensor([0, 1]))
+ similarity = cosine_similarity(v1, v2)
+ print(f"Cosine similarity : {similarity:.6f}")
+
+ return similarity
diff --git a/asm2vec/tensors.py b/asm2vec/tensors.py
new file mode 100644
index 0000000..78a356e
--- /dev/null
+++ b/asm2vec/tensors.py
@@ -0,0 +1,73 @@
+import os
+import torch
+import logging
+from pathlib import Path
+
+from asm2vec.train import train, load_model, load_data
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+def calc_tensors(
+ asm_path: str, tensor_path: str, model_path: str, epochs: int, device: str = 'cpu', learning_rate: float = 0.02
+) -> list:
+ """
+ Calculates vector representation of a binary as the mean per column of the vector representations of its assembly
+ functions.
+ :param asm_path: Path to folder with assembly function in a sub-folder per binary
+ :param tensor_path: Path to folder to store the tensors
+ :param model_path: Path to the trained model
+ :param epochs: Number of epochs
+ :param device: 'auto' | 'cuda' | 'cpu'
+ :param learning_rate: Learning rate
+ :return: List of tensors
+ """
+ tensors_list = []
+ if device == 'auto':
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+ if os.path.isfile(model_path):
+ model, tokens = load_model(model_path, device=device)
+ else:
+ print("No valid model")
+ return []
+
+ dir0 = Path(tensor_path)
+ if not (os.path.exists(dir0)):
+ os.mkdir(dir0)
+
+ if os.path.isdir(asm_path):
+ obj = os.scandir(asm_path)
+ for entry in obj:
+ if entry.is_dir() and os.listdir(entry) and entry.name:
+ tensor_file = os.path.join(dir0, entry.name)
+ if not (os.path.exists(tensor_file)):
+ functions, tokens_new = load_data([entry])
+ file_count = sum(len(files) for _, _, files in os.walk(entry))
+ tokens.update(tokens_new)
+ logging.info(f"Binary {entry.name}: {file_count} assembly functions")
+ model.update(file_count, tokens.size())
+ model = model.to(device)
+
+ model = train(
+ functions,
+ tokens,
+ model=model,
+ epochs=epochs,
+ device=device,
+ mode='update',
+ learning_rate=learning_rate
+ )
+
+ tensor = model.to('cpu').embeddings_f(torch.tensor([list(range(0, file_count))]))
+ tens = torch.squeeze(tensor)
+ if file_count == 1:
+ torch.save(tensor, tensor_file)
+ else:
+ torch.save(tens.mean(0), tensor_file)
+ tensors_list.append(entry.name)
+
+ else:
+ logging.info("No valid directory")
+
+ return tensors_list
diff --git a/asm2vec/test.py b/asm2vec/test.py
new file mode 100644
index 0000000..b80cc14
--- /dev/null
+++ b/asm2vec/test.py
@@ -0,0 +1,39 @@
+import torch
+
+from asm2vec.data import load_data
+from asm2vec.model import load_model
+from asm2vec.train import train, preprocess
+from asm2vec.utilities import show_probs
+
+
+def test_model(
+ data_path: str, model_path: str, epochs: int = 10, neg_sample_num: int = 25, limit: int | None = None,
+ device: str = "cpu", learning_rate: float = 0.02, pretty: bool = False
+) -> None:
+ # TODO - doc string
+ if device == "auto":
+ device = "cuda" if torch.cuda.is_available() else "cpu"
+
+ # load model, tokens
+ model, tokens = load_model(model_path, device=device)
+ functions, tokens_new = load_data(data_path)
+ tokens.update(tokens_new)
+ model.update(1, tokens.size())
+ model = model.to(device)
+
+ # train function embedding
+ model = train(
+ functions,
+ tokens,
+ model=model,
+ epochs=epochs,
+ neg_sample_num=neg_sample_num,
+ device=device,
+ mode="update",
+ learning_rate=learning_rate
+ )
+
+ # show predicted probability results
+ x, y = preprocess(functions, tokens)
+ probs = model.predict(x.to(device), y.to(device))
+ show_probs(x, y, probs, tokens, limit=limit, pretty=pretty)
diff --git a/asm2vec/train.py b/asm2vec/train.py
new file mode 100644
index 0000000..4de7a81
--- /dev/null
+++ b/asm2vec/train.py
@@ -0,0 +1,134 @@
+import time
+import torch
+from pathlib import Path
+from torch.utils.data import DataLoader
+from asm2vec.data import AsmDataset, load_data
+from asm2vec.datatype import Function, Tokens
+from asm2vec.model import ASM2VEC, load_model, save_model
+from asm2vec.utilities import accuracy, callback
+
+
+def preprocess(functions, tokens):
+ x, y = [], []
+ for i, fn in enumerate(functions):
+ for seq in fn.random_walk():
+ for j in range(1, len(seq) - 1):
+ x.append([i] + [tokens[token].index for token in seq[j - 1].tokens() + seq[j + 1].tokens()])
+ y.append([tokens[token].index for token in seq[j].tokens()])
+ return torch.tensor(x), torch.tensor(y)
+
+
+def train(
+ functions: list[Function], tokens: Tokens, model: ASM2VEC | None = None, embedding_size: int = 100,
+ batch_size: int = 1024, epochs: int = 10, neg_sample_num: int = 25, calc_acc: bool = False, device: str = 'cpu',
+ mode: str = 'train', verbose: bool = False, learning_rate: float = 0.02
+):
+ """This function trains a model on the given assembly functions and tokens
+ :param functions: list of assembly functions
+ :param tokens: tokens (operations, operands) of the assembly function
+ :param model: type of the model; ; (Optional, default ASM2VEC)
+ :param embedding_size: size of the tensor representation of an assembly function; (Optional, default value = 100)
+ :param batch_size: size of the batch for each epoch of training; (Optional, default value = 1024)
+ :param epochs: number of epochs for training the model; (Optional, default value = 10)
+ :param neg_sample_num: size of the negative sample; (Optional, default value = 25)
+ :param calc_acc: if set to True, the accuracy per training epoch is displayed; (Optional, default False)
+ :param device: the device used for processing; (Optional, default 'cpu')
+ :param mode: 'train' (to train a new model) | 'update' (to add to an already trained model's dictionary);
+ (Optional, default 'train')
+ :param verbose: if True performs training in verbose mode; (Optional, default False)
+ :param learning_rate: learning rate
+ """
+ if mode == 'train':
+ if model is None:
+ model = ASM2VEC(tokens.size(), function_size=len(functions), embedding_size=embedding_size).to(device)
+ optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
+ elif mode == 'update':
+ if model is None:
+ raise ValueError("Update mode requires a pretrained model")
+ optimizer = torch.optim.Adam(model.embeddings_f.parameters(), lr=learning_rate)
+ else:
+ raise ValueError("Unknown mode")
+
+ loader = DataLoader(AsmDataset(*preprocess(functions, tokens)), batch_size=batch_size, shuffle=True)
+ for epoch in range(epochs):
+ start = time.time()
+ loss_sum, loss_count, accs = 0.0, 0, []
+
+ model.train()
+ for i, (inp, pos) in enumerate(loader):
+ neg = tokens.sample(inp.shape[0], neg_sample_num)
+ loss = model(inp.to(device), pos.to(device), neg.to(device))
+ loss_sum, loss_count = loss_sum + loss, loss_count + 1
+
+ optimizer.zero_grad()
+ loss.backward()
+ optimizer.step()
+
+ if i == 0 and calc_acc:
+ probs = model.predict(inp.to(device), pos.to(device))
+ accs.append(accuracy(pos, probs))
+
+ if verbose:
+ callback({
+ 'model': model,
+ 'tokens': tokens,
+ 'epoch': epoch,
+ 'time': time.time() - start,
+ 'loss': loss_sum / loss_count,
+ 'accuracy': torch.tensor(accs).mean() if calc_acc else None
+ })
+
+ return model
+
+
+def train_asm2vec_model(
+ train_set: str, new_model: str, model_path: str | None, epochs: int, limit: int | None = None,
+ calc_acc: bool = False, embedding_size: int = 100, batch_size: int = 1024, neg_sample: int = 25,
+ learning_rate: float = 0.02, device: str = 'cpu'
+) -> ASM2VEC:
+ # TODO - this is just a wrapper - can we do this smarter?
+ """Trains an ASM2VEC model
+ :param train_set: path to the training dataset
+ :param new_model: path to the model to be trained
+ :param model_path: path to already trained model
+ :param limit: number of the assembly functions that the model will be trained on; if not defined, all the assembly
+ functions in train_set_path
+ :param epochs: number of epochs
+ :param calc_acc: displays the accuracy per training epoch; setting it to True will slow down the training
+ :param embedding_size: size of the vector representation for a token; an assembly function will be represented
+ with a vector twice that size
+ :param batch_size: the size of batches for training
+ :param neg_sample: negative sampling amount
+ :param device: 'auto' | 'cuda' | 'cpu'
+ :param learning_rate: learning rate
+ :return an ASM2VEC model
+ """
+
+ if device == 'auto':
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+ if model_path:
+ model, tokens = load_model(model_path, device=device)
+ functions, tokens_new = load_data(train_set, limit=limit)
+ tokens.update(tokens_new)
+ model.update(len(functions), tokens.size())
+ else:
+ model = None
+ functions, tokens = load_data(Path(train_set), limit=limit)
+
+ model = train(
+ functions,
+ tokens,
+ model=model,
+ embedding_size=embedding_size,
+ batch_size=batch_size,
+ epochs=epochs,
+ neg_sample_num=neg_sample,
+ calc_acc=calc_acc,
+ device=device,
+ verbose=True,
+ learning_rate=learning_rate
+ )
+ save_model(new_model, model, tokens)
+
+ return model
diff --git a/asm2vec/utilities.py b/asm2vec/utilities.py
new file mode 100644
index 0000000..dd39aac
--- /dev/null
+++ b/asm2vec/utilities.py
@@ -0,0 +1,55 @@
+import logging
+import torch
+
+from asm2vec.datatype import Instruction
+
+logging.basicConfig(level=logging.INFO, format='%(message)s')
+
+
+# TODO - Why do we have both logging and print?
+# TODO - Doc strings
+
+def accuracy(y, probs):
+ return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)]))
+
+
+def callback(context) -> None:
+ """Prettifies the display of accuracy, if chosen
+ """
+ progress = f'{context["epoch"]} | time = {context["time"]:.2f},\
+ loss = {context["loss"]:.4f}'
+
+ if context["accuracy"]:
+ progress += f', accuracy = {context["accuracy"]:.4f}'
+ logging.info(f"{progress}")
+
+
+def show_probs(x, y, probs, tokens, limit=None, pretty=False):
+ if pretty:
+ tl, tr, bl, br = '┌', '┐', '└', '┘'
+ lm, rm, tm, bm = '├', '┤', '┬', '┴'
+ h, v = '─', '│'
+ arrow = ' ➔'
+ else:
+ tl, tr, bl, br = '+', '+', '+', '+'
+ lm, rm, tm, bm = '+', '+', '+', '+'
+ h, v = '-', '|'
+ arrow = '->'
+ top = probs.topk(5)
+ for i, (xi, yi) in enumerate(zip(x, y)):
+ if limit and i >= limit:
+ break
+ xi, yi = xi.tolist(), yi.tolist()
+ print(tl + h * 42 + tr)
+ print(f'{v} {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {v}')
+ print(f'{v} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {v}')
+ print(f'{v} {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {v}')
+ print(lm + h * 8 + tm + h * 33 + rm)
+ for value, index in zip(top.values[i], top.indices[i]):
+ if index in yi:
+ colorbegin, colorclear = '\033[92m', '\033[0m'
+ else:
+ colorbegin, colorclear = '', ''
+ print(f'{v} {colorbegin}{value * 100:05.2f}%{colorclear} {v} {colorbegin}'
+ f'{tokens[index.item()].name:31}{colorclear} {v}')
+ print(bl + h * 8 + bm + h * 33 + br)
diff --git a/asm2vec/utils.py b/asm2vec/utils.py
deleted file mode 100644
index 4f9aa25..0000000
--- a/asm2vec/utils.py
+++ /dev/null
@@ -1,156 +0,0 @@
-import os
-import time
-import torch
-from torch.utils.data import DataLoader, Dataset
-from pathlib import Path
-from .datatype import Tokens, Function, Instruction
-from .model import ASM2VEC
-
-class AsmDataset(Dataset):
- def __init__(self, x, y):
- self.x = x
- self.y = y
- def __len__(self):
- return len(self.x)
- def __getitem__(self, index):
- return self.x[index], self.y[index]
-
-def load_data(paths, limit=None):
- if type(paths) is not list:
- paths = [paths]
-
- filenames = []
- for path in paths:
- if os.path.isdir(path):
- filenames += [Path(path) / filename for filename in sorted(os.listdir(path)) if os.path.isfile(Path(path) / filename)]
- else:
- filenames += [Path(path)]
-
- functions, tokens = [], Tokens()
- for i, filename in enumerate(filenames):
- if limit and i >= limit:
- break
- with open(filename) as f:
- fn = Function.load(f.read())
- functions.append(fn)
- tokens.add(fn.tokens())
-
- return functions, tokens
-
-def preprocess(functions, tokens):
- x, y = [], []
- for i, fn in enumerate(functions):
- for seq in fn.random_walk():
- for j in range(1, len(seq) - 1):
- x.append([i] + [tokens[token].index for token in seq[j-1].tokens() + seq[j+1].tokens()])
- y.append([tokens[token].index for token in seq[j].tokens()])
- return torch.tensor(x), torch.tensor(y)
-
-def train(
- functions,
- tokens,
- model=None,
- embedding_size=100,
- batch_size=1024,
- epochs=10,
- neg_sample_num=25,
- calc_acc=False,
- device='cpu',
- mode='train',
- callback=None,
- learning_rate=0.02
-):
- if mode == 'train':
- if model is None:
- model = ASM2VEC(tokens.size(), function_size=len(functions), embedding_size=embedding_size).to(device)
- optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
- elif mode == 'test':
- if model is None:
- raise ValueError("test mode required pretrained model")
- optimizer = torch.optim.Adam(model.embeddings_f.parameters(), lr=learning_rate)
- else:
- raise ValueError("Unknown mode")
-
- loader = DataLoader(AsmDataset(*preprocess(functions, tokens)), batch_size=batch_size, shuffle=True)
- for epoch in range(epochs):
- start = time.time()
- loss_sum, loss_count, accs = 0.0, 0, []
-
- model.train()
- for i, (inp, pos) in enumerate(loader):
- neg = tokens.sample(inp.shape[0], neg_sample_num)
- loss = model(inp.to(device), pos.to(device), neg.to(device))
- loss_sum, loss_count = loss_sum + loss, loss_count + 1
-
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
-
- if i == 0 and calc_acc:
- probs = model.predict(inp.to(device), pos.to(device))
- accs.append(accuracy(pos, probs))
-
- if callback:
- callback({
- 'model': model,
- 'tokens': tokens,
- 'epoch': epoch,
- 'time': time.time() - start,
- 'loss': loss_sum / loss_count,
- 'accuracy': torch.tensor(accs).mean() if calc_acc else None
- })
-
- return model
-
-def save_model(path, model, tokens):
- torch.save({
- 'model_params': (
- model.embeddings.num_embeddings,
- model.embeddings_f.num_embeddings,
- model.embeddings.embedding_dim
- ),
- 'model': model.state_dict(),
- 'tokens': tokens.state_dict(),
- }, path)
-
-def load_model(path, device='cpu'):
- checkpoint = torch.load(path, map_location=device)
- tokens = Tokens()
- tokens.load_state_dict(checkpoint['tokens'])
- model = ASM2VEC(*checkpoint['model_params'])
- model.load_state_dict(checkpoint['model'])
- model = model.to(device)
- return model, tokens
-
-def show_probs(x, y, probs, tokens, limit=None, pretty=False):
- if pretty:
- TL, TR, BL, BR = '┌', '┐', '└', '┘'
- LM, RM, TM, BM = '├', '┤', '┬', '┴'
- H, V = '─', '│'
- arrow = ' ➔'
- else:
- TL = TR = BL = BR = '+'
- LM = RM = TM = BM = '+'
- H, V = '-', '|'
- arrow = '->'
- top = probs.topk(5)
- for i, (xi, yi) in enumerate(zip(x, y)):
- if limit and i >= limit:
- break
- xi, yi = xi.tolist(), yi.tolist()
- print(TL + H * 42 + TR)
- print(f'{V} {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {V}')
- print(f'{V} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {V}')
- print(f'{V} {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {V}')
- print(LM + H * 8 + TM + H * 33 + RM)
- for value, index in zip(top.values[i], top.indices[i]):
- if index in yi:
- colorbegin, colorclear = '\033[92m', '\033[0m'
- else:
- colorbegin, colorclear = '', ''
- print(f'{V} {colorbegin}{value*100:05.2f}%{colorclear} {V} {colorbegin}{tokens[index.item()].name:31}{colorclear} {V}')
- print(BL + H * 8 + BM + H * 33 + BR)
-
-def accuracy(y, probs):
- return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)]))
-
diff --git a/asm2vec/version.py b/asm2vec/version.py
new file mode 100644
index 0000000..c85dc7e
--- /dev/null
+++ b/asm2vec/version.py
@@ -0,0 +1,4 @@
+VERSION = '1.0.3'
+DEV_VERSION = '0'
+
+radare2_version = "5.8.8"
diff --git a/catalog-info.yaml b/catalog-info.yaml
new file mode 100644
index 0000000..378ab88
--- /dev/null
+++ b/catalog-info.yaml
@@ -0,0 +1,15 @@
+apiVersion: backstage.io/v1alpha1
+kind: Component
+metadata:
+ name: asm2vec-pytorch
+ description: All code running ASM2VEC using PyTorch
+ labels:
+ - jira-key: DATASCI
+ - language: Python
+ annotations:
+ backstage.io/source-location: url:https://github.com/wandera/asm2vec-pytorch
+spec:
+ type: service
+ lifecycle: production
+ owner: datascience
+ system: datascience
diff --git a/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 b/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80
new file mode 100644
index 0000000..208607f
Binary files /dev/null and b/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 differ
diff --git a/requirements.txt b/requirements.txt
index d92495b..3163633 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,2 @@
torch>=1.7,<2
-click>=7.1,<8
r2pipe>=1.5,<2
diff --git a/scripts/bin2asm.py b/scripts/bin2asm.py
deleted file mode 100644
index 2134e8c..0000000
--- a/scripts/bin2asm.py
+++ /dev/null
@@ -1,117 +0,0 @@
-#!/usr/bin/env python3
-import re
-import os
-import click
-import r2pipe
-import hashlib
-from pathlib import Path
-
-def sha3(data):
- return hashlib.sha3_256(data.encode()).hexdigest()
-
-def validEXE(filename):
- magics = [bytes.fromhex('7f454c46')]
- with open(filename, 'rb') as f:
- header = f.read(4)
- return header in magics
-
-def normalize(opcode):
- opcode = opcode.replace(' - ', ' + ')
- opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode)
- opcode = re.sub(r'\*[0-9]', '*CONST', opcode)
- opcode = re.sub(r' [0-9]', ' CONST', opcode)
- return opcode
-
-def fn2asm(pdf, minlen):
- # check
- if pdf is None:
- return
- if len(pdf['ops']) < minlen:
- return
- if 'invalid' in [op['type'] for op in pdf['ops']]:
- return
-
- ops = pdf['ops']
-
- # set label
- labels, scope = {}, [op['offset'] for op in ops]
- assert(None not in scope)
- for i, op in enumerate(ops):
- if op.get('jump') in scope:
- labels.setdefault(op.get('jump'), i)
-
- # dump output
- output = ''
- for op in ops:
- # add label
- if labels.get(op.get('offset')) is not None:
- output += f'LABEL{labels[op["offset"]]}:\n'
- # add instruction
- if labels.get(op.get('jump')) is not None:
- output += f' {op["type"]} LABEL{labels[op["jump"]]}\n'
- else:
- output += f' {normalize(op["opcode"])}\n'
-
- return output
-
-def bin2asm(filename, opath, minlen):
- # check
- if not validEXE(filename):
- return 0
-
- r = r2pipe.open(str(filename))
- r.cmd('aaaa')
-
- count = 0
-
- for fn in r.cmdj('aflj'):
- r.cmd(f's {fn["offset"]}')
- asm = fn2asm(r.cmdj('pdfj'), minlen)
- if asm:
- uid = sha3(asm)
- asm = f''' .name {fn["name"]}
- .offset {fn["offset"]:016x}
- .file {filename.name}
-''' + asm
- with open(opath / uid, 'w') as f:
- f.write(asm)
- count += 1
-
- print(f'[+] {filename}')
-
- return count
-
-@click.command()
-@click.option('-i', '--input', 'ipath', help='input directory / file', required=True)
-@click.option('-o', '--output', 'opath', default='asm', help='output directory')
-@click.option('-l', '--len', 'minlen', default=10, help='ignore assembly code with instructions amount smaller than minlen')
-def cli(ipath, opath, minlen):
- '''
- Extract assembly functions from binary executable
- '''
- ipath = Path(ipath)
- opath = Path(opath)
-
- # create output directory
- if not os.path.exists(opath):
- os.mkdir(opath)
-
- fcount, bcount = 0, 0
-
- # directory
- if os.path.isdir(ipath):
- for f in os.listdir(ipath):
- if not os.path.islink(ipath / f) and not os.path.isdir(ipath / f):
- fcount += bin2asm(ipath / f, opath, minlen)
- bcount += 1
- # file
- elif os.path.exists(ipath):
- fcount += bin2asm(ipath, opath, minlen)
- bcount += 1
- else:
- print(f'[Error] No such file or directory: {ipath}')
-
- print(f'[+] Total scan binary: {bcount} => Total generated assembly functions: {fcount}')
-
-if __name__ == '__main__':
- cli()
diff --git a/scripts/compare.py b/scripts/compare.py
deleted file mode 100644
index 3860b83..0000000
--- a/scripts/compare.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-import torch.nn as nn
-import click
-import asm2vec
-
-def cosine_similarity(v1, v2):
- return (v1 @ v2 / (v1.norm() * v2.norm())).item()
-
-@click.command()
-@click.option('-i1', '--input1', 'ipath1', help='target function 1', required=True)
-@click.option('-i2', '--input2', 'ipath2', help='target function 2', required=True)
-@click.option('-m', '--model', 'mpath', help='model path', required=True)
-@click.option('-e', '--epochs', default=10, help='training epochs', show_default=True)
-@click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True)
-@click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True)
-def cli(ipath1, ipath2, mpath, epochs, device, lr):
- if device == 'auto':
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
-
- # load model, tokens
- model, tokens = asm2vec.utils.load_model(mpath, device=device)
- functions, tokens_new = asm2vec.utils.load_data([ipath1, ipath2])
- tokens.update(tokens_new)
- model.update(2, tokens.size())
- model = model.to(device)
-
- # train function embedding
- model = asm2vec.utils.train(
- functions,
- tokens,
- model=model,
- epochs=epochs,
- device=device,
- mode='test',
- learning_rate=lr
- )
-
- # compare 2 function vectors
- v1, v2 = model.to('cpu').embeddings_f(torch.tensor([0, 1]))
-
- print(f'cosine similarity : {cosine_similarity(v1, v2):.6f}')
-
-if __name__ == '__main__':
- cli()
diff --git a/scripts/test.py b/scripts/test.py
deleted file mode 100644
index 31372aa..0000000
--- a/scripts/test.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import torch
-import torch.nn as nn
-import click
-import asm2vec
-
-@click.command()
-@click.option('-i', '--input', 'ipath', help='target function', required=True)
-@click.option('-m', '--model', 'mpath', help='model path', required=True)
-@click.option('-e', '--epochs', default=10, help='training epochs', show_default=True)
-@click.option('-n', '--neg-sample-num', 'neg_sample_num', default=25, help='negative sampling amount', show_default=True)
-@click.option('-l', '--limit', help='limit the amount of output probability result', type=int)
-@click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True)
-@click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True)
-@click.option('-p', '--pretty', default=False, help='pretty print table', show_default=True, is_flag=True)
-def cli(ipath, mpath, epochs, neg_sample_num, limit, device, lr, pretty):
- if device == 'auto':
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
-
- # load model, tokens
- model, tokens = asm2vec.utils.load_model(mpath, device=device)
- functions, tokens_new = asm2vec.utils.load_data(ipath)
- tokens.update(tokens_new)
- model.update(1, tokens.size())
- model = model.to(device)
-
- # train function embedding
- model = asm2vec.utils.train(
- functions,
- tokens,
- model=model,
- epochs=epochs,
- neg_sample_num=neg_sample_num,
- device=device,
- mode='test',
- learning_rate=lr
- )
-
- # show predicted probability results
- x, y = asm2vec.utils.preprocess(functions, tokens)
- probs = model.predict(x.to(device), y.to(device))
- asm2vec.utils.show_probs(x, y, probs, tokens, limit=limit, pretty=pretty)
-
-if __name__ == '__main__':
- cli()
diff --git a/scripts/train.py b/scripts/train.py
deleted file mode 100644
index 98391f4..0000000
--- a/scripts/train.py
+++ /dev/null
@@ -1,52 +0,0 @@
-import torch
-import click
-import asm2vec
-
-@click.command()
-@click.option('-i', '--input', 'ipath', help='training data folder', required=True)
-@click.option('-o', '--output', 'opath', default='model.pt', help='output model path', show_default=True)
-@click.option('-m', '--model', 'mpath', help='load previous trained model path', type=str)
-@click.option('-l', '--limit', help='limit the number of functions to be loaded', show_default=True, type=int)
-@click.option('-d', '--ebedding-dimension', 'embedding_size', default=100, help='embedding dimension', show_default=True)
-@click.option('-b', '--batch-size', 'batch_size', default=1024, help='batch size', show_default=True)
-@click.option('-e', '--epochs', default=10, help='training epochs', show_default=True)
-@click.option('-n', '--neg-sample-num', 'neg_sample_num', default=25, help='negative sampling amount', show_default=True)
-@click.option('-a', '--calculate-accuracy', 'calc_acc', help='whether calculate accuracy ( will be significantly slower )', is_flag=True)
-@click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True)
-@click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True)
-def cli(ipath, opath, mpath, limit, embedding_size, batch_size, epochs, neg_sample_num, calc_acc, device, lr):
- if device == 'auto':
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
-
- if mpath:
- model, tokens = asm2vec.utils.load_model(mpath, device=device)
- functions, tokens_new = asm2vec.utils.load_data(ipath, limit=limit)
- tokens.update(tokens_new)
- model.update(len(functions), tokens.size())
- else:
- model = None
- functions, tokens = asm2vec.utils.load_data(ipath, limit=limit)
-
- def callback(context):
- progress = f'{context["epoch"]} | time = {context["time"]:.2f}, loss = {context["loss"]:.4f}'
- if context["accuracy"]:
- progress += f', accuracy = {context["accuracy"]:.4f}'
- print(progress)
- asm2vec.utils.save_model(opath, context["model"], context["tokens"])
-
- model = asm2vec.utils.train(
- functions,
- tokens,
- model=model,
- embedding_size=embedding_size,
- batch_size=batch_size,
- epochs=epochs,
- neg_sample_num=neg_sample_num,
- calc_acc=calc_acc,
- device=device,
- callback=callback,
- learning_rate=lr
- )
-
-if __name__ == '__main__':
- cli()
diff --git a/setup.py b/setup.py
index 62ff843..19a3051 100644
--- a/setup.py
+++ b/setup.py
@@ -1,14 +1,69 @@
+import os
+import sys
+import platform
from setuptools import setup, find_packages
+from setuptools.command.install import install as _install
+
+from asm2vec.version import VERSION, radare2_version
+
+
+class install(_install):
+ @staticmethod
+ def _setup_radare2() -> None:
+ if sys.platform.startswith("linux"): # Install required in Docker images
+ machine = platform.machine()
+ if machine in ["aarch64", "arm"]:
+ architecture = "arm64"
+ elif machine in ["x86_64"]:
+ architecture = "amd64"
+ elif machine in ["i386", "i686"]:
+ architecture = "i386"
+ else:
+ raise Exception(f"No architecture for Linux Machine: '{machine}'")
+
+ commands = [
+ "apt-get update",
+ "apt-get install -y --no-install-recommends wget",
+ f"wget -O /tmp/radare2_{radare2_version}_{architecture}.deb https://github.com/radareorg/radare2/releases/download/{radare2_version}/radare2_{radare2_version}_{architecture}.deb",
+ f"dpkg -i /tmp/radare2_{radare2_version}_{architecture}.deb",
+ "r2pm init",
+ "r2pm update",
+ f"rm /tmp/radare2_{radare2_version}_{architecture}.deb"
+ ]
+ for command in commands:
+ if os.system(command) != 0:
+ raise Exception(f"Install radare2 failed: '{command}'")
+ else:
+ print("Ensure 'radar2' is installed...")
+
+ def run(self):
+ self._setup_radare2()
+ _install.run(self)
+
+
+def readme():
+ with open('README.md') as f:
+ return f.read()
+
+
+def read_requirements():
+ with open('requirements.txt') as f:
+ return [s for s in f.read().split('\n') if not ('--index-url' in s)]
+
setup(
name='asm2vec',
- version='1.0.0',
- description='Unofficial implementation of asm2vec using pytorch',
- install_requires=['torch>=1.7,<2'
- 'click>=7.1,<8'
- 'r2pipe>=1.5,<2'],
- author='oalieno',
- author_email='jeffrey6910@gmail.com',
+ version=VERSION,
+ description="Jamf's implementation of asm2vec using pytorch",
+ long_description=readme(),
+ author='oalieno/jamf',
+ author_email='jamie.nutter@jamf.com',
license='MIT License',
- packages = find_packages(),
+ install_requires=read_requirements(),
+ packages=find_packages(),
+ zip_safe=False,
+ include_package_data=True,
+ test_suite='nose.collector',
+ tests_require=['nose'],
+ cmdclass={'install': install}
)
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000..472793c
--- /dev/null
+++ b/test/__init__.py
@@ -0,0 +1 @@
+__all__ = ["test_binary_to_asm"]
diff --git a/test/test_binary_to_asm.py b/test/test_binary_to_asm.py
new file mode 100644
index 0000000..ce53411
--- /dev/null
+++ b/test/test_binary_to_asm.py
@@ -0,0 +1,229 @@
+from os import path, mkdir
+from pathlib import Path
+from shutil import rmtree
+from unittest import TestCase
+
+from asm2vec import __data__
+from asm2vec.binary_to_asm import (bin_to_asm, convert_to_asm, _fn_to_asm, _normalize, _sha3, _valid_exe)
+
+
+class TestBinaryToAsm(TestCase):
+
+ @classmethod
+ def setUpClass(cls) -> None:
+ print("\n--- TestBinaryToAsm ---")
+ cls.output_path = "malware_asm/"
+ cls.data_path = path.join(__data__, "5cca32eb8f9c2a024a57ce12e3fb66070662de80")
+ cls.pdf_dict = {
+ 'name': 'main',
+ 'size': 18,
+ 'addr': 4294974144,
+ 'ops': [
+ {
+ 'offset': 4294974144,
+ 'esil': 'rbp,8,rsp,-,=[8],8,rsp,-=',
+ 'refptr': 0,
+ 'fcn_addr': 4294974144,
+ 'fcn_last': 4294974161,
+ 'size': 1,
+ 'opcode': 'push rbp',
+ 'disasm': 'push rbp',
+ 'bytes': '55',
+ 'family': 'cpu',
+ 'type': 'rpush',
+ 'reloc': 'False',
+ 'type_num': 268435468,
+ 'type2_num': 0,
+ 'flags': ['main', 'entry0', 'section.0.__TEXT.__text', 'sym.func.100001ac0', 'rip'],
+ 'comment': 'WzAwXSAtci14IHNlY3Rpb24gc2l6ZSA3Mzc2IG5hbWVkIDAuX19URVhULl9fdGV4dA=='
+ },
+ {
+ 'offset': 4294974145,
+ 'esil': 'rsp,rbp,=',
+ 'refptr': 0,
+ 'fcn_addr': 4294974144,
+ 'fcn_last': 4294974159,
+ 'size': 3,
+ 'opcode': 'mov rbp, rsp',
+ 'disasm': 'mov rbp, rsp',
+ 'bytes': '4889e5',
+ 'family': 'cpu',
+ 'type': 'mov',
+ 'reloc': 'False',
+ 'type_num': 9,
+ 'type2_num': 0
+ },
+ {
+ 'offset': 4294974148,
+ 'esil': 'rbx,8,rsp,-,=[8],8,rsp,-=',
+ 'refptr': 0,
+ 'fcn_addr': 4294974144,
+ 'fcn_last': 4294974161,
+ 'size': 1,
+ 'opcode': 'push rbx',
+ 'disasm': 'push rbx',
+ 'bytes': '53',
+ 'family': 'cpu',
+ 'type': 'rpush',
+ 'reloc': 'False',
+ 'type_num': 268435468,
+ 'type2_num': 0
+ },
+ {
+ 'offset': 4294974149,
+ 'esil': 'rax,8,rsp,-,=[8],8,rsp,-=',
+ 'refptr': 0,
+ 'fcn_addr': 4294974144,
+ 'fcn_last': 4294974161,
+ 'size': 1,
+ 'opcode': 'push rax',
+ 'disasm': 'push rax',
+ 'bytes': '50',
+ 'family': 'cpu',
+ 'type': 'rpush',
+ 'reloc': 'False',
+ 'type_num': 268435468,
+ 'type2_num': 0
+ },
+ {
+ 'offset': 4294974150,
+ 'esil': 'rsi,rbx,=',
+ 'refptr': 0,
+ 'fcn_addr': 4294974144,
+ 'fcn_last': 4294974159,
+ 'size': 3,
+ 'opcode': 'mov rbx, rsi',
+ 'disasm': 'mov rbx, rsi',
+ 'bytes': '4889f3',
+ 'family': 'cpu',
+ 'type': 'mov',
+ 'reloc': 'False',
+ 'type_num': 9,
+ 'type2_num': 0
+ },
+ {
+ 'offset': 4294974153,
+ 'ptr': 4294985864,
+ 'esil': '0x2db8,rip,+,[8],rax,=',
+ 'refptr': 8,
+ 'fcn_addr': 4294974144,
+ 'fcn_last': 4294974155,
+ 'size': 7,
+ 'opcode': 'mov rax, qword [rip + 0x2db8]',
+ 'disasm': 'mov rax, qword [0x100004888]',
+ 'bytes': '488b05b82d0000',
+ 'family': 'cpu',
+ 'type': 'mov',
+ 'reloc': 'False',
+ 'type_num': 9,
+ 'type2_num': 0,
+ 'refs': [
+ {
+ 'addr': 4294985864,
+ 'type': 'DATA',
+ 'perm': 'r--'
+ }
+ ]
+ },
+ {
+ 'offset': 4294974160,
+ 'esil': 'rax,rip,=',
+ 'refptr': 0,
+ 'fcn_addr': 4294974144,
+ 'fcn_last': 4294974160,
+ 'size': 2,
+ 'opcode': 'jmp rax',
+ 'disasm': 'jmp rax',
+ 'bytes': 'ffe0',
+ 'family': 'cpu',
+ 'type': 'rjmp',
+ 'reloc': 'False',
+ 'type_num': 268435458,
+ 'type2_num': 0
+ }
+ ]
+ }
+ mkdir(cls.output_path)
+
+
+ @classmethod
+ def tearDownClass(cls) -> None:
+ rmtree(cls.output_path)
+
+ def test_sha3(self):
+ """Should return 64-character long string"""
+ asm = ("push rbp\n"
+ "mov rbp, rsp\n"
+ "push rbx\n"
+ "push rax\n"
+ "mov rbx, rsi\n"
+ "mov rax, qword [rip + CONST]\n"
+ "jmp rax")
+ self.assertRegex(_sha3(asm), '^[a-f0-9]{64}')
+
+ def test_valid_exe_when_valid_magic_bytes(self):
+ """Should return boolean"""
+ magic_bytes = ["cffaedfe"]
+ self.assertEqual(_valid_exe(self.data_path, magic_bytes), True)
+
+ def test_valid_exe_when_not_valid_magic_bytes(self):
+ """Should return boolean"""
+ magic_bytes = ["cafebabe"]
+ self.assertEqual(_valid_exe(self.data_path, magic_bytes), False)
+
+ def test_normalize_when_offset(self):
+ """Should return normalized opcode"""
+ opcode = "mov rax, qword [rip + 0x2db8]"
+ expected_norm_opcode = "mov rax, qword [rip + CONST]"
+ self.assertEqual(_normalize(opcode), expected_norm_opcode)
+
+ def test_normalize_when_no_offset(self):
+ """Should return normalized opcode"""
+ opcode = "mov rbx, rsi"
+ expected_norm_opcode = "mov rbx, rsi"
+ self.assertEqual(_normalize(opcode), expected_norm_opcode)
+
+ def test_fn_to_asm_returns_empty_string_when_pdf_none(self):
+ """Should return assembly functions with normalized opcode"""
+ pdf = None
+ asm_min = 5
+ expected_asm = ""
+ self.assertEqual(_fn_to_asm(pdf, asm_min), expected_asm)
+
+ def test_fn_to_asm_returns_empty_string_when_pdfops_shorter_than_minlen(self):
+ """Should return assembly functions with normalized opcode"""
+ asm_minlen = 10
+ expected_asm = ""
+ self.assertEqual(_fn_to_asm(self.pdf_dict, asm_minlen), expected_asm)
+
+ def test_fn_to_asm_returns_expected_asm(self):
+ """Should return assembly functions with normalized opcode"""
+ asm_min = 5
+ expected_asm = (" push rbp\n"
+ " mov rbp, rsp\n"
+ " push rbx\n"
+ " push rax\n"
+ " mov rbx, rsi\n"
+ " mov rax, qword [rip + CONST]\n"
+ " jmp rax\n")
+ self.assertEqual(_fn_to_asm(self.pdf_dict, asm_min), expected_asm)
+
+ def test_bin_to_asm_returns_expected_number_of_disassembled_files(self):
+ asm_minlen = 5
+ magic_bytes = ["cffaedfe"]
+ self.assertEqual(bin_to_asm(Path(self.data_path), Path(self.output_path), asm_minlen, magic_bytes), 1)
+
+ def test_bin_to_asm_returns_expected_number_of_disassembled_files_when_pdfops_shorter_than_minlen(self):
+ asm_minlen = 10
+ magic_bytes = ['cffaedfe']
+ self.assertEqual(bin_to_asm(Path(self.data_path), Path(self.output_path), asm_minlen, magic_bytes), 0)
+
+ def test_convert_to_asm_returns_expected_sha1(self):
+ input_path = __data__
+ asm_minlen_upper = 10
+ asm_minlen_lower = 5
+ expected_sha1 = ["5cca32eb8f9c2a024a57ce12e3fb66070662de80"]
+ self.assertEqual(
+ convert_to_asm(input_path, self.output_path, asm_minlen_upper, asm_minlen_lower),
+ expected_sha1
+ )