diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 0000000..446aa21 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,32 @@ +# This is a comment. +# Each line is a file pattern followed by one or more owners. + +# These owners will be the default owners for everything in +# the repo. Unless a later match takes precedence, +# @global-owner1 and @global-owner2 will be requested for +# review when someone opens a pull request. +* @wandera/datascience + +# Order is important; the last matching pattern takes the most +# precedence. When someone opens a pull request that only +# modifies JS files, only @js-owner and not the global +# owner(s) will be requested for a review. +# *.js @js-owner + +# You can also use email addresses if you prefer. They'll be +# used to look up users just like we do for commit author +# emails. +#*.go docs@example.com + +# The `docs/*` pattern will match files like +# `docs/getting-started.md` but not further nested files like +# `docs/build-app/troubleshooting.md`. +# docs/* docs@example.com + +# In this example, @octocat owns any file in an apps directory +# anywhere in your repository. +# apps/ @octocat + +# In this example, @doctocat owns any file in the `/docs` +# directory in the root of your repository. +# /docs/ @doctocat \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..cb6efa5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.10.11-slim + +ADD . /asm2vec-pytorch +WORKDIR asm2vec-pytorch + +RUN apt-get update && apt-get install -y --no-install-recommends \ + unixodbc-dev \ + unixodbc \ + libpq-dev && \ + pip install -r requirements.txt && \ + python setup.py install + +CMD ["/bin/sh"] diff --git a/README.md b/README.md index 7a2043b..637d5db 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # asm2vec-pytorch -release 1.0.0 +release 1.0.3 mit python @@ -9,30 +9,17 @@ The details of the model can be found in the original paper: [(sp'19) Asm2Vec: B ## Requirements -python >= 3.6 - -| packages | for | -| --- | --- | -| r2pipe | `scripts/bin2asm.py` | -| click | `scripts/*` | -| torch | almost all code need it | - -You also need to install `radare2` to run `scripts/bin2asm.py`. `r2pipe` is just the python interface to `radare2` - -If you only want to use the library code, you just need to install `torch` +* python >= 3.10 +* radare2 +* Packages listed in `requirements.txt` ## Install ``` +pip install -r requirements.txt && python setup.py install ``` -or - -``` -pip install git+https://github.com/oalieno/asm2vec-pytorch.git -``` - ## Benchmark An implementation already exists here: [Lancern/asm2vec](https://github.com/Lancern/asm2vec) @@ -46,141 +33,20 @@ Following is the benchmark of training 1000 functions in 1 epoch. ## Get Started -```bash -python scripts/bin2asm.py -i /bin/ -o asm/ -``` - -First generate asm files from binarys under `/bin/`. -You can hit `Ctrl+C` anytime when there is enough data. - -```bash -python scripts/train.py -i asm/ -l 100 -o model.pt --epochs 100 -``` - -Try to train the model using only 100 functions and 100 epochs for a taste. -Then you can use more data if you want. - -```bash -python scripts/test.py -i asm/123456 -m model.pt -``` - -After you train your model, try to grab an assembly function and see the result. -This script will show you how the model perform. -Once you satisfied, you can take out the embedding vector of the function and do whatever you want with it. +### TODO - update this with description about to how use etc -## Usage +## Tests -### bin2asm.py +### Run test suite -``` -Usage: bin2asm.py [OPTIONS] +* Run all tests: ``python -m unittest discover -v`` +* Run a certain module's tests: ``python -m unittest -v test.test_binary_to_asm`` +* Run a certain test class: ``python -m unittest -v test.test_binary_to_asm.TestBinaryToAsm`` +* Run a certain test method: - Extract assembly functions from binary executable + ``python -m unittest -v test.test_binary_to_asm.TestBinaryToAsm.test_sha3`` -Options: - -i, --input TEXT input directory / file [required] - -o, --output TEXT output directory - -l, --len INTEGER ignore assembly code with instructions amount smaller - than minlen +### Coverage - --help Show this message and exit. -``` - -```bash -# Example -python bin2asm.py -i /bin/ -o asm/ -``` - -### train.py - -``` -Usage: train.py [OPTIONS] - -Options: - -i, --input TEXT training data folder [required] - -o, --output TEXT output model path [default: model.pt] - -m, --model TEXT load previous trained model path - -l, --limit INTEGER limit the number of functions to be loaded - -d, --ebedding-dimension INTEGER - embedding dimension [default: 100] - -b, --batch-size INTEGER batch size [default: 1024] - -e, --epochs INTEGER training epochs [default: 10] - -n, --neg-sample-num INTEGER negative sampling amount [default: 25] - -a, --calculate-accuracy whether calculate accuracy ( will be - significantly slower ) - - -c, --device TEXT hardware device to be used: cpu / cuda / - auto [default: auto] - - -lr, --learning-rate FLOAT learning rate [default: 0.02] - --help Show this message and exit. -``` - -```bash -# Example -python train.py -i asm/ -o model.pt --epochs 100 -``` - -### test.py - -``` -Usage: test.py [OPTIONS] - -Options: - -i, --input TEXT target function [required] - -m, --model TEXT model path [required] - -e, --epochs INTEGER training epochs [default: 10] - -n, --neg-sample-num INTEGER negative sampling amount [default: 25] - -l, --limit INTEGER limit the amount of output probability result - -c, --device TEXT hardware device to be used: cpu / cuda / auto - [default: auto] - - -lr, --learning-rate FLOAT learning rate [default: 0.02] - -p, --pretty pretty print table [default: False] - --help Show this message and exit. -``` - -```bash -# Example -python test.py -i asm/123456 -m model.pt -``` - -``` -┌──────────────────────────────────────────┐ -│ endbr64 │ -│ ➔ push r15 │ -│ push r14 │ -├────────┬─────────────────────────────────┤ -│ 34.68% │ [rdx + rsi*CONST + CONST] │ -│ 20.29% │ push │ -│ 16.22% │ r15 │ -│ 04.36% │ r14 │ -│ 03.55% │ r11d │ -└────────┴─────────────────────────────────┘ -``` - -### compare.py - -``` -Usage: compare.py [OPTIONS] - -Options: - -i1, --input1 TEXT target function 1 [required] - -i2, --input2 TEXT target function 2 [required] - -m, --model TEXT model path [required] - -e, --epochs INTEGER training epochs [default: 10] - -c, --device TEXT hardware device to be used: cpu / cuda / auto - [default: auto] - - -lr, --learning-rate FLOAT learning rate [default: 0.02] - --help Show this message and exit. -``` - -```bash -# Example -python compare.py -i1 asm/123456 -i2 asm/654321 -m model.pt -e 30 -``` - -``` -cosine similarity : 0.873684 -``` +* Create report: ``coverage run -m unittest discover -v`` +* Read report: ``coverage report -m`` \ No newline at end of file diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..c478391 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,26 @@ +Thanks for helping make GitHub safe for everyone. + +# Security + +Jamf takes the security of our software products and services seriously, including all of the open source code repositories managed through our GitHub organizations, such as asm2vec-pytorch. + +We will ensure that your finding gets passed along to the appropriate maintainers for remediation. + +# Reporting Security Issues + +If you believe you have found a security vulnerability in any Jamf-owned repository, please report it to us through coordinated disclosure. + +Please do not report security vulnerabilities through public GitHub issues, discussions, or pull requests. + +Instead, please send an email to info[@]jamf.com. + +Please include as much of the information listed below as you can to help us better understand and resolve the issue: +- The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting) +- Full paths of source file(s) related to the manifestation of the issue +- The location of the affected source code (tag/branch/commit or direct URL) +- Any special configuration required to reproduce the issue +- Step-by-step instructions to reproduce the issue +- Proof-of-concept or exploit code (if possible) +- Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. diff --git a/asm2vec/__init__.py b/asm2vec/__init__.py index 0962ef8..6e9d963 100644 --- a/asm2vec/__init__.py +++ b/asm2vec/__init__.py @@ -1,6 +1,9 @@ -import importlib +import os -__all__ = ['model', 'datatype', 'utils'] +__home__ = os.path.dirname(os.path.abspath(__path__[0])) +__data__ = os.path.join(__home__, "data") -for module in __all__: - importlib.import_module(f'.{module}', 'asm2vec') +__all__ = [ + "__data__", "__home__", "binary_to_asm", "data", "datatype", "model", "similarity", "tensors", "test", "train", + "utilities", "version" +] diff --git a/asm2vec/binary_to_asm.py b/asm2vec/binary_to_asm.py new file mode 100644 index 0000000..1da1389 --- /dev/null +++ b/asm2vec/binary_to_asm.py @@ -0,0 +1,168 @@ +import re +import os +import hashlib +import r2pipe +import logging +from pathlib import Path + +logging.basicConfig(level=logging.INFO, format='%(message)s') + + +def _sha3(asm: str) -> str: + """ + Produces SHA3 for each assembly function + :param asm: Input assembly function + :return: Hashed string + """ + return hashlib.sha3_256(asm.encode()).hexdigest() + + +def _valid_exe(filename: str, magic_bytes: list[str]) -> bool: + """ + Extracts magic bytes and returns the header + :param filename: Name of the malware file (SHA1) + :param magic_bytes: For the specific OS/type of binary + :return: Boolean of the header existing in magic bytes + """ + magics = [bytes.fromhex(i) for i in magic_bytes] + with open(filename, 'rb') as f: + header = f.read(4) + return header in magics + + +def _normalize(opcode: str) -> str: + """ + Normalizes the input opcode string + :param opcode: Opcode of the binary + :return Normalized opcode string + """ + opcode = opcode.replace(' - ', ' + ') + opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode) + opcode = re.sub(r'\*[0-9]', '*CONST', opcode) + opcode = re.sub(r' [0-9]', ' CONST', opcode) + return opcode + + +def _fn_to_asm(pdf: dict | None, asm_minlen: int) -> str: + """ + Converts functions to assembly code + :param pdf: disassembly + :param asm_minlen: minimum length of assembly functions to be extracted + :return: ASM string + """ + if pdf is None: + return '' + if len(pdf['ops']) < asm_minlen: + return '' + if 'invalid' in [op['type'] for op in pdf['ops']]: + return '' + + ops = pdf['ops'] + + labels, scope = {}, [op['offset'] for op in ops] + assert (None not in scope) + for i, op in enumerate(ops): + if op.get('jump') in scope: + labels.setdefault(op.get('jump'), i) + + output = '' + for op in ops: + if labels.get(op.get('offset')) is not None: + output += f'LABEL{labels[op["offset"]]}:\n' + if labels.get(op.get('jump')) is not None: + output += f' {op["type"]} LABEL{labels[op["jump"]]}\n' + else: + output += f' {_normalize(op["opcode"])}\n' + + return output + + +def bin_to_asm(filename: Path, output_path: Path, asm_minlen: int, magic_bytes: list[str]) -> int: + """ + Fragments the input binary into assembly functions via r2pipe + :param filename: name of the malware file (SHA1) + :param output_path: path to the folder to store the assembly functions for each malware + :param asm_minlen: the minimum length of assembly functions to be extracted + :param magic_bytes for the specific OS/type of binary + :return: the number of assembly functions + """ + if not _valid_exe(filename, magic_bytes): + logging.info('The input file is invalid.') + return 0 + + r = r2pipe.open(str(filename)) + r.cmd('aaaa') + + count = 0 + + for fn in r.cmdj('aflj'): + r.cmd(f's {fn["offset"]}') + asm = _fn_to_asm(r.cmdj('pdfj'), asm_minlen) + if asm: + uid = _sha3(asm) + asm = f''' .name {fn["name"]}\ + .offset {fn["offset"]:016x}\ + .file {filename.name}''' + asm + output_asm = os.path.join(output_path, uid) + with open(output_asm, 'w') as file: + file.write(asm) + count += 1 + return count + + +def convert_to_asm( + input_path: str, output_path: str, minlen_upper: int, minlen_lower: int, magic_bytes: list[str] = None +) -> list: + """ + Extracts assembly functions from malware files and saves them into separate folder per binary + :param input_path: Path to the malware binaries + :param output_path: Path for the assembly functions to be extracted + :param minlen_upper: Minimum number of assembly functions needed for disassembling + :param minlen_lower: If disassembling is not possible with minlen_upper, lower the minimum number of assembly + functions to minlen_lower (WHAT?) + :param magic_bytes: List of valid for the specific OS/type of binary, e.g. + - 'cffaedfe': for Mach-O Little Endian (64-bit) + - 'feedfacf': for Mach-O Big Endian (64-bit) + - 'cefaedfe': for Mach-O Little Endian (32-bit) + - 'feedface': Mach-O Big Endian (32-bit) + - 'cafebabe': Universal Binary Big Endian + - 'bebafeca' + :return: List of sha1 of disassembled malware files + """ + if not magic_bytes: + magic_bytes = ['cffaedfe', 'feedfacf', 'cafebabe', 'cefaedfe', 'feedface', 'bebafeca'] + + binary_dir = Path(input_path) + asm_dir = Path(output_path) + + if not os.path.exists(asm_dir): + os.mkdir(asm_dir) + + function_count, binary_count, not_found = 0, 0, 0 + disassembled_bins = [] + + if os.path.isdir(binary_dir): + for entry in os.scandir(binary_dir): + out_dir = os.path.join(asm_dir, entry.name) + if not (os.path.exists(out_dir)): + os.mkdir(out_dir) + function_count = bin_to_asm(Path(entry), Path(out_dir), minlen_upper, magic_bytes) + if function_count == 0: + function_count = bin_to_asm(Path(entry), Path(out_dir), minlen_lower, magic_bytes) + if function_count == 0: + os.rmdir(out_dir) + logging.info('The binary {} was not disassembled'.format(entry.name)) + else: + binary_count += 1 + disassembled_bins.append(entry.name) + else: + binary_count += 1 + disassembled_bins.append(entry.name) + else: + not_found += 1 + logging.info("[Error] No such file or directory: {}".format(binary_dir)) + + logging.info("Total scanned binaries: {}".format(binary_count)) + logging.info("Not converted binaries: {}".format(not_found)) + + return disassembled_bins diff --git a/asm2vec/data.py b/asm2vec/data.py new file mode 100644 index 0000000..6713c38 --- /dev/null +++ b/asm2vec/data.py @@ -0,0 +1,43 @@ +import os +from pathlib import Path +from torch.utils.data import Dataset + +from asm2vec.datatype import Tokens, Function + + +class AsmDataset(Dataset): + # TODO - doc string - explain what this class does - how does it extend `Dataset`? + def __init__(self, x, y): + self.x = x + self.y = y + + def __len__(self): + return len(self.x) + + def __getitem__(self, index): + return self.x[index], self.y[index] + + +def load_data(paths, limit=None): + # TODO - doc string + if type(paths) is not list: + paths = [paths] + + filenames = [] + for path in paths: + if os.path.isdir(path): + filenames += [Path(path) / filename for filename in sorted(os.listdir(path)) + if os.path.isfile(Path(path) / filename)] + else: + filenames += [Path(path)] + + functions, tokens = [], Tokens() + for i, filename in enumerate(filenames): + if limit and i >= limit: + break + with open(filename) as f: + fn = Function.load(f.read()) + functions.append(fn) + tokens.add(fn.tokens()) + + return functions, tokens diff --git a/asm2vec/datatype.py b/asm2vec/datatype.py index a3cd39b..f618800 100644 --- a/asm2vec/datatype.py +++ b/asm2vec/datatype.py @@ -2,19 +2,25 @@ import random import warnings +# TODO - doc strings + + class Token: def __init__(self, name, index): self.name = name self.index = index self.count = 1 + def __str__(self): return self.name + class Tokens: def __init__(self, name_to_index=None, tokens=None): self.name_to_index = name_to_index or {} self.tokens = tokens or [] self._weights = None + def __getitem__(self, key): if type(key) is str: if self.name_to_index.get(key) is None: @@ -28,13 +34,17 @@ def __getitem__(self, key): return [self[k] for k in key] except: raise ValueError + def load_state_dict(self, sd): self.name_to_index = sd['name_to_index'] self.tokens = sd['tokens'] + def state_dict(self): return {'name_to_index': self.name_to_index, 'tokens': self.tokens} + def size(self): return len(self.tokens) + def add(self, names): self._weights = None if type(names) is not list: @@ -46,6 +56,7 @@ def add(self, names): self.tokens.append(token) else: self.tokens[self.name_to_index[name]].count += 1 + def update(self, tokens_new): for token in tokens_new: if token.name not in self.name_to_index: @@ -54,6 +65,7 @@ def update(self, tokens_new): self.tokens.append(token) else: self.tokens[self.name_to_index[token.name]].count += token.count + def weights(self): # if no cache, calculate if self._weights is None: @@ -62,19 +74,22 @@ def weights(self): for token in self.tokens: self._weights[token.index] = (token.count / total) ** 0.75 return self._weights + def sample(self, batch_size, num=5): return torch.multinomial(self.weights(), num * batch_size, replacement=True).view(batch_size, num) + class Function: def __init__(self, insts, blocks, meta): self.insts = insts self.blocks = blocks self.meta = meta + @classmethod def load(cls, text): - ''' - gcc -S format compatiable - ''' + """gcc -S format compatible + """ + label, labels, insts, blocks, meta = None, {}, [], [], {} for line in text.strip('\n').split('\n'): if line[0] in [' ', '\t']: @@ -109,10 +124,13 @@ def load(cls, text): if labels.get(arg): inst.args[i] = 'CONST' return cls(insts, blocks, meta) + def tokens(self): return [token for inst in self.insts for token in inst.tokens()] + def random_walk(self, num=3): return [self._random_walk() for _ in range(num)] + def _random_walk(self): current, visited, seq = self.blocks[0], [], [] while current not in visited: @@ -124,25 +142,31 @@ def _random_walk(self): current = random.choice(list(current.successors)) return seq + class BasicBlock: def __init__(self): self.insts = [] self.successors = set() + def add(self, inst): self.insts.append(inst) + def end(self): inst = self.insts[-1] return inst.is_jmp() or inst.op == 'ret' + class Instruction: def __init__(self, op, args): self.op = op self.args = args + def __str__(self): return f'{self.op} {", ".join([str(arg) for arg in self.args if str(arg)])}' + @classmethod def load(cls, text): - text = text.strip().strip('bnd').strip() # get rid of BND prefix + text = text.strip().strip('bnd').strip() op, _, args = text.strip().partition(' ') if args: args = [arg.strip() for arg in args.split(',')] @@ -150,9 +174,12 @@ def load(cls, text): args = [] args = (args + ['', ''])[:2] return cls(op, args) + def tokens(self): return [self.op] + self.args + def is_jmp(self): return 'jmp' in self.op or self.op[0] == 'j' + def is_call(self): return self.op == 'call' diff --git a/asm2vec/model.py b/asm2vec/model.py index 301f3be..51dc433 100644 --- a/asm2vec/model.py +++ b/asm2vec/model.py @@ -1,43 +1,82 @@ import torch import torch.nn as nn +from asm2vec.datatype import Tokens + bce, sigmoid, softmax = nn.BCELoss(), nn.Sigmoid(), nn.Softmax(dim=1) + +# TODO - doc strings + + class ASM2VEC(nn.Module): def __init__(self, vocab_size, function_size, embedding_size): super(ASM2VEC, self).__init__() - self.embeddings = nn.Embedding(vocab_size, embedding_size, _weight=torch.zeros(vocab_size, embedding_size)) - self.embeddings_f = nn.Embedding(function_size, 2 * embedding_size, _weight=(torch.rand(function_size, 2 * embedding_size)-0.5)/embedding_size/2) - self.embeddings_r = nn.Embedding(vocab_size, 2 * embedding_size, _weight=(torch.rand(vocab_size, 2 * embedding_size)-0.5)/embedding_size/2) + self.embeddings = nn.Embedding(vocab_size, embedding_size, _weight=torch.zeros(vocab_size, embedding_size)) + self.embeddings_f = nn.Embedding(function_size, 2 * embedding_size, + _weight=(torch.rand(function_size, 2 * embedding_size)-0.5)/embedding_size/2) + self.embeddings_r = nn.Embedding(vocab_size, 2 * embedding_size, + _weight=(torch.rand(vocab_size, 2 * embedding_size)-0.5)/embedding_size/2) def update(self, function_size_new, vocab_size_new): device = self.embeddings.weight.device - vocab_size, function_size, embedding_size = self.embeddings.num_embeddings, self.embeddings_f.num_embeddings, self.embeddings.embedding_dim + vocab_size, function_size, embedding_size = (self.embeddings.num_embeddings, + self.embeddings_f.num_embeddings, self.embeddings.embedding_dim) if vocab_size_new != vocab_size: - weight = torch.cat([self.embeddings.weight, torch.zeros(vocab_size_new - vocab_size, embedding_size).to(device)]) + weight = torch.cat([self.embeddings.weight, torch.zeros(vocab_size_new - vocab_size, embedding_size). + to(device)]) self.embeddings = nn.Embedding(vocab_size_new, embedding_size, _weight=weight) - weight_r = torch.cat([self.embeddings_r.weight, ((torch.rand(vocab_size_new - vocab_size, 2 * embedding_size)-0.5)/embedding_size/2).to(device)]) + weight_r = torch.cat([self.embeddings_r.weight, + ((torch.rand(vocab_size_new - vocab_size, 2 * embedding_size)-0.5)/embedding_size/2) + .to(device)]) self.embeddings_r = nn.Embedding(vocab_size_new, 2 * embedding_size, _weight=weight_r) - self.embeddings_f = nn.Embedding(function_size_new, 2 * embedding_size, _weight=((torch.rand(function_size_new, 2 * embedding_size)-0.5)/embedding_size/2).to(device)) + self.embeddings_f = nn.Embedding(function_size_new, 2 * embedding_size, + _weight=((torch.rand(function_size_new, 2 * embedding_size)-0.5) / + embedding_size/2).to(device)) def v(self, inp): - e = self.embeddings(inp[:,1:]) - v_f = self.embeddings_f(inp[:,0]) - v_prev = torch.cat([e[:,0], (e[:,1] + e[:,2]) / 2], dim=1) - v_next = torch.cat([e[:,3], (e[:,4] + e[:,5]) / 2], dim=1) + e = self.embeddings(inp[:, 1:]) + v_f = self.embeddings_f(inp[:, 0]) + v_prev = torch.cat([e[:, 0], (e[:, 1] + e[:, 2]) / 2], dim=1) + v_next = torch.cat([e[:, 3], (e[:, 4] + e[:, 5]) / 2], dim=1) v = ((v_f + v_prev + v_next) / 3).unsqueeze(2) return v def forward(self, inp, pos, neg): device, batch_size = inp.device, inp.shape[0] v = self.v(inp) - # negative sampling loss pred = torch.bmm(self.embeddings_r(torch.cat([pos, neg], dim=1)), v).squeeze() label = torch.cat([torch.ones(batch_size, 3), torch.zeros(batch_size, neg.shape[1])], dim=1).to(device) return bce(sigmoid(pred), label) - def predict(self, inp, pos): + def predict(self, inp, pos): # Why is pos not used? Why does Predict differ so much from Forward? device, batch_size = inp.device, inp.shape[0] v = self.v(inp) - probs = torch.bmm(self.embeddings_r(torch.arange(self.embeddings_r.num_embeddings).repeat(batch_size, 1).to(device)), v).squeeze(dim=2) + probs = torch.bmm(self.embeddings_r(torch.arange(self.embeddings_r.num_embeddings).repeat(batch_size, 1). + to(device)), v).squeeze(dim=2) return softmax(probs) + + +def save_model(path: str, model: ASM2VEC, tokens: Tokens) -> None: + torch.save( + { + 'model_params': ( + model.embeddings.num_embeddings, + model.embeddings_f.num_embeddings, + model.embeddings.embedding_dim + ), + 'model': model.state_dict(), + 'tokens': tokens.state_dict(), + }, + path + ) + + +def load_model(path: str, device: str = 'cpu') -> tuple[ASM2VEC, Tokens]: + checkpoint = torch.load(path, map_location=device) + tokens = Tokens() + tokens.load_state_dict(checkpoint['tokens']) + model = ASM2VEC(*checkpoint['model_params']) + model.load_state_dict(checkpoint['model']) + model = model.to(device) + return model, tokens diff --git a/asm2vec/similarity.py b/asm2vec/similarity.py new file mode 100644 index 0000000..ea52327 --- /dev/null +++ b/asm2vec/similarity.py @@ -0,0 +1,48 @@ +import torch + +from asm2vec.data import load_data +from asm2vec.model import load_model +from asm2vec.train import train + + +def cosine_similarity(v1, v2) -> float: + return (v1 @ v2 / (v1.norm() * v2.norm())).item() + + +def compare_two( + data_path_1: str, data_path_2: str, model_path: str, epochs: int = 10, device: str = "cpu", + learning_rate: float = 0.02 +) -> float: + """This function produces the cosine similarity of a pair of assembly functions + :param data_path_1: the path to the assembly function no. 1 + :param data_path_2: the path to the assembly function no. 2 + :param model_path: the path to the trained asm2vec model + :param epochs: the number of epochs for calculating the tensor representations; (Optional, default = 10) + :param device: 'auto' | 'cuda' | 'cpu' (Optional, default 'cpu') + :param learning_rate: learning rate; (Optional; default = 0.02) + :return the cosine similarity value + """ + if device == "auto": + device = "cuda" if torch.cuda.is_available() else "cpu" + + model, tokens = load_model(model_path, device=device) + functions, tokens_new = load_data([data_path_1, data_path_2]) + tokens.update(tokens_new) + model.update(2, tokens.size()) + model = model.to(device) + + model = train( + functions, + tokens, + model=model, + epochs=epochs, + device=device, + mode="update", + learning_rate=learning_rate + ) + + v1, v2 = model.to("cpu").embeddings_f(torch.tensor([0, 1])) + similarity = cosine_similarity(v1, v2) + print(f"Cosine similarity : {similarity:.6f}") + + return similarity diff --git a/asm2vec/tensors.py b/asm2vec/tensors.py new file mode 100644 index 0000000..78a356e --- /dev/null +++ b/asm2vec/tensors.py @@ -0,0 +1,73 @@ +import os +import torch +import logging +from pathlib import Path + +from asm2vec.train import train, load_model, load_data + +logging.basicConfig(level=logging.INFO, format='%(message)s') + + +def calc_tensors( + asm_path: str, tensor_path: str, model_path: str, epochs: int, device: str = 'cpu', learning_rate: float = 0.02 +) -> list: + """ + Calculates vector representation of a binary as the mean per column of the vector representations of its assembly + functions. + :param asm_path: Path to folder with assembly function in a sub-folder per binary + :param tensor_path: Path to folder to store the tensors + :param model_path: Path to the trained model + :param epochs: Number of epochs + :param device: 'auto' | 'cuda' | 'cpu' + :param learning_rate: Learning rate + :return: List of tensors + """ + tensors_list = [] + if device == 'auto': + device = 'cuda' if torch.cuda.is_available() else 'cpu' + + if os.path.isfile(model_path): + model, tokens = load_model(model_path, device=device) + else: + print("No valid model") + return [] + + dir0 = Path(tensor_path) + if not (os.path.exists(dir0)): + os.mkdir(dir0) + + if os.path.isdir(asm_path): + obj = os.scandir(asm_path) + for entry in obj: + if entry.is_dir() and os.listdir(entry) and entry.name: + tensor_file = os.path.join(dir0, entry.name) + if not (os.path.exists(tensor_file)): + functions, tokens_new = load_data([entry]) + file_count = sum(len(files) for _, _, files in os.walk(entry)) + tokens.update(tokens_new) + logging.info(f"Binary {entry.name}: {file_count} assembly functions") + model.update(file_count, tokens.size()) + model = model.to(device) + + model = train( + functions, + tokens, + model=model, + epochs=epochs, + device=device, + mode='update', + learning_rate=learning_rate + ) + + tensor = model.to('cpu').embeddings_f(torch.tensor([list(range(0, file_count))])) + tens = torch.squeeze(tensor) + if file_count == 1: + torch.save(tensor, tensor_file) + else: + torch.save(tens.mean(0), tensor_file) + tensors_list.append(entry.name) + + else: + logging.info("No valid directory") + + return tensors_list diff --git a/asm2vec/test.py b/asm2vec/test.py new file mode 100644 index 0000000..b80cc14 --- /dev/null +++ b/asm2vec/test.py @@ -0,0 +1,39 @@ +import torch + +from asm2vec.data import load_data +from asm2vec.model import load_model +from asm2vec.train import train, preprocess +from asm2vec.utilities import show_probs + + +def test_model( + data_path: str, model_path: str, epochs: int = 10, neg_sample_num: int = 25, limit: int | None = None, + device: str = "cpu", learning_rate: float = 0.02, pretty: bool = False +) -> None: + # TODO - doc string + if device == "auto": + device = "cuda" if torch.cuda.is_available() else "cpu" + + # load model, tokens + model, tokens = load_model(model_path, device=device) + functions, tokens_new = load_data(data_path) + tokens.update(tokens_new) + model.update(1, tokens.size()) + model = model.to(device) + + # train function embedding + model = train( + functions, + tokens, + model=model, + epochs=epochs, + neg_sample_num=neg_sample_num, + device=device, + mode="update", + learning_rate=learning_rate + ) + + # show predicted probability results + x, y = preprocess(functions, tokens) + probs = model.predict(x.to(device), y.to(device)) + show_probs(x, y, probs, tokens, limit=limit, pretty=pretty) diff --git a/asm2vec/train.py b/asm2vec/train.py new file mode 100644 index 0000000..4de7a81 --- /dev/null +++ b/asm2vec/train.py @@ -0,0 +1,134 @@ +import time +import torch +from pathlib import Path +from torch.utils.data import DataLoader +from asm2vec.data import AsmDataset, load_data +from asm2vec.datatype import Function, Tokens +from asm2vec.model import ASM2VEC, load_model, save_model +from asm2vec.utilities import accuracy, callback + + +def preprocess(functions, tokens): + x, y = [], [] + for i, fn in enumerate(functions): + for seq in fn.random_walk(): + for j in range(1, len(seq) - 1): + x.append([i] + [tokens[token].index for token in seq[j - 1].tokens() + seq[j + 1].tokens()]) + y.append([tokens[token].index for token in seq[j].tokens()]) + return torch.tensor(x), torch.tensor(y) + + +def train( + functions: list[Function], tokens: Tokens, model: ASM2VEC | None = None, embedding_size: int = 100, + batch_size: int = 1024, epochs: int = 10, neg_sample_num: int = 25, calc_acc: bool = False, device: str = 'cpu', + mode: str = 'train', verbose: bool = False, learning_rate: float = 0.02 +): + """This function trains a model on the given assembly functions and tokens + :param functions: list of assembly functions + :param tokens: tokens (operations, operands) of the assembly function + :param model: type of the model; ; (Optional, default ASM2VEC) + :param embedding_size: size of the tensor representation of an assembly function; (Optional, default value = 100) + :param batch_size: size of the batch for each epoch of training; (Optional, default value = 1024) + :param epochs: number of epochs for training the model; (Optional, default value = 10) + :param neg_sample_num: size of the negative sample; (Optional, default value = 25) + :param calc_acc: if set to True, the accuracy per training epoch is displayed; (Optional, default False) + :param device: the device used for processing; (Optional, default 'cpu') + :param mode: 'train' (to train a new model) | 'update' (to add to an already trained model's dictionary); + (Optional, default 'train') + :param verbose: if True performs training in verbose mode; (Optional, default False) + :param learning_rate: learning rate + """ + if mode == 'train': + if model is None: + model = ASM2VEC(tokens.size(), function_size=len(functions), embedding_size=embedding_size).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) + elif mode == 'update': + if model is None: + raise ValueError("Update mode requires a pretrained model") + optimizer = torch.optim.Adam(model.embeddings_f.parameters(), lr=learning_rate) + else: + raise ValueError("Unknown mode") + + loader = DataLoader(AsmDataset(*preprocess(functions, tokens)), batch_size=batch_size, shuffle=True) + for epoch in range(epochs): + start = time.time() + loss_sum, loss_count, accs = 0.0, 0, [] + + model.train() + for i, (inp, pos) in enumerate(loader): + neg = tokens.sample(inp.shape[0], neg_sample_num) + loss = model(inp.to(device), pos.to(device), neg.to(device)) + loss_sum, loss_count = loss_sum + loss, loss_count + 1 + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + if i == 0 and calc_acc: + probs = model.predict(inp.to(device), pos.to(device)) + accs.append(accuracy(pos, probs)) + + if verbose: + callback({ + 'model': model, + 'tokens': tokens, + 'epoch': epoch, + 'time': time.time() - start, + 'loss': loss_sum / loss_count, + 'accuracy': torch.tensor(accs).mean() if calc_acc else None + }) + + return model + + +def train_asm2vec_model( + train_set: str, new_model: str, model_path: str | None, epochs: int, limit: int | None = None, + calc_acc: bool = False, embedding_size: int = 100, batch_size: int = 1024, neg_sample: int = 25, + learning_rate: float = 0.02, device: str = 'cpu' +) -> ASM2VEC: + # TODO - this is just a wrapper - can we do this smarter? + """Trains an ASM2VEC model + :param train_set: path to the training dataset + :param new_model: path to the model to be trained + :param model_path: path to already trained model + :param limit: number of the assembly functions that the model will be trained on; if not defined, all the assembly + functions in train_set_path + :param epochs: number of epochs + :param calc_acc: displays the accuracy per training epoch; setting it to True will slow down the training + :param embedding_size: size of the vector representation for a token; an assembly function will be represented + with a vector twice that size + :param batch_size: the size of batches for training + :param neg_sample: negative sampling amount + :param device: 'auto' | 'cuda' | 'cpu' + :param learning_rate: learning rate + :return an ASM2VEC model + """ + + if device == 'auto': + device = 'cuda' if torch.cuda.is_available() else 'cpu' + + if model_path: + model, tokens = load_model(model_path, device=device) + functions, tokens_new = load_data(train_set, limit=limit) + tokens.update(tokens_new) + model.update(len(functions), tokens.size()) + else: + model = None + functions, tokens = load_data(Path(train_set), limit=limit) + + model = train( + functions, + tokens, + model=model, + embedding_size=embedding_size, + batch_size=batch_size, + epochs=epochs, + neg_sample_num=neg_sample, + calc_acc=calc_acc, + device=device, + verbose=True, + learning_rate=learning_rate + ) + save_model(new_model, model, tokens) + + return model diff --git a/asm2vec/utilities.py b/asm2vec/utilities.py new file mode 100644 index 0000000..dd39aac --- /dev/null +++ b/asm2vec/utilities.py @@ -0,0 +1,55 @@ +import logging +import torch + +from asm2vec.datatype import Instruction + +logging.basicConfig(level=logging.INFO, format='%(message)s') + + +# TODO - Why do we have both logging and print? +# TODO - Doc strings + +def accuracy(y, probs): + return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)])) + + +def callback(context) -> None: + """Prettifies the display of accuracy, if chosen + """ + progress = f'{context["epoch"]} | time = {context["time"]:.2f},\ + loss = {context["loss"]:.4f}' + + if context["accuracy"]: + progress += f', accuracy = {context["accuracy"]:.4f}' + logging.info(f"{progress}") + + +def show_probs(x, y, probs, tokens, limit=None, pretty=False): + if pretty: + tl, tr, bl, br = '┌', '┐', '└', '┘' + lm, rm, tm, bm = '├', '┤', '┬', '┴' + h, v = '─', '│' + arrow = ' ➔' + else: + tl, tr, bl, br = '+', '+', '+', '+' + lm, rm, tm, bm = '+', '+', '+', '+' + h, v = '-', '|' + arrow = '->' + top = probs.topk(5) + for i, (xi, yi) in enumerate(zip(x, y)): + if limit and i >= limit: + break + xi, yi = xi.tolist(), yi.tolist() + print(tl + h * 42 + tr) + print(f'{v} {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {v}') + print(f'{v} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {v}') + print(f'{v} {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {v}') + print(lm + h * 8 + tm + h * 33 + rm) + for value, index in zip(top.values[i], top.indices[i]): + if index in yi: + colorbegin, colorclear = '\033[92m', '\033[0m' + else: + colorbegin, colorclear = '', '' + print(f'{v} {colorbegin}{value * 100:05.2f}%{colorclear} {v} {colorbegin}' + f'{tokens[index.item()].name:31}{colorclear} {v}') + print(bl + h * 8 + bm + h * 33 + br) diff --git a/asm2vec/utils.py b/asm2vec/utils.py deleted file mode 100644 index 4f9aa25..0000000 --- a/asm2vec/utils.py +++ /dev/null @@ -1,156 +0,0 @@ -import os -import time -import torch -from torch.utils.data import DataLoader, Dataset -from pathlib import Path -from .datatype import Tokens, Function, Instruction -from .model import ASM2VEC - -class AsmDataset(Dataset): - def __init__(self, x, y): - self.x = x - self.y = y - def __len__(self): - return len(self.x) - def __getitem__(self, index): - return self.x[index], self.y[index] - -def load_data(paths, limit=None): - if type(paths) is not list: - paths = [paths] - - filenames = [] - for path in paths: - if os.path.isdir(path): - filenames += [Path(path) / filename for filename in sorted(os.listdir(path)) if os.path.isfile(Path(path) / filename)] - else: - filenames += [Path(path)] - - functions, tokens = [], Tokens() - for i, filename in enumerate(filenames): - if limit and i >= limit: - break - with open(filename) as f: - fn = Function.load(f.read()) - functions.append(fn) - tokens.add(fn.tokens()) - - return functions, tokens - -def preprocess(functions, tokens): - x, y = [], [] - for i, fn in enumerate(functions): - for seq in fn.random_walk(): - for j in range(1, len(seq) - 1): - x.append([i] + [tokens[token].index for token in seq[j-1].tokens() + seq[j+1].tokens()]) - y.append([tokens[token].index for token in seq[j].tokens()]) - return torch.tensor(x), torch.tensor(y) - -def train( - functions, - tokens, - model=None, - embedding_size=100, - batch_size=1024, - epochs=10, - neg_sample_num=25, - calc_acc=False, - device='cpu', - mode='train', - callback=None, - learning_rate=0.02 -): - if mode == 'train': - if model is None: - model = ASM2VEC(tokens.size(), function_size=len(functions), embedding_size=embedding_size).to(device) - optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) - elif mode == 'test': - if model is None: - raise ValueError("test mode required pretrained model") - optimizer = torch.optim.Adam(model.embeddings_f.parameters(), lr=learning_rate) - else: - raise ValueError("Unknown mode") - - loader = DataLoader(AsmDataset(*preprocess(functions, tokens)), batch_size=batch_size, shuffle=True) - for epoch in range(epochs): - start = time.time() - loss_sum, loss_count, accs = 0.0, 0, [] - - model.train() - for i, (inp, pos) in enumerate(loader): - neg = tokens.sample(inp.shape[0], neg_sample_num) - loss = model(inp.to(device), pos.to(device), neg.to(device)) - loss_sum, loss_count = loss_sum + loss, loss_count + 1 - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - if i == 0 and calc_acc: - probs = model.predict(inp.to(device), pos.to(device)) - accs.append(accuracy(pos, probs)) - - if callback: - callback({ - 'model': model, - 'tokens': tokens, - 'epoch': epoch, - 'time': time.time() - start, - 'loss': loss_sum / loss_count, - 'accuracy': torch.tensor(accs).mean() if calc_acc else None - }) - - return model - -def save_model(path, model, tokens): - torch.save({ - 'model_params': ( - model.embeddings.num_embeddings, - model.embeddings_f.num_embeddings, - model.embeddings.embedding_dim - ), - 'model': model.state_dict(), - 'tokens': tokens.state_dict(), - }, path) - -def load_model(path, device='cpu'): - checkpoint = torch.load(path, map_location=device) - tokens = Tokens() - tokens.load_state_dict(checkpoint['tokens']) - model = ASM2VEC(*checkpoint['model_params']) - model.load_state_dict(checkpoint['model']) - model = model.to(device) - return model, tokens - -def show_probs(x, y, probs, tokens, limit=None, pretty=False): - if pretty: - TL, TR, BL, BR = '┌', '┐', '└', '┘' - LM, RM, TM, BM = '├', '┤', '┬', '┴' - H, V = '─', '│' - arrow = ' ➔' - else: - TL = TR = BL = BR = '+' - LM = RM = TM = BM = '+' - H, V = '-', '|' - arrow = '->' - top = probs.topk(5) - for i, (xi, yi) in enumerate(zip(x, y)): - if limit and i >= limit: - break - xi, yi = xi.tolist(), yi.tolist() - print(TL + H * 42 + TR) - print(f'{V} {str(Instruction(tokens[xi[1]], tokens[xi[2:4]])):37} {V}') - print(f'{V} {arrow} {str(Instruction(tokens[yi[0]], tokens[yi[1:3]])):37} {V}') - print(f'{V} {str(Instruction(tokens[xi[4]], tokens[xi[5:7]])):37} {V}') - print(LM + H * 8 + TM + H * 33 + RM) - for value, index in zip(top.values[i], top.indices[i]): - if index in yi: - colorbegin, colorclear = '\033[92m', '\033[0m' - else: - colorbegin, colorclear = '', '' - print(f'{V} {colorbegin}{value*100:05.2f}%{colorclear} {V} {colorbegin}{tokens[index.item()].name:31}{colorclear} {V}') - print(BL + H * 8 + BM + H * 33 + BR) - -def accuracy(y, probs): - return torch.mean(torch.tensor([torch.sum(probs[i][yi]) for i, yi in enumerate(y)])) - diff --git a/asm2vec/version.py b/asm2vec/version.py new file mode 100644 index 0000000..c85dc7e --- /dev/null +++ b/asm2vec/version.py @@ -0,0 +1,4 @@ +VERSION = '1.0.3' +DEV_VERSION = '0' + +radare2_version = "5.8.8" diff --git a/catalog-info.yaml b/catalog-info.yaml new file mode 100644 index 0000000..378ab88 --- /dev/null +++ b/catalog-info.yaml @@ -0,0 +1,15 @@ +apiVersion: backstage.io/v1alpha1 +kind: Component +metadata: + name: asm2vec-pytorch + description: All code running ASM2VEC using PyTorch + labels: + - jira-key: DATASCI + - language: Python + annotations: + backstage.io/source-location: url:https://github.com/wandera/asm2vec-pytorch +spec: + type: service + lifecycle: production + owner: datascience + system: datascience diff --git a/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 b/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 new file mode 100644 index 0000000..208607f Binary files /dev/null and b/data/5cca32eb8f9c2a024a57ce12e3fb66070662de80 differ diff --git a/requirements.txt b/requirements.txt index d92495b..3163633 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,2 @@ torch>=1.7,<2 -click>=7.1,<8 r2pipe>=1.5,<2 diff --git a/scripts/bin2asm.py b/scripts/bin2asm.py deleted file mode 100644 index 2134e8c..0000000 --- a/scripts/bin2asm.py +++ /dev/null @@ -1,117 +0,0 @@ -#!/usr/bin/env python3 -import re -import os -import click -import r2pipe -import hashlib -from pathlib import Path - -def sha3(data): - return hashlib.sha3_256(data.encode()).hexdigest() - -def validEXE(filename): - magics = [bytes.fromhex('7f454c46')] - with open(filename, 'rb') as f: - header = f.read(4) - return header in magics - -def normalize(opcode): - opcode = opcode.replace(' - ', ' + ') - opcode = re.sub(r'0x[0-9a-f]+', 'CONST', opcode) - opcode = re.sub(r'\*[0-9]', '*CONST', opcode) - opcode = re.sub(r' [0-9]', ' CONST', opcode) - return opcode - -def fn2asm(pdf, minlen): - # check - if pdf is None: - return - if len(pdf['ops']) < minlen: - return - if 'invalid' in [op['type'] for op in pdf['ops']]: - return - - ops = pdf['ops'] - - # set label - labels, scope = {}, [op['offset'] for op in ops] - assert(None not in scope) - for i, op in enumerate(ops): - if op.get('jump') in scope: - labels.setdefault(op.get('jump'), i) - - # dump output - output = '' - for op in ops: - # add label - if labels.get(op.get('offset')) is not None: - output += f'LABEL{labels[op["offset"]]}:\n' - # add instruction - if labels.get(op.get('jump')) is not None: - output += f' {op["type"]} LABEL{labels[op["jump"]]}\n' - else: - output += f' {normalize(op["opcode"])}\n' - - return output - -def bin2asm(filename, opath, minlen): - # check - if not validEXE(filename): - return 0 - - r = r2pipe.open(str(filename)) - r.cmd('aaaa') - - count = 0 - - for fn in r.cmdj('aflj'): - r.cmd(f's {fn["offset"]}') - asm = fn2asm(r.cmdj('pdfj'), minlen) - if asm: - uid = sha3(asm) - asm = f''' .name {fn["name"]} - .offset {fn["offset"]:016x} - .file {filename.name} -''' + asm - with open(opath / uid, 'w') as f: - f.write(asm) - count += 1 - - print(f'[+] {filename}') - - return count - -@click.command() -@click.option('-i', '--input', 'ipath', help='input directory / file', required=True) -@click.option('-o', '--output', 'opath', default='asm', help='output directory') -@click.option('-l', '--len', 'minlen', default=10, help='ignore assembly code with instructions amount smaller than minlen') -def cli(ipath, opath, minlen): - ''' - Extract assembly functions from binary executable - ''' - ipath = Path(ipath) - opath = Path(opath) - - # create output directory - if not os.path.exists(opath): - os.mkdir(opath) - - fcount, bcount = 0, 0 - - # directory - if os.path.isdir(ipath): - for f in os.listdir(ipath): - if not os.path.islink(ipath / f) and not os.path.isdir(ipath / f): - fcount += bin2asm(ipath / f, opath, minlen) - bcount += 1 - # file - elif os.path.exists(ipath): - fcount += bin2asm(ipath, opath, minlen) - bcount += 1 - else: - print(f'[Error] No such file or directory: {ipath}') - - print(f'[+] Total scan binary: {bcount} => Total generated assembly functions: {fcount}') - -if __name__ == '__main__': - cli() diff --git a/scripts/compare.py b/scripts/compare.py deleted file mode 100644 index 3860b83..0000000 --- a/scripts/compare.py +++ /dev/null @@ -1,44 +0,0 @@ -import torch -import torch.nn as nn -import click -import asm2vec - -def cosine_similarity(v1, v2): - return (v1 @ v2 / (v1.norm() * v2.norm())).item() - -@click.command() -@click.option('-i1', '--input1', 'ipath1', help='target function 1', required=True) -@click.option('-i2', '--input2', 'ipath2', help='target function 2', required=True) -@click.option('-m', '--model', 'mpath', help='model path', required=True) -@click.option('-e', '--epochs', default=10, help='training epochs', show_default=True) -@click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True) -@click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True) -def cli(ipath1, ipath2, mpath, epochs, device, lr): - if device == 'auto': - device = 'cuda' if torch.cuda.is_available() else 'cpu' - - # load model, tokens - model, tokens = asm2vec.utils.load_model(mpath, device=device) - functions, tokens_new = asm2vec.utils.load_data([ipath1, ipath2]) - tokens.update(tokens_new) - model.update(2, tokens.size()) - model = model.to(device) - - # train function embedding - model = asm2vec.utils.train( - functions, - tokens, - model=model, - epochs=epochs, - device=device, - mode='test', - learning_rate=lr - ) - - # compare 2 function vectors - v1, v2 = model.to('cpu').embeddings_f(torch.tensor([0, 1])) - - print(f'cosine similarity : {cosine_similarity(v1, v2):.6f}') - -if __name__ == '__main__': - cli() diff --git a/scripts/test.py b/scripts/test.py deleted file mode 100644 index 31372aa..0000000 --- a/scripts/test.py +++ /dev/null @@ -1,44 +0,0 @@ -import torch -import torch.nn as nn -import click -import asm2vec - -@click.command() -@click.option('-i', '--input', 'ipath', help='target function', required=True) -@click.option('-m', '--model', 'mpath', help='model path', required=True) -@click.option('-e', '--epochs', default=10, help='training epochs', show_default=True) -@click.option('-n', '--neg-sample-num', 'neg_sample_num', default=25, help='negative sampling amount', show_default=True) -@click.option('-l', '--limit', help='limit the amount of output probability result', type=int) -@click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True) -@click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True) -@click.option('-p', '--pretty', default=False, help='pretty print table', show_default=True, is_flag=True) -def cli(ipath, mpath, epochs, neg_sample_num, limit, device, lr, pretty): - if device == 'auto': - device = 'cuda' if torch.cuda.is_available() else 'cpu' - - # load model, tokens - model, tokens = asm2vec.utils.load_model(mpath, device=device) - functions, tokens_new = asm2vec.utils.load_data(ipath) - tokens.update(tokens_new) - model.update(1, tokens.size()) - model = model.to(device) - - # train function embedding - model = asm2vec.utils.train( - functions, - tokens, - model=model, - epochs=epochs, - neg_sample_num=neg_sample_num, - device=device, - mode='test', - learning_rate=lr - ) - - # show predicted probability results - x, y = asm2vec.utils.preprocess(functions, tokens) - probs = model.predict(x.to(device), y.to(device)) - asm2vec.utils.show_probs(x, y, probs, tokens, limit=limit, pretty=pretty) - -if __name__ == '__main__': - cli() diff --git a/scripts/train.py b/scripts/train.py deleted file mode 100644 index 98391f4..0000000 --- a/scripts/train.py +++ /dev/null @@ -1,52 +0,0 @@ -import torch -import click -import asm2vec - -@click.command() -@click.option('-i', '--input', 'ipath', help='training data folder', required=True) -@click.option('-o', '--output', 'opath', default='model.pt', help='output model path', show_default=True) -@click.option('-m', '--model', 'mpath', help='load previous trained model path', type=str) -@click.option('-l', '--limit', help='limit the number of functions to be loaded', show_default=True, type=int) -@click.option('-d', '--ebedding-dimension', 'embedding_size', default=100, help='embedding dimension', show_default=True) -@click.option('-b', '--batch-size', 'batch_size', default=1024, help='batch size', show_default=True) -@click.option('-e', '--epochs', default=10, help='training epochs', show_default=True) -@click.option('-n', '--neg-sample-num', 'neg_sample_num', default=25, help='negative sampling amount', show_default=True) -@click.option('-a', '--calculate-accuracy', 'calc_acc', help='whether calculate accuracy ( will be significantly slower )', is_flag=True) -@click.option('-c', '--device', default='auto', help='hardware device to be used: cpu / cuda / auto', show_default=True) -@click.option('-lr', '--learning-rate', 'lr', default=0.02, help="learning rate", show_default=True) -def cli(ipath, opath, mpath, limit, embedding_size, batch_size, epochs, neg_sample_num, calc_acc, device, lr): - if device == 'auto': - device = 'cuda' if torch.cuda.is_available() else 'cpu' - - if mpath: - model, tokens = asm2vec.utils.load_model(mpath, device=device) - functions, tokens_new = asm2vec.utils.load_data(ipath, limit=limit) - tokens.update(tokens_new) - model.update(len(functions), tokens.size()) - else: - model = None - functions, tokens = asm2vec.utils.load_data(ipath, limit=limit) - - def callback(context): - progress = f'{context["epoch"]} | time = {context["time"]:.2f}, loss = {context["loss"]:.4f}' - if context["accuracy"]: - progress += f', accuracy = {context["accuracy"]:.4f}' - print(progress) - asm2vec.utils.save_model(opath, context["model"], context["tokens"]) - - model = asm2vec.utils.train( - functions, - tokens, - model=model, - embedding_size=embedding_size, - batch_size=batch_size, - epochs=epochs, - neg_sample_num=neg_sample_num, - calc_acc=calc_acc, - device=device, - callback=callback, - learning_rate=lr - ) - -if __name__ == '__main__': - cli() diff --git a/setup.py b/setup.py index 62ff843..19a3051 100644 --- a/setup.py +++ b/setup.py @@ -1,14 +1,69 @@ +import os +import sys +import platform from setuptools import setup, find_packages +from setuptools.command.install import install as _install + +from asm2vec.version import VERSION, radare2_version + + +class install(_install): + @staticmethod + def _setup_radare2() -> None: + if sys.platform.startswith("linux"): # Install required in Docker images + machine = platform.machine() + if machine in ["aarch64", "arm"]: + architecture = "arm64" + elif machine in ["x86_64"]: + architecture = "amd64" + elif machine in ["i386", "i686"]: + architecture = "i386" + else: + raise Exception(f"No architecture for Linux Machine: '{machine}'") + + commands = [ + "apt-get update", + "apt-get install -y --no-install-recommends wget", + f"wget -O /tmp/radare2_{radare2_version}_{architecture}.deb https://github.com/radareorg/radare2/releases/download/{radare2_version}/radare2_{radare2_version}_{architecture}.deb", + f"dpkg -i /tmp/radare2_{radare2_version}_{architecture}.deb", + "r2pm init", + "r2pm update", + f"rm /tmp/radare2_{radare2_version}_{architecture}.deb" + ] + for command in commands: + if os.system(command) != 0: + raise Exception(f"Install radare2 failed: '{command}'") + else: + print("Ensure 'radar2' is installed...") + + def run(self): + self._setup_radare2() + _install.run(self) + + +def readme(): + with open('README.md') as f: + return f.read() + + +def read_requirements(): + with open('requirements.txt') as f: + return [s for s in f.read().split('\n') if not ('--index-url' in s)] + setup( name='asm2vec', - version='1.0.0', - description='Unofficial implementation of asm2vec using pytorch', - install_requires=['torch>=1.7,<2' - 'click>=7.1,<8' - 'r2pipe>=1.5,<2'], - author='oalieno', - author_email='jeffrey6910@gmail.com', + version=VERSION, + description="Jamf's implementation of asm2vec using pytorch", + long_description=readme(), + author='oalieno/jamf', + author_email='jamie.nutter@jamf.com', license='MIT License', - packages = find_packages(), + install_requires=read_requirements(), + packages=find_packages(), + zip_safe=False, + include_package_data=True, + test_suite='nose.collector', + tests_require=['nose'], + cmdclass={'install': install} ) diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..472793c --- /dev/null +++ b/test/__init__.py @@ -0,0 +1 @@ +__all__ = ["test_binary_to_asm"] diff --git a/test/test_binary_to_asm.py b/test/test_binary_to_asm.py new file mode 100644 index 0000000..ce53411 --- /dev/null +++ b/test/test_binary_to_asm.py @@ -0,0 +1,229 @@ +from os import path, mkdir +from pathlib import Path +from shutil import rmtree +from unittest import TestCase + +from asm2vec import __data__ +from asm2vec.binary_to_asm import (bin_to_asm, convert_to_asm, _fn_to_asm, _normalize, _sha3, _valid_exe) + + +class TestBinaryToAsm(TestCase): + + @classmethod + def setUpClass(cls) -> None: + print("\n--- TestBinaryToAsm ---") + cls.output_path = "malware_asm/" + cls.data_path = path.join(__data__, "5cca32eb8f9c2a024a57ce12e3fb66070662de80") + cls.pdf_dict = { + 'name': 'main', + 'size': 18, + 'addr': 4294974144, + 'ops': [ + { + 'offset': 4294974144, + 'esil': 'rbp,8,rsp,-,=[8],8,rsp,-=', + 'refptr': 0, + 'fcn_addr': 4294974144, + 'fcn_last': 4294974161, + 'size': 1, + 'opcode': 'push rbp', + 'disasm': 'push rbp', + 'bytes': '55', + 'family': 'cpu', + 'type': 'rpush', + 'reloc': 'False', + 'type_num': 268435468, + 'type2_num': 0, + 'flags': ['main', 'entry0', 'section.0.__TEXT.__text', 'sym.func.100001ac0', 'rip'], + 'comment': 'WzAwXSAtci14IHNlY3Rpb24gc2l6ZSA3Mzc2IG5hbWVkIDAuX19URVhULl9fdGV4dA==' + }, + { + 'offset': 4294974145, + 'esil': 'rsp,rbp,=', + 'refptr': 0, + 'fcn_addr': 4294974144, + 'fcn_last': 4294974159, + 'size': 3, + 'opcode': 'mov rbp, rsp', + 'disasm': 'mov rbp, rsp', + 'bytes': '4889e5', + 'family': 'cpu', + 'type': 'mov', + 'reloc': 'False', + 'type_num': 9, + 'type2_num': 0 + }, + { + 'offset': 4294974148, + 'esil': 'rbx,8,rsp,-,=[8],8,rsp,-=', + 'refptr': 0, + 'fcn_addr': 4294974144, + 'fcn_last': 4294974161, + 'size': 1, + 'opcode': 'push rbx', + 'disasm': 'push rbx', + 'bytes': '53', + 'family': 'cpu', + 'type': 'rpush', + 'reloc': 'False', + 'type_num': 268435468, + 'type2_num': 0 + }, + { + 'offset': 4294974149, + 'esil': 'rax,8,rsp,-,=[8],8,rsp,-=', + 'refptr': 0, + 'fcn_addr': 4294974144, + 'fcn_last': 4294974161, + 'size': 1, + 'opcode': 'push rax', + 'disasm': 'push rax', + 'bytes': '50', + 'family': 'cpu', + 'type': 'rpush', + 'reloc': 'False', + 'type_num': 268435468, + 'type2_num': 0 + }, + { + 'offset': 4294974150, + 'esil': 'rsi,rbx,=', + 'refptr': 0, + 'fcn_addr': 4294974144, + 'fcn_last': 4294974159, + 'size': 3, + 'opcode': 'mov rbx, rsi', + 'disasm': 'mov rbx, rsi', + 'bytes': '4889f3', + 'family': 'cpu', + 'type': 'mov', + 'reloc': 'False', + 'type_num': 9, + 'type2_num': 0 + }, + { + 'offset': 4294974153, + 'ptr': 4294985864, + 'esil': '0x2db8,rip,+,[8],rax,=', + 'refptr': 8, + 'fcn_addr': 4294974144, + 'fcn_last': 4294974155, + 'size': 7, + 'opcode': 'mov rax, qword [rip + 0x2db8]', + 'disasm': 'mov rax, qword [0x100004888]', + 'bytes': '488b05b82d0000', + 'family': 'cpu', + 'type': 'mov', + 'reloc': 'False', + 'type_num': 9, + 'type2_num': 0, + 'refs': [ + { + 'addr': 4294985864, + 'type': 'DATA', + 'perm': 'r--' + } + ] + }, + { + 'offset': 4294974160, + 'esil': 'rax,rip,=', + 'refptr': 0, + 'fcn_addr': 4294974144, + 'fcn_last': 4294974160, + 'size': 2, + 'opcode': 'jmp rax', + 'disasm': 'jmp rax', + 'bytes': 'ffe0', + 'family': 'cpu', + 'type': 'rjmp', + 'reloc': 'False', + 'type_num': 268435458, + 'type2_num': 0 + } + ] + } + mkdir(cls.output_path) + + + @classmethod + def tearDownClass(cls) -> None: + rmtree(cls.output_path) + + def test_sha3(self): + """Should return 64-character long string""" + asm = ("push rbp\n" + "mov rbp, rsp\n" + "push rbx\n" + "push rax\n" + "mov rbx, rsi\n" + "mov rax, qword [rip + CONST]\n" + "jmp rax") + self.assertRegex(_sha3(asm), '^[a-f0-9]{64}') + + def test_valid_exe_when_valid_magic_bytes(self): + """Should return boolean""" + magic_bytes = ["cffaedfe"] + self.assertEqual(_valid_exe(self.data_path, magic_bytes), True) + + def test_valid_exe_when_not_valid_magic_bytes(self): + """Should return boolean""" + magic_bytes = ["cafebabe"] + self.assertEqual(_valid_exe(self.data_path, magic_bytes), False) + + def test_normalize_when_offset(self): + """Should return normalized opcode""" + opcode = "mov rax, qword [rip + 0x2db8]" + expected_norm_opcode = "mov rax, qword [rip + CONST]" + self.assertEqual(_normalize(opcode), expected_norm_opcode) + + def test_normalize_when_no_offset(self): + """Should return normalized opcode""" + opcode = "mov rbx, rsi" + expected_norm_opcode = "mov rbx, rsi" + self.assertEqual(_normalize(opcode), expected_norm_opcode) + + def test_fn_to_asm_returns_empty_string_when_pdf_none(self): + """Should return assembly functions with normalized opcode""" + pdf = None + asm_min = 5 + expected_asm = "" + self.assertEqual(_fn_to_asm(pdf, asm_min), expected_asm) + + def test_fn_to_asm_returns_empty_string_when_pdfops_shorter_than_minlen(self): + """Should return assembly functions with normalized opcode""" + asm_minlen = 10 + expected_asm = "" + self.assertEqual(_fn_to_asm(self.pdf_dict, asm_minlen), expected_asm) + + def test_fn_to_asm_returns_expected_asm(self): + """Should return assembly functions with normalized opcode""" + asm_min = 5 + expected_asm = (" push rbp\n" + " mov rbp, rsp\n" + " push rbx\n" + " push rax\n" + " mov rbx, rsi\n" + " mov rax, qword [rip + CONST]\n" + " jmp rax\n") + self.assertEqual(_fn_to_asm(self.pdf_dict, asm_min), expected_asm) + + def test_bin_to_asm_returns_expected_number_of_disassembled_files(self): + asm_minlen = 5 + magic_bytes = ["cffaedfe"] + self.assertEqual(bin_to_asm(Path(self.data_path), Path(self.output_path), asm_minlen, magic_bytes), 1) + + def test_bin_to_asm_returns_expected_number_of_disassembled_files_when_pdfops_shorter_than_minlen(self): + asm_minlen = 10 + magic_bytes = ['cffaedfe'] + self.assertEqual(bin_to_asm(Path(self.data_path), Path(self.output_path), asm_minlen, magic_bytes), 0) + + def test_convert_to_asm_returns_expected_sha1(self): + input_path = __data__ + asm_minlen_upper = 10 + asm_minlen_lower = 5 + expected_sha1 = ["5cca32eb8f9c2a024a57ce12e3fb66070662de80"] + self.assertEqual( + convert_to_asm(input_path, self.output_path, asm_minlen_upper, asm_minlen_lower), + expected_sha1 + )