Skip to content

Commit

Permalink
added missing files and improved command line interface
Browse files Browse the repository at this point in the history
  • Loading branch information
g committed Jul 4, 2016
1 parent 7cce795 commit 16e64a3
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 70 deletions.
67 changes: 67 additions & 0 deletions README
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Neural Fuzzer

Neural-Fuzzer is an experimental fuzzer designed to use state-of-the-art Machine Learning to learn from a set of initial files.
It works in two phases: **training** and **generation**.

* In training mode: it uses [long-short term memory (LSTM)](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) to learn how sequences of bytes are structured.

* In generation mode: it will automatically generate corrupted or unexpected files and it will try to crash a given program.

Neural-Fuzzer is open-source (GPL3), powered by [keras](http://keras.io) and it is similar to [rnn-char](https://github.com/karpathy/char-rnn) and other techniques in sequence prediction.

## Requirements

* Python 2.7
* libhdf5-dev for saving and loading trained generators.
* [keras](http://keras.io) (automatically installed)
* [h5py](http://www.h5py.org/) (automatically installed)
* If you want to execute programs programs and triage crashes, you need to install gdb.
* If you are going to train your own generator, you need a powerfull GPU.

## Installation

We need install the required libraries. For instance, in Debian/Ubuntu:

# apt-get install python-numpy libhdf5-dev gdb

After that, we can continue with neural-fuzzer:

$ git clone https://github.com/CIFASIS/neural-fuzzer/
$ cd neural-fuzzer
$ python setup.py install --user

## Example

### Generation of XML

First, download the pre-trained generators:

$ wget "https://github.com/CIFASIS/neural-fuzzer/releases/download/0.0/0-gen-xml.lstm"
$ wget "https://github.com/CIFASIS/neural-fuzzer/releases/download/0.0/0-gen-xml.lstm.map"

(more generators are available [here](https://github.com/CIFASIS/neural-fuzzer/releases))

Then, we need a seed to start the generation. For instance, to use '>'

$ mkdir seeds
$ printf ">" > seeds/input.xml

Finally, we can start producing some random xmls using the generators:

$ ./neural-fuzzer.py --max-gen-size 64 0-gen-xml.lstm seeds/
Using Theano backend.
Using ./gen-449983086021 to store the generated files
Generating a batch of 8 file(s) of size 35 (temp: 0.5 )...................................

The resulting files will be stored in a randomly named directory (e.g gen-449983086021). It is faster to generate files in a batch, instead of one by one (you can experiment with different batch sizes). In this case, one of the files we obtained is this one:

```xml
></p>
<p><termdef id='dt-encoding'>
```

An interesting parameter is the maximum size of the generated file. Another parameter we can use is the temperature parameter which takes a number in range (0, 1] (default = 0.5). As [karphaty explains](https://github.com/karpathy/char-rnn/blob/master/Readme.md), the temperature is dividing the predicted log probabilities before the Softmax, so lower temperature will cause the model to make more likely, but also more boring and conservative predictions. Higher temperatures cause the model to take more chances and increase diversity of results, but at a cost of more mistakes.

### Training

TODO
122 changes: 55 additions & 67 deletions neural-fuzzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@
''' TODO '''

from __future__ import print_function
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.utils.data_utils import get_file

import argparse
import numpy as np
import random
Expand All @@ -15,6 +12,10 @@
import shutil
import sys

from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.utils.data_utils import get_file
from triage import triage, aflcount, test

def sample(a, temperature=1.0, inverse=0.01):
Expand Down Expand Up @@ -58,16 +59,18 @@ def read_seeds(seeds, nsamples):

return seeds_text

def recall(model, char_indices, indices_char, data, testdirs, filename, maxlen, maxgenlen, batch_size = 128):
def recall(model, char_indices, indices_char, data, testdirs, filename, maxlen, maxgenlen, batch_size = 8):


f = []
generated = []
sentence = []

for b in range(batch_size):
f.append(open(filename+"-"+str(b), "w+"))
fop = open(filename+"-"+str(b), "w")
f.append(fop)
f[b].write(data)
f[b].flush()

if len(data) < maxlen:
x = "".join(map(chr, list(np.random.random_integers(0,255,maxlen-len(data))) )) + data
Expand All @@ -76,41 +79,33 @@ def recall(model, char_indices, indices_char, data, testdirs, filename, maxlen,
sentence.append(x)

gensize = random.randint(maxgenlen / 2, maxgenlen)
print("Generating a batch of",batch_size,"file(s) of size", gensize, "(temp:",diversity,")",end="")

model.reset_states()

print("Generating..")
#print("Generating ",)

for i in range(gensize):

sys.stdout.write('.')
sys.stdout.flush()

x = np.zeros((batch_size, maxlen, len(char_indices)))

for b in range(batch_size):
for t, char in enumerate(sentence[b]):
x[b, t, char_indices[char]] = 1.

#print("Predicting..")

preds = model.predict(x, verbose=0)#[0]
#print("End of prediction.")

for b in range(batch_size):
next_index = sample(preds[b], diversity)
next_char = indices_char[next_index]

generated[b] += next_char
sentence[b] = sentence[b][1:] + next_char

#f.write(next_char)
#f.flush()

#for x in generated.split(data):
# print("->",repr(data+x))

#generated = data + generated.split(data)[0]
#print(repr(generated))
print("Writting..")
print("")

for b in range(batch_size):

#print(b,repr(generated[b]))
f[b].write(generated[b])
f[b].close()

Expand All @@ -120,7 +115,7 @@ def define_model(input_dim, output_dim):
model.add(LSTM(64, return_sequences=True, input_shape=input_dim))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=True, input_shape=input_dim))
model.add(Dropout(0.2))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(output_dim))
Expand All @@ -135,27 +130,31 @@ def define_model(input_dim, output_dim):
parser.add_argument("model", help="", type=str, default=None)
parser.add_argument("seeds", help="", type=str, default=None)
parser.add_argument("--cmd", help="", nargs='+', type=str, default=[])
parser.add_argument("--temp", help="", nargs='+', type=float, default=[0.5])

#parser.add_argument("-d", help="", type=int, default=5)
#parser.add_argument("-p", help="", action="store_true", default=False)

#parser.add_argument("--model", type=str,
# help="",
# action="store", default=None)
parser.add_argument("--valid-seeds", help="", type=str, default=None)


parser.add_argument("--gen",
help="Test a model using infile (recall only)",
parser.add_argument("--train",
help="",
action="store_true", default=False)

parser.add_argument("--batch-size", type=int,
help="",
action="store", default=8)


parser.add_argument("--max-gen-size", type=int,
help="",
action="store", default=100)
action="store", default=300)


parser.add_argument("--n-gen-samples", type=int,
help="",
action="store", default=10)
action="store", default=1)


parser.add_argument("--n-train-samples", type=int,
Expand All @@ -175,70 +174,66 @@ def define_model(input_dim, output_dim):
valid_seeds = options.valid_seeds

cmd = options.cmd
test_dir = "./test-"+str(random.random()).replace("0.","")
test_dir = "./gen-"+str(random.random()).replace("0.","")
max_paths = [-1]*len(cmd)
print("Using",test_dir)
#assert(0)
print("Using",test_dir,"to store the generated files")

gen_mode = options.gen
gen_mode = not options.train
n_train_samples = options.n_train_samples
n_gen_samples = options.n_gen_samples
batch_size = options.batch_size

maxgenlen = options.max_gen_size
fixed_start_index = options.start_index
temps = options.temp

#depth = options.d
#prune = options.p

#assert(0)

text = read_seeds(seeds, n_train_samples)
if valid_seeds is not None:
valid_text = read_seeds(valid_seeds, sys.maxsize)
else:
valid_text = text

maxlen = 20
max_rand = len(text) - maxlen - 1
max_rand = max(0,len(text) - maxlen - 1)


if gen_mode:
(char_indices, indices_char) = pickle.load(open(file_model+".map","r"))
model = define_model((maxlen, len(char_indices)), len(char_indices))
model.load_weights(file_model)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

for iteration in range(0,n_gen_samples):
if os.path.exists(test_dir):
shutil.rmtree(test_dir)

if not os.path.exists(test_dir):
os.makedirs(test_dir)

for diversity in [x / 10.0 for x in [5]]:
sys.stdout.write('.')
sys.stdout.flush()
for iteration in range(0,n_gen_samples):
for diversity in temps:
if fixed_start_index is not None:
start_index = fixed_start_index
else:
start_index = random.randint(0, max_rand)

filename = test_dir+"/gen-"+str(iteration)+"-"+str(diversity)
recall(model, char_indices, indices_char, text[start_index: start_index + maxlen], test_dir, filename, maxlen, maxgenlen)
filename = test_dir+"/file-"+str(iteration)+"-"+str(diversity)
#print(filename)
recall(model, char_indices, indices_char, text[start_index: start_index + maxlen], test_dir, filename, maxlen, maxgenlen, batch_size)

print("Executing..")
for c in cmd:
r = test("env -i ASAN_OPTIONS='abort_on_error=1' "+c+" "+test_dir+"/* > /dev/null 2> /dev/null", None)
print(r)
if (not (r in [0,1])):
print(c," failed?")
sys.exit(0)
#x = (triage(c, test_dir))
#if len(x.keys()) > 1 or (not ('' in x.keys())):
# print(x)
print("Executing",c)
#r = test("env -i ASAN_OPTIONS='abort_on_error=1' "+c+" "+test_dir+"/* > /dev/null 2> /dev/null", None)
#print(r)
#if (not (r in [0,1])):
# print(c," failed?")
# sys.exit(0)
x = triage(c, test_dir)
if len(x.keys()) > 1 or (not ('' in x.keys())):
print(x)
sys.exit(0)

sys.exit(0)


print('corpus length:', len(text))

Expand All @@ -253,16 +248,10 @@ def define_model(input_dim, output_dim):
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
#if random.random() <= 2.0:
sentences.append(text[i: i + maxlen])
next_chars.append(text[i + maxlen])

#for s in sample(range(len(

print('nb sequences:', len(sentences))

#assert(0)

print('Number of sequences to gen/train:', len(sentences))
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
Expand Down Expand Up @@ -300,7 +289,6 @@ def define_model(input_dim, output_dim):
else:
start_index = random.randint(0, max_rand)

#print()
filename = "test/gen-"+str(rep)+"-"+str(iteration)+"-"+str(diversity)
recall(model, char_indices, indices_char, valid_text[start_index: start_index + maxlen], "test", filename, maxlen, maxgenlen)

Expand Down
19 changes: 19 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/python2
from setuptools import setup

setup(
name='Neural-Fuzzer',
version='0.1',
license='GPL3',
description='',
long_description="",
url='http://cifasis.github.io/neural-fuzzer/',
author='G.Grieco',
author_email='[email protected]',
scripts=['neural-fuzzer.py'],
install_requires=[
"keras",
"h5py"
],
)

6 changes: 3 additions & 3 deletions triage.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,10 @@ def test(cmd, seeds):
" ") + [testcase] + prepared_cmd[1].split(" ")
prepared_cmd = remove_nils(prepared_cmd)
os.system(" ".join(prepared_cmd))

def triage(cmd, seeds, depth=5, prune=False):
gdb_cmd = "env -i ASAN_OPTIONS='abort_on_error=1' gdb -batch -ex 'tty /dev/null' -ex run -ex bt 20 --args @@ 2> /dev/null"
#gdb_cmd = "env -i ASAN_OPTIONS='abort_on_error=1' gdb -batch -ex run -ex 'bt 20' --args @@" #2> /dev/null"
#gdb_cmd = "env -i ASAN_OPTIONS='abort_on_error=1' gdb -batch -ex 'tty /dev/null' -ex run -ex bt 20 --args @@ 2> /dev/null"
gdb_cmd = "env -i ASAN_OPTIONS='abort_on_error=1' gdb -batch -ex run -ex 'bt 20' --args @@ 2> /dev/null"

all_files = []
dedup_files = dict()
Expand Down

0 comments on commit 16e64a3

Please sign in to comment.