added missing files and improved command line interface

CIFASIS · Jul 4, 2016 · 16e64a3 · 16e64a3
1 parent 7cce795
commit 16e64a3
Show file tree

Hide file tree

Showing 4 changed files with 144 additions and 70 deletions.
diff --git a/README b/README
@@ -0,0 +1,67 @@
+# Neural Fuzzer
+
+Neural-Fuzzer is an experimental fuzzer designed to use state-of-the-art Machine Learning to learn from a set of initial files. 
+It works in two phases: **training** and **generation**.
+
+* In training mode:  it uses [long-short term memory (LSTM)](https://colah.github.io/posts/2015-08-Understanding-LSTMs/) to learn how sequences of bytes are structured. 
+
+* In generation mode: it will automatically generate corrupted or unexpected files and it will try to crash a given program. 
+
+Neural-Fuzzer is open-source (GPL3), powered by [keras](http://keras.io) and it is similar to [rnn-char](https://github.com/karpathy/char-rnn) and other techniques in sequence prediction.
+
+## Requirements
+
+* Python 2.7
+* libhdf5-dev for saving and loading trained generators.
+* [keras](http://keras.io) (automatically installed)
+* [h5py](http://www.h5py.org/) (automatically installed)
+* If you want to execute programs programs and triage crashes, you need to install gdb.
+* If you are going to train your own generator, you need a powerfull GPU.
+
+## Installation
+
+We need install the required libraries. For instance, in Debian/Ubuntu:
+
+    # apt-get install python-numpy libhdf5-dev gdb
+
+After that, we can continue with neural-fuzzer:
+
+     $ git clone https://github.com/CIFASIS/neural-fuzzer/
+     $ cd neural-fuzzer
+     $ python setup.py install --user
+
+## Example
+
+### Generation of XML
+
+First, download the pre-trained generators:
+
+    $ wget "https://github.com/CIFASIS/neural-fuzzer/releases/download/0.0/0-gen-xml.lstm"
+    $ wget "https://github.com/CIFASIS/neural-fuzzer/releases/download/0.0/0-gen-xml.lstm.map"
+
+(more generators are available [here](https://github.com/CIFASIS/neural-fuzzer/releases))
+
+Then, we need a seed to start the generation. For instance, to use '>'
+
+    $ mkdir seeds
+    $ printf ">" > seeds/input.xml
+
+Finally, we can start producing some random xmls using the generators:
+
+    $  ./neural-fuzzer.py --max-gen-size 64 0-gen-xml.lstm seeds/
+      Using Theano backend.
+      Using ./gen-449983086021 to store the generated files
+      Generating a batch of 8 file(s) of size 35 (temp: 0.5 )................................... 
+
+The resulting files will be stored in a randomly named directory (e.g gen-449983086021). It is faster to generate files in a batch, instead of one by one (you can experiment with different batch sizes). In this case, one of the files we obtained is this one:
+
+```xml
+></p>
+<p><termdef id='dt-encoding'>
+```
+
+An interesting parameter is the maximum size of the generated file. Another parameter we can use is the temperature parameter which takes a number in range (0, 1] (default = 0.5). As [karphaty explains](https://github.com/karpathy/char-rnn/blob/master/Readme.md), the temperature is dividing the predicted log probabilities before the Softmax, so lower temperature will cause the model to make more likely, but also more boring and conservative predictions. Higher temperatures cause the model to take more chances and increase diversity of results, but at a cost of more mistakes. 
+
+### Training
+
+TODO
diff --git a/neural-fuzzer.py b/neural-fuzzer.py
@@ -3,10 +3,7 @@
 ''' TODO '''
 
 from __future__ import print_function
-from keras.models import Sequential
-from keras.layers.core import Dense, Activation, Dropout
-from keras.layers.recurrent import LSTM
-from keras.utils.data_utils import get_file
+
 import argparse
 import numpy as np
 import random
@@ -15,6 +12,10 @@
 import shutil
 import sys
 
+from keras.models import Sequential
+from keras.layers.core import Dense, Activation, Dropout
+from keras.layers.recurrent import LSTM
+from keras.utils.data_utils import get_file
 from triage import triage, aflcount, test
 
 def sample(a, temperature=1.0, inverse=0.01):
@@ -58,16 +59,18 @@ def read_seeds(seeds, nsamples):
 
   return seeds_text
 
-def recall(model, char_indices, indices_char, data, testdirs, filename, maxlen, maxgenlen, batch_size = 128):
+def recall(model, char_indices, indices_char, data, testdirs, filename, maxlen, maxgenlen, batch_size = 8):
 
 
     f = []
     generated = []
     sentence = []
 
     for b in range(batch_size):
-      f.append(open(filename+"-"+str(b), "w+"))
+      fop = open(filename+"-"+str(b), "w")
+      f.append(fop)
       f[b].write(data)
+      f[b].flush()
 
       if len(data) < maxlen:
          x = "".join(map(chr, list(np.random.random_integers(0,255,maxlen-len(data)))  )) + data
@@ -76,41 +79,33 @@ def recall(model, char_indices, indices_char, data, testdirs, filename, maxlen,
       sentence.append(x)
 
     gensize = random.randint(maxgenlen / 2, maxgenlen)
+    print("Generating a batch of",batch_size,"file(s) of size", gensize, "(temp:",diversity,")",end="")
+
     model.reset_states()
-
-    print("Generating..")
+    #print("Generating ",)
 
     for i in range(gensize):
+
+        sys.stdout.write('.')
+        sys.stdout.flush()
+
         x = np.zeros((batch_size, maxlen, len(char_indices)))
 
         for b in range(batch_size):
             for t, char in enumerate(sentence[b]):
                x[b, t, char_indices[char]] = 1.
-
-        #print("Predicting..")
+
         preds = model.predict(x, verbose=0)#[0]
-        #print("End of prediction.")
-
         for b in range(batch_size):
             next_index = sample(preds[b], diversity)
             next_char = indices_char[next_index]
 
             generated[b] += next_char
             sentence[b] = sentence[b][1:] + next_char
 
-        #f.write(next_char)
-        #f.flush()
-
-    #for x in generated.split(data):
-    #    print("->",repr(data+x))
-
-    #generated = data + generated.split(data)[0]
-    #print(repr(generated))
-    print("Writting..")
+    print("")
 
     for b in range(batch_size):
-
-        #print(b,repr(generated[b]))
         f[b].write(generated[b])
         f[b].close()
 
@@ -120,7 +115,7 @@ def define_model(input_dim, output_dim):
     model.add(LSTM(64, return_sequences=True, input_shape=input_dim))
     model.add(Dropout(0.2))
     model.add(LSTM(64, return_sequences=True, input_shape=input_dim))
-    model.add(Dropout(0.2)) 
+    model.add(Dropout(0.2))
     model.add(LSTM(64, return_sequences=False))
     model.add(Dropout(0.2))
     model.add(Dense(output_dim))
@@ -135,27 +130,31 @@ def define_model(input_dim, output_dim):
     parser.add_argument("model", help="", type=str, default=None)
     parser.add_argument("seeds", help="", type=str, default=None)
     parser.add_argument("--cmd", help="", nargs='+', type=str, default=[])
+    parser.add_argument("--temp", help="", nargs='+', type=float, default=[0.5])
+
     #parser.add_argument("-d", help="", type=int, default=5)
     #parser.add_argument("-p", help="", action="store_true", default=False)
 
-    #parser.add_argument("--model", type=str,
-    #                    help="",
-    #                    action="store", default=None)
     parser.add_argument("--valid-seeds", help="", type=str, default=None)
 
 
-    parser.add_argument("--gen",
-                        help="Test a model using infile (recall only)",
+    parser.add_argument("--train",
+                        help="",
                         action="store_true", default=False)
 
+    parser.add_argument("--batch-size", type=int,
+                        help="",
+                        action="store", default=8)
+
+
     parser.add_argument("--max-gen-size", type=int,
                         help="",
-                        action="store", default=100)
+                        action="store", default=300)
 
 
     parser.add_argument("--n-gen-samples", type=int,
                         help="",
-                        action="store", default=10)
+                        action="store", default=1)
 
 
     parser.add_argument("--n-train-samples", type=int,
@@ -175,70 +174,66 @@ def define_model(input_dim, output_dim):
     valid_seeds = options.valid_seeds
 
     cmd = options.cmd
-    test_dir = "./test-"+str(random.random()).replace("0.","")
+    test_dir = "./gen-"+str(random.random()).replace("0.","")
     max_paths = [-1]*len(cmd)
-    print("Using",test_dir)
-    #assert(0)
+    print("Using",test_dir,"to store the generated files")
 
-    gen_mode = options.gen
+    gen_mode = not options.train
     n_train_samples = options.n_train_samples
     n_gen_samples = options.n_gen_samples
+    batch_size = options.batch_size
 
     maxgenlen = options.max_gen_size
     fixed_start_index = options.start_index
+    temps = options.temp
 
     #depth = options.d
     #prune = options.p
 
-    #assert(0)
-
     text = read_seeds(seeds, n_train_samples)
     if valid_seeds is not None:
         valid_text = read_seeds(valid_seeds, sys.maxsize)
     else:
         valid_text = text
 
     maxlen = 20
-    max_rand = len(text) - maxlen - 1
-    
-    
+    max_rand = max(0,len(text) - maxlen - 1)
+
+
     if gen_mode:
         (char_indices, indices_char) = pickle.load(open(file_model+".map","r"))
         model = define_model((maxlen, len(char_indices)), len(char_indices))
         model.load_weights(file_model)
         model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
 
-        for iteration in range(0,n_gen_samples):
-            if os.path.exists(test_dir):
-                shutil.rmtree(test_dir)
-
+        if not os.path.exists(test_dir):
             os.makedirs(test_dir)
 
-            for diversity in [x / 10.0 for x in [5]]:
-                sys.stdout.write('.')
-                sys.stdout.flush()
+        for iteration in range(0,n_gen_samples):
+            for diversity in temps:
                 if fixed_start_index is not None:
                     start_index = fixed_start_index
                 else:
                     start_index = random.randint(0, max_rand)
 
-                filename = test_dir+"/gen-"+str(iteration)+"-"+str(diversity)
-                recall(model, char_indices, indices_char, text[start_index: start_index + maxlen], test_dir, filename, maxlen, maxgenlen)
+                filename = test_dir+"/file-"+str(iteration)+"-"+str(diversity)
+                #print(filename)
+                recall(model, char_indices, indices_char, text[start_index: start_index + maxlen], test_dir, filename, maxlen, maxgenlen, batch_size)
 
-            print("Executing..")
             for c in cmd:
-                r = test("env -i ASAN_OPTIONS='abort_on_error=1' "+c+" "+test_dir+"/* > /dev/null 2> /dev/null", None)
-                print(r)
-                if (not (r in [0,1])):
-                    print(c," failed?")
-                    sys.exit(0)
-                #x = (triage(c, test_dir))
-                #if len(x.keys()) > 1 or (not ('' in x.keys())):
-                #    print(x)
+                print("Executing",c)
+                #r = test("env -i ASAN_OPTIONS='abort_on_error=1' "+c+" "+test_dir+"/* > /dev/null 2> /dev/null", None)
+                #print(r)
+                #if (not (r in [0,1])):
+                #    print(c," failed?")
                 #    sys.exit(0)
+                x = triage(c, test_dir)
+                if len(x.keys()) > 1 or (not ('' in x.keys())):
+                    print(x)
+                    sys.exit(0)
 
         sys.exit(0)
-    
+
 
     print('corpus length:', len(text))
 
@@ -253,16 +248,10 @@ def define_model(input_dim, output_dim):
     sentences = []
     next_chars = []
     for i in range(0, len(text) - maxlen, step):
-        #if random.random() <= 2.0:
         sentences.append(text[i: i + maxlen])
         next_chars.append(text[i + maxlen])
 
-    #for s in sample(range(len(
-
-    print('nb sequences:', len(sentences))
-
-    #assert(0)
-
+    print('Number of sequences to gen/train:', len(sentences))
     print('Vectorization...')
     X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
     y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
@@ -300,7 +289,6 @@ def define_model(input_dim, output_dim):
                 else:
                     start_index = random.randint(0, max_rand)
 
-                #print()
                 filename = "test/gen-"+str(rep)+"-"+str(iteration)+"-"+str(diversity)
                 recall(model, char_indices, indices_char, valid_text[start_index: start_index + maxlen], "test", filename, maxlen, maxgenlen)
 

diff --git a/setup.py b/setup.py
@@ -0,0 +1,19 @@
+#!/usr/bin/python2
+from setuptools import setup
+
+setup(
+    name='Neural-Fuzzer',
+    version='0.1',
+    license='GPL3',
+    description='',
+    long_description="",
+    url='http://cifasis.github.io/neural-fuzzer/',
+    author='G.Grieco',
+    author_email='[email protected]',
+    scripts=['neural-fuzzer.py'],
+    install_requires=[
+        "keras",
+        "h5py"
+    ],
+)
+
diff --git a/triage.py b/triage.py
@@ -92,10 +92,10 @@ def test(cmd, seeds):
             " ") + [testcase] + prepared_cmd[1].split(" ")
         prepared_cmd = remove_nils(prepared_cmd)
         os.system(" ".join(prepared_cmd))
- 
+
 def triage(cmd, seeds, depth=5, prune=False):
-    gdb_cmd = "env -i ASAN_OPTIONS='abort_on_error=1' gdb -batch -ex 'tty /dev/null' -ex run -ex bt 20 --args @@ 2> /dev/null"
-    #gdb_cmd = "env -i ASAN_OPTIONS='abort_on_error=1' gdb -batch -ex run -ex 'bt 20' --args @@" #2> /dev/null"
+    #gdb_cmd = "env -i ASAN_OPTIONS='abort_on_error=1' gdb -batch -ex 'tty /dev/null' -ex run -ex bt 20 --args @@ 2> /dev/null"
+    gdb_cmd = "env -i ASAN_OPTIONS='abort_on_error=1' gdb -batch -ex run -ex 'bt 20' --args @@ 2> /dev/null"
 
     all_files = []
     dedup_files = dict()