Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

De-duplicate OnDiskCorpus #2802

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ uuid = { version = "1.10.0", features = ["serde", "v4"] }
which = "6.0.3"
windows = "0.58.0"
z3 = "0.12.1"
fs2 = "0.4.3"
domenukk marked this conversation as resolved.
Show resolved Hide resolved


[workspace.lints.rust]
Expand Down
2 changes: 2 additions & 0 deletions libafl/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ std = [
"libafl_bolts/std",
"typed-builder",
"fastbloom",
"fs2"
]

## Tracks the Feedbacks and the Objectives that were interesting for a Testcase
Expand Down Expand Up @@ -293,6 +294,7 @@ clap = { workspace = true, optional = true }
num_enum = { workspace = true, optional = true }
libipt = { workspace = true, optional = true }
fastbloom = { version = "0.8.0", optional = true }
fs2 = { workspace = true, optional = true }

[lints]
workspace = true
Expand Down
55 changes: 26 additions & 29 deletions libafl/src/corpus/inmemory_ondisk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ use std::{
fs,
fs::{File, OpenOptions},
io,
io::Write,
io::{Read, Write},
path::{Path, PathBuf},
};

use fs2::FileExt;
#[cfg(feature = "gzip")]
use libafl_bolts::compress::GzipCompressor;
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -87,7 +88,7 @@ where
fn add(&mut self, testcase: Testcase<I>) -> Result<CorpusId, Error> {
let id = self.inner.add(testcase)?;
let testcase = &mut self.get(id).unwrap().borrow_mut();
self.save_testcase(testcase, id)?;
self.save_testcase(testcase)?;
*testcase.input_mut() = None;
Ok(id)
}
Expand All @@ -97,7 +98,7 @@ where
fn add_disabled(&mut self, testcase: Testcase<I>) -> Result<CorpusId, Error> {
let id = self.inner.add_disabled(testcase)?;
let testcase = &mut self.get_from_all(id).unwrap().borrow_mut();
self.save_testcase(testcase, id)?;
self.save_testcase(testcase)?;
*testcase.input_mut() = None;
Ok(id)
}
Expand All @@ -108,7 +109,7 @@ where
let entry = self.inner.replace(id, testcase)?;
self.remove_testcase(&entry)?;
let testcase = &mut self.get(id).unwrap().borrow_mut();
self.save_testcase(testcase, id)?;
self.save_testcase(testcase)?;
*testcase.input_mut() = None;
Ok(entry)
}
Expand Down Expand Up @@ -375,42 +376,43 @@ impl<I> InMemoryOnDiskCorpus<I> {
}
}

fn save_testcase(&self, testcase: &mut Testcase<I>, id: CorpusId) -> Result<(), Error>
fn save_testcase(&self, testcase: &mut Testcase<I>) -> Result<(), Error>
where
I: Input,
{
let file_name_orig = testcase.filename_mut().take().unwrap_or_else(|| {
let file_name = testcase.filename_mut().take().unwrap_or_else(|| {
// TODO walk entry metadata to ask for pieces of filename (e.g. :havoc in AFL)
testcase.input().as_ref().unwrap().generate_name(Some(id))
testcase.input().as_ref().unwrap().generate_name()
});

// New testcase, we need to save it.
let mut file_name = file_name_orig.clone();
let lockfile_name = format!(".{file_name}.lock");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally we won't need a lockfile anymore for this dude

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah wait, you're using the lockfile as a counter now? That would allow us to remove the file when no clients use it anymore, right? Maybe an interesting approach..

Copy link
Contributor Author

@BAGUVIX456 BAGUVIX456 Jan 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is the implementation that you originally proposed in the issue, or have I misunderstood it?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah right, looks like I have my bright moments...
In that case great! :)
Also, this feature should maybe be optional - it's not needed if testcases rarely get removed or their content changes seldomly.

let lockfile_path = self.dir_path.join(lockfile_name);

let mut ctr = 2;
let file_name = if self.locking {
loop {
let lockfile_name = format!(".{file_name}.lafl_lock");
let lockfile_path = self.dir_path.join(lockfile_name);
let mut lockfile = try_create_new(&lockfile_path)?.unwrap_or(File::open(lockfile_path)?);

if try_create_new(lockfile_path)?.is_some() {
break file_name;
}
lockfile.lock_exclusive()?;

file_name = format!("{file_name_orig}-{ctr}");
ctr += 1;
}
} else {
file_name
let mut bfr = [0u8; 4];
let ctr = match lockfile.read_exact(&mut bfr) {
Ok(_) => u32::from_le_bytes(bfr) + 1,
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => 1,
Err(e) => return Err(e.into()),
};

lockfile.write_all(&ctr.to_le_bytes())?;
lockfile.unlock()?;

if testcase.file_path().is_none() {
*testcase.file_path_mut() = Some(self.dir_path.join(&file_name));
}
*testcase.filename_mut() = Some(file_name);

if self.meta_format.is_some() {
let metafile_name = format!(".{}.metadata", testcase.filename().as_ref().unwrap());
let metafile_name = format!(
".{}_{}.metadata",
testcase.filename().as_ref().unwrap(),
ctr
);
let metafile_path = self.dir_path.join(&metafile_name);
let mut tmpfile_path = metafile_path.clone();
tmpfile_path.set_file_name(format!(".{metafile_name}.tmp",));
Expand Down Expand Up @@ -442,12 +444,7 @@ impl<I> InMemoryOnDiskCorpus<I> {
*testcase.metadata_path_mut() = Some(metafile_path);
}

if let Err(err) = self.store_input_from(testcase) {
if self.locking {
return Err(err);
}
log::error!("An error occurred when trying to write a testcase without locking: {err}");
}
self.store_input_from(testcase)?;
Ok(())
}

Expand Down
2 changes: 1 addition & 1 deletion libafl/src/events/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ where
fn name_detailed(&self) -> Cow<'static, str> {
match self {
Event::NewTestcase { input, .. } => {
Cow::Owned(format!("Testcase {}", input.generate_name(None)))
Cow::Owned(format!("Testcase {}", input.generate_name()))
}
Event::UpdateExecStats { .. } => Cow::Borrowed("Client Heartbeat"),
Event::UpdateUserStats { .. } => Cow::Borrowed("UserStats"),
Expand Down
2 changes: 1 addition & 1 deletion libafl/src/executors/hooks/unix.rs
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ pub mod unix_signal_handler {
let mut bsod = Vec::new();
{
let mut writer = std::io::BufWriter::new(&mut bsod);
let _ = writeln!(writer, "input: {:?}", input.generate_name(None));
let _ = writeln!(writer, "input: {:?}", input.generate_name());
let bsod = libafl_bolts::minibsod::generate_minibsod(
&mut writer,
signal,
Expand Down
4 changes: 2 additions & 2 deletions libafl/src/inputs/encoded.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use libafl_bolts::{Error, HasLen};
use regex::Regex;
use serde::{Deserialize, Serialize};

use crate::{corpus::CorpusId, inputs::Input};
use crate::inputs::Input;

/// Trait to encode bytes to an [`EncodedInput`] using the given [`Tokenizer`]
pub trait InputEncoder<T>
Expand Down Expand Up @@ -202,7 +202,7 @@ pub struct EncodedInput {
impl Input for EncodedInput {
/// Generate a name for this input
#[must_use]
fn generate_name(&self, _id: Option<CorpusId>) -> String {
fn generate_name(&self) -> String {
let mut hasher = RandomState::with_seeds(0, 0, 0, 0).build_hasher();
for code in &self.codes {
hasher.write(&code.to_le_bytes());
Expand Down
4 changes: 2 additions & 2 deletions libafl/src/inputs/gramatron.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use ahash::RandomState;
use libafl_bolts::{Error, HasLen};
use serde::{Deserialize, Serialize};

use crate::{corpus::CorpusId, inputs::Input};
use crate::inputs::Input;

/// A terminal for gramatron grammar fuzzing
#[derive(Serialize, Deserialize, Clone, Debug, Default, PartialEq, Eq, Hash)]
Expand Down Expand Up @@ -44,7 +44,7 @@ pub struct GramatronInput {
impl Input for GramatronInput {
/// Generate a name for this input
#[must_use]
fn generate_name(&self, _id: Option<CorpusId>) -> String {
fn generate_name(&self) -> String {
let mut hasher = RandomState::with_seeds(0, 0, 0, 0).build_hasher();
for term in &self.terms {
hasher.write(term.symbol.as_bytes());
Expand Down
26 changes: 13 additions & 13 deletions libafl/src/inputs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,23 @@ pub mod nautilus;

use alloc::{
boxed::Box,
string::{String, ToString},
string::String,
vec::{Drain, Splice, Vec},
};
use core::{
clone::Clone,
fmt::Debug,
hash::Hash,
marker::PhantomData,
ops::{DerefMut, RangeBounds},
};
#[cfg(feature = "std")]
use std::{fs::File, hash::Hash, io::Read, path::Path};
use std::{fs::File, io::Read, path::Path};

#[cfg(feature = "std")]
use libafl_bolts::fs::write_file_atomic;
use libafl_bolts::{
generic_hash_std,
ownedref::{OwnedMutSlice, OwnedSlice},
subrange::{SubRangeMutSlice, SubRangeSlice},
Error, HasLen,
Expand All @@ -51,11 +53,9 @@ use libafl_bolts::{
pub use nautilus::*;
use serde::{Deserialize, Serialize};

use crate::corpus::CorpusId;

/// An input for the target
#[cfg(not(feature = "std"))]
pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug {
pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug + Hash {
/// Write this input to the file
fn to_file<P>(&self, _path: P) -> Result<(), Error> {
Err(Error::not_implemented("Not supported in no_std"))
Expand All @@ -67,12 +67,14 @@ pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug {
}

/// Generate a name for this input
fn generate_name(&self, id: Option<CorpusId>) -> String;
fn generate_name(&self) -> String {
format!("{:016x}", generic_hash_std(self))
}
}

/// An input for the target
#[cfg(feature = "std")]
pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug {
pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug + Hash {
/// Write this input to the file
fn to_file<P>(&self, path: P) -> Result<(), Error>
where
Expand All @@ -93,7 +95,9 @@ pub trait Input: Clone + Serialize + serde::de::DeserializeOwned + Debug {
}

/// Generate a name for this input, the user is responsible for making each name of testcase unique.
fn generate_name(&self, id: Option<CorpusId>) -> String;
fn generate_name(&self) -> String {
format!("{:016x}", generic_hash_std(self))
}
}

/// Convert between two input types with a state
Expand All @@ -118,11 +122,7 @@ macro_rules! none_input_converter {
/// An input for tests, mainly. There is no real use much else.
#[derive(Copy, Clone, Serialize, Deserialize, Debug, Hash)]
pub struct NopInput {}
impl Input for NopInput {
fn generate_name(&self, _id: Option<CorpusId>) -> String {
"nop-input".to_string()
}
}
impl Input for NopInput {}
impl HasTargetBytes for NopInput {
fn target_bytes(&self) -> OwnedSlice<u8> {
OwnedSlice::from(vec![0])
Expand Down
16 changes: 4 additions & 12 deletions libafl/src/inputs/value.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
//! Newtype pattern style wrapper for [`super::Input`]s

use alloc::{string::String, vec::Vec};
use alloc::vec::Vec;
use core::{fmt::Debug, hash::Hash};

use libafl_bolts::{generic_hash_std, rands::Rand};
use libafl_bolts::rands::Rand;
use serde::{Deserialize, Serialize};
#[cfg(feature = "std")]
use {
Expand All @@ -12,7 +12,7 @@ use {
};

use super::Input;
use crate::{corpus::CorpusId, mutators::numeric::Numeric};
use crate::mutators::numeric::Numeric;

/// Newtype pattern wrapper around an underlying structure to implement inputs
///
Expand Down Expand Up @@ -56,11 +56,7 @@ impl<I: Copy> Copy for ValueInput<I> {}
macro_rules! impl_input_for_value_input {
($($t:ty => $name:ident),+ $(,)?) => {
$(
impl Input for ValueInput<$t> {
fn generate_name(&self, _id: Option<CorpusId>) -> String {
format!("{:016x}", generic_hash_std(self))
}
}
impl Input for ValueInput<$t> {}

/// Input wrapping a <$t>
pub type $name = ValueInput<$t>;
Expand All @@ -86,10 +82,6 @@ impl_input_for_value_input!(

/// manually implemented because files can be written more efficiently
impl Input for ValueInput<Vec<u8>> {
fn generate_name(&self, _id: Option<CorpusId>) -> String {
format!("{:016x}", generic_hash_std(self))
}

/// Write this input to the file
#[cfg(feature = "std")]
fn to_file<P>(&self, path: P) -> Result<(), Error>
Expand Down
5 changes: 1 addition & 4 deletions libafl/src/stages/dump.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,7 @@ where
[
Some(id.0.to_string()),
testcase.filename().clone(),
testcase
.input()
.as_ref()
.map(|t| t.generate_name(Some(*id))),
testcase.input().as_ref().map(|t| t.generate_name()),
]
.iter()
.flatten()
Expand Down
Loading