Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Version 0.2.1 #55

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/fileinfo.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ use std::fs;
use std::hash::Hasher;
use std::{fs::Metadata, path::PathBuf};

/// FileInfo is a struct that is used to represent a file on disk.
/// it contains the following information:
/// path - the absolute path to the file
/// filemeta - file metadata of the file (std::fs::MetaData)
/// hash - Option of hash generated using the fxhash library
/// size - size of file obtained from the MetaData of the file
#[derive(Debug, Clone)]
pub struct FileInfo {
pub path: PathBuf,
Expand All @@ -13,6 +19,8 @@ pub struct FileInfo {
}

impl FileInfo {
/// returns a copy of self with the `hash` field replaced with Some(String) containing the hash
/// of the contents of the file
pub fn hash(&self) -> Result<Self> {
let file = fs::File::open(self.path.clone())?;
let mapper = unsafe { Mmap::map(&file)? };
Expand All @@ -28,6 +36,10 @@ impl FileInfo {
})
}

/// accepts the path of the file & returns a new FileInfo struct instance that contains
/// metadata, size & path of the file.
/// The hash is set to None at this stage as it will only be calculated when its absolutely
/// required.
pub fn new(path: PathBuf) -> Result<Self> {
let filemeta = std::fs::metadata(path.clone())?;
Ok(Self {
Expand Down
10 changes: 9 additions & 1 deletion src/formatter.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
pub struct Formatter;
use crate::fileinfo::FileInfo;
use crate::params::Params;
use anyhow::Result;
Expand All @@ -15,7 +14,12 @@ use std::borrow::Cow;
use std::path::PathBuf;
use std::time::Duration;

/// Empty struct that's used to encapsulate common formatting functions.
pub struct Formatter;

impl Formatter {
/// generates a string representing the path to the file on disk. The path that is returned is
/// relative to the path from which deduplicator is invoked.
pub fn human_path(
file: &FileInfo,
app_args: &Params,
Expand All @@ -33,15 +37,18 @@ impl Formatter {
Ok(formatted_path)
}

/// generates human readable file size
pub fn human_filesize(file: &FileInfo) -> Result<String> {
Ok(format!("{:>12}", bytesize::ByteSize::b(file.size)))
}

/// generate human readable modified time for the file
pub fn human_mtime(file: &FileInfo) -> Result<String> {
let modified_time: DateTime<Utc> = file.filemeta.modified()?.into();
Ok(modified_time.format("%Y-%m-%d %H:%M:%S").to_string())
}

/// generates table containing grouped duplicates
pub fn generate_table(raw: Vec<FileInfo>, app_args: &Params) -> Result<Table> {
let basepath_length = app_args.get_directory()?.to_str().unwrap_or_default().len();
let max_filepath_length = raw
Expand Down Expand Up @@ -112,6 +119,7 @@ impl Formatter {
Ok(output_table)
}

/// prints out the table generated by the generate_table function
pub fn print(raw: Vec<FileInfo>, app_args: &Params) -> Result<()> {
if raw.is_empty() {
println!(
Expand Down
32 changes: 18 additions & 14 deletions src/processor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,16 @@ use std::{time::Duration, borrow::Cow};

use crate::fileinfo::FileInfo;

/// different phases through which deduplicator progresses during the process of detecting
/// duplicates.
#[derive(Debug, Clone)]
pub enum State {
Initial,
SizeWise,
HashWise,
}

/// Processor struct contains the set of files that are being worked on along with the current state of the process.
#[derive(Debug, Clone)]
pub struct Processor {
pub files: Vec<FileInfo>,
Expand All @@ -27,7 +30,7 @@ impl Processor {
}
}

pub fn hashwise(&self) -> Result<Self> {
pub fn sizewise(&self) -> Result<Self> {
if self.files.is_empty() {
return Ok(self.clone());
}
Expand All @@ -36,19 +39,17 @@ impl Processor {
let progress_bar = ProgressBar::new(self.files.len() as u64);
progress_bar.set_style(progress_style);
progress_bar.enable_steady_tick(Duration::from_millis(50));
progress_bar.set_message("indexing file hashes");
progress_bar.set_message("indexing file sizes");

let duplicates_table: DashMap<String, Vec<FileInfo>> = DashMap::new();
let duplicates_table: DashMap<u64, Vec<FileInfo>> = DashMap::new();
self.files
.clone()
.into_par_iter()
.progress_with(progress_bar)
.with_finish(ProgressFinish::WithMessage(Cow::from("indexed files hashes")))
.map(|file| file.hash())
.filter_map(Result::ok)
.with_finish(ProgressFinish::WithMessage(Cow::from("indexed files sizes")))
.for_each(|file| {
duplicates_table
.entry(file.hash.clone().unwrap_or_default())
.entry(file.size)
.and_modify(|fileset| fileset.push(file.clone()))
.or_insert_with(|| vec![file]);
});
Expand All @@ -63,11 +64,12 @@ impl Processor {

Ok(Self {
files,
state: State::HashWise,
state: State::SizeWise,
})
}

pub fn sizewise(&self) -> Result<Self> {
/// hashwise is the last
pub fn hashwise(&self) -> Result<Self> {
if self.files.is_empty() {
return Ok(self.clone());
}
Expand All @@ -76,17 +78,19 @@ impl Processor {
let progress_bar = ProgressBar::new(self.files.len() as u64);
progress_bar.set_style(progress_style);
progress_bar.enable_steady_tick(Duration::from_millis(50));
progress_bar.set_message("indexing file sizes");
progress_bar.set_message("indexing file hashes");

let duplicates_table: DashMap<u64, Vec<FileInfo>> = DashMap::new();
let duplicates_table: DashMap<String, Vec<FileInfo>> = DashMap::new();
self.files
.clone()
.into_par_iter()
.progress_with(progress_bar)
.with_finish(ProgressFinish::WithMessage(Cow::from("indexed files sizes")))
.with_finish(ProgressFinish::WithMessage(Cow::from("indexed files hashes")))
.map(|file| file.hash())
.filter_map(Result::ok)
.for_each(|file| {
duplicates_table
.entry(file.size)
.entry(file.hash.clone().unwrap_or_default())
.and_modify(|fileset| fileset.push(file.clone()))
.or_insert_with(|| vec![file]);
});
Expand All @@ -101,7 +105,7 @@ impl Processor {

Ok(Self {
files,
state: State::SizeWise,
state: State::HashWise,
})
}
}