diff --git a/Cargo.lock b/Cargo.lock index 3c59fd7..39e467f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "0.7.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +dependencies = [ + "memchr", +] + [[package]] name = "android_system_properties" version = "0.1.5" @@ -52,6 +61,16 @@ dependencies = [ "serde", ] +[[package]] +name = "bstr" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b45ea9b00a7b3f2988e9a65ad3917e62123c38dba709b666506207be96d1790b" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "bumpalo" version = "3.11.1" @@ -229,7 +248,7 @@ version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" dependencies = [ - "bstr", + "bstr 0.2.17", "csv-core", "itoa", "ryu", @@ -305,7 +324,7 @@ dependencies = [ [[package]] name = "deduplicator" -version = "0.1.3" +version = "0.1.4" dependencies = [ "anyhow", "bytesize", @@ -314,7 +333,7 @@ dependencies = [ "colored", "dashmap", "fxhash", - "glob", + "globwalk", "indicatif", "itertools", "memmap2", @@ -385,6 +404,12 @@ dependencies = [ "libc", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "fxhash" version = "0.2.1" @@ -406,10 +431,28 @@ dependencies = [ ] [[package]] -name = "glob" -version = "0.3.0" +name = "globset" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" +checksum = "029d74589adefde59de1a0c4f4732695c32805624aec7b68d91503d4dba79afc" +dependencies = [ + "aho-corasick", + "bstr 1.1.0", + "fnv", + "log", + "regex", +] + +[[package]] +name = "globwalk" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93e3af942408868f6934a7b85134a3230832b9977cf66125df2f9edcfce4ddcc" +dependencies = [ + "bitflags", + "ignore", + "walkdir", +] [[package]] name = "hashbrown" @@ -465,6 +508,23 @@ dependencies = [ "cxx-build", ] +[[package]] +name = "ignore" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbe7873dab538a9a44ad79ede1faf5f30d49f9a5c883ddbab48bce81b64b7492" +dependencies = [ + "globset", + "lazy_static", + "log", + "memchr", + "regex", + "same-file", + "thread_local", + "walkdir", + "winapi-util", +] + [[package]] name = "indicatif" version = "0.17.2" @@ -787,12 +847,29 @@ dependencies = [ "thiserror", ] +[[package]] +name = "regex" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + [[package]] name = "regex-automata" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +[[package]] +name = "regex-syntax" +version = "0.6.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" + [[package]] name = "rustix" version = "0.36.5" @@ -819,6 +896,15 @@ version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "scopeguard" version = "1.1.0" @@ -919,6 +1005,15 @@ dependencies = [ "syn", ] +[[package]] +name = "thread_local" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180" +dependencies = [ + "once_cell", +] + [[package]] name = "time" version = "0.1.45" @@ -985,6 +1080,17 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "walkdir" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +dependencies = [ + "same-file", + "winapi", + "winapi-util", +] + [[package]] name = "wasi" version = "0.10.0+wasi-snapshot-preview1" diff --git a/Cargo.toml b/Cargo.toml index da1018a..8152c50 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,10 +1,14 @@ [package] name = "deduplicator" -version = "0.1.3" +version = "0.1.4" edition = "2021" description = "find,filter,delete Duplicates" license = "MIT" -authors = ["Sreedev Kodichath ", "Valentin Bersier ", "Dhruva Sagar "] +authors = [ + "Sreedev Kodichath ", + "Valentin Bersier ", + "Dhruva Sagar ", +] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html @@ -16,7 +20,7 @@ clap = { version = "4.0.32", features = ["derive"] } colored = "2.0.0" dashmap = { version = "5.4.0", features = ["rayon"] } fxhash = "0.2.1" -glob = "0.3.0" +globwalk = "0.8.1" indicatif = { version = "0.17.2", features = ["rayon", "tokio"] } itertools = "0.10.5" memmap2 = "0.5.8" diff --git a/src/output.rs b/src/output.rs index 44def2d..8767bfb 100644 --- a/src/output.rs +++ b/src/output.rs @@ -5,6 +5,7 @@ use chrono::offset::Utc; use chrono::DateTime; use colored::Colorize; use dashmap::DashMap; +use indicatif::{ProgressBar, ProgressIterator, ProgressStyle}; use itertools::Itertools; use prettytable::{format, row, Table}; use std::io::Write; @@ -12,7 +13,7 @@ use std::{fs, io}; use unicode_segmentation::UnicodeSegmentation; fn format_path(path: &str, opts: &Params) -> Result { - let display_path = path.replace(&opts.get_directory()?, ""); + let display_path = path.replace(opts.get_directory()?.to_string_lossy().as_ref(), ""); let display_range = if display_path.chars().count() > 32 { display_path .graphemes(true) @@ -26,7 +27,7 @@ fn format_path(path: &str, opts: &Params) -> Result { display_path }; - Ok(format!("...{:<32}", display_range)) + Ok(format!("...{display_range:<32}")) } fn file_size(file: &File) -> Result { @@ -153,10 +154,18 @@ pub fn print(duplicates: DashMap>, opts: &Params) { } let mut output_table = Table::new(); + let progress_bar = ProgressBar::new(duplicates.len() as u64); + let progress_style = ProgressStyle::default_bar() + .template("{spinner:.green} [generating output] [{wide_bar:.cyan/blue}] {pos}/{len} files") + .unwrap(); + + progress_bar.set_style(progress_style); output_table.set_titles(row!["hash", "duplicates"]); + duplicates .into_iter() .sorted_unstable_by_key(|(_, f)| f.first().and_then(|ff| ff.size).unwrap_or_default()) + .progress_with(progress_bar) .for_each(|(hash, group)| { let mut inner_table = Table::new(); inner_table.set_format(*format::consts::FORMAT_NO_BORDER_LINE_SEPARATOR); diff --git a/src/params.rs b/src/params.rs index adca974..23817ce 100644 --- a/src/params.rs +++ b/src/params.rs @@ -1,6 +1,8 @@ +use std::{fs, path::PathBuf}; + use anyhow::{anyhow, Result}; use clap::{Parser, ValueHint}; -use std::{fs, path::PathBuf}; +use globwalk::{GlobWalker, GlobWalkerBuilder}; #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] @@ -30,35 +32,21 @@ impl Params { } } - pub fn get_directory(&self) -> Result { - let dir_pathbuf: PathBuf = self - .dir - .as_ref() - .unwrap_or(&std::env::current_dir()?) - .as_os_str() - .into(); - - let dir = fs::canonicalize(dir_pathbuf)? - .as_os_str() - .to_str() - .ok_or_else(|| anyhow!("Invalid directory"))? - .to_string(); - + pub fn get_directory(&self) -> Result { + let current_dir = std::env::current_dir()?; + let dir_path = self.dir.as_ref().unwrap_or(¤t_dir).as_path(); + let dir = fs::canonicalize(dir_path)?; Ok(dir) } - pub fn get_glob_patterns(&self) -> PathBuf { - match self.types.as_ref() { - Some(filetypes) => vec![ - self.get_directory().unwrap(), - String::from("**"), - format!("{{{}}}", filetypes), - ] - .iter() - .collect::(), - None => vec![self.get_directory().unwrap().as_str(), "**", "*"] - .iter() - .collect::(), - } + pub fn get_glob_walker(&self) -> Result { + let pattern: String = match self.types.as_ref() { + Some(filetypes) => format!("**/*{{{filetypes}}}"), + None => "**/*".to_string(), + }; + // TODO: add params for maximum depth and following symlinks, then pass them to this builder + GlobWalkerBuilder::from_patterns(self.get_directory()?, &[pattern]) + .build() + .map_err(|e| anyhow!(e)) } } diff --git a/src/scanner.rs b/src/scanner.rs index b69534e..27ac8f5 100644 --- a/src/scanner.rs +++ b/src/scanner.rs @@ -2,11 +2,11 @@ use crate::{file_manager::File, filters, params::Params}; use anyhow::Result; use dashmap::DashMap; use fxhash::hash64 as hasher; -use glob::glob; -use indicatif::{ParallelProgressIterator, ProgressStyle}; +use indicatif::{ParallelProgressIterator, ProgressBar, ProgressIterator, ProgressStyle}; use memmap2::Mmap; use rayon::prelude::*; use std::hash::Hasher; +use std::time::Duration; use std::{fs, path::PathBuf}; #[derive(Clone, Copy)] @@ -40,16 +40,22 @@ pub fn duplicates(app_opts: &Params) -> Result>> { } fn scan(app_opts: &Params) -> Result> { - let glob_patterns = app_opts.get_glob_patterns().display().to_string(); - let glob_iter = glob(&glob_patterns)?; - let files = glob_iter - .filter(Result::is_ok) - .map(|file| file.unwrap()) + let walker = app_opts.get_glob_walker()?; + let progress = ProgressBar::new_spinner(); + let progress_style = + ProgressStyle::with_template("{spinner:.green} [mapping paths] {pos} paths")?; + progress.set_style(progress_style); + progress.enable_steady_tick(Duration::from_millis(100)); + + let files = walker + .progress_with(progress) + .filter_map(Result::ok) + .map(|file| file.into_path()) .filter(|fpath| fpath.is_file()) .collect::>() .into_par_iter() .progress_with_style(ProgressStyle::with_template( - "{spinner:.green} [processing scan results] [{wide_bar:.cyan/blue}] {pos}/{len} files", + "{spinner:.green} [processing mapped paths] [{wide_bar:.cyan/blue}] {pos}/{len} files", )?) .map(|fpath| fpath.display().to_string()) .map(|fpath| File {