diff --git a/src/lib.rs b/src/lib.rs index 16bf1bc..0bcc720 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,6 +3,7 @@ use number_prefix::NumberPrefix; use walkdir::WalkDir; use std::{ + collections::BTreeMap, fmt::Display, path::{Path, PathBuf}, }; @@ -31,6 +32,7 @@ impl ResultExt for Result { } } +#[derive(PartialEq, Eq, PartialOrd, Ord)] pub struct Entry { pub path: PathBuf, pub size: u64, @@ -91,3 +93,105 @@ pub fn format_size(size: u64, binary: bool) -> String { NumberPrefix::Prefixed(prefix, number) => format!("{:.2} {}B", number, prefix), } } + +/// Calculate the sum of sizes of all entries +/// +/// Ignore nested files when calculating the total +/// +/// For the nested files: +/// - `folder/ (5 MB)` +/// - `folder/big_file (15 MB)` +/// +/// The is 15 MB instead of 20 MB because the inner file is inside of the +/// folder that was also received as an argument +/// +/// Implemented with the Trie data structure, made of HashMap and PathBufs +/// that represent each path components of the canonicalized file paths +pub fn calculate_unique_total_size(entries: &[Entry]) -> u64 { + // Entries, but with with canonicalized paths + let entries = { + let mut new_entries: Vec<(PathBuf, &Entry)> = vec![]; + + for entry in entries { + // Log errors and ignore them in the total sum + let canonical_path = entry.path.canonicalize().log_err(Some(&entry.path)); + if let Ok(path) = canonical_path { + new_entries.push((path, entry)); + } + } + new_entries + }; + + #[derive(PartialEq, Eq, PartialOrd, Ord)] + struct TriePathNode { + // Children nodes of this current path, accessed by path + children: BTreeMap, + // Size of the file that ends at this node + node_size: u64, + } + + let mut trie_root = TriePathNode { + children: BTreeMap::new(), + node_size: 0, + }; + + // For each entry/path, add it to the Trie if it wasn't already inserted + // + // If the Trie receives a folder that is parent of a previously added file, then just consider + // the parent folder, removing the childs, this way, we do not count them twice towards the + // final total + for (path, entry) in entries { + // Necessary because we need to check when it's the last path piece + let mut path_iter = path.iter().peekable(); + // Pointer to traverse the tree + let mut current_trie_node = &mut trie_root; + // Size to be added at the end if the current entry isn't children of any other + let size_of_current_file = entry.size; + + while let Some(piece) = path_iter.next() { + // Query for the node in the Trie which matches the current path piece + let entry = current_trie_node.children.entry(PathBuf::from(piece)); + + let mut is_current_node_size_zero = true; + // Keeps track if the current entry is child of another previously found + let next_trie_node = entry + .and_modify(|next_node| { + // If we are in this block, it means that this node was already present in the + // trie tree + is_current_node_size_zero = next_node.node_size == 0; + }) + // Add a node with 0 size, which is only changed afterwards if it's the last piece + .or_insert(TriePathNode { + children: BTreeMap::new(), + node_size: 0, + }); + + // Skipping current entry, because it's nested inside an already accounted file, or is + // a repeated file + if !is_current_node_size_zero { + break; + } + + // If we are at the last piece of the current entry path, it means that this is the tip + // that finally represents the file, and which path is the full file path + let is_the_last_piece = path_iter.peek().is_none(); + if is_the_last_piece { + // Update the size of the last trie node for this piece + next_trie_node.node_size = size_of_current_file; + // Drop all the childrens so that their sizes won't be added twice + next_trie_node.children.clear(); + } + + // Update the pointer to keep traversing the trie + current_trie_node = next_trie_node; + } + } + + fn trie_recursive_sum(node: &TriePathNode) -> u64 { + let children_sum: u64 = node.children.values().map(trie_recursive_sum).sum(); + node.node_size + children_sum + } + + // Traverse the trie tree to calculate the sum + trie_recursive_sum(&trie_root) +} diff --git a/src/main.rs b/src/main.rs index b5ebd82..a58c9cb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,7 +4,7 @@ use wild; mod cli; use cli::Cli; -use durt::{format_size, Entry}; +use durt::{calculate_unique_total_size, format_size, Entry}; fn main() { #[cfg(windows)] @@ -60,7 +60,7 @@ fn main() { Table::new(" {:>} {:<}") }; - let total_size = entries.iter().map(|e| e.size).sum(); + let total_size = calculate_unique_total_size(&entries); let mut omitted_entries = 0; for entry in entries {