From 80b0412441f8f3324f1687f957d91731aaa7786c Mon Sep 17 00:00:00 2001 From: Bill Thiede Date: Wed, 22 Feb 2023 23:11:44 -0800 Subject: [PATCH] Use rayon to parallelize walk, this speeds up access over NFS. --- Cargo.lock | 112 ++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 1 + src/bin/cleanupdupes.rs | 25 +++++++-- 3 files changed, 133 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3f38f25..4774abe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,12 @@ version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800" +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + [[package]] name = "base64" version = "0.13.1" @@ -57,6 +63,49 @@ dependencies = [ "libc", ] +[[package]] +name = "crossbeam-channel" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f" +dependencies = [ + "cfg-if", +] + [[package]] name = "crypto-common" version = "0.1.6" @@ -83,6 +132,12 @@ dependencies = [ "crypto-common", ] +[[package]] +name = "either" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" + [[package]] name = "email" version = "0.1.0" @@ -90,6 +145,7 @@ dependencies = [ "anyhow", "mailparse", "memmap", + "rayon", "regex", "sha1", "thiserror", @@ -115,6 +171,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "hermit-abi" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" +dependencies = [ + "libc", +] + [[package]] name = "libc" version = "0.2.139" @@ -148,6 +213,25 @@ dependencies = [ "winapi", ] +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + +[[package]] +name = "num_cpus" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "proc-macro2" version = "1.0.51" @@ -172,6 +256,28 @@ version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a24039f627d8285853cc90dcddf8c1ebfaa91f834566948872b225b9a28ed1b6" +[[package]] +name = "rayon" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db3a213adf02b3bcfd2d3846bb41cb22857d131789e01df434fb7e7bc0759b7" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "356a0625f1954f730c0201cdab48611198dc6ce21f4acff55089b5a78e6e835b" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "num_cpus", +] + [[package]] name = "regex" version = "1.7.0" @@ -198,6 +304,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + [[package]] name = "sha1" version = "0.10.5" diff --git a/Cargo.toml b/Cargo.toml index 3e56dd0..3798d74 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ edition = "2018" anyhow = "1.0.69" mailparse = "0.14.0" memmap = "0.7.0" +rayon = "1.6.1" regex = "1.7.0" sha1 = "0.10.5" thiserror = "1.0.38" diff --git a/src/bin/cleanupdupes.rs b/src/bin/cleanupdupes.rs index bd44b67..c34fc26 100644 --- a/src/bin/cleanupdupes.rs +++ b/src/bin/cleanupdupes.rs @@ -1,6 +1,7 @@ use std::{collections::HashMap, fs::remove_file}; use email::hash_file; +use rayon::{iter::ParallelBridge, prelude::ParallelIterator}; use walkdir::WalkDir; const ENV_VAR_TO_DELETE: &str = "DELETE_DUPES"; @@ -11,6 +12,7 @@ fn main() -> anyhow::Result<()> { .map(|dir| { WalkDir::new(dir) .into_iter() + .par_bridge() .filter_map(|entry| entry.ok()) .filter_map(|entry| { if entry.file_type().is_dir() { @@ -26,12 +28,25 @@ fn main() -> anyhow::Result<()> { } } }) - .fold(HashMap::new(), |mut m, (h, arg)| { - m.entry(h).or_insert(Vec::new()).push(arg); - m - }) + .fold( + || HashMap::new(), + |mut m, (h, arg)| { + m.entry(h).or_insert(Vec::new()).push(arg); + m + }, + ) }) - .unwrap(); + .unwrap() + // Merge maps created by parallel iteration. + .reduce( + || HashMap::new(), + |mut acc, m| { + for (k, v) in m { + acc.entry(k).or_insert(Vec::new()).extend(v); + } + acc + }, + ); for (hash, mut paths) in map { if paths.len() == 1 {