diff --git a/Cargo.lock b/Cargo.lock index ba7f083..3f38f25 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "anyhow" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800" + [[package]] name = "base64" version = "0.13.1" @@ -81,9 +87,13 @@ dependencies = [ name = "email" version = "0.1.0" dependencies = [ + "anyhow", "mailparse", + "memmap", "regex", "sha1", + "thiserror", + "walkdir", ] [[package]] @@ -128,6 +138,34 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +[[package]] +name = "memmap" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "proc-macro2" +version = "1.0.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +dependencies = [ + "proc-macro2", +] + [[package]] name = "quoted_printable" version = "0.4.7" @@ -151,6 +189,15 @@ version = "0.6.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "sha1" version = "0.10.5" @@ -162,14 +209,93 @@ dependencies = [ "digest", ] +[[package]] +name = "syn" +version = "1.0.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d56e159d99e6c2b93995d171050271edb50ecc5288fbc7cc17de8fdce4e58c14" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "typenum" version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" +[[package]] +name = "unicode-ident" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" + [[package]] name = "version_check" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "walkdir" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" +dependencies = [ + "same-file", + "winapi", + "winapi-util", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/Cargo.toml b/Cargo.toml index 1071d42..3e56dd0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,10 @@ edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +anyhow = "1.0.69" mailparse = "0.14.0" +memmap = "0.7.0" regex = "1.7.0" sha1 = "0.10.5" +thiserror = "1.0.38" +walkdir = "2.3.2" diff --git a/src/bin/cleanupdupes.rs b/src/bin/cleanupdupes.rs new file mode 100644 index 0000000..bd44b67 --- /dev/null +++ b/src/bin/cleanupdupes.rs @@ -0,0 +1,69 @@ +use std::{collections::HashMap, fs::remove_file}; + +use email::hash_file; +use walkdir::WalkDir; +const ENV_VAR_TO_DELETE: &str = "DELETE_DUPES"; + +fn main() -> anyhow::Result<()> { + let map = std::env::args() + .skip(1) + .nth(0) + .map(|dir| { + WalkDir::new(dir) + .into_iter() + .filter_map(|entry| entry.ok()) + .filter_map(|entry| { + if entry.file_type().is_dir() { + println!("{}", entry.path().display()); + return None; + } + let arg = entry.path().display().to_string(); + match hash_file(&arg) { + Ok(h) => Some((h, arg)), + Err(e) => { + eprintln!("{}: failed {}", arg, e); + None + } + } + }) + .fold(HashMap::new(), |mut m, (h, arg)| { + m.entry(h).or_insert(Vec::new()).push(arg); + m + }) + }) + .unwrap(); + + for (hash, mut paths) in map { + if paths.len() == 1 { + continue; + } + + // Put files in "Oldmail" at the end of the list. We keep only the first, and we prefer to + // remove Oldmail over all else. + paths.sort_by(|a, b| { + if a.contains("Oldmail") && b.contains("Oldmail") { + a.partial_cmp(b).unwrap() + } else if a.contains("Oldmail") { + std::cmp::Ordering::Greater + } else if b.contains("Oldmail") { + std::cmp::Ordering::Less + } else { + a.partial_cmp(b).unwrap() + } + }); + + let mut it = paths.iter(); + println!("\n{hash}:"); + println!(" keep: {}", it.next().unwrap()); + for p in it { + println!(" rm: {p}",); + if std::env::var(ENV_VAR_TO_DELETE).is_ok() { + println!("DELETING {p}"); + if let Some(e) = remove_file(p).err() { + eprintln!("Failed to remove {p}: {e}"); + } + } + } + } + Ok(()) +} diff --git a/src/bin/mailparse.rs b/src/bin/mailparse.rs index 0d8de5c..7d10fbe 100644 --- a/src/bin/mailparse.rs +++ b/src/bin/mailparse.rs @@ -1,6 +1,6 @@ -use std::{env, error::Error, fs::File, io::prelude::*, process::exit, slice::Iter}; +use std::{env, fs::File, io::prelude::*, process::exit, slice::Iter}; -use mailparse::{dateparse, MailHeaderMap}; +use mailparse::MailHeaderMap; fn newline(b: &u8) -> bool { *b == b'\n' @@ -23,14 +23,14 @@ fn index_of(it: &mut Iter, needle: &[u8]) -> Option { }) } -fn parse_mbox(mbox_bytes: &Vec) -> Result<(), Box> { +fn parse_mbox(mbox_bytes: &Vec) { let mut it = mbox_bytes.iter(); let mut ix = 0; loop { let mail_start = it.position(newline); if mail_start.is_none() { - return Ok(()); + return; } ix += mail_start.unwrap() + 1; let start = ix; @@ -74,10 +74,10 @@ fn main() { } let mut args = env::args(); args.next(); // drop executable name - args.for_each(|mbox_path| { + for mbox_path in args { let mut mbox = File::open(mbox_path).unwrap(); let mut mails = Vec::new(); mbox.read_to_end(&mut mails).unwrap(); parse_mbox(&mails); - }); + } } diff --git a/src/lib.rs b/src/lib.rs index 7b19339..7a67274 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,17 @@ -use mailparse::{parse_headers, MailHeader, MailHeaderMap}; +use std::{fs::File, path::Path}; + +use mailparse::{parse_headers, MailHeader, MailHeaderMap, MailParseError}; +use memmap::MmapOptions; use sha1::{Digest, Sha1}; +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum EmailError { + #[error("file error: {0}")] + FileError(#[from] std::io::Error), + #[error("mail parse error: {0}")] + MailParseError(#[from] MailParseError), +} // Keep these sorted to match Go implementation. const IMPORTANT_HEADERS: &[&str] = &[ @@ -12,13 +24,20 @@ const IMPORTANT_HEADERS: &[&str] = &[ "to", ]; +pub fn hash_file>(path: P) -> Result { + let file = File::open(path)?; + let mmap = unsafe { MmapOptions::new().map(&file)? }; + let (hdrs, _) = parse_headers(&mmap)?; + Ok(hash_headers(&hdrs)) +} + pub fn hash_headers(hdrs: &[MailHeader]) -> String { // create a Sha1 object let mut hasher = Sha1::new(); for h in IMPORTANT_HEADERS { if let Some(v) = hdrs.get_first_value(h) { - eprintln!("V [{}]", v); + //eprintln!("{}: [{}]", h, v); hasher.update(v); } }