Implement cleanupdupes to remove duplicate emails in a Maildir.

This commit is contained in:
Bill Thiede 2023-02-22 22:29:22 -08:00
parent 7bbdaec84b
commit 88d6c6867d
5 changed files with 226 additions and 8 deletions

126
Cargo.lock generated
View File

@ -11,6 +11,12 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "anyhow"
version = "1.0.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800"
[[package]] [[package]]
name = "base64" name = "base64"
version = "0.13.1" version = "0.13.1"
@ -81,9 +87,13 @@ dependencies = [
name = "email" name = "email"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow",
"mailparse", "mailparse",
"memmap",
"regex", "regex",
"sha1", "sha1",
"thiserror",
"walkdir",
] ]
[[package]] [[package]]
@ -128,6 +138,34 @@ version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]]
name = "memmap"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b"
dependencies = [
"libc",
"winapi",
]
[[package]]
name = "proc-macro2"
version = "1.0.51"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
dependencies = [
"proc-macro2",
]
[[package]] [[package]]
name = "quoted_printable" name = "quoted_printable"
version = "0.4.7" version = "0.4.7"
@ -151,6 +189,15 @@ version = "0.6.28"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
[[package]]
name = "same-file"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
dependencies = [
"winapi-util",
]
[[package]] [[package]]
name = "sha1" name = "sha1"
version = "0.10.5" version = "0.10.5"
@ -162,14 +209,93 @@ dependencies = [
"digest", "digest",
] ]
[[package]]
name = "syn"
version = "1.0.108"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d56e159d99e6c2b93995d171050271edb50ecc5288fbc7cc17de8fdce4e58c14"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "thiserror"
version = "1.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "typenum" name = "typenum"
version = "1.16.0" version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
[[package]]
name = "unicode-ident"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
[[package]] [[package]]
name = "version_check" name = "version_check"
version = "0.9.4" version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "walkdir"
version = "2.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56"
dependencies = [
"same-file",
"winapi",
"winapi-util",
]
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-util"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
dependencies = [
"winapi",
]
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

View File

@ -7,6 +7,10 @@ edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
anyhow = "1.0.69"
mailparse = "0.14.0" mailparse = "0.14.0"
memmap = "0.7.0"
regex = "1.7.0" regex = "1.7.0"
sha1 = "0.10.5" sha1 = "0.10.5"
thiserror = "1.0.38"
walkdir = "2.3.2"

69
src/bin/cleanupdupes.rs Normal file
View File

@ -0,0 +1,69 @@
use std::{collections::HashMap, fs::remove_file};
use email::hash_file;
use walkdir::WalkDir;
const ENV_VAR_TO_DELETE: &str = "DELETE_DUPES";
fn main() -> anyhow::Result<()> {
let map = std::env::args()
.skip(1)
.nth(0)
.map(|dir| {
WalkDir::new(dir)
.into_iter()
.filter_map(|entry| entry.ok())
.filter_map(|entry| {
if entry.file_type().is_dir() {
println!("{}", entry.path().display());
return None;
}
let arg = entry.path().display().to_string();
match hash_file(&arg) {
Ok(h) => Some((h, arg)),
Err(e) => {
eprintln!("{}: failed {}", arg, e);
None
}
}
})
.fold(HashMap::new(), |mut m, (h, arg)| {
m.entry(h).or_insert(Vec::new()).push(arg);
m
})
})
.unwrap();
for (hash, mut paths) in map {
if paths.len() == 1 {
continue;
}
// Put files in "Oldmail" at the end of the list. We keep only the first, and we prefer to
// remove Oldmail over all else.
paths.sort_by(|a, b| {
if a.contains("Oldmail") && b.contains("Oldmail") {
a.partial_cmp(b).unwrap()
} else if a.contains("Oldmail") {
std::cmp::Ordering::Greater
} else if b.contains("Oldmail") {
std::cmp::Ordering::Less
} else {
a.partial_cmp(b).unwrap()
}
});
let mut it = paths.iter();
println!("\n{hash}:");
println!(" keep: {}", it.next().unwrap());
for p in it {
println!(" rm: {p}",);
if std::env::var(ENV_VAR_TO_DELETE).is_ok() {
println!("DELETING {p}");
if let Some(e) = remove_file(p).err() {
eprintln!("Failed to remove {p}: {e}");
}
}
}
}
Ok(())
}

View File

@ -1,6 +1,6 @@
use std::{env, error::Error, fs::File, io::prelude::*, process::exit, slice::Iter}; use std::{env, fs::File, io::prelude::*, process::exit, slice::Iter};
use mailparse::{dateparse, MailHeaderMap}; use mailparse::MailHeaderMap;
fn newline(b: &u8) -> bool { fn newline(b: &u8) -> bool {
*b == b'\n' *b == b'\n'
@ -23,14 +23,14 @@ fn index_of(it: &mut Iter<u8>, needle: &[u8]) -> Option<usize> {
}) })
} }
fn parse_mbox(mbox_bytes: &Vec<u8>) -> Result<(), Box<dyn Error>> { fn parse_mbox(mbox_bytes: &Vec<u8>) {
let mut it = mbox_bytes.iter(); let mut it = mbox_bytes.iter();
let mut ix = 0; let mut ix = 0;
loop { loop {
let mail_start = it.position(newline); let mail_start = it.position(newline);
if mail_start.is_none() { if mail_start.is_none() {
return Ok(()); return;
} }
ix += mail_start.unwrap() + 1; ix += mail_start.unwrap() + 1;
let start = ix; let start = ix;
@ -74,10 +74,10 @@ fn main() {
} }
let mut args = env::args(); let mut args = env::args();
args.next(); // drop executable name args.next(); // drop executable name
args.for_each(|mbox_path| { for mbox_path in args {
let mut mbox = File::open(mbox_path).unwrap(); let mut mbox = File::open(mbox_path).unwrap();
let mut mails = Vec::new(); let mut mails = Vec::new();
mbox.read_to_end(&mut mails).unwrap(); mbox.read_to_end(&mut mails).unwrap();
parse_mbox(&mails); parse_mbox(&mails);
}); }
} }

View File

@ -1,5 +1,17 @@
use mailparse::{parse_headers, MailHeader, MailHeaderMap}; use std::{fs::File, path::Path};
use mailparse::{parse_headers, MailHeader, MailHeaderMap, MailParseError};
use memmap::MmapOptions;
use sha1::{Digest, Sha1}; use sha1::{Digest, Sha1};
use thiserror::Error;
#[derive(Error, Debug)]
pub enum EmailError {
#[error("file error: {0}")]
FileError(#[from] std::io::Error),
#[error("mail parse error: {0}")]
MailParseError(#[from] MailParseError),
}
// Keep these sorted to match Go implementation. // Keep these sorted to match Go implementation.
const IMPORTANT_HEADERS: &[&str] = &[ const IMPORTANT_HEADERS: &[&str] = &[
@ -12,13 +24,20 @@ const IMPORTANT_HEADERS: &[&str] = &[
"to", "to",
]; ];
pub fn hash_file<P: AsRef<Path>>(path: P) -> Result<String, EmailError> {
let file = File::open(path)?;
let mmap = unsafe { MmapOptions::new().map(&file)? };
let (hdrs, _) = parse_headers(&mmap)?;
Ok(hash_headers(&hdrs))
}
pub fn hash_headers(hdrs: &[MailHeader]) -> String { pub fn hash_headers(hdrs: &[MailHeader]) -> String {
// create a Sha1 object // create a Sha1 object
let mut hasher = Sha1::new(); let mut hasher = Sha1::new();
for h in IMPORTANT_HEADERS { for h in IMPORTANT_HEADERS {
if let Some(v) = hdrs.get_first_value(h) { if let Some(v) = hdrs.get_first_value(h) {
eprintln!("V [{}]", v); //eprintln!("{}: [{}]", h, v);
hasher.update(v); hasher.update(v);
} }
} }