Implement cleanupdupes to remove duplicate emails in a Maildir.

This commit is contained in:
2023-02-22 22:29:22 -08:00
parent 7bbdaec84b
commit 88d6c6867d
5 changed files with 226 additions and 8 deletions

69
src/bin/cleanupdupes.rs Normal file
View File

@@ -0,0 +1,69 @@
use std::{collections::HashMap, fs::remove_file};
use email::hash_file;
use walkdir::WalkDir;
const ENV_VAR_TO_DELETE: &str = "DELETE_DUPES";
fn main() -> anyhow::Result<()> {
let map = std::env::args()
.skip(1)
.nth(0)
.map(|dir| {
WalkDir::new(dir)
.into_iter()
.filter_map(|entry| entry.ok())
.filter_map(|entry| {
if entry.file_type().is_dir() {
println!("{}", entry.path().display());
return None;
}
let arg = entry.path().display().to_string();
match hash_file(&arg) {
Ok(h) => Some((h, arg)),
Err(e) => {
eprintln!("{}: failed {}", arg, e);
None
}
}
})
.fold(HashMap::new(), |mut m, (h, arg)| {
m.entry(h).or_insert(Vec::new()).push(arg);
m
})
})
.unwrap();
for (hash, mut paths) in map {
if paths.len() == 1 {
continue;
}
// Put files in "Oldmail" at the end of the list. We keep only the first, and we prefer to
// remove Oldmail over all else.
paths.sort_by(|a, b| {
if a.contains("Oldmail") && b.contains("Oldmail") {
a.partial_cmp(b).unwrap()
} else if a.contains("Oldmail") {
std::cmp::Ordering::Greater
} else if b.contains("Oldmail") {
std::cmp::Ordering::Less
} else {
a.partial_cmp(b).unwrap()
}
});
let mut it = paths.iter();
println!("\n{hash}:");
println!(" keep: {}", it.next().unwrap());
for p in it {
println!(" rm: {p}",);
if std::env::var(ENV_VAR_TO_DELETE).is_ok() {
println!("DELETING {p}");
if let Some(e) = remove_file(p).err() {
eprintln!("Failed to remove {p}: {e}");
}
}
}
}
Ok(())
}

View File

@@ -1,6 +1,6 @@
use std::{env, error::Error, fs::File, io::prelude::*, process::exit, slice::Iter};
use std::{env, fs::File, io::prelude::*, process::exit, slice::Iter};
use mailparse::{dateparse, MailHeaderMap};
use mailparse::MailHeaderMap;
fn newline(b: &u8) -> bool {
*b == b'\n'
@@ -23,14 +23,14 @@ fn index_of(it: &mut Iter<u8>, needle: &[u8]) -> Option<usize> {
})
}
fn parse_mbox(mbox_bytes: &Vec<u8>) -> Result<(), Box<dyn Error>> {
fn parse_mbox(mbox_bytes: &Vec<u8>) {
let mut it = mbox_bytes.iter();
let mut ix = 0;
loop {
let mail_start = it.position(newline);
if mail_start.is_none() {
return Ok(());
return;
}
ix += mail_start.unwrap() + 1;
let start = ix;
@@ -74,10 +74,10 @@ fn main() {
}
let mut args = env::args();
args.next(); // drop executable name
args.for_each(|mbox_path| {
for mbox_path in args {
let mut mbox = File::open(mbox_path).unwrap();
let mut mails = Vec::new();
mbox.read_to_end(&mut mails).unwrap();
parse_mbox(&mails);
});
}
}