Implement cleanupdupes to remove duplicate emails in a Maildir.
This commit is contained in:
parent
7bbdaec84b
commit
88d6c6867d
126
Cargo.lock
generated
126
Cargo.lock
generated
@ -11,6 +11,12 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.69"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800"
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.13.1"
|
||||
@ -81,9 +87,13 @@ dependencies = [
|
||||
name = "email"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"mailparse",
|
||||
"memmap",
|
||||
"regex",
|
||||
"sha1",
|
||||
"thiserror",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -128,6 +138,34 @@ version = "2.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
||||
|
||||
[[package]]
|
||||
name = "memmap"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.51"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quoted_printable"
|
||||
version = "0.4.7"
|
||||
@ -151,6 +189,15 @@ version = "0.6.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
|
||||
|
||||
[[package]]
|
||||
name = "same-file"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
|
||||
dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sha1"
|
||||
version = "0.10.5"
|
||||
@ -162,14 +209,93 @@ dependencies = [
|
||||
"digest",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.108"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d56e159d99e6c2b93995d171050271edb50ecc5288fbc7cc17de8fdce4e58c14"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.38"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.38"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
version = "1.16.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
||||
|
||||
[[package]]
|
||||
name = "walkdir"
|
||||
version = "2.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56"
|
||||
dependencies = [
|
||||
"same-file",
|
||||
"winapi",
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
|
||||
dependencies = [
|
||||
"winapi-i686-pc-windows-gnu",
|
||||
"winapi-x86_64-pc-windows-gnu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-i686-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-util"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
@ -7,6 +7,10 @@ edition = "2018"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.69"
|
||||
mailparse = "0.14.0"
|
||||
memmap = "0.7.0"
|
||||
regex = "1.7.0"
|
||||
sha1 = "0.10.5"
|
||||
thiserror = "1.0.38"
|
||||
walkdir = "2.3.2"
|
||||
|
||||
69
src/bin/cleanupdupes.rs
Normal file
69
src/bin/cleanupdupes.rs
Normal file
@ -0,0 +1,69 @@
|
||||
use std::{collections::HashMap, fs::remove_file};
|
||||
|
||||
use email::hash_file;
|
||||
use walkdir::WalkDir;
|
||||
const ENV_VAR_TO_DELETE: &str = "DELETE_DUPES";
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let map = std::env::args()
|
||||
.skip(1)
|
||||
.nth(0)
|
||||
.map(|dir| {
|
||||
WalkDir::new(dir)
|
||||
.into_iter()
|
||||
.filter_map(|entry| entry.ok())
|
||||
.filter_map(|entry| {
|
||||
if entry.file_type().is_dir() {
|
||||
println!("{}", entry.path().display());
|
||||
return None;
|
||||
}
|
||||
let arg = entry.path().display().to_string();
|
||||
match hash_file(&arg) {
|
||||
Ok(h) => Some((h, arg)),
|
||||
Err(e) => {
|
||||
eprintln!("{}: failed {}", arg, e);
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
.fold(HashMap::new(), |mut m, (h, arg)| {
|
||||
m.entry(h).or_insert(Vec::new()).push(arg);
|
||||
m
|
||||
})
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
for (hash, mut paths) in map {
|
||||
if paths.len() == 1 {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Put files in "Oldmail" at the end of the list. We keep only the first, and we prefer to
|
||||
// remove Oldmail over all else.
|
||||
paths.sort_by(|a, b| {
|
||||
if a.contains("Oldmail") && b.contains("Oldmail") {
|
||||
a.partial_cmp(b).unwrap()
|
||||
} else if a.contains("Oldmail") {
|
||||
std::cmp::Ordering::Greater
|
||||
} else if b.contains("Oldmail") {
|
||||
std::cmp::Ordering::Less
|
||||
} else {
|
||||
a.partial_cmp(b).unwrap()
|
||||
}
|
||||
});
|
||||
|
||||
let mut it = paths.iter();
|
||||
println!("\n{hash}:");
|
||||
println!(" keep: {}", it.next().unwrap());
|
||||
for p in it {
|
||||
println!(" rm: {p}",);
|
||||
if std::env::var(ENV_VAR_TO_DELETE).is_ok() {
|
||||
println!("DELETING {p}");
|
||||
if let Some(e) = remove_file(p).err() {
|
||||
eprintln!("Failed to remove {p}: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@ -1,6 +1,6 @@
|
||||
use std::{env, error::Error, fs::File, io::prelude::*, process::exit, slice::Iter};
|
||||
use std::{env, fs::File, io::prelude::*, process::exit, slice::Iter};
|
||||
|
||||
use mailparse::{dateparse, MailHeaderMap};
|
||||
use mailparse::MailHeaderMap;
|
||||
|
||||
fn newline(b: &u8) -> bool {
|
||||
*b == b'\n'
|
||||
@ -23,14 +23,14 @@ fn index_of(it: &mut Iter<u8>, needle: &[u8]) -> Option<usize> {
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_mbox(mbox_bytes: &Vec<u8>) -> Result<(), Box<dyn Error>> {
|
||||
fn parse_mbox(mbox_bytes: &Vec<u8>) {
|
||||
let mut it = mbox_bytes.iter();
|
||||
let mut ix = 0;
|
||||
|
||||
loop {
|
||||
let mail_start = it.position(newline);
|
||||
if mail_start.is_none() {
|
||||
return Ok(());
|
||||
return;
|
||||
}
|
||||
ix += mail_start.unwrap() + 1;
|
||||
let start = ix;
|
||||
@ -74,10 +74,10 @@ fn main() {
|
||||
}
|
||||
let mut args = env::args();
|
||||
args.next(); // drop executable name
|
||||
args.for_each(|mbox_path| {
|
||||
for mbox_path in args {
|
||||
let mut mbox = File::open(mbox_path).unwrap();
|
||||
let mut mails = Vec::new();
|
||||
mbox.read_to_end(&mut mails).unwrap();
|
||||
parse_mbox(&mails);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
23
src/lib.rs
23
src/lib.rs
@ -1,5 +1,17 @@
|
||||
use mailparse::{parse_headers, MailHeader, MailHeaderMap};
|
||||
use std::{fs::File, path::Path};
|
||||
|
||||
use mailparse::{parse_headers, MailHeader, MailHeaderMap, MailParseError};
|
||||
use memmap::MmapOptions;
|
||||
use sha1::{Digest, Sha1};
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum EmailError {
|
||||
#[error("file error: {0}")]
|
||||
FileError(#[from] std::io::Error),
|
||||
#[error("mail parse error: {0}")]
|
||||
MailParseError(#[from] MailParseError),
|
||||
}
|
||||
|
||||
// Keep these sorted to match Go implementation.
|
||||
const IMPORTANT_HEADERS: &[&str] = &[
|
||||
@ -12,13 +24,20 @@ const IMPORTANT_HEADERS: &[&str] = &[
|
||||
"to",
|
||||
];
|
||||
|
||||
pub fn hash_file<P: AsRef<Path>>(path: P) -> Result<String, EmailError> {
|
||||
let file = File::open(path)?;
|
||||
let mmap = unsafe { MmapOptions::new().map(&file)? };
|
||||
let (hdrs, _) = parse_headers(&mmap)?;
|
||||
Ok(hash_headers(&hdrs))
|
||||
}
|
||||
|
||||
pub fn hash_headers(hdrs: &[MailHeader]) -> String {
|
||||
// create a Sha1 object
|
||||
let mut hasher = Sha1::new();
|
||||
|
||||
for h in IMPORTANT_HEADERS {
|
||||
if let Some(v) = hdrs.get_first_value(h) {
|
||||
eprintln!("V [{}]", v);
|
||||
//eprintln!("{}: [{}]", h, v);
|
||||
hasher.update(v);
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user