Implement cleanupdupes to remove duplicate emails in a Maildir.
This commit is contained in:
parent
7bbdaec84b
commit
88d6c6867d
126
Cargo.lock
generated
126
Cargo.lock
generated
@ -11,6 +11,12 @@ dependencies = [
|
|||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anyhow"
|
||||||
|
version = "1.0.69"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "base64"
|
name = "base64"
|
||||||
version = "0.13.1"
|
version = "0.13.1"
|
||||||
@ -81,9 +87,13 @@ dependencies = [
|
|||||||
name = "email"
|
name = "email"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
"mailparse",
|
"mailparse",
|
||||||
|
"memmap",
|
||||||
"regex",
|
"regex",
|
||||||
"sha1",
|
"sha1",
|
||||||
|
"thiserror",
|
||||||
|
"walkdir",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -128,6 +138,34 @@ version = "2.5.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memmap"
|
||||||
|
version = "0.7.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6585fd95e7bb50d6cc31e20d4cf9afb4e2ba16c5846fc76793f11218da9c475b"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"winapi",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proc-macro2"
|
||||||
|
version = "1.0.51"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "quote"
|
||||||
|
version = "1.0.23"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "quoted_printable"
|
name = "quoted_printable"
|
||||||
version = "0.4.7"
|
version = "0.4.7"
|
||||||
@ -151,6 +189,15 @@ version = "0.6.28"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
|
checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "same-file"
|
||||||
|
version = "1.0.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
|
||||||
|
dependencies = [
|
||||||
|
"winapi-util",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sha1"
|
name = "sha1"
|
||||||
version = "0.10.5"
|
version = "0.10.5"
|
||||||
@ -162,14 +209,93 @@ dependencies = [
|
|||||||
"digest",
|
"digest",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "syn"
|
||||||
|
version = "1.0.108"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d56e159d99e6c2b93995d171050271edb50ecc5288fbc7cc17de8fdce4e58c14"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "thiserror"
|
||||||
|
version = "1.0.38"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0"
|
||||||
|
dependencies = [
|
||||||
|
"thiserror-impl",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "thiserror-impl"
|
||||||
|
version = "1.0.38"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "typenum"
|
name = "typenum"
|
||||||
version = "1.16.0"
|
version = "1.16.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
|
checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-ident"
|
||||||
|
version = "1.0.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "version_check"
|
name = "version_check"
|
||||||
version = "0.9.4"
|
version = "0.9.4"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "walkdir"
|
||||||
|
version = "2.3.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56"
|
||||||
|
dependencies = [
|
||||||
|
"same-file",
|
||||||
|
"winapi",
|
||||||
|
"winapi-util",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi"
|
||||||
|
version = "0.3.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
|
||||||
|
dependencies = [
|
||||||
|
"winapi-i686-pc-windows-gnu",
|
||||||
|
"winapi-x86_64-pc-windows-gnu",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-i686-pc-windows-gnu"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-util"
|
||||||
|
version = "0.1.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
|
||||||
|
dependencies = [
|
||||||
|
"winapi",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-x86_64-pc-windows-gnu"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||||
|
|||||||
@ -7,6 +7,10 @@ edition = "2018"
|
|||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
anyhow = "1.0.69"
|
||||||
mailparse = "0.14.0"
|
mailparse = "0.14.0"
|
||||||
|
memmap = "0.7.0"
|
||||||
regex = "1.7.0"
|
regex = "1.7.0"
|
||||||
sha1 = "0.10.5"
|
sha1 = "0.10.5"
|
||||||
|
thiserror = "1.0.38"
|
||||||
|
walkdir = "2.3.2"
|
||||||
|
|||||||
69
src/bin/cleanupdupes.rs
Normal file
69
src/bin/cleanupdupes.rs
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
use std::{collections::HashMap, fs::remove_file};
|
||||||
|
|
||||||
|
use email::hash_file;
|
||||||
|
use walkdir::WalkDir;
|
||||||
|
const ENV_VAR_TO_DELETE: &str = "DELETE_DUPES";
|
||||||
|
|
||||||
|
fn main() -> anyhow::Result<()> {
|
||||||
|
let map = std::env::args()
|
||||||
|
.skip(1)
|
||||||
|
.nth(0)
|
||||||
|
.map(|dir| {
|
||||||
|
WalkDir::new(dir)
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|entry| entry.ok())
|
||||||
|
.filter_map(|entry| {
|
||||||
|
if entry.file_type().is_dir() {
|
||||||
|
println!("{}", entry.path().display());
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let arg = entry.path().display().to_string();
|
||||||
|
match hash_file(&arg) {
|
||||||
|
Ok(h) => Some((h, arg)),
|
||||||
|
Err(e) => {
|
||||||
|
eprintln!("{}: failed {}", arg, e);
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.fold(HashMap::new(), |mut m, (h, arg)| {
|
||||||
|
m.entry(h).or_insert(Vec::new()).push(arg);
|
||||||
|
m
|
||||||
|
})
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
for (hash, mut paths) in map {
|
||||||
|
if paths.len() == 1 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Put files in "Oldmail" at the end of the list. We keep only the first, and we prefer to
|
||||||
|
// remove Oldmail over all else.
|
||||||
|
paths.sort_by(|a, b| {
|
||||||
|
if a.contains("Oldmail") && b.contains("Oldmail") {
|
||||||
|
a.partial_cmp(b).unwrap()
|
||||||
|
} else if a.contains("Oldmail") {
|
||||||
|
std::cmp::Ordering::Greater
|
||||||
|
} else if b.contains("Oldmail") {
|
||||||
|
std::cmp::Ordering::Less
|
||||||
|
} else {
|
||||||
|
a.partial_cmp(b).unwrap()
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let mut it = paths.iter();
|
||||||
|
println!("\n{hash}:");
|
||||||
|
println!(" keep: {}", it.next().unwrap());
|
||||||
|
for p in it {
|
||||||
|
println!(" rm: {p}",);
|
||||||
|
if std::env::var(ENV_VAR_TO_DELETE).is_ok() {
|
||||||
|
println!("DELETING {p}");
|
||||||
|
if let Some(e) = remove_file(p).err() {
|
||||||
|
eprintln!("Failed to remove {p}: {e}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@ -1,6 +1,6 @@
|
|||||||
use std::{env, error::Error, fs::File, io::prelude::*, process::exit, slice::Iter};
|
use std::{env, fs::File, io::prelude::*, process::exit, slice::Iter};
|
||||||
|
|
||||||
use mailparse::{dateparse, MailHeaderMap};
|
use mailparse::MailHeaderMap;
|
||||||
|
|
||||||
fn newline(b: &u8) -> bool {
|
fn newline(b: &u8) -> bool {
|
||||||
*b == b'\n'
|
*b == b'\n'
|
||||||
@ -23,14 +23,14 @@ fn index_of(it: &mut Iter<u8>, needle: &[u8]) -> Option<usize> {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_mbox(mbox_bytes: &Vec<u8>) -> Result<(), Box<dyn Error>> {
|
fn parse_mbox(mbox_bytes: &Vec<u8>) {
|
||||||
let mut it = mbox_bytes.iter();
|
let mut it = mbox_bytes.iter();
|
||||||
let mut ix = 0;
|
let mut ix = 0;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let mail_start = it.position(newline);
|
let mail_start = it.position(newline);
|
||||||
if mail_start.is_none() {
|
if mail_start.is_none() {
|
||||||
return Ok(());
|
return;
|
||||||
}
|
}
|
||||||
ix += mail_start.unwrap() + 1;
|
ix += mail_start.unwrap() + 1;
|
||||||
let start = ix;
|
let start = ix;
|
||||||
@ -74,10 +74,10 @@ fn main() {
|
|||||||
}
|
}
|
||||||
let mut args = env::args();
|
let mut args = env::args();
|
||||||
args.next(); // drop executable name
|
args.next(); // drop executable name
|
||||||
args.for_each(|mbox_path| {
|
for mbox_path in args {
|
||||||
let mut mbox = File::open(mbox_path).unwrap();
|
let mut mbox = File::open(mbox_path).unwrap();
|
||||||
let mut mails = Vec::new();
|
let mut mails = Vec::new();
|
||||||
mbox.read_to_end(&mut mails).unwrap();
|
mbox.read_to_end(&mut mails).unwrap();
|
||||||
parse_mbox(&mails);
|
parse_mbox(&mails);
|
||||||
});
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
23
src/lib.rs
23
src/lib.rs
@ -1,5 +1,17 @@
|
|||||||
use mailparse::{parse_headers, MailHeader, MailHeaderMap};
|
use std::{fs::File, path::Path};
|
||||||
|
|
||||||
|
use mailparse::{parse_headers, MailHeader, MailHeaderMap, MailParseError};
|
||||||
|
use memmap::MmapOptions;
|
||||||
use sha1::{Digest, Sha1};
|
use sha1::{Digest, Sha1};
|
||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
#[derive(Error, Debug)]
|
||||||
|
pub enum EmailError {
|
||||||
|
#[error("file error: {0}")]
|
||||||
|
FileError(#[from] std::io::Error),
|
||||||
|
#[error("mail parse error: {0}")]
|
||||||
|
MailParseError(#[from] MailParseError),
|
||||||
|
}
|
||||||
|
|
||||||
// Keep these sorted to match Go implementation.
|
// Keep these sorted to match Go implementation.
|
||||||
const IMPORTANT_HEADERS: &[&str] = &[
|
const IMPORTANT_HEADERS: &[&str] = &[
|
||||||
@ -12,13 +24,20 @@ const IMPORTANT_HEADERS: &[&str] = &[
|
|||||||
"to",
|
"to",
|
||||||
];
|
];
|
||||||
|
|
||||||
|
pub fn hash_file<P: AsRef<Path>>(path: P) -> Result<String, EmailError> {
|
||||||
|
let file = File::open(path)?;
|
||||||
|
let mmap = unsafe { MmapOptions::new().map(&file)? };
|
||||||
|
let (hdrs, _) = parse_headers(&mmap)?;
|
||||||
|
Ok(hash_headers(&hdrs))
|
||||||
|
}
|
||||||
|
|
||||||
pub fn hash_headers(hdrs: &[MailHeader]) -> String {
|
pub fn hash_headers(hdrs: &[MailHeader]) -> String {
|
||||||
// create a Sha1 object
|
// create a Sha1 object
|
||||||
let mut hasher = Sha1::new();
|
let mut hasher = Sha1::new();
|
||||||
|
|
||||||
for h in IMPORTANT_HEADERS {
|
for h in IMPORTANT_HEADERS {
|
||||||
if let Some(v) = hdrs.get_first_value(h) {
|
if let Some(v) = hdrs.get_first_value(h) {
|
||||||
eprintln!("V [{}]", v);
|
//eprintln!("{}: [{}]", h, v);
|
||||||
hasher.update(v);
|
hasher.update(v);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user