Add helpers for parsing emails and understanding what are common structures.
This commit is contained in:
parent
093cc0cb56
commit
5a04c7cc28
244
Cargo.lock
generated
244
Cargo.lock
generated
@ -29,6 +29,12 @@ version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||
|
||||
[[package]]
|
||||
name = "block-buffer"
|
||||
version = "0.10.3"
|
||||
@ -38,6 +44,12 @@ dependencies = [
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.0.79"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
@ -54,6 +66,43 @@ dependencies = [
|
||||
"encoding_rs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "4.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c3d7ae14b20b94cb02149ed21a86c423859cbe18dc7ed69845cace50e52b40a5"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"clap_derive",
|
||||
"clap_lex",
|
||||
"is-terminal",
|
||||
"once_cell",
|
||||
"strsim",
|
||||
"termcolor",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_derive"
|
||||
version = "4.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "44bec8e5c9d09e439c4335b1af0abaab56dcf3b94999a936e1bb47b9134288f0"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro-error",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "clap_lex"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "350b9cf31731f9957399229e9b2adc51eeabdfbe9d71d9a0552275fd12710d09"
|
||||
dependencies = [
|
||||
"os_str_bytes",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cpufeatures"
|
||||
version = "0.2.5"
|
||||
@ -143,6 +192,7 @@ name = "email"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"mailparse",
|
||||
"memmap",
|
||||
"rayon",
|
||||
@ -161,6 +211,27 @@ dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "errno"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
|
||||
dependencies = [
|
||||
"errno-dragonfly",
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "errno-dragonfly"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "generic-array"
|
||||
version = "0.14.6"
|
||||
@ -171,6 +242,12 @@ dependencies = [
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.2.6"
|
||||
@ -180,12 +257,46 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
|
||||
|
||||
[[package]]
|
||||
name = "io-lifetimes"
|
||||
version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "is-terminal"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "21b6b32576413a8e69b90e952e4a026476040d81017b80445deda5f2d3921857"
|
||||
dependencies = [
|
||||
"hermit-abi 0.3.1",
|
||||
"io-lifetimes",
|
||||
"rustix",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.139"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
|
||||
|
||||
[[package]]
|
||||
name = "mailparse"
|
||||
version = "0.14.0"
|
||||
@ -228,10 +339,46 @@ version = "1.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"hermit-abi 0.2.6",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.17.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
|
||||
|
||||
[[package]]
|
||||
name = "os_str_bytes"
|
||||
version = "6.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
|
||||
dependencies = [
|
||||
"proc-macro-error-attr",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error-attr"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.51"
|
||||
@ -295,6 +442,20 @@ version = "0.6.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.36.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fd5c6ff11fecd55b40746d1995a02f2eb375bf8c00d192d521ee09f42bef37bc"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"errno",
|
||||
"io-lifetimes",
|
||||
"libc",
|
||||
"linux-raw-sys",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "same-file"
|
||||
version = "1.0.6"
|
||||
@ -321,6 +482,12 @@ dependencies = [
|
||||
"digest",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.108"
|
||||
@ -332,6 +499,15 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
|
||||
dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.38"
|
||||
@ -411,3 +587,69 @@ name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.45.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
|
||||
dependencies = [
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.42.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm",
|
||||
"windows_aarch64_msvc",
|
||||
"windows_i686_gnu",
|
||||
"windows_i686_msvc",
|
||||
"windows_x86_64_gnu",
|
||||
"windows_x86_64_gnullvm",
|
||||
"windows_x86_64_msvc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.42.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.42.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.42.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.42.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.42.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.42.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.42.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
|
||||
|
||||
@ -8,6 +8,7 @@ edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.69"
|
||||
clap = { version = "4.1.8", features = ["derive"] }
|
||||
mailparse = "0.14.0"
|
||||
memmap = "0.7.0"
|
||||
rayon = "1.6.1"
|
||||
|
||||
136
src/bin/fingerprint.rs
Normal file
136
src/bin/fingerprint.rs
Normal file
@ -0,0 +1,136 @@
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
fs::File,
|
||||
path::Path,
|
||||
time::{SystemTime, UNIX_EPOCH},
|
||||
};
|
||||
|
||||
use clap::Parser;
|
||||
use email::{fingerprint, should_skip, EmailError};
|
||||
use mailparse::{dateparse, parse_mail, MailHeaderMap};
|
||||
use memmap::MmapOptions;
|
||||
use rayon::{iter::ParallelBridge, prelude::ParallelIterator};
|
||||
use walkdir::WalkDir;
|
||||
|
||||
/// Simple program to greet a person
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Args {
|
||||
/// Number years to search through
|
||||
#[arg(short, long, default_value_t = 1)]
|
||||
years: usize,
|
||||
|
||||
/// Enable verbose logging
|
||||
#[arg(short, long, default_value_t = false)]
|
||||
verbose: bool,
|
||||
|
||||
/// Number of example filenames to print
|
||||
#[arg(short, long, default_value_t = 1)]
|
||||
examples: usize,
|
||||
|
||||
/// Show top N fingerprints
|
||||
#[arg(short, long, default_value_t = 10)]
|
||||
top_n: usize,
|
||||
|
||||
/// List of input directories to recursively search
|
||||
input_dir: String,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let args = Args::parse();
|
||||
let n = 1;
|
||||
// Just check messages from the last N years.
|
||||
let max_age = 60 * 60 * 24 * 365 * n;
|
||||
let start = std::time::Instant::now();
|
||||
let unix_secs = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.expect("couldn't get unix time");
|
||||
let youngest = unix_secs.as_secs() as i64 - max_age;
|
||||
let dir = args.input_dir.clone();
|
||||
let map = WalkDir::new(&dir)
|
||||
.into_iter()
|
||||
.par_bridge()
|
||||
.filter_map(|entry| entry.ok())
|
||||
.filter_map(|entry| {
|
||||
if entry.file_type().is_dir() {
|
||||
if args.verbose {
|
||||
println!(
|
||||
"{} ",
|
||||
entry
|
||||
.path()
|
||||
.strip_prefix(&dir)
|
||||
.expect("failed to strip dir")
|
||||
.display()
|
||||
);
|
||||
}
|
||||
return None;
|
||||
}
|
||||
let arg = entry.path().display().to_string();
|
||||
if should_skip(&arg) {
|
||||
return None;
|
||||
}
|
||||
match parse(&arg, youngest) {
|
||||
Ok(Some(h)) => Some((h, arg)),
|
||||
// Skip old emails
|
||||
Ok(None) => return None,
|
||||
Err(e) => {
|
||||
eprintln!("{}: failed {}", arg, e);
|
||||
None
|
||||
}
|
||||
}
|
||||
})
|
||||
.fold(
|
||||
|| HashMap::new(),
|
||||
|mut m, (h, arg)| {
|
||||
m.entry(h).or_insert(Vec::new()).push(arg);
|
||||
m
|
||||
},
|
||||
)
|
||||
// Merge maps created by parallel iteration.
|
||||
.reduce(
|
||||
|| HashMap::new(),
|
||||
|mut acc, m| {
|
||||
for (k, v) in m {
|
||||
acc.entry(k).or_insert(Vec::new()).extend(v);
|
||||
}
|
||||
acc
|
||||
},
|
||||
);
|
||||
|
||||
let mut res: Vec<_> = map
|
||||
.into_iter()
|
||||
.map(|(hash, paths)| {
|
||||
(
|
||||
paths.len(),
|
||||
hash,
|
||||
paths.into_iter().take(args.examples).collect::<Vec<_>>(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
res.sort();
|
||||
res.reverse();
|
||||
for (cnt, hash, ex) in res.into_iter().take(args.top_n) {
|
||||
println!("{cnt} {}\n{hash}\n", ex.join("\n"));
|
||||
}
|
||||
println!("Runtime: {:.2}s", start.elapsed().as_secs_f32());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// If the date in the email is before youngest Ok(None) will be returned.
|
||||
fn parse<P: AsRef<Path>>(path: P, youngest: i64) -> Result<Option<String>, EmailError> {
|
||||
let file = File::open(&path)?;
|
||||
let mmap = unsafe { MmapOptions::new().map(&file)? };
|
||||
let m = parse_mail(&mmap)?;
|
||||
let d = dateparse(
|
||||
m.headers
|
||||
.get_first_value("Date")
|
||||
.unwrap_or("".to_string())
|
||||
.as_str(),
|
||||
)
|
||||
.unwrap_or(0);
|
||||
if d < youngest {
|
||||
return Ok(None);
|
||||
}
|
||||
//println!("{}: {:#?}", path.as_ref().display(), m.ctype);
|
||||
Ok(Some(fingerprint(&m).join("\n")))
|
||||
}
|
||||
25
src/bin/summarize.rs
Normal file
25
src/bin/summarize.rs
Normal file
@ -0,0 +1,25 @@
|
||||
use std::fs::File;
|
||||
|
||||
use clap::Parser;
|
||||
use email::fingerprint;
|
||||
use mailparse::parse_mail;
|
||||
use memmap::MmapOptions;
|
||||
|
||||
/// Use library to summarize information about given mail files
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Args {
|
||||
/// List of files to summarize
|
||||
paths: Vec<String>,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let args = Args::parse();
|
||||
for path in args.paths {
|
||||
let file = File::open(&path)?;
|
||||
let mmap = unsafe { MmapOptions::new().map(&file)? };
|
||||
let m = parse_mail(&mmap)?;
|
||||
println!("{path}\n{}", fingerprint(&m).join("\n"));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
34
src/lib.rs
34
src/lib.rs
@ -1,6 +1,6 @@
|
||||
use std::{fs::File, path::Path};
|
||||
|
||||
use mailparse::{parse_headers, MailHeader, MailHeaderMap, MailParseError};
|
||||
use mailparse::{parse_headers, MailHeader, MailHeaderMap, MailParseError, ParsedMail};
|
||||
use memmap::MmapOptions;
|
||||
use sha1::{Digest, Sha1};
|
||||
use thiserror::Error;
|
||||
@ -44,6 +44,38 @@ pub fn hash_headers(hdrs: &[MailHeader]) -> String {
|
||||
format!("{:x}", hasher.finalize())
|
||||
}
|
||||
|
||||
// Returns true if the last part of path is known to be a non-mail file commonly found under
|
||||
// Maildir/
|
||||
const SKIP_FILES: &[&str] = &[
|
||||
"docdata.glass",
|
||||
"flintlock",
|
||||
"iamglass",
|
||||
".mbsyncstate",
|
||||
"position.glass",
|
||||
"postlist.glass",
|
||||
"termlist.glass",
|
||||
".uidvalidity",
|
||||
];
|
||||
pub fn should_skip(path: &str) -> bool {
|
||||
let filename: &str = if let Some(idx) = path.rfind("/") {
|
||||
&path[idx + 1..]
|
||||
} else {
|
||||
path
|
||||
};
|
||||
SKIP_FILES.contains(&filename)
|
||||
}
|
||||
|
||||
pub fn fingerprint(pm: &ParsedMail<'_>) -> Vec<String> {
|
||||
fingerprint_rec(pm, 0)
|
||||
}
|
||||
fn fingerprint_rec(pm: &ParsedMail<'_>, depth: usize) -> Vec<String> {
|
||||
let mut v = vec![format!("{}{}", " ".repeat(depth * 2), pm.ctype.mimetype)];
|
||||
for c in &pm.subparts {
|
||||
v.extend(fingerprint_rec(&c, depth + 1));
|
||||
}
|
||||
v
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use mailparse::{parse_headers, MailHeader, MailHeaderMap};
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user