From 65fcbd4b772ef8751b667dfb464ec0a82f9feb43 Mon Sep 17 00:00:00 2001 From: Bill Thiede Date: Sun, 21 Jul 2024 09:31:37 -0700 Subject: [PATCH] WIP move thread loading for notmuch into nm mod --- server/src/bin/server.rs | 5 +- server/src/error.rs | 12 + server/src/graphql.rs | 688 +++------------------------------------ server/src/lib.rs | 4 +- server/src/nm.rs | 638 +++++++++++++++++++++++++++++++++++- 5 files changed, 688 insertions(+), 659 deletions(-) diff --git a/server/src/bin/server.rs b/server/src/bin/server.rs index 9edbf61..6faee37 100644 --- a/server/src/bin/server.rs +++ b/server/src/bin/server.rs @@ -18,9 +18,8 @@ use rocket_cors::{AllowedHeaders, AllowedOrigins}; use serde::Deserialize; use server::{ error::ServerError, - graphql::{ - attachment_bytes, cid_attachment_bytes, Attachment, GraphqlSchema, Mutation, QueryRoot, - }, + graphql::{Attachment, GraphqlSchema, Mutation, QueryRoot}, + nm::{attachment_bytes, cid_attachment_bytes}, }; use sqlx::postgres::PgPool; diff --git a/server/src/error.rs b/server/src/error.rs index a0d15f8..b2f6b1b 100644 --- a/server/src/error.rs +++ b/server/src/error.rs @@ -1,6 +1,10 @@ +use std::{str::Utf8Error, string::FromUtf8Error}; + use mailparse::MailParseError; use thiserror::Error; +use crate::SanitizeError; + #[derive(Error, Debug)] pub enum ServerError { #[error("notmuch")] @@ -15,4 +19,12 @@ pub enum ServerError { PartNotFound, #[error("sqlx error")] SQLXError(#[from] sqlx::Error), + #[error("html sanitize error")] + SanitizeError(#[from] SanitizeError), + #[error("UTF8 error")] + Utf8Error(#[from] Utf8Error), + #[error("FromUTF8 error")] + FromUtf8Error(#[from] FromUtf8Error), + #[error("error")] + StringError(String), } diff --git a/server/src/graphql.rs b/server/src/graphql.rs index d637ef8..df85a3f 100644 --- a/server/src/graphql.rs +++ b/server/src/graphql.rs @@ -1,4 +1,3 @@ -const MAX_RAW_MESSAGE_SIZE: usize = 100_000; use std::fs::File; use async_graphql::{ @@ -19,14 +18,6 @@ pub type UnixTime = isize; /// # Thread ID, sans "thread:" pub type ThreadId = String; -const TEXT_PLAIN: &'static str = "text/plain"; -const TEXT_HTML: &'static str = "text/html"; -const IMAGE_JPEG: &'static str = "image/jpeg"; -const IMAGE_PNG: &'static str = "image/png"; -const MULTIPART_ALTERNATIVE: &'static str = "multipart/alternative"; -const MULTIPART_MIXED: &'static str = "multipart/mixed"; -const MULTIPART_RELATED: &'static str = "multipart/related"; - #[derive(Debug, SimpleObject)] pub struct ThreadSummary { pub thread: ThreadId, @@ -45,9 +36,9 @@ pub struct ThreadSummary { #[derive(Debug, SimpleObject)] pub struct Thread { - thread_id: String, - subject: String, - messages: Vec, + pub thread_id: String, + pub subject: String, + pub messages: Vec, } #[derive(Debug, SimpleObject)] @@ -91,16 +82,45 @@ pub struct Attachment { pub bytes: Vec, } +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct Disposition { + pub r#type: DispositionType, + pub filename: Option, + pub size: Option, +} + +#[derive(Debug, Enum, Copy, Clone, Eq, PartialEq)] +pub enum DispositionType { + Inline, + Attachment, +} + +impl From for DispositionType { + fn from(value: mailparse::DispositionType) -> Self { + match value { + mailparse::DispositionType::Inline => DispositionType::Inline, + mailparse::DispositionType::Attachment => DispositionType::Attachment, + dt => panic!("unhandled DispositionType {dt:?}"), + } + } +} + +impl Default for DispositionType { + fn default() -> Self { + DispositionType::Attachment + } +} + #[derive(Debug, SimpleObject)] pub struct Header { - key: String, - value: String, + pub key: String, + pub value: String, } #[derive(Debug)] pub struct UnhandledContentType { - text: String, - content_tree: String, + pub text: String, + pub content_tree: String, } #[Object] @@ -115,8 +135,8 @@ impl UnhandledContentType { #[derive(Debug)] pub struct PlainText { - text: String, - content_tree: String, + pub text: String, + pub content_tree: String, } #[Object] @@ -131,8 +151,8 @@ impl PlainText { #[derive(Debug)] pub struct Html { - html: String, - content_tree: String, + pub html: String, + pub content_tree: String, } #[Object] @@ -156,13 +176,13 @@ pub enum Body { } impl Body { - fn html(html: String) -> Body { + pub fn html(html: String) -> Body { Body::Html(Html { html, content_tree: "".to_string(), }) } - fn text(text: String) -> Body { + pub fn text(text: String) -> Body { Body::PlainText(PlainText { text, content_tree: "".to_string(), @@ -222,8 +242,6 @@ impl QueryRoot { Ok(tags) } async fn thread<'ctx>(&self, ctx: &Context<'ctx>, thread_id: String) -> Result { - // TODO(wathiede): normalize all email addresses through an address book with preferred - // display names (that default to the most commonly seen name). let nm = ctx.data_unchecked::(); let debug_content_tree = ctx .look_ahead() @@ -231,124 +249,7 @@ impl QueryRoot { .field("body") .field("contentTree") .exists(); - let mut messages = Vec::new(); - for (path, id) in std::iter::zip(nm.files(&thread_id)?, nm.message_ids(&thread_id)?) { - let tags = nm.tags_for_query(&format!("id:{id}"))?; - let file = File::open(&path)?; - let mmap = unsafe { MmapOptions::new().map(&file)? }; - let m = parse_mail(&mmap)?; - let from = email_addresses(&path, &m, "from")?; - let from = match from.len() { - 0 => None, - 1 => from.into_iter().next(), - _ => { - warn!( - "Got {} from addresses in message, truncating: {:?}", - from.len(), - from - ); - from.into_iter().next() - } - }; - let to = email_addresses(&path, &m, "to")?; - let cc = email_addresses(&path, &m, "cc")?; - let subject = m.headers.get_first_value("subject"); - let timestamp = m - .headers - .get_first_value("date") - .and_then(|d| mailparse::dateparse(&d).ok()); - let cid_prefix = shared::urls::cid_prefix(None, &id); - let body = match extract_body(&m, &id)? { - Body::PlainText(PlainText { text, content_tree }) => { - let text = if text.len() > MAX_RAW_MESSAGE_SIZE { - format!( - "{}...\n\nMESSAGE WAS TRUNCATED @ {} bytes", - &text[..MAX_RAW_MESSAGE_SIZE], - MAX_RAW_MESSAGE_SIZE - ) - } else { - text - }; - - Body::Html(Html { - html: format!( - r#"

{}

"#, - // Trim newlines to prevent excessive white space at the beginning/end of - // presenation. Leave tabs and spaces incase plain text attempts to center a - // header on the first line. - sanitize_html(&linkify_html(&text.trim_matches('\n')), &cid_prefix)? - ), - content_tree: if debug_content_tree { - render_content_type_tree(&m) - } else { - content_tree - }, - }) - } - Body::Html(Html { html, content_tree }) => Body::Html(Html { - html: sanitize_html(&html, &cid_prefix)?, - content_tree: if debug_content_tree { - render_content_type_tree(&m) - } else { - content_tree - }, - }), - - Body::UnhandledContentType(UnhandledContentType { content_tree, .. }) => { - let body_start = mmap - .windows(2) - .take(20_000) - .position(|w| w == b"\n\n") - .unwrap_or(0); - let body = mmap[body_start + 2..].to_vec(); - Body::UnhandledContentType(UnhandledContentType { - text: String::from_utf8(body)?, - content_tree: if debug_content_tree { - render_content_type_tree(&m) - } else { - content_tree - }, - }) - } - }; - let headers = m - .headers - .iter() - .map(|h| Header { - key: h.get_key(), - value: h.get_value(), - }) - .collect(); - // TODO(wathiede): parse message and fill out attachments - let attachments = extract_attachments(&m, &id)?; - messages.push(Message { - id, - from, - to, - cc, - subject, - tags, - timestamp, - headers, - body, - path, - attachments, - }); - } - messages.reverse(); - // Find the first subject that's set. After reversing the vec, this should be the oldest - // message. - let subject: String = messages - .iter() - .skip_while(|m| m.subject.is_none()) - .next() - .and_then(|m| m.subject.clone()) - .unwrap_or("(NO SUBJECT)".to_string()); - Ok(Thread { - thread_id, - subject, - messages, - }) + Ok(nm::thread(nm, thread_id, debug_content_tree).await?) } } @@ -395,506 +296,3 @@ impl Mutation { } pub type GraphqlSchema = Schema; - -#[derive(Debug, Clone, Eq, PartialEq)] -pub struct Disposition { - pub r#type: DispositionType, - pub filename: Option, - pub size: Option, -} - -#[derive(Debug, Enum, Copy, Clone, Eq, PartialEq)] -pub enum DispositionType { - Inline, - Attachment, -} - -impl From for DispositionType { - fn from(value: mailparse::DispositionType) -> Self { - match value { - mailparse::DispositionType::Inline => DispositionType::Inline, - mailparse::DispositionType::Attachment => DispositionType::Attachment, - dt => panic!("unhandled DispositionType {dt:?}"), - } - } -} - -impl Default for DispositionType { - fn default() -> Self { - DispositionType::Attachment - } -} - -fn extract_body(m: &ParsedMail, id: &str) -> Result { - let mut part_addr = Vec::new(); - part_addr.push(id.to_string()); - let body = m.get_body()?; - let ret = match m.ctype.mimetype.as_str() { - TEXT_PLAIN => return Ok(Body::text(body)), - TEXT_HTML => return Ok(Body::html(body)), - MULTIPART_MIXED => extract_mixed(m, &mut part_addr), - MULTIPART_ALTERNATIVE => extract_alternative(m, &mut part_addr), - MULTIPART_RELATED => extract_related(m, &mut part_addr), - _ => extract_unhandled(m), - }; - if let Err(err) = ret { - error!("Failed to extract body: {err:?}"); - return Ok(extract_unhandled(m)?); - } - ret -} - -fn extract_unhandled(m: &ParsedMail) -> Result { - let msg = format!( - "Unhandled body content type:\n{}\n{}", - render_content_type_tree(m), - m.get_body()?, - ); - Ok(Body::UnhandledContentType(UnhandledContentType { - text: msg, - content_tree: render_content_type_tree(m), - })) -} - -// multipart/alternative defines multiple representations of the same message, and clients should -// show the fanciest they can display. For this program, the priority is text/html, text/plain, -// then give up. -fn extract_alternative(m: &ParsedMail, part_addr: &mut Vec) -> Result { - let handled_types = vec![ - MULTIPART_ALTERNATIVE, - MULTIPART_MIXED, - MULTIPART_RELATED, - TEXT_HTML, - TEXT_PLAIN, - ]; - for sp in &m.subparts { - if sp.ctype.mimetype.as_str() == MULTIPART_ALTERNATIVE { - return extract_alternative(sp, part_addr); - } - } - for sp in &m.subparts { - if sp.ctype.mimetype.as_str() == MULTIPART_MIXED { - return extract_related(sp, part_addr); - } - } - for sp in &m.subparts { - if sp.ctype.mimetype.as_str() == MULTIPART_RELATED { - return extract_related(sp, part_addr); - } - } - for sp in &m.subparts { - if sp.ctype.mimetype.as_str() == TEXT_HTML { - let body = sp.get_body()?; - return Ok(Body::html(body)); - } - } - for sp in &m.subparts { - if sp.ctype.mimetype.as_str() == TEXT_PLAIN { - let body = sp.get_body()?; - return Ok(Body::text(body)); - } - } - Err(format!( - "extract_alternative failed to find suitable subpart, searched: {:?}", - handled_types - ) - .into()) -} - -// multipart/mixed defines multiple types of context all of which should be presented to the user -// 'serially'. -fn extract_mixed(m: &ParsedMail, part_addr: &mut Vec) -> Result { - let handled_types = vec![ - MULTIPART_ALTERNATIVE, - MULTIPART_RELATED, - TEXT_HTML, - TEXT_PLAIN, - IMAGE_JPEG, - IMAGE_PNG, - ]; - let mut unhandled_types: Vec<_> = m - .subparts - .iter() - .map(|sp| sp.ctype.mimetype.as_str()) - .filter(|mt| !handled_types.contains(&mt)) - .collect(); - unhandled_types.sort(); - if !unhandled_types.is_empty() { - warn!("{MULTIPART_MIXED} contains the following unhandled mimetypes {unhandled_types:?}"); - } - let mut parts = Vec::new(); - for (idx, sp) in m.subparts.iter().enumerate() { - part_addr.push(idx.to_string()); - match sp.ctype.mimetype.as_str() { - MULTIPART_RELATED => parts.push(extract_related(sp, part_addr)?), - MULTIPART_ALTERNATIVE => parts.push(extract_alternative(sp, part_addr)?), - TEXT_PLAIN => parts.push(Body::text(sp.get_body()?)), - TEXT_HTML => parts.push(Body::html(sp.get_body()?)), - IMAGE_JPEG | IMAGE_PNG => { - let pcd = sp.get_content_disposition(); - let filename = pcd - .params - .get("filename") - .map(|s| s.clone()) - .unwrap_or("".to_string()); - // Only add inline images, attachments are handled as an attribute of the top level Message and rendered separate client-side. - if pcd.disposition == mailparse::DispositionType::Inline { - parts.push(Body::html(format!( - r#""#, - part_addr[0], - part_addr - .iter() - .skip(1) - .map(|i| i.to_string()) - .collect::>() - .join(".") - ))); - } - } - _ => (), - } - part_addr.pop(); - } - Ok(flatten_body_parts(&parts)) -} - -fn flatten_body_parts(parts: &[Body]) -> Body { - let html = parts - .iter() - .map(|p| match p { - Body::PlainText(PlainText { text, .. }) => { - format!( - r#"

{}

"#, - // Trim newlines to prevent excessive white space at the beginning/end of - // presenation. Leave tabs and spaces incase plain text attempts to center a - // header on the first line. - linkify_html(&text.trim_matches('\n')) - ) - } - Body::Html(Html { html, .. }) => html.clone(), - Body::UnhandledContentType(UnhandledContentType { text, .. }) => { - error!("text len {}", text.len()); - format!( - r#"

{}

"#, - // Trim newlines to prevent excessive white space at the beginning/end of - // presenation. Leave tabs and spaces incase plain text attempts to center a - // header on the first line. - linkify_html(&text.trim_matches('\n')) - ) - } - }) - .collect::>() - .join("\n"); - - info!("flatten_body_parts {} {html}", parts.len()); - Body::html(html) -} - -fn extract_related(m: &ParsedMail, part_addr: &mut Vec) -> Result { - // TODO(wathiede): collect related things and change return type to new Body arm. - let handled_types = vec![ - MULTIPART_ALTERNATIVE, - TEXT_HTML, - TEXT_PLAIN, - IMAGE_JPEG, - IMAGE_PNG, - ]; - let mut unhandled_types: Vec<_> = m - .subparts - .iter() - .map(|sp| sp.ctype.mimetype.as_str()) - .filter(|mt| !handled_types.contains(&mt)) - .collect(); - unhandled_types.sort(); - if !unhandled_types.is_empty() { - warn!("{MULTIPART_RELATED} contains the following unhandled mimetypes {unhandled_types:?}"); - } - - for (i, sp) in m.subparts.iter().enumerate() { - if sp.ctype.mimetype == IMAGE_PNG || sp.ctype.mimetype == IMAGE_JPEG { - info!("sp.ctype {:#?}", sp.ctype); - //info!("sp.headers {:#?}", sp.headers); - if let Some(cid) = sp.headers.get_first_value("Content-Id") { - let mut part_id = part_addr.clone(); - part_id.push(i.to_string()); - info!("cid: {cid} part_id {part_id:?}"); - } - } - } - for sp in &m.subparts { - if sp.ctype.mimetype == MULTIPART_ALTERNATIVE { - return extract_alternative(m, part_addr); - } - } - for sp in &m.subparts { - if sp.ctype.mimetype == TEXT_HTML { - let body = sp.get_body()?; - return Ok(Body::html(body)); - } - } - for sp in &m.subparts { - if sp.ctype.mimetype == TEXT_PLAIN { - let body = sp.get_body()?; - return Ok(Body::text(body)); - } - } - Err(format!( - "extract_related failed to find suitable subpart, searched: {:?}", - handled_types - ) - .into()) -} - -fn walk_attachments Option + Copy>( - m: &ParsedMail, - visitor: F, -) -> Option { - let mut cur_addr = Vec::new(); - walk_attachments_inner(m, visitor, &mut cur_addr) -} - -fn walk_attachments_inner Option + Copy>( - m: &ParsedMail, - visitor: F, - cur_addr: &mut Vec, -) -> Option { - for (idx, sp) in m.subparts.iter().enumerate() { - cur_addr.push(idx); - let val = visitor(sp, &cur_addr); - if val.is_some() { - return val; - } - let val = walk_attachments_inner(sp, visitor, cur_addr); - if val.is_some() { - return val; - } - cur_addr.pop(); - } - None -} - -// TODO(wathiede): make this walk_attachments that takes a closure. -// Then implement one closure for building `Attachment` and imlement another that can be used to -// get the bytes for serving attachments of HTTP -fn extract_attachments(m: &ParsedMail, id: &str) -> Result, Error> { - let mut attachments = Vec::new(); - for (idx, sp) in m.subparts.iter().enumerate() { - if let Some(attachment) = extract_attachment(sp, id, &[idx]) { - // Filter out inline attachements, they're flattened into the body of the message. - if attachment.disposition == DispositionType::Attachment { - attachments.push(attachment); - } - } - } - Ok(attachments) -} - -fn extract_attachment(m: &ParsedMail, id: &str, idx: &[usize]) -> Option { - let pcd = m.get_content_disposition(); - // TODO: do we need to handle empty filename attachments, or should we change the definition of - // Attachment::filename? - let Some(filename) = pcd.params.get("filename").map(|f| f.clone()) else { - return None; - }; - - // TODO: grab this from somewhere - let content_id = None; - let bytes = match m.get_body_raw() { - Ok(bytes) => bytes, - Err(err) => { - error!("failed to get body for attachment: {err}"); - return None; - } - }; - return Some(Attachment { - id: id.to_string(), - idx: idx - .iter() - .map(|i| i.to_string()) - .collect::>() - .join("."), - disposition: pcd.disposition.into(), - filename: Some(filename), - size: bytes.len(), - // TODO: what is the default for ctype? - // TODO: do we want to use m.ctype.params for anything? - content_type: Some(m.ctype.mimetype.clone()), - content_id, - bytes, - }); -} - -pub fn get_attachment_filename(header_value: &str) -> &str { - info!("get_attachment_filename {header_value}"); - // Strip last " - let v = &header_value[..header_value.len() - 1]; - if let Some(idx) = v.rfind('"') { - &v[idx + 1..] - } else { - "" - } -} - -pub fn get_content_type<'a>(headers: &[MailHeader<'a>]) -> Option { - if let Some(v) = headers.get_first_value("Content-Type") { - if let Some(idx) = v.find(';') { - return Some(v[..idx].to_string()); - } else { - return Some(v); - } - } - None -} - -fn get_content_id<'a>(headers: &[MailHeader<'a>]) -> Option { - headers.get_first_value("Content-Id") -} - -fn render_content_type_tree(m: &ParsedMail) -> String { - const WIDTH: usize = 4; - const SKIP_HEADERS: [&str; 4] = [ - "Authentication-Results", - "DKIM-Signature", - "Received", - "Received-SPF", - ]; - fn render_ct_rec(m: &ParsedMail, depth: usize) -> String { - let mut parts = Vec::new(); - let msg = format!("{} {}", "-".repeat(depth * WIDTH), m.ctype.mimetype); - parts.push(msg); - for sp in &m.subparts { - parts.push(render_ct_rec(sp, depth + 1)) - } - parts.join("\n") - } - fn render_rec(m: &ParsedMail, depth: usize) -> String { - let mut parts = Vec::new(); - let msg = format!("{} {}", "-".repeat(depth * WIDTH), m.ctype.mimetype); - parts.push(msg); - let indent = " ".repeat(depth * WIDTH); - if !m.ctype.charset.is_empty() { - parts.push(format!("{indent} Character Set: {}", m.ctype.charset)); - } - for (k, v) in m.ctype.params.iter() { - parts.push(format!("{indent} {k}: {v}")); - } - if !m.headers.is_empty() { - parts.push(format!("{indent} == headers ==")); - for h in &m.headers { - if h.get_key().starts_with('X') { - continue; - } - if SKIP_HEADERS.contains(&h.get_key().as_str()) { - continue; - } - - parts.push(format!("{indent} {}: {}", h.get_key_ref(), h.get_value())); - } - } - for sp in &m.subparts { - parts.push(render_rec(sp, depth + 1)) - } - parts.join("\n") - } - format!( - "Outline:\n{}\n\nDetailed:\n{}\n\nNot showing headers:\n {}\n X.*", - render_ct_rec(m, 1), - render_rec(m, 1), - SKIP_HEADERS.join("\n ") - ) -} - -fn email_addresses(path: &str, m: &ParsedMail, header_name: &str) -> Result, Error> { - let mut addrs = Vec::new(); - for header_value in m.headers.get_all_values(header_name) { - match mailparse::addrparse(&header_value) { - Ok(mal) => { - for ma in mal.into_inner() { - match ma { - mailparse::MailAddr::Group(gi) => { - if !gi.group_name.contains("ndisclosed") { - println!("[{path}][{header_name}] Group: {gi}"); - } - } - mailparse::MailAddr::Single(s) => addrs.push(Email { - name: s.display_name, - addr: Some(s.addr), - }), //println!("Single: {s}"), - } - } - } - Err(_) => { - let v = header_value; - if v.matches('@').count() == 1 { - if v.matches('<').count() == 1 && v.ends_with('>') { - let idx = v.find('<').unwrap(); - let addr = &v[idx + 1..v.len() - 1].trim(); - let name = &v[..idx].trim(); - addrs.push(Email { - name: Some(name.to_string()), - addr: Some(addr.to_string()), - }); - } - } else { - addrs.push(Email { - name: Some(v), - addr: None, - }); - } - } - } - } - Ok(addrs) -} - -pub fn cid_attachment_bytes(nm: &Notmuch, id: &str, cid: &str) -> Result { - let files = nm.files(id)?; - let Some(path) = files.first() else { - warn!("failed to find files for message {id}"); - return Err(ServerError::PartNotFound); - }; - let file = File::open(&path)?; - let mmap = unsafe { MmapOptions::new().map(&file)? }; - let m = parse_mail(&mmap)?; - if let Some(attachment) = walk_attachments(&m, |sp, _cur_idx| { - info!("{cid} {:?}", get_content_id(&sp.headers)); - if let Some(h_cid) = get_content_id(&sp.headers) { - let h_cid = &h_cid[1..h_cid.len() - 1]; - if h_cid == cid { - let attachment = extract_attachment(&sp, id, &[]).unwrap_or(Attachment { - ..Attachment::default() - }); - return Some(attachment); - } - } - None - }) { - return Ok(attachment); - } - - Err(ServerError::PartNotFound) -} - -pub fn attachment_bytes(nm: &Notmuch, id: &str, idx: &[usize]) -> Result { - let files = nm.files(id)?; - let Some(path) = files.first() else { - warn!("failed to find files for message {id}"); - return Err(ServerError::PartNotFound); - }; - let file = File::open(&path)?; - let mmap = unsafe { MmapOptions::new().map(&file)? }; - let m = parse_mail(&mmap)?; - if let Some(attachment) = walk_attachments(&m, |sp, cur_idx| { - if cur_idx == idx { - let attachment = extract_attachment(&sp, id, idx).unwrap_or(Attachment { - ..Attachment::default() - }); - return Some(attachment); - } - None - }) { - return Ok(attachment); - } - - Err(ServerError::PartNotFound) -} diff --git a/server/src/lib.rs b/server/src/lib.rs index 538b840..b631921 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -1,7 +1,7 @@ pub mod error; pub mod graphql; -mod newsreader; -mod nm; +pub mod newsreader; +pub mod nm; use css_inline::{CSSInliner, InlineError, InlineOptions}; use linkify::{LinkFinder, LinkKind}; diff --git a/server/src/nm.rs b/server/src/nm.rs index 2cc60ae..f579ff5 100644 --- a/server/src/nm.rs +++ b/server/src/nm.rs @@ -1,23 +1,40 @@ use std::{ collections::HashMap, + fs::File, hash::{DefaultHasher, Hash, Hasher}, time::Instant, }; -use async_graphql::connection::{self, Connection, Edge}; -use log::info; +use async_graphql::{ + connection::{self, Connection, Edge}, + Context, EmptySubscription, Enum, FieldResult, Object, Schema, SimpleObject, Union, +}; +use log::{error, info, warn}; +use mailparse::{parse_mail, MailHeader, MailHeaderMap, ParsedMail}; +use memmap::MmapOptions; use notmuch::Notmuch; -use shared::Message; use crate::{ - error, - graphql::{Tag, ThreadSummary}, + error::ServerError, + graphql::{ + Attachment, Body, DispositionType, Email, Header, Html, Message, PlainText, Tag, Thread, + ThreadSummary, UnhandledContentType, + }, + linkify_html, newsreader, nm, sanitize_html, }; +const TEXT_PLAIN: &'static str = "text/plain"; +const TEXT_HTML: &'static str = "text/html"; +const IMAGE_JPEG: &'static str = "image/jpeg"; +const IMAGE_PNG: &'static str = "image/png"; +const MULTIPART_ALTERNATIVE: &'static str = "multipart/alternative"; +const MULTIPART_MIXED: &'static str = "multipart/mixed"; +const MULTIPART_RELATED: &'static str = "multipart/related"; + +const MAX_RAW_MESSAGE_SIZE: usize = 100_000; + // TODO(wathiede): decide good error type -pub fn threadset_to_messages( - thread_set: notmuch::ThreadSet, -) -> Result, error::ServerError> { +pub fn threadset_to_messages(thread_set: notmuch::ThreadSet) -> Result, ServerError> { for t in thread_set.0 { for _tn in t.0 {} } @@ -85,7 +102,7 @@ pub async fn search( .await } -pub fn tags(nm: &Notmuch, needs_unread: bool) -> Result, error::ServerError> { +pub fn tags(nm: &Notmuch, needs_unread: bool) -> Result, ServerError> { let now = Instant::now(); let unread_msg_cnt: HashMap = if needs_unread { // 10000 is an arbitrary number, if there's more than 10k unread messages, we'll @@ -125,3 +142,606 @@ pub fn tags(nm: &Notmuch, needs_unread: bool) -> Result, error::ServerE info!("Fetching tags took {} seconds", now.elapsed().as_secs_f32()); Ok(tags) } + +pub async fn thread( + nm: &Notmuch, + thread_id: String, + debug_content_tree: bool, +) -> Result { + // TODO(wathiede): normalize all email addresses through an address book with preferred + // display names (that default to the most commonly seen name). + let mut messages = Vec::new(); + for (path, id) in std::iter::zip(nm.files(&thread_id)?, nm.message_ids(&thread_id)?) { + let tags = nm.tags_for_query(&format!("id:{id}"))?; + let file = File::open(&path)?; + let mmap = unsafe { MmapOptions::new().map(&file)? }; + let m = parse_mail(&mmap)?; + let from = email_addresses(&path, &m, "from")?; + let from = match from.len() { + 0 => None, + 1 => from.into_iter().next(), + _ => { + warn!( + "Got {} from addresses in message, truncating: {:?}", + from.len(), + from + ); + from.into_iter().next() + } + }; + let to = email_addresses(&path, &m, "to")?; + let cc = email_addresses(&path, &m, "cc")?; + let subject = m.headers.get_first_value("subject"); + let timestamp = m + .headers + .get_first_value("date") + .and_then(|d| mailparse::dateparse(&d).ok()); + let cid_prefix = shared::urls::cid_prefix(None, &id); + let body = match extract_body(&m, &id)? { + Body::PlainText(PlainText { text, content_tree }) => { + let text = if text.len() > MAX_RAW_MESSAGE_SIZE { + format!( + "{}...\n\nMESSAGE WAS TRUNCATED @ {} bytes", + &text[..MAX_RAW_MESSAGE_SIZE], + MAX_RAW_MESSAGE_SIZE + ) + } else { + text + }; + + Body::Html(Html { + html: format!( + r#"

{}

"#, + // Trim newlines to prevent excessive white space at the beginning/end of + // presenation. Leave tabs and spaces incase plain text attempts to center a + // header on the first line. + sanitize_html(&linkify_html(&text.trim_matches('\n')), &cid_prefix)? + ), + content_tree: if debug_content_tree { + render_content_type_tree(&m) + } else { + content_tree + }, + }) + } + Body::Html(Html { html, content_tree }) => Body::Html(Html { + html: sanitize_html(&html, &cid_prefix)?, + content_tree: if debug_content_tree { + render_content_type_tree(&m) + } else { + content_tree + }, + }), + + Body::UnhandledContentType(UnhandledContentType { content_tree, .. }) => { + let body_start = mmap + .windows(2) + .take(20_000) + .position(|w| w == b"\n\n") + .unwrap_or(0); + let body = mmap[body_start + 2..].to_vec(); + Body::UnhandledContentType(UnhandledContentType { + text: String::from_utf8(body)?, + content_tree: if debug_content_tree { + render_content_type_tree(&m) + } else { + content_tree + }, + }) + } + }; + let headers = m + .headers + .iter() + .map(|h| Header { + key: h.get_key(), + value: h.get_value(), + }) + .collect(); + // TODO(wathiede): parse message and fill out attachments + let attachments = extract_attachments(&m, &id)?; + messages.push(Message { + id, + from, + to, + cc, + subject, + tags, + timestamp, + headers, + body, + path, + attachments, + }); + } + messages.reverse(); + // Find the first subject that's set. After reversing the vec, this should be the oldest + // message. + let subject: String = messages + .iter() + .skip_while(|m| m.subject.is_none()) + .next() + .and_then(|m| m.subject.clone()) + .unwrap_or("(NO SUBJECT)".to_string()); + Ok(Thread { + thread_id, + subject, + messages, + }) +} + +fn email_addresses( + path: &str, + m: &ParsedMail, + header_name: &str, +) -> Result, ServerError> { + let mut addrs = Vec::new(); + for header_value in m.headers.get_all_values(header_name) { + match mailparse::addrparse(&header_value) { + Ok(mal) => { + for ma in mal.into_inner() { + match ma { + mailparse::MailAddr::Group(gi) => { + if !gi.group_name.contains("ndisclosed") { + println!("[{path}][{header_name}] Group: {gi}"); + } + } + mailparse::MailAddr::Single(s) => addrs.push(Email { + name: s.display_name, + addr: Some(s.addr), + }), //println!("Single: {s}"), + } + } + } + Err(_) => { + let v = header_value; + if v.matches('@').count() == 1 { + if v.matches('<').count() == 1 && v.ends_with('>') { + let idx = v.find('<').unwrap(); + let addr = &v[idx + 1..v.len() - 1].trim(); + let name = &v[..idx].trim(); + addrs.push(Email { + name: Some(name.to_string()), + addr: Some(addr.to_string()), + }); + } + } else { + addrs.push(Email { + name: Some(v), + addr: None, + }); + } + } + } + } + Ok(addrs) +} + +pub fn cid_attachment_bytes(nm: &Notmuch, id: &str, cid: &str) -> Result { + let files = nm.files(id)?; + let Some(path) = files.first() else { + warn!("failed to find files for message {id}"); + return Err(ServerError::PartNotFound); + }; + let file = File::open(&path)?; + let mmap = unsafe { MmapOptions::new().map(&file)? }; + let m = parse_mail(&mmap)?; + if let Some(attachment) = walk_attachments(&m, |sp, _cur_idx| { + info!("{cid} {:?}", get_content_id(&sp.headers)); + if let Some(h_cid) = get_content_id(&sp.headers) { + let h_cid = &h_cid[1..h_cid.len() - 1]; + if h_cid == cid { + let attachment = extract_attachment(&sp, id, &[]).unwrap_or(Attachment { + ..Attachment::default() + }); + return Some(attachment); + } + } + None + }) { + return Ok(attachment); + } + + Err(ServerError::PartNotFound) +} + +pub fn attachment_bytes(nm: &Notmuch, id: &str, idx: &[usize]) -> Result { + let files = nm.files(id)?; + let Some(path) = files.first() else { + warn!("failed to find files for message {id}"); + return Err(ServerError::PartNotFound); + }; + let file = File::open(&path)?; + let mmap = unsafe { MmapOptions::new().map(&file)? }; + let m = parse_mail(&mmap)?; + if let Some(attachment) = walk_attachments(&m, |sp, cur_idx| { + if cur_idx == idx { + let attachment = extract_attachment(&sp, id, idx).unwrap_or(Attachment { + ..Attachment::default() + }); + return Some(attachment); + } + None + }) { + return Ok(attachment); + } + + Err(ServerError::PartNotFound) +} + +fn extract_body(m: &ParsedMail, id: &str) -> Result { + let mut part_addr = Vec::new(); + part_addr.push(id.to_string()); + let body = m.get_body()?; + let ret = match m.ctype.mimetype.as_str() { + TEXT_PLAIN => return Ok(Body::text(body)), + TEXT_HTML => return Ok(Body::html(body)), + MULTIPART_MIXED => extract_mixed(m, &mut part_addr), + MULTIPART_ALTERNATIVE => extract_alternative(m, &mut part_addr), + MULTIPART_RELATED => extract_related(m, &mut part_addr), + _ => extract_unhandled(m), + }; + if let Err(err) = ret { + error!("Failed to extract body: {err:?}"); + return Ok(extract_unhandled(m)?); + } + ret +} + +fn extract_unhandled(m: &ParsedMail) -> Result { + let msg = format!( + "Unhandled body content type:\n{}\n{}", + render_content_type_tree(m), + m.get_body()?, + ); + Ok(Body::UnhandledContentType(UnhandledContentType { + text: msg, + content_tree: render_content_type_tree(m), + })) +} + +// multipart/alternative defines multiple representations of the same message, and clients should +// show the fanciest they can display. For this program, the priority is text/html, text/plain, +// then give up. +fn extract_alternative(m: &ParsedMail, part_addr: &mut Vec) -> Result { + let handled_types = vec![ + MULTIPART_ALTERNATIVE, + MULTIPART_MIXED, + MULTIPART_RELATED, + TEXT_HTML, + TEXT_PLAIN, + ]; + for sp in &m.subparts { + if sp.ctype.mimetype.as_str() == MULTIPART_ALTERNATIVE { + return extract_alternative(sp, part_addr); + } + } + for sp in &m.subparts { + if sp.ctype.mimetype.as_str() == MULTIPART_MIXED { + return extract_related(sp, part_addr); + } + } + for sp in &m.subparts { + if sp.ctype.mimetype.as_str() == MULTIPART_RELATED { + return extract_related(sp, part_addr); + } + } + for sp in &m.subparts { + if sp.ctype.mimetype.as_str() == TEXT_HTML { + let body = sp.get_body()?; + return Ok(Body::html(body)); + } + } + for sp in &m.subparts { + if sp.ctype.mimetype.as_str() == TEXT_PLAIN { + let body = sp.get_body()?; + return Ok(Body::text(body)); + } + } + Err(ServerError::StringError(format!( + "extract_alternative failed to find suitable subpart, searched: {:?}", + handled_types + ))) +} + +// multipart/mixed defines multiple types of context all of which should be presented to the user +// 'serially'. +fn extract_mixed(m: &ParsedMail, part_addr: &mut Vec) -> Result { + let handled_types = vec![ + MULTIPART_ALTERNATIVE, + MULTIPART_RELATED, + TEXT_HTML, + TEXT_PLAIN, + IMAGE_JPEG, + IMAGE_PNG, + ]; + let mut unhandled_types: Vec<_> = m + .subparts + .iter() + .map(|sp| sp.ctype.mimetype.as_str()) + .filter(|mt| !handled_types.contains(&mt)) + .collect(); + unhandled_types.sort(); + if !unhandled_types.is_empty() { + warn!("{MULTIPART_MIXED} contains the following unhandled mimetypes {unhandled_types:?}"); + } + let mut parts = Vec::new(); + for (idx, sp) in m.subparts.iter().enumerate() { + part_addr.push(idx.to_string()); + match sp.ctype.mimetype.as_str() { + MULTIPART_RELATED => parts.push(extract_related(sp, part_addr)?), + MULTIPART_ALTERNATIVE => parts.push(extract_alternative(sp, part_addr)?), + TEXT_PLAIN => parts.push(Body::text(sp.get_body()?)), + TEXT_HTML => parts.push(Body::html(sp.get_body()?)), + IMAGE_JPEG | IMAGE_PNG => { + let pcd = sp.get_content_disposition(); + let filename = pcd + .params + .get("filename") + .map(|s| s.clone()) + .unwrap_or("".to_string()); + // Only add inline images, attachments are handled as an attribute of the top level Message and rendered separate client-side. + if pcd.disposition == mailparse::DispositionType::Inline { + parts.push(Body::html(format!( + r#""#, + part_addr[0], + part_addr + .iter() + .skip(1) + .map(|i| i.to_string()) + .collect::>() + .join(".") + ))); + } + } + _ => (), + } + part_addr.pop(); + } + Ok(flatten_body_parts(&parts)) +} + +fn flatten_body_parts(parts: &[Body]) -> Body { + let html = parts + .iter() + .map(|p| match p { + Body::PlainText(PlainText { text, .. }) => { + format!( + r#"

{}

"#, + // Trim newlines to prevent excessive white space at the beginning/end of + // presenation. Leave tabs and spaces incase plain text attempts to center a + // header on the first line. + linkify_html(&text.trim_matches('\n')) + ) + } + Body::Html(Html { html, .. }) => html.clone(), + Body::UnhandledContentType(UnhandledContentType { text, .. }) => { + error!("text len {}", text.len()); + format!( + r#"

{}

"#, + // Trim newlines to prevent excessive white space at the beginning/end of + // presenation. Leave tabs and spaces incase plain text attempts to center a + // header on the first line. + linkify_html(&text.trim_matches('\n')) + ) + } + }) + .collect::>() + .join("\n"); + + info!("flatten_body_parts {} {html}", parts.len()); + Body::html(html) +} + +fn extract_related(m: &ParsedMail, part_addr: &mut Vec) -> Result { + // TODO(wathiede): collect related things and change return type to new Body arm. + let handled_types = vec![ + MULTIPART_ALTERNATIVE, + TEXT_HTML, + TEXT_PLAIN, + IMAGE_JPEG, + IMAGE_PNG, + ]; + let mut unhandled_types: Vec<_> = m + .subparts + .iter() + .map(|sp| sp.ctype.mimetype.as_str()) + .filter(|mt| !handled_types.contains(&mt)) + .collect(); + unhandled_types.sort(); + if !unhandled_types.is_empty() { + warn!("{MULTIPART_RELATED} contains the following unhandled mimetypes {unhandled_types:?}"); + } + + for (i, sp) in m.subparts.iter().enumerate() { + if sp.ctype.mimetype == IMAGE_PNG || sp.ctype.mimetype == IMAGE_JPEG { + info!("sp.ctype {:#?}", sp.ctype); + //info!("sp.headers {:#?}", sp.headers); + if let Some(cid) = sp.headers.get_first_value("Content-Id") { + let mut part_id = part_addr.clone(); + part_id.push(i.to_string()); + info!("cid: {cid} part_id {part_id:?}"); + } + } + } + for sp in &m.subparts { + if sp.ctype.mimetype == MULTIPART_ALTERNATIVE { + return extract_alternative(m, part_addr); + } + } + for sp in &m.subparts { + if sp.ctype.mimetype == TEXT_HTML { + let body = sp.get_body()?; + return Ok(Body::html(body)); + } + } + for sp in &m.subparts { + if sp.ctype.mimetype == TEXT_PLAIN { + let body = sp.get_body()?; + return Ok(Body::text(body)); + } + } + Err(ServerError::StringError(format!( + "extract_related failed to find suitable subpart, searched: {:?}", + handled_types + ))) +} + +fn walk_attachments Option + Copy>( + m: &ParsedMail, + visitor: F, +) -> Option { + let mut cur_addr = Vec::new(); + walk_attachments_inner(m, visitor, &mut cur_addr) +} + +fn walk_attachments_inner Option + Copy>( + m: &ParsedMail, + visitor: F, + cur_addr: &mut Vec, +) -> Option { + for (idx, sp) in m.subparts.iter().enumerate() { + cur_addr.push(idx); + let val = visitor(sp, &cur_addr); + if val.is_some() { + return val; + } + let val = walk_attachments_inner(sp, visitor, cur_addr); + if val.is_some() { + return val; + } + cur_addr.pop(); + } + None +} + +// TODO(wathiede): make this walk_attachments that takes a closure. +// Then implement one closure for building `Attachment` and imlement another that can be used to +// get the bytes for serving attachments of HTTP +fn extract_attachments(m: &ParsedMail, id: &str) -> Result, ServerError> { + let mut attachments = Vec::new(); + for (idx, sp) in m.subparts.iter().enumerate() { + if let Some(attachment) = extract_attachment(sp, id, &[idx]) { + // Filter out inline attachements, they're flattened into the body of the message. + if attachment.disposition == DispositionType::Attachment { + attachments.push(attachment); + } + } + } + Ok(attachments) +} + +fn extract_attachment(m: &ParsedMail, id: &str, idx: &[usize]) -> Option { + let pcd = m.get_content_disposition(); + // TODO: do we need to handle empty filename attachments, or should we change the definition of + // Attachment::filename? + let Some(filename) = pcd.params.get("filename").map(|f| f.clone()) else { + return None; + }; + + // TODO: grab this from somewhere + let content_id = None; + let bytes = match m.get_body_raw() { + Ok(bytes) => bytes, + Err(err) => { + error!("failed to get body for attachment: {err}"); + return None; + } + }; + return Some(Attachment { + id: id.to_string(), + idx: idx + .iter() + .map(|i| i.to_string()) + .collect::>() + .join("."), + disposition: pcd.disposition.into(), + filename: Some(filename), + size: bytes.len(), + // TODO: what is the default for ctype? + // TODO: do we want to use m.ctype.params for anything? + content_type: Some(m.ctype.mimetype.clone()), + content_id, + bytes, + }); +} + +pub fn get_attachment_filename(header_value: &str) -> &str { + info!("get_attachment_filename {header_value}"); + // Strip last " + let v = &header_value[..header_value.len() - 1]; + if let Some(idx) = v.rfind('"') { + &v[idx + 1..] + } else { + "" + } +} + +pub fn get_content_type<'a>(headers: &[MailHeader<'a>]) -> Option { + if let Some(v) = headers.get_first_value("Content-Type") { + if let Some(idx) = v.find(';') { + return Some(v[..idx].to_string()); + } else { + return Some(v); + } + } + None +} + +fn get_content_id<'a>(headers: &[MailHeader<'a>]) -> Option { + headers.get_first_value("Content-Id") +} + +fn render_content_type_tree(m: &ParsedMail) -> String { + const WIDTH: usize = 4; + const SKIP_HEADERS: [&str; 4] = [ + "Authentication-Results", + "DKIM-Signature", + "Received", + "Received-SPF", + ]; + fn render_ct_rec(m: &ParsedMail, depth: usize) -> String { + let mut parts = Vec::new(); + let msg = format!("{} {}", "-".repeat(depth * WIDTH), m.ctype.mimetype); + parts.push(msg); + for sp in &m.subparts { + parts.push(render_ct_rec(sp, depth + 1)) + } + parts.join("\n") + } + fn render_rec(m: &ParsedMail, depth: usize) -> String { + let mut parts = Vec::new(); + let msg = format!("{} {}", "-".repeat(depth * WIDTH), m.ctype.mimetype); + parts.push(msg); + let indent = " ".repeat(depth * WIDTH); + if !m.ctype.charset.is_empty() { + parts.push(format!("{indent} Character Set: {}", m.ctype.charset)); + } + for (k, v) in m.ctype.params.iter() { + parts.push(format!("{indent} {k}: {v}")); + } + if !m.headers.is_empty() { + parts.push(format!("{indent} == headers ==")); + for h in &m.headers { + if h.get_key().starts_with('X') { + continue; + } + if SKIP_HEADERS.contains(&h.get_key().as_str()) { + continue; + } + + parts.push(format!("{indent} {}: {}", h.get_key_ref(), h.get_value())); + } + } + for sp in &m.subparts { + parts.push(render_rec(sp, depth + 1)) + } + parts.join("\n") + } + format!( + "Outline:\n{}\n\nDetailed:\n{}\n\nNot showing headers:\n {}\n X.*", + render_ct_rec(m, 1), + render_rec(m, 1), + SKIP_HEADERS.join("\n ") + ) +}