pub mod config; pub mod error; pub mod graphql; pub mod mail; pub mod newsreader; pub mod nm; #[cfg(feature = "tantivy")] pub mod tantivy; use std::{ collections::{HashMap, HashSet}, convert::Infallible, fmt, str::FromStr, sync::Arc, }; use async_trait::async_trait; use cacher::{Cacher, FilesystemCacher}; use css_inline::{CSSInliner, InlineError, InlineOptions}; use linkify::{LinkFinder, LinkKind}; use log::{debug, error, info, warn}; use lol_html::{ element, errors::RewritingError, html_content::ContentType, rewrite_str, text, RewriteStrSettings, }; use maplit::{hashmap, hashset}; use regex::Regex; use reqwest::StatusCode; use scraper::{Html, Selector}; use sqlx::types::time::PrimitiveDateTime; use thiserror::Error; use url::Url; use crate::{ error::ServerError, graphql::{Corpus, ThreadSummary}, newsreader::is_newsreader_thread, nm::is_notmuch_thread_or_id, }; const NEWSREADER_TAG_PREFIX: &'static str = "News/"; const NEWSREADER_THREAD_PREFIX: &'static str = "news:"; // TODO: figure out how to use Cow #[async_trait] trait Transformer: Send + Sync { fn should_run(&self, _addr: &Option, _html: &str) -> bool { true } // TODO: should html be something like `html_escape` uses: // >(text: &S) -> Cow async fn transform(&self, addr: &Option, html: &str) -> Result; } // TODO: how would we make this more generic to allow good implementations of Transformer outside // of this module? #[derive(Error, Debug)] pub enum TransformError { #[error("lol-html rewrite error: {0}")] RewritingError(#[from] RewritingError), #[error("css inline error: {0}")] InlineError(#[from] InlineError), #[error("failed to fetch url error: {0}")] ReqwestError(#[from] reqwest::Error), #[error("failed to parse HTML: {0}")] HtmlParsingError(String), #[error("got a retryable error code {0} for {1}")] RetryableHttpStatusError(StatusCode, String), } struct SanitizeHtml<'a> { cid_prefix: &'a str, base_url: &'a Option, } #[async_trait] impl<'a> Transformer for SanitizeHtml<'a> { async fn transform(&self, _: &Option, html: &str) -> Result { Ok(sanitize_html(html, self.cid_prefix, self.base_url)?) } } struct EscapeHtml; #[async_trait] impl Transformer for EscapeHtml { fn should_run(&self, _: &Option, html: &str) -> bool { html.contains("&") } async fn transform(&self, _: &Option, html: &str) -> Result { Ok(html_escape::decode_html_entities(html).to_string()) } } struct StripHtml; #[async_trait] impl Transformer for StripHtml { fn should_run(&self, link: &Option, html: &str) -> bool { debug!("StripHtml should_run {link:?} {}", html.contains("<")); // Lame test html.contains("<") } async fn transform(&self, link: &Option, html: &str) -> Result { debug!("StripHtml {link:?}"); let mut text = String::new(); let element_content_handlers = vec![ element!("style", |el| { el.remove(); Ok(()) }), element!("script", |el| { el.remove(); Ok(()) }), ]; let html = rewrite_str( html, RewriteStrSettings { element_content_handlers, ..RewriteStrSettings::default() }, )?; let element_content_handlers = vec![text!("*", |t| { text += t.as_str(); Ok(()) })]; let _ = rewrite_str( &html, RewriteStrSettings { element_content_handlers, ..RewriteStrSettings::default() }, )?; let re = Regex::new(r"\s+").expect("failed to parse regex"); let text = re.replace_all(&text, " ").to_string(); Ok(text) } } struct InlineStyle; #[async_trait] impl Transformer for InlineStyle { async fn transform(&self, _: &Option, html: &str) -> Result { let css = concat!( "/* chrome-default.css */\n", include_str!("chrome-default.css"), //"\n/* mvp.css */\n", //include_str!("mvp.css"), //"\n/* Xinu Specific overrides */\n", //include_str!("custom.css"), ); let inline_opts = InlineOptions { inline_style_tags: true, keep_style_tags: false, keep_link_tags: true, base_url: None, load_remote_stylesheets: true, extra_css: Some(css.into()), preallocate_node_capacity: 32, ..InlineOptions::default() }; //info!("HTML:\n{html}"); Ok(match CSSInliner::new(inline_opts).inline(&html) { Ok(inlined_html) => inlined_html, Err(err) => { error!("failed to inline CSS: {err}"); html.to_string() } }) } } /// Process images will extract any alt or title tags on images and place them as labels below said /// image. It also handles data-src and data-cfsrc attributes struct FrameImages; #[async_trait] impl Transformer for FrameImages { async fn transform(&self, _: &Option, html: &str) -> Result { Ok(rewrite_str( html, RewriteStrSettings { element_content_handlers: vec![ element!("img[data-src]", |el| { let src = el .get_attribute("data-src") .unwrap_or("https://placehold.co/600x400".to_string()); el.set_attribute("src", &src)?; Ok(()) }), element!("img[data-cfsrc]", |el| { let src = el .get_attribute("data-cfsrc") .unwrap_or("https://placehold.co/600x400".to_string()); el.set_attribute("src", &src)?; Ok(()) }), element!("img[alt], img[title]", |el| { let src = el .get_attribute("src") .unwrap_or("https://placehold.co/600x400".to_string()); let alt = el.get_attribute("alt"); let title = el.get_attribute("title"); let mut frags = vec!["
".to_string(), format!(r#""#)]; alt.map(|t| { if !t.is_empty() { frags.push(format!("
Alt: {t}
")) } }); title.map(|t| { if !t.is_empty() { frags.push(format!("
Title: {t}
")) } }); frags.push("
".to_string()); el.replace(&frags.join("\n"), ContentType::Html); Ok(()) }), ], ..RewriteStrSettings::default() }, )?) } } struct AddOutlink; #[async_trait] impl Transformer for AddOutlink { fn should_run(&self, link: &Option, html: &str) -> bool { if let Some(link) = link { link.scheme().starts_with("http") && !html.contains(link.as_str()) } else { false } } async fn transform(&self, link: &Option, html: &str) -> Result { if let Some(link) = link { Ok(format!( r#" {html} "#, link )) } else { Ok(html.to_string()) } } } struct SlurpContents<'c> { cacher: &'c FilesystemCacher, inline_css: bool, site_selectors: HashMap>, } impl<'c> SlurpContents<'c> { fn get_selectors(&self, link: &Url) -> Option<&[Selector]> { for (host, selector) in self.site_selectors.iter() { if link.host_str().map(|h| h.contains(host)).unwrap_or(false) { return Some(&selector); } } None } } #[async_trait] impl<'c> Transformer for SlurpContents<'c> { fn should_run(&self, link: &Option, html: &str) -> bool { debug!("SlurpContents should_run {link:?}"); let mut will_slurp = false; if let Some(link) = link { will_slurp = self.get_selectors(link).is_some(); } if !will_slurp && self.inline_css { return InlineStyle {}.should_run(link, html); } will_slurp } async fn transform(&self, link: &Option, html: &str) -> Result { debug!("SlurpContents {link:?}"); let retryable_status: HashSet = vec![ StatusCode::UNAUTHORIZED, StatusCode::FORBIDDEN, StatusCode::REQUEST_TIMEOUT, StatusCode::TOO_MANY_REQUESTS, ] .into_iter() .collect(); if let Some(test_link) = link { // If SlurpContents is configured for inline CSS, but no // configuration found for this site, use the local InlineStyle // transform. if self.inline_css && self.get_selectors(test_link).is_none() { debug!("local inline CSS for {link:?}"); return InlineStyle {}.transform(link, html).await; } } let Some(link) = link else { return Ok(html.to_string()); }; let Some(selectors) = self.get_selectors(&link) else { return Ok(html.to_string()); }; let cacher = self.cacher; let body = if let Some(body) = cacher.get(link.as_str()) { String::from_utf8_lossy(&body).to_string() } else { let resp = reqwest::get(link.as_str()).await?; let status = resp.status(); if status.is_server_error() || retryable_status.contains(&status) { return Err(TransformError::RetryableHttpStatusError( status, link.to_string(), )); } if !status.is_success() { return Ok(html.to_string()); } let body = resp.text().await?; cacher.set(link.as_str(), body.as_bytes()); body }; let body = Arc::new(body); let base_url = Some(link.clone()); let body = if self.inline_css { debug!("inlining CSS for {link}"); let inner_body = Arc::clone(&body); let res = tokio::task::spawn_blocking(move || { let css = concat!( "/* chrome-default.css */\n", include_str!("chrome-default.css"), "\n/* vars.css */\n", include_str!("../static/vars.css"), //"\n/* Xinu Specific overrides */\n", //include_str!("custom.css"), ); let res = CSSInliner::options() .base_url(base_url) .extra_css(Some(std::borrow::Cow::Borrowed(css))) .build() .inline(&inner_body); match res { Ok(inlined_html) => inlined_html, Err(err) => { error!("failed to inline remote CSS: {err}"); Arc::into_inner(inner_body).expect("failed to take body out of Arc") } } }) .await; match res { Ok(inlined_html) => inlined_html, Err(err) => { error!("failed to spawn inline remote CSS: {err}"); Arc::into_inner(body).expect("failed to take body out of Arc") } } } else { debug!("using body as-is for {link:?}"); Arc::into_inner(body).expect("failed to take body out of Arc") }; let doc = Html::parse_document(&body); let mut results = Vec::new(); for selector in selectors { for frag in doc.select(&selector) { results.push(frag.html()) // TODO: figure out how to warn if there were no hits //warn!("couldn't find '{:?}' in {}", selector, link); } } Ok(results.join("
")) } } pub fn linkify_html(text: &str) -> String { let mut finder = LinkFinder::new(); let finder = finder.url_must_have_scheme(false).kinds(&[LinkKind::Url]); let mut parts = Vec::new(); for span in finder.spans(text) { // TODO(wathiede): use Cow? match span.kind() { // Text as-is None => parts.push(span.as_str().to_string()), // Wrap in anchor tag Some(LinkKind::Url) => { let text = span.as_str(); let schema = if text.starts_with("http") { "" } else { "http://" }; let a = format!(r#"{0}"#, text); parts.push(a); } _ => todo!("unhandled kind: {:?}", span.kind().unwrap()), } } parts.join("") } // html contains the content to be cleaned, and cid_prefix is used to resolve mixed part image // referrences pub fn sanitize_html( html: &str, cid_prefix: &str, base_url: &Option, ) -> Result { let inline_opts = InlineOptions { inline_style_tags: true, keep_style_tags: true, keep_link_tags: false, base_url: None, load_remote_stylesheets: false, extra_css: None, preallocate_node_capacity: 32, ..InlineOptions::default() }; let html = match CSSInliner::new(inline_opts).inline(&html) { Ok(inlined_html) => inlined_html, Err(err) => { error!("failed to inline CSS: {err}"); html.to_string() } }; let mut element_content_handlers = vec![ // Open links in new tab element!("a[href]", |el| { el.set_attribute("target", "_blank").unwrap(); Ok(()) }), // Replace mixed part CID images with URL element!("img[src]", |el| { let src = el .get_attribute("src") .expect("src was required") .replace("cid:", cid_prefix); el.set_attribute("src", &src)?; Ok(()) }), // Only secure image URLs element!("img[src]", |el| { let src = el .get_attribute("src") .expect("src was required") .replace("http:", "https:"); el.set_attribute("src", &src)?; Ok(()) }), // Add https to href with // element!("link[href]", |el| { info!("found link[href] {el:?}"); let mut href = el.get_attribute("href").expect("href was required"); if href.starts_with("//") { warn!("adding https to {href}"); href.insert_str(0, "https:"); } el.set_attribute("href", &href)?; Ok(()) }), // Add https to src with // element!("style[src]", |el| { let mut src = el.get_attribute("src").expect("src was required"); if src.starts_with("//") { src.insert_str(0, "https:"); } el.set_attribute("src", &src)?; Ok(()) }), ]; if let Some(base_url) = base_url { element_content_handlers.extend(vec![ // Make links with relative URLs absolute element!("a[href]", |el| { if let Some(Ok(href)) = el.get_attribute("href").map(|href| base_url.join(&href)) { el.set_attribute("href", &href.as_str()).unwrap(); } Ok(()) }), // Make images with relative srcs absolute element!("img[src]", |el| { if let Some(Ok(src)) = el.get_attribute("src").map(|src| base_url.join(&src)) { el.set_attribute("src", &src.as_str()).unwrap(); } Ok(()) }), ]); } let html = rewrite_str( &html, RewriteStrSettings { element_content_handlers, ..RewriteStrSettings::default() }, )?; // Default's don't allow style, but we want to preserve that. // TODO: remove 'class' if rendering mails moves to a two phase process where abstract message // types are collected, santized, and then grouped together as one big HTML doc let attributes = hashset![ "align", "bgcolor", "class", "color", "height", "lang", "title", "width", "style", ]; let tags = hashset![ "a", "abbr", "acronym", "area", "article", "aside", "b", "bdi", "bdo", "blockquote", "br", "caption", "center", "cite", "code", "col", "colgroup", "data", "dd", "del", "details", "dfn", "div", "dl", "dt", "em", "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hgroup", "hr", "i", "iframe", // wathiede "img", "ins", "kbd", "kbd", "li", "map", "mark", "nav", "noscript", // wathiede "ol", "p", "pre", "q", "rp", "rt", "rtc", "ruby", "s", "samp", "small", "span", "strike", "strong", "sub", "summary", "sup", "table", "tbody", "td", "th", "thead", "time", "title", // wathiede "tr", "tt", "u", "ul", "var", "wbr", ]; let tag_attributes = hashmap![ "a" => hashset![ "href", "hreflang", "target", ], "bdo" => hashset![ "dir" ], "blockquote" => hashset![ "cite" ], "col" => hashset![ "align", "char", "charoff", "span" ], "colgroup" => hashset![ "align", "char", "charoff", "span" ], "del" => hashset![ "cite", "datetime" ], "hr" => hashset![ "align", "size", "width" ], "iframe" => hashset![ "src", "allow", "allowfullscreen" ], "img" => hashset![ "align", "alt", "height", "src", "width" ], "ins" => hashset![ "cite", "datetime" ], "ol" => hashset![ "start" ], "q" => hashset![ "cite" ], "table" => hashset![ "align", "border", "cellpadding", "cellspacing", "char", "charoff", "summary", ], "tbody" => hashset![ "align", "char", "charoff" ], "td" => hashset![ "align", "char", "charoff", "colspan", "headers", "rowspan" ], "tfoot" => hashset![ "align", "char", "charoff" ], "th" => hashset![ "align", "char", "charoff", "colspan", "headers", "rowspan", "scope" ], "thead" => hashset![ "align", "char", "charoff" ], "tr" => hashset![ "align", "char", "charoff" ], ]; let html = ammonia::Builder::default() .tags(tags) .tag_attributes(tag_attributes) .generic_attributes(attributes) .clean(&html) .to_string(); Ok(html) } fn compute_offset_limit( after: Option, before: Option, first: Option, last: Option, ) -> (i32, i32) { let default_page_size = 100; match (after, before, first, last) { // Reasonable defaults (None, None, None, None) => (0, default_page_size), (None, None, Some(first), None) => (0, first), (Some(after), None, None, None) => (after + 1, default_page_size), (Some(after), None, Some(first), None) => (after + 1, first), (None, Some(before), None, None) => (0.max(before - default_page_size), default_page_size), (None, Some(before), None, Some(last)) => (0.max(before - last), last), (None, None, None, Some(_)) => { panic!("specifying last and no before doesn't make sense") } (None, None, Some(_), Some(_)) => { panic!("specifying first and last doesn't make sense") } (None, Some(_), Some(_), _) => { panic!("specifying before and first doesn't make sense") } (Some(_), Some(_), _, _) => { panic!("specifying after and before doesn't make sense") } (Some(_), None, None, Some(_)) => { panic!("specifying after and last doesn't make sense") } (Some(_), None, Some(_), Some(_)) => { panic!("specifying after, first and last doesn't make sense") } } } #[derive(Debug, Default)] pub struct Query { pub unread_only: bool, pub tags: Vec, pub uids: Vec, pub remainder: Vec, pub is_notmuch: bool, pub is_newsreader: bool, pub is_tantivy: bool, pub corpus: Option, } impl fmt::Display for Query { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), std::fmt::Error> { if self.unread_only { write!(f, "is:unread ")?; } for tag in &self.tags { write!(f, "tag:{tag} ")?; } for uid in &self.uids { write!(f, "id:{uid} ")?; } if self.is_notmuch { write!(f, "is:mail ")?; } if self.is_newsreader { write!(f, "is:newsreader ")?; } if self.is_newsreader { write!(f, "is:news ")?; } match self.corpus { Some(c) => write!(f, "corpus:{c:?}")?, _ => (), } for rem in &self.remainder { write!(f, "{rem} ")?; } Ok(()) } } impl Query { // Converts the internal state of Query to something suitable for notmuch queries. Removes and // letterbox specific ': String { let mut parts = Vec::new(); if !self.is_notmuch { return String::new(); } if self.unread_only { parts.push("is:unread".to_string()); } for tag in &self.tags { parts.push(format!("tag:{tag}")); } for uid in &self.uids { parts.push(uid.clone()); } parts.extend(self.remainder.clone()); parts.join(" ") } } impl FromStr for Query { type Err = Infallible; fn from_str(s: &str) -> Result { let mut unread_only = false; let mut tags = Vec::new(); let mut uids = Vec::new(); let mut remainder = Vec::new(); let mut is_notmuch = false; let mut is_newsreader = false; let mut is_tantivy = false; let mut corpus = None; for word in s.split_whitespace() { if word == "is:unread" { unread_only = true } else if word.starts_with("tag:") { tags.push(word["tag:".len()..].to_string()); /* } else if word.starts_with("tag:") { // Any tag that doesn't match site_prefix should explicitly set the site to something not in the // database site = Some(NON_EXISTENT_SITE_NAME.to_string()); */ } else if word.starts_with("corpus:") { let c = word["corpus:".len()..].to_string(); corpus = c.parse::().map(|c| Some(c)).unwrap_or_else(|e| { warn!("Error parsing corpus '{c}': {e:?}"); None }); } else if is_newsreader_thread(word) { uids.push(word.to_string()); } else if is_notmuch_thread_or_id(word) { uids.push(word.to_string()); } else if word == "is:mail" || word == "is:email" || word == "is:notmuch" { is_notmuch = true; } else if word == "is:news" { is_newsreader = true; } else if word == "is:newsreader" { is_newsreader = true; } else { remainder.push(word.to_string()); } } // If we don't see any explicit filters for a corpus, flip them all on if corpus.is_none() && !(is_notmuch || is_tantivy || is_newsreader) { is_notmuch = true; is_newsreader = true; is_tantivy = true; } Ok(Query { unread_only, tags, uids, remainder, is_notmuch, is_newsreader, is_tantivy, corpus, }) } } pub struct ThreadSummaryRecord { pub site: Option, pub date: Option, pub is_read: Option, pub title: Option, pub uid: String, pub name: Option, pub corpus: Corpus, } async fn thread_summary_from_row(r: ThreadSummaryRecord) -> ThreadSummary { let site = r.site.unwrap_or("UNKOWN TAG".to_string()); let mut tags = vec![format!("{NEWSREADER_TAG_PREFIX}{site}")]; if !r.is_read.unwrap_or(true) { tags.push("unread".to_string()); }; let mut title = r.title.unwrap_or("NO TITLE".to_string()); title = clean_title(&title).await.expect("failed to clean title"); ThreadSummary { thread: format!("{NEWSREADER_THREAD_PREFIX}{}", r.uid), timestamp: r .date .expect("post missing date") .assume_utc() .unix_timestamp() as isize, date_relative: format!("{:?}", r.date), //date_relative: "TODO date_relative".to_string(), matched: 0, total: 1, authors: r.name.unwrap_or_else(|| site.clone()), subject: title, tags, corpus: r.corpus, } } async fn clean_title(title: &str) -> Result { // Make title HTML so html parsers work let mut title = format!("{title}"); let title_tranformers: Vec> = vec![Box::new(EscapeHtml), Box::new(StripHtml)]; // Make title HTML so html parsers work title = format!("{title}"); for t in title_tranformers.iter() { if t.should_run(&None, &title) { title = t.transform(&None, &title).await?; } } Ok(title) }