679 lines
20 KiB
Rust

pub mod config;
pub mod error;
pub mod graphql;
pub mod newsreader;
pub mod nm;
use std::{collections::HashMap, convert::Infallible, str::FromStr, sync::Arc};
use async_trait::async_trait;
use cacher::{Cacher, FilesystemCacher};
use css_inline::{CSSInliner, InlineError, InlineOptions};
use linkify::{LinkFinder, LinkKind};
use log::{error, info, warn};
use lol_html::{
element, errors::RewritingError, html_content::ContentType, rewrite_str, text,
RewriteStrSettings,
};
use maplit::{hashmap, hashset};
use scraper::{Html, Selector};
use thiserror::Error;
use tokio::sync::Mutex;
use url::Url;
use crate::{
config::DeSelector,
newsreader::{extract_thread_id, is_newsreader_thread},
};
// TODO: figure out how to use Cow
#[async_trait]
trait Transformer: Send + Sync {
fn should_run(&self, _addr: &Option<Url>, _html: &str) -> bool {
true
}
// TODO: should html be something like `html_escape` uses:
// <S: ?Sized + AsRef<str>>(text: &S) -> Cow<str>
async fn transform(&self, addr: &Option<Url>, html: &str) -> Result<String, TransformError>;
}
// TODO: how would we make this more generic to allow good implementations of Transformer outside
// of this module?
#[derive(Error, Debug)]
pub enum TransformError {
#[error("lol-html rewrite error: {0}")]
RewritingError(#[from] RewritingError),
#[error("css inline error: {0}")]
InlineError(#[from] InlineError),
#[error("failed to fetch url error: {0}")]
ReqwestError(#[from] reqwest::Error),
#[error("failed to parse HTML: {0}")]
HtmlParsingError(String),
}
struct SanitizeHtml<'a> {
cid_prefix: &'a str,
base_url: &'a Option<Url>,
}
#[async_trait]
impl<'a> Transformer for SanitizeHtml<'a> {
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
Ok(sanitize_html(html, self.cid_prefix, self.base_url)?)
}
}
struct EscapeHtml;
#[async_trait]
impl Transformer for EscapeHtml {
fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
html.contains("&")
}
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
Ok(html_escape::decode_html_entities(html).to_string())
}
}
struct StripHtml;
#[async_trait]
impl Transformer for StripHtml {
fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
// Lame test
html.contains("<")
}
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
let mut text = String::new();
let element_content_handlers = vec![text!("*", |t| {
text += t.as_str();
Ok(())
})];
let _ = rewrite_str(
html,
RewriteStrSettings {
element_content_handlers,
..RewriteStrSettings::default()
},
)?;
Ok(text)
}
}
struct InlineStyle;
#[async_trait]
impl Transformer for InlineStyle {
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
let css = concat!(
"/* chrome-default.css */\n",
include_str!("chrome-default.css"),
"\n/* mvp.css */\n",
include_str!("mvp.css"),
"\n/* Xinu Specific overrides */\n",
include_str!("custom.css"),
);
let inline_opts = InlineOptions {
inline_style_tags: true,
keep_style_tags: false,
keep_link_tags: true,
base_url: None,
load_remote_stylesheets: true,
extra_css: Some(css.into()),
preallocate_node_capacity: 32,
..InlineOptions::default()
};
//info!("HTML:\n{html}");
Ok(match CSSInliner::new(inline_opts).inline(&html) {
Ok(inlined_html) => inlined_html,
Err(err) => {
error!("failed to inline CSS: {err}");
html.to_string()
}
})
}
}
/// Process images will extract any alt or title tags on images and place them as labels below said
/// image. It also handles data-src and data-cfsrc attributes
struct FrameImages;
#[async_trait]
impl Transformer for FrameImages {
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
Ok(rewrite_str(
html,
RewriteStrSettings {
element_content_handlers: vec![
element!("img[data-src]", |el| {
let src = el
.get_attribute("data-src")
.unwrap_or("https://placehold.co/600x400".to_string());
el.set_attribute("src", &src)?;
Ok(())
}),
element!("img[data-cfsrc]", |el| {
let src = el
.get_attribute("data-cfsrc")
.unwrap_or("https://placehold.co/600x400".to_string());
el.set_attribute("src", &src)?;
Ok(())
}),
element!("img[alt], img[title]", |el| {
let src = el
.get_attribute("src")
.unwrap_or("https://placehold.co/600x400".to_string());
let alt = el.get_attribute("alt");
let title = el.get_attribute("title");
let mut frags =
vec!["<figure>".to_string(), format!(r#"<img src="{src}">"#)];
alt.map(|t| {
if !t.is_empty() {
frags.push(format!("<figcaption>Alt: {t}</figcaption>"))
}
});
title.map(|t| {
if !t.is_empty() {
frags.push(format!("<figcaption>Title: {t}</figcaption>"))
}
});
frags.push("</figure>".to_string());
el.replace(&frags.join("\n"), ContentType::Html);
Ok(())
}),
],
..RewriteStrSettings::default()
},
)?)
}
}
struct AddOutlink;
#[async_trait]
impl Transformer for AddOutlink {
fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
if let Some(link) = link {
link.scheme().starts_with("http") && !html.contains(link.as_str())
} else {
false
}
}
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
if let Some(link) = link {
Ok(format!(
r#"
{html}
<div><a href="{}">View on site</a></div>
"#,
link
))
} else {
Ok(html.to_string())
}
}
}
struct SlurpContents<'h> {
cacher: Arc<Mutex<FilesystemCacher>>,
site_selectors: &'h HashMap<String, Vec<DeSelector>>,
}
impl<'h> SlurpContents<'h> {
fn get_selectors(&self, link: &Url) -> Option<&[DeSelector]> {
for (host, selector) in self.site_selectors.iter() {
if link.host_str().map(|h| h.contains(host)).unwrap_or(false) {
return Some(&selector);
}
}
None
}
}
#[async_trait]
impl<'h> Transformer for SlurpContents<'h> {
fn should_run(&self, link: &Option<Url>, _: &str) -> bool {
if let Some(link) = link {
return self.get_selectors(link).is_some();
}
false
}
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
let Some(link) = link else {
return Ok(html.to_string());
};
let Some(selectors) = self.get_selectors(&link) else {
return Ok(html.to_string());
};
let mut cacher = self.cacher.lock().await;
let body = if let Some(body) = cacher.get(link.as_str()) {
info!("cache hit for {link}");
String::from_utf8_lossy(&body).to_string()
} else {
let body = reqwest::get(link.as_str()).await?.text().await?;
cacher.set(link.as_str(), body.as_bytes());
body
};
let doc = Html::parse_document(&body);
let mut results = Vec::new();
for selector in selectors {
for frag in doc.select(&selector.0) {
results.push(frag.html())
// TODO: figure out how to warn if there were no hits
//warn!("couldn't find '{:?}' in {}", selector, link);
}
}
Ok(results.join(""))
}
}
pub fn linkify_html(text: &str) -> String {
let mut finder = LinkFinder::new();
let finder = finder.url_must_have_scheme(false).kinds(&[LinkKind::Url]);
let mut parts = Vec::new();
for span in finder.spans(text) {
// TODO(wathiede): use Cow<str>?
match span.kind() {
// Text as-is
None => parts.push(span.as_str().to_string()),
// Wrap in anchor tag
Some(LinkKind::Url) => {
let text = span.as_str();
let schema = if text.starts_with("http") {
""
} else {
"http://"
};
let a = format!(r#"<a href="{schema}{0}">{0}</a>"#, text);
parts.push(a);
}
_ => todo!("unhandled kind: {:?}", span.kind().unwrap()),
}
}
parts.join("")
}
// html contains the content to be cleaned, and cid_prefix is used to resolve mixed part image
// referrences
pub fn sanitize_html(
html: &str,
cid_prefix: &str,
base_url: &Option<Url>,
) -> Result<String, TransformError> {
let inline_opts = InlineOptions {
inline_style_tags: true,
keep_style_tags: true,
keep_link_tags: false,
base_url: None,
load_remote_stylesheets: false,
extra_css: None,
preallocate_node_capacity: 32,
..InlineOptions::default()
};
let html = match CSSInliner::new(inline_opts).inline(&html) {
Ok(inlined_html) => inlined_html,
Err(err) => {
error!("failed to inline CSS: {err}");
html.to_string()
}
};
let mut element_content_handlers = vec![
// Open links in new tab
element!("a[href]", |el| {
el.set_attribute("target", "_blank").unwrap();
Ok(())
}),
// Replace mixed part CID images with URL
element!("img[src]", |el| {
let src = el
.get_attribute("src")
.expect("src was required")
.replace("cid:", cid_prefix);
el.set_attribute("src", &src)?;
Ok(())
}),
// Only secure image URLs
element!("img[src]", |el| {
let src = el
.get_attribute("src")
.expect("src was required")
.replace("http:", "https:");
el.set_attribute("src", &src)?;
Ok(())
}),
// Add https to href with //<domain name>
element!("link[href]", |el| {
info!("found link[href] {el:?}");
let mut href = el.get_attribute("href").expect("href was required");
if href.starts_with("//") {
warn!("adding https to {href}");
href.insert_str(0, "https:");
}
el.set_attribute("href", &href)?;
Ok(())
}),
// Add https to src with //<domain name>
element!("style[src]", |el| {
let mut src = el.get_attribute("src").expect("src was required");
if src.starts_with("//") {
src.insert_str(0, "https:");
}
el.set_attribute("src", &src)?;
Ok(())
}),
];
if let Some(base_url) = base_url {
element_content_handlers.extend(vec![
// Make links with relative URLs absolute
element!("a[href]", |el| {
if let Some(Ok(href)) = el.get_attribute("href").map(|href| base_url.join(&href)) {
el.set_attribute("href", &href.as_str()).unwrap();
}
Ok(())
}),
// Make images with relative srcs absolute
element!("img[src]", |el| {
if let Some(Ok(src)) = el.get_attribute("src").map(|src| base_url.join(&src)) {
el.set_attribute("src", &src.as_str()).unwrap();
}
Ok(())
}),
]);
}
let html = rewrite_str(
&html,
RewriteStrSettings {
element_content_handlers,
..RewriteStrSettings::default()
},
)?;
// Default's don't allow style, but we want to preserve that.
// TODO: remove 'class' if rendering mails moves to a two phase process where abstract message
// types are collected, santized, and then grouped together as one big HTML doc
let attributes = hashset![
"align", "bgcolor", "class", "color", "height", "lang", "title", "width", "style",
];
let tags = hashset![
"a",
"abbr",
"acronym",
"area",
"article",
"aside",
"b",
"bdi",
"bdo",
"blockquote",
"br",
"caption",
"center",
"cite",
"code",
"col",
"colgroup",
"data",
"dd",
"del",
"details",
"dfn",
"div",
"dl",
"dt",
"em",
"figcaption",
"figure",
"footer",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hgroup",
"hr",
"i",
"iframe", // wathiede
"img",
"ins",
"kbd",
"kbd",
"li",
"map",
"mark",
"nav",
"noscript", // wathiede
"ol",
"p",
"pre",
"q",
"rp",
"rt",
"rtc",
"ruby",
"s",
"samp",
"small",
"span",
"strike",
"strong",
"sub",
"summary",
"sup",
"table",
"tbody",
"td",
"th",
"thead",
"time",
"title", // wathiede
"tr",
"tt",
"u",
"ul",
"var",
"wbr",
];
let tag_attributes = hashmap![
"a" => hashset![
"href", "hreflang", "target",
],
"bdo" => hashset![
"dir"
],
"blockquote" => hashset![
"cite"
],
"col" => hashset![
"align", "char", "charoff", "span"
],
"colgroup" => hashset![
"align", "char", "charoff", "span"
],
"del" => hashset![
"cite", "datetime"
],
"hr" => hashset![
"align", "size", "width"
],
"iframe" => hashset![
"src", "allow", "allowfullscreen"
],
"img" => hashset![
"align", "alt", "height", "src", "width"
],
"ins" => hashset![
"cite", "datetime"
],
"ol" => hashset![
"start"
],
"q" => hashset![
"cite"
],
"table" => hashset![
"align", "border", "cellpadding", "cellspacing", "char", "charoff", "summary",
],
"tbody" => hashset![
"align", "char", "charoff"
],
"td" => hashset![
"align", "char", "charoff", "colspan", "headers", "rowspan"
],
"tfoot" => hashset![
"align", "char", "charoff"
],
"th" => hashset![
"align", "char", "charoff", "colspan", "headers", "rowspan", "scope"
],
"thead" => hashset![
"align", "char", "charoff"
],
"tr" => hashset![
"align", "char", "charoff"
],
];
let html = ammonia::Builder::default()
.tags(tags)
.tag_attributes(tag_attributes)
.generic_attributes(attributes)
.clean(&html)
.to_string();
Ok(html)
}
fn compute_offset_limit(
after: Option<i32>,
before: Option<i32>,
first: Option<i32>,
last: Option<i32>,
) -> (i32, i32) {
let default_page_size = 100;
match (after, before, first, last) {
// Reasonable defaults
(None, None, None, None) => (0, default_page_size),
(None, None, Some(first), None) => (0, first),
(Some(after), None, None, None) => (after + 1, default_page_size),
(Some(after), None, Some(first), None) => (after + 1, first),
(None, Some(before), None, None) => (0.max(before - default_page_size), default_page_size),
(None, Some(before), None, Some(last)) => (0.max(before - last), last),
(None, None, None, Some(_)) => {
panic!("specifying last and no before doesn't make sense")
}
(None, None, Some(_), Some(_)) => {
panic!("specifying first and last doesn't make sense")
}
(None, Some(_), Some(_), _) => {
panic!("specifying before and first doesn't make sense")
}
(Some(_), Some(_), _, _) => {
panic!("specifying after and before doesn't make sense")
}
(Some(_), None, None, Some(_)) => {
panic!("specifying after and last doesn't make sense")
}
(Some(_), None, Some(_), Some(_)) => {
panic!("specifying after, first and last doesn't make sense")
}
}
}
#[derive(Debug)]
pub struct Query {
pub unread_only: bool,
pub tag: Option<String>,
pub uid: Option<String>,
pub remainder: Vec<String>,
pub is_notmuch: bool,
pub is_newsreader: bool,
}
impl Query {
// Converts the internal state of Query to something suitable for notmuch queries. Removes and
// letterbox specific '<key>:<value' tags
fn to_notmuch(&self) -> String {
let mut parts = Vec::new();
if !self.is_notmuch {
return String::new();
}
if self.unread_only {
parts.push("is:unread".to_string());
}
if let Some(site) = &self.tag {
parts.push(format!("tag:{site}"));
}
if let Some(uid) = &self.uid {
parts.push(uid.clone());
}
parts.extend(self.remainder.clone());
parts.join(" ")
}
}
impl FromStr for Query {
type Err = Infallible;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut unread_only = false;
let mut tag = None;
let mut uid = None;
let mut remainder = Vec::new();
let mut is_notmuch = false;
let mut is_newsreader = false;
for word in s.split_whitespace() {
if word == "is:unread" {
unread_only = true
} else if word.starts_with("tag:") {
tag = Some(word["tag:".len()..].to_string())
/*
} else if word.starts_with("tag:") {
// Any tag that doesn't match site_prefix should explicitly set the site to something not in the
// database
site = Some(NON_EXISTENT_SITE_NAME.to_string());
*/
} else if is_newsreader_thread(word) {
uid = Some(extract_thread_id(word).to_string())
} else if word == "is:mail" || word == "is:email" || word == "is:notmuch" {
is_notmuch = true;
} else if word == "is:news" || word == "is:newsreader" {
is_newsreader = true;
} else {
remainder.push(word.to_string());
}
}
// If we don't see any explicit filters for a corpus, flip them all on
if !(is_notmuch || is_newsreader) {
is_newsreader = true;
is_notmuch = true;
}
Ok(Query {
unread_only,
tag,
uid,
remainder,
is_notmuch,
is_newsreader,
})
}
}