letterbox/server/src/lib.rs

pub mod config;
pub mod error;
pub mod graphql;
pub mod mail;
pub mod newsreader;
pub mod nm;
#[cfg(feature = "tantivy")]
pub mod tantivy;

use std::{collections::HashMap, convert::Infallible, fmt, str::FromStr, sync::Arc};

use async_trait::async_trait;
use cacher::{Cacher, FilesystemCacher};
use css_inline::{CSSInliner, InlineError, InlineOptions};
use linkify::{LinkFinder, LinkKind};
use log::{debug, error, info, warn};
use lol_html::{
    element, errors::RewritingError, html_content::ContentType, rewrite_str, text,
    RewriteStrSettings,
};
use maplit::{hashmap, hashset};
use regex::Regex;
use scraper::{Html, Selector};
use sqlx::types::time::PrimitiveDateTime;
use thiserror::Error;
use url::Url;

use crate::{
    error::ServerError,
    graphql::{Corpus, ThreadSummary},
    newsreader::is_newsreader_thread,
    nm::is_notmuch_thread_or_id,
};

const NEWSREADER_TAG_PREFIX: &'static str = "News/";
const NEWSREADER_THREAD_PREFIX: &'static str = "news:";

// TODO: figure out how to use Cow
#[async_trait]
trait Transformer: Send + Sync {
    fn should_run(&self, _addr: &Option<Url>, _html: &str) -> bool {
        true
    }
    // TODO: should html be something like `html_escape` uses:
    // <S: ?Sized + AsRef<str>>(text: &S) -> Cow<str>
    async fn transform(&self, addr: &Option<Url>, html: &str) -> Result<String, TransformError>;
}

// TODO: how would we make this more generic to allow good implementations of Transformer outside
// of this module?
#[derive(Error, Debug)]
pub enum TransformError {
    #[error("lol-html rewrite error: {0}")]
    RewritingError(#[from] RewritingError),
    #[error("css inline error: {0}")]
    InlineError(#[from] InlineError),
    #[error("failed to fetch url error: {0}")]
    ReqwestError(#[from] reqwest::Error),
    #[error("failed to parse HTML: {0}")]
    HtmlParsingError(String),
}

struct SanitizeHtml<'a> {
    cid_prefix: &'a str,
    base_url: &'a Option<Url>,
}

#[async_trait]
impl<'a> Transformer for SanitizeHtml<'a> {
    async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
        Ok(sanitize_html(html, self.cid_prefix, self.base_url)?)
    }
}

struct EscapeHtml;

#[async_trait]
impl Transformer for EscapeHtml {
    fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
        html.contains("&")
    }
    async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
        Ok(html_escape::decode_html_entities(html).to_string())
    }
}

struct StripHtml;

#[async_trait]
impl Transformer for StripHtml {
    fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
        // Lame test
        html.contains("<")
    }
    async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
        let mut text = String::new();
        let element_content_handlers = vec![text!("*", |t| {
            text += t.as_str();
            Ok(())
        })];
        let _ = rewrite_str(
            html,
            RewriteStrSettings {
                element_content_handlers,
                ..RewriteStrSettings::default()
            },
        )?;
        let re = Regex::new(r"\s+").expect("failed to parse regex");
        let text = re.replace_all(&text, " ").to_string();

        Ok(text)
    }
}

struct InlineRemoteStyle<'a> {
    base_url: &'a Option<Url>,
}

#[async_trait]
impl<'a> Transformer for InlineRemoteStyle<'a> {
    async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
        //info!("HTML:\n{html}");
        Ok(
            match CSSInliner::options()
                .base_url(self.base_url.clone())
                .build()
                .inline(&html)
            {
                Ok(inlined_html) => inlined_html,
                Err(err) => {
                    error!("failed to inline remote CSS: {err}");
                    html.to_string()
                }
            },
        )
    }
}
struct InlineStyle;

#[async_trait]
impl Transformer for InlineStyle {
    async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
        let css = concat!(
            "/* chrome-default.css */\n",
            include_str!("chrome-default.css"),
            //"\n/* mvp.css */\n",
            //include_str!("mvp.css"),
            //"\n/* Xinu Specific overrides */\n",
            //include_str!("custom.css"),
        );
        let inline_opts = InlineOptions {
            inline_style_tags: true,
            keep_style_tags: false,
            keep_link_tags: true,
            base_url: None,
            load_remote_stylesheets: true,
            extra_css: Some(css.into()),
            preallocate_node_capacity: 32,
            ..InlineOptions::default()
        };

        //info!("HTML:\n{html}");
        Ok(match CSSInliner::new(inline_opts).inline(&html) {
            Ok(inlined_html) => inlined_html,
            Err(err) => {
                error!("failed to inline CSS: {err}");
                html.to_string()
            }
        })
    }
}

/// Process images will extract any alt or title tags on images and place them as labels below said
/// image. It also handles data-src and data-cfsrc attributes
struct FrameImages;

#[async_trait]
impl Transformer for FrameImages {
    async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
        Ok(rewrite_str(
            html,
            RewriteStrSettings {
                element_content_handlers: vec![
                    element!("img[data-src]", |el| {
                        let src = el
                            .get_attribute("data-src")
                            .unwrap_or("https://placehold.co/600x400".to_string());
                        el.set_attribute("src", &src)?;

                        Ok(())
                    }),
                    element!("img[data-cfsrc]", |el| {
                        let src = el
                            .get_attribute("data-cfsrc")
                            .unwrap_or("https://placehold.co/600x400".to_string());
                        el.set_attribute("src", &src)?;

                        Ok(())
                    }),
                    element!("img[alt], img[title]", |el| {
                        let src = el
                            .get_attribute("src")
                            .unwrap_or("https://placehold.co/600x400".to_string());
                        let alt = el.get_attribute("alt");
                        let title = el.get_attribute("title");
                        let mut frags =
                            vec!["<figure>".to_string(), format!(r#"<img src="{src}">"#)];
                        alt.map(|t| {
                            if !t.is_empty() {
                                frags.push(format!("<figcaption>Alt: {t}</figcaption>"))
                            }
                        });
                        title.map(|t| {
                            if !t.is_empty() {
                                frags.push(format!("<figcaption>Title: {t}</figcaption>"))
                            }
                        });
                        frags.push("</figure>".to_string());
                        el.replace(&frags.join("\n"), ContentType::Html);

                        Ok(())
                    }),
                ],
                ..RewriteStrSettings::default()
            },
        )?)
    }
}
struct AddOutlink;

#[async_trait]
impl Transformer for AddOutlink {
    fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
        if let Some(link) = link {
            link.scheme().starts_with("http") && !html.contains(link.as_str())
        } else {
            false
        }
    }
    async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
        if let Some(link) = link {
            Ok(format!(
                r#"
                {html}
                <div><a href="{}">View on site</a></div>
                "#,
                link
            ))
        } else {
            Ok(html.to_string())
        }
    }
}

struct SlurpContents<'c> {
    cacher: &'c FilesystemCacher,
    inline_css: bool,
    site_selectors: HashMap<String, Vec<Selector>>,
}

impl<'c> SlurpContents<'c> {
    fn get_selectors(&self, link: &Url) -> Option<&[Selector]> {
        for (host, selector) in self.site_selectors.iter() {
            if link.host_str().map(|h| h.contains(host)).unwrap_or(false) {
                return Some(&selector);
            }
        }
        None
    }
}

#[async_trait]
impl<'c> Transformer for SlurpContents<'c> {
    fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
        let mut will_slurp = false;
        if let Some(link) = link {
            will_slurp = self.get_selectors(link).is_some();
        }
        if !will_slurp && self.inline_css {
            return InlineStyle {}.should_run(link, html);
        }
        will_slurp
    }
    async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
        if let Some(test_link) = link {
            // If SlurpContents is configured for inline CSS, but no
            // configuration found for this site, use the local InlineStyle
            // transform.
            if self.inline_css && self.get_selectors(test_link).is_none() {
                debug!("local inline CSS for {link:?}");
                return InlineStyle {}.transform(link, html).await;
            }
        }
        let Some(link) = link else {
            return Ok(html.to_string());
        };
        let Some(selectors) = self.get_selectors(&link) else {
            return Ok(html.to_string());
        };
        let cacher = self.cacher;
        let body = if let Some(body) = cacher.get(link.as_str()) {
            String::from_utf8_lossy(&body).to_string()
        } else {
            let body = reqwest::get(link.as_str()).await?.text().await?;
            cacher.set(link.as_str(), body.as_bytes());
            body
        };
        let body = Arc::new(body);
        let base_url = Some(link.clone());
        let body = if self.inline_css {
            debug!("inlining CSS for {link}");
            let inner_body = Arc::clone(&body);
            let res = tokio::task::spawn_blocking(move || {
                let css = concat!(
                    "/* chrome-default.css */\n",
                    include_str!("chrome-default.css"),
                    "\n/* vars.css */\n",
                    include_str!("../../web/static/vars.css"),
                    //"\n/* Xinu Specific overrides */\n",
                    //include_str!("custom.css"),
                );
                let res = CSSInliner::options()
                    .base_url(base_url)
                    .extra_css(Some(std::borrow::Cow::Borrowed(css)))
                    .build()
                    .inline(&inner_body);

                match res {
                    Ok(inlined_html) => inlined_html,
                    Err(err) => {
                        error!("failed to inline remote CSS: {err}");
                        Arc::into_inner(inner_body).expect("failed to take body out of Arc")
                    }
                }
            })
            .await;
            match res {
                Ok(inlined_html) => inlined_html,
                Err(err) => {
                    error!("failed to spawn inline remote CSS: {err}");
                    Arc::into_inner(body).expect("failed to take body out of Arc")
                }
            }
        } else {
            debug!("using body as-is for {link:?}");
            Arc::into_inner(body).expect("failed to take body out of Arc")
        };

        let doc = Html::parse_document(&body);

        let mut results = Vec::new();
        for selector in selectors {
            for frag in doc.select(&selector) {
                results.push(frag.html())
                // TODO: figure out how to warn if there were no hits
                //warn!("couldn't find '{:?}' in {}", selector, link);
            }
        }
        Ok(results.join("<br>"))
    }
}

pub fn linkify_html(text: &str) -> String {
    let mut finder = LinkFinder::new();
    let finder = finder.url_must_have_scheme(false).kinds(&[LinkKind::Url]);
    let mut parts = Vec::new();
    for span in finder.spans(text) {
        // TODO(wathiede): use Cow<str>?
        match span.kind() {
            // Text as-is
            None => parts.push(span.as_str().to_string()),
            // Wrap in anchor tag
            Some(LinkKind::Url) => {
                let text = span.as_str();
                let schema = if text.starts_with("http") {
                    ""
                } else {
                    "http://"
                };
                let a = format!(r#"<a href="{schema}{0}">{0}</a>"#, text);
                parts.push(a);
            }
            _ => todo!("unhandled kind: {:?}", span.kind().unwrap()),
        }
    }
    parts.join("")
}

// html contains the content to be cleaned, and cid_prefix is used to resolve mixed part image
// referrences
pub fn sanitize_html(
    html: &str,
    cid_prefix: &str,
    base_url: &Option<Url>,
) -> Result<String, TransformError> {
    let inline_opts = InlineOptions {
        inline_style_tags: true,
        keep_style_tags: true,
        keep_link_tags: false,
        base_url: None,
        load_remote_stylesheets: false,
        extra_css: None,
        preallocate_node_capacity: 32,
        ..InlineOptions::default()
    };

    let html = match CSSInliner::new(inline_opts).inline(&html) {
        Ok(inlined_html) => inlined_html,
        Err(err) => {
            error!("failed to inline CSS: {err}");
            html.to_string()
        }
    };
    let mut element_content_handlers = vec![
        // Open links in new tab
        element!("a[href]", |el| {
            el.set_attribute("target", "_blank").unwrap();

            Ok(())
        }),
        // Replace mixed part CID images with URL
        element!("img[src]", |el| {
            let src = el
                .get_attribute("src")
                .expect("src was required")
                .replace("cid:", cid_prefix);

            el.set_attribute("src", &src)?;

            Ok(())
        }),
        // Only secure image URLs
        element!("img[src]", |el| {
            let src = el
                .get_attribute("src")
                .expect("src was required")
                .replace("http:", "https:");

            el.set_attribute("src", &src)?;

            Ok(())
        }),
        // Add https to href with //<domain name>
        element!("link[href]", |el| {
            info!("found link[href] {el:?}");
            let mut href = el.get_attribute("href").expect("href was required");
            if href.starts_with("//") {
                warn!("adding https to {href}");
                href.insert_str(0, "https:");
            }

            el.set_attribute("href", &href)?;

            Ok(())
        }),
        // Add https to src with //<domain name>
        element!("style[src]", |el| {
            let mut src = el.get_attribute("src").expect("src was required");
            if src.starts_with("//") {
                src.insert_str(0, "https:");
            }

            el.set_attribute("src", &src)?;

            Ok(())
        }),
    ];
    if let Some(base_url) = base_url {
        element_content_handlers.extend(vec![
            // Make links with relative URLs absolute
            element!("a[href]", |el| {
                if let Some(Ok(href)) = el.get_attribute("href").map(|href| base_url.join(&href)) {
                    el.set_attribute("href", &href.as_str()).unwrap();
                }

                Ok(())
            }),
            // Make images with relative srcs absolute
            element!("img[src]", |el| {
                if let Some(Ok(src)) = el.get_attribute("src").map(|src| base_url.join(&src)) {
                    el.set_attribute("src", &src.as_str()).unwrap();
                }

                Ok(())
            }),
        ]);
    }
    let html = rewrite_str(
        &html,
        RewriteStrSettings {
            element_content_handlers,
            ..RewriteStrSettings::default()
        },
    )?;
    // Default's don't allow style, but we want to preserve that.
    // TODO: remove 'class' if rendering mails moves to a two phase process where abstract message
    // types are collected, santized, and then grouped together as one big HTML doc
    let attributes = hashset![
        "align", "bgcolor", "class", "color", "height", "lang", "title", "width", "style",
    ];

    let tags = hashset![
        "a",
        "abbr",
        "acronym",
        "area",
        "article",
        "aside",
        "b",
        "bdi",
        "bdo",
        "blockquote",
        "br",
        "caption",
        "center",
        "cite",
        "code",
        "col",
        "colgroup",
        "data",
        "dd",
        "del",
        "details",
        "dfn",
        "div",
        "dl",
        "dt",
        "em",
        "figcaption",
        "figure",
        "footer",
        "h1",
        "h2",
        "h3",
        "h4",
        "h5",
        "h6",
        "header",
        "hgroup",
        "hr",
        "i",
        "iframe", // wathiede
        "img",
        "ins",
        "kbd",
        "kbd",
        "li",
        "map",
        "mark",
        "nav",
        "noscript", // wathiede
        "ol",
        "p",
        "pre",
        "q",
        "rp",
        "rt",
        "rtc",
        "ruby",
        "s",
        "samp",
        "small",
        "span",
        "strike",
        "strong",
        "sub",
        "summary",
        "sup",
        "table",
        "tbody",
        "td",
        "th",
        "thead",
        "time",
        "title", // wathiede
        "tr",
        "tt",
        "u",
        "ul",
        "var",
        "wbr",
    ];
    let tag_attributes = hashmap![
        "a" => hashset![
            "href", "hreflang", "target",
        ],
        "bdo" => hashset![
            "dir"
        ],
        "blockquote" => hashset![
            "cite"
        ],
        "col" => hashset![
            "align", "char", "charoff", "span"
        ],
        "colgroup" => hashset![
            "align", "char", "charoff", "span"
        ],
        "del" => hashset![
            "cite", "datetime"
        ],
        "hr" => hashset![
            "align", "size", "width"
        ],
        "iframe" => hashset![
            "src", "allow", "allowfullscreen"
        ],
        "img" => hashset![
            "align", "alt", "height", "src", "width"
        ],
        "ins" => hashset![
            "cite", "datetime"
        ],
        "ol" => hashset![
            "start"
        ],
        "q" => hashset![
            "cite"
        ],
        "table" => hashset![
            "align", "border", "cellpadding", "cellspacing", "char", "charoff", "summary",
        ],
        "tbody" => hashset![
            "align", "char", "charoff"
        ],
        "td" => hashset![
            "align", "char", "charoff", "colspan", "headers", "rowspan"
        ],
        "tfoot" => hashset![
            "align", "char", "charoff"
        ],
        "th" => hashset![
            "align", "char", "charoff", "colspan", "headers", "rowspan", "scope"
        ],
        "thead" => hashset![
            "align", "char", "charoff"
        ],
        "tr" => hashset![
            "align", "char", "charoff"
        ],
    ];

    let html = ammonia::Builder::default()
        .tags(tags)
        .tag_attributes(tag_attributes)
        .generic_attributes(attributes)
        .clean(&html)
        .to_string();

    Ok(html)
}

fn compute_offset_limit(
    after: Option<i32>,
    before: Option<i32>,
    first: Option<i32>,
    last: Option<i32>,
) -> (i32, i32) {
    let default_page_size = 100;
    match (after, before, first, last) {
        // Reasonable defaults
        (None, None, None, None) => (0, default_page_size),
        (None, None, Some(first), None) => (0, first),
        (Some(after), None, None, None) => (after + 1, default_page_size),
        (Some(after), None, Some(first), None) => (after + 1, first),
        (None, Some(before), None, None) => (0.max(before - default_page_size), default_page_size),
        (None, Some(before), None, Some(last)) => (0.max(before - last), last),
        (None, None, None, Some(_)) => {
            panic!("specifying last and no before doesn't make sense")
        }
        (None, None, Some(_), Some(_)) => {
            panic!("specifying first and last doesn't make sense")
        }
        (None, Some(_), Some(_), _) => {
            panic!("specifying before and first doesn't make sense")
        }
        (Some(_), Some(_), _, _) => {
            panic!("specifying after and before doesn't make sense")
        }
        (Some(_), None, None, Some(_)) => {
            panic!("specifying after and last doesn't make sense")
        }
        (Some(_), None, Some(_), Some(_)) => {
            panic!("specifying after, first and last doesn't make sense")
        }
    }
}

#[derive(Debug, Default)]
pub struct Query {
    pub unread_only: bool,
    pub tags: Vec<String>,
    pub uids: Vec<String>,
    pub remainder: Vec<String>,
    pub is_notmuch: bool,
    pub is_newsreader: bool,
    pub is_tantivy: bool,
    pub corpus: Option<Corpus>,
}

impl fmt::Display for Query {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
        if self.unread_only {
            write!(f, "is:unread ")?;
        }
        for tag in &self.tags {
            write!(f, "tag:{tag} ")?;
        }
        for uid in &self.uids {
            write!(f, "id:{uid} ")?;
        }
        if self.is_notmuch {
            write!(f, "is:mail ")?;
        }
        if self.is_newsreader {
            write!(f, "is:newsreader ")?;
        }
        if self.is_newsreader {
            write!(f, "is:news ")?;
        }
        match self.corpus {
            Some(c) => write!(f, "corpus:{c:?}")?,
            _ => (),
        }
        for rem in &self.remainder {
            write!(f, "{rem} ")?;
        }
        Ok(())
    }
}

impl Query {
    // Converts the internal state of Query to something suitable for notmuch queries. Removes and
    // letterbox specific '<key>:<value' tags
    fn to_notmuch(&self) -> String {
        let mut parts = Vec::new();
        if !self.is_notmuch {
            return String::new();
        }

        if self.unread_only {
            parts.push("is:unread".to_string());
        }
        for tag in &self.tags {
            parts.push(format!("tag:{tag}"));
        }
        for uid in &self.uids {
            parts.push(uid.clone());
        }
        parts.extend(self.remainder.clone());
        parts.join(" ")
    }
}

impl FromStr for Query {
    type Err = Infallible;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        let mut unread_only = false;
        let mut tags = Vec::new();
        let mut uids = Vec::new();
        let mut remainder = Vec::new();
        let mut is_notmuch = false;
        let mut is_newsreader = false;
        let mut is_tantivy = false;
        let mut corpus = None;
        for word in s.split_whitespace() {
            if word == "is:unread" {
                unread_only = true
            } else if word.starts_with("tag:") {
                tags.push(word["tag:".len()..].to_string());

                /*
                            } else if word.starts_with("tag:") {
                                // Any tag that doesn't match site_prefix should explicitly set the site to something not in the
                                // database
                                site = Some(NON_EXISTENT_SITE_NAME.to_string());
                */
            } else if word.starts_with("corpus:") {
                let c = word["corpus:".len()..].to_string();
                corpus = c.parse::<Corpus>().map(|c| Some(c)).unwrap_or_else(|e| {
                    warn!("Error parsing corpus '{c}': {e:?}");
                    None
                });
            } else if is_newsreader_thread(word) {
                uids.push(word.to_string());
            } else if is_notmuch_thread_or_id(word) {
                uids.push(word.to_string());
            } else if word == "is:mail" || word == "is:email" || word == "is:notmuch" {
                is_notmuch = true;
            } else if word == "is:news" {
                is_newsreader = true;
            } else if word == "is:newsreader" {
                is_newsreader = true;
            } else {
                remainder.push(word.to_string());
            }
        }
        // If we don't see any explicit filters for a corpus, flip them all on
        if corpus.is_none() && !(is_notmuch || is_tantivy || is_newsreader) {
            is_notmuch = true;
            is_newsreader = true;
            is_tantivy = true;
        }
        Ok(Query {
            unread_only,
            tags,
            uids,
            remainder,
            is_notmuch,
            is_newsreader,
            is_tantivy,
            corpus,
        })
    }
}
pub struct ThreadSummaryRecord {
    pub site: Option<String>,
    pub date: Option<PrimitiveDateTime>,
    pub is_read: Option<bool>,
    pub title: Option<String>,
    pub uid: String,
    pub name: Option<String>,
    pub corpus: Corpus,
}

async fn thread_summary_from_row(r: ThreadSummaryRecord) -> ThreadSummary {
    let site = r.site.unwrap_or("UNKOWN TAG".to_string());
    let mut tags = vec![format!("{NEWSREADER_TAG_PREFIX}{site}")];
    if !r.is_read.unwrap_or(true) {
        tags.push("unread".to_string());
    };
    let mut title = r.title.unwrap_or("NO TITLE".to_string());
    title = clean_title(&title).await.expect("failed to clean title");
    ThreadSummary {
        thread: format!("{NEWSREADER_THREAD_PREFIX}{}", r.uid),
        timestamp: r
            .date
            .expect("post missing date")
            .assume_utc()
            .unix_timestamp() as isize,
        date_relative: format!("{:?}", r.date),
        //date_relative: "TODO date_relative".to_string(),
        matched: 0,
        total: 1,
        authors: r.name.unwrap_or_else(|| site.clone()),
        subject: title,
        tags,
        corpus: r.corpus,
    }
}
async fn clean_title(title: &str) -> Result<String, ServerError> {
    // Make title HTML so html parsers work
    let mut title = format!("<html>{title}</html>");
    let title_tranformers: Vec<Box<dyn Transformer>> =
        vec![Box::new(EscapeHtml), Box::new(StripHtml)];
    // Make title HTML so html parsers work
    title = format!("<html>{title}</html>");
    for t in title_tranformers.iter() {
        if t.should_run(&None, &title) {
            title = t.transform(&None, &title).await?;
        }
    }
    Ok(title)
}