diff --git a/Cargo.lock b/Cargo.lock index 4c52617..6ddaa30 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1427,6 +1427,15 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "html-escape" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476" +dependencies = [ + "utf8-width", +] + [[package]] name = "html5ever" version = "0.26.0" @@ -3168,6 +3177,7 @@ dependencies = [ "async-graphql-rocket", "css-inline", "glog", + "html-escape", "linkify", "log", "lol_html", @@ -4139,6 +4149,12 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf8-width" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3" + [[package]] name = "uuid" version = "1.10.0" diff --git a/server/Cargo.toml b/server/Cargo.toml index b270683..d23b4f2 100644 --- a/server/Cargo.toml +++ b/server/Cargo.toml @@ -30,4 +30,5 @@ maplit = "1.0.2" linkify = "0.10.0" sqlx = { version = "0.7.4", features = ["postgres", "runtime-tokio", "time"] } url = "2.5.2" +html-escape = "0.2.13" diff --git a/server/src/error.rs b/server/src/error.rs index bee0e9d..7797c16 100644 --- a/server/src/error.rs +++ b/server/src/error.rs @@ -3,7 +3,7 @@ use std::{convert::Infallible, str::Utf8Error, string::FromUtf8Error}; use mailparse::MailParseError; use thiserror::Error; -use crate::SanitizeError; +use crate::TransformError; #[derive(Error, Debug)] pub enum ServerError { @@ -19,8 +19,8 @@ pub enum ServerError { PartNotFound, #[error("sqlx error: {0}")] SQLXError(#[from] sqlx::Error), - #[error("html sanitize error: {0}")] - SanitizeError(#[from] SanitizeError), + #[error("html transform error: {0}")] + TransformError(#[from] TransformError), #[error("UTF8 error: {0}")] Utf8Error(#[from] Utf8Error), #[error("FromUTF8 error: {0}")] diff --git a/server/src/lib.rs b/server/src/lib.rs index 7b2018e..1c4cd51 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -11,14 +11,49 @@ use maplit::{hashmap, hashset}; use thiserror::Error; use url::Url; +// TODO: figure out how to use Cow +trait Transformer { + fn should_run(&self, input: &str) -> bool; + // TODO: should input be something like `html_escape` uses: + // >(text: &S) -> Cow + fn transform(&self, input: &str) -> Result; +} + +// TODO: how would we make this more generic to allow good implementations of Transformer outside +// of this module? #[derive(Error, Debug)] -pub enum SanitizeError { +pub enum TransformError { #[error("lol-html rewrite error")] RewritingError(#[from] RewritingError), #[error("css inline error")] InlineError(#[from] InlineError), } +struct SanitizeHtml<'a> { + cid_prefix: &'a str, + base_url: &'a Option, +} + +impl<'a> Transformer for SanitizeHtml<'a> { + fn should_run(&self, _input: &str) -> bool { + true + } + fn transform(&self, input: &str) -> Result { + Ok(sanitize_html(input, self.cid_prefix, self.base_url)?) + } +} + +struct EscapeHtml; + +impl Transformer for EscapeHtml { + fn should_run(&self, input: &str) -> bool { + input.starts_with("<") + } + fn transform(&self, input: &str) -> Result { + Ok(html_escape::decode_html_entities(input).to_string()) + } +} + pub fn linkify_html(text: &str) -> String { let mut finder = LinkFinder::new(); let finder = finder.url_must_have_scheme(false).kinds(&[LinkKind::Url]); @@ -51,7 +86,7 @@ pub fn sanitize_html( html: &str, cid_prefix: &str, base_url: &Option, -) -> Result { +) -> Result { let mut element_content_handlers = vec![ // Open links in new tab element!("a[href]", |el| { @@ -86,10 +121,7 @@ pub fn sanitize_html( element_content_handlers.extend(vec![ // Make links with relative URLs absolute element!("a[href]", |el| { - if let Some(Ok(href)) = el.get_attribute("href").map(|href| { - info!("href {href:?}"); - base_url.join(&href) - }) { + if let Some(Ok(href)) = el.get_attribute("href").map(|href| base_url.join(&href)) { el.set_attribute("href", &href.as_str()).unwrap(); } @@ -98,7 +130,6 @@ pub fn sanitize_html( // Make images with relative srcs absolute element!("img[src]", |el| { if let Some(Ok(src)) = el.get_attribute("src").map(|src| base_url.join(&src)) { - info!("src {src:?}"); el.set_attribute("src", &src.as_str()).unwrap(); } diff --git a/server/src/newsreader.rs b/server/src/newsreader.rs index dafd350..4fc579a 100644 --- a/server/src/newsreader.rs +++ b/server/src/newsreader.rs @@ -14,7 +14,7 @@ const THREAD_PREFIX: &'static str = "news:"; use crate::{ error::ServerError, graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary}, - sanitize_html, + EscapeHtml, SanitizeHtml, Transformer, }; pub fn is_newsreader_search(query: &str) -> bool { @@ -207,13 +207,24 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result // * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent - let html = sanitize_html(&html, "", &link)?; + let tranformers: Vec> = vec![ + Box::new(EscapeHtml), + Box::new(SanitizeHtml { + cid_prefix: "", + base_url: &link, + }), + ]; + for t in tranformers.iter() { + if t.should_run(&html) { + html = t.transform(&html)?; + } + } let body = Body::Html(Html { - html, + html: html.to_string(), content_tree: "".to_string(), }); let title = r.title.unwrap_or("NO TITLE".to_string());