server: escape RSS feeds that are HTML escaped

2024-08-03 11:29:20 -07:00
parent e0863ac085
commit 56bc1cf7ed
5 changed files with 73 additions and 14 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1427,6 +1427,15 @@ dependencies = [
 "windows-sys 0.52.0",
 ]
 [[package]]
 name = "html-escape"
 version = "0.2.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476"
 dependencies = [
 "utf8-width",
 ]
 [[package]]
 name = "html5ever"
 version = "0.26.0"
@@ -3168,6 +3177,7 @@ dependencies = [
 "async-graphql-rocket",
 "css-inline",
 "glog",
 "html-escape",
 "linkify",
 "log",
 "lol_html",
@@ -4139,6 +4149,12 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
 [[package]]
 name = "utf8-width"
 version = "0.1.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3"
 [[package]]
 name = "uuid"
 version = "1.10.0"
--- a/server/Cargo.toml
+++ b/server/Cargo.toml
@@ -30,4 +30,5 @@ maplit = "1.0.2"
 linkify = "0.10.0"
 sqlx = { version = "0.7.4", features = ["postgres", "runtime-tokio", "time"] }
 url = "2.5.2"
 html-escape = "0.2.13"
--- a/server/src/error.rs
+++ b/server/src/error.rs
@@ -3,7 +3,7 @@ use std::{convert::Infallible, str::Utf8Error, string::FromUtf8Error};
 use mailparse::MailParseError;
 use thiserror::Error;
-use crate::SanitizeError;
+use crate::TransformError;
 #[derive(Error, Debug)]
 pub enum ServerError {
@@ -19,8 +19,8 @@ pub enum ServerError {
    PartNotFound,
    #[error("sqlx error: {0}")]
    SQLXError(#[from] sqlx::Error),
-    #[error("html sanitize error: {0}")]
+    #[error("html transform error: {0}")]
-    SanitizeError(#[from] SanitizeError),
+    TransformError(#[from] TransformError),
    #[error("UTF8 error: {0}")]
    Utf8Error(#[from] Utf8Error),
    #[error("FromUTF8 error: {0}")]
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@@ -11,14 +11,49 @@ use maplit::{hashmap, hashset};
 use thiserror::Error;
 use url::Url;
 // TODO: figure out how to use Cow
 trait Transformer {
    fn should_run(&self, input: &str) -> bool;
    // TODO: should input be something like `html_escape` uses:
    // <S: ?Sized + AsRef<str>>(text: &S) -> Cow<str>
    fn transform(&self, input: &str) -> Result<String, TransformError>;
 }
 // TODO: how would we make this more generic to allow good implementations of Transformer outside
 // of this module?
 #[derive(Error, Debug)]
-pub enum SanitizeError {
+pub enum TransformError {
    #[error("lol-html rewrite error")]
    RewritingError(#[from] RewritingError),
    #[error("css inline error")]
    InlineError(#[from] InlineError),
 }
 struct SanitizeHtml<'a> {
    cid_prefix: &'a str,
    base_url: &'a Option<Url>,
 }
 impl<'a> Transformer for SanitizeHtml<'a> {
    fn should_run(&self, _input: &str) -> bool {
        true
    }
    fn transform(&self, input: &str) -> Result<String, TransformError> {
        Ok(sanitize_html(input, self.cid_prefix, self.base_url)?)
    }
 }
 struct EscapeHtml;
 impl Transformer for EscapeHtml {
    fn should_run(&self, input: &str) -> bool {
        input.starts_with("&lt")
    }
    fn transform(&self, input: &str) -> Result<String, TransformError> {
        Ok(html_escape::decode_html_entities(input).to_string())
    }
 }
 pub fn linkify_html(text: &str) -> String {
    let mut finder = LinkFinder::new();
    let finder = finder.url_must_have_scheme(false).kinds(&[LinkKind::Url]);
@@ -51,7 +86,7 @@ pub fn sanitize_html(
    html: &str,
    cid_prefix: &str,
    base_url: &Option<Url>,
-) -> Result<String, SanitizeError> {
+) -> Result<String, TransformError> {
    let mut element_content_handlers = vec![
        // Open links in new tab
        element!("a[href]", |el| {
@@ -86,10 +121,7 @@ pub fn sanitize_html(
        element_content_handlers.extend(vec![
            // Make links with relative URLs absolute
            element!("a[href]", |el| {
-                if let Some(Ok(href)) = el.get_attribute("href").map(|href| {
+                if let Some(Ok(href)) = el.get_attribute("href").map(|href| base_url.join(&href)) {
                    info!("href {href:?}");
                    base_url.join(&href)
                }) {
                    el.set_attribute("href", &href.as_str()).unwrap();
                }
@@ -98,7 +130,6 @@ pub fn sanitize_html(
            // Make images with relative srcs absolute
            element!("img[src]", |el| {
                if let Some(Ok(src)) = el.get_attribute("src").map(|src| base_url.join(&src)) {
                    info!("src {src:?}");
                    el.set_attribute("src", &src.as_str()).unwrap();
                }
--- a/server/src/newsreader.rs
+++ b/server/src/newsreader.rs
@@ -14,7 +14,7 @@ const THREAD_PREFIX: &'static str = "news:";
 use crate::{
    error::ServerError,
    graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
-    sanitize_html,
+    EscapeHtml, SanitizeHtml, Transformer,
 };
 pub fn is_newsreader_search(query: &str) -> bool {
@@ -207,13 +207,24 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
            }
        }
    });
-    let html = r.summary.unwrap_or("NO SUMMARY".to_string());
+    let mut html = r.summary.unwrap_or("NO SUMMARY".to_string());
    // TODO: add site specific cleanups. For example:
    // * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
    // * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent
-    let html = sanitize_html(&html, "", &link)?;
+    let tranformers: Vec<Box<dyn Transformer>> = vec![
        Box::new(EscapeHtml),
        Box::new(SanitizeHtml {
            cid_prefix: "",
            base_url: &link,
        }),
    ];
    for t in tranformers.iter() {
        if t.should_run(&html) {
            html = t.transform(&html)?;
        }
    }
    let body = Body::Html(Html {
-        html,
+        html: html.to_string(),
        content_tree: "".to_string(),
    });
    let title = r.title.unwrap_or("NO TITLE".to_string());