Normalize links and images based on post's URL

This commit is contained in:
Bill Thiede 2024-07-22 11:27:15 -07:00
parent b5468bced2
commit 1106377550
7 changed files with 89 additions and 15 deletions

1
Cargo.lock generated
View File

@ -3183,6 +3183,7 @@ dependencies = [
"sqlx",
"thiserror",
"tokio",
"url",
"urlencoding",
]

View File

@ -29,4 +29,5 @@ anyhow = "1.0.79"
maplit = "1.0.2"
linkify = "0.10.0"
sqlx = { version = "0.7.4", features = ["postgres", "runtime-tokio", "time"] }
url = "2.5.2"

View File

@ -5,7 +5,8 @@ SELECT
site,
summary,
title,
name
name,
homepage
FROM
post p
JOIN feed f ON p.site = f.slug

View File

@ -7,26 +7,28 @@ use crate::SanitizeError;
#[derive(Error, Debug)]
pub enum ServerError {
#[error("notmuch")]
#[error("notmuch: {0}")]
NotmuchError(#[from] notmuch::NotmuchError),
#[error("flatten")]
FlattenError,
#[error("mail parse error")]
#[error("mail parse error: {0}")]
MailParseError(#[from] MailParseError),
#[error("IO error")]
#[error("IO error: {0}")]
IoError(#[from] std::io::Error),
#[error("attachement not found")]
PartNotFound,
#[error("sqlx error")]
#[error("sqlx error: {0}")]
SQLXError(#[from] sqlx::Error),
#[error("html sanitize error")]
#[error("html sanitize error: {0}")]
SanitizeError(#[from] SanitizeError),
#[error("UTF8 error")]
#[error("UTF8 error: {0}")]
Utf8Error(#[from] Utf8Error),
#[error("FromUTF8 error")]
#[error("FromUTF8 error: {0}")]
FromUtf8Error(#[from] FromUtf8Error),
#[error("error")]
#[error("error: {0}")]
StringError(String),
#[error("impossible")]
#[error("invalid url: {0}")]
UrlParseError(#[from] url::ParseError),
#[error("impossible: {0}")]
InfaillibleError(#[from] Infallible),
}

View File

@ -9,6 +9,7 @@ use log::error;
use lol_html::{element, errors::RewritingError, rewrite_str, RewriteStrSettings};
use maplit::{hashmap, hashset};
use thiserror::Error;
use url::Url;
#[derive(Error, Debug)]
pub enum SanitizeError {
@ -46,7 +47,11 @@ pub fn linkify_html(text: &str) -> String {
// html contains the content to be cleaned, and cid_prefix is used to resolve mixed part image
// referrences
pub fn sanitize_html(html: &str, cid_prefix: &str) -> Result<String, SanitizeError> {
pub fn sanitize_html(
html: &str,
cid_prefix: &str,
base_url: &Url,
) -> Result<String, SanitizeError> {
let element_content_handlers = vec![
// Open links in new tab
element!("a[href]", |el| {
@ -54,6 +59,22 @@ pub fn sanitize_html(html: &str, cid_prefix: &str) -> Result<String, SanitizeErr
Ok(())
}),
// Make links with relative URLs absolute
element!("a[href]", |el| {
if let Some(Ok(href)) = el.get_attribute("href").map(|href| base_url.join(&href)) {
el.set_attribute("href", &href.as_str()).unwrap();
}
Ok(())
}),
// Make images with relative srcs absolute
element!("img[src]", |el| {
if let Some(Ok(src)) = el.get_attribute("src").map(|src| base_url.join(&src)) {
el.set_attribute("src", &src.as_str()).unwrap();
}
Ok(())
}),
// Replace mixed part CID images with URL
element!("img[src]", |el| {
let src = el

View File

@ -7,6 +7,7 @@ use std::{
use async_graphql::connection::{self, Connection, Edge};
use log::info;
use sqlx::postgres::PgPool;
use url::Url;
const TAG_PREFIX: &'static str = "News/";
const THREAD_PREFIX: &'static str = "news:";
@ -14,6 +15,7 @@ const THREAD_PREFIX: &'static str = "news:";
use crate::{
error::ServerError,
graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
sanitize_html,
};
pub fn is_newsreader_search(query: &str) -> bool {
@ -174,14 +176,54 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
} else {
vec!["unread".to_string(), site.clone()]
};
let default_homepage = "http://no-homepage";
let homepage = Url::parse(
&r.homepage
.map(|h| {
if h.is_empty() {
default_homepage.to_string()
} else {
h
}
})
.unwrap_or(default_homepage.to_string()),
)?;
let link = Url::parse(
&r.link
.as_ref()
.map(|h| {
if h.is_empty() {
default_homepage.to_string()
} else {
h.to_string()
}
})
.unwrap_or(default_homepage.to_string()),
)?;
let addr = r.link.as_ref().map(|link| {
if link.contains('@') {
link.clone()
} else {
if let Ok(url) = homepage.join(&link) {
url.to_string()
} else {
link.clone()
}
}
});
let html = r.summary.unwrap_or("NO SUMMARY".to_string());
// TODO: add site specific cleanups. For example:
// * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
// * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolet
let html = sanitize_html(&html, "", &link)?;
let body = Body::Html(Html {
html: r.summary.unwrap_or("NO SUMMARY".to_string()),
html,
content_tree: "".to_string(),
});
let title = r.title.unwrap_or("NO TITLE".to_string());
let from = Some(Email {
name: r.name,
addr: r.link,
addr: addr.map(|a| a.to_string()),
});
Ok(Thread {
thread_id,

View File

@ -10,6 +10,7 @@ use log::{error, info, warn};
use mailparse::{parse_mail, MailHeader, MailHeaderMap, ParsedMail};
use memmap::MmapOptions;
use notmuch::Notmuch;
use url::Url;
use crate::{
error::ServerError,
@ -178,6 +179,7 @@ pub async fn thread(
.get_first_value("date")
.and_then(|d| mailparse::dateparse(&d).ok());
let cid_prefix = shared::urls::cid_prefix(None, &id);
let base_url = Url::parse("https://there-should-be-no-relative-urls-in-email").unwrap();
let body = match extract_body(&m, &id)? {
Body::PlainText(PlainText { text, content_tree }) => {
let text = if text.len() > MAX_RAW_MESSAGE_SIZE {
@ -196,7 +198,11 @@ pub async fn thread(
// Trim newlines to prevent excessive white space at the beginning/end of
// presenation. Leave tabs and spaces incase plain text attempts to center a
// header on the first line.
sanitize_html(&linkify_html(&text.trim_matches('\n')), &cid_prefix)?
sanitize_html(
&linkify_html(&text.trim_matches('\n')),
&cid_prefix,
&base_url
)?
),
content_tree: if debug_content_tree {
render_content_type_tree(&m)
@ -206,7 +212,7 @@ pub async fn thread(
})
}
Body::Html(Html { html, content_tree }) => Body::Html(Html {
html: sanitize_html(&html, &cid_prefix)?,
html: sanitize_html(&html, &cid_prefix, &base_url)?,
content_tree: if debug_content_tree {
render_content_type_tree(&m)
} else {