Normalize links and images based on post's URL
This commit is contained in:
parent
b5468bced2
commit
1106377550
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -3183,6 +3183,7 @@ dependencies = [
|
||||
"sqlx",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"url",
|
||||
"urlencoding",
|
||||
]
|
||||
|
||||
|
||||
@ -29,4 +29,5 @@ anyhow = "1.0.79"
|
||||
maplit = "1.0.2"
|
||||
linkify = "0.10.0"
|
||||
sqlx = { version = "0.7.4", features = ["postgres", "runtime-tokio", "time"] }
|
||||
url = "2.5.2"
|
||||
|
||||
|
||||
@ -5,7 +5,8 @@ SELECT
|
||||
site,
|
||||
summary,
|
||||
title,
|
||||
name
|
||||
name,
|
||||
homepage
|
||||
FROM
|
||||
post p
|
||||
JOIN feed f ON p.site = f.slug
|
||||
|
||||
@ -7,26 +7,28 @@ use crate::SanitizeError;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum ServerError {
|
||||
#[error("notmuch")]
|
||||
#[error("notmuch: {0}")]
|
||||
NotmuchError(#[from] notmuch::NotmuchError),
|
||||
#[error("flatten")]
|
||||
FlattenError,
|
||||
#[error("mail parse error")]
|
||||
#[error("mail parse error: {0}")]
|
||||
MailParseError(#[from] MailParseError),
|
||||
#[error("IO error")]
|
||||
#[error("IO error: {0}")]
|
||||
IoError(#[from] std::io::Error),
|
||||
#[error("attachement not found")]
|
||||
PartNotFound,
|
||||
#[error("sqlx error")]
|
||||
#[error("sqlx error: {0}")]
|
||||
SQLXError(#[from] sqlx::Error),
|
||||
#[error("html sanitize error")]
|
||||
#[error("html sanitize error: {0}")]
|
||||
SanitizeError(#[from] SanitizeError),
|
||||
#[error("UTF8 error")]
|
||||
#[error("UTF8 error: {0}")]
|
||||
Utf8Error(#[from] Utf8Error),
|
||||
#[error("FromUTF8 error")]
|
||||
#[error("FromUTF8 error: {0}")]
|
||||
FromUtf8Error(#[from] FromUtf8Error),
|
||||
#[error("error")]
|
||||
#[error("error: {0}")]
|
||||
StringError(String),
|
||||
#[error("impossible")]
|
||||
#[error("invalid url: {0}")]
|
||||
UrlParseError(#[from] url::ParseError),
|
||||
#[error("impossible: {0}")]
|
||||
InfaillibleError(#[from] Infallible),
|
||||
}
|
||||
|
||||
@ -9,6 +9,7 @@ use log::error;
|
||||
use lol_html::{element, errors::RewritingError, rewrite_str, RewriteStrSettings};
|
||||
use maplit::{hashmap, hashset};
|
||||
use thiserror::Error;
|
||||
use url::Url;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SanitizeError {
|
||||
@ -46,7 +47,11 @@ pub fn linkify_html(text: &str) -> String {
|
||||
|
||||
// html contains the content to be cleaned, and cid_prefix is used to resolve mixed part image
|
||||
// referrences
|
||||
pub fn sanitize_html(html: &str, cid_prefix: &str) -> Result<String, SanitizeError> {
|
||||
pub fn sanitize_html(
|
||||
html: &str,
|
||||
cid_prefix: &str,
|
||||
base_url: &Url,
|
||||
) -> Result<String, SanitizeError> {
|
||||
let element_content_handlers = vec![
|
||||
// Open links in new tab
|
||||
element!("a[href]", |el| {
|
||||
@ -54,6 +59,22 @@ pub fn sanitize_html(html: &str, cid_prefix: &str) -> Result<String, SanitizeErr
|
||||
|
||||
Ok(())
|
||||
}),
|
||||
// Make links with relative URLs absolute
|
||||
element!("a[href]", |el| {
|
||||
if let Some(Ok(href)) = el.get_attribute("href").map(|href| base_url.join(&href)) {
|
||||
el.set_attribute("href", &href.as_str()).unwrap();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}),
|
||||
// Make images with relative srcs absolute
|
||||
element!("img[src]", |el| {
|
||||
if let Some(Ok(src)) = el.get_attribute("src").map(|src| base_url.join(&src)) {
|
||||
el.set_attribute("src", &src.as_str()).unwrap();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}),
|
||||
// Replace mixed part CID images with URL
|
||||
element!("img[src]", |el| {
|
||||
let src = el
|
||||
|
||||
@ -7,6 +7,7 @@ use std::{
|
||||
use async_graphql::connection::{self, Connection, Edge};
|
||||
use log::info;
|
||||
use sqlx::postgres::PgPool;
|
||||
use url::Url;
|
||||
|
||||
const TAG_PREFIX: &'static str = "News/";
|
||||
const THREAD_PREFIX: &'static str = "news:";
|
||||
@ -14,6 +15,7 @@ const THREAD_PREFIX: &'static str = "news:";
|
||||
use crate::{
|
||||
error::ServerError,
|
||||
graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
|
||||
sanitize_html,
|
||||
};
|
||||
|
||||
pub fn is_newsreader_search(query: &str) -> bool {
|
||||
@ -174,14 +176,54 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
|
||||
} else {
|
||||
vec!["unread".to_string(), site.clone()]
|
||||
};
|
||||
let default_homepage = "http://no-homepage";
|
||||
let homepage = Url::parse(
|
||||
&r.homepage
|
||||
.map(|h| {
|
||||
if h.is_empty() {
|
||||
default_homepage.to_string()
|
||||
} else {
|
||||
h
|
||||
}
|
||||
})
|
||||
.unwrap_or(default_homepage.to_string()),
|
||||
)?;
|
||||
let link = Url::parse(
|
||||
&r.link
|
||||
.as_ref()
|
||||
.map(|h| {
|
||||
if h.is_empty() {
|
||||
default_homepage.to_string()
|
||||
} else {
|
||||
h.to_string()
|
||||
}
|
||||
})
|
||||
.unwrap_or(default_homepage.to_string()),
|
||||
)?;
|
||||
let addr = r.link.as_ref().map(|link| {
|
||||
if link.contains('@') {
|
||||
link.clone()
|
||||
} else {
|
||||
if let Ok(url) = homepage.join(&link) {
|
||||
url.to_string()
|
||||
} else {
|
||||
link.clone()
|
||||
}
|
||||
}
|
||||
});
|
||||
let html = r.summary.unwrap_or("NO SUMMARY".to_string());
|
||||
// TODO: add site specific cleanups. For example:
|
||||
// * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
|
||||
// * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolet
|
||||
let html = sanitize_html(&html, "", &link)?;
|
||||
let body = Body::Html(Html {
|
||||
html: r.summary.unwrap_or("NO SUMMARY".to_string()),
|
||||
html,
|
||||
content_tree: "".to_string(),
|
||||
});
|
||||
let title = r.title.unwrap_or("NO TITLE".to_string());
|
||||
let from = Some(Email {
|
||||
name: r.name,
|
||||
addr: r.link,
|
||||
addr: addr.map(|a| a.to_string()),
|
||||
});
|
||||
Ok(Thread {
|
||||
thread_id,
|
||||
|
||||
@ -10,6 +10,7 @@ use log::{error, info, warn};
|
||||
use mailparse::{parse_mail, MailHeader, MailHeaderMap, ParsedMail};
|
||||
use memmap::MmapOptions;
|
||||
use notmuch::Notmuch;
|
||||
use url::Url;
|
||||
|
||||
use crate::{
|
||||
error::ServerError,
|
||||
@ -178,6 +179,7 @@ pub async fn thread(
|
||||
.get_first_value("date")
|
||||
.and_then(|d| mailparse::dateparse(&d).ok());
|
||||
let cid_prefix = shared::urls::cid_prefix(None, &id);
|
||||
let base_url = Url::parse("https://there-should-be-no-relative-urls-in-email").unwrap();
|
||||
let body = match extract_body(&m, &id)? {
|
||||
Body::PlainText(PlainText { text, content_tree }) => {
|
||||
let text = if text.len() > MAX_RAW_MESSAGE_SIZE {
|
||||
@ -196,7 +198,11 @@ pub async fn thread(
|
||||
// Trim newlines to prevent excessive white space at the beginning/end of
|
||||
// presenation. Leave tabs and spaces incase plain text attempts to center a
|
||||
// header on the first line.
|
||||
sanitize_html(&linkify_html(&text.trim_matches('\n')), &cid_prefix)?
|
||||
sanitize_html(
|
||||
&linkify_html(&text.trim_matches('\n')),
|
||||
&cid_prefix,
|
||||
&base_url
|
||||
)?
|
||||
),
|
||||
content_tree: if debug_content_tree {
|
||||
render_content_type_tree(&m)
|
||||
@ -206,7 +212,7 @@ pub async fn thread(
|
||||
})
|
||||
}
|
||||
Body::Html(Html { html, content_tree }) => Body::Html(Html {
|
||||
html: sanitize_html(&html, &cid_prefix)?,
|
||||
html: sanitize_html(&html, &cid_prefix, &base_url)?,
|
||||
content_tree: if debug_content_tree {
|
||||
render_content_type_tree(&m)
|
||||
} else {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user