server: strip style & script tags, also handle some retryable errors on slurp

This commit is contained in:
Bill Thiede 2025-01-30 13:52:22 -08:00
parent 795029cb06
commit eecc4bc3ef
2 changed files with 70 additions and 27 deletions

View File

@ -7,7 +7,13 @@ pub mod nm;
#[cfg(feature = "tantivy")] #[cfg(feature = "tantivy")]
pub mod tantivy; pub mod tantivy;
use std::{collections::HashMap, convert::Infallible, fmt, str::FromStr, sync::Arc}; use std::{
collections::{HashMap, HashSet},
convert::Infallible,
fmt,
str::FromStr,
sync::Arc,
};
use async_trait::async_trait; use async_trait::async_trait;
use cacher::{Cacher, FilesystemCacher}; use cacher::{Cacher, FilesystemCacher};
@ -20,6 +26,8 @@ use lol_html::{
}; };
use maplit::{hashmap, hashset}; use maplit::{hashmap, hashset};
use regex::Regex; use regex::Regex;
use reqwest::StatusCode;
use rocket::response::status;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use sqlx::types::time::PrimitiveDateTime; use sqlx::types::time::PrimitiveDateTime;
use thiserror::Error; use thiserror::Error;
@ -58,6 +66,8 @@ pub enum TransformError {
ReqwestError(#[from] reqwest::Error), ReqwestError(#[from] reqwest::Error),
#[error("failed to parse HTML: {0}")] #[error("failed to parse HTML: {0}")]
HtmlParsingError(String), HtmlParsingError(String),
#[error("got a retryable error code {0} for {1}")]
RetryableHttpStatusError(StatusCode, String),
} }
struct SanitizeHtml<'a> { struct SanitizeHtml<'a> {
@ -88,18 +98,37 @@ struct StripHtml;
#[async_trait] #[async_trait]
impl Transformer for StripHtml { impl Transformer for StripHtml {
fn should_run(&self, _: &Option<Url>, html: &str) -> bool { fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
debug!("StripHtml should_run {link:?} {}", html.contains("<"));
// Lame test // Lame test
html.contains("<") html.contains("<")
} }
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> { async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
debug!("StripHtml {link:?}");
let mut text = String::new(); let mut text = String::new();
let element_content_handlers = vec![
element!("style", |el| {
el.remove();
Ok(())
}),
element!("script", |el| {
el.remove();
Ok(())
}),
];
let html = rewrite_str(
html,
RewriteStrSettings {
element_content_handlers,
..RewriteStrSettings::default()
},
)?;
let element_content_handlers = vec![text!("*", |t| { let element_content_handlers = vec![text!("*", |t| {
text += t.as_str(); text += t.as_str();
Ok(()) Ok(())
})]; })];
let _ = rewrite_str( let _ = rewrite_str(
html, &html,
RewriteStrSettings { RewriteStrSettings {
element_content_handlers, element_content_handlers,
..RewriteStrSettings::default() ..RewriteStrSettings::default()
@ -272,6 +301,7 @@ impl<'c> SlurpContents<'c> {
#[async_trait] #[async_trait]
impl<'c> Transformer for SlurpContents<'c> { impl<'c> Transformer for SlurpContents<'c> {
fn should_run(&self, link: &Option<Url>, html: &str) -> bool { fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
debug!("SlurpContents should_run {link:?}");
let mut will_slurp = false; let mut will_slurp = false;
if let Some(link) = link { if let Some(link) = link {
will_slurp = self.get_selectors(link).is_some(); will_slurp = self.get_selectors(link).is_some();
@ -282,6 +312,15 @@ impl<'c> Transformer for SlurpContents<'c> {
will_slurp will_slurp
} }
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> { async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
debug!("SlurpContents {link:?}");
let retryable_status: HashSet<StatusCode> = vec![
StatusCode::UNAUTHORIZED,
StatusCode::FORBIDDEN,
StatusCode::REQUEST_TIMEOUT,
StatusCode::TOO_MANY_REQUESTS,
]
.into_iter()
.collect();
if let Some(test_link) = link { if let Some(test_link) = link {
// If SlurpContents is configured for inline CSS, but no // If SlurpContents is configured for inline CSS, but no
// configuration found for this site, use the local InlineStyle // configuration found for this site, use the local InlineStyle
@ -301,11 +340,18 @@ impl<'c> Transformer for SlurpContents<'c> {
let body = if let Some(body) = cacher.get(link.as_str()) { let body = if let Some(body) = cacher.get(link.as_str()) {
String::from_utf8_lossy(&body).to_string() String::from_utf8_lossy(&body).to_string()
} else { } else {
let body = reqwest::get(link.as_str()) let resp = reqwest::get(link.as_str()).await?;
.await? let status = resp.status();
.error_for_status()? if status.is_server_error() || retryable_status.contains(&status) {
.text() return Err(TransformError::RetryableHttpStatusError(
.await?; status,
link.to_string(),
));
}
if !status.is_success() {
return Ok(html.to_string());
}
let body = resp.text().await?;
cacher.set(link.as_str(), body.as_bytes()); cacher.set(link.as_str(), body.as_bytes());
body body
}; };

View File

@ -258,27 +258,28 @@ pub async fn refresh<'ctx>(pool: &PgPool, cacher: &FilesystemCacher) -> Result<b
async fn update_search_summary( async fn update_search_summary(
pool: &PgPool, pool: &PgPool,
cacher: &FilesystemCacher, cacher: &FilesystemCacher,
link: Url, link: String,
body: String, body: String,
id: i32, id: i32,
) -> Result<(), ServerError> { ) -> Result<(), ServerError> {
let body_transformers: Vec<Box<dyn Transformer>> = vec![ let slurp_contents = SlurpContents {
Box::new(SlurpContents { cacher,
cacher, inline_css: true,
inline_css: true, site_selectors: slurp_contents_selectors(),
site_selectors: slurp_contents_selectors(), };
}), let strip_html = StripHtml;
Box::new(StripHtml),
];
info!("adding {link} to search index"); info!("adding {link} to search index");
let mut body = body; let mut body = body;
let link = Some(link); if let Ok(link) = Url::parse(&link) {
for t in body_transformers.iter() { let link = Some(link);
if t.should_run(&link, &body) { if slurp_contents.should_run(&link, &body) {
body = t.transform(&link, &body).await?; body = slurp_contents.transform(&link, &body).await?;
} }
} else {
error!("failed to parse link: {}", link);
} }
body = strip_html.transform(&None, &body).await?;
sqlx::query!( sqlx::query!(
"UPDATE post SET search_summary = $1 WHERE id = $2", "UPDATE post SET search_summary = $1 WHERE id = $2",
body, body,
@ -294,16 +295,12 @@ pub async fn refresh<'ctx>(pool: &PgPool, cacher: &FilesystemCacher) -> Result<b
.await? .await?
.into_iter() .into_iter()
.filter_map(|r| { .filter_map(|r| {
let Ok(link) = Url::parse(&r.link) else {
error!("failed to parse link: {}", r.link);
return None;
};
let Some(body) = r.clean_summary else { let Some(body) = r.clean_summary else {
error!("clean_summary missing for {}", r.link); error!("clean_summary missing for {}", r.link);
return None; return None;
}; };
let id = r.id; let id = r.id;
Some(update_search_summary(pool, cacher, link, body, id)) Some(update_search_summary(pool, cacher, r.link, body, id))
}) })
.collect(); .collect();