server: strip style & script tags, also handle some retryable errors on slurp
This commit is contained in:
parent
795029cb06
commit
eecc4bc3ef
@ -7,7 +7,13 @@ pub mod nm;
|
|||||||
#[cfg(feature = "tantivy")]
|
#[cfg(feature = "tantivy")]
|
||||||
pub mod tantivy;
|
pub mod tantivy;
|
||||||
|
|
||||||
use std::{collections::HashMap, convert::Infallible, fmt, str::FromStr, sync::Arc};
|
use std::{
|
||||||
|
collections::{HashMap, HashSet},
|
||||||
|
convert::Infallible,
|
||||||
|
fmt,
|
||||||
|
str::FromStr,
|
||||||
|
sync::Arc,
|
||||||
|
};
|
||||||
|
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use cacher::{Cacher, FilesystemCacher};
|
use cacher::{Cacher, FilesystemCacher};
|
||||||
@ -20,6 +26,8 @@ use lol_html::{
|
|||||||
};
|
};
|
||||||
use maplit::{hashmap, hashset};
|
use maplit::{hashmap, hashset};
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
|
use reqwest::StatusCode;
|
||||||
|
use rocket::response::status;
|
||||||
use scraper::{Html, Selector};
|
use scraper::{Html, Selector};
|
||||||
use sqlx::types::time::PrimitiveDateTime;
|
use sqlx::types::time::PrimitiveDateTime;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
@ -58,6 +66,8 @@ pub enum TransformError {
|
|||||||
ReqwestError(#[from] reqwest::Error),
|
ReqwestError(#[from] reqwest::Error),
|
||||||
#[error("failed to parse HTML: {0}")]
|
#[error("failed to parse HTML: {0}")]
|
||||||
HtmlParsingError(String),
|
HtmlParsingError(String),
|
||||||
|
#[error("got a retryable error code {0} for {1}")]
|
||||||
|
RetryableHttpStatusError(StatusCode, String),
|
||||||
}
|
}
|
||||||
|
|
||||||
struct SanitizeHtml<'a> {
|
struct SanitizeHtml<'a> {
|
||||||
@ -88,18 +98,37 @@ struct StripHtml;
|
|||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl Transformer for StripHtml {
|
impl Transformer for StripHtml {
|
||||||
fn should_run(&self, _: &Option<Url>, html: &str) -> bool {
|
fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
|
||||||
|
debug!("StripHtml should_run {link:?} {}", html.contains("<"));
|
||||||
// Lame test
|
// Lame test
|
||||||
html.contains("<")
|
html.contains("<")
|
||||||
}
|
}
|
||||||
async fn transform(&self, _: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
||||||
|
debug!("StripHtml {link:?}");
|
||||||
let mut text = String::new();
|
let mut text = String::new();
|
||||||
|
let element_content_handlers = vec![
|
||||||
|
element!("style", |el| {
|
||||||
|
el.remove();
|
||||||
|
Ok(())
|
||||||
|
}),
|
||||||
|
element!("script", |el| {
|
||||||
|
el.remove();
|
||||||
|
Ok(())
|
||||||
|
}),
|
||||||
|
];
|
||||||
|
let html = rewrite_str(
|
||||||
|
html,
|
||||||
|
RewriteStrSettings {
|
||||||
|
element_content_handlers,
|
||||||
|
..RewriteStrSettings::default()
|
||||||
|
},
|
||||||
|
)?;
|
||||||
let element_content_handlers = vec![text!("*", |t| {
|
let element_content_handlers = vec![text!("*", |t| {
|
||||||
text += t.as_str();
|
text += t.as_str();
|
||||||
Ok(())
|
Ok(())
|
||||||
})];
|
})];
|
||||||
let _ = rewrite_str(
|
let _ = rewrite_str(
|
||||||
html,
|
&html,
|
||||||
RewriteStrSettings {
|
RewriteStrSettings {
|
||||||
element_content_handlers,
|
element_content_handlers,
|
||||||
..RewriteStrSettings::default()
|
..RewriteStrSettings::default()
|
||||||
@ -272,6 +301,7 @@ impl<'c> SlurpContents<'c> {
|
|||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl<'c> Transformer for SlurpContents<'c> {
|
impl<'c> Transformer for SlurpContents<'c> {
|
||||||
fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
|
fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
|
||||||
|
debug!("SlurpContents should_run {link:?}");
|
||||||
let mut will_slurp = false;
|
let mut will_slurp = false;
|
||||||
if let Some(link) = link {
|
if let Some(link) = link {
|
||||||
will_slurp = self.get_selectors(link).is_some();
|
will_slurp = self.get_selectors(link).is_some();
|
||||||
@ -282,6 +312,15 @@ impl<'c> Transformer for SlurpContents<'c> {
|
|||||||
will_slurp
|
will_slurp
|
||||||
}
|
}
|
||||||
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
async fn transform(&self, link: &Option<Url>, html: &str) -> Result<String, TransformError> {
|
||||||
|
debug!("SlurpContents {link:?}");
|
||||||
|
let retryable_status: HashSet<StatusCode> = vec![
|
||||||
|
StatusCode::UNAUTHORIZED,
|
||||||
|
StatusCode::FORBIDDEN,
|
||||||
|
StatusCode::REQUEST_TIMEOUT,
|
||||||
|
StatusCode::TOO_MANY_REQUESTS,
|
||||||
|
]
|
||||||
|
.into_iter()
|
||||||
|
.collect();
|
||||||
if let Some(test_link) = link {
|
if let Some(test_link) = link {
|
||||||
// If SlurpContents is configured for inline CSS, but no
|
// If SlurpContents is configured for inline CSS, but no
|
||||||
// configuration found for this site, use the local InlineStyle
|
// configuration found for this site, use the local InlineStyle
|
||||||
@ -301,11 +340,18 @@ impl<'c> Transformer for SlurpContents<'c> {
|
|||||||
let body = if let Some(body) = cacher.get(link.as_str()) {
|
let body = if let Some(body) = cacher.get(link.as_str()) {
|
||||||
String::from_utf8_lossy(&body).to_string()
|
String::from_utf8_lossy(&body).to_string()
|
||||||
} else {
|
} else {
|
||||||
let body = reqwest::get(link.as_str())
|
let resp = reqwest::get(link.as_str()).await?;
|
||||||
.await?
|
let status = resp.status();
|
||||||
.error_for_status()?
|
if status.is_server_error() || retryable_status.contains(&status) {
|
||||||
.text()
|
return Err(TransformError::RetryableHttpStatusError(
|
||||||
.await?;
|
status,
|
||||||
|
link.to_string(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
if !status.is_success() {
|
||||||
|
return Ok(html.to_string());
|
||||||
|
}
|
||||||
|
let body = resp.text().await?;
|
||||||
cacher.set(link.as_str(), body.as_bytes());
|
cacher.set(link.as_str(), body.as_bytes());
|
||||||
body
|
body
|
||||||
};
|
};
|
||||||
|
|||||||
@ -258,27 +258,28 @@ pub async fn refresh<'ctx>(pool: &PgPool, cacher: &FilesystemCacher) -> Result<b
|
|||||||
async fn update_search_summary(
|
async fn update_search_summary(
|
||||||
pool: &PgPool,
|
pool: &PgPool,
|
||||||
cacher: &FilesystemCacher,
|
cacher: &FilesystemCacher,
|
||||||
link: Url,
|
link: String,
|
||||||
body: String,
|
body: String,
|
||||||
id: i32,
|
id: i32,
|
||||||
) -> Result<(), ServerError> {
|
) -> Result<(), ServerError> {
|
||||||
let body_transformers: Vec<Box<dyn Transformer>> = vec![
|
let slurp_contents = SlurpContents {
|
||||||
Box::new(SlurpContents {
|
|
||||||
cacher,
|
cacher,
|
||||||
inline_css: true,
|
inline_css: true,
|
||||||
site_selectors: slurp_contents_selectors(),
|
site_selectors: slurp_contents_selectors(),
|
||||||
}),
|
};
|
||||||
Box::new(StripHtml),
|
let strip_html = StripHtml;
|
||||||
];
|
|
||||||
|
|
||||||
info!("adding {link} to search index");
|
info!("adding {link} to search index");
|
||||||
let mut body = body;
|
let mut body = body;
|
||||||
|
if let Ok(link) = Url::parse(&link) {
|
||||||
let link = Some(link);
|
let link = Some(link);
|
||||||
for t in body_transformers.iter() {
|
if slurp_contents.should_run(&link, &body) {
|
||||||
if t.should_run(&link, &body) {
|
body = slurp_contents.transform(&link, &body).await?;
|
||||||
body = t.transform(&link, &body).await?;
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
error!("failed to parse link: {}", link);
|
||||||
}
|
}
|
||||||
|
body = strip_html.transform(&None, &body).await?;
|
||||||
sqlx::query!(
|
sqlx::query!(
|
||||||
"UPDATE post SET search_summary = $1 WHERE id = $2",
|
"UPDATE post SET search_summary = $1 WHERE id = $2",
|
||||||
body,
|
body,
|
||||||
@ -294,16 +295,12 @@ pub async fn refresh<'ctx>(pool: &PgPool, cacher: &FilesystemCacher) -> Result<b
|
|||||||
.await?
|
.await?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter_map(|r| {
|
.filter_map(|r| {
|
||||||
let Ok(link) = Url::parse(&r.link) else {
|
|
||||||
error!("failed to parse link: {}", r.link);
|
|
||||||
return None;
|
|
||||||
};
|
|
||||||
let Some(body) = r.clean_summary else {
|
let Some(body) = r.clean_summary else {
|
||||||
error!("clean_summary missing for {}", r.link);
|
error!("clean_summary missing for {}", r.link);
|
||||||
return None;
|
return None;
|
||||||
};
|
};
|
||||||
let id = r.id;
|
let id = r.id;
|
||||||
Some(update_search_summary(pool, cacher, link, body, id))
|
Some(update_search_summary(pool, cacher, r.link, body, id))
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user