server: strip style & script tags, also handle some retryable errors on slurp

This commit is contained in:
2025-01-30 13:52:22 -08:00
parent 795029cb06
commit eecc4bc3ef
2 changed files with 70 additions and 27 deletions

View File

@@ -258,27 +258,28 @@ pub async fn refresh<'ctx>(pool: &PgPool, cacher: &FilesystemCacher) -> Result<b
async fn update_search_summary(
pool: &PgPool,
cacher: &FilesystemCacher,
link: Url,
link: String,
body: String,
id: i32,
) -> Result<(), ServerError> {
let body_transformers: Vec<Box<dyn Transformer>> = vec![
Box::new(SlurpContents {
cacher,
inline_css: true,
site_selectors: slurp_contents_selectors(),
}),
Box::new(StripHtml),
];
let slurp_contents = SlurpContents {
cacher,
inline_css: true,
site_selectors: slurp_contents_selectors(),
};
let strip_html = StripHtml;
info!("adding {link} to search index");
let mut body = body;
let link = Some(link);
for t in body_transformers.iter() {
if t.should_run(&link, &body) {
body = t.transform(&link, &body).await?;
if let Ok(link) = Url::parse(&link) {
let link = Some(link);
if slurp_contents.should_run(&link, &body) {
body = slurp_contents.transform(&link, &body).await?;
}
} else {
error!("failed to parse link: {}", link);
}
body = strip_html.transform(&None, &body).await?;
sqlx::query!(
"UPDATE post SET search_summary = $1 WHERE id = $2",
body,
@@ -294,16 +295,12 @@ pub async fn refresh<'ctx>(pool: &PgPool, cacher: &FilesystemCacher) -> Result<b
.await?
.into_iter()
.filter_map(|r| {
let Ok(link) = Url::parse(&r.link) else {
error!("failed to parse link: {}", r.link);
return None;
};
let Some(body) = r.clean_summary else {
error!("clean_summary missing for {}", r.link);
return None;
};
let id = r.id;
Some(update_search_summary(pool, cacher, link, body, id))
Some(update_search_summary(pool, cacher, r.link, body, id))
})
.collect();