diff --git a/server/src/lib.rs b/server/src/lib.rs index b12eaf3..d935d9e 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -7,7 +7,13 @@ pub mod nm; #[cfg(feature = "tantivy")] pub mod tantivy; -use std::{collections::HashMap, convert::Infallible, fmt, str::FromStr, sync::Arc}; +use std::{ + collections::{HashMap, HashSet}, + convert::Infallible, + fmt, + str::FromStr, + sync::Arc, +}; use async_trait::async_trait; use cacher::{Cacher, FilesystemCacher}; @@ -20,6 +26,8 @@ use lol_html::{ }; use maplit::{hashmap, hashset}; use regex::Regex; +use reqwest::StatusCode; +use rocket::response::status; use scraper::{Html, Selector}; use sqlx::types::time::PrimitiveDateTime; use thiserror::Error; @@ -58,6 +66,8 @@ pub enum TransformError { ReqwestError(#[from] reqwest::Error), #[error("failed to parse HTML: {0}")] HtmlParsingError(String), + #[error("got a retryable error code {0} for {1}")] + RetryableHttpStatusError(StatusCode, String), } struct SanitizeHtml<'a> { @@ -88,18 +98,37 @@ struct StripHtml; #[async_trait] impl Transformer for StripHtml { - fn should_run(&self, _: &Option, html: &str) -> bool { + fn should_run(&self, link: &Option, html: &str) -> bool { + debug!("StripHtml should_run {link:?} {}", html.contains("<")); // Lame test html.contains("<") } - async fn transform(&self, _: &Option, html: &str) -> Result { + async fn transform(&self, link: &Option, html: &str) -> Result { + debug!("StripHtml {link:?}"); let mut text = String::new(); + let element_content_handlers = vec![ + element!("style", |el| { + el.remove(); + Ok(()) + }), + element!("script", |el| { + el.remove(); + Ok(()) + }), + ]; + let html = rewrite_str( + html, + RewriteStrSettings { + element_content_handlers, + ..RewriteStrSettings::default() + }, + )?; let element_content_handlers = vec![text!("*", |t| { text += t.as_str(); Ok(()) })]; let _ = rewrite_str( - html, + &html, RewriteStrSettings { element_content_handlers, ..RewriteStrSettings::default() @@ -272,6 +301,7 @@ impl<'c> SlurpContents<'c> { #[async_trait] impl<'c> Transformer for SlurpContents<'c> { fn should_run(&self, link: &Option, html: &str) -> bool { + debug!("SlurpContents should_run {link:?}"); let mut will_slurp = false; if let Some(link) = link { will_slurp = self.get_selectors(link).is_some(); @@ -282,6 +312,15 @@ impl<'c> Transformer for SlurpContents<'c> { will_slurp } async fn transform(&self, link: &Option, html: &str) -> Result { + debug!("SlurpContents {link:?}"); + let retryable_status: HashSet = vec![ + StatusCode::UNAUTHORIZED, + StatusCode::FORBIDDEN, + StatusCode::REQUEST_TIMEOUT, + StatusCode::TOO_MANY_REQUESTS, + ] + .into_iter() + .collect(); if let Some(test_link) = link { // If SlurpContents is configured for inline CSS, but no // configuration found for this site, use the local InlineStyle @@ -301,11 +340,18 @@ impl<'c> Transformer for SlurpContents<'c> { let body = if let Some(body) = cacher.get(link.as_str()) { String::from_utf8_lossy(&body).to_string() } else { - let body = reqwest::get(link.as_str()) - .await? - .error_for_status()? - .text() - .await?; + let resp = reqwest::get(link.as_str()).await?; + let status = resp.status(); + if status.is_server_error() || retryable_status.contains(&status) { + return Err(TransformError::RetryableHttpStatusError( + status, + link.to_string(), + )); + } + if !status.is_success() { + return Ok(html.to_string()); + } + let body = resp.text().await?; cacher.set(link.as_str(), body.as_bytes()); body }; diff --git a/server/src/newsreader.rs b/server/src/newsreader.rs index 1e4e7ed..ac1f9bf 100644 --- a/server/src/newsreader.rs +++ b/server/src/newsreader.rs @@ -258,27 +258,28 @@ pub async fn refresh<'ctx>(pool: &PgPool, cacher: &FilesystemCacher) -> Result Result<(), ServerError> { - let body_transformers: Vec> = vec![ - Box::new(SlurpContents { - cacher, - inline_css: true, - site_selectors: slurp_contents_selectors(), - }), - Box::new(StripHtml), - ]; + let slurp_contents = SlurpContents { + cacher, + inline_css: true, + site_selectors: slurp_contents_selectors(), + }; + let strip_html = StripHtml; info!("adding {link} to search index"); let mut body = body; - let link = Some(link); - for t in body_transformers.iter() { - if t.should_run(&link, &body) { - body = t.transform(&link, &body).await?; + if let Ok(link) = Url::parse(&link) { + let link = Some(link); + if slurp_contents.should_run(&link, &body) { + body = slurp_contents.transform(&link, &body).await?; } + } else { + error!("failed to parse link: {}", link); } + body = strip_html.transform(&None, &body).await?; sqlx::query!( "UPDATE post SET search_summary = $1 WHERE id = $2", body, @@ -294,16 +295,12 @@ pub async fn refresh<'ctx>(pool: &PgPool, cacher: &FilesystemCacher) -> Result