server: use fetched contents of news for search index

This commit is contained in:
2025-01-29 14:08:20 -08:00
parent c7aa32b922
commit 12c8e0e33b
12 changed files with 168 additions and 87 deletions

View File

@@ -19,6 +19,7 @@ use lol_html::{
RewriteStrSettings,
};
use maplit::{hashmap, hashset};
use regex::Regex;
use scraper::{Html, Selector};
use sqlx::types::time::PrimitiveDateTime;
use thiserror::Error;
@@ -105,6 +106,8 @@ impl Transformer for StripHtml {
..RewriteStrSettings::default()
},
)?;
let re = Regex::new(r"\s+").expect("failed to parse regex");
let text = re.replace_all(&text, " ").to_string();
Ok(text)
}
@@ -250,13 +253,13 @@ impl Transformer for AddOutlink {
}
}
struct SlurpContents {
cacher: Arc<Mutex<FilesystemCacher>>,
struct SlurpContents<'c> {
cacher: &'c FilesystemCacher,
inline_css: bool,
site_selectors: HashMap<String, Vec<Selector>>,
}
impl SlurpContents {
impl<'c> SlurpContents<'c> {
fn get_selectors(&self, link: &Url) -> Option<&[Selector]> {
for (host, selector) in self.site_selectors.iter() {
if link.host_str().map(|h| h.contains(host)).unwrap_or(false) {
@@ -268,7 +271,7 @@ impl SlurpContents {
}
#[async_trait]
impl Transformer for SlurpContents {
impl<'c> Transformer for SlurpContents<'c> {
fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
let mut will_slurp = false;
if let Some(link) = link {
@@ -294,7 +297,7 @@ impl Transformer for SlurpContents {
let Some(selectors) = self.get_selectors(&link) else {
return Ok(html.to_string());
};
let cacher = self.cacher.lock().await;
let cacher = self.cacher;
let body = if let Some(body) = cacher.get(link.as_str()) {
String::from_utf8_lossy(&body).to_string()
} else {