server: use fetched contents of news for search index

2025-01-29 14:08:20 -08:00
parent c7aa32b922
commit 12c8e0e33b
12 changed files with 168 additions and 87 deletions
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@@ -19,6 +19,7 @@ use lol_html::{
    RewriteStrSettings,
 };
 use maplit::{hashmap, hashset};
+use regex::Regex;
 use scraper::{Html, Selector};
 use sqlx::types::time::PrimitiveDateTime;
 use thiserror::Error;
@@ -105,6 +106,8 @@ impl Transformer for StripHtml {
                ..RewriteStrSettings::default()
            },
        )?;
+        let re = Regex::new(r"\s+").expect("failed to parse regex");
+        let text = re.replace_all(&text, " ").to_string();

        Ok(text)
    }
@@ -250,13 +253,13 @@ impl Transformer for AddOutlink {
    }
 }

-struct SlurpContents {
-    cacher: Arc<Mutex<FilesystemCacher>>,
+struct SlurpContents<'c> {
+    cacher: &'c FilesystemCacher,
    inline_css: bool,
    site_selectors: HashMap<String, Vec<Selector>>,
 }

-impl SlurpContents {
+impl<'c> SlurpContents<'c> {
    fn get_selectors(&self, link: &Url) -> Option<&[Selector]> {
        for (host, selector) in self.site_selectors.iter() {
            if link.host_str().map(|h| h.contains(host)).unwrap_or(false) {
@@ -268,7 +271,7 @@ impl SlurpContents {
 }

 #[async_trait]
-impl Transformer for SlurpContents {
+impl<'c> Transformer for SlurpContents<'c> {
    fn should_run(&self, link: &Option<Url>, html: &str) -> bool {
        let mut will_slurp = false;
        if let Some(link) = link {
@@ -294,7 +297,7 @@ impl Transformer for SlurpContents {
        let Some(selectors) = self.get_selectors(&link) else {
            return Ok(html.to_string());
        };
-        let cacher = self.cacher.lock().await;
+        let cacher = self.cacher;
        let body = if let Some(body) = cacher.get(link.as_str()) {
            String::from_utf8_lossy(&body).to_string()
        } else {