web & server: using tantivy for news post search

2024-09-29 16:28:05 -07:00
parent f36d1e0c29
commit 3ec1741f10
22 changed files with 737 additions and 170 deletions
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@@ -18,15 +18,16 @@ use lol_html::{
 };
 use maplit::{hashmap, hashset};
 use scraper::{Html, Selector};
-use sqlx::{postgres::PgPool, types::time::PrimitiveDateTime};
+use sqlx::types::time::PrimitiveDateTime;
 use thiserror::Error;
 use tokio::sync::Mutex;
 use url::Url;

 use crate::{
    error::ServerError,
-    graphql::ThreadSummary,
-    newsreader::{extract_thread_id, is_newsreader_thread},
+    graphql::{Corpus, ThreadSummary},
+    newsreader::is_newsreader_thread,
+    nm::is_notmuch_thread_or_id,
 };

 const NEWSREADER_TAG_PREFIX: &'static str = "News/";
@@ -607,12 +608,13 @@ fn compute_offset_limit(
 #[derive(Debug)]
 pub struct Query {
    pub unread_only: bool,
-    pub tag: Option<String>,
-    pub uid: Option<String>,
+    pub tags: Vec<String>,
+    pub uids: Vec<String>,
    pub remainder: Vec<String>,
    pub is_notmuch: bool,
    pub is_newsreader: bool,
    pub is_tantivy: bool,
+    pub corpus: Option<Corpus>,
 }

 impl Query {
@@ -627,10 +629,10 @@ impl Query {
        if self.unread_only {
            parts.push("is:unread".to_string());
        }
-        if let Some(site) = &self.tag {
-            parts.push(format!("tag:{site}"));
+        for tag in &self.tags {
+            parts.push(format!("tag:{tag}"));
        }
-        if let Some(uid) = &self.uid {
+        for uid in &self.uids {
            parts.push(uid.clone());
        }
        parts.extend(self.remainder.clone());
@@ -642,48 +644,60 @@ impl FromStr for Query {
    type Err = Infallible;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        let mut unread_only = false;
-        let mut tag = None;
-        let mut uid = None;
+        let mut tags = Vec::new();
+        let mut uids = Vec::new();
        let mut remainder = Vec::new();
        let mut is_notmuch = false;
-        let mut is_newsreader = false;
+        let is_newsreader = false;
        let mut is_tantivy = false;
+        let mut corpus = None;
        for word in s.split_whitespace() {
            if word == "is:unread" {
                unread_only = true
            } else if word.starts_with("tag:") {
-                tag = Some(word["tag:".len()..].to_string())
+                tags.push(word["tag:".len()..].to_string());
+
                /*
                            } else if word.starts_with("tag:") {
                                // Any tag that doesn't match site_prefix should explicitly set the site to something not in the
                                // database
                                site = Some(NON_EXISTENT_SITE_NAME.to_string());
                */
+            } else if word.starts_with("corpus:") {
+                let c = word["corpus:".len()..].to_string();
+                corpus = c.parse::<Corpus>().map(|c| Some(c)).unwrap_or_else(|e| {
+                    warn!("Error parsing corpus '{c}': {e:?}");
+                    None
+                });
            } else if is_newsreader_thread(word) {
-                uid = Some(extract_thread_id(word).to_string())
+                uids.push(word.to_string());
+            } else if is_notmuch_thread_or_id(word) {
+                uids.push(word.to_string());
            } else if word == "is:mail" || word == "is:email" || word == "is:notmuch" {
                is_notmuch = true;
            } else if word == "is:news" || word == "is:newsreader" {
-                is_newsreader = true;
+                is_tantivy = true;
            } else {
                remainder.push(word.to_string());
            }
        }
        // If we don't see any explicit filters for a corpus, flip them all on
-        if !(is_notmuch || is_newsreader) {
-            is_newsreader = true;
+        if corpus.is_none() && !(is_newsreader || is_notmuch || is_tantivy) {
+            // Don't set is_newsreader unless debugging, assume tantivy can handle it.
+            // Explicitely setting corpus:newsreader will by-pass this logic
+            // is_newsreader = true;
            is_notmuch = true;
+            is_tantivy = true;
        }
-        // TODO: decide if tantivy gets it's own life or replaces newsreader
-        is_tantivy = is_newsreader;
        Ok(Query {
            unread_only,
-            tag,
-            uid,
+            tags,
+            uids,
            remainder,
            is_notmuch,
            is_newsreader,
            is_tantivy,
+            corpus,
        })
    }
 }
@@ -694,6 +708,7 @@ pub struct ThreadSummaryRecord {
    pub title: Option<String>,
    pub uid: String,
    pub name: Option<String>,
+    pub corpus: Corpus,
 }

 async fn thread_summary_from_row(r: ThreadSummaryRecord) -> ThreadSummary {
@@ -711,12 +726,14 @@ async fn thread_summary_from_row(r: ThreadSummaryRecord) -> ThreadSummary {
            .expect("post missing date")
            .assume_utc()
            .unix_timestamp() as isize,
-        date_relative: "TODO date_relative".to_string(),
+        date_relative: format!("{:?}", r.date),
+        //date_relative: "TODO date_relative".to_string(),
        matched: 0,
        total: 1,
        authors: r.name.unwrap_or_else(|| site.clone()),
        subject: title,
        tags,
+        corpus: r.corpus,
    }
 }
 async fn clean_title(title: &str) -> Result<String, ServerError> {