From 13eaf33b1aa604421b218d1f6673e1d380ed62f8 Mon Sep 17 00:00:00 2001 From: Bill Thiede Date: Tue, 17 Dec 2024 09:31:51 -0800 Subject: [PATCH] server: add postgres based newsreader search and disable tantivy --- server/Cargo.toml | 6 +++- .../20241217021645_create-search-table.up.sql | 2 ++ server/sql/count.sql | 4 +++ server/sql/threads.sql | 4 +++ server/src/bin/server.rs | 14 +++++--- server/src/error.rs | 6 ++-- server/src/graphql.rs | 32 +++++++++++++---- server/src/lib.rs | 14 ++++++-- server/src/newsreader.rs | 36 ++++++++++--------- 9 files changed, 86 insertions(+), 32 deletions(-) diff --git a/server/Cargo.toml b/server/Cargo.toml index 5cab460..7fae15a 100644 --- a/server/Cargo.toml +++ b/server/Cargo.toml @@ -31,7 +31,7 @@ serde = { version = "1.0.147", features = ["derive"] } serde_json = "1.0.87" shared = { path = "../shared" } sqlx = { version = "0.7.4", features = ["postgres", "runtime-tokio", "time"] } -tantivy = "0.22.0" +tantivy = { version = "0.22.0", optional = true } thiserror = "1.0.37" tokio = "1.26.0" tracing = "0.1.41" @@ -42,3 +42,7 @@ xtracing = { git = "http://git-private.h.xinu.tv/wathiede/xtracing.git" } [build-dependencies] build-info-build = "0.0.38" + +[features] +default = [ "tantivy" ] +tantivy = [ "dep:tantivy" ] diff --git a/server/migrations/20241217021645_create-search-table.up.sql b/server/migrations/20241217021645_create-search-table.up.sql index 27a85a1..35f3614 100644 --- a/server/migrations/20241217021645_create-search-table.up.sql +++ b/server/migrations/20241217021645_create-search-table.up.sql @@ -1 +1,3 @@ CREATE INDEX post_summary_idx ON post USING GIN (to_tsvector('english', summary)); +CREATE INDEX post_site_idx ON post USING GIN (to_tsvector('english', site)); +CREATE INDEX post_title_idx ON post USING GIN (to_tsvector('english', title)); diff --git a/server/sql/count.sql b/server/sql/count.sql index 4801fdb..e527285 100644 --- a/server/sql/count.sql +++ b/server/sql/count.sql @@ -11,3 +11,7 @@ WHERE NOT $2 OR NOT is_read ) + AND ( + $3 :: text IS NULL + OR to_tsvector('english', summary) @@ websearch_to_tsquery('english', $3) + ) diff --git a/server/sql/threads.sql b/server/sql/threads.sql index 6d2da8f..6c80f50 100644 --- a/server/sql/threads.sql +++ b/server/sql/threads.sql @@ -14,6 +14,10 @@ WHERE NOT $2 OR NOT is_read ) + AND ( + $5 :: text IS NULL + OR to_tsvector('english', summary) @@ websearch_to_tsquery('english', $5) + ) ORDER BY date DESC, title OFFSET $3 diff --git a/server/src/bin/server.rs b/server/src/bin/server.rs index eb7f330..f9fc168 100644 --- a/server/src/bin/server.rs +++ b/server/src/bin/server.rs @@ -17,12 +17,13 @@ use rocket::{ Response, State, }; use rocket_cors::{AllowedHeaders, AllowedOrigins}; +#[cfg(feature = "tantivy")] +use server::tantivy::TantivyConnection; use server::{ config::Config, error::ServerError, graphql::{Attachment, GraphqlSchema, Mutation, QueryRoot}, nm::{attachment_bytes, cid_attachment_bytes}, - tantivy::TantivyConnection, }; use sqlx::postgres::PgPool; @@ -216,15 +217,18 @@ async fn main() -> Result<(), Box> { } let pool = PgPool::connect(&config.newsreader_database_url).await?; sqlx::migrate!("./migrations").run(&pool).await?; + #[cfg(feature = "tantivy")] let tantivy_conn = TantivyConnection::new(&config.newsreader_tantivy_db_path)?; let schema = Schema::build(QueryRoot, Mutation, EmptySubscription) .data(Notmuch::default()) .data(config) - .data(pool.clone()) - .data(tantivy_conn) - .extension(async_graphql::extensions::Logger) - .finish(); + .data(pool.clone()); + + #[cfg(feature = "tantivy")] + let schema = schema.data(tantivy_conn); + + let schema = schema.extension(async_graphql::extensions::Logger).finish(); let rkt = rkt.manage(schema).manage(pool).manage(Notmuch::default()); //.manage(Notmuch::with_config("../notmuch/testdata/notmuch.config")) diff --git a/server/src/error.rs b/server/src/error.rs index f07142c..b2dcd46 100644 --- a/server/src/error.rs +++ b/server/src/error.rs @@ -1,8 +1,8 @@ use std::{convert::Infallible, str::Utf8Error, string::FromUtf8Error}; use mailparse::MailParseError; -use tantivy::TantivyError; -use tantivy::query::QueryParserError; +#[cfg(feature = "tantivy")] +use tantivy::{query::QueryParserError, TantivyError}; use thiserror::Error; use crate::TransformError; @@ -31,8 +31,10 @@ pub enum ServerError { StringError(String), #[error("invalid url: {0}")] UrlParseError(#[from] url::ParseError), + #[cfg(feature = "tantivy")] #[error("tantivy error: {0}")] TantivyError(#[from] TantivyError), + #[cfg(feature = "tantivy")] #[error("tantivy query parse error: {0}")] QueryParseError(#[from] QueryParserError), #[error("impossible: {0}")] diff --git a/server/src/graphql.rs b/server/src/graphql.rs index 5c84e12..0e4bd60 100644 --- a/server/src/graphql.rs +++ b/server/src/graphql.rs @@ -12,7 +12,9 @@ use sqlx::postgres::PgPool; use tokio::join; use tracing::instrument; -use crate::{config::Config, newsreader, nm, tantivy::TantivyConnection, Query}; +#[cfg(feature = "tantivy")] +use crate::tantivy::TantivyConnection; +use crate::{config::Config, newsreader, nm, Query}; /// # Number of seconds since the Epoch pub type UnixTime = isize; @@ -275,13 +277,18 @@ impl QueryRoot { async fn count<'ctx>(&self, ctx: &Context<'ctx>, query: String) -> Result { let nm = ctx.data_unchecked::(); let pool = ctx.data_unchecked::(); + #[cfg(feature = "tantivy")] let tantivy = ctx.data_unchecked::(); let newsreader_query: Query = query.parse()?; let newsreader_count = newsreader::count(pool, &newsreader_query).await?; let notmuch_count = nm::count(nm, &newsreader_query).await?; + #[cfg(feature = "tantivy")] let tantivy_count = tantivy.count(&newsreader_query).await?; + #[cfg(not(feature = "tantivy"))] + let tantivy_count = 0; + let total = newsreader_count + notmuch_count + tantivy_count; info!("count {newsreader_query:?} newsreader count {newsreader_count} notmuch count {notmuch_count} tantivy count {tantivy_count} total {total}"); Ok(total) @@ -302,6 +309,7 @@ impl QueryRoot { info!("search({after:?} {before:?} {first:?} {last:?} {query:?})",); let nm = ctx.data_unchecked::(); let pool = ctx.data_unchecked::(); + #[cfg(feature = "tantivy")] let tantivy = ctx.data_unchecked::(); Ok(connection::query( @@ -341,6 +349,7 @@ impl QueryRoot { ); let notmuch_fut = notmuch_search(nm, notmuch_after, notmuch_before, first, last, &query); + #[cfg(feature = "tantivy")] let tantivy_fut = tantivy_search( tantivy, pool, @@ -350,6 +359,10 @@ impl QueryRoot { last, &query, ); + #[cfg(not(feature = "tantivy"))] + let tantivy_fut = + async { Ok::, async_graphql::Error>(Vec::new()) }; + let (newsreader_results, notmuch_results, tantivy_results) = join!(newsreader_fut, notmuch_fut, tantivy_fut); @@ -492,6 +505,7 @@ async fn notmuch_search( .collect()) } +#[cfg(feature = "tantivy")] async fn tantivy_search( tantivy: &TantivyConnection, pool: &PgPool, @@ -521,10 +535,12 @@ impl Mutation { ) -> Result { let nm = ctx.data_unchecked::(); let pool = ctx.data_unchecked::(); + #[cfg(feature = "tantivy")] let tantivy = ctx.data_unchecked::(); let query: Query = query.parse()?; newsreader::set_read_status(pool, &query, unread).await?; + #[cfg(feature = "tantivy")] tantivy.reindex_thread(pool, &query).await?; nm::set_read_status(nm, &query, unread).await?; Ok(true) @@ -554,6 +570,7 @@ impl Mutation { Ok(true) } /// Drop and recreate tantivy index. Warning this is slow + #[cfg(feature = "tantivy")] async fn drop_and_load_index<'ctx>(&self, ctx: &Context<'ctx>) -> Result { let tantivy = ctx.data_unchecked::(); let pool = ctx.data_unchecked::(); @@ -566,11 +583,14 @@ impl Mutation { #[instrument(skip_all)] async fn refresh<'ctx>(&self, ctx: &Context<'ctx>) -> Result { let nm = ctx.data_unchecked::(); - let tantivy = ctx.data_unchecked::(); - let pool = ctx.data_unchecked::(); - // TODO: parallelize - info!("{}", String::from_utf8_lossy(&nm.new()?)); - tantivy.refresh(pool).await?; + #[cfg(feature = "tantivy")] + { + let tantivy = ctx.data_unchecked::(); + let pool = ctx.data_unchecked::(); + // TODO: parallelize + info!("{}", String::from_utf8_lossy(&nm.new()?)); + tantivy.refresh(pool).await?; + } Ok(true) } } diff --git a/server/src/lib.rs b/server/src/lib.rs index b019c06..4c4056e 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -3,6 +3,7 @@ pub mod error; pub mod graphql; pub mod newsreader; pub mod nm; +#[cfg(feature = "tantivy")] pub mod tantivy; use std::{collections::HashMap, convert::Infallible, fmt, str::FromStr, sync::Arc}; @@ -612,6 +613,7 @@ pub struct Query { pub uids: Vec, pub remainder: Vec, pub is_notmuch: bool, + pub is_newsreader: bool, pub is_tantivy: bool, pub corpus: Option, } @@ -630,6 +632,9 @@ impl fmt::Display for Query { if self.is_notmuch { write!(f, "is:mail ")?; } + if self.is_newsreader { + write!(f, "is:newsreader ")?; + } if self.is_tantivy { write!(f, "is:news ")?; } @@ -675,6 +680,7 @@ impl FromStr for Query { let mut uids = Vec::new(); let mut remainder = Vec::new(); let mut is_notmuch = false; + let mut is_newsreader = false; let mut is_tantivy = false; let mut corpus = None; for word in s.split_whitespace() { @@ -701,15 +707,18 @@ impl FromStr for Query { uids.push(word.to_string()); } else if word == "is:mail" || word == "is:email" || word == "is:notmuch" { is_notmuch = true; - } else if word == "is:news" || word == "is:newsreader" { + } else if word == "is:news" { is_tantivy = true; + } else if word == "is:newsreader" { + is_newsreader = true; } else { remainder.push(word.to_string()); } } // If we don't see any explicit filters for a corpus, flip them all on - if corpus.is_none() && !(is_notmuch || is_tantivy) { + if corpus.is_none() && !(is_notmuch || is_tantivy || is_newsreader) { is_notmuch = true; + is_newsreader = true; is_tantivy = true; } Ok(Query { @@ -718,6 +727,7 @@ impl FromStr for Query { uids, remainder, is_notmuch, + is_newsreader, is_tantivy, corpus, }) diff --git a/server/src/newsreader.rs b/server/src/newsreader.rs index 92a7b70..de09fc0 100644 --- a/server/src/newsreader.rs +++ b/server/src/newsreader.rs @@ -21,7 +21,7 @@ use crate::{ }; pub fn is_newsreader_query(query: &Query) -> bool { - query.corpus == Some(Corpus::Newsreader) + query.is_newsreader || query.corpus == Some(Corpus::Newsreader) } pub fn is_newsreader_thread(query: &str) -> bool { @@ -58,12 +58,6 @@ pub async fn count(pool: &PgPool, query: &Query) -> Result { if !is_newsreader_query(query) { return Ok(0); } - if !query.remainder.is_empty() { - // TODO: handle full text search against all sites, for now, early return if search words - // are specified. - return Ok(0); - } - let site = site_from_tags(&query.tags); if !query.tags.is_empty() && site.is_none() { // Newsreader can only handle all sites read/unread queries, anything with a non-site tag @@ -71,7 +65,15 @@ pub async fn count(pool: &PgPool, query: &Query) -> Result { return Ok(0); } - let row = sqlx::query_file!("sql/count.sql", site, query.unread_only) + let search_term = query.remainder.join(" "); + let search_term = search_term.trim(); + let search_term = if search_term.is_empty() { + None + } else { + Some(search_term) + }; + // TODO: add support for looking for search_term in title and site + let row = sqlx::query_file!("sql/count.sql", site, query.unread_only, search_term) .fetch_one(pool) .await?; Ok(row.count.unwrap_or(0).try_into().unwrap_or(0)) @@ -90,12 +92,6 @@ pub async fn search( if !is_newsreader_query(query) { return Ok(Vec::new()); } - if !query.remainder.is_empty() { - // TODO: handle full text search against all sites, for now, early return if search words - // are specified. - return Ok(Vec::new()); - } - let site = site_from_tags(&query.tags); if !query.tags.is_empty() && site.is_none() { // Newsreader can only handle all sites read/unread queries, anything with a non-site tag @@ -115,14 +111,22 @@ pub async fn search( "search offset {offset} limit {limit} site {site:?} unread_only {}", query.unread_only ); + let search_term = query.remainder.join(" "); + let search_term = search_term.trim(); + let search_term = if search_term.is_empty() { + None + } else { + Some(search_term) + }; - // TODO: further limit results to include query.remainder if set + // TODO: add support for looking for search_term in title and site let rows = sqlx::query_file!( "sql/threads.sql", site, query.unread_only, offset as i64, - limit as i64 + limit as i64, + search_term ) .fetch_all(pool) .await?;