server: add postgres based newsreader search and disable tantivy

This commit is contained in:
Bill Thiede 2024-12-17 09:31:51 -08:00
parent e36f4f97f9
commit 13eaf33b1a
9 changed files with 86 additions and 32 deletions

View File

@ -31,7 +31,7 @@ serde = { version = "1.0.147", features = ["derive"] }
serde_json = "1.0.87"
shared = { path = "../shared" }
sqlx = { version = "0.7.4", features = ["postgres", "runtime-tokio", "time"] }
tantivy = "0.22.0"
tantivy = { version = "0.22.0", optional = true }
thiserror = "1.0.37"
tokio = "1.26.0"
tracing = "0.1.41"
@ -42,3 +42,7 @@ xtracing = { git = "http://git-private.h.xinu.tv/wathiede/xtracing.git" }
[build-dependencies]
build-info-build = "0.0.38"
[features]
default = [ "tantivy" ]
tantivy = [ "dep:tantivy" ]

View File

@ -1 +1,3 @@
CREATE INDEX post_summary_idx ON post USING GIN (to_tsvector('english', summary));
CREATE INDEX post_site_idx ON post USING GIN (to_tsvector('english', site));
CREATE INDEX post_title_idx ON post USING GIN (to_tsvector('english', title));

View File

@ -11,3 +11,7 @@ WHERE
NOT $2
OR NOT is_read
)
AND (
$3 :: text IS NULL
OR to_tsvector('english', summary) @@ websearch_to_tsquery('english', $3)
)

View File

@ -14,6 +14,10 @@ WHERE
NOT $2
OR NOT is_read
)
AND (
$5 :: text IS NULL
OR to_tsvector('english', summary) @@ websearch_to_tsquery('english', $5)
)
ORDER BY
date DESC,
title OFFSET $3

View File

@ -17,12 +17,13 @@ use rocket::{
Response, State,
};
use rocket_cors::{AllowedHeaders, AllowedOrigins};
#[cfg(feature = "tantivy")]
use server::tantivy::TantivyConnection;
use server::{
config::Config,
error::ServerError,
graphql::{Attachment, GraphqlSchema, Mutation, QueryRoot},
nm::{attachment_bytes, cid_attachment_bytes},
tantivy::TantivyConnection,
};
use sqlx::postgres::PgPool;
@ -216,15 +217,18 @@ async fn main() -> Result<(), Box<dyn Error>> {
}
let pool = PgPool::connect(&config.newsreader_database_url).await?;
sqlx::migrate!("./migrations").run(&pool).await?;
#[cfg(feature = "tantivy")]
let tantivy_conn = TantivyConnection::new(&config.newsreader_tantivy_db_path)?;
let schema = Schema::build(QueryRoot, Mutation, EmptySubscription)
.data(Notmuch::default())
.data(config)
.data(pool.clone())
.data(tantivy_conn)
.extension(async_graphql::extensions::Logger)
.finish();
.data(pool.clone());
#[cfg(feature = "tantivy")]
let schema = schema.data(tantivy_conn);
let schema = schema.extension(async_graphql::extensions::Logger).finish();
let rkt = rkt.manage(schema).manage(pool).manage(Notmuch::default());
//.manage(Notmuch::with_config("../notmuch/testdata/notmuch.config"))

View File

@ -1,8 +1,8 @@
use std::{convert::Infallible, str::Utf8Error, string::FromUtf8Error};
use mailparse::MailParseError;
use tantivy::TantivyError;
use tantivy::query::QueryParserError;
#[cfg(feature = "tantivy")]
use tantivy::{query::QueryParserError, TantivyError};
use thiserror::Error;
use crate::TransformError;
@ -31,8 +31,10 @@ pub enum ServerError {
StringError(String),
#[error("invalid url: {0}")]
UrlParseError(#[from] url::ParseError),
#[cfg(feature = "tantivy")]
#[error("tantivy error: {0}")]
TantivyError(#[from] TantivyError),
#[cfg(feature = "tantivy")]
#[error("tantivy query parse error: {0}")]
QueryParseError(#[from] QueryParserError),
#[error("impossible: {0}")]

View File

@ -12,7 +12,9 @@ use sqlx::postgres::PgPool;
use tokio::join;
use tracing::instrument;
use crate::{config::Config, newsreader, nm, tantivy::TantivyConnection, Query};
#[cfg(feature = "tantivy")]
use crate::tantivy::TantivyConnection;
use crate::{config::Config, newsreader, nm, Query};
/// # Number of seconds since the Epoch
pub type UnixTime = isize;
@ -275,13 +277,18 @@ impl QueryRoot {
async fn count<'ctx>(&self, ctx: &Context<'ctx>, query: String) -> Result<usize, Error> {
let nm = ctx.data_unchecked::<Notmuch>();
let pool = ctx.data_unchecked::<PgPool>();
#[cfg(feature = "tantivy")]
let tantivy = ctx.data_unchecked::<TantivyConnection>();
let newsreader_query: Query = query.parse()?;
let newsreader_count = newsreader::count(pool, &newsreader_query).await?;
let notmuch_count = nm::count(nm, &newsreader_query).await?;
#[cfg(feature = "tantivy")]
let tantivy_count = tantivy.count(&newsreader_query).await?;
#[cfg(not(feature = "tantivy"))]
let tantivy_count = 0;
let total = newsreader_count + notmuch_count + tantivy_count;
info!("count {newsreader_query:?} newsreader count {newsreader_count} notmuch count {notmuch_count} tantivy count {tantivy_count} total {total}");
Ok(total)
@ -302,6 +309,7 @@ impl QueryRoot {
info!("search({after:?} {before:?} {first:?} {last:?} {query:?})",);
let nm = ctx.data_unchecked::<Notmuch>();
let pool = ctx.data_unchecked::<PgPool>();
#[cfg(feature = "tantivy")]
let tantivy = ctx.data_unchecked::<TantivyConnection>();
Ok(connection::query(
@ -341,6 +349,7 @@ impl QueryRoot {
);
let notmuch_fut =
notmuch_search(nm, notmuch_after, notmuch_before, first, last, &query);
#[cfg(feature = "tantivy")]
let tantivy_fut = tantivy_search(
tantivy,
pool,
@ -350,6 +359,10 @@ impl QueryRoot {
last,
&query,
);
#[cfg(not(feature = "tantivy"))]
let tantivy_fut =
async { Ok::<Vec<ThreadSummaryCursor>, async_graphql::Error>(Vec::new()) };
let (newsreader_results, notmuch_results, tantivy_results) =
join!(newsreader_fut, notmuch_fut, tantivy_fut);
@ -492,6 +505,7 @@ async fn notmuch_search(
.collect())
}
#[cfg(feature = "tantivy")]
async fn tantivy_search(
tantivy: &TantivyConnection,
pool: &PgPool,
@ -521,10 +535,12 @@ impl Mutation {
) -> Result<bool, Error> {
let nm = ctx.data_unchecked::<Notmuch>();
let pool = ctx.data_unchecked::<PgPool>();
#[cfg(feature = "tantivy")]
let tantivy = ctx.data_unchecked::<TantivyConnection>();
let query: Query = query.parse()?;
newsreader::set_read_status(pool, &query, unread).await?;
#[cfg(feature = "tantivy")]
tantivy.reindex_thread(pool, &query).await?;
nm::set_read_status(nm, &query, unread).await?;
Ok(true)
@ -554,6 +570,7 @@ impl Mutation {
Ok(true)
}
/// Drop and recreate tantivy index. Warning this is slow
#[cfg(feature = "tantivy")]
async fn drop_and_load_index<'ctx>(&self, ctx: &Context<'ctx>) -> Result<bool, Error> {
let tantivy = ctx.data_unchecked::<TantivyConnection>();
let pool = ctx.data_unchecked::<PgPool>();
@ -566,11 +583,14 @@ impl Mutation {
#[instrument(skip_all)]
async fn refresh<'ctx>(&self, ctx: &Context<'ctx>) -> Result<bool, Error> {
let nm = ctx.data_unchecked::<Notmuch>();
#[cfg(feature = "tantivy")]
{
let tantivy = ctx.data_unchecked::<TantivyConnection>();
let pool = ctx.data_unchecked::<PgPool>();
// TODO: parallelize
info!("{}", String::from_utf8_lossy(&nm.new()?));
tantivy.refresh(pool).await?;
}
Ok(true)
}
}

View File

@ -3,6 +3,7 @@ pub mod error;
pub mod graphql;
pub mod newsreader;
pub mod nm;
#[cfg(feature = "tantivy")]
pub mod tantivy;
use std::{collections::HashMap, convert::Infallible, fmt, str::FromStr, sync::Arc};
@ -612,6 +613,7 @@ pub struct Query {
pub uids: Vec<String>,
pub remainder: Vec<String>,
pub is_notmuch: bool,
pub is_newsreader: bool,
pub is_tantivy: bool,
pub corpus: Option<Corpus>,
}
@ -630,6 +632,9 @@ impl fmt::Display for Query {
if self.is_notmuch {
write!(f, "is:mail ")?;
}
if self.is_newsreader {
write!(f, "is:newsreader ")?;
}
if self.is_tantivy {
write!(f, "is:news ")?;
}
@ -675,6 +680,7 @@ impl FromStr for Query {
let mut uids = Vec::new();
let mut remainder = Vec::new();
let mut is_notmuch = false;
let mut is_newsreader = false;
let mut is_tantivy = false;
let mut corpus = None;
for word in s.split_whitespace() {
@ -701,15 +707,18 @@ impl FromStr for Query {
uids.push(word.to_string());
} else if word == "is:mail" || word == "is:email" || word == "is:notmuch" {
is_notmuch = true;
} else if word == "is:news" || word == "is:newsreader" {
} else if word == "is:news" {
is_tantivy = true;
} else if word == "is:newsreader" {
is_newsreader = true;
} else {
remainder.push(word.to_string());
}
}
// If we don't see any explicit filters for a corpus, flip them all on
if corpus.is_none() && !(is_notmuch || is_tantivy) {
if corpus.is_none() && !(is_notmuch || is_tantivy || is_newsreader) {
is_notmuch = true;
is_newsreader = true;
is_tantivy = true;
}
Ok(Query {
@ -718,6 +727,7 @@ impl FromStr for Query {
uids,
remainder,
is_notmuch,
is_newsreader,
is_tantivy,
corpus,
})

View File

@ -21,7 +21,7 @@ use crate::{
};
pub fn is_newsreader_query(query: &Query) -> bool {
query.corpus == Some(Corpus::Newsreader)
query.is_newsreader || query.corpus == Some(Corpus::Newsreader)
}
pub fn is_newsreader_thread(query: &str) -> bool {
@ -58,12 +58,6 @@ pub async fn count(pool: &PgPool, query: &Query) -> Result<usize, ServerError> {
if !is_newsreader_query(query) {
return Ok(0);
}
if !query.remainder.is_empty() {
// TODO: handle full text search against all sites, for now, early return if search words
// are specified.
return Ok(0);
}
let site = site_from_tags(&query.tags);
if !query.tags.is_empty() && site.is_none() {
// Newsreader can only handle all sites read/unread queries, anything with a non-site tag
@ -71,7 +65,15 @@ pub async fn count(pool: &PgPool, query: &Query) -> Result<usize, ServerError> {
return Ok(0);
}
let row = sqlx::query_file!("sql/count.sql", site, query.unread_only)
let search_term = query.remainder.join(" ");
let search_term = search_term.trim();
let search_term = if search_term.is_empty() {
None
} else {
Some(search_term)
};
// TODO: add support for looking for search_term in title and site
let row = sqlx::query_file!("sql/count.sql", site, query.unread_only, search_term)
.fetch_one(pool)
.await?;
Ok(row.count.unwrap_or(0).try_into().unwrap_or(0))
@ -90,12 +92,6 @@ pub async fn search(
if !is_newsreader_query(query) {
return Ok(Vec::new());
}
if !query.remainder.is_empty() {
// TODO: handle full text search against all sites, for now, early return if search words
// are specified.
return Ok(Vec::new());
}
let site = site_from_tags(&query.tags);
if !query.tags.is_empty() && site.is_none() {
// Newsreader can only handle all sites read/unread queries, anything with a non-site tag
@ -115,14 +111,22 @@ pub async fn search(
"search offset {offset} limit {limit} site {site:?} unread_only {}",
query.unread_only
);
let search_term = query.remainder.join(" ");
let search_term = search_term.trim();
let search_term = if search_term.is_empty() {
None
} else {
Some(search_term)
};
// TODO: further limit results to include query.remainder if set
// TODO: add support for looking for search_term in title and site
let rows = sqlx::query_file!(
"sql/threads.sql",
site,
query.unread_only,
offset as i64,
limit as i64
limit as i64,
search_term
)
.fetch_all(pool)
.await?;