web & server: using tantivy for news post search

This commit is contained in:
2024-09-29 16:28:05 -07:00
parent f36d1e0c29
commit 3ec1741f10
22 changed files with 737 additions and 170 deletions

View File

@@ -1,3 +1,5 @@
use std::str::FromStr;
use async_graphql::{
connection::{self, Connection, Edge, OpaqueCursor},
Context, EmptySubscription, Enum, Error, FieldResult, InputObject, Object, Schema,
@@ -16,6 +18,26 @@ pub type UnixTime = isize;
/// # Thread ID, sans "thread:"
pub type ThreadId = String;
#[derive(Debug, Enum, Copy, Clone, Eq, PartialEq)]
pub enum Corpus {
Notmuch,
Newsreader,
Tantivy,
}
impl FromStr for Corpus {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
Ok(match s {
"notmuch" => Corpus::Notmuch,
"newsreader" => Corpus::Newsreader,
"tantivy" => Corpus::Tantivy,
s => return Err(format!("unknown corpus: '{s}'")),
})
}
}
// TODO: add is_read field and remove all use of 'tag:unread'
#[derive(Debug, SimpleObject)]
pub struct ThreadSummary {
pub thread: ThreadId,
@@ -30,6 +52,7 @@ pub struct ThreadSummary {
pub authors: String,
pub subject: String,
pub tags: Vec<String>,
pub corpus: Corpus,
}
#[derive(Debug, Union)]
@@ -237,13 +260,16 @@ impl QueryRoot {
async fn count<'ctx>(&self, ctx: &Context<'ctx>, query: String) -> Result<usize, Error> {
let nm = ctx.data_unchecked::<Notmuch>();
let pool = ctx.data_unchecked::<PgPool>();
let tantivy = ctx.data_unchecked::<TantivyConnection>();
let newsreader_query: Query = query.parse()?;
let newsreader_count = newsreader::count(pool, &newsreader_query).await?;
let notmuch_count = nm::count(nm, &newsreader_query.to_notmuch()).await?;
info!("count {newsreader_query:?} newsreader count {newsreader_count} notmuch count {notmuch_count}");
Ok(newsreader_count + notmuch_count)
let notmuch_count = nm::count(nm, &newsreader_query).await?;
let tantivy_count = tantivy.count(&newsreader_query).await?;
let total = newsreader_count + notmuch_count + tantivy_count;
info!("count {newsreader_query:?} newsreader count {newsreader_count} notmuch count {notmuch_count} tantivy count {tantivy_count} total {total}");
Ok(total)
}
async fn search<'ctx>(
@@ -255,18 +281,11 @@ impl QueryRoot {
last: Option<i32>,
query: String,
) -> Result<Connection<OpaqueCursor<SearchCursor>, ThreadSummary>, Error> {
// TODO: add keywords to limit search to one corpus, i.e. is:news or is:mail
info!("search({after:?} {before:?} {first:?} {last:?} {query:?})",);
let nm = ctx.data_unchecked::<Notmuch>();
let pool = ctx.data_unchecked::<PgPool>();
let tantivy = ctx.data_unchecked::<TantivyConnection>();
#[derive(Debug)]
enum ThreadSummaryCursor {
Newsreader(i32, ThreadSummary),
Notmuch(i32, ThreadSummary),
Tantivy(i32, ThreadSummary),
}
Ok(connection::query(
after,
before,
@@ -277,7 +296,7 @@ impl QueryRoot {
first: Option<usize>,
last: Option<usize>| async move {
info!(
"search({:?} {:?} {first:?} {last:?} {query:?})",
"search(after {:?} before {:?} first {first:?} last {last:?} query: {query:?})",
after.as_ref().map(|v| &v.0),
before.as_ref().map(|v| &v.0)
);
@@ -288,65 +307,40 @@ impl QueryRoot {
let newsreader_before = before.as_ref().map(|sc| sc.newsreader_offset);
let notmuch_before = before.as_ref().map(|sc| sc.notmuch_offset);
let tantivy_before = before.as_ref().map(|sc| sc.tantivy_offset);
let first = first.map(|v| v as i32);
let last = last.map(|v| v as i32);
let newsreader_query: Query = query.parse()?;
info!("newsreader_query {newsreader_query:?}");
let newsreader_results = if newsreader_query.is_newsreader {
newsreader::search(
pool,
newsreader_after,
newsreader_before,
first.map(|v| v as i32),
last.map(|v| v as i32),
&newsreader_query,
)
.await?
.into_iter()
.map(|(cur, ts)| ThreadSummaryCursor::Newsreader(cur, ts))
.collect()
} else {
Vec::new()
};
let notmuch_results = if newsreader_query.is_notmuch {
nm::search(
nm,
notmuch_after,
notmuch_before,
first.map(|v| v as i32),
last.map(|v| v as i32),
newsreader_query.to_notmuch(),
)
.await?
.into_iter()
.map(|(cur, ts)| ThreadSummaryCursor::Notmuch(cur, ts))
.collect()
} else {
Vec::new()
};
let tantivy_results = if newsreader_query.is_tantivy {
tantivy
.search(
pool,
tantivy_after,
tantivy_before,
first.map(|v| v as i32),
last.map(|v| v as i32),
&newsreader_query,
)
.await?
.into_iter()
.map(|(cur, ts)| ThreadSummaryCursor::Tantivy(cur, ts))
.collect()
} else {
Vec::new()
};
let query: Query = query.parse()?;
info!("newsreader_query {query:?}");
let newsreader_results = newsreader_search(
pool,
newsreader_after,
newsreader_before,
first,
last,
&query,
)
.await?;
let notmuch_results =
notmuch_search(nm, notmuch_after, notmuch_before, first, last, &query).await?;
let tantivy_results = tantivy_search(
tantivy,
pool,
tantivy_after,
tantivy_before,
first,
last,
&query,
)
.await?;
info!(
"tantivy results:\nis_tantivy:{} {tantivy_results:#?}",
newsreader_query.is_tantivy
"newsreader_results ({}) notmuch_results ({}) tantivy_results ({})",
newsreader_results.len(),
notmuch_results.len(),
tantivy_results.len()
);
let mut results: Vec<_> = newsreader_results
.into_iter()
.chain(notmuch_results)
@@ -362,6 +356,7 @@ impl QueryRoot {
let mut has_next_page = before.is_some();
if let Some(first) = first {
let first = first as usize;
if results.len() > first {
has_next_page = true;
results.truncate(first);
@@ -370,6 +365,7 @@ impl QueryRoot {
let mut has_previous_page = after.is_some();
if let Some(last) = last {
let last = last as usize;
if results.len() > last {
has_previous_page = true;
results.truncate(last);
@@ -437,6 +433,59 @@ impl QueryRoot {
}
}
#[derive(Debug)]
enum ThreadSummaryCursor {
Newsreader(i32, ThreadSummary),
Notmuch(i32, ThreadSummary),
Tantivy(i32, ThreadSummary),
}
async fn newsreader_search(
pool: &PgPool,
after: Option<i32>,
before: Option<i32>,
first: Option<i32>,
last: Option<i32>,
query: &Query,
) -> Result<Vec<ThreadSummaryCursor>, async_graphql::Error> {
Ok(newsreader::search(pool, after, before, first, last, &query)
.await?
.into_iter()
.map(|(cur, ts)| ThreadSummaryCursor::Newsreader(cur, ts))
.collect())
}
async fn notmuch_search(
nm: &Notmuch,
after: Option<i32>,
before: Option<i32>,
first: Option<i32>,
last: Option<i32>,
query: &Query,
) -> Result<Vec<ThreadSummaryCursor>, async_graphql::Error> {
Ok(nm::search(nm, after, before, first, last, &query)
.await?
.into_iter()
.map(|(cur, ts)| ThreadSummaryCursor::Notmuch(cur, ts))
.collect())
}
async fn tantivy_search(
tantivy: &TantivyConnection,
pool: &PgPool,
after: Option<i32>,
before: Option<i32>,
first: Option<i32>,
last: Option<i32>,
query: &Query,
) -> Result<Vec<ThreadSummaryCursor>, async_graphql::Error> {
Ok(tantivy
.search(pool, after, before, first, last, &query)
.await?
.into_iter()
.map(|(cur, ts)| ThreadSummaryCursor::Tantivy(cur, ts))
.collect())
}
pub struct Mutation;
#[Object]
impl Mutation {
@@ -448,14 +497,12 @@ impl Mutation {
) -> Result<bool, Error> {
let nm = ctx.data_unchecked::<Notmuch>();
let pool = ctx.data_unchecked::<PgPool>();
let tantivy = ctx.data_unchecked::<TantivyConnection>();
for q in query.split_whitespace() {
if newsreader::is_newsreader_thread(&q) {
newsreader::set_read_status(pool, &q, unread).await?;
} else {
nm::set_read_status(nm, q, unread).await?;
}
}
let query: Query = query.parse()?;
newsreader::set_read_status(pool, &query, unread).await?;
tantivy.reindex_thread(pool, &query).await?;
nm::set_read_status(nm, &query, unread).await?;
Ok(true)
}
async fn tag_add<'ctx>(
@@ -486,10 +533,19 @@ impl Mutation {
let pool = ctx.data_unchecked::<PgPool>();
tantivy.drop_and_load_index()?;
tantivy.reindex(pool).await?;
tantivy.reindex_all(pool).await?;
Ok(true)
}
async fn refresh<'ctx>(&self, ctx: &Context<'ctx>) -> Result<bool, Error> {
let nm = ctx.data_unchecked::<Notmuch>();
let tantivy = ctx.data_unchecked::<TantivyConnection>();
let pool = ctx.data_unchecked::<PgPool>();
// TODO: parallelize
info!("{}", String::from_utf8_lossy(&nm.new()?));
tantivy.refresh(pool).await?;
Ok(true)
}
}
pub type GraphqlSchema = Schema<QueryRoot, Mutation, EmptySubscription>;