server: improve tantivy performance by reusing IndexReader

Also improve a bunch of trace logging
2024-12-15 14:46:10 -08:00 · 2024-12-15 14:46:10 -08:00 · 6d8b2de608
commit 6d8b2de608
parent 05cdcec244
5 changed files with 67 additions and 42 deletions
--- a/server/src/graphql.rs
+++ b/server/src/graphql.rs
@ -505,7 +505,7 @@ async fn tantivy_search(
 pub struct Mutation;
 #[Object]
 impl Mutation {
-    #[instrument(skip_all, fields(query, bool))]
+    #[instrument(skip_all, fields(query=query, unread=unread))]
    async fn set_read_status<'ctx>(
        &self,
        ctx: &Context<'ctx>,
@ -522,7 +522,7 @@ impl Mutation {
        nm::set_read_status(nm, &query, unread).await?;
        Ok(true)
    }
-    #[instrument(skip_all, fields(query, tag))]
+    #[instrument(skip_all, fields(query=query, tag=tag))]
    async fn tag_add<'ctx>(
        &self,
        ctx: &Context<'ctx>,
@ -534,7 +534,7 @@ impl Mutation {
        nm.tag_add(&tag, &query)?;
        Ok(true)
    }
-    #[instrument(skip_all, fields(query, tag))]
+    #[instrument(skip_all, fields(query=query, tag=tag))]
    async fn tag_remove<'ctx>(
        &self,
        ctx: &Context<'ctx>,
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@ -5,7 +5,7 @@ pub mod newsreader;
 pub mod nm;
 pub mod tantivy;
-use std::{collections::HashMap, convert::Infallible, str::FromStr, sync::Arc};
+use std::{collections::HashMap, convert::Infallible, fmt, str::FromStr, sync::Arc};
 use async_trait::async_trait;
 use cacher::{Cacher, FilesystemCacher};
@ -612,11 +612,38 @@ pub struct Query {
    pub uids: Vec<String>,
    pub remainder: Vec<String>,
    pub is_notmuch: bool,
    pub is_newsreader: bool,
    pub is_tantivy: bool,
    pub corpus: Option<Corpus>,
 }
 impl fmt::Display for Query {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
        if self.unread_only {
            write!(f, "is:unread ")?;
        }
        for tag in &self.tags {
            write!(f, "tag:{tag} ")?;
        }
        for uid in &self.uids {
            write!(f, "id:{uid} ")?;
        }
        if self.is_notmuch {
            write!(f, "is:mail ")?;
        }
        if self.is_tantivy {
            write!(f, "is:news ")?;
        }
        match self.corpus {
            Some(c) => write!(f, "corpus:{c:?}")?,
            _ => (),
        }
        for rem in &self.remainder {
            write!(f, "{rem} ")?;
        }
        Ok(())
    }
 }
 impl Query {
    // Converts the internal state of Query to something suitable for notmuch queries. Removes and
    // letterbox specific '<key>:<value' tags
@ -648,7 +675,6 @@ impl FromStr for Query {
        let mut uids = Vec::new();
        let mut remainder = Vec::new();
        let mut is_notmuch = false;
        let is_newsreader = false;
        let mut is_tantivy = false;
        let mut corpus = None;
        for word in s.split_whitespace() {
@ -682,10 +708,7 @@ impl FromStr for Query {
            }
        }
        // If we don't see any explicit filters for a corpus, flip them all on
-        if corpus.is_none() && !(is_newsreader || is_notmuch || is_tantivy) {
+        if corpus.is_none() && !(is_notmuch || is_tantivy) {
            // Don't set is_newsreader unless debugging, assume tantivy can handle it.
            // Explicitely setting corpus:newsreader will by-pass this logic
            // is_newsreader = true;
            is_notmuch = true;
            is_tantivy = true;
        }
@ -695,7 +718,6 @@ impl FromStr for Query {
            uids,
            remainder,
            is_notmuch,
            is_newsreader,
            is_tantivy,
            corpus,
        })
--- a/server/src/newsreader.rs
+++ b/server/src/newsreader.rs
@ -20,7 +20,7 @@ use crate::{
 };
 pub fn is_newsreader_query(query: &Query) -> bool {
-    query.is_newsreader || query.corpus == Some(Corpus::Newsreader)
+    query.corpus == Some(Corpus::Newsreader)
 }
 pub fn is_newsreader_thread(query: &str) -> bool {
--- a/server/src/nm.rs
+++ b/server/src/nm.rs
@ -49,7 +49,7 @@ pub fn threadset_to_messages(thread_set: notmuch::ThreadSet) -> Result<Vec<Messa
    Ok(Vec::new())
 }
-#[instrument(name="nm::count", skip_all, fields(query=?query))]
+#[instrument(name="nm::count", skip_all, fields(query=%query))]
 pub async fn count(nm: &Notmuch, query: &Query) -> Result<usize, ServerError> {
    if !is_notmuch_query(query) {
        return Ok(0);
@ -58,7 +58,7 @@ pub async fn count(nm: &Notmuch, query: &Query) -> Result<usize, ServerError> {
    Ok(nm.count(&query)?)
 }
-#[instrument(name="nm::search", skip_all, fields(query=?query))]
+#[instrument(name="nm::search", skip_all, fields(query=%query))]
 pub async fn search(
    nm: &Notmuch,
    after: Option<i32>,
@ -856,7 +856,7 @@ fn render_content_type_tree(m: &ParsedMail) -> String {
    )
 }
-#[instrument(name="nm::set_read_status", skip_all, fields(query=?query, unread=unread))]
+#[instrument(name="nm::set_read_status", skip_all, fields(query=%query, unread=unread))]
 pub async fn set_read_status<'ctx>(
    nm: &Notmuch,
    query: &Query,
--- a/server/src/tantivy.rs
+++ b/server/src/tantivy.rs
@ -7,7 +7,7 @@ use tantivy::{
    doc, query,
    query::{AllQuery, BooleanQuery, Occur, QueryParser, TermQuery},
    schema::{Facet, IndexRecordOption, Value},
-    DocAddress, Index, Searcher, TantivyDocument, TantivyError, Term,
+    DocAddress, Index, IndexReader, Searcher, TantivyDocument, TantivyError, Term,
 };
 use tracing::instrument;
@ -24,23 +24,29 @@ pub fn is_tantivy_query(query: &Query) -> bool {
 }
 pub struct TantivyConnection {
    db_path: String,
-    //index: Index,
+    index: Index,
    reader: IndexReader,
 }
-impl TantivyConnection {
+fn get_index(db_path: &str) -> Result<Index, TantivyError> {
-    fn get_index(&self) -> Result<Index, TantivyError> {
+    Ok(match Index::open_in_dir(db_path) {
        Ok(match Index::open_in_dir(&self.db_path) {
        Ok(idx) => idx,
        Err(_) => {
-                create_news_db(&self.db_path)?;
+            create_news_db(db_path)?;
-                Index::open_in_dir(&self.db_path)?
+            Index::open_in_dir(db_path)?
        }
    })
 }
 impl TantivyConnection {
    pub fn new(tantivy_db_path: &str) -> Result<TantivyConnection, TantivyError> {
        let index = get_index(tantivy_db_path)?;
        let reader = index.reader()?;
        Ok(TantivyConnection {
            db_path: tantivy_db_path.to_string(),
            index,
            reader,
        })
    }
    #[instrument(name = "tantivy::refresh", skip_all)]
@ -61,7 +67,7 @@ impl TantivyConnection {
        let start_time = std::time::Instant::now();
        let (searcher, _query) = self.searcher_and_query(&Query::default())?;
        let docs = searcher.search(&AllQuery, &DocSetCollector)?;
-        let uid = self.get_index()?.schema().get_field("uid")?;
+        let uid = self.index.schema().get_field("uid")?;
        let t_uids: Vec<_> = docs
            .into_iter()
            .map(|doc_address| {
@ -112,9 +118,8 @@ impl TantivyConnection {
        let start_time = std::time::Instant::now();
        let pool: &PgPool = pool;
-        let index = self.get_index()?;
+        let mut index_writer = self.index.writer(50_000_000)?;
-        let mut index_writer = index.writer(50_000_000)?;
+        let schema = self.index.schema();
        let schema = index.schema();
        let site = schema.get_field("site")?;
        let title = schema.get_field("title")?;
        let summary = schema.get_field("summary")?;
@ -169,7 +174,7 @@ impl TantivyConnection {
        index_writer.commit()?;
        Ok(())
    }
-    #[instrument(name = "tantivy::reindex_thread", skip_all, fields(query=?query))]
+    #[instrument(name = "tantivy::reindex_thread", skip_all, fields(query=%query))]
    pub async fn reindex_thread(&self, pool: &PgPool, query: &Query) -> Result<(), ServerError> {
        let uids: Vec<_> = query
            .uids
@ -193,7 +198,6 @@ impl TantivyConnection {
        &self,
        query: &Query,
    ) -> Result<(Searcher, Box<dyn query::Query>), ServerError> {
        let index = self.get_index()?;
        // TODO: only create one reader
        // From https://tantivy-search.github.io/examples/basic_search.html
        // "For a search server you will typically create one reader for the entire lifetime of
@ -202,12 +206,11 @@ impl TantivyConnection {
        // I think there's some challenge in making the reader work if we reindex, so reader my
        // need to be stored indirectly, and be recreated on reindex
        // I think creating a reader takes 200-300 ms.
-        let reader = index.reader()?;
+        let schema = self.index.schema();
-        let schema = index.schema();
+        let searcher = self.reader.searcher();
        let searcher = reader.searcher();
        let title = schema.get_field("title")?;
        let summary = schema.get_field("summary")?;
-        let query_parser = QueryParser::for_index(&index, vec![title, summary]);
+        let query_parser = QueryParser::for_index(&self.index, vec![title, summary]);
        // Tantivy uses '*' to match all docs, not empty string
        let term = &query.remainder.join(" ");
        let term = if term.is_empty() { "*" } else { term };
@ -215,8 +218,8 @@ impl TantivyConnection {
        let tantivy_query = query_parser.parse_query(&term)?;
-        let tag = self.get_index()?.schema().get_field("tag")?;
+        let tag = schema.get_field("tag")?;
-        let is_read = self.get_index()?.schema().get_field("is_read")?;
+        let is_read = schema.get_field("is_read")?;
        let mut terms = vec![(Occur::Must, tantivy_query)];
        for t in &query.tags {
            let facet = Facet::from(&format!("/{t}"));
@ -236,7 +239,7 @@ impl TantivyConnection {
        Ok((searcher, Box::new(search_query)))
    }
-    #[instrument(name="tantivy::count", skip_all, fields(query=?query))]
+    #[instrument(name="tantivy::count", skip_all, fields(query=%query))]
    pub async fn count(&self, query: &Query) -> Result<usize, ServerError> {
        if !is_tantivy_query(query) {
            return Ok(0);
@ -246,7 +249,7 @@ impl TantivyConnection {
        let (searcher, query) = self.searcher_and_query(&query)?;
        Ok(searcher.search(&query, &Count)?)
    }
-    #[instrument(name="tantivy::search", skip_all, fields(query=?query))]
+    #[instrument(name="tantivy::search", skip_all, fields(query=%query))]
    pub async fn search(
        &self,
        pool: &PgPool,
@ -276,7 +279,7 @@ impl TantivyConnection {
                .order_by_u64_field("date", tantivy::index::Order::Desc),
        )?;
        info!("search found {} docs", top_docs.len());
-        let uid = self.get_index()?.schema().get_field("uid")?;
+        let uid = self.index.schema().get_field("uid")?;
        let uids = top_docs
            .into_iter()
            .map(|(_, doc_address): (u64, DocAddress)| {