server: improve tantivy performance by reusing IndexReader
Also improve a bunch of trace logging
This commit is contained in:
@@ -7,7 +7,7 @@ use tantivy::{
|
||||
doc, query,
|
||||
query::{AllQuery, BooleanQuery, Occur, QueryParser, TermQuery},
|
||||
schema::{Facet, IndexRecordOption, Value},
|
||||
DocAddress, Index, Searcher, TantivyDocument, TantivyError, Term,
|
||||
DocAddress, Index, IndexReader, Searcher, TantivyDocument, TantivyError, Term,
|
||||
};
|
||||
use tracing::instrument;
|
||||
|
||||
@@ -24,23 +24,29 @@ pub fn is_tantivy_query(query: &Query) -> bool {
|
||||
}
|
||||
pub struct TantivyConnection {
|
||||
db_path: String,
|
||||
//index: Index,
|
||||
index: Index,
|
||||
reader: IndexReader,
|
||||
}
|
||||
|
||||
fn get_index(db_path: &str) -> Result<Index, TantivyError> {
|
||||
Ok(match Index::open_in_dir(db_path) {
|
||||
Ok(idx) => idx,
|
||||
Err(_) => {
|
||||
create_news_db(db_path)?;
|
||||
Index::open_in_dir(db_path)?
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
impl TantivyConnection {
|
||||
fn get_index(&self) -> Result<Index, TantivyError> {
|
||||
Ok(match Index::open_in_dir(&self.db_path) {
|
||||
Ok(idx) => idx,
|
||||
Err(_) => {
|
||||
create_news_db(&self.db_path)?;
|
||||
Index::open_in_dir(&self.db_path)?
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn new(tantivy_db_path: &str) -> Result<TantivyConnection, TantivyError> {
|
||||
let index = get_index(tantivy_db_path)?;
|
||||
let reader = index.reader()?;
|
||||
|
||||
Ok(TantivyConnection {
|
||||
db_path: tantivy_db_path.to_string(),
|
||||
index,
|
||||
reader,
|
||||
})
|
||||
}
|
||||
#[instrument(name = "tantivy::refresh", skip_all)]
|
||||
@@ -61,7 +67,7 @@ impl TantivyConnection {
|
||||
let start_time = std::time::Instant::now();
|
||||
let (searcher, _query) = self.searcher_and_query(&Query::default())?;
|
||||
let docs = searcher.search(&AllQuery, &DocSetCollector)?;
|
||||
let uid = self.get_index()?.schema().get_field("uid")?;
|
||||
let uid = self.index.schema().get_field("uid")?;
|
||||
let t_uids: Vec<_> = docs
|
||||
.into_iter()
|
||||
.map(|doc_address| {
|
||||
@@ -112,9 +118,8 @@ impl TantivyConnection {
|
||||
let start_time = std::time::Instant::now();
|
||||
let pool: &PgPool = pool;
|
||||
|
||||
let index = self.get_index()?;
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let schema = index.schema();
|
||||
let mut index_writer = self.index.writer(50_000_000)?;
|
||||
let schema = self.index.schema();
|
||||
let site = schema.get_field("site")?;
|
||||
let title = schema.get_field("title")?;
|
||||
let summary = schema.get_field("summary")?;
|
||||
@@ -169,7 +174,7 @@ impl TantivyConnection {
|
||||
index_writer.commit()?;
|
||||
Ok(())
|
||||
}
|
||||
#[instrument(name = "tantivy::reindex_thread", skip_all, fields(query=?query))]
|
||||
#[instrument(name = "tantivy::reindex_thread", skip_all, fields(query=%query))]
|
||||
pub async fn reindex_thread(&self, pool: &PgPool, query: &Query) -> Result<(), ServerError> {
|
||||
let uids: Vec<_> = query
|
||||
.uids
|
||||
@@ -193,7 +198,6 @@ impl TantivyConnection {
|
||||
&self,
|
||||
query: &Query,
|
||||
) -> Result<(Searcher, Box<dyn query::Query>), ServerError> {
|
||||
let index = self.get_index()?;
|
||||
// TODO: only create one reader
|
||||
// From https://tantivy-search.github.io/examples/basic_search.html
|
||||
// "For a search server you will typically create one reader for the entire lifetime of
|
||||
@@ -202,12 +206,11 @@ impl TantivyConnection {
|
||||
// I think there's some challenge in making the reader work if we reindex, so reader my
|
||||
// need to be stored indirectly, and be recreated on reindex
|
||||
// I think creating a reader takes 200-300 ms.
|
||||
let reader = index.reader()?;
|
||||
let schema = index.schema();
|
||||
let searcher = reader.searcher();
|
||||
let schema = self.index.schema();
|
||||
let searcher = self.reader.searcher();
|
||||
let title = schema.get_field("title")?;
|
||||
let summary = schema.get_field("summary")?;
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, summary]);
|
||||
let query_parser = QueryParser::for_index(&self.index, vec![title, summary]);
|
||||
// Tantivy uses '*' to match all docs, not empty string
|
||||
let term = &query.remainder.join(" ");
|
||||
let term = if term.is_empty() { "*" } else { term };
|
||||
@@ -215,8 +218,8 @@ impl TantivyConnection {
|
||||
|
||||
let tantivy_query = query_parser.parse_query(&term)?;
|
||||
|
||||
let tag = self.get_index()?.schema().get_field("tag")?;
|
||||
let is_read = self.get_index()?.schema().get_field("is_read")?;
|
||||
let tag = schema.get_field("tag")?;
|
||||
let is_read = schema.get_field("is_read")?;
|
||||
let mut terms = vec![(Occur::Must, tantivy_query)];
|
||||
for t in &query.tags {
|
||||
let facet = Facet::from(&format!("/{t}"));
|
||||
@@ -236,7 +239,7 @@ impl TantivyConnection {
|
||||
Ok((searcher, Box::new(search_query)))
|
||||
}
|
||||
|
||||
#[instrument(name="tantivy::count", skip_all, fields(query=?query))]
|
||||
#[instrument(name="tantivy::count", skip_all, fields(query=%query))]
|
||||
pub async fn count(&self, query: &Query) -> Result<usize, ServerError> {
|
||||
if !is_tantivy_query(query) {
|
||||
return Ok(0);
|
||||
@@ -246,7 +249,7 @@ impl TantivyConnection {
|
||||
let (searcher, query) = self.searcher_and_query(&query)?;
|
||||
Ok(searcher.search(&query, &Count)?)
|
||||
}
|
||||
#[instrument(name="tantivy::search", skip_all, fields(query=?query))]
|
||||
#[instrument(name="tantivy::search", skip_all, fields(query=%query))]
|
||||
pub async fn search(
|
||||
&self,
|
||||
pool: &PgPool,
|
||||
@@ -276,7 +279,7 @@ impl TantivyConnection {
|
||||
.order_by_u64_field("date", tantivy::index::Order::Desc),
|
||||
)?;
|
||||
info!("search found {} docs", top_docs.len());
|
||||
let uid = self.get_index()?.schema().get_field("uid")?;
|
||||
let uid = self.index.schema().get_field("uid")?;
|
||||
let uids = top_docs
|
||||
.into_iter()
|
||||
.map(|(_, doc_address): (u64, DocAddress)| {
|
||||
|
||||
Reference in New Issue
Block a user