server: WIP tantivy integration

This commit is contained in:
2024-09-28 11:17:52 -07:00
parent 005a457348
commit ebf32a9905
8 changed files with 285 additions and 99 deletions

View File

@@ -1,23 +1,31 @@
use log::info;
use sqlx::postgres::PgPool;
use tantivy::{Index, IndexWriter, TantivyError};
use tantivy::{schema::Value, Index, TantivyError};
use crate::error::ServerError;
use crate::{
error::ServerError, graphql::ThreadSummary, thread_summary_from_row, Query, ThreadSummaryRecord,
};
pub struct TantivyConnection {
index: Index,
db_path: String,
//index: Index,
}
impl TantivyConnection {
pub fn new(tantivy_db_path: &str) -> Result<TantivyConnection, TantivyError> {
let index = match Index::open_in_dir(tantivy_db_path) {
fn get_index(&self) -> Result<Index, TantivyError> {
Ok(match Index::open_in_dir(&self.db_path) {
Ok(idx) => idx,
Err(_) => {
create_news_db(tantivy_db_path)?;
Index::open_in_dir(tantivy_db_path)?
create_news_db(&self.db_path)?;
Index::open_in_dir(&self.db_path)?
}
};
Ok(TantivyConnection { index })
})
}
pub fn new(tantivy_db_path: &str) -> Result<TantivyConnection, TantivyError> {
Ok(TantivyConnection {
db_path: tantivy_db_path.to_string(),
})
}
pub async fn reindex(&self, pool: &PgPool) -> Result<(), ServerError> {
use tantivy::{doc, Term};
@@ -25,8 +33,9 @@ impl TantivyConnection {
let start_time = std::time::Instant::now();
let pool: &PgPool = pool;
let mut index_writer = self.index.writer(50_000_000)?;
let schema = self.index.schema();
let index = self.get_index()?;
let mut index_writer = index.writer(50_000_000)?;
let schema = index.schema();
let site = schema.get_field("site")?;
let title = schema.get_field("title")?;
let summary = schema.get_field("summary")?;
@@ -68,30 +77,76 @@ impl TantivyConnection {
info!("took {:.2}s to reindex", start_time.elapsed().as_secs_f32());
Ok(())
}
pub fn search(&self) -> Result<String, TantivyError> {
pub async fn search(
&self,
pool: &PgPool,
after: Option<i32>,
before: Option<i32>,
first: Option<i32>,
last: Option<i32>,
query: &Query,
) -> Result<Vec<(i32, ThreadSummary)>, async_graphql::Error> {
use tantivy::{collector::TopDocs, query::QueryParser, Document, TantivyDocument};
// TODO: set based on function parameters
let offset = 0;
let reader = self.index.reader()?;
let schema = self.index.schema();
let index = self.get_index()?;
let reader = index.reader()?;
let schema = index.schema();
let searcher = reader.searcher();
let site = schema.get_field("site")?;
let uid = schema.get_field("uid")?;
let title = schema.get_field("title")?;
let summary = schema.get_field("summary")?;
let query_parser = QueryParser::for_index(&self.index, vec![site, title, summary]);
let date = schema.get_field("date")?;
let query_parser = QueryParser::for_index(&index, vec![title, summary]);
let query = query_parser.parse_query("grapheme")?;
let query = query_parser.parse_query(&query.remainder.join(" "))?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let mut results = vec![];
info!("search found {} docs", top_docs.len());
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
results.push(format!("{}", retrieved_doc.to_json(&schema)));
let uids = top_docs
.into_iter()
.map(|(_, doc_address)| {
searcher.doc(doc_address).map(|doc: TantivyDocument| {
doc.get_first(uid)
.expect("doc missing uid")
.as_str()
.expect("doc str missing")
.to_string()
})
})
.collect::<Result<Vec<String>, TantivyError>>()?;
//let uids = format!("'{}'", uids.join("','"));
info!("uids {uids:?}");
let rows = sqlx::query_file!("sql/threads-from-uid.sql", &uids as &[String])
.fetch_all(pool)
.await?;
let mut res = Vec::new();
info!("found {} hits joining w/ tantivy", rows.len());
for (i, r) in rows.into_iter().enumerate() {
res.push((
i as i32 + offset,
thread_summary_from_row(ThreadSummaryRecord {
site: r.site,
date: r.date,
is_read: r.is_read,
title: r.title,
uid: r.uid,
name: r.name,
})
.await,
));
}
Ok(results.join(" "))
Ok(res)
}
pub fn drop_and_load_index(&self) -> Result<(), TantivyError> {
create_news_db(&self.db_path)
}
}
fn create_news_db(tantivy_db_path: &str) -> Result<(), TantivyError> {
info!("create_news_db");
std::fs::remove_dir_all(tantivy_db_path)?;
std::fs::create_dir_all(tantivy_db_path)?;
use tantivy::schema::*;
@@ -100,7 +155,7 @@ fn create_news_db(tantivy_db_path: &str) -> Result<(), TantivyError> {
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("summary", TEXT);
schema_builder.add_text_field("link", STRING | STORED);
schema_builder.add_date_field("date", FAST);
schema_builder.add_date_field("date", FAST | INDEXED | STORED);
schema_builder.add_bool_field("is_read", FAST);
schema_builder.add_text_field("uid", STRING | STORED);
schema_builder.add_i64_field("id", FAST);