server: move tantivy code to separate mod

This commit is contained in:
2024-09-22 10:26:45 -07:00
parent d1604f8e70
commit 3c8d7d4f81
3 changed files with 123 additions and 123 deletions

111
server/src/tantivy.rs Normal file
View File

@@ -0,0 +1,111 @@
use log::info;
use sqlx::postgres::PgPool;
use tantivy::{Index, IndexWriter, TantivyError};
use crate::error::ServerError;
pub struct TantivyConnection {
index: Index,
}
impl TantivyConnection {
pub fn new(tantivy_db_path: &str) -> Result<TantivyConnection, TantivyError> {
let index = match Index::open_in_dir(tantivy_db_path) {
Ok(idx) => idx,
Err(_) => {
create_news_db(tantivy_db_path)?;
Index::open_in_dir(tantivy_db_path)?
}
};
Ok(TantivyConnection { index })
}
pub async fn reindex(&self, pool: &PgPool) -> Result<(), ServerError> {
use tantivy::{doc, Term};
let start_time = std::time::Instant::now();
let pool: &PgPool = pool;
let mut index_writer = self.index.writer(50_000_000)?;
let schema = self.index.schema();
let site = schema.get_field("site")?;
let title = schema.get_field("title")?;
let summary = schema.get_field("summary")?;
let link = schema.get_field("link")?;
let date = schema.get_field("date")?;
let is_read = schema.get_field("is_read")?;
let uid = schema.get_field("uid")?;
let id = schema.get_field("id")?;
let rows = sqlx::query_file!("sql/all-posts.sql")
.fetch_all(pool)
.await?;
let total = rows.len();
for (i, r) in rows.into_iter().enumerate() {
if i % 10_000 == 0 {
info!(
"{i}/{total} processed, elapsed {:.2}s",
start_time.elapsed().as_secs_f32()
);
}
let id_term = Term::from_field_text(uid, &r.uid);
index_writer.delete_term(id_term);
index_writer.add_document(doc!(
site => r.site.expect("UNKOWN_SITE"),
title => r.title.expect("UNKOWN_TITLE"),
// TODO: clean and extract text from HTML
summary => r.summary.expect("UNKNOWN_SUMMARY"),
link => r.link.expect("link"),
date => tantivy::DateTime::from_primitive(r.date.expect("date")),
is_read => r.is_read.expect("is_read"),
uid => r.uid,
id => r.id as i64,
))?;
}
index_writer.commit()?;
info!("took {:.2}s to reindex", start_time.elapsed().as_secs_f32());
Ok(())
}
pub fn search(&self) -> Result<String, TantivyError> {
use tantivy::{collector::TopDocs, query::QueryParser, Document, TantivyDocument};
let reader = self.index.reader()?;
let schema = self.index.schema();
let searcher = reader.searcher();
let site = schema.get_field("site")?;
let title = schema.get_field("title")?;
let summary = schema.get_field("summary")?;
let query_parser = QueryParser::for_index(&self.index, vec![site, title, summary]);
let query = query_parser.parse_query("grapheme")?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let mut results = vec![];
info!("search found {} docs", top_docs.len());
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
results.push(format!("{}", retrieved_doc.to_json(&schema)));
}
Ok(results.join(" "))
}
}
fn create_news_db(tantivy_db_path: &str) -> Result<(), TantivyError> {
std::fs::remove_dir_all(tantivy_db_path)?;
std::fs::create_dir_all(tantivy_db_path)?;
use tantivy::schema::*;
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("site", STRING | STORED);
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("summary", TEXT);
schema_builder.add_text_field("link", STRING | STORED);
schema_builder.add_date_field("date", FAST);
schema_builder.add_bool_field("is_read", FAST);
schema_builder.add_text_field("uid", STRING | STORED);
schema_builder.add_i64_field("id", FAST);
let schema = schema_builder.build();
Index::create_in_dir(tantivy_db_path, schema)?;
Ok(())
}