diff --git a/server/src/bin/server.rs b/server/src/bin/server.rs index 80f1cbf..3406a6f 100644 --- a/server/src/bin/server.rs +++ b/server/src/bin/server.rs @@ -23,9 +23,9 @@ use server::{ error::ServerError, graphql::{Attachment, GraphqlSchema, Mutation, QueryRoot}, nm::{attachment_bytes, cid_attachment_bytes}, + tantivy::TantivyConnection, }; use sqlx::postgres::PgPool; -use tantivy::{Index, IndexWriter}; #[get("/refresh")] async fn refresh(nm: &State) -> Result, Debug> { @@ -166,124 +166,19 @@ fn graphiql() -> content::RawHtml { content::RawHtml(GraphiQLSource::build().endpoint("/api/graphql").finish()) } -#[rocket::post("/create-news-db")] -fn create_news_db(config: &State) -> Result> { - create_news_db_impl(config)?; - Ok(format!( - "DB created in {}\n", - config.newsreader_tantivy_db_path - )) -} -fn create_news_db_impl(config: &Config) -> Result<(), ServerError> { - std::fs::remove_dir_all(&config.newsreader_tantivy_db_path).map_err(ServerError::from)?; - std::fs::create_dir_all(&config.newsreader_tantivy_db_path).map_err(ServerError::from)?; - use tantivy::schema::*; - let mut schema_builder = Schema::builder(); - schema_builder.add_text_field("site", STRING | STORED); - schema_builder.add_text_field("title", TEXT | STORED); - schema_builder.add_text_field("summary", TEXT); - schema_builder.add_text_field("link", STRING | STORED); - schema_builder.add_date_field("date", FAST); - schema_builder.add_bool_field("is_read", FAST); - schema_builder.add_text_field("uid", STRING | STORED); - schema_builder.add_i64_field("id", FAST); - - let schema = schema_builder.build(); - Index::create_in_dir(&config.newsreader_tantivy_db_path, schema).map_err(ServerError::from)?; - Ok(()) -} - #[rocket::post("/reindex-news-db")] async fn reindex_news_db( pool: &State, - config: &State, + tantivy_conn: &State, ) -> Result> { - use tantivy::{doc, Term}; - - let start_time = std::time::Instant::now(); - let pool: &PgPool = pool; - - let index = - Index::open_in_dir(&config.newsreader_tantivy_db_path).map_err(ServerError::from)?; - let mut index_writer = index.writer(50_000_000).map_err(ServerError::from)?; - let schema = index.schema(); - let site = schema.get_field("site").map_err(ServerError::from)?; - let title = schema.get_field("title").map_err(ServerError::from)?; - let summary = schema.get_field("summary").map_err(ServerError::from)?; - let link = schema.get_field("link").map_err(ServerError::from)?; - let date = schema.get_field("date").map_err(ServerError::from)?; - let is_read = schema.get_field("is_read").map_err(ServerError::from)?; - let uid = schema.get_field("uid").map_err(ServerError::from)?; - let id = schema.get_field("id").map_err(ServerError::from)?; - - let rows = sqlx::query_file!("sql/all-posts.sql") - .fetch_all(pool) - .await - .map_err(ServerError::from)?; - - let total = rows.len(); - for (i, r) in rows.into_iter().enumerate() { - if i % 10_000 == 0 { - info!( - "{i}/{total} processed, elapsed {:.2}s", - start_time.elapsed().as_secs_f32() - ); - } - let id_term = Term::from_field_text(uid, &r.uid); - index_writer.delete_term(id_term); - index_writer - .add_document(doc!( - site => r.site.expect("UNKOWN_SITE"), - title => r.title.expect("UNKOWN_TITLE"), - // TODO: clean and extract text from HTML - summary => r.summary.expect("UNKNOWN_SUMMARY"), - link => r.link.expect("link"), - date => tantivy::DateTime::from_primitive(r.date.expect("date")), - is_read => r.is_read.expect("is_read"), - uid => r.uid, - id => r.id as i64, - )) - .map_err(ServerError::from)?; - } - - index_writer.commit().map_err(ServerError::from)?; - - info!("took {:.2}s to reindex", start_time.elapsed().as_secs_f32()); - Ok(format!( - "DB openned in {}\n", - config.newsreader_tantivy_db_path - )) + tantivy_conn.reindex(pool).await?; + Ok(format!("Reindexed tantivy\n")) } #[rocket::get("/search-news-db")] -fn search_news_db( - index: &State, - reader: &State, -) -> Result> { - use tantivy::{collector::TopDocs, query::QueryParser, Document, TantivyDocument}; - - let searcher = reader.searcher(); - let schema = index.schema(); - let site = schema.get_field("site").map_err(ServerError::from)?; - let title = schema.get_field("title").map_err(ServerError::from)?; - let summary = schema.get_field("summary").map_err(ServerError::from)?; - let query_parser = QueryParser::for_index(&index, vec![site, title, summary]); - - let query = query_parser - .parse_query("grapheme") - .map_err(ServerError::from)?; - let top_docs = searcher - .search(&query, &TopDocs::with_limit(10)) - .map_err(ServerError::from)?; - let mut results = vec![]; - info!("search found {} docs", top_docs.len()); - for (_score, doc_address) in top_docs { - let retrieved_doc: TantivyDocument = - searcher.doc(doc_address).map_err(ServerError::from)?; - results.push(format!("{}", retrieved_doc.to_json(&schema))); - } - - Ok(format!("{}", results.join(" "))) +fn search_news_db(tantivy_conn: &State) -> Result> { + let res = tantivy_conn.search().map_err(ServerError::from)?; + Ok(format!("{}", res)) } #[rocket::get("/graphql?")] @@ -328,7 +223,6 @@ async fn main() -> Result<(), Box> { .mount( shared::urls::MOUNT_POINT, routes![ - create_news_db, reindex_news_db, search_news_db, original, @@ -352,14 +246,9 @@ async fn main() -> Result<(), Box> { std::fs::create_dir_all(&config.slurp_cache_path)?; } let pool = PgPool::connect(&config.newsreader_database_url).await?; - let tantivy_newsreader_index = match Index::open_in_dir(&config.newsreader_tantivy_db_path) { - Ok(idx) => idx, - Err(_) => { - create_news_db_impl(&config)?; - Index::open_in_dir(&config.newsreader_tantivy_db_path)? - } - }; - let tantivy_newsreader_reader = tantivy_newsreader_index.reader()?; + let tantivy_conn = + TantivyConnection::new(&config.newsreader_tantivy_db_path)?; + let schema = Schema::build(QueryRoot, Mutation, EmptySubscription) .data(Notmuch::default()) .data(config) @@ -371,8 +260,7 @@ async fn main() -> Result<(), Box> { .manage(schema) .manage(pool) .manage(Notmuch::default()) - .manage(tantivy_newsreader_index) - .manage(tantivy_newsreader_reader); + .manage(tantivy_conn); //.manage(Notmuch::with_config("../notmuch/testdata/notmuch.config")) rkt.launch().await?; diff --git a/server/src/lib.rs b/server/src/lib.rs index 04d8b19..3146b16 100644 --- a/server/src/lib.rs +++ b/server/src/lib.rs @@ -3,6 +3,7 @@ pub mod error; pub mod graphql; pub mod newsreader; pub mod nm; +pub mod tantivy; use std::{collections::HashMap, convert::Infallible, str::FromStr, sync::Arc}; diff --git a/server/src/tantivy.rs b/server/src/tantivy.rs new file mode 100644 index 0000000..031d11d --- /dev/null +++ b/server/src/tantivy.rs @@ -0,0 +1,111 @@ +use log::info; +use sqlx::postgres::PgPool; +use tantivy::{Index, IndexWriter, TantivyError}; + +use crate::error::ServerError; + +pub struct TantivyConnection { + index: Index, +} + +impl TantivyConnection { + pub fn new(tantivy_db_path: &str) -> Result { + let index = match Index::open_in_dir(tantivy_db_path) { + Ok(idx) => idx, + Err(_) => { + create_news_db(tantivy_db_path)?; + Index::open_in_dir(tantivy_db_path)? + } + }; + Ok(TantivyConnection { index }) + } + pub async fn reindex(&self, pool: &PgPool) -> Result<(), ServerError> { + use tantivy::{doc, Term}; + + let start_time = std::time::Instant::now(); + let pool: &PgPool = pool; + + let mut index_writer = self.index.writer(50_000_000)?; + let schema = self.index.schema(); + let site = schema.get_field("site")?; + let title = schema.get_field("title")?; + let summary = schema.get_field("summary")?; + let link = schema.get_field("link")?; + let date = schema.get_field("date")?; + let is_read = schema.get_field("is_read")?; + let uid = schema.get_field("uid")?; + let id = schema.get_field("id")?; + + let rows = sqlx::query_file!("sql/all-posts.sql") + .fetch_all(pool) + .await?; + + let total = rows.len(); + for (i, r) in rows.into_iter().enumerate() { + if i % 10_000 == 0 { + info!( + "{i}/{total} processed, elapsed {:.2}s", + start_time.elapsed().as_secs_f32() + ); + } + let id_term = Term::from_field_text(uid, &r.uid); + index_writer.delete_term(id_term); + index_writer.add_document(doc!( + site => r.site.expect("UNKOWN_SITE"), + title => r.title.expect("UNKOWN_TITLE"), + // TODO: clean and extract text from HTML + summary => r.summary.expect("UNKNOWN_SUMMARY"), + link => r.link.expect("link"), + date => tantivy::DateTime::from_primitive(r.date.expect("date")), + is_read => r.is_read.expect("is_read"), + uid => r.uid, + id => r.id as i64, + ))?; + } + + index_writer.commit()?; + + info!("took {:.2}s to reindex", start_time.elapsed().as_secs_f32()); + Ok(()) + } + pub fn search(&self) -> Result { + use tantivy::{collector::TopDocs, query::QueryParser, Document, TantivyDocument}; + + let reader = self.index.reader()?; + let schema = self.index.schema(); + let searcher = reader.searcher(); + let site = schema.get_field("site")?; + let title = schema.get_field("title")?; + let summary = schema.get_field("summary")?; + let query_parser = QueryParser::for_index(&self.index, vec![site, title, summary]); + + let query = query_parser.parse_query("grapheme")?; + let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; + let mut results = vec![]; + info!("search found {} docs", top_docs.len()); + for (_score, doc_address) in top_docs { + let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; + results.push(format!("{}", retrieved_doc.to_json(&schema))); + } + Ok(results.join(" ")) + } +} + +fn create_news_db(tantivy_db_path: &str) -> Result<(), TantivyError> { + std::fs::remove_dir_all(tantivy_db_path)?; + std::fs::create_dir_all(tantivy_db_path)?; + use tantivy::schema::*; + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("site", STRING | STORED); + schema_builder.add_text_field("title", TEXT | STORED); + schema_builder.add_text_field("summary", TEXT); + schema_builder.add_text_field("link", STRING | STORED); + schema_builder.add_date_field("date", FAST); + schema_builder.add_bool_field("is_read", FAST); + schema_builder.add_text_field("uid", STRING | STORED); + schema_builder.add_i64_field("id", FAST); + + let schema = schema_builder.build(); + Index::create_in_dir(tantivy_db_path, schema)?; + Ok(()) +}