server: WIP tantivy, cache slurps, use shared::compute_color,

This commit is contained in:
2024-09-19 15:53:09 -07:00
parent e7cbf9cc45
commit 30f510bb03
10 changed files with 1341 additions and 177 deletions

View File

@@ -18,18 +18,14 @@ use rocket::{
Response, State,
};
use rocket_cors::{AllowedHeaders, AllowedOrigins};
use serde::Deserialize;
use server::{
config::Config,
error::ServerError,
graphql::{Attachment, GraphqlSchema, Mutation, QueryRoot},
nm::{attachment_bytes, cid_attachment_bytes},
};
use sqlx::postgres::PgPool;
#[derive(Deserialize)]
struct Config {
newsreader_database_url: String,
}
use tantivy::{Index, IndexWriter};
#[get("/refresh")]
async fn refresh(nm: &State<Notmuch>) -> Result<Json<String>, Debug<NotmuchError>> {
@@ -170,6 +166,122 @@ fn graphiql() -> content::RawHtml<String> {
content::RawHtml(GraphiQLSource::build().endpoint("/api/graphql").finish())
}
#[rocket::post("/create-news-db")]
fn create_news_db(config: &State<Config>) -> Result<String, Debug<ServerError>> {
std::fs::remove_dir_all(&config.newsreader_tantivy_db_path).map_err(ServerError::from)?;
std::fs::create_dir_all(&config.newsreader_tantivy_db_path).map_err(ServerError::from)?;
use tantivy::schema::*;
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("site", STRING | STORED);
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("summary", TEXT);
schema_builder.add_text_field("link", STRING | STORED);
schema_builder.add_date_field("date", FAST);
schema_builder.add_bool_field("is_read", FAST);
schema_builder.add_text_field("uid", STRING | STORED);
schema_builder.add_i64_field("id", FAST);
let schema = schema_builder.build();
Index::create_in_dir(&config.newsreader_tantivy_db_path, schema).map_err(ServerError::from)?;
Ok(format!(
"DB created in {}\n",
config.newsreader_tantivy_db_path
))
}
#[rocket::post("/reindex-news-db")]
async fn reindex_news_db(
pool: &State<PgPool>,
config: &State<Config>,
) -> Result<String, Debug<ServerError>> {
use tantivy::{doc, Term};
let start_time = std::time::Instant::now();
let pool: &PgPool = pool;
let index =
Index::open_in_dir(&config.newsreader_tantivy_db_path).map_err(ServerError::from)?;
let mut index_writer = index.writer(50_000_000).map_err(ServerError::from)?;
let schema = index.schema();
let site = schema.get_field("site").map_err(ServerError::from)?;
let title = schema.get_field("title").map_err(ServerError::from)?;
let summary = schema.get_field("summary").map_err(ServerError::from)?;
let link = schema.get_field("link").map_err(ServerError::from)?;
let date = schema.get_field("date").map_err(ServerError::from)?;
let is_read = schema.get_field("is_read").map_err(ServerError::from)?;
let uid = schema.get_field("uid").map_err(ServerError::from)?;
let id = schema.get_field("id").map_err(ServerError::from)?;
let rows = sqlx::query_file!("sql/all-posts.sql")
.fetch_all(pool)
.await
.map_err(ServerError::from)?;
let total = rows.len();
for (i, r) in rows.into_iter().enumerate() {
if i % 10_000 == 0 {
info!(
"{i}/{total} processed, elapsed {:.2}s",
start_time.elapsed().as_secs_f32()
);
}
let id_term = Term::from_field_text(uid, &r.uid);
index_writer.delete_term(id_term);
index_writer
.add_document(doc!(
site => r.site.expect("UNKOWN_SITE"),
title => r.title.expect("UNKOWN_TITLE"),
// TODO: clean and extract text from HTML
summary => r.summary.expect("UNKNOWN_SUMMARY"),
link => r.link.expect("link"),
date => tantivy::DateTime::from_primitive(r.date.expect("date")),
is_read => r.is_read.expect("is_read"),
uid => r.uid,
id => r.id as i64,
))
.map_err(ServerError::from)?;
}
index_writer.commit().map_err(ServerError::from)?;
info!("took {:.2}s to reindex", start_time.elapsed().as_secs_f32());
Ok(format!(
"DB openned in {}\n",
config.newsreader_tantivy_db_path
))
}
#[rocket::get("/search-news-db")]
fn search_news_db(
index: &State<tantivy::Index>,
reader: &State<tantivy::IndexReader>,
) -> Result<String, Debug<ServerError>> {
use tantivy::{collector::TopDocs, query::QueryParser, Document, TantivyDocument};
let searcher = reader.searcher();
let schema = index.schema();
let site = schema.get_field("site").map_err(ServerError::from)?;
let title = schema.get_field("title").map_err(ServerError::from)?;
let summary = schema.get_field("summary").map_err(ServerError::from)?;
let query_parser = QueryParser::for_index(&index, vec![site, title, summary]);
let query = query_parser
.parse_query("grapheme")
.map_err(ServerError::from)?;
let top_docs = searcher
.search(&query, &TopDocs::with_limit(10))
.map_err(ServerError::from)?;
let mut results = vec![];
info!("search found {} docs", top_docs.len());
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument =
searcher.doc(doc_address).map_err(ServerError::from)?;
results.push(format!("{}", retrieved_doc.to_json(&schema)));
}
Ok(format!("{}", results.join(" ")))
}
#[rocket::get("/graphql?<query..>")]
async fn graphql_query(schema: &State<GraphqlSchema>, query: GraphQLQuery) -> GraphQLResponse {
query.execute(schema.inner()).await
@@ -183,7 +295,6 @@ async fn graphql_request(
request.execute(schema.inner()).await
}
#[rocket::main]
async fn main() -> Result<(), Box<dyn Error>> {
glog::new()
@@ -213,6 +324,9 @@ async fn main() -> Result<(), Box<dyn Error>> {
.mount(
shared::urls::MOUNT_POINT,
routes![
create_news_db,
reindex_news_db,
search_news_db,
original,
refresh,
show_pretty,
@@ -229,14 +343,26 @@ async fn main() -> Result<(), Box<dyn Error>> {
.attach(AdHoc::config::<Config>());
let config: Config = rkt.figment().extract()?;
if !std::fs::exists(&config.slurp_cache_path)? {
info!("Creating slurp cache @ '{}'", &config.slurp_cache_path);
std::fs::create_dir_all(&config.slurp_cache_path)?;
}
let pool = PgPool::connect(&config.newsreader_database_url).await?;
let tantivy_newsreader_index = Index::open_in_dir(&config.newsreader_tantivy_db_path)?;
let tantivy_newsreader_reader = tantivy_newsreader_index.reader()?;
let schema = Schema::build(QueryRoot, Mutation, EmptySubscription)
.data(Notmuch::default())
.data(config)
.data(pool.clone())
.extension(async_graphql::extensions::Logger)
.finish();
let rkt = rkt.manage(schema).manage(pool).manage(Notmuch::default());
let rkt = rkt
.manage(schema)
.manage(pool)
.manage(Notmuch::default())
.manage(tantivy_newsreader_index)
.manage(tantivy_newsreader_reader);
//.manage(Notmuch::with_config("../notmuch/testdata/notmuch.config"))
rkt.launch().await?;