server: WIP tantivy integration

2024-09-28 11:17:52 -07:00 · 2024-09-28 11:17:52 -07:00 · ebf32a9905
commit ebf32a9905
parent 005a457348
8 changed files with 285 additions and 99 deletions
--- a/server/sql/all-posts.sql
+++ b/server/sql/all-posts.sql
@ -8,3 +8,4 @@ SELECT
    uid,
    id
 FROM post
+WHERE title ILIKE '%grapheme%' OR summary ILIKE '%grapheme%';
--- a/server/sql/threads-from-uid.sql
+++ b/server/sql/threads-from-uid.sql
@ -0,0 +1,13 @@
+SELECT
+    site,
+    date,
+    is_read,
+    title,
+    uid,
+    name
+FROM
+    post p
+    JOIN feed f ON p.site = f.slug
+WHERE
+    uid = ANY ($1)
+;
--- a/server/src/bin/server.rs
+++ b/server/src/bin/server.rs
@ -166,21 +166,6 @@ fn graphiql() -> content::RawHtml<String> {
    content::RawHtml(GraphiQLSource::build().endpoint("/api/graphql").finish())
 }

-#[rocket::post("/reindex-news-db")]
-async fn reindex_news_db(
-    pool: &State<PgPool>,
-    tantivy_conn: &State<TantivyConnection>,
-) -> Result<String, Debug<ServerError>> {
-    tantivy_conn.reindex(pool).await?;
-    Ok(format!("Reindexed tantivy\n"))
-}
-
-#[rocket::get("/search-news-db")]
-fn search_news_db(tantivy_conn: &State<TantivyConnection>) -> Result<String, Debug<ServerError>> {
-    let res = tantivy_conn.search().map_err(ServerError::from)?;
-    Ok(format!("{}", res))
-}
-
 #[rocket::get("/graphql?<query..>")]
 async fn graphql_query(schema: &State<GraphqlSchema>, query: GraphQLQuery) -> GraphQLResponse {
    query.execute(schema.inner()).await
@ -223,8 +208,6 @@ async fn main() -> Result<(), Box<dyn Error>> {
        .mount(
            shared::urls::MOUNT_POINT,
            routes![
-                reindex_news_db,
-                search_news_db,
                original,
                refresh,
                show_pretty,
@ -246,21 +229,17 @@ async fn main() -> Result<(), Box<dyn Error>> {
        std::fs::create_dir_all(&config.slurp_cache_path)?;
    }
    let pool = PgPool::connect(&config.newsreader_database_url).await?;
-    let tantivy_conn =
-        TantivyConnection::new(&config.newsreader_tantivy_db_path)?;
+    let tantivy_conn = TantivyConnection::new(&config.newsreader_tantivy_db_path)?;

    let schema = Schema::build(QueryRoot, Mutation, EmptySubscription)
        .data(Notmuch::default())
        .data(config)
        .data(pool.clone())
+        .data(tantivy_conn)
        .extension(async_graphql::extensions::Logger)
        .finish();

-    let rkt = rkt
-        .manage(schema)
-        .manage(pool)
-        .manage(Notmuch::default())
-        .manage(tantivy_conn);
+    let rkt = rkt.manage(schema).manage(pool).manage(Notmuch::default());
    //.manage(Notmuch::with_config("../notmuch/testdata/notmuch.config"))

    rkt.launch().await?;
--- a/server/src/graphql.rs
+++ b/server/src/graphql.rs
@ -8,7 +8,7 @@ use notmuch::Notmuch;
 use serde::{Deserialize, Serialize};
 use sqlx::postgres::PgPool;

-use crate::{config::Config, newsreader, nm, Query};
+use crate::{config::Config, newsreader, nm, tantivy::TantivyConnection, Query};

 /// # Number of seconds since the Epoch
 pub type UnixTime = isize;
@ -224,6 +224,7 @@ pub struct Tag {
 struct SearchCursor {
    newsreader_offset: i32,
    notmuch_offset: i32,
+    tantivy_offset: i32,
 }

 pub struct QueryRoot;
@ -258,10 +259,13 @@ impl QueryRoot {
        info!("search({after:?} {before:?} {first:?} {last:?} {query:?})",);
        let nm = ctx.data_unchecked::<Notmuch>();
        let pool = ctx.data_unchecked::<PgPool>();
+        let tantivy = ctx.data_unchecked::<TantivyConnection>();

+        #[derive(Debug)]
        enum ThreadSummaryCursor {
            Newsreader(i32, ThreadSummary),
            Notmuch(i32, ThreadSummary),
+            Tantivy(i32, ThreadSummary),
        }
        Ok(connection::query(
            after,
@ -279,8 +283,11 @@ impl QueryRoot {
                );
                let newsreader_after = after.as_ref().map(|sc| sc.newsreader_offset);
                let notmuch_after = after.as_ref().map(|sc| sc.notmuch_offset);
+                let tantivy_after = after.as_ref().map(|sc| sc.tantivy_offset);
+
                let newsreader_before = before.as_ref().map(|sc| sc.newsreader_offset);
                let notmuch_before = before.as_ref().map(|sc| sc.notmuch_offset);
+                let tantivy_before = before.as_ref().map(|sc| sc.tantivy_offset);

                let newsreader_query: Query = query.parse()?;
                info!("newsreader_query {newsreader_query:?}");
@ -318,15 +325,39 @@ impl QueryRoot {
                    Vec::new()
                };

+                let tantivy_results = if newsreader_query.is_tantivy {
+                    tantivy
+                        .search(
+                            pool,
+                            tantivy_after,
+                            tantivy_before,
+                            first.map(|v| v as i32),
+                            last.map(|v| v as i32),
+                            &newsreader_query,
+                        )
+                        .await?
+                        .into_iter()
+                        .map(|(cur, ts)| ThreadSummaryCursor::Tantivy(cur, ts))
+                        .collect()
+                } else {
+                    Vec::new()
+                };
+
+                info!(
+                    "tantivy results:\nis_tantivy:{} {tantivy_results:#?}",
+                    newsreader_query.is_tantivy
+                );
                let mut results: Vec<_> = newsreader_results
                    .into_iter()
                    .chain(notmuch_results)
+                    .chain(tantivy_results)
                    .collect();

                // The leading '-' is to reverse sort
                results.sort_by_key(|item| match item {
                    ThreadSummaryCursor::Newsreader(_, ts) => -ts.timestamp,
                    ThreadSummaryCursor::Notmuch(_, ts) => -ts.timestamp,
+                    ThreadSummaryCursor::Tantivy(_, ts) => -ts.timestamp,
                });

                let mut has_next_page = before.is_some();
@ -348,6 +379,7 @@ impl QueryRoot {
                let mut connection = Connection::new(has_previous_page, has_next_page);
                let mut newsreader_offset = 0;
                let mut notmuch_offset = 0;
+                let mut tantivy_offset = 0;

                connection.edges.extend(results.into_iter().map(|item| {
                    let thread_summary;
@ -360,10 +392,15 @@ impl QueryRoot {
                            thread_summary = ts;
                            notmuch_offset = offset;
                        }
+                        ThreadSummaryCursor::Tantivy(offset, ts) => {
+                            thread_summary = ts;
+                            tantivy_offset = offset;
+                        }
                    }
                    let cur = OpaqueCursor(SearchCursor {
                        newsreader_offset,
                        notmuch_offset,
+                        tantivy_offset,
                    });
                    Edge::new(cur, thread_summary)
                }));
@ -443,6 +480,16 @@ impl Mutation {
        nm.tag_remove(&tag, &query)?;
        Ok(true)
    }
+    /// Drop and recreate tantivy index. Warning this is slow
+    async fn drop_and_load_index<'ctx>(&self, ctx: &Context<'ctx>) -> Result<bool, Error> {
+        let tantivy = ctx.data_unchecked::<TantivyConnection>();
+        let pool = ctx.data_unchecked::<PgPool>();
+
+        tantivy.drop_and_load_index()?;
+        tantivy.reindex(pool).await?;
+
+        Ok(true)
+    }
 }

 pub type GraphqlSchema = Schema<QueryRoot, Mutation, EmptySubscription>;
--- a/server/src/lib.rs
+++ b/server/src/lib.rs
@ -18,11 +18,19 @@ use lol_html::{
 };
 use maplit::{hashmap, hashset};
 use scraper::{Html, Selector};
+use sqlx::{postgres::PgPool, types::time::PrimitiveDateTime};
 use thiserror::Error;
 use tokio::sync::Mutex;
 use url::Url;

-use crate::newsreader::{extract_thread_id, is_newsreader_thread};
+use crate::{
+    error::ServerError,
+    graphql::ThreadSummary,
+    newsreader::{extract_thread_id, is_newsreader_thread},
+};
+
+const NEWSREADER_TAG_PREFIX: &'static str = "News/";
+const NEWSREADER_THREAD_PREFIX: &'static str = "news:";

 // TODO: figure out how to use Cow
 #[async_trait]
@ -604,6 +612,7 @@ pub struct Query {
    pub remainder: Vec<String>,
    pub is_notmuch: bool,
    pub is_newsreader: bool,
+    pub is_tantivy: bool,
 }

 impl Query {
@ -638,6 +647,7 @@ impl FromStr for Query {
        let mut remainder = Vec::new();
        let mut is_notmuch = false;
        let mut is_newsreader = false;
+        let mut is_tantivy = false;
        for word in s.split_whitespace() {
            if word == "is:unread" {
                unread_only = true
@ -664,6 +674,8 @@ impl FromStr for Query {
            is_newsreader = true;
            is_notmuch = true;
        }
+        // TODO: decide if tantivy gets it's own life or replaces newsreader
+        is_tantivy = is_newsreader;
        Ok(Query {
            unread_only,
            tag,
@ -671,6 +683,53 @@ impl FromStr for Query {
            remainder,
            is_notmuch,
            is_newsreader,
+            is_tantivy,
        })
    }
 }
+pub struct ThreadSummaryRecord {
+    pub site: Option<String>,
+    pub date: Option<PrimitiveDateTime>,
+    pub is_read: Option<bool>,
+    pub title: Option<String>,
+    pub uid: String,
+    pub name: Option<String>,
+}
+
+async fn thread_summary_from_row(r: ThreadSummaryRecord) -> ThreadSummary {
+    let site = r.site.unwrap_or("UNKOWN TAG".to_string());
+    let mut tags = vec![format!("{NEWSREADER_TAG_PREFIX}{site}")];
+    if !r.is_read.unwrap_or(true) {
+        tags.push("unread".to_string());
+    };
+    let mut title = r.title.unwrap_or("NO TITLE".to_string());
+    title = clean_title(&title).await.expect("failed to clean title");
+    ThreadSummary {
+        thread: format!("{NEWSREADER_THREAD_PREFIX}{}", r.uid),
+        timestamp: r
+            .date
+            .expect("post missing date")
+            .assume_utc()
+            .unix_timestamp() as isize,
+        date_relative: "TODO date_relative".to_string(),
+        matched: 0,
+        total: 1,
+        authors: r.name.unwrap_or_else(|| site.clone()),
+        subject: title,
+        tags,
+    }
+}
+async fn clean_title(title: &str) -> Result<String, ServerError> {
+    // Make title HTML so html parsers work
+    let mut title = format!("<html>{title}</html>");
+    let title_tranformers: Vec<Box<dyn Transformer>> =
+        vec![Box::new(EscapeHtml), Box::new(StripHtml)];
+    // Make title HTML so html parsers work
+    title = format!("<html>{title}</html>");
+    for t in title_tranformers.iter() {
+        if t.should_run(&None, &title) {
+            title = t.transform(&None, &title).await?;
+        }
+    }
+    Ok(title)
+}
--- a/server/src/newsreader.rs
+++ b/server/src/newsreader.rs
@ -10,35 +10,33 @@ use tokio::sync::Mutex;
 use url::Url;

 use crate::{
-    compute_offset_limit,
+    clean_title, compute_offset_limit,
    config::Config,
    error::ServerError,
    graphql::{NewsPost, Tag, Thread, ThreadSummary},
-    AddOutlink, EscapeHtml, FrameImages, InlineStyle, Query, SanitizeHtml, SlurpContents,
-    StripHtml, Transformer,
+    thread_summary_from_row, AddOutlink, EscapeHtml, FrameImages, InlineStyle, Query, SanitizeHtml,
+    SlurpContents, StripHtml, ThreadSummaryRecord, Transformer, NEWSREADER_TAG_PREFIX,
+    NEWSREADER_THREAD_PREFIX,
 };

-const TAG_PREFIX: &'static str = "News/";
-const THREAD_PREFIX: &'static str = "news:";
-
 pub fn is_newsreader_search(query: &str) -> bool {
-    query.contains(TAG_PREFIX)
+    query.contains(NEWSREADER_TAG_PREFIX)
 }

 pub fn is_newsreader_thread(query: &str) -> bool {
-    query.starts_with(THREAD_PREFIX)
+    query.starts_with(NEWSREADER_THREAD_PREFIX)
 }

 pub fn extract_thread_id(query: &str) -> &str {
-    &query[THREAD_PREFIX.len()..]
+    &query[NEWSREADER_THREAD_PREFIX.len()..]
 }

 pub fn extract_site(tag: &str) -> &str {
-    &tag[TAG_PREFIX.len()..]
+    &tag[NEWSREADER_TAG_PREFIX.len()..]
 }

 pub fn make_news_tag(tag: &str) -> String {
-    format!("tag:{TAG_PREFIX}{tag}")
+    format!("tag:{NEWSREADER_TAG_PREFIX}{tag}")
 }

 pub async fn count(pool: &PgPool, query: &Query) -> Result<usize, ServerError> {
@ -93,37 +91,23 @@ pub async fn search(
    )
    .fetch_all(pool)
    .await?;
-
    let mut res = Vec::new();
    for (i, r) in rows.into_iter().enumerate() {
-        let site = r.site.unwrap_or("UNKOWN TAG".to_string());
-        let mut tags = vec![format!("{TAG_PREFIX}{site}")];
-        if !r.is_read.unwrap_or(true) {
-            tags.push("unread".to_string());
-        };
-        let mut title = r.title.unwrap_or("NO TITLE".to_string());
-        title = clean_title(&title).await.expect("failed to clean title");
        res.push((
            i as i32 + offset,
-            ThreadSummary {
-                thread: format!("{THREAD_PREFIX}{}", r.uid),
-                timestamp: r
-                    .date
-                    .expect("post missing date")
-                    .assume_utc()
-                    .unix_timestamp() as isize,
-                date_relative: "TODO date_relative".to_string(),
-                matched: 0,
-                total: 1,
-                authors: r.name.unwrap_or_else(|| site.clone()),
-                subject: title,
-                tags,
-            },
+            thread_summary_from_row(ThreadSummaryRecord {
+                site: r.site,
+                date: r.date,
+                is_read: r.is_read,
+                title: r.title,
+                uid: r.uid,
+                name: r.name,
+            })
+            .await,
        ));
    }
    Ok(res)
 }
-
 pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, ServerError> {
    // TODO: optimize query by using needs_unread
    let tags = sqlx::query_file!("sql/tags.sql").fetch_all(pool).await?;
@ -131,7 +115,10 @@ pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, Server
        .into_iter()
        .map(|tag| {
            let unread = tag.unread.unwrap_or(0).try_into().unwrap_or(0);
-            let name = format!("{TAG_PREFIX}{}", tag.site.expect("tag must have site"));
+            let name = format!(
+                "{NEWSREADER_TAG_PREFIX}{}",
+                tag.site.expect("tag must have site")
+            );
            let hex = compute_color(&name);
            Tag {
                name,
@ -150,8 +137,8 @@ pub async fn thread(
    thread_id: String,
 ) -> Result<Thread, ServerError> {
    let id = thread_id
-        .strip_prefix(THREAD_PREFIX)
-        .expect("news thread doesn't start with '{THREAD_PREFIX}'")
+        .strip_prefix(NEWSREADER_THREAD_PREFIX)
+        .expect("news thread doesn't start with '{NEWSREADER_THREAD_PREFIX}'")
        .to_string();

    let r = sqlx::query_file!("sql/thread.sql", id)
@ -265,17 +252,3 @@ pub async fn set_read_status<'ctx>(
        .await?;
    Ok(true)
 }
-async fn clean_title(title: &str) -> Result<String, ServerError> {
-    // Make title HTML so html parsers work
-    let mut title = format!("<html>{title}</html>");
-    let title_tranformers: Vec<Box<dyn Transformer>> =
-        vec![Box::new(EscapeHtml), Box::new(StripHtml)];
-    // Make title HTML so html parsers work
-    title = format!("<html>{title}</html>");
-    for t in title_tranformers.iter() {
-        if t.should_run(&None, &title) {
-            title = t.transform(&None, &title).await?;
-        }
-    }
-    Ok(title)
-}
--- a/server/src/tantivy.rs
+++ b/server/src/tantivy.rs
@ -1,23 +1,31 @@
 use log::info;
 use sqlx::postgres::PgPool;
-use tantivy::{Index, IndexWriter, TantivyError};
+use tantivy::{schema::Value, Index, TantivyError};

-use crate::error::ServerError;
+use crate::{
+    error::ServerError, graphql::ThreadSummary, thread_summary_from_row, Query, ThreadSummaryRecord,
+};

 pub struct TantivyConnection {
-    index: Index,
+    db_path: String,
+    //index: Index,
 }

 impl TantivyConnection {
-    pub fn new(tantivy_db_path: &str) -> Result<TantivyConnection, TantivyError> {
-        let index = match Index::open_in_dir(tantivy_db_path) {
+    fn get_index(&self) -> Result<Index, TantivyError> {
+        Ok(match Index::open_in_dir(&self.db_path) {
            Ok(idx) => idx,
            Err(_) => {
-                create_news_db(tantivy_db_path)?;
-                Index::open_in_dir(tantivy_db_path)?
+                create_news_db(&self.db_path)?;
+                Index::open_in_dir(&self.db_path)?
            }
-        };
-        Ok(TantivyConnection { index })
+        })
+    }
+
+    pub fn new(tantivy_db_path: &str) -> Result<TantivyConnection, TantivyError> {
+        Ok(TantivyConnection {
+            db_path: tantivy_db_path.to_string(),
+        })
    }
    pub async fn reindex(&self, pool: &PgPool) -> Result<(), ServerError> {
        use tantivy::{doc, Term};
@ -25,8 +33,9 @@ impl TantivyConnection {
        let start_time = std::time::Instant::now();
        let pool: &PgPool = pool;

-        let mut index_writer = self.index.writer(50_000_000)?;
-        let schema = self.index.schema();
+        let index = self.get_index()?;
+        let mut index_writer = index.writer(50_000_000)?;
+        let schema = index.schema();
        let site = schema.get_field("site")?;
        let title = schema.get_field("title")?;
        let summary = schema.get_field("summary")?;
@ -68,30 +77,76 @@ impl TantivyConnection {
        info!("took {:.2}s to reindex", start_time.elapsed().as_secs_f32());
        Ok(())
    }
-    pub fn search(&self) -> Result<String, TantivyError> {
+    pub async fn search(
+        &self,
+        pool: &PgPool,
+        after: Option<i32>,
+        before: Option<i32>,
+        first: Option<i32>,
+        last: Option<i32>,
+        query: &Query,
+    ) -> Result<Vec<(i32, ThreadSummary)>, async_graphql::Error> {
        use tantivy::{collector::TopDocs, query::QueryParser, Document, TantivyDocument};
+        // TODO: set based on function parameters
+        let offset = 0;

-        let reader = self.index.reader()?;
-        let schema = self.index.schema();
+        let index = self.get_index()?;
+        let reader = index.reader()?;
+        let schema = index.schema();
        let searcher = reader.searcher();
        let site = schema.get_field("site")?;
+        let uid = schema.get_field("uid")?;
        let title = schema.get_field("title")?;
        let summary = schema.get_field("summary")?;
-        let query_parser = QueryParser::for_index(&self.index, vec![site, title, summary]);
+        let date = schema.get_field("date")?;
+        let query_parser = QueryParser::for_index(&index, vec![title, summary]);

-        let query = query_parser.parse_query("grapheme")?;
+        let query = query_parser.parse_query(&query.remainder.join(" "))?;
        let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
-        let mut results = vec![];
        info!("search found {} docs", top_docs.len());
-        for (_score, doc_address) in top_docs {
-            let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
-            results.push(format!("{}", retrieved_doc.to_json(&schema)));
+        let uids = top_docs
+            .into_iter()
+            .map(|(_, doc_address)| {
+                searcher.doc(doc_address).map(|doc: TantivyDocument| {
+                    doc.get_first(uid)
+                        .expect("doc missing uid")
+                        .as_str()
+                        .expect("doc str missing")
+                        .to_string()
+                })
+            })
+            .collect::<Result<Vec<String>, TantivyError>>()?;
+
+        //let uids = format!("'{}'", uids.join("','"));
+        info!("uids {uids:?}");
+        let rows = sqlx::query_file!("sql/threads-from-uid.sql", &uids as &[String])
+            .fetch_all(pool)
+            .await?;
+        let mut res = Vec::new();
+        info!("found {} hits joining w/ tantivy", rows.len());
+        for (i, r) in rows.into_iter().enumerate() {
+            res.push((
+                i as i32 + offset,
+                thread_summary_from_row(ThreadSummaryRecord {
+                    site: r.site,
+                    date: r.date,
+                    is_read: r.is_read,
+                    title: r.title,
+                    uid: r.uid,
+                    name: r.name,
+                })
+                .await,
+            ));
        }
-        Ok(results.join(" "))
+        Ok(res)
+    }
+    pub fn drop_and_load_index(&self) -> Result<(), TantivyError> {
+        create_news_db(&self.db_path)
    }
 }

 fn create_news_db(tantivy_db_path: &str) -> Result<(), TantivyError> {
+    info!("create_news_db");
    std::fs::remove_dir_all(tantivy_db_path)?;
    std::fs::create_dir_all(tantivy_db_path)?;
    use tantivy::schema::*;
@ -100,7 +155,7 @@ fn create_news_db(tantivy_db_path: &str) -> Result<(), TantivyError> {
    schema_builder.add_text_field("title", TEXT | STORED);
    schema_builder.add_text_field("summary", TEXT);
    schema_builder.add_text_field("link", STRING | STORED);
-    schema_builder.add_date_field("date", FAST);
+    schema_builder.add_date_field("date", FAST | INDEXED | STORED);
    schema_builder.add_bool_field("is_read", FAST);
    schema_builder.add_text_field("uid", STRING | STORED);
    schema_builder.add_i64_field("id", FAST);
--- a/server/static/graphql-playground.html
+++ b/server/static/graphql-playground.html
@ -0,0 +1,59 @@
+<!DOCTYPE html>
+<html>
+
+<head>
+  <meta charset=utf-8 />
+  <meta name="viewport" content="user-scalable=no, initial-scale=1.0, minimum-scale=1.0, maximum-scale=1.0, minimal-ui">
+  <title>GraphQL Playground</title>
+  <link rel="stylesheet" href="//cdn.jsdelivr.net/npm/graphql-playground-react/build/static/css/index.css" />
+  <link rel="shortcut icon" href="//cdn.jsdelivr.net/npm/graphql-playground-react/build/favicon.png" />
+  <script src="//cdn.jsdelivr.net/npm/graphql-playground-react/build/static/js/middleware.js"></script>
+</head>
+
+<body>
+  <div id="root">
+    <style>
+      body {
+        background-color: rgb(23, 42, 58);
+        font-family: Open Sans, sans-serif;
+        height: 90vh;
+      }
+
+      #root {
+        height: 100%;
+        width: 100%;
+        display: flex;
+        align-items: center;
+        justify-content: center;
+      }
+
+      .loading {
+        font-size: 32px;
+        font-weight: 200;
+        color: rgba(255, 255, 255, .6);
+        margin-left: 20px;
+      }
+
+      img {
+        width: 78px;
+        height: 78px;
+      }
+
+      .title {
+        font-weight: 400;
+      }
+    </style>
+    <img src='//cdn.jsdelivr.net/npm/graphql-playground-react/build/logo.png' alt=''>
+    <div class="loading"> Loading
+      <span class="title">GraphQL Playground</span>
+    </div>
+  </div>
+  <script>window.addEventListener('load', function (event) {
+      GraphQLPlayground.init(document.getElementById('root'), {
+        // options as 'endpoint' belong here
+        endpoint: "/api/graphql",
+      })
+    })</script>
+</body>
+
+</html>