web & server: using tantivy for news post search

This commit is contained in:
Bill Thiede 2024-09-29 16:28:05 -07:00
parent f36d1e0c29
commit 3ec1741f10
22 changed files with 737 additions and 170 deletions

View File

@ -1,6 +1,6 @@
{
"db_name": "PostgreSQL",
"query": "SELECT\n COUNT(*) count\nFROM\n post\nWHERE\n ($1::text IS NULL OR site = $1)\n AND (\n NOT $2\n OR NOT is_read\n )\n",
"query": "SELECT\n COUNT(*) count\nFROM\n post\nWHERE\n (\n $1 :: text IS NULL\n OR site = $1\n )\n AND (\n NOT $2\n OR NOT is_read\n )\n",
"describe": {
"columns": [
{
@ -19,5 +19,5 @@
null
]
},
"hash": "e28b890e308f483aa6bd08617548ae66294ae1e99b1cab49f5f4211e0fd7d419"
"hash": "13a9193d91264b678c18df032cc61b523e6a804ae9569da18455eb380eadb515"
}

View File

@ -1,6 +1,6 @@
{
"db_name": "PostgreSQL",
"query": "SELECT\n site,\n title,\n summary,\n link,\n date,\n is_read,\n uid,\n id\nFROM post\n",
"query": "SELECT\n site,\n title,\n summary,\n link,\n date,\n is_read,\n uid,\n p.id id\nFROM\n post AS p\n JOIN feed AS f ON p.site = f.slug -- necessary to weed out nzb posts\nORDER BY\n date DESC;\n",
"describe": {
"columns": [
{
@ -58,5 +58,5 @@
false
]
},
"hash": "1b2244c9b9b64a1395d8d266f5df5352242bbe5efe481b0852e1c1d4b40584a7"
"hash": "4cbb6252a0b76f6a452ee09abb9f4b99308a74db33cdfb91baaa1bb98dc53a82"
}

View File

@ -0,0 +1,64 @@
{
"db_name": "PostgreSQL",
"query": "SELECT\n site AS \"site!\",\n title AS \"title!\",\n summary AS \"summary!\",\n link AS \"link!\",\n date AS \"date!\",\n is_read AS \"is_read!\",\n uid AS \"uid!\",\n p.id id\nFROM\n post p\n JOIN feed f ON p.site = f.slug\nWHERE\n uid = ANY ($1);\n",
"describe": {
"columns": [
{
"ordinal": 0,
"name": "site!",
"type_info": "Text"
},
{
"ordinal": 1,
"name": "title!",
"type_info": "Text"
},
{
"ordinal": 2,
"name": "summary!",
"type_info": "Text"
},
{
"ordinal": 3,
"name": "link!",
"type_info": "Text"
},
{
"ordinal": 4,
"name": "date!",
"type_info": "Timestamp"
},
{
"ordinal": 5,
"name": "is_read!",
"type_info": "Bool"
},
{
"ordinal": 6,
"name": "uid!",
"type_info": "Text"
},
{
"ordinal": 7,
"name": "id",
"type_info": "Int4"
}
],
"parameters": {
"Left": [
"TextArray"
]
},
"nullable": [
true,
true,
true,
true,
true,
true,
false,
false
]
},
"hash": "700757f6fb175b1aa246816ac60501576a38db222380c034ae7bc030ba6a915e"
}

View File

@ -0,0 +1,20 @@
{
"db_name": "PostgreSQL",
"query": "SELECT\n uid\nFROM\n post AS p\n JOIN feed AS f ON p.site = f.slug -- necessary to weed out nzb posts\n;\n",
"describe": {
"columns": [
{
"ordinal": 0,
"name": "uid",
"type_info": "Text"
}
],
"parameters": {
"Left": []
},
"nullable": [
false
]
},
"hash": "c16821d20c1e6012d6079008889af16eed687c74d9d77957ad08aaaeefcd1ff2"
}

View File

@ -0,0 +1,52 @@
{
"db_name": "PostgreSQL",
"query": "SELECT\n site,\n date,\n is_read,\n title,\n uid,\n name\nFROM\n post p\n JOIN feed f ON p.site = f.slug\nWHERE\n uid = ANY ($1)\nORDER BY\n date DESC;\n",
"describe": {
"columns": [
{
"ordinal": 0,
"name": "site",
"type_info": "Text"
},
{
"ordinal": 1,
"name": "date",
"type_info": "Timestamp"
},
{
"ordinal": 2,
"name": "is_read",
"type_info": "Bool"
},
{
"ordinal": 3,
"name": "title",
"type_info": "Text"
},
{
"ordinal": 4,
"name": "uid",
"type_info": "Text"
},
{
"ordinal": 5,
"name": "name",
"type_info": "Text"
}
],
"parameters": {
"Left": [
"TextArray"
]
},
"nullable": [
true,
true,
true,
true,
false,
true
]
},
"hash": "ed34545b5fc681e9df9fc47c6ff93e5eac874eb8f95f55a27859537c51bca4f4"
}

View File

@ -6,6 +6,9 @@ SELECT
date,
is_read,
uid,
id
FROM post
WHERE title ILIKE '%grapheme%' OR summary ILIKE '%grapheme%';
p.id id
FROM
post AS p
JOIN feed AS f ON p.site = f.slug -- necessary to weed out nzb posts
ORDER BY
date DESC;

6
server/sql/all-uids.sql Normal file
View File

@ -0,0 +1,6 @@
SELECT
uid
FROM
post AS p
JOIN feed AS f ON p.site = f.slug -- necessary to weed out nzb posts
;

View File

@ -3,7 +3,10 @@ SELECT
FROM
post
WHERE
($1::text IS NULL OR site = $1)
(
$1 :: text IS NULL
OR site = $1
)
AND (
NOT $2
OR NOT is_read

View File

@ -0,0 +1,14 @@
SELECT
site AS "site!",
title AS "title!",
summary AS "summary!",
link AS "link!",
date AS "date!",
is_read AS "is_read!",
uid AS "uid!",
p.id id
FROM
post p
JOIN feed f ON p.site = f.slug
WHERE
uid = ANY ($1);

View File

@ -10,4 +10,5 @@ FROM
JOIN feed f ON p.site = f.slug
WHERE
uid = ANY ($1)
;
ORDER BY
date DESC;

View File

@ -27,11 +27,6 @@ use server::{
};
use sqlx::postgres::PgPool;
#[get("/refresh")]
async fn refresh(nm: &State<Notmuch>) -> Result<Json<String>, Debug<NotmuchError>> {
Ok(Json(String::from_utf8_lossy(&nm.new()?).to_string()))
}
#[get("/show/<query>/pretty")]
async fn show_pretty(
nm: &State<Notmuch>,
@ -209,7 +204,6 @@ async fn main() -> Result<(), Box<dyn Error>> {
shared::urls::MOUNT_POINT,
routes![
original,
refresh,
show_pretty,
show,
graphql_query,

View File

@ -1,3 +1,5 @@
use std::str::FromStr;
use async_graphql::{
connection::{self, Connection, Edge, OpaqueCursor},
Context, EmptySubscription, Enum, Error, FieldResult, InputObject, Object, Schema,
@ -16,6 +18,26 @@ pub type UnixTime = isize;
/// # Thread ID, sans "thread:"
pub type ThreadId = String;
#[derive(Debug, Enum, Copy, Clone, Eq, PartialEq)]
pub enum Corpus {
Notmuch,
Newsreader,
Tantivy,
}
impl FromStr for Corpus {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
Ok(match s {
"notmuch" => Corpus::Notmuch,
"newsreader" => Corpus::Newsreader,
"tantivy" => Corpus::Tantivy,
s => return Err(format!("unknown corpus: '{s}'")),
})
}
}
// TODO: add is_read field and remove all use of 'tag:unread'
#[derive(Debug, SimpleObject)]
pub struct ThreadSummary {
pub thread: ThreadId,
@ -30,6 +52,7 @@ pub struct ThreadSummary {
pub authors: String,
pub subject: String,
pub tags: Vec<String>,
pub corpus: Corpus,
}
#[derive(Debug, Union)]
@ -237,13 +260,16 @@ impl QueryRoot {
async fn count<'ctx>(&self, ctx: &Context<'ctx>, query: String) -> Result<usize, Error> {
let nm = ctx.data_unchecked::<Notmuch>();
let pool = ctx.data_unchecked::<PgPool>();
let tantivy = ctx.data_unchecked::<TantivyConnection>();
let newsreader_query: Query = query.parse()?;
let newsreader_count = newsreader::count(pool, &newsreader_query).await?;
let notmuch_count = nm::count(nm, &newsreader_query.to_notmuch()).await?;
info!("count {newsreader_query:?} newsreader count {newsreader_count} notmuch count {notmuch_count}");
Ok(newsreader_count + notmuch_count)
let notmuch_count = nm::count(nm, &newsreader_query).await?;
let tantivy_count = tantivy.count(&newsreader_query).await?;
let total = newsreader_count + notmuch_count + tantivy_count;
info!("count {newsreader_query:?} newsreader count {newsreader_count} notmuch count {notmuch_count} tantivy count {tantivy_count} total {total}");
Ok(total)
}
async fn search<'ctx>(
@ -255,18 +281,11 @@ impl QueryRoot {
last: Option<i32>,
query: String,
) -> Result<Connection<OpaqueCursor<SearchCursor>, ThreadSummary>, Error> {
// TODO: add keywords to limit search to one corpus, i.e. is:news or is:mail
info!("search({after:?} {before:?} {first:?} {last:?} {query:?})",);
let nm = ctx.data_unchecked::<Notmuch>();
let pool = ctx.data_unchecked::<PgPool>();
let tantivy = ctx.data_unchecked::<TantivyConnection>();
#[derive(Debug)]
enum ThreadSummaryCursor {
Newsreader(i32, ThreadSummary),
Notmuch(i32, ThreadSummary),
Tantivy(i32, ThreadSummary),
}
Ok(connection::query(
after,
before,
@ -277,7 +296,7 @@ impl QueryRoot {
first: Option<usize>,
last: Option<usize>| async move {
info!(
"search({:?} {:?} {first:?} {last:?} {query:?})",
"search(after {:?} before {:?} first {first:?} last {last:?} query: {query:?})",
after.as_ref().map(|v| &v.0),
before.as_ref().map(|v| &v.0)
);
@ -288,65 +307,40 @@ impl QueryRoot {
let newsreader_before = before.as_ref().map(|sc| sc.newsreader_offset);
let notmuch_before = before.as_ref().map(|sc| sc.notmuch_offset);
let tantivy_before = before.as_ref().map(|sc| sc.tantivy_offset);
let first = first.map(|v| v as i32);
let last = last.map(|v| v as i32);
let newsreader_query: Query = query.parse()?;
info!("newsreader_query {newsreader_query:?}");
let newsreader_results = if newsreader_query.is_newsreader {
newsreader::search(
pool,
newsreader_after,
newsreader_before,
first.map(|v| v as i32),
last.map(|v| v as i32),
&newsreader_query,
)
.await?
.into_iter()
.map(|(cur, ts)| ThreadSummaryCursor::Newsreader(cur, ts))
.collect()
} else {
Vec::new()
};
let notmuch_results = if newsreader_query.is_notmuch {
nm::search(
nm,
notmuch_after,
notmuch_before,
first.map(|v| v as i32),
last.map(|v| v as i32),
newsreader_query.to_notmuch(),
)
.await?
.into_iter()
.map(|(cur, ts)| ThreadSummaryCursor::Notmuch(cur, ts))
.collect()
} else {
Vec::new()
};
let tantivy_results = if newsreader_query.is_tantivy {
tantivy
.search(
pool,
tantivy_after,
tantivy_before,
first.map(|v| v as i32),
last.map(|v| v as i32),
&newsreader_query,
)
.await?
.into_iter()
.map(|(cur, ts)| ThreadSummaryCursor::Tantivy(cur, ts))
.collect()
} else {
Vec::new()
};
let query: Query = query.parse()?;
info!("newsreader_query {query:?}");
let newsreader_results = newsreader_search(
pool,
newsreader_after,
newsreader_before,
first,
last,
&query,
)
.await?;
let notmuch_results =
notmuch_search(nm, notmuch_after, notmuch_before, first, last, &query).await?;
let tantivy_results = tantivy_search(
tantivy,
pool,
tantivy_after,
tantivy_before,
first,
last,
&query,
)
.await?;
info!(
"tantivy results:\nis_tantivy:{} {tantivy_results:#?}",
newsreader_query.is_tantivy
"newsreader_results ({}) notmuch_results ({}) tantivy_results ({})",
newsreader_results.len(),
notmuch_results.len(),
tantivy_results.len()
);
let mut results: Vec<_> = newsreader_results
.into_iter()
.chain(notmuch_results)
@ -362,6 +356,7 @@ impl QueryRoot {
let mut has_next_page = before.is_some();
if let Some(first) = first {
let first = first as usize;
if results.len() > first {
has_next_page = true;
results.truncate(first);
@ -370,6 +365,7 @@ impl QueryRoot {
let mut has_previous_page = after.is_some();
if let Some(last) = last {
let last = last as usize;
if results.len() > last {
has_previous_page = true;
results.truncate(last);
@ -437,6 +433,59 @@ impl QueryRoot {
}
}
#[derive(Debug)]
enum ThreadSummaryCursor {
Newsreader(i32, ThreadSummary),
Notmuch(i32, ThreadSummary),
Tantivy(i32, ThreadSummary),
}
async fn newsreader_search(
pool: &PgPool,
after: Option<i32>,
before: Option<i32>,
first: Option<i32>,
last: Option<i32>,
query: &Query,
) -> Result<Vec<ThreadSummaryCursor>, async_graphql::Error> {
Ok(newsreader::search(pool, after, before, first, last, &query)
.await?
.into_iter()
.map(|(cur, ts)| ThreadSummaryCursor::Newsreader(cur, ts))
.collect())
}
async fn notmuch_search(
nm: &Notmuch,
after: Option<i32>,
before: Option<i32>,
first: Option<i32>,
last: Option<i32>,
query: &Query,
) -> Result<Vec<ThreadSummaryCursor>, async_graphql::Error> {
Ok(nm::search(nm, after, before, first, last, &query)
.await?
.into_iter()
.map(|(cur, ts)| ThreadSummaryCursor::Notmuch(cur, ts))
.collect())
}
async fn tantivy_search(
tantivy: &TantivyConnection,
pool: &PgPool,
after: Option<i32>,
before: Option<i32>,
first: Option<i32>,
last: Option<i32>,
query: &Query,
) -> Result<Vec<ThreadSummaryCursor>, async_graphql::Error> {
Ok(tantivy
.search(pool, after, before, first, last, &query)
.await?
.into_iter()
.map(|(cur, ts)| ThreadSummaryCursor::Tantivy(cur, ts))
.collect())
}
pub struct Mutation;
#[Object]
impl Mutation {
@ -448,14 +497,12 @@ impl Mutation {
) -> Result<bool, Error> {
let nm = ctx.data_unchecked::<Notmuch>();
let pool = ctx.data_unchecked::<PgPool>();
let tantivy = ctx.data_unchecked::<TantivyConnection>();
for q in query.split_whitespace() {
if newsreader::is_newsreader_thread(&q) {
newsreader::set_read_status(pool, &q, unread).await?;
} else {
nm::set_read_status(nm, q, unread).await?;
}
}
let query: Query = query.parse()?;
newsreader::set_read_status(pool, &query, unread).await?;
tantivy.reindex_thread(pool, &query).await?;
nm::set_read_status(nm, &query, unread).await?;
Ok(true)
}
async fn tag_add<'ctx>(
@ -486,10 +533,19 @@ impl Mutation {
let pool = ctx.data_unchecked::<PgPool>();
tantivy.drop_and_load_index()?;
tantivy.reindex(pool).await?;
tantivy.reindex_all(pool).await?;
Ok(true)
}
async fn refresh<'ctx>(&self, ctx: &Context<'ctx>) -> Result<bool, Error> {
let nm = ctx.data_unchecked::<Notmuch>();
let tantivy = ctx.data_unchecked::<TantivyConnection>();
let pool = ctx.data_unchecked::<PgPool>();
// TODO: parallelize
info!("{}", String::from_utf8_lossy(&nm.new()?));
tantivy.refresh(pool).await?;
Ok(true)
}
}
pub type GraphqlSchema = Schema<QueryRoot, Mutation, EmptySubscription>;

View File

@ -18,15 +18,16 @@ use lol_html::{
};
use maplit::{hashmap, hashset};
use scraper::{Html, Selector};
use sqlx::{postgres::PgPool, types::time::PrimitiveDateTime};
use sqlx::types::time::PrimitiveDateTime;
use thiserror::Error;
use tokio::sync::Mutex;
use url::Url;
use crate::{
error::ServerError,
graphql::ThreadSummary,
newsreader::{extract_thread_id, is_newsreader_thread},
graphql::{Corpus, ThreadSummary},
newsreader::is_newsreader_thread,
nm::is_notmuch_thread_or_id,
};
const NEWSREADER_TAG_PREFIX: &'static str = "News/";
@ -607,12 +608,13 @@ fn compute_offset_limit(
#[derive(Debug)]
pub struct Query {
pub unread_only: bool,
pub tag: Option<String>,
pub uid: Option<String>,
pub tags: Vec<String>,
pub uids: Vec<String>,
pub remainder: Vec<String>,
pub is_notmuch: bool,
pub is_newsreader: bool,
pub is_tantivy: bool,
pub corpus: Option<Corpus>,
}
impl Query {
@ -627,10 +629,10 @@ impl Query {
if self.unread_only {
parts.push("is:unread".to_string());
}
if let Some(site) = &self.tag {
parts.push(format!("tag:{site}"));
for tag in &self.tags {
parts.push(format!("tag:{tag}"));
}
if let Some(uid) = &self.uid {
for uid in &self.uids {
parts.push(uid.clone());
}
parts.extend(self.remainder.clone());
@ -642,48 +644,60 @@ impl FromStr for Query {
type Err = Infallible;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut unread_only = false;
let mut tag = None;
let mut uid = None;
let mut tags = Vec::new();
let mut uids = Vec::new();
let mut remainder = Vec::new();
let mut is_notmuch = false;
let mut is_newsreader = false;
let is_newsreader = false;
let mut is_tantivy = false;
let mut corpus = None;
for word in s.split_whitespace() {
if word == "is:unread" {
unread_only = true
} else if word.starts_with("tag:") {
tag = Some(word["tag:".len()..].to_string())
tags.push(word["tag:".len()..].to_string());
/*
} else if word.starts_with("tag:") {
// Any tag that doesn't match site_prefix should explicitly set the site to something not in the
// database
site = Some(NON_EXISTENT_SITE_NAME.to_string());
*/
} else if word.starts_with("corpus:") {
let c = word["corpus:".len()..].to_string();
corpus = c.parse::<Corpus>().map(|c| Some(c)).unwrap_or_else(|e| {
warn!("Error parsing corpus '{c}': {e:?}");
None
});
} else if is_newsreader_thread(word) {
uid = Some(extract_thread_id(word).to_string())
uids.push(word.to_string());
} else if is_notmuch_thread_or_id(word) {
uids.push(word.to_string());
} else if word == "is:mail" || word == "is:email" || word == "is:notmuch" {
is_notmuch = true;
} else if word == "is:news" || word == "is:newsreader" {
is_newsreader = true;
is_tantivy = true;
} else {
remainder.push(word.to_string());
}
}
// If we don't see any explicit filters for a corpus, flip them all on
if !(is_notmuch || is_newsreader) {
is_newsreader = true;
if corpus.is_none() && !(is_newsreader || is_notmuch || is_tantivy) {
// Don't set is_newsreader unless debugging, assume tantivy can handle it.
// Explicitely setting corpus:newsreader will by-pass this logic
// is_newsreader = true;
is_notmuch = true;
is_tantivy = true;
}
// TODO: decide if tantivy gets it's own life or replaces newsreader
is_tantivy = is_newsreader;
Ok(Query {
unread_only,
tag,
uid,
tags,
uids,
remainder,
is_notmuch,
is_newsreader,
is_tantivy,
corpus,
})
}
}
@ -694,6 +708,7 @@ pub struct ThreadSummaryRecord {
pub title: Option<String>,
pub uid: String,
pub name: Option<String>,
pub corpus: Corpus,
}
async fn thread_summary_from_row(r: ThreadSummaryRecord) -> ThreadSummary {
@ -711,12 +726,14 @@ async fn thread_summary_from_row(r: ThreadSummaryRecord) -> ThreadSummary {
.expect("post missing date")
.assume_utc()
.unix_timestamp() as isize,
date_relative: "TODO date_relative".to_string(),
date_relative: format!("{:?}", r.date),
//date_relative: "TODO date_relative".to_string(),
matched: 0,
total: 1,
authors: r.name.unwrap_or_else(|| site.clone()),
subject: title,
tags,
corpus: r.corpus,
}
}
async fn clean_title(title: &str) -> Result<String, ServerError> {

View File

@ -13,14 +13,14 @@ use crate::{
clean_title, compute_offset_limit,
config::Config,
error::ServerError,
graphql::{NewsPost, Tag, Thread, ThreadSummary},
graphql::{Corpus, NewsPost, Tag, Thread, ThreadSummary},
thread_summary_from_row, AddOutlink, EscapeHtml, FrameImages, InlineStyle, Query, SanitizeHtml,
SlurpContents, StripHtml, ThreadSummaryRecord, Transformer, NEWSREADER_TAG_PREFIX,
SlurpContents, ThreadSummaryRecord, Transformer, NEWSREADER_TAG_PREFIX,
NEWSREADER_THREAD_PREFIX,
};
pub fn is_newsreader_search(query: &str) -> bool {
query.contains(NEWSREADER_TAG_PREFIX)
pub fn is_newsreader_query(query: &Query) -> bool {
query.is_newsreader || query.corpus == Some(Corpus::Newsreader)
}
pub fn is_newsreader_thread(query: &str) -> bool {
@ -28,7 +28,11 @@ pub fn is_newsreader_thread(query: &str) -> bool {
}
pub fn extract_thread_id(query: &str) -> &str {
&query[NEWSREADER_THREAD_PREFIX.len()..]
if query.starts_with(NEWSREADER_THREAD_PREFIX) {
&query[NEWSREADER_THREAD_PREFIX.len()..]
} else {
query
}
}
pub fn extract_site(tag: &str) -> &str {
@ -39,14 +43,33 @@ pub fn make_news_tag(tag: &str) -> String {
format!("tag:{NEWSREADER_TAG_PREFIX}{tag}")
}
fn site_from_tags(tags: &[String]) -> Option<String> {
for t in tags {
if t.starts_with(NEWSREADER_TAG_PREFIX) {
return Some(extract_site(t).to_string());
}
}
None
}
pub async fn count(pool: &PgPool, query: &Query) -> Result<usize, ServerError> {
if !is_newsreader_query(query) {
return Ok(0);
}
if !query.remainder.is_empty() {
// TODO: handle full text search against all sites, for now, early return if search words
// are specified.
return Ok(0);
}
let row = sqlx::query_file!("sql/count.sql", query.tag, query.unread_only)
let site = site_from_tags(&query.tags);
if !query.tags.is_empty() && site.is_none() {
// Newsreader can only handle all sites read/unread queries, anything with a non-site tag
// isn't supported
return Ok(0);
}
let row = sqlx::query_file!("sql/count.sql", site, query.unread_only)
.fetch_one(pool)
.await?;
Ok(row.count.unwrap_or(0).try_into().unwrap_or(0))
@ -61,12 +84,22 @@ pub async fn search(
query: &Query,
) -> Result<Vec<(i32, ThreadSummary)>, async_graphql::Error> {
info!("search({after:?} {before:?} {first:?} {last:?} {query:?}");
if !is_newsreader_query(query) {
return Ok(Vec::new());
}
if !query.remainder.is_empty() {
// TODO: handle full text search against all sites, for now, early return if search words
// are specified.
return Ok(Vec::new());
}
let site = site_from_tags(&query.tags);
if !query.tags.is_empty() && site.is_none() {
// Newsreader can only handle all sites read/unread queries, anything with a non-site tag
// isn't supported
return Ok(Vec::new());
}
let (offset, mut limit) = compute_offset_limit(after, before, first, last);
if before.is_none() {
// When searching forward, the +1 is to see if there are more pages of data available.
@ -75,7 +108,6 @@ pub async fn search(
limit = limit + 1;
}
let site = query.tag.as_ref().map(|t| extract_site(&t).to_string());
info!(
"search offset {offset} limit {limit} site {site:?} unread_only {}",
query.unread_only
@ -102,6 +134,7 @@ pub async fn search(
title: r.title,
uid: r.uid,
name: r.name,
corpus: Corpus::Newsreader,
})
.await,
));
@ -243,12 +276,22 @@ pub async fn thread(
}
pub async fn set_read_status<'ctx>(
pool: &PgPool,
query: &str,
query: &Query,
unread: bool,
) -> Result<bool, ServerError> {
let query: Query = query.parse()?;
sqlx::query_file!("sql/set_unread.sql", !unread, query.uid)
.execute(pool)
.await?;
// TODO: make single query when query.uids.len() > 1
let uids: Vec<_> = query
.uids
.iter()
.filter(|uid| is_newsreader_thread(uid))
.map(
|uid| extract_thread_id(uid), // TODO strip prefix
)
.collect();
for uid in uids {
sqlx::query_file!("sql/set_unread.sql", !unread, uid)
.execute(pool)
.await?;
}
Ok(true)
}

View File

@ -14,10 +14,10 @@ use crate::{
compute_offset_limit,
error::ServerError,
graphql::{
Attachment, Body, DispositionType, Email, EmailThread, Header, Html, Message, PlainText,
Tag, Thread, ThreadSummary, UnhandledContentType,
Attachment, Body, Corpus, DispositionType, Email, EmailThread, Header, Html, Message,
PlainText, Tag, Thread, ThreadSummary, UnhandledContentType,
},
linkify_html, InlineStyle, SanitizeHtml, Transformer,
linkify_html, InlineStyle, Query, SanitizeHtml, Transformer,
};
const TEXT_PLAIN: &'static str = "text/plain";
@ -31,6 +31,14 @@ const MULTIPART_RELATED: &'static str = "multipart/related";
const MAX_RAW_MESSAGE_SIZE: usize = 100_000;
fn is_notmuch_query(query: &Query) -> bool {
query.is_notmuch || query.corpus == Some(Corpus::Notmuch)
}
pub fn is_notmuch_thread_or_id(id: &str) -> bool {
id.starts_with("id:") || id.starts_with("thread:")
}
// TODO(wathiede): decide good error type
pub fn threadset_to_messages(thread_set: notmuch::ThreadSet) -> Result<Vec<Message>, ServerError> {
for t in thread_set.0 {
@ -39,8 +47,12 @@ pub fn threadset_to_messages(thread_set: notmuch::ThreadSet) -> Result<Vec<Messa
Ok(Vec::new())
}
pub async fn count(nm: &Notmuch, query: &str) -> Result<usize, ServerError> {
Ok(nm.count(query)?)
pub async fn count(nm: &Notmuch, query: &Query) -> Result<usize, ServerError> {
if !is_notmuch_query(query) {
return Ok(0);
}
let query = query.to_notmuch();
Ok(nm.count(&query)?)
}
pub async fn search(
@ -49,8 +61,12 @@ pub async fn search(
before: Option<i32>,
first: Option<i32>,
last: Option<i32>,
query: String,
query: &Query,
) -> Result<Vec<(i32, ThreadSummary)>, async_graphql::Error> {
if !is_notmuch_query(query) {
return Ok(Vec::new());
}
let query = query.to_notmuch();
let (offset, mut limit) = compute_offset_limit(after, before, first, last);
if before.is_none() {
// When searching forward, the +1 is to see if there are more pages of data available.
@ -75,6 +91,7 @@ pub async fn search(
authors: ts.authors,
subject: ts.subject,
tags: ts.tags,
corpus: Corpus::Notmuch,
},
)
})
@ -770,13 +787,21 @@ fn render_content_type_tree(m: &ParsedMail) -> String {
pub async fn set_read_status<'ctx>(
nm: &Notmuch,
query: &str,
query: &Query,
unread: bool,
) -> Result<bool, ServerError> {
if unread {
nm.tag_add("unread", &format!("{query}"))?;
} else {
nm.tag_remove("unread", &format!("{query}"))?;
let uids: Vec<_> = query
.uids
.iter()
.filter(|uid| is_notmuch_thread_or_id(uid))
.collect();
info!("set_read_status({unread} {uids:?})");
for uid in uids {
if unread {
nm.tag_add("unread", uid)?;
} else {
nm.tag_remove("unread", uid)?;
}
}
Ok(true)
}

View File

@ -1,11 +1,26 @@
use log::info;
use sqlx::postgres::PgPool;
use tantivy::{schema::Value, Index, TantivyError};
use std::collections::HashSet;
use crate::{
error::ServerError, graphql::ThreadSummary, thread_summary_from_row, Query, ThreadSummaryRecord,
use log::{debug, error, info};
use sqlx::{postgres::PgPool, types::time::PrimitiveDateTime};
use tantivy::{
collector::{DocSetCollector, TopDocs},
query,
query::{AllQuery, BooleanQuery, Occur, QueryParser, TermQuery},
schema::{Facet, IndexRecordOption, Value},
DocAddress, Index, Searcher, TantivyDocument, TantivyError, Term,
};
use crate::{
compute_offset_limit,
error::ServerError,
graphql::{Corpus, ThreadSummary},
newsreader::{extract_thread_id, is_newsreader_thread},
thread_summary_from_row, Query, ThreadSummaryRecord,
};
pub fn is_tantivy_query(query: &Query) -> bool {
query.is_tantivy || query.corpus == Some(Corpus::Tantivy)
}
pub struct TantivyConnection {
db_path: String,
//index: Index,
@ -27,7 +42,67 @@ impl TantivyConnection {
db_path: tantivy_db_path.to_string(),
})
}
pub async fn reindex(&self, pool: &PgPool) -> Result<(), ServerError> {
pub async fn refresh(&self, pool: &PgPool) -> Result<(), ServerError> {
let start_time = std::time::Instant::now();
let p_uids: Vec<_> = sqlx::query_file!("sql/all-uids.sql")
.fetch_all(pool)
.await?
.into_iter()
.map(|r| r.uid)
.collect();
info!(
"refresh from postgres got {} uids in {}",
p_uids.len(),
start_time.elapsed().as_secs_f32()
);
let start_time = std::time::Instant::now();
let (searcher, _query) = self.searcher_and_query("")?;
let docs = searcher.search(&AllQuery, &DocSetCollector)?;
let uid = self.get_index()?.schema().get_field("uid")?;
let t_uids: Vec<_> = docs
.into_iter()
.map(|doc_address| {
searcher
.doc(doc_address)
.map(|doc: TantivyDocument| {
debug!("doc: {doc:#?}");
doc.get_first(uid)
.expect("uid")
.as_str()
.expect("as_str")
.to_string()
})
.expect("searcher.doc")
})
.collect();
info!(
"refresh tantivy got {} uids in {}",
t_uids.len(),
start_time.elapsed().as_secs_f32()
);
let t_set: HashSet<_> = t_uids.into_iter().collect();
let need: Vec<_> = p_uids
.into_iter()
.filter(|uid| !t_set.contains(uid.as_str()))
.collect();
if !need.is_empty() {
info!(
"need to reindex {} uids: {:?}...",
need.len(),
&need[..need.len().min(10)]
);
}
let batch_size = 1000;
let uids: Vec<_> = need[..need.len().min(batch_size)]
.into_iter()
.cloned()
.collect();
self.reindex_uids(pool, &uids).await
}
async fn reindex_uids(&self, pool: &PgPool, uids: &[String]) -> Result<(), ServerError> {
// TODO: add SlurpContents and convert HTML to text
use tantivy::{doc, Term};
let start_time = std::time::Instant::now();
@ -44,11 +119,20 @@ impl TantivyConnection {
let is_read = schema.get_field("is_read")?;
let uid = schema.get_field("uid")?;
let id = schema.get_field("id")?;
let tag = schema.get_field("tag")?;
let rows = sqlx::query_file!("sql/all-posts.sql")
info!("reindexing {} posts", uids.len());
let rows = sqlx::query_file_as!(PostgresDoc, "sql/posts-from-uids.sql", uids)
.fetch_all(pool)
.await?;
if uids.len() != rows.len() {
error!(
"Had {} uids and only got {} rows: uids {uids:?}",
uids.len(),
rows.len()
);
}
let total = rows.len();
for (i, r) in rows.into_iter().enumerate() {
if i % 10_000 == 0 {
@ -57,26 +141,76 @@ impl TantivyConnection {
start_time.elapsed().as_secs_f32()
);
}
let id_term = Term::from_field_text(uid, &r.uid);
index_writer.delete_term(id_term);
let slug = r.site;
let tag_facet = Facet::from(&format!("/News/{slug}"));
index_writer.add_document(doc!(
site => r.site.expect("UNKOWN_SITE"),
title => r.title.expect("UNKOWN_TITLE"),
site => slug.clone(),
title => r.title,
// TODO: clean and extract text from HTML
summary => r.summary.expect("UNKNOWN_SUMMARY"),
link => r.link.expect("link"),
date => tantivy::DateTime::from_primitive(r.date.expect("date")),
is_read => r.is_read.expect("is_read"),
summary => r.summary,
link => r.link,
date => tantivy::DateTime::from_primitive(r.date),
is_read => r.is_read,
uid => r.uid,
id => r.id as i64,
id => r.id as u64,
tag => tag_facet,
))?;
}
index_writer.commit()?;
info!("took {:.2}s to reindex", start_time.elapsed().as_secs_f32());
index_writer.commit()?;
Ok(())
}
pub async fn reindex_thread(&self, pool: &PgPool, query: &Query) -> Result<(), ServerError> {
let uids: Vec<_> = query
.uids
.iter()
.filter(|uid| is_newsreader_thread(uid))
.map(|uid| extract_thread_id(uid).to_string())
.collect();
Ok(self.reindex_uids(pool, &uids).await?)
}
pub async fn reindex_all(&self, pool: &PgPool) -> Result<(), ServerError> {
let rows = sqlx::query_file!("sql/all-posts.sql")
.fetch_all(pool)
.await?;
let uids: Vec<String> = rows.into_iter().map(|r| r.uid).collect();
self.reindex_uids(pool, &uids).await?;
Ok(())
}
fn searcher_and_query(
&self,
term: &str,
) -> Result<(Searcher, Box<dyn query::Query>), ServerError> {
let index = self.get_index()?;
let reader = index.reader()?;
let schema = index.schema();
let searcher = reader.searcher();
let title = schema.get_field("title")?;
let summary = schema.get_field("summary")?;
let query_parser = QueryParser::for_index(&index, vec![title, summary]);
// Tantivy uses '*' to match all docs, not empty string
let term = if term.is_empty() { "*" } else { term };
info!("query_parser('{term}')");
let query = query_parser.parse_query(&term)?;
Ok((searcher, query))
}
pub async fn count(&self, query: &Query) -> Result<usize, ServerError> {
if !is_tantivy_query(query) {
return Ok(0);
}
use tantivy::collector::Count;
let term = query.remainder.join(" ");
let (searcher, query) = self.searcher_and_query(&term)?;
Ok(searcher.search(&query, &Count)?)
}
pub async fn search(
&self,
pool: &PgPool,
@ -86,28 +220,51 @@ impl TantivyConnection {
last: Option<i32>,
query: &Query,
) -> Result<Vec<(i32, ThreadSummary)>, async_graphql::Error> {
use tantivy::{collector::TopDocs, query::QueryParser, Document, TantivyDocument};
// TODO: set based on function parameters
let offset = 0;
if !is_tantivy_query(query) {
return Ok(Vec::new());
}
let (offset, mut limit) = compute_offset_limit(after, before, first, last);
if before.is_none() {
// When searching forward, the +1 is to see if there are more pages of data available.
// Searching backwards implies there's more pages forward, because the value represented by
// `before` is on the next page.
limit = limit + 1;
}
let index = self.get_index()?;
let reader = index.reader()?;
let schema = index.schema();
let searcher = reader.searcher();
let site = schema.get_field("site")?;
let uid = schema.get_field("uid")?;
let title = schema.get_field("title")?;
let summary = schema.get_field("summary")?;
let date = schema.get_field("date")?;
let query_parser = QueryParser::for_index(&index, vec![title, summary]);
let query = query_parser.parse_query(&query.remainder.join(" "))?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let term = query.remainder.join(" ");
let (searcher, tantivy_query) = self.searcher_and_query(&term)?;
let tag = self.get_index()?.schema().get_field("tag")?;
let is_read = self.get_index()?.schema().get_field("is_read")?;
let mut terms = vec![(Occur::Must, tantivy_query)];
for t in &query.tags {
let facet = Facet::from(&format!("/{t}"));
let facet_term = Term::from_facet(tag, &facet);
let facet_term_query = Box::new(TermQuery::new(facet_term, IndexRecordOption::Basic));
terms.push((Occur::Must, facet_term_query));
}
if query.unread_only {
info!("searching for unread only");
let term = Term::from_field_bool(is_read, false);
terms.push((
Occur::Must,
Box::new(TermQuery::new(term, IndexRecordOption::Basic)),
));
}
let search_query = BooleanQuery::new(terms);
info!("Tantivy::search(term '{term}', off {offset}, lim {limit}, search_query {search_query:?})");
let top_docs = searcher.search(
&search_query,
&TopDocs::with_limit(limit as usize)
.and_offset(offset as usize)
.order_by_u64_field("date", tantivy::index::Order::Desc),
)?;
info!("search found {} docs", top_docs.len());
let uid = self.get_index()?.schema().get_field("uid")?;
let uids = top_docs
.into_iter()
.map(|(_, doc_address)| {
.map(|(_, doc_address): (u64, DocAddress)| {
searcher.doc(doc_address).map(|doc: TantivyDocument| {
debug!("doc: {doc:#?}");
doc.get_first(uid)
.expect("doc missing uid")
.as_str()
@ -134,6 +291,7 @@ impl TantivyConnection {
title: r.title,
uid: r.uid,
name: r.name,
corpus: Corpus::Tantivy,
})
.await,
));
@ -157,11 +315,23 @@ fn create_news_db(tantivy_db_path: &str) -> Result<(), TantivyError> {
schema_builder.add_text_field("summary", TEXT);
schema_builder.add_text_field("link", STRING | STORED);
schema_builder.add_date_field("date", FAST | INDEXED | STORED);
schema_builder.add_bool_field("is_read", FAST);
schema_builder.add_bool_field("is_read", FAST | INDEXED | STORED);
schema_builder.add_text_field("uid", STRING | STORED);
schema_builder.add_i64_field("id", FAST);
schema_builder.add_u64_field("id", FAST);
schema_builder.add_facet_field("tag", FacetOptions::default());
let schema = schema_builder.build();
Index::create_in_dir(tantivy_db_path, schema)?;
Ok(())
}
struct PostgresDoc {
site: String,
title: String,
summary: String,
link: String,
date: PrimitiveDateTime,
is_read: bool,
uid: String,
id: i32,
}

View File

@ -14,6 +14,7 @@ query FrontPageQuery($query: String!, $after: String $before: String, $first: In
subject
authors
tags
corpus
}
}
tags {

View File

@ -0,0 +1,3 @@
mutation RefreshMutation {
refresh
}

View File

@ -232,6 +232,35 @@
"name": "Boolean",
"possibleTypes": null
},
{
"description": null,
"enumValues": [
{
"deprecationReason": null,
"description": null,
"isDeprecated": false,
"name": "NOTMUCH"
},
{
"deprecationReason": null,
"description": null,
"isDeprecated": false,
"name": "NEWSREADER"
},
{
"deprecationReason": null,
"description": null,
"isDeprecated": false,
"name": "TANTIVY"
}
],
"fields": null,
"inputFields": null,
"interfaces": null,
"kind": "ENUM",
"name": "Corpus",
"possibleTypes": null
},
{
"description": null,
"enumValues": [
@ -850,6 +879,38 @@
"ofType": null
}
}
},
{
"args": [],
"deprecationReason": null,
"description": "Drop and recreate tantivy index. Warning this is slow",
"isDeprecated": false,
"name": "dropAndLoadIndex",
"type": {
"kind": "NON_NULL",
"name": null,
"ofType": {
"kind": "SCALAR",
"name": "Boolean",
"ofType": null
}
}
},
{
"args": [],
"deprecationReason": null,
"description": null,
"isDeprecated": false,
"name": "refresh",
"type": {
"kind": "NON_NULL",
"name": null,
"ofType": {
"kind": "SCALAR",
"name": "Boolean",
"ofType": null
}
}
}
],
"inputFields": null,
@ -1536,6 +1597,22 @@
}
}
}
},
{
"args": [],
"deprecationReason": null,
"description": null,
"isDeprecated": false,
"name": "corpus",
"type": {
"kind": "NON_NULL",
"name": null,
"ofType": {
"kind": "ENUM",
"name": "Corpus",
"ofType": null
}
}
}
],
"inputFields": null,

View File

@ -44,6 +44,14 @@ pub struct AddTagMutation;
)]
pub struct RemoveTagMutation;
#[derive(GraphQLQuery)]
#[graphql(
schema_path = "graphql/schema.json",
query_path = "graphql/refresh.graphql",
response_derives = "Debug"
)]
pub struct RefreshMutation;
pub async fn send_graphql<Body, Resp>(body: Body) -> Result<graphql_client::Response<Resp>, Error>
where
Body: Serialize,

View File

@ -111,7 +111,17 @@ pub fn update(msg: Msg, model: &mut Model, orders: &mut impl Orders<Msg>) {
Msg::Noop => {}
Msg::RefreshStart => {
model.refreshing_state = RefreshingState::Loading;
orders.perform_cmd(async move { Msg::RefreshDone(api::refresh_request().await.err()) });
orders.perform_cmd(async move {
Msg::RefreshDone(
send_graphql::<_, graphql::refresh_mutation::ResponseData>(
graphql::RefreshMutation::build_query(
graphql::refresh_mutation::Variables {},
),
)
.await
.err(),
)
});
}
Msg::RefreshDone(err) => {
model.refreshing_state = if let Some(err) = err {

View File

@ -203,7 +203,7 @@ fn view_search_results(
}),
]],
td![
C!["from"],
C!["from", format!("corpus-{:?} ", r.corpus)],
a![
C!["has-text-light", "text"],
attrs! {