server: more news title/body cleanup, and don't search news so much
This commit is contained in:
parent
c314e3c798
commit
d1cfc77148
@ -8,7 +8,7 @@ use notmuch::Notmuch;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sqlx::postgres::PgPool;
|
||||
|
||||
use crate::{newsreader, nm};
|
||||
use crate::{newsreader, nm, Query};
|
||||
|
||||
/// # Number of seconds since the Epoch
|
||||
pub type UnixTime = isize;
|
||||
@ -215,7 +215,7 @@ impl QueryRoot {
|
||||
let nm = ctx.data_unchecked::<Notmuch>();
|
||||
let pool = ctx.data_unchecked::<PgPool>();
|
||||
|
||||
let newsreader_query: newsreader::Query = query.parse()?;
|
||||
let newsreader_query: Query = query.parse()?;
|
||||
|
||||
Ok(newsreader::count(pool, &newsreader_query).await? + nm::count(nm, &query).await?)
|
||||
}
|
||||
@ -257,32 +257,46 @@ impl QueryRoot {
|
||||
let newsreader_before = before.as_ref().map(|sc| sc.newsreader_offset);
|
||||
let notmuch_before = before.as_ref().map(|sc| sc.notmuch_offset);
|
||||
|
||||
let newsreader_query: newsreader::Query = query.parse()?;
|
||||
let newsreader_results = newsreader::search(
|
||||
pool,
|
||||
newsreader_after,
|
||||
newsreader_before,
|
||||
first.map(|v| v as i32),
|
||||
last.map(|v| v as i32),
|
||||
&newsreader_query,
|
||||
)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|(cur, ts)| ThreadSummaryCursor::Newsreader(cur, ts));
|
||||
let newsreader_query: Query = query.parse()?;
|
||||
info!("newsreader_query {newsreader_query:?}");
|
||||
let newsreader_results = if newsreader_query.is_newsreader {
|
||||
newsreader::search(
|
||||
pool,
|
||||
newsreader_after,
|
||||
newsreader_before,
|
||||
first.map(|v| v as i32),
|
||||
last.map(|v| v as i32),
|
||||
&newsreader_query,
|
||||
)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|(cur, ts)| ThreadSummaryCursor::Newsreader(cur, ts))
|
||||
.collect()
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
let notmuch_results = nm::search(
|
||||
nm,
|
||||
notmuch_after,
|
||||
notmuch_before,
|
||||
first.map(|v| v as i32),
|
||||
last.map(|v| v as i32),
|
||||
query,
|
||||
)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|(cur, ts)| ThreadSummaryCursor::Notmuch(cur, ts));
|
||||
let notmuch_results = if newsreader_query.is_notmuch {
|
||||
nm::search(
|
||||
nm,
|
||||
notmuch_after,
|
||||
notmuch_before,
|
||||
first.map(|v| v as i32),
|
||||
last.map(|v| v as i32),
|
||||
newsreader_query.to_notmuch(),
|
||||
)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|(cur, ts)| ThreadSummaryCursor::Notmuch(cur, ts))
|
||||
.collect()
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
let mut results: Vec<_> = newsreader_results.chain(notmuch_results).collect();
|
||||
let mut results: Vec<_> = newsreader_results
|
||||
.into_iter()
|
||||
.chain(notmuch_results)
|
||||
.collect();
|
||||
|
||||
// The leading '-' is to reverse sort
|
||||
results.sort_by_key(|item| match item {
|
||||
|
||||
@ -3,14 +3,21 @@ pub mod graphql;
|
||||
pub mod newsreader;
|
||||
pub mod nm;
|
||||
|
||||
use std::{convert::Infallible, str::FromStr};
|
||||
|
||||
use css_inline::{CSSInliner, InlineError, InlineOptions};
|
||||
use linkify::{LinkFinder, LinkKind};
|
||||
use log::error;
|
||||
use lol_html::{element, errors::RewritingError, rewrite_str, RewriteStrSettings};
|
||||
use log::{error, info};
|
||||
use lol_html::{element, errors::RewritingError, rewrite_str, text, RewriteStrSettings};
|
||||
use maplit::{hashmap, hashset};
|
||||
use thiserror::Error;
|
||||
use url::Url;
|
||||
|
||||
use crate::newsreader::{
|
||||
extract_thread_id, is_newsreader_search, is_newsreader_thread, make_news_tag,
|
||||
};
|
||||
const NON_EXISTENT_SITE_NAME: &'static str = "NO-SUCH-SITE";
|
||||
|
||||
// TODO: figure out how to use Cow
|
||||
trait Transformer {
|
||||
fn should_run(&self, _html: &str) -> bool {
|
||||
@ -46,13 +53,38 @@ struct EscapeHtml;
|
||||
|
||||
impl Transformer for EscapeHtml {
|
||||
fn should_run(&self, html: &str) -> bool {
|
||||
html.starts_with("<")
|
||||
html.contains("&")
|
||||
}
|
||||
fn transform(&self, html: &str) -> Result<String, TransformError> {
|
||||
Ok(html_escape::decode_html_entities(html).to_string())
|
||||
}
|
||||
}
|
||||
|
||||
struct StripHtml;
|
||||
|
||||
impl Transformer for StripHtml {
|
||||
fn should_run(&self, html: &str) -> bool {
|
||||
// Lame test
|
||||
html.contains("<")
|
||||
}
|
||||
fn transform(&self, html: &str) -> Result<String, TransformError> {
|
||||
let mut text = String::new();
|
||||
let element_content_handlers = vec![text!("*", |t| {
|
||||
text += t.as_str();
|
||||
Ok(())
|
||||
})];
|
||||
let _ = rewrite_str(
|
||||
html,
|
||||
RewriteStrSettings {
|
||||
element_content_handlers,
|
||||
..RewriteStrSettings::default()
|
||||
},
|
||||
)?;
|
||||
|
||||
Ok(text)
|
||||
}
|
||||
}
|
||||
|
||||
struct InlineStyle;
|
||||
|
||||
impl Transformer for InlineStyle {
|
||||
@ -381,3 +413,83 @@ fn compute_offset_limit(
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Query {
|
||||
pub unread_only: bool,
|
||||
pub tag: Option<String>,
|
||||
pub uid: Option<String>,
|
||||
pub remainder: Vec<String>,
|
||||
pub is_notmuch: bool,
|
||||
pub is_newsreader: bool,
|
||||
}
|
||||
|
||||
impl Query {
|
||||
// Converts the internal state of Query to something suitable for notmuch queries. Removes and
|
||||
// letterbox specific '<key>:<value' tags
|
||||
fn to_notmuch(&self) -> String {
|
||||
let mut parts = Vec::new();
|
||||
if !self.is_notmuch {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
if self.unread_only {
|
||||
parts.push("is:unread".to_string());
|
||||
}
|
||||
if let Some(site) = &self.tag {
|
||||
parts.push(format!("tag:{site}"));
|
||||
}
|
||||
if let Some(uid) = &self.uid {
|
||||
parts.push(uid.clone());
|
||||
}
|
||||
parts.extend(self.remainder.clone());
|
||||
parts.join(" ")
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for Query {
|
||||
type Err = Infallible;
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let mut unread_only = false;
|
||||
let mut tag = None;
|
||||
let mut uid = None;
|
||||
let mut remainder = Vec::new();
|
||||
let site_prefix = make_news_tag("");
|
||||
let mut is_notmuch = false;
|
||||
let mut is_newsreader = false;
|
||||
for word in s.split_whitespace() {
|
||||
if word == "is:unread" {
|
||||
unread_only = true
|
||||
} else if word.starts_with("tag:") {
|
||||
tag = Some(word["tag:".len()..].to_string())
|
||||
/*
|
||||
} else if word.starts_with("tag:") {
|
||||
// Any tag that doesn't match site_prefix should explicitly set the site to something not in the
|
||||
// database
|
||||
site = Some(NON_EXISTENT_SITE_NAME.to_string());
|
||||
*/
|
||||
} else if is_newsreader_thread(word) {
|
||||
uid = Some(extract_thread_id(word).to_string())
|
||||
} else if word == "is:mail" || word == "is:email" || word == "is:notmuch" {
|
||||
is_notmuch = true;
|
||||
} else if word == "is:news" || word == "is:newsreader" {
|
||||
is_newsreader = true;
|
||||
} else {
|
||||
remainder.push(word.to_string());
|
||||
}
|
||||
}
|
||||
// If we don't see any explicit filters for a corpus, flip them all on
|
||||
if !(is_notmuch || is_newsreader) {
|
||||
is_newsreader = true;
|
||||
is_notmuch = true;
|
||||
}
|
||||
Ok(Query {
|
||||
unread_only,
|
||||
tag,
|
||||
uid,
|
||||
remainder,
|
||||
is_notmuch,
|
||||
is_newsreader,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,22 +1,19 @@
|
||||
use std::{
|
||||
convert::Infallible,
|
||||
hash::{DefaultHasher, Hash, Hasher},
|
||||
str::FromStr,
|
||||
};
|
||||
use std::hash::{DefaultHasher, Hash, Hasher};
|
||||
|
||||
use log::info;
|
||||
use sqlx::postgres::PgPool;
|
||||
use url::Url;
|
||||
|
||||
use crate::Query;
|
||||
|
||||
const TAG_PREFIX: &'static str = "News/";
|
||||
const THREAD_PREFIX: &'static str = "news:";
|
||||
const NON_EXISTENT_SITE_NAME: &'static str = "NO-SUCH-SITE";
|
||||
|
||||
use crate::{
|
||||
compute_offset_limit,
|
||||
error::ServerError,
|
||||
graphql::{Body, Email, Html, Message, Tag, Thread, ThreadSummary},
|
||||
EscapeHtml, InlineStyle, SanitizeHtml, Transformer,
|
||||
EscapeHtml, InlineStyle, SanitizeHtml, StripHtml, Transformer,
|
||||
};
|
||||
|
||||
pub fn is_newsreader_search(query: &str) -> bool {
|
||||
@ -27,8 +24,20 @@ pub fn is_newsreader_thread(query: &str) -> bool {
|
||||
query.starts_with(THREAD_PREFIX)
|
||||
}
|
||||
|
||||
pub fn extract_thread_id(query: &str) -> &str {
|
||||
&query[THREAD_PREFIX.len()..]
|
||||
}
|
||||
|
||||
pub fn extract_site(tag: &str) -> &str {
|
||||
&tag[TAG_PREFIX.len()..]
|
||||
}
|
||||
|
||||
pub fn make_news_tag(tag: &str) -> String {
|
||||
format!("tag:{TAG_PREFIX}{tag}")
|
||||
}
|
||||
|
||||
pub async fn count(pool: &PgPool, query: &Query) -> Result<usize, ServerError> {
|
||||
let row = sqlx::query_file!("sql/count.sql", query.site, query.unread_only)
|
||||
let row = sqlx::query_file!("sql/count.sql", query.tag, query.unread_only)
|
||||
.fetch_one(pool)
|
||||
.await?;
|
||||
Ok(row.count.unwrap_or(0).try_into().unwrap_or(0))
|
||||
@ -43,6 +52,12 @@ pub async fn search(
|
||||
query: &Query,
|
||||
) -> Result<Vec<(i32, ThreadSummary)>, async_graphql::Error> {
|
||||
info!("search({after:?} {before:?} {first:?} {last:?} {query:?}");
|
||||
if !query.remainder.is_empty() {
|
||||
// TODO: handle full text search against all sites, for now, early return if search words
|
||||
// are specified.
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let (offset, mut limit) = compute_offset_limit(after, before, first, last);
|
||||
if before.is_none() {
|
||||
// When searching forward, the +1 is to see if there are more pages of data available.
|
||||
@ -50,11 +65,17 @@ pub async fn search(
|
||||
// `before` is on the next page.
|
||||
limit = limit + 1;
|
||||
}
|
||||
info!("search offset {offset} limit {limit}");
|
||||
|
||||
let site = query.tag.as_ref().map(|t| extract_site(&t).to_string());
|
||||
info!(
|
||||
"search offset {offset} limit {limit} site {site:?} unread_only {}",
|
||||
query.unread_only
|
||||
);
|
||||
|
||||
// TODO: further limit results to include query.remainder if set
|
||||
let rows = sqlx::query_file!(
|
||||
"sql/threads.sql",
|
||||
query.site,
|
||||
site,
|
||||
query.unread_only,
|
||||
offset as i64,
|
||||
limit as i64
|
||||
@ -66,12 +87,13 @@ pub async fn search(
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(i, r)| {
|
||||
let site = r.site.unwrap_or("UNKOWN SITE".to_string());
|
||||
let tags = if r.is_read.unwrap_or(false) {
|
||||
vec![site.clone()]
|
||||
} else {
|
||||
vec!["unread".to_string(), site.clone()]
|
||||
let site = r.site.unwrap_or("UNKOWN TAG".to_string());
|
||||
let mut tags = vec![format!("{TAG_PREFIX}{site}")];
|
||||
if !r.is_read.unwrap_or(true) {
|
||||
tags.push("unread".to_string());
|
||||
};
|
||||
let mut title = r.title.unwrap_or("NO TITLE".to_string());
|
||||
title = clean_title(&title).expect("failed to clean title");
|
||||
(
|
||||
i as i32 + offset,
|
||||
ThreadSummary {
|
||||
@ -85,7 +107,7 @@ pub async fn search(
|
||||
matched: 0,
|
||||
total: 1,
|
||||
authors: r.name.unwrap_or_else(|| site.clone()),
|
||||
subject: r.title.unwrap_or("NO TITLE".to_string()),
|
||||
subject: title,
|
||||
tags,
|
||||
},
|
||||
)
|
||||
@ -125,11 +147,10 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
|
||||
.fetch_one(pool)
|
||||
.await?;
|
||||
|
||||
let site = r.site.unwrap_or("NO SITE".to_string());
|
||||
let tags = if r.is_read.unwrap_or(false) {
|
||||
vec![site.clone()]
|
||||
} else {
|
||||
vec!["unread".to_string(), site.clone()]
|
||||
let site = r.site.unwrap_or("NO TAG".to_string());
|
||||
let mut tags = vec![format!("{TAG_PREFIX}{site}")];
|
||||
if r.is_read.unwrap_or(true) {
|
||||
tags.push("unread".to_string());
|
||||
};
|
||||
let default_homepage = "http://no-homepage";
|
||||
let homepage = Url::parse(
|
||||
@ -166,11 +187,11 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
|
||||
}
|
||||
}
|
||||
});
|
||||
let mut html = r.summary.unwrap_or("NO SUMMARY".to_string());
|
||||
let mut body = r.summary.unwrap_or("NO SUMMARY".to_string());
|
||||
// TODO: add site specific cleanups. For example:
|
||||
// * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
|
||||
// * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent
|
||||
let tranformers: Vec<Box<dyn Transformer>> = vec![
|
||||
let body_tranformers: Vec<Box<dyn Transformer>> = vec![
|
||||
Box::new(EscapeHtml),
|
||||
Box::new(InlineStyle),
|
||||
Box::new(SanitizeHtml {
|
||||
@ -178,16 +199,16 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
|
||||
base_url: &link,
|
||||
}),
|
||||
];
|
||||
for t in tranformers.iter() {
|
||||
if t.should_run(&html) {
|
||||
html = t.transform(&html)?;
|
||||
for t in body_tranformers.iter() {
|
||||
if t.should_run(&body) {
|
||||
body = t.transform(&body)?;
|
||||
}
|
||||
}
|
||||
let body = Body::Html(Html {
|
||||
html,
|
||||
html: body,
|
||||
content_tree: "".to_string(),
|
||||
});
|
||||
let title = r.title.unwrap_or("NO TITLE".to_string());
|
||||
let title = clean_title(&r.title.unwrap_or("NO TITLE".to_string()))?;
|
||||
let from = Some(Email {
|
||||
name: r.name,
|
||||
addr: addr.map(|a| a.to_string()),
|
||||
@ -215,47 +236,6 @@ pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerEr
|
||||
}],
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Query {
|
||||
pub unread_only: bool,
|
||||
pub site: Option<String>,
|
||||
pub uid: Option<String>,
|
||||
pub remainder: Vec<String>,
|
||||
}
|
||||
|
||||
impl FromStr for Query {
|
||||
type Err = Infallible;
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let mut unread_only = false;
|
||||
let mut site = None;
|
||||
let mut uid = None;
|
||||
let mut remainder = Vec::new();
|
||||
let site_prefix = format!("tag:{TAG_PREFIX}");
|
||||
for word in s.split_whitespace() {
|
||||
if word == "is:unread" {
|
||||
unread_only = true
|
||||
} else if word.starts_with(&site_prefix) {
|
||||
site = Some(word[site_prefix.len()..].to_string())
|
||||
} else if word.starts_with("tag:") {
|
||||
// Any tag that doesn't match site_prefix should explicitly set the site to something not in the
|
||||
// database
|
||||
site = Some(NON_EXISTENT_SITE_NAME.to_string());
|
||||
} else if word.starts_with(THREAD_PREFIX) {
|
||||
uid = Some(word[THREAD_PREFIX.len()..].to_string())
|
||||
} else {
|
||||
remainder.push(word.to_string());
|
||||
}
|
||||
}
|
||||
Ok(Query {
|
||||
unread_only,
|
||||
site,
|
||||
uid,
|
||||
remainder,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn set_read_status<'ctx>(
|
||||
pool: &PgPool,
|
||||
query: &str,
|
||||
@ -267,3 +247,17 @@ pub async fn set_read_status<'ctx>(
|
||||
.await?;
|
||||
Ok(true)
|
||||
}
|
||||
fn clean_title(title: &str) -> Result<String, ServerError> {
|
||||
// Make title HTML so html parsers work
|
||||
let mut title = format!("<html>{title}</html>");
|
||||
let title_tranformers: Vec<Box<dyn Transformer>> =
|
||||
vec![Box::new(EscapeHtml), Box::new(StripHtml)];
|
||||
// Make title HTML so html parsers work
|
||||
title = format!("<html>{title}</html>");
|
||||
for t in title_tranformers.iter() {
|
||||
if t.should_run(&title) {
|
||||
title = t.transform(&title)?;
|
||||
}
|
||||
}
|
||||
Ok(title)
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user