Compare commits

..

No commits in common. "f8134dad7a646a6e9131fd8a5563411f8b20268c" and "e7cbf9cc45e4e478be78a0b481ee3ceaa525c086" have entirely different histories.

14 changed files with 189 additions and 1353 deletions

1263
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[package] [package]
name = "notmuch" name = "notmuch"
version = "0.0.23" version = "0.0.22"
edition = "2021" edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

View File

@ -1,6 +1,6 @@
[package] [package]
name = "procmail2notmuch" name = "procmail2notmuch"
version = "0.0.23" version = "0.0.22"
edition = "2021" edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

View File

@ -1,6 +1,6 @@
[package] [package]
name = "server" name = "server"
version = "0.0.23" version = "0.0.22"
edition = "2021" edition = "2021"
default-run = "server" default-run = "server"
@ -13,7 +13,6 @@ async-graphql = { version = "6.0.11", features = ["log"] }
async-graphql-rocket = "6.0.11" async-graphql-rocket = "6.0.11"
async-trait = "0.1.81" async-trait = "0.1.81"
build-info = "0.0.38" build-info = "0.0.38"
cacher = {git = "https://git-private.z.xinu.tv/wathiede/cacher"}
css-inline = "0.13.0" css-inline = "0.13.0"
glog = "0.1.0" glog = "0.1.0"
html-escape = "0.2.13" html-escape = "0.2.13"

View File

@ -1,8 +1,6 @@
[release] [release]
address = "0.0.0.0" address = "0.0.0.0"
port = 9345 port = 9345
newsreader_database_url = "postgres://newsreader@nixos-07.h.xinu.tv/newsreader"
newsreader_tantivy_db_path = "../target/database/newsreader"
[debug] [debug]
address = "0.0.0.0" address = "0.0.0.0"
@ -10,5 +8,3 @@ port = 9345
# Uncomment to make it production like. # Uncomment to make it production like.
#log_level = "critical" #log_level = "critical"
newsreader_database_url = "postgres://newsreader@nixos-07.h.xinu.tv/newsreader" newsreader_database_url = "postgres://newsreader@nixos-07.h.xinu.tv/newsreader"
newsreader_tantivy_db_path = "../target/database/newsreader"
slurp_cache_path = "/net/nasx/x/letterbox/slurp"

View File

@ -1,10 +0,0 @@
SELECT
site,
title,
summary,
link,
date,
is_read,
uid,
id
FROM post

View File

@ -18,14 +18,18 @@ use rocket::{
Response, State, Response, State,
}; };
use rocket_cors::{AllowedHeaders, AllowedOrigins}; use rocket_cors::{AllowedHeaders, AllowedOrigins};
use serde::Deserialize;
use server::{ use server::{
config::Config,
error::ServerError, error::ServerError,
graphql::{Attachment, GraphqlSchema, Mutation, QueryRoot}, graphql::{Attachment, GraphqlSchema, Mutation, QueryRoot},
nm::{attachment_bytes, cid_attachment_bytes}, nm::{attachment_bytes, cid_attachment_bytes},
}; };
use sqlx::postgres::PgPool; use sqlx::postgres::PgPool;
use tantivy::{Index, IndexWriter};
#[derive(Deserialize)]
struct Config {
newsreader_database_url: String,
}
#[get("/refresh")] #[get("/refresh")]
async fn refresh(nm: &State<Notmuch>) -> Result<Json<String>, Debug<NotmuchError>> { async fn refresh(nm: &State<Notmuch>) -> Result<Json<String>, Debug<NotmuchError>> {
@ -166,122 +170,6 @@ fn graphiql() -> content::RawHtml<String> {
content::RawHtml(GraphiQLSource::build().endpoint("/api/graphql").finish()) content::RawHtml(GraphiQLSource::build().endpoint("/api/graphql").finish())
} }
#[rocket::post("/create-news-db")]
fn create_news_db(config: &State<Config>) -> Result<String, Debug<ServerError>> {
std::fs::remove_dir_all(&config.newsreader_tantivy_db_path).map_err(ServerError::from)?;
std::fs::create_dir_all(&config.newsreader_tantivy_db_path).map_err(ServerError::from)?;
use tantivy::schema::*;
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("site", STRING | STORED);
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("summary", TEXT);
schema_builder.add_text_field("link", STRING | STORED);
schema_builder.add_date_field("date", FAST);
schema_builder.add_bool_field("is_read", FAST);
schema_builder.add_text_field("uid", STRING | STORED);
schema_builder.add_i64_field("id", FAST);
let schema = schema_builder.build();
Index::create_in_dir(&config.newsreader_tantivy_db_path, schema).map_err(ServerError::from)?;
Ok(format!(
"DB created in {}\n",
config.newsreader_tantivy_db_path
))
}
#[rocket::post("/reindex-news-db")]
async fn reindex_news_db(
pool: &State<PgPool>,
config: &State<Config>,
) -> Result<String, Debug<ServerError>> {
use tantivy::{doc, Term};
let start_time = std::time::Instant::now();
let pool: &PgPool = pool;
let index =
Index::open_in_dir(&config.newsreader_tantivy_db_path).map_err(ServerError::from)?;
let mut index_writer = index.writer(50_000_000).map_err(ServerError::from)?;
let schema = index.schema();
let site = schema.get_field("site").map_err(ServerError::from)?;
let title = schema.get_field("title").map_err(ServerError::from)?;
let summary = schema.get_field("summary").map_err(ServerError::from)?;
let link = schema.get_field("link").map_err(ServerError::from)?;
let date = schema.get_field("date").map_err(ServerError::from)?;
let is_read = schema.get_field("is_read").map_err(ServerError::from)?;
let uid = schema.get_field("uid").map_err(ServerError::from)?;
let id = schema.get_field("id").map_err(ServerError::from)?;
let rows = sqlx::query_file!("sql/all-posts.sql")
.fetch_all(pool)
.await
.map_err(ServerError::from)?;
let total = rows.len();
for (i, r) in rows.into_iter().enumerate() {
if i % 10_000 == 0 {
info!(
"{i}/{total} processed, elapsed {:.2}s",
start_time.elapsed().as_secs_f32()
);
}
let id_term = Term::from_field_text(uid, &r.uid);
index_writer.delete_term(id_term);
index_writer
.add_document(doc!(
site => r.site.expect("UNKOWN_SITE"),
title => r.title.expect("UNKOWN_TITLE"),
// TODO: clean and extract text from HTML
summary => r.summary.expect("UNKNOWN_SUMMARY"),
link => r.link.expect("link"),
date => tantivy::DateTime::from_primitive(r.date.expect("date")),
is_read => r.is_read.expect("is_read"),
uid => r.uid,
id => r.id as i64,
))
.map_err(ServerError::from)?;
}
index_writer.commit().map_err(ServerError::from)?;
info!("took {:.2}s to reindex", start_time.elapsed().as_secs_f32());
Ok(format!(
"DB openned in {}\n",
config.newsreader_tantivy_db_path
))
}
#[rocket::get("/search-news-db")]
fn search_news_db(
index: &State<tantivy::Index>,
reader: &State<tantivy::IndexReader>,
) -> Result<String, Debug<ServerError>> {
use tantivy::{collector::TopDocs, query::QueryParser, Document, TantivyDocument};
let searcher = reader.searcher();
let schema = index.schema();
let site = schema.get_field("site").map_err(ServerError::from)?;
let title = schema.get_field("title").map_err(ServerError::from)?;
let summary = schema.get_field("summary").map_err(ServerError::from)?;
let query_parser = QueryParser::for_index(&index, vec![site, title, summary]);
let query = query_parser
.parse_query("grapheme")
.map_err(ServerError::from)?;
let top_docs = searcher
.search(&query, &TopDocs::with_limit(10))
.map_err(ServerError::from)?;
let mut results = vec![];
info!("search found {} docs", top_docs.len());
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument =
searcher.doc(doc_address).map_err(ServerError::from)?;
results.push(format!("{}", retrieved_doc.to_json(&schema)));
}
Ok(format!("{}", results.join(" ")))
}
#[rocket::get("/graphql?<query..>")] #[rocket::get("/graphql?<query..>")]
async fn graphql_query(schema: &State<GraphqlSchema>, query: GraphQLQuery) -> GraphQLResponse { async fn graphql_query(schema: &State<GraphqlSchema>, query: GraphQLQuery) -> GraphQLResponse {
query.execute(schema.inner()).await query.execute(schema.inner()).await
@ -295,6 +183,7 @@ async fn graphql_request(
request.execute(schema.inner()).await request.execute(schema.inner()).await
} }
#[rocket::main] #[rocket::main]
async fn main() -> Result<(), Box<dyn Error>> { async fn main() -> Result<(), Box<dyn Error>> {
glog::new() glog::new()
@ -324,9 +213,6 @@ async fn main() -> Result<(), Box<dyn Error>> {
.mount( .mount(
shared::urls::MOUNT_POINT, shared::urls::MOUNT_POINT,
routes![ routes![
create_news_db,
reindex_news_db,
search_news_db,
original, original,
refresh, refresh,
show_pretty, show_pretty,
@ -343,26 +229,14 @@ async fn main() -> Result<(), Box<dyn Error>> {
.attach(AdHoc::config::<Config>()); .attach(AdHoc::config::<Config>());
let config: Config = rkt.figment().extract()?; let config: Config = rkt.figment().extract()?;
if !std::fs::exists(&config.slurp_cache_path)? {
info!("Creating slurp cache @ '{}'", &config.slurp_cache_path);
std::fs::create_dir_all(&config.slurp_cache_path)?;
}
let pool = PgPool::connect(&config.newsreader_database_url).await?; let pool = PgPool::connect(&config.newsreader_database_url).await?;
let tantivy_newsreader_index = Index::open_in_dir(&config.newsreader_tantivy_db_path)?;
let tantivy_newsreader_reader = tantivy_newsreader_index.reader()?;
let schema = Schema::build(QueryRoot, Mutation, EmptySubscription) let schema = Schema::build(QueryRoot, Mutation, EmptySubscription)
.data(Notmuch::default()) .data(Notmuch::default())
.data(config)
.data(pool.clone()) .data(pool.clone())
.extension(async_graphql::extensions::Logger) .extension(async_graphql::extensions::Logger)
.finish(); .finish();
let rkt = rkt let rkt = rkt.manage(schema).manage(pool).manage(Notmuch::default());
.manage(schema)
.manage(pool)
.manage(Notmuch::default())
.manage(tantivy_newsreader_index)
.manage(tantivy_newsreader_reader);
//.manage(Notmuch::with_config("../notmuch/testdata/notmuch.config")) //.manage(Notmuch::with_config("../notmuch/testdata/notmuch.config"))
rkt.launch().await?; rkt.launch().await?;

View File

@ -1,7 +0,0 @@
use serde::Deserialize;
#[derive(Deserialize)]
pub struct Config {
pub newsreader_database_url: String,
pub newsreader_tantivy_db_path: String,
pub slurp_cache_path: String,
}

View File

@ -1,8 +1,6 @@
use std::{convert::Infallible, str::Utf8Error, string::FromUtf8Error}; use std::{convert::Infallible, str::Utf8Error, string::FromUtf8Error};
use mailparse::MailParseError; use mailparse::MailParseError;
use tantivy::TantivyError;
use tantivy::query::QueryParserError;
use thiserror::Error; use thiserror::Error;
use crate::TransformError; use crate::TransformError;
@ -31,10 +29,6 @@ pub enum ServerError {
StringError(String), StringError(String),
#[error("invalid url: {0}")] #[error("invalid url: {0}")]
UrlParseError(#[from] url::ParseError), UrlParseError(#[from] url::ParseError),
#[error("tantivy error: {0}")]
TantivyError(#[from] TantivyError),
#[error("tantivy query parse error: {0}")]
QueryParseError(#[from] QueryParserError),
#[error("impossible: {0}")] #[error("impossible: {0}")]
InfaillibleError(#[from] Infallible), InfaillibleError(#[from] Infallible),
} }

View File

@ -8,7 +8,7 @@ use notmuch::Notmuch;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use sqlx::postgres::PgPool; use sqlx::postgres::PgPool;
use crate::{config::Config, newsreader, nm, Query}; use crate::{newsreader, nm, Query};
/// # Number of seconds since the Epoch /// # Number of seconds since the Epoch
pub type UnixTime = isize; pub type UnixTime = isize;
@ -384,7 +384,6 @@ impl QueryRoot {
async fn thread<'ctx>(&self, ctx: &Context<'ctx>, thread_id: String) -> Result<Thread, Error> { async fn thread<'ctx>(&self, ctx: &Context<'ctx>, thread_id: String) -> Result<Thread, Error> {
let nm = ctx.data_unchecked::<Notmuch>(); let nm = ctx.data_unchecked::<Notmuch>();
let pool = ctx.data_unchecked::<PgPool>(); let pool = ctx.data_unchecked::<PgPool>();
let config = ctx.data_unchecked::<Config>();
let debug_content_tree = ctx let debug_content_tree = ctx
.look_ahead() .look_ahead()
.field("messages") .field("messages")
@ -393,7 +392,7 @@ impl QueryRoot {
.exists(); .exists();
// TODO: look at thread_id and conditionally load newsreader // TODO: look at thread_id and conditionally load newsreader
if newsreader::is_newsreader_thread(&thread_id) { if newsreader::is_newsreader_thread(&thread_id) {
Ok(newsreader::thread(config, pool, thread_id).await?) Ok(newsreader::thread(pool, thread_id).await?)
} else { } else {
Ok(nm::thread(nm, thread_id, debug_content_tree).await?) Ok(nm::thread(nm, thread_id, debug_content_tree).await?)
} }

View File

@ -1,16 +1,14 @@
pub mod config;
pub mod error; pub mod error;
pub mod graphql; pub mod graphql;
pub mod newsreader; pub mod newsreader;
pub mod nm; pub mod nm;
use std::{collections::HashMap, convert::Infallible, str::FromStr, sync::Arc}; use std::{collections::HashMap, convert::Infallible, str::FromStr};
use async_trait::async_trait; use async_trait::async_trait;
use cacher::{Cacher, FilesystemCacher};
use css_inline::{CSSInliner, InlineError, InlineOptions}; use css_inline::{CSSInliner, InlineError, InlineOptions};
use linkify::{LinkFinder, LinkKind}; use linkify::{LinkFinder, LinkKind};
use log::{error, info, warn}; use log::{error, warn};
use lol_html::{ use lol_html::{
element, errors::RewritingError, html_content::ContentType, rewrite_str, text, element, errors::RewritingError, html_content::ContentType, rewrite_str, text,
RewriteStrSettings, RewriteStrSettings,
@ -18,7 +16,6 @@ use lol_html::{
use maplit::{hashmap, hashset}; use maplit::{hashmap, hashset};
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use thiserror::Error; use thiserror::Error;
use tokio::sync::Mutex;
use url::Url; use url::Url;
use crate::newsreader::{extract_thread_id, is_newsreader_thread}; use crate::newsreader::{extract_thread_id, is_newsreader_thread};
@ -112,17 +109,16 @@ impl Transformer for InlineStyle {
include_str!("custom.css"), include_str!("custom.css"),
); );
let inline_opts = InlineOptions { let inline_opts = InlineOptions {
inline_style_tags: true, inline_style_tags: false,
keep_style_tags: false, keep_style_tags: false,
keep_link_tags: true, keep_link_tags: false,
base_url: None, base_url: None,
load_remote_stylesheets: true, load_remote_stylesheets: false,
extra_css: Some(css.into()), extra_css: Some(css.into()),
preallocate_node_capacity: 32, preallocate_node_capacity: 32,
..InlineOptions::default() ..InlineOptions::default()
}; };
//info!("HTML:\n{html}");
Ok(match CSSInliner::new(inline_opts).inline(&html) { Ok(match CSSInliner::new(inline_opts).inline(&html) {
Ok(inlined_html) => inlined_html, Ok(inlined_html) => inlined_html,
Err(err) => { Err(err) => {
@ -216,7 +212,6 @@ impl Transformer for AddOutlink {
} }
struct SlurpContents { struct SlurpContents {
cacher: Arc<Mutex<FilesystemCacher>>,
site_selectors: HashMap<String, Vec<Selector>>, site_selectors: HashMap<String, Vec<Selector>>,
} }
@ -246,26 +241,19 @@ impl Transformer for SlurpContents {
let Some(selectors) = self.get_selectors(&link) else { let Some(selectors) = self.get_selectors(&link) else {
return Ok(html.to_string()); return Ok(html.to_string());
}; };
let mut cacher = self.cacher.lock().await; let body = reqwest::get(link.as_str()).await?.text().await?;
let body = if let Some(body) = cacher.get(link.as_str()) {
info!("cache hit for {link}");
String::from_utf8_lossy(&body).to_string()
} else {
let body = reqwest::get(link.as_str()).await?.text().await?;
cacher.set(link.as_str(), body.as_bytes());
body
};
let doc = Html::parse_document(&body); let doc = Html::parse_document(&body);
let mut results = Vec::new(); let mut results = Vec::new();
for selector in selectors { for selector in selectors {
for frag in doc.select(&selector) { if let Some(frag) = doc.select(&selector).next() {
results.push(frag.html()) results.push(frag.html())
// TODO: figure out how to warn if there were no hits } else {
//warn!("couldn't find '{:?}' in {}", selector, link); warn!("couldn't find '{:?}' in {}", selector, link);
return Ok(html.to_string());
} }
} }
Ok(results.join("")) Ok(results.join("<br><br>"))
} }
} }
@ -304,7 +292,7 @@ pub fn sanitize_html(
) -> Result<String, TransformError> { ) -> Result<String, TransformError> {
let inline_opts = InlineOptions { let inline_opts = InlineOptions {
inline_style_tags: true, inline_style_tags: true,
keep_style_tags: true, keep_style_tags: false,
keep_link_tags: false, keep_link_tags: false,
base_url: None, base_url: None,
load_remote_stylesheets: false, load_remote_stylesheets: false,
@ -347,30 +335,6 @@ pub fn sanitize_html(
el.set_attribute("src", &src)?; el.set_attribute("src", &src)?;
Ok(())
}),
// Add https to href with //<domain name>
element!("link[href]", |el| {
info!("found link[href] {el:?}");
let mut href = el.get_attribute("href").expect("href was required");
if href.starts_with("//") {
warn!("adding https to {href}");
href.insert_str(0, "https:");
}
el.set_attribute("href", &href)?;
Ok(())
}),
// Add https to src with //<domain name>
element!("style[src]", |el| {
let mut src = el.get_attribute("src").expect("src was required");
if src.starts_with("//") {
src.insert_str(0, "https:");
}
el.set_attribute("src", &src)?;
Ok(()) Ok(())
}), }),
]; ];

View File

@ -1,26 +1,24 @@
use std::sync::Arc; use std::hash::{DefaultHasher, Hash, Hasher};
use cacher::FilesystemCacher;
use log::info; use log::info;
use maplit::hashmap; use maplit::hashmap;
use scraper::Selector; use scraper::Selector;
use shared::compute_color;
use sqlx::postgres::PgPool; use sqlx::postgres::PgPool;
use tokio::sync::Mutex;
use url::Url; use url::Url;
use crate::{ use crate::Query;
compute_offset_limit,
config::Config,
error::ServerError,
graphql::{NewsPost, Tag, Thread, ThreadSummary},
AddOutlink, EscapeHtml, FrameImages, InlineStyle, Query, SanitizeHtml, SlurpContents,
StripHtml, Transformer,
};
const TAG_PREFIX: &'static str = "News/"; const TAG_PREFIX: &'static str = "News/";
const THREAD_PREFIX: &'static str = "news:"; const THREAD_PREFIX: &'static str = "news:";
use crate::{
compute_offset_limit,
error::ServerError,
graphql::{NewsPost, Tag, Thread, ThreadSummary},
AddOutlink, EscapeHtml, FrameImages, InlineStyle, SanitizeHtml, SlurpContents, StripHtml,
Transformer,
};
pub fn is_newsreader_search(query: &str) -> bool { pub fn is_newsreader_search(query: &str) -> bool {
query.contains(TAG_PREFIX) query.contains(TAG_PREFIX)
} }
@ -130,9 +128,11 @@ pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, Server
let tags = tags let tags = tags
.into_iter() .into_iter()
.map(|tag| { .map(|tag| {
let mut hasher = DefaultHasher::new();
tag.site.hash(&mut hasher);
let hex = format!("#{:06x}", hasher.finish() % (1 << 24));
let unread = tag.unread.unwrap_or(0).try_into().unwrap_or(0); let unread = tag.unread.unwrap_or(0).try_into().unwrap_or(0);
let name = format!("{TAG_PREFIX}{}", tag.site.expect("tag must have site")); let name = format!("{TAG_PREFIX}{}", tag.site.expect("tag must have site"));
let hex = compute_color(&name);
Tag { Tag {
name, name,
fg_color: "white".to_string(), fg_color: "white".to_string(),
@ -144,11 +144,7 @@ pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, Server
Ok(tags) Ok(tags)
} }
pub async fn thread( pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerError> {
config: &Config,
pool: &PgPool,
thread_id: String,
) -> Result<Thread, ServerError> {
let id = thread_id let id = thread_id
.strip_prefix(THREAD_PREFIX) .strip_prefix(THREAD_PREFIX)
.expect("news thread doesn't start with '{THREAD_PREFIX}'") .expect("news thread doesn't start with '{THREAD_PREFIX}'")
@ -177,10 +173,8 @@ pub async fn thread(
// TODO: add site specific cleanups. For example: // TODO: add site specific cleanups. For example:
// * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div> // * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
// * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent // * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent
let cacher = Arc::new(Mutex::new(FilesystemCacher::new(&config.slurp_cache_path)?));
let body_tranformers: Vec<Box<dyn Transformer>> = vec![ let body_tranformers: Vec<Box<dyn Transformer>> = vec![
Box::new(SlurpContents { Box::new(SlurpContents {
cacher,
site_selectors: hashmap![ site_selectors: hashmap![
"atmeta.com".to_string() => vec![ "atmeta.com".to_string() => vec![
Selector::parse("div.entry-content").unwrap(), Selector::parse("div.entry-content").unwrap(),

View File

@ -1,6 +1,6 @@
[package] [package]
name = "shared" name = "shared"
version = "0.0.23" version = "0.0.22"
edition = "2021" edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

View File

@ -1,5 +1,5 @@
[package] [package]
version = "0.0.23" version = "0.0.22"
name = "letterbox" name = "letterbox"
repository = "https://github.com/seed-rs/seed-quickstart" repository = "https://github.com/seed-rs/seed-quickstart"
authors = ["Bill Thiede <git@xinu.tv>"] authors = ["Bill Thiede <git@xinu.tv>"]