Compare commits

..

No commits in common. "f8134dad7a646a6e9131fd8a5563411f8b20268c" and "e7cbf9cc45e4e478be78a0b481ee3ceaa525c086" have entirely different histories.

14 changed files with 189 additions and 1353 deletions

1263
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[package]
name = "notmuch"
version = "0.0.23"
version = "0.0.22"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

View File

@ -1,6 +1,6 @@
[package]
name = "procmail2notmuch"
version = "0.0.23"
version = "0.0.22"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

View File

@ -1,6 +1,6 @@
[package]
name = "server"
version = "0.0.23"
version = "0.0.22"
edition = "2021"
default-run = "server"
@ -13,7 +13,6 @@ async-graphql = { version = "6.0.11", features = ["log"] }
async-graphql-rocket = "6.0.11"
async-trait = "0.1.81"
build-info = "0.0.38"
cacher = {git = "https://git-private.z.xinu.tv/wathiede/cacher"}
css-inline = "0.13.0"
glog = "0.1.0"
html-escape = "0.2.13"

View File

@ -1,8 +1,6 @@
[release]
address = "0.0.0.0"
port = 9345
newsreader_database_url = "postgres://newsreader@nixos-07.h.xinu.tv/newsreader"
newsreader_tantivy_db_path = "../target/database/newsreader"
[debug]
address = "0.0.0.0"
@ -10,5 +8,3 @@ port = 9345
# Uncomment to make it production like.
#log_level = "critical"
newsreader_database_url = "postgres://newsreader@nixos-07.h.xinu.tv/newsreader"
newsreader_tantivy_db_path = "../target/database/newsreader"
slurp_cache_path = "/net/nasx/x/letterbox/slurp"

View File

@ -1,10 +0,0 @@
SELECT
site,
title,
summary,
link,
date,
is_read,
uid,
id
FROM post

View File

@ -18,14 +18,18 @@ use rocket::{
Response, State,
};
use rocket_cors::{AllowedHeaders, AllowedOrigins};
use serde::Deserialize;
use server::{
config::Config,
error::ServerError,
graphql::{Attachment, GraphqlSchema, Mutation, QueryRoot},
nm::{attachment_bytes, cid_attachment_bytes},
};
use sqlx::postgres::PgPool;
use tantivy::{Index, IndexWriter};
#[derive(Deserialize)]
struct Config {
newsreader_database_url: String,
}
#[get("/refresh")]
async fn refresh(nm: &State<Notmuch>) -> Result<Json<String>, Debug<NotmuchError>> {
@ -166,122 +170,6 @@ fn graphiql() -> content::RawHtml<String> {
content::RawHtml(GraphiQLSource::build().endpoint("/api/graphql").finish())
}
#[rocket::post("/create-news-db")]
fn create_news_db(config: &State<Config>) -> Result<String, Debug<ServerError>> {
std::fs::remove_dir_all(&config.newsreader_tantivy_db_path).map_err(ServerError::from)?;
std::fs::create_dir_all(&config.newsreader_tantivy_db_path).map_err(ServerError::from)?;
use tantivy::schema::*;
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("site", STRING | STORED);
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("summary", TEXT);
schema_builder.add_text_field("link", STRING | STORED);
schema_builder.add_date_field("date", FAST);
schema_builder.add_bool_field("is_read", FAST);
schema_builder.add_text_field("uid", STRING | STORED);
schema_builder.add_i64_field("id", FAST);
let schema = schema_builder.build();
Index::create_in_dir(&config.newsreader_tantivy_db_path, schema).map_err(ServerError::from)?;
Ok(format!(
"DB created in {}\n",
config.newsreader_tantivy_db_path
))
}
#[rocket::post("/reindex-news-db")]
async fn reindex_news_db(
pool: &State<PgPool>,
config: &State<Config>,
) -> Result<String, Debug<ServerError>> {
use tantivy::{doc, Term};
let start_time = std::time::Instant::now();
let pool: &PgPool = pool;
let index =
Index::open_in_dir(&config.newsreader_tantivy_db_path).map_err(ServerError::from)?;
let mut index_writer = index.writer(50_000_000).map_err(ServerError::from)?;
let schema = index.schema();
let site = schema.get_field("site").map_err(ServerError::from)?;
let title = schema.get_field("title").map_err(ServerError::from)?;
let summary = schema.get_field("summary").map_err(ServerError::from)?;
let link = schema.get_field("link").map_err(ServerError::from)?;
let date = schema.get_field("date").map_err(ServerError::from)?;
let is_read = schema.get_field("is_read").map_err(ServerError::from)?;
let uid = schema.get_field("uid").map_err(ServerError::from)?;
let id = schema.get_field("id").map_err(ServerError::from)?;
let rows = sqlx::query_file!("sql/all-posts.sql")
.fetch_all(pool)
.await
.map_err(ServerError::from)?;
let total = rows.len();
for (i, r) in rows.into_iter().enumerate() {
if i % 10_000 == 0 {
info!(
"{i}/{total} processed, elapsed {:.2}s",
start_time.elapsed().as_secs_f32()
);
}
let id_term = Term::from_field_text(uid, &r.uid);
index_writer.delete_term(id_term);
index_writer
.add_document(doc!(
site => r.site.expect("UNKOWN_SITE"),
title => r.title.expect("UNKOWN_TITLE"),
// TODO: clean and extract text from HTML
summary => r.summary.expect("UNKNOWN_SUMMARY"),
link => r.link.expect("link"),
date => tantivy::DateTime::from_primitive(r.date.expect("date")),
is_read => r.is_read.expect("is_read"),
uid => r.uid,
id => r.id as i64,
))
.map_err(ServerError::from)?;
}
index_writer.commit().map_err(ServerError::from)?;
info!("took {:.2}s to reindex", start_time.elapsed().as_secs_f32());
Ok(format!(
"DB openned in {}\n",
config.newsreader_tantivy_db_path
))
}
#[rocket::get("/search-news-db")]
fn search_news_db(
index: &State<tantivy::Index>,
reader: &State<tantivy::IndexReader>,
) -> Result<String, Debug<ServerError>> {
use tantivy::{collector::TopDocs, query::QueryParser, Document, TantivyDocument};
let searcher = reader.searcher();
let schema = index.schema();
let site = schema.get_field("site").map_err(ServerError::from)?;
let title = schema.get_field("title").map_err(ServerError::from)?;
let summary = schema.get_field("summary").map_err(ServerError::from)?;
let query_parser = QueryParser::for_index(&index, vec![site, title, summary]);
let query = query_parser
.parse_query("grapheme")
.map_err(ServerError::from)?;
let top_docs = searcher
.search(&query, &TopDocs::with_limit(10))
.map_err(ServerError::from)?;
let mut results = vec![];
info!("search found {} docs", top_docs.len());
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument =
searcher.doc(doc_address).map_err(ServerError::from)?;
results.push(format!("{}", retrieved_doc.to_json(&schema)));
}
Ok(format!("{}", results.join(" ")))
}
#[rocket::get("/graphql?<query..>")]
async fn graphql_query(schema: &State<GraphqlSchema>, query: GraphQLQuery) -> GraphQLResponse {
query.execute(schema.inner()).await
@ -295,6 +183,7 @@ async fn graphql_request(
request.execute(schema.inner()).await
}
#[rocket::main]
async fn main() -> Result<(), Box<dyn Error>> {
glog::new()
@ -324,9 +213,6 @@ async fn main() -> Result<(), Box<dyn Error>> {
.mount(
shared::urls::MOUNT_POINT,
routes![
create_news_db,
reindex_news_db,
search_news_db,
original,
refresh,
show_pretty,
@ -343,26 +229,14 @@ async fn main() -> Result<(), Box<dyn Error>> {
.attach(AdHoc::config::<Config>());
let config: Config = rkt.figment().extract()?;
if !std::fs::exists(&config.slurp_cache_path)? {
info!("Creating slurp cache @ '{}'", &config.slurp_cache_path);
std::fs::create_dir_all(&config.slurp_cache_path)?;
}
let pool = PgPool::connect(&config.newsreader_database_url).await?;
let tantivy_newsreader_index = Index::open_in_dir(&config.newsreader_tantivy_db_path)?;
let tantivy_newsreader_reader = tantivy_newsreader_index.reader()?;
let schema = Schema::build(QueryRoot, Mutation, EmptySubscription)
.data(Notmuch::default())
.data(config)
.data(pool.clone())
.extension(async_graphql::extensions::Logger)
.finish();
let rkt = rkt
.manage(schema)
.manage(pool)
.manage(Notmuch::default())
.manage(tantivy_newsreader_index)
.manage(tantivy_newsreader_reader);
let rkt = rkt.manage(schema).manage(pool).manage(Notmuch::default());
//.manage(Notmuch::with_config("../notmuch/testdata/notmuch.config"))
rkt.launch().await?;

View File

@ -1,7 +0,0 @@
use serde::Deserialize;
#[derive(Deserialize)]
pub struct Config {
pub newsreader_database_url: String,
pub newsreader_tantivy_db_path: String,
pub slurp_cache_path: String,
}

View File

@ -1,8 +1,6 @@
use std::{convert::Infallible, str::Utf8Error, string::FromUtf8Error};
use mailparse::MailParseError;
use tantivy::TantivyError;
use tantivy::query::QueryParserError;
use thiserror::Error;
use crate::TransformError;
@ -31,10 +29,6 @@ pub enum ServerError {
StringError(String),
#[error("invalid url: {0}")]
UrlParseError(#[from] url::ParseError),
#[error("tantivy error: {0}")]
TantivyError(#[from] TantivyError),
#[error("tantivy query parse error: {0}")]
QueryParseError(#[from] QueryParserError),
#[error("impossible: {0}")]
InfaillibleError(#[from] Infallible),
}

View File

@ -8,7 +8,7 @@ use notmuch::Notmuch;
use serde::{Deserialize, Serialize};
use sqlx::postgres::PgPool;
use crate::{config::Config, newsreader, nm, Query};
use crate::{newsreader, nm, Query};
/// # Number of seconds since the Epoch
pub type UnixTime = isize;
@ -384,7 +384,6 @@ impl QueryRoot {
async fn thread<'ctx>(&self, ctx: &Context<'ctx>, thread_id: String) -> Result<Thread, Error> {
let nm = ctx.data_unchecked::<Notmuch>();
let pool = ctx.data_unchecked::<PgPool>();
let config = ctx.data_unchecked::<Config>();
let debug_content_tree = ctx
.look_ahead()
.field("messages")
@ -393,7 +392,7 @@ impl QueryRoot {
.exists();
// TODO: look at thread_id and conditionally load newsreader
if newsreader::is_newsreader_thread(&thread_id) {
Ok(newsreader::thread(config, pool, thread_id).await?)
Ok(newsreader::thread(pool, thread_id).await?)
} else {
Ok(nm::thread(nm, thread_id, debug_content_tree).await?)
}

View File

@ -1,16 +1,14 @@
pub mod config;
pub mod error;
pub mod graphql;
pub mod newsreader;
pub mod nm;
use std::{collections::HashMap, convert::Infallible, str::FromStr, sync::Arc};
use std::{collections::HashMap, convert::Infallible, str::FromStr};
use async_trait::async_trait;
use cacher::{Cacher, FilesystemCacher};
use css_inline::{CSSInliner, InlineError, InlineOptions};
use linkify::{LinkFinder, LinkKind};
use log::{error, info, warn};
use log::{error, warn};
use lol_html::{
element, errors::RewritingError, html_content::ContentType, rewrite_str, text,
RewriteStrSettings,
@ -18,7 +16,6 @@ use lol_html::{
use maplit::{hashmap, hashset};
use scraper::{Html, Selector};
use thiserror::Error;
use tokio::sync::Mutex;
use url::Url;
use crate::newsreader::{extract_thread_id, is_newsreader_thread};
@ -112,17 +109,16 @@ impl Transformer for InlineStyle {
include_str!("custom.css"),
);
let inline_opts = InlineOptions {
inline_style_tags: true,
inline_style_tags: false,
keep_style_tags: false,
keep_link_tags: true,
keep_link_tags: false,
base_url: None,
load_remote_stylesheets: true,
load_remote_stylesheets: false,
extra_css: Some(css.into()),
preallocate_node_capacity: 32,
..InlineOptions::default()
};
//info!("HTML:\n{html}");
Ok(match CSSInliner::new(inline_opts).inline(&html) {
Ok(inlined_html) => inlined_html,
Err(err) => {
@ -216,7 +212,6 @@ impl Transformer for AddOutlink {
}
struct SlurpContents {
cacher: Arc<Mutex<FilesystemCacher>>,
site_selectors: HashMap<String, Vec<Selector>>,
}
@ -246,26 +241,19 @@ impl Transformer for SlurpContents {
let Some(selectors) = self.get_selectors(&link) else {
return Ok(html.to_string());
};
let mut cacher = self.cacher.lock().await;
let body = if let Some(body) = cacher.get(link.as_str()) {
info!("cache hit for {link}");
String::from_utf8_lossy(&body).to_string()
} else {
let body = reqwest::get(link.as_str()).await?.text().await?;
cacher.set(link.as_str(), body.as_bytes());
body
};
let body = reqwest::get(link.as_str()).await?.text().await?;
let doc = Html::parse_document(&body);
let mut results = Vec::new();
for selector in selectors {
for frag in doc.select(&selector) {
if let Some(frag) = doc.select(&selector).next() {
results.push(frag.html())
// TODO: figure out how to warn if there were no hits
//warn!("couldn't find '{:?}' in {}", selector, link);
} else {
warn!("couldn't find '{:?}' in {}", selector, link);
return Ok(html.to_string());
}
}
Ok(results.join(""))
Ok(results.join("<br><br>"))
}
}
@ -304,7 +292,7 @@ pub fn sanitize_html(
) -> Result<String, TransformError> {
let inline_opts = InlineOptions {
inline_style_tags: true,
keep_style_tags: true,
keep_style_tags: false,
keep_link_tags: false,
base_url: None,
load_remote_stylesheets: false,
@ -347,30 +335,6 @@ pub fn sanitize_html(
el.set_attribute("src", &src)?;
Ok(())
}),
// Add https to href with //<domain name>
element!("link[href]", |el| {
info!("found link[href] {el:?}");
let mut href = el.get_attribute("href").expect("href was required");
if href.starts_with("//") {
warn!("adding https to {href}");
href.insert_str(0, "https:");
}
el.set_attribute("href", &href)?;
Ok(())
}),
// Add https to src with //<domain name>
element!("style[src]", |el| {
let mut src = el.get_attribute("src").expect("src was required");
if src.starts_with("//") {
src.insert_str(0, "https:");
}
el.set_attribute("src", &src)?;
Ok(())
}),
];

View File

@ -1,26 +1,24 @@
use std::sync::Arc;
use std::hash::{DefaultHasher, Hash, Hasher};
use cacher::FilesystemCacher;
use log::info;
use maplit::hashmap;
use scraper::Selector;
use shared::compute_color;
use sqlx::postgres::PgPool;
use tokio::sync::Mutex;
use url::Url;
use crate::{
compute_offset_limit,
config::Config,
error::ServerError,
graphql::{NewsPost, Tag, Thread, ThreadSummary},
AddOutlink, EscapeHtml, FrameImages, InlineStyle, Query, SanitizeHtml, SlurpContents,
StripHtml, Transformer,
};
use crate::Query;
const TAG_PREFIX: &'static str = "News/";
const THREAD_PREFIX: &'static str = "news:";
use crate::{
compute_offset_limit,
error::ServerError,
graphql::{NewsPost, Tag, Thread, ThreadSummary},
AddOutlink, EscapeHtml, FrameImages, InlineStyle, SanitizeHtml, SlurpContents, StripHtml,
Transformer,
};
pub fn is_newsreader_search(query: &str) -> bool {
query.contains(TAG_PREFIX)
}
@ -130,9 +128,11 @@ pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, Server
let tags = tags
.into_iter()
.map(|tag| {
let mut hasher = DefaultHasher::new();
tag.site.hash(&mut hasher);
let hex = format!("#{:06x}", hasher.finish() % (1 << 24));
let unread = tag.unread.unwrap_or(0).try_into().unwrap_or(0);
let name = format!("{TAG_PREFIX}{}", tag.site.expect("tag must have site"));
let hex = compute_color(&name);
Tag {
name,
fg_color: "white".to_string(),
@ -144,11 +144,7 @@ pub async fn tags(pool: &PgPool, _needs_unread: bool) -> Result<Vec<Tag>, Server
Ok(tags)
}
pub async fn thread(
config: &Config,
pool: &PgPool,
thread_id: String,
) -> Result<Thread, ServerError> {
pub async fn thread(pool: &PgPool, thread_id: String) -> Result<Thread, ServerError> {
let id = thread_id
.strip_prefix(THREAD_PREFIX)
.expect("news thread doesn't start with '{THREAD_PREFIX}'")
@ -177,10 +173,8 @@ pub async fn thread(
// TODO: add site specific cleanups. For example:
// * Grafana does <div class="image-wrapp"><img class="lazyload>"<img src="/media/...>"</img></div>
// * Some sites appear to be HTML encoded, unencode them, i.e. imperialviolent
let cacher = Arc::new(Mutex::new(FilesystemCacher::new(&config.slurp_cache_path)?));
let body_tranformers: Vec<Box<dyn Transformer>> = vec![
Box::new(SlurpContents {
cacher,
site_selectors: hashmap![
"atmeta.com".to_string() => vec![
Selector::parse("div.entry-content").unwrap(),

View File

@ -1,6 +1,6 @@
[package]
name = "shared"
version = "0.0.23"
version = "0.0.22"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

View File

@ -1,5 +1,5 @@
[package]
version = "0.0.23"
version = "0.0.22"
name = "letterbox"
repository = "https://github.com/seed-rs/seed-quickstart"
authors = ["Bill Thiede <git@xinu.tv>"]