Compare commits

..

6 Commits

10 changed files with 3317 additions and 207 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
/target
mastodon-data.toml
http-cacache

868
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,13 +1,12 @@
[package]
name = "corobel"
version = "0.4.0"
version = "0.5.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
clap = { version = "4.0.18", features = [ "derive" ] }
eggbug = { version = "0.1.2", features = [ "tokio" ] }
reqwest = "0.11.12"
rocket = { version = "0.5.0-rc.2", features = [ "json" ] }
serde = { version = "1.0.147", features = [ "derive" ] }
@ -18,3 +17,8 @@ chrono = { version = "0.4.22", features = [ "serde" ] }
rss = { path = "./rss/", version = "2.0.1-atom-link-fix", features = [ "builders", "atom", "chrono" ] }
pulldown-cmark = "0.9.2"
atom_syndication = "0.11.0"
http-cache-reqwest = "0.5.0"
reqwest-middleware = "0.1.6"
cached = "0.40.0"
mime = "0.3.16"
mime_guess = "2.0.4"

View File

@ -16,11 +16,15 @@ ports to use for development and deployment.
- [ ] Handle redirects
- [x] RSS feeds for projects
- [x] Index page explaining what's going on
- [x] Better support for transparent shares
- [x] Add feed without shares
- [ ] More robust parsing (defaults for all!)
- [ ] RSS feeds for tags
- [x] Atom Extension pagination support
- [x] Disable pagination and just go for it lmao
- [x] Disable pagination
- [x] HTTP Cacheing
- [x] Data cacheing
- [x] Nicer theme
- [ ] Read More support
- [ ] Dublin Core support
- [ ] Media Envelope support
- [x] Media Envelope support

File diff suppressed because it is too large Load Diff

View File

@ -3,7 +3,7 @@ use serde::Deserialize;
/// The API URL from whence Cohost serves JSON project definitions
pub const COHOST_ACCOUNT_API_URL: &str = "https://cohost.org/api/v1/project/";
#[derive(Debug, Deserialize, PartialEq, Eq)]
#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
pub struct CohostAccount {
#[serde(rename = "projectId")]
pub project_id: u64,

View File

@ -11,7 +11,7 @@ pub fn cohost_posts_api_url(project: impl AsRef<str>, page: u64) -> String {
// Cohost doesn't give us Next links ("rel: next") for further pages, so we'll have to ALWAYS populate the rel=next field
#[derive(Debug, Deserialize)]
#[derive(Debug, Clone, Deserialize)]
pub struct CohostPostsPage {
#[serde(rename = "nItems")]
pub number_items: usize,
@ -22,7 +22,7 @@ pub struct CohostPostsPage {
pub links: Vec<CohostPostLink>,
}
#[derive(Debug, Deserialize)]
#[derive(Debug, Clone, Deserialize)]
pub struct CohostPost {
#[serde(rename = "postId")]
pub id: u64,
@ -44,13 +44,17 @@ pub struct CohostPost {
default
)]
pub url: String,
#[serde(deserialize_with = "deserialize_null_default", default)]
pub blocks: Vec<CohostPostBlock>,
#[serde(rename = "transparentShareOfPostId")]
pub transparent_share_of_post_id: Option<u64>,
#[serde(rename = "postingProject")]
pub poster: CohostPostingProject,
#[serde(rename = "shareTree")]
pub share_tree: Vec<CohostPost>,
}
#[derive(Debug, Deserialize)]
#[derive(Debug, Clone, Deserialize)]
pub struct CohostPostingProject {
#[serde(rename = "projectId")]
pub id: u64,
@ -70,7 +74,7 @@ pub struct CohostPostingProject {
pub pronouns: String,
}
#[derive(Debug, Deserialize)]
#[derive(Debug, Clone, Deserialize)]
pub struct CohostPostLink {
#[serde(deserialize_with = "deserialize_null_default", default)]
pub href: String,
@ -84,6 +88,21 @@ pub struct CohostPostLink {
pub t_type: String,
}
#[derive(Debug, Clone, Deserialize)]
pub struct CohostPostBlock {
pub attachment: Option<CohostPostAttachment>,
}
#[derive(Debug, Clone, Deserialize)]
pub struct CohostPostAttachment {
#[serde(
rename = "fileURL",
deserialize_with = "deserialize_null_default",
default
)]
pub file_url: String,
}
fn deserialize_null_default<'de, D, T>(deserializer: D) -> Result<T, D::Error>
where
T: Default + Deserialize<'de>,

View File

@ -3,7 +3,10 @@ use std::collections::HashMap;
use std::error::Error;
#[macro_use]
extern crate rocket;
use reqwest::{Client, StatusCode};
use cached::proc_macro::cached;
use http_cache_reqwest::{CACacheManager, Cache, CacheMode, HttpCache};
use reqwest::StatusCode;
use reqwest_middleware::{ClientBuilder, ClientWithMiddleware};
use rocket::response::content::RawHtml;
use rocket::serde::json::Json;
@ -40,6 +43,20 @@ fn user_agent() -> String {
}
static ARGS: once_cell::sync::Lazy<Args> = once_cell::sync::Lazy::new(|| Args::parse());
static CLIENT: once_cell::sync::Lazy<ClientWithMiddleware> = once_cell::sync::Lazy::new(|| {
ClientBuilder::new(
reqwest::Client::builder()
.user_agent(user_agent())
.build()
.unwrap(),
)
.with(Cache(HttpCache {
mode: CacheMode::Default,
manager: CACacheManager::default(),
options: None,
}))
.build()
});
#[get("/")]
fn index() -> RawHtml<&'static str> {
@ -52,13 +69,13 @@ struct MdResponse {
inner: String,
}
#[derive(Responder)]
#[derive(Debug, Clone, Responder)]
#[response(content_type = "application/rss+xml")]
struct RssResponse {
inner: String,
}
#[derive(Responder)]
#[derive(Debug, Responder)]
#[response(content_type = "text/plain")]
enum ErrorResponse {
#[response(status = 404)]
@ -67,14 +84,11 @@ enum ErrorResponse {
InternalError(String),
}
async fn get_post_from_page(
client: &mut Client,
project_id: &str,
post_id: u64,
) -> Result<CohostPost, ErrorResponse> {
#[cached(time = 60, result)]
async fn get_post_from_page(project_id: String, post_id: u64) -> Result<CohostPost, ErrorResponse> {
let mut page = 0;
loop {
let new_page = get_page_data(client, project_id, page).await?;
let new_page = get_page_data(project_id.clone(), page).await?;
if new_page.items.is_empty() {
// Once there are no posts, we're done.
return Err(ErrorResponse::NotFound(
@ -89,14 +103,12 @@ async fn get_post_from_page(
}
}
async fn get_full_post_data(
client: &mut Client,
project_id: &str,
) -> Result<CohostPostsPage, ErrorResponse> {
#[cached(time = 120, result)]
async fn get_full_post_data(project_id: String) -> Result<CohostPostsPage, ErrorResponse> {
let mut page = 0;
let mut merged_page = get_page_data(client, project_id, page).await?;
let mut merged_page = get_page_data(project_id.clone(), page).await?;
loop {
let mut new_page = get_page_data(client, project_id, page).await?;
let mut new_page = get_page_data(project_id.clone(), page).await?;
if new_page.items.is_empty() {
// Once there are no posts, we're done.
break;
@ -109,14 +121,11 @@ async fn get_full_post_data(
Ok(merged_page)
}
async fn get_page_data(
client: &mut Client,
project_id: &str,
page: u64,
) -> Result<CohostPostsPage, ErrorResponse> {
let posts_url = cohost_posts_api_url(project_id, page);
// Not cached because it's never used individually.
async fn get_page_data(project_id: String, page: u64) -> Result<CohostPostsPage, ErrorResponse> {
let posts_url = cohost_posts_api_url(&project_id, page);
eprintln!("making request to {}", posts_url);
match client.get(posts_url).send().await {
match CLIENT.get(posts_url).send().await {
Ok(v) => match v.status() {
StatusCode::OK => match v.json::<CohostPostsPage>().await {
Ok(page_data) => Ok(page_data),
@ -147,35 +156,11 @@ async fn get_page_data(
}
}
#[get("/<project>/feed.rss")]
async fn syndication_rss_route(project: &str) -> Result<RssResponse, ErrorResponse> {
let mut client = get_client()?;
let project_data = get_project_data(&mut client, project).await?;
let page_data = get_full_post_data(&mut client, project).await?;
Ok(RssResponse {
inner: syndication::channel_for_posts_page(project, project_data, page_data).to_string(),
})
}
#[get("/<project>/<id>")]
async fn post_md_route(project: &str, id: u64) -> Result<MdResponse, ErrorResponse> {
let mut client = get_client()?;
let _project_data = get_project_data(&mut client, project).await?;
let post_data = get_post_from_page(&mut client, project, id).await?;
Ok(MdResponse {
inner: post_data.plain_body,
})
}
async fn get_project_data(
client: &mut Client,
project_id: &str,
) -> Result<CohostAccount, ErrorResponse> {
#[cached(time = 60, result)]
async fn get_project_data(project_id: String) -> Result<CohostAccount, ErrorResponse> {
let project_url = format!("{}{}", COHOST_ACCOUNT_API_URL, project_id);
eprintln!("making request to {}", project_url);
match client.get(project_url).send().await {
match CLIENT.get(project_url).send().await {
Ok(v) => match v.status() {
StatusCode::OK => match v.json::<CohostAccount>().await {
Ok(a) => Ok(a),
@ -209,15 +194,33 @@ async fn get_project_data(
}
}
fn get_client() -> Result<Client, ErrorResponse> {
match Client::builder().user_agent(user_agent()).build() {
Ok(v) => Ok(v),
Err(e) => {
let err = format!("Couldn't build a reqwest client: {:?}", e);
eprintln!("{}", err);
Err(ErrorResponse::InternalError(err))
#[get("/<project>/originals.rss")]
async fn syndication_originals_rss_route(project: String) -> Result<RssResponse, ErrorResponse> {
let project_data = get_project_data(project.clone()).await?;
let page_data = get_full_post_data(project.clone()).await?;
Ok(RssResponse {
inner: syndication::channel_for_posts_page(project.clone(), project_data, page_data, true)
.to_string(),
})
}
#[get("/<project>/feed.rss")]
async fn syndication_rss_route(project: String) -> Result<RssResponse, ErrorResponse> {
let project_data = get_project_data(project.clone()).await?;
let page_data = get_full_post_data(project.clone()).await?;
Ok(RssResponse {
inner: syndication::channel_for_posts_page(project.clone(), project_data, page_data, false)
.to_string(),
})
}
#[get("/<project>/<id>")]
async fn post_md_route(project: String, id: u64) -> Result<MdResponse, ErrorResponse> {
let _project_data = get_project_data(project.clone()).await?;
let post_data = get_post_from_page(project.clone(), id).await?;
Ok(MdResponse {
inner: post_data.plain_body,
})
}
#[get("/.well-known/webfinger?<params..>")]
@ -232,9 +235,8 @@ async fn webfinger_route(
eprintln!("{}", err);
return Err(ErrorResponse::InternalError(err));
}
let mut client = get_client()?;
if let Some(param) = params.iter().next() {
let _project_data = get_project_data(&mut client, param.0.as_str()).await?;
let _project_data = get_project_data(param.0.clone()).await?;
Ok(Json(CohostWebfingerResource::new(
param.0.as_str(),
&ARGS.domain,
@ -252,7 +254,13 @@ async fn main() -> Result<(), Box<dyn Error>> {
let _rocket = rocket::build()
.mount(
&ARGS.base_url,
routes![index, webfinger_route, syndication_rss_route, post_md_route],
routes![
index,
webfinger_route,
syndication_rss_route,
syndication_originals_rss_route,
post_md_route
],
)
.ignite()
.await?

View File

@ -22,11 +22,16 @@ pub fn channel_for_posts_page(
project_name: impl AsRef<str>,
project: CohostAccount,
mut page: CohostPostsPage,
originals_only: bool,
) -> Channel {
let project_name = project_name.as_ref().clone();
let mut builder = rss::ChannelBuilder::default();
builder
.title(format!("{} Cohost Posts", project.display_name))
.title(format!(
"{} Cohost Posts{}",
project.display_name,
if originals_only { "" } else { " and Shares" }
))
.description(project.description)
.generator(Some(format!(
"{} {}",
@ -72,7 +77,15 @@ pub fn channel_for_posts_page(
let mut body_text = String::new();
if item.share_tree.len() == 1 {
if let Some(shared_post_id) = item.transparent_share_of_post_id {
if originals_only {
continue;
}
body_text.push_str(&format!(
"(share of post {} without any commentary)\n\n---\n\n",
shared_post_id
));
} else if item.share_tree.len() == 1 {
body_text.push_str("(in reply to another post)\n\n---\n\n")
} else if item.share_tree.len() > 1 {
body_text.push_str(&format!(
@ -108,9 +121,22 @@ pub fn channel_for_posts_page(
let parser = pulldown_cmark::Parser::new_ext(&body_text, options);
let mut html_output = String::new();
pulldown_cmark::html::push_html(&mut html_output, parser);
item_builder.content(html_output);
for attachment in item.blocks.into_iter().filter_map(|block| block.attachment) {
use mime_guess::from_path as guess_mime_from_path;
use rss::EnclosureBuilder;
let enclosure = EnclosureBuilder::default()
.mime_type(
guess_mime_from_path(&attachment.file_url)
.first_or_octet_stream()
.to_string(),
)
.url(attachment.file_url)
.build();
item_builder.enclosure(enclosure);
}
items.push(item_builder.build());
}

View File

@ -20,28 +20,67 @@
line-height: 1.75;
font-size: 1.25em;
}
h1,h2,h3,h4,h5,h6 {
font-family: sans-serif;
}
h1 {
text-align: center;
}
code {
font-family: monospace;
background-color: black;
color: white;
display: inline-block;
padding: 0px 4px;
border-radius: 4px;
}
a code {
color: white;
background-color: darkblue;
}
a:hover code {
color: darkblue;
background-color: white;
}
</style>
</head>
<body>
<h1>corobel</h1>
<h2>RSS feeds from Cohost pages</h2>
<h2>Standard Data from Cohost Posts and Projects</h2>
<p>
Go to <code>/project_name/feed.rss</code> to get a feed for a project.
For example, <a href="/noracodes/feed.rss"><code>/noracodes/feed.rss</code></a> will give you the feed for my page.
<h3>Project RSS Feeds</h3>
Go to <code>/project_name/feed.rss</code> to get a feed for a project, or <code>/project_name/originals.rss</code> for just original posts (including shared posts with commentary).
For example, <a href="/noracodes/feed.rss"><code>/noracodes/feed.rss</code></a> will give you the feed for my page,
or <a href="/noracodes/original.rss"><code>/noracodes/feed.rss</code></a> for just my original posts.
</p>
<p>
<h3>Markdown Extraction</h3>
You can also get a particular post's original plain-text body at <code>/project_name/post_id/</code>, such as
<a href="/noracodes/169186/"><code>/noracodes/169186/</code></a>. (In a Cohost post URL, the ID is the numerical part after <code>/post/</code>.
For instance, in <code>https://cohost.org/noracodes/post/169186-october-update</code>, the ID is "169186".)
Or, drag this bookmarklet: <a href="javascript:(function(){const regex = /^https:\/\/cohost.org\/([a-zA-Z_0-9]*)\/post\/([0-9]*)-.*/;const new_loc = window.location.href.replace(regex, 'https://corobel.nora.codes/$1/$2');window.open(new_loc);})()">
Or, drag this bookmarklet: <a href="javascript:(function(){const regex = /^https:\/\/cohost.org\/([a-zA-Z_\-0-9]*)\/post\/([0-9]*)-.*/;const new_loc = window.location.href.replace(regex, 'https://corobel.nora.codes/$1/$2');window.open(new_loc);})()">
Cohost: Extract Source
</a> to your bookmarks bar and then click on it when you're on a Cohost individual post page to download that post's source.
</p>
<p>
<h3>Webfinger Resources</h3>
Webfinger resources for accounts are provided at the Webfinger well-known URL <code>/.well-known/webfinger?project_name</code>.
</p>
<p>
<h3>Technical Details</h3>
Since 0.5.0, Corobel caches various responses to provide better service.
<ul>
<li>Project/account data for <b>60 seconds</b></li>
<li>Individual posts for <b>60 seconds</b></li>
<li>Whole RSS feeds for <b>120 seconds</b></li>
<li>Internal HTTP responses <b>according to Cohost's settings</b></li>
</ul>
This means that if you update a post and then immediately request its source, you might get the old source. Just wait a few seconds.
</p>
<p>
Brought to you by <a href="https://nora.codes">Leonora Tindall</a>, written in Rust with Rocket. Code is <a href="https://git.nora.codes/nora/corobel">online</a>, bug reports should go to my email nora@nora.codes.
</p>