fix: flatten content and then apply markup rules

This commit is contained in:
Aravinth Manivannan 2022-08-21 23:10:40 +05:30
parent 1c1224b405
commit bd401d888e
Signed by: realaravinth
GPG key ID: AD9F0F08E855ED88
5 changed files with 403 additions and 77 deletions

View file

@ -25,6 +25,7 @@ use log::info;
mod data; mod data;
mod meta; mod meta;
mod post;
mod proxy; mod proxy;
mod render_html; mod render_html;
mod routes; mod routes;

370
src/post.rs Normal file
View file

@ -0,0 +1,370 @@
/*
* Copyright (C) 2021 Aravinth Manivannan <realaravinth@batsense.net>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
use std::{collections::HashMap, hash::Hash};
use crate::data::*;
use crate::proxy::StringUtils;
use get_post::*;
#[derive(Eq, PartialEq)]
enum PostitionType {
Start,
End,
}
struct Markup<'a, 'b> {
markup: &'a GetPostPostContentBodyModelParagraphsMarkups,
p: &'a GetPostPostContentBodyModelParagraphs,
pos_type: PostitionType,
gists: &'b Option<Vec<(String, crate::data::GistContent)>>,
}
impl<'a, 'b> Markup<'a, 'b> {
fn start(
p: &GetPostPostContentBodyModelParagraphs,
gists: &'b Option<Vec<(String, crate::data::GistContent)>>,
pindex: usize,
in_oli: &mut bool,
) -> String {
if p.type_ == "IMG" {
let metadata = p.metadata.as_ref().unwrap();
format!(
r#"<figure><img width="{}" src="{}" /> <figcaption>"#,
metadata.original_width.as_ref().unwrap(),
crate::V1_API_ROUTES.proxy.get_medium_asset(&metadata.id)
)
} else if p.type_ == "P" {
"<p>".into()
} else if p.type_ == "PRE" {
"<pre>".into()
} else if p.type_ == "BQ" {
"<blockquote>".into()
} else if p.type_ == "H1" {
"<h1>".into()
} else if p.type_ == "H2" {
"<h2>".into()
} else if p.type_ == "H3" {
if pindex == 0 {
log::debug!("caught heading");
"".into()
} else {
"<h3>".into()
}
} else if p.type_ == "H4" {
"<h4>".into()
} else if p.type_ == "H5" {
"<h5>".into()
} else if p.type_ == "H6" {
"<h6>".into()
} else if p.type_ == "IFRAME" {
let src = &p
.iframe
.as_ref()
.unwrap()
.media_resource
.as_ref()
.unwrap()
.href;
if src.contains("gist.github.com") {
let gist_id = crate::data::Data::get_gist_id(src);
let (_, gist) = gists
.as_ref()
.unwrap()
.iter()
.find(|(id, _)| id == gist_id)
.as_ref()
.unwrap();
let mut gists = String::default();
for file in &gist.files {
gists += &format!(
r#"<div class="code-block gist-block">{}</div>"#,
file.get_html_content()
);
}
format!(
r#"<div class="gist_container">{gists}
<a class="gist_link" href="{}" target="_blank">See gist on GitHub</a>"#,
&gist.html_url
)
} else {
format!(r#"<iframe src="{src}" frameborder="0">"#)
}
} else if p.type_ == "OLI" {
if *in_oli {
"<li>".into()
} else {
*in_oli = true;
"<ul><li>".into()
}
} else {
log::info!("Unknown type");
r#"
<p class="libmedium__meta">
<b>From LibMedium:</b> LibMedium is built by reverse
engineering the Meduim's internal API. This post contains
markup(formatting rules) that we are unaware of.
Please report this URL <a
href="https://github.com/realaravinth/libmedium/issues/1"
rel="noreferrer">on our bug tracker</a> so that we can
improve page rendering.
<br />
Alternatively, you can also email me at realaravinth at batsense dot net!
</p>
<span>"#
.into()
}
}
fn end(p: &GetPostPostContentBodyModelParagraphs, pindex: usize, in_oli: &mut bool) -> String {
let resp: String = if p.type_ == "IMG" {
"</figcaption></figure>".into()
} else if p.type_ == "P" {
"</p>".into()
} else if p.type_ == "PRE" {
"</pre>".into()
} else if p.type_ == "BQ" {
"</blockquote>".into()
} else if p.type_ == "H1" {
"</h1>".into()
} else if p.type_ == "H2" {
"</h2>".into()
} else if p.type_ == "H3" {
if pindex == 0 {
log::debug!("caught heading");
"".into()
} else {
"</h3>".into()
}
} else if p.type_ == "H4" {
"</h4>".into()
} else if p.type_ == "H5" {
"</h5>".into()
} else if p.type_ == "H6" {
"</h6>".into()
} else if p.type_ == "IFRAME" {
let src = &p
.iframe
.as_ref()
.unwrap()
.media_resource
.as_ref()
.unwrap()
.href;
if src.contains("gist.github.com") {
"</div>".into()
} else {
"</iframe>".into()
}
} else if p.type_ == "OLI" {
"</li>".into()
} else {
"</span>".into()
};
if *in_oli {
if p.type_ != "OLL" {
*in_oli = false;
format!("</ul>{resp}")
} else {
resp
}
} else {
resp
}
}
fn apply_markup(&self, pindex: usize) -> String {
if self.markup.type_ == "A" {
if let Some(anchor_type) = &self.markup.anchor_type {
if anchor_type == "LINK" {
if self.pos_type == PostitionType::Start {
format!(
r#"<a rel="noreferrer" href="{}">"#,
self.markup.href.as_ref().unwrap()
)
} else {
"</a>".into()
}
} else if anchor_type == "USER" {
if self.pos_type == PostitionType::Start {
format!(
r#"<a rel="noreferrer" href="https://medium.com/u/{}">"#,
self.markup.user_id.as_ref().unwrap()
)
} else {
"</a>".into()
}
} else {
// log::error!("unknown markup.anchor_type: {:?} post id {}", anchor_type, id);
if self.pos_type == PostitionType::Start {
"<span>".into()
} else {
"</span>".into()
}
}
} else {
// log::error!("unknown markup.anchor_type: {:?} post id {}", anchor_type, id);
if self.pos_type == PostitionType::Start {
"<span>".into()
} else {
"</span>".into()
}
}
} else if self.markup.type_ == "PRE" {
if self.pos_type == PostitionType::Start {
"<pre>".into()
} else {
"</pre>".into()
}
} else if self.markup.type_ == "EM" {
if self.pos_type == PostitionType::Start {
"<em>".into()
} else {
"</em>".into()
}
} else if self.markup.type_ == "STRONG" {
if self.pos_type == PostitionType::Start {
"<strong>".into()
} else {
"</strong>".into()
}
} else if self.markup.type_ == "CODE" {
if self.pos_type == PostitionType::Start {
"<code>".into()
} else {
"</code>".into()
}
} else {
// log::error!("unknown markup.type_: {:?} post id {}", markup.type_, id);
if self.pos_type == PostitionType::Start {
log::info!("Unknown type");
r#"
<p class="libmedium__meta">
<b>From LibMedium:</b> LibMedium is built by reverse
engineering the Meduim's internal API. This post contains
markup(formatting rules) that we are unaware of.
Please report this URL <a
href="https://github.com/realaravinth/libmedium/issues/1"
rel="noreferrer">on our bug tracker</a> so that we can
improve page rendering.
<br />
Alternatively, you can also email me at realaravinth at batsense dot net!
</p>
<span>"#
.into()
} else {
"</span>".into()
}
}
}
}
#[derive(Default)]
struct PositionMap<'a, 'b> {
map: HashMap<i64, Vec<Markup<'a, 'b>>>,
arr: Vec<i64>,
}
impl<'a, 'b> PositionMap<'a, 'b> {
fn insert_if_not_exists(&mut self, pos: i64, m: Markup<'a, 'b>) {
if let Some(markups) = self.map.get_mut(&pos) {
markups.push(m);
} else {
self.map.insert(pos, vec![m]);
self.arr.push(pos);
}
}
}
pub fn apply_markup<'b>(
data: &PostResp,
gists: &'b Option<Vec<(String, crate::data::GistContent)>>,
) -> Vec<String> {
let mut paragraphs: Vec<String> = Vec::with_capacity(data.content.body_model.paragraphs.len());
for (pindex, p) in data.content.body_model.paragraphs.iter().enumerate() {
let mut pos = PositionMap::default();
if p.type_ == "H3" && pindex == 0 {
log::debug!("FOUND TOP LEVEL H3. Breaking");
continue;
}
for m in p.markups.iter() {
let start_markup = Markup {
markup: &m,
p,
gists,
pos_type: PostitionType::Start,
};
pos.insert_if_not_exists(m.start, start_markup);
let end_markup = Markup {
markup: &m,
p,
gists,
pos_type: PostitionType::End,
};
pos.insert_if_not_exists(m.end, end_markup);
}
let mut cur = 0;
fn incr_cur(cur: usize, point: i64) -> usize {
let incr = point as usize - cur;
let post_incr = cur + incr;
log::debug!(
"cur before incr: {cur}, incr by: {}, post incr: {}",
incr,
post_incr
);
post_incr
}
let mut content = String::with_capacity(p.text.len());
let mut in_oli = false;
content += &Markup::start(&p, &gists, pindex, &mut in_oli);
pos.arr.sort();
if let Some(first) = pos.arr.get(0) {
//content += p.text.substring(cur, *first as usize);
content += p.text.slice(cur..*first as usize);
cur = incr_cur(cur, *first);
for point in pos.arr.iter() {
//content.push(p.text.substring(start, start + point);
// if *point != 0 {
if cur != *point as usize {
// content += p.text.substring(cur, *point as usize);
content += p.text.slice(cur..*point as usize);
}
// }
let pos_markups = pos.map.get(point).unwrap();
for m in pos_markups.iter() {
// println!("{}", &m.apply_markup(pindex));
content += &m.apply_markup(pindex);
}
cur = incr_cur(cur, *point);
}
log::debug!("LAST");
content += p.text.slice(cur..);
content += &Markup::end(&p, pindex, &mut in_oli);
} else {
log::debug!("LAST WITH NO MARKUP");
content += p.text.slice(cur..);
content += &Markup::end(&p, pindex, &mut in_oli);
}
paragraphs.push(content);
}
paragraphs
}

View file

@ -23,6 +23,7 @@ use reqwest::header::CONTENT_TYPE;
use sailfish::TemplateOnce; use sailfish::TemplateOnce;
use crate::data::PostResp; use crate::data::PostResp;
use crate::post::apply_markup;
use crate::AppData; use crate::AppData;
const CACHE_AGE: u32 = 60 * 60 * 24; const CACHE_AGE: u32 = 60 * 60 * 24;
@ -99,11 +100,14 @@ impl StringUtils for str {
Bound::Included(bound) | Bound::Excluded(bound) => *bound, Bound::Included(bound) | Bound::Excluded(bound) => *bound,
Bound::Unbounded => 0, Bound::Unbounded => 0,
}; };
log::debug!("{}", self);
log::debug!("start: {start}");
let len = match range.end_bound() { let len = match range.end_bound() {
Bound::Included(bound) => *bound + 1, Bound::Included(bound) => *bound + 1,
Bound::Excluded(bound) => *bound, Bound::Excluded(bound) => *bound,
Bound::Unbounded => self.len(), Bound::Unbounded => self.len(),
} - start; } - start;
log::debug!("len {len}");
self.substring(start, len) self.substring(start, len)
} }
} }
@ -118,6 +122,7 @@ pub struct Post {
pub reading_time: usize, pub reading_time: usize,
pub id: String, pub id: String,
pub gists: Option<Vec<(String, crate::data::GistContent)>>, pub gists: Option<Vec<(String, crate::data::GistContent)>>,
pub paragraphs: Vec<String>,
} }
const INDEX: &str = include_str!("../templates/index.html"); const INDEX: &str = include_str!("../templates/index.html");
@ -211,6 +216,8 @@ async fn page(path: web::Path<(String, String)>, data: AppData) -> impl Responde
.unwrap(); .unwrap();
let preview_img = crate::V1_API_ROUTES.proxy.get_medium_asset(preview_img); let preview_img = crate::V1_API_ROUTES.proxy.get_medium_asset(preview_img);
let paragraphs = apply_markup(&post_data, &gists);
let page = Post { let page = Post {
id: id.to_owned(), id: id.to_owned(),
data: post_data, data: post_data,
@ -218,6 +225,7 @@ async fn page(path: web::Path<(String, String)>, data: AppData) -> impl Responde
gists, gists,
reading_time, reading_time,
preview_img, preview_img,
paragraphs,
}; };
let page = page.render_once().unwrap(); let page = page.render_once().unwrap();

View file

@ -3,13 +3,29 @@
padding: 0; padding: 0;
} }
body {
width: 100%;
display: flex;
flex-direction: column;
}
main {
width: 35em;
margin: auto;
display: flex;
flex-direction: column;
}
h1, h1,
h2, h2,
h3, h3,
h4, h4,
h5, h5,
h6 { h6 {
font-family: "Times New Roman", Times, serif; font-family: sohne, "Helvetica Neue", Helvetica, Arial, sans-serif !important;
} }
a { a {
@ -27,24 +43,10 @@ a:hover {
html { html {
color: #333; color: #333;
font-family: Georgia, "Times New Roman", Times, serif; font-family: charter, Georgia, Cambria, "Times New Roman", Times, serif;
font-size: 26px; font-size: 26px;
line-height: 1.55rem; line-height: 1.55rem;
} }
body {
width: 100%;
display: flex;
flex-direction: column;
}
main {
width: 35em;
margin: auto;
display: flex;
flex-direction: column;
}
p { p {
margin: 20px 0; margin: 20px 0;
} }
@ -72,19 +74,19 @@ code {
} }
*/ */
/*
pre { pre {
font-family: monospace; font-family: Menlo, Monaco, "Courier New", Courier, monospace;
font-size: 15px; font-size: 15px;
white-space: pre-wrap; white-space: pre-wrap;
/*
font-weight: 600; font-weight: 600;
*/
line-height: 1rem; line-height: 1rem;
padding: 5px; padding: 20px;
border-radius: 6px; border-radius: 6px;
background-color: rgba(175, 184, 193, 0.2); background-color: rgba(175, 184, 193, 0.2);
} }
*/
.code-block { .code-block {
display: block; display: block;

View file

@ -23,63 +23,8 @@
</p> </p>
<article> <article>
<. let paragraphs = &data.content.body_model.paragraphs; .> <. for (_pindex, p) in paragraphs.iter().enumerate() {.>
<. for (pindex, p) in paragraphs.iter().enumerate() {.> <.- p .>
<. if open_list && p.type_ != "OLI" { .>
</ol>
<. } .>
<. if pindex == 0 && p.type_ == "H3" {.>
<. continue; .>
<.}.>
<. if p.type_ == "IMG" {.>
<. include!("./img.html"); .>
<.} else if p.type_ == "P" {.>
<p><. include!("./_markup.html"); .></p>
<.} else if p.type_ == "BQ" {.>
<blockquote><. include!("./_markup.html"); .></blockquote>
<.} else if p.type_ == "H2" {.>
<h2><.= p.text .></h2>
<.} else if p.type_ == "H3" {.>
<h3><.= p.text .></h3>
<.} else if p.type_ == "H4" {.>
<h4><.= p.text .></h4>
<.} else if p.type_ == "H5" {.>
<h5><.= p.text .></h5>
<.} else if p.type_ == "H6" {.>
<h6><.= p.text .></h6>
<.} else if p.type_ == "IFRAME" {.>
<. let src = &p.iframe.as_ref().unwrap().media_resource.as_ref().unwrap().href; .>
<. if src.contains("gist.github.com"){.>
<. include!("./gist_insert.html"); .>
<!--
<iframe src="<.#= crate::V1_API_ROUTES.proxy.get_gist(&src) .>" frameborder="0"></iframe>
<a href="<.= src .>">Click here to open gist on GitHub</a>
-->
<.} else {.>
<iframe src="<.= src .>" frameborder="0"></iframe>
<.}.>
<.} else if p.type_ == "OLI" {.>
<. if !open_list { .>
<. open_list = true;.>
<ol>
<. } .>
<li><.= p.text .></li>
<.} else {.>
<p>
<. include!("./_markup.html"); .>
</p>
<p class="libmedium__meta">
<b>From LibMedium:</b> LibMedium is built by reverse
engineering the Meduim's internal API. This post contains
markup(formatting rules) that we are unaware of.
Please report this URL <a
href="https://github.com/realaravinth/libmedium/issues/1"
rel="noreferrer">on our bug tracker</a> so that we can
improve page rendering.
<br />
Alternatively, you can also email me at realaravinth at batsense dot net!
</p>
<.}.>
<.}.> <.}.>
</article> </article>
</main> </main>