Begin a parser implementation from HTML into an internal representation.

This commit is contained in:
Danila Fedorin 2021-07-02 00:18:37 -07:00
parent db202b23ae
commit 94f6c99ea6
4 changed files with 142 additions and 30 deletions

View file

@ -4,6 +4,12 @@
inspiration from [Pandoc's AST](https://hackage.haskell.org/package/pandoc-types-1.22/docs/Text-Pandoc-Definition.html#t:Block).
Then, much like Pandoc AST, we can turn our representation into
markdown in the editor, or HTML in the view.
* This is good for both serialization and deserialization.
We can use the representation when going input -> formatted
message, which would allow other, non-web platforms
to rely on non-Markdown input types. We can also
use it going formatted message -> display, since a frontend
can then choose to use non-HTML output (GTK JS bindings?).
* As such, we represent formatting by nesting parts
(we'd have a `ItalicsPart`, `BoldPart`, etc.)
* We keep the "inline"/"block" distinction, but only

View file

@ -12,7 +12,7 @@ export function parsePlainBody(body) {
// create callback outside of loop
const linkifyCallback = (text, isLink) => {
if (isLink) {
parts.push(new LinkPart(text, text));
parts.push(new LinkPart(text, [new TextPart(text)]));
} else {
parts.push(new TextPart(text));
}
@ -36,65 +36,63 @@ export function stringAsBody(body) {
return new MessageBody(body, [new TextPart(body)]);
}
class HeaderBlock {
export class HeaderBlock {
constructor(level, inlines) {
this.level = level;
this.inlines = inlines;
}
get type() { return "header"; }
isBlock() { return true; }
}
class CodeBlock {
constructor(text) {
export class CodeBlock {
constructor(language, text) {
this.language = language;
this.text = text;
}
get type() { return "codeblock"; }
isBlock() { return true; }
}
class NewLinePart {
export class ListBlock {
constructor(startOffset, items) {
this.items = items;
this.startOffset = startOffset;
}
}
export class RulePart {
get type( ) { return "rule"; }
}
export class NewLinePart {
get type() { return "newline"; }
isBlock() { return false; }
}
class EmphPart {
constructor(wraps) {
this.wraps = wraps;
export class FormatPart {
constructor(format, children) {
this.format = format;
this.children = children;
}
get type() { return "emph"; }
isBlock() { return false; }
get type() { return "format"; }
}
class CodePart {
constructor(text) {
this.text = text;
}
get type() { return "code"; }
isBlock() { return false; }
}
class LinkPart {
constructor(url, text) {
export class LinkPart {
constructor(url, inlines) {
this.url = url;
this.text = text;
this.inlines = inlines;
}
get type() { return "link"; }
isBlock() { return false; }
}
class TextPart {
export class TextPart {
constructor(text) {
this.text = text;
}
get type() { return "text"; }
isBlock() { return false; }
}
class MessageBody {

View file

@ -0,0 +1,108 @@
import { HeaderBlock, ListBlock, CodeBlock, FormatPart, NewLinePart, RulePart, TextPart, LinkPart } from "../../../domain/session/room/timeline/MessageBody.js"
/* At the time of writing (Jul 1 2021), Matrix Spec recommends
* allowing the following HTML tags:
* font, del, h1, h2, h3, h4, h5, h6, blockquote, p, a, ul, ol, sup, sub, li, b, i, u,
* strong, em, strike, code, hr, br, div, table, thead, tbody, tr, th, td, caption, pre, span, img
*/
const basicNodes = ["EM", "STRONG", "CODE", "DEL", "P", "DIV", "SPAN" ]
function basicWrapper(tag) {
return (_, children) => new FormatPart(tag, children);
}
function headerWrapper(level) {
return (_, children) => new HeaderBlock(level, children);
}
function parseLink(node, children) {
return new LinkPart(node.href, children);
}
function parseList(node) {
const start = node.getAttribute("start") || 1;
const nodes = [];
const len = node.childNodes.length;
for (let i = 0; i < len; i += 1) {
const child = node.childNodes[i];
if (child.tagName !== "LI") {
continue;
}
nodes.push(parseNodes(child.childNodes));
}
return new ListBlock(start, nodes);
}
function parseCodeBlock(node) {
let codeNode;
if (!((codeNode = node.firstChild) && codeNode.nodeName == "CODE")) {
return null;
}
let language = "";
for (const clname of codeNode.classList) {
if (clname.startsWith("language-") && !clname.startsWith("language-_")) {
language = clname.substring(9) // "language-".length
break;
}
}
return new CodeBlock(language, codeNode.textContent);
}
function parseImage(node) {
return null;
}
function buildNodeMap() {
let map = {
A: { descend: true, parsefn: parseLink },
UL: { descend: false, parsefn: parseList },
OL: { descend: false, parsefn: parseList },
PRE: { descend: false, parsefn: parseCodeBlock },
BR: { descend: false, parsefn: () => new NewLinePart() },
HR: { descend: false, parsefn: () => new RulePart() },
IMG: { descend: false, parsefn: parseImage }
}
for (const tag of basicNodes) {
map[tag] = { descend: true, parsefn: basicWrapper(tag) }
}
for (let level = 1; level <= 6; level++) {
const tag = "h" + level;
map[tag] = { descend: true, parsefn: headerWrapper(level) }
}
return map;
}
const nodes = buildNodeMap();
function parseNode(node) {
if (node.nodeType === Node.TEXT_NODE) {
return new TextPart(node.nodeValue);
} else if (node.nodeType == Node.ELEMENT_NODE) {
const f = nodes[node.nodeName];
if (!f) {
return null;
}
let result = f.parsefn(node, f.descend ? parseNodes(node.childNodes) : null);
return result;
}
return null;
}
function parseNodes(nodes) {
const len = nodes.length;
const parsed = [];
for (let i = 0; i < len; i ++) {
let node = parseNode(nodes[i]);
if (node) {
parsed.push(node);
}
}
return parsed;
}
export function parse(html) {
const rootNode = new DOMParser().parseFromString(html, "text/html").body;
return parseNodes(rootNode.childNodes);
}

View file

@ -38,10 +38,10 @@ export class TextMessageView extends BaseMessageView {
const formatFunction = {
header: headerBlock => tag["h" + Math.min(6,headerBlock.level)]({}, renderParts(headerBlock.inlines)),
codeblock: codeBlock => tag.pre({}, tag.code({}, text(codeBlock.text))),
emph: emphPart => tag.em({}, [renderPart(emphPart.wraps)]),
emph: emphPart => tag.em({}, renderPart(emphPart.inlines)),
code: codePart => tag.code({}, text(codePart.text)),
text: textPart => text(textPart.text),
link: linkPart => tag.a({ href: linkPart.url, target: "_blank", rel: "noopener" }, [linkPart.text]),
link: linkPart => tag.a({ href: linkPart.url, target: "_blank", rel: "noopener" }, renderParts(linkPart.inlines)),
newline: () => tag.br()
};