Begin a parser implementation from HTML into an internal representation.

This commit is contained in:
Danila Fedorin 2021-07-02 00:18:37 -07:00
parent db202b23ae
commit 94f6c99ea6
4 changed files with 142 additions and 30 deletions

View file

@ -4,6 +4,12 @@
inspiration from [Pandoc's AST](https://hackage.haskell.org/package/pandoc-types-1.22/docs/Text-Pandoc-Definition.html#t:Block). inspiration from [Pandoc's AST](https://hackage.haskell.org/package/pandoc-types-1.22/docs/Text-Pandoc-Definition.html#t:Block).
Then, much like Pandoc AST, we can turn our representation into Then, much like Pandoc AST, we can turn our representation into
markdown in the editor, or HTML in the view. markdown in the editor, or HTML in the view.
* This is good for both serialization and deserialization.
We can use the representation when going input -> formatted
message, which would allow other, non-web platforms
to rely on non-Markdown input types. We can also
use it going formatted message -> display, since a frontend
can then choose to use non-HTML output (GTK JS bindings?).
* As such, we represent formatting by nesting parts * As such, we represent formatting by nesting parts
(we'd have a `ItalicsPart`, `BoldPart`, etc.) (we'd have a `ItalicsPart`, `BoldPart`, etc.)
* We keep the "inline"/"block" distinction, but only * We keep the "inline"/"block" distinction, but only

View file

@ -12,7 +12,7 @@ export function parsePlainBody(body) {
// create callback outside of loop // create callback outside of loop
const linkifyCallback = (text, isLink) => { const linkifyCallback = (text, isLink) => {
if (isLink) { if (isLink) {
parts.push(new LinkPart(text, text)); parts.push(new LinkPart(text, [new TextPart(text)]));
} else { } else {
parts.push(new TextPart(text)); parts.push(new TextPart(text));
} }
@ -36,65 +36,63 @@ export function stringAsBody(body) {
return new MessageBody(body, [new TextPart(body)]); return new MessageBody(body, [new TextPart(body)]);
} }
class HeaderBlock { export class HeaderBlock {
constructor(level, inlines) { constructor(level, inlines) {
this.level = level; this.level = level;
this.inlines = inlines; this.inlines = inlines;
} }
get type() { return "header"; } get type() { return "header"; }
isBlock() { return true; }
} }
class CodeBlock { export class CodeBlock {
constructor(text) { constructor(language, text) {
this.language = language;
this.text = text; this.text = text;
} }
get type() { return "codeblock"; } get type() { return "codeblock"; }
isBlock() { return true; }
} }
class NewLinePart { export class ListBlock {
constructor(startOffset, items) {
this.items = items;
this.startOffset = startOffset;
}
}
export class RulePart {
get type( ) { return "rule"; }
}
export class NewLinePart {
get type() { return "newline"; } get type() { return "newline"; }
isBlock() { return false; }
} }
class EmphPart { export class FormatPart {
constructor(wraps) { constructor(format, children) {
this.wraps = wraps; this.format = format;
this.children = children;
} }
get type() { return "emph"; } get type() { return "format"; }
isBlock() { return false; }
} }
class CodePart { export class LinkPart {
constructor(text) { constructor(url, inlines) {
this.text = text;
}
get type() { return "code"; }
isBlock() { return false; }
}
class LinkPart {
constructor(url, text) {
this.url = url; this.url = url;
this.text = text; this.inlines = inlines;
} }
get type() { return "link"; } get type() { return "link"; }
isBlock() { return false; }
} }
class TextPart { export class TextPart {
constructor(text) { constructor(text) {
this.text = text; this.text = text;
} }
get type() { return "text"; } get type() { return "text"; }
isBlock() { return false; }
} }
class MessageBody { class MessageBody {

View file

@ -0,0 +1,108 @@
import { HeaderBlock, ListBlock, CodeBlock, FormatPart, NewLinePart, RulePart, TextPart, LinkPart } from "../../../domain/session/room/timeline/MessageBody.js"
/* At the time of writing (Jul 1 2021), Matrix Spec recommends
* allowing the following HTML tags:
* font, del, h1, h2, h3, h4, h5, h6, blockquote, p, a, ul, ol, sup, sub, li, b, i, u,
* strong, em, strike, code, hr, br, div, table, thead, tbody, tr, th, td, caption, pre, span, img
*/
const basicNodes = ["EM", "STRONG", "CODE", "DEL", "P", "DIV", "SPAN" ]
function basicWrapper(tag) {
return (_, children) => new FormatPart(tag, children);
}
function headerWrapper(level) {
return (_, children) => new HeaderBlock(level, children);
}
function parseLink(node, children) {
return new LinkPart(node.href, children);
}
function parseList(node) {
const start = node.getAttribute("start") || 1;
const nodes = [];
const len = node.childNodes.length;
for (let i = 0; i < len; i += 1) {
const child = node.childNodes[i];
if (child.tagName !== "LI") {
continue;
}
nodes.push(parseNodes(child.childNodes));
}
return new ListBlock(start, nodes);
}
function parseCodeBlock(node) {
let codeNode;
if (!((codeNode = node.firstChild) && codeNode.nodeName == "CODE")) {
return null;
}
let language = "";
for (const clname of codeNode.classList) {
if (clname.startsWith("language-") && !clname.startsWith("language-_")) {
language = clname.substring(9) // "language-".length
break;
}
}
return new CodeBlock(language, codeNode.textContent);
}
function parseImage(node) {
return null;
}
function buildNodeMap() {
let map = {
A: { descend: true, parsefn: parseLink },
UL: { descend: false, parsefn: parseList },
OL: { descend: false, parsefn: parseList },
PRE: { descend: false, parsefn: parseCodeBlock },
BR: { descend: false, parsefn: () => new NewLinePart() },
HR: { descend: false, parsefn: () => new RulePart() },
IMG: { descend: false, parsefn: parseImage }
}
for (const tag of basicNodes) {
map[tag] = { descend: true, parsefn: basicWrapper(tag) }
}
for (let level = 1; level <= 6; level++) {
const tag = "h" + level;
map[tag] = { descend: true, parsefn: headerWrapper(level) }
}
return map;
}
const nodes = buildNodeMap();
function parseNode(node) {
if (node.nodeType === Node.TEXT_NODE) {
return new TextPart(node.nodeValue);
} else if (node.nodeType == Node.ELEMENT_NODE) {
const f = nodes[node.nodeName];
if (!f) {
return null;
}
let result = f.parsefn(node, f.descend ? parseNodes(node.childNodes) : null);
return result;
}
return null;
}
function parseNodes(nodes) {
const len = nodes.length;
const parsed = [];
for (let i = 0; i < len; i ++) {
let node = parseNode(nodes[i]);
if (node) {
parsed.push(node);
}
}
return parsed;
}
export function parse(html) {
const rootNode = new DOMParser().parseFromString(html, "text/html").body;
return parseNodes(rootNode.childNodes);
}

View file

@ -38,10 +38,10 @@ export class TextMessageView extends BaseMessageView {
const formatFunction = { const formatFunction = {
header: headerBlock => tag["h" + Math.min(6,headerBlock.level)]({}, renderParts(headerBlock.inlines)), header: headerBlock => tag["h" + Math.min(6,headerBlock.level)]({}, renderParts(headerBlock.inlines)),
codeblock: codeBlock => tag.pre({}, tag.code({}, text(codeBlock.text))), codeblock: codeBlock => tag.pre({}, tag.code({}, text(codeBlock.text))),
emph: emphPart => tag.em({}, [renderPart(emphPart.wraps)]), emph: emphPart => tag.em({}, renderPart(emphPart.inlines)),
code: codePart => tag.code({}, text(codePart.text)), code: codePart => tag.code({}, text(codePart.text)),
text: textPart => text(textPart.text), text: textPart => text(textPart.text),
link: linkPart => tag.a({ href: linkPart.url, target: "_blank", rel: "noopener" }, [linkPart.text]), link: linkPart => tag.a({ href: linkPart.url, target: "_blank", rel: "noopener" }, renderParts(linkPart.inlines)),
newline: () => tag.br() newline: () => tag.br()
}; };