Begin a parser implementation from HTML into an internal representation.
This commit is contained in:
parent
db202b23ae
commit
94f6c99ea6
4 changed files with 142 additions and 30 deletions
|
@ -4,6 +4,12 @@
|
||||||
inspiration from [Pandoc's AST](https://hackage.haskell.org/package/pandoc-types-1.22/docs/Text-Pandoc-Definition.html#t:Block).
|
inspiration from [Pandoc's AST](https://hackage.haskell.org/package/pandoc-types-1.22/docs/Text-Pandoc-Definition.html#t:Block).
|
||||||
Then, much like Pandoc AST, we can turn our representation into
|
Then, much like Pandoc AST, we can turn our representation into
|
||||||
markdown in the editor, or HTML in the view.
|
markdown in the editor, or HTML in the view.
|
||||||
|
* This is good for both serialization and deserialization.
|
||||||
|
We can use the representation when going input -> formatted
|
||||||
|
message, which would allow other, non-web platforms
|
||||||
|
to rely on non-Markdown input types. We can also
|
||||||
|
use it going formatted message -> display, since a frontend
|
||||||
|
can then choose to use non-HTML output (GTK JS bindings?).
|
||||||
* As such, we represent formatting by nesting parts
|
* As such, we represent formatting by nesting parts
|
||||||
(we'd have a `ItalicsPart`, `BoldPart`, etc.)
|
(we'd have a `ItalicsPart`, `BoldPart`, etc.)
|
||||||
* We keep the "inline"/"block" distinction, but only
|
* We keep the "inline"/"block" distinction, but only
|
||||||
|
|
|
@ -12,7 +12,7 @@ export function parsePlainBody(body) {
|
||||||
// create callback outside of loop
|
// create callback outside of loop
|
||||||
const linkifyCallback = (text, isLink) => {
|
const linkifyCallback = (text, isLink) => {
|
||||||
if (isLink) {
|
if (isLink) {
|
||||||
parts.push(new LinkPart(text, text));
|
parts.push(new LinkPart(text, [new TextPart(text)]));
|
||||||
} else {
|
} else {
|
||||||
parts.push(new TextPart(text));
|
parts.push(new TextPart(text));
|
||||||
}
|
}
|
||||||
|
@ -36,65 +36,63 @@ export function stringAsBody(body) {
|
||||||
return new MessageBody(body, [new TextPart(body)]);
|
return new MessageBody(body, [new TextPart(body)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
class HeaderBlock {
|
export class HeaderBlock {
|
||||||
constructor(level, inlines) {
|
constructor(level, inlines) {
|
||||||
this.level = level;
|
this.level = level;
|
||||||
this.inlines = inlines;
|
this.inlines = inlines;
|
||||||
}
|
}
|
||||||
|
|
||||||
get type() { return "header"; }
|
get type() { return "header"; }
|
||||||
isBlock() { return true; }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class CodeBlock {
|
export class CodeBlock {
|
||||||
constructor(text) {
|
constructor(language, text) {
|
||||||
|
this.language = language;
|
||||||
this.text = text;
|
this.text = text;
|
||||||
}
|
}
|
||||||
|
|
||||||
get type() { return "codeblock"; }
|
get type() { return "codeblock"; }
|
||||||
isBlock() { return true; }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class NewLinePart {
|
export class ListBlock {
|
||||||
|
constructor(startOffset, items) {
|
||||||
|
this.items = items;
|
||||||
|
this.startOffset = startOffset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export class RulePart {
|
||||||
|
get type( ) { return "rule"; }
|
||||||
|
}
|
||||||
|
|
||||||
|
export class NewLinePart {
|
||||||
get type() { return "newline"; }
|
get type() { return "newline"; }
|
||||||
isBlock() { return false; }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class EmphPart {
|
export class FormatPart {
|
||||||
constructor(wraps) {
|
constructor(format, children) {
|
||||||
this.wraps = wraps;
|
this.format = format;
|
||||||
|
this.children = children;
|
||||||
}
|
}
|
||||||
|
|
||||||
get type() { return "emph"; }
|
get type() { return "format"; }
|
||||||
isBlock() { return false; }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class CodePart {
|
export class LinkPart {
|
||||||
constructor(text) {
|
constructor(url, inlines) {
|
||||||
this.text = text;
|
|
||||||
}
|
|
||||||
|
|
||||||
get type() { return "code"; }
|
|
||||||
isBlock() { return false; }
|
|
||||||
}
|
|
||||||
|
|
||||||
class LinkPart {
|
|
||||||
constructor(url, text) {
|
|
||||||
this.url = url;
|
this.url = url;
|
||||||
this.text = text;
|
this.inlines = inlines;
|
||||||
}
|
}
|
||||||
|
|
||||||
get type() { return "link"; }
|
get type() { return "link"; }
|
||||||
isBlock() { return false; }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class TextPart {
|
export class TextPart {
|
||||||
constructor(text) {
|
constructor(text) {
|
||||||
this.text = text;
|
this.text = text;
|
||||||
}
|
}
|
||||||
|
|
||||||
get type() { return "text"; }
|
get type() { return "text"; }
|
||||||
isBlock() { return false; }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class MessageBody {
|
class MessageBody {
|
||||||
|
|
108
src/platform/web/dom/deserialize.js
Normal file
108
src/platform/web/dom/deserialize.js
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
import { HeaderBlock, ListBlock, CodeBlock, FormatPart, NewLinePart, RulePart, TextPart, LinkPart } from "../../../domain/session/room/timeline/MessageBody.js"
|
||||||
|
|
||||||
|
|
||||||
|
/* At the time of writing (Jul 1 2021), Matrix Spec recommends
|
||||||
|
* allowing the following HTML tags:
|
||||||
|
* font, del, h1, h2, h3, h4, h5, h6, blockquote, p, a, ul, ol, sup, sub, li, b, i, u,
|
||||||
|
* strong, em, strike, code, hr, br, div, table, thead, tbody, tr, th, td, caption, pre, span, img
|
||||||
|
*/
|
||||||
|
|
||||||
|
const basicNodes = ["EM", "STRONG", "CODE", "DEL", "P", "DIV", "SPAN" ]
|
||||||
|
|
||||||
|
function basicWrapper(tag) {
|
||||||
|
return (_, children) => new FormatPart(tag, children);
|
||||||
|
}
|
||||||
|
|
||||||
|
function headerWrapper(level) {
|
||||||
|
return (_, children) => new HeaderBlock(level, children);
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseLink(node, children) {
|
||||||
|
return new LinkPart(node.href, children);
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseList(node) {
|
||||||
|
const start = node.getAttribute("start") || 1;
|
||||||
|
const nodes = [];
|
||||||
|
const len = node.childNodes.length;
|
||||||
|
for (let i = 0; i < len; i += 1) {
|
||||||
|
const child = node.childNodes[i];
|
||||||
|
if (child.tagName !== "LI") {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
nodes.push(parseNodes(child.childNodes));
|
||||||
|
}
|
||||||
|
return new ListBlock(start, nodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseCodeBlock(node) {
|
||||||
|
let codeNode;
|
||||||
|
if (!((codeNode = node.firstChild) && codeNode.nodeName == "CODE")) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
let language = "";
|
||||||
|
for (const clname of codeNode.classList) {
|
||||||
|
if (clname.startsWith("language-") && !clname.startsWith("language-_")) {
|
||||||
|
language = clname.substring(9) // "language-".length
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new CodeBlock(language, codeNode.textContent);
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseImage(node) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildNodeMap() {
|
||||||
|
let map = {
|
||||||
|
A: { descend: true, parsefn: parseLink },
|
||||||
|
UL: { descend: false, parsefn: parseList },
|
||||||
|
OL: { descend: false, parsefn: parseList },
|
||||||
|
PRE: { descend: false, parsefn: parseCodeBlock },
|
||||||
|
BR: { descend: false, parsefn: () => new NewLinePart() },
|
||||||
|
HR: { descend: false, parsefn: () => new RulePart() },
|
||||||
|
IMG: { descend: false, parsefn: parseImage }
|
||||||
|
}
|
||||||
|
for (const tag of basicNodes) {
|
||||||
|
map[tag] = { descend: true, parsefn: basicWrapper(tag) }
|
||||||
|
}
|
||||||
|
for (let level = 1; level <= 6; level++) {
|
||||||
|
const tag = "h" + level;
|
||||||
|
map[tag] = { descend: true, parsefn: headerWrapper(level) }
|
||||||
|
}
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
const nodes = buildNodeMap();
|
||||||
|
|
||||||
|
function parseNode(node) {
|
||||||
|
if (node.nodeType === Node.TEXT_NODE) {
|
||||||
|
return new TextPart(node.nodeValue);
|
||||||
|
} else if (node.nodeType == Node.ELEMENT_NODE) {
|
||||||
|
const f = nodes[node.nodeName];
|
||||||
|
if (!f) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
let result = f.parsefn(node, f.descend ? parseNodes(node.childNodes) : null);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseNodes(nodes) {
|
||||||
|
const len = nodes.length;
|
||||||
|
const parsed = [];
|
||||||
|
for (let i = 0; i < len; i ++) {
|
||||||
|
let node = parseNode(nodes[i]);
|
||||||
|
if (node) {
|
||||||
|
parsed.push(node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function parse(html) {
|
||||||
|
const rootNode = new DOMParser().parseFromString(html, "text/html").body;
|
||||||
|
return parseNodes(rootNode.childNodes);
|
||||||
|
}
|
|
@ -38,10 +38,10 @@ export class TextMessageView extends BaseMessageView {
|
||||||
const formatFunction = {
|
const formatFunction = {
|
||||||
header: headerBlock => tag["h" + Math.min(6,headerBlock.level)]({}, renderParts(headerBlock.inlines)),
|
header: headerBlock => tag["h" + Math.min(6,headerBlock.level)]({}, renderParts(headerBlock.inlines)),
|
||||||
codeblock: codeBlock => tag.pre({}, tag.code({}, text(codeBlock.text))),
|
codeblock: codeBlock => tag.pre({}, tag.code({}, text(codeBlock.text))),
|
||||||
emph: emphPart => tag.em({}, [renderPart(emphPart.wraps)]),
|
emph: emphPart => tag.em({}, renderPart(emphPart.inlines)),
|
||||||
code: codePart => tag.code({}, text(codePart.text)),
|
code: codePart => tag.code({}, text(codePart.text)),
|
||||||
text: textPart => text(textPart.text),
|
text: textPart => text(textPart.text),
|
||||||
link: linkPart => tag.a({ href: linkPart.url, target: "_blank", rel: "noopener" }, [linkPart.text]),
|
link: linkPart => tag.a({ href: linkPart.url, target: "_blank", rel: "noopener" }, renderParts(linkPart.inlines)),
|
||||||
newline: () => tag.br()
|
newline: () => tag.br()
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Reference in a new issue