Begin a parser implementation from HTML into an internal representation.
This commit is contained in:
parent
db202b23ae
commit
94f6c99ea6
4 changed files with 142 additions and 30 deletions
|
@ -4,6 +4,12 @@
|
|||
inspiration from [Pandoc's AST](https://hackage.haskell.org/package/pandoc-types-1.22/docs/Text-Pandoc-Definition.html#t:Block).
|
||||
Then, much like Pandoc AST, we can turn our representation into
|
||||
markdown in the editor, or HTML in the view.
|
||||
* This is good for both serialization and deserialization.
|
||||
We can use the representation when going input -> formatted
|
||||
message, which would allow other, non-web platforms
|
||||
to rely on non-Markdown input types. We can also
|
||||
use it going formatted message -> display, since a frontend
|
||||
can then choose to use non-HTML output (GTK JS bindings?).
|
||||
* As such, we represent formatting by nesting parts
|
||||
(we'd have a `ItalicsPart`, `BoldPart`, etc.)
|
||||
* We keep the "inline"/"block" distinction, but only
|
||||
|
|
|
@ -12,7 +12,7 @@ export function parsePlainBody(body) {
|
|||
// create callback outside of loop
|
||||
const linkifyCallback = (text, isLink) => {
|
||||
if (isLink) {
|
||||
parts.push(new LinkPart(text, text));
|
||||
parts.push(new LinkPart(text, [new TextPart(text)]));
|
||||
} else {
|
||||
parts.push(new TextPart(text));
|
||||
}
|
||||
|
@ -36,65 +36,63 @@ export function stringAsBody(body) {
|
|||
return new MessageBody(body, [new TextPart(body)]);
|
||||
}
|
||||
|
||||
class HeaderBlock {
|
||||
export class HeaderBlock {
|
||||
constructor(level, inlines) {
|
||||
this.level = level;
|
||||
this.inlines = inlines;
|
||||
}
|
||||
|
||||
get type() { return "header"; }
|
||||
isBlock() { return true; }
|
||||
}
|
||||
|
||||
class CodeBlock {
|
||||
constructor(text) {
|
||||
export class CodeBlock {
|
||||
constructor(language, text) {
|
||||
this.language = language;
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
get type() { return "codeblock"; }
|
||||
isBlock() { return true; }
|
||||
}
|
||||
|
||||
class NewLinePart {
|
||||
export class ListBlock {
|
||||
constructor(startOffset, items) {
|
||||
this.items = items;
|
||||
this.startOffset = startOffset;
|
||||
}
|
||||
}
|
||||
|
||||
export class RulePart {
|
||||
get type( ) { return "rule"; }
|
||||
}
|
||||
|
||||
export class NewLinePart {
|
||||
get type() { return "newline"; }
|
||||
isBlock() { return false; }
|
||||
}
|
||||
|
||||
class EmphPart {
|
||||
constructor(wraps) {
|
||||
this.wraps = wraps;
|
||||
export class FormatPart {
|
||||
constructor(format, children) {
|
||||
this.format = format;
|
||||
this.children = children;
|
||||
}
|
||||
|
||||
get type() { return "emph"; }
|
||||
isBlock() { return false; }
|
||||
get type() { return "format"; }
|
||||
}
|
||||
|
||||
class CodePart {
|
||||
constructor(text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
get type() { return "code"; }
|
||||
isBlock() { return false; }
|
||||
}
|
||||
|
||||
class LinkPart {
|
||||
constructor(url, text) {
|
||||
export class LinkPart {
|
||||
constructor(url, inlines) {
|
||||
this.url = url;
|
||||
this.text = text;
|
||||
this.inlines = inlines;
|
||||
}
|
||||
|
||||
get type() { return "link"; }
|
||||
isBlock() { return false; }
|
||||
}
|
||||
|
||||
class TextPart {
|
||||
export class TextPart {
|
||||
constructor(text) {
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
get type() { return "text"; }
|
||||
isBlock() { return false; }
|
||||
}
|
||||
|
||||
class MessageBody {
|
||||
|
|
108
src/platform/web/dom/deserialize.js
Normal file
108
src/platform/web/dom/deserialize.js
Normal file
|
@ -0,0 +1,108 @@
|
|||
import { HeaderBlock, ListBlock, CodeBlock, FormatPart, NewLinePart, RulePart, TextPart, LinkPart } from "../../../domain/session/room/timeline/MessageBody.js"
|
||||
|
||||
|
||||
/* At the time of writing (Jul 1 2021), Matrix Spec recommends
|
||||
* allowing the following HTML tags:
|
||||
* font, del, h1, h2, h3, h4, h5, h6, blockquote, p, a, ul, ol, sup, sub, li, b, i, u,
|
||||
* strong, em, strike, code, hr, br, div, table, thead, tbody, tr, th, td, caption, pre, span, img
|
||||
*/
|
||||
|
||||
const basicNodes = ["EM", "STRONG", "CODE", "DEL", "P", "DIV", "SPAN" ]
|
||||
|
||||
function basicWrapper(tag) {
|
||||
return (_, children) => new FormatPart(tag, children);
|
||||
}
|
||||
|
||||
function headerWrapper(level) {
|
||||
return (_, children) => new HeaderBlock(level, children);
|
||||
}
|
||||
|
||||
function parseLink(node, children) {
|
||||
return new LinkPart(node.href, children);
|
||||
}
|
||||
|
||||
function parseList(node) {
|
||||
const start = node.getAttribute("start") || 1;
|
||||
const nodes = [];
|
||||
const len = node.childNodes.length;
|
||||
for (let i = 0; i < len; i += 1) {
|
||||
const child = node.childNodes[i];
|
||||
if (child.tagName !== "LI") {
|
||||
continue;
|
||||
}
|
||||
nodes.push(parseNodes(child.childNodes));
|
||||
}
|
||||
return new ListBlock(start, nodes);
|
||||
}
|
||||
|
||||
function parseCodeBlock(node) {
|
||||
let codeNode;
|
||||
if (!((codeNode = node.firstChild) && codeNode.nodeName == "CODE")) {
|
||||
return null;
|
||||
}
|
||||
let language = "";
|
||||
for (const clname of codeNode.classList) {
|
||||
if (clname.startsWith("language-") && !clname.startsWith("language-_")) {
|
||||
language = clname.substring(9) // "language-".length
|
||||
break;
|
||||
}
|
||||
}
|
||||
return new CodeBlock(language, codeNode.textContent);
|
||||
}
|
||||
|
||||
function parseImage(node) {
|
||||
return null;
|
||||
}
|
||||
|
||||
function buildNodeMap() {
|
||||
let map = {
|
||||
A: { descend: true, parsefn: parseLink },
|
||||
UL: { descend: false, parsefn: parseList },
|
||||
OL: { descend: false, parsefn: parseList },
|
||||
PRE: { descend: false, parsefn: parseCodeBlock },
|
||||
BR: { descend: false, parsefn: () => new NewLinePart() },
|
||||
HR: { descend: false, parsefn: () => new RulePart() },
|
||||
IMG: { descend: false, parsefn: parseImage }
|
||||
}
|
||||
for (const tag of basicNodes) {
|
||||
map[tag] = { descend: true, parsefn: basicWrapper(tag) }
|
||||
}
|
||||
for (let level = 1; level <= 6; level++) {
|
||||
const tag = "h" + level;
|
||||
map[tag] = { descend: true, parsefn: headerWrapper(level) }
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
const nodes = buildNodeMap();
|
||||
|
||||
function parseNode(node) {
|
||||
if (node.nodeType === Node.TEXT_NODE) {
|
||||
return new TextPart(node.nodeValue);
|
||||
} else if (node.nodeType == Node.ELEMENT_NODE) {
|
||||
const f = nodes[node.nodeName];
|
||||
if (!f) {
|
||||
return null;
|
||||
}
|
||||
let result = f.parsefn(node, f.descend ? parseNodes(node.childNodes) : null);
|
||||
return result;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function parseNodes(nodes) {
|
||||
const len = nodes.length;
|
||||
const parsed = [];
|
||||
for (let i = 0; i < len; i ++) {
|
||||
let node = parseNode(nodes[i]);
|
||||
if (node) {
|
||||
parsed.push(node);
|
||||
}
|
||||
}
|
||||
return parsed;
|
||||
}
|
||||
|
||||
export function parse(html) {
|
||||
const rootNode = new DOMParser().parseFromString(html, "text/html").body;
|
||||
return parseNodes(rootNode.childNodes);
|
||||
}
|
|
@ -38,10 +38,10 @@ export class TextMessageView extends BaseMessageView {
|
|||
const formatFunction = {
|
||||
header: headerBlock => tag["h" + Math.min(6,headerBlock.level)]({}, renderParts(headerBlock.inlines)),
|
||||
codeblock: codeBlock => tag.pre({}, tag.code({}, text(codeBlock.text))),
|
||||
emph: emphPart => tag.em({}, [renderPart(emphPart.wraps)]),
|
||||
emph: emphPart => tag.em({}, renderPart(emphPart.inlines)),
|
||||
code: codePart => tag.code({}, text(codePart.text)),
|
||||
text: textPart => text(textPart.text),
|
||||
link: linkPart => tag.a({ href: linkPart.url, target: "_blank", rel: "noopener" }, [linkPart.text]),
|
||||
link: linkPart => tag.a({ href: linkPart.url, target: "_blank", rel: "noopener" }, renderParts(linkPart.inlines)),
|
||||
newline: () => tag.br()
|
||||
};
|
||||
|
||||
|
|
Reference in a new issue