From ad868818c71e61e53f8374f0ec90d09743e93a8e Mon Sep 17 00:00:00 2001 From: Danila Fedorin Date: Thu, 1 Jul 2021 00:18:39 -0700 Subject: [PATCH 01/69] Add some initial thoughts on the implementation. --- src/domain/session/room/timeline/FORMATTED.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 src/domain/session/room/timeline/FORMATTED.md diff --git a/src/domain/session/room/timeline/FORMATTED.md b/src/domain/session/room/timeline/FORMATTED.md new file mode 100644 index 00000000..1d72a71a --- /dev/null +++ b/src/domain/session/room/timeline/FORMATTED.md @@ -0,0 +1,19 @@ +# Ideas for formatted messages + +* Seems like a good idea to take some + inspiration from [Pandoc's AST](https://hackage.haskell.org/package/pandoc-types-1.22/docs/Text-Pandoc-Definition.html#t:Block). + Then, much like Pandoc AST, we can turn our representation into + markdown in the editor, or HTML in the view. +* As such, we represent formatting by nesting parts + (we'd have a `ItalicsPart`, `BoldPart`, etc.) +* We keep the "inline"/"block" distinction, but only + track it as a property, so that we can avoid adding + block parts to elements that cannot contain blocks + (headers, for instance, cannot contain blocks, but + lists -- themselves blocks -- can). +* When parsing, we may need some sort of "permanent" context: + if we're parsing a header, and we are inside 3 layers of other + "inline" things (italics, strikethrough, and bold, for example), + and we encounter a block, that's still not valid. + * Element seems to not care at all about the validity of what + it's parsing. Do we assume the HTML is well-formatted, then? From fd12baae3b6e1adcbede51635240698b91d91ffa Mon Sep 17 00:00:00 2001 From: Danila Fedorin Date: Thu, 1 Jul 2021 00:19:01 -0700 Subject: [PATCH 02/69] Add some other message parts as demo. --- .../session/room/timeline/MessageBody.js | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/domain/session/room/timeline/MessageBody.js b/src/domain/session/room/timeline/MessageBody.js index 42f3c951..98a54634 100644 --- a/src/domain/session/room/timeline/MessageBody.js +++ b/src/domain/session/room/timeline/MessageBody.js @@ -36,8 +36,46 @@ export function stringAsBody(body) { return new MessageBody(body, [new TextPart(body)]); } +class HeaderBlock { + constructor(level, inlines) { + this.level = level; + this.inlines = inlines; + } + + get type() { return "header"; } + isBlock() { return true; } +} + +class CodeBlock { + constructor(text) { + this.text = text; + } + + get type() { return "codeblock"; } + isBlock() { return true; } +} + class NewLinePart { get type() { return "newline"; } + isBlock() { return false; } +} + +class EmphPart { + constructor(wraps) { + this.wraps = wraps; + } + + get type() { return "emph"; } + isBlock() { return false; } +} + +class CodePart { + constructor(wraps) { + this.wraps = wraps; + } + + get type() { return "code"; } + isBlock() { return false; } } class LinkPart { @@ -47,6 +85,7 @@ class LinkPart { } get type() { return "link"; } + isBlock() { return false; } } class TextPart { @@ -55,6 +94,7 @@ class TextPart { } get type() { return "text"; } + isBlock() { return false; } } class MessageBody { From 4ee15005f5e064cffacbd204d3c84bd15bada977 Mon Sep 17 00:00:00 2001 From: Danila Fedorin Date: Thu, 1 Jul 2021 00:44:41 -0700 Subject: [PATCH 03/69] Inline code blocks don't contain other elements --- src/domain/session/room/timeline/MessageBody.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/domain/session/room/timeline/MessageBody.js b/src/domain/session/room/timeline/MessageBody.js index 98a54634..33f5d196 100644 --- a/src/domain/session/room/timeline/MessageBody.js +++ b/src/domain/session/room/timeline/MessageBody.js @@ -70,8 +70,8 @@ class EmphPart { } class CodePart { - constructor(wraps) { - this.wraps = wraps; + constructor(text) { + this.text = text; } get type() { return "code"; } From cc506756a16515c4d150d837fef5d09e7c33dd0e Mon Sep 17 00:00:00 2001 From: Danila Fedorin Date: Thu, 1 Jul 2021 00:45:04 -0700 Subject: [PATCH 04/69] Add code tag to HTML --- src/platform/web/ui/general/html.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/platform/web/ui/general/html.js b/src/platform/web/ui/general/html.js index 19b670d7..e4d6b383 100644 --- a/src/platform/web/ui/general/html.js +++ b/src/platform/web/ui/general/html.js @@ -94,7 +94,7 @@ export const TAG_NAMES = { [HTML_NS]: [ "br", "a", "ol", "ul", "li", "div", "h1", "h2", "h3", "h4", "h5", "h6", "p", "strong", "em", "span", "img", "section", "main", "article", "aside", - "pre", "button", "time", "input", "textarea", "label", "form", "progress", "output", "video"], + "pre", "code", "button", "time", "input", "textarea", "label", "form", "progress", "output", "video"], [SVG_NS]: ["svg", "circle"] }; From db202b23ae602665268df69e1728105134df3cf6 Mon Sep 17 00:00:00 2001 From: Danila Fedorin Date: Thu, 1 Jul 2021 00:45:46 -0700 Subject: [PATCH 05/69] Add some prototype rendering implementations. --- .../ui/session/room/timeline/TextMessageView.js | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/platform/web/ui/session/room/timeline/TextMessageView.js b/src/platform/web/ui/session/room/timeline/TextMessageView.js index 88d97b3f..9bad828e 100644 --- a/src/platform/web/ui/session/room/timeline/TextMessageView.js +++ b/src/platform/web/ui/session/room/timeline/TextMessageView.js @@ -36,18 +36,29 @@ export class TextMessageView extends BaseMessageView { * Map from part to function that outputs DOM for the part */ const formatFunction = { + header: headerBlock => tag["h" + Math.min(6,headerBlock.level)]({}, renderParts(headerBlock.inlines)), + codeblock: codeBlock => tag.pre({}, tag.code({}, text(codeBlock.text))), + emph: emphPart => tag.em({}, [renderPart(emphPart.wraps)]), + code: codePart => tag.code({}, text(codePart.text)), text: textPart => text(textPart.text), link: linkPart => tag.a({ href: linkPart.url, target: "_blank", rel: "noopener" }, [linkPart.text]), newline: () => tag.br() }; +function renderPart(part) { + const f = formatFunction[part.type]; + return f(part); +} + +function renderParts(parts) { + return Array.from(parts, renderPart); +} + class BodyView extends StaticView { render(t, messageBody) { const container = t.span(); for (const part of messageBody.parts) { - const f = formatFunction[part.type]; - const element = f(part); - container.appendChild(element); + container.appendChild(renderPart(part)); } return container; } From 94f6c99ea6df91a13162e5dae4a989bf46faf282 Mon Sep 17 00:00:00 2001 From: Danila Fedorin Date: Fri, 2 Jul 2021 00:18:37 -0700 Subject: [PATCH 06/69] Begin a parser implementation from HTML into an internal representation. --- src/domain/session/room/timeline/FORMATTED.md | 6 + .../session/room/timeline/MessageBody.js | 54 +++++---- src/platform/web/dom/deserialize.js | 108 ++++++++++++++++++ .../session/room/timeline/TextMessageView.js | 4 +- 4 files changed, 142 insertions(+), 30 deletions(-) create mode 100644 src/platform/web/dom/deserialize.js diff --git a/src/domain/session/room/timeline/FORMATTED.md b/src/domain/session/room/timeline/FORMATTED.md index 1d72a71a..576d2729 100644 --- a/src/domain/session/room/timeline/FORMATTED.md +++ b/src/domain/session/room/timeline/FORMATTED.md @@ -4,6 +4,12 @@ inspiration from [Pandoc's AST](https://hackage.haskell.org/package/pandoc-types-1.22/docs/Text-Pandoc-Definition.html#t:Block). Then, much like Pandoc AST, we can turn our representation into markdown in the editor, or HTML in the view. + * This is good for both serialization and deserialization. + We can use the representation when going input -> formatted + message, which would allow other, non-web platforms + to rely on non-Markdown input types. We can also + use it going formatted message -> display, since a frontend + can then choose to use non-HTML output (GTK JS bindings?). * As such, we represent formatting by nesting parts (we'd have a `ItalicsPart`, `BoldPart`, etc.) * We keep the "inline"/"block" distinction, but only diff --git a/src/domain/session/room/timeline/MessageBody.js b/src/domain/session/room/timeline/MessageBody.js index 33f5d196..d8efb4d9 100644 --- a/src/domain/session/room/timeline/MessageBody.js +++ b/src/domain/session/room/timeline/MessageBody.js @@ -12,7 +12,7 @@ export function parsePlainBody(body) { // create callback outside of loop const linkifyCallback = (text, isLink) => { if (isLink) { - parts.push(new LinkPart(text, text)); + parts.push(new LinkPart(text, [new TextPart(text)])); } else { parts.push(new TextPart(text)); } @@ -36,65 +36,63 @@ export function stringAsBody(body) { return new MessageBody(body, [new TextPart(body)]); } -class HeaderBlock { +export class HeaderBlock { constructor(level, inlines) { this.level = level; this.inlines = inlines; } get type() { return "header"; } - isBlock() { return true; } } -class CodeBlock { - constructor(text) { +export class CodeBlock { + constructor(language, text) { + this.language = language; this.text = text; } get type() { return "codeblock"; } - isBlock() { return true; } } -class NewLinePart { +export class ListBlock { + constructor(startOffset, items) { + this.items = items; + this.startOffset = startOffset; + } +} + +export class RulePart { + get type( ) { return "rule"; } +} + +export class NewLinePart { get type() { return "newline"; } - isBlock() { return false; } } -class EmphPart { - constructor(wraps) { - this.wraps = wraps; +export class FormatPart { + constructor(format, children) { + this.format = format; + this.children = children; } - get type() { return "emph"; } - isBlock() { return false; } + get type() { return "format"; } } -class CodePart { - constructor(text) { - this.text = text; - } - - get type() { return "code"; } - isBlock() { return false; } -} - -class LinkPart { - constructor(url, text) { +export class LinkPart { + constructor(url, inlines) { this.url = url; - this.text = text; + this.inlines = inlines; } get type() { return "link"; } - isBlock() { return false; } } -class TextPart { +export class TextPart { constructor(text) { this.text = text; } get type() { return "text"; } - isBlock() { return false; } } class MessageBody { diff --git a/src/platform/web/dom/deserialize.js b/src/platform/web/dom/deserialize.js new file mode 100644 index 00000000..a848cb16 --- /dev/null +++ b/src/platform/web/dom/deserialize.js @@ -0,0 +1,108 @@ +import { HeaderBlock, ListBlock, CodeBlock, FormatPart, NewLinePart, RulePart, TextPart, LinkPart } from "../../../domain/session/room/timeline/MessageBody.js" + + +/* At the time of writing (Jul 1 2021), Matrix Spec recommends + * allowing the following HTML tags: + * font, del, h1, h2, h3, h4, h5, h6, blockquote, p, a, ul, ol, sup, sub, li, b, i, u, + * strong, em, strike, code, hr, br, div, table, thead, tbody, tr, th, td, caption, pre, span, img + */ + +const basicNodes = ["EM", "STRONG", "CODE", "DEL", "P", "DIV", "SPAN" ] + +function basicWrapper(tag) { + return (_, children) => new FormatPart(tag, children); +} + +function headerWrapper(level) { + return (_, children) => new HeaderBlock(level, children); +} + +function parseLink(node, children) { + return new LinkPart(node.href, children); +} + +function parseList(node) { + const start = node.getAttribute("start") || 1; + const nodes = []; + const len = node.childNodes.length; + for (let i = 0; i < len; i += 1) { + const child = node.childNodes[i]; + if (child.tagName !== "LI") { + continue; + } + nodes.push(parseNodes(child.childNodes)); + } + return new ListBlock(start, nodes); +} + +function parseCodeBlock(node) { + let codeNode; + if (!((codeNode = node.firstChild) && codeNode.nodeName == "CODE")) { + return null; + } + let language = ""; + for (const clname of codeNode.classList) { + if (clname.startsWith("language-") && !clname.startsWith("language-_")) { + language = clname.substring(9) // "language-".length + break; + } + } + return new CodeBlock(language, codeNode.textContent); +} + +function parseImage(node) { + return null; +} + +function buildNodeMap() { + let map = { + A: { descend: true, parsefn: parseLink }, + UL: { descend: false, parsefn: parseList }, + OL: { descend: false, parsefn: parseList }, + PRE: { descend: false, parsefn: parseCodeBlock }, + BR: { descend: false, parsefn: () => new NewLinePart() }, + HR: { descend: false, parsefn: () => new RulePart() }, + IMG: { descend: false, parsefn: parseImage } + } + for (const tag of basicNodes) { + map[tag] = { descend: true, parsefn: basicWrapper(tag) } + } + for (let level = 1; level <= 6; level++) { + const tag = "h" + level; + map[tag] = { descend: true, parsefn: headerWrapper(level) } + } + return map; +} + +const nodes = buildNodeMap(); + +function parseNode(node) { + if (node.nodeType === Node.TEXT_NODE) { + return new TextPart(node.nodeValue); + } else if (node.nodeType == Node.ELEMENT_NODE) { + const f = nodes[node.nodeName]; + if (!f) { + return null; + } + let result = f.parsefn(node, f.descend ? parseNodes(node.childNodes) : null); + return result; + } + return null; +} + +function parseNodes(nodes) { + const len = nodes.length; + const parsed = []; + for (let i = 0; i < len; i ++) { + let node = parseNode(nodes[i]); + if (node) { + parsed.push(node); + } + } + return parsed; +} + +export function parse(html) { + const rootNode = new DOMParser().parseFromString(html, "text/html").body; + return parseNodes(rootNode.childNodes); +} diff --git a/src/platform/web/ui/session/room/timeline/TextMessageView.js b/src/platform/web/ui/session/room/timeline/TextMessageView.js index 9bad828e..9c9277fb 100644 --- a/src/platform/web/ui/session/room/timeline/TextMessageView.js +++ b/src/platform/web/ui/session/room/timeline/TextMessageView.js @@ -38,10 +38,10 @@ export class TextMessageView extends BaseMessageView { const formatFunction = { header: headerBlock => tag["h" + Math.min(6,headerBlock.level)]({}, renderParts(headerBlock.inlines)), codeblock: codeBlock => tag.pre({}, tag.code({}, text(codeBlock.text))), - emph: emphPart => tag.em({}, [renderPart(emphPart.wraps)]), + emph: emphPart => tag.em({}, renderPart(emphPart.inlines)), code: codePart => tag.code({}, text(codePart.text)), text: textPart => text(textPart.text), - link: linkPart => tag.a({ href: linkPart.url, target: "_blank", rel: "noopener" }, [linkPart.text]), + link: linkPart => tag.a({ href: linkPart.url, target: "_blank", rel: "noopener" }, renderParts(linkPart.inlines)), newline: () => tag.br() }; From 824e66a62f7d899201e9e8edf3cc8cd683c5be9d Mon Sep 17 00:00:00 2001 From: Danila Fedorin Date: Fri, 2 Jul 2021 00:23:59 -0700 Subject: [PATCH 07/69] Add some comments. --- src/platform/web/dom/deserialize.js | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/platform/web/dom/deserialize.js b/src/platform/web/dom/deserialize.js index a848cb16..41acf792 100644 --- a/src/platform/web/dom/deserialize.js +++ b/src/platform/web/dom/deserialize.js @@ -7,12 +7,22 @@ import { HeaderBlock, ListBlock, CodeBlock, FormatPart, NewLinePart, RulePart, T * strong, em, strike, code, hr, br, div, table, thead, tbody, tr, th, td, caption, pre, span, img */ +/** + * Nodes that don't have any properties to them other than their tag. + * While has `href`, and has `src`, these have... themselves. + */ const basicNodes = ["EM", "STRONG", "CODE", "DEL", "P", "DIV", "SPAN" ] +/** + * Return a builder function for a particular tag. + */ function basicWrapper(tag) { return (_, children) => new FormatPart(tag, children); } +/** + * Return a builder function for a particular header level. + */ function headerWrapper(level) { return (_, children) => new HeaderBlock(level, children); } @@ -74,6 +84,18 @@ function buildNodeMap() { return map; } +/** + * Handlers for various nodes. + * + * Each handler has two properties: `descend` and `parsefn`. + * If `descend` is true, the node's children should be + * parsed just like any other node, and fed as a second argument + * to `parsefn`. If not, the node's children are either to be ignored + * (as in
) or processed specially (as in