Do some additional validation, blocking block nodes inside inline nodes.

This commit is contained in:
Danila Fedorin 2021-07-13 14:47:10 -07:00
parent 4b92903ddd
commit 5e39eb8f6c

View file

@ -10,7 +10,8 @@ import { MessageBody, HeaderBlock, ListBlock, CodeBlock, FormatPart, NewLinePart
* Nodes that don't have any properties to them other than their tag. * Nodes that don't have any properties to them other than their tag.
* While <a> has `href`, and <img> has `src`, these have... themselves. * While <a> has `href`, and <img> has `src`, these have... themselves.
*/ */
const basicNodes = ["EM", "STRONG", "CODE", "DEL", "P", "DIV", "SPAN" ] const basicInline = ["EM", "STRONG", "CODE", "DEL", "SPAN" ];
const basicBlock = ["DIV"];
class Deserializer { class Deserializer {
constructor(result, mediaRepository) { constructor(result, mediaRepository) {
@ -21,7 +22,7 @@ class Deserializer {
parseLink(node, children) { parseLink(node, children) {
// TODO Not equivalent to `node.href`! // TODO Not equivalent to `node.href`!
// Add another HTMLParseResult method? // Add another HTMLParseResult method?
let href = this.result.getAttributeValue(node, "href"); const href = this.result.getAttributeValue(node, "href");
return new LinkPart(href, children); return new LinkPart(href, children);
} }
@ -37,7 +38,7 @@ class Deserializer {
if (result.getNodeElementName(child) !== "LI") { if (result.getNodeElementName(child) !== "LI") {
continue; continue;
} }
const item = this.parseNodes(result.getChildNodes(child)); const item = this.parseAnyNodes(result.getChildNodes(child));
nodes.push(item); nodes.push(item);
} }
return new ListBlock(start, nodes); return new ListBlock(start, nodes);
@ -79,9 +80,59 @@ class Deserializer {
return new ImagePart(url, width, height, alt, title); return new ImagePart(url, width, height, alt, title);
} }
parseElement(node) { /** Once a node is known to be an element,
* attempt to interpret it as an inline element.
*
* @returns the inline message part, or null if the element
* is not inline or not allowed.
*/
parseInlineElement(node) {
const result = this.result; const result = this.result;
const tag = result.getNodeElementName(node); const tag = result.getNodeElementName(node);
const children = result.getChildNodes(node);
switch (tag) {
case "A": {
const inlines = this.parseInlineNodes(children);
return this.parseLink(node, inlines);
}
case "BR":
return new NewLinePart();
default: {
if (!basicInline.includes(tag)) {
return null;
}
const inlines = this.parseInlineNodes(children);
return new FormatPart(tag, inlines);
}
}
}
/** Attempt to interpret a node as inline.
*
* @returns the inline message part, or null if the
* element is not inline or not allowed.
*/
parseInlineNode(node) {
const result = this.result;
if (result.isTextNode(node)) {
// TODO Linkify
return new TextPart(result.getNodeText(node));
} else if (result.isElementNode(node)) {
return this.parseInlineElement(node);
}
return null;
}
/** Once a node is known to be an element, attempt
* to interpret it as a block element.
*
* @returns the block message part, or null of the
* element is not a block or not allowed.
*/
parseBlockElement(node) {
const result = this.result;
const tag = result.getNodeElementName(node);
const children = result.getChildNodes(node);
switch (tag) { switch (tag) {
case "H1": case "H1":
case "H2": case "H2":
@ -89,66 +140,87 @@ class Deserializer {
case "H4": case "H4":
case "H5": case "H5":
case "H6": { case "H6": {
const children = this.parseChildNodes(node); const inlines = this.parseInlineNodes(children);
return new HeaderBlock(parseInt(tag[1]), children) return new HeaderBlock(parseInt(tag[1]), inlines)
}
case "A": {
const children = this.parseChildNodes(node);
return this.parseLink(node, children);
} }
case "UL": case "UL":
case "OL": case "OL":
return this.parseList(node); return this.parseList(node);
case "PRE": case "PRE":
return this.parseCodeBlock(node); return this.parseCodeBlock(node);
case "BR":
return new NewLinePart();
case "HR": case "HR":
return new RulePart(); return new RulePart();
case "IMG": case "IMG":
return this.parseImage(node); return this.parseImage(node);
case "P": {
const inlines = this.parseInlineNodes(children);
return new FormatPart(tag, inlines);
}
default: { default: {
if (!basicNodes.includes(tag)) { if (!basicBlock.includes(tag)) {
return null; return null;
} }
const children = this.parseChildNodes(node); const blocks = this.parseAnyNodes(children);
return new FormatPart(tag, children); return new FormatPart(tag, blocks);
} }
} }
} }
parseNode(node) { /** Attempt to parse a node as a block.
*
* @return the block message part, or null if the node
* is not a block element.
*/
parseBlockNode(node) {
const result = this.result; const result = this.result;
if (result.isTextNode(node)) { if (result.isElementNode(node)) {
return new TextPart(result.getNodeText(node)); return this.parseBlockElement(node);
} else if (result.isElementNode(node)) {
return this.parseElement(node);
} }
return null; return null;
} }
parseChildNodes(node) { _parseInlineNodes(nodes, into) {
const childNodes = this.result.getChildNodes(node); for (const htmlNode of nodes) {
return this.parseNodes(childNodes); const node = this.parseInlineNode(htmlNode);
if (node) {
into.push(node);
continue;
}
// Node is either block or unrecognized. In
// both cases, just move on to its children.
this._parseInlineNodes(this.result.getChildNodes(htmlNode), into);
}
} }
parseNodes(nodes) { parseInlineNodes(nodes) {
const parsed = []; const into = [];
this._parseInlineNodes(nodes, into);
return into;
}
_parseAnyNodes(nodes, into) {
for (const htmlNode of nodes) { for (const htmlNode of nodes) {
let node = this.parseNode(htmlNode); const node = this.parseInlineNode(htmlNode) || this.parseBlockNode(htmlNode);
// Just ignore invalid / unknown tags.
if (node) { if (node) {
parsed.push(node); into.push(node);
continue;
}
// Node is unrecognized. Just move on to its children.
this._parseAnyNodes(this.result.getChildNodes(htmlNode), into);
} }
} }
return parsed;
parseAnyNodes(nodes) {
const into = [];
this._parseAnyNodes(nodes, into);
return into;
} }
} }
export function parseHTMLBody(platform, mediaRepository, html) { export function parseHTMLBody(platform, mediaRepository, html) {
const parseResult = platform.parseHTML(html); const parseResult = platform.parseHTML(html);
const deserializer = new Deserializer(parseResult, mediaRepository); const deserializer = new Deserializer(parseResult, mediaRepository);
const parts = deserializer.parseNodes(parseResult.rootNodes); const parts = deserializer.parseAnyNodes(parseResult.rootNodes);
return new MessageBody(html, parts); return new MessageBody(html, parts);
} }
@ -256,6 +328,23 @@ export function tests() {
]; ];
test(assert, input, output); test(assert, input, output);
}, },
"Block elements ignored inside inline elements": assert => {
const input = '<span><p><code>Hello</code></p></span>';
const output = [
new FormatPart("span", [new FormatPart("code", [new TextPart("Hello")])])
];
test(assert, input, output);
},
"Unknown tags are ignored, but their children are kept": assert => {
const input = '<span><dfn><code>Hello</code></dfn><footer><em>World</em></footer></span>';
const output = [
new FormatPart("span", [
new FormatPart("code", [new TextPart("Hello")]),
new FormatPart("em", [new TextPart("World")])
])
];
test(assert, input, output);
},
/* Doesnt work: HTML library doesn't handle <pre><code> properly. /* Doesnt work: HTML library doesn't handle <pre><code> properly.
"Text with code block": assert => { "Text with code block": assert => {
const code = 'main :: IO ()\nmain = putStrLn "Hello"' const code = 'main :: IO ()\nmain = putStrLn "Hello"'