From c6d7cef491d1202c3d535ca24d92e746bee50d99 Mon Sep 17 00:00:00 2001 From: RMidhunSuresh Date: Tue, 11 May 2021 22:21:37 +0530 Subject: [PATCH] Refactor regex to improve readability - Split regex into components - Add informative comments Signed-off-by: RMidhunSuresh --- .../session/room/timeline/linkify/regex.js | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/domain/session/room/timeline/linkify/regex.js b/src/domain/session/room/timeline/linkify/regex.js index fe8c94eb..7530d301 100644 --- a/src/domain/session/room/timeline/linkify/regex.js +++ b/src/domain/session/room/timeline/linkify/regex.js @@ -1 +1,29 @@ -export const regex = /(?:https|http|ftp):\/\/[a-zA-Z0-9:.\[\]#-]+(?:[\/#][^\s]*[^\s.,?!]|[^\s\u{80}-\u{10ffff}.,?!])/gui +const scheme = "(?:https|http|ftp):\\/\\/"; +const host = "[a-zA-Z0-9:.\\[\\]-]"; + +/* +A URL containing path (/) or fragment (#) component +is allowed to end with any character which is not +space nor punctuation. The ending character may be +non-ASCII. +*/ +const end = "[^\\s.,?!]"; +const additional = `[\\/#][^\\s]*${end}`; + +/* +Similarly, a URL not containing path or fragment must +also end with a character that is not space nor punctuation. +However the ending character must also be ASCII. +*/ +const nonASCII = "\\u{80}-\\u{10ffff}"; +const endASCII = `[^\\s${nonASCII}.,?!]`; + +/* +URL must not contain non-ascii characters in host but may contain +them in path or fragment components. +https://matrix.org/ - valid +https://matrix.org - invalid +*/ +const urlRegex = `${scheme}${host}+(?:${additional}|${endASCII})`; + +export const regex = new RegExp(urlRegex, "gui");