Improve SHA1 link detection (#6526)

This improves the SHA1 link detection to not pick up extraneous
non-whitespace characters at the end of the URL. The '.' is a special
case handled in code itself because of missing regexp lookahead
support.

Regex test cases: https://regex101.com/r/xUMlqh/3
This commit is contained in:
silverwind 2019-04-06 20:28:45 +02:00 committed by Lauris BH
parent 0bdd81df9d
commit 2242a9f82e
2 changed files with 39 additions and 24 deletions

View file

@ -54,7 +54,7 @@ var (
shortLinkPattern = regexp.MustCompile(`\[\[(.*?)\]\](\w*)`) shortLinkPattern = regexp.MustCompile(`\[\[(.*?)\]\](\w*)`)
// anySHA1Pattern allows to split url containing SHA into parts // anySHA1Pattern allows to split url containing SHA into parts
anySHA1Pattern = regexp.MustCompile(`https?://(?:\S+/){4}([0-9a-f]{40})/?([^#\s]+)?(?:#(\S+))?`) anySHA1Pattern = regexp.MustCompile(`https?://(?:\S+/){4}([0-9a-f]{40})(/[^#\s]+)?(#\S+)?`)
validLinksPattern = regexp.MustCompile(`^[a-z][\w-]+://`) validLinksPattern = regexp.MustCompile(`^[a-z][\w-]+://`)
@ -594,31 +594,46 @@ func fullSha1PatternProcessor(ctx *postProcessCtx, node *html.Node) {
if m == nil { if m == nil {
return return
} }
// take out what's relevant
urlFull := node.Data[m[0]:m[1]] urlFull := node.Data[m[0]:m[1]]
hash := node.Data[m[2]:m[3]] text := base.ShortSha(node.Data[m[2]:m[3]])
var subtree, line string // 3rd capture group matches a optional path
subpath := ""
// optional, we do them depending on the length.
if m[7] > 0 {
line = node.Data[m[6]:m[7]]
}
if m[5] > 0 { if m[5] > 0 {
subtree = node.Data[m[4]:m[5]] subpath = node.Data[m[4]:m[5]]
} }
text := base.ShortSha(hash) // 4th capture group matches a optional url hash
if subtree != "" { hash := ""
text += "/" + subtree if m[7] > 0 {
} hash = node.Data[m[6]:m[7]][1:]
if line != "" {
text += " ("
text += line
text += ")"
} }
replaceContent(node, m[0], m[1], createLink(urlFull, text)) start := m[0]
end := m[1]
// If url ends in '.', it's very likely that it is not part of the
// actual url but used to finish a sentence.
if strings.HasSuffix(urlFull, ".") {
end--
urlFull = urlFull[:len(urlFull)-1]
if hash != "" {
hash = hash[:len(hash)-1]
} else if subpath != "" {
subpath = subpath[:len(subpath)-1]
}
}
if subpath != "" {
text += subpath
}
if hash != "" {
text += " (" + hash + ")"
}
replaceContent(node, start, end, createLink(urlFull, text))
} }
// sha1CurrentPatternProcessor renders SHA1 strings to corresponding links that // sha1CurrentPatternProcessor renders SHA1 strings to corresponding links that

View file

@ -273,12 +273,12 @@ func TestRegExp_anySHA1Pattern(t *testing.T) {
testCases := map[string][]string{ testCases := map[string][]string{
"https://github.com/jquery/jquery/blob/a644101ed04d0beacea864ce805e0c4f86ba1cd1/test/unit/event.js#L2703": { "https://github.com/jquery/jquery/blob/a644101ed04d0beacea864ce805e0c4f86ba1cd1/test/unit/event.js#L2703": {
"a644101ed04d0beacea864ce805e0c4f86ba1cd1", "a644101ed04d0beacea864ce805e0c4f86ba1cd1",
"test/unit/event.js", "/test/unit/event.js",
"L2703", "#L2703",
}, },
"https://github.com/jquery/jquery/blob/a644101ed04d0beacea864ce805e0c4f86ba1cd1/test/unit/event.js": { "https://github.com/jquery/jquery/blob/a644101ed04d0beacea864ce805e0c4f86ba1cd1/test/unit/event.js": {
"a644101ed04d0beacea864ce805e0c4f86ba1cd1", "a644101ed04d0beacea864ce805e0c4f86ba1cd1",
"test/unit/event.js", "/test/unit/event.js",
"", "",
}, },
"https://github.com/jquery/jquery/commit/0705be475092aede1eddae01319ec931fb9c65fc": { "https://github.com/jquery/jquery/commit/0705be475092aede1eddae01319ec931fb9c65fc": {
@ -288,13 +288,13 @@ func TestRegExp_anySHA1Pattern(t *testing.T) {
}, },
"https://github.com/jquery/jquery/tree/0705be475092aede1eddae01319ec931fb9c65fc/src": { "https://github.com/jquery/jquery/tree/0705be475092aede1eddae01319ec931fb9c65fc/src": {
"0705be475092aede1eddae01319ec931fb9c65fc", "0705be475092aede1eddae01319ec931fb9c65fc",
"src", "/src",
"", "",
}, },
"https://try.gogs.io/gogs/gogs/commit/d8a994ef243349f321568f9e36d5c3f444b99cae#diff-2": { "https://try.gogs.io/gogs/gogs/commit/d8a994ef243349f321568f9e36d5c3f444b99cae#diff-2": {
"d8a994ef243349f321568f9e36d5c3f444b99cae", "d8a994ef243349f321568f9e36d5c3f444b99cae",
"", "",
"diff-2", "#diff-2",
}, },
} }