debian-mirror-gitlab/lib/banzai/filter/autolink_filter.rb

134 lines
4.4 KiB
Ruby
Raw Normal View History

2018-11-18 11:00:15 +05:30
# frozen_string_literal: true
2015-09-11 14:41:01 +05:30
require 'uri'
2015-12-23 02:04:40 +05:30
module Banzai
module Filter
2015-09-11 14:41:01 +05:30
# HTML Filter for auto-linking URLs in HTML.
#
# Based on HTML::Pipeline::AutolinkFilter
#
2019-02-02 18:00:53 +05:30
# Note that our CommonMark parser, `commonmarker` (using the autolink extension)
# handles standard autolinking, like http/https. We detect additional
# schemes (smb, rdar, etc).
#
2015-09-11 14:41:01 +05:30
# Context options:
# :autolink - Boolean, skips all processing done by this filter when false
# :link_attr - Hash of attributes for the generated links
#
class AutolinkFilter < HTML::Pipeline::Filter
include ActionView::Helpers::TagHelper
2019-07-31 22:56:46 +05:30
include Gitlab::Utils::SanitizeNodeLink
2015-09-11 14:41:01 +05:30
# Pattern to match text that should be autolinked.
#
# A URI scheme begins with a letter and may contain letters, numbers,
# plus, period and hyphen. Schemes are case-insensitive but we're being
# picky here and allowing only lowercase for autolinks.
#
# See http://en.wikipedia.org/wiki/URI_scheme
#
2018-04-04 21:44:52 +05:30
# The negative lookbehind ensures that users can paste a URL followed by
# punctuation without those characters being included in the generated
# link. It matches the behaviour of Rinku 2.0.1:
# https://github.com/vmg/rinku/blob/v2.0.1/ext/rinku/autolink.c#L65
2015-09-11 14:41:01 +05:30
#
2018-04-04 21:44:52 +05:30
# Rubular: http://rubular.com/r/nrL3r9yUiq
2019-07-31 22:56:46 +05:30
LINK_PATTERN = %r{([a-z][a-z0-9\+\.-]+://[^\s>]+)(?<!\?|!|\.|,|:)}.freeze
2015-09-11 14:41:01 +05:30
# Text matching LINK_PATTERN inside these elements will not be linked
IGNORE_PARENTS = %w(a code kbd pre script style).to_set
2016-09-13 17:45:13 +05:30
# The XPath query to use for finding text nodes to parse.
TEXT_QUERY = %Q(descendant-or-self::text()[
not(#{IGNORE_PARENTS.map { |p| "ancestor::#{p}" }.join(' or ')})
and contains(., '://')
2017-08-17 22:00:37 +05:30
]).freeze
2016-09-13 17:45:13 +05:30
2018-03-27 19:54:05 +05:30
PUNCTUATION_PAIRS = {
"'" => "'",
'"' => '"',
')' => '(',
']' => '[',
'}' => '{'
}.freeze
2015-09-11 14:41:01 +05:30
def call
return doc if context[:autolink] == false
2016-09-13 17:45:13 +05:30
doc.xpath(TEXT_QUERY).each do |node|
2015-09-11 14:41:01 +05:30
content = node.to_html
next unless content.match(LINK_PATTERN)
html = autolink_filter(content)
next if html == content
node.replace(html)
end
doc
end
2018-03-27 19:54:05 +05:30
private
2016-11-24 13:41:30 +05:30
def autolink_match(match)
# start by stripping out dangerous links
begin
uri = Addressable::URI.parse(match)
2019-07-31 22:56:46 +05:30
return match unless safe_protocol?(uri.scheme)
2016-11-24 13:41:30 +05:30
rescue Addressable::URI::InvalidURIError
return match
2015-09-11 14:41:01 +05:30
end
2016-11-24 13:41:30 +05:30
# Remove any trailing HTML entities and store them for appending
# outside the link element. The entity must be marked HTML safe in
# order to be output literally rather than escaped.
match.gsub!(/((?:&[\w#]+;)+)\z/, '')
dropped = ($1 || '').html_safe
2018-03-27 19:54:05 +05:30
# To match the behaviour of Rinku, if the matched link ends with a
# closing part of a matched pair of punctuation, we remove that trailing
# character unless there are an equal number of closing and opening
# characters in the link.
if match.end_with?(*PUNCTUATION_PAIRS.keys)
close_character = match[-1]
close_count = match.count(close_character)
open_character = PUNCTUATION_PAIRS[close_character]
open_count = match.count(open_character)
if open_count != close_count || open_character == close_character
dropped += close_character
match = match[0..-2]
end
end
2019-02-02 18:00:53 +05:30
# Since this came from a Text node, make sure the new href is encoded.
# `commonmarker` percent encodes the domains of links it handles, so
# do the same (instead of using `normalized_encode`).
2019-03-02 22:35:43 +05:30
begin
href_safe = Addressable::URI.encode(match).html_safe
rescue Addressable::URI::InvalidURIError
return uri.to_s
end
2019-02-02 18:00:53 +05:30
2018-05-01 15:08:00 +05:30
html_safe_match = match.html_safe
2019-02-02 18:00:53 +05:30
options = link_options.merge(href: href_safe)
2018-05-01 15:08:00 +05:30
content_tag(:a, html_safe_match, options) + dropped
2016-11-24 13:41:30 +05:30
end
def autolink_filter(text)
2018-03-27 19:54:05 +05:30
Gitlab::StringRegexMarker.new(CGI.unescapeHTML(text), text.html_safe).mark(LINK_PATTERN) do |link, left:, right:|
2020-01-01 13:55:28 +05:30
autolink_match(link).html_safe
2018-03-27 19:54:05 +05:30
end
2015-09-11 14:41:01 +05:30
end
def link_options
@link_options ||= context[:link_attr] || {}
end
end
end
end