debian-mirror-gitlab/lib/gitlab/robots_txt/parser.rb

# frozen_string_literal: true

module Gitlab
  module RobotsTxt
    class Parser
      DISALLOW_REGEX = /^disallow: /i.freeze
      ALLOW_REGEX = /^allow: /i.freeze

      attr_reader :disallow_rules, :allow_rules

      def initialize(content)
        @raw_content = content

        @disallow_rules, @allow_rules = parse_raw_content!
      end

      def disallowed?(path)
        return false if allow_rules.any? { |rule| path =~ rule }

        disallow_rules.any? { |rule| path =~ rule }
      end

      private

      # This parser is very basic as it only knows about `Disallow:`
      # and `Allow:` lines, and simply ignores all other lines.
      #
      # Patterns ending in `$`, and `*` for 0 or more characters are recognized.
      #
      # It is case insensitive and `Allow` rules takes precedence
      # over `Disallow`.
      def parse_raw_content!
        disallowed = []
        allowed = []

        @raw_content.each_line.each do |line|
          if disallow_rule?(line)
            disallowed << get_disallow_pattern(line)
          elsif allow_rule?(line)
            allowed << get_allow_pattern(line)
          end
        end

        [disallowed, allowed]
      end

      def disallow_rule?(line)
        line =~ DISALLOW_REGEX
      end

      def get_disallow_pattern(line)
        get_pattern(line, DISALLOW_REGEX)
      end

      def allow_rule?(line)
        line =~ ALLOW_REGEX
      end

      def get_allow_pattern(line)
        get_pattern(line, ALLOW_REGEX)
      end

      def get_pattern(line, rule_regex)
        value = line.sub(rule_regex, '').strip
        value = Regexp.escape(value).gsub('\*', '.*')
        value = value.sub(/\\\$$/, '$')
        Regexp.new("^#{value}")
      end
    end
  end
end
New upstream version 13.4.6 2020-11-24 15:15:51 +05:30			`# frozen_string_literal: true`

			`module Gitlab`
			`module RobotsTxt`
			`class Parser`
New upstream version 13.6.5 2021-01-29 00:20:46 +05:30			`DISALLOW_REGEX = /^disallow: /i.freeze`
			`ALLOW_REGEX = /^allow: /i.freeze`

			`attr_reader :disallow_rules, :allow_rules`
New upstream version 13.4.6 2020-11-24 15:15:51 +05:30
			`def initialize(content)`
			`@raw_content = content`

New upstream version 13.6.5 2021-01-29 00:20:46 +05:30			`@disallow_rules, @allow_rules = parse_raw_content!`
New upstream version 13.4.6 2020-11-24 15:15:51 +05:30			`end`

			`def disallowed?(path)`
New upstream version 13.6.5 2021-01-29 00:20:46 +05:30			`return false if allow_rules.any? { \|rule\| path =~ rule }`

New upstream version 13.4.6 2020-11-24 15:15:51 +05:30			`disallow_rules.any? { \|rule\| path =~ rule }`
			`end`

			`private`

New upstream version 13.6.5 2021-01-29 00:20:46 +05:30			# This parser is very basic as it only knows about `Disallow:`
			# and `Allow:` lines, and simply ignores all other lines.
New upstream version 13.4.6 2020-11-24 15:15:51 +05:30			`#`
New upstream version 13.6.5 2021-01-29 00:20:46 +05:30			# Patterns ending in `$`, and `*` for 0 or more characters are recognized.
			`#`
			# It is case insensitive and `Allow` rules takes precedence
			# over `Disallow`.
New upstream version 13.4.6 2020-11-24 15:15:51 +05:30			`def parse_raw_content!`
New upstream version 13.6.5 2021-01-29 00:20:46 +05:30			`disallowed = []`
			`allowed = []`

			`@raw_content.each_line.each do \|line\|`
			`if disallow_rule?(line)`
			`disallowed << get_disallow_pattern(line)`
			`elsif allow_rule?(line)`
			`allowed << get_allow_pattern(line)`
New upstream version 13.4.6 2020-11-24 15:15:51 +05:30			`end`
New upstream version 13.6.5 2021-01-29 00:20:46 +05:30			`end`

			`[disallowed, allowed]`
			`end`

			`def disallow_rule?(line)`
			`line =~ DISALLOW_REGEX`
			`end`

			`def get_disallow_pattern(line)`
			`get_pattern(line, DISALLOW_REGEX)`
			`end`

			`def allow_rule?(line)`
			`line =~ ALLOW_REGEX`
			`end`

			`def get_allow_pattern(line)`
			`get_pattern(line, ALLOW_REGEX)`
			`end`

			`def get_pattern(line, rule_regex)`
			`value = line.sub(rule_regex, '').strip`
			`value = Regexp.escape(value).gsub('\', '.')`
			`value = value.sub(/\\\$$/, '$')`
			`Regexp.new("^#{value}")`
New upstream version 13.4.6 2020-11-24 15:15:51 +05:30			`end`
			`end`
			`end`
			`end`