debian-mirror-gitlab/lib/gitlab/blob_helper.rb

# frozen_string_literal: true

# This has been extracted from https://github.com/github/linguist/blob/master/lib/linguist/blob_helper.rb
module Gitlab
  module BlobHelper
    include Gitlab::Utils::StrongMemoize

    def extname
      File.extname(name.to_s)
    end

    def known_extension?
      LanguageData.extensions.include?(extname)
    end

    def viewable?
      !large? && text_in_repo?
    end

    MEGABYTE = 1024 * 1024

    def large?
      size.to_i > MEGABYTE
    end

    def binary_in_repo?
      # Large blobs aren't even loaded into memory
      if data.nil?
        true

      # Treat blank files as text
      elsif data == ""
        false

      # Charlock doesn't know what to think
      elsif encoding.nil?
        true

      # If Charlock says its binary
      else
        detect_encoding[:type] == :binary
      end
    end

    def text_in_repo?
      !binary_in_repo?
    end

    def image?
      ['.png', '.jpg', '.jpeg', '.gif', '.svg'].include?(extname.downcase)
    end

    # Internal: Lookup mime type for extension.
    #
    # Returns a MIME::Type
    # rubocop:disable Gitlab/ModuleWithInstanceVariables
    def _mime_type
      if defined? @_mime_type
        @_mime_type
      else
        guesses = ::MIME::Types.type_for(extname.to_s)

        # Prefer text mime types over binary
        @_mime_type = guesses.detect { |type| type.ascii? } || guesses.first
      end
    end
    # rubocop:enable Gitlab/ModuleWithInstanceVariables

    # Public: Get the actual blob mime type
    #
    # Examples
    #
    #   # => 'text/plain'
    #   # => 'text/html'
    #
    # Returns a mime type String.
    def mime_type
      _mime_type ? _mime_type.to_s : 'text/plain'
    end

    def binary_mime_type?
      _mime_type ? _mime_type.binary? : false
    end

    def lines
      @lines ||=
        if viewable? && data
          # `data` is usually encoded as ASCII-8BIT even when the content has
          # been detected as a different encoding. However, we are not allowed
          # to change the encoding of `data` because we've made the implicit
          # guarantee that each entry in `lines` is encoded the same way as
          # `data`.
          #
          # Instead, we re-encode each possible newline sequence as the
          # detected encoding, then force them back to the encoding of `data`
          # (usually a binary encoding like ASCII-8BIT). This means that the
          # byte sequence will match how newlines are likely encoded in the
          # file, but we don't have to change the encoding of `data` as far as
          # Ruby is concerned. This allows us to correctly parse out each line
          # without changing the encoding of `data`, and
          # also--importantly--without having to duplicate many (potentially
          # large) strings.
          begin
            data.split(encoded_newlines_re, -1)
          rescue Encoding::ConverterNotFoundError
            # The data is not splittable in the detected encoding.  Assume it's
            # one big line.
            [data]
          end
        else
          []
        end
    end

    def content_type
      # rubocop:disable Style/MultilineTernaryOperator
      # rubocop:disable Style/NestedTernaryOperator
      @content_type ||= binary_mime_type? || binary_in_repo? ? mime_type :
                          (encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")
      # rubocop:enable Style/NestedTernaryOperator
      # rubocop:enable Style/MultilineTernaryOperator
    end

    def encoded_newlines_re
      strong_memoize(:encoded_newlines_re) do
        newlines = ["\r\n", "\r", "\n"]
        data_encoding = data&.encoding

        if ruby_encoding && data_encoding
          newlines.map! do |nl|
            nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data_encoding)
          end
        end

        Regexp.union(newlines)
      end
    end

    def ruby_encoding
      if hash = detect_encoding
        hash[:ruby_encoding]
      end
    end

    def encoding
      if hash = detect_encoding
        hash[:encoding]
      end
    end

    def detect_encoding
      @detect_encoding ||= CharlockHolmes::EncodingDetector.new.detect(data) if data # rubocop:disable Gitlab/ModuleWithInstanceVariables
    end

    def empty?
      data.nil? || data == ""
    end
  end
end
New upstream version 11.5.3+dfsg 2018-12-13 13:39:08 +05:30			`# frozen_string_literal: true`

			`# This has been extracted from https://github.com/github/linguist/blob/master/lib/linguist/blob_helper.rb`
			`module Gitlab`
			`module BlobHelper`
New upstream version 13.0.0 2020-05-24 23:13:21 +05:30			`include Gitlab::Utils::StrongMemoize`

New upstream version 11.5.3+dfsg 2018-12-13 13:39:08 +05:30			`def extname`
			`File.extname(name.to_s)`
			`end`

			`def known_extension?`
			`LanguageData.extensions.include?(extname)`
			`end`

			`def viewable?`
New upstream version 11.7.5 2019-02-15 15:39:39 +05:30			`!large? && text_in_repo?`
New upstream version 11.5.3+dfsg 2018-12-13 13:39:08 +05:30			`end`

			`MEGABYTE = 1024 * 1024`

			`def large?`
			`size.to_i > MEGABYTE`
			`end`

New upstream version 11.7.5 2019-02-15 15:39:39 +05:30			`def binary_in_repo?`
New upstream version 11.5.3+dfsg 2018-12-13 13:39:08 +05:30			`# Large blobs aren't even loaded into memory`
			`if data.nil?`
			`true`

			`# Treat blank files as text`
			`elsif data == ""`
			`false`

			`# Charlock doesn't know what to think`
			`elsif encoding.nil?`
			`true`

			`# If Charlock says its binary`
			`else`
			`detect_encoding[:type] == :binary`
			`end`
			`end`

New upstream version 11.7.5 2019-02-15 15:39:39 +05:30			`def text_in_repo?`
			`!binary_in_repo?`
New upstream version 11.5.3+dfsg 2018-12-13 13:39:08 +05:30			`end`

			`def image?`
New upstream version 12.2.8 2019-10-12 21:52:04 +05:30			`['.png', '.jpg', '.jpeg', '.gif', '.svg'].include?(extname.downcase)`
New upstream version 11.5.3+dfsg 2018-12-13 13:39:08 +05:30			`end`

			`# Internal: Lookup mime type for extension.`
			`#`
			`# Returns a MIME::Type`
			`# rubocop:disable Gitlab/ModuleWithInstanceVariables`
			`def _mime_type`
			`if defined? @_mime_type`
			`@_mime_type`
			`else`
			`guesses = ::MIME::Types.type_for(extname.to_s)`

			`# Prefer text mime types over binary`
			`@_mime_type = guesses.detect { \|type\| type.ascii? } \|\| guesses.first`
			`end`
			`end`
			`# rubocop:enable Gitlab/ModuleWithInstanceVariables`

			`# Public: Get the actual blob mime type`
			`#`
			`# Examples`
			`#`
			`# # => 'text/plain'`
			`# # => 'text/html'`
			`#`
			`# Returns a mime type String.`
			`def mime_type`
			`_mime_type ? _mime_type.to_s : 'text/plain'`
			`end`

			`def binary_mime_type?`
			`_mime_type ? _mime_type.binary? : false`
			`end`

			`def lines`
			`@lines \|\|=`
			`if viewable? && data`
			# `data` is usually encoded as ASCII-8BIT even when the content has
			`# been detected as a different encoding. However, we are not allowed`
			# to change the encoding of `data` because we've made the implicit
			# guarantee that each entry in `lines` is encoded the same way as
			# `data`.
			`#`
			`# Instead, we re-encode each possible newline sequence as the`
			# detected encoding, then force them back to the encoding of `data`
			`# (usually a binary encoding like ASCII-8BIT). This means that the`
			`# byte sequence will match how newlines are likely encoded in the`
			# file, but we don't have to change the encoding of `data` as far as
			`# Ruby is concerned. This allows us to correctly parse out each line`
			# without changing the encoding of `data`, and
			`# also--importantly--without having to duplicate many (potentially`
			`# large) strings.`
			`begin`
			`data.split(encoded_newlines_re, -1)`
			`rescue Encoding::ConverterNotFoundError`
			`# The data is not splittable in the detected encoding. Assume it's`
			`# one big line.`
			`[data]`
			`end`
			`else`
			`[]`
			`end`
			`end`

			`def content_type`
			`# rubocop:disable Style/MultilineTernaryOperator`
			`# rubocop:disable Style/NestedTernaryOperator`
New upstream version 11.7.5 2019-02-15 15:39:39 +05:30			`@content_type \|\|= binary_mime_type? \|\| binary_in_repo? ? mime_type :`
New upstream version 11.5.3+dfsg 2018-12-13 13:39:08 +05:30			`(encoding ? "text/plain; charset=#{encoding.downcase}" : "text/plain")`
			`# rubocop:enable Style/NestedTernaryOperator`
			`# rubocop:enable Style/MultilineTernaryOperator`
			`end`

			`def encoded_newlines_re`
New upstream version 13.0.0 2020-05-24 23:13:21 +05:30			`strong_memoize(:encoded_newlines_re) do`
			`newlines = ["\r\n", "\r", "\n"]`
			`data_encoding = data&.encoding`

			`if ruby_encoding && data_encoding`
			`newlines.map! do \|nl\|`
			`nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data_encoding)`
			`end`
			`end`

			`Regexp.union(newlines)`
			`end`
New upstream version 11.5.3+dfsg 2018-12-13 13:39:08 +05:30			`end`

			`def ruby_encoding`
			`if hash = detect_encoding`
			`hash[:ruby_encoding]`
			`end`
			`end`

			`def encoding`
			`if hash = detect_encoding`
			`hash[:encoding]`
			`end`
			`end`

			`def detect_encoding`
			`@detect_encoding \|\|= CharlockHolmes::EncodingDetector.new.detect(data) if data # rubocop:disable Gitlab/ModuleWithInstanceVariables`
			`end`

			`def empty?`
			`data.nil? \|\| data == ""`
			`end`
			`end`
			`end`