debian-mirror-gitlab/ruby-statistics/lib/statistics/statistical_test/f_test.rb

module Statistics
  module StatisticalTest
    class FTest
      # This method calculates the one-way ANOVA F-test statistic.
      # We assume that all specified arguments are arrays.
      # It returns an array with three elements:
      #   [F-statistic or F-score, degrees of freedom numerator, degrees of freedom denominator].
      #
      # Formulas extracted from:
      # https://courses.lumenlearning.com/boundless-statistics/chapter/one-way-anova/
      # http://sphweb.bumc.bu.edu/otlt/MPH-Modules/BS/BS704_HypothesisTesting-ANOVA/BS704_HypothesisTesting-Anova_print.html
      def self.anova_f_score(*args)
        # If only two groups have been specified as arguments, we follow the classic F-Test for
        # equality of variances, which is the ratio between the variances.
        f_score = nil
        df1 = nil
        df2 = nil

        if args.size == 2
          variances = [args[0].variance, args[1].variance]

          f_score = variances.max/variances.min.to_f
          df1 = 1 # k-1 (k = 2)
          df2 = args.flatten.size - 2 # N-k (k = 2)
        elsif args.size > 2
          total_groups = args.size
          total_elements = args.flatten.size
          overall_mean = args.flatten.mean

          sample_sizes = args.map(&:size)
          sample_means = args.map(&:mean)
          sample_stds = args.map(&:standard_deviation)

          # Variance between groups
          iterator = sample_sizes.each_with_index

          variance_between_groups = iterator.reduce(0) do |summation, (size, index)|
            inner_calculation = size * ((sample_means[index] - overall_mean) ** 2)

            summation += (inner_calculation / (total_groups - 1).to_f)
          end

          # Variance within groups
          variance_within_groups = (0...total_groups).reduce(0) do |outer_summation, group_index|
            outer_summation += args[group_index].reduce(0) do |inner_sumation, observation|
              inner_calculation = ((observation - sample_means[group_index]) ** 2)
              inner_sumation += (inner_calculation / (total_elements - total_groups).to_f)
            end
          end

          f_score = variance_between_groups/variance_within_groups.to_f
          df1 = total_groups - 1
          df2 = total_elements - total_groups
        end

        [f_score, df1, df2]
      end

      # This method expects the alpha value and the groups to calculate the one-way ANOVA test.
      # It returns a hash with multiple information and the test result (if reject the null hypotesis or not).
      # Keep in mind that the values for the alternative key (true/false) does not imply that the alternative hypothesis
      # is TRUE or FALSE. It's a minor notation advantage to decide if reject the null hypothesis or not.

      def self.one_way_anova(alpha, *args)
        f_score, df1, df2 = *self.anova_f_score(*args) # Splat array result

        return if f_score.nil? || df1.nil? || df2.nil?

        probability = Distribution::F.new(df1, df2).cumulative_function(f_score)
        p_value = 1 - probability

        # According to https://stats.stackexchange.com/questions/29158/do-you-reject-the-null-hypothesis-when-p-alpha-or-p-leq-alpha
        # We can assume that if p_value <= alpha, we can safely reject the null hypothesis, ie. accept the alternative hypothesis.
        { probability: probability,
          p_value: p_value,
          alpha: alpha,
          null: alpha < p_value,
          alternative: p_value <= alpha,
          confidence_level: 1 - alpha }
      end
    end
  end
end