Merge different languages for language stats (#24900)

Fix #24896 If users set different languages by `linguist-language`, the `stats` map could be: `java: 100, Java: 200`. Language stats are stored as case-insensitive in database and there is a unique key. So, the different language names should be merged to one unique name: `Java: 300`
2023-05-25 03:37:36 +08:00 · 2023-05-25 03:37:36 +08:00 · 395bb33e4c
commit 395bb33e4c
parent 63d5e762d8
5 changed files with 59 additions and 6 deletions
--- a/modules/git/repo_language_stats.go
+++ b/modules/git/repo_language_stats.go
@ -3,7 +3,46 @@
 package git
 import (
 	"strings"
 	"unicode"
 )
 const (
 	fileSizeLimit int64 = 16 * 1024   // 16 KiB
 	bigFileSize   int64 = 1024 * 1024 // 1 MiB
 )
 // mergeLanguageStats mergers language names with different cases. The name with most upper case letters is used.
 func mergeLanguageStats(stats map[string]int64) map[string]int64 {
 	names := map[string]struct {
 		uniqueName string
 		upperCount int
 	}{}
 	countUpper := func(s string) (count int) {
 		for _, r := range s {
 			if unicode.IsUpper(r) {
 				count++
 			}
 		}
 		return count
 	}
 	for name := range stats {
 		cnt := countUpper(name)
 		lower := strings.ToLower(name)
 		if cnt >= names[lower].upperCount {
 			names[lower] = struct {
 				uniqueName string
 				upperCount int
 			}{uniqueName: name, upperCount: cnt}
 		}
 	}
 	res := make(map[string]int64, len(names))
 	for name, num := range stats {
 		res[names[strings.ToLower(name)].uniqueName] += num
 	}
 	return res
 }
--- a/modules/git/repo_language_stats_gogit.go
+++ b/modules/git/repo_language_stats_gogit.go
@ -156,7 +156,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 		sizes[firstExcludedLanguage] = firstExcludedLanguageSize
 	}
-	return sizes, nil
+	return mergeLanguageStats(sizes), nil
 }
 func readFile(f *object.File, limit int64) ([]byte, error) {
--- a/modules/git/repo_language_stats_nogogit.go
+++ b/modules/git/repo_language_stats_nogogit.go
@ -180,7 +180,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 		// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
 		// - eg. do the all the detection tests using filename first before reading content.
 		language := analyze.GetCodeLanguage(f.Name(), content)
-		if language == enry.OtherLanguage || language == "" {
+		if language == "" {
 			continue
 		}
@ -192,8 +192,8 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 		included, checked := includedLanguage[language]
 		if !checked {
-			langtype := enry.GetLanguageType(language)
+			langType := enry.GetLanguageType(language)
-			included = langtype == enry.Programming || langtype == enry.Markup
+			included = langType == enry.Programming || langType == enry.Markup
 			includedLanguage[language] = included
 		}
 		if included {
@ -210,7 +210,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
 		sizes[firstExcludedLanguage] = firstExcludedLanguageSize
 	}
-	return sizes, nil
+	return mergeLanguageStats(sizes), nil
 }
 func discardFull(rd *bufio.Reader, discard int64) error {
--- a/modules/git/repo_language_stats_test.go
+++ b/modules/git/repo_language_stats_test.go
@ -30,3 +30,17 @@ func TestRepository_GetLanguageStats(t *testing.T) {
 		"Java":   112,
 	}, stats)
 }
 func TestMergeLanguageStats(t *testing.T) {
 	assert.EqualValues(t, map[string]int64{
 		"PHP":    1,
 		"python": 10,
 		"JAVA":   700,
 	}, mergeLanguageStats(map[string]int64{
 		"PHP":    1,
 		"python": 10,
 		"Java":   100,
 		"java":   200,
 		"JAVA":   400,
 	}))
 }
--- a/modules/log/logger_global.go
+++ b/modules/log/logger_global.go
@ -10,7 +10,7 @@ import (
 // FallbackErrorf is the last chance to show an error if the logger has internal errors
 func FallbackErrorf(format string, args ...any) {
-	_, _ = fmt.Fprintf(os.Stderr, format+"\n", args)
+	_, _ = fmt.Fprintf(os.Stderr, format+"\n", args...)
 }
 func GetLevel() Level {