Merge different languages for language stats (#24900)
Fix #24896 If users set different languages by `linguist-language`, the `stats` map could be: `java: 100, Java: 200`. Language stats are stored as case-insensitive in database and there is a unique key. So, the different language names should be merged to one unique name: `Java: 300`
This commit is contained in:
parent
63d5e762d8
commit
395bb33e4c
5 changed files with 59 additions and 6 deletions
|
@ -3,7 +3,46 @@
|
||||||
|
|
||||||
package git
|
package git
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"unicode"
|
||||||
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
fileSizeLimit int64 = 16 * 1024 // 16 KiB
|
fileSizeLimit int64 = 16 * 1024 // 16 KiB
|
||||||
bigFileSize int64 = 1024 * 1024 // 1 MiB
|
bigFileSize int64 = 1024 * 1024 // 1 MiB
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// mergeLanguageStats mergers language names with different cases. The name with most upper case letters is used.
|
||||||
|
func mergeLanguageStats(stats map[string]int64) map[string]int64 {
|
||||||
|
names := map[string]struct {
|
||||||
|
uniqueName string
|
||||||
|
upperCount int
|
||||||
|
}{}
|
||||||
|
|
||||||
|
countUpper := func(s string) (count int) {
|
||||||
|
for _, r := range s {
|
||||||
|
if unicode.IsUpper(r) {
|
||||||
|
count++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count
|
||||||
|
}
|
||||||
|
|
||||||
|
for name := range stats {
|
||||||
|
cnt := countUpper(name)
|
||||||
|
lower := strings.ToLower(name)
|
||||||
|
if cnt >= names[lower].upperCount {
|
||||||
|
names[lower] = struct {
|
||||||
|
uniqueName string
|
||||||
|
upperCount int
|
||||||
|
}{uniqueName: name, upperCount: cnt}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
res := make(map[string]int64, len(names))
|
||||||
|
for name, num := range stats {
|
||||||
|
res[names[strings.ToLower(name)].uniqueName] += num
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
|
@ -156,7 +156,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
|
||||||
sizes[firstExcludedLanguage] = firstExcludedLanguageSize
|
sizes[firstExcludedLanguage] = firstExcludedLanguageSize
|
||||||
}
|
}
|
||||||
|
|
||||||
return sizes, nil
|
return mergeLanguageStats(sizes), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func readFile(f *object.File, limit int64) ([]byte, error) {
|
func readFile(f *object.File, limit int64) ([]byte, error) {
|
||||||
|
|
|
@ -180,7 +180,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
|
||||||
// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
|
// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
|
||||||
// - eg. do the all the detection tests using filename first before reading content.
|
// - eg. do the all the detection tests using filename first before reading content.
|
||||||
language := analyze.GetCodeLanguage(f.Name(), content)
|
language := analyze.GetCodeLanguage(f.Name(), content)
|
||||||
if language == enry.OtherLanguage || language == "" {
|
if language == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -192,8 +192,8 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
|
||||||
|
|
||||||
included, checked := includedLanguage[language]
|
included, checked := includedLanguage[language]
|
||||||
if !checked {
|
if !checked {
|
||||||
langtype := enry.GetLanguageType(language)
|
langType := enry.GetLanguageType(language)
|
||||||
included = langtype == enry.Programming || langtype == enry.Markup
|
included = langType == enry.Programming || langType == enry.Markup
|
||||||
includedLanguage[language] = included
|
includedLanguage[language] = included
|
||||||
}
|
}
|
||||||
if included {
|
if included {
|
||||||
|
@ -210,7 +210,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
|
||||||
sizes[firstExcludedLanguage] = firstExcludedLanguageSize
|
sizes[firstExcludedLanguage] = firstExcludedLanguageSize
|
||||||
}
|
}
|
||||||
|
|
||||||
return sizes, nil
|
return mergeLanguageStats(sizes), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func discardFull(rd *bufio.Reader, discard int64) error {
|
func discardFull(rd *bufio.Reader, discard int64) error {
|
||||||
|
|
|
@ -30,3 +30,17 @@ func TestRepository_GetLanguageStats(t *testing.T) {
|
||||||
"Java": 112,
|
"Java": 112,
|
||||||
}, stats)
|
}, stats)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestMergeLanguageStats(t *testing.T) {
|
||||||
|
assert.EqualValues(t, map[string]int64{
|
||||||
|
"PHP": 1,
|
||||||
|
"python": 10,
|
||||||
|
"JAVA": 700,
|
||||||
|
}, mergeLanguageStats(map[string]int64{
|
||||||
|
"PHP": 1,
|
||||||
|
"python": 10,
|
||||||
|
"Java": 100,
|
||||||
|
"java": 200,
|
||||||
|
"JAVA": 400,
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
|
@ -10,7 +10,7 @@ import (
|
||||||
|
|
||||||
// FallbackErrorf is the last chance to show an error if the logger has internal errors
|
// FallbackErrorf is the last chance to show an error if the logger has internal errors
|
||||||
func FallbackErrorf(format string, args ...any) {
|
func FallbackErrorf(format string, args ...any) {
|
||||||
_, _ = fmt.Fprintf(os.Stderr, format+"\n", args)
|
_, _ = fmt.Fprintf(os.Stderr, format+"\n", args...)
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetLevel() Level {
|
func GetLevel() Level {
|
||||||
|
|
Loading…
Reference in a new issue