Add doctor command for full GC of LFS (#21978)

The recent PR adding orphaned checks to the LFS storage is not
sufficient to completely GC LFS, as it is possible for LFSMetaObjects to
remain associated with repos but still need to be garbage collected.

Imagine a situation where a branch is uploaded containing LFS files but
that branch is later completely deleted. The LFSMetaObjects will remain
associated with the Repository but the Repository will no longer contain
any pointers to the object.

This PR adds a second doctor command to perform a full GC.

Signed-off-by: Andrew Thornton <art27@cantab.net>
This commit is contained in:
zeripath 2022-12-15 20:44:16 +00:00 committed by GitHub
parent 3243dbe1a9
commit 651fe4bb7d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 245 additions and 39 deletions

View file

@ -6,6 +6,7 @@ package git
import ( import (
"context" "context"
"fmt" "fmt"
"time"
"code.gitea.io/gitea/models/db" "code.gitea.io/gitea/models/db"
"code.gitea.io/gitea/models/perm" "code.gitea.io/gitea/models/perm"
@ -14,6 +15,7 @@ import (
user_model "code.gitea.io/gitea/models/user" user_model "code.gitea.io/gitea/models/user"
"code.gitea.io/gitea/modules/lfs" "code.gitea.io/gitea/modules/lfs"
"code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil" "code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/modules/util" "code.gitea.io/gitea/modules/util"
@ -180,6 +182,12 @@ func GetLFSMetaObjectByOid(repoID int64, oid string) (*LFSMetaObject, error) {
// RemoveLFSMetaObjectByOid removes a LFSMetaObject entry from database by its OID. // RemoveLFSMetaObjectByOid removes a LFSMetaObject entry from database by its OID.
// It may return ErrLFSObjectNotExist or a database error. // It may return ErrLFSObjectNotExist or a database error.
func RemoveLFSMetaObjectByOid(repoID int64, oid string) (int64, error) { func RemoveLFSMetaObjectByOid(repoID int64, oid string) (int64, error) {
return RemoveLFSMetaObjectByOidFn(repoID, oid, nil)
}
// RemoveLFSMetaObjectByOidFn removes a LFSMetaObject entry from database by its OID.
// It may return ErrLFSObjectNotExist or a database error. It will run Fn with the current count within the transaction
func RemoveLFSMetaObjectByOidFn(repoID int64, oid string, fn func(count int64) error) (int64, error) {
if len(oid) == 0 { if len(oid) == 0 {
return 0, ErrLFSObjectNotExist return 0, ErrLFSObjectNotExist
} }
@ -200,6 +208,12 @@ func RemoveLFSMetaObjectByOid(repoID int64, oid string) (int64, error) {
return count, err return count, err
} }
if fn != nil {
if err := fn(count); err != nil {
return count, err
}
}
return count, committer.Commit() return count, committer.Commit()
} }
@ -319,3 +333,43 @@ func GetRepoLFSSize(ctx context.Context, repoID int64) (int64, error) {
} }
return lfsSize, nil return lfsSize, nil
} }
type IterateLFSMetaObjectsForRepoOptions struct {
OlderThan time.Time
}
// IterateLFSMetaObjectsForRepo provides a iterator for LFSMetaObjects per Repo
func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(context.Context, *LFSMetaObject, int64) error, opts *IterateLFSMetaObjectsForRepoOptions) error {
var start int
batchSize := setting.Database.IterateBufferSize
engine := db.GetEngine(ctx)
type CountLFSMetaObject struct {
Count int64
LFSMetaObject
}
for {
beans := make([]*CountLFSMetaObject, 0, batchSize)
// SELECT `lfs_meta_object`.*, COUNT(`l1`.id) as `count` FROM lfs_meta_object INNER JOIN lfs_meta_object AS l1 ON l1.oid = lfs_meta_object.oid WHERE lfs_meta_object.repository_id = ? GROUP BY lfs_meta_object.id
sess := engine.Select("`lfs_meta_object`.*, COUNT(`l1`.oid) AS `count`").
Join("INNER", "`lfs_meta_object` AS l1", "`lfs_meta_object`.oid = `l1`.oid").
Where("`lfs_meta_object`.repository_id = ?", repoID)
if !opts.OlderThan.IsZero() {
sess.And("`lfs_meta_object`.created_unix < ?", opts.OlderThan)
}
sess.GroupBy("`lfs_meta_object`.id")
if err := sess.Limit(batchSize, start).Find(&beans); err != nil {
return err
}
if len(beans) == 0 {
return nil
}
start += len(beans)
for _, bean := range beans {
if err := f(ctx, &bean.LFSMetaObject, bean.Count); err != nil {
return err
}
}
}
}

37
modules/doctor/lfs.go Normal file
View file

@ -0,0 +1,37 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package doctor
import (
"context"
"fmt"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/services/repository"
)
func init() {
Register(&Check{
Title: "Garbage collect LFS",
Name: "gc-lfs",
IsDefault: false,
Run: garbageCollectLFSCheck,
AbortIfFailed: false,
SkipDatabaseInitialization: false,
Priority: 1,
})
}
func garbageCollectLFSCheck(ctx context.Context, logger log.Logger, autofix bool) error {
if !setting.LFS.StartServer {
return fmt.Errorf("LFS support is disabled")
}
if err := repository.GarbageCollectLFSMetaObjects(ctx, logger, autofix); err != nil {
return err
}
return checkStorage(&checkStorageOptions{LFS: true})(ctx, logger, autofix)
}

View file

@ -63,7 +63,7 @@ func registerRepoHealthCheck() {
for _, arg := range rhcConfig.Args { for _, arg := range rhcConfig.Args {
args = append(args, git.CmdArg(arg)) args = append(args, git.CmdArg(arg))
} }
return repo_service.GitFsck(ctx, rhcConfig.Timeout, args) return repo_service.GitFsckRepos(ctx, rhcConfig.Timeout, args)
}) })
} }

View file

@ -22,8 +22,8 @@ import (
"xorm.io/builder" "xorm.io/builder"
) )
// GitFsck calls 'git fsck' to check repository health. // GitFsckRepos calls 'git fsck' to check repository health.
func GitFsck(ctx context.Context, timeout time.Duration, args []git.CmdArg) error { func GitFsckRepos(ctx context.Context, timeout time.Duration, args []git.CmdArg) error {
log.Trace("Doing: GitFsck") log.Trace("Doing: GitFsck")
if err := db.Iterate( if err := db.Iterate(
@ -35,15 +35,7 @@ func GitFsck(ctx context.Context, timeout time.Duration, args []git.CmdArg) erro
return db.ErrCancelledf("before fsck of %s", repo.FullName()) return db.ErrCancelledf("before fsck of %s", repo.FullName())
default: default:
} }
log.Trace("Running health check on repository %v", repo) return GitFsckRepo(ctx, repo, timeout, args)
repoPath := repo.RepoPath()
if err := git.Fsck(ctx, repoPath, timeout, args...); err != nil {
log.Warn("Failed to health check repository (%v): %v", repo, err)
if err = system_model.CreateRepositoryNotice("Failed to health check repository (%s): %v", repo.FullName(), err); err != nil {
log.Error("CreateRepositoryNotice: %v", err)
}
}
return nil
}, },
); err != nil { ); err != nil {
log.Trace("Error: GitFsck: %v", err) log.Trace("Error: GitFsck: %v", err)
@ -54,6 +46,19 @@ func GitFsck(ctx context.Context, timeout time.Duration, args []git.CmdArg) erro
return nil return nil
} }
// GitFsckRepo calls 'git fsck' to check an individual repository's health.
func GitFsckRepo(ctx context.Context, repo *repo_model.Repository, timeout time.Duration, args []git.CmdArg) error {
log.Trace("Running health check on repository %-v", repo)
repoPath := repo.RepoPath()
if err := git.Fsck(ctx, repoPath, timeout, args...); err != nil {
log.Warn("Failed to health check repository (%-v): %v", repo, err)
if err = system_model.CreateRepositoryNotice("Failed to health check repository (%s): %v", repo.FullName(), err); err != nil {
log.Error("CreateRepositoryNotice: %v", err)
}
}
return nil
}
// GitGcRepos calls 'git gc' to remove unnecessary files and optimize the local repository // GitGcRepos calls 'git gc' to remove unnecessary files and optimize the local repository
func GitGcRepos(ctx context.Context, timeout time.Duration, args ...git.CmdArg) error { func GitGcRepos(ctx context.Context, timeout time.Duration, args ...git.CmdArg) error {
log.Trace("Doing: GitGcRepos") log.Trace("Doing: GitGcRepos")
@ -68,7 +73,19 @@ func GitGcRepos(ctx context.Context, timeout time.Duration, args ...git.CmdArg)
return db.ErrCancelledf("before GC of %s", repo.FullName()) return db.ErrCancelledf("before GC of %s", repo.FullName())
default: default:
} }
log.Trace("Running git gc on %v", repo) return GitGcRepo(ctx, repo, timeout, args)
},
); err != nil {
return err
}
log.Trace("Finished: GitGcRepos")
return nil
}
// GitGcRepo calls 'git gc' to remove unnecessary files and optimize the local repository
func GitGcRepo(ctx context.Context, repo *repo_model.Repository, timeout time.Duration, args []git.CmdArg) error {
log.Trace("Running git gc on %-v", repo)
command := git.NewCommand(ctx, args...). command := git.NewCommand(ctx, args...).
SetDescription(fmt.Sprintf("Repository Garbage Collection: %s", repo.FullName())) SetDescription(fmt.Sprintf("Repository Garbage Collection: %s", repo.FullName()))
var stdout string var stdout string
@ -86,7 +103,7 @@ func GitGcRepos(ctx context.Context, timeout time.Duration, args ...git.CmdArg)
// Now update the size of the repository // Now update the size of the repository
if err := repo_module.UpdateRepoSize(ctx, repo); err != nil { if err := repo_module.UpdateRepoSize(ctx, repo); err != nil {
log.Error("Updating size as part of garbage collection failed for %v. Stdout: %s\nError: %v", repo, stdout, err) log.Error("Updating size as part of garbage collection failed for %-v. Stdout: %s\nError: %v", repo, stdout, err)
desc := fmt.Sprintf("Updating size as part of garbage collection failed for %s. Stdout: %s\nError: %v", repo.RepoPath(), stdout, err) desc := fmt.Sprintf("Updating size as part of garbage collection failed for %s. Stdout: %s\nError: %v", repo.RepoPath(), stdout, err)
if err = system_model.CreateRepositoryNotice(desc); err != nil { if err = system_model.CreateRepositoryNotice(desc); err != nil {
log.Error("CreateRepositoryNotice: %v", err) log.Error("CreateRepositoryNotice: %v", err)
@ -94,13 +111,6 @@ func GitGcRepos(ctx context.Context, timeout time.Duration, args ...git.CmdArg)
return fmt.Errorf("Updating size as part of garbage collection failed in repo: %s: Error: %w", repo.FullName(), err) return fmt.Errorf("Updating size as part of garbage collection failed in repo: %s: Error: %w", repo.FullName(), err)
} }
return nil
},
); err != nil {
return err
}
log.Trace("Finished: GitGcRepos")
return nil return nil
} }

105
services/repository/lfs.go Normal file
View file

@ -0,0 +1,105 @@
// Copyright 2022 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package repository
import (
"context"
"fmt"
"time"
"code.gitea.io/gitea/models/db"
git_model "code.gitea.io/gitea/models/git"
repo_model "code.gitea.io/gitea/models/repo"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/lfs"
"code.gitea.io/gitea/modules/log"
"xorm.io/builder"
)
func GarbageCollectLFSMetaObjects(ctx context.Context, logger log.Logger, autofix bool) error {
log.Trace("Doing: GarbageCollectLFSMetaObjects")
if err := db.Iterate(
ctx,
builder.And(builder.Gt{"id": 0}),
func(ctx context.Context, repo *repo_model.Repository) error {
return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, logger, autofix)
},
); err != nil {
return err
}
log.Trace("Finished: GarbageCollectLFSMetaObjects")
return nil
}
func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.Repository, logger log.Logger, autofix bool) error {
if logger != nil {
logger.Info("Checking %-v", repo)
}
total, orphaned, collected, deleted := 0, 0, 0, 0
if logger != nil {
defer func() {
if orphaned == 0 {
logger.Info("Found %d total LFSMetaObjects in %-v", total, repo)
} else if !autofix {
logger.Info("Found %d/%d orphaned LFSMetaObjects in %-v", orphaned, total, repo)
} else {
logger.Info("Collected %d/%d orphaned/%d total LFSMetaObjects in %-v. %d removed from storage.", collected, orphaned, total, repo, deleted)
}
}()
}
gitRepo, err := git.OpenRepository(ctx, repo.RepoPath())
if err != nil {
log.Error("Unable to open git repository %-v: %v", repo, err)
return err
}
defer gitRepo.Close()
store := lfs.NewContentStore()
return git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error {
total++
pointerSha := git.ComputeBlobHash([]byte(metaObject.Pointer.StringContent()))
if gitRepo.IsObjectExist(pointerSha.String()) {
return nil
}
orphaned++
if !autofix {
return nil
}
// Non-existent pointer file
_, err = git_model.RemoveLFSMetaObjectByOidFn(repo.ID, metaObject.Oid, func(count int64) error {
if count > 0 {
return nil
}
if err := store.Delete(metaObject.RelativePath()); err != nil {
log.Error("Unable to remove lfs metaobject %s from store: %v", metaObject.Oid, err)
}
deleted++
return nil
})
if err != nil {
return fmt.Errorf("unable to remove meta-object %s in %s: %w", metaObject.Oid, repo.FullName(), err)
}
collected++
return nil
}, &git_model.IterateLFSMetaObjectsForRepoOptions{
// Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload
// and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby
// an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid
// changes in new branches that might lead to lfs objects becoming temporarily unassociated with git
// objects.
//
// It is likely that a week is potentially excessive but it should definitely be enough that any
// unassociated LFS object is genuinely unassociated.
OlderThan: time.Now().Add(-24 * 7 * time.Hour),
})
}