From a2a1309fd91957fc2c0d1a338a7c5de81ec3931a Mon Sep 17 00:00:00 2001 From: Winfried Plappert Date: Tue, 18 Feb 2025 16:54:44 +0000 Subject: [PATCH] prune: make small pack size configureable for `prune` all changes together cmd_prune.go: added option `--repack-smaller-than` prune.go: added field `SmallPackBytes` to `PruneOptions`, including checking and processing prune_test.go: added test `TestPruneSmall` doc/060_forget.rst: added description of enhancement changelog/unreleased/issue-5109: description of enhancement --- changelog/unreleased/issue-5109 | 8 +++ cmd/restic/cmd_prune.go | 18 +++++++ doc/060_forget.rst | 24 ++++++--- internal/repository/prune.go | 9 +++- internal/repository/prune_test.go | 84 +++++++++++++++++++++++++++++++ 5 files changed, 134 insertions(+), 9 deletions(-) create mode 100644 changelog/unreleased/issue-5109 diff --git a/changelog/unreleased/issue-5109 b/changelog/unreleased/issue-5109 new file mode 100644 index 000000000..59e2330ef --- /dev/null +++ b/changelog/unreleased/issue-5109 @@ -0,0 +1,8 @@ +Enhancement: Make small pack size configureable for `prune` + +The `prune` command now supports the `--small-pack-size` option that +allows repacking pack files smaller than the given size. It has to be used in +conjunction with option `--repack-small`. + +https://github.com/restic/restic/issues/5109 +https://github.com/restic/restic/pull/5183 diff --git a/cmd/restic/cmd_prune.go b/cmd/restic/cmd_prune.go index a613f2255..10b465f39 100644 --- a/cmd/restic/cmd_prune.go +++ b/cmd/restic/cmd_prune.go @@ -67,6 +67,9 @@ type PruneOptions struct { RepackCacheableOnly bool RepackSmall bool RepackUncompressed bool + + SmallPackSize string + SmallPackBytes uint64 } func (opts *PruneOptions) AddFlags(f *pflag.FlagSet) { @@ -81,6 +84,7 @@ func (opts *PruneOptions) AddLimitedFlags(f *pflag.FlagSet) { f.BoolVar(&opts.RepackCacheableOnly, "repack-cacheable-only", false, "only repack packs which are cacheable") f.BoolVar(&opts.RepackSmall, "repack-small", false, "repack pack files below 80% of target pack size") f.BoolVar(&opts.RepackUncompressed, "repack-uncompressed", false, "repack all uncompressed data") + f.StringVar(&opts.SmallPackSize, "repack-smaller-than", "", "pack `below-limit` packfiles (allowed suffixes: k/K, m/M)") } func verifyPruneOptions(opts *PruneOptions) error { @@ -139,6 +143,20 @@ func verifyPruneOptions(opts *PruneOptions) error { } } + if !opts.RepackSmall && opts.SmallPackSize != "" { + return errors.Fatal("option --repack-small and --repack-smaller-than: must be specified together") + } + + opts.SmallPackBytes = uint64(0) + if opts.SmallPackSize != "" { + size, err := ui.ParseBytes(opts.SmallPackSize) + if err != nil { + return errors.Fatalf("invalid number of bytes %q for --repack-smaller-than: %v", opts.SmallPackSize, err) + } + opts.SmallPackBytes = uint64(size) + opts.RepackSmall = true + } + return nil } diff --git a/doc/060_forget.rst b/doc/060_forget.rst index b211148cb..484e0c6bd 100644 --- a/doc/060_forget.rst +++ b/doc/060_forget.rst @@ -88,14 +88,14 @@ command must be run: searching used packs... collecting packs for deletion and repacking [0:00] 100.00% 5 / 5 packs processed - + to repack: 69 blobs / 1.078 MiB this removes: 67 blobs / 1.047 MiB to delete: 7 blobs / 25.726 KiB total prune: 74 blobs / 1.072 MiB remaining: 16 blobs / 38.003 KiB unused size after prune: 0 B (0.00% of remaining size) - + repacking packs [0:00] 100.00% 2 / 2 packs repacked rebuilding index @@ -134,14 +134,14 @@ to ``forget``: searching used packs... collecting packs for deletion and repacking [0:00] 100.00% 5 / 5 packs processed - + to repack: 69 blobs / 1.078 MiB this removes 67 blobs / 1.047 MiB to delete: 7 blobs / 25.726 KiB total prune: 74 blobs / 1.072 MiB remaining: 16 blobs / 38.003 KiB unused size after prune: 0 B (0.00% of remaining size) - + repacking packs [0:00] 100.00% 2 / 2 packs repacked rebuilding index @@ -214,7 +214,7 @@ The ``forget`` command accepts the following policy options: run) and these snapshots will hence not be removed. .. note:: If there are not enough snapshots to keep one for each duration related - ``--keep-{within-,}*`` option, the oldest snapshot is kept additionally and + ``--keep-{within-,}*`` option, the oldest snapshot is kept additionally and marked as ``oldest`` in the output (e.g. ``oldest hourly snapshot``). .. note:: Specifying ``--keep-tag ''`` will match untagged snapshots only. @@ -331,7 +331,7 @@ kept, depending on whether one of them ends up being the same as an already kept snapshot. All other snapshots are removed. You might want to maintain the same policy as in the example above, but have -irregular backups. For example, the 7 snapshots specified with ``--keep-daily 7`` +irregular backups. For example, the 7 snapshots specified with ``--keep-daily 7`` might be spread over a longer period. If what you want is to keep daily snapshots for the last week, weekly for the last month, monthly for the last year and yearly for the last 75 years, you can instead specify ``forget @@ -448,13 +448,13 @@ The ``prune`` command accepts the following options: you want to minimize the time and bandwidth used by the ``prune`` operation. Note that metadata will still be repacked. - Restic tries to repack as little data as possible while still ensuring this + Restic tries to repack as little data as possible while still ensuring this limit for unused data. The default value is 5%. - ``--max-repack-size size`` if set limits the total size of files to repack. As ``prune`` first stores all repacked files and deletes the obsolete files at the end, this option might be handy if you expect many files to be repacked and fear to run low - on storage. + on storage. - ``--repack-cacheable-only`` if set to true only files which contain metadata and would be stored in the cache are repacked. Other pack files are @@ -463,6 +463,14 @@ The ``prune`` command accepts the following options: your repository exceeds the value given by ``--max-unused``. The default value is false. +- ``--repack-small`` if set will repack pack files below 80% of target pack size. + The default value is false. + +- ``--repack-smaller-than`` in conjunction with ``--repack-small`` will repack all + packfiles below the size of ``--repack-smaller-than``. This will allow to repack + packfiles which initially came from a repository with a smaller ``--pack-size`` + to be compacted into larger packfiles. + - ``--dry-run`` only show what ``prune`` would do. - ``--verbose`` increased verbosity shows additional statistics for ``prune``. diff --git a/internal/repository/prune.go b/internal/repository/prune.go index ba13ba1a3..e22dd4c20 100644 --- a/internal/repository/prune.go +++ b/internal/repository/prune.go @@ -24,6 +24,7 @@ type PruneOptions struct { MaxUnusedBytes func(used uint64) (unused uint64) // calculates the number of unused bytes after repacking, according to MaxUnused MaxRepackBytes uint64 + SmallPackBytes uint64 RepackCacheableOnly bool RepackSmall bool @@ -104,6 +105,9 @@ func PlanPrune(ctx context.Context, opts PruneOptions, repo *Repository, getUsed if repo.Config().Version < 2 && opts.RepackUncompressed { return nil, fmt.Errorf("compression requires at least repository format version 2") } + if opts.SmallPackBytes > uint64(repo.packSize()) { + return nil, fmt.Errorf("repack-smaller-than exceeds repository packsize") + } usedBlobs := index.NewAssociatedSet[uint8](repo.idx) err := getUsedBlobs(ctx, repo, usedBlobs) @@ -326,7 +330,9 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository, repoVersion := repo.Config().Version // only repack very small files by default targetPackSize := repo.packSize() / 25 - if opts.RepackSmall { + if opts.SmallPackBytes > 0 { + targetPackSize = uint(opts.SmallPackBytes) + } else if opts.RepackSmall { // consider files with at least 80% of the target size as large enough targetPackSize = repo.packSize() / 5 * 4 } @@ -402,6 +408,7 @@ func decidePackAction(ctx context.Context, opts PruneOptions, repo *Repository, bar.Add(1) return nil }) + bar.Done() if err != nil { return PrunePlan{}, err diff --git a/internal/repository/prune_test.go b/internal/repository/prune_test.go index 3234622f4..63aa939cf 100644 --- a/internal/repository/prune_test.go +++ b/internal/repository/prune_test.go @@ -2,6 +2,7 @@ package repository_test import ( "context" + "fmt" "math" "math/rand" "testing" @@ -9,6 +10,7 @@ import ( "github.com/restic/restic/internal/checker" "github.com/restic/restic/internal/repository" + "github.com/restic/restic/internal/repository/pack" "github.com/restic/restic/internal/restic" rtest "github.com/restic/restic/internal/test" "github.com/restic/restic/internal/ui/progress" @@ -191,3 +193,85 @@ func TestPruneMaxUnusedDuplicate(t *testing.T) { rtest.Equals(t, rsize.Unref, uint64(0)) rtest.Equals(t, rsize.Uncompressed, uint64(0)) } + +/* +1.) create repository with packsize of 2M. +2.) create enough data for 11 packfiles (31 packs) +3.) run a repository.PlanPrune(...) with a packsize of 16M (current default). +4.) run plan.Execute(...), extract plan.Stats() and check. +5.) Check that all blobs are contained in the new packfiles. +6.) The result should be less packfiles than before +*/ +func TestPruneSmall(t *testing.T) { + seed := time.Now().UnixNano() + random := rand.New(rand.NewSource(seed)) + t.Logf("rand initialized with seed %d", seed) + + be := repository.TestBackend(t) + repo, _ := repository.TestRepositoryWithBackend(t, be, 0, repository.Options{PackSize: repository.MinPackSize}) + + const blobSize = 1000 * 1000 + const numBlobsCreated = 55 + + var wg errgroup.Group + repo.StartPackUploader(context.TODO(), &wg) + keep := restic.NewBlobSet() + // we need a minum of 11 packfiles, each packfile will be about 5 Mb long + for i := 0; i < numBlobsCreated; i++ { + buf := make([]byte, blobSize) + random.Read(buf) + + id, _, _, err := repo.SaveBlob(context.TODO(), restic.DataBlob, buf, restic.ID{}, false) + rtest.OK(t, err) + keep.Insert(restic.BlobHandle{Type: restic.DataBlob, ID: id}) + } + + rtest.OK(t, repo.Flush(context.Background())) + + // gather number of packfiles + repoPacks, err := pack.Size(context.TODO(), repo, false) + rtest.OK(t, err) + lenPackfilesBefore := len(repoPacks) + rtest.OK(t, repo.Close()) + + // and reopen repository with default packsize + repo = repository.TestOpenBackend(t, be) + rtest.OK(t, repo.LoadIndex(context.TODO(), nil)) + + opts := repository.PruneOptions{ + MaxRepackBytes: math.MaxUint64, + MaxUnusedBytes: func(used uint64) (unused uint64) { return blobSize / 4 }, + SmallPackBytes: 5 * 1024 * 1024, + RepackSmall: true, + } + plan, err := repository.PlanPrune(context.TODO(), opts, repo, func(ctx context.Context, repo restic.Repository, usedBlobs restic.FindBlobSet) error { + for blob := range keep { + usedBlobs.Insert(blob) + } + return nil + }, &progress.NoopPrinter{}) + rtest.OK(t, err) + rtest.OK(t, plan.Execute(context.TODO(), &progress.NoopPrinter{})) + + stats := plan.Stats() + rtest.Equals(t, stats.Size.Used/blobSize, uint64(numBlobsCreated), fmt.Sprintf("total size of blobs should be %d but is %d", + numBlobsCreated, stats.Size.Used/blobSize)) + rtest.Equals(t, stats.Blobs.Used, stats.Blobs.Repack, "the number of blobs should be identical after a repack") + + // repopen repository + repo = repository.TestOpenBackend(t, be) + checker.TestCheckRepo(t, repo, true) + + // load all blobs + for blob := range keep { + _, err := repo.LoadBlob(context.TODO(), blob.Type, blob.ID, nil) + rtest.OK(t, err) + } + + repoPacks, err = pack.Size(context.TODO(), repo, false) + rtest.OK(t, err) + lenPackfilesAfter := len(repoPacks) + + rtest.Equals(t, lenPackfilesBefore > lenPackfilesAfter, true, + fmt.Sprintf("the number packfiles before %d and after repack %d", lenPackfilesBefore, lenPackfilesAfter)) +}