Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 25 additions & 30 deletions path-walk.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,24 @@ static void push_to_stack(struct path_walk_context *ctx,
prio_queue_put(&ctx->path_stack, xstrdup(path));
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the Git mailing list, Elijah Newren wrote (reply to this):

On Wed, Aug 20, 2025 at 11:40 AM Derrick Stolee via GitGitGadget
<[email protected]> wrote:
>
> From: Derrick Stolee <[email protected]>
>
> The previous change fixed a bug in 'git repack -adf --path-walk' that
> was due to an update to how path lists are initialized and missing some
> important cases when processing the pending objects.
>
> This change takes the three critical places where path lists are
> initialized and combines them into a static method. This simplifies the
> callers somewhat while also helping to avoid a missed update in the
> future.
>
> The other places where a path list (struct type_and_oid_list) is
> initialized is for the following "fixed" lists:
>
>  * Tag objects.
>  * Commit objects.
>  * Root trees.
>  * Tagged trees.
>  * Tagged blobs.
>
> These lists are created and consumed in different ways, with only the
> root trees being passed into the logic that cares about the
> "maybe_interesting" bit. It is appropriate to keep these uses separate.
>
> Signed-off-by: Derrick Stolee <[email protected]>
> ---
>  path-walk.c | 57 +++++++++++++++++++++++------------------------------
>  1 file changed, 25 insertions(+), 32 deletions(-)
>
> diff --git a/path-walk.c b/path-walk.c
> index 1215ed398f4f..f1ceed99e94c 100644
> --- a/path-walk.c
> +++ b/path-walk.c
> @@ -105,6 +105,24 @@ static void push_to_stack(struct path_walk_context *ctx,
>         prio_queue_put(&ctx->path_stack, xstrdup(path));
>  }
>
> +static void add_path_to_list(struct path_walk_context *ctx,
> +                            const char *path,
> +                            enum object_type type,
> +                            struct object_id *oid,
> +                            int interesting)
> +{
> +       struct type_and_oid_list *list = strmap_get(&ctx->paths_to_lists, path);
> +
> +       if (!list) {
> +               CALLOC_ARRAY(list, 1);
> +               list->type = type;
> +               strmap_put(&ctx->paths_to_lists, path, list);
> +       }
> +
> +       list->maybe_interesting |= interesting;
> +       oid_array_append(&list->oids, oid);
> +}
> +
>  static int add_tree_entries(struct path_walk_context *ctx,
>                             const char *base_path,
>                             struct object_id *oid)
> @@ -129,7 +147,6 @@ static int add_tree_entries(struct path_walk_context *ctx,
>
>         init_tree_desc(&desc, &tree->object.oid, tree->buffer, tree->size);
>         while (tree_entry(&desc, &entry)) {
> -               struct type_and_oid_list *list;
>                 struct object *o;
>                 /* Not actually true, but we will ignore submodules later. */
>                 enum object_type type = S_ISDIR(entry.mode) ? OBJ_TREE : OBJ_BLOB;
> @@ -190,17 +207,10 @@ static int add_tree_entries(struct path_walk_context *ctx,
>                                 continue;
>                 }
>
> -               if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) {
> -                       CALLOC_ARRAY(list, 1);
> -                       list->type = type;
> -                       strmap_put(&ctx->paths_to_lists, path.buf, list);
> -               }
> -               push_to_stack(ctx, path.buf);
> -
> -               if (!(o->flags & UNINTERESTING))
> -                       list->maybe_interesting = 1;
> +               add_path_to_list(ctx, path.buf, type, &entry.oid,
> +                                !(o->flags & UNINTERESTING));
>
> -               oid_array_append(&list->oids, &entry.oid);
> +               push_to_stack(ctx, path.buf);
>         }
>
>         free_tree_buffer(tree);
> @@ -377,16 +387,9 @@ static int setup_pending_objects(struct path_walk_info *info,
>                         if (!info->trees)
>                                 continue;
>                         if (pending->path) {
> -                               struct type_and_oid_list *list;
>                                 char *path = *pending->path ? xstrfmt("%s/", pending->path)
>                                                             : xstrdup("");
> -                               if (!(list = strmap_get(&ctx->paths_to_lists, path))) {
> -                                       CALLOC_ARRAY(list, 1);
> -                                       list->type = OBJ_TREE;
> -                                       strmap_put(&ctx->paths_to_lists, path, list);
> -                               }
> -                               list->maybe_interesting = 1;
> -                               oid_array_append(&list->oids, &obj->oid);
> +                               add_path_to_list(ctx, path, OBJ_TREE, &obj->oid, 1);
>                                 free(path);
>                         } else {
>                                 /* assume a root tree, such as a lightweight tag. */
> @@ -397,20 +400,10 @@ static int setup_pending_objects(struct path_walk_info *info,
>                 case OBJ_BLOB:
>                         if (!info->blobs)
>                                 continue;
> -                       if (pending->path) {
> -                               struct type_and_oid_list *list;
> -                               char *path = pending->path;
> -                               if (!(list = strmap_get(&ctx->paths_to_lists, path))) {
> -                                       CALLOC_ARRAY(list, 1);
> -                                       list->type = OBJ_BLOB;
> -                                       strmap_put(&ctx->paths_to_lists, path, list);
> -                               }
> -                               list->maybe_interesting = 1;
> -                               oid_array_append(&list->oids, &obj->oid);
> -                       } else {
> -                               /* assume a root tree, such as a lightweight tag. */
> +                       if (pending->path)
> +                               add_path_to_list(ctx, pending->path, OBJ_BLOB, &obj->oid, 1);
> +                       else
>                                 oid_array_append(&tagged_blobs->oids, &obj->oid);
> -                       }
>                         break;
>
>                 case OBJ_COMMIT:
> --
> gitgitgadget

Looks like a straightforward collapsing of common code into a new function.

}

static void add_path_to_list(struct path_walk_context *ctx,
const char *path,
enum object_type type,
struct object_id *oid,
int interesting)
{
struct type_and_oid_list *list = strmap_get(&ctx->paths_to_lists, path);

if (!list) {
CALLOC_ARRAY(list, 1);
list->type = type;
strmap_put(&ctx->paths_to_lists, path, list);
}

list->maybe_interesting |= interesting;
oid_array_append(&list->oids, oid);
}

static int add_tree_entries(struct path_walk_context *ctx,
const char *base_path,
struct object_id *oid)
Expand All @@ -129,7 +147,6 @@ static int add_tree_entries(struct path_walk_context *ctx,

init_tree_desc(&desc, &tree->object.oid, tree->buffer, tree->size);
while (tree_entry(&desc, &entry)) {
struct type_and_oid_list *list;
struct object *o;
/* Not actually true, but we will ignore submodules later. */
enum object_type type = S_ISDIR(entry.mode) ? OBJ_TREE : OBJ_BLOB;
Expand Down Expand Up @@ -190,17 +207,10 @@ static int add_tree_entries(struct path_walk_context *ctx,
continue;
}

if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) {
CALLOC_ARRAY(list, 1);
list->type = type;
strmap_put(&ctx->paths_to_lists, path.buf, list);
}
push_to_stack(ctx, path.buf);

if (!(o->flags & UNINTERESTING))
list->maybe_interesting = 1;
add_path_to_list(ctx, path.buf, type, &entry.oid,
!(o->flags & UNINTERESTING));

oid_array_append(&list->oids, &entry.oid);
push_to_stack(ctx, path.buf);
}

free_tree_buffer(tree);
Expand Down Expand Up @@ -377,15 +387,9 @@ static int setup_pending_objects(struct path_walk_info *info,
if (!info->trees)
continue;
if (pending->path) {
struct type_and_oid_list *list;
char *path = *pending->path ? xstrfmt("%s/", pending->path)
: xstrdup("");
if (!(list = strmap_get(&ctx->paths_to_lists, path))) {
CALLOC_ARRAY(list, 1);
list->type = OBJ_TREE;
strmap_put(&ctx->paths_to_lists, path, list);
}
oid_array_append(&list->oids, &obj->oid);
add_path_to_list(ctx, path, OBJ_TREE, &obj->oid, 1);
free(path);
} else {
/* assume a root tree, such as a lightweight tag. */
Expand All @@ -396,19 +400,10 @@ static int setup_pending_objects(struct path_walk_info *info,
case OBJ_BLOB:
if (!info->blobs)
continue;
if (pending->path) {
struct type_and_oid_list *list;
char *path = pending->path;
if (!(list = strmap_get(&ctx->paths_to_lists, path))) {
CALLOC_ARRAY(list, 1);
list->type = OBJ_BLOB;
strmap_put(&ctx->paths_to_lists, path, list);
}
oid_array_append(&list->oids, &obj->oid);
} else {
/* assume a root tree, such as a lightweight tag. */
if (pending->path)
add_path_to_list(ctx, pending->path, OBJ_BLOB, &obj->oid, 1);
else
oid_array_append(&tagged_blobs->oids, &obj->oid);
}
break;

case OBJ_COMMIT:
Expand Down
63 changes: 63 additions & 0 deletions t/t7700-repack.sh
Original file line number Diff line number Diff line change
Expand Up @@ -838,4 +838,67 @@ test_expect_success '-n overrides repack.updateServerInfo=true' '
test_server_info_missing
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the Git mailing list, Patrick Steinhardt wrote (reply to this):

On Wed, Aug 20, 2025 at 06:39:54PM +0000, Derrick Stolee via GitGitGadget wrote:
> diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
> index 611755cc139b..1998d9bf291c 100755
> --- a/t/t7700-repack.sh
> +++ b/t/t7700-repack.sh
> @@ -838,4 +838,47 @@ test_expect_success '-n overrides repack.updateServerInfo=true' '

Tiny nit: I would've probably squashed this patch into the second patch,
as we usually don't use the add-failing-test-and-then-fix-it-later
dance. On the other hand though it gives some nice context, so I
ultimately don't mind it all that much. So please feel free to ignore
this nit.

>  	test_server_info_missing
>  '
>  
> +test_expect_failure 'pending objects are repacked appropriately' '
> +	git init pending &&

We probably also want `test_when_finished "rm -rf pending"` before
calling git-init(1).

> +
> +	(
> +		cd pending &&
> +
> +		mkdir -p a/b &&
> +		echo singleton >file &&
> +		echo stuff >a/b/c &&
> +		echo more >a/d &&
> +		git add file a &&
> +		git commit -m "single blobs" &&
> +
> +		echo d >a/d &&
> +		echo e >a/e &&
> +		git add a &&
> +		git commit -m "more blobs" &&
> +
> +		# This use of a sparse index helps to force
> +		# test that the cache-tree is walked, too.
> +		git sparse-checkout set --sparse-index a x &&
> +
> +		# Just _stage_ the changes.
> +		echo f >a/d &&
> +		echo h >a/e &&
> +		echo i >a/i &&
> +		mkdir x &&
> +		echo y >x/y &&
> +		git add a x &&

Nit: I think I would've moved the explanations you have in the commit
message into these hunks so that the test becomes a bit more
self-explanatory.

> +		# Bring the loose objects into a packfile to avoid
> +		# leftovers in next test. Without this, the loose
> +		# objects persist and the test succeeds for other
> +		# reasons.
> +		git repack -adf &&
> +		git fsck &&
> +
> +		# Test path walk version with pack.useSparse.
> +		git -c pack.useSparse=true repack -adf --path-walk &&
> +		git fsck
> +	)
> +'

Patrick

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the Git mailing list, Derrick Stolee wrote (reply to this):

On 8/21/2025 4:00 AM, Patrick Steinhardt wrote:
> On Wed, Aug 20, 2025 at 06:39:54PM +0000, Derrick Stolee via GitGitGadget wrote:
>> diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
>> index 611755cc139b..1998d9bf291c 100755
>> --- a/t/t7700-repack.sh
>> +++ b/t/t7700-repack.sh
>> @@ -838,4 +838,47 @@ test_expect_success '-n overrides repack.updateServerInfo=true' '
> 
> Tiny nit: I would've probably squashed this patch into the second patch,
> as we usually don't use the add-failing-test-and-then-fix-it-later
> dance. On the other hand though it gives some nice context, so I
> ultimately don't mind it all that much. So please feel free to ignore
> this nit.

I'm probably the person who is always asking folks to create a test
that either fails or demonstrates the "before" behavior before making
the actual change that updates the case. This allows the ability to
"test the test" by checking it in place to confirm that it is indeed
failing.

Using test_expect_failure allows us to avoid breaking bisect. 
>>  	test_server_info_missing
>>  '
>>  
>> +test_expect_failure 'pending objects are repacked appropriately' '
>> +	git init pending &&
> 
> We probably also want `test_when_finished "rm -rf pending"` before
> calling git-init(1).

Good idea.

>> +
>> +	(
>> +		cd pending &&
>> +
>> +		mkdir -p a/b &&
>> +		echo singleton >file &&
>> +		echo stuff >a/b/c &&
>> +		echo more >a/d &&
>> +		git add file a &&
>> +		git commit -m "single blobs" &&
>> +
>> +		echo d >a/d &&
>> +		echo e >a/e &&
>> +		git add a &&
>> +		git commit -m "more blobs" &&
>> +
>> +		# This use of a sparse index helps to force
>> +		# test that the cache-tree is walked, too.
>> +		git sparse-checkout set --sparse-index a x &&
>> +
>> +		# Just _stage_ the changes.
>> +		echo f >a/d &&
>> +		echo h >a/e &&
>> +		echo i >a/i &&
>> +		mkdir x &&
>> +		echo y >x/y &&
>> +		git add a x &&
> 
> Nit: I think I would've moved the explanations you have in the commit
> message into these hunks so that the test becomes a bit more
> self-explanatory.

Hm. That seems to go against our typical pattern of leaving comments
sparse and having the longer explanation available in commit messages
but maybe I'm out of date or tests are a different beast. I'll see
what I can do to make the test more self-documented.

Thanks,
-Stolee

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the Git mailing list, Junio C Hamano wrote (reply to this):

Derrick Stolee <[email protected]> writes:

> On 8/21/2025 4:00 AM, Patrick Steinhardt wrote:
>> On Wed, Aug 20, 2025 at 06:39:54PM +0000, Derrick Stolee via GitGitGadget wrote:
>>> diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh
>>> index 611755cc139b..1998d9bf291c 100755
>>> --- a/t/t7700-repack.sh
>>> +++ b/t/t7700-repack.sh
>>> @@ -838,4 +838,47 @@ test_expect_success '-n overrides repack.updateServerInfo=true' '
>> 
>> Tiny nit: I would've probably squashed this patch into the second patch,
>> as we usually don't use the add-failing-test-and-then-fix-it-later
>> dance. On the other hand though it gives some nice context, so I
>> ultimately don't mind it all that much. So please feel free to ignore
>> this nit.
>
> I'm probably the person who is always asking folks to create a test
> that either fails or demonstrates the "before" behavior before making
> the actual change that updates the case. This allows the ability to
> "test the test" by checking it in place to confirm that it is indeed
> failing.
> Using test_expect_failure allows us to avoid breaking bisect. 

Yes, you can develop that way, but on the reviewing and receiving
end, the second patch that shows only the change from expect_failure
to expect_success pushes the more important "what behaviour was this
thing testing?" out of the hunk context, if you split them into two.

If we really wanted to verify the claim that without the fix this
was broken and we have a test to demonstrate a failure on the
receiving end, we can "checkout" paths touched by that commit
outside t/ from HEAD^ to build to see how the system behaved without
the code change just fine, so such a split does not buy us much.

Unless there is a strong reason not to, please always present such
test in the same patch as the code change to fix that breakage.

Thanks.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the Git mailing list, Elijah Newren wrote (reply to this):

On Wed, Aug 20, 2025 at 11:39 AM Derrick Stolee via GitGitGadget
<[email protected]> wrote:
>
> From: Derrick Stolee <[email protected]>
>
> Users reported an issue where objects were missing from their local
> enlistments after a full repack using 'git repack -adf --path-walk'.

What is an enlistment?

> This was alarming, but took a while to create a reproducer.

but => and ?

> The root cause is that certain objects existed in the index and had no
> second versions. These objects are usually blobs, though trees can be
> included if a cache-tree exists. The issue is that the revision walk
> adds these objects to the "pending" list and the path-walk API forgets
> to mark the lists it creates at this point as "maybe_interesting". If
> these paths only ever have a single version in the history of the repo
> (including the current staged version) then the parent directory never
> tries to add a new object to the list and mark the list as
> "maybe_interesting". Thus, when walking the list later, the group is
> skipped as it is expected that no objects are interesting. This happens
> even when there are actually no UNINTERESTING objects at all! This is
> based on the optimization enabled by the pack.useSparse=true config
> option, which is the default.
>
> Thus, we create a test case that demonstrates the many cases of this
> issue for reproducibility:
>
>  1. File a/b/c has only one committed version.
>  2. Files a/i and x/y only exists as staged changes.

exists => exist


I didn't have any questions or spot any issues on the rest of the patch.

'

test_expect_success 'pending objects are repacked appropriately' '
test_when_finished rm -rf pending &&
git init pending &&

(
cd pending &&

# Commit file, a/b/c and never change them.
mkdir -p a/b &&
echo singleton >file &&
echo stuff >a/b/c &&
echo more >a/d &&
git add file a &&
git commit -m "single blobs" &&

# Files a/d and a/e will not be singletons.
echo d >a/d &&
echo e >a/e &&
git add a &&
git commit -m "more blobs" &&

# This use of a sparse index helps to force
# test that the cache-tree is walked, too.
git sparse-checkout set --sparse-index a x &&

# Create staged changes:
# * a/e now has multiple versions.
# * a/i now has only one version.
echo f >a/d &&
echo h >a/e &&
echo i >a/i &&
git add a &&

# Stage and unstage a change to make use of
# resolve-undo cache and how that impacts fsck.
mkdir x &&
echo y >x/y &&
git add x &&
xy=$(git rev-parse :x/y) &&
git rm --cached x/y &&

# The blob for x/y must persist through repacks,
# but fsck currently ignores the REUC extension
# for finding links to the blob.
cat >expect <<-EOF &&
dangling blob $xy
EOF

# Bring the loose objects into a packfile to avoid
# leftovers in next test. Without this, the loose
# objects persist and the test succeeds for other
# reasons.
git repack -adf &&
git fsck >out &&
test_cmp expect out &&

# Test path walk version with pack.useSparse.
git -c pack.useSparse=true repack -adf --path-walk &&
git fsck >out &&
test_cmp expect out
)
'

test_done
Loading