diff --git a/doc/manual/rl-next/tarball-fixes.md b/doc/manual/rl-next/tarball-fixes.md new file mode 100644 index 000000000..c938e9db6 --- /dev/null +++ b/doc/manual/rl-next/tarball-fixes.md @@ -0,0 +1,9 @@ +--- +synopsis: "Improve handling of tarballs that don't consist of a single top-level directory" +prs: +- 11195 +--- + +In previous Nix releases, the tarball fetcher (used by `builtins.fetchTarball`) erroneously merged top-level directories into a single directory, and silently discarded top-level files that are not directories. This is no longer the case. The new behaviour is that *only* if the tarball consists of a single directory, the top-level path component of the files in the tarball is removed (similar to `tar`'s `--strip-components=1`). + +Author: [**Eelco Dolstra (@edolstra)**](https://github.com/edolstra) diff --git a/src/libexpr/primops/fetchTree.cc b/src/libexpr/primops/fetchTree.cc index e59d7fe67..f79b6b7b8 100644 --- a/src/libexpr/primops/fetchTree.cc +++ b/src/libexpr/primops/fetchTree.cc @@ -559,11 +559,11 @@ static RegisterPrimOp primop_fetchTarball({ .doc = R"( Download the specified URL, unpack it and return the path of the unpacked tree. The file must be a tape archive (`.tar`) compressed - with `gzip`, `bzip2` or `xz`. The top-level path component of the - files in the tarball is removed, so it is best if the tarball - contains a single directory at top level. The typical use of the - function is to obtain external Nix expression dependencies, such as - a particular version of Nixpkgs, e.g. + with `gzip`, `bzip2` or `xz`. If the tarball consists of a + single directory, then the top-level path component of the files + in the tarball is removed. The typical use of the function is to + obtain external Nix expression dependencies, such as a + particular version of Nixpkgs, e.g. ```nix with import (fetchTarball https://github.com/NixOS/nixpkgs/archive/nixos-14.12.tar.gz) {}; diff --git a/src/libfetchers/git-utils.cc b/src/libfetchers/git-utils.cc index ecc71ae47..114aa4ec0 100644 --- a/src/libfetchers/git-utils.cc +++ b/src/libfetchers/git-utils.cc @@ -126,16 +126,39 @@ Object lookupObject(git_repository * repo, const git_oid & oid, git_object_t typ } template -T peelObject(git_repository * repo, git_object * obj, git_object_t type) +T peelObject(git_object * obj, git_object_t type) { T obj2; if (git_object_peel((git_object * *) (typename T::pointer *) Setter(obj2), obj, type)) { auto err = git_error_last(); - throw Error("peeling Git object '%s': %s", git_object_id(obj), err->message); + throw Error("peeling Git object '%s': %s", *git_object_id(obj), err->message); } return obj2; } +template +T dupObject(typename T::pointer obj) +{ + T obj2; + if (git_object_dup((git_object * *) (typename T::pointer *) Setter(obj2), (git_object *) obj)) + throw Error("duplicating object '%s': %s", *git_object_id((git_object *) obj), git_error_last()->message); + return obj2; +} + +/** + * Peel the specified object (i.e. follow tag and commit objects) to + * either a blob or a tree. + */ +static Object peelToTreeOrBlob(git_object * obj) +{ + /* git_object_peel() doesn't handle blob objects, so handle those + specially. */ + if (git_object_type(obj) == GIT_OBJECT_BLOB) + return dupObject(obj); + else + return peelObject(obj, GIT_OBJECT_TREE); +} + struct GitRepoImpl : GitRepo, std::enable_shared_from_this { /** Location of the repository on disk. */ @@ -166,7 +189,7 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this std::unordered_set done; std::queue todo; - todo.push(peelObject(*this, lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT)); + todo.push(peelObject(lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT)); while (auto commit = pop(todo)) { if (!done.insert(*git_commit_id(commit->get())).second) continue; @@ -184,7 +207,7 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this uint64_t getLastModified(const Hash & rev) override { - auto commit = peelObject(*this, lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT); + auto commit = peelObject(lookupObject(*this, hashToOID(rev)).get(), GIT_OBJECT_COMMIT); return git_commit_time(commit.get()); } @@ -463,6 +486,23 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this return narHash; } + + Hash dereferenceSingletonDirectory(const Hash & oid_) override + { + auto oid = hashToOID(oid_); + + auto _tree = lookupObject(*this, oid, GIT_OBJECT_TREE); + auto tree = (const git_tree *) &*_tree; + + if (git_tree_entrycount(tree) == 1) { + auto entry = git_tree_entry_byindex(tree, 0); + auto mode = git_tree_entry_filemode(entry); + if (mode == GIT_FILEMODE_TREE) + oid = *git_tree_entry_id(entry); + } + + return toHash(oid); + } }; ref GitRepo::openRepo(const std::filesystem::path & path, bool create, bool bare) @@ -476,11 +516,11 @@ ref GitRepo::openRepo(const std::filesystem::path & path, bool create, struct GitSourceAccessor : SourceAccessor { ref repo; - Tree root; + Object root; GitSourceAccessor(ref repo_, const Hash & rev) : repo(repo_) - , root(peelObject(*repo, lookupObject(*repo, hashToOID(rev)).get(), GIT_OBJECT_TREE)) + , root(peelToTreeOrBlob(lookupObject(*repo, hashToOID(rev)).get())) { } @@ -506,7 +546,7 @@ struct GitSourceAccessor : SourceAccessor std::optional maybeLstat(const CanonPath & path) override { if (path.isRoot()) - return Stat { .type = tDirectory }; + return Stat { .type = git_object_type(root.get()) == GIT_OBJECT_TREE ? tDirectory : tRegular }; auto entry = lookup(path); if (!entry) @@ -616,10 +656,10 @@ struct GitSourceAccessor : SourceAccessor std::optional lookupTree(const CanonPath & path) { if (path.isRoot()) { - Tree tree; - if (git_tree_dup(Setter(tree), root.get())) - throw Error("duplicating directory '%s': %s", showPath(path), git_error_last()->message); - return tree; + if (git_object_type(root.get()) == GIT_OBJECT_TREE) + return dupObject((git_tree *) &*root); + else + return std::nullopt; } auto entry = lookup(path); @@ -646,10 +686,10 @@ struct GitSourceAccessor : SourceAccessor std::variant getTree(const CanonPath & path) { if (path.isRoot()) { - Tree tree; - if (git_tree_dup(Setter(tree), root.get())) - throw Error("duplicating directory '%s': %s", showPath(path), git_error_last()->message); - return tree; + if (git_object_type(root.get()) == GIT_OBJECT_TREE) + return dupObject((git_tree *) &*root); + else + throw Error("Git root object '%s' is not a directory", *git_object_id(root.get())); } auto entry = need(path); @@ -669,6 +709,9 @@ struct GitSourceAccessor : SourceAccessor Blob getBlob(const CanonPath & path, bool expectSymlink) { + if (!expectSymlink && git_object_type(root.get()) == GIT_OBJECT_BLOB) + return dupObject((git_blob *) &*root); + auto notExpected = [&]() { throw Error( @@ -782,8 +825,6 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink std::vector pendingDirs; - size_t componentsToStrip = 1; - void pushBuilder(std::string name) { git_treebuilder * b; @@ -839,9 +880,6 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink { std::span pathComponents2{pathComponents}; - if (pathComponents2.size() <= componentsToStrip) return false; - pathComponents2 = pathComponents2.subspan(componentsToStrip); - updateBuilders( isDir ? pathComponents2 @@ -964,7 +1002,8 @@ struct GitFileSystemObjectSinkImpl : GitFileSystemObjectSink git_tree_entry_filemode(entry)); } - Hash sync() override { + Hash sync() override + { updateBuilders({}); auto [oid, _name] = popBuilder(); diff --git a/src/libfetchers/git-utils.hh b/src/libfetchers/git-utils.hh index 495916f62..915252868 100644 --- a/src/libfetchers/git-utils.hh +++ b/src/libfetchers/git-utils.hh @@ -98,6 +98,13 @@ struct GitRepo * serialisation. This is memoised on-disk. */ virtual Hash treeHashToNarHash(const Hash & treeHash) = 0; + + /** + * If the specified Git object is a directory with a single entry + * that is a directory, return the ID of that object. + * Otherwise, return the passed ID unchanged. + */ + virtual Hash dereferenceSingletonDirectory(const Hash & oid) = 0; }; ref getTarballCache(); diff --git a/src/libfetchers/github.cc b/src/libfetchers/github.cc index 5710b94d5..2e914164a 100644 --- a/src/libfetchers/github.cc +++ b/src/libfetchers/github.cc @@ -258,13 +258,14 @@ struct GitArchiveInputScheme : InputScheme fmt("unpacking '%s' into the Git cache", input.to_string())); TarArchive archive { *source }; - auto parseSink = getTarballCache()->getFileSystemObjectSink(); + auto tarballCache = getTarballCache(); + auto parseSink = tarballCache->getFileSystemObjectSink(); auto lastModified = unpackTarfileToSink(archive, *parseSink); act.reset(); TarballInfo tarballInfo { - .treeHash = parseSink->sync(), + .treeHash = tarballCache->dereferenceSingletonDirectory(parseSink->sync()), .lastModified = lastModified }; diff --git a/src/libfetchers/tarball.cc b/src/libfetchers/tarball.cc index 5837cf8a4..457210542 100644 --- a/src/libfetchers/tarball.cc +++ b/src/libfetchers/tarball.cc @@ -167,7 +167,8 @@ DownloadTarballResult downloadTarball( TarArchive{path}; }) : TarArchive{*source}; - auto parseSink = getTarballCache()->getFileSystemObjectSink(); + auto tarballCache = getTarballCache(); + auto parseSink = tarballCache->getFileSystemObjectSink(); auto lastModified = unpackTarfileToSink(archive, *parseSink); act.reset(); @@ -182,7 +183,8 @@ DownloadTarballResult downloadTarball( infoAttrs = cached->value; } else { infoAttrs.insert_or_assign("etag", res->etag); - infoAttrs.insert_or_assign("treeHash", parseSink->sync().gitRev()); + infoAttrs.insert_or_assign("treeHash", + tarballCache->dereferenceSingletonDirectory(parseSink->sync()).gitRev()); infoAttrs.insert_or_assign("lastModified", uint64_t(lastModified)); if (res->immutableUrl) infoAttrs.insert_or_assign("immutableUrl", *res->immutableUrl); diff --git a/tests/functional/tarball.sh b/tests/functional/tarball.sh index ab357ac78..4d8945625 100755 --- a/tests/functional/tarball.sh +++ b/tests/functional/tarball.sh @@ -83,3 +83,20 @@ path="$(nix flake prefetch --json "tarball+file://$(pwd)/tree.tar.gz" | jq -r .s [[ $(cat "$path/a/zzz") = bar ]] [[ $(cat "$path/c/aap") = bar ]] [[ $(cat "$path/fnord") = bar ]] + +# Test a tarball that has multiple top-level directories. +rm -rf "$TEST_ROOT/tar_root" +mkdir -p "$TEST_ROOT/tar_root" "$TEST_ROOT/tar_root/foo" "$TEST_ROOT/tar_root/bar" +tar cvf "$TEST_ROOT/tar.tar" -C "$TEST_ROOT/tar_root" . +path="$(nix flake prefetch --json "tarball+file://$TEST_ROOT/tar.tar" | jq -r .storePath)" +[[ -d "$path/foo" ]] +[[ -d "$path/bar" ]] + +# Test a tarball that has a single regular file. +rm -rf "$TEST_ROOT/tar_root" +mkdir -p "$TEST_ROOT/tar_root" +echo bar > "$TEST_ROOT/tar_root/foo" +chmod +x "$TEST_ROOT/tar_root/foo" +tar cvf "$TEST_ROOT/tar.tar" -C "$TEST_ROOT/tar_root" . +path="$(nix flake prefetch --refresh --json "tarball+file://$TEST_ROOT/tar.tar" | jq -r .storePath)" +[[ $(cat "$path/foo") = bar ]] diff --git a/tests/unit/libfetchers/git-utils.cc b/tests/unit/libfetchers/git-utils.cc index d3547ec6a..de5110cc3 100644 --- a/tests/unit/libfetchers/git-utils.cc +++ b/tests/unit/libfetchers/git-utils.cc @@ -77,7 +77,7 @@ TEST_F(GitUtilsTest, sink_basic) // sink->createHardlink("foo-1.1/links/foo-2", CanonPath("foo-1.1/hello")); - auto result = sink->sync(); + auto result = repo->dereferenceSingletonDirectory(sink->sync()); auto accessor = repo->getAccessor(result, false); auto entries = accessor->readDirectory(CanonPath::root); ASSERT_EQ(entries.size(), 5); @@ -103,7 +103,7 @@ TEST_F(GitUtilsTest, sink_hardlink) sink->createHardlink(CanonPath("foo-1.1/link"), CanonPath("hello")); FAIL() << "Expected an exception"; } catch (const nix::Error & e) { - ASSERT_THAT(e.msg(), testing::HasSubstr("invalid hard link target")); + ASSERT_THAT(e.msg(), testing::HasSubstr("cannot find hard link target")); ASSERT_THAT(e.msg(), testing::HasSubstr("/hello")); ASSERT_THAT(e.msg(), testing::HasSubstr("foo-1.1/link")); }