From d88106df24869104cc6c29c726ddfbbfda9dae10 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 27 Oct 2023 18:39:00 +0200 Subject: [PATCH] Git fetcher: Improve submodule handling Instead of making a complete copy of the repo, fetching the submodules, and writing the result to the store (which is all superexpensive), we now fetch the submodules recursively using the Git fetcher, and return a union accessor that "mounts" the accessors for the submodules on top of the root accessor. --- src/libfetchers/git-utils.cc | 78 +++++++++++++++++ src/libfetchers/git-utils.hh | 12 +++ src/libfetchers/git.cc | 109 +++++++----------------- src/libfetchers/union-input-accessor.cc | 80 +++++++++++++++++ src/libfetchers/union-input-accessor.hh | 9 ++ tests/functional/fetchGitSubmodules.sh | 8 -- 6 files changed, 212 insertions(+), 84 deletions(-) create mode 100644 src/libfetchers/union-input-accessor.cc create mode 100644 src/libfetchers/union-input-accessor.hh diff --git a/src/libfetchers/git-utils.cc b/src/libfetchers/git-utils.cc index 68e39580f..5e3e6dae4 100644 --- a/src/libfetchers/git-utils.cc +++ b/src/libfetchers/git-utils.cc @@ -1,11 +1,13 @@ #include "git-utils.hh" #include "input-accessor.hh" #include "cache.hh" +#include "finally.hh" #include #include #include +#include #include #include #include @@ -14,6 +16,7 @@ #include #include #include +#include #include #include @@ -63,6 +66,8 @@ typedef std::unique_ptr> Reference; typedef std::unique_ptr> DescribeResult; typedef std::unique_ptr> StatusList; typedef std::unique_ptr> Remote; +typedef std::unique_ptr> GitConfig; +typedef std::unique_ptr> ConfigIterator; // A helper to ensure that we don't leak objects returned by libgit2. template @@ -256,6 +261,17 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this return std::nullopt; } + std::vector getSubmodules(const Hash & rev) override; + + std::string resolveSubmoduleUrl(const std::string & url) override + { + git_buf buf = GIT_BUF_INIT; + if (git_submodule_resolve_url(&buf, *this, url.c_str())) + throw Error("resolving Git submodule URL '%s'", url); + Finally cleanup = [&]() { git_buf_dispose(&buf); }; + return buf.ptr; + } + bool hasObject(const Hash & oid_) override { auto oid = hashToOID(oid_); @@ -400,6 +416,16 @@ struct GitInputAccessor : InputAccessor return readBlob(path, true); } + Hash getSubmoduleRev(const CanonPath & path) + { + auto entry = need(path); + + if (git_tree_entry_type(entry) != GIT_OBJECT_COMMIT) + throw Error("'%s' is not a submodule", showPath(path)); + + return toHash(*git_tree_entry_id(entry)); + } + std::map lookupCache; /* Recursively look up 'path' relative to the root. */ @@ -495,4 +521,56 @@ ref GitRepoImpl::getAccessor(const Hash & rev) return make_ref(ref(shared_from_this()), rev); } +std::vector GitRepoImpl::getSubmodules(const Hash & rev) +{ + /* Read the .gitmodules files from this revision. */ + CanonPath modulesFile(".gitmodules"); + + auto accessor = getAccessor(rev); + if (!accessor->pathExists(modulesFile)) return {}; + + /* Parse it. */ + auto configS = accessor->readFile(modulesFile); + + auto [fdTemp, pathTemp] = createTempFile("nix-git-submodules"); + writeFull(fdTemp.get(), configS); + + GitConfig config; + if (git_config_open_ondisk(Setter(config), pathTemp.c_str())) + throw Error("parsing .gitmodules file: %s", git_error_last()->message); + + ConfigIterator it; + if (git_config_iterator_glob_new(Setter(it), config.get(), "^submodule\\..*\\.(path|url|branch)$")) + throw Error("iterating over .gitmodules: %s", git_error_last()->message); + + std::map entries; + + while (true) { + git_config_entry * entry = nullptr; + if (auto err = git_config_next(&entry, it.get())) { + if (err == GIT_ITEROVER) break; + throw Error("iterating over .gitmodules: %s", git_error_last()->message); + } + entries.emplace(entry->name + 10, entry->value); + } + + std::vector result; + + for (auto & [key, value] : entries) { + if (!hasSuffix(key, ".path")) continue; + std::string key2(key, 0, key.size() - 5); + auto path = CanonPath(value); + auto rev = accessor.dynamic_pointer_cast()->getSubmoduleRev(path); + result.push_back(Submodule { + .path = path, + .url = entries[key2 + ".url"], + .branch = entries[key2 + ".branch"], + .rev = rev, + }); + } + + return result; +} + + } diff --git a/src/libfetchers/git-utils.hh b/src/libfetchers/git-utils.hh index dd2c06672..55e7ef969 100644 --- a/src/libfetchers/git-utils.hh +++ b/src/libfetchers/git-utils.hh @@ -38,6 +38,18 @@ struct GitRepo /* Get the ref that HEAD points to. */ virtual std::optional getWorkdirRef() = 0; + struct Submodule + { + CanonPath path; + std::string url; + std::string branch; + Hash rev; + }; + + virtual std::vector getSubmodules(const Hash & rev) = 0; + + virtual std::string resolveSubmoduleUrl(const std::string & url) = 0; + struct TarballInfo { Hash treeHash; diff --git a/src/libfetchers/git.cc b/src/libfetchers/git.cc index 55d3a8ebe..42b4aa23a 100644 --- a/src/libfetchers/git.cc +++ b/src/libfetchers/git.cc @@ -8,6 +8,7 @@ #include "util.hh" #include "git.hh" #include "fs-input-accessor.hh" +#include "union-input-accessor.hh" #include "git-utils.hh" #include "fetch-settings.hh" @@ -134,11 +135,6 @@ std::optional readHeadCached(const std::string & actualUrl) return std::nullopt; } -bool isNotDotGitDirectory(const Path & path) -{ - return baseNameOf(path) != ".git"; -} - } // end namespace struct GitInputScheme : InputScheme @@ -413,7 +409,7 @@ struct GitInputScheme : InputScheme std::string name = input.getName(); - auto makeResult2 = [&](const Attrs & infoAttrs, ref accessor) -> std::pair, Input> + auto makeResult = [&](const Attrs & infoAttrs, ref accessor) -> std::pair, Input> { assert(input.getRev()); assert(!origRev || origRev == input.getRev()); @@ -424,18 +420,6 @@ struct GitInputScheme : InputScheme return {accessor, std::move(input)}; }; - auto makeResult = [&](const Attrs & infoAttrs, const StorePath & storePath) -> std::pair, Input> - { - // FIXME: remove? - //input.attrs.erase("narHash"); - auto narHash = store->queryPathInfo(storePath)->narHash; - input.attrs.insert_or_assign("narHash", narHash.to_string(HashFormat::SRI, true)); - - auto accessor = makeStorePathAccessor(store, storePath, makeNotAllowedError(repoInfo.url)); - - return makeResult2(infoAttrs, accessor); - }; - auto originalRef = input.getRef(); auto ref = originalRef ? *originalRef : getDefaultRef(repoInfo); input.attrs.insert_or_assign("ref", ref); @@ -542,66 +526,39 @@ struct GitInputScheme : InputScheme printTalkative("using revision %s of repo '%s'", rev.gitRev(), repoInfo.url); - if (!repoInfo.submodules) { - auto accessor = GitRepo::openRepo(CanonPath(repoDir))->getAccessor(rev); - return makeResult2(infoAttrs, accessor); + auto repo = GitRepo::openRepo(CanonPath(repoDir)); + + auto accessor = repo->getAccessor(rev); + + /* If the repo has submodules, fetch them and return a union + input accessor consisting of the accessor for the top-level + repo and the accessors for the submodules. */ + if (repoInfo.submodules) { + std::map> mounts; + + for (auto & submodule : repo->getSubmodules(rev)) { + auto resolved = repo->resolveSubmoduleUrl(submodule.url); + debug("Git submodule %s: %s %s %s -> %s", + submodule.path, submodule.url, submodule.branch, submodule.rev.gitRev(), resolved); + fetchers::Attrs attrs; + attrs.insert_or_assign("type", "git"); + attrs.insert_or_assign("url", resolved); + if (submodule.branch != "") + attrs.insert_or_assign("ref", submodule.branch); + attrs.insert_or_assign("rev", submodule.rev.gitRev()); + auto submoduleInput = fetchers::Input::fromAttrs(std::move(attrs)); + auto [submoduleAccessor, submoduleInput2] = + submoduleInput.scheme->getAccessor(store, submoduleInput); + mounts.insert_or_assign(submodule.path, submoduleAccessor); + } + + if (!mounts.empty()) { + mounts.insert_or_assign(CanonPath::root, accessor); + accessor = makeUnionInputAccessor(std::move(mounts)); + } } - else { - // FIXME: use libgit2 - Path tmpDir = createTempDir(); - AutoDelete delTmpDir(tmpDir, true); - PathFilter filter = defaultPathFilter; - - Activity act(*logger, lvlChatty, actUnknown, fmt("copying Git tree '%s' to the store", input.to_string())); - - Path tmpGitDir = createTempDir(); - AutoDelete delTmpGitDir(tmpGitDir, true); - - runProgram("git", true, { "-c", "init.defaultBranch=" + gitInitialBranch, "init", tmpDir, "--separate-git-dir", tmpGitDir }); - - { - // TODO: repoDir might lack the ref (it only checks if rev - // exists, see FIXME above) so use a big hammer and fetch - // everything to ensure we get the rev. - Activity act(*logger, lvlTalkative, actUnknown, fmt("making temporary clone of '%s'", repoDir)); - runProgram("git", true, { "-C", tmpDir, "fetch", "--quiet", "--force", - "--update-head-ok", "--", repoDir, "refs/*:refs/*" }, {}, true); - } - - runProgram("git", true, { "-C", tmpDir, "checkout", "--quiet", rev.gitRev() }); - - /* Ensure that we use the correct origin for fetching - submodules. This matters for submodules with relative - URLs. */ - if (repoInfo.isLocal) { - writeFile(tmpGitDir + "/config", readFile(repoDir + "/" + repoInfo.gitDir + "/config")); - - /* Restore the config.bare setting we may have just - copied erroneously from the user's repo. */ - runProgram("git", true, { "-C", tmpDir, "config", "core.bare", "false" }); - } else - runProgram("git", true, { "-C", tmpDir, "config", "remote.origin.url", repoInfo.url }); - - /* As an optimisation, copy the modules directory of the - source repo if it exists. */ - auto modulesPath = repoDir + "/" + repoInfo.gitDir + "/modules"; - if (pathExists(modulesPath)) { - Activity act(*logger, lvlTalkative, actUnknown, fmt("copying submodules of '%s'", repoInfo.url)); - runProgram("cp", true, { "-R", "--", modulesPath, tmpGitDir + "/modules" }); - } - - { - Activity act(*logger, lvlTalkative, actUnknown, fmt("fetching submodules of '%s'", repoInfo.url)); - runProgram("git", true, { "-C", tmpDir, "submodule", "--quiet", "update", "--init", "--recursive" }, {}, true); - } - - filter = isNotDotGitDirectory; - - auto storePath = store->addToStore(name, tmpDir, FileIngestionMethod::Recursive, htSHA256, filter); - - return makeResult(infoAttrs, std::move(storePath)); - } + return makeResult(infoAttrs, accessor); } std::pair, Input> getAccessorFromWorkdir( diff --git a/src/libfetchers/union-input-accessor.cc b/src/libfetchers/union-input-accessor.cc new file mode 100644 index 000000000..940c0e06c --- /dev/null +++ b/src/libfetchers/union-input-accessor.cc @@ -0,0 +1,80 @@ +#include "union-input-accessor.hh" + +namespace nix { + +struct UnionInputAccessor : InputAccessor +{ + std::map> mounts; + + UnionInputAccessor(std::map> _mounts) + : mounts(std::move(_mounts)) + { + // Currently we require a root filesystem. This could be relaxed. + assert(mounts.contains(CanonPath::root)); + + // FIXME: should check that every mount point exists. Or we + // could return dummy parent directories automatically. + } + + std::string readFile(const CanonPath & path) override + { + auto [accessor, subpath] = resolve(path); + return accessor->readFile(subpath); + } + + bool pathExists(const CanonPath & path) override + { + auto [accessor, subpath] = resolve(path); + return accessor->pathExists(subpath); + } + + Stat lstat(const CanonPath & path) override + { + auto [accessor, subpath] = resolve(path); + return accessor->lstat(subpath); + } + + DirEntries readDirectory(const CanonPath & path) override + { + auto [accessor, subpath] = resolve(path); + return accessor->readDirectory(subpath); + } + + std::string readLink(const CanonPath & path) override + { + auto [accessor, subpath] = resolve(path); + return accessor->readLink(subpath); + } + + std::string showPath(const CanonPath & path) override + { + auto [accessor, subpath] = resolve(path); + return accessor->showPath(subpath); + } + + std::pair, CanonPath> resolve(CanonPath path) + { + // Find the nearest parent of `path` that is a mount point. + std::vector ss; + while (true) { + auto i = mounts.find(path); + if (i != mounts.end()) { + auto subpath = CanonPath::root; + for (auto j = ss.rbegin(); j != ss.rend(); ++j) + subpath.push(*j); + return {i->second, std::move(subpath)}; + } + + assert(!path.isRoot()); + ss.push_back(std::string(*path.baseName())); + path.pop(); + } + } +}; + +ref makeUnionInputAccessor(std::map> mounts) +{ + return make_ref(std::move(mounts)); +} + +} diff --git a/src/libfetchers/union-input-accessor.hh b/src/libfetchers/union-input-accessor.hh new file mode 100644 index 000000000..6a1649c1d --- /dev/null +++ b/src/libfetchers/union-input-accessor.hh @@ -0,0 +1,9 @@ +#pragma once + +#include "input-accessor.hh" + +namespace nix { + +ref makeUnionInputAccessor(std::map> mounts); + +} diff --git a/tests/functional/fetchGitSubmodules.sh b/tests/functional/fetchGitSubmodules.sh index df81232e5..369cdc5db 100644 --- a/tests/functional/fetchGitSubmodules.sh +++ b/tests/functional/fetchGitSubmodules.sh @@ -118,11 +118,3 @@ cloneRepo=$TEST_ROOT/a/b/gitSubmodulesClone # NB /a/b to make the relative path git clone $rootRepo $cloneRepo pathIndirect=$(nix eval --raw --expr "(builtins.fetchGit { url = file://$cloneRepo; rev = \"$rev2\"; submodules = true; }).outPath") [[ $pathIndirect = $pathWithRelative ]] - -# Test that if the clone has the submodule already, we're not fetching -# it again. -git -C $cloneRepo submodule update --init -rm $TEST_HOME/.cache/nix/fetcher-cache* -rm -rf $subRepo -pathSubmoduleGone=$(nix eval --raw --expr "(builtins.fetchGit { url = file://$cloneRepo; rev = \"$rev2\"; submodules = true; }).outPath") -[[ $pathSubmoduleGone = $pathWithRelative ]]