From b36857ac8d60cbf9a78c3c69f6370d38a14facbc Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 29 Nov 2023 12:35:08 +0100 Subject: [PATCH] Add a Git-based content-addressed tarball cache GitArchiveInputScheme now streams tarballs into a Git repository. This deduplicates data a lot, e.g. when you're fetching different revisions of the Nixpkgs repo. It also warns if the tree hash returned by GitHub doesn't match the tree hash of the imported tarball. --- src/libfetchers/attrs.cc | 5 + src/libfetchers/attrs.hh | 2 + src/libfetchers/git-utils.cc | 178 +++++++++++++++++++++++++++++++++++ src/libfetchers/git-utils.hh | 10 ++ src/libfetchers/github.cc | 114 ++++++++++++++-------- 5 files changed, 272 insertions(+), 37 deletions(-) diff --git a/src/libfetchers/attrs.cc b/src/libfetchers/attrs.cc index a565d19d4..e3fa1d26a 100644 --- a/src/libfetchers/attrs.cc +++ b/src/libfetchers/attrs.cc @@ -104,4 +104,9 @@ std::map attrsToQuery(const Attrs & attrs) return query; } +Hash getRevAttr(const Attrs & attrs, const std::string & name) +{ + return Hash::parseAny(getStrAttr(attrs, name), htSHA1); +} + } diff --git a/src/libfetchers/attrs.hh b/src/libfetchers/attrs.hh index b9a2c824e..97a74bce0 100644 --- a/src/libfetchers/attrs.hh +++ b/src/libfetchers/attrs.hh @@ -39,4 +39,6 @@ bool getBoolAttr(const Attrs & attrs, const std::string & name); std::map attrsToQuery(const Attrs & attrs); +Hash getRevAttr(const Attrs & attrs, const std::string & name); + } diff --git a/src/libfetchers/git-utils.cc b/src/libfetchers/git-utils.cc index 19eae0e1d..abad42c29 100644 --- a/src/libfetchers/git-utils.cc +++ b/src/libfetchers/git-utils.cc @@ -4,6 +4,7 @@ #include "finally.hh" #include "processes.hh" #include "signals.hh" +#include "users.hh" #include @@ -21,6 +22,9 @@ #include #include +#include "tarfile.hh" +#include + #include #include #include @@ -307,6 +311,158 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this return std::nullopt; } + TarballInfo importTarball(Source & source) override + { + TarArchive archive(source); + + struct PendingDir + { + std::string name; + TreeBuilder builder; + }; + + std::vector pendingDirs; + + auto pushBuilder = [&](std::string name) + { + git_treebuilder * b; + if (git_treebuilder_new(&b, *this, nullptr)) + throw Error("creating a tree builder: %s", git_error_last()->message); + pendingDirs.push_back({ .name = std::move(name), .builder = TreeBuilder(b) }); + }; + + auto popBuilder = [&]() -> std::pair + { + assert(!pendingDirs.empty()); + auto pending = std::move(pendingDirs.back()); + git_oid oid; + if (git_treebuilder_write(&oid, pending.builder.get())) + throw Error("creating a tree object: %s", git_error_last()->message); + pendingDirs.pop_back(); + return {oid, pending.name}; + }; + + auto addToTree = [&](const std::string & name, const git_oid & oid, git_filemode_t mode) + { + assert(!pendingDirs.empty()); + auto & pending = pendingDirs.back(); + if (git_treebuilder_insert(nullptr, pending.builder.get(), name.c_str(), &oid, mode)) + throw Error("adding a file to a tree builder: %s", git_error_last()->message); + }; + + auto updateBuilders = [&](boost::span names) + { + // Find the common prefix of pendingDirs and names. + size_t prefixLen = 0; + for (; prefixLen < names.size() && prefixLen + 1 < pendingDirs.size(); ++prefixLen) + if (names[prefixLen] != pendingDirs[prefixLen + 1].name) + break; + + // Finish the builders that are not part of the common prefix. + for (auto n = pendingDirs.size(); n > prefixLen + 1; --n) { + auto [oid, name] = popBuilder(); + addToTree(name, oid, GIT_FILEMODE_TREE); + } + + // Create builders for the new directories. + for (auto n = prefixLen; n < names.size(); ++n) + pushBuilder(names[n]); + }; + + pushBuilder(""); + + size_t componentsToStrip = 1; + + time_t lastModified = 0; + + for (;;) { + // FIXME: merge with extract_archive + struct archive_entry * entry; + int r = archive_read_next_header(archive.archive, &entry); + if (r == ARCHIVE_EOF) break; + auto path = archive_entry_pathname(entry); + if (!path) + throw Error("cannot get archive member name: %s", archive_error_string(archive.archive)); + if (r == ARCHIVE_WARN) + warn(archive_error_string(archive.archive)); + else + archive.check(r); + + lastModified = std::max(lastModified, archive_entry_mtime(entry)); + + auto pathComponents = tokenizeString>(path, "/"); + + boost::span pathComponents2{pathComponents}; + + if (pathComponents2.size() <= componentsToStrip) continue; + pathComponents2 = pathComponents2.subspan(componentsToStrip); + + updateBuilders( + archive_entry_filetype(entry) == AE_IFDIR + ? pathComponents2 + : pathComponents2.first(pathComponents2.size() - 1)); + + switch (archive_entry_filetype(entry)) { + + case AE_IFDIR: + // Nothing to do right now. + break; + + case AE_IFREG: { + + git_writestream * stream = nullptr; + if (git_blob_create_from_stream(&stream, *this, nullptr)) + throw Error("creating a blob stream object: %s", git_error_last()->message); + + while (true) { + std::vector buf(128 * 1024); + auto n = archive_read_data(archive.archive, buf.data(), buf.size()); + if (n < 0) + throw Error("cannot read file '%s' from tarball", path); + if (n == 0) break; + if (stream->write(stream, (const char *) buf.data(), n)) + throw Error("writing a blob for tarball member '%s': %s", path, git_error_last()->message); + } + + git_oid oid; + if (git_blob_create_from_stream_commit(&oid, stream)) + throw Error("creating a blob object for tarball member '%s': %s", path, git_error_last()->message); + + addToTree(*pathComponents.rbegin(), oid, + archive_entry_mode(entry) & S_IXUSR + ? GIT_FILEMODE_BLOB_EXECUTABLE + : GIT_FILEMODE_BLOB); + + break; + } + + case AE_IFLNK: { + auto target = archive_entry_symlink(entry); + + git_oid oid; + if (git_blob_create_from_buffer(&oid, *this, target, strlen(target))) + throw Error("creating a blob object for tarball symlink member '%s': %s", path, git_error_last()->message); + + addToTree(*pathComponents.rbegin(), oid, GIT_FILEMODE_LINK); + + break; + } + + default: + throw Error("file '%s' in tarball has unsupported file type", path); + } + } + + updateBuilders({}); + + auto [oid, _name] = popBuilder(); + + return TarballInfo { + .treeHash = toHash(oid), + .lastModified = lastModified + }; + } + std::vector> getSubmodules(const Hash & rev) override; std::string resolveSubmoduleUrl( @@ -449,6 +605,22 @@ struct GitRepoImpl : GitRepo, std::enable_shared_from_this else throw Error("Commit signature verification on commit %s failed: %s", rev.gitRev(), output); } + + Hash treeHashToNarHash(const Hash & treeHash) override + { + auto accessor = getAccessor(treeHash); + + fetchers::Attrs cacheKey({{"_what", "treeHashToNarHash"}, {"treeHash", treeHash.gitRev()}}); + + if (auto res = fetchers::getCache()->lookup(cacheKey)) + return Hash::parseAny(fetchers::getStrAttr(*res, "narHash"), htSHA256); + + auto narHash = accessor->hashPath(CanonPath::root); + + fetchers::getCache()->upsert(cacheKey, fetchers::Attrs({{"narHash", narHash.to_string(HashFormat::SRI, true)}})); + + return narHash; + } }; ref GitRepo::openRepo(const CanonPath & path, bool create, bool bare) @@ -673,5 +845,11 @@ std::vector> GitRepoImpl::getSubmodules return result; } +ref getTarballCache() +{ + static CanonPath repoDir(getCacheDir() + "/nix/tarball-cache"); + + return make_ref(repoDir, true, true); +} } diff --git a/src/libfetchers/git-utils.hh b/src/libfetchers/git-utils.hh index 1def82071..b8b31530a 100644 --- a/src/libfetchers/git-utils.hh +++ b/src/libfetchers/git-utils.hh @@ -69,6 +69,8 @@ struct GitRepo time_t lastModified; }; + virtual TarballInfo importTarball(Source & source) = 0; + virtual bool hasObject(const Hash & oid) = 0; virtual ref getAccessor(const Hash & rev) = 0; @@ -85,6 +87,14 @@ struct GitRepo virtual void verifyCommit( const Hash & rev, const std::vector & publicKeys) = 0; + + /** + * Given a Git tree hash, compute the hash of its NAR + * serialisation. This is memoised on-disk. + */ + virtual Hash treeHashToNarHash(const Hash & treeHash) = 0; }; +ref getTarballCache(); + } diff --git a/src/libfetchers/github.cc b/src/libfetchers/github.cc index 661ad4884..877f6378b 100644 --- a/src/libfetchers/github.cc +++ b/src/libfetchers/github.cc @@ -8,6 +8,7 @@ #include "fetchers.hh" #include "fetch-settings.hh" #include "tarball.hh" +#include "git-utils.hh" #include #include @@ -180,49 +181,87 @@ struct GitArchiveInputScheme : InputScheme return headers; } - virtual Hash getRevFromRef(nix::ref store, const Input & input) const = 0; + struct RefInfo + { + Hash rev; + std::optional treeHash; + }; + + virtual RefInfo getRevFromRef(nix::ref store, const Input & input) const = 0; virtual DownloadUrl getDownloadUrl(const Input & input) const = 0; - std::pair fetch(ref store, const Input & _input) override + std::pair downloadArchive(ref store, Input input) const { - Input input(_input); - if (!maybeGetStrAttr(input.attrs, "ref")) input.attrs.insert_or_assign("ref", "HEAD"); + std::optional upstreamTreeHash; + auto rev = input.getRev(); - if (!rev) rev = getRevFromRef(store, input); + if (!rev) { + auto refInfo = getRevFromRef(store, input); + rev = refInfo.rev; + upstreamTreeHash = refInfo.treeHash; + debug("HEAD revision for '%s' is %s", input.to_string(), refInfo.rev.gitRev()); + } input.attrs.erase("ref"); input.attrs.insert_or_assign("rev", rev->gitRev()); - Attrs lockedAttrs({ - {"type", "git-tarball"}, - {"rev", rev->gitRev()}, - }); + auto cache = getCache(); - if (auto res = getCache()->lookup(store, lockedAttrs)) { - input.attrs.insert_or_assign("lastModified", getIntAttr(res->first, "lastModified")); - return {std::move(res->second), input}; + Attrs treeHashKey{{"_what", "gitRevToTreeHash"}, {"rev", rev->gitRev()}}; + Attrs lastModifiedKey{{"_what", "gitRevToLastModified"}, {"rev", rev->gitRev()}}; + + if (auto treeHashAttrs = cache->lookup(treeHashKey)) { + if (auto lastModifiedAttrs = cache->lookup(lastModifiedKey)) { + auto treeHash = getRevAttr(*treeHashAttrs, "treeHash"); + auto lastModified = getIntAttr(*lastModifiedAttrs, "lastModified"); + if (getTarballCache()->hasObject(treeHash)) + return {std::move(input), GitRepo::TarballInfo { .treeHash = treeHash, .lastModified = (time_t) lastModified }}; + else + debug("Git tree with hash '%s' has disappeared from the cache, refetching...", treeHash.gitRev()); + } } + /* Stream the tarball into the tarball cache. */ auto url = getDownloadUrl(input); - auto result = downloadTarball(store, url.url, input.getName(), true, url.headers); + auto source = sinkToSource([&](Sink & sink) { + FileTransferRequest req(url.url); + req.headers = url.headers; + getFileTransfer()->download(std::move(req), sink); + }); - input.attrs.insert_or_assign("lastModified", uint64_t(result.lastModified)); + auto tarballInfo = getTarballCache()->importTarball(*source); - getCache()->add( - store, - lockedAttrs, - { - {"rev", rev->gitRev()}, - {"lastModified", uint64_t(result.lastModified)} - }, - result.storePath, - true); + cache->upsert(treeHashKey, Attrs{{"treeHash", tarballInfo.treeHash.gitRev()}}); + cache->upsert(lastModifiedKey, Attrs{{"lastModified", (uint64_t) tarballInfo.lastModified}}); - return {result.storePath, input}; + if (upstreamTreeHash != tarballInfo.treeHash) + warn( + "Git tree hash mismatch for revision '%s' of '%s': " + "expected '%s', got '%s'. " + "This can happen if the Git repository uses submodules.", + rev->gitRev(), input.to_string(), upstreamTreeHash->gitRev(), tarballInfo.treeHash.gitRev()); + + return {std::move(input), tarballInfo}; + } + + std::pair, Input> getAccessor(ref store, const Input & _input) const override + { + auto [input, tarballInfo] = downloadArchive(store, _input); + + input.attrs.insert_or_assign("treeHash", tarballInfo.treeHash.gitRev()); + input.attrs.insert_or_assign("lastModified", uint64_t(tarballInfo.lastModified)); + + auto accessor = getTarballCache()->getAccessor(tarballInfo.treeHash); + + accessor->setPathDisplay("«" + input.to_string() + "»"); + + accessor->fingerprint = input.getFingerprint(store); + + return {accessor, input}; } std::optional experimentalFeature() const override @@ -269,7 +308,7 @@ struct GitHubInputScheme : GitArchiveInputScheme return getStrAttr(input.attrs, "repo"); } - Hash getRevFromRef(nix::ref store, const Input & input) const override + RefInfo getRevFromRef(nix::ref store, const Input & input) const override { auto host = getHost(input); auto url = fmt( @@ -284,9 +323,10 @@ struct GitHubInputScheme : GitArchiveInputScheme readFile( store->toRealPath( downloadFile(store, url, "source", false, headers).storePath))); - auto rev = Hash::parseAny(std::string { json["sha"] }, htSHA1); - debug("HEAD revision for '%s' is %s", url, rev.gitRev()); - return rev; + return RefInfo { + .rev = Hash::parseAny(std::string { json["sha"] }, htSHA1), + .treeHash = Hash::parseAny(std::string { json["commit"]["tree"]["sha"] }, htSHA1) + }; } DownloadUrl getDownloadUrl(const Input & input) const override @@ -343,7 +383,7 @@ struct GitLabInputScheme : GitArchiveInputScheme return std::make_pair(token.substr(0,fldsplit), token.substr(fldsplit+1)); } - Hash getRevFromRef(nix::ref store, const Input & input) const override + RefInfo getRevFromRef(nix::ref store, const Input & input) const override { auto host = maybeGetStrAttr(input.attrs, "host").value_or("gitlab.com"); // See rate limiting note below @@ -356,9 +396,9 @@ struct GitLabInputScheme : GitArchiveInputScheme readFile( store->toRealPath( downloadFile(store, url, "source", false, headers).storePath))); - auto rev = Hash::parseAny(std::string(json[0]["id"]), htSHA1); - debug("HEAD revision for '%s' is %s", url, rev.gitRev()); - return rev; + return RefInfo { + .rev = Hash::parseAny(std::string(json[0]["id"]), htSHA1) + }; } DownloadUrl getDownloadUrl(const Input & input) const override @@ -402,7 +442,7 @@ struct SourceHutInputScheme : GitArchiveInputScheme // Once it is implemented, however, should work as expected. } - Hash getRevFromRef(nix::ref store, const Input & input) const override + RefInfo getRevFromRef(nix::ref store, const Input & input) const override { // TODO: In the future, when the sourcehut graphql API is implemented for mercurial // and with anonymous access, this method should use it instead. @@ -445,12 +485,12 @@ struct SourceHutInputScheme : GitArchiveInputScheme id = parsedLine->target; } - if(!id) + if (!id) throw BadURL("in '%d', couldn't find ref '%d'", input.to_string(), ref); - auto rev = Hash::parseAny(*id, htSHA1); - debug("HEAD revision for '%s' is %s", fmt("%s/%s", base_url, ref), rev.gitRev()); - return rev; + return RefInfo { + .rev = Hash::parseAny(*id, htSHA1) + }; } DownloadUrl getDownloadUrl(const Input & input) const override