From d4ad1fcf303f6f34ebb30a82ebe6f99c26bef8cb Mon Sep 17 00:00:00 2001 From: John Ericson Date: Thu, 18 Jan 2024 23:57:26 -0500 Subject: [PATCH] Avoid creating temporary store object for git over the wire Instead, serialize as NAR and send that over, then rehash sever side. This is alorithmically simpler, but comes at the cost of a newer parameter to `Store::addToStoreFromDump`. Co-authored-by: Eelco Dolstra --- src/libexpr/primops.cc | 2 +- src/libstore/binary-cache-store.cc | 27 +++++--- src/libstore/binary-cache-store.hh | 3 +- src/libstore/build/local-derivation-goal.cc | 5 +- src/libstore/daemon.cc | 59 ++++++++-------- src/libstore/derivations.cc | 2 +- src/libstore/dummy-store.cc | 3 +- src/libstore/legacy-ssh-store.hh | 3 +- src/libstore/local-store.cc | 77 ++++++++------------- src/libstore/local-store.hh | 3 +- src/libstore/remote-store.cc | 20 +++++- src/libstore/remote-store.hh | 3 +- src/libstore/store-api.cc | 47 ++++--------- src/libstore/store-api.hh | 17 +++-- src/nix-env/user-env.cc | 2 +- src/nix/develop.cc | 2 +- 16 files changed, 137 insertions(+), 138 deletions(-) diff --git a/src/libexpr/primops.cc b/src/libexpr/primops.cc index 9ea266cf9..78f7f71ed 100644 --- a/src/libexpr/primops.cc +++ b/src/libexpr/primops.cc @@ -2092,7 +2092,7 @@ static void prim_toFile(EvalState & state, const PosIdx pos, Value * * args, Val }) : ({ StringSource s { contents }; - state.store->addToStoreFromDump(s, name, TextIngestionMethod {}, HashAlgorithm::SHA256, refs, state.repair); + state.store->addToStoreFromDump(s, name, FileSerialisationMethod::Flat, TextIngestionMethod {}, HashAlgorithm::SHA256, refs, state.repair); }); /* Note: we don't need to add `context' to the context of the diff --git a/src/libstore/binary-cache-store.cc b/src/libstore/binary-cache-store.cc index d6047dd7e..bea2bb370 100644 --- a/src/libstore/binary-cache-store.cc +++ b/src/libstore/binary-cache-store.cc @@ -305,7 +305,8 @@ void BinaryCacheStore::addToStore(const ValidPathInfo & info, Source & narSource StorePath BinaryCacheStore::addToStoreFromDump( Source & dump, std::string_view name, - ContentAddressMethod method, + FileSerialisationMethod dumpMethod, + ContentAddressMethod hashMethod, HashAlgorithm hashAlgo, const StorePathSet & references, RepairFlag repair) @@ -313,17 +314,26 @@ StorePath BinaryCacheStore::addToStoreFromDump( std::optional caHash; std::string nar; + // Calculating Git hash from NAR stream not yet implemented. May not + // be possible to implement in single-pass if the NAR is in an + // inconvenient order. Could fetch after uploading, however. + if (hashMethod.getFileIngestionMethod() == FileIngestionMethod::Git) + unsupported("addToStoreFromDump"); + if (auto * dump2p = dynamic_cast(&dump)) { auto & dump2 = *dump2p; // Hack, this gives us a "replayable" source so we can compute // multiple hashes more easily. - caHash = hashString(HashAlgorithm::SHA256, dump2.s); - switch (method.getFileIngestionMethod()) { - case FileIngestionMethod::Recursive: + // + // Only calculate if the dump is in the right format, however. + if (static_cast(dumpMethod) == hashMethod.getFileIngestionMethod()) + caHash = hashString(HashAlgorithm::SHA256, dump2.s); + switch (dumpMethod) { + case FileSerialisationMethod::Recursive: // The dump is already NAR in this case, just use it. nar = dump2.s; break; - case FileIngestionMethod::Flat: + case FileSerialisationMethod::Flat: { // The dump is Flat, so we need to convert it to NAR with a // single file. @@ -332,14 +342,11 @@ StorePath BinaryCacheStore::addToStoreFromDump( nar = std::move(s.s); break; } - case FileIngestionMethod::Git: - unsupported("addToStoreFromDump"); - break; } } else { // Otherwise, we have to do th same hashing as NAR so our single // hash will suffice for both purposes. - if (method != FileIngestionMethod::Recursive || hashAlgo != HashAlgorithm::SHA256) + if (dumpMethod != FileSerialisationMethod::Recursive || hashAlgo != HashAlgorithm::SHA256) unsupported("addToStoreFromDump"); } StringSource narDump { nar }; @@ -354,7 +361,7 @@ StorePath BinaryCacheStore::addToStoreFromDump( *this, name, ContentAddressWithReferences::fromParts( - method, + hashMethod, caHash ? *caHash : nar.first, { .others = references, diff --git a/src/libstore/binary-cache-store.hh b/src/libstore/binary-cache-store.hh index 76de2d11a..7c2828309 100644 --- a/src/libstore/binary-cache-store.hh +++ b/src/libstore/binary-cache-store.hh @@ -125,7 +125,8 @@ public: StorePath addToStoreFromDump( Source & dump, std::string_view name, - ContentAddressMethod method, + FileSerialisationMethod dumpMethod, + ContentAddressMethod hashMethod, HashAlgorithm hashAlgo, const StorePathSet & references, RepairFlag repair) override; diff --git a/src/libstore/build/local-derivation-goal.cc b/src/libstore/build/local-derivation-goal.cc index d92966a74..a9b8de123 100644 --- a/src/libstore/build/local-derivation-goal.cc +++ b/src/libstore/build/local-derivation-goal.cc @@ -1312,12 +1312,13 @@ struct RestrictedStore : public virtual RestrictedStoreConfig, public virtual In StorePath addToStoreFromDump( Source & dump, std::string_view name, - ContentAddressMethod method, + FileSerialisationMethod dumpMethod, + ContentAddressMethod hashMethod, HashAlgorithm hashAlgo, const StorePathSet & references, RepairFlag repair) override { - auto path = next->addToStoreFromDump(dump, name, method, hashAlgo, references, repair); + auto path = next->addToStoreFromDump(dump, name, dumpMethod, hashMethod, hashAlgo, references, repair); goal.addDependency(path); return path; } diff --git a/src/libstore/daemon.cc b/src/libstore/daemon.cc index 873065e14..e1337f51d 100644 --- a/src/libstore/daemon.cc +++ b/src/libstore/daemon.cc @@ -401,11 +401,23 @@ static void performOp(TunnelLogger * logger, ref store, logger->startWork(); auto pathInfo = [&]() { // NB: FramedSource must be out of scope before logger->stopWork(); - auto [contentAddressMethod, hashAlgo_] = ContentAddressMethod::parseWithAlgo(camStr); - auto hashAlgo = hashAlgo_; // work around clang bug + auto [contentAddressMethod, hashAlgo] = ContentAddressMethod::parseWithAlgo(camStr); FramedSource source(from); + FileSerialisationMethod dumpMethod; + switch (contentAddressMethod.getFileIngestionMethod()) { + case FileIngestionMethod::Flat: + dumpMethod = FileSerialisationMethod::Flat; + break; + case FileIngestionMethod::Recursive: + dumpMethod = FileSerialisationMethod::Recursive; + break; + case FileIngestionMethod::Git: + // Use NAR; Git is not a serialization method + dumpMethod = FileSerialisationMethod::Recursive; + break; + } // TODO these two steps are essentially RemoteStore::addCAToStore. Move it up to Store. - auto path = store->addToStoreFromDump(source, name, contentAddressMethod, hashAlgo, refs, repair); + auto path = store->addToStoreFromDump(source, name, dumpMethod, contentAddressMethod, hashAlgo, refs, repair); return store->queryPathInfo(path); }(); logger->stopWork(); @@ -431,34 +443,23 @@ static void performOp(TunnelLogger * logger, ref store, hashAlgo = parseHashAlgo(hashAlgoRaw); } + // Old protocol always sends NAR, regardless of hashing method auto dumpSource = sinkToSource([&](Sink & saved) { - if (method == FileIngestionMethod::Recursive) { - /* We parse the NAR dump through into `saved` unmodified, - so why all this extra work? We still parse the NAR so - that we aren't sending arbitrary data to `saved` - unwittingly`, and we know when the NAR ends so we don't - consume the rest of `from` and can't parse another - command. (We don't trust `addToStoreFromDump` to not - eagerly consume the entire stream it's given, past the - length of the Nar. */ - TeeSource savedNARSource(from, saved); - NullFileSystemObjectSink sink; /* just parse the NAR */ - parseDump(sink, savedNARSource); - } else if (method == FileIngestionMethod::Flat) { - /* Incrementally parse the NAR file, stripping the - metadata, and streaming the sole file we expect into - `saved`. */ - RegularFileSink savedRegular { saved }; - parseDump(savedRegular, from); - if (!savedRegular.regular) throw Error("regular file expected"); - } else { - /* Should have validated above that no other file ingestion - method was used. */ - assert(false); - } + /* We parse the NAR dump through into `saved` unmodified, + so why all this extra work? We still parse the NAR so + that we aren't sending arbitrary data to `saved` + unwittingly`, and we know when the NAR ends so we don't + consume the rest of `from` and can't parse another + command. (We don't trust `addToStoreFromDump` to not + eagerly consume the entire stream it's given, past the + length of the Nar. */ + TeeSource savedNARSource(from, saved); + NullFileSystemObjectSink sink; /* just parse the NAR */ + parseDump(sink, savedNARSource); }); logger->startWork(); - auto path = store->addToStoreFromDump(*dumpSource, baseName, method, hashAlgo); + auto path = store->addToStoreFromDump( + *dumpSource, baseName, FileSerialisationMethod::Recursive, method, hashAlgo); logger->stopWork(); to << store->printStorePath(path); @@ -490,7 +491,7 @@ static void performOp(TunnelLogger * logger, ref store, logger->startWork(); auto path = ({ StringSource source { s }; - store->addToStoreFromDump(source, suffix, TextIngestionMethod {}, HashAlgorithm::SHA256, refs, NoRepair); + store->addToStoreFromDump(source, suffix, FileSerialisationMethod::Flat, TextIngestionMethod {}, HashAlgorithm::SHA256, refs, NoRepair); }); logger->stopWork(); to << store->printStorePath(path); diff --git a/src/libstore/derivations.cc b/src/libstore/derivations.cc index 305ed5b42..df14e979f 100644 --- a/src/libstore/derivations.cc +++ b/src/libstore/derivations.cc @@ -150,7 +150,7 @@ StorePath writeDerivation(Store & store, }) : ({ StringSource s { contents }; - store.addToStoreFromDump(s, suffix, TextIngestionMethod {}, HashAlgorithm::SHA256, references, repair); + store.addToStoreFromDump(s, suffix, FileSerialisationMethod::Flat, TextIngestionMethod {}, HashAlgorithm::SHA256, references, repair); }); } diff --git a/src/libstore/dummy-store.cc b/src/libstore/dummy-store.cc index e4f13b8f4..30f23cff9 100644 --- a/src/libstore/dummy-store.cc +++ b/src/libstore/dummy-store.cc @@ -61,7 +61,8 @@ struct DummyStore : public virtual DummyStoreConfig, public virtual Store virtual StorePath addToStoreFromDump( Source & dump, std::string_view name, - ContentAddressMethod method = FileIngestionMethod::Recursive, + FileSerialisationMethod dumpMethod = FileSerialisationMethod::Recursive, + ContentAddressMethod hashMethod = FileIngestionMethod::Recursive, HashAlgorithm hashAlgo = HashAlgorithm::SHA256, const StorePathSet & references = StorePathSet(), RepairFlag repair = NoRepair) override diff --git a/src/libstore/legacy-ssh-store.hh b/src/libstore/legacy-ssh-store.hh index ae890177b..ca2f115d2 100644 --- a/src/libstore/legacy-ssh-store.hh +++ b/src/libstore/legacy-ssh-store.hh @@ -72,7 +72,8 @@ struct LegacySSHStore : public virtual LegacySSHStoreConfig, public virtual Stor virtual StorePath addToStoreFromDump( Source & dump, std::string_view name, - ContentAddressMethod method = FileIngestionMethod::Recursive, + FileSerialisationMethod dumpMethod = FileSerialisationMethod::Recursive, + ContentAddressMethod hashMethod = FileIngestionMethod::Recursive, HashAlgorithm hashAlgo = HashAlgorithm::SHA256, const StorePathSet & references = StorePathSet(), RepairFlag repair = NoRepair) override diff --git a/src/libstore/local-store.cc b/src/libstore/local-store.cc index 5f35cf3a8..56f8c5dd8 100644 --- a/src/libstore/local-store.cc +++ b/src/libstore/local-store.cc @@ -1148,7 +1148,8 @@ void LocalStore::addToStore(const ValidPathInfo & info, Source & source, StorePath LocalStore::addToStoreFromDump( Source & source0, std::string_view name, - ContentAddressMethod method, + FileSerialisationMethod dumpMethod, + ContentAddressMethod hashMethod, HashAlgorithm hashAlgo, const StorePathSet & references, RepairFlag repair) @@ -1201,7 +1202,13 @@ StorePath LocalStore::addToStoreFromDump( Path tempDir; AutoCloseFD tempDirFd; - if (!inMemory) { + bool methodsMatch = (FileIngestionMethod) dumpMethod == hashMethod; + + /* If the methods don't match, our streaming hash of the dump is the + wrong sort, and we need to rehash. */ + bool inMemoryAndDontNeedRestore = inMemory && methodsMatch; + + if (!inMemoryAndDontNeedRestore) { /* Drain what we pulled so far, and then keep on pulling */ StringSource dumpSource { dump }; ChainSource bothSource { dumpSource, source }; @@ -1210,40 +1217,23 @@ StorePath LocalStore::addToStoreFromDump( delTempDir = std::make_unique(tempDir); tempPath = tempDir + "/x"; - auto fim = method.getFileIngestionMethod(); - switch (fim) { - case FileIngestionMethod::Flat: - case FileIngestionMethod::Recursive: - restorePath(tempPath, bothSource, (FileSerialisationMethod) fim); - break; - case FileIngestionMethod::Git: { - RestoreSink sink; - sink.dstPath = tempPath; - auto accessor = getFSAccessor(); - git::restore(sink, bothSource, [&](Hash childHash) { - return std::pair { - &*accessor, - CanonPath { - printStorePath(this->makeFixedOutputPath("git", FixedOutputInfo { - .method = FileIngestionMethod::Git, - .hash = childHash, - })) - }, - }; - }); - break; - } - } + restorePath(tempPath, bothSource, dumpMethod); dumpBuffer.reset(); dump = {}; } - auto [hash, size] = hashSink->finish(); + auto [dumpHash, size] = hashSink->finish(); + + PosixSourceAccessor accessor; auto desc = ContentAddressWithReferences::fromParts( - method, - hash, + hashMethod, + methodsMatch + ? dumpHash + : hashPath( + accessor, CanonPath { tempPath }, + hashMethod.getFileIngestionMethod(), hashAlgo), { .others = references, // caller is not capable of creating a self-reference, because this is content-addressed without modulus @@ -1269,32 +1259,19 @@ StorePath LocalStore::addToStoreFromDump( autoGC(); - if (inMemory) { + if (inMemoryAndDontNeedRestore) { StringSource dumpSource { dump }; /* Restore from the buffer in memory. */ - auto fim = method.getFileIngestionMethod(); + auto fim = hashMethod.getFileIngestionMethod(); switch (fim) { case FileIngestionMethod::Flat: case FileIngestionMethod::Recursive: restorePath(realPath, dumpSource, (FileSerialisationMethod) fim); break; - case FileIngestionMethod::Git: { - RestoreSink sink; - sink.dstPath = realPath; - auto accessor = getFSAccessor(); - git::restore(sink, dumpSource, [&](Hash childHash) { - return std::pair { - &*accessor, - CanonPath { - printStorePath(this->makeFixedOutputPath("git", FixedOutputInfo { - .method = FileIngestionMethod::Git, - .hash = childHash, - })) - }, - }; - }); - break; - } + case FileIngestionMethod::Git: + // doesn't correspond to serialization method, so + // this should be unreachable + assert(false); } } else { /* Move the temporary path we restored above. */ @@ -1303,8 +1280,8 @@ StorePath LocalStore::addToStoreFromDump( /* For computing the nar hash. In recursive SHA-256 mode, this is the same as the store hash, so no need to do it again. */ - auto narHash = std::pair { hash, size }; - if (method != FileIngestionMethod::Recursive || hashAlgo != HashAlgorithm::SHA256) { + auto narHash = std::pair { dumpHash, size }; + if (dumpMethod != FileSerialisationMethod::Recursive || hashAlgo != HashAlgorithm::SHA256) { HashSink narSink { HashAlgorithm::SHA256 }; dumpPath(realPath, narSink); narHash = narSink.finish(); diff --git a/src/libstore/local-store.hh b/src/libstore/local-store.hh index ba56d3ead..7eff1d690 100644 --- a/src/libstore/local-store.hh +++ b/src/libstore/local-store.hh @@ -180,7 +180,8 @@ public: StorePath addToStoreFromDump( Source & dump, std::string_view name, - ContentAddressMethod method, + FileSerialisationMethod dumpMethod, + ContentAddressMethod hashMethod, HashAlgorithm hashAlgo, const StorePathSet & references, RepairFlag repair) override; diff --git a/src/libstore/remote-store.cc b/src/libstore/remote-store.cc index 0cae84828..8dfe8adda 100644 --- a/src/libstore/remote-store.cc +++ b/src/libstore/remote-store.cc @@ -509,12 +509,28 @@ ref RemoteStore::addCAToStore( StorePath RemoteStore::addToStoreFromDump( Source & dump, std::string_view name, - ContentAddressMethod method, + FileSerialisationMethod dumpMethod, + ContentAddressMethod hashMethod, HashAlgorithm hashAlgo, const StorePathSet & references, RepairFlag repair) { - return addCAToStore(dump, name, method, hashAlgo, references, repair)->path; + FileSerialisationMethod fsm; + switch (hashMethod.getFileIngestionMethod()) { + case FileIngestionMethod::Flat: + fsm = FileSerialisationMethod::Flat; + break; + case FileIngestionMethod::Recursive: + fsm = FileSerialisationMethod::Recursive; + break; + case FileIngestionMethod::Git: + // Use NAR; Git is not a serialization method + fsm = FileSerialisationMethod::Recursive; + break; + } + if (fsm != dumpMethod) + unsupported("RemoteStore::addToStoreFromDump doesn't support this `dumpMethod` `hashMethod` combination"); + return addCAToStore(dump, name, hashMethod, hashAlgo, references, repair)->path; } diff --git a/src/libstore/remote-store.hh b/src/libstore/remote-store.hh index c51a21375..d630adc08 100644 --- a/src/libstore/remote-store.hh +++ b/src/libstore/remote-store.hh @@ -87,7 +87,8 @@ public: StorePath addToStoreFromDump( Source & dump, std::string_view name, - ContentAddressMethod method = FileIngestionMethod::Recursive, + FileSerialisationMethod dumpMethod = FileSerialisationMethod::Recursive, + ContentAddressMethod hashMethod = FileIngestionMethod::Recursive, HashAlgorithm hashAlgo = HashAlgorithm::SHA256, const StorePathSet & references = StorePathSet(), RepairFlag repair = NoRepair) override; diff --git a/src/libstore/store-api.cc b/src/libstore/store-api.cc index c44612ec5..4356296d4 100644 --- a/src/libstore/store-api.cc +++ b/src/libstore/store-api.cc @@ -197,40 +197,23 @@ StorePath Store::addToStore( PathFilter & filter, RepairFlag repair) { + FileSerialisationMethod fsm; + switch (method.getFileIngestionMethod()) { + case FileIngestionMethod::Flat: + fsm = FileSerialisationMethod::Flat; + break; + case FileIngestionMethod::Recursive: + fsm = FileSerialisationMethod::Recursive; + break; + case FileIngestionMethod::Git: + // Use NAR; Git is not a serialization method + fsm = FileSerialisationMethod::Recursive; + break; + } auto source = sinkToSource([&](Sink & sink) { - auto fim = method.getFileIngestionMethod(); - switch (fim) { - case FileIngestionMethod::Flat: - case FileIngestionMethod::Recursive: - { - dumpPath(accessor, path, sink, (FileSerialisationMethod) fim, filter); - break; - } - case FileIngestionMethod::Git: - { - git::dump( - accessor, path, - sink, - // recursively add to store if path is a directory - [&](const CanonPath & path) -> git::TreeEntry { - auto storePath = addToStore("git", accessor, path, method, hashAlgo, references, filter, repair); - auto info = queryPathInfo(storePath); - assert(info->ca); - assert(info->ca->method == FileIngestionMethod::Git); - auto stat = getFSAccessor()->lstat(CanonPath(printStorePath(storePath))); - auto gitModeOpt = git::convertMode(stat.type); - assert(gitModeOpt); - return { - .mode = *gitModeOpt, - .hash = info->ca->hash, - }; - }, - filter); - break; - } - } + dumpPath(accessor, path, sink, fsm, filter); }); - return addToStoreFromDump(*source, name, method, hashAlgo, references, repair); + return addToStoreFromDump(*source, name, fsm, method, hashAlgo, references, repair); } void Store::addMultipleToStore( diff --git a/src/libstore/store-api.hh b/src/libstore/store-api.hh index 5163070b2..5f683a211 100644 --- a/src/libstore/store-api.hh +++ b/src/libstore/store-api.hh @@ -466,14 +466,23 @@ public: * in `dump`, which is either a NAR serialisation (if recursive == * true) or simply the contents of a regular file (if recursive == * false). - * `dump` may be drained * - * \todo remove? + * `dump` may be drained. + * + * @param dumpMethod What serialisation format is `dump`, i.e. how + * to deserialize it. Must either match hashMethod or be + * `FileSerialisationMethod::Recursive`. + * + * @param hashMethod How content addressing? Need not match be the + * same as `dumpMethod`. + * + * @todo remove? */ virtual StorePath addToStoreFromDump( Source & dump, std::string_view name, - ContentAddressMethod method = FileIngestionMethod::Recursive, + FileSerialisationMethod dumpMethod = FileSerialisationMethod::Recursive, + ContentAddressMethod hashMethod = FileIngestionMethod::Recursive, HashAlgorithm hashAlgo = HashAlgorithm::SHA256, const StorePathSet & references = StorePathSet(), RepairFlag repair = NoRepair) = 0; @@ -772,7 +781,7 @@ protected: * Helper for methods that are not unsupported: this is used for * default definitions for virtual methods that are meant to be overriden. * - * \todo Using this should be a last resort. It is better to make + * @todo Using this should be a last resort. It is better to make * the method "virtual pure" and/or move it to a subclass. */ [[noreturn]] void unsupported(const std::string & op) diff --git a/src/nix-env/user-env.cc b/src/nix-env/user-env.cc index 2f9c988d5..8bebe2b9e 100644 --- a/src/nix-env/user-env.cc +++ b/src/nix-env/user-env.cc @@ -113,7 +113,7 @@ bool createUserEnv(EvalState & state, PackageInfos & elems, std::string str2 = str.str(); StringSource source { str2 }; state.store->addToStoreFromDump( - source, "env-manifest.nix", TextIngestionMethod {}, HashAlgorithm::SHA256, references); + source, "env-manifest.nix", FileSerialisationMethod::Flat, TextIngestionMethod {}, HashAlgorithm::SHA256, references); }); /* Get the environment builder expression. */ diff --git a/src/nix/develop.cc b/src/nix/develop.cc index 403178a5d..c1842f2d5 100644 --- a/src/nix/develop.cc +++ b/src/nix/develop.cc @@ -226,7 +226,7 @@ static StorePath getDerivationEnvironment(ref store, ref evalStore auto getEnvShPath = ({ StringSource source { getEnvSh }; evalStore->addToStoreFromDump( - source, "get-env.sh", TextIngestionMethod {}, HashAlgorithm::SHA256, {}); + source, "get-env.sh", FileSerialisationMethod::Flat, TextIngestionMethod {}, HashAlgorithm::SHA256, {}); }); drv.args = {store->printStorePath(getEnvShPath)};