From 177bf25d648092826a75369191503a3f2bb763a4 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 15 Apr 2016 14:28:00 +0200 Subject: [PATCH] Queue monitor: Bail out earlier if a step has failed previously Currently, the hydra.nixos.org queue contains 1000s of Darwin builds that all depend on a stdenv-darwin that previously failed. However, before, first createStep() would construct a dependency graph for each build, then getQueuedBuilds() would discover that one of the steps had failed previously and discard all those steps. Since the graph construction involves a lot of uncached calls to isValidPath(), this took several seconds per build. Now createStep() detects the previous failure right away and bails out. --- src/hydra-queue-runner/queue-monitor.cc | 111 +++++++++++++----------- 1 file changed, 61 insertions(+), 50 deletions(-) diff --git a/src/hydra-queue-runner/queue-monitor.cc b/src/hydra-queue-runner/queue-monitor.cc index 1de0b45d..3b3fb997 100644 --- a/src/hydra-queue-runner/queue-monitor.cc +++ b/src/hydra-queue-runner/queue-monitor.cc @@ -63,6 +63,12 @@ void State::queueMonitorLoop() } +struct PreviousFailure : public std::exception { + Step::ptr step; + PreviousFailure(Step::ptr step) : step(step) { } +}; + + bool State::getQueuedBuilds(Connection & conn, ref localStore, ref destStore, unsigned int & lastBuildId) { @@ -139,8 +145,55 @@ bool State::getQueuedBuilds(Connection & conn, ref localStore, } std::set newSteps; - Step::ptr step = createStep(destStore, conn, build, build->drvPath, - build, 0, finishedDrvs, newSteps, newRunnable); + Step::ptr step; + + /* Create steps for this derivation and its dependencies. */ + try { + step = createStep(destStore, conn, build, build->drvPath, + build, 0, finishedDrvs, newSteps, newRunnable); + } catch (PreviousFailure & ex) { + + /* Some step previously failed, so mark the build as + failed right away. */ + printMsg(lvlError, format("marking build %d as cached failure due to ā€˜%sā€™") % build->id % ex.step->drvPath); + if (!build->finishedInDB) { + auto mc = startDbUpdate(); + pqxx::work txn(conn); + + /* Find the previous build step record, first by + derivation path, then by output path. */ + BuildID propagatedFrom = 0; + + auto res = txn.parameterized + ("select max(build) from BuildSteps where drvPath = $1 and startTime != 0 and stopTime != 0 and status = 1") + (ex.step->drvPath).exec(); + if (!res[0][0].is_null()) propagatedFrom = res[0][0].as(); + + if (!propagatedFrom) { + for (auto & output : ex.step->drv.outputs) { + auto res = txn.parameterized + ("select max(s.build) from BuildSteps s join BuildStepOutputs o on s.build = o.build where path = $1 and startTime != 0 and stopTime != 0 and status = 1") + (output.second.path).exec(); + if (!res[0][0].is_null()) { + propagatedFrom = res[0][0].as(); + break; + } + } + } + + createBuildStep(txn, 0, build, ex.step, "", bsCachedFailure, "", propagatedFrom); + txn.parameterized + ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = 1 where id = $1 and finished = 0") + (build->id) + ((int) (ex.step->drvPath == build->drvPath ? bsFailed : bsDepFailed)) + (time(0)).exec(); + txn.commit(); + build->finishedInDB = true; + nrBuildsDone++; + } + + return; + } /* Some of the new steps may be the top level of builds that we haven't processed yet. So do them now. This ensures that @@ -174,53 +227,6 @@ bool State::getQueuedBuilds(Connection & conn, ref localStore, return; } - /* If any step has a previously failed output path, then fail - the build right away. */ - bool badStep = false; - for (auto & r : newSteps) - if (checkCachedFailure(r, conn)) { - printMsg(lvlError, format("marking build %1% as cached failure") % build->id); - if (!build->finishedInDB) { - auto mc = startDbUpdate(); - pqxx::work txn(conn); - - /* Find the previous build step record, first by - derivation path, then by output path. */ - BuildID propagatedFrom = 0; - - auto res = txn.parameterized - ("select max(build) from BuildSteps where drvPath = $1 and startTime != 0 and stopTime != 0 and status = 1") - (r->drvPath).exec(); - if (!res[0][0].is_null()) propagatedFrom = res[0][0].as(); - - if (!propagatedFrom) { - for (auto & output : r->drv.outputs) { - auto res = txn.parameterized - ("select max(s.build) from BuildSteps s join BuildStepOutputs o on s.build = o.build where path = $1 and startTime != 0 and stopTime != 0 and status = 1") - (output.second.path).exec(); - if (!res[0][0].is_null()) { - propagatedFrom = res[0][0].as(); - break; - } - } - } - - createBuildStep(txn, 0, build, r, "", bsCachedFailure, "", propagatedFrom); - txn.parameterized - ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = 1 where id = $1 and finished = 0") - (build->id) - ((int) (step == r ? bsFailed : bsDepFailed)) - (time(0)).exec(); - txn.commit(); - build->finishedInDB = true; - nrBuildsDone++; - } - badStep = true; - break; - } - - if (badStep) return; - /* Note: if we exit this scope prior to this, the build and all newly created steps are destroyed. */ @@ -401,6 +407,10 @@ Step::ptr State::createStep(ref destStore, } } + /* If this derivation failed previously, give up. */ + if (checkCachedFailure(step, conn)) + throw PreviousFailure{step}; + /* Are all outputs valid? */ bool valid = true; PathSet outputs = step->drv.outputPaths(); @@ -453,7 +463,6 @@ Step::ptr State::createStep(ref destStore, /* No, we need to build. */ printMsg(lvlDebug, format("creating build step ā€˜%1%ā€™") % drvPath); - newSteps.insert(step); /* Create steps for the dependencies. */ for (auto & i : step->drv.inputDrvs) { @@ -474,6 +483,8 @@ Step::ptr State::createStep(ref destStore, newRunnable.insert(step); } + newSteps.insert(step); + return step; }