From 177bf25d648092826a75369191503a3f2bb763a4 Mon Sep 17 00:00:00 2001
From: Eelco Dolstra <eelco.dolstra@logicblox.com>
Date: Fri, 15 Apr 2016 14:28:00 +0200
Subject: [PATCH] Queue monitor: Bail out earlier if a step has failed
 previously

Currently, the hydra.nixos.org queue contains 1000s of Darwin builds
that all depend on a stdenv-darwin that previously failed. However,
before, first createStep() would construct a dependency graph for each
build, then getQueuedBuilds() would discover that one of the steps had
failed previously and discard all those steps. Since the graph
construction involves a lot of uncached calls to isValidPath(), this
took several seconds per build.

Now createStep() detects the previous failure right away and bails
out.
---
 src/hydra-queue-runner/queue-monitor.cc | 111 +++++++++++++-----------
 1 file changed, 61 insertions(+), 50 deletions(-)
diff --git a/src/hydra-queue-runner/queue-monitor.cc b/src/hydra-queue-runner/queue-monitor.cc
index 1de0b45d..3b3fb997 100644
--- a/src/hydra-queue-runner/queue-monitor.cc
+++ b/src/hydra-queue-runner/queue-monitor.cc
@@ -63,6 +63,12 @@ void State::queueMonitorLoop()
 }
 
 
+struct PreviousFailure : public std::exception {
+    Step::ptr step;
+    PreviousFailure(Step::ptr step) : step(step) { }
+};
+
+
 bool State::getQueuedBuilds(Connection & conn, ref<Store> localStore,
     ref<Store> destStore, unsigned int & lastBuildId)
 {
@@ -139,8 +145,55 @@ bool State::getQueuedBuilds(Connection & conn, ref<Store> localStore,
         }
 
         std::set<Step::ptr> newSteps;
-        Step::ptr step = createStep(destStore, conn, build, build->drvPath,
-            build, 0, finishedDrvs, newSteps, newRunnable);
+        Step::ptr step;
+
+        /* Create steps for this derivation and its dependencies. */
+        try {
+            step = createStep(destStore, conn, build, build->drvPath,
+                build, 0, finishedDrvs, newSteps, newRunnable);
+        } catch (PreviousFailure & ex) {
+
+            /* Some step previously failed, so mark the build as
+               failed right away. */
+            printMsg(lvlError, format("marking build %d as cached failure due to ‘%s’") % build->id % ex.step->drvPath);
+            if (!build->finishedInDB) {
+                auto mc = startDbUpdate();
+                pqxx::work txn(conn);
+
+                /* Find the previous build step record, first by
+                   derivation path, then by output path. */
+                BuildID propagatedFrom = 0;
+
+                auto res = txn.parameterized
+                    ("select max(build) from BuildSteps where drvPath = $1 and startTime != 0 and stopTime != 0 and status = 1")
+                    (ex.step->drvPath).exec();
+                if (!res[0][0].is_null()) propagatedFrom = res[0][0].as<BuildID>();
+
+                if (!propagatedFrom) {
+                    for (auto & output : ex.step->drv.outputs) {
+                        auto res = txn.parameterized
+                            ("select max(s.build) from BuildSteps s join BuildStepOutputs o on s.build = o.build where path = $1 and startTime != 0 and stopTime != 0 and status = 1")
+                            (output.second.path).exec();
+                        if (!res[0][0].is_null()) {
+                            propagatedFrom = res[0][0].as<BuildID>();
+                            break;
+                        }
+                    }
+                }
+
+                createBuildStep(txn, 0, build, ex.step, "", bsCachedFailure, "", propagatedFrom);
+                txn.parameterized
+                    ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = 1 where id = $1 and finished = 0")
+                    (build->id)
+                    ((int) (ex.step->drvPath == build->drvPath ? bsFailed : bsDepFailed))
+                    (time(0)).exec();
+                txn.commit();
+                build->finishedInDB = true;
+                nrBuildsDone++;
+            }
+
+            return;
+        }
 
         /* Some of the new steps may be the top level of builds that
            we haven't processed yet. So do them now. This ensures that
@@ -174,53 +227,6 @@ bool State::getQueuedBuilds(Connection & conn, ref<Store> localStore,
             return;
         }
 
-        /* If any step has a previously failed output path, then fail
-           the build right away. */
-        bool badStep = false;
-        for (auto & r : newSteps)
-            if (checkCachedFailure(r, conn)) {
-                printMsg(lvlError, format("marking build %1% as cached failure") % build->id);
-                if (!build->finishedInDB) {
-                    auto mc = startDbUpdate();
-                    pqxx::work txn(conn);
-
-                    /* Find the previous build step record, first by
-                       derivation path, then by output path. */
-                    BuildID propagatedFrom = 0;
-
-                    auto res = txn.parameterized
-                        ("select max(build) from BuildSteps where drvPath = $1 and startTime != 0 and stopTime != 0 and status = 1")
-                        (r->drvPath).exec();
-                    if (!res[0][0].is_null()) propagatedFrom = res[0][0].as<BuildID>();
-
-                    if (!propagatedFrom) {
-                        for (auto & output : r->drv.outputs) {
-                            auto res = txn.parameterized
-                                ("select max(s.build) from BuildSteps s join BuildStepOutputs o on s.build = o.build where path = $1 and startTime != 0 and stopTime != 0 and status = 1")
-                                (output.second.path).exec();
-                            if (!res[0][0].is_null()) {
-                                propagatedFrom = res[0][0].as<BuildID>();
-                                break;
-                            }
-                        }
-                    }
-
-                    createBuildStep(txn, 0, build, r, "", bsCachedFailure, "", propagatedFrom);
-                    txn.parameterized
-                        ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = 1 where id = $1 and finished = 0")
-                        (build->id)
-                        ((int) (step == r ? bsFailed : bsDepFailed))
-                        (time(0)).exec();
-                    txn.commit();
-                    build->finishedInDB = true;
-                    nrBuildsDone++;
-                }
-                badStep = true;
-                break;
-            }
-
-        if (badStep) return;
-
         /* Note: if we exit this scope prior to this, the build and
            all newly created steps are destroyed. */
 
@@ -401,6 +407,10 @@ Step::ptr State::createStep(ref<Store> destStore,
         }
     }
 
+    /* If this derivation failed previously, give up. */
+    if (checkCachedFailure(step, conn))
+        throw PreviousFailure{step};
+
     /* Are all outputs valid? */
     bool valid = true;
     PathSet outputs = step->drv.outputPaths();
@@ -453,7 +463,6 @@ Step::ptr State::createStep(ref<Store> destStore,
 
     /* No, we need to build. */
     printMsg(lvlDebug, format("creating build step ‘%1%’") % drvPath);
-    newSteps.insert(step);
 
     /* Create steps for the dependencies. */
     for (auto & i : step->drv.inputDrvs) {
@@ -474,6 +483,8 @@ Step::ptr State::createStep(ref<Store> destStore,
             newRunnable.insert(step);
     }
 
+    newSteps.insert(step);
+
     return step;
 }