Blob Blame History Raw
From 0ce7bf2cdb236c453af5f2d6c1b021c3a47a3a77 Mon Sep 17 00:00:00 2001
From: Adam Williamson <awilliam@redhat.com>
Date: Tue, 19 Feb 2019 12:08:32 -0800
Subject: [PATCH] Setting to not cancel parallel parents with still-pending
 children

As discussed extensively in
https://progress.opensuse.org/issues/46295 , openQA job logic
makes an assumption that, any time a parallel child fails or is
cancelled, its parent and any other pending children of that
parent ought to be cancelled. This is the behaviour SUSE's tests
expect, but it is not the behaviour Fedora's tests expect. In
Fedora we have several cases of clusters where a parallel parent
acts as a server to multiple unrelated child tests; if one of
the children fails, that does not mean the parent and all other
children must be cancelled.

This patch adds a job setting to set whether parallel parents
with other pending children (and hence those children) will be
cancelled when one child fails or is cancelled. The default is
the current behaviour. For the parent and the other pending
children *not* to be canceled, the parent must have the setting
`PARALLEL_CANCEL_WHOLE_CLUSTER` set to 0 (or anything false-y;
empty string also works).

Signed-off-by: Adam Williamson <awilliam@redhat.com>
---
 docs/WritingTests.asciidoc       |  5 +++
 lib/OpenQA/Schema/Result/Jobs.pm | 47 ++++++++++++++++++------
 t/05-scheduler-cancel.t          | 63 ++++++++++++++++++++++++++++++++
 3 files changed, 104 insertions(+), 11 deletions(-)

diff --git a/docs/WritingTests.asciidoc b/docs/WritingTests.asciidoc
index 67851937..152c10ff 100644
--- a/docs/WritingTests.asciidoc
+++ b/docs/WritingTests.asciidoc
@@ -298,6 +298,11 @@ Then, in job templates, add test suite(s) and all of its dependent test suite(s)
 have been explicitly defined in a variable for each dependent test suite.
 Checkout the example sections to get a better understanding:
 
+===== Handling of related jobs on failure / cancellation / restart
+openQA tries to handle things sensibly when jobs with relations like this either fail, or are manually cancelled or restarted. When a chained or parallel parent fails or is cancelled, all children will be cancelled; if the parent is restarted, all children are also restarted. When a parallel child is restarted, the parent and any other children will also be restarted. When a chained child is restarted, the parent is not restarted; this will usually be fine, but be aware that if an asset uploaded by the chained parent has been cleaned up, the child may fail immediately. To deal with this case, just restart the parent.
+
+By default, when a parallel *child* fails or is cancelled, the parent and all other children are also cancelled. This behaviour is intended for closely-related clusters of jobs, e.g. high availability tests, where it's sensible to assume the entire test is invalid if any of its components fails. A special variable can be used to change this behaviour. Setting a parallel parent job's PARALLEL_CANCEL_WHOLE_CLUSTER to any truth-y value (e.g. 1 or 'true') changes this so that, if one of its children fails or is cancelled but the parent has other pending or active children, the parent and the other children will not be cancelled. This behaviour makes more sense if the parent is providing services to the various children but the children themselves are not closely related and a failure of one does not imply that the tests run by the other children and the parent are invalid.
+
 ====== Example: correct dependency and machine placed
 ----
 There is a test suite A placed on machine 64bit-8G,
diff --git a/lib/OpenQA/Schema/Result/Jobs.pm b/lib/OpenQA/Schema/Result/Jobs.pm
index f5ec2a1d..bc4f5c39 100644
--- a/lib/OpenQA/Schema/Result/Jobs.pm
+++ b/lib/OpenQA/Schema/Result/Jobs.pm
@@ -729,9 +729,17 @@ sub create_clones {
 # internal (recursive) function for duplicate - returns hash of all jobs in the
 # cluster of the current job (in no order but with relations)
 sub cluster_jobs {
-    my ($self, $jobs) = @_;
+    my $self = shift;
+    my %args = (
+        jobs => {},
+        # set to 1 when called on a cluster job being cancelled or failing;
+        # affects whether we include parallel parents with
+        # PARALLEL_CANCEL_WHOLE_CLUSTER set if they have other pending children
+        cancelmode => 0,
+        @_
+    );
 
-    $jobs ||= {};
+    my $jobs = $args{jobs};
     return $jobs if defined $jobs->{$self->id};
     $jobs->{$self->id} = {
         parallel_parents  => [],
@@ -741,7 +749,7 @@ sub cluster_jobs {
 
     ## if we have a parallel parent, go up recursively
     my $parents = $self->parents;
-    while (my $pd = $parents->next) {
+  PARENT: while (my $pd = $parents->next) {
         my $p = $pd->parent;
 
         if ($pd->dependency eq OpenQA::Schema::Result::JobDependencies->CHAINED) {
@@ -751,7 +759,23 @@ sub cluster_jobs {
         }
         else {
             push(@{$jobs->{$self->id}->{parallel_parents}}, $p->id);
-            $p->cluster_jobs($jobs);
+            my $cancelwhole = 1;
+            # check if the setting to disable cancelwhole is set: the var
+            # must exist and be set to something false-y
+            my $cwset = $p->settings_hash->{PARALLEL_CANCEL_WHOLE_CLUSTER};
+            $cancelwhole = 0 if (defined $cwset && !$cwset);
+            if ($args{cancelmode} && !$cancelwhole) {
+                # skip calling cluster_jobs (so cancelling it and its other
+                # related jobs) if job has pending children we are not
+                # cancelling
+                my $otherchildren = $p->children;
+              CHILD: while (my $childr = $otherchildren->next) {
+                    my $child = $childr->child;
+                    next CHILD unless grep { $child->state eq $_ } PENDING_STATES;
+                    next PARENT unless $jobs->{$child->id};
+                }
+            }
+            $p->cluster_jobs(jobs => $jobs);
         }
     }
 
@@ -772,7 +796,7 @@ sub cluster_children {
         next if $c->clone_id;
 
         # do not fear the recursion
-        $c->cluster_jobs($jobs);
+        $c->cluster_jobs(jobs => $jobs);
         if ($cd->dependency eq OpenQA::Schema::Result::JobDependencies->PARALLEL) {
             push(@{$jobs->{$self->id}->{parallel_children}}, $c->id);
         }
@@ -1714,8 +1738,9 @@ sub store_column {
     return $self->SUPER::store_column(%args);
 }
 
-# parent job failed, handle running children - send stop command
-sub _job_stop_child {
+# used to stop jobs with some kind of dependency relationship to another
+# job that failed or was cancelled, see cluster_jobs(), cancel() and done()
+sub _job_stop_cluster {
     my ($self, $job) = @_;
 
     # skip ourselves
@@ -1811,9 +1836,9 @@ sub done {
     $self->update(\%new_val);
 
     if (defined $new_val{result} && !grep { $result eq $_ } OK_RESULTS) {
-        my $jobs = $self->cluster_jobs;
+        my $jobs = $self->cluster_jobs(cancelmode => 1);
         for my $job (sort keys %$jobs) {
-            $self->_job_stop_child($job);
+            $self->_job_stop_cluster($job);
         }
     }
 
@@ -1841,9 +1866,9 @@ sub cancel {
     if ($self->worker) {
         $self->worker->send_command(command => 'cancel', job_id => $self->id);
     }
-    my $jobs = $self->cluster_jobs;
+    my $jobs = $self->cluster_jobs(cancelmode => 1);
     for my $job (sort keys %$jobs) {
-        $count += $self->_job_stop_child($job);
+        $count += $self->_job_stop_cluster($job);
     }
 
     return $count;
diff --git a/t/05-scheduler-cancel.t b/t/05-scheduler-cancel.t
index b95a322d..917fc28f 100644
--- a/t/05-scheduler-cancel.t
+++ b/t/05-scheduler-cancel.t
@@ -304,4 +304,67 @@ subtest 'chained parent fails -> parallel parents of children are cancelled (ski
     is($jobD->result, OpenQA::Jobs::Constants::SKIPPED,   'D result is skipped');
 };
 
+subtest 'parallel child with one parent fails -> parent is cancelled' => sub {
+    my %settingsA = %settings;
+    my %settingsB = %settings;
+    $settingsA{TEST} = 'A';
+    $settingsB{TEST} = 'B';
+    my $jobA = _job_create(\%settingsA);
+    $settingsB{_PARALLEL_JOBS} = [$jobA->id];
+    my $jobB = _job_create(\%settingsB);
+
+    # set B as failed and reload A from database
+    $jobB->done(result => OpenQA::Jobs::Constants::FAILED);
+    $jobA->discard_changes;
+
+    is($jobA->state, OpenQA::Jobs::Constants::CANCELLED, 'A state is cancelled');
+};
+
+subtest 'failure behaviour for multiple parallel children' => sub {
+    my %settingsA = %settings;
+    my %settingsB = %settings;
+    my %settingsC = %settings;
+    $settingsA{TEST} = 'A';
+    $settingsB{TEST} = 'B';
+    $settingsC{TEST} = 'C';
+    my $jobA = _job_create(\%settingsA);
+    $settingsB{_PARALLEL_JOBS} = [$jobA->id];
+    $settingsC{_PARALLEL_JOBS} = [$jobA->id];
+    my $jobB = _job_create(\%settingsB);
+    my $jobC = _job_create(\%settingsC);
+
+    # set B as failed and reload A and C from database
+    $jobB->done(result => OpenQA::Jobs::Constants::FAILED);
+    $jobA->discard_changes;
+    $jobC->discard_changes;
+
+    # A and C should be cancelled
+    is($jobA->state, OpenQA::Jobs::Constants::CANCELLED, 'A state is cancelled');
+    is($jobC->state, OpenQA::Jobs::Constants::CANCELLED, 'C state is cancelled');
+
+    # now test in 'do not cancel parent and other children' mode
+    $settingsA{PARALLEL_CANCEL_WHOLE_CLUSTER} = '0';
+    $jobA                                     = _job_create(\%settingsA);
+    $settingsB{_PARALLEL_JOBS}                = [$jobA->id];
+    $settingsC{_PARALLEL_JOBS}                = [$jobA->id];
+    $jobB                                     = _job_create(\%settingsB);
+    $jobC                                     = _job_create(\%settingsC);
+
+    # set B as failed and reload A and C from database
+    $jobB->done(result => OpenQA::Jobs::Constants::FAILED);
+    $jobA->discard_changes;
+    $jobC->discard_changes;
+
+    # this time A and C should still be scheduled (*not* cancelled)
+    is($jobA->state, OpenQA::Jobs::Constants::SCHEDULED, 'new A state is scheduled');
+    is($jobC->state, OpenQA::Jobs::Constants::SCHEDULED, 'new C state is scheduled');
+
+    # now set C as failed and reload A
+    $jobC->done(result => OpenQA::Jobs::Constants::FAILED);
+    $jobA->discard_changes;
+
+    # now A *should* be cancelled
+    is($jobA->state, OpenQA::Jobs::Constants::CANCELLED, 'new A state is cancelled');
+};
+
 done_testing();
-- 
2.21.0