[ASTERIXDB-3554][CONF]: Increasing timeout for aborting tasks
- user model changes: yes
- storage format changes: no
- interface changes: no
Details:
Increasing timeout for aborting tasks from 2mins to 10mins.
Ext-ref: MB-64974
Change-Id: I1e597eb280e996370f6be604bef28691fc9acd2c
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/19357
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Ritik Raj <raj.ritik9835@gmail.com>
Reviewed-by: Murtadha Hubail <mhubail@apache.org>
diff --git a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/NCConfig.java b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/NCConfig.java
index bb40e2b..44b2fdc 100644
--- a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/NCConfig.java
+++ b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-common/src/main/java/org/apache/hyracks/control/common/controllers/NCConfig.java
@@ -30,6 +30,7 @@
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.List;
+import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import org.apache.hyracks.api.config.IApplicationConfig;
@@ -72,6 +73,7 @@
REPLICATION_PUBLIC_ADDRESS(STRING, PUBLIC_ADDRESS),
REPLICATION_PUBLIC_PORT(NONNEGATIVE_INTEGER, REPLICATION_LISTEN_PORT),
CLUSTER_CONNECT_RETRIES(NONNEGATIVE_INTEGER, 5),
+ ABORT_TASKS_TIMEOUT(POSITIVE_INTEGER, (int) TimeUnit.MINUTES.toMillis(10)),
IODEVICES(
STRING_ARRAY,
appConfig -> new String[] {
@@ -253,6 +255,8 @@
return "Path to systemd socket for fenced Python UDFs. Requires JDK17+, *nix operating system, and ";
case CREDENTIAL_FILE:
return "Path to HTTP basic credentials";
+ case ABORT_TASKS_TIMEOUT:
+ return "The maximum time to wait for the tasks to be aborted";
default:
throw new IllegalStateException("Not yet implemented: " + this);
}
@@ -628,4 +632,8 @@
return getAppConfig().getString(Option.CREDENTIAL_FILE);
}
+ public int getAbortedTasksTimeout() {
+ return appConfig.getInt(Option.ABORT_TASKS_TIMEOUT);
+ }
+
}
diff --git a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/work/EnsureAllCcTasksCompleted.java b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/work/EnsureAllCcTasksCompleted.java
index 9e090f2..fcc2aa2 100644
--- a/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/work/EnsureAllCcTasksCompleted.java
+++ b/hyracks-fullstack/hyracks/hyracks-control/hyracks-control-nc/src/main/java/org/apache/hyracks/control/nc/work/EnsureAllCcTasksCompleted.java
@@ -35,7 +35,6 @@
public class EnsureAllCcTasksCompleted implements Runnable {
private static final Logger LOGGER = LogManager.getLogger();
- private static final long TIMEOUT = TimeUnit.MINUTES.toMillis(2);
private final NodeControllerService ncs;
private final CcId ccId;
private final Deque<Task> runningTasks;
@@ -58,7 +57,8 @@
}
private void waitForTaskCompletion() throws InterruptedException {
- final Span maxWaitTime = Span.start(TIMEOUT, TimeUnit.MILLISECONDS);
+ long taskTimeout = ncs.getConfiguration().getAbortedTasksTimeout();
+ final Span maxWaitTime = Span.start(taskTimeout, TimeUnit.MILLISECONDS);
while (!maxWaitTime.elapsed()) {
removeCompleted();
if (runningTasks.isEmpty()) {
@@ -81,7 +81,7 @@
}
} else {
LOGGER.error("{} tasks associated with CC {} failed to complete after {}ms. Giving up", runningTasks.size(),
- ccId, TIMEOUT);
+ ccId, taskTimeout);
logPendingTasks();
ExitUtil.halt(ExitUtil.EC_NC_FAILED_TO_ABORT_ALL_PREVIOUS_TASKS);
}