[NO ISSUE][CLUS] Interrupt global recovery on node failure
- user model changes: no
- storage format changes: no
- interface changes: no
Details:
- When a node fails while global recovery is on-going, interrupt
recovery to avoid unnecessary waiting.
Change-Id: I58852e046ff4021f4c5d115f5c3488b249fc61a2
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/14025
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Murtadha Hubail <mhubail@apache.org>
Reviewed-by: Ali Alsuliman <ali.al.solaiman@gmail.com>
diff --git a/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/GlobalRecoveryManager.java b/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/GlobalRecoveryManager.java
index 9438b16..e6ef8df 100644
--- a/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/GlobalRecoveryManager.java
+++ b/asterixdb/asterix-app/src/main/java/org/apache/asterix/hyracks/bootstrap/GlobalRecoveryManager.java
@@ -23,6 +23,7 @@
import java.util.Collections;
import java.util.List;
import java.util.Set;
+import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.apache.asterix.app.message.StorageCleanupRequestMessage;
@@ -64,6 +65,7 @@
protected final IHyracksClientConnection hcc;
protected volatile boolean recoveryCompleted;
protected volatile boolean recovering;
+ protected Future<?> recoveryFuture;
public GlobalRecoveryManager(ICCServiceContext serviceCtx, IHyracksClientConnection hcc,
IStorageComponentProvider componentProvider) {
@@ -98,7 +100,7 @@
* Perform recovery on a different thread to avoid deadlocks in
* {@link org.apache.asterix.common.cluster.IClusterStateManager}
*/
- serviceCtx.getControllerService().getExecutor().submit(() -> {
+ recoveryFuture = serviceCtx.getControllerService().getExecutor().submit(() -> {
try {
recover(appCtx);
} catch (Throwable e) {
@@ -127,6 +129,9 @@
MetadataManager.INSTANCE.commitTransaction(mdTxnCtx);
recoveryCompleted = true;
recovering = false;
+ synchronized (this) {
+ recoveryFuture = null;
+ }
LOGGER.info("Global Recovery Completed. Refreshing cluster state...");
appCtx.getClusterStateManager().refreshState();
}
@@ -166,6 +171,12 @@
@Override
public void notifyStateChange(ClusterState newState) {
+ synchronized (this) {
+ if (recovering && newState == ClusterState.UNUSABLE && recoveryFuture != null) {
+ // interrupt the recovery attempt since cluster became unusable during global recovery
+ recoveryFuture.cancel(true);
+ }
+ }
if (newState != ClusterState.ACTIVE && newState != ClusterState.RECOVERING) {
recoveryCompleted = false;
}