[ASTERIXDB-2783] Fix hash collision for hash join/groupby
- user model changes: no
- storage format changes: no
- interface changes: no
Details:
- Use a random seed for hash join/groupby to avoid hash collisions
with the hash partitioning
- Slightly increase the join memory so that the large object join
test case can still pass.
Change-Id: If2aa02384129293e80015efc3d1f60b57f98909c
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/8123
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Dmitry Lychagin <dmitry.lychagin@couchbase.com>
diff --git a/asterixdb/asterix-app/src/main/resources/cc.conf b/asterixdb/asterix-app/src/main/resources/cc.conf
index ccd35f8..d5da6d4 100644
--- a/asterixdb/asterix-app/src/main/resources/cc.conf
+++ b/asterixdb/asterix-app/src/main/resources/cc.conf
@@ -55,7 +55,7 @@
compiler.framesize=32KB
compiler.sortmemory=320KB
compiler.groupmemory=160KB
-compiler.joinmemory=256KB
+compiler.joinmemory=768KB
compiler.textsearchmemory=160KB
compiler.windowmemory=192KB
compiler.sort.parallel=false
diff --git a/asterixdb/asterix-app/src/main/resources/cc3.conf b/asterixdb/asterix-app/src/main/resources/cc3.conf
index 88362aa..d2a8556 100644
--- a/asterixdb/asterix-app/src/main/resources/cc3.conf
+++ b/asterixdb/asterix-app/src/main/resources/cc3.conf
@@ -51,7 +51,7 @@
compiler.framesize=32KB
compiler.sortmemory=320KB
compiler.groupmemory=160KB
-compiler.joinmemory=256KB
+compiler.joinmemory=1024KB
compiler.textsearchmemory=160KB
compiler.windowmemory=192KB
compiler.parallelism=3
diff --git a/asterixdb/asterix-app/src/test/resources/cc-compression.conf b/asterixdb/asterix-app/src/test/resources/cc-compression.conf
index c8d9780..a3047a0 100644
--- a/asterixdb/asterix-app/src/test/resources/cc-compression.conf
+++ b/asterixdb/asterix-app/src/test/resources/cc-compression.conf
@@ -50,7 +50,7 @@
compiler.framesize=32KB
compiler.sortmemory=320KB
compiler.groupmemory=160KB
-compiler.joinmemory=256KB
+compiler.joinmemory=768KB
compiler.textsearchmemory=160KB
compiler.windowmemory=192KB
messaging.frame.size=4096
diff --git a/asterixdb/asterix-app/src/test/resources/cc-ssl.conf b/asterixdb/asterix-app/src/test/resources/cc-ssl.conf
index 499e9fc..1c0a68f 100644
--- a/asterixdb/asterix-app/src/test/resources/cc-ssl.conf
+++ b/asterixdb/asterix-app/src/test/resources/cc-ssl.conf
@@ -62,7 +62,7 @@
compiler.framesize=32KB
compiler.sortmemory=320KB
compiler.groupmemory=160KB
-compiler.joinmemory=256KB
+compiler.joinmemory=768KB
compiler.textsearchmemory=160KB
compiler.windowmemory=192KB
messaging.frame.size=4096
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1/cluster_state_1.1.regexadm b/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1/cluster_state_1.1.regexadm
index e377fd1..cc96921 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1/cluster_state_1.1.regexadm
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1/cluster_state_1.1.regexadm
@@ -12,7 +12,7 @@
"compiler\.groupmemory" : 163840,
"compiler\.indexonly" : true,
"compiler\.internal\.sanitycheck" : true,
- "compiler\.joinmemory" : 262144,
+ "compiler\.joinmemory" : 786432,
"compiler\.parallelism" : 0,
"compiler\.sort\.parallel" : false,
"compiler\.sort\.samples" : 100,
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_less/cluster_state_1_less.1.regexadm b/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_less/cluster_state_1_less.1.regexadm
index e51d12f..68d3079 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_less/cluster_state_1_less.1.regexadm
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/api/cluster_state_1_less/cluster_state_1_less.1.regexadm
@@ -12,7 +12,7 @@
"compiler\.groupmemory" : 163840,
"compiler\.indexonly" : true,
"compiler\.internal\.sanitycheck" : false,
- "compiler\.joinmemory" : 262144,
+ "compiler\.joinmemory" : 1048576,
"compiler\.parallelism" : 3,
"compiler\.sort\.parallel" : true,
"compiler\.sort\.samples" : 100,
diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupBuildOperatorNodePushable.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupBuildOperatorNodePushable.java
index 43f57af..20d223e 100644
--- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupBuildOperatorNodePushable.java
+++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/group/external/ExternalGroupBuildOperatorNodePushable.java
@@ -38,6 +38,11 @@
public class ExternalGroupBuildOperatorNodePushable extends AbstractUnaryInputSinkOperatorNodePushable
implements IRunFileWriterGenerator {
+ /**
+ * Use a random seed to avoid hash collision with the hash exchange operator.
+ * See https://issues.apache.org/jira/browse/ASTERIXDB-2783 for more details.
+ */
+ private static final int INIT_SEED = 573275022;
private static final Logger LOGGER = LogManager.getLogger();
private final IHyracksTaskContext ctx;
@@ -85,7 +90,7 @@
state = new ExternalGroupState(ctx.getJobletContext().getJobId(), stateId);
ISpillableTable table = spillableTableFactory.buildSpillableTable(ctx, tableSize, fileSize, keyFields,
comparators, firstNormalizerComputer, aggregatorFactory, inRecordDescriptor, outRecordDescriptor,
- framesLimit, 0);
+ framesLimit, INIT_SEED);
RunFileWriter[] runFileWriters = new RunFileWriter[table.getNumPartitions()];
this.externalGroupBy = new ExternalHashGroupBy(this, table, runFileWriters, inRecordDescriptor);
diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java
index 97f9c24..c142113 100644
--- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java
+++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java
@@ -107,6 +107,12 @@
*/
public class OptimizedHybridHashJoinOperatorDescriptor extends AbstractOperatorDescriptor {
+ /**
+ * Use a random seed to avoid hash collision with the hash exchange operator.
+ * See https://issues.apache.org/jira/browse/ASTERIXDB-2783 for more details.
+ */
+ private static final int INIT_SEED = 982028031;
+
private static final int BUILD_AND_PARTITION_ACTIVITY_ID = 0;
private static final int PARTITION_AND_JOIN_ACTIVITY_ID = 1;
@@ -269,10 +275,11 @@
ctx.getJobletContext().getJobId(), new TaskId(getActivityId(), partition));
ITuplePartitionComputer probeHpc =
- new FieldHashPartitionComputerFamily(probeKeys, propHashFunctionFactories).createPartitioner(0);
+ new FieldHashPartitionComputerFamily(probeKeys, propHashFunctionFactories)
+ .createPartitioner(INIT_SEED);
ITuplePartitionComputer buildHpc =
new FieldHashPartitionComputerFamily(buildKeys, buildHashFunctionFactories)
- .createPartitioner(0);
+ .createPartitioner(INIT_SEED);
boolean failed = false;
@Override