Several bug fixes in HHJ, NLJ, and tokenizer
- in HHJ handle the case when it spills and skipInMemoryHJ is set to false,
- check for memsize in NLJ and correctly set memsize in HHJ,
- make counthashed-ngram-token() to skip the bits for length & type
Change-Id: I908345f993019b0bfd0ac0bcb3e497a42295b623
Reviewed-on: http://fulliautomatix.ics.uci.edu:8443/96
Reviewed-by: Pouria Pirzadeh <pouria.pirzadeh@gmail.com>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/NestedLoopJoin.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/NestedLoopJoin.java
index 52f1198..eab60bc 100644
--- a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/NestedLoopJoin.java
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/NestedLoopJoin.java
@@ -64,6 +64,9 @@
this.appender.reset(outBuffer, true);
this.outBuffers = new ArrayList<ByteBuffer>();
this.memSize = memSize;
+ if (memSize < 3) {
+ throw new HyracksDataException("Not enough memory is available for Nested Loop Join");
+ }
this.predEvaluator = predEval;
this.isReversed = false;
this.ctx = ctx;
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java
index 276f60f..744f939 100644
--- a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/join/OptimizedHybridHashJoinOperatorDescriptor.java
@@ -437,30 +437,33 @@
LOGGER.fine("\n>>>Joining Partition Pairs (pid "+pid+") - (level "+level+") - wasReversed "+wasReversed+" - BuildSize:\t"+buildPartSize+"\tProbeSize:\t"+probePartSize+" - MemForJoin "+(state.memForJoin)+" - LeftOuter is "+isLeftOuter);
//Apply in-Mem HJ if possible
- if(!skipInMemoryHJ){
- if ((buildPartSize < state.memForJoin) || (probePartSize < state.memForJoin && !isLeftOuter)) {
- int tabSize = -1;
- if (!forceRR && (isLeftOuter || (buildPartSize < probePartSize)) ) { //Case 1.1 - InMemHJ (wout Role-Reversal)
- LOGGER.fine("\t>>>Case 1.1 (IsLeftOuter || buildSize<probe) AND ApplyInMemHJ - [Level "+level+"]");
- tabSize = wasReversed ? ohhj.getProbePartitionSizeInTup(pid) : ohhj.getBuildPartitionSizeInTup(pid);
- if (tabSize == 0) {
- throw new HyracksDataException(
- "Trying to join an empty partition. Invalid table size for inMemoryHashJoin.");
- }
- //Build Side is smaller
- applyInMemHashJoin(buildKeys, probeKeys, tabSize, probeRd, buildRd, hpcRep0, hpcRep1,
- buildSideReader, probeSideReader, false, pid); //checked-confirmed
- } else { //Case 1.2 - InMemHJ with Role Reversal
- LOGGER.fine("\t>>>Case 1.2. (NoIsLeftOuter || probe<build) AND ApplyInMemHJ WITH RoleReversal - [Level "+level+"]");
- tabSize = wasReversed ? ohhj.getBuildPartitionSizeInTup(pid) : ohhj.getProbePartitionSizeInTup(pid);
- if (tabSize == 0) {
- throw new HyracksDataException(
- "Trying to join an empty partition. Invalid table size for inMemoryHashJoin.");
- }
- //Probe Side is smaller
- applyInMemHashJoin(probeKeys, buildKeys, tabSize, buildRd, probeRd, hpcRep1, hpcRep0,
- probeSideReader, buildSideReader, true, pid); //checked-confirmed
+ if (!skipInMemoryHJ && (buildPartSize < state.memForJoin)
+ || (probePartSize < state.memForJoin && !isLeftOuter)) {
+ int tabSize = -1;
+ if (!forceRR && (isLeftOuter || (buildPartSize < probePartSize))) { //Case 1.1 - InMemHJ (wout Role-Reversal)
+ LOGGER.fine("\t>>>Case 1.1 (IsLeftOuter || buildSize<probe) AND ApplyInMemHJ - [Level "
+ + level + "]");
+ tabSize = wasReversed ? ohhj.getProbePartitionSizeInTup(pid) : ohhj
+ .getBuildPartitionSizeInTup(pid);
+ if (tabSize == 0) {
+ throw new HyracksDataException(
+ "Trying to join an empty partition. Invalid table size for inMemoryHashJoin.");
}
+ //Build Side is smaller
+ applyInMemHashJoin(buildKeys, probeKeys, tabSize, probeRd, buildRd, hpcRep0, hpcRep1,
+ buildSideReader, probeSideReader, false, pid); //checked-confirmed
+ } else { //Case 1.2 - InMemHJ with Role Reversal
+ LOGGER.fine("\t>>>Case 1.2. (NoIsLeftOuter || probe<build) AND ApplyInMemHJ WITH RoleReversal - [Level "
+ + level + "]");
+ tabSize = wasReversed ? ohhj.getBuildPartitionSizeInTup(pid) : ohhj
+ .getProbePartitionSizeInTup(pid);
+ if (tabSize == 0) {
+ throw new HyracksDataException(
+ "Trying to join an empty partition. Invalid table size for inMemoryHashJoin.");
+ }
+ //Probe Side is smaller
+ applyInMemHashJoin(probeKeys, buildKeys, tabSize, buildRd, probeRd, hpcRep1, hpcRep0,
+ probeSideReader, buildSideReader, true, pid); //checked-confirmed
}
}
//Apply (Recursive) HHJ
@@ -523,10 +526,10 @@
int buildSideInTups = rHHj.getBuildPartitionSizeInTup(rPid);
int probeSideInTups = rHHj.getProbePartitionSizeInTup(rPid);
if (isLeftOuter || buildSideInTups < probeSideInTups) {
- applyNestedLoopJoin(buildRd, probeRd, state.memForJoin, rprfw, rbrfw,
+ applyNestedLoopJoin(buildRd, probeRd, memsize, rprfw, rbrfw,
nljComparator0, false); //checked-modified
} else {
- applyNestedLoopJoin(probeRd, buildRd, state.memForJoin, rbrfw, rprfw,
+ applyNestedLoopJoin(probeRd, buildRd, memsize, rbrfw, rprfw,
nljComparator1, true); //checked-modified
}
}
@@ -585,10 +588,10 @@
long buildSideSize = rbrfw.getFileSize();
long probeSideSize = rprfw.getFileSize();
if (buildSideSize > probeSideSize) {
- applyNestedLoopJoin(buildRd, probeRd, state.memForJoin, rbrfw, rprfw,
+ applyNestedLoopJoin(buildRd, probeRd, memsize, rbrfw, rprfw,
nljComparator0, true); //checked-modified
} else {
- applyNestedLoopJoin(probeRd, buildRd, state.memForJoin, rprfw, rbrfw,
+ applyNestedLoopJoin(probeRd, buildRd, memsize, rprfw, rbrfw,
nljComparator1, true); //checked-modified
}
}
diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
index 847a923..b1d722e 100644
--- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
+++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
@@ -64,7 +64,10 @@
// compute token count
// ignore pre and post grams for duplicate detection
if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
- int tmpIndex = start;
+ int tmpIndex = start + 2; // skip utf8 length indicator
+ if (sourceHasTypeTag) {
+ tmpIndex++; // skip type tag
+ }
while (tmpIndex < currentTokenStart) {
tokenCount++; // assume found
int offset = 0;