fixing issue 378
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/opentype/opentype.3.query.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/opentype/opentype.3.query.aql
index 26d1b83..957de80 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/opentype/opentype.3.query.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/opentype/opentype.3.query.aql
@@ -4,7 +4,7 @@
set simthreshold "0.3";
for $t in dataset TweetMessages
-//order by $t.tweetid
+order by $t.tweetid
return {
"tweet": $t,
"similar-tweets": for $t2 in dataset TweetMessages
diff --git a/asterix-app/src/test/resources/runtimets/testsuite.xml b/asterix-app/src/test/resources/runtimets/testsuite.xml
index 9e6c308..8585570 100644
--- a/asterix-app/src/test/resources/runtimets/testsuite.xml
+++ b/asterix-app/src/test/resources/runtimets/testsuite.xml
@@ -1790,6 +1790,11 @@
</compilation-unit>
</test-case>
-->
+ <test-case FilePath="fuzzyjoin">
+ <compilation-unit name="opentype">
+ <output-dir compare="Text">opentype</output-dir>
+ </compilation-unit>
+ </test-case>
</test-group>
<test-group name="index-join">
<test-case FilePath="index-join">
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java
index 4efc600..379a7e3 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java
@@ -39,11 +39,20 @@
.createBinaryComparator();
final IBinaryComparator ascDoubleComp = new PointableBinaryComparatorFactory(DoublePointable.FACTORY)
.createBinaryComparator();
- final IBinaryComparator ascRectangleComp = RectangleBinaryComparatorFactory.INSTANCE
+ final IBinaryComparator ascRectangleComp = ARectanglePartialBinaryComparatorFactory.INSTANCE
.createBinaryComparator();
- final IBinaryComparator ascDateTimeComp = ADateTimeAscBinaryComparatorFactory.INSTANCE
+ final IBinaryComparator ascCircleComp = ACirclePartialBinaryComparatorFactory.INSTANCE
.createBinaryComparator();
- final IBinaryComparator ascDateOrTimeComp = ADateOrTimeAscBinaryComparatorFactory.INSTANCE
+ final IBinaryComparator ascDurationComp = ADurationPartialBinaryComparatorFactory.INSTANCE
+ .createBinaryComparator();
+ final IBinaryComparator ascIntervalComp = AIntervalPartialBinaryComparatorFactory.INSTANCE
+ .createBinaryComparator();
+ final IBinaryComparator ascLineComp = ALinePartialBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+ final IBinaryComparator ascPointComp = APointPartialBinaryComparatorFactory.INSTANCE
+ .createBinaryComparator();
+ final IBinaryComparator ascPoint3DComp = APoint3DPartialBinaryComparatorFactory.INSTANCE
+ .createBinaryComparator();
+ final IBinaryComparator ascPolygonComp = APolygonPartialBinaryComparatorFactory.INSTANCE
.createBinaryComparator();
final IBinaryComparator rawComp = RawBinaryComparatorFactory.INSTANCE.createBinaryComparator();
@@ -82,9 +91,14 @@
case BOOLEAN: {
return ascBoolComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
}
+ case TIME:
+ case DATE:
+ case YEARMONTHDURATION:
case INT32: {
return ascIntComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
}
+ case DATETIME:
+ case DAYTIMEDURATION:
case INT64: {
return ascLongComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
}
@@ -104,12 +118,26 @@
case RECTANGLE: {
return ascRectangleComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
}
- case DATETIME: {
- return ascDateTimeComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ case CIRCLE: {
+ return ascCircleComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
}
- case TIME:
- case DATE: {
- return ascDateOrTimeComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ case POINT: {
+ return ascPointComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case POINT3D: {
+ return ascPoint3DComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case LINE: {
+ return ascLineComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case POLYGON: {
+ return ascPolygonComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case DURATION: {
+ return ascDurationComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case INTERVAL: {
+ return ascIntervalComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
}
default: {
return rawComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java
index cf7fc8a..1c0773c 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java
@@ -5,6 +5,7 @@
import edu.uci.ics.asterix.om.types.EnumDeserializer;
import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunction;
import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunctionFactory;
+import edu.uci.ics.hyracks.data.std.accessors.MurmurHash3BinaryHashFunctionFamily;
import edu.uci.ics.hyracks.data.std.accessors.PointableBinaryHashFunctionFactory;
import edu.uci.ics.hyracks.data.std.primitive.FloatPointable;
import edu.uci.ics.hyracks.data.std.primitive.IntegerPointable;
@@ -39,9 +40,8 @@
.createBinaryHashFunction();
private IBinaryHashFunction doubleHash = DoubleBinaryHashFunctionFactory.INSTANCE
.createBinaryHashFunction();
- private IBinaryHashFunction rectangleHash = RectangleBinaryHashFunctionFactory.INSTANCE
- .createBinaryHashFunction();
- private IBinaryHashFunction rawHash = RawBinaryHashFunctionFactory.INSTANCE.createBinaryHashFunction();
+ private IBinaryHashFunction genericBinaryHash = MurmurHash3BinaryHashFunctionFamily.INSTANCE
+ .createBinaryHashFunction(0);
@Override
public int hash(byte[] bytes, int offset, int length) {
@@ -55,9 +55,14 @@
case BOOLEAN: {
return boolHash.hash(bytes, offset + skip, length - skip);
}
+ case TIME:
+ case DATE:
+ case YEARMONTHDURATION:
case INT32: {
return intHash.hash(bytes, offset + skip, length - skip);
}
+ case DATETIME:
+ case DAYTIMEDURATION:
case INT64: {
return longHash.hash(bytes, offset + skip, length - skip);
}
@@ -74,14 +79,11 @@
return stringHash.hash(bytes, offset + skip, length - skip);
}
}
- case RECTANGLE: {
- return rectangleHash.hash(bytes, offset + skip, length - skip);
- }
case NULL: {
return 0;
}
default: {
- return rawHash.hash(bytes, offset + skip, length - skip);
+ return genericBinaryHash.hash(bytes, offset + skip, length - skip);
}
}
}
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
index efae5f9..84eef1d 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
@@ -13,6 +13,8 @@
protected byte[] data;
protected int count = 0;
protected int pos = -1;
+ protected int nextPos = -1;
+ protected int itemLen = -1;
protected int size = -1;
protected int startOff = -1;
protected IBinaryComparator cmp;
@@ -44,11 +46,21 @@
public int getPos() {
return pos;
}
+
+ public int getItemLen() {
+ return itemLen;
+ }
@Override
public void next() {
try {
- pos = getItemOffset(data, startOff, ++count);
+ pos = nextPos;
+ ++count;
+ nextPos = data.length;
+ if (count + 1 < size) {
+ nextPos = getItemOffset(data, startOff, count + 1);
+ }
+ itemLen = nextPos - pos;
} catch (AsterixException e) {
throw new AsterixRuntimeException(e);
}
@@ -59,6 +71,11 @@
count = 0;
try {
pos = getItemOffset(data, startOff, count);
+ nextPos = data.length;
+ if (count + 1 < size) {
+ nextPos = getItemOffset(data, startOff, count + 1);
+ }
+ itemLen = nextPos - pos;
} catch (AsterixException e) {
throw new AsterixRuntimeException(e);
}
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java
index ab73df2..11c4d6b 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java
@@ -62,7 +62,7 @@
probeListCount++;
byte[] buf = probeIter.getData();
int off = probeIter.getPos();
- int len = getItemLen(buf, off);
+ int len = probeIter.getItemLen();
keyEntry.set(buf, off, len);
BinaryEntry entry = hashMap.get(keyEntry);
if (entry != null) {
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java
index 9f5c9c8..c610bde 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java
@@ -4,8 +4,6 @@
import java.io.IOException;
import java.util.Arrays;
-import edu.uci.ics.asterix.formats.nontagged.AqlBinaryComparatorFactoryProvider;
-import edu.uci.ics.asterix.formats.nontagged.AqlBinaryHashFunctionFactoryProvider;
import edu.uci.ics.asterix.formats.nontagged.AqlSerializerDeserializerProvider;
import edu.uci.ics.asterix.om.base.AFloat;
import edu.uci.ics.asterix.om.base.AMutableFloat;
@@ -22,9 +20,10 @@
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
import edu.uci.ics.hyracks.data.std.api.IDataOutputProvider;
import edu.uci.ics.hyracks.data.std.primitive.IntegerPointable;
-import edu.uci.ics.hyracks.data.std.primitive.UTF8StringPointable;
import edu.uci.ics.hyracks.data.std.util.ArrayBackedValueStorage;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import edu.uci.ics.asterix.dataflow.data.nontagged.comparators.ListItemBinaryComparatorFactory;
+import edu.uci.ics.asterix.dataflow.data.nontagged.hash.ListItemBinaryHashFunctionFactory;
public class SimilarityJaccardEvaluator implements ICopyEvaluator {
@@ -58,7 +57,8 @@
protected int firstStart = -1;
protected int secondStart = -1;
protected float jaccSim = 0.0f;
- protected ATypeTag itemTypeTag;
+ protected ATypeTag firstItemTypeTag;
+ protected ATypeTag secondItemTypeTag;
protected BinaryHashMap hashMap;
protected BinaryEntry keyEntry = new BinaryEntry();
@@ -105,6 +105,9 @@
firstTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getByteArray()[firstStart]);
secondTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getByteArray()[secondStart]);
+
+ firstItemTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getByteArray()[firstStart + 1]);
+ secondItemTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getByteArray()[secondStart + 1]);
}
protected boolean prepareLists(byte[] bytes, int firstStart, int secondStart, ATypeTag argType)
@@ -116,17 +119,12 @@
if (firstListIter.size() == 0 || secondListIter.size() == 0) {
return false;
}
- if (firstTypeTag == ATypeTag.ANY || secondTypeTag == ATypeTag.ANY) {
- throw new AlgebricksException("\n Jaccard can only be called on homogenous lists");
- }
// TODO: Check item types are compatible.
- itemTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(bytes[firstStart + 1]);
return true;
}
protected float computeResult(byte[] bytes, int firstStart, int secondStart, ATypeTag argType)
throws AlgebricksException {
- setHashMap(bytes, firstStart, secondStart);
// We will subtract the intersection size later to get the real union size.
int firstListSize = firstListIter.size();
int secondListSize = secondListIter.size();
@@ -136,7 +134,10 @@
AbstractAsterixListIterator probeList = (buildList == firstListIter) ? secondListIter : firstListIter;
int buildListSize = (buildList == firstListIter) ? firstListSize : secondListSize;
int probeListSize = (probeList == firstListIter) ? firstListSize : secondListSize;
+ ATypeTag buildItemTypeTag = (buildList == firstListIter) ? firstItemTypeTag : secondItemTypeTag;
+ ATypeTag probeItemTypeTag = (probeList == firstListIter) ? firstItemTypeTag : secondItemTypeTag;
+ setHashMap(bytes, buildItemTypeTag, probeItemTypeTag);
buildHashMap(buildList);
int intersectionSize = probeHashMap(probeList, buildListSize, probeListSize);
// Special indicator for the "check" version of jaccard.
@@ -154,7 +155,7 @@
while (buildIter.hasNext()) {
byte[] buf = buildIter.getData();
int off = buildIter.getPos();
- int len = getItemLen(buf, off);
+ int len = buildIter.getItemLen();
keyEntry.set(buf, off, len);
BinaryEntry entry = hashMap.put(keyEntry, valEntry);
if (entry != null) {
@@ -172,7 +173,7 @@
while (probeIter.hasNext()) {
byte[] buf = probeIter.getData();
int off = probeIter.getPos();
- int len = getItemLen(buf, off);
+ int len = probeIter.getItemLen();
keyEntry.set(buf, off, len);
BinaryEntry entry = hashMap.get(keyEntry);
if (entry != null) {
@@ -195,69 +196,19 @@
return intersectionSize;
}
- protected void setHashMap(byte[] bytes, int firstStart, int secondStart) {
+ protected void setHashMap(byte[] bytes, ATypeTag buildItemTypeTag, ATypeTag probeItemTypeTag) {
if (hashMap != null) {
hashMap.clear();
return;
}
- IBinaryHashFunction hashFunc = null;
- IBinaryComparator cmp = null;
- switch (itemTypeTag) {
- case INT32: {
- hashFunc = AqlBinaryHashFunctionFactoryProvider.INTEGER_POINTABLE_INSTANCE.createBinaryHashFunction();
- cmp = AqlBinaryComparatorFactoryProvider.INTEGER_POINTABLE_INSTANCE.createBinaryComparator();
- break;
- }
- case FLOAT: {
- hashFunc = AqlBinaryHashFunctionFactoryProvider.FLOAT_POINTABLE_INSTANCE.createBinaryHashFunction();
- cmp = AqlBinaryComparatorFactoryProvider.FLOAT_POINTABLE_INSTANCE.createBinaryComparator();
- break;
- }
- case DOUBLE: {
- hashFunc = AqlBinaryHashFunctionFactoryProvider.DOUBLE_POINTABLE_INSTANCE.createBinaryHashFunction();
- cmp = AqlBinaryComparatorFactoryProvider.DOUBLE_POINTABLE_INSTANCE.createBinaryComparator();
- break;
- }
- case STRING: {
- if (ignoreCase) {
- // Ignore case in comparisons and hashing.
- hashFunc = AqlBinaryHashFunctionFactoryProvider.UTF8STRING_LOWERCASE_POINTABLE_INSTANCE
- .createBinaryHashFunction();
- cmp = AqlBinaryComparatorFactoryProvider.UTF8STRING_LOWERCASE_POINTABLE_INSTANCE
- .createBinaryComparator();
- } else {
- hashFunc = AqlBinaryHashFunctionFactoryProvider.UTF8STRING_POINTABLE_INSTANCE
- .createBinaryHashFunction();
- cmp = AqlBinaryComparatorFactoryProvider.UTF8STRING_POINTABLE_INSTANCE.createBinaryComparator();
- }
- break;
- }
- default: {
- break;
- }
- }
- hashMap = new BinaryHashMap(TABLE_SIZE, TABLE_FRAME_SIZE, hashFunc, cmp);
- }
-
- protected int getItemLen(byte[] bytes, int itemOff) {
- switch (itemTypeTag) {
- case INT32: {
- return 4;
- }
- case FLOAT: {
- return 4;
- }
- case DOUBLE: {
- return 8;
- }
- case STRING: {
- // 2 bytes for the UTF8 len, plus the string data.
- return 2 + UTF8StringPointable.getUTFLength(bytes, itemOff);
- }
- default: {
- return -1;
- }
- }
+
+ IBinaryHashFunction putHashFunc = ListItemBinaryHashFunctionFactory.INSTANCE
+ .createBinaryHashFunction(buildItemTypeTag, ignoreCase);
+ IBinaryHashFunction getHashFunc = ListItemBinaryHashFunctionFactory.INSTANCE
+ .createBinaryHashFunction(probeItemTypeTag, ignoreCase);
+ IBinaryComparator cmp = ListItemBinaryComparatorFactory.INSTANCE
+ .createBinaryComparator(buildItemTypeTag, probeItemTypeTag, ignoreCase);
+ hashMap = new BinaryHashMap(TABLE_SIZE, TABLE_FRAME_SIZE, putHashFunc, getHashFunc, cmp);
}
protected boolean checkArgTypes(ATypeTag typeTag1, ATypeTag typeTag2) throws AlgebricksException {
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/BinaryHashMap.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/BinaryHashMap.java
index 240f8c7..f69a54e 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/BinaryHashMap.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/BinaryHashMap.java
@@ -33,7 +33,8 @@
private static final int PTR_SIZE = 8;
private static final int SLOT_SIZE = 2;
private static final int ENTRY_HEADER_SIZE = PTR_SIZE + 2 * SLOT_SIZE;
- private final IBinaryHashFunction hashFunc;
+ private final IBinaryHashFunction putHashFunc;
+ private final IBinaryHashFunction getHashFunc;
private final IBinaryComparator cmp;
private final BinaryEntry returnValue = new BinaryEntry();
@@ -65,10 +66,11 @@
}
}
- public BinaryHashMap(int tableSize, int frameSize, IBinaryHashFunction hashFunc, IBinaryComparator cmp) {
+ public BinaryHashMap(int tableSize, int frameSize, IBinaryHashFunction putHashFunc, IBinaryHashFunction getHashFunc, IBinaryComparator cmp) {
listHeads = new long[tableSize];
this.frameSize = frameSize;
- this.hashFunc = hashFunc;
+ this.putHashFunc = putHashFunc;
+ this.getHashFunc = getHashFunc;
this.cmp = cmp;
frames.add(ByteBuffer.allocate(frameSize));
clear();
@@ -98,7 +100,12 @@
}
private BinaryEntry getPutInternal(BinaryEntry key, BinaryEntry value, boolean put) {
- int bucket = Math.abs(hashFunc.hash(key.buf, key.off, key.len) % listHeads.length);
+ int bucket;
+ if (put) {
+ bucket = Math.abs(putHashFunc.hash(key.buf, key.off, key.len) % listHeads.length);
+ } else {
+ bucket = Math.abs(getHashFunc.hash(key.buf, key.off, key.len) % listHeads.length);
+ }
long headPtr = listHeads[bucket];
if (headPtr == NULL_PTR) {
// Key definitely doesn't exist yet.