Merge branch 'master' of https://code.google.com/p/asterixdb into icetindil/issue_378
diff --git a/asterix-app/data/tinysocial/twm.adm b/asterix-app/data/tinysocial/twm.adm
new file mode 100644
index 0000000..d18c70f
--- /dev/null
+++ b/asterix-app/data/tinysocial/twm.adm
@@ -0,0 +1,12 @@
+{"tweetid":"1","user":{"screen-name":"NathanGiesen@211","lang":"en","friends_count":39339,"statuses_count":473,"name":"Nathan Giesen","followers_count":49416},"sender-location":point("47.44,80.65"),"send-time":datetime("2008-04-26T10:10:00"),"referred-topics":{{"t-mobile","customization"}},"message-text":" love t-mobile its customization is good:)"}
+{"tweetid":"2","user":{"screen-name":"ColineGeyer@63","lang":"en","friends_count":121,"statuses_count":362,"name":"Coline Geyer","followers_count":17159},"sender-location":point("32.84,67.14"),"send-time":datetime("2010-05-13T10:10:00"),"referred-topics":{{"verizon","shortcut-menu"}},"message-text":" like verizon its shortcut-menu is awesome:)"}
+{"tweetid":"3","user":{"screen-name":"NathanGiesen@211","lang":"en","friends_count":39339,"statuses_count":473,"name":"Nathan Giesen","followers_count":49416},"sender-location":point("29.72,75.8"),"send-time":datetime("2006-11-04T10:10:00"),"referred-topics":{{"motorola","speed"}},"message-text":" like motorola the speed is good:)"}
+{"tweetid":"4","user":{"screen-name":"NathanGiesen@211","lang":"en","friends_count":39339,"statuses_count":473,"name":"Nathan Giesen","followers_count":49416},"sender-location":point("39.28,70.48"),"send-time":datetime("2011-12-26T10:10:00"),"referred-topics":{{"sprint","voice-command"}},"message-text":" like sprint the voice-command is mind-blowing:)"}
+{"tweetid":"5","user":{"screen-name":"NathanGiesen@211","lang":"en","friends_count":39339,"statuses_count":473,"name":"Nathan Giesen","followers_count":49416},"sender-location":point("40.09,92.69"),"send-time":datetime("2006-08-04T10:10:00"),"referred-topics":{{"motorola","speed"}},"message-text":" can't stand motorola its speed is terrible:("}
+{"tweetid":"6","user":{"screen-name":"ColineGeyer@63","lang":"en","friends_count":121,"statuses_count":362,"name":"Coline Geyer","followers_count":17159},"sender-location":point("47.51,83.99"),"send-time":datetime("2010-05-07T10:10:00"),"referred-topics":{{"iphone","voice-clarity"}},"message-text":" like iphone the voice-clarity is good:)"}
+{"tweetid":"7","user":{"screen-name":"ChangEwing_573","lang":"en","friends_count":182,"statuses_count":394,"name":"Chang Ewing","followers_count":32136},"sender-location":point("36.21,72.6"),"send-time":datetime("2011-08-25T10:10:00"),"referred-topics":{{"samsung","platform"}},"message-text":" like samsung the platform is good"}
+{"tweetid":"8","user":{"screen-name":"NathanGiesen@211","lang":"en","friends_count":39339,"statuses_count":473,"name":"Nathan Giesen","followers_count":49416},"sender-location":point("46.05,93.34"),"send-time":datetime("2005-10-14T10:10:00"),"referred-topics":{{"t-mobile","shortcut-menu"}},"message-text":" like t-mobile the shortcut-menu is awesome:)"}
+{"tweetid":"9","user":{"screen-name":"NathanGiesen@211","lang":"en","friends_count":39339,"statuses_count":473,"name":"Nathan Giesen","followers_count":49416},"sender-location":point("36.86,74.62"),"send-time":datetime("2012-07-21T10:10:00"),"referred-topics":{{"verizon","voicemail-service"}},"message-text":" love verizon its voicemail-service is awesome"}
+{"tweetid":"10","user":{"screen-name":"ColineGeyer@63","lang":"en","friends_count":121,"statuses_count":362,"name":"Coline Geyer","followers_count":17159},"sender-location":point("29.15,76.53"),"send-time":datetime("2008-01-26T10:10:00"),"referred-topics":{{"verizon","voice-clarity"}},"message-text":" hate verizon its voice-clarity is OMG:("}
+{"tweetid":"11","user":{"screen-name":"NilaMilliron_tw","lang":"en","friends_count":445,"statuses_count":164,"name":"Nila Milliron","followers_count":22649},"sender-location":point("37.59,68.42"),"send-time":datetime("2008-03-09T10:10:00"),"referred-topics":{{"iphone","platform"}},"message-text":" can't stand iphone its platform is terrible"}
+{"tweetid":"12","user":{"screen-name":"OliJackson_512","lang":"en","friends_count":445,"statuses_count":164,"name":"Oli Jackson","followers_count":22649},"sender-location":point("24.82,94.63"),"send-time":datetime("2010-02-13T10:10:00"),"referred-topics":{{"samsung","voice-command"}},"message-text":" like samsung the voice-command is amazing:)"}
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/opentype/opentype.1.ddl.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/opentype/opentype.1.ddl.aql
new file mode 100644
index 0000000..17da24b
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/opentype/opentype.1.ddl.aql
@@ -0,0 +1,13 @@
+drop dataverse TinySocial if exists;
+create dataverse TinySocial;
+use dataverse TinySocial;
+
+create type TweetMessageType as open {
+ tweetid: string
+}
+
+create dataset TweetMessages(TweetMessageType)
+primary key tweetid
+hints(cardinality=100);
+
+
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/opentype/opentype.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/opentype/opentype.2.update.aql
new file mode 100644
index 0000000..627623a
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/opentype/opentype.2.update.aql
@@ -0,0 +1,5 @@
+use dataverse TinySocial;
+
+load dataset TweetMessages
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/tinysocial/twm.adm"),("format"="adm"));
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/opentype/opentype.3.query.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/opentype/opentype.3.query.aql
new file mode 100644
index 0000000..f40e884
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/opentype/opentype.3.query.aql
@@ -0,0 +1,15 @@
+use dataverse TinySocial;
+
+set simfunction "jaccard";
+set simthreshold "0.3";
+
+for $t in dataset TweetMessages
+order by $t.tweetid
+return {
+ "tweet": $t,
+ "similar-tweets": for $t2 in dataset TweetMessages
+ order by $t2.tweetid
+ where $t2.referred-topics ~= $t.referred-topics
+ and $t2.tweetid != $t.tweetid
+ return $t2.referred-topics
+};
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/results/fuzzyjoin/opentype/opentype.1.adm b/asterix-app/src/test/resources/runtimets/results/fuzzyjoin/opentype/opentype.1.adm
new file mode 100644
index 0000000..ae0c4cb
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/results/fuzzyjoin/opentype/opentype.1.adm
@@ -0,0 +1,12 @@
+{ "tweet": { "tweetid": "1", "user": { "screen-name": "NathanGiesen@211", "lang": "en", "friends_count": 39339, "statuses_count": 473, "name": "Nathan Giesen", "followers_count": 49416 }, "sender-location": point("47.44,80.65"), "send-time": datetime("2008-04-26T10:10:00.000Z"), "referred-topics": {{ "t-mobile", "customization" }}, "message-text": " love t-mobile its customization is good:)" }, "similar-tweets": [ {{ "t-mobile", "shortcut-menu" }} ] }
+{ "tweet": { "tweetid": "10", "user": { "screen-name": "ColineGeyer@63", "lang": "en", "friends_count": 121, "statuses_count": 362, "name": "Coline Geyer", "followers_count": 17159 }, "sender-location": point("29.15,76.53"), "send-time": datetime("2008-01-26T10:10:00.000Z"), "referred-topics": {{ "verizon", "voice-clarity" }}, "message-text": " hate verizon its voice-clarity is OMG:(" }, "similar-tweets": [ {{ "verizon", "shortcut-menu" }}, {{ "iphone", "voice-clarity" }}, {{ "verizon", "voicemail-service" }} ] }
+{ "tweet": { "tweetid": "11", "user": { "screen-name": "NilaMilliron_tw", "lang": "en", "friends_count": 445, "statuses_count": 164, "name": "Nila Milliron", "followers_count": 22649 }, "sender-location": point("37.59,68.42"), "send-time": datetime("2008-03-09T10:10:00.000Z"), "referred-topics": {{ "iphone", "platform" }}, "message-text": " can't stand iphone its platform is terrible" }, "similar-tweets": [ {{ "iphone", "voice-clarity" }}, {{ "samsung", "platform" }} ] }
+{ "tweet": { "tweetid": "12", "user": { "screen-name": "OliJackson_512", "lang": "en", "friends_count": 445, "statuses_count": 164, "name": "Oli Jackson", "followers_count": 22649 }, "sender-location": point("24.82,94.63"), "send-time": datetime("2010-02-13T10:10:00.000Z"), "referred-topics": {{ "samsung", "voice-command" }}, "message-text": " like samsung the voice-command is amazing:)" }, "similar-tweets": [ {{ "sprint", "voice-command" }}, {{ "samsung", "platform" }} ] }
+{ "tweet": { "tweetid": "2", "user": { "screen-name": "ColineGeyer@63", "lang": "en", "friends_count": 121, "statuses_count": 362, "name": "Coline Geyer", "followers_count": 17159 }, "sender-location": point("32.84,67.14"), "send-time": datetime("2010-05-13T10:10:00.000Z"), "referred-topics": {{ "verizon", "shortcut-menu" }}, "message-text": " like verizon its shortcut-menu is awesome:)" }, "similar-tweets": [ {{ "verizon", "voice-clarity" }}, {{ "t-mobile", "shortcut-menu" }}, {{ "verizon", "voicemail-service" }} ] }
+{ "tweet": { "tweetid": "3", "user": { "screen-name": "NathanGiesen@211", "lang": "en", "friends_count": 39339, "statuses_count": 473, "name": "Nathan Giesen", "followers_count": 49416 }, "sender-location": point("29.72,75.8"), "send-time": datetime("2006-11-04T10:10:00.000Z"), "referred-topics": {{ "motorola", "speed" }}, "message-text": " like motorola the speed is good:)" }, "similar-tweets": [ {{ "motorola", "speed" }} ] }
+{ "tweet": { "tweetid": "4", "user": { "screen-name": "NathanGiesen@211", "lang": "en", "friends_count": 39339, "statuses_count": 473, "name": "Nathan Giesen", "followers_count": 49416 }, "sender-location": point("39.28,70.48"), "send-time": datetime("2011-12-26T10:10:00.000Z"), "referred-topics": {{ "sprint", "voice-command" }}, "message-text": " like sprint the voice-command is mind-blowing:)" }, "similar-tweets": [ {{ "samsung", "voice-command" }} ] }
+{ "tweet": { "tweetid": "5", "user": { "screen-name": "NathanGiesen@211", "lang": "en", "friends_count": 39339, "statuses_count": 473, "name": "Nathan Giesen", "followers_count": 49416 }, "sender-location": point("40.09,92.69"), "send-time": datetime("2006-08-04T10:10:00.000Z"), "referred-topics": {{ "motorola", "speed" }}, "message-text": " can't stand motorola its speed is terrible:(" }, "similar-tweets": [ {{ "motorola", "speed" }} ] }
+{ "tweet": { "tweetid": "6", "user": { "screen-name": "ColineGeyer@63", "lang": "en", "friends_count": 121, "statuses_count": 362, "name": "Coline Geyer", "followers_count": 17159 }, "sender-location": point("47.51,83.99"), "send-time": datetime("2010-05-07T10:10:00.000Z"), "referred-topics": {{ "iphone", "voice-clarity" }}, "message-text": " like iphone the voice-clarity is good:)" }, "similar-tweets": [ {{ "verizon", "voice-clarity" }}, {{ "iphone", "platform" }} ] }
+{ "tweet": { "tweetid": "7", "user": { "screen-name": "ChangEwing_573", "lang": "en", "friends_count": 182, "statuses_count": 394, "name": "Chang Ewing", "followers_count": 32136 }, "sender-location": point("36.21,72.6"), "send-time": datetime("2011-08-25T10:10:00.000Z"), "referred-topics": {{ "samsung", "platform" }}, "message-text": " like samsung the platform is good" }, "similar-tweets": [ {{ "iphone", "platform" }}, {{ "samsung", "voice-command" }} ] }
+{ "tweet": { "tweetid": "8", "user": { "screen-name": "NathanGiesen@211", "lang": "en", "friends_count": 39339, "statuses_count": 473, "name": "Nathan Giesen", "followers_count": 49416 }, "sender-location": point("46.05,93.34"), "send-time": datetime("2005-10-14T10:10:00.000Z"), "referred-topics": {{ "t-mobile", "shortcut-menu" }}, "message-text": " like t-mobile the shortcut-menu is awesome:)" }, "similar-tweets": [ {{ "t-mobile", "customization" }}, {{ "verizon", "shortcut-menu" }} ] }
+{ "tweet": { "tweetid": "9", "user": { "screen-name": "NathanGiesen@211", "lang": "en", "friends_count": 39339, "statuses_count": 473, "name": "Nathan Giesen", "followers_count": 49416 }, "sender-location": point("36.86,74.62"), "send-time": datetime("2012-07-21T10:10:00.000Z"), "referred-topics": {{ "verizon", "voicemail-service" }}, "message-text": " love verizon its voicemail-service is awesome" }, "similar-tweets": [ {{ "verizon", "voice-clarity" }}, {{ "verizon", "shortcut-menu" }} ] }
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/testsuite.xml b/asterix-app/src/test/resources/runtimets/testsuite.xml
index 1fb101d..0d523b0 100644
--- a/asterix-app/src/test/resources/runtimets/testsuite.xml
+++ b/asterix-app/src/test/resources/runtimets/testsuite.xml
@@ -1750,6 +1750,11 @@
</compilation-unit>
</test-case>
-->
+ <test-case FilePath="fuzzyjoin">
+ <compilation-unit name="opentype">
+ <output-dir compare="Text">opentype</output-dir>
+ </compilation-unit>
+ </test-case>
</test-group>
<test-group name="index-join">
<test-case FilePath="index-join">
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java
new file mode 100644
index 0000000..01e34b0
--- /dev/null
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/nontagged/comparators/ListItemBinaryComparatorFactory.java
@@ -0,0 +1,164 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.asterix.dataflow.data.nontagged.comparators;
+
+import edu.uci.ics.asterix.formats.nontagged.UTF8StringLowercasePointable;
+import edu.uci.ics.asterix.om.types.ATypeTag;
+import edu.uci.ics.asterix.om.types.EnumDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
+import edu.uci.ics.hyracks.data.std.accessors.PointableBinaryComparatorFactory;
+import edu.uci.ics.hyracks.data.std.primitive.DoublePointable;
+import edu.uci.ics.hyracks.data.std.primitive.FloatPointable;
+import edu.uci.ics.hyracks.data.std.primitive.IntegerPointable;
+import edu.uci.ics.hyracks.data.std.primitive.UTF8StringPointable;
+
+public class ListItemBinaryComparatorFactory implements IBinaryComparatorFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public static final ListItemBinaryComparatorFactory INSTANCE = new ListItemBinaryComparatorFactory();
+
+ private ListItemBinaryComparatorFactory() {
+ }
+
+ @Override
+ public IBinaryComparator createBinaryComparator() {
+ return createBinaryComparator(ATypeTag.NULL, ATypeTag.NULL, false);
+ }
+
+ public IBinaryComparator createBinaryComparator(final ATypeTag firstItemTypeTag, final ATypeTag secondItemTypeTag, final boolean ignoreCase) {
+ return new IBinaryComparator() {
+ final IBinaryComparator ascBoolComp = BooleanBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+ final IBinaryComparator ascIntComp = new PointableBinaryComparatorFactory(IntegerPointable.FACTORY)
+ .createBinaryComparator();
+ final IBinaryComparator ascLongComp = LongBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+ final IBinaryComparator ascStrComp = new PointableBinaryComparatorFactory(UTF8StringPointable.FACTORY)
+ .createBinaryComparator();
+ final IBinaryComparator ascLowerCaseStrComp = new PointableBinaryComparatorFactory(UTF8StringLowercasePointable.FACTORY)
+ .createBinaryComparator();
+ final IBinaryComparator ascFloatComp = new PointableBinaryComparatorFactory(FloatPointable.FACTORY)
+ .createBinaryComparator();
+ final IBinaryComparator ascDoubleComp = new PointableBinaryComparatorFactory(DoublePointable.FACTORY)
+ .createBinaryComparator();
+ final IBinaryComparator ascRectangleComp = ARectanglePartialBinaryComparatorFactory.INSTANCE
+ .createBinaryComparator();
+ final IBinaryComparator ascCircleComp = ACirclePartialBinaryComparatorFactory.INSTANCE
+ .createBinaryComparator();
+ final IBinaryComparator ascDurationComp = ADurationPartialBinaryComparatorFactory.INSTANCE
+ .createBinaryComparator();
+ final IBinaryComparator ascIntervalComp = AIntervalPartialBinaryComparatorFactory.INSTANCE
+ .createBinaryComparator();
+ final IBinaryComparator ascLineComp = ALinePartialBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+ final IBinaryComparator ascPointComp = APointPartialBinaryComparatorFactory.INSTANCE
+ .createBinaryComparator();
+ final IBinaryComparator ascPoint3DComp = APoint3DPartialBinaryComparatorFactory.INSTANCE
+ .createBinaryComparator();
+ final IBinaryComparator ascPolygonComp = APolygonPartialBinaryComparatorFactory.INSTANCE
+ .createBinaryComparator();
+ final IBinaryComparator rawComp = RawBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+
+ @Override
+ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
+
+ if (b1[s1] == ATypeTag.NULL.serialize()) {
+ if (b2[s2] == ATypeTag.NULL.serialize())
+ return 0;
+ else
+ return -1;
+ } else {
+ if (b2[s2] == ATypeTag.NULL.serialize())
+ return 1;
+ }
+
+ ATypeTag tag1 = firstItemTypeTag;
+ int skip1 = 0;
+ if (firstItemTypeTag == ATypeTag.ANY) {
+ tag1 = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(b1[s1]);
+ skip1 = 1;
+ }
+
+ ATypeTag tag2 = secondItemTypeTag;
+ int skip2 = 0;
+ if (secondItemTypeTag == ATypeTag.ANY) {
+ tag2 = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(b2[s2]);
+ skip2 = 1;
+ }
+
+ if (tag1 != tag2) {
+ return rawComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+
+ switch (tag1) {
+ case BOOLEAN: {
+ return ascBoolComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case TIME:
+ case DATE:
+ case YEARMONTHDURATION:
+ case INT32: {
+ return ascIntComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case DATETIME:
+ case DAYTIMEDURATION:
+ case INT64: {
+ return ascLongComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case FLOAT: {
+ return ascFloatComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case DOUBLE: {
+ return ascDoubleComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case STRING: {
+ if (ignoreCase) {
+ return ascLowerCaseStrComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ } else {
+ return ascStrComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ }
+ case RECTANGLE: {
+ return ascRectangleComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case CIRCLE: {
+ return ascCircleComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case POINT: {
+ return ascPointComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case POINT3D: {
+ return ascPoint3DComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case LINE: {
+ return ascLineComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case POLYGON: {
+ return ascPolygonComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case DURATION: {
+ return ascDurationComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ case INTERVAL: {
+ return ascIntervalComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ default: {
+ return rawComp.compare(b1, s1 + skip1, l1 - skip1, b2, s2 + skip2, l2 - skip2);
+ }
+ }
+ }
+ };
+ }
+}
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java
new file mode 100644
index 0000000..67878a3
--- /dev/null
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/nontagged/hash/ListItemBinaryHashFunctionFactory.java
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.asterix.dataflow.data.nontagged.hash;
+
+import edu.uci.ics.asterix.formats.nontagged.UTF8StringLowercasePointable;
+import edu.uci.ics.asterix.om.types.ATypeTag;
+import edu.uci.ics.asterix.om.types.EnumDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunction;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunctionFactory;
+import edu.uci.ics.hyracks.data.std.accessors.MurmurHash3BinaryHashFunctionFamily;
+import edu.uci.ics.hyracks.data.std.accessors.PointableBinaryHashFunctionFactory;
+
+/**
+ * This hash function factory is introduced to be able to hash heterogeneous list items.
+ * The item type tag is also included in the hash computation to distinguish the different
+ * types with the same raw bytes.
+ *
+ */
+public class ListItemBinaryHashFunctionFactory implements IBinaryHashFunctionFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public static final ListItemBinaryHashFunctionFactory INSTANCE = new ListItemBinaryHashFunctionFactory();
+
+ private ListItemBinaryHashFunctionFactory() {
+ }
+
+ @Override
+ public IBinaryHashFunction createBinaryHashFunction() {
+ return createBinaryHashFunction(ATypeTag.ANY, false);
+ }
+
+ public IBinaryHashFunction createBinaryHashFunction(final ATypeTag itemTypeTag, final boolean ignoreCase) {
+ return new IBinaryHashFunction() {
+
+ private IBinaryHashFunction lowerCaseStringHash = new PointableBinaryHashFunctionFactory(UTF8StringLowercasePointable.FACTORY)
+ .createBinaryHashFunction();
+ private IBinaryHashFunction genericBinaryHash = MurmurHash3BinaryHashFunctionFamily.INSTANCE
+ .createBinaryHashFunction(0);
+
+ @Override
+ public int hash(byte[] bytes, int offset, int length) {
+ ATypeTag tag = itemTypeTag;
+ int skip = 0;
+ if (itemTypeTag == ATypeTag.ANY) {
+ tag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(bytes[offset]);
+ skip = 1;
+ }
+ switch (tag) {
+ case STRING: {
+ if (ignoreCase) {
+ return lowerCaseStringHash.hash(bytes, offset + skip, length - skip);
+ }
+ }
+ default: {
+ if (itemTypeTag != ATypeTag.ANY) {
+ // add the type tag
+ byte[] taggedBytes = new byte[length + 1];
+ taggedBytes[0] = itemTypeTag.serialize();
+ System.arraycopy(bytes, offset, taggedBytes, 1, length);
+ return genericBinaryHash.hash(taggedBytes, 0, length + 1);
+ } else {
+ return genericBinaryHash.hash(bytes, offset, length);
+ }
+ }
+ }
+ }
+ };
+ }
+}
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
index efae5f9..2f9cfc7 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
@@ -13,7 +13,10 @@
protected byte[] data;
protected int count = 0;
protected int pos = -1;
- protected int size = -1;
+ protected int nextPos = -1;
+ protected int itemLen = -1;
+ protected int numberOfItems = -1;
+ protected int listLength = -1;
protected int startOff = -1;
protected IBinaryComparator cmp;
@@ -27,12 +30,12 @@
@Override
public boolean hasNext() {
- return count < size;
+ return count < numberOfItems;
}
@Override
public int size() {
- return size;
+ return numberOfItems;
}
@Override
@@ -44,11 +47,21 @@
public int getPos() {
return pos;
}
+
+ public int getItemLen() {
+ return itemLen;
+ }
@Override
public void next() {
try {
- pos = getItemOffset(data, startOff, ++count);
+ pos = nextPos;
+ ++count;
+ nextPos = startOff + listLength;
+ if (count + 1 < numberOfItems) {
+ nextPos = getItemOffset(data, startOff, count + 1);
+ }
+ itemLen = nextPos - pos;
} catch (AsterixException e) {
throw new AsterixRuntimeException(e);
}
@@ -59,6 +72,11 @@
count = 0;
try {
pos = getItemOffset(data, startOff, count);
+ nextPos = startOff + listLength;
+ if (count + 1 < numberOfItems) {
+ nextPos = getItemOffset(data, startOff, count + 1);
+ }
+ itemLen = nextPos - pos;
} catch (AsterixException e) {
throw new AsterixRuntimeException(e);
}
@@ -67,7 +85,8 @@
public void reset(byte[] data, int startOff) {
this.data = data;
this.startOff = startOff;
- size = getNumberOfItems(data, startOff);
+ this.numberOfItems = getNumberOfItems(data, startOff);
+ this.listLength = getListLength(data, startOff);
ATypeTag tag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(data[startOff + 1]);
switch (tag) {
case INT32: {
@@ -102,4 +121,6 @@
protected abstract int getItemOffset(byte[] serOrderedList, int offset, int itemIndex) throws AsterixException;
protected abstract int getNumberOfItems(byte[] serOrderedList, int offset);
+
+ protected abstract int getListLength(byte[] serOrderedList, int offset);
}
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AsterixOrderedListIterator.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AsterixOrderedListIterator.java
index d3714c1..6a73b6d 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AsterixOrderedListIterator.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AsterixOrderedListIterator.java
@@ -14,4 +14,9 @@
protected int getNumberOfItems(byte[] serOrderedList, int offset) {
return AOrderedListSerializerDeserializer.getNumberOfItems(serOrderedList, offset);
}
+
+ @Override
+ protected int getListLength(byte[] serOrderedList, int offset) {
+ return AOrderedListSerializerDeserializer.getOrderedListLength(serOrderedList, offset + 1);
+ }
}
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AsterixUnorderedListIterator.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AsterixUnorderedListIterator.java
index de7742b..90d9ae3 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AsterixUnorderedListIterator.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/AsterixUnorderedListIterator.java
@@ -14,4 +14,9 @@
protected int getNumberOfItems(byte[] serOrderedList, int offset) {
return AUnorderedListSerializerDeserializer.getNumberOfItems(serOrderedList, offset);
}
+
+ @Override
+ protected int getListLength(byte[] serOrderedList, int offset) {
+ return AUnorderedListSerializerDeserializer.getUnorderedListLength(serOrderedList, offset + 1);
+ }
}
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java
index ab73df2..11c4d6b 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardCheckEvaluator.java
@@ -62,7 +62,7 @@
probeListCount++;
byte[] buf = probeIter.getData();
int off = probeIter.getPos();
- int len = getItemLen(buf, off);
+ int len = probeIter.getItemLen();
keyEntry.set(buf, off, len);
BinaryEntry entry = hashMap.get(keyEntry);
if (entry != null) {
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java
index 9f5c9c8..c610bde 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/SimilarityJaccardEvaluator.java
@@ -4,8 +4,6 @@
import java.io.IOException;
import java.util.Arrays;
-import edu.uci.ics.asterix.formats.nontagged.AqlBinaryComparatorFactoryProvider;
-import edu.uci.ics.asterix.formats.nontagged.AqlBinaryHashFunctionFactoryProvider;
import edu.uci.ics.asterix.formats.nontagged.AqlSerializerDeserializerProvider;
import edu.uci.ics.asterix.om.base.AFloat;
import edu.uci.ics.asterix.om.base.AMutableFloat;
@@ -22,9 +20,10 @@
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
import edu.uci.ics.hyracks.data.std.api.IDataOutputProvider;
import edu.uci.ics.hyracks.data.std.primitive.IntegerPointable;
-import edu.uci.ics.hyracks.data.std.primitive.UTF8StringPointable;
import edu.uci.ics.hyracks.data.std.util.ArrayBackedValueStorage;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import edu.uci.ics.asterix.dataflow.data.nontagged.comparators.ListItemBinaryComparatorFactory;
+import edu.uci.ics.asterix.dataflow.data.nontagged.hash.ListItemBinaryHashFunctionFactory;
public class SimilarityJaccardEvaluator implements ICopyEvaluator {
@@ -58,7 +57,8 @@
protected int firstStart = -1;
protected int secondStart = -1;
protected float jaccSim = 0.0f;
- protected ATypeTag itemTypeTag;
+ protected ATypeTag firstItemTypeTag;
+ protected ATypeTag secondItemTypeTag;
protected BinaryHashMap hashMap;
protected BinaryEntry keyEntry = new BinaryEntry();
@@ -105,6 +105,9 @@
firstTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getByteArray()[firstStart]);
secondTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getByteArray()[secondStart]);
+
+ firstItemTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getByteArray()[firstStart + 1]);
+ secondItemTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getByteArray()[secondStart + 1]);
}
protected boolean prepareLists(byte[] bytes, int firstStart, int secondStart, ATypeTag argType)
@@ -116,17 +119,12 @@
if (firstListIter.size() == 0 || secondListIter.size() == 0) {
return false;
}
- if (firstTypeTag == ATypeTag.ANY || secondTypeTag == ATypeTag.ANY) {
- throw new AlgebricksException("\n Jaccard can only be called on homogenous lists");
- }
// TODO: Check item types are compatible.
- itemTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(bytes[firstStart + 1]);
return true;
}
protected float computeResult(byte[] bytes, int firstStart, int secondStart, ATypeTag argType)
throws AlgebricksException {
- setHashMap(bytes, firstStart, secondStart);
// We will subtract the intersection size later to get the real union size.
int firstListSize = firstListIter.size();
int secondListSize = secondListIter.size();
@@ -136,7 +134,10 @@
AbstractAsterixListIterator probeList = (buildList == firstListIter) ? secondListIter : firstListIter;
int buildListSize = (buildList == firstListIter) ? firstListSize : secondListSize;
int probeListSize = (probeList == firstListIter) ? firstListSize : secondListSize;
+ ATypeTag buildItemTypeTag = (buildList == firstListIter) ? firstItemTypeTag : secondItemTypeTag;
+ ATypeTag probeItemTypeTag = (probeList == firstListIter) ? firstItemTypeTag : secondItemTypeTag;
+ setHashMap(bytes, buildItemTypeTag, probeItemTypeTag);
buildHashMap(buildList);
int intersectionSize = probeHashMap(probeList, buildListSize, probeListSize);
// Special indicator for the "check" version of jaccard.
@@ -154,7 +155,7 @@
while (buildIter.hasNext()) {
byte[] buf = buildIter.getData();
int off = buildIter.getPos();
- int len = getItemLen(buf, off);
+ int len = buildIter.getItemLen();
keyEntry.set(buf, off, len);
BinaryEntry entry = hashMap.put(keyEntry, valEntry);
if (entry != null) {
@@ -172,7 +173,7 @@
while (probeIter.hasNext()) {
byte[] buf = probeIter.getData();
int off = probeIter.getPos();
- int len = getItemLen(buf, off);
+ int len = probeIter.getItemLen();
keyEntry.set(buf, off, len);
BinaryEntry entry = hashMap.get(keyEntry);
if (entry != null) {
@@ -195,69 +196,19 @@
return intersectionSize;
}
- protected void setHashMap(byte[] bytes, int firstStart, int secondStart) {
+ protected void setHashMap(byte[] bytes, ATypeTag buildItemTypeTag, ATypeTag probeItemTypeTag) {
if (hashMap != null) {
hashMap.clear();
return;
}
- IBinaryHashFunction hashFunc = null;
- IBinaryComparator cmp = null;
- switch (itemTypeTag) {
- case INT32: {
- hashFunc = AqlBinaryHashFunctionFactoryProvider.INTEGER_POINTABLE_INSTANCE.createBinaryHashFunction();
- cmp = AqlBinaryComparatorFactoryProvider.INTEGER_POINTABLE_INSTANCE.createBinaryComparator();
- break;
- }
- case FLOAT: {
- hashFunc = AqlBinaryHashFunctionFactoryProvider.FLOAT_POINTABLE_INSTANCE.createBinaryHashFunction();
- cmp = AqlBinaryComparatorFactoryProvider.FLOAT_POINTABLE_INSTANCE.createBinaryComparator();
- break;
- }
- case DOUBLE: {
- hashFunc = AqlBinaryHashFunctionFactoryProvider.DOUBLE_POINTABLE_INSTANCE.createBinaryHashFunction();
- cmp = AqlBinaryComparatorFactoryProvider.DOUBLE_POINTABLE_INSTANCE.createBinaryComparator();
- break;
- }
- case STRING: {
- if (ignoreCase) {
- // Ignore case in comparisons and hashing.
- hashFunc = AqlBinaryHashFunctionFactoryProvider.UTF8STRING_LOWERCASE_POINTABLE_INSTANCE
- .createBinaryHashFunction();
- cmp = AqlBinaryComparatorFactoryProvider.UTF8STRING_LOWERCASE_POINTABLE_INSTANCE
- .createBinaryComparator();
- } else {
- hashFunc = AqlBinaryHashFunctionFactoryProvider.UTF8STRING_POINTABLE_INSTANCE
- .createBinaryHashFunction();
- cmp = AqlBinaryComparatorFactoryProvider.UTF8STRING_POINTABLE_INSTANCE.createBinaryComparator();
- }
- break;
- }
- default: {
- break;
- }
- }
- hashMap = new BinaryHashMap(TABLE_SIZE, TABLE_FRAME_SIZE, hashFunc, cmp);
- }
-
- protected int getItemLen(byte[] bytes, int itemOff) {
- switch (itemTypeTag) {
- case INT32: {
- return 4;
- }
- case FLOAT: {
- return 4;
- }
- case DOUBLE: {
- return 8;
- }
- case STRING: {
- // 2 bytes for the UTF8 len, plus the string data.
- return 2 + UTF8StringPointable.getUTFLength(bytes, itemOff);
- }
- default: {
- return -1;
- }
- }
+
+ IBinaryHashFunction putHashFunc = ListItemBinaryHashFunctionFactory.INSTANCE
+ .createBinaryHashFunction(buildItemTypeTag, ignoreCase);
+ IBinaryHashFunction getHashFunc = ListItemBinaryHashFunctionFactory.INSTANCE
+ .createBinaryHashFunction(probeItemTypeTag, ignoreCase);
+ IBinaryComparator cmp = ListItemBinaryComparatorFactory.INSTANCE
+ .createBinaryComparator(buildItemTypeTag, probeItemTypeTag, ignoreCase);
+ hashMap = new BinaryHashMap(TABLE_SIZE, TABLE_FRAME_SIZE, putHashFunc, getHashFunc, cmp);
}
protected boolean checkArgTypes(ATypeTag typeTag1, ATypeTag typeTag2) throws AlgebricksException {
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/BinaryHashMap.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/BinaryHashMap.java
index 240f8c7..f69a54e 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/BinaryHashMap.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/BinaryHashMap.java
@@ -33,7 +33,8 @@
private static final int PTR_SIZE = 8;
private static final int SLOT_SIZE = 2;
private static final int ENTRY_HEADER_SIZE = PTR_SIZE + 2 * SLOT_SIZE;
- private final IBinaryHashFunction hashFunc;
+ private final IBinaryHashFunction putHashFunc;
+ private final IBinaryHashFunction getHashFunc;
private final IBinaryComparator cmp;
private final BinaryEntry returnValue = new BinaryEntry();
@@ -65,10 +66,11 @@
}
}
- public BinaryHashMap(int tableSize, int frameSize, IBinaryHashFunction hashFunc, IBinaryComparator cmp) {
+ public BinaryHashMap(int tableSize, int frameSize, IBinaryHashFunction putHashFunc, IBinaryHashFunction getHashFunc, IBinaryComparator cmp) {
listHeads = new long[tableSize];
this.frameSize = frameSize;
- this.hashFunc = hashFunc;
+ this.putHashFunc = putHashFunc;
+ this.getHashFunc = getHashFunc;
this.cmp = cmp;
frames.add(ByteBuffer.allocate(frameSize));
clear();
@@ -98,7 +100,12 @@
}
private BinaryEntry getPutInternal(BinaryEntry key, BinaryEntry value, boolean put) {
- int bucket = Math.abs(hashFunc.hash(key.buf, key.off, key.len) % listHeads.length);
+ int bucket;
+ if (put) {
+ bucket = Math.abs(putHashFunc.hash(key.buf, key.off, key.len) % listHeads.length);
+ } else {
+ bucket = Math.abs(getHashFunc.hash(key.buf, key.off, key.len) % listHeads.length);
+ }
long headPtr = listHeads[bucket];
if (headPtr == NULL_PTR) {
// Key definitely doesn't exist yet.