making edit-distance-contains() work with lists, null argument handling, handling assymetric arguments of contains() and edit-distance-contains()
diff --git a/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/AccessMethodUtils.java b/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/AccessMethodUtils.java
index fc0c562..f2211e6 100644
--- a/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/AccessMethodUtils.java
+++ b/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/AccessMethodUtils.java
@@ -110,6 +110,10 @@
// One of the args must be a constant, and the other arg must be a variable.
if (arg1.getExpressionTag() == LogicalExpressionTag.CONSTANT
&& arg2.getExpressionTag() == LogicalExpressionTag.VARIABLE) {
+ // The arguments of contains() function are asymmetrical, we can only use index if it is on the first argument
+ if (funcExpr.getFunctionIdentifier() == AsterixBuiltinFunctions.CONTAINS) {
+ return false;
+ }
ConstantExpression constExpr = (ConstantExpression) arg1;
constFilterVal = constExpr.getValue();
VariableReferenceExpression varExpr = (VariableReferenceExpression) arg2;
diff --git a/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/InvertedIndexAccessMethod.java b/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/InvertedIndexAccessMethod.java
index 6e0c1f7..fce97f4 100644
--- a/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/InvertedIndexAccessMethod.java
+++ b/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/InvertedIndexAccessMethod.java
@@ -71,6 +71,7 @@
import edu.uci.ics.hyracks.algebricks.core.algebra.operators.logical.visitors.VariableUtilities;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifierFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.ConjunctiveEditDistanceSearchModifierFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.ConjunctiveListEditDistanceSearchModifierFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.ConjunctiveSearchModifierFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.EditDistanceSearchModifierFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.JaccardSearchModifierFactory;
@@ -258,6 +259,10 @@
ILogicalExpression nonConstArg = null;
if (arg1.getExpressionTag() == LogicalExpressionTag.CONSTANT
&& arg2.getExpressionTag() != LogicalExpressionTag.CONSTANT) {
+ // The arguments of edit-distance-contains() function are asymmetrical, we can only use index if it is on the first argument
+ if (funcExpr.getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CONTAINS) {
+ return false;
+ }
constArg = arg1;
nonConstArg = arg2;
} else if (arg2.getExpressionTag() == LogicalExpressionTag.CONSTANT
@@ -404,6 +409,7 @@
public boolean applyJoinPlanTransformation(Mutable<ILogicalOperator> joinRef,
OptimizableOperatorSubTree leftSubTree, OptimizableOperatorSubTree rightSubTree, Index chosenIndex,
AccessMethodAnalysisContext analysisCtx, IOptimizationContext context) throws AlgebricksException {
+ IOptimizableFuncExpr optFuncExpr = AccessMethodUtils.chooseFirstOptFuncExpr(chosenIndex, analysisCtx);
// Figure out if the index is applicable on the left or right side (if both, we arbitrarily prefer the left side).
Dataset dataset = analysisCtx.indexDatasetMap.get(chosenIndex);
// Determine probe and index subtrees based on chosen index.
@@ -413,10 +419,13 @@
indexSubTree = leftSubTree;
probeSubTree = rightSubTree;
} else if (dataset.getDatasetName().equals(rightSubTree.dataset.getDatasetName())) {
+ // The arguments of edit-distance-contains() function are asymmetrical, we can only use index if it is on the left side
+ if (optFuncExpr.getFuncExpr().getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CONTAINS) {
+ return false;
+ }
indexSubTree = rightSubTree;
probeSubTree = leftSubTree;
}
- IOptimizableFuncExpr optFuncExpr = AccessMethodUtils.chooseFirstOptFuncExpr(chosenIndex, analysisCtx);
InnerJoinOperator join = (InnerJoinOperator) joinRef.getValue();
// Remember the original probe subtree, and its primary-key variables,
@@ -898,18 +907,27 @@
float jaccThresh = ((AFloat) simThresh).getFloatValue();
return new JaccardSearchModifierFactory(jaccThresh);
}
- case EDIT_DISTANCE: {
+ case EDIT_DISTANCE:
+ case CONJUNCTIVE_EDIT_DISTANCE: {
int edThresh = ((AInt32) simThresh).getIntegerValue();
switch (index.getIndexType()) {
case SINGLE_PARTITION_NGRAM_INVIX:
case LENGTH_PARTITIONED_NGRAM_INVIX: {
// Edit distance on strings, filtered with overlapping grams.
- return new EditDistanceSearchModifierFactory(index.getGramLength(), edThresh);
+ if (searchModifierType == SearchModifierType.EDIT_DISTANCE) {
+ return new EditDistanceSearchModifierFactory(index.getGramLength(), edThresh);
+ } else {
+ return new ConjunctiveEditDistanceSearchModifierFactory(index.getGramLength(), edThresh);
+ }
}
case SINGLE_PARTITION_WORD_INVIX:
case LENGTH_PARTITIONED_WORD_INVIX: {
// Edit distance on two lists. The list-elements are non-overlapping.
- return new ListEditDistanceSearchModifierFactory(edThresh);
+ if (searchModifierType == SearchModifierType.EDIT_DISTANCE) {
+ return new ListEditDistanceSearchModifierFactory(edThresh);
+ } else {
+ return new ConjunctiveListEditDistanceSearchModifierFactory(edThresh);
+ }
}
default: {
throw new AlgebricksException("Incompatible search modifier '" + searchModifierType
@@ -917,10 +935,6 @@
}
}
}
- case CONJUNCTIVE_EDIT_DISTANCE: {
- int edThresh = ((AInt32) simThresh).getIntegerValue();
- return new ConjunctiveEditDistanceSearchModifierFactory(index.getGramLength(), edThresh);
- }
default: {
throw new AlgebricksException("Unknown search modifier type '" + searchModifierType + "'.");
}
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
index 9b9dfc6..ab60b2c 100644
--- a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
@@ -104,6 +104,55 @@
}
}
+ public int getSimilarityContains(IListIterator exprList, IListIterator patternList, int simThresh) {
+ int exprLen = exprList.size();
+ int patternLen = patternList.size();
+
+ // reuse existing matrix if possible
+ if (patternLen >= cols) {
+ cols = patternLen + 1;
+ matrix = new int[rows][cols];
+ }
+
+ // init matrix
+ for (int i = 0; i <= patternLen; i++) {
+ matrix[0][i] = i;
+ }
+
+ int currRow = 1;
+ int prevRow = 0;
+ int minEd = Integer.MAX_VALUE;
+ // expand dynamic programming matrix row by row
+ for (int i = 1; i <= exprLen; i++) {
+ matrix[currRow][0] = 0;
+
+ patternList.reset();
+ for (int j = 1; j <= patternLen; j++) {
+
+ matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
+ matrix[prevRow][j - 1] + (exprList.compare(patternList) == 0 ? 0 : 1));
+
+ patternList.next();
+
+ if (j == patternLen && matrix[currRow][patternLen] < minEd) {
+ minEd = matrix[currRow][patternLen];
+ }
+ }
+
+ exprList.next();
+
+ int tmp = currRow;
+ currRow = prevRow;
+ prevRow = tmp;
+ }
+
+ if (minEd > simThresh) {
+ return -1;
+ } else {
+ return minEd;
+ }
+ }
+
// faster implementation for common case of string edit distance
public int UTF8StringEditDistance(byte[] bytes, int fsStart, int ssStart) {
@@ -215,49 +264,49 @@
}
// checks whether the first string contains a similar string to the second string
- public int UTF8StringEditDistanceContains(byte[] bytes, int fsStart, int ssStart, int edThresh) {
+ public int UTF8StringEditDistanceContains(byte[] bytes, int stringStart, int patternStart, int edThresh) {
- int fsLen = StringUtils.getStrLen(bytes, fsStart);
- int ssLen = StringUtils.getStrLen(bytes, ssStart);
+ int stringLen = StringUtils.getStrLen(bytes, stringStart);
+ int patternLen = StringUtils.getStrLen(bytes, patternStart);
// reuse existing matrix if possible
- if (ssLen >= cols) {
- cols = ssLen + 1;
+ if (patternLen >= cols) {
+ cols = patternLen + 1;
matrix = new int[rows][cols];
}
- int fsDataStart = fsStart + utf8SizeIndicatorSize;
- int ssDataStart = ssStart + utf8SizeIndicatorSize;
+ int stringDataStart = stringStart + utf8SizeIndicatorSize;
+ int patternDataStart = patternStart + utf8SizeIndicatorSize;
// init matrix
- for (int i = 0; i <= ssLen; i++) {
- matrix[0][i] = 0;
+ for (int i = 0; i <= patternLen; i++) {
+ matrix[0][i] = i;
}
int currRow = 1;
int prevRow = 0;
int minEd = Integer.MAX_VALUE;
// expand dynamic programming matrix row by row
- int fsPos = fsDataStart;
- for (int i = 1; i <= fsLen; i++) {
- matrix[currRow][0] = i;
- char fsChar = StringUtils.toLowerCase(StringUtils.charAt(bytes, fsPos));
+ int stringPos = stringDataStart;
+ for (int i = 1; i <= stringLen; i++) {
+ matrix[currRow][0] = 0;
+ char stringChar = StringUtils.toLowerCase(StringUtils.charAt(bytes, stringPos));
- int ssPos = ssDataStart;
- for (int j = 1; j <= ssLen; j++) {
- char ssChar = StringUtils.toLowerCase(StringUtils.charAt(bytes, ssPos));
+ int patternPos = patternDataStart;
+ for (int j = 1; j <= patternLen; j++) {
+ char patternChar = StringUtils.toLowerCase(StringUtils.charAt(bytes, patternPos));
matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
- matrix[prevRow][j - 1] + (fsChar == ssChar ? 0 : 1));
+ matrix[prevRow][j - 1] + (stringChar == patternChar ? 0 : 1));
- ssPos += StringUtils.charSize(bytes, ssPos);
+ patternPos += StringUtils.charSize(bytes, patternPos);
- if (i == fsLen && matrix[currRow][j] < minEd) {
- minEd = matrix[currRow][j];
+ if (j == patternLen && matrix[currRow][patternLen] < minEd) {
+ minEd = matrix[currRow][patternLen];
}
}
- fsPos += StringUtils.charSize(bytes, fsPos);
+ stringPos += StringUtils.charSize(bytes, stringPos);
int tmp = currRow;
currRow = prevRow;
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
new file mode 100644
index 0000000..1db0569
--- /dev/null
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.asterix.runtime.evaluators.common;
+
+import java.io.IOException;
+
+import edu.uci.ics.asterix.builders.OrderedListBuilder;
+import edu.uci.ics.asterix.formats.nontagged.AqlSerializerDeserializerProvider;
+import edu.uci.ics.asterix.om.base.ABoolean;
+import edu.uci.ics.asterix.om.functions.AsterixBuiltinFunctions;
+import edu.uci.ics.asterix.om.types.AOrderedListType;
+import edu.uci.ics.asterix.om.types.ATypeTag;
+import edu.uci.ics.asterix.om.types.BuiltinType;
+import edu.uci.ics.asterix.om.types.EnumDeserializer;
+import edu.uci.ics.hyracks.algebricks.common.exceptions.AlgebricksException;
+import edu.uci.ics.hyracks.algebricks.runtime.base.ICopyEvaluator;
+import edu.uci.ics.hyracks.algebricks.runtime.base.ICopyEvaluatorFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.data.std.api.IDataOutputProvider;
+import edu.uci.ics.hyracks.data.std.util.ArrayBackedValueStorage;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+
+public class EditDistanceCheckEvaluator extends EditDistanceEvaluator {
+
+ protected final ICopyEvaluator edThreshEval;
+ protected int edThresh = -1;
+ protected final OrderedListBuilder listBuilder;
+ protected ArrayBackedValueStorage listItemVal;
+ @SuppressWarnings("unchecked")
+ protected final ISerializerDeserializer<ABoolean> booleanSerde = AqlSerializerDeserializerProvider.INSTANCE
+ .getSerializerDeserializer(BuiltinType.ABOOLEAN);
+ protected final static byte SER_INT32_TYPE_TAG = ATypeTag.INT32.serialize();
+
+ public EditDistanceCheckEvaluator(ICopyEvaluatorFactory[] args, IDataOutputProvider output)
+ throws AlgebricksException {
+ super(args, output);
+ edThreshEval = args[2].createEvaluator(argOut);
+ listBuilder = new OrderedListBuilder();
+ listItemVal = new ArrayBackedValueStorage();
+ }
+
+ @Override
+ protected void runArgEvals(IFrameTupleReference tuple) throws AlgebricksException {
+ super.runArgEvals(tuple);
+ int edThreshStart = argOut.getLength();
+ edThreshEval.evaluate(tuple);
+ if (argOut.getByteArray()[edThreshStart] != SER_INT32_TYPE_TAG) {
+ throw new AlgebricksException("Invalid threshold type, expected INT32 but got "
+ + EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getByteArray()[edThreshStart]) + ".");
+ }
+ edThresh = IntegerSerializerDeserializer.getInt(argOut.getByteArray(), edThreshStart + typeIndicatorSize);
+ }
+
+ @Override
+ protected int computeResult(byte[] bytes, int firstStart, int secondStart, ATypeTag argType)
+ throws AlgebricksException {
+ switch (argType) {
+
+ case STRING: {
+ return ed.UTF8StringEditDistance(bytes, firstStart + typeIndicatorSize, secondStart
+ + typeIndicatorSize, edThresh);
+ }
+
+ case ORDEREDLIST: {
+ firstOrdListIter.reset(bytes, firstStart);
+ secondOrdListIter.reset(bytes, secondStart);
+ return (int) ed.getSimilarity(firstOrdListIter, secondOrdListIter, edThresh);
+ }
+
+ default: {
+ throw new AlgebricksException(AsterixBuiltinFunctions.EDIT_DISTANCE_CHECK.getName()
+ + ": expects input type as STRING or ORDEREDLIST but got " + argType + ".");
+ }
+
+ }
+ }
+
+ @Override
+ protected void writeResult(int ed) throws IOException {
+
+ listBuilder.reset(new AOrderedListType(BuiltinType.ANY, "list"));
+ boolean matches = (ed < 0) ? false : true;
+ listItemVal.reset();
+ booleanSerde.serialize(matches ? ABoolean.TRUE : ABoolean.FALSE, listItemVal.getDataOutput());
+ listBuilder.addItem(listItemVal);
+
+ listItemVal.reset();
+ aInt32.setValue((matches) ? ed : Integer.MAX_VALUE);
+ int32Serde.serialize(aInt32, listItemVal.getDataOutput());
+ listBuilder.addItem(listItemVal);
+
+ listBuilder.write(out, true);
+ }
+}
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/EditDistanceContainsEvaluator.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/EditDistanceContainsEvaluator.java
new file mode 100644
index 0000000..1dac595
--- /dev/null
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/EditDistanceContainsEvaluator.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.asterix.runtime.evaluators.common;
+
+import edu.uci.ics.asterix.om.functions.AsterixBuiltinFunctions;
+import edu.uci.ics.asterix.om.types.ATypeTag;
+import edu.uci.ics.hyracks.algebricks.common.exceptions.AlgebricksException;
+import edu.uci.ics.hyracks.algebricks.runtime.base.ICopyEvaluatorFactory;
+import edu.uci.ics.hyracks.data.std.api.IDataOutputProvider;
+
+public class EditDistanceContainsEvaluator extends EditDistanceCheckEvaluator {
+
+ public EditDistanceContainsEvaluator(ICopyEvaluatorFactory[] args, IDataOutputProvider output)
+ throws AlgebricksException {
+ super(args, output);
+ }
+
+ @Override
+ protected int computeResult(byte[] bytes, int firstStart, int secondStart, ATypeTag argType)
+ throws AlgebricksException {
+ switch (argType) {
+
+ case STRING: {
+ return ed.UTF8StringEditDistanceContains(argOut.getByteArray(), firstStart + typeIndicatorSize,
+ secondStart + typeIndicatorSize, edThresh);
+ }
+
+ case ORDEREDLIST: {
+ firstOrdListIter.reset(bytes, firstStart);
+ secondOrdListIter.reset(bytes, secondStart);
+ return ed.getSimilarityContains(firstOrdListIter, secondOrdListIter, edThresh);
+ }
+
+ default: {
+ throw new AlgebricksException(AsterixBuiltinFunctions.EDIT_DISTANCE_CONTAINS.getName()
+ + ": expects input type as STRING or ORDEREDLIST but got " + argType + ".");
+ }
+
+ }
+ }
+}
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/EditDistanceEvaluator.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
index 0a5f251..7ddd957 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
@@ -20,6 +20,7 @@
import edu.uci.ics.asterix.formats.nontagged.AqlSerializerDeserializerProvider;
import edu.uci.ics.asterix.om.base.AInt32;
import edu.uci.ics.asterix.om.base.AMutableInt32;
+import edu.uci.ics.asterix.om.base.ANull;
import edu.uci.ics.asterix.om.types.ATypeTag;
import edu.uci.ics.asterix.om.types.BuiltinType;
import edu.uci.ics.asterix.om.types.EnumDeserializer;
@@ -49,6 +50,9 @@
@SuppressWarnings("unchecked")
protected final ISerializerDeserializer<AInt32> int32Serde = AqlSerializerDeserializerProvider.INSTANCE
.getSerializerDeserializer(BuiltinType.AINT32);
+ @SuppressWarnings("unchecked")
+ private final ISerializerDeserializer<ANull> nullSerde = AqlSerializerDeserializerProvider.INSTANCE
+ .getSerializerDeserializer(BuiltinType.ANULL);
protected ATypeTag itemTypeTag;
protected int firstStart = -1;
@@ -123,10 +127,10 @@
}
protected boolean checkArgTypes(ATypeTag typeTag1, ATypeTag typeTag2) throws AlgebricksException {
- // edit distance between null and anything else is 0
+ // edit distance between null and anything else is undefined
if (typeTag1 == ATypeTag.NULL || typeTag2 == ATypeTag.NULL) {
try {
- writeResult(0);
+ nullSerde.serialize(ANull.NULL, out);
} catch (IOException e) {
throw new AlgebricksException(e);
}
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/AbstractStringContainsEval.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/AbstractStringContainsEval.java
index ae66d22..e587f32 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/AbstractStringContainsEval.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/AbstractStringContainsEval.java
@@ -18,6 +18,7 @@
import edu.uci.ics.asterix.formats.nontagged.AqlSerializerDeserializerProvider;
import edu.uci.ics.asterix.om.base.ABoolean;
+import edu.uci.ics.asterix.om.base.ANull;
import edu.uci.ics.asterix.om.types.ATypeTag;
import edu.uci.ics.asterix.om.types.BuiltinType;
import edu.uci.ics.asterix.om.types.EnumDeserializer;
@@ -45,6 +46,9 @@
@SuppressWarnings("rawtypes")
private ISerializerDeserializer boolSerde = AqlSerializerDeserializerProvider.INSTANCE
.getSerializerDeserializer(BuiltinType.ABOOLEAN);
+ @SuppressWarnings("unchecked")
+ private final ISerializerDeserializer<ANull> nullSerde = AqlSerializerDeserializerProvider.INSTANCE
+ .getSerializerDeserializer(BuiltinType.ANULL);
private final FunctionIdentifier funcID;
@@ -65,17 +69,9 @@
evalString.evaluate(tuple);
try {
- if (array1.getByteArray()[0] == SER_NULL_TYPE_TAG) {
- if (array0.getByteArray()[0] == SER_NULL_TYPE_TAG || array0.getByteArray()[0] == SER_STRING_TYPE_TAG) {
- boolSerde.serialize(ABoolean.TRUE, dout);
- return;
- }
- } else if (array0.getByteArray()[0] == SER_NULL_TYPE_TAG) {
- if (array1.getByteArray()[0] == SER_STRING_TYPE_TAG) {
- boolSerde.serialize(ABoolean.FALSE, dout);
- return;
- }
-
+ if (array0.getByteArray()[0] == SER_NULL_TYPE_TAG || array1.getByteArray()[0] == SER_NULL_TYPE_TAG) {
+ nullSerde.serialize(ANull.NULL, dout);
+ return;
}
if (array0.getByteArray()[0] != SER_STRING_TYPE_TAG || array1.getByteArray()[0] != SER_STRING_TYPE_TAG) {
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/EditDistanceCheckDescriptor.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/EditDistanceCheckDescriptor.java
index c258f67..09ef550 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/EditDistanceCheckDescriptor.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/EditDistanceCheckDescriptor.java
@@ -14,28 +14,16 @@
*/
package edu.uci.ics.asterix.runtime.evaluators.functions;
-import java.io.IOException;
-
-import edu.uci.ics.asterix.builders.OrderedListBuilder;
-import edu.uci.ics.asterix.formats.nontagged.AqlSerializerDeserializerProvider;
-import edu.uci.ics.asterix.om.base.ABoolean;
import edu.uci.ics.asterix.om.functions.AsterixBuiltinFunctions;
import edu.uci.ics.asterix.om.functions.IFunctionDescriptor;
import edu.uci.ics.asterix.om.functions.IFunctionDescriptorFactory;
-import edu.uci.ics.asterix.om.types.AOrderedListType;
-import edu.uci.ics.asterix.om.types.ATypeTag;
-import edu.uci.ics.asterix.om.types.BuiltinType;
import edu.uci.ics.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
-import edu.uci.ics.asterix.runtime.evaluators.common.EditDistanceEvaluator;
+import edu.uci.ics.asterix.runtime.evaluators.common.EditDistanceCheckEvaluator;
import edu.uci.ics.hyracks.algebricks.common.exceptions.AlgebricksException;
import edu.uci.ics.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
import edu.uci.ics.hyracks.algebricks.runtime.base.ICopyEvaluator;
import edu.uci.ics.hyracks.algebricks.runtime.base.ICopyEvaluatorFactory;
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
import edu.uci.ics.hyracks.data.std.api.IDataOutputProvider;
-import edu.uci.ics.hyracks.data.std.util.ArrayBackedValueStorage;
-import edu.uci.ics.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
public class EditDistanceCheckDescriptor extends AbstractScalarFunctionDynamicDescriptor {
@@ -62,73 +50,4 @@
public FunctionIdentifier getIdentifier() {
return AsterixBuiltinFunctions.EDIT_DISTANCE_CHECK;
}
-
- private static class EditDistanceCheckEvaluator extends EditDistanceEvaluator {
-
- private final ICopyEvaluator edThreshEval;
- private int edThresh = -1;
- private final OrderedListBuilder listBuilder;
- private ArrayBackedValueStorage inputVal;
- @SuppressWarnings("unchecked")
- private final ISerializerDeserializer<ABoolean> booleanSerde = AqlSerializerDeserializerProvider.INSTANCE
- .getSerializerDeserializer(BuiltinType.ABOOLEAN);
-
- public EditDistanceCheckEvaluator(ICopyEvaluatorFactory[] args, IDataOutputProvider output)
- throws AlgebricksException {
- super(args, output);
- edThreshEval = args[2].createEvaluator(argOut);
- listBuilder = new OrderedListBuilder();
- inputVal = new ArrayBackedValueStorage();
- }
-
- @Override
- protected void runArgEvals(IFrameTupleReference tuple) throws AlgebricksException {
- super.runArgEvals(tuple);
- int edThreshStart = argOut.getLength();
- edThreshEval.evaluate(tuple);
- edThresh = IntegerSerializerDeserializer.getInt(argOut.getByteArray(), edThreshStart + typeIndicatorSize);
- }
-
- @Override
- protected int computeResult(byte[] bytes, int firstStart, int secondStart, ATypeTag argType)
- throws AlgebricksException {
- switch (argType) {
-
- case STRING: {
- return ed.UTF8StringEditDistance(bytes, firstStart + typeIndicatorSize, secondStart
- + typeIndicatorSize, edThresh);
- }
-
- case ORDEREDLIST: {
- firstOrdListIter.reset(bytes, firstStart);
- secondOrdListIter.reset(bytes, secondStart);
- return (int) ed.getSimilarity(firstOrdListIter, secondOrdListIter, edThresh);
- }
-
- default: {
- throw new AlgebricksException(AsterixBuiltinFunctions.EDIT_DISTANCE_CHECK.getName()
- + ": expects input type as STRING or ORDEREDLIST but got " + argType + ".");
- }
-
- }
- }
-
- @Override
- protected void writeResult(int ed) throws IOException {
-
- listBuilder.reset(new AOrderedListType(BuiltinType.ANY, "list"));
- boolean matches = (ed < 0) ? false : true;
- inputVal.reset();
- booleanSerde.serialize(matches ? ABoolean.TRUE : ABoolean.FALSE, inputVal.getDataOutput());
- listBuilder.addItem(inputVal);
-
- inputVal.reset();
- aInt32.setValue((matches) ? ed : Integer.MAX_VALUE);
- int32Serde.serialize(aInt32, inputVal.getDataOutput());
- listBuilder.addItem(inputVal);
-
- listBuilder.write(out, true);
- }
- }
-
}
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/EditDistanceContainsDescriptor.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/EditDistanceContainsDescriptor.java
index 679a9bc..d34012e 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/EditDistanceContainsDescriptor.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/EditDistanceContainsDescriptor.java
@@ -14,31 +14,16 @@
*/
package edu.uci.ics.asterix.runtime.evaluators.functions;
-import java.io.DataOutput;
-import edu.uci.ics.asterix.builders.OrderedListBuilder;
-import edu.uci.ics.asterix.formats.nontagged.AqlSerializerDeserializerProvider;
-import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityMetricEditDistance;
-import edu.uci.ics.asterix.om.base.ABoolean;
-import edu.uci.ics.asterix.om.base.AInt32;
-import edu.uci.ics.asterix.om.base.AMutableInt32;
import edu.uci.ics.asterix.om.functions.AsterixBuiltinFunctions;
import edu.uci.ics.asterix.om.functions.IFunctionDescriptor;
import edu.uci.ics.asterix.om.functions.IFunctionDescriptorFactory;
-import edu.uci.ics.asterix.om.types.AOrderedListType;
-import edu.uci.ics.asterix.om.types.ATypeTag;
-import edu.uci.ics.asterix.om.types.BuiltinType;
-import edu.uci.ics.asterix.om.types.EnumDeserializer;
import edu.uci.ics.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import edu.uci.ics.asterix.runtime.evaluators.common.EditDistanceContainsEvaluator;
import edu.uci.ics.hyracks.algebricks.common.exceptions.AlgebricksException;
import edu.uci.ics.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
import edu.uci.ics.hyracks.algebricks.runtime.base.ICopyEvaluator;
import edu.uci.ics.hyracks.algebricks.runtime.base.ICopyEvaluatorFactory;
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
-import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
import edu.uci.ics.hyracks.data.std.api.IDataOutputProvider;
-import edu.uci.ics.hyracks.data.std.util.ArrayBackedValueStorage;
-import edu.uci.ics.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
public class EditDistanceContainsDescriptor extends AbstractScalarFunctionDynamicDescriptor {
@@ -65,100 +50,4 @@
public FunctionIdentifier getIdentifier() {
return AsterixBuiltinFunctions.EDIT_DISTANCE_CONTAINS;
}
-
- private static class EditDistanceContainsEvaluator implements ICopyEvaluator {
-
- // assuming type indicator in serde format
- private final int typeIndicatorSize = 1;
-
- private DataOutput out;
- private final ArrayBackedValueStorage argOut = new ArrayBackedValueStorage();
- private final ICopyEvaluator evalString;
- private final ICopyEvaluator evalPattern;
- private final ICopyEvaluator evalEdThresh;
- private final SimilarityMetricEditDistance ed = new SimilarityMetricEditDistance();
- private int edThresh = -1;
- private final OrderedListBuilder listBuilder;
- private ArrayBackedValueStorage inputVal;
- protected final AMutableInt32 aInt32 = new AMutableInt32(-1);
- @SuppressWarnings("unchecked")
- private final ISerializerDeserializer<ABoolean> booleanSerde = AqlSerializerDeserializerProvider.INSTANCE
- .getSerializerDeserializer(BuiltinType.ABOOLEAN);
- @SuppressWarnings("unchecked")
- private final ISerializerDeserializer<AInt32> int32Serde = AqlSerializerDeserializerProvider.INSTANCE
- .getSerializerDeserializer(BuiltinType.AINT32);
-
- // allowed input types
- private final static byte SER_NULL_TYPE_TAG = ATypeTag.NULL.serialize();
- private final static byte SER_STRING_TYPE_TAG = ATypeTag.STRING.serialize();
- private final static byte SER_INT32_TYPE_TAG = ATypeTag.INT32.serialize();
-
- public EditDistanceContainsEvaluator(ICopyEvaluatorFactory[] args, IDataOutputProvider output)
- throws AlgebricksException {
- out = output.getDataOutput();
- evalString = args[0].createEvaluator(argOut);
- evalPattern = args[1].createEvaluator(argOut);
- evalEdThresh = args[2].createEvaluator(argOut);
- listBuilder = new OrderedListBuilder();
- inputVal = new ArrayBackedValueStorage();
- }
-
- @Override
- public void evaluate(IFrameTupleReference tuple) throws AlgebricksException {
- argOut.reset();
- int patternStart = argOut.getLength();
- evalString.evaluate(tuple);
- int stringStart = argOut.getLength();
- evalPattern.evaluate(tuple);
- int edThreshStart = argOut.getLength();
- evalEdThresh.evaluate(tuple);
-
- try {
- // edit distance between null and anything else is 0
- if (argOut.getByteArray()[stringStart] == SER_NULL_TYPE_TAG
- || argOut.getByteArray()[patternStart] == SER_NULL_TYPE_TAG) {
- writeResult(0);
- return;
- }
-
- if (argOut.getByteArray()[stringStart] != SER_STRING_TYPE_TAG
- || argOut.getByteArray()[patternStart] != SER_STRING_TYPE_TAG
- || argOut.getByteArray()[edThreshStart] != SER_INT32_TYPE_TAG) {
- throw new AlgebricksException(AsterixBuiltinFunctions.EDIT_DISTANCE_CONTAINS
- + ": expects input type (STRING/NULL, STRING/NULL, INT), but got ("
- + EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getByteArray()[stringStart])
- + ", "
- + EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getByteArray()[patternStart])
- + ", "
- + EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getByteArray()[edThreshStart])
- + ").");
- }
- } catch (HyracksDataException e) {
- throw new AlgebricksException(e);
- }
-
- edThresh = IntegerSerializerDeserializer.getInt(argOut.getByteArray(), edThreshStart + typeIndicatorSize);
- int minEd = ed.UTF8StringEditDistanceContains(argOut.getByteArray(), stringStart + typeIndicatorSize,
- patternStart + typeIndicatorSize, edThresh);
-
- try {
- writeResult(minEd);
- } catch (HyracksDataException e) {
- throw new AlgebricksException(e);
- }
- }
-
- protected void writeResult(int minEd) throws HyracksDataException {
- boolean contains = (minEd < 0) ? false : true;
- listBuilder.reset(new AOrderedListType(BuiltinType.ANY, "list"));
- inputVal.reset();
- booleanSerde.serialize(contains ? ABoolean.TRUE : ABoolean.FALSE, inputVal.getDataOutput());
- listBuilder.addItem(inputVal);
- inputVal.reset();
- aInt32.setValue((contains) ? minEd : Integer.MAX_VALUE);
- int32Serde.serialize(aInt32, inputVal.getDataOutput());
- listBuilder.addItem(inputVal);
- listBuilder.write(out, true);
- }
- }
-}
+}
\ No newline at end of file