edit-distance-contains function implementation
diff --git a/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/InvertedIndexAccessMethod.java b/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/InvertedIndexAccessMethod.java
index 39b9480..6e0c1f7 100644
--- a/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/InvertedIndexAccessMethod.java
+++ b/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/InvertedIndexAccessMethod.java
@@ -105,6 +105,7 @@
static {
secondLevelFuncIdents.add(AsterixBuiltinFunctions.SIMILARITY_JACCARD_CHECK);
secondLevelFuncIdents.add(AsterixBuiltinFunctions.EDIT_DISTANCE_CHECK);
+ secondLevelFuncIdents.add(AsterixBuiltinFunctions.EDIT_DISTANCE_CONTAINS);
}
public static InvertedIndexAccessMethod INSTANCE = new InvertedIndexAccessMethod();
@@ -115,16 +116,16 @@
}
@Override
- public boolean analyzeFuncExprArgs(AbstractFunctionCallExpression funcExpr, List<AbstractLogicalOperator> assignsAndUnnests,
- AccessMethodAnalysisContext analysisCtx) {
+ public boolean analyzeFuncExprArgs(AbstractFunctionCallExpression funcExpr,
+ List<AbstractLogicalOperator> assignsAndUnnests, AccessMethodAnalysisContext analysisCtx) {
if (funcExpr.getFunctionIdentifier() == AsterixBuiltinFunctions.CONTAINS) {
return AccessMethodUtils.analyzeFuncExprArgsForOneConstAndVar(funcExpr, analysisCtx);
}
return analyzeGetItemFuncExpr(funcExpr, assignsAndUnnests, analysisCtx);
}
- public boolean analyzeGetItemFuncExpr(AbstractFunctionCallExpression funcExpr, List<AbstractLogicalOperator> assignsAndUnnests,
- AccessMethodAnalysisContext analysisCtx) {
+ public boolean analyzeGetItemFuncExpr(AbstractFunctionCallExpression funcExpr,
+ List<AbstractLogicalOperator> assignsAndUnnests, AccessMethodAnalysisContext analysisCtx) {
if (funcExpr.getFunctionIdentifier() != AsterixBuiltinFunctions.GET_ITEM) {
return false;
}
@@ -169,8 +170,7 @@
matchedFuncExpr = (AbstractFunctionCallExpression) matchedExpr;
break;
}
- }
- else {
+ } else {
UnnestOperator unnest = (UnnestOperator) op;
LogicalVariable var = unnest.getVariable();
if (var == varRefExpr.getVariableReference()) {
@@ -182,7 +182,8 @@
if (unnestFuncExpr.getFunctionIdentifier() != AsterixBuiltinFunctions.SCAN_COLLECTION) {
return false;
}
- matchedFuncExpr = (AbstractFunctionCallExpression) unnestFuncExpr.getArguments().get(0).getValue();
+ matchedFuncExpr = (AbstractFunctionCallExpression) unnestFuncExpr.getArguments().get(0)
+ .getValue();
}
}
// We've already found a match.
@@ -198,8 +199,8 @@
}
boolean selectMatchFound = analyzeSelectSimilarityCheckFuncExprArgs(matchedFuncExpr, assignsAndUnnests,
matchedAssignOrUnnestIndex, analysisCtx);
- boolean joinMatchFound = analyzeJoinSimilarityCheckFuncExprArgs(matchedFuncExpr, assignsAndUnnests, matchedAssignOrUnnestIndex,
- analysisCtx);
+ boolean joinMatchFound = analyzeJoinSimilarityCheckFuncExprArgs(matchedFuncExpr, assignsAndUnnests,
+ matchedAssignOrUnnestIndex, analysisCtx);
if (selectMatchFound || joinMatchFound) {
return true;
}
@@ -207,7 +208,8 @@
}
private boolean analyzeJoinSimilarityCheckFuncExprArgs(AbstractFunctionCallExpression funcExpr,
- List<AbstractLogicalOperator> assignsAndUnnests, int matchedAssignOrUnnestIndex, AccessMethodAnalysisContext analysisCtx) {
+ List<AbstractLogicalOperator> assignsAndUnnests, int matchedAssignOrUnnestIndex,
+ AccessMethodAnalysisContext analysisCtx) {
// There should be exactly three arguments.
// The last function argument is assumed to be the similarity threshold.
IAlgebricksConstantValue constThreshVal = null;
@@ -223,11 +225,13 @@
|| arg2.getExpressionTag() == LogicalExpressionTag.CONSTANT) {
return false;
}
- LogicalVariable fieldVar1 = getNonConstArgFieldVar(arg1, funcExpr, assignsAndUnnests, matchedAssignOrUnnestIndex);
+ LogicalVariable fieldVar1 = getNonConstArgFieldVar(arg1, funcExpr, assignsAndUnnests,
+ matchedAssignOrUnnestIndex);
if (fieldVar1 == null) {
return false;
}
- LogicalVariable fieldVar2 = getNonConstArgFieldVar(arg2, funcExpr, assignsAndUnnests, matchedAssignOrUnnestIndex);
+ LogicalVariable fieldVar2 = getNonConstArgFieldVar(arg2, funcExpr, assignsAndUnnests,
+ matchedAssignOrUnnestIndex);
if (fieldVar2 == null) {
return false;
}
@@ -237,7 +241,8 @@
}
private boolean analyzeSelectSimilarityCheckFuncExprArgs(AbstractFunctionCallExpression funcExpr,
- List<AbstractLogicalOperator> assignsAndUnnests, int matchedAssignOrUnnestIndex, AccessMethodAnalysisContext analysisCtx) {
+ List<AbstractLogicalOperator> assignsAndUnnests, int matchedAssignOrUnnestIndex,
+ AccessMethodAnalysisContext analysisCtx) {
// There should be exactly three arguments.
// The last function argument is assumed to be the similarity threshold.
IAlgebricksConstantValue constThreshVal = null;
@@ -264,7 +269,8 @@
}
ConstantExpression constExpr = (ConstantExpression) constArg;
IAlgebricksConstantValue constFilterVal = constExpr.getValue();
- LogicalVariable fieldVar = getNonConstArgFieldVar(nonConstArg, funcExpr, assignsAndUnnests, matchedAssignOrUnnestIndex);
+ LogicalVariable fieldVar = getNonConstArgFieldVar(nonConstArg, funcExpr, assignsAndUnnests,
+ matchedAssignOrUnnestIndex);
if (fieldVar == null) {
return false;
}
@@ -274,7 +280,8 @@
}
private LogicalVariable getNonConstArgFieldVar(ILogicalExpression nonConstArg,
- AbstractFunctionCallExpression funcExpr, List<AbstractLogicalOperator> assignsAndUnnests, int matchedAssignOrUnnestIndex) {
+ AbstractFunctionCallExpression funcExpr, List<AbstractLogicalOperator> assignsAndUnnests,
+ int matchedAssignOrUnnestIndex) {
LogicalVariable fieldVar = null;
// Analyze nonConstArg depending on similarity function.
if (funcExpr.getFunctionIdentifier() == AsterixBuiltinFunctions.SIMILARITY_JACCARD_CHECK) {
@@ -290,7 +297,8 @@
nonConstArg = nonConstFuncExpr.getArguments().get(0).getValue();
}
}
- if (funcExpr.getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CHECK) {
+ if (funcExpr.getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CHECK
+ || funcExpr.getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CONTAINS) {
while (nonConstArg.getExpressionTag() == LogicalExpressionTag.FUNCTION_CALL) {
AbstractFunctionCallExpression nonConstFuncExpr = (AbstractFunctionCallExpression) nonConstArg;
if (nonConstFuncExpr.getFunctionIdentifier() != AsterixBuiltinFunctions.WORD_TOKENS
@@ -431,7 +439,8 @@
// Create "panic" (non indexed) nested-loop join path if necessary.
Mutable<ILogicalOperator> panicJoinRef = null;
Map<LogicalVariable, LogicalVariable> panicVarMap = null;
- if (optFuncExpr.getFuncExpr().getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CHECK) {
+ if (optFuncExpr.getFuncExpr().getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CHECK
+ || optFuncExpr.getFuncExpr().getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CONTAINS) {
panicJoinRef = new MutableObject<ILogicalOperator>(joinRef.getValue());
panicVarMap = new HashMap<LogicalVariable, LogicalVariable>();
Mutable<ILogicalOperator> newProbeRootRef = createPanicNestedLoopJoinPlan(panicJoinRef, indexSubTree,
@@ -658,9 +667,9 @@
.getConstantVal(0))));
isFilterableArgs.add(new MutableObject<ILogicalExpression>(AccessMethodUtils
.createInt32Constant(chosenIndex.getGramLength())));
- // TODO: Currently usePrePost is hardcoded to be true.
+ boolean usePrePost = optFuncExpr.containsPartialField() ? false : true;
isFilterableArgs.add(new MutableObject<ILogicalExpression>(AccessMethodUtils
- .createBooleanConstant(true)));
+ .createBooleanConstant(usePrePost)));
isFilterableExpr = new ScalarFunctionCallExpression(
FunctionUtils.getFunctionInfo(AsterixBuiltinFunctions.EDIT_DISTANCE_STRING_IS_FILTERABLE),
isFilterableArgs);
@@ -743,7 +752,8 @@
// Add the similarity threshold which, by convention, is the last constant value.
jobGenParams.setSimilarityThreshold(optFuncExpr.getConstantVal(optFuncExpr.getNumConstantVals() - 1));
}
- if (optFuncExpr.getFuncExpr().getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CHECK) {
+ if (optFuncExpr.getFuncExpr().getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CHECK
+ || optFuncExpr.getFuncExpr().getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CONTAINS) {
if (optFuncExpr.containsPartialField()) {
jobGenParams.setSearchModifierType(SearchModifierType.CONJUNCTIVE_EDIT_DISTANCE);
} else {
@@ -771,7 +781,8 @@
.containsKey(SkipSecondaryIndexSearchExpressionAnnotation.INSTANCE)) {
return false;
}
- if (optFuncExpr.getFuncExpr().getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CHECK) {
+ if (optFuncExpr.getFuncExpr().getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CHECK
+ || optFuncExpr.getFuncExpr().getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CONTAINS) {
// Must be for a join query.
if (optFuncExpr.getNumConstantVals() == 1) {
return true;
@@ -794,7 +805,7 @@
- edThresh.getIntegerValue() * index.getGramLength();
} else {
mergeThreshold = (astr.getStringValue().length() + index.getGramLength() - 1)
- - edThresh.getIntegerValue() * index.getGramLength();
+ - edThresh.getIntegerValue() * index.getGramLength();
}
}
// We can only optimize edit distance on lists using a word index.
@@ -866,7 +877,8 @@
case SINGLE_PARTITION_NGRAM_INVIX:
case LENGTH_PARTITIONED_NGRAM_INVIX: {
// Make sure not to use pre- and postfixing for conjunctive searches.
- boolean prePost = (searchModifierType == SearchModifierType.CONJUNCTIVE || searchModifierType == SearchModifierType.CONJUNCTIVE_EDIT_DISTANCE) ? false : true;
+ boolean prePost = (searchModifierType == SearchModifierType.CONJUNCTIVE || searchModifierType == SearchModifierType.CONJUNCTIVE_EDIT_DISTANCE) ? false
+ : true;
return AqlBinaryTokenizerFactoryProvider.INSTANCE.getNGramTokenizerFactory(searchKeyType,
index.getGramLength(), prePost, false);
}
diff --git a/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/OptimizableFuncExpr.java b/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/OptimizableFuncExpr.java
index 78acc26..79ccfc0 100644
--- a/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/OptimizableFuncExpr.java
+++ b/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/am/OptimizableFuncExpr.java
@@ -14,6 +14,7 @@
*/
package edu.uci.ics.asterix.optimizer.rules.am;
+import edu.uci.ics.asterix.om.functions.AsterixBuiltinFunctions;
import edu.uci.ics.hyracks.algebricks.core.algebra.base.LogicalVariable;
import edu.uci.ics.hyracks.algebricks.core.algebra.expressions.AbstractFunctionCallExpression;
import edu.uci.ics.hyracks.algebricks.core.algebra.expressions.IAlgebricksConstantValue;
@@ -36,8 +37,12 @@
this.logicalVars = logicalVars;
this.constantVals = constantVals;
this.fieldNames = new String[logicalVars.length];
- this.partialField = false;
this.subTrees = new OptimizableOperatorSubTree[logicalVars.length];
+ if (funcExpr.getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CONTAINS) {
+ this.partialField = true;
+ } else {
+ this.partialField = false;
+ }
}
// Special, more convenient c'tor for simple binary functions.
@@ -45,10 +50,14 @@
IAlgebricksConstantValue constantVal) {
this.funcExpr = funcExpr;
this.logicalVars = new LogicalVariable[] { logicalVar };
- this.partialField = false;
this.constantVals = new IAlgebricksConstantValue[] { constantVal };
this.fieldNames = new String[logicalVars.length];
this.subTrees = new OptimizableOperatorSubTree[logicalVars.length];
+ if (funcExpr.getFunctionIdentifier() == AsterixBuiltinFunctions.EDIT_DISTANCE_CONTAINS) {
+ this.partialField = true;
+ } else {
+ this.partialField = false;
+ }
}
@Override
diff --git a/asterix-app/src/test/resources/optimizerts/queries/inverted-index-join/ngram-edit-distance-contains.aql b/asterix-app/src/test/resources/optimizerts/queries/inverted-index-join/ngram-edit-distance-contains.aql
new file mode 100644
index 0000000..5d61ede
--- /dev/null
+++ b/asterix-app/src/test/resources/optimizerts/queries/inverted-index-join/ngram-edit-distance-contains.aql
@@ -0,0 +1,38 @@
+/*
+ * Description : Fuzzy joins two datasets, DBLP and CSX, based on the edit-distance-contains function of their authors.
+ * DBLP has a 3-gram index on authors, and we expect the join to be transformed into an indexed nested-loop join.
+ * Success : Yes
+ */
+
+drop dataverse test if exists;
+create dataverse test;
+use dataverse test;
+
+create type DBLPType as closed {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create type CSXType as closed {
+ id: int32,
+ csxid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create dataset DBLP(DBLPType) primary key id;
+
+create dataset CSX(CSXType) primary key id;
+
+create index ngram_index on DBLP(authors) type ngram(3);
+
+write output to nc1:"rttest/inverted-index-join_ngram-edit-distance-contains.adm";
+
+for $a in dataset('DBLP')
+for $b in dataset('CSX')
+where edit-distance-contains($a.authors, $b.authors, 3)[0] and $a.id < $b.id
+return {"arec": $a, "brec": $b }
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/optimizerts/results/inverted-index-join/ngram-edit-distance-contains.plan b/asterix-app/src/test/resources/optimizerts/results/inverted-index-join/ngram-edit-distance-contains.plan
new file mode 100644
index 0000000..56b5df4
--- /dev/null
+++ b/asterix-app/src/test/resources/optimizerts/results/inverted-index-join/ngram-edit-distance-contains.plan
@@ -0,0 +1,55 @@
+-- DISTRIBUTE_RESULT |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- STREAM_PROJECT |PARTITIONED|
+ -- ASSIGN |PARTITIONED|
+ -- STREAM_PROJECT |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- HYBRID_HASH_JOIN [$$21][$$14] |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- DATASOURCE_SCAN |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- EMPTY_TUPLE_SOURCE |PARTITIONED|
+ -- HASH_PARTITION_EXCHANGE [$$14] |PARTITIONED|
+ -- UNION_ALL |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- STREAM_PROJECT |PARTITIONED|
+ -- STREAM_SELECT |PARTITIONED|
+ -- STREAM_PROJECT |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- BTREE_SEARCH |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- STABLE_SORT [$$26(ASC)] |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- LENGTH_PARTITIONED_INVERTED_INDEX_SEARCH |PARTITIONED|
+ -- BROADCAST_EXCHANGE |PARTITIONED|
+ -- STREAM_SELECT |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- SPLIT |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- STREAM_PROJECT |PARTITIONED|
+ -- ASSIGN |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- DATASOURCE_SCAN |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- EMPTY_TUPLE_SOURCE |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- STREAM_PROJECT |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- NESTED_LOOP |PARTITIONED|
+ -- BROADCAST_EXCHANGE |PARTITIONED|
+ -- ASSIGN |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- DATASOURCE_SCAN |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- EMPTY_TUPLE_SOURCE |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- STREAM_SELECT |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- SPLIT |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- STREAM_PROJECT |PARTITIONED|
+ -- ASSIGN |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- DATASOURCE_SCAN |PARTITIONED|
+ -- ONE_TO_ONE_EXCHANGE |PARTITIONED|
+ -- EMPTY_TUPLE_SOURCE |PARTITIONED|
diff --git a/asterix-app/src/test/resources/runtimets/queries/index-selection/inverted-index-ngram-edit-distance-contains/inverted-index-ngram-edit-distance-contains.1.ddl.aql b/asterix-app/src/test/resources/runtimets/queries/index-selection/inverted-index-ngram-edit-distance-contains/inverted-index-ngram-edit-distance-contains.1.ddl.aql
new file mode 100644
index 0000000..36d37d8
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/index-selection/inverted-index-ngram-edit-distance-contains/inverted-index-ngram-edit-distance-contains.1.ddl.aql
@@ -0,0 +1,17 @@
+drop dataverse test if exists;
+create dataverse test;
+use dataverse test;
+
+create type DBLPType as closed {
+ id: int32,
+ dblpid: string,
+ title: string,
+ authors: string,
+ misc: string
+}
+
+create nodegroup group1 if not exists on nc1, nc2;
+
+create dataset DBLP(DBLPType)
+ primary key id on group1;
+
diff --git a/asterix-app/src/test/resources/runtimets/queries/index-selection/inverted-index-ngram-edit-distance-contains/inverted-index-ngram-edit-distance-contains.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/index-selection/inverted-index-ngram-edit-distance-contains/inverted-index-ngram-edit-distance-contains.2.update.aql
new file mode 100644
index 0000000..f590a97
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/index-selection/inverted-index-ngram-edit-distance-contains/inverted-index-ngram-edit-distance-contains.2.update.aql
@@ -0,0 +1,6 @@
+use dataverse test;
+
+load dataset DBLP
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+
diff --git a/asterix-app/src/test/resources/runtimets/queries/index-selection/inverted-index-ngram-edit-distance-contains/inverted-index-ngram-edit-distance-contains.3.ddl.aql b/asterix-app/src/test/resources/runtimets/queries/index-selection/inverted-index-ngram-edit-distance-contains/inverted-index-ngram-edit-distance-contains.3.ddl.aql
new file mode 100644
index 0000000..4209874
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/index-selection/inverted-index-ngram-edit-distance-contains/inverted-index-ngram-edit-distance-contains.3.ddl.aql
@@ -0,0 +1,3 @@
+use dataverse test;
+
+create index ngram_index on DBLP(title) type ngram(3);
diff --git a/asterix-app/src/test/resources/runtimets/queries/index-selection/inverted-index-ngram-edit-distance-contains/inverted-index-ngram-edit-distance-contains.4.query.aql b/asterix-app/src/test/resources/runtimets/queries/index-selection/inverted-index-ngram-edit-distance-contains/inverted-index-ngram-edit-distance-contains.4.query.aql
new file mode 100644
index 0000000..9749da8
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/index-selection/inverted-index-ngram-edit-distance-contains/inverted-index-ngram-edit-distance-contains.4.query.aql
@@ -0,0 +1,9 @@
+use dataverse test;
+
+for $paper in dataset('DBLP')
+where edit-distance-contains($paper.title, "Multmedia", 1)[0]
+order by $paper.id
+return {
+ "id" : $paper.id,
+ "title" : $paper.title
+}
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/results/index-selection/inverted-index-ngram-edit-distance-contains/inverted-index-ngram-edit-distance-contains.1.adm b/asterix-app/src/test/resources/runtimets/results/index-selection/inverted-index-ngram-edit-distance-contains/inverted-index-ngram-edit-distance-contains.1.adm
new file mode 100644
index 0000000..12593bf
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/results/index-selection/inverted-index-ngram-edit-distance-contains/inverted-index-ngram-edit-distance-contains.1.adm
@@ -0,0 +1,3 @@
+{ "id": 4, "title": "Multimedia Information Systems Issues and Approaches." }
+{ "id": 89, "title": "VORTEX Video Retrieval and Tracking from Compressed Multimedia Databases." }
+{ "id": 90, "title": "VORTEX Video Retrieval and Tracking from Compressed Multimedia Databases ¾ Visual Search Engine." }
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/testsuite.xml b/asterix-app/src/test/resources/runtimets/testsuite.xml
index 39f01c8..33227b3 100644
--- a/asterix-app/src/test/resources/runtimets/testsuite.xml
+++ b/asterix-app/src/test/resources/runtimets/testsuite.xml
@@ -2340,6 +2340,11 @@
</compilation-unit>
</test-case>
<test-case FilePath="index-selection">
+ <compilation-unit name="inverted-index-ngram-edit-distance-contains">
+ <output-dir compare="Text">inverted-index-ngram-edit-distance-contains</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="index-selection">
<compilation-unit name="inverted-index-olist-edit-distance-panic">
<output-dir compare="Text">inverted-index-olist-edit-distance-panic</output-dir>
</compilation-unit>
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
index 247bbd0..9b9dfc6 100644
--- a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
@@ -213,4 +213,60 @@
return ed;
}
}
+
+ // checks whether the first string contains a similar string to the second string
+ public int UTF8StringEditDistanceContains(byte[] bytes, int fsStart, int ssStart, int edThresh) {
+
+ int fsLen = StringUtils.getStrLen(bytes, fsStart);
+ int ssLen = StringUtils.getStrLen(bytes, ssStart);
+
+ // reuse existing matrix if possible
+ if (ssLen >= cols) {
+ cols = ssLen + 1;
+ matrix = new int[rows][cols];
+ }
+
+ int fsDataStart = fsStart + utf8SizeIndicatorSize;
+ int ssDataStart = ssStart + utf8SizeIndicatorSize;
+
+ // init matrix
+ for (int i = 0; i <= ssLen; i++) {
+ matrix[0][i] = 0;
+ }
+
+ int currRow = 1;
+ int prevRow = 0;
+ int minEd = Integer.MAX_VALUE;
+ // expand dynamic programming matrix row by row
+ int fsPos = fsDataStart;
+ for (int i = 1; i <= fsLen; i++) {
+ matrix[currRow][0] = i;
+ char fsChar = StringUtils.toLowerCase(StringUtils.charAt(bytes, fsPos));
+
+ int ssPos = ssDataStart;
+ for (int j = 1; j <= ssLen; j++) {
+ char ssChar = StringUtils.toLowerCase(StringUtils.charAt(bytes, ssPos));
+
+ matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
+ matrix[prevRow][j - 1] + (fsChar == ssChar ? 0 : 1));
+
+ ssPos += StringUtils.charSize(bytes, ssPos);
+
+ if (i == fsLen && matrix[currRow][j] < minEd) {
+ minEd = matrix[currRow][j];
+ }
+ }
+
+ fsPos += StringUtils.charSize(bytes, fsPos);
+
+ int tmp = currRow;
+ currRow = prevRow;
+ prevRow = tmp;
+ }
+ if (minEd > edThresh) {
+ return -1;
+ } else {
+ return minEd;
+ }
+ }
}
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/om/functions/AsterixBuiltinFunctions.java b/asterix-om/src/main/java/edu/uci/ics/asterix/om/functions/AsterixBuiltinFunctions.java
index 8f0bab3..e8472a0 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/om/functions/AsterixBuiltinFunctions.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/om/functions/AsterixBuiltinFunctions.java
@@ -390,6 +390,8 @@
FunctionConstants.ASTERIX_NS, "edit-distance-list-is-filterable", 2);
public final static FunctionIdentifier EDIT_DISTANCE_STRING_IS_FILTERABLE = new FunctionIdentifier(
FunctionConstants.ASTERIX_NS, "edit-distance-string-is-filterable", 4);
+ public final static FunctionIdentifier EDIT_DISTANCE_CONTAINS = new FunctionIdentifier(FunctionConstants.ASTERIX_NS,
+ "edit-distance-contains", 3);
// tokenizers:
public final static FunctionIdentifier WORD_TOKENS = new FunctionIdentifier(FunctionConstants.ASTERIX_NS,
@@ -827,6 +829,7 @@
addPrivateFunction(SERIAL_LOCAL_AVG, NonTaggedLocalAvgTypeComputer.INSTANCE, true);
addPrivateFunction(SERIAL_SUM, NonTaggedNumericAggTypeComputer.INSTANCE, true);
addPrivateFunction(SERIAL_LOCAL_SUM, NonTaggedNumericAggTypeComputer.INSTANCE, true);
+ addFunction(EDIT_DISTANCE_CONTAINS, OrderedListOfAnyTypeComputer.INSTANCE, true);
addFunction(SIMILARITY_JACCARD, AFloatTypeComputer.INSTANCE, true);
addFunction(SIMILARITY_JACCARD_CHECK, OrderedListOfAnyTypeComputer.INSTANCE, true);
addPrivateFunction(SIMILARITY_JACCARD_SORTED, AFloatTypeComputer.INSTANCE, true);
@@ -1277,6 +1280,7 @@
similarityFunctions.add(getAsterixFunctionInfo(SIMILARITY_JACCARD_CHECK));
similarityFunctions.add(getAsterixFunctionInfo(EDIT_DISTANCE));
similarityFunctions.add(getAsterixFunctionInfo(EDIT_DISTANCE_CHECK));
+ similarityFunctions.add(getAsterixFunctionInfo(EDIT_DISTANCE_CONTAINS));
}
public static boolean isSimilarityFunction(FunctionIdentifier fi) {
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/EditDistanceContainsDescriptor.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/EditDistanceContainsDescriptor.java
new file mode 100644
index 0000000..679a9bc
--- /dev/null
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/functions/EditDistanceContainsDescriptor.java
@@ -0,0 +1,164 @@
+/*
+ * Copyright 2009-2013 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.asterix.runtime.evaluators.functions;
+
+import java.io.DataOutput;
+import edu.uci.ics.asterix.builders.OrderedListBuilder;
+import edu.uci.ics.asterix.formats.nontagged.AqlSerializerDeserializerProvider;
+import edu.uci.ics.asterix.fuzzyjoin.similarity.SimilarityMetricEditDistance;
+import edu.uci.ics.asterix.om.base.ABoolean;
+import edu.uci.ics.asterix.om.base.AInt32;
+import edu.uci.ics.asterix.om.base.AMutableInt32;
+import edu.uci.ics.asterix.om.functions.AsterixBuiltinFunctions;
+import edu.uci.ics.asterix.om.functions.IFunctionDescriptor;
+import edu.uci.ics.asterix.om.functions.IFunctionDescriptorFactory;
+import edu.uci.ics.asterix.om.types.AOrderedListType;
+import edu.uci.ics.asterix.om.types.ATypeTag;
+import edu.uci.ics.asterix.om.types.BuiltinType;
+import edu.uci.ics.asterix.om.types.EnumDeserializer;
+import edu.uci.ics.asterix.runtime.evaluators.base.AbstractScalarFunctionDynamicDescriptor;
+import edu.uci.ics.hyracks.algebricks.common.exceptions.AlgebricksException;
+import edu.uci.ics.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import edu.uci.ics.hyracks.algebricks.runtime.base.ICopyEvaluator;
+import edu.uci.ics.hyracks.algebricks.runtime.base.ICopyEvaluatorFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.data.std.api.IDataOutputProvider;
+import edu.uci.ics.hyracks.data.std.util.ArrayBackedValueStorage;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+
+public class EditDistanceContainsDescriptor extends AbstractScalarFunctionDynamicDescriptor {
+
+ private static final long serialVersionUID = 1L;
+ public static final IFunctionDescriptorFactory FACTORY = new IFunctionDescriptorFactory() {
+ public IFunctionDescriptor createFunctionDescriptor() {
+ return new EditDistanceContainsDescriptor();
+ }
+ };
+
+ @Override
+ public ICopyEvaluatorFactory createEvaluatorFactory(final ICopyEvaluatorFactory[] args) throws AlgebricksException {
+ return new ICopyEvaluatorFactory() {
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public ICopyEvaluator createEvaluator(IDataOutputProvider output) throws AlgebricksException {
+ return new EditDistanceContainsEvaluator(args, output);
+ }
+ };
+ }
+
+ @Override
+ public FunctionIdentifier getIdentifier() {
+ return AsterixBuiltinFunctions.EDIT_DISTANCE_CONTAINS;
+ }
+
+ private static class EditDistanceContainsEvaluator implements ICopyEvaluator {
+
+ // assuming type indicator in serde format
+ private final int typeIndicatorSize = 1;
+
+ private DataOutput out;
+ private final ArrayBackedValueStorage argOut = new ArrayBackedValueStorage();
+ private final ICopyEvaluator evalString;
+ private final ICopyEvaluator evalPattern;
+ private final ICopyEvaluator evalEdThresh;
+ private final SimilarityMetricEditDistance ed = new SimilarityMetricEditDistance();
+ private int edThresh = -1;
+ private final OrderedListBuilder listBuilder;
+ private ArrayBackedValueStorage inputVal;
+ protected final AMutableInt32 aInt32 = new AMutableInt32(-1);
+ @SuppressWarnings("unchecked")
+ private final ISerializerDeserializer<ABoolean> booleanSerde = AqlSerializerDeserializerProvider.INSTANCE
+ .getSerializerDeserializer(BuiltinType.ABOOLEAN);
+ @SuppressWarnings("unchecked")
+ private final ISerializerDeserializer<AInt32> int32Serde = AqlSerializerDeserializerProvider.INSTANCE
+ .getSerializerDeserializer(BuiltinType.AINT32);
+
+ // allowed input types
+ private final static byte SER_NULL_TYPE_TAG = ATypeTag.NULL.serialize();
+ private final static byte SER_STRING_TYPE_TAG = ATypeTag.STRING.serialize();
+ private final static byte SER_INT32_TYPE_TAG = ATypeTag.INT32.serialize();
+
+ public EditDistanceContainsEvaluator(ICopyEvaluatorFactory[] args, IDataOutputProvider output)
+ throws AlgebricksException {
+ out = output.getDataOutput();
+ evalString = args[0].createEvaluator(argOut);
+ evalPattern = args[1].createEvaluator(argOut);
+ evalEdThresh = args[2].createEvaluator(argOut);
+ listBuilder = new OrderedListBuilder();
+ inputVal = new ArrayBackedValueStorage();
+ }
+
+ @Override
+ public void evaluate(IFrameTupleReference tuple) throws AlgebricksException {
+ argOut.reset();
+ int patternStart = argOut.getLength();
+ evalString.evaluate(tuple);
+ int stringStart = argOut.getLength();
+ evalPattern.evaluate(tuple);
+ int edThreshStart = argOut.getLength();
+ evalEdThresh.evaluate(tuple);
+
+ try {
+ // edit distance between null and anything else is 0
+ if (argOut.getByteArray()[stringStart] == SER_NULL_TYPE_TAG
+ || argOut.getByteArray()[patternStart] == SER_NULL_TYPE_TAG) {
+ writeResult(0);
+ return;
+ }
+
+ if (argOut.getByteArray()[stringStart] != SER_STRING_TYPE_TAG
+ || argOut.getByteArray()[patternStart] != SER_STRING_TYPE_TAG
+ || argOut.getByteArray()[edThreshStart] != SER_INT32_TYPE_TAG) {
+ throw new AlgebricksException(AsterixBuiltinFunctions.EDIT_DISTANCE_CONTAINS
+ + ": expects input type (STRING/NULL, STRING/NULL, INT), but got ("
+ + EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getByteArray()[stringStart])
+ + ", "
+ + EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getByteArray()[patternStart])
+ + ", "
+ + EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getByteArray()[edThreshStart])
+ + ").");
+ }
+ } catch (HyracksDataException e) {
+ throw new AlgebricksException(e);
+ }
+
+ edThresh = IntegerSerializerDeserializer.getInt(argOut.getByteArray(), edThreshStart + typeIndicatorSize);
+ int minEd = ed.UTF8StringEditDistanceContains(argOut.getByteArray(), stringStart + typeIndicatorSize,
+ patternStart + typeIndicatorSize, edThresh);
+
+ try {
+ writeResult(minEd);
+ } catch (HyracksDataException e) {
+ throw new AlgebricksException(e);
+ }
+ }
+
+ protected void writeResult(int minEd) throws HyracksDataException {
+ boolean contains = (minEd < 0) ? false : true;
+ listBuilder.reset(new AOrderedListType(BuiltinType.ANY, "list"));
+ inputVal.reset();
+ booleanSerde.serialize(contains ? ABoolean.TRUE : ABoolean.FALSE, inputVal.getDataOutput());
+ listBuilder.addItem(inputVal);
+ inputVal.reset();
+ aInt32.setValue((contains) ? minEd : Integer.MAX_VALUE);
+ int32Serde.serialize(aInt32, inputVal.getDataOutput());
+ listBuilder.addItem(inputVal);
+ listBuilder.write(out, true);
+ }
+ }
+}
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/formats/NonTaggedDataFormat.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/formats/NonTaggedDataFormat.java
index 835e7e1..4501fda 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/formats/NonTaggedDataFormat.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/formats/NonTaggedDataFormat.java
@@ -166,6 +166,7 @@
import edu.uci.ics.asterix.runtime.evaluators.functions.CreateRectangleDescriptor;
import edu.uci.ics.asterix.runtime.evaluators.functions.CreateUUIDDescriptor;
import edu.uci.ics.asterix.runtime.evaluators.functions.EditDistanceCheckDescriptor;
+import edu.uci.ics.asterix.runtime.evaluators.functions.EditDistanceContainsDescriptor;
import edu.uci.ics.asterix.runtime.evaluators.functions.EditDistanceDescriptor;
import edu.uci.ics.asterix.runtime.evaluators.functions.EditDistanceListIsFilterable;
import edu.uci.ics.asterix.runtime.evaluators.functions.EditDistanceStringIsFilterable;
@@ -533,6 +534,7 @@
temp.add(EditDistanceCheckDescriptor.FACTORY);
temp.add(EditDistanceStringIsFilterable.FACTORY);
temp.add(EditDistanceListIsFilterable.FACTORY);
+ temp.add(EditDistanceContainsDescriptor.FACTORY);
temp.add(SimilarityJaccardDescriptor.FACTORY);
temp.add(SimilarityJaccardCheckDescriptor.FACTORY);