ASTERIXDB-1778: Optimize the edit-distance-check function
- Only calculate 2 * (threshold + 1) cells, rather than all cells per row.
- Terminate the calculation steps early when it become obvious that
the possible edit-distance value is greater than the given threshold.
There is no reason to compute all cells in the 2 dimensional array.
- Move the location of IListIterator to Hyracks since we now have
a CharacterIterator in a String. Change the name to ISequenceIterator.
- Add the section for the function in the manual.
- Remove letter counting filtering method since it is only applicable for
the string in ASCII range (0 ~ 127).
Change-Id: Ibc8729c4514bb87c347dd7d50358fd897b769977
Reviewed-on: https://asterix-gerrit.ics.uci.edu/1481
Sonar-Qube: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
BAD: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Jianfeng Jia <jianfeng.jia@gmail.com>
diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md b/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
index 89ef0f7..cb3318f 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/5_similarity.md
@@ -47,6 +47,36 @@
2
+### edit_distance_check ###
+* Syntax:
+
+ edit_distance_check(expression1, expression2, threshold)
+
+* Checks whether the edit distance of `expression1` and `expression2` is within a given threshold.
+
+* Arguments:
+ * `expression1` : a `string` or a homogeneous `array` of a comparable item type.
+ * `expression2` : The same type as `expression1`.
+ * `threshold` : a `bigint` that represents the distance threshold.
+* Return Value:
+ * an `array` with two items:
+ * The first item contains a `boolean` value representing whether the edit distance of `expression1` and `expression2` is within the given threshold.
+ * The second item contains an `integer` that represents the edit distance of `expression1` and `expression2` if the first item is true.
+ * If the first item is false, then the second item is set to 2147483647.
+ * `missing` if any argument is a `missing` value,
+ * `null` if any argument is a `null` value but no argument is a `missing` value,
+ * a type error will be raised if:
+ * the first or second argument is any other non-string value,
+ * or, the third argument is any other non-bigint value.
+* Note: an [n_gram index](similarity.html#UsingIndexesToSupportSimilarityQueries) can be utilized for this function.
+* Example:
+
+ edit_distance_check("happy","hapr",2);
+
+
+* The expected result is:
+
+ [ true, 2 ]
### edit_distance_contains ###
* Syntax:
diff --git a/asterixdb/asterix-fuzzyjoin/pom.xml b/asterixdb/asterix-fuzzyjoin/pom.xml
index 0539782..9485852 100644
--- a/asterixdb/asterix-fuzzyjoin/pom.xml
+++ b/asterixdb/asterix-fuzzyjoin/pom.xml
@@ -82,6 +82,10 @@
<groupId>org.apache.hyracks</groupId>
<artifactId>hyracks-util</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.apache.hyracks</groupId>
+ <artifactId>hyracks-data-std</artifactId>
+ </dependency>
</dependencies>
</project>
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
index ac4a3dd..b213df2 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IGenericSimilarityMetric.java
@@ -20,13 +20,33 @@
package org.apache.asterix.fuzzyjoin.similarity;
import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
public interface IGenericSimilarityMetric {
- // returns similarity
- public float getSimilarity(IListIterator firstList, IListIterator secondList) throws HyracksDataException;
+ /**
+ * Returns the similarity value for the given two lists.
+ *
+ * @param firstSequence
+ * an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+ * @param secondSequence
+ * an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+ * @return a float similarity value
+ * @throws HyracksDataException
+ */
+ public float computeSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence)
+ throws HyracksDataException;
- // returns -1 if does not satisfy threshold
- // else returns similarity
- public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh)
+ /**
+ * Returns the similarity value for the given two lists. If the calculated similarity value
+ * doesn't satisfy the given simThresh value based on the function's check condition, this returns -1.
+ *
+ * @param firstSequence
+ * an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+ * @param secondSequence
+ * an instance of {@link org.apache.hyracks.data.std.util.ISequenceIterator}
+ * @return a float similarity value.
+ * @throws HyracksDataException
+ */
+ public float computeSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence, float simThresh)
throws HyracksDataException;
}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
index d36d60d..3348d4c 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetric.java
@@ -21,10 +21,12 @@
import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
public abstract class SimilarityMetric {
- public static int getIntersectSize(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
+ public static int getIntersectSize(ISequenceIterator tokensX, ISequenceIterator tokensY)
+ throws HyracksDataException {
int intersectSize = 0;
while (tokensX.hasNext() && tokensY.hasNext()) {
int cmp = tokensX.compare(tokensY);
@@ -64,23 +66,6 @@
}
public static int getIntersectSize(int[] tokensX, int startX, int[] tokensY, int startY) {
- // int intersectSize = 0;
- //
- // while (startX < tokensX.length && startY < tokensY.length) {
- // int tokenX = tokensX[startX];
- // int tokenY = tokensY[startY];
- // if (tokenX > tokenY) {
- // startY++;
- // } else if (tokenX < tokenY) {
- // startX++;
- // } else {
- // intersectSize++;
- // startX++;
- // startY++;
- // }
- // }
- //
- // return intersectSize;
return getIntersectSize(tokensX, startX, tokensX.length, tokensY, startY, tokensY.length);
}
@@ -131,52 +116,6 @@
return getPartialIntersectSize(tokensX, 0, tokensX.length, tokensY, 0, tokensY.length, tokenStop);
}
- // @SuppressWarnings("unchecked")
- // public static int getIntersectSize(DataBag tokensX, DataBag tokensY) {
- // int intersectSize = 0;
- //
- // Iterator<Tuple> iteratorX = tokensX.iterator();
- // Iterator<Tuple> iteratorY = tokensY.iterator();
- //
- // Tuple nextX = null;
- // Tuple nextY = null;
- //
- // while ((nextX != null || iteratorX.hasNext())
- // && (nextY != null || iteratorY.hasNext())) {
- // if (nextX == null) {
- // nextX = iteratorX.next();
- // }
- // if (nextY == null) {
- // nextY = iteratorY.next();
- // }
- //
- // int cmp = nextX.compareTo(nextY);
- // if (cmp > 0) {
- // nextY = null;
- // } else if (cmp < 0) {
- // nextX = null;
- // } else {
- // intersectSize++;
- // nextX = null;
- // nextY = null;
- // }
- // }
- //
- // return intersectSize;
- // }
-
- // public abstract float getSimilarity(DataBag tokensX, DataBag tokensY);
-
- // public abstract float getSimilarity(DataBag tokensX, int lengthX,
- // DataBag tokensY, int lengthY);
-
- public float getSimilarity(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
- int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
- int totalSize = tokensX.size() + tokensY.size();
-
- return (float) intersectionSize / (totalSize - intersectionSize);
- }
-
public abstract float getSimilarity(int[] tokensX, int startX, int lengthX, int[] tokensY, int startY, int lengthY);
public abstract float getSimilarity(int[] tokensX, int[] tokensY);
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
index 9dce89e..8003767 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
@@ -19,39 +19,59 @@
package org.apache.asterix.fuzzyjoin.similarity;
-import java.util.Arrays;
-
import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
+import org.apache.hyracks.data.std.util.UTF8StringCharByCharIterator;
import org.apache.hyracks.util.string.UTF8StringUtil;
public class SimilarityMetricEditDistance implements IGenericSimilarityMetric {
- // dp implementation only needs 2 rows
+ // This Dynamic Programming implementation only needs 2 rows.
private final int rows = 2;
private int cols;
private int[][] matrix;
- // for letter count filtering
- private final int[] fsLcCount = new int[128];
- private final int[] ssLcCount = new int[128];
+ // for string edit-distance calculation
+ private final UTF8StringCharByCharIterator leftIt = new UTF8StringCharByCharIterator();
+ private final UTF8StringCharByCharIterator rightIt = new UTF8StringCharByCharIterator();
+
+ public static final int SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE = -1;
public SimilarityMetricEditDistance() {
cols = 100; // arbitrary default value
matrix = new int[rows][cols];
}
- @Override
- public float getSimilarity(IListIterator firstList, IListIterator secondList) throws HyracksDataException {
- int flLen = firstList.size();
- int slLen = secondList.size();
+ /**
+ * Gets the edit distance value for the given two sequences using a Dynamic Programming approach.
+ * If a positive simThresh value is provided, this method only calculates 2 * (simThresh + 1) cells per row,
+ * not all the cells in a row as an optimization. Refer to https://en.wikipedia.org/wiki/Wagner–Fischer_algorithm
+ * for more details. Also, as one more optimization, during the calculation steps, if this method finds out
+ * that the final edit distance value cannot be within simThresh, this method stops the calculation
+ * and immediately returns -1.
+ * If the final edit distance value is less than or equal to simThresh, then that value will be returned.
+ * If a non-positive simThresh is given, then it calculates all cells and rows and returns
+ * the final edit distance value.
+ *
+ * @return the edit distance of the two lists. -1 if a positive simThresh value is given and the edit distance
+ * value is greater than the given simThresh.
+ */
+ private float computeActualSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence,
+ float simThresh) throws HyracksDataException {
+ int flLen = firstSequence.size();
+ int slLen = secondSequence.size();
- // reuse existing matrix if possible
+ // When a positive threshold is given, then we can apply two optimizations.
+ int edThresh = (int) simThresh;
+ boolean canTerminateEarly = edThresh >= 0;
+
+ // Reuses the existing matrix if possible.
if (slLen >= cols) {
cols = slLen + 1;
matrix = new int[rows][cols];
}
- // init matrix
+ // Inits the matrix.
for (int i = 0; i <= slLen; i++) {
matrix[0][i] = i;
}
@@ -59,20 +79,54 @@
int currRow = 1;
int prevRow = 0;
- // expand dynamic programming matrix row by row
+ int from = 1;
+ int to = slLen;
+ int minDistance = -1;
+
+ // Expands the dynamic programming matrix row by row.
for (int i = 1; i <= flLen; i++) {
matrix[currRow][0] = i;
- secondList.reset();
- for (int j = 1; j <= slLen; j++) {
+ secondSequence.reset();
- matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
- matrix[prevRow][j - 1] + (firstList.compare(secondList) == 0 ? 0 : 1));
-
- secondList.next();
+ // Only calculates 2 * (simThresh + 1) cells per row as an optimization.
+ // Also keeps minDistance to see whether the possible edit distance after
+ // each row calculation is greater than the simThresh.
+ if (canTerminateEarly) {
+ minDistance = edThresh + 1;
+ from = Math.max(i - edThresh - 1, 1);
+ to = Math.min(i + edThresh + 1, slLen);
+ for (int j = 1; j < from; j++) {
+ // Moves the pointer of the second list to the point where the calculation starts for this row.
+ secondSequence.next();
+ }
+ if (from > 1) {
+ // Sets the left Boundary cell value to make sure that the calculation is correct.
+ matrix[currRow][from - 1] = edThresh + 1;
+ }
+ if (to < slLen) {
+ // Sets the right Boundary cell value to make sure that the calculation is correct.
+ matrix[currRow][to + 1] = edThresh + 1;
+ }
}
- firstList.next();
+ for (int j = from; j <= to; j++) {
+
+ matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
+ matrix[prevRow][j - 1] + (firstSequence.compare(secondSequence) == 0 ? 0 : 1));
+
+ // Replaces minDistance after each cell computation if we find a smaller value than that.
+ if (canTerminateEarly && matrix[currRow][j] < minDistance) {
+ minDistance = matrix[currRow][j];
+ }
+
+ secondSequence.next();
+ }
+ // If the minimum distance value is greater than the given threshold, no reason to process next row.
+ if (canTerminateEarly && minDistance > edThresh) {
+ return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
+ }
+ firstSequence.next();
int tmp = currRow;
currRow = prevRow;
@@ -82,8 +136,12 @@
return matrix[prevRow][slLen];
}
+ /**
+ * Gets the similarity value for the given two sequences. If the value doesn't satisfy the given simThresh,
+ * this method returns -1. Else, this returns the real similarity value.
+ */
@Override
- public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh)
+ public float computeSimilarity(ISequenceIterator firstList, ISequenceIterator secondList, float simThresh)
throws HyracksDataException {
int edThresh = (int) simThresh;
@@ -93,18 +151,18 @@
// length filter
if (Math.abs(flLen - slLen) > edThresh) {
- return -1;
+ return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
}
- float ed = getSimilarity(firstList, secondList);
- if (ed > edThresh) {
- return -1;
+ float ed = computeActualSimilarity(firstList, secondList, simThresh);
+ if (ed > edThresh || ed < 0) {
+ return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
} else {
return ed;
}
}
- public int getSimilarityContains(IListIterator exprList, IListIterator patternList, int simThresh)
+ public int getSimilarityContains(ISequenceIterator exprList, ISequenceIterator patternList, int simThresh)
throws HyracksDataException {
int exprLen = exprList.size();
int patternLen = patternList.size();
@@ -148,182 +206,50 @@
}
if (minEd > simThresh) {
- return -1;
+ return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
} else {
return minEd;
}
}
// faster implementation for common case of string edit distance
- public int UTF8StringEditDistance(byte[] leftBytes, int fsStart, byte[] rightBytes, int ssStart) {
- int fsLen = UTF8StringUtil.getStringLength(leftBytes, fsStart);
- int ssLen = UTF8StringUtil.getStringLength(rightBytes, ssStart);
-
- int fsUtfLen = UTF8StringUtil.getUTFLength(leftBytes, fsStart);
- int ssUtfLen = UTF8StringUtil.getUTFLength(rightBytes, ssStart);
- int fsMetaLen = UTF8StringUtil.getNumBytesToStoreLength(fsUtfLen);
- int ssMetaLen = UTF8StringUtil.getNumBytesToStoreLength(ssUtfLen);
-
- // reuse existing matrix if possible
- if (ssLen >= cols) {
- cols = ssLen + 1;
- matrix = new int[rows][cols];
- }
-
- int fsDataStart = fsStart + fsMetaLen;
- int ssDataStart = ssStart + ssMetaLen;
-
- // init matrix
- for (int i = 0; i <= ssLen; i++) {
- matrix[0][i] = i;
- }
-
- int currRow = 1;
- int prevRow = 0;
-
- // expand dynamic programming matrix row by row
- int fsPos = fsDataStart;
- for (int i = 1; i <= fsLen; i++) {
- matrix[currRow][0] = i;
- char fsChar = Character.toLowerCase(UTF8StringUtil.charAt(leftBytes, fsPos));
- int ssPos = ssDataStart;
- for (int j = 1; j <= ssLen; j++) {
- char ssChar = Character.toLowerCase(UTF8StringUtil.charAt(rightBytes, ssPos));
-
- matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
- matrix[prevRow][j - 1] + (fsChar == ssChar ? 0 : 1));
-
- ssPos += UTF8StringUtil.charSize(rightBytes, ssPos);
- }
- fsPos += UTF8StringUtil.charSize(leftBytes, fsPos);
- int tmp = currRow;
- currRow = prevRow;
- prevRow = tmp;
- }
- return matrix[prevRow][ssLen];
+ public int getActualUTF8StringEditDistanceVal(byte[] leftBytes, int fsStart, byte[] rightBytes, int ssStart,
+ int edThresh) throws HyracksDataException {
+ leftIt.reset(leftBytes, fsStart);
+ rightIt.reset(rightBytes, ssStart);
+ return (int) computeActualSimilarity(leftIt, rightIt, edThresh);
}
- public int UTF8StringEditDistance(byte[] bytesLeft, int fsStart, byte[] bytesRight, int ssStart, int edThresh) {
+ public int UTF8StringEditDistance(byte[] bytesLeft, int fsStart, byte[] bytesRight, int ssStart, int edThresh)
+ throws HyracksDataException {
int fsStrLen = UTF8StringUtil.getStringLength(bytesLeft, fsStart);
int ssStrLen = UTF8StringUtil.getStringLength(bytesRight, ssStart);
- int fsUtfLen = UTF8StringUtil.getUTFLength(bytesLeft, fsStart);
- int ssUtfLen = UTF8StringUtil.getUTFLength(bytesRight, ssStart);
- int fsMetaLen = UTF8StringUtil.getNumBytesToStoreLength(fsUtfLen);
- int ssMetaLen = UTF8StringUtil.getNumBytesToStoreLength(ssUtfLen);
-
// length filter
if (Math.abs(fsStrLen - ssStrLen) > edThresh) {
- return -1;
+ return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
}
- // initialize letter count filtering
- Arrays.fill(fsLcCount, 0);
- Arrays.fill(ssLcCount, 0);
-
- // compute letter counts for first string
- int fsPos = fsStart + fsMetaLen;
- int fsEnd = fsPos + fsUtfLen;;
- while (fsPos < fsEnd) {
- char c = Character.toLowerCase(UTF8StringUtil.charAt(bytesLeft, fsPos));
- if (c < 128) {
- fsLcCount[c]++;
- }
- fsPos += UTF8StringUtil.charSize(bytesLeft, fsPos);
- }
-
- // compute letter counts for second string
- int ssPos = ssStart + ssMetaLen;
- int ssEnd = ssPos + ssUtfLen;
- while (ssPos < ssEnd) {
- char c = Character.toLowerCase(UTF8StringUtil.charAt(bytesRight, ssPos));
- if (c < 128) {
- ssLcCount[c]++;
- }
- ssPos += UTF8StringUtil.charSize(bytesRight, ssPos);
- }
-
- // apply filter
- int gtSum = 0;
- int ltSum = 0;
- for (int i = 0; i < 128; i++) {
- if (fsLcCount[i] > ssLcCount[i]) {
- gtSum += fsLcCount[i] - ssLcCount[i];
- if (gtSum > edThresh) {
- return -1;
- }
- } else {
- ltSum += ssLcCount[i] - fsLcCount[i];
- if (ltSum > edThresh) {
- return -1;
- }
- }
- }
-
- int ed = UTF8StringEditDistance(bytesLeft, fsStart, bytesRight, ssStart);
- if (ed > edThresh) {
- return -1;
+ int ed = getActualUTF8StringEditDistanceVal(bytesLeft, fsStart, bytesRight, ssStart, edThresh);
+ if (ed > edThresh || ed < 0) {
+ return SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE;
} else {
return ed;
}
}
// checks whether the first string contains a similar string to the second string
- public int UTF8StringEditDistanceContains(byte[] strBytes, int stringStart, byte[] pattenBytes, int patternStart,
- int edThresh) {
+ public int UTF8StringEditDistanceContains(byte[] strBytes, int stringStart, byte[] patternBytes, int patternStart,
+ int edThresh) throws HyracksDataException {
+ leftIt.reset(strBytes, stringStart);
+ rightIt.reset(patternBytes, patternStart);
+ return getSimilarityContains(leftIt, rightIt, edThresh);
+ }
- int stringLen = UTF8StringUtil.getStringLength(strBytes, stringStart);
- int patternLen = UTF8StringUtil.getStringLength(pattenBytes, patternStart);
-
- int stringUTFLen = UTF8StringUtil.getUTFLength(strBytes, stringStart);
- int stringMetaLen = UTF8StringUtil.getNumBytesToStoreLength(stringUTFLen);
-
- int patternUTFLen = UTF8StringUtil.getUTFLength(pattenBytes, patternStart);
- int patternMetaLen = UTF8StringUtil.getNumBytesToStoreLength(patternUTFLen);
-
- // reuse existing matrix if possible
- if (patternLen >= cols) {
- cols = patternLen + 1;
- matrix = new int[rows][cols];
- }
-
- int stringDataStart = stringStart + stringMetaLen;
- int patternDataStart = patternStart + patternMetaLen;
-
- // init matrix
- for (int i = 0; i <= patternLen; i++) {
- matrix[0][i] = i;
- }
-
- int currRow = 1;
- int prevRow = 0;
- int minEd = Integer.MAX_VALUE;
- // expand dynamic programming matrix row by row
- int stringPos = stringDataStart;
- for (int i = 1; i <= stringLen; i++) {
- matrix[currRow][0] = 0;
- char stringChar = Character.toLowerCase(UTF8StringUtil.charAt(strBytes, stringPos));
-
- int patternPos = patternDataStart;
- for (int j = 1; j <= patternLen; j++) {
- char patternChar = Character.toLowerCase(UTF8StringUtil.charAt(pattenBytes, patternPos));
- matrix[currRow][j] = Math.min(Math.min(matrix[prevRow][j] + 1, matrix[currRow][j - 1] + 1),
- matrix[prevRow][j - 1] + (stringChar == patternChar ? 0 : 1));
- patternPos += UTF8StringUtil.charSize(pattenBytes, patternPos);
- if (j == patternLen && matrix[currRow][patternLen] < minEd) {
- minEd = matrix[currRow][patternLen];
- }
- }
-
- stringPos += UTF8StringUtil.charSize(strBytes, stringPos);
- int tmp = currRow;
- currRow = prevRow;
- prevRow = tmp;
- }
- if (minEd > edThresh) {
- return -1;
- } else {
- return minEd;
- }
+ @Override
+ public float computeSimilarity(ISequenceIterator firstSequence, ISequenceIterator secondSequence)
+ throws HyracksDataException {
+ // Passes -1 as the simThresh to calculate the edit distance without applying any calculation optimizations.
+ return computeActualSimilarity(firstSequence, secondSequence, -1);
}
}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
index f4162c7..4a31b8b 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
+++ b/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricJaccard.java
@@ -24,6 +24,7 @@
import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
public class SimilarityMetricJaccard extends SimilarityMetric implements IGenericSimilarityMetric {
@@ -44,24 +45,8 @@
return ((float) setX.size()) / (tokensX.length + tokensY.length - setX.size());
}
- // @Override
- // public float getSimilarity(DataBag tokensX, DataBag tokensY) {
- // return getSimilarity(tokensX, (int) tokensX.size(), tokensY,
- // (int) tokensY.size());
- // }
-
- // @Override
- // public float getSimilarity(DataBag tokensX, int lengthX, DataBag tokensY,
- // int lengthY) {
- // int intersectionSize = SimilarityMetric.getIntersectSize(tokensX,
- // tokensY);
- // int totalSize = lengthX + lengthY;
- //
- // return (float) intersectionSize / (totalSize - intersectionSize);
- // }
-
@Override
- public float getSimilarity(IListIterator tokensX, IListIterator tokensY) throws HyracksDataException {
+ public float computeSimilarity(ISequenceIterator tokensX, ISequenceIterator tokensY) throws HyracksDataException {
int intersectionSize = SimilarityMetric.getIntersectSize(tokensX, tokensY);
int totalSize = tokensX.size() + tokensY.size();
@@ -69,7 +54,7 @@
}
@Override
- public float getSimilarity(IListIterator firstList, IListIterator secondList, float simThresh)
+ public float computeSimilarity(ISequenceIterator firstList, ISequenceIterator secondList, float simThresh)
throws HyracksDataException {
// apply length filter
@@ -81,7 +66,7 @@
return -1f;
}
- float jacc = getSimilarity(firstList, secondList);
+ float jacc = computeSimilarity(firstList, secondList);
if (jacc < simThresh) {
return -1f;
} else {
diff --git a/asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java b/asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java
new file mode 100644
index 0000000..caa80bc
--- /dev/null
+++ b/asterixdb/asterix-fuzzyjoin/src/test/java/org/apache/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistanceTest.java
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.asterix.fuzzyjoin.similarity;
+
+import static org.apache.hyracks.data.std.primitive.UTF8StringPointable.generateUTF8Pointable;
+import static org.junit.Assert.assertEquals;
+
+import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
+import org.junit.Test;
+
+public class SimilarityMetricEditDistanceTest {
+
+ private static final SimilarityMetricEditDistance ed = new SimilarityMetricEditDistance();
+
+ @Test
+ public void test() throws Exception {
+ // For this case, the edit-distance of two strings is 3.
+ UTF8StringPointable leftStrPointable1 = generateUTF8Pointable("coupon not available in store");
+ UTF8StringPointable rightStrPointable1 = generateUTF8Pointable("coupon is available in store");
+
+ // The edit-distance between leftStrPointable1 and the following is 14.
+ UTF8StringPointable rightStrPointable2 = generateUTF8Pointable("coupon in store");
+
+ byte[] leftBytes1 = leftStrPointable1.getByteArray();
+ int leftStartOffset1 = leftStrPointable1.getStartOffset();
+ byte[] rightBytes1 = rightStrPointable1.getByteArray();
+ int rightStartOffset1 = rightStrPointable1.getStartOffset();
+ byte[] rightBytes2 = rightStrPointable2.getByteArray();
+ int rightStartOffset2 = rightStrPointable2.getStartOffset();
+
+ // Case 1 - normal - no early termination
+ int edThresh = 3;
+ int edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes1, rightStartOffset1, edThresh);
+ assertEquals(edThresh, edVal);
+
+ // Case 2 - the length difference between two strings is greater than edThresh.
+ // Even without calculating the distance, the method should return -1.
+ edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes2, rightStartOffset2, edThresh);
+ assertEquals(SimilarityMetricEditDistance.SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE, edVal);
+
+ // Case 3 - the edit distance is 14, but the threshold is 1.
+ // The early termination should happen and the returned value should be -1.
+ edThresh = 1;
+ edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes2, rightStartOffset2, edThresh);
+ assertEquals(SimilarityMetricEditDistance.SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE, edVal);
+
+ // Case 4 - the edit distance is 14, but the threshold is 13.
+ // The early termination will not happen. But, the resulting edit distance is greater than the given threshold.
+ // So, the final returned value should be -1.
+ edThresh = 13;
+ edVal = ed.UTF8StringEditDistance(leftBytes1, leftStartOffset1, rightBytes2, rightStartOffset2, edThresh);
+ assertEquals(SimilarityMetricEditDistance.SIMILARITY_THRESHOLD_NOT_SATISFIED_VALUE, edVal);
+ }
+
+}
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
index b929854..2435047 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/AbstractAsterixListIterator.java
@@ -20,13 +20,13 @@
import org.apache.asterix.common.exceptions.AsterixException;
import org.apache.asterix.formats.nontagged.BinaryComparatorFactoryProvider;
-import org.apache.asterix.fuzzyjoin.similarity.IListIterator;
import org.apache.asterix.om.types.ATypeTag;
import org.apache.asterix.om.types.EnumDeserializer;
import org.apache.hyracks.api.dataflow.value.IBinaryComparator;
import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.data.std.util.ISequenceIterator;
-public abstract class AbstractAsterixListIterator implements IListIterator {
+public abstract class AbstractAsterixListIterator implements ISequenceIterator {
protected byte[] data;
protected int count = 0;
@@ -42,7 +42,7 @@
protected final boolean ignoreCase = true;
@Override
- public int compare(IListIterator cmpIter) throws HyracksDataException {
+ public int compare(ISequenceIterator cmpIter) throws HyracksDataException {
return cmp.compare(data, pos, -1, cmpIter.getData(), cmpIter.getPos(), -1);
}
@@ -100,6 +100,7 @@
}
}
+ @Override
public void reset(byte[] data, int startOff) throws HyracksDataException {
this.data = data;
this.startOff = startOff;
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
index fee34b9..63e9e44 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceCheckEvaluator.java
@@ -21,6 +21,8 @@
import java.io.IOException;
import org.apache.asterix.builders.OrderedListBuilder;
+import org.apache.asterix.common.exceptions.ErrorCode;
+import org.apache.asterix.common.exceptions.RuntimeDataException;
import org.apache.asterix.formats.nontagged.SerializerDeserializerProvider;
import org.apache.asterix.om.base.ABoolean;
import org.apache.asterix.om.functions.BuiltinFunctions;
@@ -77,6 +79,10 @@
try {
edThresh = ATypeHierarchy.getIntegerValue(BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(), 2,
argPtrThreshold.getByteArray(), argPtrThreshold.getStartOffset());
+ if (edThresh < 0) {
+ throw new RuntimeDataException(ErrorCode.NEGATIVE_VALUE, BuiltinFunctions.EDIT_DISTANCE_CHECK.getName(),
+ 3, edThresh);
+ }
editDistance = computeResult(argPtr1, argPtr2, firstTypeTag);
writeResult(editDistance);
} catch (IOException e) {
@@ -101,7 +107,7 @@
case ORDEREDLIST: {
firstOrdListIter.reset(leftBytes, leftStartOffset);
secondOrdListIter.reset(rightBytes, rightStartOffset);
- return (int) ed.getSimilarity(firstOrdListIter, secondOrdListIter, edThresh);
+ return (int) ed.computeSimilarity(firstOrdListIter, secondOrdListIter, edThresh);
}
default: {
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
index c9d3731..dbe99b9 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/EditDistanceEvaluator.java
@@ -105,13 +105,15 @@
switch (argType) {
case STRING: {
- return ed.UTF8StringEditDistance(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
- rightStartOffset + typeIndicatorSize);
+ // Passes -1 as the simThresh to calculate the edit distance
+ // without applying any calculation optimizations.
+ return ed.getActualUTF8StringEditDistanceVal(leftBytes, leftStartOffset + typeIndicatorSize, rightBytes,
+ rightStartOffset + typeIndicatorSize, -1);
}
case ORDEREDLIST: {
firstOrdListIter.reset(leftBytes, leftStartOffset);
secondOrdListIter.reset(rightBytes, rightStartOffset);
- return (int) ed.getSimilarity(firstOrdListIter, secondOrdListIter);
+ return (int) ed.computeSimilarity(firstOrdListIter, secondOrdListIter);
}
default: {
throw new TypeMismatchException(BuiltinFunctions.EDIT_DISTANCE, 0, argType.serialize(),
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
index 19e9395..0decd9e 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedCheckEvaluator.java
@@ -34,6 +34,6 @@
@Override
protected float computeResult() throws HyracksDataException {
- return jaccard.getSimilarity(firstListIter, secondListIter, jaccThresh);
+ return jaccard.computeSimilarity(firstListIter, secondListIter, jaccThresh);
}
}
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
index d40cb67..1cd32c8 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/common/SimilarityJaccardSortedEvaluator.java
@@ -35,6 +35,6 @@
@Override
protected float computeResult() throws HyracksDataException {
- return jaccard.getSimilarity(firstListIter, secondListIter);
+ return jaccard.computeSimilarity(firstListIter, secondListIter);
}
}
diff --git a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IListIterator.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
similarity index 81%
rename from asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IListIterator.java
rename to hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
index 6c3d22e..9441453 100644
--- a/asterixdb/asterix-fuzzyjoin/src/main/java/org/apache/asterix/fuzzyjoin/similarity/IListIterator.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ISequenceIterator.java
@@ -17,12 +17,12 @@
* under the License.
*/
-package org.apache.asterix.fuzzyjoin.similarity;
+package org.apache.hyracks.data.std.util;
import org.apache.hyracks.api.exceptions.HyracksDataException;
-public interface IListIterator {
- public int compare(IListIterator cmpIter) throws HyracksDataException;
+public interface ISequenceIterator {
+ public int compare(ISequenceIterator cmpIter) throws HyracksDataException;
public byte[] getData();
@@ -34,5 +34,7 @@
public void reset() throws HyracksDataException;
+ public void reset(byte[] data, int startOff) throws HyracksDataException;
+
public int size();
}
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
new file mode 100644
index 0000000..237d291
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringCharByCharIterator.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.data.std.util;
+
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.util.string.UTF8StringUtil;
+
+/**
+ * An iterator class for a String. This class iterates a char by char in the given String.
+ */
+public class UTF8StringCharByCharIterator implements ISequenceIterator {
+
+ protected byte[] data;
+ protected int pos = -1;
+ protected int length = -1;
+ protected int utfByteLength = -1;
+ protected int metaLength = -1;
+ protected int startOffset = -1;
+
+ @Override
+ public int compare(ISequenceIterator cmpIter) throws HyracksDataException {
+ char thisChar = Character.toLowerCase(UTF8StringUtil.charAt(data, pos));
+ char thatChar = Character.toLowerCase(UTF8StringUtil.charAt(cmpIter.getData(), cmpIter.getPos()));
+ if (thisChar == thatChar) {
+ return 0;
+ }
+ return -1;
+ }
+
+ @Override
+ public boolean hasNext() {
+ return pos < utfByteLength;
+ }
+
+ @Override
+ public int size() {
+ return length;
+ }
+
+ @Override
+ public byte[] getData() {
+ return data;
+ }
+
+ @Override
+ public int getPos() {
+ return pos;
+ }
+
+ @Override
+ public void next() throws HyracksDataException {
+ pos += UTF8StringUtil.charSize(data, pos);
+ }
+
+ @Override
+ public void reset() throws HyracksDataException {
+ pos = startOffset + metaLength;
+ }
+
+ @Override
+ public void reset(byte[] data, int startOff) throws HyracksDataException {
+ this.data = data;
+ this.startOffset = startOff;
+ this.length = UTF8StringUtil.getStringLength(data, startOffset);
+ this.utfByteLength = UTF8StringUtil.getUTFLength(data, startOffset);
+ this.metaLength = UTF8StringUtil.getNumBytesToStoreLength(utfByteLength);
+ reset();
+ }
+
+}