modified snapshot version of fuzzy join, fixed issue 703
diff --git a/asterix-app/src/test/resources/runtimets/queries/similarity/edit-distance-check_unicode/edit-distance-check_unicode.1.ddl.aql b/asterix-app/src/test/resources/runtimets/queries/similarity/edit-distance-check_unicode/edit-distance-check_unicode.1.ddl.aql
new file mode 100644
index 0000000..754ea81
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/similarity/edit-distance-check_unicode/edit-distance-check_unicode.1.ddl.aql
@@ -0,0 +1,3 @@
+drop dataverse test if exists;
+create dataverse test;
+
diff --git a/asterix-app/src/test/resources/runtimets/queries/similarity/edit-distance-check_unicode/edit-distance-check_unicode.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/similarity/edit-distance-check_unicode/edit-distance-check_unicode.2.update.aql
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/similarity/edit-distance-check_unicode/edit-distance-check_unicode.2.update.aql
diff --git a/asterix-app/src/test/resources/runtimets/queries/similarity/edit-distance-check_unicode/edit-distance-check_unicode.3.query.aql b/asterix-app/src/test/resources/runtimets/queries/similarity/edit-distance-check_unicode/edit-distance-check_unicode.3.query.aql
new file mode 100644
index 0000000..ee84ba4
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/similarity/edit-distance-check_unicode/edit-distance-check_unicode.3.query.aql
@@ -0,0 +1,15 @@
+use dataverse test;
+
+let $a := "사랑"
+let $b := "사랑해"
+let $c := "사과"
+
+let $results :=
+[
+ edit-distance-check($a, $b, 1), // TRUE
+ edit-distance-check($b, $a, 1), // TRUE
+ edit-distance-check($b, $c, 1), // FALSE
+ edit-distance-check($c, $b, 2) // TRUE
+]
+for $i in $results
+return $i
diff --git a/asterix-app/src/test/resources/runtimets/results/similarity/edit-distance-check_unicode/edit-distance-check_unicode.1.adm b/asterix-app/src/test/resources/runtimets/results/similarity/edit-distance-check_unicode/edit-distance-check_unicode.1.adm
new file mode 100644
index 0000000..56de037
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/results/similarity/edit-distance-check_unicode/edit-distance-check_unicode.1.adm
@@ -0,0 +1,4 @@
+[ true, 1 ]
+[ true, 1 ]
+[ false, 2147483647 ]
+[ true, 2 ]
diff --git a/asterix-app/src/test/resources/runtimets/testsuite.xml b/asterix-app/src/test/resources/runtimets/testsuite.xml
index d7b4c75..067fddb 100644
--- a/asterix-app/src/test/resources/runtimets/testsuite.xml
+++ b/asterix-app/src/test/resources/runtimets/testsuite.xml
@@ -3160,6 +3160,11 @@
</compilation-unit>
</test-case>
<test-case FilePath="similarity">
+ <compilation-unit name="edit-distance-check_unicode">
+ <output-dir compare="Text">edit-distance-check_unicode</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="similarity">
<compilation-unit name="edit-distance-list-is-filterable">
<output-dir compare="Text">edit-distance-list-is-filterable</output-dir>
</compilation-unit>
diff --git a/asterix-fuzzyjoin/pom.xml b/asterix-fuzzyjoin/pom.xml
index 42dd773..95ecfd8 100644
--- a/asterix-fuzzyjoin/pom.xml
+++ b/asterix-fuzzyjoin/pom.xml
@@ -3,11 +3,11 @@
<parent>
<artifactId>asterix</artifactId>
<groupId>edu.uci.ics.asterix</groupId>
- <version>0.8.1-SNAPSHOT</version>
+ <version>0.8.4-SNAPSHOT</version>
</parent>
<groupId>edu.uci.ics.asterix</groupId>
<artifactId>asterix-fuzzyjoin</artifactId>
- <version>0.8.1-SNAPSHOT</version>
+ <version>0.8.4-SNAPSHOT</version>
<build>
<plugins>
diff --git a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
index b99d6f7..247bbd0 100644
--- a/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
+++ b/asterix-fuzzyjoin/src/main/java/edu/uci/ics/asterix/fuzzyjoin/similarity/SimilarityMetricEditDistance.java
@@ -155,11 +155,11 @@
public int UTF8StringEditDistance(byte[] bytes, int fsStart, int ssStart, int edThresh) {
- int fsUtfLen = StringUtils.getUTFLen(bytes, fsStart);
- int ssUtfLen = StringUtils.getUTFLen(bytes, ssStart);
+ int fsStrLen = StringUtils.getStrLen(bytes, fsStart);
+ int ssStrLen = StringUtils.getStrLen(bytes, ssStart);
// length filter
- if (Math.abs(fsUtfLen - ssUtfLen) > edThresh) {
+ if (Math.abs(fsStrLen - ssStrLen) > edThresh) {
return -1;
}
@@ -169,7 +169,7 @@
// compute letter counts for first string
int fsPos = fsStart + utf8SizeIndicatorSize;
- int fsEnd = fsPos + fsUtfLen;
+ int fsEnd = fsPos + StringUtils.getUTFLen(bytes, fsStart);;
while (fsPos < fsEnd) {
char c = StringUtils.toLowerCase(StringUtils.charAt(bytes, fsPos));
if (c < 128) {
@@ -180,7 +180,7 @@
// compute letter counts for second string
int ssPos = ssStart + utf8SizeIndicatorSize;
- int ssEnd = ssPos + ssUtfLen;
+ int ssEnd = ssPos + StringUtils.getUTFLen(bytes, ssStart);
while (ssPos < ssEnd) {
char c = StringUtils.toLowerCase(StringUtils.charAt(bytes, ssPos));
if (c < 128) {
diff --git a/asterix-runtime/pom.xml b/asterix-runtime/pom.xml
index 06ac7fa..8e29c7d 100644
--- a/asterix-runtime/pom.xml
+++ b/asterix-runtime/pom.xml
@@ -139,7 +139,7 @@
<dependency>
<groupId>edu.uci.ics.asterix</groupId>
<artifactId>asterix-fuzzyjoin</artifactId>
- <version>0.8.1-SNAPSHOT</version>
+ <version>0.8.4-SNAPSHOT</version>
<scope>compile</scope>
</dependency>
<dependency>