[ASTERIXDB-2762] Count code points in string length()
This commit aims to let the string length() built-in function to count
the number of code points instead of the number of Java Chars in a string.
Change-Id: I3ff25840adc94b4a688c53a06816d5934c6418ad
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7304
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Dmitry Lychagin <dmitry.lychagin@couchbase.com>
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/length_multi_code_point_01/length_multi_code_point_01.1.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/length_multi_code_point_01/length_multi_code_point_01.1.query.sqlpp
new file mode 100644
index 0000000..082b827
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/length_multi_code_point_01/length_multi_code_point_01.1.query.sqlpp
@@ -0,0 +1,5 @@
+[
+ length("๐ฉ๐ฉ๐ง๐ฆ"),
+ length("๐ฉ๐ฉ๐ง๐ฆ๐จ๐ณ"),
+ length("แแ
กแซแแ
ณแฏ")
+];
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/length_multi_code_point_01/length_multi_code_point_01.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/length_multi_code_point_01/length_multi_code_point_01.1.adm
new file mode 100644
index 0000000..721a7e1
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/length_multi_code_point_01/length_multi_code_point_01.1.adm
@@ -0,0 +1 @@
+[ 7, 9, 6 ]
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
index 7e389dc..4fd8b31 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
@@ -9381,6 +9381,11 @@
</compilation-unit>
</test-case>
<test-case FilePath="string">
+ <compilation-unit name="length_multi_code_point_01">
+ <output-dir compare="Text">length_multi_code_point_01</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="string">
<compilation-unit name="like_01">
<output-dir compare="Text">like_01</output-dir>
</compilation-unit>
diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
index b7a1aca..86695bc 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
@@ -132,6 +132,8 @@
length(string)
* Returns the length of the string `string`.
+ Note that the length is in the unit of code point.
+ See the following examples for more details.
* Arguments:
* `string` : a `string` or `null` that represents the string to be checked.
* Return Value:
@@ -144,11 +146,18 @@
length("test string");
-
* The expected result is:
11
+ * Example:
+
+ length("๐ฉ๐ฉ๐ง๐ฆ");
+
+ * The expected result is (the emoji character ๐ฉ๐ฉ๐ง๐ฆ has 7 code points):
+
+ 7
+
### lower ###
* Syntax:
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
index 72b6a47..a4fd667 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
@@ -91,7 +91,7 @@
ATypeTag.STRING);
return;
}
- int len = UTF8StringUtil.getUTFLength(serString, offset + 1);
+ int len = UTF8StringUtil.getNumCodePoint(serString, offset + 1);
result.setValue(len);
int64Serde.serialize(result, out);
resultPointable.set(resultStorage);
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index c3ee97c..ebc1301 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -181,6 +181,7 @@
}
}
+ // The result is the number of Java Chars (8 bytes) in the string
public static int getStringLength(byte[] b, int s) {
int len = getUTFLength(b, s);
int pos = s + getNumBytesToStoreLength(len);
@@ -193,6 +194,36 @@
return charCount;
}
+ public static int getNumCodePoint(byte[] b, int s) {
+ int len = getUTFLength(b, s);
+ int pos = s + getNumBytesToStoreLength(len);
+ int end = pos + len;
+ int codePointCount = 0;
+ while (pos < end) {
+ char ch = charAt(b, pos);
+
+ if (Character.isHighSurrogate(ch)) {
+ pos += charSize(b, pos);
+ ch = charAt(b, pos);
+ if (Character.isLowSurrogate(ch)) {
+ codePointCount++;
+ } else {
+ throw new IllegalArgumentException(
+ "Decoding error: get a high surrogate without a following low surrogate when counting number of code points");
+ }
+ } else if (Character.isLowSurrogate(ch)) {
+ throw new IllegalArgumentException(
+ "Decoding error: get a low surrogate without a leading high surrogate when counting number of code points");
+ } else {
+ // A single-Java-Char code point (not a surrogate pair)
+ codePointCount++;
+ }
+ pos += charSize(b, pos);
+ }
+
+ return codePointCount;
+ }
+
public static int getUTFLength(byte[] b, int s) {
return VarLenIntEncoderDecoder.decode(b, s);
}
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
index 2c99104..c7468d2 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
@@ -30,6 +30,7 @@
import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
import static org.apache.hyracks.util.string.UTF8StringUtil.getModifiedUTF8Len;
import static org.apache.hyracks.util.string.UTF8StringUtil.getNumBytesToStoreLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getNumCodePoint;
import static org.apache.hyracks.util.string.UTF8StringUtil.getStringLength;
import static org.apache.hyracks.util.string.UTF8StringUtil.getUTF8StringInArray;
import static org.apache.hyracks.util.string.UTF8StringUtil.getUTFLength;
@@ -176,4 +177,13 @@
assertEquals("23333", getUTF8StringInArray(bytes, 52, 5));
}
+ @Test
+ public void testGetNumCodePoint() {
+ String str = "\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66";
+ assertEquals(getNumCodePoint(writeStringToBytes(str), 0), 7);
+
+ str = "\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66\uD83C\uDDE8\uD83C\uDDF3";
+ assertEquals(getNumCodePoint(writeStringToBytes(str), 0), 9);
+ }
+
}