[ASTERIXDB-2762] Count code points in string length()

This commit aims to let the string length() built-in function to count
the number of code points instead of the number of Java Chars in a string.

Change-Id: I3ff25840adc94b4a688c53a06816d5934c6418ad
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7304
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Dmitry Lychagin <dmitry.lychagin@couchbase.com>
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/length_multi_code_point_01/length_multi_code_point_01.1.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/length_multi_code_point_01/length_multi_code_point_01.1.query.sqlpp
new file mode 100644
index 0000000..082b827
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/length_multi_code_point_01/length_multi_code_point_01.1.query.sqlpp
@@ -0,0 +1,5 @@
+[
+    length("๐Ÿ‘ฉ‍๐Ÿ‘ฉ‍๐Ÿ‘ง‍๐Ÿ‘ฆ"),
+    length("๐Ÿ‘ฉ‍๐Ÿ‘ฉ‍๐Ÿ‘ง‍๐Ÿ‘ฆ๐Ÿ‡จ๐Ÿ‡ณ"),
+    length("แ„’แ…กแ†ซแ„€แ…ณแ†ฏ")
+];
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/length_multi_code_point_01/length_multi_code_point_01.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/length_multi_code_point_01/length_multi_code_point_01.1.adm
new file mode 100644
index 0000000..721a7e1
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/length_multi_code_point_01/length_multi_code_point_01.1.adm
@@ -0,0 +1 @@
+[ 7, 9, 6 ]
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
index 7e389dc..4fd8b31 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
@@ -9381,6 +9381,11 @@
       </compilation-unit>
     </test-case>
     <test-case FilePath="string">
+      <compilation-unit name="length_multi_code_point_01">
+        <output-dir compare="Text">length_multi_code_point_01</output-dir>
+      </compilation-unit>
+    </test-case>
+    <test-case FilePath="string">
       <compilation-unit name="like_01">
         <output-dir compare="Text">like_01</output-dir>
       </compilation-unit>
diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
index b7a1aca..86695bc 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
@@ -132,6 +132,8 @@
         length(string)
 
  * Returns the length of the string `string`.
+ Note that the length is in the unit of code point.
+ See the following examples for more details.
  * Arguments:
     * `string` : a `string` or `null` that represents the string to be checked.
  * Return Value:
@@ -144,11 +146,18 @@
 
         length("test string");
 
-
  * The expected result is:
 
         11
 
+ * Example:
+
+        length("๐Ÿ‘ฉ‍๐Ÿ‘ฉ‍๐Ÿ‘ง‍๐Ÿ‘ฆ");
+
+ * The expected result is (the emoji character ๐Ÿ‘ฉ‍๐Ÿ‘ฉ‍๐Ÿ‘ง‍๐Ÿ‘ฆ has 7 code points):
+
+        7
+
 
 ### lower ###
  * Syntax:
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
index 72b6a47..a4fd667 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
@@ -91,7 +91,7 @@
                                         ATypeTag.STRING);
                                 return;
                             }
-                            int len = UTF8StringUtil.getUTFLength(serString, offset + 1);
+                            int len = UTF8StringUtil.getNumCodePoint(serString, offset + 1);
                             result.setValue(len);
                             int64Serde.serialize(result, out);
                             resultPointable.set(resultStorage);
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index c3ee97c..ebc1301 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -181,6 +181,7 @@
         }
     }
 
+    // The result is the number of Java Chars (8 bytes) in the string
     public static int getStringLength(byte[] b, int s) {
         int len = getUTFLength(b, s);
         int pos = s + getNumBytesToStoreLength(len);
@@ -193,6 +194,36 @@
         return charCount;
     }
 
+    public static int getNumCodePoint(byte[] b, int s) {
+        int len = getUTFLength(b, s);
+        int pos = s + getNumBytesToStoreLength(len);
+        int end = pos + len;
+        int codePointCount = 0;
+        while (pos < end) {
+            char ch = charAt(b, pos);
+
+            if (Character.isHighSurrogate(ch)) {
+                pos += charSize(b, pos);
+                ch = charAt(b, pos);
+                if (Character.isLowSurrogate(ch)) {
+                    codePointCount++;
+                } else {
+                    throw new IllegalArgumentException(
+                            "Decoding error: get a high surrogate without a following low surrogate when counting number of code points");
+                }
+            } else if (Character.isLowSurrogate(ch)) {
+                throw new IllegalArgumentException(
+                        "Decoding error: get a low surrogate without a leading high surrogate when counting number of code points");
+            } else {
+                // A single-Java-Char code point (not a surrogate pair)
+                codePointCount++;
+            }
+            pos += charSize(b, pos);
+        }
+
+        return codePointCount;
+    }
+
     public static int getUTFLength(byte[] b, int s) {
         return VarLenIntEncoderDecoder.decode(b, s);
     }
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
index 2c99104..c7468d2 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
@@ -30,6 +30,7 @@
 import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
 import static org.apache.hyracks.util.string.UTF8StringUtil.getModifiedUTF8Len;
 import static org.apache.hyracks.util.string.UTF8StringUtil.getNumBytesToStoreLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getNumCodePoint;
 import static org.apache.hyracks.util.string.UTF8StringUtil.getStringLength;
 import static org.apache.hyracks.util.string.UTF8StringUtil.getUTF8StringInArray;
 import static org.apache.hyracks.util.string.UTF8StringUtil.getUTFLength;
@@ -176,4 +177,13 @@
         assertEquals("23333", getUTF8StringInArray(bytes, 52, 5));
     }
 
+    @Test
+    public void testGetNumCodePoint() {
+        String str = "\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66";
+        assertEquals(getNumCodePoint(writeStringToBytes(str), 0), 7);
+
+        str = "\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66\uD83C\uDDE8\uD83C\uDDF3";
+        assertEquals(getNumCodePoint(writeStringToBytes(str), 0), 9);
+    }
+
 }