[ASTERIXDB-2762] Fix str_to_codepoint() and codepoint_to_str() This commit aims to fix bugs in the two functions. Previously, for surrogate-pair characters (those who have 4 bytes or 2 Java chars in UTF-16 instead of 2 bytes or 1 Java char) the two functions didn't work fine. The code point of such a character was an integer pair (due to two Java chars in the encoding) instead of one integer, and this was not expected. Change-Id: I93563b90e8d4f77886e1cb3ed67519fd0968c95d Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7306 Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu> Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu> Reviewed-by: Dmitry Lychagin <dmitry.lychagin@couchbase.com>

commit: c961938ea6187350b39f1bf432b6e7b124299524 [log] [tgz]
author: Rui Guo <ruig2@uci.edu> Tue Jul 28 12:40:09 2020 -0700
committer: Dmitry Lychagin <dmitry.lychagin@couchbase.com> Tue Jul 28 23:34:20 2020 +0000
tree: f954b3bd8a27655ada5f0b1c4dfcb3c2ccb9a352
parent: ba678bc51772b5b332dcd75fc08085fda9d299af [diff]
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/string_to_codepoint_multi_codepoints_01/string_to_codepoint_multi_codepoints_01.1.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/string_to_codepoint_multi_codepoints_01/string_to_codepoint_multi_codepoints_01.1.query.sqlpp
new file mode 100644
index 0000000..4261cda
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/string_to_codepoint_multi_codepoints_01/string_to_codepoint_multi_codepoints_01.1.query.sqlpp

@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+[
+    string_to_codepoint("👩‍👩‍👧‍👦"),
+    codepoint_to_string([ 128105, 8205, 128105, 8205, 128103, 8205, 128102 ])
+];
\ No newline at end of file

diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/string_to_codepoint_multi_codepoints_01/string_to_codepoint_multi_codepoints_01.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/string_to_codepoint_multi_codepoints_01/string_to_codepoint_multi_codepoints_01.1.adm
new file mode 100644
index 0000000..2e6f72f
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/string_to_codepoint_multi_codepoints_01/string_to_codepoint_multi_codepoints_01.1.adm

@@ -0,0 +1 @@
+[ [ 128105, 8205, 128105, 8205, 128103, 8205, 128102 ], "👩‍👩‍👧‍👦" ]

diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
index ad6bbe5..e04f341 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml

@@ -9843,6 +9843,11 @@
       </compilation-unit>
     </test-case>
     <test-case FilePath="string">
+      <compilation-unit name="string_to_codepoint_multi_codepoints_01">
+        <output-dir compare="Text">string_to_codepoint_multi_codepoints_01</output-dir>
+      </compilation-unit>
+    </test-case>
+    <test-case FilePath="string">
       <compilation-unit name="strlen02">
         <output-dir compare="Text">strlen02</output-dir>
       </compilation-unit>

diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/CodePointToStringDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/CodePointToStringDescriptor.java
index fcfab72..3147a9c 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/CodePointToStringDescriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/CodePointToStringDescriptor.java

@@ -70,6 +70,7 @@
                     private final byte[] currentUTF8 = new byte[6];
                     private final byte[] tempStoreForLength = new byte[5];
                     private final FunctionIdentifier fid = getIdentifier();
+                    private final char[] tempCharPair = new char[2];
 
                     @Override
                     public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
@@ -121,7 +122,7 @@
                                 if (!returnNull) {
                                     int codePoint = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(
                                             fid.getName(), 0, serOrderedList, itemOffset, itemTagPosition);
-                                    utf_8_len += UTF8StringUtil.codePointToUTF8(codePoint, currentUTF8);
+                                    utf_8_len += UTF8StringUtil.codePointToUTF8(codePoint, tempCharPair, currentUTF8);
                                 }
                             }
                             if (returnNull) {
@@ -142,7 +143,7 @@
                                 }
                                 int codePoint = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(
                                         fid.getName(), 0, serOrderedList, itemOffset, itemTagPosition);
-                                utf_8_len = UTF8StringUtil.codePointToUTF8(codePoint, currentUTF8);
+                                utf_8_len = UTF8StringUtil.codePointToUTF8(codePoint, tempCharPair, currentUTF8);
                                 for (int j = 0; j < utf_8_len; j++) {
                                     out.writeByte(currentUTF8[j]);
                                 }

diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
index e4fb37b..8d03352 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java

@@ -105,8 +105,8 @@
                             int pos = 0;
                             listBuilder.reset(intListType);
                             while (pos < len) {
-                                int codePoint = UTF8StringUtil.UTF8ToCodePoint(serString, start + pos);
-                                pos += UTF8StringUtil.charSize(serString, start + pos);
+                                int codePoint = UTF8StringUtil.codePointAt(serString, start + pos);
+                                pos += UTF8StringUtil.codePointSize(serString, start + pos);
 
                                 inputVal.reset();
                                 aInt64.setValue(codePoint);

diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index d2dfc23..f96ed72 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java

@@ -224,68 +224,14 @@
         return VarLenIntEncoderDecoder.getBytesRequired(strlen);
     }
 
-    public static int UTF8ToCodePoint(byte[] b, int s) {
-        if (b[s] >> 7 == 0) {
-            // 1 byte
-            return b[s];
-        } else if ((b[s] & 0xe0) == 0xc0) { /*0xe0 = 0b1110000*/
-            // 2 bytes
-            return (b[s] & 0x1f) << 6 | /*0x3f = 0b00111111*/
-                    (b[s + 1] & 0x3f);
-        } else if ((b[s] & 0xf0) == 0xe0) {
-            // 3bytes
-            return (b[s] & 0xf) << 12 | (b[s + 1] & 0x3f) << 6 | (b[s + 2] & 0x3f);
-        } else if ((b[s] & 0xf8) == 0xf0) {
-            // 4bytes
-            return (b[s] & 0x7) << 18 | (b[s + 1] & 0x3f) << 12 | (b[s + 2] & 0x3f) << 6 | (b[s + 3] & 0x3f);
-        } else if ((b[s] & 0xfc) == 0xf8) {
-            // 5bytes
-            return (b[s] & 0x3) << 24 | (b[s + 1] & 0x3f) << 18 | (b[s + 2] & 0x3f) << 12 | (b[s + 3] & 0x3f) << 6
-                    | (b[s + 4] & 0x3f);
-        } else if ((b[s] & 0xfe) == 0xfc) {
-            // 6bytes
-            return (b[s] & 0x1) << 30 | (b[s + 1] & 0x3f) << 24 | (b[s + 2] & 0x3f) << 18 | (b[s + 3] & 0x3f) << 12
-                    | (b[s + 4] & 0x3f) << 6 | (b[s + 5] & 0x3f);
+    public static int codePointToUTF8(int codePoint, char[] tempChars, byte[] outputUTF8) {
+        int len = 0;
+        int numChars = Character.toChars(codePoint, tempChars, 0);
+        for (int i = 0; i < numChars; i++) {
+            len += writeToBytes(outputUTF8, len, tempChars[i]);
         }
-        return 0;
-    }
 
-    public static int codePointToUTF8(int c, byte[] outputUTF8) {
-        if (c < 0x80) {
-            outputUTF8[0] = (byte) (c & 0x7F /* mask 7 lsb: 0b1111111 */);
-            return 1;
-        } else if (c < 0x0800) {
-            outputUTF8[0] = (byte) (c >> 6 & 0x1F | 0xC0);
-            outputUTF8[1] = (byte) (c & 0x3F | 0x80);
-            return 2;
-        } else if (c < 0x010000) {
-            outputUTF8[0] = (byte) (c >> 12 & 0x0F | 0xE0);
-            outputUTF8[1] = (byte) (c >> 6 & 0x3F | 0x80);
-            outputUTF8[2] = (byte) (c & 0x3F | 0x80);
-            return 3;
-        } else if (c < 0x200000) {
-            outputUTF8[0] = (byte) (c >> 18 & 0x07 | 0xF0);
-            outputUTF8[1] = (byte) (c >> 12 & 0x3F | 0x80);
-            outputUTF8[2] = (byte) (c >> 6 & 0x3F | 0x80);
-            outputUTF8[3] = (byte) (c & 0x3F | 0x80);
-            return 4;
-        } else if (c < 0x4000000) {
-            outputUTF8[0] = (byte) (c >> 24 & 0x03 | 0xF8);
-            outputUTF8[1] = (byte) (c >> 18 & 0x3F | 0x80);
-            outputUTF8[2] = (byte) (c >> 12 & 0x3F | 0x80);
-            outputUTF8[3] = (byte) (c >> 6 & 0x3F | 0x80);
-            outputUTF8[4] = (byte) (c & 0x3F | 0x80);
-            return 5;
-        } else if (c < 0x80000000) {
-            outputUTF8[0] = (byte) (c >> 30 & 0x01 | 0xFC);
-            outputUTF8[1] = (byte) (c >> 24 & 0x3F | 0x80);
-            outputUTF8[2] = (byte) (c >> 18 & 0x3F | 0x80);
-            outputUTF8[3] = (byte) (c >> 12 & 0x3F | 0x80);
-            outputUTF8[4] = (byte) (c >> 6 & 0x3F | 0x80);
-            outputUTF8[5] = (byte) (c & 0x3F | 0x80);
-            return 6;
-        }
-        return 0;
+        return len;
     }
 
     /**
commit	c961938ea6187350b39f1bf432b6e7b124299524	[log] [tgz]
author	Rui Guo <ruig2@uci.edu>	Tue Jul 28 12:40:09 2020 -0700
committer	Dmitry Lychagin <dmitry.lychagin@couchbase.com>	Tue Jul 28 23:34:20 2020 +0000
tree	f954b3bd8a27655ada5f0b1c4dfcb3c2ccb9a352
parent	ba678bc51772b5b332dcd75fc08085fda9d299af [diff]