[ASTERIXDB-2762] Fix str_to_codepoint() and codepoint_to_str()
This commit aims to fix bugs in the two functions.
Previously, for surrogate-pair characters (those who have 4 bytes
or 2 Java chars in UTF-16 instead of 2 bytes or 1 Java char)
the two functions didn't work fine.
The code point of such a character was an integer pair (due to two Java
chars in the encoding) instead of one integer, and this was not expected.
Change-Id: I93563b90e8d4f77886e1cb3ed67519fd0968c95d
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7306
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Dmitry Lychagin <dmitry.lychagin@couchbase.com>
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/string_to_codepoint_multi_codepoints_01/string_to_codepoint_multi_codepoints_01.1.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/string_to_codepoint_multi_codepoints_01/string_to_codepoint_multi_codepoints_01.1.query.sqlpp
new file mode 100644
index 0000000..4261cda
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/string_to_codepoint_multi_codepoints_01/string_to_codepoint_multi_codepoints_01.1.query.sqlpp
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+[
+ string_to_codepoint("👩👩👧👦"),
+ codepoint_to_string([ 128105, 8205, 128105, 8205, 128103, 8205, 128102 ])
+];
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/string_to_codepoint_multi_codepoints_01/string_to_codepoint_multi_codepoints_01.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/string_to_codepoint_multi_codepoints_01/string_to_codepoint_multi_codepoints_01.1.adm
new file mode 100644
index 0000000..2e6f72f
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/string_to_codepoint_multi_codepoints_01/string_to_codepoint_multi_codepoints_01.1.adm
@@ -0,0 +1 @@
+[ [ 128105, 8205, 128105, 8205, 128103, 8205, 128102 ], "👩👩👧👦" ]
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
index ad6bbe5..e04f341 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
@@ -9843,6 +9843,11 @@
</compilation-unit>
</test-case>
<test-case FilePath="string">
+ <compilation-unit name="string_to_codepoint_multi_codepoints_01">
+ <output-dir compare="Text">string_to_codepoint_multi_codepoints_01</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="string">
<compilation-unit name="strlen02">
<output-dir compare="Text">strlen02</output-dir>
</compilation-unit>
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/CodePointToStringDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/CodePointToStringDescriptor.java
index fcfab72..3147a9c 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/CodePointToStringDescriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/CodePointToStringDescriptor.java
@@ -70,6 +70,7 @@
private final byte[] currentUTF8 = new byte[6];
private final byte[] tempStoreForLength = new byte[5];
private final FunctionIdentifier fid = getIdentifier();
+ private final char[] tempCharPair = new char[2];
@Override
public void evaluate(IFrameTupleReference tuple, IPointable result) throws HyracksDataException {
@@ -121,7 +122,7 @@
if (!returnNull) {
int codePoint = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(
fid.getName(), 0, serOrderedList, itemOffset, itemTagPosition);
- utf_8_len += UTF8StringUtil.codePointToUTF8(codePoint, currentUTF8);
+ utf_8_len += UTF8StringUtil.codePointToUTF8(codePoint, tempCharPair, currentUTF8);
}
}
if (returnNull) {
@@ -142,7 +143,7 @@
}
int codePoint = ATypeHierarchy.getIntegerValueWithDifferentTypeTagPosition(
fid.getName(), 0, serOrderedList, itemOffset, itemTagPosition);
- utf_8_len = UTF8StringUtil.codePointToUTF8(codePoint, currentUTF8);
+ utf_8_len = UTF8StringUtil.codePointToUTF8(codePoint, tempCharPair, currentUTF8);
for (int j = 0; j < utf_8_len; j++) {
out.writeByte(currentUTF8[j]);
}
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
index e4fb37b..8d03352 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
@@ -105,8 +105,8 @@
int pos = 0;
listBuilder.reset(intListType);
while (pos < len) {
- int codePoint = UTF8StringUtil.UTF8ToCodePoint(serString, start + pos);
- pos += UTF8StringUtil.charSize(serString, start + pos);
+ int codePoint = UTF8StringUtil.codePointAt(serString, start + pos);
+ pos += UTF8StringUtil.codePointSize(serString, start + pos);
inputVal.reset();
aInt64.setValue(codePoint);
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index d2dfc23..f96ed72 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -224,68 +224,14 @@
return VarLenIntEncoderDecoder.getBytesRequired(strlen);
}
- public static int UTF8ToCodePoint(byte[] b, int s) {
- if (b[s] >> 7 == 0) {
- // 1 byte
- return b[s];
- } else if ((b[s] & 0xe0) == 0xc0) { /*0xe0 = 0b1110000*/
- // 2 bytes
- return (b[s] & 0x1f) << 6 | /*0x3f = 0b00111111*/
- (b[s + 1] & 0x3f);
- } else if ((b[s] & 0xf0) == 0xe0) {
- // 3bytes
- return (b[s] & 0xf) << 12 | (b[s + 1] & 0x3f) << 6 | (b[s + 2] & 0x3f);
- } else if ((b[s] & 0xf8) == 0xf0) {
- // 4bytes
- return (b[s] & 0x7) << 18 | (b[s + 1] & 0x3f) << 12 | (b[s + 2] & 0x3f) << 6 | (b[s + 3] & 0x3f);
- } else if ((b[s] & 0xfc) == 0xf8) {
- // 5bytes
- return (b[s] & 0x3) << 24 | (b[s + 1] & 0x3f) << 18 | (b[s + 2] & 0x3f) << 12 | (b[s + 3] & 0x3f) << 6
- | (b[s + 4] & 0x3f);
- } else if ((b[s] & 0xfe) == 0xfc) {
- // 6bytes
- return (b[s] & 0x1) << 30 | (b[s + 1] & 0x3f) << 24 | (b[s + 2] & 0x3f) << 18 | (b[s + 3] & 0x3f) << 12
- | (b[s + 4] & 0x3f) << 6 | (b[s + 5] & 0x3f);
+ public static int codePointToUTF8(int codePoint, char[] tempChars, byte[] outputUTF8) {
+ int len = 0;
+ int numChars = Character.toChars(codePoint, tempChars, 0);
+ for (int i = 0; i < numChars; i++) {
+ len += writeToBytes(outputUTF8, len, tempChars[i]);
}
- return 0;
- }
- public static int codePointToUTF8(int c, byte[] outputUTF8) {
- if (c < 0x80) {
- outputUTF8[0] = (byte) (c & 0x7F /* mask 7 lsb: 0b1111111 */);
- return 1;
- } else if (c < 0x0800) {
- outputUTF8[0] = (byte) (c >> 6 & 0x1F | 0xC0);
- outputUTF8[1] = (byte) (c & 0x3F | 0x80);
- return 2;
- } else if (c < 0x010000) {
- outputUTF8[0] = (byte) (c >> 12 & 0x0F | 0xE0);
- outputUTF8[1] = (byte) (c >> 6 & 0x3F | 0x80);
- outputUTF8[2] = (byte) (c & 0x3F | 0x80);
- return 3;
- } else if (c < 0x200000) {
- outputUTF8[0] = (byte) (c >> 18 & 0x07 | 0xF0);
- outputUTF8[1] = (byte) (c >> 12 & 0x3F | 0x80);
- outputUTF8[2] = (byte) (c >> 6 & 0x3F | 0x80);
- outputUTF8[3] = (byte) (c & 0x3F | 0x80);
- return 4;
- } else if (c < 0x4000000) {
- outputUTF8[0] = (byte) (c >> 24 & 0x03 | 0xF8);
- outputUTF8[1] = (byte) (c >> 18 & 0x3F | 0x80);
- outputUTF8[2] = (byte) (c >> 12 & 0x3F | 0x80);
- outputUTF8[3] = (byte) (c >> 6 & 0x3F | 0x80);
- outputUTF8[4] = (byte) (c & 0x3F | 0x80);
- return 5;
- } else if (c < 0x80000000) {
- outputUTF8[0] = (byte) (c >> 30 & 0x01 | 0xFC);
- outputUTF8[1] = (byte) (c >> 24 & 0x3F | 0x80);
- outputUTF8[2] = (byte) (c >> 18 & 0x3F | 0x80);
- outputUTF8[3] = (byte) (c >> 12 & 0x3F | 0x80);
- outputUTF8[4] = (byte) (c >> 6 & 0x3F | 0x80);
- outputUTF8[5] = (byte) (c & 0x3F | 0x80);
- return 6;
- }
- return 0;
+ return len;
}
/**