[ASTERIXDB-2303][API] Fix Supplementary Chars Printing
- user model changes: no
- storage format changes: no
- interface changes: no
Details:
- Properly print supplementary chars as utf8
by converting their java surrogates to a string.
- Add test case.
Change-Id: I59e825c11ff750d5b651fb86712023c52e98367e
Reviewed-on: https://asterix-gerrit.ics.uci.edu/2429
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Contrib: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Michael Blow <mblow@apache.org>
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/utf8/utf8.1.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/utf8/utf8.1.query.sqlpp
new file mode 100644
index 0000000..88909ef
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/utf8/utf8.1.query.sqlpp
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+select value "\uD83D\uDE22\uD83D\uDE22\uD83D\uDC89\uD83D\uDC89 = 😢😢💉💉. Coffee ☕‼️😃. حسنا";
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/utf8/utf8.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/utf8/utf8.1.adm
new file mode 100644
index 0000000..89c6334
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/utf8/utf8.1.adm
@@ -0,0 +1 @@
+"😢😢💉💉 = 😢😢💉💉. Coffee ☕‼️😃. حسنا"
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
index 4265163..9fc0b4b 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
@@ -6445,6 +6445,11 @@
<output-dir compare="Text">varlen-encoding</output-dir>
</compilation-unit>
</test-case>
+ <test-case FilePath="string">
+ <compilation-unit name="utf8">
+ <output-dir compare="Text">utf8</output-dir>
+ </compilation-unit>
+ </test-case>
</test-group>
<test-group name="subquery">
<test-case FilePath="subquery">
diff --git a/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java b/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
index b1039a5..8d05f0f 100644
--- a/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
+++ b/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
@@ -357,6 +357,13 @@
break;
}
break;
+ case 3:
+ // special treatment for surrogates
+ if (Character.isHighSurrogate(c)) {
+ position += writeSupplementaryChar(os, b, maxPosition, position, c, sz);
+ sz = 0;
+ }
+ break;
}
while (sz > 0) {
os.write(b[position]);
@@ -378,4 +385,22 @@
os.write(HexPrinter.hex(c & 0x0f, HexPrinter.CASE.LOWER_CASE));
}
+ /**
+ * Writes a supplementary char consisting of high and low surrogates
+ *
+ * @return The length of the surrogates
+ * @throws IOException
+ */
+ private static int writeSupplementaryChar(OutputStream os, byte[] src, int limit, int highSurrogatePos,
+ char highSurrogate, int highSurrogateSize) throws IOException {
+ final int lowSurrogatePos = highSurrogatePos + highSurrogateSize;
+ if (lowSurrogatePos >= limit) {
+ throw new IllegalStateException("malformed utf8 input");
+ }
+ final char lowSurrogate = UTF8StringUtil.charAt(src, lowSurrogatePos);
+ final int lowSurrogateSize = UTF8StringUtil.charSize(src, lowSurrogatePos);
+ os.write(new String(new char[] { highSurrogate, lowSurrogate }).getBytes());
+ return highSurrogateSize + lowSurrogateSize;
+ }
+
}