[ASTERIXDB-2762] reverse() per code point
This commit aims to reverse a string per code point instead of per Java
char in the reverse() function.
Change-Id: I437903b8bc668c836e781f4a965e6039305b8654
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7303
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Dmitry Lychagin <dmitry.lychagin@couchbase.com>
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/reverse/reverse.1.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/reverse/reverse.1.query.sqlpp
index e127372..f450868 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/reverse/reverse.1.query.sqlpp
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/reverse/reverse.1.query.sqlpp
@@ -22,5 +22,6 @@
"t2": reverse(""),
"t3": reverse("abcd"),
"t4": string_to_codepoint(reverse("a\u00D7\u2103\u00F7\u2109b")),
- "t5": ( from ["ab", "abc", "abcd"] t select value reverse(t) order by t )
- }
\ No newline at end of file
+ "t5": ( from ["ab", "abc", "abcd"] t select value reverse(t) order by t ),
+ "t6": reverse("🇨🇳")
+ };
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/reverse/reverse.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/reverse/reverse.1.adm
index a2b8b2c..d0669c4 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/reverse/reverse.1.adm
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/reverse/reverse.1.adm
@@ -1 +1 @@
-{ "t1": [ true, true ], "t2": "", "t3": "dcba", "t4": [ 98, 8457, 247, 8451, 215, 97 ], "t5": [ "ba", "cba", "dcba" ] }
\ No newline at end of file
+{ "t1": [ true, true ], "t2": "", "t3": "dcba", "t4": [ 98, 8457, 247, 8451, 215, 97 ], "t5": [ "ba", "cba", "dcba" ], "t6": "🇳🇨" }
diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
index 1c713b0..b7a1aca 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
@@ -460,6 +460,8 @@
reverse(string)
* Returns a string formed by reversing characters in the input `string`.
+ For characters of multiple code points, code point is the minimal unit to reverse.
+ See the following examples for more details.
* Arguments:
* `string` : a `string` to be reversed
* Return Value:
@@ -473,11 +475,19 @@
reverse("hello");
-
* The expected result is:
"olleh"
+* Example of multi-code-point character (Korean):
+
+ reverse("한글");
+
+* The expected result is
+ (the Korean characters are splitted into code points and then the code points are reversed):
+
+ "ᆯᅳᄀᆫᅡᄒ"
+
### rtrim ###
* Syntax:
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 3b1f18b..9a38a4e 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
@@ -656,7 +656,27 @@
int srcEnd = srcPtr.getStartOffset() + srcPtr.getLength() - 1;
for (int cursorIndex = srcEnd; cursorIndex >= srcStart; cursorIndex--) {
if (UTF8StringUtil.isCharStart(srcPtr.bytes, cursorIndex)) {
+ char ch = UTF8StringUtil.charAt(srcPtr.bytes, cursorIndex);
int charSize = UTF8StringUtil.charSize(srcPtr.bytes, cursorIndex);
+
+ if (Character.isLowSurrogate(ch)) {
+ while (cursorIndex >= srcStart) {
+ cursorIndex--;
+ if (UTF8StringUtil.isCharStart(srcPtr.bytes, cursorIndex)) {
+ ch = UTF8StringUtil.charAt(srcPtr.bytes, cursorIndex);
+ if (Character.isHighSurrogate(ch) == false) {
+ throw new IllegalArgumentException(
+ "Decoding Error: no corresponding high surrogate found for the following low surrogate");
+ }
+
+ charSize += UTF8StringUtil.charSize(srcPtr.bytes, cursorIndex);
+ break;
+ }
+ }
+ } else if (Character.isHighSurrogate(ch)) {
+ throw new IllegalArgumentException("Decoding Error: get a high surrogate without low surrogate");
+ }
+
builder.appendUtf8StringPointable(srcPtr, cursorIndex, charSize);
}
}
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
index 22be7ca..387bc03 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
@@ -325,4 +325,30 @@
assertEquals(0, expected.compareTo(result));
}
+ @Test
+ public void testReverse() throws Exception {
+ UTF8StringBuilder builder = new UTF8StringBuilder();
+ GrowableArray storage = new GrowableArray();
+ UTF8StringPointable result = new UTF8StringPointable();
+ UTF8StringPointable input = generateUTF8Pointable(" I'd like to reverse ");
+ UTF8StringPointable expected = generateUTF8Pointable(" esrever ot ekil d'I ");
+
+ UTF8StringPointable.reverse(input, builder, storage);
+ result.set(storage.getByteArray(), 0, storage.getLength());
+ assertEquals(0, expected.compareTo(result));
+ }
+
+ @Test
+ public void testReverseWithEmoji() throws IOException {
+ UTF8StringBuilder builder = new UTF8StringBuilder();
+ GrowableArray storage = new GrowableArray();
+ UTF8StringPointable result = new UTF8StringPointable();
+ UTF8StringPointable input = generateUTF8Pointable("\uD83C\uDDE8\uD83C\uDDF3"); // CN flag
+ UTF8StringPointable expected = generateUTF8Pointable("\uD83C\uDDF3\uD83C\uDDE8"); // NC flag
+
+ UTF8StringPointable.reverse(input, builder, storage);
+ result.set(storage.getByteArray(), 0, storage.getLength());
+ assertEquals(0, expected.compareTo(result));
+ }
+
}