[ASTERIXDB-2762] Use code point as unit in position()
Change-Id: Icf1b8b3401599e4332dd09534bdf4787cd9d85d6
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7305
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Dmitry Lychagin <dmitry.lychagin@couchbase.com>
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.query.sqlpp
new file mode 100644
index 0000000..e50ada6
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.query.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+pos0("👩👩👧👦🏀", "🏀");
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.query.sqlpp
new file mode 100644
index 0000000..55af74b
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.query.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+pos1("👩👩👧👦🏀", "🏀");
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.adm
new file mode 100644
index 0000000..7f8f011
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.adm
@@ -0,0 +1 @@
+7
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.adm
new file mode 100644
index 0000000..45a4fb7
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.adm
@@ -0,0 +1 @@
+8
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
index 4fd8b31..ad6bbe5 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
@@ -9498,6 +9498,11 @@
</compilation-unit>
</test-case>
<test-case FilePath="string">
+ <compilation-unit name="position/offset0/pos0_multi_code_point">
+ <output-dir compare="Text">position/offset0/pos0_multi_code_point</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="string">
<compilation-unit name="position/offset1/position1">
<output-dir compare="Text">position/offset1/position1</output-dir>
</compilation-unit>
@@ -9508,6 +9513,11 @@
</compilation-unit>
</test-case>
<test-case FilePath="string">
+ <compilation-unit name="position/offset1/pos1_multi_code_point">
+ <output-dir compare="Text">position/offset1/pos1_multi_code_point</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="string">
<compilation-unit name="regexp_contains/regexp_contains">
<output-dir compare="Text">regexp_contains/regexp_contains</output-dir>
</compilation-unit>
diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
index 86695bc..b4a7a87 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
@@ -226,10 +226,13 @@
position(string, string_pattern)
- * Returns the first position of `string_pattern` within `string`. The function returns the 0-based position. Another
+ * Returns the first position of `string_pattern` within `string`.
+ The result is counted in the unit of code points.
+ See the following example for more details.
+
+ * The function returns the 0-based position. Another
version of the function returns the 1-based position. Below are the aliases for each version:
- * Aliases:
* 0-based: `position`, `pos`, `position0`, `pos0`.
* 1-based: `position1`, `pos1`.
@@ -249,14 +252,21 @@
"v1": position("ppphonepp", "phone"),
"v2": position("hone", "phone"),
"v3": position1("ppphonepp", "phone"),
- "v4": position1("hone", "phone"),
+ "v4": position1("hone", "phone")
};
-
* The expected result is:
{ "v1": 2, "v2": -1, v3": 3, "v4": -1 }
+ * Example of multi-code-point character:
+
+ position("👩👩👧👦🏀", "🏀");
+
+ * The expected result is (the emoji family character has 7 code points):
+
+ 7
+
### regexp_contains ###
* Syntax:
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionDescriptor.java
index f7177fd..6c06056 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionDescriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionDescriptor.java
@@ -48,7 +48,7 @@
@Override
protected int compute(UTF8StringPointable left, UTF8StringPointable right) {
- return UTF8StringPointable.find(left, right, false);
+ return UTF8StringPointable.findInCodePoint(left, right, false);
}
};
}
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionOffset1Descriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionOffset1Descriptor.java
index 10cc779..93ada0f 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionOffset1Descriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionOffset1Descriptor.java
@@ -48,7 +48,7 @@
@Override
protected int compute(UTF8StringPointable left, UTF8StringPointable right) {
- int pos = UTF8StringPointable.find(left, right, false);
+ int pos = UTF8StringPointable.findInCodePoint(left, right, false);
return pos < 0 ? pos : pos + 1;
}
};
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 9a38a4e..21c8a36 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
@@ -18,6 +18,9 @@
*/
package org.apache.hyracks.data.std.primitive;
+import static org.apache.hyracks.util.string.UTF8StringUtil.HIGH_SURROGATE_WITHOUT_LOW_SURROGATE;
+import static org.apache.hyracks.util.string.UTF8StringUtil.LOW_SURROGATE_WITHOUT_HIGH_SURROGATE;
+
import java.io.IOException;
import java.nio.charset.Charset;
@@ -235,19 +238,56 @@
* the pattern string.
* @param ignoreCase,
* to ignore case or not.
+ * @return the offset in the unit of code point of the first character of the matching string. Not including the MetaLength.
+ */
+ public static int findInCodePoint(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase) {
+ return findInByteOrCodePoint(src, pattern, ignoreCase, 0, false);
+ }
+
+ /**
+ * @param src,
+ * the source string.
+ * @param pattern,
+ * the pattern string.
+ * @param ignoreCase,
+ * to ignore case or not.
* @param startMatch,
* the start offset.
* @return the byte offset of the first character of the matching string after <code>startMatchPos}</code>.
* Not including the MetaLength.
*/
public static int find(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase, int startMatch) {
+ return findInByteOrCodePoint(src, pattern, ignoreCase, startMatch, true);
+ }
+
+ /**
+ * @param src,
+ * the source string.
+ * @param pattern,
+ * the pattern string.
+ * @param ignoreCase,
+ * to ignore case or not.
+ * @param startMatch,
+ * the start offset.
+ * @return the offset in the unit of code point of the first character of the matching string. Not including the MetaLength.
+ */
+ public static int findInCodePoint(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase,
+ int startMatch) {
+ return findInByteOrCodePoint(src, pattern, ignoreCase, startMatch, false);
+ }
+
+ // If resultInByte is true, then return the position in bytes, otherwise return the position in code points
+ private static int findInByteOrCodePoint(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase,
+ int startMatch, boolean resultInByte) {
int startMatchPos = startMatch;
final int srcUtfLen = src.getUTF8Length();
final int pttnUtfLen = pattern.getUTF8Length();
final int srcStart = src.getMetaDataLength();
final int pttnStart = pattern.getMetaDataLength();
+ int codePointCount = 0;
int maxStart = srcUtfLen - pttnUtfLen;
+ boolean prevHighSurrogate = false;
while (startMatchPos <= maxStart) {
int c1 = startMatchPos;
int c2 = 0;
@@ -256,6 +296,14 @@
char ch2 = pattern.charAt(pttnStart + c2);
if (ch1 != ch2) {
+ // Currently, the ignoreCase is only valid for one-surrogate characters
+ // (e.g. characters whose UTF-16 encoding is 2-byte (1 Java char) instead of 4-byte (2 Java chars).
+ // We may need to support the two-surrogate characters in the future
+ //
+ // Another edge case is that one letter may have different forms of lower cases in different languages
+ // For example, the letter I may have "i" as the lower case in English but "ı" in Turkish.
+ // We may need to use methods such as String.toLowerCase(Locale locale) to support other languages in the future
+ // Reference: https://stackoverflow.com/questions/11063102/using-locales-with-javas-tolowercase-and-touppercase
if (!ignoreCase || Character.toLowerCase(ch1) != Character.toLowerCase(ch2)) {
break;
}
@@ -263,9 +311,35 @@
c1 += src.charSize(srcStart + c1);
c2 += pattern.charSize(pttnStart + c2);
}
+
if (c2 == pttnUtfLen) {
- return startMatchPos;
+ if (resultInByte) {
+ return startMatchPos;
+ } else {
+ if (prevHighSurrogate == true) {
+ throw new IllegalArgumentException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
+ }
+ return codePointCount;
+ }
}
+
+ // The result is counted in code point instead of bytes
+ if (resultInByte == false) {
+ char ch = src.charAt(srcStart + startMatchPos);
+ if (Character.isHighSurrogate(ch)) {
+ prevHighSurrogate = true;
+ } else if (Character.isLowSurrogate(ch)) {
+ if (prevHighSurrogate == true) {
+ codePointCount++;
+ prevHighSurrogate = false;
+ } else {
+ throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+ }
+ } else {
+ codePointCount++;
+ }
+ }
+
startMatchPos += src.charSize(srcStart + startMatchPos);
}
return -1;
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
index 387bc03..8b62765 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
@@ -20,6 +20,7 @@
package org.apache.hyracks.data.std.primitive;
import static org.apache.hyracks.data.std.primitive.UTF8StringPointable.generateUTF8Pointable;
+import static org.apache.hyracks.util.string.UTF8StringSample.EMOJI_BASKETBALL;
import static org.apache.hyracks.util.string.UTF8StringSample.STRING_EMOJI_FAMILY_OF_2;
import static org.apache.hyracks.util.string.UTF8StringSample.STRING_EMOJI_FAMILY_OF_4;
import static org.junit.Assert.assertEquals;
@@ -53,7 +54,7 @@
generateUTF8Pointable(STRING_EMOJI_FAMILY_OF_2);
@Test
- public void testGetStringLength() throws Exception {
+ public void testGetStringUTF8Length() throws Exception {
UTF8StringPointable utf8Ptr = generateUTF8Pointable(UTF8StringSample.STRING_LEN_127);
assertEquals(127, utf8Ptr.getUTF8Length());
assertEquals(1, utf8Ptr.getMetaDataLength());
@@ -67,6 +68,16 @@
}
@Test
+ public void testFindInCodePoint() {
+ UTF8StringPointable strp = generateUTF8Pointable(STRING_EMOJI_FAMILY_OF_4 + EMOJI_BASKETBALL);
+ UTF8StringPointable pattern = generateUTF8Pointable(EMOJI_BASKETBALL);
+
+ assertEquals(UTF8StringPointable.findInCodePoint(strp, pattern, false), 7);
+
+ assertEquals(UTF8StringPointable.findInCodePoint(strp, pattern, true), 7);
+ }
+
+ @Test
public void testContains() throws Exception {
assertTrue(STRING_UTF8_MIX.contains(STRING_UTF8_MIX, false));
assertTrue(STRING_UTF8_MIX.contains(STRING_UTF8_MIX, true));
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index 53271e4..d2cd050 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -34,6 +34,11 @@
*/
public class UTF8StringUtil {
+ public static final String LOW_SURROGATE_WITHOUT_HIGH_SURROGATE =
+ "Decoding error: got a low surrogate without a leading high surrogate";
+ public static final String HIGH_SURROGATE_WITHOUT_LOW_SURROGATE =
+ "Decoding error: got a high surrogate without a following low surrogate";
+
private UTF8StringUtil() {
}
@@ -95,7 +100,7 @@
if (Character.isLowSurrogate(c1)) {
// In this case, the index s doesn't point to a correct position
- throw new IllegalArgumentException("decoding error: got a low surrogate without a high surrogate");
+ throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
}
if (Character.isHighSurrogate(c1)) {
@@ -106,8 +111,7 @@
if (Character.isLowSurrogate(c2)) {
return Character.toCodePoint(c1, c2);
} else {
- throw new IllegalArgumentException(
- "decoding error: the high surrogate is not followed by a low surrogate");
+ throw new IllegalArgumentException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
}
}
@@ -119,7 +123,7 @@
int size1 = charSize(b, s);
if (Character.isLowSurrogate(c1)) {
- throw new IllegalArgumentException("decoding error: got a low surrogate without a high surrogate");
+ throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
}
if (Character.isHighSurrogate(c1)) {
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
index 1502f25..b114351 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
@@ -42,6 +42,7 @@
public static final String STRING_EMOJI_FAMILY_OF_4 =
"\uD83D\uDC68\u200D\uD83D\uDC68\u200D\uD83D\uDC66\u200D\uD83D\uDC66";
public static final String STRING_EMOJI_FAMILY_OF_2 = "\uD83D\uDC68\u200D\uD83D\uDC66";
+ public static final String EMOJI_BASKETBALL = "\uD83C\uDFC0";
public static final String STRING_LEN_127 = generateStringRepeatBy(ONE_ASCII_CHAR, 127);
public static final String STRING_LEN_128 = generateStringRepeatBy(ONE_ASCII_CHAR, 128);