[ASTERIXDB-2762] Use code point as unit in position() Change-Id: Icf1b8b3401599e4332dd09534bdf4787cd9d85d6 Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7305 Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu> Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu> Reviewed-by: Dmitry Lychagin <dmitry.lychagin@couchbase.com>

commit: 120d7eac49ad855eb1ae8a295683c0250aa4fe9e [log] [tgz]
author: Rui Guo <ruig2@uci.edu> Mon Jul 27 13:33:21 2020 -0700
committer: Dmitry Lychagin <dmitry.lychagin@couchbase.com> Tue Jul 28 01:35:19 2020 +0000
tree: d1c33f0759573187dd236ab6bf5b25d3cca1a116
parent: 963b52e1d511d70141134154e08b285cdf6e0ac2 [diff]
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.query.sqlpp
new file mode 100644
index 0000000..e50ada6
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.query.sqlpp

@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+pos0("👩‍👩‍👧‍👦🏀", "🏀");
\ No newline at end of file

diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.query.sqlpp
new file mode 100644
index 0000000..55af74b
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.query.sqlpp

@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+pos1("👩‍👩‍👧‍👦🏀", "🏀");
\ No newline at end of file

diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.adm
new file mode 100644
index 0000000..7f8f011
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset0/pos0_multi_code_point/pos0_multi_code_point.1.adm

@@ -0,0 +1 @@
+7

diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.adm
new file mode 100644
index 0000000..45a4fb7
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/position/offset1/pos1_multi_code_point/pos1_multi_code_point.1.adm

@@ -0,0 +1 @@
+8

diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
index 4fd8b31..ad6bbe5 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml

@@ -9498,6 +9498,11 @@
       </compilation-unit>
     </test-case>
     <test-case FilePath="string">
+      <compilation-unit name="position/offset0/pos0_multi_code_point">
+        <output-dir compare="Text">position/offset0/pos0_multi_code_point</output-dir>
+      </compilation-unit>
+    </test-case>
+    <test-case FilePath="string">
       <compilation-unit name="position/offset1/position1">
         <output-dir compare="Text">position/offset1/position1</output-dir>
       </compilation-unit>
@@ -9508,6 +9513,11 @@
       </compilation-unit>
     </test-case>
     <test-case FilePath="string">
+      <compilation-unit name="position/offset1/pos1_multi_code_point">
+        <output-dir compare="Text">position/offset1/pos1_multi_code_point</output-dir>
+      </compilation-unit>
+    </test-case>
+    <test-case FilePath="string">
       <compilation-unit name="regexp_contains/regexp_contains">
         <output-dir compare="Text">regexp_contains/regexp_contains</output-dir>
       </compilation-unit>

diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
index 86695bc..b4a7a87 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md

@@ -226,10 +226,13 @@
 
         position(string, string_pattern)
 
- * Returns the first position of `string_pattern` within `string`. The function returns the 0-based position. Another
+ * Returns the first position of `string_pattern` within `string`.
+  The result is counted in the unit of code points.
+ See the following example for more details.
+
+ * The function returns the 0-based position. Another
  version of the function returns the 1-based position. Below are the aliases for each version:
 
- * Aliases:
     * 0-based: `position`, `pos`, `position0`, `pos0`.
     * 1-based: `position1`, `pos1`.
 
@@ -249,14 +252,21 @@
           "v1": position("ppphonepp", "phone"),
           "v2": position("hone", "phone"),
           "v3": position1("ppphonepp", "phone"),
-          "v4": position1("hone", "phone"),
+          "v4": position1("hone", "phone")
         };
 
-
  * The expected result is:
 
         { "v1": 2, "v2": -1, v3": 3, "v4": -1 }
 
+ * Example of multi-code-point character:
+
+        position("👩‍👩‍👧‍👦🏀", "🏀");
+
+ * The expected result is (the emoji family character has 7 code points):
+
+        7
+
 
 ### regexp_contains ###
  * Syntax:

diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionDescriptor.java
index f7177fd..6c06056 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionDescriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionDescriptor.java

@@ -48,7 +48,7 @@
 
                     @Override
                     protected int compute(UTF8StringPointable left, UTF8StringPointable right) {
-                        return UTF8StringPointable.find(left, right, false);
+                        return UTF8StringPointable.findInCodePoint(left, right, false);
                     }
                 };
             }

diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionOffset1Descriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionOffset1Descriptor.java
index 10cc779..93ada0f 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionOffset1Descriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringPositionOffset1Descriptor.java

@@ -48,7 +48,7 @@
 
                     @Override
                     protected int compute(UTF8StringPointable left, UTF8StringPointable right) {
-                        int pos = UTF8StringPointable.find(left, right, false);
+                        int pos = UTF8StringPointable.findInCodePoint(left, right, false);
                         return pos < 0 ? pos : pos + 1;
                     }
                 };

diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 9a38a4e..21c8a36 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java

@@ -18,6 +18,9 @@
  */
 package org.apache.hyracks.data.std.primitive;
 
+import static org.apache.hyracks.util.string.UTF8StringUtil.HIGH_SURROGATE_WITHOUT_LOW_SURROGATE;
+import static org.apache.hyracks.util.string.UTF8StringUtil.LOW_SURROGATE_WITHOUT_HIGH_SURROGATE;
+
 import java.io.IOException;
 import java.nio.charset.Charset;
 
@@ -235,19 +238,56 @@
      *            the pattern string.
      * @param ignoreCase,
      *            to ignore case or not.
+     * @return the offset in the unit of code point of the first character of the matching string. Not including the MetaLength.
+     */
+    public static int findInCodePoint(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase) {
+        return findInByteOrCodePoint(src, pattern, ignoreCase, 0, false);
+    }
+
+    /**
+     * @param src,
+     *            the source string.
+     * @param pattern,
+     *            the pattern string.
+     * @param ignoreCase,
+     *            to ignore case or not.
      * @param startMatch,
      *            the start offset.
      * @return the byte offset of the first character of the matching string after <code>startMatchPos}</code>.
      *         Not including the MetaLength.
      */
     public static int find(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase, int startMatch) {
+        return findInByteOrCodePoint(src, pattern, ignoreCase, startMatch, true);
+    }
+
+    /**
+     * @param src,
+     *            the source string.
+     * @param pattern,
+     *            the pattern string.
+     * @param ignoreCase,
+     *            to ignore case or not.
+     * @param startMatch,
+     *            the start offset.
+     * @return the offset in the unit of code point of the first character of the matching string. Not including the MetaLength.
+     */
+    public static int findInCodePoint(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase,
+            int startMatch) {
+        return findInByteOrCodePoint(src, pattern, ignoreCase, startMatch, false);
+    }
+
+    // If resultInByte is true, then return the position in bytes, otherwise return the position in code points
+    private static int findInByteOrCodePoint(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase,
+            int startMatch, boolean resultInByte) {
         int startMatchPos = startMatch;
         final int srcUtfLen = src.getUTF8Length();
         final int pttnUtfLen = pattern.getUTF8Length();
         final int srcStart = src.getMetaDataLength();
         final int pttnStart = pattern.getMetaDataLength();
+        int codePointCount = 0;
 
         int maxStart = srcUtfLen - pttnUtfLen;
+        boolean prevHighSurrogate = false;
         while (startMatchPos <= maxStart) {
             int c1 = startMatchPos;
             int c2 = 0;
@@ -256,6 +296,14 @@
                 char ch2 = pattern.charAt(pttnStart + c2);
 
                 if (ch1 != ch2) {
+                    // Currently, the ignoreCase is only valid for one-surrogate characters
+                    // (e.g. characters whose UTF-16 encoding is 2-byte (1 Java char) instead of 4-byte (2 Java chars).
+                    // We may need to support the two-surrogate characters in the future
+                    //
+                    // Another edge case is that one letter may have different forms of lower cases in different languages
+                    // For example, the letter I may have "i" as the lower case in English but "ı" in Turkish.
+                    // We may need to use methods such as String.toLowerCase(Locale locale) to support other languages in the future
+                    // Reference: https://stackoverflow.com/questions/11063102/using-locales-with-javas-tolowercase-and-touppercase
                     if (!ignoreCase || Character.toLowerCase(ch1) != Character.toLowerCase(ch2)) {
                         break;
                     }
@@ -263,9 +311,35 @@
                 c1 += src.charSize(srcStart + c1);
                 c2 += pattern.charSize(pttnStart + c2);
             }
+
             if (c2 == pttnUtfLen) {
-                return startMatchPos;
+                if (resultInByte) {
+                    return startMatchPos;
+                } else {
+                    if (prevHighSurrogate == true) {
+                        throw new IllegalArgumentException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
+                    }
+                    return codePointCount;
+                }
             }
+
+            // The result is counted in code point instead of bytes
+            if (resultInByte == false) {
+                char ch = src.charAt(srcStart + startMatchPos);
+                if (Character.isHighSurrogate(ch)) {
+                    prevHighSurrogate = true;
+                } else if (Character.isLowSurrogate(ch)) {
+                    if (prevHighSurrogate == true) {
+                        codePointCount++;
+                        prevHighSurrogate = false;
+                    } else {
+                        throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+                    }
+                } else {
+                    codePointCount++;
+                }
+            }
+
             startMatchPos += src.charSize(srcStart + startMatchPos);
         }
         return -1;

diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
index 387bc03..8b62765 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java

@@ -20,6 +20,7 @@
 package org.apache.hyracks.data.std.primitive;
 
 import static org.apache.hyracks.data.std.primitive.UTF8StringPointable.generateUTF8Pointable;
+import static org.apache.hyracks.util.string.UTF8StringSample.EMOJI_BASKETBALL;
 import static org.apache.hyracks.util.string.UTF8StringSample.STRING_EMOJI_FAMILY_OF_2;
 import static org.apache.hyracks.util.string.UTF8StringSample.STRING_EMOJI_FAMILY_OF_4;
 import static org.junit.Assert.assertEquals;
@@ -53,7 +54,7 @@
             generateUTF8Pointable(STRING_EMOJI_FAMILY_OF_2);
 
     @Test
-    public void testGetStringLength() throws Exception {
+    public void testGetStringUTF8Length() throws Exception {
         UTF8StringPointable utf8Ptr = generateUTF8Pointable(UTF8StringSample.STRING_LEN_127);
         assertEquals(127, utf8Ptr.getUTF8Length());
         assertEquals(1, utf8Ptr.getMetaDataLength());
@@ -67,6 +68,16 @@
     }
 
     @Test
+    public void testFindInCodePoint() {
+        UTF8StringPointable strp = generateUTF8Pointable(STRING_EMOJI_FAMILY_OF_4 + EMOJI_BASKETBALL);
+        UTF8StringPointable pattern = generateUTF8Pointable(EMOJI_BASKETBALL);
+
+        assertEquals(UTF8StringPointable.findInCodePoint(strp, pattern, false), 7);
+
+        assertEquals(UTF8StringPointable.findInCodePoint(strp, pattern, true), 7);
+    }
+
+    @Test
     public void testContains() throws Exception {
         assertTrue(STRING_UTF8_MIX.contains(STRING_UTF8_MIX, false));
         assertTrue(STRING_UTF8_MIX.contains(STRING_UTF8_MIX, true));

diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index 53271e4..d2cd050 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java

@@ -34,6 +34,11 @@
  */
 public class UTF8StringUtil {
 
+    public static final String LOW_SURROGATE_WITHOUT_HIGH_SURROGATE =
+            "Decoding error: got a low surrogate without a leading high surrogate";
+    public static final String HIGH_SURROGATE_WITHOUT_LOW_SURROGATE =
+            "Decoding error: got a high surrogate without a following low surrogate";
+
     private UTF8StringUtil() {
     }
 
@@ -95,7 +100,7 @@
 
         if (Character.isLowSurrogate(c1)) {
             // In this case, the index s doesn't point to a correct position
-            throw new IllegalArgumentException("decoding error: got a low surrogate without a high surrogate");
+            throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
         }
 
         if (Character.isHighSurrogate(c1)) {
@@ -106,8 +111,7 @@
             if (Character.isLowSurrogate(c2)) {
                 return Character.toCodePoint(c1, c2);
             } else {
-                throw new IllegalArgumentException(
-                        "decoding error: the high surrogate is not followed by a low surrogate");
+                throw new IllegalArgumentException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
             }
         }
 
@@ -119,7 +123,7 @@
         int size1 = charSize(b, s);
 
         if (Character.isLowSurrogate(c1)) {
-            throw new IllegalArgumentException("decoding error: got a low surrogate without a high surrogate");
+            throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
         }
 
         if (Character.isHighSurrogate(c1)) {

diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
index 1502f25..b114351 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java

@@ -42,6 +42,7 @@
     public static final String STRING_EMOJI_FAMILY_OF_4 =
             "\uD83D\uDC68\u200D\uD83D\uDC68\u200D\uD83D\uDC66\u200D\uD83D\uDC66";
     public static final String STRING_EMOJI_FAMILY_OF_2 = "\uD83D\uDC68\u200D\uD83D\uDC66";
+    public static final String EMOJI_BASKETBALL = "\uD83C\uDFC0";
 
     public static final String STRING_LEN_127 = generateStringRepeatBy(ONE_ASCII_CHAR, 127);
     public static final String STRING_LEN_128 = generateStringRepeatBy(ONE_ASCII_CHAR, 128);
commit	120d7eac49ad855eb1ae8a295683c0250aa4fe9e	[log] [tgz]
author	Rui Guo <ruig2@uci.edu>	Mon Jul 27 13:33:21 2020 -0700
committer	Dmitry Lychagin <dmitry.lychagin@couchbase.com>	Tue Jul 28 01:35:19 2020 +0000
tree	d1c33f0759573187dd236ab6bf5b25d3cca1a116
parent	963b52e1d511d70141134154e08b285cdf6e0ac2 [diff]