[NO ISSUE]: Move StringUtils to hyracks-api module Change-Id: Iea1b7db9374332315dfaf56d49f24217f7c0834c Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17990 Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu> Reviewed-by: Michael Blow <mblow@apache.org> Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>

commit: b4a7d81b9925f94f1e06827e95cb54650ad6625f [log] [tgz]
author: Hussain Towaileb <Hussain.Towaileb@couchbase.com> Fri Dec 01 07:54:00 2023 +0300
committer: Hussain Towaileb <hussainht@gmail.com> Fri Dec 01 16:19:48 2023 +0000
tree: f6991376c4023d39df0d8ed34dfe09368133f379
parent: 62578ad6659762f5e7fda13421abe9c38cae3605 [diff]
diff --git a/hyracks-fullstack/hyracks/hyracks-api/pom.xml b/hyracks-fullstack/hyracks/hyracks-api/pom.xml
index 047f066..131731d 100644
--- a/hyracks-fullstack/hyracks/hyracks-api/pom.xml
+++ b/hyracks-fullstack/hyracks/hyracks-api/pom.xml

@@ -65,6 +65,13 @@
       <version>${project.version}</version>
     </dependency>
     <dependency>
+      <groupId>org.apache.hyracks</groupId>
+      <artifactId>hyracks-util</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
       <groupId>org.mockito</groupId>
       <artifactId>mockito-core</artifactId>
       <scope>test</scope>

diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/ErrorCode.java b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/ErrorCode.java
index 7291473..8170f07 100644
--- a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/ErrorCode.java
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/ErrorCode.java

@@ -154,6 +154,7 @@
     PARSING_ERROR(124),
     INVALID_INVERTED_LIST_TYPE_TRAITS(125),
     ILLEGAL_STATE(126),
+    INVALID_STRING_UNICODE(127),
 
     // Compilation error codes.
     RULECOLLECTION_NOT_INSTANCE_OF_LIST(10000),

diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/HyracksException.java b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/HyracksException.java
index 977e5d2..12f1095 100644
--- a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/HyracksException.java
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/HyracksException.java

@@ -158,6 +158,10 @@
         return message;
     }
 
+    public String getMessageNoCode() {
+        return ErrorMessageUtil.getMessageNoCode(component, getMessage());
+    }
+
     @Override
     public String toString() {
         return getLocalizedMessage();

diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ErrorMessageUtil.java b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ErrorMessageUtil.java
index 70b13fa..cb0d579 100644
--- a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ErrorMessageUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ErrorMessageUtil.java

@@ -125,6 +125,13 @@
         }
     }
 
+    public static String getMessageNoCode(String component, String message) {
+        if (NONE.equals(component)) {
+            return message;
+        }
+        return message.substring(message.indexOf(":") + 2);
+    }
+
     public static String getCauseMessage(Throwable t) {
         if (t instanceof IFormattedException) {
             return t.getMessage();

diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ExceptionUtils.java b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ExceptionUtils.java
index 7147542..e07cdd4 100644
--- a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ExceptionUtils.java
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ExceptionUtils.java

@@ -207,4 +207,16 @@
     public static String getMessageOrToString(Throwable e) {
         return e instanceof IFormattedException ? e.getMessage() : e.toString();
     }
+
+    /**
+     * Checks if the error code of the throwable is of the provided type
+     *
+     * @param throwable throwable with error code
+     * @param code error code to match against
+     *
+     * @return true if error code matches, false otherwise
+     */
+    public static boolean isErrorCode(HyracksDataException throwable, ErrorCode code) {
+        return throwable.getError().isPresent() && throwable.getError().get() == code;
+    }
 }

diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringReader.java b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringReader.java
new file mode 100644
index 0000000..43faf18
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringReader.java

@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.string;
+
+import java.io.DataInput;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.Serializable;
+import java.io.UTFDataFormatException;
+
+public class UTF8StringReader implements Serializable {
+
+    private static final long serialVersionUID = 1L;
+
+    transient byte[] bytearr = null;
+    transient char[] chararr = null;
+
+    /**
+     * Reads from the
+     * stream <code>in</code> a representation
+     * of a Unicode character string encoded in
+     * <a href="DataInput.html#modified-utf-8">modified UTF-8</a> format;
+     * this string of characters is then returned as a <code>String</code>.
+     * The details of the modified UTF-8 representation
+     * are exactly the same as for the <code>readUTF</code>
+     * method of <code>DataInput</code>.
+     *
+     * @param in
+     *            a data input stream.
+     * @return a Unicode string.
+     * @throws EOFException
+     *             if the input stream reaches the end
+     *             before all the bytes.
+     * @throws IOException
+     *             the stream has been closed and the contained
+     *             input stream does not support reading after close, or
+     *             another I/O error occurs.
+     * @throws UTFDataFormatException
+     *             if the bytes do not represent a
+     *             valid modified UTF-8 encoding of a Unicode string.
+     * @see java.io.DataInputStream#readUnsignedShort()
+     */
+    public final String readUTF(DataInput in) throws IOException {
+        return UTF8StringUtil.readUTF8(in, this);
+    }
+}

diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
new file mode 100644
index 0000000..4fc503d
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java

@@ -0,0 +1,711 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.string;
+
+import static org.apache.hyracks.api.exceptions.ErrorCode.INVALID_STRING_UNICODE;
+
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.UTFDataFormatException;
+import java.lang.ref.SoftReference;
+
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
+
+/**
+ * A helper package to operate the UTF8String in Hyracks.
+ * Most of the codes were migrated from asterix-fuzzyjoin and hyracks-storage-am-invertedindex
+ */
+public class UTF8StringUtil {
+
+    public static final String MALFORMED_BYTES = "malformed bytes";
+    public static final String LOW_SURROGATE_WITHOUT_HIGH_SURROGATE =
+            "got a low surrogate without a leading high surrogate";
+    public static final String HIGH_SURROGATE_WITHOUT_LOW_SURROGATE =
+            "got a high surrogate without a following low surrogate";
+
+    private UTF8StringUtil() {
+    }
+
+    public static char charAt(byte[] b, int s) {
+        if (s >= b.length) {
+            throw new ArrayIndexOutOfBoundsException(s);
+        }
+        int c = b[s] & 0xff;
+        switch (c >> 4) {
+            case 0:
+            case 1:
+            case 2:
+            case 3:
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+                return (char) c;
+
+            case 12:
+            case 13:
+                return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
+
+            case 14:
+                return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (b[s + 2] & 0x3F));
+
+            default:
+                throw new IllegalArgumentException();
+        }
+    }
+
+    public static int charSize(byte[] b, int s) {
+        int c = b[s] & 0xff;
+        switch (c >> 4) {
+            case 0:
+            case 1:
+            case 2:
+            case 3:
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+                return 1;
+
+            case 12:
+            case 13:
+                return 2;
+
+            case 14:
+                return 3;
+
+            default:
+                throw new IllegalStateException();
+        }
+    }
+
+    public static int codePointAt(byte[] b, int s) throws HyracksDataException {
+        char c1 = charAt(b, s);
+
+        if (Character.isLowSurrogate(c1)) {
+            // In this case, the index s doesn't point to a correct position
+            throw HyracksDataException.create(INVALID_STRING_UNICODE, LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+        }
+
+        if (Character.isHighSurrogate(c1)) {
+            // If c1 is the a high surrogate and also the last char in the byte array (that means the byte array is somehow illegal),
+            // then an exception will be thrown because there is no low surrogate (c2) available in the byte array
+            s += charSize(b, s);
+            char c2 = charAt(b, s);
+            if (Character.isLowSurrogate(c2)) {
+                return Character.toCodePoint(c1, c2);
+            } else {
+                throw HyracksDataException.create(INVALID_STRING_UNICODE, HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
+            }
+        }
+
+        return c1;
+    }
+
+    public static int codePointSize(byte[] b, int s) throws HyracksDataException {
+        char c1 = charAt(b, s);
+        int size1 = charSize(b, s);
+
+        if (Character.isLowSurrogate(c1)) {
+            throw HyracksDataException.create(INVALID_STRING_UNICODE, LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+        }
+
+        if (Character.isHighSurrogate(c1)) {
+            // Similar to the above codePointAt(),
+            // if c1 is the a high surrogate and also the last char in the byte array (that means the byte array is somehow illegal),
+            // then an exception will be thrown because there is no low surrogate available in the byte array
+            s += size1;
+            int size2 = charSize(b, s);
+            return size1 + size2;
+        }
+
+        return size1;
+    }
+
+    public static boolean isCharStart(byte[] b, int s) {
+        int c = b[s] & 0xff;
+        return (c >> 6) != 2;
+    }
+
+    public static int getModifiedUTF8Len(char c) {
+        if (c >= 0x0001 && c <= 0x007F) {
+            return 1;
+        } else if (c <= 0x07FF) {
+            return 2;
+        } else {
+            return 3;
+        }
+    }
+
+    public static int writeCharAsModifiedUTF8(char c, DataOutput dos) throws IOException {
+        if (c >= 0x0001 && c <= 0x007F) {
+            dos.writeByte(c);
+            return 1;
+        } else if (c <= 0x07FF) {
+            dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
+            dos.writeByte((byte) (0x80 | (c & 0x3F)));
+            return 2;
+        } else {
+            dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
+            dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
+            dos.writeByte((byte) (0x80 | (c & 0x3F)));
+            return 3;
+        }
+    }
+
+    public static int writeCharAsModifiedUTF8(char c, OutputStream dos) throws IOException {
+        if (c >= 0x0001 && c <= 0x007F) {
+            dos.write(c);
+            return 1;
+        } else if (c <= 0x07FF) {
+            dos.write((byte) (0xC0 | ((c >> 6) & 0x3F)));
+            dos.write((byte) (0x80 | (c & 0x3F)));
+            return 2;
+        } else {
+            dos.write((byte) (0xE0 | ((c >> 12) & 0x0F)));
+            dos.write((byte) (0x80 | ((c >> 6) & 0x3F)));
+            dos.write((byte) (0x80 | (c & 0x3F)));
+            return 3;
+        }
+    }
+
+    // The result is the number of Java Chars (8 bytes) in the string
+    public static int getStringLength(byte[] b, int s) {
+        int len = getUTFLength(b, s);
+        int pos = s + getNumBytesToStoreLength(len);
+        return getStringLength(b, pos, len);
+    }
+
+    public static int getStringLength(byte[] b, int offs, int len) {
+        int pos = offs;
+        int end = pos + len;
+        int charCount = 0;
+        while (pos < end) {
+            charCount++;
+            pos += charSize(b, pos);
+        }
+        return charCount;
+    }
+
+    public static int getNumCodePoint(byte[] b, int s) throws HyracksDataException {
+        int len = getUTFLength(b, s);
+        int pos = s + getNumBytesToStoreLength(len);
+        int end = pos + len;
+        int codePointCount = 0;
+        while (pos < end) {
+            codePointCount++;
+            pos += codePointSize(b, pos);
+        }
+
+        return codePointCount;
+    }
+
+    public static int getUTFLength(byte[] b, int s) {
+        return VarLenIntEncoderDecoder.decode(b, s);
+    }
+
+    public static int getNumBytesToStoreLength(int strlen) {
+        return VarLenIntEncoderDecoder.getBytesRequired(strlen);
+    }
+
+    public static int codePointToUTF8(int codePoint, char[] tempChars, byte[] outputUTF8) {
+        int len = 0;
+        int numChars = Character.toChars(codePoint, tempChars, 0);
+        for (int i = 0; i < numChars; i++) {
+            len += writeToBytes(outputUTF8, len, tempChars[i]);
+        }
+
+        return len;
+    }
+
+    /**
+     * Compute the normalized key of the UTF8 string.
+     * The normalized key in Hyracks is mainly used to speedup the comparison between pointable data.
+     * In the UTF8StringPTR case, we compute the integer value by using the first 2 chars.
+     * The comparator will first use this integer to get the result ( <,>, or =), it will check
+     * the actual bytes only if the normalized key is equal. Thus this normalized key must be
+     * consistent with the comparison result.
+     */
+    public static int normalize(byte[] bytes, int start) {
+        int len = getUTFLength(bytes, start);
+        long nk = 0;
+        int offset = start + getNumBytesToStoreLength(len);
+        for (int i = 0; i < 2; ++i) {
+            nk <<= 16;
+            if (i < len) {
+                nk += (charAt(bytes, offset)) & 0xffff;
+                offset += charSize(bytes, offset);
+            }
+        }
+        return (int) (nk >> 1); // make it always positive.
+    }
+
+    public static int compareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart) {
+        return compareTo(thisBytes, thisStart, thatBytes, thatStart, false, false);
+    }
+
+    // the start and length of each are the ones calculated by UTF8StringPointable. caller should provide proper values
+    public static int compareTo(byte[] thisBytes, int thisStart, int thisLength, byte[] thatBytes, int thatStart,
+            int thatLength) {
+        return compareTo(thisBytes, thisStart, thisLength, thatBytes, thatStart, thatLength, false, false);
+    }
+
+    /**
+     * This function provides the raw bytes-based comparison for UTF8 strings.
+     * Note that the comparison may not deliver the correct ordering for certain languages that include 2 or 3 bytes characters.
+     * But it works for single-byte character languages.
+     */
+    public static int rawByteCompareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart) {
+        return compareTo(thisBytes, thisStart, thatBytes, thatStart, false, true);
+    }
+
+    public static int lowerCaseCompareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart) {
+        return compareTo(thisBytes, thisStart, thatBytes, thatStart, true, false);
+    }
+
+    // Certain type of string does not include lengthByte in the beginning and
+    // the length of the given string is given explicitly as a parameter. (e.g., token in a string)
+    public static int lowerCaseCompareTo(byte[] thisBytes, int thisStart, int thisLength, byte[] thatBytes,
+            int thatStart, int thatLength) {
+        return compareTo(thisBytes, thisStart, thisLength, thatBytes, thatStart, thatLength, true, false);
+    }
+
+    public static int hash(byte[] bytes, int start, int coefficient, int r) {
+        return hash(bytes, start, false, false, coefficient, r);
+    }
+
+    public static int hash(byte[] bytes, int start) {
+        return hash(bytes, start, false, false, 31, Integer.MAX_VALUE);
+    }
+
+    private static int hash(byte[] bytes, int start, boolean useLowerCase, boolean useRawByte, int coefficient, int r) {
+        int utflen = getUTFLength(bytes, start);
+        int sStart = start + getNumBytesToStoreLength(utflen);
+        return hash(bytes, sStart, utflen, useLowerCase, useRawByte, coefficient, r);
+    }
+
+    /**
+     * This function provides the raw bytes-based hash function for UTF8 strings.
+     * Note that the hash values may not deliver the correct ordering for certain languages that include 2 or 3 bytes characters.
+     * But it works for single-byte character languages.
+     */
+    public static int rawBytehash(byte[] bytes, int start) {
+        return hash(bytes, start, false, true, 31, Integer.MAX_VALUE);
+    }
+
+    public static int lowerCaseHash(byte[] bytes, int start) {
+        return hash(bytes, start, true, false, 31, Integer.MAX_VALUE);
+    }
+
+    // Certain type of string does not include lengthByte in the beginning and
+    // the length of the given string is given explicitly as a parameter.
+    public static int lowerCaseHash(byte[] bytes, int start, int length) {
+        return hash(bytes, start, length, true, false, 31, Integer.MAX_VALUE);
+    }
+
+    public static String toString(byte[] bytes, int start) {
+        StringBuilder builder = new StringBuilder();
+        return toString(builder, bytes, start).toString();
+    }
+
+    public static StringBuilder toString(StringBuilder builder, byte[] bytes, int start) {
+        int utfLen = getUTFLength(bytes, start);
+        int offset = getNumBytesToStoreLength(utfLen);
+        while (utfLen > 0) {
+            char c = charAt(bytes, start + offset);
+            builder.append(c);
+            int cLen = getModifiedUTF8Len(c);
+            offset += cLen;
+            utfLen -= cLen;
+        }
+        return builder;
+    }
+
+    // Different from the above toString() methods, here we assume the byte[] doesn't contain NumBytesToStoreLength
+    // In fact, this is used for string tokenizer: get "hello" and "world" from the bytes of "hello world"
+    public static String getUTF8StringInArray(byte[] b, int start, int len) {
+        StringBuilder builder = new StringBuilder();
+
+        for (int i = start; i < start + len;) {
+            char c = UTF8StringUtil.charAt(b, i);
+            builder.append(c);
+            i += UTF8StringUtil.charSize(b, i);
+        }
+
+        return builder.toString();
+    }
+
+    public static void printUTF8StringWithQuotes(byte[] b, int s, int l, OutputStream os) throws IOException {
+        printUTF8String(b, s, l, os, true);
+    }
+
+    public static void printUTF8StringNoQuotes(byte[] b, int s, int l, OutputStream os) throws IOException {
+        printUTF8String(b, s, l, os, false);
+    }
+
+    public static void printUTF8StringWithQuotes(String str, OutputStream os) throws IOException {
+        printUTF8String(str, os, true);
+    }
+
+    public static void printUTF8StringNoQuotes(String str, OutputStream os) throws IOException {
+        printUTF8String(str, os, false);
+    }
+
+    public static int encodeUTF8Length(int length, byte[] bytes, int start) {
+        return VarLenIntEncoderDecoder.encode(length, bytes, start);
+    }
+
+    public static int writeUTF8Length(int length, byte[] bytes, DataOutput out) throws IOException {
+        int nbytes = encodeUTF8Length(length, bytes, 0);
+        out.write(bytes, 0, nbytes);
+        return nbytes;
+    }
+
+    private static void printUTF8String(byte[] b, int s, int l, OutputStream os, boolean useQuotes) throws IOException {
+        int stringLength = getUTFLength(b, s);
+        int position = s + getNumBytesToStoreLength(stringLength);
+        int maxPosition = position + stringLength;
+        if (useQuotes) {
+            os.write('\"');
+        }
+        while (position < maxPosition) {
+            char c = charAt(b, position);
+            if (c == '\\' || c == '"') {
+                // escape
+                os.write('\\');
+            }
+            int sz = charSize(b, position);
+            while (sz > 0) {
+                os.write(b[position]);
+                position++;
+                sz--;
+            }
+        }
+        if (useQuotes) {
+            os.write('\"');
+        }
+    }
+
+    private static void printUTF8String(String string, OutputStream os, boolean useQuotes) throws IOException {
+        if (useQuotes) {
+            os.write('\"');
+        }
+        for (int i = 0; i < string.length(); i++) {
+            char ch = string.charAt(i);
+            writeCharAsModifiedUTF8(ch, os);
+        }
+        if (useQuotes) {
+            os.write('\"');
+        }
+    }
+
+    private static int compareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart, boolean useLowerCase,
+            boolean useRawByte) {
+        int thisLength = getUTFLength(thisBytes, thisStart);
+        int thatLength = getUTFLength(thatBytes, thatStart);
+        int thisActualStart = thisStart + getNumBytesToStoreLength(thisLength);
+        int thatActualStart = thatStart + getNumBytesToStoreLength(thatLength);
+        return compareTo(thisBytes, thisActualStart, thisLength, thatBytes, thatActualStart, thatLength, useLowerCase,
+                useRawByte);
+    }
+
+    private static int compareTo(byte[] thisBytes, int thisActualStart, int thisLength, byte[] thatBytes,
+            int thatActualStart, int thatLength, boolean useLowerCase, boolean useRawByte) {
+        int c1 = 0;
+        int c2 = 0;
+
+        while (c1 < thisLength && c2 < thatLength) {
+            char ch1, ch2;
+            if (useRawByte) {
+                ch1 = (char) thisBytes[thisActualStart + c1];
+                ch2 = (char) thatBytes[thatActualStart + c2];
+            } else {
+                ch1 = charAt(thisBytes, thisActualStart + c1);
+                ch2 = charAt(thatBytes, thatActualStart + c2);
+
+                if (useLowerCase) {
+                    ch1 = Character.toLowerCase(ch1);
+                    ch2 = Character.toLowerCase(ch2);
+                }
+            }
+
+            if (ch1 != ch2) {
+                return ch1 - ch2;
+            }
+            c1 += charSize(thisBytes, thisActualStart + c1);
+            c2 += charSize(thatBytes, thatActualStart + c2);
+        }
+        return thisLength - thatLength;
+    }
+
+    private static int hash(byte[] bytes, int start, int length, boolean useLowerCase, boolean useRawByte,
+            int coefficient, int r) {
+        int h = 0;
+        int c = 0;
+
+        while (c < length) {
+            char ch;
+            if (useRawByte) {
+                ch = (char) bytes[start + c];
+            } else {
+                ch = charAt(bytes, start + c);
+                if (useLowerCase) {
+                    ch = Character.toLowerCase(ch);
+                }
+            }
+            h = (coefficient * h + ch) % r;
+            c += charSize(bytes, start + c);
+        }
+        return h;
+    }
+
+    public static byte[] writeStringToBytes(String string) {
+        UTF8StringWriter writer = new UTF8StringWriter();
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        DataOutputStream dos = new DataOutputStream(bos);
+        try {
+            writer.writeUTF8(string, dos);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+        return bos.toByteArray();
+    }
+
+    /**
+     * Reads from the
+     * stream <code>in</code> a representation
+     * of a Unicode character string encoded in
+     * <a href="DataInput.html#modified-utf-8">modified UTF-8</a> format;
+     * this string of characters is then returned as a <code>String</code>.
+     * The details of the modified UTF-8 representation
+     * are exactly the same as for the <code>readUTF</code>
+     * method of <code>DataInput</code>.
+     *
+     * @param in
+     *            a data input stream.
+     * @return a Unicode string.
+     * @throws EOFException
+     *             if the input stream reaches the end
+     *             before all the bytes.
+     * @throws IOException
+     *             the stream has been closed and the contained
+     *             input stream does not support reading after close, or
+     *             another I/O error occurs.
+     * @throws UTFDataFormatException
+     *             if the bytes do not represent a
+     *             valid modified UTF-8 encoding of a Unicode string.
+     * @see java.io.DataInputStream#readUnsignedShort()
+     */
+    public static String readUTF8(DataInput in) throws IOException {
+        return readUTF8(in, null);
+    }
+
+    public static String readUTF8(DataInput in, UTF8StringReader reader) throws IOException {
+        int utflen = VarLenIntEncoderDecoder.decode(in);
+        byte[] bytearr;
+        char[] chararr;
+
+        if (reader == null) {
+            bytearr = new byte[utflen * 2];
+            chararr = new char[utflen * 2];
+        } else {
+            if (reader.bytearr == null || reader.bytearr.length < utflen) {
+                reader.bytearr = new byte[utflen * 2];
+                reader.chararr = new char[utflen * 2];
+            }
+            bytearr = reader.bytearr;
+            chararr = reader.chararr;
+        }
+
+        int c, char2, char3;
+        int count = 0;
+        int chararr_count = 0;
+
+        in.readFully(bytearr, 0, utflen);
+
+        while (count < utflen) {
+            c = bytearr[count] & 0xff;
+            if (c > 127) {
+                break;
+            }
+            count++;
+            chararr[chararr_count++] = (char) c;
+        }
+
+        while (count < utflen) {
+            c = bytearr[count] & 0xff;
+            switch (c >> 4) {
+                case 0:
+                case 1:
+                case 2:
+                case 3:
+                case 4:
+                case 5:
+                case 6:
+                case 7:
+                    /* 0xxxxxxx*/
+                    count++;
+                    chararr[chararr_count++] = (char) c;
+                    break;
+                case 12:
+                case 13:
+                    /* 110x xxxx   10xx xxxx*/
+                    count += 2;
+                    if (count > utflen) {
+                        throw new UTFDataFormatException("malformed input: partial character at end");
+                    }
+                    char2 = bytearr[count - 1];
+                    if ((char2 & 0xC0) != 0x80) {
+                        throw new UTFDataFormatException("malformed input around byte " + count);
+                    }
+                    chararr[chararr_count++] = (char) (((c & 0x1F) << 6) | (char2 & 0x3F));
+                    break;
+                case 14:
+                    /* 1110 xxxx  10xx xxxx  10xx xxxx */
+                    count += 3;
+                    if (count > utflen) {
+                        throw new UTFDataFormatException("malformed input: partial character at end");
+                    }
+                    char2 = bytearr[count - 2];
+                    char3 = bytearr[count - 1];
+                    if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) {
+                        throw new UTFDataFormatException("malformed input around byte " + (count - 1));
+                    }
+                    chararr[chararr_count++] = (char) (((c & 0x0F) << 12) | ((char2 & 0x3F) << 6) | (char3 & 0x3F));
+                    break;
+                default:
+                    /* 10xx xxxx,  1111 xxxx */
+                    throw new UTFDataFormatException("malformed input around byte " + count);
+            }
+        }
+        // The number of chars produced may be less than utflen
+        return new String(chararr, 0, chararr_count);
+    }
+
+    /**
+     * Write a UTF8 String <code>str</code> into the DataOutput <code>out</code>
+     *
+     * @param str,
+     *            a Unicode string;
+     * @param out,
+     *            a Data output stream.
+     * @throws IOException
+     */
+    public static void writeUTF8(CharSequence str, DataOutput out) throws IOException {
+        writeUTF8(str, out, null);
+    }
+
+    public static void writeUTF8(CharSequence str, DataOutput out, UTF8StringWriter writer) throws IOException {
+        int strlen = str.length();
+        int utflen = 0;
+        char c;
+        int count = 0;
+
+        for (int i = 0; i < strlen; i++) {
+            // ToDo: we shouldn't use str.charAt(i) to convert raw byte array to UTF-8 chars
+            // one UTF-8 char has at most four bytes, and one Java char we get via str.charAt(i) has 2 bytes
+            // In this case, a UTF-8 char may be consistent of 2 Java chars, and 1 Java char can be converted into 3 UTF-8 bytes
+            // For the emoji, it can be 6 bytes after encoded to UTF-8
+            c = str.charAt(i);
+            utflen += UTF8StringUtil.getModifiedUTF8Len(c);
+        }
+
+        byte[] tempBytes = getTempBytes(writer, utflen);
+        count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count);
+        int i = 0;
+        for (; i < strlen; i++) {
+            c = str.charAt(i);
+            if (!((c >= 0x0001) && (c <= 0x007F))) {
+                break;
+            }
+            tempBytes[count++] = (byte) c;
+        }
+
+        for (; i < strlen; i++) {
+            c = str.charAt(i);
+            count += writeToBytes(tempBytes, count, c);
+        }
+        out.write(tempBytes, 0, count);
+    }
+
+    public static void writeUTF8(char[] buffer, int start, int length, DataOutput out, UTF8StringWriter writer)
+            throws IOException {
+        int utflen = 0;
+        int count = 0;
+        char c;
+
+        for (int i = 0; i < length; i++) {
+            c = buffer[i + start];
+            utflen += UTF8StringUtil.getModifiedUTF8Len(c);
+        }
+
+        byte[] tempBytes = getTempBytes(writer, utflen);
+        count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count);
+
+        int i = 0;
+        for (; i < length; i++) {
+            c = buffer[i + start];
+            if (!((c >= 0x0001) && (c <= 0x007F))) {
+                break;
+            }
+            tempBytes[count++] = (byte) c;
+        }
+
+        for (; i < length; i++) {
+            c = buffer[i + start];
+            count += writeToBytes(tempBytes, count, c);
+        }
+        out.write(tempBytes, 0, count);
+    }
+
+    private static int writeToBytes(byte[] tempBytes, int count, char c) {
+        int orig = count;
+        if ((c >= 0x0001) && (c <= 0x007F)) {
+            tempBytes[count++] = (byte) c;
+        } else if (c > 0x07FF) {
+            tempBytes[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
+            tempBytes[count++] = (byte) (0x80 | ((c >> 6) & 0x3F));
+            tempBytes[count++] = (byte) (0x80 | (c & 0x3F));
+        } else {
+            tempBytes[count++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
+            tempBytes[count++] = (byte) (0x80 | (c & 0x3F));
+        }
+        return count - orig;
+    }
+
+    private static byte[] getTempBytes(UTF8StringWriter writer, int utflen) {
+        byte[] tempBytes;
+        if (writer == null) {
+            tempBytes = new byte[utflen + 5];
+        } else {
+            byte[] writerTempBytes = writer.tempBytesRef != null ? writer.tempBytesRef.get() : null;
+            if (writerTempBytes == null || writerTempBytes.length < utflen + 5) {
+                writerTempBytes = new byte[utflen + 5];
+                writer.tempBytesRef = new SoftReference<>(writerTempBytes);
+            }
+            tempBytes = writerTempBytes;
+        }
+        return tempBytes;
+    }
+}

diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringWriter.java b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringWriter.java
new file mode 100644
index 0000000..a0cc7d0
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringWriter.java

@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hyracks.util.string;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+import java.lang.ref.SoftReference;
+
+public class UTF8StringWriter implements Serializable {
+
+    private static final long serialVersionUID = 1L;
+    transient SoftReference<byte[]> tempBytesRef;
+
+    public final void writeUTF8(CharSequence str, DataOutput out) throws IOException {
+        UTF8StringUtil.writeUTF8(str, out, this);
+    }
+
+    public final void writeUTF8(char[] buffer, int start, int length, DataOutput out) throws IOException {
+        UTF8StringUtil.writeUTF8(buffer, start, length, out, this);
+    }
+
+}

diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/resources/errormsg/en.properties b/hyracks-fullstack/hyracks/hyracks-api/src/main/resources/errormsg/en.properties
index 4d9c60b..7db5d49 100644
--- a/hyracks-fullstack/hyracks/hyracks-api/src/main/resources/errormsg/en.properties
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/resources/errormsg/en.properties

@@ -144,6 +144,8 @@
 124 = Parsing error %s: %s
 125 = Invalid inverted list type traits: %1$s
 126 = Illegal state. %1$s
+127 = Decoding error - %1$s
+
 
 10000 = The given rule collection %1$s is not an instance of the List class.
 10001 = Cannot compose partition constraint %1$s with %2$s

diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringReaderWriterTest.java b/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringReaderWriterTest.java
new file mode 100644
index 0000000..abba958
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringReaderWriterTest.java

@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.api.string;
+
+import static org.apache.hyracks.util.string.UTF8StringSample.EMPTY_STRING;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_127;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_128;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_3;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_LARGE;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_LARGE_SUB_1;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_MEDIUM;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_MEDIUM_SUB_1;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+import org.apache.hyracks.util.string.UTF8StringReader;
+import org.apache.hyracks.util.string.UTF8StringWriter;
+import org.junit.Test;
+
+public class UTF8StringReaderWriterTest {
+
+    UTF8StringWriter writer = new UTF8StringWriter();
+    UTF8StringReader reader = new UTF8StringReader();
+
+    @Test
+    public void testWriterReader() throws IOException {
+        writeAndReadOneString(EMPTY_STRING);
+        writeAndReadOneString(STRING_LEN_3);
+
+        writeAndReadOneString(STRING_LEN_127);
+        writeAndReadOneString(STRING_LEN_128);
+        writeAndReadOneString(STRING_LEN_MEDIUM_SUB_1);
+    }
+
+    @Test
+    public void testMedium() throws IOException {
+        writeAndReadOneString(STRING_LEN_MEDIUM);
+        writeAndReadOneString(STRING_LEN_LARGE_SUB_1);
+    }
+
+    @Test
+    public void testLarge() throws IOException {
+        writeAndReadOneString(STRING_LEN_LARGE);
+    }
+
+    @Test
+    public void testUTF8() throws IOException {
+        writeAndReadOneString(STRING_UTF8_3);
+        writeAndReadOneString(STRING_UTF8_MIX);
+    }
+
+    private void writeAndReadOneString(String testString) throws IOException {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        DataOutputStream dos = new DataOutputStream(bos);
+        writer.writeUTF8(testString, dos);
+
+        ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray(), 0, bos.size());
+        assertEquals(testString, reader.readUTF(new DataInputStream(bis)));
+
+        int lastOffset = bos.size();
+        char[] charArray = testString.toCharArray();
+        writer.writeUTF8(charArray, 0, charArray.length, dos);
+
+        bis = new ByteArrayInputStream(bos.toByteArray(), lastOffset, bos.size());
+        assertEquals(testString, reader.readUTF(new DataInputStream(bis)));
+    }
+
+}

diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java b/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java
new file mode 100644
index 0000000..6f3782b
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java

@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.api.string;
+
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_127;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_128;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_MEDIUM;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX_LOWERCASE;
+import static org.apache.hyracks.util.string.UTF8StringUtil.charAt;
+import static org.apache.hyracks.util.string.UTF8StringUtil.charSize;
+import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getModifiedUTF8Len;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getNumBytesToStoreLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getNumCodePoint;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getStringLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getUTF8StringInArray;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getUTFLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.hash;
+import static org.apache.hyracks.util.string.UTF8StringUtil.lowerCaseCompareTo;
+import static org.apache.hyracks.util.string.UTF8StringUtil.lowerCaseHash;
+import static org.apache.hyracks.util.string.UTF8StringUtil.normalize;
+import static org.apache.hyracks.util.string.UTF8StringUtil.rawByteCompareTo;
+import static org.apache.hyracks.util.string.UTF8StringUtil.writeStringToBytes;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.util.string.UTF8StringUtil;
+import org.junit.Test;
+
+public class UTF8StringUtilTest {
+
+    @Test
+    public void testCharAtCharSizeGetLen() throws Exception {
+        char[] utf8Mix = STRING_UTF8_MIX.toCharArray();
+        byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
+        int pos = getNumBytesToStoreLength(getUTFLength(buffer, 0));
+        for (char c : utf8Mix) {
+            assertEquals(c, charAt(buffer, pos));
+            assertEquals(getModifiedUTF8Len(c), charSize(buffer, pos));
+            pos += charSize(buffer, pos);
+        }
+    }
+
+    @Test
+    public void testGetStringLength() throws Exception {
+        byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
+        assertEquals(STRING_UTF8_MIX.length(), getStringLength(buffer, 0));
+    }
+
+    @Test
+    public void testChinese() {
+        byte[] bufferDe = writeStringToBytes("的");
+        byte[] bufferLi = writeStringToBytes("离");
+        int ret = compareTo(bufferDe, 0, bufferLi, 0);
+        assertTrue(ret != 0);
+    }
+
+    @Test
+    public void testCompareToAndNormolize() throws Exception {
+        testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX, OPTION.STANDARD);
+        testCompare(STRING_UTF8_3, STRING_UTF8_MIX, OPTION.STANDARD);
+        testCompare(STRING_LEN_MEDIUM, STRING_UTF8_MIX, OPTION.STANDARD);
+    }
+
+    public boolean isSameSign(int r1, int r2) {
+        if (r1 > 0) {
+            return r2 > 0;
+        }
+        if (r1 < 0) {
+            return r2 < 0;
+        }
+        return r2 == 0;
+    }
+
+    enum OPTION {
+        STANDARD,
+        RAW_BYTE,
+        LOWERCASE
+    }
+
+    public void testCompare(String str1, String str2, OPTION option) throws IOException {
+        byte[] buffer1 = writeStringToBytes(str1);
+        byte[] buffer2 = writeStringToBytes(str2);
+
+        switch (option) {
+            case STANDARD:
+                assertEquals(str1.compareTo(str2), compareTo(buffer1, 0, buffer2, 0));
+                int n1 = normalize(buffer1, 0);
+                int n2 = normalize(buffer2, 0);
+                assertTrue(isSameSign(str1.compareTo(str2), n1 - n2));
+                break;
+            case RAW_BYTE:
+                assertEquals(str1.compareTo(str2), rawByteCompareTo(buffer1, 0, buffer2, 0));
+                break;
+            case LOWERCASE:
+                assertEquals(str1.compareToIgnoreCase(str2), lowerCaseCompareTo(buffer1, 0, buffer2, 0));
+                break;
+        }
+
+    }
+
+    @Test
+    public void testRawByteCompareTo() throws Exception {
+        testCompare(STRING_LEN_MEDIUM, STRING_LEN_MEDIUM, OPTION.RAW_BYTE);
+        testCompare(STRING_LEN_127, STRING_LEN_128, OPTION.RAW_BYTE);
+    }
+
+    @Test
+    public void testLowerCaseCompareTo() throws Exception {
+        testCompare(STRING_LEN_127, STRING_LEN_128, OPTION.LOWERCASE);
+        testCompare(STRING_LEN_127, STRING_UTF8_MIX, OPTION.LOWERCASE);
+        testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX_LOWERCASE, OPTION.LOWERCASE);
+        testCompare(STRING_UTF8_MIX_LOWERCASE, STRING_UTF8_MIX, OPTION.LOWERCASE);
+    }
+
+    @Test
+    public void testToString() throws Exception {
+
+        StringBuilder sb = new StringBuilder();
+        byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
+        assertEquals(STRING_UTF8_MIX, UTF8StringUtil.toString(sb, buffer, 0).toString());
+    }
+
+    @Test
+    public void testHash() throws IOException {
+        byte[] buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE);
+        int lowerHash = hash(buffer, 0);
+
+        buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE);
+        int upperHash = lowerCaseHash(buffer, 0);
+        assertEquals(lowerHash, upperHash);
+
+        int familyOne = hash(buffer, 0, 7, 297);
+        int familyTwo = hash(buffer, 0, 8, 297);
+        assertTrue(familyOne != familyTwo);
+    }
+
+    @Test
+    public void testGetUTF8StringInArray() {
+        String str = null;
+        byte[] bytes = null;
+        List<String> answer = null;
+
+        str = "database group at university of California, Irvine 23333";
+        bytes = writeStringToBytes(str);
+        // First byte in bytes is for the number of bytes of the entire string,
+        // and it should be skipped in getUTF8StringInArray
+        assertEquals("database", getUTF8StringInArray(bytes, 1, 8));
+        assertEquals("at", getUTF8StringInArray(bytes, 16, 2));
+        // test upper case
+        assertEquals("California", getUTF8StringInArray(bytes, 33, 10));
+        // test non-english char
+        assertEquals(",", getUTF8StringInArray(bytes, 43, 1));
+        assertEquals("Irvine", getUTF8StringInArray(bytes, 45, 6));
+        // test number
+        assertEquals("23333", getUTF8StringInArray(bytes, 52, 5));
+    }
+
+    @Test
+    public void testGetNumCodePoint() throws HyracksDataException {
+        String str = "\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66";
+        assertEquals(getNumCodePoint(writeStringToBytes(str), 0), 7);
+
+        str = "\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66\uD83C\uDDE8\uD83C\uDDF3";
+        assertEquals(getNumCodePoint(writeStringToBytes(str), 0), 9);
+    }
+
+}
commit	b4a7d81b9925f94f1e06827e95cb54650ad6625f	[log] [tgz]
author	Hussain Towaileb <Hussain.Towaileb@couchbase.com>	Fri Dec 01 07:54:00 2023 +0300
committer	Hussain Towaileb <hussainht@gmail.com>	Fri Dec 01 16:19:48 2023 +0000
tree	f6991376c4023d39df0d8ed34dfe09368133f379
parent	62578ad6659762f5e7fda13421abe9c38cae3605 [diff]