[NO ISSUE]: Move StringUtils to hyracks-api module
Change-Id: Iea1b7db9374332315dfaf56d49f24217f7c0834c
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17990
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Michael Blow <mblow@apache.org>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
diff --git a/hyracks-fullstack/hyracks/hyracks-api/pom.xml b/hyracks-fullstack/hyracks/hyracks-api/pom.xml
index 047f066..131731d 100644
--- a/hyracks-fullstack/hyracks/hyracks-api/pom.xml
+++ b/hyracks-fullstack/hyracks/hyracks-api/pom.xml
@@ -65,6 +65,13 @@
<version>${project.version}</version>
</dependency>
<dependency>
+ <groupId>org.apache.hyracks</groupId>
+ <artifactId>hyracks-util</artifactId>
+ <version>${project.version}</version>
+ <type>test-jar</type>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<scope>test</scope>
diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/ErrorCode.java b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/ErrorCode.java
index 7291473..8170f07 100644
--- a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/ErrorCode.java
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/ErrorCode.java
@@ -154,6 +154,7 @@
PARSING_ERROR(124),
INVALID_INVERTED_LIST_TYPE_TRAITS(125),
ILLEGAL_STATE(126),
+ INVALID_STRING_UNICODE(127),
// Compilation error codes.
RULECOLLECTION_NOT_INSTANCE_OF_LIST(10000),
diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/HyracksException.java b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/HyracksException.java
index 977e5d2..12f1095 100644
--- a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/HyracksException.java
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/exceptions/HyracksException.java
@@ -158,6 +158,10 @@
return message;
}
+ public String getMessageNoCode() {
+ return ErrorMessageUtil.getMessageNoCode(component, getMessage());
+ }
+
@Override
public String toString() {
return getLocalizedMessage();
diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ErrorMessageUtil.java b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ErrorMessageUtil.java
index 70b13fa..cb0d579 100644
--- a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ErrorMessageUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ErrorMessageUtil.java
@@ -125,6 +125,13 @@
}
}
+ public static String getMessageNoCode(String component, String message) {
+ if (NONE.equals(component)) {
+ return message;
+ }
+ return message.substring(message.indexOf(":") + 2);
+ }
+
public static String getCauseMessage(Throwable t) {
if (t instanceof IFormattedException) {
return t.getMessage();
diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ExceptionUtils.java b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ExceptionUtils.java
index 7147542..e07cdd4 100644
--- a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ExceptionUtils.java
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/api/util/ExceptionUtils.java
@@ -207,4 +207,16 @@
public static String getMessageOrToString(Throwable e) {
return e instanceof IFormattedException ? e.getMessage() : e.toString();
}
+
+ /**
+ * Checks if the error code of the throwable is of the provided type
+ *
+ * @param throwable throwable with error code
+ * @param code error code to match against
+ *
+ * @return true if error code matches, false otherwise
+ */
+ public static boolean isErrorCode(HyracksDataException throwable, ErrorCode code) {
+ return throwable.getError().isPresent() && throwable.getError().get() == code;
+ }
}
diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringReader.java b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringReader.java
new file mode 100644
index 0000000..43faf18
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringReader.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.string;
+
+import java.io.DataInput;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.Serializable;
+import java.io.UTFDataFormatException;
+
+public class UTF8StringReader implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ transient byte[] bytearr = null;
+ transient char[] chararr = null;
+
+ /**
+ * Reads from the
+ * stream <code>in</code> a representation
+ * of a Unicode character string encoded in
+ * <a href="DataInput.html#modified-utf-8">modified UTF-8</a> format;
+ * this string of characters is then returned as a <code>String</code>.
+ * The details of the modified UTF-8 representation
+ * are exactly the same as for the <code>readUTF</code>
+ * method of <code>DataInput</code>.
+ *
+ * @param in
+ * a data input stream.
+ * @return a Unicode string.
+ * @throws EOFException
+ * if the input stream reaches the end
+ * before all the bytes.
+ * @throws IOException
+ * the stream has been closed and the contained
+ * input stream does not support reading after close, or
+ * another I/O error occurs.
+ * @throws UTFDataFormatException
+ * if the bytes do not represent a
+ * valid modified UTF-8 encoding of a Unicode string.
+ * @see java.io.DataInputStream#readUnsignedShort()
+ */
+ public final String readUTF(DataInput in) throws IOException {
+ return UTF8StringUtil.readUTF8(in, this);
+ }
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
new file mode 100644
index 0000000..4fc503d
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -0,0 +1,711 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.string;
+
+import static org.apache.hyracks.api.exceptions.ErrorCode.INVALID_STRING_UNICODE;
+
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.UTFDataFormatException;
+import java.lang.ref.SoftReference;
+
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
+
+/**
+ * A helper package to operate the UTF8String in Hyracks.
+ * Most of the codes were migrated from asterix-fuzzyjoin and hyracks-storage-am-invertedindex
+ */
+public class UTF8StringUtil {
+
+ public static final String MALFORMED_BYTES = "malformed bytes";
+ public static final String LOW_SURROGATE_WITHOUT_HIGH_SURROGATE =
+ "got a low surrogate without a leading high surrogate";
+ public static final String HIGH_SURROGATE_WITHOUT_LOW_SURROGATE =
+ "got a high surrogate without a following low surrogate";
+
+ private UTF8StringUtil() {
+ }
+
+ public static char charAt(byte[] b, int s) {
+ if (s >= b.length) {
+ throw new ArrayIndexOutOfBoundsException(s);
+ }
+ int c = b[s] & 0xff;
+ switch (c >> 4) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ return (char) c;
+
+ case 12:
+ case 13:
+ return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
+
+ case 14:
+ return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (b[s + 2] & 0x3F));
+
+ default:
+ throw new IllegalArgumentException();
+ }
+ }
+
+ public static int charSize(byte[] b, int s) {
+ int c = b[s] & 0xff;
+ switch (c >> 4) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ return 1;
+
+ case 12:
+ case 13:
+ return 2;
+
+ case 14:
+ return 3;
+
+ default:
+ throw new IllegalStateException();
+ }
+ }
+
+ public static int codePointAt(byte[] b, int s) throws HyracksDataException {
+ char c1 = charAt(b, s);
+
+ if (Character.isLowSurrogate(c1)) {
+ // In this case, the index s doesn't point to a correct position
+ throw HyracksDataException.create(INVALID_STRING_UNICODE, LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+ }
+
+ if (Character.isHighSurrogate(c1)) {
+ // If c1 is the a high surrogate and also the last char in the byte array (that means the byte array is somehow illegal),
+ // then an exception will be thrown because there is no low surrogate (c2) available in the byte array
+ s += charSize(b, s);
+ char c2 = charAt(b, s);
+ if (Character.isLowSurrogate(c2)) {
+ return Character.toCodePoint(c1, c2);
+ } else {
+ throw HyracksDataException.create(INVALID_STRING_UNICODE, HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
+ }
+ }
+
+ return c1;
+ }
+
+ public static int codePointSize(byte[] b, int s) throws HyracksDataException {
+ char c1 = charAt(b, s);
+ int size1 = charSize(b, s);
+
+ if (Character.isLowSurrogate(c1)) {
+ throw HyracksDataException.create(INVALID_STRING_UNICODE, LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+ }
+
+ if (Character.isHighSurrogate(c1)) {
+ // Similar to the above codePointAt(),
+ // if c1 is the a high surrogate and also the last char in the byte array (that means the byte array is somehow illegal),
+ // then an exception will be thrown because there is no low surrogate available in the byte array
+ s += size1;
+ int size2 = charSize(b, s);
+ return size1 + size2;
+ }
+
+ return size1;
+ }
+
+ public static boolean isCharStart(byte[] b, int s) {
+ int c = b[s] & 0xff;
+ return (c >> 6) != 2;
+ }
+
+ public static int getModifiedUTF8Len(char c) {
+ if (c >= 0x0001 && c <= 0x007F) {
+ return 1;
+ } else if (c <= 0x07FF) {
+ return 2;
+ } else {
+ return 3;
+ }
+ }
+
+ public static int writeCharAsModifiedUTF8(char c, DataOutput dos) throws IOException {
+ if (c >= 0x0001 && c <= 0x007F) {
+ dos.writeByte(c);
+ return 1;
+ } else if (c <= 0x07FF) {
+ dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
+ dos.writeByte((byte) (0x80 | (c & 0x3F)));
+ return 2;
+ } else {
+ dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
+ dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
+ dos.writeByte((byte) (0x80 | (c & 0x3F)));
+ return 3;
+ }
+ }
+
+ public static int writeCharAsModifiedUTF8(char c, OutputStream dos) throws IOException {
+ if (c >= 0x0001 && c <= 0x007F) {
+ dos.write(c);
+ return 1;
+ } else if (c <= 0x07FF) {
+ dos.write((byte) (0xC0 | ((c >> 6) & 0x3F)));
+ dos.write((byte) (0x80 | (c & 0x3F)));
+ return 2;
+ } else {
+ dos.write((byte) (0xE0 | ((c >> 12) & 0x0F)));
+ dos.write((byte) (0x80 | ((c >> 6) & 0x3F)));
+ dos.write((byte) (0x80 | (c & 0x3F)));
+ return 3;
+ }
+ }
+
+ // The result is the number of Java Chars (8 bytes) in the string
+ public static int getStringLength(byte[] b, int s) {
+ int len = getUTFLength(b, s);
+ int pos = s + getNumBytesToStoreLength(len);
+ return getStringLength(b, pos, len);
+ }
+
+ public static int getStringLength(byte[] b, int offs, int len) {
+ int pos = offs;
+ int end = pos + len;
+ int charCount = 0;
+ while (pos < end) {
+ charCount++;
+ pos += charSize(b, pos);
+ }
+ return charCount;
+ }
+
+ public static int getNumCodePoint(byte[] b, int s) throws HyracksDataException {
+ int len = getUTFLength(b, s);
+ int pos = s + getNumBytesToStoreLength(len);
+ int end = pos + len;
+ int codePointCount = 0;
+ while (pos < end) {
+ codePointCount++;
+ pos += codePointSize(b, pos);
+ }
+
+ return codePointCount;
+ }
+
+ public static int getUTFLength(byte[] b, int s) {
+ return VarLenIntEncoderDecoder.decode(b, s);
+ }
+
+ public static int getNumBytesToStoreLength(int strlen) {
+ return VarLenIntEncoderDecoder.getBytesRequired(strlen);
+ }
+
+ public static int codePointToUTF8(int codePoint, char[] tempChars, byte[] outputUTF8) {
+ int len = 0;
+ int numChars = Character.toChars(codePoint, tempChars, 0);
+ for (int i = 0; i < numChars; i++) {
+ len += writeToBytes(outputUTF8, len, tempChars[i]);
+ }
+
+ return len;
+ }
+
+ /**
+ * Compute the normalized key of the UTF8 string.
+ * The normalized key in Hyracks is mainly used to speedup the comparison between pointable data.
+ * In the UTF8StringPTR case, we compute the integer value by using the first 2 chars.
+ * The comparator will first use this integer to get the result ( <,>, or =), it will check
+ * the actual bytes only if the normalized key is equal. Thus this normalized key must be
+ * consistent with the comparison result.
+ */
+ public static int normalize(byte[] bytes, int start) {
+ int len = getUTFLength(bytes, start);
+ long nk = 0;
+ int offset = start + getNumBytesToStoreLength(len);
+ for (int i = 0; i < 2; ++i) {
+ nk <<= 16;
+ if (i < len) {
+ nk += (charAt(bytes, offset)) & 0xffff;
+ offset += charSize(bytes, offset);
+ }
+ }
+ return (int) (nk >> 1); // make it always positive.
+ }
+
+ public static int compareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart) {
+ return compareTo(thisBytes, thisStart, thatBytes, thatStart, false, false);
+ }
+
+ // the start and length of each are the ones calculated by UTF8StringPointable. caller should provide proper values
+ public static int compareTo(byte[] thisBytes, int thisStart, int thisLength, byte[] thatBytes, int thatStart,
+ int thatLength) {
+ return compareTo(thisBytes, thisStart, thisLength, thatBytes, thatStart, thatLength, false, false);
+ }
+
+ /**
+ * This function provides the raw bytes-based comparison for UTF8 strings.
+ * Note that the comparison may not deliver the correct ordering for certain languages that include 2 or 3 bytes characters.
+ * But it works for single-byte character languages.
+ */
+ public static int rawByteCompareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart) {
+ return compareTo(thisBytes, thisStart, thatBytes, thatStart, false, true);
+ }
+
+ public static int lowerCaseCompareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart) {
+ return compareTo(thisBytes, thisStart, thatBytes, thatStart, true, false);
+ }
+
+ // Certain type of string does not include lengthByte in the beginning and
+ // the length of the given string is given explicitly as a parameter. (e.g., token in a string)
+ public static int lowerCaseCompareTo(byte[] thisBytes, int thisStart, int thisLength, byte[] thatBytes,
+ int thatStart, int thatLength) {
+ return compareTo(thisBytes, thisStart, thisLength, thatBytes, thatStart, thatLength, true, false);
+ }
+
+ public static int hash(byte[] bytes, int start, int coefficient, int r) {
+ return hash(bytes, start, false, false, coefficient, r);
+ }
+
+ public static int hash(byte[] bytes, int start) {
+ return hash(bytes, start, false, false, 31, Integer.MAX_VALUE);
+ }
+
+ private static int hash(byte[] bytes, int start, boolean useLowerCase, boolean useRawByte, int coefficient, int r) {
+ int utflen = getUTFLength(bytes, start);
+ int sStart = start + getNumBytesToStoreLength(utflen);
+ return hash(bytes, sStart, utflen, useLowerCase, useRawByte, coefficient, r);
+ }
+
+ /**
+ * This function provides the raw bytes-based hash function for UTF8 strings.
+ * Note that the hash values may not deliver the correct ordering for certain languages that include 2 or 3 bytes characters.
+ * But it works for single-byte character languages.
+ */
+ public static int rawBytehash(byte[] bytes, int start) {
+ return hash(bytes, start, false, true, 31, Integer.MAX_VALUE);
+ }
+
+ public static int lowerCaseHash(byte[] bytes, int start) {
+ return hash(bytes, start, true, false, 31, Integer.MAX_VALUE);
+ }
+
+ // Certain type of string does not include lengthByte in the beginning and
+ // the length of the given string is given explicitly as a parameter.
+ public static int lowerCaseHash(byte[] bytes, int start, int length) {
+ return hash(bytes, start, length, true, false, 31, Integer.MAX_VALUE);
+ }
+
+ public static String toString(byte[] bytes, int start) {
+ StringBuilder builder = new StringBuilder();
+ return toString(builder, bytes, start).toString();
+ }
+
+ public static StringBuilder toString(StringBuilder builder, byte[] bytes, int start) {
+ int utfLen = getUTFLength(bytes, start);
+ int offset = getNumBytesToStoreLength(utfLen);
+ while (utfLen > 0) {
+ char c = charAt(bytes, start + offset);
+ builder.append(c);
+ int cLen = getModifiedUTF8Len(c);
+ offset += cLen;
+ utfLen -= cLen;
+ }
+ return builder;
+ }
+
+ // Different from the above toString() methods, here we assume the byte[] doesn't contain NumBytesToStoreLength
+ // In fact, this is used for string tokenizer: get "hello" and "world" from the bytes of "hello world"
+ public static String getUTF8StringInArray(byte[] b, int start, int len) {
+ StringBuilder builder = new StringBuilder();
+
+ for (int i = start; i < start + len;) {
+ char c = UTF8StringUtil.charAt(b, i);
+ builder.append(c);
+ i += UTF8StringUtil.charSize(b, i);
+ }
+
+ return builder.toString();
+ }
+
+ public static void printUTF8StringWithQuotes(byte[] b, int s, int l, OutputStream os) throws IOException {
+ printUTF8String(b, s, l, os, true);
+ }
+
+ public static void printUTF8StringNoQuotes(byte[] b, int s, int l, OutputStream os) throws IOException {
+ printUTF8String(b, s, l, os, false);
+ }
+
+ public static void printUTF8StringWithQuotes(String str, OutputStream os) throws IOException {
+ printUTF8String(str, os, true);
+ }
+
+ public static void printUTF8StringNoQuotes(String str, OutputStream os) throws IOException {
+ printUTF8String(str, os, false);
+ }
+
+ public static int encodeUTF8Length(int length, byte[] bytes, int start) {
+ return VarLenIntEncoderDecoder.encode(length, bytes, start);
+ }
+
+ public static int writeUTF8Length(int length, byte[] bytes, DataOutput out) throws IOException {
+ int nbytes = encodeUTF8Length(length, bytes, 0);
+ out.write(bytes, 0, nbytes);
+ return nbytes;
+ }
+
+ private static void printUTF8String(byte[] b, int s, int l, OutputStream os, boolean useQuotes) throws IOException {
+ int stringLength = getUTFLength(b, s);
+ int position = s + getNumBytesToStoreLength(stringLength);
+ int maxPosition = position + stringLength;
+ if (useQuotes) {
+ os.write('\"');
+ }
+ while (position < maxPosition) {
+ char c = charAt(b, position);
+ if (c == '\\' || c == '"') {
+ // escape
+ os.write('\\');
+ }
+ int sz = charSize(b, position);
+ while (sz > 0) {
+ os.write(b[position]);
+ position++;
+ sz--;
+ }
+ }
+ if (useQuotes) {
+ os.write('\"');
+ }
+ }
+
+ private static void printUTF8String(String string, OutputStream os, boolean useQuotes) throws IOException {
+ if (useQuotes) {
+ os.write('\"');
+ }
+ for (int i = 0; i < string.length(); i++) {
+ char ch = string.charAt(i);
+ writeCharAsModifiedUTF8(ch, os);
+ }
+ if (useQuotes) {
+ os.write('\"');
+ }
+ }
+
+ private static int compareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart, boolean useLowerCase,
+ boolean useRawByte) {
+ int thisLength = getUTFLength(thisBytes, thisStart);
+ int thatLength = getUTFLength(thatBytes, thatStart);
+ int thisActualStart = thisStart + getNumBytesToStoreLength(thisLength);
+ int thatActualStart = thatStart + getNumBytesToStoreLength(thatLength);
+ return compareTo(thisBytes, thisActualStart, thisLength, thatBytes, thatActualStart, thatLength, useLowerCase,
+ useRawByte);
+ }
+
+ private static int compareTo(byte[] thisBytes, int thisActualStart, int thisLength, byte[] thatBytes,
+ int thatActualStart, int thatLength, boolean useLowerCase, boolean useRawByte) {
+ int c1 = 0;
+ int c2 = 0;
+
+ while (c1 < thisLength && c2 < thatLength) {
+ char ch1, ch2;
+ if (useRawByte) {
+ ch1 = (char) thisBytes[thisActualStart + c1];
+ ch2 = (char) thatBytes[thatActualStart + c2];
+ } else {
+ ch1 = charAt(thisBytes, thisActualStart + c1);
+ ch2 = charAt(thatBytes, thatActualStart + c2);
+
+ if (useLowerCase) {
+ ch1 = Character.toLowerCase(ch1);
+ ch2 = Character.toLowerCase(ch2);
+ }
+ }
+
+ if (ch1 != ch2) {
+ return ch1 - ch2;
+ }
+ c1 += charSize(thisBytes, thisActualStart + c1);
+ c2 += charSize(thatBytes, thatActualStart + c2);
+ }
+ return thisLength - thatLength;
+ }
+
+ private static int hash(byte[] bytes, int start, int length, boolean useLowerCase, boolean useRawByte,
+ int coefficient, int r) {
+ int h = 0;
+ int c = 0;
+
+ while (c < length) {
+ char ch;
+ if (useRawByte) {
+ ch = (char) bytes[start + c];
+ } else {
+ ch = charAt(bytes, start + c);
+ if (useLowerCase) {
+ ch = Character.toLowerCase(ch);
+ }
+ }
+ h = (coefficient * h + ch) % r;
+ c += charSize(bytes, start + c);
+ }
+ return h;
+ }
+
+ public static byte[] writeStringToBytes(String string) {
+ UTF8StringWriter writer = new UTF8StringWriter();
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ DataOutputStream dos = new DataOutputStream(bos);
+ try {
+ writer.writeUTF8(string, dos);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ return bos.toByteArray();
+ }
+
+ /**
+ * Reads from the
+ * stream <code>in</code> a representation
+ * of a Unicode character string encoded in
+ * <a href="DataInput.html#modified-utf-8">modified UTF-8</a> format;
+ * this string of characters is then returned as a <code>String</code>.
+ * The details of the modified UTF-8 representation
+ * are exactly the same as for the <code>readUTF</code>
+ * method of <code>DataInput</code>.
+ *
+ * @param in
+ * a data input stream.
+ * @return a Unicode string.
+ * @throws EOFException
+ * if the input stream reaches the end
+ * before all the bytes.
+ * @throws IOException
+ * the stream has been closed and the contained
+ * input stream does not support reading after close, or
+ * another I/O error occurs.
+ * @throws UTFDataFormatException
+ * if the bytes do not represent a
+ * valid modified UTF-8 encoding of a Unicode string.
+ * @see java.io.DataInputStream#readUnsignedShort()
+ */
+ public static String readUTF8(DataInput in) throws IOException {
+ return readUTF8(in, null);
+ }
+
+ public static String readUTF8(DataInput in, UTF8StringReader reader) throws IOException {
+ int utflen = VarLenIntEncoderDecoder.decode(in);
+ byte[] bytearr;
+ char[] chararr;
+
+ if (reader == null) {
+ bytearr = new byte[utflen * 2];
+ chararr = new char[utflen * 2];
+ } else {
+ if (reader.bytearr == null || reader.bytearr.length < utflen) {
+ reader.bytearr = new byte[utflen * 2];
+ reader.chararr = new char[utflen * 2];
+ }
+ bytearr = reader.bytearr;
+ chararr = reader.chararr;
+ }
+
+ int c, char2, char3;
+ int count = 0;
+ int chararr_count = 0;
+
+ in.readFully(bytearr, 0, utflen);
+
+ while (count < utflen) {
+ c = bytearr[count] & 0xff;
+ if (c > 127) {
+ break;
+ }
+ count++;
+ chararr[chararr_count++] = (char) c;
+ }
+
+ while (count < utflen) {
+ c = bytearr[count] & 0xff;
+ switch (c >> 4) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ /* 0xxxxxxx*/
+ count++;
+ chararr[chararr_count++] = (char) c;
+ break;
+ case 12:
+ case 13:
+ /* 110x xxxx 10xx xxxx*/
+ count += 2;
+ if (count > utflen) {
+ throw new UTFDataFormatException("malformed input: partial character at end");
+ }
+ char2 = bytearr[count - 1];
+ if ((char2 & 0xC0) != 0x80) {
+ throw new UTFDataFormatException("malformed input around byte " + count);
+ }
+ chararr[chararr_count++] = (char) (((c & 0x1F) << 6) | (char2 & 0x3F));
+ break;
+ case 14:
+ /* 1110 xxxx 10xx xxxx 10xx xxxx */
+ count += 3;
+ if (count > utflen) {
+ throw new UTFDataFormatException("malformed input: partial character at end");
+ }
+ char2 = bytearr[count - 2];
+ char3 = bytearr[count - 1];
+ if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) {
+ throw new UTFDataFormatException("malformed input around byte " + (count - 1));
+ }
+ chararr[chararr_count++] = (char) (((c & 0x0F) << 12) | ((char2 & 0x3F) << 6) | (char3 & 0x3F));
+ break;
+ default:
+ /* 10xx xxxx, 1111 xxxx */
+ throw new UTFDataFormatException("malformed input around byte " + count);
+ }
+ }
+ // The number of chars produced may be less than utflen
+ return new String(chararr, 0, chararr_count);
+ }
+
+ /**
+ * Write a UTF8 String <code>str</code> into the DataOutput <code>out</code>
+ *
+ * @param str,
+ * a Unicode string;
+ * @param out,
+ * a Data output stream.
+ * @throws IOException
+ */
+ public static void writeUTF8(CharSequence str, DataOutput out) throws IOException {
+ writeUTF8(str, out, null);
+ }
+
+ public static void writeUTF8(CharSequence str, DataOutput out, UTF8StringWriter writer) throws IOException {
+ int strlen = str.length();
+ int utflen = 0;
+ char c;
+ int count = 0;
+
+ for (int i = 0; i < strlen; i++) {
+ // ToDo: we shouldn't use str.charAt(i) to convert raw byte array to UTF-8 chars
+ // one UTF-8 char has at most four bytes, and one Java char we get via str.charAt(i) has 2 bytes
+ // In this case, a UTF-8 char may be consistent of 2 Java chars, and 1 Java char can be converted into 3 UTF-8 bytes
+ // For the emoji, it can be 6 bytes after encoded to UTF-8
+ c = str.charAt(i);
+ utflen += UTF8StringUtil.getModifiedUTF8Len(c);
+ }
+
+ byte[] tempBytes = getTempBytes(writer, utflen);
+ count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count);
+ int i = 0;
+ for (; i < strlen; i++) {
+ c = str.charAt(i);
+ if (!((c >= 0x0001) && (c <= 0x007F))) {
+ break;
+ }
+ tempBytes[count++] = (byte) c;
+ }
+
+ for (; i < strlen; i++) {
+ c = str.charAt(i);
+ count += writeToBytes(tempBytes, count, c);
+ }
+ out.write(tempBytes, 0, count);
+ }
+
+ public static void writeUTF8(char[] buffer, int start, int length, DataOutput out, UTF8StringWriter writer)
+ throws IOException {
+ int utflen = 0;
+ int count = 0;
+ char c;
+
+ for (int i = 0; i < length; i++) {
+ c = buffer[i + start];
+ utflen += UTF8StringUtil.getModifiedUTF8Len(c);
+ }
+
+ byte[] tempBytes = getTempBytes(writer, utflen);
+ count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count);
+
+ int i = 0;
+ for (; i < length; i++) {
+ c = buffer[i + start];
+ if (!((c >= 0x0001) && (c <= 0x007F))) {
+ break;
+ }
+ tempBytes[count++] = (byte) c;
+ }
+
+ for (; i < length; i++) {
+ c = buffer[i + start];
+ count += writeToBytes(tempBytes, count, c);
+ }
+ out.write(tempBytes, 0, count);
+ }
+
+ private static int writeToBytes(byte[] tempBytes, int count, char c) {
+ int orig = count;
+ if ((c >= 0x0001) && (c <= 0x007F)) {
+ tempBytes[count++] = (byte) c;
+ } else if (c > 0x07FF) {
+ tempBytes[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
+ tempBytes[count++] = (byte) (0x80 | ((c >> 6) & 0x3F));
+ tempBytes[count++] = (byte) (0x80 | (c & 0x3F));
+ } else {
+ tempBytes[count++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
+ tempBytes[count++] = (byte) (0x80 | (c & 0x3F));
+ }
+ return count - orig;
+ }
+
+ private static byte[] getTempBytes(UTF8StringWriter writer, int utflen) {
+ byte[] tempBytes;
+ if (writer == null) {
+ tempBytes = new byte[utflen + 5];
+ } else {
+ byte[] writerTempBytes = writer.tempBytesRef != null ? writer.tempBytesRef.get() : null;
+ if (writerTempBytes == null || writerTempBytes.length < utflen + 5) {
+ writerTempBytes = new byte[utflen + 5];
+ writer.tempBytesRef = new SoftReference<>(writerTempBytes);
+ }
+ tempBytes = writerTempBytes;
+ }
+ return tempBytes;
+ }
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringWriter.java b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringWriter.java
new file mode 100644
index 0000000..a0cc7d0
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringWriter.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hyracks.util.string;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+import java.lang.ref.SoftReference;
+
+public class UTF8StringWriter implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+ transient SoftReference<byte[]> tempBytesRef;
+
+ public final void writeUTF8(CharSequence str, DataOutput out) throws IOException {
+ UTF8StringUtil.writeUTF8(str, out, this);
+ }
+
+ public final void writeUTF8(char[] buffer, int start, int length, DataOutput out) throws IOException {
+ UTF8StringUtil.writeUTF8(buffer, start, length, out, this);
+ }
+
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/main/resources/errormsg/en.properties b/hyracks-fullstack/hyracks/hyracks-api/src/main/resources/errormsg/en.properties
index 4d9c60b..7db5d49 100644
--- a/hyracks-fullstack/hyracks/hyracks-api/src/main/resources/errormsg/en.properties
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/main/resources/errormsg/en.properties
@@ -144,6 +144,8 @@
124 = Parsing error %s: %s
125 = Invalid inverted list type traits: %1$s
126 = Illegal state. %1$s
+127 = Decoding error - %1$s
+
10000 = The given rule collection %1$s is not an instance of the List class.
10001 = Cannot compose partition constraint %1$s with %2$s
diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringReaderWriterTest.java b/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringReaderWriterTest.java
new file mode 100644
index 0000000..abba958
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringReaderWriterTest.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.api.string;
+
+import static org.apache.hyracks.util.string.UTF8StringSample.EMPTY_STRING;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_127;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_128;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_3;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_LARGE;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_LARGE_SUB_1;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_MEDIUM;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_MEDIUM_SUB_1;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+import org.apache.hyracks.util.string.UTF8StringReader;
+import org.apache.hyracks.util.string.UTF8StringWriter;
+import org.junit.Test;
+
+public class UTF8StringReaderWriterTest {
+
+ UTF8StringWriter writer = new UTF8StringWriter();
+ UTF8StringReader reader = new UTF8StringReader();
+
+ @Test
+ public void testWriterReader() throws IOException {
+ writeAndReadOneString(EMPTY_STRING);
+ writeAndReadOneString(STRING_LEN_3);
+
+ writeAndReadOneString(STRING_LEN_127);
+ writeAndReadOneString(STRING_LEN_128);
+ writeAndReadOneString(STRING_LEN_MEDIUM_SUB_1);
+ }
+
+ @Test
+ public void testMedium() throws IOException {
+ writeAndReadOneString(STRING_LEN_MEDIUM);
+ writeAndReadOneString(STRING_LEN_LARGE_SUB_1);
+ }
+
+ @Test
+ public void testLarge() throws IOException {
+ writeAndReadOneString(STRING_LEN_LARGE);
+ }
+
+ @Test
+ public void testUTF8() throws IOException {
+ writeAndReadOneString(STRING_UTF8_3);
+ writeAndReadOneString(STRING_UTF8_MIX);
+ }
+
+ private void writeAndReadOneString(String testString) throws IOException {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ DataOutputStream dos = new DataOutputStream(bos);
+ writer.writeUTF8(testString, dos);
+
+ ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray(), 0, bos.size());
+ assertEquals(testString, reader.readUTF(new DataInputStream(bis)));
+
+ int lastOffset = bos.size();
+ char[] charArray = testString.toCharArray();
+ writer.writeUTF8(charArray, 0, charArray.length, dos);
+
+ bis = new ByteArrayInputStream(bos.toByteArray(), lastOffset, bos.size());
+ assertEquals(testString, reader.readUTF(new DataInputStream(bis)));
+ }
+
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java b/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java
new file mode 100644
index 0000000..6f3782b
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.api.string;
+
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_127;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_128;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_MEDIUM;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX_LOWERCASE;
+import static org.apache.hyracks.util.string.UTF8StringUtil.charAt;
+import static org.apache.hyracks.util.string.UTF8StringUtil.charSize;
+import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getModifiedUTF8Len;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getNumBytesToStoreLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getNumCodePoint;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getStringLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getUTF8StringInArray;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getUTFLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.hash;
+import static org.apache.hyracks.util.string.UTF8StringUtil.lowerCaseCompareTo;
+import static org.apache.hyracks.util.string.UTF8StringUtil.lowerCaseHash;
+import static org.apache.hyracks.util.string.UTF8StringUtil.normalize;
+import static org.apache.hyracks.util.string.UTF8StringUtil.rawByteCompareTo;
+import static org.apache.hyracks.util.string.UTF8StringUtil.writeStringToBytes;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.util.string.UTF8StringUtil;
+import org.junit.Test;
+
+public class UTF8StringUtilTest {
+
+ @Test
+ public void testCharAtCharSizeGetLen() throws Exception {
+ char[] utf8Mix = STRING_UTF8_MIX.toCharArray();
+ byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
+ int pos = getNumBytesToStoreLength(getUTFLength(buffer, 0));
+ for (char c : utf8Mix) {
+ assertEquals(c, charAt(buffer, pos));
+ assertEquals(getModifiedUTF8Len(c), charSize(buffer, pos));
+ pos += charSize(buffer, pos);
+ }
+ }
+
+ @Test
+ public void testGetStringLength() throws Exception {
+ byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
+ assertEquals(STRING_UTF8_MIX.length(), getStringLength(buffer, 0));
+ }
+
+ @Test
+ public void testChinese() {
+ byte[] bufferDe = writeStringToBytes("的");
+ byte[] bufferLi = writeStringToBytes("离");
+ int ret = compareTo(bufferDe, 0, bufferLi, 0);
+ assertTrue(ret != 0);
+ }
+
+ @Test
+ public void testCompareToAndNormolize() throws Exception {
+ testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX, OPTION.STANDARD);
+ testCompare(STRING_UTF8_3, STRING_UTF8_MIX, OPTION.STANDARD);
+ testCompare(STRING_LEN_MEDIUM, STRING_UTF8_MIX, OPTION.STANDARD);
+ }
+
+ public boolean isSameSign(int r1, int r2) {
+ if (r1 > 0) {
+ return r2 > 0;
+ }
+ if (r1 < 0) {
+ return r2 < 0;
+ }
+ return r2 == 0;
+ }
+
+ enum OPTION {
+ STANDARD,
+ RAW_BYTE,
+ LOWERCASE
+ }
+
+ public void testCompare(String str1, String str2, OPTION option) throws IOException {
+ byte[] buffer1 = writeStringToBytes(str1);
+ byte[] buffer2 = writeStringToBytes(str2);
+
+ switch (option) {
+ case STANDARD:
+ assertEquals(str1.compareTo(str2), compareTo(buffer1, 0, buffer2, 0));
+ int n1 = normalize(buffer1, 0);
+ int n2 = normalize(buffer2, 0);
+ assertTrue(isSameSign(str1.compareTo(str2), n1 - n2));
+ break;
+ case RAW_BYTE:
+ assertEquals(str1.compareTo(str2), rawByteCompareTo(buffer1, 0, buffer2, 0));
+ break;
+ case LOWERCASE:
+ assertEquals(str1.compareToIgnoreCase(str2), lowerCaseCompareTo(buffer1, 0, buffer2, 0));
+ break;
+ }
+
+ }
+
+ @Test
+ public void testRawByteCompareTo() throws Exception {
+ testCompare(STRING_LEN_MEDIUM, STRING_LEN_MEDIUM, OPTION.RAW_BYTE);
+ testCompare(STRING_LEN_127, STRING_LEN_128, OPTION.RAW_BYTE);
+ }
+
+ @Test
+ public void testLowerCaseCompareTo() throws Exception {
+ testCompare(STRING_LEN_127, STRING_LEN_128, OPTION.LOWERCASE);
+ testCompare(STRING_LEN_127, STRING_UTF8_MIX, OPTION.LOWERCASE);
+ testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX_LOWERCASE, OPTION.LOWERCASE);
+ testCompare(STRING_UTF8_MIX_LOWERCASE, STRING_UTF8_MIX, OPTION.LOWERCASE);
+ }
+
+ @Test
+ public void testToString() throws Exception {
+
+ StringBuilder sb = new StringBuilder();
+ byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
+ assertEquals(STRING_UTF8_MIX, UTF8StringUtil.toString(sb, buffer, 0).toString());
+ }
+
+ @Test
+ public void testHash() throws IOException {
+ byte[] buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE);
+ int lowerHash = hash(buffer, 0);
+
+ buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE);
+ int upperHash = lowerCaseHash(buffer, 0);
+ assertEquals(lowerHash, upperHash);
+
+ int familyOne = hash(buffer, 0, 7, 297);
+ int familyTwo = hash(buffer, 0, 8, 297);
+ assertTrue(familyOne != familyTwo);
+ }
+
+ @Test
+ public void testGetUTF8StringInArray() {
+ String str = null;
+ byte[] bytes = null;
+ List<String> answer = null;
+
+ str = "database group at university of California, Irvine 23333";
+ bytes = writeStringToBytes(str);
+ // First byte in bytes is for the number of bytes of the entire string,
+ // and it should be skipped in getUTF8StringInArray
+ assertEquals("database", getUTF8StringInArray(bytes, 1, 8));
+ assertEquals("at", getUTF8StringInArray(bytes, 16, 2));
+ // test upper case
+ assertEquals("California", getUTF8StringInArray(bytes, 33, 10));
+ // test non-english char
+ assertEquals(",", getUTF8StringInArray(bytes, 43, 1));
+ assertEquals("Irvine", getUTF8StringInArray(bytes, 45, 6));
+ // test number
+ assertEquals("23333", getUTF8StringInArray(bytes, 52, 5));
+ }
+
+ @Test
+ public void testGetNumCodePoint() throws HyracksDataException {
+ String str = "\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66";
+ assertEquals(getNumCodePoint(writeStringToBytes(str), 0), 7);
+
+ str = "\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66\uD83C\uDDE8\uD83C\uDDF3";
+ assertEquals(getNumCodePoint(writeStringToBytes(str), 0), 9);
+ }
+
+}