[NO ISSUE] Add UTF8StringUtil.getUTF8StringInArray() for tokenizer scenario
Change-Id: I273a776f14a2846e5380f2bdc4a3168a1dac052c
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/5565
Contrib: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Ian Maxon <imaxon@uci.edu>
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index f50fa90..2b0e49e 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -324,6 +324,20 @@
return builder;
}
+ // Different from the above toString() methods, here we assume the byte[] doesn't contain NumBytesToStoreLength
+ // In fact, this is used for string tokenizer: get "hello" and "world" from the bytes of "hello world"
+ public static String getUTF8StringInArray(byte[] b, int start, int len) {
+ StringBuilder builder = new StringBuilder();
+
+ for (int i = start; i < start + len;) {
+ char c = UTF8StringUtil.charAt(b, i);
+ builder.append(c);
+ i += UTF8StringUtil.charSize(b, i);
+ }
+
+ return builder.toString();
+ }
+
public static void printUTF8StringWithQuotes(byte[] b, int s, int l, OutputStream os) throws IOException {
printUTF8String(b, s, l, os, true);
}
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
index b75d68c..2c99104 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
@@ -31,6 +31,7 @@
import static org.apache.hyracks.util.string.UTF8StringUtil.getModifiedUTF8Len;
import static org.apache.hyracks.util.string.UTF8StringUtil.getNumBytesToStoreLength;
import static org.apache.hyracks.util.string.UTF8StringUtil.getStringLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getUTF8StringInArray;
import static org.apache.hyracks.util.string.UTF8StringUtil.getUTFLength;
import static org.apache.hyracks.util.string.UTF8StringUtil.hash;
import static org.apache.hyracks.util.string.UTF8StringUtil.lowerCaseCompareTo;
@@ -42,6 +43,7 @@
import static org.junit.Assert.assertTrue;
import java.io.IOException;
+import java.util.List;
import org.junit.Test;
@@ -153,4 +155,25 @@
assertTrue(familyOne != familyTwo);
}
+ @Test
+ public void testGetUTF8StringInArray() {
+ String str = null;
+ byte[] bytes = null;
+ List<String> answer = null;
+
+ str = "database group at university of California, Irvine 23333";
+ bytes = writeStringToBytes(str);
+ // First byte in bytes is for the number of bytes of the entire string,
+ // and it should be skipped in getUTF8StringInArray
+ assertEquals("database", getUTF8StringInArray(bytes, 1, 8));
+ assertEquals("at", getUTF8StringInArray(bytes, 16, 2));
+ // test upper case
+ assertEquals("California", getUTF8StringInArray(bytes, 33, 10));
+ // test non-english char
+ assertEquals(",", getUTF8StringInArray(bytes, 43, 1));
+ assertEquals("Irvine", getUTF8StringInArray(bytes, 45, 6));
+ // test number
+ assertEquals("23333", getUTF8StringInArray(bytes, 52, 5));
+ }
+
}