[ASTERIXDB-2129][RT] Fix normalizing non-ascii strings
- user model changes: no
- storage format changes: no
- interface changes: no
Details:
For example, single char strings with a 3-byte char can go out of the
string's buffer boundry
Change-Id: Ic169d5ff20f9bf5ce2ca36bab4ebd241bbc50dca
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17230
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Ali Alsuliman <ali.al.solaiman@gmail.com>
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index 3eb8687..c0475b1 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -244,12 +244,13 @@
* consistent with the comparison result.
*/
public static int normalize(byte[] bytes, int start) {
- int len = getUTFLength(bytes, start);
long nk = 0;
+ int len = getUTFLength(bytes, start);
int offset = start + getNumBytesToStoreLength(len);
+ int end = offset + len;
for (int i = 0; i < 2; ++i) {
nk <<= 16;
- if (i < len) {
+ if (offset < end) {
nk += (charAt(bytes, offset)) & 0xffff;
offset += charSize(bytes, offset);
}
@@ -498,19 +499,15 @@
* are exactly the same as for the <code>readUTF</code>
* method of <code>DataInput</code>.
*
- * @param in
- * a data input stream.
+ * @param in a data input stream.
* @return a Unicode string.
- * @throws EOFException
- * if the input stream reaches the end
- * before all the bytes.
- * @throws IOException
- * the stream has been closed and the contained
- * input stream does not support reading after close, or
- * another I/O error occurs.
- * @throws UTFDataFormatException
- * if the bytes do not represent a
- * valid modified UTF-8 encoding of a Unicode string.
+ * @throws EOFException if the input stream reaches the end
+ * before all the bytes.
+ * @throws IOException the stream has been closed and the contained
+ * input stream does not support reading after close, or
+ * another I/O error occurs.
+ * @throws UTFDataFormatException if the bytes do not represent a
+ * valid modified UTF-8 encoding of a Unicode string.
* @see java.io.DataInputStream#readUnsignedShort()
*/
public static String readUTF8(DataInput in) throws IOException {
@@ -602,10 +599,8 @@
/**
* Write a UTF8 String <code>str</code> into the DataOutput <code>out</code>
*
- * @param str,
- * a Unicode string;
- * @param out,
- * a Data output stream.
+ * @param str, a Unicode string;
+ * @param out, a Data output stream.
* @throws IOException
*/
public static void writeUTF8(CharSequence str, DataOutput out) throws IOException {
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
index b114351..eb3a5b6 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
@@ -35,7 +35,8 @@
public static final String STRING_LEN_3 = "xyz";
public static final String STRING_UTF8_3 = "锟斤拷";
- public static final String STRING_UTF8_MIX = "\uD841\uDF0E\uD841\uDF31锟X斤Y拷Zà"; // one, two, three, and four bytes
+ // one, two, three, and four bytes
+ public static final String STRING_UTF8_MIX = "\uD841\uDF0E\uD841\uDF31锟X斤Y拷Zà";
public static final String STRING_UTF8_MIX_LOWERCASE = "\uD841\uDF0E\uD841\uDF31锟x斤y拷zà";
public static final String STRING_NEEDS_2_JAVA_CHARS_1 = "\uD83D\uDE22\uD83D\uDE22\uD83D\uDC89\uD83D\uDC89";
public static final String STRING_NEEDS_2_JAVA_CHARS_2 = "😢😢💉💉";
@@ -44,6 +45,8 @@
public static final String STRING_EMOJI_FAMILY_OF_2 = "\uD83D\uDC68\u200D\uD83D\uDC66";
public static final String EMOJI_BASKETBALL = "\uD83C\uDFC0";
+ public static final String THREE_BYTES_UTF8_CHAR = "ह";
+
public static final String STRING_LEN_127 = generateStringRepeatBy(ONE_ASCII_CHAR, 127);
public static final String STRING_LEN_128 = generateStringRepeatBy(ONE_ASCII_CHAR, 128);
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
index c7468d2..4eb1fc3 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
@@ -25,6 +25,7 @@
import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX_LOWERCASE;
+import static org.apache.hyracks.util.string.UTF8StringSample.THREE_BYTES_UTF8_CHAR;
import static org.apache.hyracks.util.string.UTF8StringUtil.charAt;
import static org.apache.hyracks.util.string.UTF8StringUtil.charSize;
import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
@@ -77,13 +78,14 @@
}
@Test
- public void testCompareToAndNormolize() throws Exception {
+ public void testCompareToAndNormalize() throws Exception {
testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX, OPTION.STANDARD);
testCompare(STRING_UTF8_3, STRING_UTF8_MIX, OPTION.STANDARD);
testCompare(STRING_LEN_MEDIUM, STRING_UTF8_MIX, OPTION.STANDARD);
+ testCompare(THREE_BYTES_UTF8_CHAR, THREE_BYTES_UTF8_CHAR, OPTION.STANDARD);
}
- public boolean isSameSign(int r1, int r2) {
+ private static boolean isSameSign(int r1, int r2) {
if (r1 > 0) {
return r2 > 0;
}
@@ -99,7 +101,7 @@
LOWERCASE
}
- public void testCompare(String str1, String str2, OPTION option) throws IOException {
+ private static void testCompare(String str1, String str2, OPTION option) {
byte[] buffer1 = writeStringToBytes(str1);
byte[] buffer2 = writeStringToBytes(str2);
@@ -117,7 +119,6 @@
assertEquals(str1.compareToIgnoreCase(str2), lowerCaseCompareTo(buffer1, 0, buffer2, 0));
break;
}
-
}
@Test