[ASTERIXDB-2762] Use code point as the unit in trim()
This commit aims to use code point as the unit in trim().
Currently, Java char (2 bytes) is used as the unit in trim(),
however, for non-English characters such as Emoji and Korean,
one character may have multiple bytes and thus can be trimmed
in an illegal way if we use Java char as the unit.
Instead, code point is a more natural unit to do so.
Change-Id: If14092be9c2a654dba392bb2b773db81c9e47ae6
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7283
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Dmitry Lychagin <dmitry.lychagin@couchbase.com>
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml
index c27d884..df1f94e 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml
@@ -32,10 +32,6 @@
</properties>
<dependencies>
<dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-lang3</artifactId>
- </dependency>
- <dependency>
<groupId>org.apache.hyracks</groupId>
<artifactId>hyracks-util</artifactId>
<version>${project.version}</version>
@@ -56,5 +52,9 @@
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
+ <dependency>
+ <groupId>it.unimi.dsi</groupId>
+ <artifactId>fastutil</artifactId>
+ </dependency>
</dependencies>
</project>
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 944b317..3b1f18b 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
@@ -19,10 +19,8 @@
package org.apache.hyracks.data.std.primitive;
import java.io.IOException;
-import java.io.UnsupportedEncodingException;
-import java.nio.charset.StandardCharsets;
+import java.nio.charset.Charset;
-import org.apache.commons.lang3.CharSet;
import org.apache.hyracks.api.dataflow.value.ITypeTraits;
import org.apache.hyracks.api.exceptions.HyracksDataException;
import org.apache.hyracks.api.io.IJsonSerializable;
@@ -38,6 +36,8 @@
import com.fasterxml.jackson.databind.JsonNode;
+import it.unimi.dsi.fastutil.ints.IntCollection;
+
public final class UTF8StringPointable extends AbstractPointable implements IHashable, IComparable {
public static final UTF8StringPointableFactory FACTORY = new UTF8StringPointableFactory();
@@ -50,6 +50,9 @@
private int hashValue;
private int stringLength;
+ public static final UTF8StringPointable SPACE_STRING_POINTABLE = generateUTF8Pointable(" ");
+ public static final Charset CESU8_CHARSET = Charset.forName("CESU8");
+
/**
* reset those meta length.
* Since the {@code utf8Length} and the {@code metaLength} are often used, we compute those two values in advance.
@@ -122,6 +125,18 @@
return UTF8StringUtil.codePointSize(bytes, start + offset);
}
+ public void getCodePoints(IntCollection codePointSet) {
+ int byteIdx = 0;
+ while (byteIdx < utf8Length) {
+ codePointSet.add(codePointAt(metaLength + byteIdx));
+ byteIdx += codePointSize(metaLength + byteIdx);
+ }
+
+ if (byteIdx != utf8Length) {
+ throw new IllegalArgumentException("Decoding error: malformed bytes");
+ }
+ }
+
/**
* Gets the length of the string in characters.
* The first time call will need to go through the entire string, the following call will just return the pre-caculated result
@@ -176,11 +191,7 @@
@Override
public String toString() {
- try {
- return new String(bytes, getCharStartOffset(), getUTF8Length(), StandardCharsets.UTF_8.name());
- } catch (UnsupportedEncodingException e) {
- throw new IllegalStateException(e);
- }
+ return new String(bytes, getCharStartOffset(), getUTF8Length(), CESU8_CHARSET);
}
public int ignoreCaseCompareTo(UTF8StringPointable other) {
@@ -553,16 +564,11 @@
builder.finish();
}
- public void trim(UTF8StringBuilder builder, GrowableArray out, boolean left, boolean right, CharSet charSet)
- throws IOException {
- trim(this, builder, out, left, right, charSet);
- }
-
/**
* Generates a trimmed string of an input source string.
*
* @param srcPtr
- * , the input source string.
+ * , the input source string
* @param builder
* , the result string builder.
* @param out
@@ -571,23 +577,23 @@
* , whether to trim the left side.
* @param right
* , whether to trim the right side.
- * @param charSet
- * , the chars that should be trimmed.
+ * @param codePointSet
+ * , the set of code points that should be trimmed.
* @throws IOException
*/
public static void trim(UTF8StringPointable srcPtr, UTF8StringBuilder builder, GrowableArray out, boolean left,
- boolean right, CharSet charSet) throws IOException {
+ boolean right, IntCollection codePointSet) throws IOException {
final int srcUtfLen = srcPtr.getUTF8Length();
final int srcStart = srcPtr.getMetaDataLength();
// Finds the start Index (inclusive).
int startIndex = 0;
if (left) {
while (startIndex < srcUtfLen) {
- char ch = srcPtr.charAt(srcStart + startIndex);
- if (!charSet.contains(ch)) {
+ int codepoint = srcPtr.codePointAt(srcStart + startIndex);
+ if (!codePointSet.contains(codepoint)) {
break;
}
- startIndex += srcPtr.charSize(srcStart + startIndex);
+ startIndex += srcPtr.codePointSize(srcStart + startIndex);
}
}
@@ -597,9 +603,9 @@
endIndex = startIndex;
int cursorIndex = startIndex;
while (cursorIndex < srcUtfLen) {
- char ch = srcPtr.charAt(srcStart + cursorIndex);
- cursorIndex += srcPtr.charSize(srcStart + cursorIndex);
- if (!charSet.contains(ch)) {
+ int codePioint = srcPtr.codePointAt(srcStart + cursorIndex);
+ cursorIndex += srcPtr.codePointSize(srcStart + cursorIndex);
+ if (!codePointSet.contains(codePioint)) {
endIndex = cursorIndex;
}
}
@@ -613,6 +619,26 @@
}
/**
+ * Generates a trimmed string from the original string.
+ *
+ * @param builder
+ * , the result string builder.
+ * @param out
+ * , the storage for the output string.
+ * @param left
+ * , whether to trim the left side.
+ * @param right
+ * , whether to trim the right side.
+ * @param codePointSet
+ * , the set of code points that should be trimmed.
+ * @throws IOException
+ */
+ public void trim(UTF8StringBuilder builder, GrowableArray out, boolean left, boolean right,
+ IntCollection codePointSet) throws IOException {
+ trim(this, builder, out, left, right, codePointSet);
+ }
+
+ /**
* Generates a reversed string from an input source string
*
* @param srcPtr
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
index fa93003..22be7ca 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
@@ -27,14 +27,17 @@
import static org.junit.Assert.assertTrue;
import java.io.IOException;
+import java.util.Arrays;
-import org.apache.commons.lang3.CharSet;
import org.apache.hyracks.data.std.util.GrowableArray;
import org.apache.hyracks.data.std.util.UTF8StringBuilder;
import org.apache.hyracks.util.string.UTF8StringSample;
import org.apache.hyracks.util.string.UTF8StringUtil;
import org.junit.Test;
+import it.unimi.dsi.fastutil.ints.IntArraySet;
+import it.unimi.dsi.fastutil.ints.IntCollection;
+
public class UTF8StringPointableTest {
public static UTF8StringPointable STRING_EMPTY = generateUTF8Pointable(UTF8StringSample.EMPTY_STRING);
public static UTF8StringPointable STRING_UTF8_MIX = generateUTF8Pointable(UTF8StringSample.STRING_UTF8_MIX);
@@ -229,26 +232,35 @@
GrowableArray storage = new GrowableArray();
UTF8StringPointable result = new UTF8StringPointable();
UTF8StringPointable input = generateUTF8Pointable(" this is it.i am;here. ");
+ IntCollection spaceCodePointSet = new IntArraySet(Arrays.asList((int) ' '));
// Trims both sides.
- input.trim(builder, storage, true, true, CharSet.getInstance(" "));
+ input.trim(builder, storage, true, true, spaceCodePointSet);
result.set(storage.getByteArray(), 0, storage.getLength());
UTF8StringPointable expected = generateUTF8Pointable("this is it.i am;here.");
assertEquals(0, expected.compareTo(result));
// Only trims the right side.
storage.reset();
- input.trim(builder, storage, false, true, CharSet.getInstance(" "));
+ input.trim(builder, storage, false, true, spaceCodePointSet);
result.set(storage.getByteArray(), 0, storage.getLength());
expected = generateUTF8Pointable(" this is it.i am;here.");
assertEquals(0, expected.compareTo(result));
// Only trims the left side.
storage.reset();
- input.trim(builder, storage, true, false, CharSet.getInstance(" "));
+ input.trim(builder, storage, true, false, spaceCodePointSet);
result.set(storage.getByteArray(), 0, storage.getLength());
expected = generateUTF8Pointable("this is it.i am;here. ");
assertEquals(0, expected.compareTo(result));
+
+ // Only trims the left side in case of emoji
+ input = STRING_POINTABLE_EMOJI_FAMILY_OF_4;
+ storage.reset();
+ input.trim(builder, storage, true, false, spaceCodePointSet);
+ result.set(storage.getByteArray(), 0, storage.getLength());
+ expected = STRING_POINTABLE_EMOJI_FAMILY_OF_4;
+ assertEquals(0, expected.compareTo(result));
}
@Test
@@ -258,25 +270,59 @@
UTF8StringPointable result = new UTF8StringPointable();
UTF8StringPointable input = generateUTF8Pointable(" this is it.i am;here. ");
+ String pattern = " hert.";
+ UTF8StringPointable patternPointable = generateUTF8Pointable(pattern);
+ IntCollection codePointSet = new IntArraySet();
+ codePointSet.clear();
+ patternPointable.getCodePoints(codePointSet);
+
// Trims both sides.
- input.trim(builder, storage, true, true, CharSet.getInstance(" hert."));
+ input.trim(builder, storage, true, true, codePointSet);
result.set(storage.getByteArray(), 0, storage.getLength());
UTF8StringPointable expected = generateUTF8Pointable("is is it.i am;");
assertEquals(0, expected.compareTo(result));
// Only trims the right side.
storage.reset();
- input.trim(builder, storage, false, true, CharSet.getInstance(" hert."));
+ input.trim(builder, storage, false, true, codePointSet);
result.set(storage.getByteArray(), 0, storage.getLength());
expected = generateUTF8Pointable(" this is it.i am;");
assertEquals(0, expected.compareTo(result));
// Only trims the left side.
storage.reset();
- input.trim(builder, storage, true, false, CharSet.getInstance(" hert."));
+ input.trim(builder, storage, true, false, codePointSet);
result.set(storage.getByteArray(), 0, storage.getLength());
expected = generateUTF8Pointable("is is it.i am;here. ");
assertEquals(0, expected.compareTo(result));
+
+ // Test Emoji trim
+ input = STRING_POINTABLE_EMOJI_FAMILY_OF_4;
+ pattern = "👨👦";
+ patternPointable = generateUTF8Pointable(pattern);
+ codePointSet.clear();
+ patternPointable.getCodePoints(codePointSet);
+
+ // Trim left
+ storage.reset();
+ input.trim(builder, storage, true, false, codePointSet);
+ result.set(storage.getByteArray(), 0, storage.getLength());
+ expected = generateUTF8Pointable("\u200D" + "👨👦👦");
+ assertEquals(0, expected.compareTo(result));
+
+ // Trim right
+ storage.reset();
+ input.trim(builder, storage, false, true, codePointSet);
+ result.set(storage.getByteArray(), 0, storage.getLength());
+ expected = generateUTF8Pointable("👨👨👦" + "\u200D");
+ assertEquals(0, expected.compareTo(result));
+
+ // Trim left and right
+ storage.reset();
+ input.trim(builder, storage, true, true, codePointSet);
+ result.set(storage.getByteArray(), 0, storage.getLength());
+ expected = generateUTF8Pointable("\u200D" + "👨👦" + "\u200D");
+ assertEquals(0, expected.compareTo(result));
}
}
diff --git a/hyracks-fullstack/pom.xml b/hyracks-fullstack/pom.xml
index fb801b9..d7b6829 100644
--- a/hyracks-fullstack/pom.xml
+++ b/hyracks-fullstack/pom.xml
@@ -283,6 +283,11 @@
<artifactId>maven-plugin-api</artifactId>
<version>3.6.3</version>
</dependency>
+ <dependency>
+ <groupId>it.unimi.dsi</groupId>
+ <artifactId>fastutil</artifactId>
+ <version>8.3.0</version>
+ </dependency>
</dependencies>
</dependencyManagement>
<build>