ASTERIXDB-1102: VarSize Encoding to store length of String and ByteArray
This patch is to change the encoding format that stores the length value of
the variable length type (e.g. String, ByteArray) from fix-size encoding
(2bytes) to variable-size encoding ( 1 to 5bytes)
It will solve the issue 1102 to enable us to store a String that longer
than 64K. Also for the common case of storing the short string ( <=
127), it will save one byte per string.
Some important changes include:
1. Add one hyracks-util package to consolidate all the hyracks
independent utility functions. It will reduce the chances of having
duplicate utils in different packages.
2. Move parts of Asterix string functions down to Hyracks
UTF8StringPointable object, which will benefit the other dependencies,
such as VXQuery.
Change-Id: I7e95df0f06984b784ebac2c84b97e56a50207d27
Reviewed-on: https://asterix-gerrit.ics.uci.edu/449
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Taewoo Kim <wangsaeu@gmail.com>
Reviewed-by: Jianfeng Jia <jianfeng.jia@gmail.com>
diff --git a/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/impl/UTF8StringPrinterFactory.java b/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/impl/UTF8StringPrinterFactory.java
index 8aa646e..1aa3370 100644
--- a/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/impl/UTF8StringPrinterFactory.java
+++ b/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/impl/UTF8StringPrinterFactory.java
@@ -18,12 +18,13 @@
*/
package org.apache.hyracks.algebricks.data.impl;
+import java.io.IOException;
import java.io.PrintStream;
import org.apache.hyracks.algebricks.common.exceptions.AlgebricksException;
import org.apache.hyracks.algebricks.data.IPrinter;
import org.apache.hyracks.algebricks.data.IPrinterFactory;
-import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
+import org.apache.hyracks.util.string.UTF8StringUtil;
public class UTF8StringPrinterFactory implements IPrinterFactory {
@@ -40,22 +41,11 @@
@Override
public void print(byte[] b, int s, int l, PrintStream ps) throws AlgebricksException {
- int strlen = UTF8StringPointable.getUTFLength(b, s);
- int pos = s + 2;
- int maxPos = pos + strlen;
- ps.print("\"");
- while (pos < maxPos) {
- char c = UTF8StringPointable.charAt(b, pos);
- switch (c) {
- case '\\':
- case '"':
- ps.print('\\');
- break;
- }
- ps.print(c);
- pos += UTF8StringPointable.charSize(b, pos);
+ try {
+ UTF8StringUtil.printUTF8StringWithQuotes(b, s, l, ps);
+ } catch (IOException e) {
+ throw new AlgebricksException(e);
}
- ps.print("\"");
}
@Override
diff --git a/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/utils/WriteValueTools.java b/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/utils/WriteValueTools.java
index 8a96ea6..97e7d95 100644
--- a/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/utils/WriteValueTools.java
+++ b/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/utils/WriteValueTools.java
@@ -20,14 +20,16 @@
import java.io.IOException;
import java.io.OutputStream;
+import java.io.PrintStream;
-import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
+import org.apache.hyracks.util.string.UTF8StringUtil;
public final class WriteValueTools {
private final static int[] INT_INTERVALS = { 9, 99, 999, 9999, 99999, 999999, 9999999, 99999999, 999999999,
Integer.MAX_VALUE };
- private final static int[] INT_DIVIDERS = { 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 };
+ private final static int[] INT_DIVIDERS = { 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000,
+ 1000000000 };
private final static int[] DIGITS = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' };
public static void writeInt(int i, OutputStream os) throws IOException {
@@ -75,50 +77,11 @@
os.write(DIGITS[(int) (d % 10)]);
}
- public static void writeUTF8String(byte[] b, int s, int l, OutputStream os) throws IOException {
- int stringLength = UTF8StringPointable.getUTFLength(b, s);
- int position = s + 2;
- int maxPosition = position + stringLength;
- os.write('\"');
- while (position < maxPosition) {
- char c = UTF8StringPointable.charAt(b, position);
- switch (c) {
- // escape
- case '\\':
- case '"':
- os.write('\\');
- break;
- }
- int sz = UTF8StringPointable.charSize(b, position);
- while (sz > 0) {
- os.write(b[position]);
- position++;
- sz--;
- }
- }
- os.write('\"');
+ public static void writeUTF8StringWithQuotes(String string, OutputStream ps) throws IOException {
+ UTF8StringUtil.printUTF8StringWithQuotes(string, ps);
}
- public static void writeUTF8StringNoQuotes(byte[] b, int s, int l, OutputStream os) throws IOException {
- int stringLength = UTF8StringPointable.getUTFLength(b, s);
- int position = s + 2;
- int maxPosition = position + stringLength;
- while (position < maxPosition) {
- char c = UTF8StringPointable.charAt(b, position);
- switch (c) {
- // escape
- case '\\':
- case '"':
- os.write('\\');
- break;
- }
- int sz = UTF8StringPointable.charSize(b, position);
- while (sz > 0) {
- os.write(b[position]);
- position++;
- sz--;
- }
- }
+ public static void writeUTF8StringNoQuotes(String string, OutputStream ps) throws IOException {
+ UTF8StringUtil.printUTF8StringNoQuotes(string, ps);
}
-
}
diff --git a/algebricks/algebricks-examples/piglet-example/pom.xml b/algebricks/algebricks-examples/piglet-example/pom.xml
index a037db5..ae2ec51 100644
--- a/algebricks/algebricks-examples/piglet-example/pom.xml
+++ b/algebricks/algebricks-examples/piglet-example/pom.xml
@@ -111,5 +111,10 @@
<artifactId>algebricks-compiler</artifactId>
<version>0.2.17-SNAPSHOT</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.hyracks</groupId>
+ <artifactId>hyracks-util</artifactId>
+ <version>0.2.17-SNAPSHOT</version>
+ </dependency>
</dependencies>
</project>
diff --git a/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/compiler/PigletPrinterFactoryProvider.java b/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/compiler/PigletPrinterFactoryProvider.java
index 6d64741..8049594 100644
--- a/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/compiler/PigletPrinterFactoryProvider.java
+++ b/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/compiler/PigletPrinterFactoryProvider.java
@@ -29,7 +29,9 @@
import org.apache.hyracks.algebricks.data.utils.WriteValueTools;
import org.apache.hyracks.algebricks.examples.piglet.types.Type;
import org.apache.hyracks.data.std.primitive.FloatPointable;
+import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
import org.apache.hyracks.dataflow.common.data.marshalling.FloatSerializerDeserializer;
+import org.apache.hyracks.util.string.UTF8StringUtil;
public class PigletPrinterFactoryProvider implements IPrinterFactoryProvider {
@@ -73,7 +75,7 @@
@Override
public void print(byte[] b, int s, int l, PrintStream ps) throws AlgebricksException {
try {
- WriteValueTools.writeUTF8String(b, s, l, ps);
+ UTF8StringUtil.printUTF8StringWithQuotes(b, s, l, ps);
} catch (IOException e) {
throw new AlgebricksException(e);
}
diff --git a/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/metadata/PigletMetadataProvider.java b/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/metadata/PigletMetadataProvider.java
index 7d9b3db..8f9ab9f 100644
--- a/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/metadata/PigletMetadataProvider.java
+++ b/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/metadata/PigletMetadataProvider.java
@@ -110,7 +110,7 @@
case CHAR_ARRAY:
vpf = UTF8StringParserFactory.INSTANCE;
- serDeser = UTF8StringSerializerDeserializer.INSTANCE;
+ serDeser = new UTF8StringSerializerDeserializer();
break;
case FLOAT:
diff --git a/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/runtime/PigletExpressionJobGen.java b/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/runtime/PigletExpressionJobGen.java
index 6c173b2..1c3f9b8 100644
--- a/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/runtime/PigletExpressionJobGen.java
+++ b/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/runtime/PigletExpressionJobGen.java
@@ -53,6 +53,8 @@
import org.apache.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
public class PigletExpressionJobGen implements ILogicalExpressionJobGen {
+ private final UTF8StringSerializerDeserializer utf8SerDer = new UTF8StringSerializerDeserializer();
+
@Override
public ICopyEvaluatorFactory createEvaluatorFactory(ILogicalExpression expr, IVariableTypeEnvironment env,
IOperatorSchema[] inputSchemas, JobGenContext context) throws AlgebricksException {
@@ -74,7 +76,7 @@
case CHAR_ARRAY:
try {
- UTF8StringSerializerDeserializer.INSTANCE.serialize(image, dos);
+ utf8SerDer.serialize(image, dos);
} catch (Exception e) {
throw new AlgebricksException(e);
}
diff --git a/algebricks/algebricks-examples/pom.xml b/algebricks/algebricks-examples/pom.xml
index 7ba1b5b..968db33 100644
--- a/algebricks/algebricks-examples/pom.xml
+++ b/algebricks/algebricks-examples/pom.xml
@@ -22,8 +22,15 @@
<artifactId>algebricks-examples</artifactId>
<packaging>pom</packaging>
<name>algebricks-examples</name>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.hyracks</groupId>
+ <artifactId>algebricks-core</artifactId>
+ <version>0.2.17-SNAPSHOT</version>
+ </dependency>
+ </dependencies>
- <parent>
+ <parent>
<groupId>org.apache.hyracks</groupId>
<artifactId>algebricks</artifactId>
<version>0.2.17-SNAPSHOT</version>
diff --git a/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/pushruntime/PushRuntimeTest.java b/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/pushruntime/PushRuntimeTest.java
index 3c97878..7fcab17 100644
--- a/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/pushruntime/PushRuntimeTest.java
+++ b/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/pushruntime/PushRuntimeTest.java
@@ -275,10 +275,10 @@
IFileSplitProvider splitProvider = new ConstantFileSplitProvider(fileSplits);
RecordDescriptor scannerDesc = new RecordDescriptor(new ISerializerDeserializer[] {
- IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE,
- UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE,
- UTF8StringSerializerDeserializer.INSTANCE, FloatSerializerDeserializer.INSTANCE,
- UTF8StringSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE });
+ IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer(),
+ new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE,
+ new UTF8StringSerializerDeserializer(), FloatSerializerDeserializer.INSTANCE,
+ new UTF8StringSerializerDeserializer(), new UTF8StringSerializerDeserializer() });
IValueParserFactory[] valueParsers = new IValueParserFactory[] { IntegerParserFactory.INSTANCE,
UTF8StringParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE, IntegerParserFactory.INSTANCE,
UTF8StringParserFactory.INSTANCE, FloatParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE,
@@ -355,10 +355,10 @@
"data/tpch0.001/customer-part1.tbl")));
IFileSplitProvider splitProvider = new ConstantFileSplitProvider(fileSplits);
RecordDescriptor scannerDesc = new RecordDescriptor(new ISerializerDeserializer[] {
- IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE,
- UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE,
- UTF8StringSerializerDeserializer.INSTANCE, FloatSerializerDeserializer.INSTANCE,
- UTF8StringSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE });
+ IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer(),
+ new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE,
+ new UTF8StringSerializerDeserializer(), FloatSerializerDeserializer.INSTANCE,
+ new UTF8StringSerializerDeserializer(), new UTF8StringSerializerDeserializer() });
IValueParserFactory[] valueParsers = new IValueParserFactory[] { IntegerParserFactory.INSTANCE,
UTF8StringParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE, IntegerParserFactory.INSTANCE,
UTF8StringParserFactory.INSTANCE, FloatParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE,
@@ -407,10 +407,10 @@
"data/tpch0.001/customer.tbl")));
IFileSplitProvider splitProvider = new ConstantFileSplitProvider(fileSplits);
RecordDescriptor scannerDesc = new RecordDescriptor(new ISerializerDeserializer[] {
- IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE,
- UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE,
- UTF8StringSerializerDeserializer.INSTANCE, FloatSerializerDeserializer.INSTANCE,
- UTF8StringSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE });
+ IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer(),
+ new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE,
+ new UTF8StringSerializerDeserializer(), FloatSerializerDeserializer.INSTANCE,
+ new UTF8StringSerializerDeserializer(), new UTF8StringSerializerDeserializer() });
IValueParserFactory[] valueParsers = new IValueParserFactory[] { IntegerParserFactory.INSTANCE,
UTF8StringParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE, IntegerParserFactory.INSTANCE,
UTF8StringParserFactory.INSTANCE, FloatParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE,
@@ -492,10 +492,10 @@
"data/tpch0.001/customer.tbl")));
IFileSplitProvider splitProvider = new ConstantFileSplitProvider(fileSplits);
RecordDescriptor scannerDesc = new RecordDescriptor(new ISerializerDeserializer[] {
- IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE,
- UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE,
- UTF8StringSerializerDeserializer.INSTANCE, FloatSerializerDeserializer.INSTANCE,
- UTF8StringSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE });
+ IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer(),
+ new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE,
+ new UTF8StringSerializerDeserializer(), FloatSerializerDeserializer.INSTANCE,
+ new UTF8StringSerializerDeserializer(), new UTF8StringSerializerDeserializer() });
IValueParserFactory[] valueParsers = new IValueParserFactory[] { IntegerParserFactory.INSTANCE,
UTF8StringParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE, IntegerParserFactory.INSTANCE,
UTF8StringParserFactory.INSTANCE, FloatParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE,
@@ -663,7 +663,7 @@
DelimitedDataTupleParserFactory stringParser = new DelimitedDataTupleParserFactory(
new IValueParserFactory[] { UTF8StringParserFactory.INSTANCE }, '\u0000');
RecordDescriptor stringRec = new RecordDescriptor(
- new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE, });
+ new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer(), });
FileScanOperatorDescriptor scanOp = new FileScanOperatorDescriptor(spec, new ConstantFileSplitProvider(
inputSplits), stringParser, stringRec);
@@ -709,8 +709,8 @@
"data/tpch0.001/nation.tbl")));
IFileSplitProvider splitProvider = new ConstantFileSplitProvider(fileSplits);
RecordDescriptor scannerDesc = new RecordDescriptor(new ISerializerDeserializer[] {
- IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE,
- IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE });
+ IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer(),
+ IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer() });
IValueParserFactory[] valueParsers = new IValueParserFactory[] { IntegerParserFactory.INSTANCE,
UTF8StringParserFactory.INSTANCE, IntegerParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE };
FileScanOperatorDescriptor scanner = new FileScanOperatorDescriptor(spec, splitProvider,
@@ -817,10 +817,10 @@
"data/tpch0.001/customer.tbl")));
IFileSplitProvider splitProvider = new ConstantFileSplitProvider(fileSplits);
RecordDescriptor scannerDesc = new RecordDescriptor(new ISerializerDeserializer[] {
- IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE,
- UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE,
- UTF8StringSerializerDeserializer.INSTANCE, FloatSerializerDeserializer.INSTANCE,
- UTF8StringSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE });
+ IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer(),
+ new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE,
+ new UTF8StringSerializerDeserializer(), FloatSerializerDeserializer.INSTANCE,
+ new UTF8StringSerializerDeserializer(), new UTF8StringSerializerDeserializer() });
IValueParserFactory[] valueParsers = new IValueParserFactory[] { IntegerParserFactory.INSTANCE,
UTF8StringParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE, IntegerParserFactory.INSTANCE,
UTF8StringParserFactory.INSTANCE, FloatParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE,
diff --git a/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/tools/WriteValueTest.java b/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/tools/WriteValueTest.java
index 0968478..6770494 100644
--- a/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/tools/WriteValueTest.java
+++ b/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/tools/WriteValueTest.java
@@ -97,7 +97,7 @@
interm.reset();
dout.writeUTF(str);
baaos.reset();
- WriteValueTools.writeUTF8String(interm.getByteArray(), 0, interm.size(), baaos);
+ WriteValueTools.writeUTF8StringWithQuotes(str, baaos);
byte[] b = str.getBytes("UTF-8");
if (baaos.size() != b.length + 2) {
throw new Exception("Expecting to write " + b + " in " + b.length + " bytes, but found " + baaos.size()