Merged hyracks_asterix_stabilization r2462:r2562.
git-svn-id: https://hyracks.googlecode.com/svn/branches/hyracks_lsm_tree@2563 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-algebricks/hyracks-algebricks-core/src/main/java/edu/uci/ics/hyracks/algebricks/core/algebra/expressions/ScalarFunctionCallExpression.java b/hyracks-algebricks/hyracks-algebricks-core/src/main/java/edu/uci/ics/hyracks/algebricks/core/algebra/expressions/ScalarFunctionCallExpression.java
index e3307cd..b284b22 100644
--- a/hyracks-algebricks/hyracks-algebricks-core/src/main/java/edu/uci/ics/hyracks/algebricks/core/algebra/expressions/ScalarFunctionCallExpression.java
+++ b/hyracks-algebricks/hyracks-algebricks-core/src/main/java/edu/uci/ics/hyracks/algebricks/core/algebra/expressions/ScalarFunctionCallExpression.java
@@ -42,6 +42,7 @@
List<Mutable<ILogicalExpression>> clonedArgs = cloneArguments();
ScalarFunctionCallExpression funcExpr = new ScalarFunctionCallExpression(finfo, clonedArgs);
funcExpr.getAnnotations().putAll(cloneAnnotations());
+ funcExpr.setOpaqueParameters(this.getOpaqueParameters());
return funcExpr;
}
diff --git a/hyracks-algebricks/hyracks-algebricks-core/src/main/java/edu/uci/ics/hyracks/algebricks/core/algebra/expressions/UnnestingFunctionCallExpression.java b/hyracks-algebricks/hyracks-algebricks-core/src/main/java/edu/uci/ics/hyracks/algebricks/core/algebra/expressions/UnnestingFunctionCallExpression.java
index 71932d8..652f9b0 100644
--- a/hyracks-algebricks/hyracks-algebricks-core/src/main/java/edu/uci/ics/hyracks/algebricks/core/algebra/expressions/UnnestingFunctionCallExpression.java
+++ b/hyracks-algebricks/hyracks-algebricks-core/src/main/java/edu/uci/ics/hyracks/algebricks/core/algebra/expressions/UnnestingFunctionCallExpression.java
@@ -45,6 +45,7 @@
List<Mutable<ILogicalExpression>> clonedArgs = cloneArguments();
UnnestingFunctionCallExpression ufce = new UnnestingFunctionCallExpression(finfo, clonedArgs);
ufce.setReturnsUniqueValues(returnsUniqueValues);
+ ufce.setOpaqueParameters(this.getOpaqueParameters());
return ufce;
}
diff --git a/hyracks-algebricks/hyracks-algebricks-rewriter/src/main/java/edu/uci/ics/hyracks/algebricks/rewriter/rules/ComplexUnnestToProductRule.java b/hyracks-algebricks/hyracks-algebricks-rewriter/src/main/java/edu/uci/ics/hyracks/algebricks/rewriter/rules/ComplexUnnestToProductRule.java
index 48361c8..fa5000e 100644
--- a/hyracks-algebricks/hyracks-algebricks-rewriter/src/main/java/edu/uci/ics/hyracks/algebricks/rewriter/rules/ComplexUnnestToProductRule.java
+++ b/hyracks-algebricks/hyracks-algebricks-rewriter/src/main/java/edu/uci/ics/hyracks/algebricks/rewriter/rules/ComplexUnnestToProductRule.java
@@ -217,7 +217,8 @@
for (LogicalVariable producedVar : producedVars) {
if (outerUsedVars.contains(producedVar)) {
outerMatches++;
- } else if (innerUsedVars.contains(producedVar)) {
+ }
+ if (innerUsedVars.contains(producedVar)) {
innerMatches++;
}
}
@@ -227,24 +228,30 @@
// All produced vars used by outer partition.
outerOps.add(op);
targetUsedVars = outerUsedVars;
- } else if (innerMatches == producedVars.size() && !producedVars.isEmpty()) {
+ }
+ if (innerMatches == producedVars.size() && !producedVars.isEmpty()) {
// All produced vars used by inner partition.
innerOps.add(op);
targetUsedVars = innerUsedVars;
- } else if (innerMatches == 0 && outerMatches == 0) {
+ }
+ if (innerMatches == 0 && outerMatches == 0) {
// Op produces variables that are not used in the part of the plan we've seen (or it doesn't produce any vars).
// Try to figure out where it belongs by analyzing the used variables.
List<LogicalVariable> usedVars = new ArrayList<LogicalVariable>();
VariableUtilities.getUsedVariables(op, usedVars);
for (LogicalVariable usedVar : usedVars) {
+ boolean canBreak = false;
if (outerUsedVars.contains(usedVar)) {
outerOps.add(op);
targetUsedVars = outerUsedVars;
- break;
+ canBreak = true;
}
if (innerUsedVars.contains(usedVar)) {
innerOps.add(op);
targetUsedVars = innerUsedVars;
+ canBreak = true;
+ }
+ if (canBreak) {
break;
}
}
diff --git a/hyracks-algebricks/hyracks-algebricks-rewriter/src/main/java/edu/uci/ics/hyracks/algebricks/rewriter/rules/PushFunctionsBelowJoin.java b/hyracks-algebricks/hyracks-algebricks-rewriter/src/main/java/edu/uci/ics/hyracks/algebricks/rewriter/rules/PushFunctionsBelowJoin.java
new file mode 100644
index 0000000..16b010e
--- /dev/null
+++ b/hyracks-algebricks/hyracks-algebricks-rewriter/src/main/java/edu/uci/ics/hyracks/algebricks/rewriter/rules/PushFunctionsBelowJoin.java
@@ -0,0 +1,208 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.algebricks.rewriter.rules;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.lang3.mutable.Mutable;
+import org.apache.commons.lang3.mutable.MutableObject;
+
+import edu.uci.ics.hyracks.algebricks.common.exceptions.AlgebricksException;
+import edu.uci.ics.hyracks.algebricks.core.algebra.base.ILogicalExpression;
+import edu.uci.ics.hyracks.algebricks.core.algebra.base.ILogicalOperator;
+import edu.uci.ics.hyracks.algebricks.core.algebra.base.IOptimizationContext;
+import edu.uci.ics.hyracks.algebricks.core.algebra.base.LogicalExpressionTag;
+import edu.uci.ics.hyracks.algebricks.core.algebra.base.LogicalOperatorTag;
+import edu.uci.ics.hyracks.algebricks.core.algebra.base.LogicalVariable;
+import edu.uci.ics.hyracks.algebricks.core.algebra.expressions.AbstractFunctionCallExpression;
+import edu.uci.ics.hyracks.algebricks.core.algebra.expressions.AbstractLogicalExpression;
+import edu.uci.ics.hyracks.algebricks.core.algebra.expressions.VariableReferenceExpression;
+import edu.uci.ics.hyracks.algebricks.core.algebra.functions.FunctionIdentifier;
+import edu.uci.ics.hyracks.algebricks.core.algebra.operators.logical.AbstractBinaryJoinOperator;
+import edu.uci.ics.hyracks.algebricks.core.algebra.operators.logical.AbstractLogicalOperator;
+import edu.uci.ics.hyracks.algebricks.core.algebra.operators.logical.AssignOperator;
+import edu.uci.ics.hyracks.algebricks.core.algebra.operators.logical.visitors.VariableUtilities;
+import edu.uci.ics.hyracks.algebricks.core.rewriter.base.IAlgebraicRewriteRule;
+
+/**
+ * Pushes function-call expressions below a join if possible.
+ * Assigns the result of such function-calls expressions to new variables, and replaces the original
+ * expression with a corresponding variable reference expression.
+ * This rule can help reduce the cost of computing expensive functions by pushing them below
+ * a join (which may blow up the cardinality).
+ * Also, this rule may help to enable other rules such as common subexpression elimination, again to reduce
+ * the number of calls to expensive functions.
+ *
+ * Example: (we are pushing pushMeFunc)
+ *
+ * Before plan:
+ * assign [$$10] <- [funcA(funcB(pushMeFunc($$3, $$4)))]
+ * join (some condition)
+ * join_branch_0 where $$3 and $$4 are not live
+ * ...
+ * join_branch_1 where $$3 and $$4 are live
+ * ...
+ *
+ * After plan:
+ * assign [$$10] <- [funcA(funcB($$11))]
+ * join (some condition)
+ * join_branch_0 where $$3 and $$4 are not live
+ * ...
+ * join_branch_1 where $$3 and $$4 are live
+ * assign[$$11] <- [pushMeFunc($$3, $$4)]
+ * ...
+ */
+public class PushFunctionsBelowJoin implements IAlgebraicRewriteRule {
+
+ private final Set<FunctionIdentifier> toPushFuncIdents;
+ private final List<Mutable<ILogicalExpression>> funcExprs = new ArrayList<Mutable<ILogicalExpression>>();
+ private final List<LogicalVariable> usedVars = new ArrayList<LogicalVariable>();
+ private final List<LogicalVariable> liveVars = new ArrayList<LogicalVariable>();
+
+ public PushFunctionsBelowJoin(Set<FunctionIdentifier> toPushFuncIdents) {
+ this.toPushFuncIdents = toPushFuncIdents;
+ }
+
+ @Override
+ public boolean rewritePre(Mutable<ILogicalOperator> opRef, IOptimizationContext context) throws AlgebricksException {
+ return false;
+ }
+
+ @Override
+ public boolean rewritePost(Mutable<ILogicalOperator> opRef, IOptimizationContext context)
+ throws AlgebricksException {
+ AbstractLogicalOperator op = (AbstractLogicalOperator) opRef.getValue();
+ if (op.getOperatorTag() != LogicalOperatorTag.ASSIGN) {
+ return false;
+ }
+ AssignOperator assignOp = (AssignOperator) op;
+
+ // Find a join operator below this assign.
+ Mutable<ILogicalOperator> joinOpRef = findJoinOp(assignOp.getInputs().get(0));
+ if (joinOpRef == null) {
+ return false;
+ }
+ AbstractBinaryJoinOperator joinOp = (AbstractBinaryJoinOperator) joinOpRef.getValue();
+
+ // Check if the assign uses a function that we wish to push below the join if possible.
+ funcExprs.clear();
+ gatherFunctionCalls(assignOp, funcExprs);
+ if (funcExprs.isEmpty()) {
+ return false;
+ }
+
+ // Try to push the functions down the input branches of the join.
+ boolean modified = false;
+ if (pushDownFunctions(joinOp, 0, funcExprs, context)) {
+ modified = true;
+ }
+ if (pushDownFunctions(joinOp, 1, funcExprs, context)) {
+ modified = true;
+ }
+ if (modified) {
+ context.computeAndSetTypeEnvironmentForOperator(joinOp);
+ }
+ return modified;
+ }
+
+ private Mutable<ILogicalOperator> findJoinOp(Mutable<ILogicalOperator> opRef) {
+ AbstractLogicalOperator op = (AbstractLogicalOperator) opRef.getValue();
+ switch (op.getOperatorTag()) {
+ case INNERJOIN:
+ case LEFTOUTERJOIN: {
+ return opRef;
+ }
+ // Bail on these operators.
+ case GROUP:
+ case AGGREGATE:
+ case DISTINCT:
+ case UNNEST_MAP: {
+ return null;
+ }
+ // Traverse children.
+ default: {
+ for (Mutable<ILogicalOperator> childOpRef : op.getInputs()) {
+ return findJoinOp(childOpRef);
+ }
+ }
+ }
+ return null;
+ }
+
+ private void gatherFunctionCalls(AssignOperator assignOp, List<Mutable<ILogicalExpression>> funcExprs) {
+ for (Mutable<ILogicalExpression> exprRef : assignOp.getExpressions()) {
+ gatherFunctionCalls(exprRef, funcExprs);
+ }
+ }
+
+ private void gatherFunctionCalls(Mutable<ILogicalExpression> exprRef, List<Mutable<ILogicalExpression>> funcExprs) {
+ AbstractLogicalExpression expr = (AbstractLogicalExpression) exprRef.getValue();
+ if (expr.getExpressionTag() != LogicalExpressionTag.FUNCTION_CALL) {
+ return;
+ }
+ // Check whether the function is a function we want to push.
+ AbstractFunctionCallExpression funcExpr = (AbstractFunctionCallExpression) expr;
+ if (toPushFuncIdents.contains(funcExpr.getFunctionIdentifier())) {
+ funcExprs.add(exprRef);
+ }
+ // Traverse arguments.
+ for (Mutable<ILogicalExpression> funcArg : funcExpr.getArguments()) {
+ gatherFunctionCalls(funcArg, funcExprs);
+ }
+ }
+
+ private boolean pushDownFunctions(AbstractBinaryJoinOperator joinOp, int inputIndex,
+ List<Mutable<ILogicalExpression>> funcExprs, IOptimizationContext context) throws AlgebricksException {
+ ILogicalOperator joinInputOp = joinOp.getInputs().get(inputIndex).getValue();
+ liveVars.clear();
+ VariableUtilities.getLiveVariables(joinInputOp, liveVars);
+ Iterator<Mutable<ILogicalExpression>> funcIter = funcExprs.iterator();
+ List<LogicalVariable> assignVars = null;
+ List<Mutable<ILogicalExpression>> assignExprs = null;
+ while (funcIter.hasNext()) {
+ Mutable<ILogicalExpression> funcExprRef = funcIter.next();
+ ILogicalExpression funcExpr = funcExprRef.getValue();
+ usedVars.clear();
+ funcExpr.getUsedVariables(usedVars);
+ // Check if we can push the function down this branch.
+ if (liveVars.containsAll(usedVars)) {
+ if (assignVars == null) {
+ assignVars = new ArrayList<LogicalVariable>();
+ assignExprs = new ArrayList<Mutable<ILogicalExpression>>();
+ }
+ // Replace the original expression with a variable reference expression.
+ LogicalVariable replacementVar = context.newVar();
+ assignVars.add(replacementVar);
+ assignExprs.add(new MutableObject<ILogicalExpression>(funcExpr));
+ funcExprRef.setValue(new VariableReferenceExpression(replacementVar));
+ funcIter.remove();
+ }
+ }
+ // Create new assign operator below the join if any functions can be pushed.
+ if (assignVars != null) {
+ AssignOperator newAssign = new AssignOperator(assignVars, assignExprs);
+ newAssign.getInputs().add(new MutableObject<ILogicalOperator>(joinInputOp));
+ newAssign.setExecutionMode(joinOp.getExecutionMode());
+ joinOp.getInputs().get(inputIndex).setValue(newAssign);
+ context.computeAndSetTypeEnvironmentForOperator(newAssign);
+ return true;
+ }
+ return false;
+ }
+}
diff --git a/hyracks-data/hyracks-data-std/src/main/java/edu/uci/ics/hyracks/data/std/util/ArrayBackedValueStorage.java b/hyracks-data/hyracks-data-std/src/main/java/edu/uci/ics/hyracks/data/std/util/ArrayBackedValueStorage.java
index e8dc9b4..7d7d2c1 100644
--- a/hyracks-data/hyracks-data-std/src/main/java/edu/uci/ics/hyracks/data/std/util/ArrayBackedValueStorage.java
+++ b/hyracks-data/hyracks-data-std/src/main/java/edu/uci/ics/hyracks/data/std/util/ArrayBackedValueStorage.java
@@ -1,34 +1,28 @@
package edu.uci.ics.hyracks.data.std.util;
import java.io.DataOutput;
-import java.io.DataOutputStream;
import java.io.IOException;
import edu.uci.ics.hyracks.data.std.api.IMutableValueStorage;
import edu.uci.ics.hyracks.data.std.api.IValueReference;
public class ArrayBackedValueStorage implements IMutableValueStorage {
- private final ByteArrayAccessibleOutputStream baaos;
- private final DataOutputStream dos;
-
- public ArrayBackedValueStorage() {
- baaos = new ByteArrayAccessibleOutputStream();
- dos = new DataOutputStream(baaos);
- }
+
+ private final GrowableArray data = new GrowableArray();
@Override
public void reset() {
- baaos.reset();
+ data.reset();
}
@Override
public DataOutput getDataOutput() {
- return dos;
+ return data.getDataOutput();
}
@Override
public byte[] getByteArray() {
- return baaos.getByteArray();
+ return data.getByteArray();
}
@Override
@@ -38,12 +32,12 @@
@Override
public int getLength() {
- return baaos.size();
+ return data.getLength();
}
public void append(IValueReference value) {
try {
- dos.write(value.getByteArray(), value.getStartOffset(), value.getLength());
+ data.append(value);
} catch (IOException e) {
e.printStackTrace();
}
diff --git a/hyracks-data/hyracks-data-std/src/main/java/edu/uci/ics/hyracks/data/std/util/GrowableArray.java b/hyracks-data/hyracks-data-std/src/main/java/edu/uci/ics/hyracks/data/std/util/GrowableArray.java
new file mode 100644
index 0000000..c174d4e
--- /dev/null
+++ b/hyracks-data/hyracks-data-std/src/main/java/edu/uci/ics/hyracks/data/std/util/GrowableArray.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.data.std.util;
+
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.data.std.api.IDataOutputProvider;
+import edu.uci.ics.hyracks.data.std.api.IValueReference;
+
+public class GrowableArray implements IDataOutputProvider {
+ private final ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
+ private final DataOutputStream dos = new DataOutputStream(baaos);
+
+ @Override
+ public DataOutput getDataOutput() {
+ return dos;
+ }
+
+ public void reset() {
+ baaos.reset();
+ }
+
+ public byte[] getByteArray() {
+ return baaos.getByteArray();
+ }
+
+ public int getLength() {
+ return baaos.size();
+ }
+
+ public void append(IValueReference value) throws IOException {
+ dos.write(value.getByteArray(), value.getStartOffset(), value.getLength());
+ }
+}
diff --git a/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/comm/io/ArrayTupleBuilder.java b/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/comm/io/ArrayTupleBuilder.java
index 989bc6b..8c865c4 100644
--- a/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/comm/io/ArrayTupleBuilder.java
+++ b/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/comm/io/ArrayTupleBuilder.java
@@ -15,14 +15,13 @@
package edu.uci.ics.hyracks.dataflow.common.comm.io;
import java.io.DataOutput;
-import java.io.DataOutputStream;
import java.io.IOException;
import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
import edu.uci.ics.hyracks.data.std.api.IDataOutputProvider;
-import edu.uci.ics.hyracks.data.std.util.ByteArrayAccessibleOutputStream;
+import edu.uci.ics.hyracks.data.std.util.GrowableArray;
/**
* Array backed tuple builder.
@@ -30,25 +29,21 @@
* @author vinayakb
*/
public class ArrayTupleBuilder implements IDataOutputProvider {
- private final ByteArrayAccessibleOutputStream baaos;
- private final DataOutputStream dos;
+ private final GrowableArray fieldData = new GrowableArray();
private final int[] fEndOffsets;
private int nextField;
public ArrayTupleBuilder(int nFields) {
- baaos = new ByteArrayAccessibleOutputStream();
- dos = new DataOutputStream(baaos);
fEndOffsets = new int[nFields];
}
/**
* Resets the builder.
- *
* reset() must be called before attempting to create a new tuple.
*/
public void reset() {
nextField = 0;
- baaos.reset();
+ fieldData.reset();
}
/**
@@ -66,7 +61,7 @@
* @return Data byte array.
*/
public byte[] getByteArray() {
- return baaos.getByteArray();
+ return fieldData.getByteArray();
}
/**
@@ -75,7 +70,7 @@
* @return data area size.
*/
public int getSize() {
- return baaos.size();
+ return fieldData.getLength();
}
/**
@@ -96,14 +91,15 @@
int fStartOffset = accessor.getFieldStartOffset(tIndex, fIndex);
int fLen = accessor.getFieldEndOffset(tIndex, fIndex) - fStartOffset;
try {
- dos.write(accessor.getBuffer().array(), startOffset + accessor.getFieldSlotsLength() + fStartOffset, fLen);
+ fieldData.getDataOutput().write(accessor.getBuffer().array(),
+ startOffset + accessor.getFieldSlotsLength() + fStartOffset, fLen);
if (FrameConstants.DEBUG_FRAME_IO) {
- dos.writeInt(FrameConstants.FRAME_FIELD_MAGIC);
+ fieldData.getDataOutput().writeInt(FrameConstants.FRAME_FIELD_MAGIC);
}
} catch (IOException e) {
throw new HyracksDataException(e);
}
- fEndOffsets[nextField++] = baaos.size();
+ fEndOffsets[nextField++] = fieldData.getLength();
}
/**
@@ -117,8 +113,8 @@
* @throws HyracksDataException
*/
public <T> void addField(ISerializerDeserializer<T> serDeser, T instance) throws HyracksDataException {
- serDeser.serialize(instance, dos);
- fEndOffsets[nextField++] = baaos.size();
+ serDeser.serialize(instance, fieldData.getDataOutput());
+ fEndOffsets[nextField++] = fieldData.getLength();
}
/**
@@ -134,11 +130,11 @@
*/
public void addField(byte[] bytes, int start, int length) throws HyracksDataException {
try {
- dos.write(bytes, start, length);
+ fieldData.getDataOutput().write(bytes, start, length);
} catch (IOException e) {
throw new HyracksDataException(e);
}
- fEndOffsets[nextField++] = baaos.size();
+ fEndOffsets[nextField++] = fieldData.getLength();
}
/**
@@ -146,7 +142,14 @@
*/
@Override
public DataOutput getDataOutput() {
- return dos;
+ return fieldData.getDataOutput();
+ }
+
+ /**
+ * Get the growable array storing the field data.
+ */
+ public GrowableArray getFieldData() {
+ return fieldData;
}
/**
@@ -156,6 +159,6 @@
* data.
*/
public void addFieldEndOffset() {
- fEndOffsets[nextField++] = baaos.size();
+ fEndOffsets[nextField++] = fieldData.getLength();
}
}
\ No newline at end of file
diff --git a/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java b/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java
index a29209c..8a30a71 100644
--- a/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java
+++ b/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java
@@ -18,17 +18,19 @@
import java.io.IOException;
public class StringUtils {
- public static void writeCharAsModifiedUTF8(char c, DataOutput dos) throws IOException {
-
+ public static int writeCharAsModifiedUTF8(char c, DataOutput dos) throws IOException {
if (c >= 0x0000 && c <= 0x007F) {
dos.writeByte(c);
+ return 1;
} else if (c <= 0x07FF) {
dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
dos.writeByte((byte) (0x80 | (c & 0x3F)));
+ return 2;
} else {
dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
dos.writeByte((byte) (0x80 | (c & 0x3F)));
+ return 3;
}
}
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
index 9495516..0ad10a7 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
@@ -15,13 +15,13 @@
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.dataflow;
-import java.io.DataOutput;
import java.io.IOException;
import java.nio.ByteBuffer;
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.data.std.util.GrowableArray;
import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
@@ -42,7 +42,7 @@
private FrameTupleAccessor accessor;
private ArrayTupleBuilder builder;
- private DataOutput builderDos;
+ private GrowableArray builderData;
private FrameTupleAppender appender;
private ByteBuffer writeBuffer;
@@ -63,7 +63,7 @@
accessor = new FrameTupleAccessor(ctx.getFrameSize(), inputRecDesc);
writeBuffer = ctx.allocateFrame();
builder = new ArrayTupleBuilder(outputRecDesc.getFieldCount());
- builderDos = builder.getDataOutput();
+ builderData = builder.getFieldData();
appender = new FrameTupleAppender(ctx.getFrameSize());
appender.reset(writeBuffer, true);
writer.open();
@@ -97,7 +97,7 @@
builder.reset();
try {
IToken token = tokenizer.getToken();
- token.serializeToken(builderDos);
+ token.serializeToken(builderData);
builder.addFieldEndOffset();
// Add number of tokens if requested.
if (addNumTokensKey) {
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/AbstractTOccurrenceSearcher.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/AbstractTOccurrenceSearcher.java
index af597bc..d973967 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/AbstractTOccurrenceSearcher.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/AbstractTOccurrenceSearcher.java
@@ -103,7 +103,7 @@
queryTokenBuilder.reset();
try {
IToken token = queryTokenizer.getToken();
- token.serializeToken(queryTokenBuilder.getDataOutput());
+ token.serializeToken(queryTokenBuilder.getFieldData());
queryTokenBuilder.addFieldEndOffset();
// WARNING: assuming one frame is big enough to hold all tokens
queryTokenAppender.append(queryTokenBuilder.getFieldEndOffsets(), queryTokenBuilder.getByteArray(), 0,
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
index 7eb62ac..7c0ec4d 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
@@ -1,16 +1,20 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8Token.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8Token.java
index c775955..c9b6e1f 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8Token.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8Token.java
@@ -1,16 +1,20 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
@@ -18,6 +22,7 @@
import java.io.IOException;
import edu.uci.ics.hyracks.data.std.primitive.UTF8StringPointable;
+import edu.uci.ics.hyracks.data.std.util.GrowableArray;
public abstract class AbstractUTF8Token implements IToken {
public static final int GOLDEN_RATIO_32 = 0x09e3779b9;
@@ -93,8 +98,8 @@
}
@Override
- public void serializeTokenCount(DataOutput dos) throws IOException {
- handleCountTypeTag(dos);
- dos.writeInt(tokenCount);
+ public void serializeTokenCount(GrowableArray out) throws IOException {
+ handleCountTypeTag(out.getDataOutput());
+ out.getDataOutput().writeInt(tokenCount);
}
}
\ No newline at end of file
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8TokenFactory.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8TokenFactory.java
index ed8935c..1507613 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8TokenFactory.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8TokenFactory.java
@@ -1,16 +1,20 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
index 2fa71a5..4c11523 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
@@ -1,16 +1,20 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
index 986d938..08b962b 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
@@ -1,36 +1,42 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
-public class DelimitedUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
+public class DelimitedUTF8StringBinaryTokenizerFactory implements
+ IBinaryTokenizerFactory {
- private static final long serialVersionUID = 1L;
- private final boolean ignoreTokenCount;
- private final boolean sourceHasTypeTag;
- private final ITokenFactory tokenFactory;
+ private static final long serialVersionUID = 1L;
+ private final boolean ignoreTokenCount;
+ private final boolean sourceHasTypeTag;
+ private final ITokenFactory tokenFactory;
- public DelimitedUTF8StringBinaryTokenizerFactory(boolean ignoreTokenCount, boolean sourceHasTypeTag,
- ITokenFactory tokenFactory) {
- this.ignoreTokenCount = ignoreTokenCount;
- this.sourceHasTypeTag = sourceHasTypeTag;
- this.tokenFactory = tokenFactory;
- }
+ public DelimitedUTF8StringBinaryTokenizerFactory(boolean ignoreTokenCount,
+ boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+ this.ignoreTokenCount = ignoreTokenCount;
+ this.sourceHasTypeTag = sourceHasTypeTag;
+ this.tokenFactory = tokenFactory;
+ }
- @Override
- public IBinaryTokenizer createTokenizer() {
- return new DelimitedUTF8StringBinaryTokenizer(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
- }
+ @Override
+ public IBinaryTokenizer createTokenizer() {
+ return new DelimitedUTF8StringBinaryTokenizer(ignoreTokenCount,
+ sourceHasTypeTag, tokenFactory);
+ }
}
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8NGramToken.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8NGramToken.java
index ac0a73c..632bf9a 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8NGramToken.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8NGramToken.java
@@ -1,24 +1,28 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
-import java.io.DataOutput;
import java.io.IOException;
import edu.uci.ics.hyracks.data.std.primitive.UTF8StringPointable;
+import edu.uci.ics.hyracks.data.std.util.GrowableArray;
public class HashedUTF8NGramToken extends UTF8NGramToken {
public HashedUTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
@@ -26,8 +30,8 @@
}
@Override
- public void serializeToken(DataOutput dos) throws IOException {
- handleTokenTypeTag(dos);
+ public void serializeToken(GrowableArray out) throws IOException {
+ handleTokenTypeTag(out.getDataOutput());
int hash = GOLDEN_RATIO_32;
@@ -55,6 +59,6 @@
// token count
hash += tokenCount;
- dos.writeInt(hash);
+ out.getDataOutput().writeInt(hash);
}
}
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java
index 22efc92..e1d8e31 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java
@@ -1,16 +1,20 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8WordToken.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8WordToken.java
index c7854d7..32954f9 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8WordToken.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8WordToken.java
@@ -1,24 +1,28 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
-import java.io.DataOutput;
import java.io.IOException;
import edu.uci.ics.hyracks.data.std.primitive.UTF8StringPointable;
+import edu.uci.ics.hyracks.data.std.util.GrowableArray;
public class HashedUTF8WordToken extends UTF8WordToken {
@@ -72,12 +76,12 @@
}
@Override
- public void serializeToken(DataOutput dos) throws IOException {
+ public void serializeToken(GrowableArray out) throws IOException {
if (tokenTypeTag > 0) {
- dos.write(tokenTypeTag);
+ out.getDataOutput().write(tokenTypeTag);
}
// serialize hash value
- dos.writeInt(hash);
+ out.getDataOutput().writeInt(hash);
}
}
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java
index f551e30..a4788c4 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java
@@ -1,21 +1,24 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
-
public class HashedUTF8WordTokenFactory extends AbstractUTF8TokenFactory {
private static final long serialVersionUID = 1L;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
index 281f2c8..f88e744 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
@@ -1,16 +1,20 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizerFactory.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizerFactory.java
index 9bf1a69..5890124 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizerFactory.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizerFactory.java
@@ -1,16 +1,20 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/INGramToken.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/INGramToken.java
index ea7d5b4..40351c4 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/INGramToken.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/INGramToken.java
@@ -1,16 +1,20 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IToken.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IToken.java
index 63358b6..7b1a130 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IToken.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IToken.java
@@ -1,23 +1,28 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
-import java.io.DataOutput;
import java.io.IOException;
+import edu.uci.ics.hyracks.data.std.util.GrowableArray;
+
public interface IToken {
public byte[] getData();
@@ -30,7 +35,7 @@
public void reset(byte[] data, int start, int length, int tokenLength,
int tokenCount);
- public void serializeToken(DataOutput dos) throws IOException;
+ public void serializeToken(GrowableArray out) throws IOException;
- public void serializeTokenCount(DataOutput dos) throws IOException;
+ public void serializeTokenCount(GrowableArray out) throws IOException;
}
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/ITokenFactory.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/ITokenFactory.java
index cf94baa..5765263 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/ITokenFactory.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/ITokenFactory.java
@@ -1,16 +1,20 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IntArray.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IntArray.java
index 6ad2bf9..6bae90b 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IntArray.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IntArray.java
@@ -1,16 +1,20 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
index de47c33..0af0335 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
@@ -1,16 +1,20 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8NGramToken.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8NGramToken.java
index d2a1ed4..8713499 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8NGramToken.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8NGramToken.java
@@ -1,24 +1,28 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
-import java.io.DataOutput;
import java.io.IOException;
import edu.uci.ics.hyracks.data.std.primitive.UTF8StringPointable;
+import edu.uci.ics.hyracks.data.std.util.GrowableArray;
import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
public class UTF8NGramToken extends AbstractUTF8Token implements INGramToken {
@@ -45,34 +49,39 @@
}
@Override
- public void serializeToken(DataOutput dos) throws IOException {
- handleTokenTypeTag(dos);
+ public void serializeToken(GrowableArray out) throws IOException {
+ handleTokenTypeTag(out.getDataOutput());
+ int tokenUTF8LenOff = out.getLength();
// regular chars
int numRegChars = tokenLength - numPreChars - numPostChars;
// assuming pre and post char need 1-byte each in utf8
- int tokenUTF8Len = getLowerCaseUTF8Len(numRegChars) + numPreChars + numPostChars;
+ int tokenUTF8Len = numPreChars + numPostChars;
- // write utf8 length indicator
- StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+ // Write dummy UTF length which will be correctly set later.
+ out.getDataOutput().writeShort(0);
// pre chars
for (int i = 0; i < numPreChars; i++) {
- StringUtils.writeCharAsModifiedUTF8(PRECHAR, dos);
+ StringUtils.writeCharAsModifiedUTF8(PRECHAR, out.getDataOutput());
}
int pos = start;
for (int i = 0; i < numRegChars; i++) {
char c = Character.toLowerCase(UTF8StringPointable.charAt(data, pos));
- StringUtils.writeCharAsModifiedUTF8(c, dos);
+ tokenUTF8Len += StringUtils.writeCharAsModifiedUTF8(c, out.getDataOutput());
pos += UTF8StringPointable.charSize(data, pos);
}
// post chars
for (int i = 0; i < numPostChars; i++) {
- StringUtils.writeCharAsModifiedUTF8(POSTCHAR, dos);
+ StringUtils.writeCharAsModifiedUTF8(POSTCHAR, out.getDataOutput());
}
+
+ // Set UTF length of token.
+ out.getByteArray()[tokenUTF8LenOff] = (byte) ((tokenUTF8Len >>> 8) & 0xFF);
+ out.getByteArray()[tokenUTF8LenOff + 1] = (byte) ((tokenUTF8Len >>> 0) & 0xFF);
}
public void setNumPrePostChars(int numPreChars, int numPostChars) {
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8NGramTokenFactory.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8NGramTokenFactory.java
index 299e678..d26a409 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8NGramTokenFactory.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8NGramTokenFactory.java
@@ -1,16 +1,20 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8WordToken.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8WordToken.java
index e29d3bb..dbfc76f 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8WordToken.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8WordToken.java
@@ -1,24 +1,28 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
-import java.io.DataOutput;
import java.io.IOException;
import edu.uci.ics.hyracks.data.std.primitive.UTF8StringPointable;
+import edu.uci.ics.hyracks.data.std.util.GrowableArray;
import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
public class UTF8WordToken extends AbstractUTF8Token {
@@ -28,16 +32,20 @@
}
@Override
- public void serializeToken(DataOutput dos) throws IOException {
- handleTokenTypeTag(dos);
-
- int tokenUTF8Len = getLowerCaseUTF8Len(tokenLength);
- StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+ public void serializeToken(GrowableArray out) throws IOException {
+ handleTokenTypeTag(out.getDataOutput());
+ int tokenUTF8LenOff = out.getLength();
+ int tokenUTF8Len = 0;
+ // Write dummy UTF length which will be correctly set later.
+ out.getDataOutput().writeShort(0);
int pos = start;
for (int i = 0; i < tokenLength; i++) {
char c = Character.toLowerCase(UTF8StringPointable.charAt(data, pos));
- StringUtils.writeCharAsModifiedUTF8(c, dos);
+ tokenUTF8Len += StringUtils.writeCharAsModifiedUTF8(c, out.getDataOutput());
pos += UTF8StringPointable.charSize(data, pos);
}
+ // Set UTF length of token.
+ out.getByteArray()[tokenUTF8LenOff] = (byte) ((tokenUTF8Len >>> 8) & 0xFF);
+ out.getByteArray()[tokenUTF8LenOff + 1] = (byte) ((tokenUTF8Len >>> 0) & 0xFF);
}
}
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8WordTokenFactory.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8WordTokenFactory.java
index 45020e9..023e957 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8WordTokenFactory.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/UTF8WordTokenFactory.java
@@ -1,16 +1,20 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTokenizingTupleIterator.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTokenizingTupleIterator.java
index 4f8f635..b9f9362 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTokenizingTupleIterator.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTokenizingTupleIterator.java
@@ -58,7 +58,7 @@
tupleBuilder.reset();
// Add token field.
try {
- token.serializeToken(tupleBuilder.getDataOutput());
+ token.serializeToken(tupleBuilder.getFieldData());
} catch (IOException e) {
throw new HyracksDataException(e);
}
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/PartitionedInvertedIndexTokenizingTupleIterator.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/PartitionedInvertedIndexTokenizingTupleIterator.java
index a6a4bc1..8a18a91 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/PartitionedInvertedIndexTokenizingTupleIterator.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/PartitionedInvertedIndexTokenizingTupleIterator.java
@@ -49,7 +49,7 @@
tupleBuilder.reset();
try {
// Add token field.
- token.serializeToken(tupleBuilder.getDataOutput());
+ token.serializeToken(tupleBuilder.getFieldData());
tupleBuilder.addFieldEndOffset();
// Add field with number of tokens.
tupleBuilder.getDataOutput().writeShort(numTokens);
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramTokenizerTest.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramTokenizerTest.java
index 13bbc35..33ea4f5 100644
--- a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramTokenizerTest.java
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramTokenizerTest.java
@@ -29,215 +29,200 @@
import org.junit.Before;
import org.junit.Test;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.AbstractUTF8Token;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8NGramTokenFactory;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8NGramTokenFactory;
+import edu.uci.ics.hyracks.data.std.util.GrowableArray;
public class NGramTokenizerTest {
- private char PRECHAR = '#';
- private char POSTCHAR = '$';
+ private char PRECHAR = '#';
+ private char POSTCHAR = '$';
- private String str = "Jürgen S. Generic's Car";
- private byte[] inputBuffer;
+ private String str = "Jürgen S. Generic's Car";
+ private byte[] inputBuffer;
- private int gramLength = 3;
+ private int gramLength = 3;
- private void getExpectedGrams(String s, int gramLength,
- ArrayList<String> grams, boolean prePost) {
+ private void getExpectedGrams(String s, int gramLength, ArrayList<String> grams, boolean prePost) {
- String tmp = s.toLowerCase();
- if (prePost) {
- StringBuilder preBuilder = new StringBuilder();
- for (int i = 0; i < gramLength - 1; i++) {
- preBuilder.append(PRECHAR);
- }
- String pre = preBuilder.toString();
+ String tmp = s.toLowerCase();
+ if (prePost) {
+ StringBuilder preBuilder = new StringBuilder();
+ for (int i = 0; i < gramLength - 1; i++) {
+ preBuilder.append(PRECHAR);
+ }
+ String pre = preBuilder.toString();
- StringBuilder postBuilder = new StringBuilder();
- for (int i = 0; i < gramLength - 1; i++) {
- postBuilder.append(POSTCHAR);
- }
- String post = postBuilder.toString();
+ StringBuilder postBuilder = new StringBuilder();
+ for (int i = 0; i < gramLength - 1; i++) {
+ postBuilder.append(POSTCHAR);
+ }
+ String post = postBuilder.toString();
- tmp = pre + s.toLowerCase() + post;
- }
+ tmp = pre + s.toLowerCase() + post;
+ }
- for (int i = 0; i < tmp.length() - gramLength + 1; i++) {
- String gram = tmp.substring(i, i + gramLength);
- grams.add(gram);
- }
- }
+ for (int i = 0; i < tmp.length() - gramLength + 1; i++) {
+ String gram = tmp.substring(i, i + gramLength);
+ grams.add(gram);
+ }
+ }
- @Before
- public void init() throws Exception {
- // serialize string into bytes
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutput dos = new DataOutputStream(baos);
- dos.writeUTF(str);
- inputBuffer = baos.toByteArray();
- }
+ @Before
+ public void init() throws Exception {
+ // serialize string into bytes
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutput dos = new DataOutputStream(baos);
+ dos.writeUTF(str);
+ inputBuffer = baos.toByteArray();
+ }
- void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost)
- throws IOException {
- HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
- NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(
- gramLength, prePost, false, false, tokenFactory);
- tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+ void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException {
+ HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+ NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, false,
+ false, tokenFactory);
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
- ArrayList<String> expectedGrams = new ArrayList<String>();
- getExpectedGrams(str, gramLength, expectedGrams, prePost);
- ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
- HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
- for (String s : expectedGrams) {
- Integer count = gramCounts.get(s);
- if (count == null) {
- count = 1;
- gramCounts.put(s, count);
- } else {
- count++;
- }
+ ArrayList<String> expectedGrams = new ArrayList<String>();
+ getExpectedGrams(str, gramLength, expectedGrams, prePost);
+ ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
+ HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
+ for (String s : expectedGrams) {
+ Integer count = gramCounts.get(s);
+ if (count == null) {
+ count = 1;
+ gramCounts.put(s, count);
+ } else {
+ count++;
+ }
- int hash = tokenHash(s, count);
- expectedHashedGrams.add(hash);
- }
+ int hash = tokenHash(s, count);
+ expectedHashedGrams.add(hash);
+ }
- int tokenCount = 0;
+ int tokenCount = 0;
- while (tokenizer.hasNext()) {
- tokenizer.next();
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
- // serialize hashed token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
+ // serialize hashed token
+ GrowableArray tokenData = new GrowableArray();
- IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenData);
- // deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(
- tokenBaos.toByteArray());
- DataInput in = new DataInputStream(bais);
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
+ DataInput in = new DataInputStream(bais);
- Integer hashedGram = in.readInt();
+ Integer hashedGram = in.readInt();
- // System.out.println(hashedGram);
+ // System.out.println(hashedGram);
- Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
+ Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
- tokenCount++;
- }
- // System.out.println("---------");
- }
+ tokenCount++;
+ }
+ // System.out.println("---------");
+ }
- void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost)
- throws IOException {
- HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
- NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(
- gramLength, prePost, true, false, tokenFactory);
- tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+ void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost) throws IOException {
+ HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+ NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
+ tokenFactory);
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
- ArrayList<String> expectedGrams = new ArrayList<String>();
- getExpectedGrams(str, gramLength, expectedGrams, prePost);
- ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
- for (String s : expectedGrams) {
- int hash = tokenHash(s, 1);
- expectedHashedGrams.add(hash);
- }
+ ArrayList<String> expectedGrams = new ArrayList<String>();
+ getExpectedGrams(str, gramLength, expectedGrams, prePost);
+ ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
+ for (String s : expectedGrams) {
+ int hash = tokenHash(s, 1);
+ expectedHashedGrams.add(hash);
+ }
- int tokenCount = 0;
+ int tokenCount = 0;
- while (tokenizer.hasNext()) {
- tokenizer.next();
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
- // serialize hashed token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
+ // serialize hashed token
+ GrowableArray tokenData = new GrowableArray();
- IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenData);
- // deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(
- tokenBaos.toByteArray());
- DataInput in = new DataInputStream(bais);
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
+ DataInput in = new DataInputStream(bais);
- Integer hashedGram = in.readInt();
+ Integer hashedGram = in.readInt();
- // System.out.println(hashedGram);
+ // System.out.println(hashedGram);
- Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
+ Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
- tokenCount++;
- }
- // System.out.println("---------");
- }
+ tokenCount++;
+ }
+ // System.out.println("---------");
+ }
- void runTestNGramTokenizerWithUTF8Tokens(boolean prePost)
- throws IOException {
- UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory();
- NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(
- gramLength, prePost, true, false, tokenFactory);
- tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+ void runTestNGramTokenizerWithUTF8Tokens(boolean prePost) throws IOException {
+ UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory();
+ NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(gramLength, prePost, true, false,
+ tokenFactory);
+ tokenizer.reset(inputBuffer, 0, inputBuffer.length);
- ArrayList<String> expectedGrams = new ArrayList<String>();
- getExpectedGrams(str, gramLength, expectedGrams, prePost);
+ ArrayList<String> expectedGrams = new ArrayList<String>();
+ getExpectedGrams(str, gramLength, expectedGrams, prePost);
- int tokenCount = 0;
+ int tokenCount = 0;
- while (tokenizer.hasNext()) {
- tokenizer.next();
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
- // serialize hashed token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
+ // serialize hashed token
+ GrowableArray tokenData = new GrowableArray();
- IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
+ IToken token = tokenizer.getToken();
+ token.serializeToken(tokenData);
- // deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(
- tokenBaos.toByteArray());
- DataInput in = new DataInputStream(bais);
+ // deserialize token
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
+ DataInput in = new DataInputStream(bais);
- String strGram = in.readUTF();
+ String strGram = in.readUTF();
- // System.out.println("\"" + strGram + "\"");
+ // System.out.println("\"" + strGram + "\"");
- Assert.assertEquals(expectedGrams.get(tokenCount), strGram);
+ Assert.assertEquals(expectedGrams.get(tokenCount), strGram);
- tokenCount++;
- }
- // System.out.println("---------");
- }
+ tokenCount++;
+ }
+ // System.out.println("---------");
+ }
- @Test
- public void testNGramTokenizerWithCountedHashedUTF8Tokens()
- throws Exception {
- runTestNGramTokenizerWithCountedHashedUTF8Tokens(false);
- runTestNGramTokenizerWithCountedHashedUTF8Tokens(true);
- }
+ @Test
+ public void testNGramTokenizerWithCountedHashedUTF8Tokens() throws Exception {
+ runTestNGramTokenizerWithCountedHashedUTF8Tokens(false);
+ runTestNGramTokenizerWithCountedHashedUTF8Tokens(true);
+ }
- @Test
- public void testNGramTokenizerWithHashedUTF8Tokens() throws Exception {
- runTestNGramTokenizerWithHashedUTF8Tokens(false);
- runTestNGramTokenizerWithHashedUTF8Tokens(true);
- }
+ @Test
+ public void testNGramTokenizerWithHashedUTF8Tokens() throws Exception {
+ runTestNGramTokenizerWithHashedUTF8Tokens(false);
+ runTestNGramTokenizerWithHashedUTF8Tokens(true);
+ }
- @Test
- public void testNGramTokenizerWithUTF8Tokens() throws IOException {
- runTestNGramTokenizerWithUTF8Tokens(false);
- runTestNGramTokenizerWithUTF8Tokens(true);
- }
+ @Test
+ public void testNGramTokenizerWithUTF8Tokens() throws IOException {
+ runTestNGramTokenizerWithUTF8Tokens(false);
+ runTestNGramTokenizerWithUTF8Tokens(true);
+ }
- public int tokenHash(String token, int tokenCount) {
- int h = AbstractUTF8Token.GOLDEN_RATIO_32;
- for (int i = 0; i < token.length(); i++) {
- h ^= token.charAt(i);
- h *= AbstractUTF8Token.GOLDEN_RATIO_32;
- }
- return h + tokenCount;
- }
+ public int tokenHash(String token, int tokenCount) {
+ int h = AbstractUTF8Token.GOLDEN_RATIO_32;
+ for (int i = 0; i < token.length(); i++) {
+ h ^= token.charAt(i);
+ h *= AbstractUTF8Token.GOLDEN_RATIO_32;
+ }
+ return h + tokenCount;
+ }
}
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/WordTokenizerTest.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/WordTokenizerTest.java
index f52eb54..3ff9304 100644
--- a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/WordTokenizerTest.java
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/WordTokenizerTest.java
@@ -30,11 +30,7 @@
import org.junit.Before;
import org.junit.Test;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.AbstractUTF8Token;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8WordTokenFactory;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8WordTokenFactory;
+import edu.uci.ics.hyracks.data.std.util.GrowableArray;
public class WordTokenizerTest {
@@ -48,33 +44,33 @@
private boolean isSeparator(char c) {
return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER || Character.getType(c) == Character.OTHER_NUMBER);
}
-
+
private void tokenize(String text, ArrayList<String> tokens) {
- String lowerCaseText = text.toLowerCase();
- int startIx = 0;
-
- // Skip separators at beginning of string.
- while(isSeparator(lowerCaseText.charAt(startIx))) {
- startIx++;
- }
- while(startIx < lowerCaseText.length()) {
- while(startIx < lowerCaseText.length() && isSeparator(lowerCaseText.charAt(startIx))) {
- startIx++;
- }
- int tokenStart = startIx;
-
- while(startIx < lowerCaseText.length() && !isSeparator(lowerCaseText.charAt(startIx))) {
- startIx++;
- }
- int tokenEnd = startIx;
-
- // Emit token.
- String token = lowerCaseText.substring(tokenStart, tokenEnd);
-
- tokens.add(token);
- }
+ String lowerCaseText = text.toLowerCase();
+ int startIx = 0;
+
+ // Skip separators at beginning of string.
+ while (isSeparator(lowerCaseText.charAt(startIx))) {
+ startIx++;
+ }
+ while (startIx < lowerCaseText.length()) {
+ while (startIx < lowerCaseText.length() && isSeparator(lowerCaseText.charAt(startIx))) {
+ startIx++;
+ }
+ int tokenStart = startIx;
+
+ while (startIx < lowerCaseText.length() && !isSeparator(lowerCaseText.charAt(startIx))) {
+ startIx++;
+ }
+ int tokenEnd = startIx;
+
+ // Emit token.
+ String token = lowerCaseText.substring(tokenStart, tokenEnd);
+
+ tokens.add(token);
+ }
}
-
+
@Before
public void init() throws IOException {
// serialize text into bytes
@@ -82,10 +78,10 @@
DataOutput dos = new DataOutputStream(baos);
dos.writeUTF(text);
inputBuffer = baos.toByteArray();
-
+
// init expected string tokens
tokenize(text, expectedUTF8Tokens);
-
+
// hashed tokens ignoring token count
for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
int hash = tokenHash(expectedUTF8Tokens.get(i), 1);
@@ -122,15 +118,14 @@
while (tokenizer.hasNext()) {
tokenizer.next();
- // serialize token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
+ // serialize hashed token
+ GrowableArray tokenData = new GrowableArray();
IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
+ token.serializeToken(tokenData);
// deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
DataInput in = new DataInputStream(bais);
Integer hashedToken = in.readInt();
@@ -154,15 +149,14 @@
while (tokenizer.hasNext()) {
tokenizer.next();
- // serialize token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
+ // serialize hashed token
+ GrowableArray tokenData = new GrowableArray();
IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
+ token.serializeToken(tokenData);
// deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
DataInput in = new DataInputStream(bais);
Integer hashedToken = in.readInt();
@@ -187,14 +181,13 @@
tokenizer.next();
// serialize hashed token
- ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
- DataOutput tokenDos = new DataOutputStream(tokenBaos);
+ GrowableArray tokenData = new GrowableArray();
IToken token = tokenizer.getToken();
- token.serializeToken(tokenDos);
+ token.serializeToken(tokenData);
// deserialize token
- ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+ ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
DataInput in = new DataInputStream(bais);
String strToken = in.readUTF();
@@ -209,7 +202,7 @@
public int tokenHash(String token, int tokenCount) {
int h = AbstractUTF8Token.GOLDEN_RATIO_32;
for (int i = 0; i < token.length(); i++) {
- h ^= token.charAt(i);
+ h ^= token.charAt(i);
h *= AbstractUTF8Token.GOLDEN_RATIO_32;
}
return h + tokenCount;
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java
index 3bd8129..7b54884 100644
--- a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java
@@ -20,8 +20,6 @@
import java.io.ByteArrayInputStream;
import java.io.DataInput;
import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
@@ -35,7 +33,7 @@
import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
-import edu.uci.ics.hyracks.data.std.util.ByteArrayAccessibleOutputStream;
+import edu.uci.ics.hyracks.data.std.util.GrowableArray;
import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleReference;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
@@ -411,7 +409,7 @@
Arrays.fill(scanCountArray, 0);
expectedResults.clear();
- ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
+ GrowableArray tokenData = new GrowableArray();
tokenizer.reset(searchDocument.getFieldData(0), searchDocument.getFieldStart(0),
searchDocument.getFieldLength(0));
// Run though tokenizer to get number of tokens.
@@ -434,10 +432,9 @@
while (tokenizer.hasNext()) {
tokenizer.next();
IToken token = tokenizer.getToken();
- baaos.reset();
- DataOutput out = new DataOutputStream(baaos);
- token.serializeToken(out);
- ByteArrayInputStream inStream = new ByteArrayInputStream(baaos.getByteArray(), 0, baaos.size());
+ tokenData.reset();
+ token.serializeToken(tokenData);
+ ByteArrayInputStream inStream = new ByteArrayInputStream(tokenData.getByteArray(), 0, tokenData.getLength());
DataInput dataIn = new DataInputStream(inStream);
Comparable tokenObj = (Comparable) tokenSerde.deserialize(dataIn);
CheckTuple lowKey;