Optimized the binary tokenizer - get the total number of tokens
Change-Id: Ifa9a18a43a097766da22633bb48371ffc78406ae
Reviewed-on: https://asterix-gerrit.ics.uci.edu/348
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Young-Seok Kim <kisskys@gmail.com>
diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
index 231adbd..8ac1e3c 100644
--- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
+++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
@@ -31,8 +31,7 @@
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
-public class BinaryTokenizerOperatorNodePushable extends
- AbstractUnaryInputUnaryOutputOperatorNodePushable {
+public class BinaryTokenizerOperatorNodePushable extends AbstractUnaryInputUnaryOutputOperatorNodePushable {
private final IHyracksTaskContext ctx;
private final IBinaryTokenizer tokenizer;
@@ -48,9 +47,8 @@
private GrowableArray builderData;
private FrameTupleAppender appender;
- public BinaryTokenizerOperatorNodePushable(IHyracksTaskContext ctx,
- RecordDescriptor inputRecDesc, RecordDescriptor outputRecDesc,
- IBinaryTokenizer tokenizer, int docField, int[] keyFields,
+ public BinaryTokenizerOperatorNodePushable(IHyracksTaskContext ctx, RecordDescriptor inputRecDesc,
+ RecordDescriptor outputRecDesc, IBinaryTokenizer tokenizer, int docField, int[] keyFields,
boolean addNumTokensKey, boolean writeKeyFieldsFirst) {
this.ctx = ctx;
this.tokenizer = tokenizer;
@@ -78,26 +76,16 @@
for (int i = 0; i < tupleCount; i++) {
short numTokens = 0;
- if (addNumTokensKey) {
- // Run through the tokens to get the total number of tokens.
- tokenizer.reset(
- accessor.getBuffer().array(),
- accessor.getTupleStartOffset(i)
- + accessor.getFieldSlotsLength()
- + accessor.getFieldStartOffset(i, docField),
- accessor.getFieldLength(i, docField));
- while (tokenizer.hasNext()) {
- tokenizer.next();
- numTokens++;
- }
- }
tokenizer.reset(
accessor.getBuffer().array(),
- accessor.getTupleStartOffset(i)
- + accessor.getFieldSlotsLength()
- + accessor.getFieldStartOffset(i, docField),
- accessor.getFieldLength(i, docField));
+ accessor.getTupleStartOffset(i) + accessor.getFieldSlotsLength()
+ + accessor.getFieldStartOffset(i, docField), accessor.getFieldLength(i, docField));
+
+ if (addNumTokensKey) {
+ // Get the total number of tokens.
+ numTokens = tokenizer.getTokensCount();
+ }
// Write token and data into frame by following the order specified
// in the writeKeyFieldsFirst field.
@@ -151,8 +139,8 @@
}
- FrameUtils.appendToWriter(writer, appender, builder.getFieldEndOffsets(),
- builder.getByteArray(), 0, builder.getSize());
+ FrameUtils.appendToWriter(writer, appender, builder.getFieldEndOffsets(), builder.getByteArray(), 0,
+ builder.getSize());
}
diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
index af20ad2..5ac4aa4 100644
--- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
+++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
@@ -3,9 +3,9 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,7 +24,10 @@
protected int length;
protected int tokenLength;
protected int index;
+ protected int originalIndex;
protected int utf8Length;
+ protected boolean tokenCountCalculated = false;
+ protected short tokenCount;
protected final IntArray tokensStart;
protected final IntArray tokensLength;
@@ -69,5 +72,10 @@
tokensStart.reset();
tokensLength.reset();
}
+
+ // Needed for calculating the number of tokens
+ originalIndex = index;
+ tokenCountCalculated = false;
+ tokenCount = 0;
}
}
diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
index daf853a..c4a6994 100644
--- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
+++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
@@ -3,9 +3,9 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -74,4 +74,24 @@
// set token
token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
}
+
+ @Override
+ public short getTokensCount() {
+ if (!tokenCountCalculated) {
+ tokenCount = 0;
+ boolean previousCharIsSeparator = true;
+ while (originalIndex < length) {
+ if (isSeparator(UTF8StringPointable.charAt(data, originalIndex))) {
+ previousCharIsSeparator = true;
+ } else {
+ if (previousCharIsSeparator) {
+ tokenCount++;
+ previousCharIsSeparator = false;
+ }
+ }
+ originalIndex += UTF8StringPointable.charSize(data, originalIndex);
+ }
+ }
+ return tokenCount;
+ }
}
\ No newline at end of file
diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
index 207df81..206175b 100644
--- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
+++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
@@ -3,9 +3,9 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,11 +16,14 @@
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
public interface IBinaryTokenizer {
- public IToken getToken();
+ public IToken getToken();
- public boolean hasNext();
+ public boolean hasNext();
- public void next();
+ public void next();
- public void reset(byte[] data, int start, int length);
+ public void reset(byte[] data, int start, int length);
+
+ // Get the total number of tokens
+ public short getTokensCount();
}
diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
index b1d722e..d19da58 100644
--- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
+++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
@@ -3,9 +3,9 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -114,4 +114,9 @@
public void setPrePost(boolean usePrePost) {
this.usePrePost = usePrePost;
}
+
+ @Override
+ public short getTokensCount() {
+ return (short) totalGrams;
+ }
}