Optimized the binary tokenizer - get the total number of tokens
Change-Id: Ifa9a18a43a097766da22633bb48371ffc78406ae
Reviewed-on: https://asterix-gerrit.ics.uci.edu/348
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Young-Seok Kim <kisskys@gmail.com>
diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
index 231adbd..8ac1e3c 100644
--- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
+++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
@@ -31,8 +31,7 @@
 import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
 import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
 
-public class BinaryTokenizerOperatorNodePushable extends
-        AbstractUnaryInputUnaryOutputOperatorNodePushable {
+public class BinaryTokenizerOperatorNodePushable extends AbstractUnaryInputUnaryOutputOperatorNodePushable {
 
     private final IHyracksTaskContext ctx;
     private final IBinaryTokenizer tokenizer;
@@ -48,9 +47,8 @@
     private GrowableArray builderData;
     private FrameTupleAppender appender;
 
-    public BinaryTokenizerOperatorNodePushable(IHyracksTaskContext ctx,
-            RecordDescriptor inputRecDesc, RecordDescriptor outputRecDesc,
-            IBinaryTokenizer tokenizer, int docField, int[] keyFields,
+    public BinaryTokenizerOperatorNodePushable(IHyracksTaskContext ctx, RecordDescriptor inputRecDesc,
+            RecordDescriptor outputRecDesc, IBinaryTokenizer tokenizer, int docField, int[] keyFields,
             boolean addNumTokensKey, boolean writeKeyFieldsFirst) {
         this.ctx = ctx;
         this.tokenizer = tokenizer;
@@ -78,26 +76,16 @@
 
         for (int i = 0; i < tupleCount; i++) {
             short numTokens = 0;
-            if (addNumTokensKey) {
-                // Run through the tokens to get the total number of tokens.
-                tokenizer.reset(
-                        accessor.getBuffer().array(),
-                        accessor.getTupleStartOffset(i)
-                                + accessor.getFieldSlotsLength()
-                                + accessor.getFieldStartOffset(i, docField),
-                        accessor.getFieldLength(i, docField));
-                while (tokenizer.hasNext()) {
-                    tokenizer.next();
-                    numTokens++;
-                }
-            }
 
             tokenizer.reset(
                     accessor.getBuffer().array(),
-                    accessor.getTupleStartOffset(i)
-                            + accessor.getFieldSlotsLength()
-                            + accessor.getFieldStartOffset(i, docField),
-                    accessor.getFieldLength(i, docField));
+                    accessor.getTupleStartOffset(i) + accessor.getFieldSlotsLength()
+                            + accessor.getFieldStartOffset(i, docField), accessor.getFieldLength(i, docField));
+
+            if (addNumTokensKey) {
+                // Get the total number of tokens.
+                numTokens = tokenizer.getTokensCount();
+            }
 
             // Write token and data into frame by following the order specified
             // in the writeKeyFieldsFirst field.
@@ -151,8 +139,8 @@
 
                 }
 
-                FrameUtils.appendToWriter(writer, appender, builder.getFieldEndOffsets(),
-                        builder.getByteArray(), 0, builder.getSize());
+                FrameUtils.appendToWriter(writer, appender, builder.getFieldEndOffsets(), builder.getByteArray(), 0,
+                        builder.getSize());
 
             }
 
diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
index af20ad2..5ac4aa4 100644
--- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
+++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
@@ -3,9 +3,9 @@
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * you may obtain a copy of the License from
- * 
+ *
  *     http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,7 +24,10 @@
     protected int length;
     protected int tokenLength;
     protected int index;
+    protected int originalIndex;
     protected int utf8Length;
+    protected boolean tokenCountCalculated = false;
+    protected short tokenCount;
 
     protected final IntArray tokensStart;
     protected final IntArray tokensLength;
@@ -69,5 +72,10 @@
             tokensStart.reset();
             tokensLength.reset();
         }
+
+        // Needed for calculating the number of tokens
+        originalIndex = index;
+        tokenCountCalculated = false;
+        tokenCount = 0;
     }
 }
diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
index daf853a..c4a6994 100644
--- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
+++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
@@ -3,9 +3,9 @@
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * you may obtain a copy of the License from
- * 
+ *
  *     http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -74,4 +74,24 @@
         // set token
         token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
     }
+
+    @Override
+    public short getTokensCount() {
+        if (!tokenCountCalculated) {
+            tokenCount = 0;
+            boolean previousCharIsSeparator = true;
+            while (originalIndex < length) {
+                if (isSeparator(UTF8StringPointable.charAt(data, originalIndex))) {
+                    previousCharIsSeparator = true;
+                } else {
+                    if (previousCharIsSeparator) {
+                        tokenCount++;
+                        previousCharIsSeparator = false;
+                    }
+                }
+                originalIndex += UTF8StringPointable.charSize(data, originalIndex);
+            }
+        }
+        return tokenCount;
+    }
 }
\ No newline at end of file
diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
index 207df81..206175b 100644
--- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
+++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
@@ -3,9 +3,9 @@
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * you may obtain a copy of the License from
- * 
+ *
  *     http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,11 +16,14 @@
 package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
 
 public interface IBinaryTokenizer {
-	public IToken getToken();
+    public IToken getToken();
 
-	public boolean hasNext();
+    public boolean hasNext();
 
-	public void next();
+    public void next();
 
-	public void reset(byte[] data, int start, int length);
+    public void reset(byte[] data, int start, int length);
+
+    // Get the total number of tokens
+    public short getTokensCount();
 }
diff --git a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
index b1d722e..d19da58 100644
--- a/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
+++ b/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
@@ -3,9 +3,9 @@
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * you may obtain a copy of the License from
- * 
+ *
  *     http://www.apache.org/licenses/LICENSE-2.0
- * 
+ *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -114,4 +114,9 @@
     public void setPrePost(boolean usePrePost) {
         this.usePrePost = usePrePost;
     }
+
+    @Override
+    public short getTokensCount() {
+        return (short) totalGrams;
+    }
 }