Started work on improved inverted index. Implemented bulk load procedure.

git-svn-id: https://hyracks.googlecode.com/svn/branches/hyracks_indexes@374 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-examples/hyracks-integration-tests/src/test/java/edu/uci/ics/hyracks/tests/invertedindex/InvertedIndexOperatorsTest.java b/hyracks-examples/hyracks-integration-tests/src/test/java/edu/uci/ics/hyracks/tests/invertedindex/InvertedIndexOperatorsTest.java
index 2cfe438..0e19298 100644
--- a/hyracks-examples/hyracks-integration-tests/src/test/java/edu/uci/ics/hyracks/tests/invertedindex/InvertedIndexOperatorsTest.java
+++ b/hyracks-examples/hyracks-integration-tests/src/test/java/edu/uci/ics/hyracks/tests/invertedindex/InvertedIndexOperatorsTest.java
@@ -4,6 +4,10 @@
 
 import org.junit.Test;
 
+import edu.uci.ics.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizerFactory;
+import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizerFactory;
+import edu.uci.ics.fuzzyjoin.tokenizer.ITokenFactory;
+import edu.uci.ics.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
 import edu.uci.ics.hyracks.api.constraints.PartitionConstraintHelper;
 import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
 import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
@@ -21,9 +25,7 @@
 import edu.uci.ics.hyracks.dataflow.std.file.FileSplit;
 import edu.uci.ics.hyracks.dataflow.std.file.IFileSplitProvider;
 import edu.uci.ics.hyracks.dataflow.std.misc.PrinterOperatorDescriptor;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
 import edu.uci.ics.hyracks.storage.am.invertedindex.dataflow.BinaryTokenizerOperatorDescriptor;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizerFactory;
 import edu.uci.ics.hyracks.tests.integration.AbstractIntegrationTest;
 
 public class InvertedIndexOperatorsTest extends AbstractIntegrationTest {
@@ -45,7 +47,9 @@
 
         RecordDescriptor tokenizerRecDesc = new RecordDescriptor(new ISerializerDeserializer[] {
                 UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE });
-        IBinaryTokenizerFactory tokenizerFactory = new DelimitedUTF8StringBinaryTokenizerFactory(' ');
+        
+        ITokenFactory tokenFactory = new UTF8WordTokenFactory();
+        IBinaryTokenizerFactory tokenizerFactory = new DelimitedUTF8StringBinaryTokenizerFactory(true, false, tokenFactory);
         int[] tokenFields = { 1 };
         int[] projFields = { 0 };
         BinaryTokenizerOperatorDescriptor binaryTokenizer = new BinaryTokenizerOperatorDescriptor(spec,
diff --git a/hyracks-storage-am-invertedindex/pom.xml b/hyracks-storage-am-invertedindex/pom.xml
index df6f6d1..9a464bf 100644
--- a/hyracks-storage-am-invertedindex/pom.xml
+++ b/hyracks-storage-am-invertedindex/pom.xml
@@ -23,7 +23,7 @@
       </plugin>
     </plugins>
   </build>
-  <dependencies>
+  <dependencies>  	
   	<dependency>
   		<groupId>edu.uci.ics.hyracks</groupId>
   		<artifactId>hyracks-storage-common</artifactId>
@@ -60,6 +60,13 @@
   		<scope>compile</scope>
   	</dependency>  	
   	<dependency>
+  		<groupId>edu.uci.ics.fuzzyjoin</groupId>
+  		<artifactId>fuzzyjoin-core</artifactId>
+  		<version>0.0.2-SNAPSHOT</version>
+  		<type>jar</type>
+  		<scope>compile</scope>
+  	</dependency>
+  	<dependency>
   		<groupId>junit</groupId>
   		<artifactId>junit</artifactId>
   		<version>4.8.1</version>
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java
deleted file mode 100644
index 40cb7da..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.api;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-
-public interface IBinaryTokenizer {
-
-    public void reset(byte[] data, int start, int length);
-
-    public boolean hasNext();
-
-    public void next();
-
-    public int getTokenStartOff();
-
-    public int getTokenLength();
-
-    public void writeToken(DataOutput dos) throws IOException;
-
-    public RecordDescriptor getTokenSchema();
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
deleted file mode 100644
index 7e91fd4..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.api;
-
-import java.io.Serializable;
-
-public interface IBinaryTokenizerFactory extends Serializable {
-    public IBinaryTokenizer createBinaryTokenizer();
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListBuilder.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListBuilder.java
new file mode 100644
index 0000000..7cdd9ac
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListBuilder.java
@@ -0,0 +1,17 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+
+public interface IInvertedListBuilder {	
+	public boolean startNewList(ITupleReference tuple, int tokenField);
+	
+	// returns true if successfully appended
+	// returns false if not enough space in targetBuf	
+	public boolean appendElement(ITupleReference tuple, int[] elementFields);		
+	
+	public void setTargetBuffer(byte[] targetBuf, int startPos);
+	
+	public int getListSize();
+	
+	public int getPos();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
index 059a1e2..1533307 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
@@ -15,6 +15,7 @@
 
 package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
 
+import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizerFactory;
 import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
 import edu.uci.ics.hyracks.api.dataflow.IOperatorNodePushable;
 import edu.uci.ics.hyracks.api.dataflow.value.IRecordDescriptorProvider;
@@ -23,7 +24,6 @@
 import edu.uci.ics.hyracks.api.job.IOperatorEnvironment;
 import edu.uci.ics.hyracks.api.job.JobSpecification;
 import edu.uci.ics.hyracks.dataflow.std.base.AbstractSingleActivityOperatorDescriptor;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
 
 public class BinaryTokenizerOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor {
 
@@ -50,6 +50,6 @@
     public IOperatorNodePushable createPushRuntime(IHyracksStageletContext ctx, IOperatorEnvironment env,
             IRecordDescriptorProvider recordDescProvider, int partition, int nPartitions) throws HyracksDataException {
         return new BinaryTokenizerOperatorNodePushable(ctx, recordDescProvider.getInputRecordDescriptor(odId, 0),
-                recordDescriptors[0], tokenizerFactory.createBinaryTokenizer(), tokenFields, projFields);
+                recordDescriptors[0], tokenizerFactory.createTokenizer(), tokenFields, projFields);
     }
 }
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
index 9dac535..d470513 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
@@ -19,6 +19,8 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 
+import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
+import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
 import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
 import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
 import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
@@ -27,7 +29,6 @@
 import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
 import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
 import edu.uci.ics.hyracks.dataflow.std.base.AbstractUnaryInputUnaryOutputOperatorNodePushable;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
 
 public class BinaryTokenizerOperatorNodePushable extends AbstractUnaryInputUnaryOutputOperatorNodePushable {
 
@@ -84,7 +85,8 @@
 
                     builder.reset();
                     try {
-                        tokenizer.writeToken(builderDos);
+                        IToken token = tokenizer.getToken();
+                        token.serializeToken(builderDos);
                         builder.addFieldEndOffset();
                     } catch (IOException e) {
                         throw new HyracksDataException(e.getMessage());
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListBuilder.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListBuilder.java
new file mode 100644
index 0000000..b733103
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListBuilder.java
@@ -0,0 +1,55 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
+
+public class FixedSizeElementInvertedListBuilder implements IInvertedListBuilder {	
+	private final int listElementSize;
+	private int listSize = 0;	
+	
+	private byte[] targetBuf;
+	private int pos;
+	
+	public FixedSizeElementInvertedListBuilder(int listElementSize) {
+		this.listElementSize = listElementSize;
+	}
+		
+	@Override
+	public boolean startNewList(ITupleReference tuple, int tokenField) {
+		if(pos + listElementSize >= targetBuf.length) return false;
+		else {
+			listSize = 0;
+			return true;
+		}
+	}		
+	
+	@Override
+	public boolean appendElement(ITupleReference tuple, int[] elementFields) {		
+		if(pos + listElementSize >= targetBuf.length) return false;
+		
+		for(int i = 0; i < elementFields.length; i++) {
+			int field = elementFields[i];
+			System.arraycopy(tuple.getFieldData(field), tuple.getFieldStart(field), targetBuf, pos, tuple.getFieldLength(field));			
+		}
+		
+		listSize++;
+		
+		return true;
+	}
+	
+	@Override
+	public void setTargetBuffer(byte[] targetBuf, int startPos) {
+		this.pos = startPos;
+		this.targetBuf = targetBuf;
+	}
+
+	@Override
+	public int getListSize() {
+		return listSize;
+	}
+
+	@Override
+	public int getPos() {
+		return pos;
+	}	
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java
new file mode 100644
index 0000000..34a2825
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java
@@ -0,0 +1,146 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ByteArrayAccessibleOutputStream;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
+import edu.uci.ics.hyracks.storage.common.buffercache.ICachedPage;
+import edu.uci.ics.hyracks.storage.common.file.BufferedFileHandle;
+
+public class InvertedIndex {
+	private int rootPageId = 0;
+	private IBufferCache bufferCache;
+    private int fileId;
+		
+	public BulkLoadContext beginBulkLoad(IInvertedListBuilder invListBuilder, IBinaryComparator tokenCmp) throws HyracksDataException {
+		BulkLoadContext ctx = new BulkLoadContext(invListBuilder, tokenCmp);
+		ctx.init(rootPageId, fileId);
+		return ctx;
+	}
+	
+	public void bulkLoadAddTuple(BulkLoadContext ctx, ITupleReference tuple, int tokenField, int[] listElementFields) throws HyracksDataException {
+				
+		// first inverted list, copy token to baaos and start new list
+		if(ctx.currentInvListTokenBaaos.size() == 0) {
+			ctx.currentInvListStartPageId = ctx.currentPageId;
+			ctx.currentInvListStartOffset = ctx.invListBuilder.getPos();
+			
+			ctx.currentInvListTokenBaaos.reset();
+			ctx.currentInvListTokenBaaos.write(tuple.getFieldData(tokenField), tuple.getFieldStart(tokenField), tuple.getFieldLength(tokenField));
+			
+			if(!ctx.invListBuilder.startNewList(tuple, tokenField)) {
+				ctx.pinNextPage();
+				ctx.invListBuilder.setTargetBuffer(ctx.currentPage.getBuffer().array(), 0);
+				if(!ctx.invListBuilder.startNewList(tuple, tokenField)) {
+					throw new IllegalStateException("Failed to create first inverted list.");
+				}								
+			}
+		}
+		
+		// create new inverted list?
+		if(ctx.tokenCmp.compare(tuple.getFieldData(tokenField), 
+				tuple.getFieldStart(tokenField), 
+				tuple.getFieldLength(tokenField), 
+				ctx.currentInvListTokenBaaos.getByteArray(), 
+				0, 
+				ctx.currentInvListTokenBaaos.size()) != 0) {
+
+			ctx.lastInvListStartPageId = ctx.currentInvListStartPageId;
+			ctx.lastInvListStartOffset = ctx.currentInvListStartOffset;
+			
+			ctx.lastInvListTokenBaaos.reset();
+			ctx.lastInvListTokenBaaos.write(ctx.currentInvListTokenBaaos.getByteArray(), 0, ctx.currentInvListTokenBaaos.size());
+			
+			ctx.currentInvListTokenBaaos.reset();
+			ctx.currentInvListTokenBaaos.write(tuple.getFieldData(tokenField), tuple.getFieldStart(tokenField), tuple.getFieldLength(tokenField));
+
+			ctx.lastInvListSize = ctx.invListBuilder.getListSize();
+			if(!ctx.invListBuilder.startNewList(tuple, tokenField)) {
+				ctx.pinNextPage();
+				ctx.invListBuilder.setTargetBuffer(ctx.currentPage.getBuffer().array(), 0);
+				if(!ctx.invListBuilder.startNewList(tuple, tokenField)) {
+					throw new IllegalStateException("Failed to start new inverted list after switching to a new page.");
+				}								
+			}
+			
+			ctx.currentInvListStartPageId = ctx.currentPageId;
+			ctx.currentInvListStartOffset = ctx.invListBuilder.getPos();
+		}
+
+		// append to current inverted list
+		if(!ctx.invListBuilder.appendElement(tuple, listElementFields)) {
+			ctx.pinNextPage();
+			ctx.invListBuilder.setTargetBuffer(ctx.currentPage.getBuffer().array(), 0);
+			if(!ctx.invListBuilder.appendElement(tuple, listElementFields)) {
+				throw new IllegalStateException("Failed to append element to inverted list after switching to a new page.");
+			}								
+		}
+	}
+	
+	// returns size of last inverted list
+	public int endBulkLoad(BulkLoadContext ctx) throws HyracksDataException {		
+		ctx.lastInvListStartPageId = ctx.currentInvListStartPageId;
+		ctx.lastInvListStartOffset = ctx.currentInvListStartOffset;
+		
+		ctx.lastInvListTokenBaaos.reset();
+		ctx.lastInvListTokenBaaos.write(ctx.currentInvListTokenBaaos.getByteArray(), 0, ctx.currentInvListTokenBaaos.size());
+		
+		ctx.deinit();
+		return ctx.invListBuilder.getListSize();
+	}
+	
+	public final class BulkLoadContext {		
+		private int lastInvListSize;
+		private int lastInvListStartPageId;
+		private int lastInvListStartOffset;
+		private final ByteArrayAccessibleOutputStream lastInvListTokenBaaos = new ByteArrayAccessibleOutputStream();
+		
+		private int currentInvListStartPageId;
+		private int currentInvListStartOffset;					
+		private final ByteArrayAccessibleOutputStream currentInvListTokenBaaos = new ByteArrayAccessibleOutputStream();
+		
+		private int currentPageId;
+		private ICachedPage currentPage;		
+		private final IInvertedListBuilder invListBuilder;	
+		private final IBinaryComparator tokenCmp;
+		
+		public BulkLoadContext(IInvertedListBuilder invListBuilder, IBinaryComparator tokenCmp) {
+			this.invListBuilder = invListBuilder;
+			this.tokenCmp = tokenCmp;			
+		}
+		
+		public void init(int startPageId, int fileId) throws HyracksDataException {
+			currentPageId = startPageId;
+			currentPage = bufferCache.pin(BufferedFileHandle.getDiskPageId(fileId, currentPageId), true);
+			invListBuilder.setTargetBuffer(currentPage.getBuffer().array(), 0);
+		}
+		
+		public void deinit() throws HyracksDataException {
+			if(currentPage != null) bufferCache.unpin(currentPage);
+		}
+		
+		public void pinNextPage() throws HyracksDataException {
+			bufferCache.unpin(currentPage);
+			currentPageId++;
+			currentPage = bufferCache.pin(BufferedFileHandle.getDiskPageId(fileId, currentPageId), true);
+		}
+		
+		public ByteArrayAccessibleOutputStream getLastInvListTokenBaaos() {
+			return lastInvListTokenBaaos;
+		}
+		
+		public int getLastInvListStartPageId() {
+			return lastInvListStartPageId;
+		}
+		
+		public int getLastInvListStartOffset() {
+			return lastInvListStartOffset;
+		}
+		
+		public int getLastInvListSize() {
+			return lastInvListSize;
+		}
+	};	
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java
index dc5bd0c..47b4310 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java
@@ -21,6 +21,8 @@
 import java.util.ArrayList;
 import java.util.List;
 
+import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
+import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
 import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
 import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
 import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
@@ -41,7 +43,6 @@
 import edu.uci.ics.hyracks.storage.am.btree.impls.RangeSearchCursor;
 import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
 import edu.uci.ics.hyracks.storage.am.common.ophelpers.TreeIndexOp;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
 import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
 import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
 
@@ -135,7 +136,8 @@
 
             queryTokenBuilder.reset();
             try {
-                queryTokenizer.writeToken(queryTokenDos);
+                IToken token = queryTokenizer.getToken();
+            	token.serializeToken(queryTokenDos);
                 queryTokenBuilder.addFieldEndOffset();
             } catch (IOException e) {
                 throw new HyracksDataException(e);
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
deleted file mode 100644
index 73635f9..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
-import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-
-public class DelimitedUTF8StringBinaryTokenizer implements IBinaryTokenizer {
-
-    private static final RecordDescriptor tokenSchema = new RecordDescriptor(
-            new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE });
-
-    private final char delimiter;
-    private byte[] data;
-    private int start;
-    private int length;
-
-    private int tokenLength;
-    private int tokenStart;
-    private int pos;
-
-    public DelimitedUTF8StringBinaryTokenizer(char delimiter) {
-        this.delimiter = delimiter;
-    }
-
-    @Override
-    public int getTokenLength() {
-        return tokenLength;
-    }
-
-    @Override
-    public int getTokenStartOff() {
-        return tokenStart;
-    }
-
-    @Override
-    public boolean hasNext() {
-        if (pos >= start + length)
-            return false;
-        else
-            return true;
-    }
-
-    @Override
-    public void next() {
-        tokenLength = 0;
-        tokenStart = pos;
-        while (pos < start + length) {
-            int len = StringUtils.charSize(data, pos);
-            char ch = StringUtils.charAt(data, pos);
-            pos += len;
-            if (ch == delimiter) {
-                break;
-            }
-            tokenLength += len;
-        }
-    }
-
-    @Override
-    public void reset(byte[] data, int start, int length) {
-        this.data = data;
-        this.start = start;
-        this.pos = start;
-        this.length = length;
-        this.tokenLength = 0;
-        this.tokenStart = 0;
-        pos += 2; // UTF-8 specific
-    }
-
-    @Override
-    public void writeToken(DataOutput dos) throws IOException {
-        // WARNING: 2-byte length indicator is specific to UTF-8
-        dos.writeShort((short) tokenLength);
-        dos.write(data, tokenStart, tokenLength);
-    }
-
-    @Override
-    public RecordDescriptor getTokenSchema() {
-        return tokenSchema;
-    }
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
deleted file mode 100644
index e3e0be3..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
-
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
-
-public class DelimitedUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
-
-    private static final long serialVersionUID = 1L;
-    private final char delimiter;
-
-    public DelimitedUTF8StringBinaryTokenizerFactory(char delimiter) {
-        this.delimiter = delimiter;
-    }
-
-    @Override
-    public IBinaryTokenizer createBinaryTokenizer() {
-        return new DelimitedUTF8StringBinaryTokenizer(delimiter);
-    }
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java
deleted file mode 100644
index 54fc371..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
-import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-
-public class HashedQGramUTF8StringBinaryTokenizer implements IBinaryTokenizer {
-
-    private static final RecordDescriptor tokenSchema = new RecordDescriptor(
-            new ISerializerDeserializer[] { IntegerSerializerDeserializer.INSTANCE });
-
-    private final boolean prePost;
-    private final int q;
-    private byte[] data;
-    private int start;
-    private int length;
-    private int gramNum;
-    private int utflen;
-
-    private final char PRECHAR = '#';
-    private final char POSTCHAR = '$';
-
-    private int charPos;
-    private int pos;
-    private int hashedGram;
-
-    HashedQGramUTF8StringBinaryTokenizer(int q, boolean prePost) {
-        this.prePost = prePost;
-        this.q = q;
-    }
-
-    @Override
-    public int getTokenLength() {
-        // the produced token (hashed q-gram) is derived from data
-        // but not contained in it
-        // therefore this call does not make sense
-        return -1;
-    }
-
-    @Override
-    public int getTokenStartOff() {
-        // the produced token (hashed q-gram) is derived from data
-        // but not contained in it
-        // therefore this call does not make sense
-        return -1;
-    }
-
-    @Override
-    public boolean hasNext() {
-        if ((prePost && pos >= start + length) || (!prePost && pos >= start + length - q))
-            return false;
-        else
-            return true;
-    }
-
-    @Override
-    public void next() {
-        hashedGram = 0;
-        if (prePost) {
-            if (gramNum < q) {
-                for (int i = 0; i < q - gramNum; i++) {
-                    hashedGram = 31 * hashedGram + PRECHAR;
-                }
-
-                int tmpPos = pos;
-                for (int i = 0; i < gramNum; i++) {
-                    hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
-                    tmpPos += StringUtils.charSize(data, tmpPos);
-                }
-            } else {
-                int stopStr = Math.min(charPos + q, utflen);
-                int tmpPos = pos;
-                for (int i = charPos; i < stopStr; i++) {
-                    hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
-                    tmpPos += StringUtils.charSize(data, tmpPos);
-                }
-
-                int stopPost = (charPos + q) - (utflen);
-                for (int i = 0; i < stopPost; i++) {
-                    hashedGram = 31 * hashedGram + POSTCHAR;
-                }
-                pos += StringUtils.charSize(data, pos);
-                charPos++;
-            }
-            gramNum++;
-        } else {
-            int tmpPos = pos;
-            for (int i = charPos; i < charPos + q; i++) {
-                hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
-                tmpPos += StringUtils.charSize(data, tmpPos);
-            }
-            pos += StringUtils.charSize(data, pos);
-            charPos++;
-        }
-    }
-
-    @Override
-    public void reset(byte[] data, int start, int length) {
-        this.data = data;
-        this.start = start;
-        this.length = length;
-        this.utflen = StringUtils.getUTFLen(data, start);
-        this.pos = start + 2; // UTF-8 specific
-        this.gramNum = 1;
-        this.charPos = 0;
-    }
-
-    @Override
-    public void writeToken(DataOutput dos) throws IOException {
-        dos.writeInt(hashedGram);
-    }
-
-    public char getPreChar() {
-        return PRECHAR;
-    }
-
-    public char getPostChar() {
-        return POSTCHAR;
-    }
-
-    @Override
-    public RecordDescriptor getTokenSchema() {
-        return tokenSchema;
-    }
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java
deleted file mode 100644
index a11fe8a..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
-
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
-
-public class HashedQGramUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
-
-    private static final long serialVersionUID = 1L;
-    private final int q;
-    private final boolean prePost;
-
-    public HashedQGramUTF8StringBinaryTokenizerFactory(int q, boolean prePost) {
-        this.q = q;
-        this.prePost = prePost;
-    }
-
-    @Override
-    public IBinaryTokenizer createBinaryTokenizer() {
-        return new HashedQGramUTF8StringBinaryTokenizer(q, prePost);
-    }
-}
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java
index cba3c79..373e3b6 100644
--- a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java
@@ -27,6 +27,10 @@
 
 import org.junit.Test;
 
+import edu.uci.ics.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer;
+import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
+import edu.uci.ics.fuzzyjoin.tokenizer.ITokenFactory;
+import edu.uci.ics.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
 import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
 import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
 import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
@@ -59,10 +63,8 @@
 import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
 import edu.uci.ics.hyracks.storage.am.common.ophelpers.TreeIndexOp;
 import edu.uci.ics.hyracks.storage.am.common.tuples.TypeAwareTupleWriterFactory;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
 import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
 import edu.uci.ics.hyracks.storage.am.invertedindex.impls.SimpleConjunctiveSearcher;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer;
 import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
 import edu.uci.ics.hyracks.storage.common.buffercache.ICacheMemoryAllocator;
 import edu.uci.ics.hyracks.storage.common.file.IFileMapProvider;
@@ -208,8 +210,9 @@
         FrameTupleReference queryTuple = new FrameTupleReference();
 
         String query = "computer hyracks fast";
-        char queryDelimiter = ' ';
-        IBinaryTokenizer queryTokenizer = new DelimitedUTF8StringBinaryTokenizer(queryDelimiter);
+        
+        ITokenFactory tokenFactory = new UTF8WordTokenFactory();
+        IBinaryTokenizer queryTokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
 
         queryTb.reset();
         UTF8StringSerializerDeserializer.INSTANCE.serialize(query, queryDos);
@@ -255,26 +258,26 @@
             }
         }
 
-        /*
-         * IBinaryComparator[] searchCmps = new IBinaryComparator[1];
-         * searchCmps[0] =
-         * UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
-         * MultiComparator searchCmp = new MultiComparator(typeTraits,
-         * searchCmps);
-         * 
-         * // ordered scan IBTreeCursor scanCursor = new
-         * RangeSearchCursor(leafFrame); RangePredicate nullPred = new
-         * RangePredicate(true, null, null, true, true, null); BTreeOpContext
-         * searchOpCtx = btree.createOpContext(BTreeOp.BTO_SEARCH, leafFrame,
-         * interiorFrame, metaFrame); btree.search(scanCursor, nullPred,
-         * searchOpCtx);
-         * 
-         * try { while (scanCursor.hasNext()) { scanCursor.next();
-         * ITupleReference frameTuple = scanCursor.getTuple(); String rec =
-         * cmp.printTuple(frameTuple, btreeSerde); System.out.println(rec); } }
-         * catch (Exception e) { e.printStackTrace(); } finally {
-         * scanCursor.close(); }
-         */
+//        
+//         IBinaryComparator[] searchCmps = new IBinaryComparator[1];
+//         searchCmps[0] =
+//         UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+//         MultiComparator searchCmp = new MultiComparator(typeTraits,
+//         searchCmps);
+//         
+//         // ordered scan IBTreeCursor scanCursor = new
+//         RangeSearchCursor(leafFrame); RangePredicate nullPred = new
+//         RangePredicate(true, null, null, true, true, null); BTreeOpContext
+//         searchOpCtx = btree.createOpContext(BTreeOp.BTO_SEARCH, leafFrame,
+//         interiorFrame, metaFrame); btree.search(scanCursor, nullPred,
+//         searchOpCtx);
+//         
+//         try { while (scanCursor.hasNext()) { scanCursor.next();
+//         ITupleReference frameTuple = scanCursor.getTuple(); String rec =
+//         cmp.printTuple(frameTuple, btreeSerde); System.out.println(rec); } }
+//         catch (Exception e) { e.printStackTrace(); } finally {
+//         scanCursor.close(); }
+        
 
         btree.close();
         bufferCache.closeFile(fileId);
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java
index 47c75cf..7181b77 100644
--- a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java
@@ -19,16 +19,16 @@
 import java.io.DataInput;
 import java.io.DataInputStream;
 import java.io.DataOutputStream;
-import java.util.ArrayList;
 import java.util.Random;
 
 import org.junit.Assert;
 import org.junit.Test;
 
-import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunction;
+import edu.uci.ics.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer;
+import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
+import edu.uci.ics.fuzzyjoin.tokenizer.ITokenFactory;
+import edu.uci.ics.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
 import edu.uci.ics.hyracks.dataflow.common.comm.io.ByteArrayAccessibleOutputStream;
-import edu.uci.ics.hyracks.dataflow.common.data.hash.UTF8StringBinaryHashFunctionFactory;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
 import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
 
 public class TokenizerTest {
@@ -43,7 +43,8 @@
         int maxWordLength = 50;
         char delimiter = ' ';
 
-        DelimitedUTF8StringBinaryTokenizer tok = new DelimitedUTF8StringBinaryTokenizer(delimiter);
+        ITokenFactory tokenFactory = new UTF8WordTokenFactory();
+        DelimitedUTF8StringBinaryTokenizer tok = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
 
         // create a bunch of documents
         for (int i = 0; i < numDocs; i++) {
@@ -78,7 +79,8 @@
                 // write token to outputstream
                 ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
                 DataOutputStream dosWrite = new DataOutputStream(baaosWrite);
-                tok.writeToken(dosWrite);
+                IToken token = tok.getToken();
+                token.serializeToken(dosWrite);
 
                 // deserialize token to get string object
                 ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
@@ -90,7 +92,8 @@
         }
     }
 
-    // testing HashedQGramUTF8StringBinaryTokenizer
+    /*
+    // testing HashedNGramUTF8StringBinaryTokenizer
     @Test
     public void test02() throws Exception {
         Random rnd = new Random(50);
@@ -115,27 +118,16 @@
 
             // randomly choose pre and postfixing
             boolean prePost = false;
-            if (Math.abs(rnd.nextInt()) % 2 == 0)
-                prePost = true;
+            //if (Math.abs(rnd.nextInt()) % 2 == 0)
+              //  prePost = true;
 
-            HashedQGramUTF8StringBinaryTokenizer qgramTok = new HashedQGramUTF8StringBinaryTokenizer(q, prePost);
-
-            String extendedString = str;
-            if (prePost) {
-                // pre and postfix string
-                StringBuilder strBuilder = new StringBuilder();
-                for (int j = 0; j < q - 1; j++)
-                    strBuilder.append(qgramTok.getPreChar());
-                strBuilder.append(str);
-                for (int j = 0; j < q - 1; j++)
-                    strBuilder.append(qgramTok.getPostChar());
-                extendedString = strBuilder.toString();
-            }
-
+            ITokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+            NGramUTF8StringBinaryTokenizer qgramTok = new NGramUTF8StringBinaryTokenizer(q, prePost, true, false, tokenFactory);
+            
             // generate q-grams in deserialized form
             ArrayList<String> javaGrams = new ArrayList<String>();
-            for (int j = 0; j < extendedString.length() - q + 1; j++) {
-                javaGrams.add(extendedString.substring(j, j + q));
+            for (int j = 0; j < str.length() - q + 1; j++) {
+                javaGrams.add(str.substring(j, j + q).toLowerCase());
             }
 
             // serialize string for use in binary gram tokenizer
@@ -153,7 +145,8 @@
                 // write token to outputstream
                 ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
                 DataOutputStream dosWrite = new DataOutputStream(baaosWrite);
-                qgramTok.writeToken(dosWrite);
+                IToken token = qgramTok.getToken();
+                token.serializeToken(dosWrite);
 
                 // deserialize token to get hashed gram
                 ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
@@ -175,6 +168,7 @@
             }
         }
     }
+    */
 
     public static String randomString(int length, Random random) {
         int maxAttempts = 1000;