Merged 500:541 from trunk

git-svn-id: https://hyracks.googlecode.com/svn/branches/hyracks_scheduling@542 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-storage-am-invertedindex/pom.xml b/hyracks-storage-am-invertedindex/pom.xml
index 78a053c..4578a46 100644
--- a/hyracks-storage-am-invertedindex/pom.xml
+++ b/hyracks-storage-am-invertedindex/pom.xml
@@ -2,12 +2,12 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>edu.uci.ics.hyracks</groupId>
   <artifactId>hyracks-storage-am-invertedindex</artifactId>
-  <version>0.1.7-SNAPSHOT</version>
+  <version>0.1.8-SNAPSHOT</version>
 
   <parent>
     <groupId>edu.uci.ics.hyracks</groupId>
     <artifactId>hyracks</artifactId>
-    <version>0.1.7-SNAPSHOT</version>
+    <version>0.1.8-SNAPSHOT</version>
   </parent>
 
   <build>
@@ -27,38 +27,38 @@
   	<dependency>
   		<groupId>edu.uci.ics.hyracks</groupId>
   		<artifactId>hyracks-storage-common</artifactId>
-  		<version>0.1.7-SNAPSHOT</version>
+  		<version>0.1.8-SNAPSHOT</version>
   		<type>jar</type>
   		<scope>compile</scope>
   	</dependency>  	
   	<dependency>
   		<groupId>edu.uci.ics.hyracks</groupId>
   		<artifactId>hyracks-dataflow-common</artifactId>
-  		<version>0.1.7-SNAPSHOT</version>
+  		<version>0.1.8-SNAPSHOT</version>
   		<type>jar</type>
   		<scope>compile</scope>
   	</dependency>  	
   	<dependency>
   		<groupId>edu.uci.ics.hyracks</groupId>
   		<artifactId>hyracks-dataflow-std</artifactId>
-  		<version>0.1.7-SNAPSHOT</version>
+  		<version>0.1.8-SNAPSHOT</version>
   		<type>jar</type>
   		<scope>compile</scope>
   	</dependency>  	
   	<dependency>
   		<groupId>edu.uci.ics.hyracks</groupId>
   		<artifactId>hyracks-control-nc</artifactId>
-  		<version>0.1.7-SNAPSHOT</version>
+  		<version>0.1.8-SNAPSHOT</version>
   		<type>jar</type>
   		<scope>compile</scope>
   	</dependency>  	
   	<dependency>
   		<groupId>edu.uci.ics.hyracks</groupId>
   		<artifactId>hyracks-storage-am-btree</artifactId>
-  		<version>0.1.7-SNAPSHOT</version>
+  		<version>0.1.8-SNAPSHOT</version>
   		<type>jar</type>
   		<scope>compile</scope>
-  	</dependency>  	
+  	</dependency>    
   	<dependency>
   		<groupId>junit</groupId>
   		<artifactId>junit</artifactId>
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java
deleted file mode 100644
index 0d5dc8f..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.api;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-
-public interface IBinaryTokenizer {
-
-    public void reset(byte[] data, int start, int length);
-
-    public boolean hasNext();
-
-    public void next();
-
-    public int getTokenStartOff();
-
-    public int getTokenLength();
-
-    public int getNumTokens();
-
-    public void writeToken(DataOutput dos) throws IOException;
-
-    public RecordDescriptor getTokenSchema();
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexOperatorDescriptorHelper.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexOperatorDescriptorHelper.java
new file mode 100644
index 0000000..b3afe4a
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexOperatorDescriptorHelper.java
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.dataflow.std.file.IFileSplitProvider;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.ITreeIndexOperatorDescriptorHelper;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.IIndexRegistryProvider;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
+
+public interface IInvertedIndexOperatorDescriptorHelper extends ITreeIndexOperatorDescriptorHelper {
+    public IFileSplitProvider getInvIndexFileSplitProvider();
+
+    public IBinaryComparatorFactory[] getInvIndexComparatorFactories();
+
+    public ITypeTrait[] getInvIndexTypeTraits();
+
+    public IIndexRegistryProvider<InvertedIndex> getInvIndexRegistryProvider();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java
index 990936e..fb73a65 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java
@@ -15,14 +15,14 @@
 
 package edu.uci.ics.hyracks.storage.am.invertedindex.api;
 
-import java.nio.ByteBuffer;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
 
 public interface IInvertedIndexResultCursor {
     public boolean hasNext();
 
     public void next();
 
-    public ByteBuffer getBuffer();
+    public ITupleReference getTuple();
 
-    public void reset();
+    public void reset(IInvertedIndexSearcher invIndexSearcher);
 }
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearchModifier.java
similarity index 76%
rename from hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
rename to hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearchModifier.java
index 7e91fd4..8f7a3e3 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearchModifier.java
@@ -15,8 +15,10 @@
 
 package edu.uci.ics.hyracks.storage.am.invertedindex.api;
 
-import java.io.Serializable;
+import java.util.List;
 
-public interface IBinaryTokenizerFactory extends Serializable {
-    public IBinaryTokenizer createBinaryTokenizer();
+public interface IInvertedIndexSearchModifier {
+    int getOccurrenceThreshold(List<IInvertedListCursor> invListCursors);
+
+    int getPrefixLists(List<IInvertedListCursor> invListCursors);
 }
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java
index 98dc3b9..36d64ea 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java
@@ -15,10 +15,21 @@
 
 package edu.uci.ics.hyracks.storage.am.invertedindex.api;
 
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
 import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
 
 public interface IInvertedIndexSearcher {
-    public void search(ITupleReference queryTuple, int queryFieldIndex) throws Exception;
+    public void search(IInvertedIndexResultCursor resultCursor, ITupleReference queryTuple, int queryFieldIndex,
+            IInvertedIndexSearchModifier searchModifier) throws Exception;
 
-    public IInvertedIndexResultCursor getResultCursor();
+    public IFrameTupleAccessor createResultFrameTupleAccessor();
+
+    public ITupleReference createResultTupleReference();
+
+    public List<ByteBuffer> getResultBuffers();
+
+    public int getNumValidResultBuffers();
 }
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListBuilder.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListBuilder.java
new file mode 100644
index 0000000..aaaef56
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListBuilder.java
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+
+public interface IInvertedListBuilder {
+    public boolean startNewList(ITupleReference tuple, int numTokenFields);
+
+    // returns true if successfully appended
+    // returns false if not enough space in targetBuf
+    public boolean appendElement(ITupleReference tuple, int numTokenFields, int numElementFields);
+
+    public void setTargetBuffer(byte[] targetBuf, int startPos);
+
+    public int getListSize();
+
+    public int getPos();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListCursor.java
new file mode 100644
index 0000000..9435f3c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListCursor.java
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+
+public interface IInvertedListCursor extends Comparable<IInvertedListCursor> {
+    void reset(int startPageId, int endPageId, int startOff, int numElements);
+
+    void pinPagesSync() throws HyracksDataException;
+
+    void pinPagesAsync() throws HyracksDataException;
+
+    void unpinPages() throws HyracksDataException;
+
+    boolean hasNext();
+
+    void next();
+
+    ITupleReference getTuple();
+
+    // getters
+    int getNumElements();
+
+    int getStartPageId();
+
+    int getEndPageId();
+
+    int getStartOff();
+
+    // jump to a specific element
+    void positionCursor(int elementIx);
+
+    boolean containsKey(ITupleReference searchTuple, MultiComparator invListCmp);
+
+    // for debugging
+    String printInvList(ISerializerDeserializer[] serdes) throws HyracksDataException;
+
+    String printCurrentElement(ISerializerDeserializer[] serdes) throws HyracksDataException;
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/AbstractInvertedIndexOperatorDescriptor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/AbstractInvertedIndexOperatorDescriptor.java
new file mode 100644
index 0000000..0d0c5bf
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/AbstractInvertedIndexOperatorDescriptor.java
@@ -0,0 +1,151 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
+
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.job.JobSpecification;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractSingleActivityOperatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.file.IFileSplitProvider;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndex;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexFrameFactory;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.IIndexRegistryProvider;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.ITreeIndexOpHelperFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexOperatorDescriptorHelper;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
+import edu.uci.ics.hyracks.storage.common.IStorageManagerInterface;
+
+public abstract class AbstractInvertedIndexOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor
+        implements IInvertedIndexOperatorDescriptorHelper {
+
+    private static final long serialVersionUID = 1L;
+
+    // general
+    protected final IStorageManagerInterface storageManager;
+
+    // btree
+    protected final IFileSplitProvider btreeFileSplitProvider;
+    protected final IIndexRegistryProvider<ITreeIndex> treeIndexRegistryProvider;
+    protected final ITreeIndexFrameFactory interiorFrameFactory;
+    protected final ITreeIndexFrameFactory leafFrameFactory;
+    protected final ITypeTrait[] btreeTypeTraits;
+    protected final IBinaryComparatorFactory[] btreeComparatorFactories;
+    protected final ITreeIndexOpHelperFactory opHelperFactory;
+
+    // inverted index
+    protected final IFileSplitProvider invIndexFileSplitProvider;
+    protected final IIndexRegistryProvider<InvertedIndex> invIndexRegistryProvider;
+    protected final ITypeTrait[] invIndexTypeTraits;
+    protected final IBinaryComparatorFactory[] invIndexComparatorFactories;
+
+    public AbstractInvertedIndexOperatorDescriptor(JobSpecification spec, int inputArity, int outputArity,
+            RecordDescriptor recDesc, IStorageManagerInterface storageManager,
+            IFileSplitProvider btreeFileSplitProvider, IIndexRegistryProvider<ITreeIndex> treeIndexRegistryProvider,
+            ITreeIndexFrameFactory interiorFrameFactory, ITreeIndexFrameFactory leafFrameFactory,
+            ITypeTrait[] btreeTypeTraits, IBinaryComparatorFactory[] btreeComparatorFactories, float btreeFillFactor,
+            ITreeIndexOpHelperFactory opHelperFactory, IFileSplitProvider invIndexFileSplitProvider,
+            IIndexRegistryProvider<InvertedIndex> invIndexRegistryProvider, ITypeTrait[] invIndexTypeTraits,
+            IBinaryComparatorFactory[] invIndexComparatorFactories) {
+        super(spec, inputArity, outputArity);
+
+        // general
+        this.storageManager = storageManager;
+
+        // btree
+        this.btreeFileSplitProvider = btreeFileSplitProvider;
+        this.treeIndexRegistryProvider = treeIndexRegistryProvider;
+        this.interiorFrameFactory = interiorFrameFactory;
+        this.leafFrameFactory = leafFrameFactory;
+        this.btreeTypeTraits = btreeTypeTraits;
+        this.btreeComparatorFactories = btreeComparatorFactories;
+        this.opHelperFactory = opHelperFactory;
+
+        // inverted index
+        this.invIndexFileSplitProvider = invIndexFileSplitProvider;
+        this.invIndexRegistryProvider = invIndexRegistryProvider;
+        this.invIndexTypeTraits = invIndexTypeTraits;
+        this.invIndexComparatorFactories = invIndexComparatorFactories;
+
+        if (outputArity > 0)
+            recordDescriptors[0] = recDesc;
+    }
+
+    @Override
+    public IFileSplitProvider getTreeIndexFileSplitProvider() {
+        return btreeFileSplitProvider;
+    }
+
+    @Override
+    public IBinaryComparatorFactory[] getTreeIndexComparatorFactories() {
+        return btreeComparatorFactories;
+    }
+
+    @Override
+    public ITypeTrait[] getTreeIndexTypeTraits() {
+        return btreeTypeTraits;
+    }
+
+    @Override
+    public ITreeIndexFrameFactory getTreeIndexInteriorFactory() {
+        return interiorFrameFactory;
+    }
+
+    @Override
+    public ITreeIndexFrameFactory getTreeIndexLeafFactory() {
+        return leafFrameFactory;
+    }
+
+    @Override
+    public IStorageManagerInterface getStorageManager() {
+        return storageManager;
+    }
+
+    @Override
+    public IIndexRegistryProvider<ITreeIndex> getTreeIndexRegistryProvider() {
+        return treeIndexRegistryProvider;
+    }
+
+    @Override
+    public RecordDescriptor getRecordDescriptor() {
+        return recordDescriptors[0];
+    }
+
+    @Override
+    public IIndexRegistryProvider<InvertedIndex> getInvIndexRegistryProvider() {
+        return invIndexRegistryProvider;
+    }
+
+    @Override
+    public IBinaryComparatorFactory[] getInvIndexComparatorFactories() {
+        return invIndexComparatorFactories;
+    }
+
+    @Override
+    public IFileSplitProvider getInvIndexFileSplitProvider() {
+        return invIndexFileSplitProvider;
+    }
+
+    @Override
+    public ITypeTrait[] getInvIndexTypeTraits() {
+        return invIndexTypeTraits;
+    }
+
+    @Override
+    public ITreeIndexOpHelperFactory getTreeIndexOpHelperFactory() {
+        return opHelperFactory;
+    }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
index 6e7e0d3..b5b1393 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
@@ -23,7 +23,7 @@
 import edu.uci.ics.hyracks.api.job.IOperatorEnvironment;
 import edu.uci.ics.hyracks.api.job.JobSpecification;
 import edu.uci.ics.hyracks.dataflow.std.base.AbstractSingleActivityOperatorDescriptor;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizerFactory;
 
 public class BinaryTokenizerOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor {
 
@@ -50,6 +50,6 @@
     public IOperatorNodePushable createPushRuntime(IHyracksTaskContext ctx, IOperatorEnvironment env,
             IRecordDescriptorProvider recordDescProvider, int partition, int nPartitions) throws HyracksDataException {
         return new BinaryTokenizerOperatorNodePushable(ctx, recordDescProvider.getInputRecordDescriptor(odId, 0),
-                recordDescriptors[0], tokenizerFactory.createBinaryTokenizer(), tokenFields, projFields);
+                recordDescriptors[0], tokenizerFactory.createTokenizer(), tokenFields, projFields);
     }
 }
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
index 37e7bd6..1a0091a 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
@@ -27,7 +27,8 @@
 import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
 import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
 import edu.uci.ics.hyracks.dataflow.std.base.AbstractUnaryInputUnaryOutputOperatorNodePushable;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
 
 public class BinaryTokenizerOperatorNodePushable extends AbstractUnaryInputUnaryOutputOperatorNodePushable {
 
@@ -85,7 +86,8 @@
 
                     builder.reset();
                     try {
-                        tokenizer.writeToken(builderDos);
+                        IToken token = tokenizer.getToken();
+                        token.serializeToken(builderDos);
                         builder.addFieldEndOffset();
                     } catch (IOException e) {
                         throw new HyracksDataException(e.getMessage());
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexBulkLoadOperatorDescriptor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexBulkLoadOperatorDescriptor.java
new file mode 100644
index 0000000..1ab72fe
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexBulkLoadOperatorDescriptor.java
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
+
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.IOperatorNodePushable;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.IRecordDescriptorProvider;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.api.job.IOperatorEnvironment;
+import edu.uci.ics.hyracks.api.job.JobSpecification;
+import edu.uci.ics.hyracks.dataflow.std.file.IFileSplitProvider;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndex;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexFrameFactory;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.IIndexRegistryProvider;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.ITreeIndexOpHelperFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
+import edu.uci.ics.hyracks.storage.common.IStorageManagerInterface;
+
+public class InvertedIndexBulkLoadOperatorDescriptor extends AbstractInvertedIndexOperatorDescriptor {
+
+    private static final long serialVersionUID = 1L;
+
+    private final int[] fieldPermutation;
+    private final float btreeFillFactor;
+    private final IInvertedListBuilder invListBuilder;
+
+    public InvertedIndexBulkLoadOperatorDescriptor(JobSpecification spec, IStorageManagerInterface storageManager,
+            int[] fieldPermutation, IFileSplitProvider btreeFileSplitProvider,
+            IIndexRegistryProvider<ITreeIndex> treeIndexRegistryProvider, ITreeIndexFrameFactory interiorFrameFactory,
+            ITreeIndexFrameFactory leafFrameFactory, ITypeTrait[] btreeTypeTraits,
+            IBinaryComparatorFactory[] btreeComparatorFactories, float btreeFillFactor,
+            ITreeIndexOpHelperFactory opHelperFactory, IFileSplitProvider invIndexFileSplitProvider,
+            IIndexRegistryProvider<InvertedIndex> invIndexRegistryProvider, ITypeTrait[] invIndexTypeTraits,
+            IBinaryComparatorFactory[] invIndexComparatorFactories, IInvertedListBuilder invListBuilder) {
+        super(spec, 1, 0, null, storageManager, btreeFileSplitProvider, treeIndexRegistryProvider,
+                interiorFrameFactory, leafFrameFactory, btreeTypeTraits, btreeComparatorFactories, btreeFillFactor,
+                opHelperFactory, invIndexFileSplitProvider, invIndexRegistryProvider, invIndexTypeTraits,
+                invIndexComparatorFactories);
+        this.fieldPermutation = fieldPermutation;
+        this.btreeFillFactor = btreeFillFactor;
+        this.invListBuilder = invListBuilder;
+    }
+
+    @Override
+    public IOperatorNodePushable createPushRuntime(IHyracksTaskContext ctx, IOperatorEnvironment env,
+            IRecordDescriptorProvider recordDescProvider, int partition, int nPartitions) {
+        return new InvertedIndexBulkLoadOperatorNodePushable(this, ctx, partition, fieldPermutation, btreeFillFactor,
+                invListBuilder, recordDescProvider);
+    }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexBulkLoadOperatorNodePushable.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexBulkLoadOperatorNodePushable.java
new file mode 100644
index 0000000..d2bd395
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexBulkLoadOperatorNodePushable.java
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
+
+import java.nio.ByteBuffer;
+
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IRecordDescriptorProvider;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractUnaryInputSinkOperatorNodePushable;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.IndexHelperOpenMode;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.PermutingFrameTupleReference;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.TreeIndexOpHelper;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
+
+public class InvertedIndexBulkLoadOperatorNodePushable extends AbstractUnaryInputSinkOperatorNodePushable {
+    private final TreeIndexOpHelper treeIndexOpHelper;
+    private float btreeFillFactor;
+
+    private final InvertedIndexOpHelper invIndexOpHelper;
+    protected final IInvertedListBuilder invListBuilder;
+    private InvertedIndex.BulkLoadContext bulkLoadCtx;
+
+    private final IHyracksTaskContext ctx;
+
+    private FrameTupleAccessor accessor;
+    private PermutingFrameTupleReference tuple = new PermutingFrameTupleReference();
+
+    private IRecordDescriptorProvider recordDescProvider;
+
+    public InvertedIndexBulkLoadOperatorNodePushable(AbstractInvertedIndexOperatorDescriptor opDesc,
+            IHyracksTaskContext ctx, int partition, int[] fieldPermutation, float btreeFillFactor,
+            IInvertedListBuilder invListBuilder, IRecordDescriptorProvider recordDescProvider) {
+        treeIndexOpHelper = opDesc.getTreeIndexOpHelperFactory().createTreeIndexOpHelper(opDesc, ctx, partition,
+                IndexHelperOpenMode.CREATE);
+        invIndexOpHelper = new InvertedIndexOpHelper(opDesc, ctx, partition, IndexHelperOpenMode.CREATE);
+        this.btreeFillFactor = btreeFillFactor;
+        this.recordDescProvider = recordDescProvider;
+        this.ctx = ctx;
+        this.invListBuilder = invListBuilder;
+        tuple.setFieldPermutation(fieldPermutation);
+    }
+
+    @Override
+    public void open() throws HyracksDataException {
+        AbstractInvertedIndexOperatorDescriptor opDesc = (AbstractInvertedIndexOperatorDescriptor) treeIndexOpHelper
+                .getOperatorDescriptor();
+        RecordDescriptor recDesc = recordDescProvider.getInputRecordDescriptor(opDesc.getOperatorId(), 0);
+        accessor = new FrameTupleAccessor(treeIndexOpHelper.getHyracksTaskContext().getFrameSize(), recDesc);
+
+        // btree
+        try {
+            treeIndexOpHelper.init();
+            treeIndexOpHelper.getTreeIndex().open(treeIndexOpHelper.getIndexFileId());
+        } catch (Exception e) {
+            // cleanup in case of failure
+            treeIndexOpHelper.deinit();
+            throw new HyracksDataException(e);
+        }
+
+        // inverted index
+        try {
+            invIndexOpHelper.init();
+            invIndexOpHelper.getInvIndex().open(invIndexOpHelper.getInvIndexFileId());
+            bulkLoadCtx = invIndexOpHelper.getInvIndex().beginBulkLoad(invListBuilder, ctx.getFrameSize(),
+                    btreeFillFactor);
+        } catch (Exception e) {
+            // cleanup in case of failure
+            invIndexOpHelper.deinit();
+            throw new HyracksDataException(e);
+        }
+    }
+
+    @Override
+    public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
+        accessor.reset(buffer);
+        int tupleCount = accessor.getTupleCount();
+        for (int i = 0; i < tupleCount; i++) {
+            tuple.reset(accessor, i);
+            invIndexOpHelper.getInvIndex().bulkLoadAddTuple(bulkLoadCtx, tuple);
+        }
+    }
+
+    @Override
+    public void close() throws HyracksDataException {
+        try {
+            invIndexOpHelper.getInvIndex().endBulkLoad(bulkLoadCtx);
+        } finally {
+            treeIndexOpHelper.deinit();
+        }
+    }
+
+    @Override
+    public void flush() throws HyracksDataException {
+    }
+}
\ No newline at end of file
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexOpHelper.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexOpHelper.java
new file mode 100644
index 0000000..1eb6757
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexOpHelper.java
@@ -0,0 +1,154 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
+
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.api.io.FileReference;
+import edu.uci.ics.hyracks.dataflow.std.file.IFileSplitProvider;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndex;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.ITreeIndexOperatorDescriptorHelper;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.IndexHelperOpenMode;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.IndexRegistry;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexOperatorDescriptorHelper;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
+import edu.uci.ics.hyracks.storage.common.file.IFileMapProvider;
+
+public final class InvertedIndexOpHelper {
+
+    private InvertedIndex invIndex;
+    private int invIndexFileId = -1;
+    private int partition;
+
+    private IInvertedIndexOperatorDescriptorHelper opDesc;
+    private IHyracksTaskContext ctx;
+
+    private IndexHelperOpenMode mode;
+
+    public InvertedIndexOpHelper(IInvertedIndexOperatorDescriptorHelper opDesc, final IHyracksTaskContext ctx,
+            int partition, IndexHelperOpenMode mode) {
+        this.opDesc = opDesc;
+        this.ctx = ctx;
+        this.mode = mode;
+        this.partition = partition;
+    }
+
+    public void init() throws HyracksDataException {
+        IBufferCache bufferCache = opDesc.getStorageManager().getBufferCache(ctx);
+        IFileMapProvider fileMapProvider = opDesc.getStorageManager().getFileMapProvider(ctx);
+        IFileSplitProvider fileSplitProvider = opDesc.getInvIndexFileSplitProvider();
+
+        FileReference f = fileSplitProvider.getFileSplits()[partition].getLocalFile();
+        boolean fileIsMapped = fileMapProvider.isMapped(f);
+
+        switch (mode) {
+
+            case OPEN: {
+                if (!fileIsMapped) {
+                    throw new HyracksDataException("Trying to open inverted index from unmapped file " + f.toString());
+                }
+            }
+                break;
+
+            case CREATE:
+            case ENLIST: {
+                if (!fileIsMapped) {
+                    bufferCache.createFile(f);
+                }
+            }
+                break;
+
+        }
+
+        int fileId = fileMapProvider.lookupFileId(f);
+        try {
+            bufferCache.openFile(fileId);
+        } catch (HyracksDataException e) {
+            // revert state of buffer cache since file failed to open
+            if (!fileIsMapped) {
+                bufferCache.deleteFile(fileId);
+            }
+            throw e;
+        }
+
+        // only set btreeFileId member when openFile() succeeds,
+        // otherwise deinit() will try to close the file that failed to open
+        invIndexFileId = fileId;
+        IndexRegistry<InvertedIndex> invIndexRegistry = opDesc.getInvIndexRegistryProvider().getRegistry(ctx);
+        invIndex = invIndexRegistry.get(invIndexFileId);
+        if (invIndex == null) {
+
+            // create new inverted index and register it
+            invIndexRegistry.lock();
+            try {
+                // check if inverted index has already been registered by
+                // another thread
+                invIndex = invIndexRegistry.get(invIndexFileId);
+                if (invIndex == null) {
+                    // this thread should create and register the inverted index
+
+                    IBinaryComparator[] comparators = new IBinaryComparator[opDesc.getInvIndexComparatorFactories().length];
+                    for (int i = 0; i < opDesc.getInvIndexComparatorFactories().length; i++) {
+                        comparators[i] = opDesc.getInvIndexComparatorFactories()[i].createBinaryComparator();
+                    }
+
+                    MultiComparator cmp = new MultiComparator(opDesc.getInvIndexTypeTraits(), comparators);
+
+                    // assumes btree has already been registered
+                    IFileSplitProvider btreeFileSplitProvider = opDesc.getTreeIndexFileSplitProvider();
+                    IndexRegistry<ITreeIndex> treeIndexRegistry = opDesc.getTreeIndexRegistryProvider()
+                            .getRegistry(ctx);
+                    FileReference btreeFile = btreeFileSplitProvider.getFileSplits()[partition].getLocalFile();
+                    boolean btreeFileIsMapped = fileMapProvider.isMapped(btreeFile);
+                    if (!btreeFileIsMapped) {
+                        throw new HyracksDataException(
+                                "Trying to create inverted index, but associated BTree file has not been mapped");
+                    }
+                    int btreeFileId = fileMapProvider.lookupFileId(f);
+                    BTree btree = (BTree) treeIndexRegistry.get(btreeFileId);
+
+                    invIndex = new InvertedIndex(bufferCache, btree, cmp);
+                    invIndex.open(invIndexFileId);
+                    invIndexRegistry.register(invIndexFileId, invIndex);
+                }
+            } finally {
+                invIndexRegistry.unlock();
+            }
+        }
+    }
+
+    public void deinit() throws HyracksDataException {
+        if (invIndexFileId != -1) {
+            IBufferCache bufferCache = opDesc.getStorageManager().getBufferCache(ctx);
+            bufferCache.closeFile(invIndexFileId);
+        }
+    }
+
+    public InvertedIndex getInvIndex() {
+        return invIndex;
+    }
+
+    public ITreeIndexOperatorDescriptorHelper getOperatorDescriptor() {
+        return opDesc;
+    }
+
+    public int getInvIndexFileId() {
+        return invIndexFileId;
+    }
+}
\ No newline at end of file
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListBuilder.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListBuilder.java
new file mode 100644
index 0000000..03fc9a1
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListBuilder.java
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
+
+public class FixedSizeElementInvertedListBuilder implements IInvertedListBuilder {
+    private final int listElementSize;
+    private int listSize = 0;
+
+    private byte[] targetBuf;
+    private int pos;
+
+    public FixedSizeElementInvertedListBuilder(ITypeTrait[] invListFields) {
+        int tmp = 0;
+        for (int i = 0; i < invListFields.length; i++) {
+            tmp += invListFields[i].getStaticallyKnownDataLength();
+        }
+        listElementSize = tmp;
+    }
+
+    @Override
+    public boolean startNewList(ITupleReference tuple, int tokenField) {
+        if (pos + listElementSize > targetBuf.length)
+            return false;
+        else {
+            listSize = 0;
+            return true;
+        }
+    }
+
+    @Override
+    public boolean appendElement(ITupleReference tuple, int numTokenFields, int numElementFields) {
+        if (pos + listElementSize > targetBuf.length)
+            return false;
+
+        for (int i = 0; i < numElementFields; i++) {
+            int field = numTokenFields + i;
+            System.arraycopy(tuple.getFieldData(field), tuple.getFieldStart(field), targetBuf, pos,
+                    tuple.getFieldLength(field));
+        }
+
+        listSize++;
+        pos += listElementSize;
+
+        return true;
+    }
+
+    @Override
+    public void setTargetBuffer(byte[] targetBuf, int startPos) {
+        this.targetBuf = targetBuf;
+        this.pos = startPos;
+    }
+
+    @Override
+    public int getListSize() {
+        return listSize;
+    }
+
+    @Override
+    public int getPos() {
+        return pos;
+    }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListCursor.java
new file mode 100644
index 0000000..f7ef56e
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListCursor.java
@@ -0,0 +1,280 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
+import edu.uci.ics.hyracks.storage.common.buffercache.ICachedPage;
+import edu.uci.ics.hyracks.storage.common.file.BufferedFileHandle;
+
+public class FixedSizeElementInvertedListCursor implements IInvertedListCursor {
+
+    private final IBufferCache bufferCache;
+    private final int fileId;
+    private final int elementSize;
+    private int currentElementIx;
+    private int currentOff;
+    private int currentPageIx;
+
+    private int startPageId;
+    private int endPageId;
+    private int startOff;
+    private int numElements;
+
+    private final FixedSizeTupleReference tuple;
+    private ICachedPage[] pages = new ICachedPage[10];
+    private int[] elementIndexes = new int[10];
+
+    public FixedSizeElementInvertedListCursor(IBufferCache bufferCache, int fileId, ITypeTrait[] invListFields) {
+        this.bufferCache = bufferCache;
+        this.fileId = fileId;
+        this.currentElementIx = 0;
+        this.currentPageIx = 0;
+
+        int tmp = 0;
+        for (int i = 0; i < invListFields.length; i++) {
+            tmp += invListFields[i].getStaticallyKnownDataLength();
+        }
+        elementSize = tmp;
+        this.currentOff = -elementSize;
+        this.tuple = new FixedSizeTupleReference(invListFields);
+    }
+
+    @Override
+    public boolean hasNext() {
+        if (currentElementIx < numElements)
+            return true;
+        else
+            return false;
+    }
+
+    @Override
+    public void next() {
+        if (currentOff + elementSize >= bufferCache.getPageSize()) {
+            currentPageIx++;
+            currentOff = 0;
+        } else {
+            currentOff += elementSize;
+        }
+
+        currentElementIx++;
+        tuple.reset(pages[currentPageIx].getBuffer().array(), currentOff);
+    }
+
+    @Override
+    public void pinPagesAsync() {
+        // TODO: implement
+    }
+
+    @Override
+    public void pinPagesSync() throws HyracksDataException {
+        int pix = 0;
+        for (int i = startPageId; i <= endPageId; i++) {
+            pages[pix] = bufferCache.pin(BufferedFileHandle.getDiskPageId(fileId, i), false);
+            pages[pix].acquireReadLatch();
+            pix++;
+        }
+    }
+
+    @Override
+    public void unpinPages() throws HyracksDataException {
+        int numPages = endPageId - startPageId + 1;
+        for (int i = 0; i < numPages; i++) {
+            pages[i].releaseReadLatch();
+            bufferCache.unpin(pages[i]);
+        }
+    }
+
+    @Override
+    public void positionCursor(int elementIx) {
+        int numPages = endPageId - startPageId + 1;
+
+        currentPageIx = binarySearch(elementIndexes, 0, numPages, elementIx);
+        if (currentPageIx < 0) {
+            throw new IndexOutOfBoundsException("Requested index: " + elementIx + " from array with numElements: "
+                    + numElements);
+        }
+
+        if (currentPageIx == 0) {
+            currentOff = startOff + elementIx * elementSize;
+        } else {
+            int relativeElementIx = elementIx - elementIndexes[currentPageIx - 1] - 1;
+            currentOff = relativeElementIx * elementSize;
+        }
+
+        currentElementIx = elementIx;
+        tuple.reset(pages[currentPageIx].getBuffer().array(), currentOff);
+    }
+
+    @Override
+    public boolean containsKey(ITupleReference searchTuple, MultiComparator invListCmp) {
+        int mid;
+        int begin = 0;
+        int end = numElements - 1;
+
+        while (begin <= end) {
+            mid = (begin + end) / 2;
+            positionCursor(mid);
+            int cmp = invListCmp.compare(searchTuple, tuple);
+            if (cmp < 0) {
+                end = mid - 1;
+            } else if (cmp > 0) {
+                begin = mid + 1;
+            } else {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    @Override
+    public void reset(int startPageId, int endPageId, int startOff, int numElements) {
+        this.startPageId = startPageId;
+        this.endPageId = endPageId;
+        this.startOff = startOff;
+        this.numElements = numElements;
+        this.currentElementIx = 0;
+        this.currentPageIx = 0;
+        this.currentOff = startOff - elementSize;
+
+        int numPages = endPageId - startPageId + 1;
+        if (numPages > pages.length) {
+            pages = new ICachedPage[endPageId - startPageId + 1];
+            elementIndexes = new int[endPageId - startPageId + 1];
+        }
+
+        // fill elementIndexes
+        // first page
+        int cumulElements = (bufferCache.getPageSize() - startOff) / elementSize;
+        elementIndexes[0] = cumulElements - 1;
+
+        // middle, full pages
+        for (int i = 1; i < numPages - 1; i++) {
+            elementIndexes[i] = elementIndexes[i - 1] + (bufferCache.getPageSize() / elementSize);
+        }
+
+        // last page
+        elementIndexes[numPages - 1] = numElements - 1;
+    }
+
+    @Override
+    public String printInvList(ISerializerDeserializer[] serdes) throws HyracksDataException {
+        int oldCurrentOff = currentOff;
+        int oldCurrentPageId = currentPageIx;
+        int oldCurrentElementIx = currentElementIx;
+
+        currentOff = startOff - elementSize;
+        currentPageIx = 0;
+        currentElementIx = 0;
+
+        StringBuilder strBuilder = new StringBuilder();
+
+        int count = 0;
+        while (hasNext()) {
+            next();
+            count++;
+            for (int i = 0; i < tuple.getFieldCount(); i++) {
+                ByteArrayInputStream inStream = new ByteArrayInputStream(tuple.getFieldData(i), tuple.getFieldStart(i),
+                        tuple.getFieldLength(i));
+                DataInput dataIn = new DataInputStream(inStream);
+                Object o = serdes[i].deserialize(dataIn);
+                strBuilder.append(o.toString());
+                if (i + 1 < tuple.getFieldCount())
+                    strBuilder.append(",");
+            }
+            strBuilder.append(" ");
+        }
+
+        // reset previous state
+        currentOff = oldCurrentOff;
+        currentPageIx = oldCurrentPageId;
+        currentElementIx = oldCurrentElementIx;
+
+        return strBuilder.toString();
+    }
+
+    public String printCurrentElement(ISerializerDeserializer[] serdes) throws HyracksDataException {
+        StringBuilder strBuilder = new StringBuilder();
+        for (int i = 0; i < tuple.getFieldCount(); i++) {
+            ByteArrayInputStream inStream = new ByteArrayInputStream(tuple.getFieldData(i), tuple.getFieldStart(i),
+                    tuple.getFieldLength(i));
+            DataInput dataIn = new DataInputStream(inStream);
+            Object o = serdes[i].deserialize(dataIn);
+            strBuilder.append(o.toString());
+            if (i + 1 < tuple.getFieldCount())
+                strBuilder.append(",");
+        }
+        return strBuilder.toString();
+    }
+
+    private int binarySearch(int[] arr, int arrStart, int arrLength, int key) {
+        int mid;
+        int begin = arrStart;
+        int end = arrStart + arrLength - 1;
+
+        while (begin <= end) {
+            mid = (begin + end) / 2;
+            int cmp = (key - arr[mid]);
+            if (cmp < 0) {
+                end = mid - 1;
+            } else if (cmp > 0) {
+                begin = mid + 1;
+            } else {
+                return mid;
+            }
+        }
+
+        if (begin > arr.length - 1)
+            return -1;
+        if (key < arr[begin])
+            return begin;
+        else
+            return -1;
+    }
+
+    @Override
+    public int compareTo(IInvertedListCursor invListCursor) {
+        return numElements - invListCursor.getNumElements();
+    }
+
+    @Override
+    public int getEndPageId() {
+        return endPageId;
+    }
+
+    @Override
+    public int getNumElements() {
+        return numElements;
+    }
+
+    @Override
+    public int getStartOff() {
+        return startOff;
+    }
+
+    @Override
+    public int getStartPageId() {
+        return startPageId;
+    }
+
+    public int getOffset() {
+        return currentOff;
+    }
+
+    public ICachedPage getPage() {
+        return pages[currentPageIx];
+    }
+
+    @Override
+    public ITupleReference getTuple() {
+        return tuple;
+    }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeFrameTupleAccessor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeFrameTupleAccessor.java
new file mode 100644
index 0000000..9858eb0
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeFrameTupleAccessor.java
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.nio.ByteBuffer;
+
+import edu.uci.ics.hyracks.api.comm.FrameHelper;
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+
+public class FixedSizeFrameTupleAccessor implements IFrameTupleAccessor {
+
+    private final int frameSize;
+    private ByteBuffer buffer;
+
+    private final ITypeTrait[] fields;
+    private final int[] fieldStartOffsets;
+    private final int tupleSize;
+
+    public FixedSizeFrameTupleAccessor(int frameSize, ITypeTrait[] fields) {
+        this.frameSize = frameSize;
+        this.fields = fields;
+        this.fieldStartOffsets = new int[fields.length];
+        this.fieldStartOffsets[0] = 0;
+        for (int i = 1; i < fields.length; i++) {
+            fieldStartOffsets[i] = fieldStartOffsets[i - 1] + fields[i - 1].getStaticallyKnownDataLength();
+        }
+
+        int tmp = 0;
+        for (int i = 0; i < fields.length; i++) {
+            tmp += fields[i].getStaticallyKnownDataLength();
+        }
+        tupleSize = tmp;
+    }
+
+    @Override
+    public ByteBuffer getBuffer() {
+        return buffer;
+    }
+
+    @Override
+    public int getFieldCount() {
+        return fields.length;
+    }
+
+    @Override
+    public int getFieldEndOffset(int tupleIndex, int fIdx) {
+        return getTupleStartOffset(tupleIndex) + fieldStartOffsets[fIdx] + fields[fIdx].getStaticallyKnownDataLength();
+    }
+
+    @Override
+    public int getFieldLength(int tupleIndex, int fIdx) {
+        return fields[fIdx].getStaticallyKnownDataLength();
+    }
+
+    @Override
+    public int getFieldSlotsLength() {
+        return 0;
+    }
+
+    @Override
+    public int getFieldStartOffset(int tupleIndex, int fIdx) {
+        return tupleIndex * tupleSize + fieldStartOffsets[fIdx];
+    }
+
+    @Override
+    public int getTupleCount() {
+        return buffer.getInt(FrameHelper.getTupleCountOffset(frameSize));
+    }
+
+    @Override
+    public int getTupleEndOffset(int tupleIndex) {
+        return getFieldEndOffset(tupleIndex, fields.length - 1);
+    }
+
+    @Override
+    public int getTupleStartOffset(int tupleIndex) {
+        return tupleIndex * tupleSize;
+    }
+
+    @Override
+    public void reset(ByteBuffer buffer) {
+        this.buffer = buffer;
+    }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeFrameTupleAppender.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeFrameTupleAppender.java
new file mode 100644
index 0000000..edc2304
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeFrameTupleAppender.java
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.nio.ByteBuffer;
+
+import edu.uci.ics.hyracks.api.comm.FrameHelper;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+
+public class FixedSizeFrameTupleAppender {
+
+    private static final int TUPLE_COUNT_SIZE = 4;
+    private final int frameSize;
+    private final int tupleSize;
+    private ByteBuffer buffer;
+    private int tupleCount;
+    private int tupleDataEndOffset;
+
+    public FixedSizeFrameTupleAppender(int frameSize, ITypeTrait[] fields) {
+        this.frameSize = frameSize;
+        int tmp = 0;
+        for (int i = 0; i < fields.length; i++) {
+            tmp += fields[i].getStaticallyKnownDataLength();
+        }
+        tupleSize = tmp;
+    }
+
+    public void reset(ByteBuffer buffer, boolean clear) {
+        this.buffer = buffer;
+        if (clear) {
+            buffer.putInt(FrameHelper.getTupleCountOffset(frameSize), 0);
+            tupleCount = 0;
+            tupleDataEndOffset = 0;
+        }
+    }
+
+    public boolean append(byte[] bytes, int offset) {
+        if (tupleDataEndOffset + tupleSize + TUPLE_COUNT_SIZE <= frameSize) {
+            System.arraycopy(bytes, offset, buffer.array(), tupleDataEndOffset, tupleSize);
+            tupleDataEndOffset += tupleSize;
+            tupleCount++;
+            return true;
+        }
+        return false;
+    }
+
+    public boolean append(byte[] bytes, int offset, int length) {
+        if (tupleDataEndOffset + length + TUPLE_COUNT_SIZE <= frameSize) {
+            System.arraycopy(bytes, offset, buffer.array(), tupleDataEndOffset, length);
+            tupleDataEndOffset += length;
+            return true;
+        }
+        return false;
+    }
+
+    public boolean append(int fieldValue) {
+        if (tupleDataEndOffset + 4 + TUPLE_COUNT_SIZE <= frameSize) {
+            buffer.putInt(tupleDataEndOffset, fieldValue);
+            tupleDataEndOffset += 4;
+            tupleCount++;
+            return true;
+        }
+        return false;
+    }
+
+    public boolean append(long fieldValue) {
+        if (tupleDataEndOffset + 8 + TUPLE_COUNT_SIZE <= frameSize) {
+            buffer.putLong(tupleDataEndOffset, fieldValue);
+            tupleDataEndOffset += 8;
+            tupleCount++;
+            return true;
+        }
+        return false;
+    }
+
+    public boolean append(char fieldValue) {
+        if (tupleDataEndOffset + 2 + TUPLE_COUNT_SIZE <= frameSize) {
+            buffer.putLong(tupleDataEndOffset, fieldValue);
+            tupleDataEndOffset += 2;
+            tupleCount++;
+            return true;
+        }
+        return false;
+    }
+
+    public boolean append(byte fieldValue) {
+        if (tupleDataEndOffset + 1 + TUPLE_COUNT_SIZE <= frameSize) {
+            buffer.put(tupleDataEndOffset, fieldValue);
+            tupleDataEndOffset += 1;
+            tupleCount++;
+            return true;
+        }
+        return false;
+    }
+
+    // returns true if an entire tuple fits
+    // returns false otherwise
+    public boolean hasSpace() {
+        return tupleDataEndOffset + tupleSize + TUPLE_COUNT_SIZE <= frameSize;
+    }
+
+    public void incrementTupleCount(int count) {
+        buffer.putInt(FrameHelper.getTupleCountOffset(frameSize),
+                buffer.getInt(FrameHelper.getTupleCountOffset(frameSize)) + count);
+    }
+
+    public int getTupleCount() {
+        return tupleCount;
+    }
+
+    public ByteBuffer getBuffer() {
+        return buffer;
+    }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeTupleReference.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeTupleReference.java
new file mode 100644
index 0000000..248b81e
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeTupleReference.java
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+
+public class FixedSizeTupleReference implements ITupleReference {
+
+    private final ITypeTrait[] typeTraits;
+    private final int[] fieldStartOffsets;
+    private byte[] data;
+    private int startOff;
+
+    public FixedSizeTupleReference(ITypeTrait[] typeTraits) {
+        this.typeTraits = typeTraits;
+        this.fieldStartOffsets = new int[typeTraits.length];
+        this.fieldStartOffsets[0] = 0;
+        for (int i = 1; i < typeTraits.length; i++) {
+            fieldStartOffsets[i] = fieldStartOffsets[i - 1] + typeTraits[i - 1].getStaticallyKnownDataLength();
+        }
+    }
+
+    public void reset(byte[] data, int startOff) {
+        this.data = data;
+        this.startOff = startOff;
+    }
+
+    @Override
+    public int getFieldCount() {
+        return typeTraits.length;
+    }
+
+    @Override
+    public byte[] getFieldData(int fIdx) {
+        return data;
+    }
+
+    @Override
+    public int getFieldLength(int fIdx) {
+        return typeTraits[fIdx].getStaticallyKnownDataLength();
+    }
+
+    @Override
+    public int getFieldStart(int fIdx) {
+        return startOff + fieldStartOffsets[fIdx];
+    }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java
new file mode 100644
index 0000000..9eab110
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java
@@ -0,0 +1,289 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.nio.ByteBuffer;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ByteArrayAccessibleOutputStream;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOpContext;
+import edu.uci.ics.hyracks.storage.am.btree.impls.RangePredicate;
+import edu.uci.ics.hyracks.storage.am.common.api.IIndexBulkLoadContext;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexCursor;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
+import edu.uci.ics.hyracks.storage.common.buffercache.ICachedPage;
+import edu.uci.ics.hyracks.storage.common.file.BufferedFileHandle;
+
+/**
+ * An inverted index consists of two files: 1. a file storing (paginated)
+ * inverted lists 2. a BTree-file mapping from tokens to inverted lists
+ * 
+ * Implemented features: bulk loading and searching (based on T-Occurrence) Not
+ * implemented features: updates (insert/update/delete) Limitations: a query
+ * cannot exceed the size of a Hyracks frame
+ */
+public class InvertedIndex {
+
+    private BTree btree;
+    private int rootPageId = 0;
+    private IBufferCache bufferCache;
+    private int fileId;
+    private final MultiComparator invListCmp;
+    private final int numTokenFields;
+    private final int numInvListKeys;
+
+    public InvertedIndex(IBufferCache bufferCache, BTree btree, MultiComparator invListCmp) {
+        this.bufferCache = bufferCache;
+        this.btree = btree;
+        this.invListCmp = invListCmp;
+        this.numTokenFields = btree.getMultiComparator().getKeyFieldCount();
+        this.numInvListKeys = invListCmp.getKeyFieldCount();
+    }
+
+    public void open(int fileId) {
+        this.fileId = fileId;
+    }
+
+    public void close() {
+        this.fileId = -1;
+    }
+
+    public BulkLoadContext beginBulkLoad(IInvertedListBuilder invListBuilder, int hyracksFrameSize,
+            float btreeFillFactor) throws HyracksDataException {
+        BulkLoadContext ctx = new BulkLoadContext(invListBuilder, hyracksFrameSize, btreeFillFactor);
+        ctx.init(rootPageId, fileId);
+        return ctx;
+    }
+
+    // ASSUMPTIONS:
+    // the first btree.getMultiComparator().getKeyFieldCount() fields in tuple
+    // are btree keys (e.g., a string token)
+    // the next invListCmp.getKeyFieldCount() fields in tuple are keys of the
+    // inverted list (e.g., primary key)
+    // key fields of inverted list are fixed size
+    public void bulkLoadAddTuple(BulkLoadContext ctx, ITupleReference tuple) throws HyracksDataException {
+
+        // first inverted list, copy token to baaos and start new list
+        if (ctx.currentInvListTokenBaaos.size() == 0) {
+            ctx.currentInvListStartPageId = ctx.currentPageId;
+            ctx.currentInvListStartOffset = ctx.invListBuilder.getPos();
+
+            // remember current token
+            ctx.currentInvListTokenBaaos.reset();
+            for (int i = 0; i < numTokenFields; i++) {
+                ctx.currentInvListTokenBaaos.write(tuple.getFieldData(i), tuple.getFieldStart(i),
+                        tuple.getFieldLength(i));
+            }
+
+            if (!ctx.invListBuilder.startNewList(tuple, numTokenFields)) {
+                ctx.pinNextPage();
+                ctx.invListBuilder.setTargetBuffer(ctx.currentPage.getBuffer().array(), 0);
+                if (!ctx.invListBuilder.startNewList(tuple, numTokenFields)) {
+                    throw new IllegalStateException("Failed to create first inverted list.");
+                }
+            }
+        }
+
+        // create new inverted list?
+        ctx.currentInvListToken.reset(ctx.currentInvListTokenBaaos.getByteArray(), 0);
+        if (ctx.tokenCmp.compare(tuple, ctx.currentInvListToken) != 0) {
+
+            // create entry in btree for last inverted list
+            createAndInsertBTreeTuple(ctx);
+
+            // remember new token
+            ctx.currentInvListTokenBaaos.reset();
+            for (int i = 0; i < numTokenFields; i++) {
+                ctx.currentInvListTokenBaaos.write(tuple.getFieldData(i), tuple.getFieldStart(i),
+                        tuple.getFieldLength(i));
+            }
+
+            // start new list
+            if (!ctx.invListBuilder.startNewList(tuple, numTokenFields)) {
+                ctx.pinNextPage();
+                ctx.invListBuilder.setTargetBuffer(ctx.currentPage.getBuffer().array(), 0);
+                if (!ctx.invListBuilder.startNewList(tuple, numTokenFields)) {
+                    throw new IllegalStateException("Failed to start new inverted list after switching to a new page.");
+                }
+            }
+
+            ctx.currentInvListStartPageId = ctx.currentPageId;
+            ctx.currentInvListStartOffset = ctx.invListBuilder.getPos();
+        }
+
+        // append to current inverted list
+        if (!ctx.invListBuilder.appendElement(tuple, numTokenFields, numInvListKeys)) {
+            ctx.pinNextPage();
+            ctx.invListBuilder.setTargetBuffer(ctx.currentPage.getBuffer().array(), 0);
+            if (!ctx.invListBuilder.appendElement(tuple, numTokenFields, numInvListKeys)) {
+                throw new IllegalStateException(
+                        "Failed to append element to inverted list after switching to a new page.");
+            }
+        }
+    }
+
+    public boolean openCursor(ITreeIndexCursor btreeCursor, RangePredicate btreePred, BTreeOpContext btreeOpCtx,
+            IInvertedListCursor invListCursor) throws Exception {
+        btree.search(btreeCursor, btreePred, btreeOpCtx);
+
+        boolean ret = false;
+        if (btreeCursor.hasNext()) {
+
+            btreeCursor.next();
+            ITupleReference frameTuple = btreeCursor.getTuple();
+
+            // hardcoded mapping of btree fields
+            int startPageId = IntegerSerializerDeserializer.getInt(frameTuple.getFieldData(1),
+                    frameTuple.getFieldStart(1));
+            int endPageId = IntegerSerializerDeserializer.getInt(frameTuple.getFieldData(2),
+                    frameTuple.getFieldStart(2));
+            int startOff = IntegerSerializerDeserializer
+                    .getInt(frameTuple.getFieldData(3), frameTuple.getFieldStart(3));
+            int numElements = IntegerSerializerDeserializer.getInt(frameTuple.getFieldData(4),
+                    frameTuple.getFieldStart(4));
+
+            invListCursor.reset(startPageId, endPageId, startOff, numElements);
+            ret = true;
+        } else {
+            invListCursor.reset(0, 0, 0, 0);
+        }
+
+        btreeCursor.close();
+        btreeCursor.reset();
+
+        return ret;
+    }
+
+    public void createAndInsertBTreeTuple(BulkLoadContext ctx) throws HyracksDataException {
+        // build tuple
+        ctx.btreeTupleBuilder.reset();
+        ctx.btreeTupleBuilder.addField(ctx.currentInvListTokenBaaos.getByteArray(), 0,
+                ctx.currentInvListTokenBaaos.size());
+        ctx.btreeTupleBuilder.addField(IntegerSerializerDeserializer.INSTANCE, ctx.currentInvListStartPageId);
+        ctx.btreeTupleBuilder.addField(IntegerSerializerDeserializer.INSTANCE, ctx.currentPageId);
+        ctx.btreeTupleBuilder.addField(IntegerSerializerDeserializer.INSTANCE, ctx.currentInvListStartOffset);
+        ctx.btreeTupleBuilder.addField(IntegerSerializerDeserializer.INSTANCE, ctx.invListBuilder.getListSize());
+
+        // append to buffer
+        ctx.btreeTupleAppender.reset(ctx.btreeTupleBuffer, true);
+        ctx.btreeTupleAppender.append(ctx.btreeTupleBuilder.getFieldEndOffsets(), ctx.btreeTupleBuilder.getByteArray(),
+                0, ctx.btreeTupleBuilder.getSize());
+
+        // reset tuple reference
+        ctx.btreeFrameTupleReference.reset(ctx.btreeFrameTupleAccessor, 0);
+
+        btree.bulkLoadAddTuple(ctx.btreeBulkLoadCtx, ctx.btreeFrameTupleReference);
+    }
+
+    public void endBulkLoad(BulkLoadContext ctx) throws HyracksDataException {
+        // create entry in btree for last inverted list
+        createAndInsertBTreeTuple(ctx);
+        btree.endBulkLoad(ctx.btreeBulkLoadCtx);
+        ctx.deinit();
+    }
+
+    public IBufferCache getBufferCache() {
+        return bufferCache;
+    }
+
+    public int getInvListsFileId() {
+        return fileId;
+    }
+
+    public MultiComparator getInvListElementCmp() {
+        return invListCmp;
+    }
+
+    public BTree getBTree() {
+        return btree;
+    }
+
+    public final class BulkLoadContext {
+        private final ByteBuffer btreeTupleBuffer;
+        private final ArrayTupleBuilder btreeTupleBuilder;
+        private final FrameTupleAppender btreeTupleAppender;
+        private final FrameTupleAccessor btreeFrameTupleAccessor;
+        private final FrameTupleReference btreeFrameTupleReference = new FrameTupleReference();
+        private final float btreeFillFactor;
+        private IIndexBulkLoadContext btreeBulkLoadCtx;
+
+        private int currentInvListStartPageId;
+        private int currentInvListStartOffset;
+        private final ByteArrayAccessibleOutputStream currentInvListTokenBaaos = new ByteArrayAccessibleOutputStream();
+        private final FixedSizeTupleReference currentInvListToken = new FixedSizeTupleReference(
+                invListCmp.getTypeTraits());
+
+        private int currentPageId;
+        private ICachedPage currentPage;
+        private final IInvertedListBuilder invListBuilder;
+        private final MultiComparator tokenCmp;
+
+        public BulkLoadContext(IInvertedListBuilder invListBuilder, int hyracksFrameSize, float btreeFillFactor) {
+            this.invListBuilder = invListBuilder;
+            this.tokenCmp = btree.getMultiComparator();
+            this.btreeTupleBuffer = ByteBuffer.allocate(hyracksFrameSize);
+            this.btreeTupleBuilder = new ArrayTupleBuilder(tokenCmp.getFieldCount());
+            this.btreeTupleAppender = new FrameTupleAppender(hyracksFrameSize);
+            // TODO: serde never used, only need correct number of fields
+            // tuple contains (token, start page, end page, start offset, num
+            // elements)
+            RecordDescriptor recDesc = new RecordDescriptor(new ISerializerDeserializer[] {
+                    IntegerSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE,
+                    IntegerSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE,
+                    IntegerSerializerDeserializer.INSTANCE });
+            this.btreeFrameTupleAccessor = new FrameTupleAccessor(hyracksFrameSize, recDesc);
+            this.btreeFillFactor = btreeFillFactor;
+        }
+
+        public void init(int startPageId, int fileId) throws HyracksDataException {
+            btreeBulkLoadCtx = btree.beginBulkLoad(BTree.DEFAULT_FILL_FACTOR,
+                    btree.getLeafFrameFactory().createFrame(), btree.getInteriorFrameFactory().createFrame(), btree
+                            .getFreePageManager().getMetaDataFrameFactory().createFrame());
+            currentPageId = startPageId;
+            currentPage = bufferCache.pin(BufferedFileHandle.getDiskPageId(fileId, currentPageId), true);
+            currentPage.acquireWriteLatch();
+            invListBuilder.setTargetBuffer(currentPage.getBuffer().array(), 0);
+            btreeFrameTupleAccessor.reset(btreeTupleBuffer);
+        }
+
+        public void deinit() throws HyracksDataException {
+            if (currentPage != null) {
+                currentPage.releaseWriteLatch();
+                bufferCache.unpin(currentPage);
+            }
+        }
+
+        public void pinNextPage() throws HyracksDataException {
+            currentPage.releaseWriteLatch();
+            bufferCache.unpin(currentPage);
+            currentPageId++;
+            currentPage = bufferCache.pin(BufferedFileHandle.getDiskPageId(fileId, currentPageId), true);
+            currentPage.acquireWriteLatch();
+        }
+    };
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndexException.java
similarity index 72%
copy from hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
copy to hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndexException.java
index 7e91fd4..494ac0c 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndexException.java
@@ -13,10 +13,12 @@
  * limitations under the License.
  */
 
-package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
 
-import java.io.Serializable;
+public class InvertedIndexException extends Exception {
+    private static final long serialVersionUID = 1L;
 
-public interface IBinaryTokenizerFactory extends Serializable {
-    public IBinaryTokenizer createBinaryTokenizer();
+    public InvertedIndexException(String msg) {
+        super(msg);
+    }
 }
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/ListResultCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/ListResultCursor.java
deleted file mode 100644
index adc5221..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/ListResultCursor.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
-
-import java.nio.ByteBuffer;
-import java.util.List;
-
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
-
-public class ListResultCursor implements IInvertedIndexResultCursor {
-
-    private List<ByteBuffer> resultBuffers;
-    private int numResultBuffers;
-    private int currentPos = 0;
-
-    public void setResults(List<ByteBuffer> resultBuffers, int numResultBuffers) {
-        this.resultBuffers = resultBuffers;
-        this.numResultBuffers = numResultBuffers;
-        reset();
-    }
-
-    @Override
-    public boolean hasNext() {
-        if (currentPos + 1 < numResultBuffers)
-            return true;
-        else
-            return false;
-    }
-
-    @Override
-    public void next() {
-        currentPos++;
-    }
-
-    @Override
-    public ByteBuffer getBuffer() {
-        return resultBuffers.get(currentPos);
-    }
-
-    @Override
-    public void reset() {
-        currentPos = -1;
-    }
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/OccurrenceThresholdPanicException.java
similarity index 69%
copy from hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
copy to hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/OccurrenceThresholdPanicException.java
index 7e91fd4..b0e737c 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/OccurrenceThresholdPanicException.java
@@ -13,10 +13,12 @@
  * limitations under the License.
  */
 
-package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
 
-import java.io.Serializable;
+public class OccurrenceThresholdPanicException extends InvertedIndexException {
+    private static final long serialVersionUID = 1L;
 
-public interface IBinaryTokenizerFactory extends Serializable {
-    public IBinaryTokenizer createBinaryTokenizer();
+    public OccurrenceThresholdPanicException(String msg) {
+        super(msg);
+    }
 }
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SearchResultCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SearchResultCursor.java
new file mode 100644
index 0000000..5a7230c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SearchResultCursor.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
+
+public class SearchResultCursor implements IInvertedIndexResultCursor {
+
+    private List<ByteBuffer> resultBuffers;
+    private IFrameTupleAccessor fta;
+    private FixedSizeTupleReference resultTuple;
+    private int numResultBuffers;
+    private int currentBufferIndex = 0;
+    private int tupleIndex = 0;
+
+    public SearchResultCursor(IFrameTupleAccessor fta, ITupleReference resultTuple) {
+        this.fta = fta;
+        this.resultTuple = (FixedSizeTupleReference) resultTuple;
+    }
+
+    @Override
+    public boolean hasNext() {
+        if (currentBufferIndex < numResultBuffers && tupleIndex < fta.getTupleCount())
+            return true;
+        else
+            return false;
+    }
+
+    @Override
+    public void next() {
+        resultTuple.reset(fta.getBuffer().array(), fta.getTupleStartOffset(tupleIndex));
+        tupleIndex++;
+        if (tupleIndex >= fta.getTupleCount()) {
+            if (currentBufferIndex + 1 < numResultBuffers) {
+                currentBufferIndex++;
+                fta.reset(resultBuffers.get(currentBufferIndex));
+                tupleIndex = 0;
+            }
+        }
+    }
+
+    @Override
+    public ITupleReference getTuple() {
+        return resultTuple;
+    }
+
+    @Override
+    public void reset(IInvertedIndexSearcher invIndexSearcher) {
+        currentBufferIndex = 0;
+        tupleIndex = 0;
+        resultBuffers = invIndexSearcher.getResultBuffers();
+        numResultBuffers = invIndexSearcher.getNumValidResultBuffers();
+        if (numResultBuffers > 0) {
+            fta.reset(resultBuffers.get(0));
+        }
+    }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java
deleted file mode 100644
index f88439c..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java
+++ /dev/null
@@ -1,294 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
-
-import java.io.DataOutput;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
-import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
-import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
-import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
-import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeCursor;
-import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrame;
-import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrame;
-import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
-import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOp;
-import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOpContext;
-import edu.uci.ics.hyracks.storage.am.btree.impls.MultiComparator;
-import edu.uci.ics.hyracks.storage.am.btree.impls.RangePredicate;
-import edu.uci.ics.hyracks.storage.am.btree.impls.RangeSearchCursor;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
-
-public class SimpleConjunctiveSearcher implements IInvertedIndexSearcher {
-
-    private final int numKeyFields;
-    private final int numValueFields;
-
-    private final IBinaryComparator[] keyCmps;
-    private final IBinaryComparator[] valueCmps;
-
-    private final BTree btree;
-    private final IHyracksTaskContext ctx;
-    private final ArrayTupleBuilder resultTupleBuilder;
-    private final FrameTupleAppender resultTupleAppender;
-    private final FrameTupleAccessor resultFrameAccessor;
-
-    private List<ByteBuffer> newResultBuffers = new ArrayList<ByteBuffer>();
-    private List<ByteBuffer> prevResultBuffers = new ArrayList<ByteBuffer>();
-    private List<ByteBuffer> swap = null;
-    private final ListResultCursor resultCursor = new ListResultCursor();
-    private int maxResultBufIdx = 0;
-
-    private final IBTreeLeafFrame leafFrame;
-    private final IBTreeInteriorFrame interiorFrame;
-    private final IBTreeCursor btreeCursor;
-    private final FrameTupleReference searchKey = new FrameTupleReference();
-    private final RangePredicate pred = new RangePredicate(true, null, null, true, true, null, null);
-
-    private final IBinaryTokenizer queryTokenizer;
-
-    public SimpleConjunctiveSearcher(IHyracksTaskContext ctx, BTree btree, RecordDescriptor btreeRecDesc,
-            IBinaryTokenizer queryTokenizer, int numKeyFields, int numValueFields) {
-        this.ctx = ctx;
-        this.btree = btree;
-        this.queryTokenizer = queryTokenizer;
-        this.numKeyFields = numKeyFields;
-        this.numValueFields = numValueFields;
-
-        leafFrame = btree.getLeafFrameFactory().getFrame();
-        interiorFrame = btree.getInteriorFrameFactory().getFrame();
-        btreeCursor = new RangeSearchCursor(leafFrame);
-        resultTupleAppender = new FrameTupleAppender(ctx.getFrameSize());
-        resultTupleBuilder = new ArrayTupleBuilder(numValueFields);
-        newResultBuffers.add(ctx.allocateFrame());
-        prevResultBuffers.add(ctx.allocateFrame());
-
-        MultiComparator btreeCmp = btree.getMultiComparator();
-
-        keyCmps = new IBinaryComparator[numKeyFields];
-        for (int i = 0; i < numKeyFields; i++) {
-            keyCmps[i] = btreeCmp.getComparators()[i];
-        }
-
-        valueCmps = new IBinaryComparator[numValueFields];
-        for (int i = 0; i < numValueFields; i++) {
-            valueCmps[i] = btreeCmp.getComparators()[numKeyFields + i];
-        }
-
-        MultiComparator searchCmp = new MultiComparator(btreeCmp.getTypeTraits(), keyCmps);
-        pred.setLowKeyComparator(searchCmp);
-        pred.setHighKeyComparator(searchCmp);
-        pred.setLowKey(searchKey, true);
-        pred.setHighKey(searchKey, true);
-
-        ISerializerDeserializer[] valueSerde = new ISerializerDeserializer[numValueFields];
-        for (int i = 0; i < numValueFields; i++) {
-            valueSerde[i] = btreeRecDesc.getFields()[numKeyFields + i];
-
-        }
-        RecordDescriptor valueRecDesc = new RecordDescriptor(valueSerde);
-        resultFrameAccessor = new FrameTupleAccessor(ctx.getFrameSize(), valueRecDesc);
-    }
-
-    public void search(ITupleReference queryTuple, int queryFieldIndex) throws Exception {
-
-        // parse query, TODO: this parsing is too simple
-        RecordDescriptor queryTokenRecDesc = new RecordDescriptor(
-                new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE });
-
-        ArrayTupleBuilder queryTokenBuilder = new ArrayTupleBuilder(queryTokenRecDesc.getFields().length);
-        DataOutput queryTokenDos = queryTokenBuilder.getDataOutput();
-        FrameTupleAppender queryTokenAppender = new FrameTupleAppender(ctx.getFrameSize());
-        ByteBuffer queryTokenFrame = ctx.allocateFrame();
-        queryTokenAppender.reset(queryTokenFrame, true);
-
-        queryTokenizer.reset(queryTuple.getFieldData(queryFieldIndex), queryTuple.getFieldStart(queryFieldIndex),
-                queryTuple.getFieldLength(queryFieldIndex));
-        while (queryTokenizer.hasNext()) {
-            queryTokenizer.next();
-
-            queryTokenBuilder.reset();
-            try {
-                queryTokenizer.writeToken(queryTokenDos);
-                queryTokenBuilder.addFieldEndOffset();
-            } catch (IOException e) {
-                throw new HyracksDataException(e);
-            }
-
-            // WARNING: assuming one frame is enough to hold all tokens
-            queryTokenAppender.append(queryTokenBuilder.getFieldEndOffsets(), queryTokenBuilder.getByteArray(), 0,
-                    queryTokenBuilder.getSize());
-        }
-
-        FrameTupleAccessor queryTokenAccessor = new FrameTupleAccessor(ctx.getFrameSize(), queryTokenRecDesc);
-        queryTokenAccessor.reset(queryTokenFrame);
-        int numQueryTokens = queryTokenAccessor.getTupleCount();
-
-        maxResultBufIdx = 0;
-
-        BTreeOpContext opCtx = btree.createOpContext(BTreeOp.BTO_SEARCH, leafFrame, interiorFrame, null);
-
-        resultTupleAppender.reset(newResultBuffers.get(0), true);
-        try {
-            // append first inverted list to temporary results
-            searchKey.reset(queryTokenAccessor, 0);
-            btree.search(btreeCursor, pred, opCtx);
-            while (btreeCursor.hasNext()) {
-                btreeCursor.next();
-                maxResultBufIdx = appendTupleToNewResults(btreeCursor, maxResultBufIdx);
-            }
-            btreeCursor.close();
-            btreeCursor.reset();
-        } catch (Exception e) {
-            throw new HyracksDataException(e);
-        }
-
-        resultFrameAccessor.reset(newResultBuffers.get(0));
-
-        // intersect temporary results with remaining inverted lists
-        for (int i = 1; i < numQueryTokens; i++) {
-            swap = prevResultBuffers;
-            prevResultBuffers = newResultBuffers;
-            newResultBuffers = swap;
-            try {
-                searchKey.reset(queryTokenAccessor, i);
-                btree.search(btreeCursor, pred, opCtx);
-                maxResultBufIdx = intersectList(btreeCursor, prevResultBuffers, maxResultBufIdx, newResultBuffers);
-            } catch (Exception e) {
-                throw new HyracksDataException(e);
-            }
-            btreeCursor.close();
-            btreeCursor.reset();
-        }
-    }
-
-    private int appendTupleToNewResults(IBTreeCursor btreeCursor, int newBufIdx) throws IOException {
-        ByteBuffer newCurrentBuffer = newResultBuffers.get(newBufIdx);
-
-        ITupleReference tuple = btreeCursor.getTuple();
-        resultTupleBuilder.reset();
-        DataOutput dos = resultTupleBuilder.getDataOutput();
-        for (int i = 0; i < numValueFields; i++) {
-            int fIdx = numKeyFields + i;
-            dos.write(tuple.getFieldData(fIdx), tuple.getFieldStart(fIdx), tuple.getFieldLength(fIdx));
-            resultTupleBuilder.addFieldEndOffset();
-        }
-
-        if (!resultTupleAppender.append(resultTupleBuilder.getFieldEndOffsets(), resultTupleBuilder.getByteArray(), 0,
-                resultTupleBuilder.getSize())) {
-            newBufIdx++;
-            if (newBufIdx >= newResultBuffers.size()) {
-                newResultBuffers.add(ctx.allocateFrame());
-            }
-            newCurrentBuffer = newResultBuffers.get(newBufIdx);
-            resultTupleAppender.reset(newCurrentBuffer, true);
-            if (!resultTupleAppender.append(resultTupleBuilder.getFieldEndOffsets(), resultTupleBuilder.getByteArray(),
-                    0, resultTupleBuilder.getSize())) {
-                throw new IllegalStateException();
-            }
-        }
-
-        return newBufIdx;
-    }
-
-    private int intersectList(IBTreeCursor btreeCursor, List<ByteBuffer> prevResultBuffers, int maxPrevBufIdx,
-            List<ByteBuffer> newResultBuffers) throws IOException, Exception {
-
-        int newBufIdx = 0;
-        ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
-
-        int prevBufIdx = 0;
-        ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
-
-        resultTupleBuilder.reset();
-        resultTupleAppender.reset(newCurrentBuffer, true);
-        resultFrameAccessor.reset(prevCurrentBuffer);
-
-        // WARNING: not very efficient but good enough for the first cut
-        boolean advanceCursor = true;
-        boolean advancePrevResult = false;
-        int resultTidx = 0;
-
-        while ((!advanceCursor || btreeCursor.hasNext()) && prevBufIdx <= maxPrevBufIdx
-                && resultTidx < resultFrameAccessor.getTupleCount()) {
-
-            if (advanceCursor)
-                btreeCursor.next();
-            ITupleReference tuple = btreeCursor.getTuple();
-
-            int cmp = 0;
-            for (int i = 0; i < valueCmps.length; i++) {
-                int tupleFidx = numKeyFields + i;
-                cmp = valueCmps[i].compare(tuple.getFieldData(tupleFidx), tuple.getFieldStart(tupleFidx),
-                        tuple.getFieldLength(tupleFidx), resultFrameAccessor.getBuffer().array(),
-                        resultFrameAccessor.getTupleStartOffset(resultTidx) + resultFrameAccessor.getFieldSlotsLength()
-                                + resultFrameAccessor.getFieldStartOffset(resultTidx, i),
-                        resultFrameAccessor.getFieldLength(resultTidx, i));
-                if (cmp != 0)
-                    break;
-            }
-
-            // match found
-            if (cmp == 0) {
-                newBufIdx = appendTupleToNewResults(btreeCursor, newBufIdx);
-
-                advanceCursor = true;
-                advancePrevResult = true;
-            } else {
-                if (cmp < 0) {
-                    advanceCursor = true;
-                    advancePrevResult = false;
-                } else {
-                    advanceCursor = false;
-                    advancePrevResult = true;
-                }
-            }
-
-            if (advancePrevResult) {
-                resultTidx++;
-                if (resultTidx >= resultFrameAccessor.getTupleCount()) {
-                    prevBufIdx++;
-                    if (prevBufIdx <= maxPrevBufIdx) {
-                        prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
-                        resultFrameAccessor.reset(prevCurrentBuffer);
-                        resultTidx = 0;
-                    }
-                }
-            }
-        }
-
-        return newBufIdx;
-    }
-
-    @Override
-    public IInvertedIndexResultCursor getResultCursor() {
-        resultCursor.setResults(newResultBuffers, maxResultBufIdx + 1);
-        return resultCursor;
-    }
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
new file mode 100644
index 0000000..7505ae0
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
@@ -0,0 +1,543 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.dataflow.value.TypeTrait;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrame;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOpContext;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeRangeSearchCursor;
+import edu.uci.ics.hyracks.storage.am.btree.impls.RangePredicate;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexCursor;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexFrame;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.IndexOp;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
+
+public class TOccurrenceSearcher implements IInvertedIndexSearcher {
+
+    protected final IHyracksTaskContext ctx;
+    protected final FixedSizeFrameTupleAppender resultFrameTupleApp;
+    protected final FixedSizeFrameTupleAccessor resultFrameTupleAcc;
+    protected final FixedSizeTupleReference resultTuple;
+    protected final int invListKeyLength;
+    protected int currentNumResults;
+
+    protected List<ByteBuffer> newResultBuffers = new ArrayList<ByteBuffer>();
+    protected List<ByteBuffer> prevResultBuffers = new ArrayList<ByteBuffer>();
+    protected List<ByteBuffer> swap = null;
+    protected int maxResultBufIdx = 0;
+
+    protected final ITreeIndexFrame leafFrame;
+    protected final ITreeIndexFrame interiorFrame;
+    protected final ITreeIndexCursor btreeCursor;
+    protected final FrameTupleReference searchKey = new FrameTupleReference();
+    protected final RangePredicate btreePred = new RangePredicate(true, null, null, true, true, null, null);
+    protected final BTreeOpContext btreeOpCtx;
+
+    protected RecordDescriptor queryTokenRecDesc = new RecordDescriptor(
+            new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE });
+    protected ArrayTupleBuilder queryTokenBuilder = new ArrayTupleBuilder(queryTokenRecDesc.getFields().length);
+    protected DataOutput queryTokenDos = queryTokenBuilder.getDataOutput();
+    protected FrameTupleAppender queryTokenAppender;
+    protected ByteBuffer queryTokenFrame;
+
+    protected final InvertedIndex invIndex;
+    protected final IBinaryTokenizer queryTokenizer;
+    protected final ITypeTrait[] invListFieldsWithCount;
+    protected int occurrenceThreshold;
+
+    protected final int cursorCacheSize = 10;
+    protected List<IInvertedListCursor> invListCursorCache = new ArrayList<IInvertedListCursor>(cursorCacheSize);
+    protected List<IInvertedListCursor> invListCursors = new ArrayList<IInvertedListCursor>(cursorCacheSize);
+
+    public TOccurrenceSearcher(IHyracksTaskContext ctx, InvertedIndex invIndex, IBinaryTokenizer queryTokenizer) {
+        this.ctx = ctx;
+        this.invIndex = invIndex;
+        this.queryTokenizer = queryTokenizer;
+
+        leafFrame = invIndex.getBTree().getLeafFrameFactory().createFrame();
+        interiorFrame = invIndex.getBTree().getInteriorFrameFactory().createFrame();
+
+        btreeCursor = new BTreeRangeSearchCursor((IBTreeLeafFrame) leafFrame);
+        ITypeTrait[] invListFields = invIndex.getInvListElementCmp().getTypeTraits();
+        invListFieldsWithCount = new TypeTrait[invListFields.length + 1];
+        int tmp = 0;
+        for (int i = 0; i < invListFields.length; i++) {
+            invListFieldsWithCount[i] = invListFields[i];
+            tmp += invListFields[i].getStaticallyKnownDataLength();
+        }
+        // using an integer for counting occurrences
+        invListFieldsWithCount[invListFields.length] = new TypeTrait(4);
+        invListKeyLength = tmp;
+
+        btreeOpCtx = invIndex.getBTree().createOpContext(IndexOp.SEARCH, leafFrame, interiorFrame, null);
+
+        resultFrameTupleApp = new FixedSizeFrameTupleAppender(ctx.getFrameSize(), invListFieldsWithCount);
+        resultFrameTupleAcc = new FixedSizeFrameTupleAccessor(ctx.getFrameSize(), invListFieldsWithCount);
+        resultTuple = new FixedSizeTupleReference(invListFieldsWithCount);
+        newResultBuffers.add(ctx.allocateFrame());
+        prevResultBuffers.add(ctx.allocateFrame());
+
+        MultiComparator searchCmp = invIndex.getBTree().getMultiComparator();
+        btreePred.setLowKeyComparator(searchCmp);
+        btreePred.setHighKeyComparator(searchCmp);
+        btreePred.setLowKey(searchKey, true);
+        btreePred.setHighKey(searchKey, true);
+
+        // pre-create cursor objects
+        for (int i = 0; i < cursorCacheSize; i++) {
+            invListCursorCache.add(new FixedSizeElementInvertedListCursor(invIndex.getBufferCache(), invIndex
+                    .getInvListsFileId(), invIndex.getInvListElementCmp().getTypeTraits()));
+        }
+
+        queryTokenAppender = new FrameTupleAppender(ctx.getFrameSize());
+        queryTokenFrame = ctx.allocateFrame();
+
+        currentNumResults = 0;
+    }
+
+    public void reset() {
+        for (ByteBuffer b : newResultBuffers) {
+            resultFrameTupleApp.reset(b, true);
+        }
+        for (ByteBuffer b : prevResultBuffers) {
+            resultFrameTupleApp.reset(b, true);
+        }
+        currentNumResults = 0;
+    }
+
+    public void search(IInvertedIndexResultCursor resultCursor, ITupleReference queryTuple, int queryFieldIndex,
+            IInvertedIndexSearchModifier searchModifier) throws Exception {
+
+        queryTokenAppender.reset(queryTokenFrame, true);
+        queryTokenizer.reset(queryTuple.getFieldData(queryFieldIndex), queryTuple.getFieldStart(queryFieldIndex),
+                queryTuple.getFieldLength(queryFieldIndex));
+
+        while (queryTokenizer.hasNext()) {
+            queryTokenizer.next();
+            queryTokenBuilder.reset();
+            try {
+                IToken token = queryTokenizer.getToken();
+                token.serializeToken(queryTokenDos);
+                queryTokenBuilder.addFieldEndOffset();
+                // WARNING: assuming one frame is big enough to hold all tokens
+                queryTokenAppender.append(queryTokenBuilder.getFieldEndOffsets(), queryTokenBuilder.getByteArray(), 0,
+                        queryTokenBuilder.getSize());
+            } catch (IOException e) {
+                throw new HyracksDataException(e);
+            }
+        }
+
+        FrameTupleAccessor queryTokenAccessor = new FrameTupleAccessor(ctx.getFrameSize(), queryTokenRecDesc);
+        queryTokenAccessor.reset(queryTokenFrame);
+        int numQueryTokens = queryTokenAccessor.getTupleCount();
+
+        // expand cursor cache if necessary
+        if (numQueryTokens > invListCursorCache.size()) {
+            int diff = numQueryTokens - invListCursorCache.size();
+            for (int i = 0; i < diff; i++) {
+                invListCursorCache.add(new FixedSizeElementInvertedListCursor(invIndex.getBufferCache(), invIndex
+                        .getInvListsFileId(), invIndex.getInvListElementCmp().getTypeTraits()));
+            }
+        }
+
+        invListCursors.clear();
+        for (int i = 0; i < numQueryTokens; i++) {
+            searchKey.reset(queryTokenAccessor, i);
+            invIndex.openCursor(btreeCursor, btreePred, btreeOpCtx, invListCursorCache.get(i));
+            invListCursors.add(invListCursorCache.get(i));
+        }
+
+        occurrenceThreshold = searchModifier.getOccurrenceThreshold(invListCursors);
+
+        // TODO: deal with panic cases properly
+        if (occurrenceThreshold <= 0) {
+            throw new OccurrenceThresholdPanicException("Merge Threshold is <= 0. Failing Search.");
+        }
+
+        int numPrefixLists = searchModifier.getPrefixLists(invListCursors);
+        maxResultBufIdx = mergePrefixLists(numPrefixLists, numQueryTokens);
+        maxResultBufIdx = mergeSuffixLists(numPrefixLists, numQueryTokens, maxResultBufIdx);
+
+        resultCursor.reset(this);
+    }
+
+    protected int mergePrefixLists(int numPrefixTokens, int numQueryTokens) throws IOException {
+        int maxPrevBufIdx = 0;
+        for (int i = 0; i < numPrefixTokens; i++) {
+            swap = prevResultBuffers;
+            prevResultBuffers = newResultBuffers;
+            newResultBuffers = swap;
+            currentNumResults = 0;
+
+            invListCursors.get(i).pinPagesSync();
+            maxPrevBufIdx = mergePrefixList(invListCursors.get(i), prevResultBuffers, maxPrevBufIdx, newResultBuffers);
+            invListCursors.get(i).unpinPages();
+        }
+        return maxPrevBufIdx;
+    }
+
+    protected int mergeSuffixLists(int numPrefixTokens, int numQueryTokens, int maxPrevBufIdx) throws IOException {
+        for (int i = numPrefixTokens; i < numQueryTokens; i++) {
+            swap = prevResultBuffers;
+            prevResultBuffers = newResultBuffers;
+            newResultBuffers = swap;
+
+            invListCursors.get(i).pinPagesSync();
+            int numInvListElements = invListCursors.get(i).getNumElements();
+            // should we binary search the next list or should we sort-merge it?
+            if (currentNumResults * Math.log(numInvListElements) < currentNumResults + numInvListElements) {
+                maxPrevBufIdx = mergeSuffixListProbe(invListCursors.get(i), prevResultBuffers, maxPrevBufIdx,
+                        newResultBuffers, i, numQueryTokens);
+            } else {
+                maxPrevBufIdx = mergeSuffixListScan(invListCursors.get(i), prevResultBuffers, maxPrevBufIdx,
+                        newResultBuffers, i, numQueryTokens);
+            }
+            invListCursors.get(i).unpinPages();
+        }
+        return maxPrevBufIdx;
+    }
+
+    protected int mergeSuffixListProbe(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
+            int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws IOException {
+
+        int newBufIdx = 0;
+        ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
+
+        int prevBufIdx = 0;
+        ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+
+        int resultTidx = 0;
+
+        currentNumResults = 0;
+
+        MultiComparator invListCmp = invIndex.getInvListElementCmp();
+
+        resultFrameTupleAcc.reset(prevCurrentBuffer);
+        resultFrameTupleApp.reset(newCurrentBuffer, true);
+
+        while (resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+            resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+            int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+                    resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+
+            if (invListCursor.containsKey(resultTuple, invListCmp)) {
+                count++;
+                newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+            } else {
+                if (count + numQueryTokens - invListIx > occurrenceThreshold) {
+                    newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+                }
+            }
+
+            resultTidx++;
+            if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+                prevBufIdx++;
+                if (prevBufIdx <= maxPrevBufIdx) {
+                    prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+                    resultFrameTupleAcc.reset(prevCurrentBuffer);
+                    resultTidx = 0;
+                }
+            }
+        }
+
+        return newBufIdx;
+    }
+
+    protected int mergeSuffixListScan(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
+            int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws IOException {
+        int newBufIdx = 0;
+        ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
+
+        int prevBufIdx = 0;
+        ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+
+        boolean advanceCursor = true;
+        boolean advancePrevResult = false;
+        int resultTidx = 0;
+
+        MultiComparator invListCmp = invIndex.getInvListElementCmp();
+
+        resultFrameTupleAcc.reset(prevCurrentBuffer);
+        resultFrameTupleApp.reset(newCurrentBuffer, true);
+
+        int invListTidx = 0;
+        int invListNumTuples = invListCursor.getNumElements();
+
+        if (invListCursor.hasNext())
+            invListCursor.next();
+
+        while (invListTidx < invListNumTuples && resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+            ITupleReference invListTuple = invListCursor.getTuple();
+
+            resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+
+            int cmp = invListCmp.compare(invListTuple, resultTuple);
+            if (cmp == 0) {
+                int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+                        resultTuple.getFieldStart(resultTuple.getFieldCount() - 1)) + 1;
+                newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+                advanceCursor = true;
+                advancePrevResult = true;
+            } else {
+                if (cmp < 0) {
+                    advanceCursor = true;
+                    advancePrevResult = false;
+                } else {
+                    int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+                            resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+                    if (count + numQueryTokens - invListIx > occurrenceThreshold) {
+                        newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+                    }
+                    advanceCursor = false;
+                    advancePrevResult = true;
+                }
+            }
+
+            if (advancePrevResult) {
+                resultTidx++;
+                if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+                    prevBufIdx++;
+                    if (prevBufIdx <= maxPrevBufIdx) {
+                        prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+                        resultFrameTupleAcc.reset(prevCurrentBuffer);
+                        resultTidx = 0;
+                    }
+                }
+            }
+
+            if (advanceCursor) {
+                invListTidx++;
+                invListCursor.next();
+            }
+        }
+
+        // append remaining elements from previous result set
+        while (resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+            resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+
+            int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+                    resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+            if (count + numQueryTokens - invListIx > occurrenceThreshold) {
+                newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+            }
+
+            resultTidx++;
+            if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+                prevBufIdx++;
+                if (prevBufIdx <= maxPrevBufIdx) {
+                    prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+                    resultFrameTupleAcc.reset(prevCurrentBuffer);
+                    resultTidx = 0;
+                }
+            }
+        }
+
+        return newBufIdx;
+    }
+
+    protected int mergePrefixList(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
+            int maxPrevBufIdx, List<ByteBuffer> newResultBuffers) throws IOException {
+        int newBufIdx = 0;
+        ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
+
+        int prevBufIdx = 0;
+        ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+
+        boolean advanceCursor = true;
+        boolean advancePrevResult = false;
+        int resultTidx = 0;
+
+        MultiComparator invListCmp = invIndex.getInvListElementCmp();
+
+        resultFrameTupleAcc.reset(prevCurrentBuffer);
+        resultFrameTupleApp.reset(newCurrentBuffer, true);
+
+        int invListTidx = 0;
+        int invListNumTuples = invListCursor.getNumElements();
+
+        if (invListCursor.hasNext())
+            invListCursor.next();
+
+        while (invListTidx < invListNumTuples && resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+            ITupleReference invListTuple = invListCursor.getTuple();
+            resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+
+            int cmp = invListCmp.compare(invListTuple, resultTuple);
+            if (cmp == 0) {
+                int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+                        resultTuple.getFieldStart(resultTuple.getFieldCount() - 1)) + 1;
+                newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+                advanceCursor = true;
+                advancePrevResult = true;
+            } else {
+                if (cmp < 0) {
+                    int count = 1;
+                    newBufIdx = appendTupleToNewResults(invListTuple, count, newBufIdx);
+                    advanceCursor = true;
+                    advancePrevResult = false;
+                } else {
+                    int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+                            resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+                    newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+                    advanceCursor = false;
+                    advancePrevResult = true;
+                }
+            }
+
+            if (advancePrevResult) {
+                resultTidx++;
+                if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+                    prevBufIdx++;
+                    if (prevBufIdx <= maxPrevBufIdx) {
+                        prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+                        resultFrameTupleAcc.reset(prevCurrentBuffer);
+                        resultTidx = 0;
+                    }
+                }
+            }
+
+            if (advanceCursor) {
+                invListTidx++;
+                invListCursor.next();
+            }
+        }
+
+        // append remaining new elements from inverted list
+        while (invListTidx < invListNumTuples) {
+            ITupleReference invListTuple = invListCursor.getTuple();
+            newBufIdx = appendTupleToNewResults(invListTuple, 1, newBufIdx);
+            invListTidx++;
+            invListCursor.next();
+        }
+
+        // append remaining elements from previous result set
+        while (resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+            resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+
+            int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+                    resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+            newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+
+            resultTidx++;
+            if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+                prevBufIdx++;
+                if (prevBufIdx <= maxPrevBufIdx) {
+                    prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+                    resultFrameTupleAcc.reset(prevCurrentBuffer);
+                    resultTidx = 0;
+                }
+            }
+        }
+
+        return newBufIdx;
+    }
+
+    protected int appendTupleToNewResults(ITupleReference tuple, int newCount, int newBufIdx) throws IOException {
+        ByteBuffer newCurrentBuffer = newResultBuffers.get(newBufIdx);
+
+        if (!resultFrameTupleApp.hasSpace()) {
+            newBufIdx++;
+            if (newBufIdx >= newResultBuffers.size()) {
+                newResultBuffers.add(ctx.allocateFrame());
+            }
+            newCurrentBuffer = newResultBuffers.get(newBufIdx);
+            resultFrameTupleApp.reset(newCurrentBuffer, true);
+        }
+
+        // append key
+        if (!resultFrameTupleApp.append(tuple.getFieldData(0), tuple.getFieldStart(0), invListKeyLength)) {
+            throw new IllegalStateException();
+        }
+
+        // append new count
+        if (!resultFrameTupleApp.append(newCount)) {
+            throw new IllegalStateException();
+        }
+
+        resultFrameTupleApp.incrementTupleCount(1);
+
+        currentNumResults++;
+
+        return newBufIdx;
+    }
+
+    public IFrameTupleAccessor createResultFrameTupleAccessor() {
+        return new FixedSizeFrameTupleAccessor(ctx.getFrameSize(), invListFieldsWithCount);
+    }
+
+    public ITupleReference createResultTupleReference() {
+        return new FixedSizeTupleReference(invListFieldsWithCount);
+    }
+
+    @Override
+    public List<ByteBuffer> getResultBuffers() {
+        return newResultBuffers;
+    }
+
+    @Override
+    public int getNumValidResultBuffers() {
+        return maxResultBufIdx + 1;
+    }
+
+    public int getOccurrenceThreshold() {
+        return occurrenceThreshold;
+    }
+
+    public void printNewResults(int maxResultBufIdx) {
+        StringBuffer strBuffer = new StringBuffer();
+        for (int i = 0; i <= maxResultBufIdx; i++) {
+            ByteBuffer testBuf = newResultBuffers.get(i);
+            resultFrameTupleAcc.reset(testBuf);
+            for (int j = 0; j < resultFrameTupleAcc.getTupleCount(); j++) {
+                strBuffer.append(IntegerSerializerDeserializer.getInt(resultFrameTupleAcc.getBuffer().array(),
+                        resultFrameTupleAcc.getFieldStartOffset(j, 0)) + ",");
+                strBuffer.append(IntegerSerializerDeserializer.getInt(resultFrameTupleAcc.getBuffer().array(),
+                        resultFrameTupleAcc.getFieldStartOffset(j, 1)) + " ");
+            }
+        }
+        System.out.println(strBuffer.toString());
+    }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java
new file mode 100644
index 0000000..0f5439b
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
+
+public class TOccurrenceSearcherSuffixProbeOnly extends TOccurrenceSearcher {
+
+    public TOccurrenceSearcherSuffixProbeOnly(IHyracksTaskContext ctx, InvertedIndex invIndex,
+            IBinaryTokenizer queryTokenizer) {
+        super(ctx, invIndex, queryTokenizer);
+    }
+
+    protected int mergeSuffixLists(int numPrefixTokens, int numQueryTokens, int maxPrevBufIdx) throws IOException {
+        for (int i = numPrefixTokens; i < numQueryTokens; i++) {
+            swap = prevResultBuffers;
+            prevResultBuffers = newResultBuffers;
+            newResultBuffers = swap;
+            currentNumResults = 0;
+
+            invListCursors.get(i).pinPagesSync();
+            maxPrevBufIdx = mergeSuffixListProbe(invListCursors.get(i), prevResultBuffers, maxPrevBufIdx,
+                    newResultBuffers, i, numQueryTokens);
+            invListCursors.get(i).unpinPages();
+        }
+        return maxPrevBufIdx;
+    }
+
+    protected int mergeSuffixListProbe(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
+            int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws IOException {
+
+        int newBufIdx = 0;
+        ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
+
+        int prevBufIdx = 0;
+        ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+
+        int resultTidx = 0;
+
+        MultiComparator invListCmp = invIndex.getInvListElementCmp();
+
+        resultFrameTupleAcc.reset(prevCurrentBuffer);
+        resultFrameTupleApp.reset(newCurrentBuffer, true);
+
+        while (resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+            resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+            int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+                    resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+
+            if (invListCursor.containsKey(resultTuple, invListCmp)) {
+                count++;
+                newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+            } else {
+                if (count + numQueryTokens - invListIx > occurrenceThreshold) {
+                    newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+                }
+            }
+
+            resultTidx++;
+            if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+                prevBufIdx++;
+                if (prevBufIdx <= maxPrevBufIdx) {
+                    prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+                    resultFrameTupleAcc.reset(prevCurrentBuffer);
+                    resultTidx = 0;
+                }
+            }
+        }
+
+        return newBufIdx;
+    }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java
new file mode 100644
index 0000000..b997dc9
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
+
+public class TOccurrenceSearcherSuffixScanOnly extends TOccurrenceSearcher {
+
+    public TOccurrenceSearcherSuffixScanOnly(IHyracksTaskContext ctx, InvertedIndex invIndex,
+            IBinaryTokenizer queryTokenizer) {
+        super(ctx, invIndex, queryTokenizer);
+    }
+
+    protected int mergeSuffixLists(int numPrefixTokens, int numQueryTokens, int maxPrevBufIdx) throws IOException {
+        for (int i = numPrefixTokens; i < numQueryTokens; i++) {
+            swap = prevResultBuffers;
+            prevResultBuffers = newResultBuffers;
+            newResultBuffers = swap;
+            currentNumResults = 0;
+
+            invListCursors.get(i).pinPagesSync();
+            maxPrevBufIdx = mergeSuffixListScan(invListCursors.get(i), prevResultBuffers, maxPrevBufIdx,
+                    newResultBuffers, i, numQueryTokens);
+            invListCursors.get(i).unpinPages();
+        }
+        return maxPrevBufIdx;
+    }
+
+    protected int mergeSuffixListScan(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
+            int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws IOException {
+
+        int newBufIdx = 0;
+        ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
+
+        int prevBufIdx = 0;
+        ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+
+        boolean advanceCursor = true;
+        boolean advancePrevResult = false;
+        int resultTidx = 0;
+
+        MultiComparator invListCmp = invIndex.getInvListElementCmp();
+
+        resultFrameTupleAcc.reset(prevCurrentBuffer);
+        resultFrameTupleApp.reset(newCurrentBuffer, true);
+
+        while (invListCursor.hasNext() && resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+            if (advanceCursor)
+                invListCursor.next();
+
+            ITupleReference invListTuple = invListCursor.getTuple();
+
+            resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+
+            int cmp = invListCmp.compare(invListTuple, resultTuple);
+            if (cmp == 0) {
+                int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+                        resultTuple.getFieldStart(resultTuple.getFieldCount() - 1)) + 1;
+                newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+                advanceCursor = true;
+                advancePrevResult = true;
+            } else {
+                if (cmp < 0) {
+                    advanceCursor = true;
+                    advancePrevResult = false;
+                } else {
+                    int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+                            resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+                    if (count + numQueryTokens - invListIx > occurrenceThreshold) {
+                        newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+                    }
+                    advanceCursor = false;
+                    advancePrevResult = true;
+                }
+            }
+
+            if (advancePrevResult) {
+                resultTidx++;
+                if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+                    prevBufIdx++;
+                    if (prevBufIdx <= maxPrevBufIdx) {
+                        prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+                        resultFrameTupleAcc.reset(prevCurrentBuffer);
+                        resultTidx = 0;
+                    }
+                }
+            }
+        }
+
+        // append remaining elements from previous result set
+        while (resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+            int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+                    resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+            newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+
+            resultTidx++;
+            if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+                prevBufIdx++;
+                if (prevBufIdx <= maxPrevBufIdx) {
+                    prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+                    resultFrameTupleAcc.reset(prevCurrentBuffer);
+                    resultTidx = 0;
+                }
+            }
+        }
+
+        return newBufIdx;
+    }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/ConjunctiveSearchModifier.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/ConjunctiveSearchModifier.java
new file mode 100644
index 0000000..55945be
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/ConjunctiveSearchModifier.java
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers;
+
+import java.util.Collections;
+import java.util.List;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
+
+public class ConjunctiveSearchModifier implements IInvertedIndexSearchModifier {
+
+    @Override
+    public int getOccurrenceThreshold(List<IInvertedListCursor> invListCursors) {
+        return invListCursors.size();
+    }
+
+    @Override
+    public int getPrefixLists(List<IInvertedListCursor> invListCursors) {
+        Collections.sort(invListCursors);
+        return 1;
+    }
+
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/EditDistanceSearchModifier.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/EditDistanceSearchModifier.java
new file mode 100644
index 0000000..ac109b6
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/EditDistanceSearchModifier.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers;
+
+import java.util.Collections;
+import java.util.List;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
+
+public class EditDistanceSearchModifier implements IInvertedIndexSearchModifier {
+
+    private int gramLength;
+    private int edThresh;
+
+    public EditDistanceSearchModifier(int gramLength, int edThresh) {
+        this.gramLength = gramLength;
+        this.edThresh = edThresh;
+    }
+
+    @Override
+    public int getOccurrenceThreshold(List<IInvertedListCursor> invListCursors) {
+        return invListCursors.size() - edThresh * gramLength;
+    }
+
+    @Override
+    public int getPrefixLists(List<IInvertedListCursor> invListCursors) {
+        Collections.sort(invListCursors);
+        return invListCursors.size() - getOccurrenceThreshold(invListCursors) + 1;
+    }
+
+    public int getGramLength() {
+        return gramLength;
+    }
+
+    public void setGramLength(int gramLength) {
+        this.gramLength = gramLength;
+    }
+
+    public int getEdThresh() {
+        return edThresh;
+    }
+
+    public void setEdThresh(int edThresh) {
+        this.edThresh = edThresh;
+    }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/JaccardSearchModifier.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/JaccardSearchModifier.java
new file mode 100644
index 0000000..f42841c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/JaccardSearchModifier.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers;
+
+import java.util.Collections;
+import java.util.List;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
+
+public class JaccardSearchModifier implements IInvertedIndexSearchModifier {
+
+    private float jaccThresh;
+
+    public JaccardSearchModifier(float jaccThresh) {
+        this.jaccThresh = jaccThresh;
+    }
+
+    @Override
+    public int getOccurrenceThreshold(List<IInvertedListCursor> invListCursors) {
+        return (int) Math.floor((float) invListCursors.size() * jaccThresh);
+    }
+
+    @Override
+    public int getPrefixLists(List<IInvertedListCursor> invListCursors) {
+        Collections.sort(invListCursors);
+        if (invListCursors.size() == 0) {
+            return 0;
+        }
+        return invListCursors.size() - getOccurrenceThreshold(invListCursors) + 1;
+    }
+
+    public float getJaccThresh() {
+        return jaccThresh;
+    }
+
+    public void setJaccThresh(float jaccThresh) {
+        this.jaccThresh = jaccThresh;
+    }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..bbb32d6
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public abstract class AbstractUTF8StringBinaryTokenizer implements
+		IBinaryTokenizer {
+
+	protected byte[] data;
+	protected int start;
+	protected int length;
+	protected int tokenLength;
+	protected int index;
+	protected int utf8Length;
+
+	protected final IntArray tokensStart;
+	protected final IntArray tokensLength;
+	protected final IToken token;
+
+	protected final boolean ignoreTokenCount;
+	protected final boolean sourceHasTypeTag;
+
+	public AbstractUTF8StringBinaryTokenizer(boolean ignoreTokenCount,
+			boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+		this.ignoreTokenCount = ignoreTokenCount;
+		this.sourceHasTypeTag = sourceHasTypeTag;
+		if (!ignoreTokenCount) {
+			tokensStart = new IntArray();
+			tokensLength = new IntArray();
+		} else {
+			tokensStart = null;
+			tokensLength = null;
+		}
+		token = tokenFactory.createToken();
+	}
+
+	@Override
+	public IToken getToken() {
+		return token;
+	}
+
+	@Override
+	public void reset(byte[] data, int start, int length) {
+		this.start = start;
+		index = this.start;
+		if (sourceHasTypeTag) {
+			index++; // skip type tag
+		}
+		utf8Length = StringUtils.getUTFLen(data, index);
+		index += 2; // skip utf8 length indicator
+		this.data = data;
+		this.length = length + start;
+
+		tokenLength = 0;
+		if (!ignoreTokenCount) {
+			tokensStart.reset();
+			tokensLength.reset();
+		}
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java
new file mode 100644
index 0000000..92d6ac2
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java
@@ -0,0 +1,106 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public abstract class AbstractUTF8Token implements IToken {
+	public static final int GOLDEN_RATIO_32 = 0x09e3779b9;
+
+	protected int length;
+	protected int tokenLength;
+	protected int start;
+	protected int tokenCount;
+	protected byte[] data;
+	protected final byte tokenTypeTag;
+	protected final byte countTypeTag;
+
+	public AbstractUTF8Token() {
+		tokenTypeTag = -1;
+		countTypeTag = -1;
+	}
+
+	public AbstractUTF8Token(byte tokenTypeTag, byte countTypeTag) {
+		this.tokenTypeTag = tokenTypeTag;
+		this.countTypeTag = countTypeTag;
+	}
+
+	@Override
+	public byte[] getData() {
+		return data;
+	}
+
+	@Override
+	public int getLength() {
+		return length;
+	}
+
+	public int getLowerCaseUTF8Len(int size) {
+		int lowerCaseUTF8Len = 0;
+		int pos = start;
+		for (int i = 0; i < size; i++) {
+			char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+			lowerCaseUTF8Len += StringUtils.getModifiedUTF8Len(c);
+			pos += StringUtils.charSize(data, pos);
+		}
+		return lowerCaseUTF8Len;
+	}
+
+	@Override
+	public int getStart() {
+		return start;
+	}
+
+	@Override
+	public int getTokenLength() {
+		return tokenLength;
+	}
+
+	public void handleCountTypeTag(DataOutput dos) throws IOException {
+		if (countTypeTag > 0) {
+			dos.write(countTypeTag);
+		}
+	}
+
+	public void handleTokenTypeTag(DataOutput dos) throws IOException {
+		if (tokenTypeTag > 0) {
+			dos.write(tokenTypeTag);
+		}
+	}
+
+	@Override
+	public void reset(byte[] data, int start, int length, int tokenLength,
+			int tokenCount) {
+		this.data = data;
+		this.start = start;
+		this.length = length;
+		this.tokenLength = tokenLength;
+		this.tokenCount = tokenCount;
+	}
+
+	@Override
+	public void serializeTokenCount(DataOutput dos) throws IOException {
+		handleCountTypeTag(dos);
+		dos.writeInt(tokenCount);
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8TokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8TokenFactory.java
new file mode 100644
index 0000000..3b0b82d
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8TokenFactory.java
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public abstract class AbstractUTF8TokenFactory implements ITokenFactory {
+	private static final long serialVersionUID = 1L;
+	protected final byte tokenTypeTag;
+	protected final byte countTypeTag;
+
+	public AbstractUTF8TokenFactory() {
+		tokenTypeTag = -1;
+		countTypeTag = -1;
+	}
+
+	public AbstractUTF8TokenFactory(byte tokenTypeTag, byte countTypeTag) {
+		this.tokenTypeTag = tokenTypeTag;
+		this.countTypeTag = countTypeTag;
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
index 425436b..de9ad2c 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
@@ -1,116 +1,87 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- * 
+ * You may obtain a copy of the License at
+ *
  *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
  * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
  */
 
 package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
 
-import java.io.DataOutput;
-import java.io.IOException;
-
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
 import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
 
-public class DelimitedUTF8StringBinaryTokenizer implements IBinaryTokenizer {
+public class DelimitedUTF8StringBinaryTokenizer extends
+		AbstractUTF8StringBinaryTokenizer {
 
-    private static final RecordDescriptor tokenSchema = new RecordDescriptor(
-            new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE });
+	public DelimitedUTF8StringBinaryTokenizer(boolean ignoreTokenCount,
+			boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+		super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+	}
 
-    private final char delimiter;
-    private final byte typeTag;
-    private byte[] data;
-    private int start;
-    private int length;
+	@Override
+	public boolean hasNext() {
+		// skip delimiters
+		while (index < length && isSeparator(StringUtils.charAt(data, index))) {
+			index += StringUtils.charSize(data, index);
+		}
+		return index < length;
+	}
 
-    private int tokenLength;
-    private int tokenStart;
-    private int pos;
+	private boolean isSeparator(char c) {
+		return !(Character.isLetterOrDigit(c)
+				|| Character.getType(c) == Character.OTHER_LETTER || Character
+				.getType(c) == Character.OTHER_NUMBER);
+	}
 
-    public DelimitedUTF8StringBinaryTokenizer(char delimiter, byte typeTag) {
-        this.delimiter = delimiter;
-        this.typeTag = typeTag;
-    }
+	@Override
+	public void next() {
+		tokenLength = 0;
+		int currentTokenStart = index;
+		while (index < length && !isSeparator(StringUtils.charAt(data, index))) {
+			index += StringUtils.charSize(data, index);
+			tokenLength++;
+		}
+		int tokenCount = 1;
+		if (tokenLength > 0 && !ignoreTokenCount) {
+			// search if we got the same token before
+			for (int i = 0; i < tokensStart.length(); ++i) {
+				if (tokenLength == tokensLength.get(i)) {
+					int tokenStart = tokensStart.get(i);
+					tokenCount++; // assume we found it
+					int offset = 0;
+					int currLength = 0;
+					while (currLength < tokenLength) {
+						// case insensitive comparison
+						if (StringUtils.toLowerCase(StringUtils.charAt(data,
+								currentTokenStart + offset)) != StringUtils
+								.toLowerCase(StringUtils.charAt(data,
+										tokenStart + offset))) {
+							tokenCount--;
+							break;
+						}
+						offset += StringUtils.charSize(data, currentTokenStart
+								+ offset);
+						currLength++;
+					}
+				}
+			}
+			// add the new token to the list of seen tokens
+			tokensStart.add(currentTokenStart);
+			tokensLength.add(tokenLength);
+		}
 
-    public DelimitedUTF8StringBinaryTokenizer(char delimiter) {
-        this.delimiter = delimiter;
-        this.typeTag = -1;
-    }
-
-    @Override
-    public int getTokenLength() {
-        return tokenLength;
-    }
-
-    @Override
-    public int getTokenStartOff() {
-        return tokenStart;
-    }
-
-    @Override
-    public boolean hasNext() {
-        if (pos >= start + length)
-            return false;
-        else
-            return true;
-    }
-
-    @Override
-    public void next() {
-        tokenLength = 0;
-        tokenStart = pos;
-        while (pos < start + length) {
-            int len = StringUtils.charSize(data, pos);
-            char ch = StringUtils.charAt(data, pos);
-            pos += len;
-            if (ch == delimiter) {
-                break;
-            }
-            tokenLength += len;
-        }
-    }
-
-    @Override
-    public void reset(byte[] data, int start, int length) {
-        this.data = data;
-        this.start = start;
-        this.pos = start;
-        this.length = length;
-        this.tokenLength = 0;
-        this.tokenStart = 0;
-        pos += 2; // UTF-8 specific
-    }
-
-    @Override
-    public void writeToken(DataOutput dos) throws IOException {
-        if (typeTag > 0)
-            dos.write(typeTag);
-
-        // WARNING: 2-byte length indicator is specific to UTF-8
-        dos.writeShort((short) tokenLength);
-        dos.write(data, tokenStart, tokenLength);
-    }
-
-    @Override
-    public RecordDescriptor getTokenSchema() {
-        return tokenSchema;
-    }
-
-    // cannot be implemented for this tokenizer
-    @Override
-    public int getNumTokens() {
-        return -1;
-    }
+		// set token
+		token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
+	}
 }
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
index 2e85db5..4a350b3 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
@@ -1,41 +1,42 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- * 
+ * You may obtain a copy of the License at
+ *
  *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
  * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
  */
 
 package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
 
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
+public class DelimitedUTF8StringBinaryTokenizerFactory implements
+		IBinaryTokenizerFactory {
 
-public class DelimitedUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
+	private static final long serialVersionUID = 1L;
+	private final boolean ignoreTokenCount;
+	private final boolean sourceHasTypeTag;
+	private final ITokenFactory tokenFactory;
 
-    private static final long serialVersionUID = 1L;
-    private final char delimiter;
-    private final byte typeTag;
+	public DelimitedUTF8StringBinaryTokenizerFactory(boolean ignoreTokenCount,
+			boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+		this.ignoreTokenCount = ignoreTokenCount;
+		this.sourceHasTypeTag = sourceHasTypeTag;
+		this.tokenFactory = tokenFactory;
+	}
 
-    public DelimitedUTF8StringBinaryTokenizerFactory(char delimiter, byte typeTag) {
-        this.delimiter = delimiter;
-        this.typeTag = typeTag;
-    }
-
-    public DelimitedUTF8StringBinaryTokenizerFactory(char delimiter) {
-        this.delimiter = delimiter;
-        this.typeTag = -1;
-    }
-
-    @Override
-    public IBinaryTokenizer createBinaryTokenizer() {
-        return new DelimitedUTF8StringBinaryTokenizer(delimiter, typeTag);
-    }
+	@Override
+	public IBinaryTokenizer createTokenizer() {
+		return new DelimitedUTF8StringBinaryTokenizer(ignoreTokenCount,
+				sourceHasTypeTag, tokenFactory);
+	}
 }
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java
deleted file mode 100644
index 2cc0b6c..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
-import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-
-public class HashedQGramUTF8StringBinaryTokenizer implements IBinaryTokenizer {
-
-    private static final RecordDescriptor tokenSchema = new RecordDescriptor(
-            new ISerializerDeserializer[] { IntegerSerializerDeserializer.INSTANCE });
-
-    private final boolean prePost;
-    private final int q;
-    private byte[] data;
-    private int start;
-    private int length;
-    private int gramNum;
-    private int utflen;
-
-    private final char PRECHAR = '#';
-    private final char POSTCHAR = '$';
-
-    private int charPos;
-    private int pos;
-    private int hashedGram;
-
-    HashedQGramUTF8StringBinaryTokenizer(int q, boolean prePost) {
-        this.prePost = prePost;
-        this.q = q;
-    }
-
-    @Override
-    public int getTokenLength() {
-        // the produced token (hashed q-gram) is derived from data
-        // but not contained in it
-        // therefore this call does not make sense
-        return -1;
-    }
-
-    @Override
-    public int getTokenStartOff() {
-        // the produced token (hashed q-gram) is derived from data
-        // but not contained in it
-        // therefore this call does not make sense
-        return -1;
-    }
-
-    @Override
-    public boolean hasNext() {
-        if ((prePost && pos >= start + length) || (!prePost && pos >= start + length - q))
-            return false;
-        else
-            return true;
-    }
-
-    @Override
-    public void next() {
-        hashedGram = 0;
-        if (prePost) {
-            if (gramNum < q) {
-                for (int i = 0; i < q - gramNum; i++) {
-                    hashedGram = 31 * hashedGram + PRECHAR;
-                }
-
-                int tmpPos = pos;
-                for (int i = 0; i < gramNum; i++) {
-                    hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
-                    tmpPos += StringUtils.charSize(data, tmpPos);
-                }
-            } else {
-                int stopStr = Math.min(charPos + q, utflen);
-                int tmpPos = pos;
-                for (int i = charPos; i < stopStr; i++) {
-                    hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
-                    tmpPos += StringUtils.charSize(data, tmpPos);
-                }
-
-                int stopPost = (charPos + q) - (utflen);
-                for (int i = 0; i < stopPost; i++) {
-                    hashedGram = 31 * hashedGram + POSTCHAR;
-                }
-                pos += StringUtils.charSize(data, pos);
-                charPos++;
-            }
-            gramNum++;
-        } else {
-            int tmpPos = pos;
-            for (int i = charPos; i < charPos + q; i++) {
-                hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
-                tmpPos += StringUtils.charSize(data, tmpPos);
-            }
-            pos += StringUtils.charSize(data, pos);
-            charPos++;
-        }
-    }
-
-    @Override
-    public void reset(byte[] data, int start, int length) {
-        this.data = data;
-        this.start = start;
-        this.length = length;
-        this.utflen = StringUtils.getUTFLen(data, start);
-        this.pos = start + 2; // UTF-8 specific
-        this.gramNum = 1;
-        this.charPos = 0;
-    }
-
-    @Override
-    public void writeToken(DataOutput dos) throws IOException {
-        dos.writeInt(hashedGram);
-    }
-
-    public char getPreChar() {
-        return PRECHAR;
-    }
-
-    public char getPostChar() {
-        return POSTCHAR;
-    }
-
-    @Override
-    public RecordDescriptor getTokenSchema() {
-        return tokenSchema;
-    }
-
-    @Override
-    public int getNumTokens() {
-        return 0;
-    }
-}
\ No newline at end of file
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java
deleted file mode 100644
index a11fe8a..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
-
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
-
-public class HashedQGramUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
-
-    private static final long serialVersionUID = 1L;
-    private final int q;
-    private final boolean prePost;
-
-    public HashedQGramUTF8StringBinaryTokenizerFactory(int q, boolean prePost) {
-        this.q = q;
-        this.prePost = prePost;
-    }
-
-    @Override
-    public IBinaryTokenizer createBinaryTokenizer() {
-        return new HashedQGramUTF8StringBinaryTokenizer(q, prePost);
-    }
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java
new file mode 100644
index 0000000..25d1a2c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class HashedUTF8NGramToken extends UTF8NGramToken {
+	public HashedUTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public void serializeToken(DataOutput dos) throws IOException {
+		handleTokenTypeTag(dos);
+
+		int hash = GOLDEN_RATIO_32;
+
+		// pre chars
+		for (int i = 0; i < numPreChars; i++) {
+			hash ^= PRECHAR;
+			hash *= GOLDEN_RATIO_32;
+		}
+
+		// regular chars
+		int numRegGrams = tokenLength - numPreChars - numPostChars;
+		int pos = start;
+		for (int i = 0; i < numRegGrams; i++) {
+			hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+			hash *= GOLDEN_RATIO_32;
+			pos += StringUtils.charSize(data, pos);
+		}
+
+		// post chars
+		for (int i = 0; i < numPostChars; i++) {
+			hash ^= POSTCHAR;
+			hash *= GOLDEN_RATIO_32;
+		}
+
+		// token count
+		hash += tokenCount;
+
+		dos.writeInt(hash);
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java
new file mode 100644
index 0000000..4a87793
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class HashedUTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
+
+	private static final long serialVersionUID = 1L;
+
+	public HashedUTF8NGramTokenFactory() {
+		super();
+	}
+
+	public HashedUTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public IToken createToken() {
+		return new HashedUTF8NGramToken(tokenTypeTag, countTypeTag);
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java
new file mode 100644
index 0000000..55237ce
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java
@@ -0,0 +1,88 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class HashedUTF8WordToken extends UTF8WordToken {
+
+	private int hash = 0;
+
+	public HashedUTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public boolean equals(Object o) {
+		if (o == null) {
+			return false;
+		}
+		if (!(o instanceof IToken)) {
+			return false;
+		}
+		IToken t = (IToken) o;
+		if (t.getTokenLength() != tokenLength) {
+			return false;
+		}
+		int offset = 0;
+		for (int i = 0; i < tokenLength; i++) {
+			if (StringUtils.charAt(t.getData(), t.getStart() + offset) != StringUtils
+					.charAt(data, start + offset)) {
+				return false;
+			}
+			offset += StringUtils.charSize(data, start + offset);
+		}
+		return true;
+	}
+
+	@Override
+	public int hashCode() {
+		return hash;
+	}
+
+	@Override
+	public void reset(byte[] data, int start, int length, int tokenLength,
+			int tokenCount) {
+		super.reset(data, start, length, tokenLength, tokenCount);
+
+		// pre-compute hash value using JAQL-like string hashing
+		int pos = start;
+		hash = GOLDEN_RATIO_32;
+		for (int i = 0; i < tokenLength; i++) {
+			hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+			hash *= GOLDEN_RATIO_32;
+			pos += StringUtils.charSize(data, pos);
+		}
+		hash += tokenCount;
+	}
+
+	@Override
+	public void serializeToken(DataOutput dos) throws IOException {
+		if (tokenTypeTag > 0) {
+			dos.write(tokenTypeTag);
+		}
+
+		// serialize hash value
+		dos.writeInt(hash);
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java
new file mode 100644
index 0000000..318f041
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class HashedUTF8WordTokenFactory extends AbstractUTF8TokenFactory {
+
+	private static final long serialVersionUID = 1L;
+
+	public HashedUTF8WordTokenFactory() {
+		super();
+	}
+
+	public HashedUTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public IToken createToken() {
+		return new HashedUTF8WordToken(tokenTypeTag, countTypeTag);
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizer.java
new file mode 100644
index 0000000..05c6d0b
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizer.java
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public interface IBinaryTokenizer {
+	public IToken getToken();
+
+	public boolean hasNext();
+
+	public void next();
+
+	public void reset(byte[] data, int start, int length);
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizerFactory.java
new file mode 100644
index 0000000..bfe78ee
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizerFactory.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.Serializable;
+
+public interface IBinaryTokenizerFactory extends Serializable {
+	public IBinaryTokenizer createTokenizer();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/INGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/INGramToken.java
new file mode 100644
index 0000000..befc6d2
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/INGramToken.java
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public interface INGramToken {
+	public int getNumPostChars();
+
+	public int getNumPreChars();
+
+	public void setNumPrePostChars(int numPreChars, int numPostChars);
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IToken.java
new file mode 100644
index 0000000..c1840d7
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IToken.java
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public interface IToken {
+	public byte[] getData();
+
+	public int getLength();
+
+	public int getStart();
+
+	public int getTokenLength();
+
+	public void reset(byte[] data, int start, int length, int tokenLength,
+			int tokenCount);
+
+	public void serializeToken(DataOutput dos) throws IOException;
+
+	public void serializeTokenCount(DataOutput dos) throws IOException;
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/ITokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/ITokenFactory.java
new file mode 100644
index 0000000..8b5d71d
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/ITokenFactory.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.Serializable;
+
+public interface ITokenFactory extends Serializable {
+    public IToken createToken();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IntArray.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IntArray.java
new file mode 100644
index 0000000..2eb9ff4
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IntArray.java
@@ -0,0 +1,80 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.util.Arrays;
+
+public class IntArray {
+    private static final int SIZE = 128;
+
+    private int[] data;
+    private int length;
+
+    public IntArray() {
+        data = new int[SIZE];
+        length = 0;
+    }
+
+    public void add(int d) {
+        if (length == data.length) {
+            data = Arrays.copyOf(data, data.length << 1);
+        }
+        data[length++] = d;
+    }
+
+    public int[] get() {
+        return data;
+    }
+
+    public int get(int i) {
+        return data[i];
+    }
+
+    public int length() {
+        return length;
+    }
+
+    public void reset() {
+        length = 0;
+    }
+
+    public void sort() {
+        sort(0, length);
+    }
+
+    public void sort(int start, int end) {
+        Arrays.sort(data, start, end);
+    }
+
+    @Override
+    public String toString() {
+        StringBuilder out = new StringBuilder();
+        out.append('[');
+        for (int i = 0; i < length; ++i) {
+            out.append(data[i]);
+            if (i < length - 1) {
+                out.append(',');
+                out.append(' ');
+            }
+        }
+        out.append(']');
+        return out.toString();
+    }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..2a13f83
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
@@ -0,0 +1,123 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class NGramUTF8StringBinaryTokenizer extends
+		AbstractUTF8StringBinaryTokenizer {
+
+	private int gramLength;
+	private boolean usePrePost;
+
+	private int gramNum;
+	private int totalGrams;
+
+	private final INGramToken concreteToken;
+
+	public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost,
+			boolean ignoreTokenCount, boolean sourceHasTypeTag,
+			ITokenFactory tokenFactory) {
+		super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+		this.gramLength = gramLength;
+		this.usePrePost = usePrePost;
+		concreteToken = (INGramToken) token;
+	}
+
+	@Override
+	public boolean hasNext() {
+		if (gramNum < totalGrams) {
+			return true;
+		} else {
+			return false;
+		}
+	}
+
+	@Override
+	public void next() {
+		int currentTokenStart = index;
+		int tokenCount = 1;
+		int numPreChars = 0;
+		int numPostChars = 0;
+		if (usePrePost) {
+			numPreChars = Math.max(gramLength - gramNum - 1, 0);
+			numPostChars = (gramNum > totalGrams - gramLength) ? gramLength
+					- totalGrams + gramNum : 0;
+		}
+		gramNum++;
+
+		concreteToken.setNumPrePostChars(numPreChars, numPostChars);
+		if (numPreChars == 0) {
+			index += StringUtils.charSize(data, index);
+		}
+
+		// compute token count
+		// ignore pre and post grams for duplicate detection
+		if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
+			int tmpIndex = start;
+			while (tmpIndex < currentTokenStart) {
+				tokenCount++; // assume found
+				int offset = 0;
+				for (int j = 0; j < gramLength; j++) {
+					if (StringUtils.toLowerCase(StringUtils.charAt(data,
+							currentTokenStart + offset)) != StringUtils
+							.toLowerCase(StringUtils.charAt(data, tmpIndex
+									+ offset))) {
+						tokenCount--;
+						break;
+					}
+					offset += StringUtils.charSize(data, tmpIndex + offset);
+				}
+				tmpIndex += StringUtils.charSize(data, tmpIndex);
+			}
+		}
+
+		// set token
+		token.reset(data, currentTokenStart, length, gramLength, tokenCount);
+	}
+
+	@Override
+	public void reset(byte[] data, int start, int length) {
+		super.reset(data, start, length);
+		gramNum = 0;
+
+		int numChars = 0;
+		int pos = index;
+		int end = pos + utf8Length;
+		while (pos < end) {
+			numChars++;
+			pos += StringUtils.charSize(data, pos);
+		}
+
+		if (usePrePost) {
+			totalGrams = numChars + gramLength - 1;
+		} else {
+			totalGrams = numChars - gramLength + 1;
+		}
+	}
+
+	public void setGramlength(int gramLength) {
+		this.gramLength = gramLength;
+	}
+
+	public void setPrePost(boolean usePrePost) {
+		this.usePrePost = usePrePost;
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java
new file mode 100644
index 0000000..6b6406f
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java
@@ -0,0 +1,86 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class UTF8NGramToken extends AbstractUTF8Token implements INGramToken {
+
+	public final static char PRECHAR = '#';
+
+	public final static char POSTCHAR = '$';
+
+	protected int numPreChars;
+	protected int numPostChars;
+
+	public UTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public int getNumPostChars() {
+		return numPreChars;
+	}
+
+	@Override
+	public int getNumPreChars() {
+		return numPostChars;
+	}
+
+	@Override
+	public void serializeToken(DataOutput dos) throws IOException {
+		handleTokenTypeTag(dos);
+
+		// regular chars
+		int numRegChars = tokenLength - numPreChars - numPostChars;
+
+		// assuming pre and post char need 1-byte each in utf8
+		int tokenUTF8Len = getLowerCaseUTF8Len(numRegChars) + numPreChars
+				+ numPostChars;
+
+		// write utf8 length indicator
+		StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+
+		// pre chars
+		for (int i = 0; i < numPreChars; i++) {
+			StringUtils.writeCharAsModifiedUTF8(PRECHAR, dos);
+		}
+
+		int pos = start;
+		for (int i = 0; i < numRegChars; i++) {
+			char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+			StringUtils.writeCharAsModifiedUTF8(c, dos);
+			pos += StringUtils.charSize(data, pos);
+		}
+
+		// post chars
+		for (int i = 0; i < numPostChars; i++) {
+			StringUtils.writeCharAsModifiedUTF8(POSTCHAR, dos);
+		}
+	}
+
+	public void setNumPrePostChars(int numPreChars, int numPostChars) {
+		this.numPreChars = numPreChars;
+		this.numPostChars = numPostChars;
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramTokenFactory.java
new file mode 100644
index 0000000..968d8e1
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramTokenFactory.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class UTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
+
+	private static final long serialVersionUID = 1L;
+
+	public UTF8NGramTokenFactory() {
+		super();
+	}
+
+	public UTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public IToken createToken() {
+		return new UTF8NGramToken(tokenTypeTag, countTypeTag);
+	}
+
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java
new file mode 100644
index 0000000..25e0cd3
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class UTF8WordToken extends AbstractUTF8Token {
+
+	public UTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public void serializeToken(DataOutput dos) throws IOException {
+		handleTokenTypeTag(dos);
+
+		int tokenUTF8Len = getLowerCaseUTF8Len(tokenLength);
+		StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+		int pos = start;
+		for (int i = 0; i < tokenLength; i++) {
+			char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+			StringUtils.writeCharAsModifiedUTF8(c, dos);
+			pos += StringUtils.charSize(data, pos);
+		}
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordTokenFactory.java
new file mode 100644
index 0000000..4358254
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordTokenFactory.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class UTF8WordTokenFactory extends AbstractUTF8TokenFactory {
+
+	private static final long serialVersionUID = 1L;
+
+	public UTF8WordTokenFactory() {
+		super();
+	}
+
+	public UTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public IToken createToken() {
+		return new UTF8WordToken(tokenTypeTag, countTypeTag);
+	}
+
+}