Removed dependency on fuzzyjoin. Moved tokenizers into invertedindex package.

git-svn-id: https://hyracks.googlecode.com/svn/branches/hyracks_indexes@523 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-storage-am-invertedindex/pom.xml b/hyracks-storage-am-invertedindex/pom.xml
index 34f1135..769a3f4 100644
--- a/hyracks-storage-am-invertedindex/pom.xml
+++ b/hyracks-storage-am-invertedindex/pom.xml
@@ -58,14 +58,7 @@
   		<version>0.1.5</version>
   		<type>jar</type>
   		<scope>compile</scope>
-  	</dependency>  	
-    <dependency>
-        <groupId>edu.uci.ics.fuzzyjoin</groupId>
-        <artifactId>fuzzyjoin-core</artifactId>
-        <version>0.0.3</version>
-        <type>jar</type>
-        <scope>compile</scope>
-    </dependency>
+  	</dependency>    
   	<dependency>
   		<groupId>junit</groupId>
   		<artifactId>junit</artifactId>
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
index 1533307..83246d6 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
@@ -15,7 +15,6 @@
 
 package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
 
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizerFactory;
 import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
 import edu.uci.ics.hyracks.api.dataflow.IOperatorNodePushable;
 import edu.uci.ics.hyracks.api.dataflow.value.IRecordDescriptorProvider;
@@ -24,6 +23,7 @@
 import edu.uci.ics.hyracks.api.job.IOperatorEnvironment;
 import edu.uci.ics.hyracks.api.job.JobSpecification;
 import edu.uci.ics.hyracks.dataflow.std.base.AbstractSingleActivityOperatorDescriptor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizerFactory;
 
 public class BinaryTokenizerOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor {
 
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
index d470513..0647f45 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
@@ -19,8 +19,6 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
 import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
 import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
 import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
@@ -29,6 +27,8 @@
 import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
 import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
 import edu.uci.ics.hyracks.dataflow.std.base.AbstractUnaryInputUnaryOutputOperatorNodePushable;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
 
 public class BinaryTokenizerOperatorNodePushable extends AbstractUnaryInputUnaryOutputOperatorNodePushable {
 
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
index 5585856..d1fba3b 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
@@ -15,17 +15,12 @@
 
 package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
 
-import java.io.ByteArrayInputStream;
-import java.io.DataInput;
-import java.io.DataInputStream;
 import java.io.DataOutput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.List;
 
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
 import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
 import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
 import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
@@ -52,6 +47,8 @@
 import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
 import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
 import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
 
 public class TOccurrenceSearcher implements IInvertedIndexSearcher {
 
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java
index 18b870b..30d67f0 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java
@@ -19,11 +19,11 @@
 import java.nio.ByteBuffer;
 import java.util.List;
 
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
 import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
 import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
 import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
 import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
 
 public class TOccurrenceSearcherSuffixProbeOnly extends TOccurrenceSearcher {
 
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java
index 604c68d..f8bc1ab 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java
@@ -19,12 +19,12 @@
 import java.nio.ByteBuffer;
 import java.util.List;
 
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
 import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
 import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
 import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
 import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
 import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
 
 public class TOccurrenceSearcherSuffixScanOnly extends TOccurrenceSearcher {
 
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..bbb32d6
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public abstract class AbstractUTF8StringBinaryTokenizer implements
+		IBinaryTokenizer {
+
+	protected byte[] data;
+	protected int start;
+	protected int length;
+	protected int tokenLength;
+	protected int index;
+	protected int utf8Length;
+
+	protected final IntArray tokensStart;
+	protected final IntArray tokensLength;
+	protected final IToken token;
+
+	protected final boolean ignoreTokenCount;
+	protected final boolean sourceHasTypeTag;
+
+	public AbstractUTF8StringBinaryTokenizer(boolean ignoreTokenCount,
+			boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+		this.ignoreTokenCount = ignoreTokenCount;
+		this.sourceHasTypeTag = sourceHasTypeTag;
+		if (!ignoreTokenCount) {
+			tokensStart = new IntArray();
+			tokensLength = new IntArray();
+		} else {
+			tokensStart = null;
+			tokensLength = null;
+		}
+		token = tokenFactory.createToken();
+	}
+
+	@Override
+	public IToken getToken() {
+		return token;
+	}
+
+	@Override
+	public void reset(byte[] data, int start, int length) {
+		this.start = start;
+		index = this.start;
+		if (sourceHasTypeTag) {
+			index++; // skip type tag
+		}
+		utf8Length = StringUtils.getUTFLen(data, index);
+		index += 2; // skip utf8 length indicator
+		this.data = data;
+		this.length = length + start;
+
+		tokenLength = 0;
+		if (!ignoreTokenCount) {
+			tokensStart.reset();
+			tokensLength.reset();
+		}
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java
new file mode 100644
index 0000000..92d6ac2
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java
@@ -0,0 +1,106 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public abstract class AbstractUTF8Token implements IToken {
+	public static final int GOLDEN_RATIO_32 = 0x09e3779b9;
+
+	protected int length;
+	protected int tokenLength;
+	protected int start;
+	protected int tokenCount;
+	protected byte[] data;
+	protected final byte tokenTypeTag;
+	protected final byte countTypeTag;
+
+	public AbstractUTF8Token() {
+		tokenTypeTag = -1;
+		countTypeTag = -1;
+	}
+
+	public AbstractUTF8Token(byte tokenTypeTag, byte countTypeTag) {
+		this.tokenTypeTag = tokenTypeTag;
+		this.countTypeTag = countTypeTag;
+	}
+
+	@Override
+	public byte[] getData() {
+		return data;
+	}
+
+	@Override
+	public int getLength() {
+		return length;
+	}
+
+	public int getLowerCaseUTF8Len(int size) {
+		int lowerCaseUTF8Len = 0;
+		int pos = start;
+		for (int i = 0; i < size; i++) {
+			char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+			lowerCaseUTF8Len += StringUtils.getModifiedUTF8Len(c);
+			pos += StringUtils.charSize(data, pos);
+		}
+		return lowerCaseUTF8Len;
+	}
+
+	@Override
+	public int getStart() {
+		return start;
+	}
+
+	@Override
+	public int getTokenLength() {
+		return tokenLength;
+	}
+
+	public void handleCountTypeTag(DataOutput dos) throws IOException {
+		if (countTypeTag > 0) {
+			dos.write(countTypeTag);
+		}
+	}
+
+	public void handleTokenTypeTag(DataOutput dos) throws IOException {
+		if (tokenTypeTag > 0) {
+			dos.write(tokenTypeTag);
+		}
+	}
+
+	@Override
+	public void reset(byte[] data, int start, int length, int tokenLength,
+			int tokenCount) {
+		this.data = data;
+		this.start = start;
+		this.length = length;
+		this.tokenLength = tokenLength;
+		this.tokenCount = tokenCount;
+	}
+
+	@Override
+	public void serializeTokenCount(DataOutput dos) throws IOException {
+		handleCountTypeTag(dos);
+		dos.writeInt(tokenCount);
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8TokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8TokenFactory.java
new file mode 100644
index 0000000..3b0b82d
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8TokenFactory.java
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public abstract class AbstractUTF8TokenFactory implements ITokenFactory {
+	private static final long serialVersionUID = 1L;
+	protected final byte tokenTypeTag;
+	protected final byte countTypeTag;
+
+	public AbstractUTF8TokenFactory() {
+		tokenTypeTag = -1;
+		countTypeTag = -1;
+	}
+
+	public AbstractUTF8TokenFactory(byte tokenTypeTag, byte countTypeTag) {
+		this.tokenTypeTag = tokenTypeTag;
+		this.countTypeTag = countTypeTag;
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..de9ad2c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
@@ -0,0 +1,87 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class DelimitedUTF8StringBinaryTokenizer extends
+		AbstractUTF8StringBinaryTokenizer {
+
+	public DelimitedUTF8StringBinaryTokenizer(boolean ignoreTokenCount,
+			boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+		super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+	}
+
+	@Override
+	public boolean hasNext() {
+		// skip delimiters
+		while (index < length && isSeparator(StringUtils.charAt(data, index))) {
+			index += StringUtils.charSize(data, index);
+		}
+		return index < length;
+	}
+
+	private boolean isSeparator(char c) {
+		return !(Character.isLetterOrDigit(c)
+				|| Character.getType(c) == Character.OTHER_LETTER || Character
+				.getType(c) == Character.OTHER_NUMBER);
+	}
+
+	@Override
+	public void next() {
+		tokenLength = 0;
+		int currentTokenStart = index;
+		while (index < length && !isSeparator(StringUtils.charAt(data, index))) {
+			index += StringUtils.charSize(data, index);
+			tokenLength++;
+		}
+		int tokenCount = 1;
+		if (tokenLength > 0 && !ignoreTokenCount) {
+			// search if we got the same token before
+			for (int i = 0; i < tokensStart.length(); ++i) {
+				if (tokenLength == tokensLength.get(i)) {
+					int tokenStart = tokensStart.get(i);
+					tokenCount++; // assume we found it
+					int offset = 0;
+					int currLength = 0;
+					while (currLength < tokenLength) {
+						// case insensitive comparison
+						if (StringUtils.toLowerCase(StringUtils.charAt(data,
+								currentTokenStart + offset)) != StringUtils
+								.toLowerCase(StringUtils.charAt(data,
+										tokenStart + offset))) {
+							tokenCount--;
+							break;
+						}
+						offset += StringUtils.charSize(data, currentTokenStart
+								+ offset);
+						currLength++;
+					}
+				}
+			}
+			// add the new token to the list of seen tokens
+			tokensStart.add(currentTokenStart);
+			tokensLength.add(tokenLength);
+		}
+
+		// set token
+		token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
new file mode 100644
index 0000000..4a350b3
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class DelimitedUTF8StringBinaryTokenizerFactory implements
+		IBinaryTokenizerFactory {
+
+	private static final long serialVersionUID = 1L;
+	private final boolean ignoreTokenCount;
+	private final boolean sourceHasTypeTag;
+	private final ITokenFactory tokenFactory;
+
+	public DelimitedUTF8StringBinaryTokenizerFactory(boolean ignoreTokenCount,
+			boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+		this.ignoreTokenCount = ignoreTokenCount;
+		this.sourceHasTypeTag = sourceHasTypeTag;
+		this.tokenFactory = tokenFactory;
+	}
+
+	@Override
+	public IBinaryTokenizer createTokenizer() {
+		return new DelimitedUTF8StringBinaryTokenizer(ignoreTokenCount,
+				sourceHasTypeTag, tokenFactory);
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java
new file mode 100644
index 0000000..25d1a2c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class HashedUTF8NGramToken extends UTF8NGramToken {
+	public HashedUTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public void serializeToken(DataOutput dos) throws IOException {
+		handleTokenTypeTag(dos);
+
+		int hash = GOLDEN_RATIO_32;
+
+		// pre chars
+		for (int i = 0; i < numPreChars; i++) {
+			hash ^= PRECHAR;
+			hash *= GOLDEN_RATIO_32;
+		}
+
+		// regular chars
+		int numRegGrams = tokenLength - numPreChars - numPostChars;
+		int pos = start;
+		for (int i = 0; i < numRegGrams; i++) {
+			hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+			hash *= GOLDEN_RATIO_32;
+			pos += StringUtils.charSize(data, pos);
+		}
+
+		// post chars
+		for (int i = 0; i < numPostChars; i++) {
+			hash ^= POSTCHAR;
+			hash *= GOLDEN_RATIO_32;
+		}
+
+		// token count
+		hash += tokenCount;
+
+		dos.writeInt(hash);
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java
new file mode 100644
index 0000000..4a87793
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class HashedUTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
+
+	private static final long serialVersionUID = 1L;
+
+	public HashedUTF8NGramTokenFactory() {
+		super();
+	}
+
+	public HashedUTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public IToken createToken() {
+		return new HashedUTF8NGramToken(tokenTypeTag, countTypeTag);
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java
new file mode 100644
index 0000000..55237ce
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java
@@ -0,0 +1,88 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class HashedUTF8WordToken extends UTF8WordToken {
+
+	private int hash = 0;
+
+	public HashedUTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public boolean equals(Object o) {
+		if (o == null) {
+			return false;
+		}
+		if (!(o instanceof IToken)) {
+			return false;
+		}
+		IToken t = (IToken) o;
+		if (t.getTokenLength() != tokenLength) {
+			return false;
+		}
+		int offset = 0;
+		for (int i = 0; i < tokenLength; i++) {
+			if (StringUtils.charAt(t.getData(), t.getStart() + offset) != StringUtils
+					.charAt(data, start + offset)) {
+				return false;
+			}
+			offset += StringUtils.charSize(data, start + offset);
+		}
+		return true;
+	}
+
+	@Override
+	public int hashCode() {
+		return hash;
+	}
+
+	@Override
+	public void reset(byte[] data, int start, int length, int tokenLength,
+			int tokenCount) {
+		super.reset(data, start, length, tokenLength, tokenCount);
+
+		// pre-compute hash value using JAQL-like string hashing
+		int pos = start;
+		hash = GOLDEN_RATIO_32;
+		for (int i = 0; i < tokenLength; i++) {
+			hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+			hash *= GOLDEN_RATIO_32;
+			pos += StringUtils.charSize(data, pos);
+		}
+		hash += tokenCount;
+	}
+
+	@Override
+	public void serializeToken(DataOutput dos) throws IOException {
+		if (tokenTypeTag > 0) {
+			dos.write(tokenTypeTag);
+		}
+
+		// serialize hash value
+		dos.writeInt(hash);
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java
new file mode 100644
index 0000000..318f041
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class HashedUTF8WordTokenFactory extends AbstractUTF8TokenFactory {
+
+	private static final long serialVersionUID = 1L;
+
+	public HashedUTF8WordTokenFactory() {
+		super();
+	}
+
+	public HashedUTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public IToken createToken() {
+		return new HashedUTF8WordToken(tokenTypeTag, countTypeTag);
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizer.java
new file mode 100644
index 0000000..05c6d0b
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizer.java
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public interface IBinaryTokenizer {
+	public IToken getToken();
+
+	public boolean hasNext();
+
+	public void next();
+
+	public void reset(byte[] data, int start, int length);
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizerFactory.java
new file mode 100644
index 0000000..bfe78ee
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizerFactory.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.Serializable;
+
+public interface IBinaryTokenizerFactory extends Serializable {
+	public IBinaryTokenizer createTokenizer();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/INGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/INGramToken.java
new file mode 100644
index 0000000..befc6d2
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/INGramToken.java
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public interface INGramToken {
+	public int getNumPostChars();
+
+	public int getNumPreChars();
+
+	public void setNumPrePostChars(int numPreChars, int numPostChars);
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IToken.java
new file mode 100644
index 0000000..c1840d7
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IToken.java
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public interface IToken {
+	public byte[] getData();
+
+	public int getLength();
+
+	public int getStart();
+
+	public int getTokenLength();
+
+	public void reset(byte[] data, int start, int length, int tokenLength,
+			int tokenCount);
+
+	public void serializeToken(DataOutput dos) throws IOException;
+
+	public void serializeTokenCount(DataOutput dos) throws IOException;
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/ITokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/ITokenFactory.java
new file mode 100644
index 0000000..8b5d71d
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/ITokenFactory.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.Serializable;
+
+public interface ITokenFactory extends Serializable {
+    public IToken createToken();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IntArray.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IntArray.java
new file mode 100644
index 0000000..2eb9ff4
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IntArray.java
@@ -0,0 +1,80 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.util.Arrays;
+
+public class IntArray {
+    private static final int SIZE = 128;
+
+    private int[] data;
+    private int length;
+
+    public IntArray() {
+        data = new int[SIZE];
+        length = 0;
+    }
+
+    public void add(int d) {
+        if (length == data.length) {
+            data = Arrays.copyOf(data, data.length << 1);
+        }
+        data[length++] = d;
+    }
+
+    public int[] get() {
+        return data;
+    }
+
+    public int get(int i) {
+        return data[i];
+    }
+
+    public int length() {
+        return length;
+    }
+
+    public void reset() {
+        length = 0;
+    }
+
+    public void sort() {
+        sort(0, length);
+    }
+
+    public void sort(int start, int end) {
+        Arrays.sort(data, start, end);
+    }
+
+    @Override
+    public String toString() {
+        StringBuilder out = new StringBuilder();
+        out.append('[');
+        for (int i = 0; i < length; ++i) {
+            out.append(data[i]);
+            if (i < length - 1) {
+                out.append(',');
+                out.append(' ');
+            }
+        }
+        out.append(']');
+        return out.toString();
+    }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..2a13f83
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
@@ -0,0 +1,123 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class NGramUTF8StringBinaryTokenizer extends
+		AbstractUTF8StringBinaryTokenizer {
+
+	private int gramLength;
+	private boolean usePrePost;
+
+	private int gramNum;
+	private int totalGrams;
+
+	private final INGramToken concreteToken;
+
+	public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost,
+			boolean ignoreTokenCount, boolean sourceHasTypeTag,
+			ITokenFactory tokenFactory) {
+		super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+		this.gramLength = gramLength;
+		this.usePrePost = usePrePost;
+		concreteToken = (INGramToken) token;
+	}
+
+	@Override
+	public boolean hasNext() {
+		if (gramNum < totalGrams) {
+			return true;
+		} else {
+			return false;
+		}
+	}
+
+	@Override
+	public void next() {
+		int currentTokenStart = index;
+		int tokenCount = 1;
+		int numPreChars = 0;
+		int numPostChars = 0;
+		if (usePrePost) {
+			numPreChars = Math.max(gramLength - gramNum - 1, 0);
+			numPostChars = (gramNum > totalGrams - gramLength) ? gramLength
+					- totalGrams + gramNum : 0;
+		}
+		gramNum++;
+
+		concreteToken.setNumPrePostChars(numPreChars, numPostChars);
+		if (numPreChars == 0) {
+			index += StringUtils.charSize(data, index);
+		}
+
+		// compute token count
+		// ignore pre and post grams for duplicate detection
+		if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
+			int tmpIndex = start;
+			while (tmpIndex < currentTokenStart) {
+				tokenCount++; // assume found
+				int offset = 0;
+				for (int j = 0; j < gramLength; j++) {
+					if (StringUtils.toLowerCase(StringUtils.charAt(data,
+							currentTokenStart + offset)) != StringUtils
+							.toLowerCase(StringUtils.charAt(data, tmpIndex
+									+ offset))) {
+						tokenCount--;
+						break;
+					}
+					offset += StringUtils.charSize(data, tmpIndex + offset);
+				}
+				tmpIndex += StringUtils.charSize(data, tmpIndex);
+			}
+		}
+
+		// set token
+		token.reset(data, currentTokenStart, length, gramLength, tokenCount);
+	}
+
+	@Override
+	public void reset(byte[] data, int start, int length) {
+		super.reset(data, start, length);
+		gramNum = 0;
+
+		int numChars = 0;
+		int pos = index;
+		int end = pos + utf8Length;
+		while (pos < end) {
+			numChars++;
+			pos += StringUtils.charSize(data, pos);
+		}
+
+		if (usePrePost) {
+			totalGrams = numChars + gramLength - 1;
+		} else {
+			totalGrams = numChars - gramLength + 1;
+		}
+	}
+
+	public void setGramlength(int gramLength) {
+		this.gramLength = gramLength;
+	}
+
+	public void setPrePost(boolean usePrePost) {
+		this.usePrePost = usePrePost;
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java
new file mode 100644
index 0000000..6b6406f
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java
@@ -0,0 +1,86 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class UTF8NGramToken extends AbstractUTF8Token implements INGramToken {
+
+	public final static char PRECHAR = '#';
+
+	public final static char POSTCHAR = '$';
+
+	protected int numPreChars;
+	protected int numPostChars;
+
+	public UTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public int getNumPostChars() {
+		return numPreChars;
+	}
+
+	@Override
+	public int getNumPreChars() {
+		return numPostChars;
+	}
+
+	@Override
+	public void serializeToken(DataOutput dos) throws IOException {
+		handleTokenTypeTag(dos);
+
+		// regular chars
+		int numRegChars = tokenLength - numPreChars - numPostChars;
+
+		// assuming pre and post char need 1-byte each in utf8
+		int tokenUTF8Len = getLowerCaseUTF8Len(numRegChars) + numPreChars
+				+ numPostChars;
+
+		// write utf8 length indicator
+		StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+
+		// pre chars
+		for (int i = 0; i < numPreChars; i++) {
+			StringUtils.writeCharAsModifiedUTF8(PRECHAR, dos);
+		}
+
+		int pos = start;
+		for (int i = 0; i < numRegChars; i++) {
+			char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+			StringUtils.writeCharAsModifiedUTF8(c, dos);
+			pos += StringUtils.charSize(data, pos);
+		}
+
+		// post chars
+		for (int i = 0; i < numPostChars; i++) {
+			StringUtils.writeCharAsModifiedUTF8(POSTCHAR, dos);
+		}
+	}
+
+	public void setNumPrePostChars(int numPreChars, int numPostChars) {
+		this.numPreChars = numPreChars;
+		this.numPostChars = numPostChars;
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramTokenFactory.java
new file mode 100644
index 0000000..968d8e1
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramTokenFactory.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class UTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
+
+	private static final long serialVersionUID = 1L;
+
+	public UTF8NGramTokenFactory() {
+		super();
+	}
+
+	public UTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public IToken createToken() {
+		return new UTF8NGramToken(tokenTypeTag, countTypeTag);
+	}
+
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java
new file mode 100644
index 0000000..25e0cd3
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class UTF8WordToken extends AbstractUTF8Token {
+
+	public UTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public void serializeToken(DataOutput dos) throws IOException {
+		handleTokenTypeTag(dos);
+
+		int tokenUTF8Len = getLowerCaseUTF8Len(tokenLength);
+		StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+		int pos = start;
+		for (int i = 0; i < tokenLength; i++) {
+			char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+			StringUtils.writeCharAsModifiedUTF8(c, dos);
+			pos += StringUtils.charSize(data, pos);
+		}
+	}
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordTokenFactory.java
new file mode 100644
index 0000000..4358254
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordTokenFactory.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class UTF8WordTokenFactory extends AbstractUTF8TokenFactory {
+
+	private static final long serialVersionUID = 1L;
+
+	public UTF8WordTokenFactory() {
+		super();
+	}
+
+	public UTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public IToken createToken() {
+		return new UTF8WordToken(tokenTypeTag, countTypeTag);
+	}
+
+}