Removed dependency on fuzzyjoin. Moved tokenizers into invertedindex package. git-svn-id: https://hyracks.googlecode.com/svn/branches/hyracks_indexes@523 123451ca-8445-de46-9d55-352943316053

commit: 7c4db76fc323b3b8405898298ea43ab9732ca72e [log] [tgz]
author: alexander.behm <alexander.behm@123451ca-8445-de46-9d55-352943316053> Thu Aug 04 06:50:55 2011 +0000
committer: alexander.behm <alexander.behm@123451ca-8445-de46-9d55-352943316053> Thu Aug 04 06:50:55 2011 +0000
tree: 7c9e0f02160851b17e05e950896ef03939a5148d
parent: 8ee0fee8dfcc4a7f9dec619f8961b1c67e52a644 [diff]
diff --git a/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java b/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java
index dbe6386..d11245b 100644
--- a/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java
+++ b/hyracks-dataflow-common/src/main/java/edu/uci/ics/hyracks/dataflow/common/data/util/StringUtils.java

@@ -14,56 +14,199 @@
  */
 package edu.uci.ics.hyracks.dataflow.common.data.util;
 
+import java.io.DataOutput;
+import java.io.IOException;
+
 public class StringUtils {
-    public static int charSize(byte[] b, int s) {
-        int c = (int) b[s] & 0xff;
-        switch (c >> 4) {
-            case 0:
-            case 1:
-            case 2:
-            case 3:
-            case 4:
-            case 5:
-            case 6:
-            case 7:
-                return 1;
+	public static char charAt(byte[] b, int s) {
+		int c = b[s] & 0xff;
+		switch (c >> 4) {
+		case 0:
+		case 1:
+		case 2:
+		case 3:
+		case 4:
+		case 5:
+		case 6:
+		case 7:
+			return (char) c;
 
-            case 12:
-            case 13:
-                return 2;
+		case 12:
+		case 13:
+			return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
 
-            case 14:
-                return 3;
-        }
-        throw new IllegalStateException();
-    }
+		case 14:
+			return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (((b[s + 2]) & 0x3F) << 0));
 
-    public static char charAt(byte[] b, int s) {
-        int c = (int) b[s] & 0xff;
-        switch (c >> 4) {
-            case 0:
-            case 1:
-            case 2:
-            case 3:
-            case 4:
-            case 5:
-            case 6:
-            case 7:
-                return (char) c;
+		default:
+			throw new IllegalArgumentException();
+		}
+	}
 
-            case 12:
-            case 13:
-                return (char) (((c & 0x1F) << 6) | (((int) b[s + 1]) & 0x3F));
+	public static int charSize(byte[] b, int s) {
+		int c = b[s] & 0xff;
+		switch (c >> 4) {
+		case 0:
+		case 1:
+		case 2:
+		case 3:
+		case 4:
+		case 5:
+		case 6:
+		case 7:
+			return 1;
 
-            case 14:
-                return (char) (((c & 0x0F) << 12) | ((((int) b[s + 1]) & 0x3F) << 6) | ((((int) b[s + 2]) & 0x3F) << 0));
+		case 12:
+		case 13:
+			return 2;
 
-            default:
-                throw new IllegalArgumentException();
-        }
-    }
+		case 14:
+			return 3;
+		}
+		throw new IllegalStateException();
+	}
 
-    public static int getUTFLen(byte[] b, int s) {
-        return ((b[s] & 0xff) << 8) + ((b[s + 1] & 0xff) << 0);
-    }
+	public static int getModifiedUTF8Len(char c) {
+		if (c >= 0x0000 && c <= 0x007F) {
+			return 1;
+		} else if (c <= 0x07FF) {
+			return 2;
+		} else {
+			return 3;
+		}
+	}
+
+	public static int getStrLen(byte[] b, int s) {
+		int pos = s + 2;
+		int end = pos + getUTFLen(b, s);
+		int charCount = 0;
+		while (pos < end) {
+			charCount++;
+			pos += charSize(b, pos);
+		}
+		return charCount;
+	}
+
+	public static int getUTFLen(byte[] b, int s) {
+		return ((b[s] & 0xff) << 8) + ((b[s + 1] & 0xff) << 0);
+	}
+
+	public static char toLowerCase(char c) {
+		switch (c) {
+		case 'A':
+			return 'a';
+		case 'B':
+			return 'b';
+		case 'C':
+			return 'c';
+		case 'D':
+			return 'd';
+		case 'E':
+			return 'e';
+		case 'F':
+			return 'f';
+		case 'G':
+			return 'g';
+		case 'H':
+			return 'h';
+		case 'I':
+			return 'i';
+		case 'J':
+			return 'j';
+		case 'K':
+			return 'k';
+		case 'L':
+			return 'l';
+		case 'M':
+			return 'm';
+		case 'N':
+			return 'n';
+		case 'O':
+			return 'o';
+		case 'P':
+			return 'p';
+		case 'Q':
+			return 'q';
+		case 'R':
+			return 'r';
+		case 'S':
+			return 's';
+		case 'T':
+			return 't';
+		case 'U':
+			return 'u';
+		case 'V':
+			return 'v';
+		case 'W':
+			return 'w';
+		case 'X':
+			return 'x';
+		case 'Y':
+			return 'y';
+		case 'Z':
+			return 'z';
+		case 'Ä':
+			return 'ä';
+		case 'Ǟ':
+			return 'ǟ';
+		case 'Ë':
+			return 'ë';
+		case 'Ḧ':
+			return 'ḧ';
+		case 'Ï':
+			return 'ï';
+		case 'Ḯ':
+			return 'ḯ';
+		case 'Ö':
+			return 'ö';
+		case 'Ȫ':
+			return 'ȫ';
+		case 'Ṏ':
+			return 'ṏ';
+		case 'Ü':
+			return 'ü';
+		case 'Ǖ':
+			return 'ǖ';
+		case 'Ǘ':
+			return 'ǘ';
+		case 'Ǚ':
+			return 'ǚ';
+		case 'Ǜ':
+			return 'ǜ';
+		case 'Ṳ':
+			return 'ṳ';
+		case 'Ṻ':
+			return 'ṻ';
+		case 'Ẅ':
+			return 'ẅ';
+		case 'Ẍ':
+			return 'ẍ';
+		case 'Ÿ':
+			return 'ÿ';
+		default:
+			// since I probably missed some chars above
+			// use Java to convert to lower case to be safe
+			return Character.toLowerCase(c);
+		}
+	}
+
+	public static void writeCharAsModifiedUTF8(char c, DataOutput dos)
+			throws IOException {
+
+		if (c >= 0x0000 && c <= 0x007F) {
+			dos.writeByte(c);
+		} else if (c <= 0x07FF) {
+			dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
+			dos.writeByte((byte) (0x80 | (c & 0x3F)));
+		} else {
+			dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
+			dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
+			dos.writeByte((byte) (0x80 | (c & 0x3F)));
+		}
+	}
+
+	public static void writeUTF8Len(int len, DataOutput dos) throws IOException {
+		dos.write((len >>> 8) & 0xFF);
+		dos.write((len >>> 0) & 0xFF);
+	}
 }
\ No newline at end of file

diff --git a/hyracks-examples/hyracks-integration-tests/src/test/java/edu/uci/ics/hyracks/tests/invertedindex/InvertedIndexOperatorsTest.java b/hyracks-examples/hyracks-integration-tests/src/test/java/edu/uci/ics/hyracks/tests/invertedindex/InvertedIndexOperatorsTest.java
index 1d8357e..c84ef74 100644
--- a/hyracks-examples/hyracks-integration-tests/src/test/java/edu/uci/ics/hyracks/tests/invertedindex/InvertedIndexOperatorsTest.java
+++ b/hyracks-examples/hyracks-integration-tests/src/test/java/edu/uci/ics/hyracks/tests/invertedindex/InvertedIndexOperatorsTest.java

@@ -4,10 +4,6 @@
 
 import org.junit.Test;
 
-import edu.uci.ics.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizerFactory;
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizerFactory;
-import edu.uci.ics.fuzzyjoin.tokenizer.ITokenFactory;
-import edu.uci.ics.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
 import edu.uci.ics.hyracks.api.constraints.PartitionConstraintHelper;
 import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
 import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
@@ -26,6 +22,10 @@
 import edu.uci.ics.hyracks.dataflow.std.file.IFileSplitProvider;
 import edu.uci.ics.hyracks.dataflow.std.misc.PrinterOperatorDescriptor;
 import edu.uci.ics.hyracks.storage.am.invertedindex.dataflow.BinaryTokenizerOperatorDescriptor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizerFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizerFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.ITokenFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.UTF8WordTokenFactory;
 import edu.uci.ics.hyracks.tests.integration.AbstractIntegrationTest;
 
 public class InvertedIndexOperatorsTest extends AbstractIntegrationTest {

diff --git a/hyracks-storage-am-invertedindex/pom.xml b/hyracks-storage-am-invertedindex/pom.xml
index 34f1135..769a3f4 100644
--- a/hyracks-storage-am-invertedindex/pom.xml
+++ b/hyracks-storage-am-invertedindex/pom.xml

@@ -58,14 +58,7 @@
   		<version>0.1.5</version>
   		<type>jar</type>
   		<scope>compile</scope>
-  	</dependency>  	
-    <dependency>
-        <groupId>edu.uci.ics.fuzzyjoin</groupId>
-        <artifactId>fuzzyjoin-core</artifactId>
-        <version>0.0.3</version>
-        <type>jar</type>
-        <scope>compile</scope>
-    </dependency>
+  	</dependency>    
   	<dependency>
   		<groupId>junit</groupId>
   		<artifactId>junit</artifactId>

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
index 1533307..83246d6 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java

@@ -15,7 +15,6 @@
 
 package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
 
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizerFactory;
 import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
 import edu.uci.ics.hyracks.api.dataflow.IOperatorNodePushable;
 import edu.uci.ics.hyracks.api.dataflow.value.IRecordDescriptorProvider;
@@ -24,6 +23,7 @@
 import edu.uci.ics.hyracks.api.job.IOperatorEnvironment;
 import edu.uci.ics.hyracks.api.job.JobSpecification;
 import edu.uci.ics.hyracks.dataflow.std.base.AbstractSingleActivityOperatorDescriptor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizerFactory;
 
 public class BinaryTokenizerOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor {
 

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
index d470513..0647f45 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java

@@ -19,8 +19,6 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
 import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
 import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
 import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
@@ -29,6 +27,8 @@
 import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
 import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
 import edu.uci.ics.hyracks.dataflow.std.base.AbstractUnaryInputUnaryOutputOperatorNodePushable;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
 
 public class BinaryTokenizerOperatorNodePushable extends AbstractUnaryInputUnaryOutputOperatorNodePushable {
 

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
index 5585856..d1fba3b 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java

@@ -15,17 +15,12 @@
 
 package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
 
-import java.io.ByteArrayInputStream;
-import java.io.DataInput;
-import java.io.DataInputStream;
 import java.io.DataOutput;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.List;
 
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
 import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
 import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
 import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
@@ -52,6 +47,8 @@
 import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
 import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
 import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
 
 public class TOccurrenceSearcher implements IInvertedIndexSearcher {
 

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java
index 18b870b..30d67f0 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java

@@ -19,11 +19,11 @@
 import java.nio.ByteBuffer;
 import java.util.List;
 
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
 import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
 import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
 import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
 import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
 
 public class TOccurrenceSearcherSuffixProbeOnly extends TOccurrenceSearcher {
 

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java
index 604c68d..f8bc1ab 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java

@@ -19,12 +19,12 @@
 import java.nio.ByteBuffer;
 import java.util.List;
 
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
 import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
 import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
 import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
 import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
 import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
 
 public class TOccurrenceSearcherSuffixScanOnly extends TOccurrenceSearcher {
 

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..bbb32d6
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java

@@ -0,0 +1,78 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public abstract class AbstractUTF8StringBinaryTokenizer implements
+		IBinaryTokenizer {
+
+	protected byte[] data;
+	protected int start;
+	protected int length;
+	protected int tokenLength;
+	protected int index;
+	protected int utf8Length;
+
+	protected final IntArray tokensStart;
+	protected final IntArray tokensLength;
+	protected final IToken token;
+
+	protected final boolean ignoreTokenCount;
+	protected final boolean sourceHasTypeTag;
+
+	public AbstractUTF8StringBinaryTokenizer(boolean ignoreTokenCount,
+			boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+		this.ignoreTokenCount = ignoreTokenCount;
+		this.sourceHasTypeTag = sourceHasTypeTag;
+		if (!ignoreTokenCount) {
+			tokensStart = new IntArray();
+			tokensLength = new IntArray();
+		} else {
+			tokensStart = null;
+			tokensLength = null;
+		}
+		token = tokenFactory.createToken();
+	}
+
+	@Override
+	public IToken getToken() {
+		return token;
+	}
+
+	@Override
+	public void reset(byte[] data, int start, int length) {
+		this.start = start;
+		index = this.start;
+		if (sourceHasTypeTag) {
+			index++; // skip type tag
+		}
+		utf8Length = StringUtils.getUTFLen(data, index);
+		index += 2; // skip utf8 length indicator
+		this.data = data;
+		this.length = length + start;
+
+		tokenLength = 0;
+		if (!ignoreTokenCount) {
+			tokensStart.reset();
+			tokensLength.reset();
+		}
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java
new file mode 100644
index 0000000..92d6ac2
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java

@@ -0,0 +1,106 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public abstract class AbstractUTF8Token implements IToken {
+	public static final int GOLDEN_RATIO_32 = 0x09e3779b9;
+
+	protected int length;
+	protected int tokenLength;
+	protected int start;
+	protected int tokenCount;
+	protected byte[] data;
+	protected final byte tokenTypeTag;
+	protected final byte countTypeTag;
+
+	public AbstractUTF8Token() {
+		tokenTypeTag = -1;
+		countTypeTag = -1;
+	}
+
+	public AbstractUTF8Token(byte tokenTypeTag, byte countTypeTag) {
+		this.tokenTypeTag = tokenTypeTag;
+		this.countTypeTag = countTypeTag;
+	}
+
+	@Override
+	public byte[] getData() {
+		return data;
+	}
+
+	@Override
+	public int getLength() {
+		return length;
+	}
+
+	public int getLowerCaseUTF8Len(int size) {
+		int lowerCaseUTF8Len = 0;
+		int pos = start;
+		for (int i = 0; i < size; i++) {
+			char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+			lowerCaseUTF8Len += StringUtils.getModifiedUTF8Len(c);
+			pos += StringUtils.charSize(data, pos);
+		}
+		return lowerCaseUTF8Len;
+	}
+
+	@Override
+	public int getStart() {
+		return start;
+	}
+
+	@Override
+	public int getTokenLength() {
+		return tokenLength;
+	}
+
+	public void handleCountTypeTag(DataOutput dos) throws IOException {
+		if (countTypeTag > 0) {
+			dos.write(countTypeTag);
+		}
+	}
+
+	public void handleTokenTypeTag(DataOutput dos) throws IOException {
+		if (tokenTypeTag > 0) {
+			dos.write(tokenTypeTag);
+		}
+	}
+
+	@Override
+	public void reset(byte[] data, int start, int length, int tokenLength,
+			int tokenCount) {
+		this.data = data;
+		this.start = start;
+		this.length = length;
+		this.tokenLength = tokenLength;
+		this.tokenCount = tokenCount;
+	}
+
+	@Override
+	public void serializeTokenCount(DataOutput dos) throws IOException {
+		handleCountTypeTag(dos);
+		dos.writeInt(tokenCount);
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8TokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8TokenFactory.java
new file mode 100644
index 0000000..3b0b82d
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8TokenFactory.java

@@ -0,0 +1,36 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public abstract class AbstractUTF8TokenFactory implements ITokenFactory {
+	private static final long serialVersionUID = 1L;
+	protected final byte tokenTypeTag;
+	protected final byte countTypeTag;
+
+	public AbstractUTF8TokenFactory() {
+		tokenTypeTag = -1;
+		countTypeTag = -1;
+	}
+
+	public AbstractUTF8TokenFactory(byte tokenTypeTag, byte countTypeTag) {
+		this.tokenTypeTag = tokenTypeTag;
+		this.countTypeTag = countTypeTag;
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..de9ad2c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java

@@ -0,0 +1,87 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class DelimitedUTF8StringBinaryTokenizer extends
+		AbstractUTF8StringBinaryTokenizer {
+
+	public DelimitedUTF8StringBinaryTokenizer(boolean ignoreTokenCount,
+			boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+		super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+	}
+
+	@Override
+	public boolean hasNext() {
+		// skip delimiters
+		while (index < length && isSeparator(StringUtils.charAt(data, index))) {
+			index += StringUtils.charSize(data, index);
+		}
+		return index < length;
+	}
+
+	private boolean isSeparator(char c) {
+		return !(Character.isLetterOrDigit(c)
+				|| Character.getType(c) == Character.OTHER_LETTER || Character
+				.getType(c) == Character.OTHER_NUMBER);
+	}
+
+	@Override
+	public void next() {
+		tokenLength = 0;
+		int currentTokenStart = index;
+		while (index < length && !isSeparator(StringUtils.charAt(data, index))) {
+			index += StringUtils.charSize(data, index);
+			tokenLength++;
+		}
+		int tokenCount = 1;
+		if (tokenLength > 0 && !ignoreTokenCount) {
+			// search if we got the same token before
+			for (int i = 0; i < tokensStart.length(); ++i) {
+				if (tokenLength == tokensLength.get(i)) {
+					int tokenStart = tokensStart.get(i);
+					tokenCount++; // assume we found it
+					int offset = 0;
+					int currLength = 0;
+					while (currLength < tokenLength) {
+						// case insensitive comparison
+						if (StringUtils.toLowerCase(StringUtils.charAt(data,
+								currentTokenStart + offset)) != StringUtils
+								.toLowerCase(StringUtils.charAt(data,
+										tokenStart + offset))) {
+							tokenCount--;
+							break;
+						}
+						offset += StringUtils.charSize(data, currentTokenStart
+								+ offset);
+						currLength++;
+					}
+				}
+			}
+			// add the new token to the list of seen tokens
+			tokensStart.add(currentTokenStart);
+			tokensLength.add(tokenLength);
+		}
+
+		// set token
+		token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
new file mode 100644
index 0000000..4a350b3
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java

@@ -0,0 +1,42 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class DelimitedUTF8StringBinaryTokenizerFactory implements
+		IBinaryTokenizerFactory {
+
+	private static final long serialVersionUID = 1L;
+	private final boolean ignoreTokenCount;
+	private final boolean sourceHasTypeTag;
+	private final ITokenFactory tokenFactory;
+
+	public DelimitedUTF8StringBinaryTokenizerFactory(boolean ignoreTokenCount,
+			boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+		this.ignoreTokenCount = ignoreTokenCount;
+		this.sourceHasTypeTag = sourceHasTypeTag;
+		this.tokenFactory = tokenFactory;
+	}
+
+	@Override
+	public IBinaryTokenizer createTokenizer() {
+		return new DelimitedUTF8StringBinaryTokenizer(ignoreTokenCount,
+				sourceHasTypeTag, tokenFactory);
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java
new file mode 100644
index 0000000..25d1a2c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java

@@ -0,0 +1,64 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class HashedUTF8NGramToken extends UTF8NGramToken {
+	public HashedUTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public void serializeToken(DataOutput dos) throws IOException {
+		handleTokenTypeTag(dos);
+
+		int hash = GOLDEN_RATIO_32;
+
+		// pre chars
+		for (int i = 0; i < numPreChars; i++) {
+			hash ^= PRECHAR;
+			hash *= GOLDEN_RATIO_32;
+		}
+
+		// regular chars
+		int numRegGrams = tokenLength - numPreChars - numPostChars;
+		int pos = start;
+		for (int i = 0; i < numRegGrams; i++) {
+			hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+			hash *= GOLDEN_RATIO_32;
+			pos += StringUtils.charSize(data, pos);
+		}
+
+		// post chars
+		for (int i = 0; i < numPostChars; i++) {
+			hash ^= POSTCHAR;
+			hash *= GOLDEN_RATIO_32;
+		}
+
+		// token count
+		hash += tokenCount;
+
+		dos.writeInt(hash);
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java
new file mode 100644
index 0000000..4a87793
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java

@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class HashedUTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
+
+	private static final long serialVersionUID = 1L;
+
+	public HashedUTF8NGramTokenFactory() {
+		super();
+	}
+
+	public HashedUTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public IToken createToken() {
+		return new HashedUTF8NGramToken(tokenTypeTag, countTypeTag);
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java
new file mode 100644
index 0000000..55237ce
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java

@@ -0,0 +1,88 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class HashedUTF8WordToken extends UTF8WordToken {
+
+	private int hash = 0;
+
+	public HashedUTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public boolean equals(Object o) {
+		if (o == null) {
+			return false;
+		}
+		if (!(o instanceof IToken)) {
+			return false;
+		}
+		IToken t = (IToken) o;
+		if (t.getTokenLength() != tokenLength) {
+			return false;
+		}
+		int offset = 0;
+		for (int i = 0; i < tokenLength; i++) {
+			if (StringUtils.charAt(t.getData(), t.getStart() + offset) != StringUtils
+					.charAt(data, start + offset)) {
+				return false;
+			}
+			offset += StringUtils.charSize(data, start + offset);
+		}
+		return true;
+	}
+
+	@Override
+	public int hashCode() {
+		return hash;
+	}
+
+	@Override
+	public void reset(byte[] data, int start, int length, int tokenLength,
+			int tokenCount) {
+		super.reset(data, start, length, tokenLength, tokenCount);
+
+		// pre-compute hash value using JAQL-like string hashing
+		int pos = start;
+		hash = GOLDEN_RATIO_32;
+		for (int i = 0; i < tokenLength; i++) {
+			hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+			hash *= GOLDEN_RATIO_32;
+			pos += StringUtils.charSize(data, pos);
+		}
+		hash += tokenCount;
+	}
+
+	@Override
+	public void serializeToken(DataOutput dos) throws IOException {
+		if (tokenTypeTag > 0) {
+			dos.write(tokenTypeTag);
+		}
+
+		// serialize hash value
+		dos.writeInt(hash);
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java
new file mode 100644
index 0000000..318f041
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java

@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class HashedUTF8WordTokenFactory extends AbstractUTF8TokenFactory {
+
+	private static final long serialVersionUID = 1L;
+
+	public HashedUTF8WordTokenFactory() {
+		super();
+	}
+
+	public HashedUTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public IToken createToken() {
+		return new HashedUTF8WordToken(tokenTypeTag, countTypeTag);
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizer.java
new file mode 100644
index 0000000..05c6d0b
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizer.java

@@ -0,0 +1,30 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public interface IBinaryTokenizer {
+	public IToken getToken();
+
+	public boolean hasNext();
+
+	public void next();
+
+	public void reset(byte[] data, int start, int length);
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizerFactory.java
new file mode 100644
index 0000000..bfe78ee
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizerFactory.java

@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.Serializable;
+
+public interface IBinaryTokenizerFactory extends Serializable {
+	public IBinaryTokenizer createTokenizer();
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/INGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/INGramToken.java
new file mode 100644
index 0000000..befc6d2
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/INGramToken.java

@@ -0,0 +1,28 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public interface INGramToken {
+	public int getNumPostChars();
+
+	public int getNumPreChars();
+
+	public void setNumPrePostChars(int numPreChars, int numPostChars);
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IToken.java
new file mode 100644
index 0000000..c1840d7
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IToken.java

@@ -0,0 +1,40 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public interface IToken {
+	public byte[] getData();
+
+	public int getLength();
+
+	public int getStart();
+
+	public int getTokenLength();
+
+	public void reset(byte[] data, int start, int length, int tokenLength,
+			int tokenCount);
+
+	public void serializeToken(DataOutput dos) throws IOException;
+
+	public void serializeTokenCount(DataOutput dos) throws IOException;
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/ITokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/ITokenFactory.java
new file mode 100644
index 0000000..8b5d71d
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/ITokenFactory.java

@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.Serializable;
+
+public interface ITokenFactory extends Serializable {
+    public IToken createToken();
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IntArray.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IntArray.java
new file mode 100644
index 0000000..2eb9ff4
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IntArray.java

@@ -0,0 +1,80 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.util.Arrays;
+
+public class IntArray {
+    private static final int SIZE = 128;
+
+    private int[] data;
+    private int length;
+
+    public IntArray() {
+        data = new int[SIZE];
+        length = 0;
+    }
+
+    public void add(int d) {
+        if (length == data.length) {
+            data = Arrays.copyOf(data, data.length << 1);
+        }
+        data[length++] = d;
+    }
+
+    public int[] get() {
+        return data;
+    }
+
+    public int get(int i) {
+        return data[i];
+    }
+
+    public int length() {
+        return length;
+    }
+
+    public void reset() {
+        length = 0;
+    }
+
+    public void sort() {
+        sort(0, length);
+    }
+
+    public void sort(int start, int end) {
+        Arrays.sort(data, start, end);
+    }
+
+    @Override
+    public String toString() {
+        StringBuilder out = new StringBuilder();
+        out.append('[');
+        for (int i = 0; i < length; ++i) {
+            out.append(data[i]);
+            if (i < length - 1) {
+                out.append(',');
+                out.append(' ');
+            }
+        }
+        out.append(']');
+        return out.toString();
+    }
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..2a13f83
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java

@@ -0,0 +1,123 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class NGramUTF8StringBinaryTokenizer extends
+		AbstractUTF8StringBinaryTokenizer {
+
+	private int gramLength;
+	private boolean usePrePost;
+
+	private int gramNum;
+	private int totalGrams;
+
+	private final INGramToken concreteToken;
+
+	public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost,
+			boolean ignoreTokenCount, boolean sourceHasTypeTag,
+			ITokenFactory tokenFactory) {
+		super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+		this.gramLength = gramLength;
+		this.usePrePost = usePrePost;
+		concreteToken = (INGramToken) token;
+	}
+
+	@Override
+	public boolean hasNext() {
+		if (gramNum < totalGrams) {
+			return true;
+		} else {
+			return false;
+		}
+	}
+
+	@Override
+	public void next() {
+		int currentTokenStart = index;
+		int tokenCount = 1;
+		int numPreChars = 0;
+		int numPostChars = 0;
+		if (usePrePost) {
+			numPreChars = Math.max(gramLength - gramNum - 1, 0);
+			numPostChars = (gramNum > totalGrams - gramLength) ? gramLength
+					- totalGrams + gramNum : 0;
+		}
+		gramNum++;
+
+		concreteToken.setNumPrePostChars(numPreChars, numPostChars);
+		if (numPreChars == 0) {
+			index += StringUtils.charSize(data, index);
+		}
+
+		// compute token count
+		// ignore pre and post grams for duplicate detection
+		if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
+			int tmpIndex = start;
+			while (tmpIndex < currentTokenStart) {
+				tokenCount++; // assume found
+				int offset = 0;
+				for (int j = 0; j < gramLength; j++) {
+					if (StringUtils.toLowerCase(StringUtils.charAt(data,
+							currentTokenStart + offset)) != StringUtils
+							.toLowerCase(StringUtils.charAt(data, tmpIndex
+									+ offset))) {
+						tokenCount--;
+						break;
+					}
+					offset += StringUtils.charSize(data, tmpIndex + offset);
+				}
+				tmpIndex += StringUtils.charSize(data, tmpIndex);
+			}
+		}
+
+		// set token
+		token.reset(data, currentTokenStart, length, gramLength, tokenCount);
+	}
+
+	@Override
+	public void reset(byte[] data, int start, int length) {
+		super.reset(data, start, length);
+		gramNum = 0;
+
+		int numChars = 0;
+		int pos = index;
+		int end = pos + utf8Length;
+		while (pos < end) {
+			numChars++;
+			pos += StringUtils.charSize(data, pos);
+		}
+
+		if (usePrePost) {
+			totalGrams = numChars + gramLength - 1;
+		} else {
+			totalGrams = numChars - gramLength + 1;
+		}
+	}
+
+	public void setGramlength(int gramLength) {
+		this.gramLength = gramLength;
+	}
+
+	public void setPrePost(boolean usePrePost) {
+		this.usePrePost = usePrePost;
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java
new file mode 100644
index 0000000..6b6406f
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java

@@ -0,0 +1,86 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class UTF8NGramToken extends AbstractUTF8Token implements INGramToken {
+
+	public final static char PRECHAR = '#';
+
+	public final static char POSTCHAR = '$';
+
+	protected int numPreChars;
+	protected int numPostChars;
+
+	public UTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public int getNumPostChars() {
+		return numPreChars;
+	}
+
+	@Override
+	public int getNumPreChars() {
+		return numPostChars;
+	}
+
+	@Override
+	public void serializeToken(DataOutput dos) throws IOException {
+		handleTokenTypeTag(dos);
+
+		// regular chars
+		int numRegChars = tokenLength - numPreChars - numPostChars;
+
+		// assuming pre and post char need 1-byte each in utf8
+		int tokenUTF8Len = getLowerCaseUTF8Len(numRegChars) + numPreChars
+				+ numPostChars;
+
+		// write utf8 length indicator
+		StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+
+		// pre chars
+		for (int i = 0; i < numPreChars; i++) {
+			StringUtils.writeCharAsModifiedUTF8(PRECHAR, dos);
+		}
+
+		int pos = start;
+		for (int i = 0; i < numRegChars; i++) {
+			char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+			StringUtils.writeCharAsModifiedUTF8(c, dos);
+			pos += StringUtils.charSize(data, pos);
+		}
+
+		// post chars
+		for (int i = 0; i < numPostChars; i++) {
+			StringUtils.writeCharAsModifiedUTF8(POSTCHAR, dos);
+		}
+	}
+
+	public void setNumPrePostChars(int numPreChars, int numPostChars) {
+		this.numPreChars = numPreChars;
+		this.numPostChars = numPostChars;
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramTokenFactory.java
new file mode 100644
index 0000000..968d8e1
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramTokenFactory.java

@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class UTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
+
+	private static final long serialVersionUID = 1L;
+
+	public UTF8NGramTokenFactory() {
+		super();
+	}
+
+	public UTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public IToken createToken() {
+		return new UTF8NGramToken(tokenTypeTag, countTypeTag);
+	}
+
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java
new file mode 100644
index 0000000..25e0cd3
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java

@@ -0,0 +1,46 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class UTF8WordToken extends AbstractUTF8Token {
+
+	public UTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public void serializeToken(DataOutput dos) throws IOException {
+		handleTokenTypeTag(dos);
+
+		int tokenUTF8Len = getLowerCaseUTF8Len(tokenLength);
+		StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+		int pos = start;
+		for (int i = 0; i < tokenLength; i++) {
+			char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+			StringUtils.writeCharAsModifiedUTF8(c, dos);
+			pos += StringUtils.charSize(data, pos);
+		}
+	}
+}

diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordTokenFactory.java
new file mode 100644
index 0000000..4358254
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordTokenFactory.java

@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class UTF8WordTokenFactory extends AbstractUTF8TokenFactory {
+
+	private static final long serialVersionUID = 1L;
+
+	public UTF8WordTokenFactory() {
+		super();
+	}
+
+	public UTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+		super(tokenTypeTag, countTypeTag);
+	}
+
+	@Override
+	public IToken createToken() {
+		return new UTF8WordToken(tokenTypeTag, countTypeTag);
+	}
+
+}

diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/AbstractInvIndexSearchTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/AbstractInvIndexSearchTest.java
index 31f06a1..b42001d 100644
--- a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/AbstractInvIndexSearchTest.java
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/AbstractInvIndexSearchTest.java

@@ -9,8 +9,6 @@
 import org.junit.After;
 import org.junit.Before;
 
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.ITokenFactory;
 import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
 import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
 import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
@@ -43,6 +41,8 @@
 import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
 import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
 import edu.uci.ics.hyracks.storage.am.invertedindex.impls.TOccurrenceSearcher;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.ITokenFactory;
 import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
 import edu.uci.ics.hyracks.storage.common.file.IFileMapProvider;
 import edu.uci.ics.hyracks.test.support.TestStorageManagerComponentHolder;

diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/NGramTokenizerTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/NGramTokenizerTest.java
new file mode 100644
index 0000000..5f15a91
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/NGramTokenizerTest.java

@@ -0,0 +1,247 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.AbstractUTF8Token;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.HashedUTF8NGramTokenFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.UTF8NGramTokenFactory;
+
+public class NGramTokenizerTest {
+
+	private char PRECHAR = '#';
+	private char POSTCHAR = '$';
+
+	private String str = "Jürgen S. Generic's Car";
+	private byte[] inputBuffer;
+
+	private int gramLength = 3;
+
+	private void getExpectedGrams(String s, int gramLength,
+			ArrayList<String> grams, boolean prePost) {
+
+		String tmp = s.toLowerCase();
+		if (prePost) {
+			StringBuilder preBuilder = new StringBuilder();
+			for (int i = 0; i < gramLength - 1; i++) {
+				preBuilder.append(PRECHAR);
+			}
+			String pre = preBuilder.toString();
+
+			StringBuilder postBuilder = new StringBuilder();
+			for (int i = 0; i < gramLength - 1; i++) {
+				postBuilder.append(POSTCHAR);
+			}
+			String post = postBuilder.toString();
+
+			tmp = pre + s.toLowerCase() + post;
+		}
+
+		for (int i = 0; i < tmp.length() - gramLength + 1; i++) {
+			String gram = tmp.substring(i, i + gramLength);
+			grams.add(gram);
+		}
+	}
+
+	@Before
+	public void init() throws Exception {
+		// serialize string into bytes
+		ByteArrayOutputStream baos = new ByteArrayOutputStream();
+		DataOutput dos = new DataOutputStream(baos);
+		dos.writeUTF(str);
+		inputBuffer = baos.toByteArray();
+	}
+
+	void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost)
+			throws IOException {
+		HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+		NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(
+				gramLength, prePost, false, false, tokenFactory);
+		tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+		ArrayList<String> expectedGrams = new ArrayList<String>();
+		getExpectedGrams(str, gramLength, expectedGrams, prePost);
+		ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
+		HashMap<String, Integer> gramCounts = new HashMap<String, Integer>();
+		for (String s : expectedGrams) {
+			Integer count = gramCounts.get(s);
+			if (count == null) {
+				count = 1;
+				gramCounts.put(s, count);
+			} else {
+				count++;
+			}
+
+			int hash = tokenHash(s, count);
+			expectedHashedGrams.add(hash);
+		}
+
+		int tokenCount = 0;
+
+		while (tokenizer.hasNext()) {
+			tokenizer.next();
+
+			// serialize hashed token
+			ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+			DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+			IToken token = tokenizer.getToken();
+			token.serializeToken(tokenDos);
+
+			// deserialize token
+			ByteArrayInputStream bais = new ByteArrayInputStream(
+					tokenBaos.toByteArray());
+			DataInput in = new DataInputStream(bais);
+
+			Integer hashedGram = in.readInt();
+
+			// System.out.println(hashedGram);
+
+			Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
+
+			tokenCount++;
+		}
+		// System.out.println("---------");
+	}
+
+	void runTestNGramTokenizerWithHashedUTF8Tokens(boolean prePost)
+			throws IOException {
+		HashedUTF8NGramTokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+		NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(
+				gramLength, prePost, true, false, tokenFactory);
+		tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+		ArrayList<String> expectedGrams = new ArrayList<String>();
+		getExpectedGrams(str, gramLength, expectedGrams, prePost);
+		ArrayList<Integer> expectedHashedGrams = new ArrayList<Integer>();
+		for (String s : expectedGrams) {
+			int hash = tokenHash(s, 1);
+			expectedHashedGrams.add(hash);
+		}
+
+		int tokenCount = 0;
+
+		while (tokenizer.hasNext()) {
+			tokenizer.next();
+
+			// serialize hashed token
+			ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+			DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+			IToken token = tokenizer.getToken();
+			token.serializeToken(tokenDos);
+
+			// deserialize token
+			ByteArrayInputStream bais = new ByteArrayInputStream(
+					tokenBaos.toByteArray());
+			DataInput in = new DataInputStream(bais);
+
+			Integer hashedGram = in.readInt();
+
+			// System.out.println(hashedGram);
+
+			Assert.assertEquals(expectedHashedGrams.get(tokenCount), hashedGram);
+
+			tokenCount++;
+		}
+		// System.out.println("---------");
+	}
+
+	void runTestNGramTokenizerWithUTF8Tokens(boolean prePost)
+			throws IOException {
+		UTF8NGramTokenFactory tokenFactory = new UTF8NGramTokenFactory();
+		NGramUTF8StringBinaryTokenizer tokenizer = new NGramUTF8StringBinaryTokenizer(
+				gramLength, prePost, true, false, tokenFactory);
+		tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+		ArrayList<String> expectedGrams = new ArrayList<String>();
+		getExpectedGrams(str, gramLength, expectedGrams, prePost);
+
+		int tokenCount = 0;
+
+		while (tokenizer.hasNext()) {
+			tokenizer.next();
+
+			// serialize hashed token
+			ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+			DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+			IToken token = tokenizer.getToken();
+			token.serializeToken(tokenDos);
+
+			// deserialize token
+			ByteArrayInputStream bais = new ByteArrayInputStream(
+					tokenBaos.toByteArray());
+			DataInput in = new DataInputStream(bais);
+
+			String strGram = in.readUTF();
+
+			// System.out.println("\"" + strGram + "\"");
+
+			Assert.assertEquals(expectedGrams.get(tokenCount), strGram);
+
+			tokenCount++;
+		}
+		// System.out.println("---------");
+	}
+
+	@Test
+	public void testNGramTokenizerWithCountedHashedUTF8Tokens()
+			throws Exception {
+		runTestNGramTokenizerWithCountedHashedUTF8Tokens(false);
+		runTestNGramTokenizerWithCountedHashedUTF8Tokens(true);
+	}
+
+	@Test
+	public void testNGramTokenizerWithHashedUTF8Tokens() throws Exception {
+		runTestNGramTokenizerWithHashedUTF8Tokens(false);
+		runTestNGramTokenizerWithHashedUTF8Tokens(true);
+	}
+
+	@Test
+	public void testNGramTokenizerWithUTF8Tokens() throws IOException {
+		runTestNGramTokenizerWithUTF8Tokens(false);
+		runTestNGramTokenizerWithUTF8Tokens(true);
+	}
+
+	public int tokenHash(String token, int tokenCount) {
+		int h = AbstractUTF8Token.GOLDEN_RATIO_32;
+		for (int i = 0; i < token.length(); i++) {
+			h ^= token.charAt(i);
+			h *= AbstractUTF8Token.GOLDEN_RATIO_32;
+		}
+		return h + tokenCount;
+	}
+}

diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SearchPerfTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SearchPerfTest.java
index 4d3942f..161f20f 100644
--- a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SearchPerfTest.java
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SearchPerfTest.java

@@ -17,17 +17,11 @@
 
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Random;
 
-import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
-import edu.uci.ics.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.ITokenFactory;
-import edu.uci.ics.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
 import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
 import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
 import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
@@ -42,6 +36,8 @@
 import edu.uci.ics.hyracks.storage.am.invertedindex.impls.TOccurrenceSearcher;
 import edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers.ConjunctiveSearchModifier;
 import edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers.JaccardSearchModifier;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.UTF8WordTokenFactory;
 
 /**
  * The purpose of this test is to evaluate the performance of searches against

diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SearchTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SearchTest.java
index 376735f..d9fef2c 100644
--- a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SearchTest.java
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SearchTest.java

@@ -9,9 +9,6 @@
 import org.junit.Before;
 import org.junit.Test;
 
-import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
-import edu.uci.ics.fuzzyjoin.tokenizer.NGramUTF8StringBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.UTF8NGramTokenFactory;
 import edu.uci.ics.hyracks.dataflow.common.comm.io.ByteArrayAccessibleOutputStream;
 import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
 import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
@@ -27,6 +24,9 @@
 import edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers.ConjunctiveSearchModifier;
 import edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers.EditDistanceSearchModifier;
 import edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers.JaccardSearchModifier;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.UTF8NGramTokenFactory;
 
 public class SearchTest extends AbstractInvIndexSearchTest {
 

diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java
new file mode 100644
index 0000000..57fe306
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java

@@ -0,0 +1,222 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ * 
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import junit.framework.Assert;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.AbstractUTF8Token;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.HashedUTF8WordTokenFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.UTF8WordTokenFactory;
+
+public class WordTokenizerTest {
+
+	private String text = "Hello World, I would like to inform you of the importance of Foo Bar. Yes, Foo Bar. Jürgen.";
+	private byte[] inputBuffer;
+
+	private ArrayList<String> expectedUTF8Tokens = new ArrayList<String>();
+	private ArrayList<Integer> expectedHashedUTF8Tokens = new ArrayList<Integer>();
+	private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>();
+
+	@Before
+	public void init() throws IOException {
+		// serialize text into bytes
+		ByteArrayOutputStream baos = new ByteArrayOutputStream();
+		DataOutput dos = new DataOutputStream(baos);
+		dos.writeUTF(text);
+		inputBuffer = baos.toByteArray();
+
+		// init expected string tokens
+		expectedUTF8Tokens.add("hello");
+		expectedUTF8Tokens.add("world");
+		expectedUTF8Tokens.add("i");
+		expectedUTF8Tokens.add("would");
+		expectedUTF8Tokens.add("like");
+		expectedUTF8Tokens.add("to");
+		expectedUTF8Tokens.add("inform");
+		expectedUTF8Tokens.add("you");
+		expectedUTF8Tokens.add("of");
+		expectedUTF8Tokens.add("the");
+		expectedUTF8Tokens.add("importance");
+		expectedUTF8Tokens.add("of");
+		expectedUTF8Tokens.add("foo");
+		expectedUTF8Tokens.add("bar");
+		expectedUTF8Tokens.add("yes");
+		expectedUTF8Tokens.add("foo");
+		expectedUTF8Tokens.add("bar");
+		expectedUTF8Tokens.add("jürgen");
+
+		// hashed tokens ignoring token count
+		for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
+			int hash = tokenHash(expectedUTF8Tokens.get(i), 1);
+			expectedHashedUTF8Tokens.add(hash);
+		}
+
+		// hashed tokens using token count
+		HashMap<String, Integer> tokenCounts = new HashMap<String, Integer>();
+		for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
+			Integer count = tokenCounts.get(expectedUTF8Tokens.get(i));
+			if (count == null) {
+				count = 1;
+				tokenCounts.put(expectedUTF8Tokens.get(i), count);
+			} else {
+				count++;
+			}
+
+			int hash = tokenHash(expectedUTF8Tokens.get(i), count);
+			expectedCountedHashedUTF8Tokens.add(hash);
+		}
+	}
+
+	@Test
+	public void testWordTokenizerWithCountedHashedUTF8Tokens()
+			throws IOException {
+
+		HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
+		DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(
+				false, false, tokenFactory);
+
+		tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+		int tokenCount = 0;
+
+		while (tokenizer.hasNext()) {
+			tokenizer.next();
+
+			// serialize token
+			ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+			DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+			IToken token = tokenizer.getToken();
+			token.serializeToken(tokenDos);
+
+			// deserialize token
+			ByteArrayInputStream bais = new ByteArrayInputStream(
+					tokenBaos.toByteArray());
+			DataInput in = new DataInputStream(bais);
+
+			Integer hashedToken = in.readInt();
+
+			// System.out.println(hashedToken);
+
+			Assert.assertEquals(hashedToken,
+					expectedCountedHashedUTF8Tokens.get(tokenCount));
+
+			tokenCount++;
+		}
+	}
+
+	@Test
+	public void testWordTokenizerWithHashedUTF8Tokens() throws IOException {
+
+		HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
+		DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(
+				true, false, tokenFactory);
+
+		tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+		int tokenCount = 0;
+
+		while (tokenizer.hasNext()) {
+			tokenizer.next();
+
+			// serialize token
+			ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+			DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+			IToken token = tokenizer.getToken();
+			token.serializeToken(tokenDos);
+
+			// deserialize token
+			ByteArrayInputStream bais = new ByteArrayInputStream(
+					tokenBaos.toByteArray());
+			DataInput in = new DataInputStream(bais);
+
+			Integer hashedToken = in.readInt();
+
+			// System.out.println(hashedToken);
+
+			Assert.assertEquals(expectedHashedUTF8Tokens.get(tokenCount),
+					hashedToken);
+
+			tokenCount++;
+		}
+	}
+
+	@Test
+	public void testWordTokenizerWithUTF8Tokens() throws IOException {
+
+		UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
+		DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(
+				true, false, tokenFactory);
+
+		tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+
+		int tokenCount = 0;
+
+		while (tokenizer.hasNext()) {
+			tokenizer.next();
+
+			// serialize hashed token
+			ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+			DataOutput tokenDos = new DataOutputStream(tokenBaos);
+
+			IToken token = tokenizer.getToken();
+			token.serializeToken(tokenDos);
+
+			// deserialize token
+			ByteArrayInputStream bais = new ByteArrayInputStream(
+					tokenBaos.toByteArray());
+			DataInput in = new DataInputStream(bais);
+
+			String strToken = in.readUTF();
+
+			// System.out.println(strToken);
+
+			Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);
+
+			tokenCount++;
+		}
+	}
+
+	// JAQL
+	public int tokenHash(String token, int tokenCount) {
+		int h = AbstractUTF8Token.GOLDEN_RATIO_32;
+		for (int i = 0; i < token.length(); i++) {
+			h ^= token.charAt(i);
+			h *= AbstractUTF8Token.GOLDEN_RATIO_32;
+		}
+		return h + tokenCount;
+	}
+}
commit	7c4db76fc323b3b8405898298ea43ab9732ca72e	[log] [tgz]
author	alexander.behm <alexander.behm@123451ca-8445-de46-9d55-352943316053>	Thu Aug 04 06:50:55 2011 +0000
committer	alexander.behm <alexander.behm@123451ca-8445-de46-9d55-352943316053>	Thu Aug 04 06:50:55 2011 +0000
tree	7c9e0f02160851b17e05e950896ef03939a5148d
parent	8ee0fee8dfcc4a7f9dec619f8961b1c67e52a644 [diff]