Removed dependency on fuzzyjoin. Moved tokenizers into invertedindex package.
git-svn-id: https://hyracks.googlecode.com/svn/branches/hyracks_indexes@523 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-storage-am-invertedindex/pom.xml b/hyracks-storage-am-invertedindex/pom.xml
index 34f1135..769a3f4 100644
--- a/hyracks-storage-am-invertedindex/pom.xml
+++ b/hyracks-storage-am-invertedindex/pom.xml
@@ -58,14 +58,7 @@
<version>0.1.5</version>
<type>jar</type>
<scope>compile</scope>
- </dependency>
- <dependency>
- <groupId>edu.uci.ics.fuzzyjoin</groupId>
- <artifactId>fuzzyjoin-core</artifactId>
- <version>0.0.3</version>
- <type>jar</type>
- <scope>compile</scope>
- </dependency>
+ </dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
index 1533307..83246d6 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
@@ -15,7 +15,6 @@
package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizerFactory;
import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
import edu.uci.ics.hyracks.api.dataflow.IOperatorNodePushable;
import edu.uci.ics.hyracks.api.dataflow.value.IRecordDescriptorProvider;
@@ -24,6 +23,7 @@
import edu.uci.ics.hyracks.api.job.IOperatorEnvironment;
import edu.uci.ics.hyracks.api.job.JobSpecification;
import edu.uci.ics.hyracks.dataflow.std.base.AbstractSingleActivityOperatorDescriptor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizerFactory;
public class BinaryTokenizerOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor {
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
index d470513..0647f45 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
@@ -19,8 +19,6 @@
import java.io.IOException;
import java.nio.ByteBuffer;
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
@@ -29,6 +27,8 @@
import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
import edu.uci.ics.hyracks.dataflow.std.base.AbstractUnaryInputUnaryOutputOperatorNodePushable;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
public class BinaryTokenizerOperatorNodePushable extends AbstractUnaryInputUnaryOutputOperatorNodePushable {
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
index 5585856..d1fba3b 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
@@ -15,17 +15,12 @@
package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
-import java.io.ByteArrayInputStream;
-import java.io.DataInput;
-import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
@@ -52,6 +47,8 @@
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
public class TOccurrenceSearcher implements IInvertedIndexSearcher {
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java
index 18b870b..30d67f0 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java
@@ -19,11 +19,11 @@
import java.nio.ByteBuffer;
import java.util.List;
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
public class TOccurrenceSearcherSuffixProbeOnly extends TOccurrenceSearcher {
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java
index 604c68d..f8bc1ab 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java
@@ -19,12 +19,12 @@
import java.nio.ByteBuffer;
import java.util.List;
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
public class TOccurrenceSearcherSuffixScanOnly extends TOccurrenceSearcher {
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..bbb32d6
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public abstract class AbstractUTF8StringBinaryTokenizer implements
+ IBinaryTokenizer {
+
+ protected byte[] data;
+ protected int start;
+ protected int length;
+ protected int tokenLength;
+ protected int index;
+ protected int utf8Length;
+
+ protected final IntArray tokensStart;
+ protected final IntArray tokensLength;
+ protected final IToken token;
+
+ protected final boolean ignoreTokenCount;
+ protected final boolean sourceHasTypeTag;
+
+ public AbstractUTF8StringBinaryTokenizer(boolean ignoreTokenCount,
+ boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+ this.ignoreTokenCount = ignoreTokenCount;
+ this.sourceHasTypeTag = sourceHasTypeTag;
+ if (!ignoreTokenCount) {
+ tokensStart = new IntArray();
+ tokensLength = new IntArray();
+ } else {
+ tokensStart = null;
+ tokensLength = null;
+ }
+ token = tokenFactory.createToken();
+ }
+
+ @Override
+ public IToken getToken() {
+ return token;
+ }
+
+ @Override
+ public void reset(byte[] data, int start, int length) {
+ this.start = start;
+ index = this.start;
+ if (sourceHasTypeTag) {
+ index++; // skip type tag
+ }
+ utf8Length = StringUtils.getUTFLen(data, index);
+ index += 2; // skip utf8 length indicator
+ this.data = data;
+ this.length = length + start;
+
+ tokenLength = 0;
+ if (!ignoreTokenCount) {
+ tokensStart.reset();
+ tokensLength.reset();
+ }
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java
new file mode 100644
index 0000000..92d6ac2
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java
@@ -0,0 +1,106 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public abstract class AbstractUTF8Token implements IToken {
+ public static final int GOLDEN_RATIO_32 = 0x09e3779b9;
+
+ protected int length;
+ protected int tokenLength;
+ protected int start;
+ protected int tokenCount;
+ protected byte[] data;
+ protected final byte tokenTypeTag;
+ protected final byte countTypeTag;
+
+ public AbstractUTF8Token() {
+ tokenTypeTag = -1;
+ countTypeTag = -1;
+ }
+
+ public AbstractUTF8Token(byte tokenTypeTag, byte countTypeTag) {
+ this.tokenTypeTag = tokenTypeTag;
+ this.countTypeTag = countTypeTag;
+ }
+
+ @Override
+ public byte[] getData() {
+ return data;
+ }
+
+ @Override
+ public int getLength() {
+ return length;
+ }
+
+ public int getLowerCaseUTF8Len(int size) {
+ int lowerCaseUTF8Len = 0;
+ int pos = start;
+ for (int i = 0; i < size; i++) {
+ char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+ lowerCaseUTF8Len += StringUtils.getModifiedUTF8Len(c);
+ pos += StringUtils.charSize(data, pos);
+ }
+ return lowerCaseUTF8Len;
+ }
+
+ @Override
+ public int getStart() {
+ return start;
+ }
+
+ @Override
+ public int getTokenLength() {
+ return tokenLength;
+ }
+
+ public void handleCountTypeTag(DataOutput dos) throws IOException {
+ if (countTypeTag > 0) {
+ dos.write(countTypeTag);
+ }
+ }
+
+ public void handleTokenTypeTag(DataOutput dos) throws IOException {
+ if (tokenTypeTag > 0) {
+ dos.write(tokenTypeTag);
+ }
+ }
+
+ @Override
+ public void reset(byte[] data, int start, int length, int tokenLength,
+ int tokenCount) {
+ this.data = data;
+ this.start = start;
+ this.length = length;
+ this.tokenLength = tokenLength;
+ this.tokenCount = tokenCount;
+ }
+
+ @Override
+ public void serializeTokenCount(DataOutput dos) throws IOException {
+ handleCountTypeTag(dos);
+ dos.writeInt(tokenCount);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8TokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8TokenFactory.java
new file mode 100644
index 0000000..3b0b82d
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8TokenFactory.java
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public abstract class AbstractUTF8TokenFactory implements ITokenFactory {
+ private static final long serialVersionUID = 1L;
+ protected final byte tokenTypeTag;
+ protected final byte countTypeTag;
+
+ public AbstractUTF8TokenFactory() {
+ tokenTypeTag = -1;
+ countTypeTag = -1;
+ }
+
+ public AbstractUTF8TokenFactory(byte tokenTypeTag, byte countTypeTag) {
+ this.tokenTypeTag = tokenTypeTag;
+ this.countTypeTag = countTypeTag;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..de9ad2c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
@@ -0,0 +1,87 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class DelimitedUTF8StringBinaryTokenizer extends
+ AbstractUTF8StringBinaryTokenizer {
+
+ public DelimitedUTF8StringBinaryTokenizer(boolean ignoreTokenCount,
+ boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+ super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+ }
+
+ @Override
+ public boolean hasNext() {
+ // skip delimiters
+ while (index < length && isSeparator(StringUtils.charAt(data, index))) {
+ index += StringUtils.charSize(data, index);
+ }
+ return index < length;
+ }
+
+ private boolean isSeparator(char c) {
+ return !(Character.isLetterOrDigit(c)
+ || Character.getType(c) == Character.OTHER_LETTER || Character
+ .getType(c) == Character.OTHER_NUMBER);
+ }
+
+ @Override
+ public void next() {
+ tokenLength = 0;
+ int currentTokenStart = index;
+ while (index < length && !isSeparator(StringUtils.charAt(data, index))) {
+ index += StringUtils.charSize(data, index);
+ tokenLength++;
+ }
+ int tokenCount = 1;
+ if (tokenLength > 0 && !ignoreTokenCount) {
+ // search if we got the same token before
+ for (int i = 0; i < tokensStart.length(); ++i) {
+ if (tokenLength == tokensLength.get(i)) {
+ int tokenStart = tokensStart.get(i);
+ tokenCount++; // assume we found it
+ int offset = 0;
+ int currLength = 0;
+ while (currLength < tokenLength) {
+ // case insensitive comparison
+ if (StringUtils.toLowerCase(StringUtils.charAt(data,
+ currentTokenStart + offset)) != StringUtils
+ .toLowerCase(StringUtils.charAt(data,
+ tokenStart + offset))) {
+ tokenCount--;
+ break;
+ }
+ offset += StringUtils.charSize(data, currentTokenStart
+ + offset);
+ currLength++;
+ }
+ }
+ }
+ // add the new token to the list of seen tokens
+ tokensStart.add(currentTokenStart);
+ tokensLength.add(tokenLength);
+ }
+
+ // set token
+ token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
new file mode 100644
index 0000000..4a350b3
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class DelimitedUTF8StringBinaryTokenizerFactory implements
+ IBinaryTokenizerFactory {
+
+ private static final long serialVersionUID = 1L;
+ private final boolean ignoreTokenCount;
+ private final boolean sourceHasTypeTag;
+ private final ITokenFactory tokenFactory;
+
+ public DelimitedUTF8StringBinaryTokenizerFactory(boolean ignoreTokenCount,
+ boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+ this.ignoreTokenCount = ignoreTokenCount;
+ this.sourceHasTypeTag = sourceHasTypeTag;
+ this.tokenFactory = tokenFactory;
+ }
+
+ @Override
+ public IBinaryTokenizer createTokenizer() {
+ return new DelimitedUTF8StringBinaryTokenizer(ignoreTokenCount,
+ sourceHasTypeTag, tokenFactory);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java
new file mode 100644
index 0000000..25d1a2c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class HashedUTF8NGramToken extends UTF8NGramToken {
+ public HashedUTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ handleTokenTypeTag(dos);
+
+ int hash = GOLDEN_RATIO_32;
+
+ // pre chars
+ for (int i = 0; i < numPreChars; i++) {
+ hash ^= PRECHAR;
+ hash *= GOLDEN_RATIO_32;
+ }
+
+ // regular chars
+ int numRegGrams = tokenLength - numPreChars - numPostChars;
+ int pos = start;
+ for (int i = 0; i < numRegGrams; i++) {
+ hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+ hash *= GOLDEN_RATIO_32;
+ pos += StringUtils.charSize(data, pos);
+ }
+
+ // post chars
+ for (int i = 0; i < numPostChars; i++) {
+ hash ^= POSTCHAR;
+ hash *= GOLDEN_RATIO_32;
+ }
+
+ // token count
+ hash += tokenCount;
+
+ dos.writeInt(hash);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java
new file mode 100644
index 0000000..4a87793
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class HashedUTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public HashedUTF8NGramTokenFactory() {
+ super();
+ }
+
+ public HashedUTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public IToken createToken() {
+ return new HashedUTF8NGramToken(tokenTypeTag, countTypeTag);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java
new file mode 100644
index 0000000..55237ce
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java
@@ -0,0 +1,88 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class HashedUTF8WordToken extends UTF8WordToken {
+
+ private int hash = 0;
+
+ public HashedUTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o == null) {
+ return false;
+ }
+ if (!(o instanceof IToken)) {
+ return false;
+ }
+ IToken t = (IToken) o;
+ if (t.getTokenLength() != tokenLength) {
+ return false;
+ }
+ int offset = 0;
+ for (int i = 0; i < tokenLength; i++) {
+ if (StringUtils.charAt(t.getData(), t.getStart() + offset) != StringUtils
+ .charAt(data, start + offset)) {
+ return false;
+ }
+ offset += StringUtils.charSize(data, start + offset);
+ }
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ return hash;
+ }
+
+ @Override
+ public void reset(byte[] data, int start, int length, int tokenLength,
+ int tokenCount) {
+ super.reset(data, start, length, tokenLength, tokenCount);
+
+ // pre-compute hash value using JAQL-like string hashing
+ int pos = start;
+ hash = GOLDEN_RATIO_32;
+ for (int i = 0; i < tokenLength; i++) {
+ hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+ hash *= GOLDEN_RATIO_32;
+ pos += StringUtils.charSize(data, pos);
+ }
+ hash += tokenCount;
+ }
+
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ if (tokenTypeTag > 0) {
+ dos.write(tokenTypeTag);
+ }
+
+ // serialize hash value
+ dos.writeInt(hash);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java
new file mode 100644
index 0000000..318f041
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class HashedUTF8WordTokenFactory extends AbstractUTF8TokenFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public HashedUTF8WordTokenFactory() {
+ super();
+ }
+
+ public HashedUTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public IToken createToken() {
+ return new HashedUTF8WordToken(tokenTypeTag, countTypeTag);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizer.java
new file mode 100644
index 0000000..05c6d0b
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizer.java
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public interface IBinaryTokenizer {
+ public IToken getToken();
+
+ public boolean hasNext();
+
+ public void next();
+
+ public void reset(byte[] data, int start, int length);
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizerFactory.java
new file mode 100644
index 0000000..bfe78ee
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizerFactory.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.Serializable;
+
+public interface IBinaryTokenizerFactory extends Serializable {
+ public IBinaryTokenizer createTokenizer();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/INGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/INGramToken.java
new file mode 100644
index 0000000..befc6d2
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/INGramToken.java
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public interface INGramToken {
+ public int getNumPostChars();
+
+ public int getNumPreChars();
+
+ public void setNumPrePostChars(int numPreChars, int numPostChars);
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IToken.java
new file mode 100644
index 0000000..c1840d7
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IToken.java
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public interface IToken {
+ public byte[] getData();
+
+ public int getLength();
+
+ public int getStart();
+
+ public int getTokenLength();
+
+ public void reset(byte[] data, int start, int length, int tokenLength,
+ int tokenCount);
+
+ public void serializeToken(DataOutput dos) throws IOException;
+
+ public void serializeTokenCount(DataOutput dos) throws IOException;
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/ITokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/ITokenFactory.java
new file mode 100644
index 0000000..8b5d71d
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/ITokenFactory.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.Serializable;
+
+public interface ITokenFactory extends Serializable {
+ public IToken createToken();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IntArray.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IntArray.java
new file mode 100644
index 0000000..2eb9ff4
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IntArray.java
@@ -0,0 +1,80 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.util.Arrays;
+
+public class IntArray {
+ private static final int SIZE = 128;
+
+ private int[] data;
+ private int length;
+
+ public IntArray() {
+ data = new int[SIZE];
+ length = 0;
+ }
+
+ public void add(int d) {
+ if (length == data.length) {
+ data = Arrays.copyOf(data, data.length << 1);
+ }
+ data[length++] = d;
+ }
+
+ public int[] get() {
+ return data;
+ }
+
+ public int get(int i) {
+ return data[i];
+ }
+
+ public int length() {
+ return length;
+ }
+
+ public void reset() {
+ length = 0;
+ }
+
+ public void sort() {
+ sort(0, length);
+ }
+
+ public void sort(int start, int end) {
+ Arrays.sort(data, start, end);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder out = new StringBuilder();
+ out.append('[');
+ for (int i = 0; i < length; ++i) {
+ out.append(data[i]);
+ if (i < length - 1) {
+ out.append(',');
+ out.append(' ');
+ }
+ }
+ out.append(']');
+ return out.toString();
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..2a13f83
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
@@ -0,0 +1,123 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class NGramUTF8StringBinaryTokenizer extends
+ AbstractUTF8StringBinaryTokenizer {
+
+ private int gramLength;
+ private boolean usePrePost;
+
+ private int gramNum;
+ private int totalGrams;
+
+ private final INGramToken concreteToken;
+
+ public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost,
+ boolean ignoreTokenCount, boolean sourceHasTypeTag,
+ ITokenFactory tokenFactory) {
+ super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+ this.gramLength = gramLength;
+ this.usePrePost = usePrePost;
+ concreteToken = (INGramToken) token;
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (gramNum < totalGrams) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public void next() {
+ int currentTokenStart = index;
+ int tokenCount = 1;
+ int numPreChars = 0;
+ int numPostChars = 0;
+ if (usePrePost) {
+ numPreChars = Math.max(gramLength - gramNum - 1, 0);
+ numPostChars = (gramNum > totalGrams - gramLength) ? gramLength
+ - totalGrams + gramNum : 0;
+ }
+ gramNum++;
+
+ concreteToken.setNumPrePostChars(numPreChars, numPostChars);
+ if (numPreChars == 0) {
+ index += StringUtils.charSize(data, index);
+ }
+
+ // compute token count
+ // ignore pre and post grams for duplicate detection
+ if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
+ int tmpIndex = start;
+ while (tmpIndex < currentTokenStart) {
+ tokenCount++; // assume found
+ int offset = 0;
+ for (int j = 0; j < gramLength; j++) {
+ if (StringUtils.toLowerCase(StringUtils.charAt(data,
+ currentTokenStart + offset)) != StringUtils
+ .toLowerCase(StringUtils.charAt(data, tmpIndex
+ + offset))) {
+ tokenCount--;
+ break;
+ }
+ offset += StringUtils.charSize(data, tmpIndex + offset);
+ }
+ tmpIndex += StringUtils.charSize(data, tmpIndex);
+ }
+ }
+
+ // set token
+ token.reset(data, currentTokenStart, length, gramLength, tokenCount);
+ }
+
+ @Override
+ public void reset(byte[] data, int start, int length) {
+ super.reset(data, start, length);
+ gramNum = 0;
+
+ int numChars = 0;
+ int pos = index;
+ int end = pos + utf8Length;
+ while (pos < end) {
+ numChars++;
+ pos += StringUtils.charSize(data, pos);
+ }
+
+ if (usePrePost) {
+ totalGrams = numChars + gramLength - 1;
+ } else {
+ totalGrams = numChars - gramLength + 1;
+ }
+ }
+
+ public void setGramlength(int gramLength) {
+ this.gramLength = gramLength;
+ }
+
+ public void setPrePost(boolean usePrePost) {
+ this.usePrePost = usePrePost;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java
new file mode 100644
index 0000000..6b6406f
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java
@@ -0,0 +1,86 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class UTF8NGramToken extends AbstractUTF8Token implements INGramToken {
+
+ public final static char PRECHAR = '#';
+
+ public final static char POSTCHAR = '$';
+
+ protected int numPreChars;
+ protected int numPostChars;
+
+ public UTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public int getNumPostChars() {
+ return numPreChars;
+ }
+
+ @Override
+ public int getNumPreChars() {
+ return numPostChars;
+ }
+
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ handleTokenTypeTag(dos);
+
+ // regular chars
+ int numRegChars = tokenLength - numPreChars - numPostChars;
+
+ // assuming pre and post char need 1-byte each in utf8
+ int tokenUTF8Len = getLowerCaseUTF8Len(numRegChars) + numPreChars
+ + numPostChars;
+
+ // write utf8 length indicator
+ StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+
+ // pre chars
+ for (int i = 0; i < numPreChars; i++) {
+ StringUtils.writeCharAsModifiedUTF8(PRECHAR, dos);
+ }
+
+ int pos = start;
+ for (int i = 0; i < numRegChars; i++) {
+ char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+ StringUtils.writeCharAsModifiedUTF8(c, dos);
+ pos += StringUtils.charSize(data, pos);
+ }
+
+ // post chars
+ for (int i = 0; i < numPostChars; i++) {
+ StringUtils.writeCharAsModifiedUTF8(POSTCHAR, dos);
+ }
+ }
+
+ public void setNumPrePostChars(int numPreChars, int numPostChars) {
+ this.numPreChars = numPreChars;
+ this.numPostChars = numPostChars;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramTokenFactory.java
new file mode 100644
index 0000000..968d8e1
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramTokenFactory.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class UTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public UTF8NGramTokenFactory() {
+ super();
+ }
+
+ public UTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public IToken createToken() {
+ return new UTF8NGramToken(tokenTypeTag, countTypeTag);
+ }
+
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java
new file mode 100644
index 0000000..25e0cd3
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class UTF8WordToken extends AbstractUTF8Token {
+
+ public UTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ handleTokenTypeTag(dos);
+
+ int tokenUTF8Len = getLowerCaseUTF8Len(tokenLength);
+ StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+ int pos = start;
+ for (int i = 0; i < tokenLength; i++) {
+ char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+ StringUtils.writeCharAsModifiedUTF8(c, dos);
+ pos += StringUtils.charSize(data, pos);
+ }
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordTokenFactory.java
new file mode 100644
index 0000000..4358254
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordTokenFactory.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class UTF8WordTokenFactory extends AbstractUTF8TokenFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public UTF8WordTokenFactory() {
+ super();
+ }
+
+ public UTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public IToken createToken() {
+ return new UTF8WordToken(tokenTypeTag, countTypeTag);
+ }
+
+}