Started work on improved inverted index. Implemented bulk load procedure.
git-svn-id: https://hyracks.googlecode.com/svn/branches/hyracks_indexes@374 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-examples/hyracks-integration-tests/src/test/java/edu/uci/ics/hyracks/tests/invertedindex/InvertedIndexOperatorsTest.java b/hyracks-examples/hyracks-integration-tests/src/test/java/edu/uci/ics/hyracks/tests/invertedindex/InvertedIndexOperatorsTest.java
index 2cfe438..0e19298 100644
--- a/hyracks-examples/hyracks-integration-tests/src/test/java/edu/uci/ics/hyracks/tests/invertedindex/InvertedIndexOperatorsTest.java
+++ b/hyracks-examples/hyracks-integration-tests/src/test/java/edu/uci/ics/hyracks/tests/invertedindex/InvertedIndexOperatorsTest.java
@@ -4,6 +4,10 @@
import org.junit.Test;
+import edu.uci.ics.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizerFactory;
+import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizerFactory;
+import edu.uci.ics.fuzzyjoin.tokenizer.ITokenFactory;
+import edu.uci.ics.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
import edu.uci.ics.hyracks.api.constraints.PartitionConstraintHelper;
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
@@ -21,9 +25,7 @@
import edu.uci.ics.hyracks.dataflow.std.file.FileSplit;
import edu.uci.ics.hyracks.dataflow.std.file.IFileSplitProvider;
import edu.uci.ics.hyracks.dataflow.std.misc.PrinterOperatorDescriptor;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
import edu.uci.ics.hyracks.storage.am.invertedindex.dataflow.BinaryTokenizerOperatorDescriptor;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizerFactory;
import edu.uci.ics.hyracks.tests.integration.AbstractIntegrationTest;
public class InvertedIndexOperatorsTest extends AbstractIntegrationTest {
@@ -45,7 +47,9 @@
RecordDescriptor tokenizerRecDesc = new RecordDescriptor(new ISerializerDeserializer[] {
UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE });
- IBinaryTokenizerFactory tokenizerFactory = new DelimitedUTF8StringBinaryTokenizerFactory(' ');
+
+ ITokenFactory tokenFactory = new UTF8WordTokenFactory();
+ IBinaryTokenizerFactory tokenizerFactory = new DelimitedUTF8StringBinaryTokenizerFactory(true, false, tokenFactory);
int[] tokenFields = { 1 };
int[] projFields = { 0 };
BinaryTokenizerOperatorDescriptor binaryTokenizer = new BinaryTokenizerOperatorDescriptor(spec,
diff --git a/hyracks-storage-am-invertedindex/pom.xml b/hyracks-storage-am-invertedindex/pom.xml
index df6f6d1..9a464bf 100644
--- a/hyracks-storage-am-invertedindex/pom.xml
+++ b/hyracks-storage-am-invertedindex/pom.xml
@@ -23,7 +23,7 @@
</plugin>
</plugins>
</build>
- <dependencies>
+ <dependencies>
<dependency>
<groupId>edu.uci.ics.hyracks</groupId>
<artifactId>hyracks-storage-common</artifactId>
@@ -60,6 +60,13 @@
<scope>compile</scope>
</dependency>
<dependency>
+ <groupId>edu.uci.ics.fuzzyjoin</groupId>
+ <artifactId>fuzzyjoin-core</artifactId>
+ <version>0.0.2-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.8.1</version>
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java
deleted file mode 100644
index 40cb7da..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.api;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-
-public interface IBinaryTokenizer {
-
- public void reset(byte[] data, int start, int length);
-
- public boolean hasNext();
-
- public void next();
-
- public int getTokenStartOff();
-
- public int getTokenLength();
-
- public void writeToken(DataOutput dos) throws IOException;
-
- public RecordDescriptor getTokenSchema();
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
deleted file mode 100644
index 7e91fd4..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.api;
-
-import java.io.Serializable;
-
-public interface IBinaryTokenizerFactory extends Serializable {
- public IBinaryTokenizer createBinaryTokenizer();
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListBuilder.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListBuilder.java
new file mode 100644
index 0000000..7cdd9ac
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListBuilder.java
@@ -0,0 +1,17 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+
+public interface IInvertedListBuilder {
+ public boolean startNewList(ITupleReference tuple, int tokenField);
+
+ // returns true if successfully appended
+ // returns false if not enough space in targetBuf
+ public boolean appendElement(ITupleReference tuple, int[] elementFields);
+
+ public void setTargetBuffer(byte[] targetBuf, int startPos);
+
+ public int getListSize();
+
+ public int getPos();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
index 059a1e2..1533307 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
@@ -15,6 +15,7 @@
package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
+import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizerFactory;
import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
import edu.uci.ics.hyracks.api.dataflow.IOperatorNodePushable;
import edu.uci.ics.hyracks.api.dataflow.value.IRecordDescriptorProvider;
@@ -23,7 +24,6 @@
import edu.uci.ics.hyracks.api.job.IOperatorEnvironment;
import edu.uci.ics.hyracks.api.job.JobSpecification;
import edu.uci.ics.hyracks.dataflow.std.base.AbstractSingleActivityOperatorDescriptor;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
public class BinaryTokenizerOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor {
@@ -50,6 +50,6 @@
public IOperatorNodePushable createPushRuntime(IHyracksStageletContext ctx, IOperatorEnvironment env,
IRecordDescriptorProvider recordDescProvider, int partition, int nPartitions) throws HyracksDataException {
return new BinaryTokenizerOperatorNodePushable(ctx, recordDescProvider.getInputRecordDescriptor(odId, 0),
- recordDescriptors[0], tokenizerFactory.createBinaryTokenizer(), tokenFields, projFields);
+ recordDescriptors[0], tokenizerFactory.createTokenizer(), tokenFields, projFields);
}
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
index 9dac535..d470513 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
@@ -19,6 +19,8 @@
import java.io.IOException;
import java.nio.ByteBuffer;
+import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
+import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
@@ -27,7 +29,6 @@
import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
import edu.uci.ics.hyracks.dataflow.std.base.AbstractUnaryInputUnaryOutputOperatorNodePushable;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
public class BinaryTokenizerOperatorNodePushable extends AbstractUnaryInputUnaryOutputOperatorNodePushable {
@@ -84,7 +85,8 @@
builder.reset();
try {
- tokenizer.writeToken(builderDos);
+ IToken token = tokenizer.getToken();
+ token.serializeToken(builderDos);
builder.addFieldEndOffset();
} catch (IOException e) {
throw new HyracksDataException(e.getMessage());
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListBuilder.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListBuilder.java
new file mode 100644
index 0000000..b733103
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListBuilder.java
@@ -0,0 +1,55 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
+
+public class FixedSizeElementInvertedListBuilder implements IInvertedListBuilder {
+ private final int listElementSize;
+ private int listSize = 0;
+
+ private byte[] targetBuf;
+ private int pos;
+
+ public FixedSizeElementInvertedListBuilder(int listElementSize) {
+ this.listElementSize = listElementSize;
+ }
+
+ @Override
+ public boolean startNewList(ITupleReference tuple, int tokenField) {
+ if(pos + listElementSize >= targetBuf.length) return false;
+ else {
+ listSize = 0;
+ return true;
+ }
+ }
+
+ @Override
+ public boolean appendElement(ITupleReference tuple, int[] elementFields) {
+ if(pos + listElementSize >= targetBuf.length) return false;
+
+ for(int i = 0; i < elementFields.length; i++) {
+ int field = elementFields[i];
+ System.arraycopy(tuple.getFieldData(field), tuple.getFieldStart(field), targetBuf, pos, tuple.getFieldLength(field));
+ }
+
+ listSize++;
+
+ return true;
+ }
+
+ @Override
+ public void setTargetBuffer(byte[] targetBuf, int startPos) {
+ this.pos = startPos;
+ this.targetBuf = targetBuf;
+ }
+
+ @Override
+ public int getListSize() {
+ return listSize;
+ }
+
+ @Override
+ public int getPos() {
+ return pos;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java
new file mode 100644
index 0000000..34a2825
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java
@@ -0,0 +1,146 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ByteArrayAccessibleOutputStream;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
+import edu.uci.ics.hyracks.storage.common.buffercache.ICachedPage;
+import edu.uci.ics.hyracks.storage.common.file.BufferedFileHandle;
+
+public class InvertedIndex {
+ private int rootPageId = 0;
+ private IBufferCache bufferCache;
+ private int fileId;
+
+ public BulkLoadContext beginBulkLoad(IInvertedListBuilder invListBuilder, IBinaryComparator tokenCmp) throws HyracksDataException {
+ BulkLoadContext ctx = new BulkLoadContext(invListBuilder, tokenCmp);
+ ctx.init(rootPageId, fileId);
+ return ctx;
+ }
+
+ public void bulkLoadAddTuple(BulkLoadContext ctx, ITupleReference tuple, int tokenField, int[] listElementFields) throws HyracksDataException {
+
+ // first inverted list, copy token to baaos and start new list
+ if(ctx.currentInvListTokenBaaos.size() == 0) {
+ ctx.currentInvListStartPageId = ctx.currentPageId;
+ ctx.currentInvListStartOffset = ctx.invListBuilder.getPos();
+
+ ctx.currentInvListTokenBaaos.reset();
+ ctx.currentInvListTokenBaaos.write(tuple.getFieldData(tokenField), tuple.getFieldStart(tokenField), tuple.getFieldLength(tokenField));
+
+ if(!ctx.invListBuilder.startNewList(tuple, tokenField)) {
+ ctx.pinNextPage();
+ ctx.invListBuilder.setTargetBuffer(ctx.currentPage.getBuffer().array(), 0);
+ if(!ctx.invListBuilder.startNewList(tuple, tokenField)) {
+ throw new IllegalStateException("Failed to create first inverted list.");
+ }
+ }
+ }
+
+ // create new inverted list?
+ if(ctx.tokenCmp.compare(tuple.getFieldData(tokenField),
+ tuple.getFieldStart(tokenField),
+ tuple.getFieldLength(tokenField),
+ ctx.currentInvListTokenBaaos.getByteArray(),
+ 0,
+ ctx.currentInvListTokenBaaos.size()) != 0) {
+
+ ctx.lastInvListStartPageId = ctx.currentInvListStartPageId;
+ ctx.lastInvListStartOffset = ctx.currentInvListStartOffset;
+
+ ctx.lastInvListTokenBaaos.reset();
+ ctx.lastInvListTokenBaaos.write(ctx.currentInvListTokenBaaos.getByteArray(), 0, ctx.currentInvListTokenBaaos.size());
+
+ ctx.currentInvListTokenBaaos.reset();
+ ctx.currentInvListTokenBaaos.write(tuple.getFieldData(tokenField), tuple.getFieldStart(tokenField), tuple.getFieldLength(tokenField));
+
+ ctx.lastInvListSize = ctx.invListBuilder.getListSize();
+ if(!ctx.invListBuilder.startNewList(tuple, tokenField)) {
+ ctx.pinNextPage();
+ ctx.invListBuilder.setTargetBuffer(ctx.currentPage.getBuffer().array(), 0);
+ if(!ctx.invListBuilder.startNewList(tuple, tokenField)) {
+ throw new IllegalStateException("Failed to start new inverted list after switching to a new page.");
+ }
+ }
+
+ ctx.currentInvListStartPageId = ctx.currentPageId;
+ ctx.currentInvListStartOffset = ctx.invListBuilder.getPos();
+ }
+
+ // append to current inverted list
+ if(!ctx.invListBuilder.appendElement(tuple, listElementFields)) {
+ ctx.pinNextPage();
+ ctx.invListBuilder.setTargetBuffer(ctx.currentPage.getBuffer().array(), 0);
+ if(!ctx.invListBuilder.appendElement(tuple, listElementFields)) {
+ throw new IllegalStateException("Failed to append element to inverted list after switching to a new page.");
+ }
+ }
+ }
+
+ // returns size of last inverted list
+ public int endBulkLoad(BulkLoadContext ctx) throws HyracksDataException {
+ ctx.lastInvListStartPageId = ctx.currentInvListStartPageId;
+ ctx.lastInvListStartOffset = ctx.currentInvListStartOffset;
+
+ ctx.lastInvListTokenBaaos.reset();
+ ctx.lastInvListTokenBaaos.write(ctx.currentInvListTokenBaaos.getByteArray(), 0, ctx.currentInvListTokenBaaos.size());
+
+ ctx.deinit();
+ return ctx.invListBuilder.getListSize();
+ }
+
+ public final class BulkLoadContext {
+ private int lastInvListSize;
+ private int lastInvListStartPageId;
+ private int lastInvListStartOffset;
+ private final ByteArrayAccessibleOutputStream lastInvListTokenBaaos = new ByteArrayAccessibleOutputStream();
+
+ private int currentInvListStartPageId;
+ private int currentInvListStartOffset;
+ private final ByteArrayAccessibleOutputStream currentInvListTokenBaaos = new ByteArrayAccessibleOutputStream();
+
+ private int currentPageId;
+ private ICachedPage currentPage;
+ private final IInvertedListBuilder invListBuilder;
+ private final IBinaryComparator tokenCmp;
+
+ public BulkLoadContext(IInvertedListBuilder invListBuilder, IBinaryComparator tokenCmp) {
+ this.invListBuilder = invListBuilder;
+ this.tokenCmp = tokenCmp;
+ }
+
+ public void init(int startPageId, int fileId) throws HyracksDataException {
+ currentPageId = startPageId;
+ currentPage = bufferCache.pin(BufferedFileHandle.getDiskPageId(fileId, currentPageId), true);
+ invListBuilder.setTargetBuffer(currentPage.getBuffer().array(), 0);
+ }
+
+ public void deinit() throws HyracksDataException {
+ if(currentPage != null) bufferCache.unpin(currentPage);
+ }
+
+ public void pinNextPage() throws HyracksDataException {
+ bufferCache.unpin(currentPage);
+ currentPageId++;
+ currentPage = bufferCache.pin(BufferedFileHandle.getDiskPageId(fileId, currentPageId), true);
+ }
+
+ public ByteArrayAccessibleOutputStream getLastInvListTokenBaaos() {
+ return lastInvListTokenBaaos;
+ }
+
+ public int getLastInvListStartPageId() {
+ return lastInvListStartPageId;
+ }
+
+ public int getLastInvListStartOffset() {
+ return lastInvListStartOffset;
+ }
+
+ public int getLastInvListSize() {
+ return lastInvListSize;
+ }
+ };
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java
index dc5bd0c..47b4310 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java
@@ -21,6 +21,8 @@
import java.util.ArrayList;
import java.util.List;
+import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
+import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
@@ -41,7 +43,6 @@
import edu.uci.ics.hyracks.storage.am.btree.impls.RangeSearchCursor;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.TreeIndexOp;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
@@ -135,7 +136,8 @@
queryTokenBuilder.reset();
try {
- queryTokenizer.writeToken(queryTokenDos);
+ IToken token = queryTokenizer.getToken();
+ token.serializeToken(queryTokenDos);
queryTokenBuilder.addFieldEndOffset();
} catch (IOException e) {
throw new HyracksDataException(e);
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
deleted file mode 100644
index 73635f9..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
-import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-
-public class DelimitedUTF8StringBinaryTokenizer implements IBinaryTokenizer {
-
- private static final RecordDescriptor tokenSchema = new RecordDescriptor(
- new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE });
-
- private final char delimiter;
- private byte[] data;
- private int start;
- private int length;
-
- private int tokenLength;
- private int tokenStart;
- private int pos;
-
- public DelimitedUTF8StringBinaryTokenizer(char delimiter) {
- this.delimiter = delimiter;
- }
-
- @Override
- public int getTokenLength() {
- return tokenLength;
- }
-
- @Override
- public int getTokenStartOff() {
- return tokenStart;
- }
-
- @Override
- public boolean hasNext() {
- if (pos >= start + length)
- return false;
- else
- return true;
- }
-
- @Override
- public void next() {
- tokenLength = 0;
- tokenStart = pos;
- while (pos < start + length) {
- int len = StringUtils.charSize(data, pos);
- char ch = StringUtils.charAt(data, pos);
- pos += len;
- if (ch == delimiter) {
- break;
- }
- tokenLength += len;
- }
- }
-
- @Override
- public void reset(byte[] data, int start, int length) {
- this.data = data;
- this.start = start;
- this.pos = start;
- this.length = length;
- this.tokenLength = 0;
- this.tokenStart = 0;
- pos += 2; // UTF-8 specific
- }
-
- @Override
- public void writeToken(DataOutput dos) throws IOException {
- // WARNING: 2-byte length indicator is specific to UTF-8
- dos.writeShort((short) tokenLength);
- dos.write(data, tokenStart, tokenLength);
- }
-
- @Override
- public RecordDescriptor getTokenSchema() {
- return tokenSchema;
- }
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
deleted file mode 100644
index e3e0be3..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
-
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
-
-public class DelimitedUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
-
- private static final long serialVersionUID = 1L;
- private final char delimiter;
-
- public DelimitedUTF8StringBinaryTokenizerFactory(char delimiter) {
- this.delimiter = delimiter;
- }
-
- @Override
- public IBinaryTokenizer createBinaryTokenizer() {
- return new DelimitedUTF8StringBinaryTokenizer(delimiter);
- }
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java
deleted file mode 100644
index 54fc371..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
-import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-
-public class HashedQGramUTF8StringBinaryTokenizer implements IBinaryTokenizer {
-
- private static final RecordDescriptor tokenSchema = new RecordDescriptor(
- new ISerializerDeserializer[] { IntegerSerializerDeserializer.INSTANCE });
-
- private final boolean prePost;
- private final int q;
- private byte[] data;
- private int start;
- private int length;
- private int gramNum;
- private int utflen;
-
- private final char PRECHAR = '#';
- private final char POSTCHAR = '$';
-
- private int charPos;
- private int pos;
- private int hashedGram;
-
- HashedQGramUTF8StringBinaryTokenizer(int q, boolean prePost) {
- this.prePost = prePost;
- this.q = q;
- }
-
- @Override
- public int getTokenLength() {
- // the produced token (hashed q-gram) is derived from data
- // but not contained in it
- // therefore this call does not make sense
- return -1;
- }
-
- @Override
- public int getTokenStartOff() {
- // the produced token (hashed q-gram) is derived from data
- // but not contained in it
- // therefore this call does not make sense
- return -1;
- }
-
- @Override
- public boolean hasNext() {
- if ((prePost && pos >= start + length) || (!prePost && pos >= start + length - q))
- return false;
- else
- return true;
- }
-
- @Override
- public void next() {
- hashedGram = 0;
- if (prePost) {
- if (gramNum < q) {
- for (int i = 0; i < q - gramNum; i++) {
- hashedGram = 31 * hashedGram + PRECHAR;
- }
-
- int tmpPos = pos;
- for (int i = 0; i < gramNum; i++) {
- hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
- tmpPos += StringUtils.charSize(data, tmpPos);
- }
- } else {
- int stopStr = Math.min(charPos + q, utflen);
- int tmpPos = pos;
- for (int i = charPos; i < stopStr; i++) {
- hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
- tmpPos += StringUtils.charSize(data, tmpPos);
- }
-
- int stopPost = (charPos + q) - (utflen);
- for (int i = 0; i < stopPost; i++) {
- hashedGram = 31 * hashedGram + POSTCHAR;
- }
- pos += StringUtils.charSize(data, pos);
- charPos++;
- }
- gramNum++;
- } else {
- int tmpPos = pos;
- for (int i = charPos; i < charPos + q; i++) {
- hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
- tmpPos += StringUtils.charSize(data, tmpPos);
- }
- pos += StringUtils.charSize(data, pos);
- charPos++;
- }
- }
-
- @Override
- public void reset(byte[] data, int start, int length) {
- this.data = data;
- this.start = start;
- this.length = length;
- this.utflen = StringUtils.getUTFLen(data, start);
- this.pos = start + 2; // UTF-8 specific
- this.gramNum = 1;
- this.charPos = 0;
- }
-
- @Override
- public void writeToken(DataOutput dos) throws IOException {
- dos.writeInt(hashedGram);
- }
-
- public char getPreChar() {
- return PRECHAR;
- }
-
- public char getPostChar() {
- return POSTCHAR;
- }
-
- @Override
- public RecordDescriptor getTokenSchema() {
- return tokenSchema;
- }
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java
deleted file mode 100644
index a11fe8a..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
-
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
-
-public class HashedQGramUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
-
- private static final long serialVersionUID = 1L;
- private final int q;
- private final boolean prePost;
-
- public HashedQGramUTF8StringBinaryTokenizerFactory(int q, boolean prePost) {
- this.q = q;
- this.prePost = prePost;
- }
-
- @Override
- public IBinaryTokenizer createBinaryTokenizer() {
- return new HashedQGramUTF8StringBinaryTokenizer(q, prePost);
- }
-}
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java
index cba3c79..373e3b6 100644
--- a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java
@@ -27,6 +27,10 @@
import org.junit.Test;
+import edu.uci.ics.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer;
+import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
+import edu.uci.ics.fuzzyjoin.tokenizer.ITokenFactory;
+import edu.uci.ics.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
@@ -59,10 +63,8 @@
import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.TreeIndexOp;
import edu.uci.ics.hyracks.storage.am.common.tuples.TypeAwareTupleWriterFactory;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.SimpleConjunctiveSearcher;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer;
import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
import edu.uci.ics.hyracks.storage.common.buffercache.ICacheMemoryAllocator;
import edu.uci.ics.hyracks.storage.common.file.IFileMapProvider;
@@ -208,8 +210,9 @@
FrameTupleReference queryTuple = new FrameTupleReference();
String query = "computer hyracks fast";
- char queryDelimiter = ' ';
- IBinaryTokenizer queryTokenizer = new DelimitedUTF8StringBinaryTokenizer(queryDelimiter);
+
+ ITokenFactory tokenFactory = new UTF8WordTokenFactory();
+ IBinaryTokenizer queryTokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
queryTb.reset();
UTF8StringSerializerDeserializer.INSTANCE.serialize(query, queryDos);
@@ -255,26 +258,26 @@
}
}
- /*
- * IBinaryComparator[] searchCmps = new IBinaryComparator[1];
- * searchCmps[0] =
- * UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
- * MultiComparator searchCmp = new MultiComparator(typeTraits,
- * searchCmps);
- *
- * // ordered scan IBTreeCursor scanCursor = new
- * RangeSearchCursor(leafFrame); RangePredicate nullPred = new
- * RangePredicate(true, null, null, true, true, null); BTreeOpContext
- * searchOpCtx = btree.createOpContext(BTreeOp.BTO_SEARCH, leafFrame,
- * interiorFrame, metaFrame); btree.search(scanCursor, nullPred,
- * searchOpCtx);
- *
- * try { while (scanCursor.hasNext()) { scanCursor.next();
- * ITupleReference frameTuple = scanCursor.getTuple(); String rec =
- * cmp.printTuple(frameTuple, btreeSerde); System.out.println(rec); } }
- * catch (Exception e) { e.printStackTrace(); } finally {
- * scanCursor.close(); }
- */
+//
+// IBinaryComparator[] searchCmps = new IBinaryComparator[1];
+// searchCmps[0] =
+// UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+// MultiComparator searchCmp = new MultiComparator(typeTraits,
+// searchCmps);
+//
+// // ordered scan IBTreeCursor scanCursor = new
+// RangeSearchCursor(leafFrame); RangePredicate nullPred = new
+// RangePredicate(true, null, null, true, true, null); BTreeOpContext
+// searchOpCtx = btree.createOpContext(BTreeOp.BTO_SEARCH, leafFrame,
+// interiorFrame, metaFrame); btree.search(scanCursor, nullPred,
+// searchOpCtx);
+//
+// try { while (scanCursor.hasNext()) { scanCursor.next();
+// ITupleReference frameTuple = scanCursor.getTuple(); String rec =
+// cmp.printTuple(frameTuple, btreeSerde); System.out.println(rec); } }
+// catch (Exception e) { e.printStackTrace(); } finally {
+// scanCursor.close(); }
+
btree.close();
bufferCache.closeFile(fileId);
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java
index 47c75cf..7181b77 100644
--- a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java
@@ -19,16 +19,16 @@
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutputStream;
-import java.util.ArrayList;
import java.util.Random;
import org.junit.Assert;
import org.junit.Test;
-import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunction;
+import edu.uci.ics.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer;
+import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
+import edu.uci.ics.fuzzyjoin.tokenizer.ITokenFactory;
+import edu.uci.ics.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
import edu.uci.ics.hyracks.dataflow.common.comm.io.ByteArrayAccessibleOutputStream;
-import edu.uci.ics.hyracks.dataflow.common.data.hash.UTF8StringBinaryHashFunctionFactory;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
public class TokenizerTest {
@@ -43,7 +43,8 @@
int maxWordLength = 50;
char delimiter = ' ';
- DelimitedUTF8StringBinaryTokenizer tok = new DelimitedUTF8StringBinaryTokenizer(delimiter);
+ ITokenFactory tokenFactory = new UTF8WordTokenFactory();
+ DelimitedUTF8StringBinaryTokenizer tok = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
// create a bunch of documents
for (int i = 0; i < numDocs; i++) {
@@ -78,7 +79,8 @@
// write token to outputstream
ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
DataOutputStream dosWrite = new DataOutputStream(baaosWrite);
- tok.writeToken(dosWrite);
+ IToken token = tok.getToken();
+ token.serializeToken(dosWrite);
// deserialize token to get string object
ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
@@ -90,7 +92,8 @@
}
}
- // testing HashedQGramUTF8StringBinaryTokenizer
+ /*
+ // testing HashedNGramUTF8StringBinaryTokenizer
@Test
public void test02() throws Exception {
Random rnd = new Random(50);
@@ -115,27 +118,16 @@
// randomly choose pre and postfixing
boolean prePost = false;
- if (Math.abs(rnd.nextInt()) % 2 == 0)
- prePost = true;
+ //if (Math.abs(rnd.nextInt()) % 2 == 0)
+ // prePost = true;
- HashedQGramUTF8StringBinaryTokenizer qgramTok = new HashedQGramUTF8StringBinaryTokenizer(q, prePost);
-
- String extendedString = str;
- if (prePost) {
- // pre and postfix string
- StringBuilder strBuilder = new StringBuilder();
- for (int j = 0; j < q - 1; j++)
- strBuilder.append(qgramTok.getPreChar());
- strBuilder.append(str);
- for (int j = 0; j < q - 1; j++)
- strBuilder.append(qgramTok.getPostChar());
- extendedString = strBuilder.toString();
- }
-
+ ITokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+ NGramUTF8StringBinaryTokenizer qgramTok = new NGramUTF8StringBinaryTokenizer(q, prePost, true, false, tokenFactory);
+
// generate q-grams in deserialized form
ArrayList<String> javaGrams = new ArrayList<String>();
- for (int j = 0; j < extendedString.length() - q + 1; j++) {
- javaGrams.add(extendedString.substring(j, j + q));
+ for (int j = 0; j < str.length() - q + 1; j++) {
+ javaGrams.add(str.substring(j, j + q).toLowerCase());
}
// serialize string for use in binary gram tokenizer
@@ -153,7 +145,8 @@
// write token to outputstream
ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
DataOutputStream dosWrite = new DataOutputStream(baaosWrite);
- qgramTok.writeToken(dosWrite);
+ IToken token = qgramTok.getToken();
+ token.serializeToken(dosWrite);
// deserialize token to get hashed gram
ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
@@ -175,6 +168,7 @@
}
}
}
+ */
public static String randomString(int length, Random random) {
int maxAttempts = 1000;