Merged 500:541 from trunk
git-svn-id: https://hyracks.googlecode.com/svn/branches/hyracks_scheduling@542 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-storage-am-invertedindex/pom.xml b/hyracks-storage-am-invertedindex/pom.xml
index 78a053c..4578a46 100644
--- a/hyracks-storage-am-invertedindex/pom.xml
+++ b/hyracks-storage-am-invertedindex/pom.xml
@@ -2,12 +2,12 @@
<modelVersion>4.0.0</modelVersion>
<groupId>edu.uci.ics.hyracks</groupId>
<artifactId>hyracks-storage-am-invertedindex</artifactId>
- <version>0.1.7-SNAPSHOT</version>
+ <version>0.1.8-SNAPSHOT</version>
<parent>
<groupId>edu.uci.ics.hyracks</groupId>
<artifactId>hyracks</artifactId>
- <version>0.1.7-SNAPSHOT</version>
+ <version>0.1.8-SNAPSHOT</version>
</parent>
<build>
@@ -27,38 +27,38 @@
<dependency>
<groupId>edu.uci.ics.hyracks</groupId>
<artifactId>hyracks-storage-common</artifactId>
- <version>0.1.7-SNAPSHOT</version>
+ <version>0.1.8-SNAPSHOT</version>
<type>jar</type>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>edu.uci.ics.hyracks</groupId>
<artifactId>hyracks-dataflow-common</artifactId>
- <version>0.1.7-SNAPSHOT</version>
+ <version>0.1.8-SNAPSHOT</version>
<type>jar</type>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>edu.uci.ics.hyracks</groupId>
<artifactId>hyracks-dataflow-std</artifactId>
- <version>0.1.7-SNAPSHOT</version>
+ <version>0.1.8-SNAPSHOT</version>
<type>jar</type>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>edu.uci.ics.hyracks</groupId>
<artifactId>hyracks-control-nc</artifactId>
- <version>0.1.7-SNAPSHOT</version>
+ <version>0.1.8-SNAPSHOT</version>
<type>jar</type>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>edu.uci.ics.hyracks</groupId>
<artifactId>hyracks-storage-am-btree</artifactId>
- <version>0.1.7-SNAPSHOT</version>
+ <version>0.1.8-SNAPSHOT</version>
<type>jar</type>
<scope>compile</scope>
- </dependency>
+ </dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java
deleted file mode 100644
index 0d5dc8f..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.api;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-
-public interface IBinaryTokenizer {
-
- public void reset(byte[] data, int start, int length);
-
- public boolean hasNext();
-
- public void next();
-
- public int getTokenStartOff();
-
- public int getTokenLength();
-
- public int getNumTokens();
-
- public void writeToken(DataOutput dos) throws IOException;
-
- public RecordDescriptor getTokenSchema();
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexOperatorDescriptorHelper.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexOperatorDescriptorHelper.java
new file mode 100644
index 0000000..b3afe4a
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexOperatorDescriptorHelper.java
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.dataflow.std.file.IFileSplitProvider;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.ITreeIndexOperatorDescriptorHelper;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.IIndexRegistryProvider;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
+
+public interface IInvertedIndexOperatorDescriptorHelper extends ITreeIndexOperatorDescriptorHelper {
+ public IFileSplitProvider getInvIndexFileSplitProvider();
+
+ public IBinaryComparatorFactory[] getInvIndexComparatorFactories();
+
+ public ITypeTrait[] getInvIndexTypeTraits();
+
+ public IIndexRegistryProvider<InvertedIndex> getInvIndexRegistryProvider();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java
index 990936e..fb73a65 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java
@@ -15,14 +15,14 @@
package edu.uci.ics.hyracks.storage.am.invertedindex.api;
-import java.nio.ByteBuffer;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
public interface IInvertedIndexResultCursor {
public boolean hasNext();
public void next();
- public ByteBuffer getBuffer();
+ public ITupleReference getTuple();
- public void reset();
+ public void reset(IInvertedIndexSearcher invIndexSearcher);
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearchModifier.java
similarity index 76%
rename from hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
rename to hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearchModifier.java
index 7e91fd4..8f7a3e3 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearchModifier.java
@@ -15,8 +15,10 @@
package edu.uci.ics.hyracks.storage.am.invertedindex.api;
-import java.io.Serializable;
+import java.util.List;
-public interface IBinaryTokenizerFactory extends Serializable {
- public IBinaryTokenizer createBinaryTokenizer();
+public interface IInvertedIndexSearchModifier {
+ int getOccurrenceThreshold(List<IInvertedListCursor> invListCursors);
+
+ int getPrefixLists(List<IInvertedListCursor> invListCursors);
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java
index 98dc3b9..36d64ea 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java
@@ -15,10 +15,21 @@
package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
public interface IInvertedIndexSearcher {
- public void search(ITupleReference queryTuple, int queryFieldIndex) throws Exception;
+ public void search(IInvertedIndexResultCursor resultCursor, ITupleReference queryTuple, int queryFieldIndex,
+ IInvertedIndexSearchModifier searchModifier) throws Exception;
- public IInvertedIndexResultCursor getResultCursor();
+ public IFrameTupleAccessor createResultFrameTupleAccessor();
+
+ public ITupleReference createResultTupleReference();
+
+ public List<ByteBuffer> getResultBuffers();
+
+ public int getNumValidResultBuffers();
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListBuilder.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListBuilder.java
new file mode 100644
index 0000000..aaaef56
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListBuilder.java
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+
+public interface IInvertedListBuilder {
+ public boolean startNewList(ITupleReference tuple, int numTokenFields);
+
+ // returns true if successfully appended
+ // returns false if not enough space in targetBuf
+ public boolean appendElement(ITupleReference tuple, int numTokenFields, int numElementFields);
+
+ public void setTargetBuffer(byte[] targetBuf, int startPos);
+
+ public int getListSize();
+
+ public int getPos();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListCursor.java
new file mode 100644
index 0000000..9435f3c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedListCursor.java
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+
+public interface IInvertedListCursor extends Comparable<IInvertedListCursor> {
+ void reset(int startPageId, int endPageId, int startOff, int numElements);
+
+ void pinPagesSync() throws HyracksDataException;
+
+ void pinPagesAsync() throws HyracksDataException;
+
+ void unpinPages() throws HyracksDataException;
+
+ boolean hasNext();
+
+ void next();
+
+ ITupleReference getTuple();
+
+ // getters
+ int getNumElements();
+
+ int getStartPageId();
+
+ int getEndPageId();
+
+ int getStartOff();
+
+ // jump to a specific element
+ void positionCursor(int elementIx);
+
+ boolean containsKey(ITupleReference searchTuple, MultiComparator invListCmp);
+
+ // for debugging
+ String printInvList(ISerializerDeserializer[] serdes) throws HyracksDataException;
+
+ String printCurrentElement(ISerializerDeserializer[] serdes) throws HyracksDataException;
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/AbstractInvertedIndexOperatorDescriptor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/AbstractInvertedIndexOperatorDescriptor.java
new file mode 100644
index 0000000..0d0c5bf
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/AbstractInvertedIndexOperatorDescriptor.java
@@ -0,0 +1,151 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
+
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.job.JobSpecification;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractSingleActivityOperatorDescriptor;
+import edu.uci.ics.hyracks.dataflow.std.file.IFileSplitProvider;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndex;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexFrameFactory;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.IIndexRegistryProvider;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.ITreeIndexOpHelperFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexOperatorDescriptorHelper;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
+import edu.uci.ics.hyracks.storage.common.IStorageManagerInterface;
+
+public abstract class AbstractInvertedIndexOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor
+ implements IInvertedIndexOperatorDescriptorHelper {
+
+ private static final long serialVersionUID = 1L;
+
+ // general
+ protected final IStorageManagerInterface storageManager;
+
+ // btree
+ protected final IFileSplitProvider btreeFileSplitProvider;
+ protected final IIndexRegistryProvider<ITreeIndex> treeIndexRegistryProvider;
+ protected final ITreeIndexFrameFactory interiorFrameFactory;
+ protected final ITreeIndexFrameFactory leafFrameFactory;
+ protected final ITypeTrait[] btreeTypeTraits;
+ protected final IBinaryComparatorFactory[] btreeComparatorFactories;
+ protected final ITreeIndexOpHelperFactory opHelperFactory;
+
+ // inverted index
+ protected final IFileSplitProvider invIndexFileSplitProvider;
+ protected final IIndexRegistryProvider<InvertedIndex> invIndexRegistryProvider;
+ protected final ITypeTrait[] invIndexTypeTraits;
+ protected final IBinaryComparatorFactory[] invIndexComparatorFactories;
+
+ public AbstractInvertedIndexOperatorDescriptor(JobSpecification spec, int inputArity, int outputArity,
+ RecordDescriptor recDesc, IStorageManagerInterface storageManager,
+ IFileSplitProvider btreeFileSplitProvider, IIndexRegistryProvider<ITreeIndex> treeIndexRegistryProvider,
+ ITreeIndexFrameFactory interiorFrameFactory, ITreeIndexFrameFactory leafFrameFactory,
+ ITypeTrait[] btreeTypeTraits, IBinaryComparatorFactory[] btreeComparatorFactories, float btreeFillFactor,
+ ITreeIndexOpHelperFactory opHelperFactory, IFileSplitProvider invIndexFileSplitProvider,
+ IIndexRegistryProvider<InvertedIndex> invIndexRegistryProvider, ITypeTrait[] invIndexTypeTraits,
+ IBinaryComparatorFactory[] invIndexComparatorFactories) {
+ super(spec, inputArity, outputArity);
+
+ // general
+ this.storageManager = storageManager;
+
+ // btree
+ this.btreeFileSplitProvider = btreeFileSplitProvider;
+ this.treeIndexRegistryProvider = treeIndexRegistryProvider;
+ this.interiorFrameFactory = interiorFrameFactory;
+ this.leafFrameFactory = leafFrameFactory;
+ this.btreeTypeTraits = btreeTypeTraits;
+ this.btreeComparatorFactories = btreeComparatorFactories;
+ this.opHelperFactory = opHelperFactory;
+
+ // inverted index
+ this.invIndexFileSplitProvider = invIndexFileSplitProvider;
+ this.invIndexRegistryProvider = invIndexRegistryProvider;
+ this.invIndexTypeTraits = invIndexTypeTraits;
+ this.invIndexComparatorFactories = invIndexComparatorFactories;
+
+ if (outputArity > 0)
+ recordDescriptors[0] = recDesc;
+ }
+
+ @Override
+ public IFileSplitProvider getTreeIndexFileSplitProvider() {
+ return btreeFileSplitProvider;
+ }
+
+ @Override
+ public IBinaryComparatorFactory[] getTreeIndexComparatorFactories() {
+ return btreeComparatorFactories;
+ }
+
+ @Override
+ public ITypeTrait[] getTreeIndexTypeTraits() {
+ return btreeTypeTraits;
+ }
+
+ @Override
+ public ITreeIndexFrameFactory getTreeIndexInteriorFactory() {
+ return interiorFrameFactory;
+ }
+
+ @Override
+ public ITreeIndexFrameFactory getTreeIndexLeafFactory() {
+ return leafFrameFactory;
+ }
+
+ @Override
+ public IStorageManagerInterface getStorageManager() {
+ return storageManager;
+ }
+
+ @Override
+ public IIndexRegistryProvider<ITreeIndex> getTreeIndexRegistryProvider() {
+ return treeIndexRegistryProvider;
+ }
+
+ @Override
+ public RecordDescriptor getRecordDescriptor() {
+ return recordDescriptors[0];
+ }
+
+ @Override
+ public IIndexRegistryProvider<InvertedIndex> getInvIndexRegistryProvider() {
+ return invIndexRegistryProvider;
+ }
+
+ @Override
+ public IBinaryComparatorFactory[] getInvIndexComparatorFactories() {
+ return invIndexComparatorFactories;
+ }
+
+ @Override
+ public IFileSplitProvider getInvIndexFileSplitProvider() {
+ return invIndexFileSplitProvider;
+ }
+
+ @Override
+ public ITypeTrait[] getInvIndexTypeTraits() {
+ return invIndexTypeTraits;
+ }
+
+ @Override
+ public ITreeIndexOpHelperFactory getTreeIndexOpHelperFactory() {
+ return opHelperFactory;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
index 6e7e0d3..b5b1393 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
@@ -23,7 +23,7 @@
import edu.uci.ics.hyracks.api.job.IOperatorEnvironment;
import edu.uci.ics.hyracks.api.job.JobSpecification;
import edu.uci.ics.hyracks.dataflow.std.base.AbstractSingleActivityOperatorDescriptor;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizerFactory;
public class BinaryTokenizerOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor {
@@ -50,6 +50,6 @@
public IOperatorNodePushable createPushRuntime(IHyracksTaskContext ctx, IOperatorEnvironment env,
IRecordDescriptorProvider recordDescProvider, int partition, int nPartitions) throws HyracksDataException {
return new BinaryTokenizerOperatorNodePushable(ctx, recordDescProvider.getInputRecordDescriptor(odId, 0),
- recordDescriptors[0], tokenizerFactory.createBinaryTokenizer(), tokenFields, projFields);
+ recordDescriptors[0], tokenizerFactory.createTokenizer(), tokenFields, projFields);
}
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
index 37e7bd6..1a0091a 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
@@ -27,7 +27,8 @@
import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
import edu.uci.ics.hyracks.dataflow.std.base.AbstractUnaryInputUnaryOutputOperatorNodePushable;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
public class BinaryTokenizerOperatorNodePushable extends AbstractUnaryInputUnaryOutputOperatorNodePushable {
@@ -85,7 +86,8 @@
builder.reset();
try {
- tokenizer.writeToken(builderDos);
+ IToken token = tokenizer.getToken();
+ token.serializeToken(builderDos);
builder.addFieldEndOffset();
} catch (IOException e) {
throw new HyracksDataException(e.getMessage());
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexBulkLoadOperatorDescriptor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexBulkLoadOperatorDescriptor.java
new file mode 100644
index 0000000..1ab72fe
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexBulkLoadOperatorDescriptor.java
@@ -0,0 +1,65 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
+
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.IOperatorNodePushable;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
+import edu.uci.ics.hyracks.api.dataflow.value.IRecordDescriptorProvider;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.api.job.IOperatorEnvironment;
+import edu.uci.ics.hyracks.api.job.JobSpecification;
+import edu.uci.ics.hyracks.dataflow.std.file.IFileSplitProvider;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndex;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexFrameFactory;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.IIndexRegistryProvider;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.ITreeIndexOpHelperFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
+import edu.uci.ics.hyracks.storage.common.IStorageManagerInterface;
+
+public class InvertedIndexBulkLoadOperatorDescriptor extends AbstractInvertedIndexOperatorDescriptor {
+
+ private static final long serialVersionUID = 1L;
+
+ private final int[] fieldPermutation;
+ private final float btreeFillFactor;
+ private final IInvertedListBuilder invListBuilder;
+
+ public InvertedIndexBulkLoadOperatorDescriptor(JobSpecification spec, IStorageManagerInterface storageManager,
+ int[] fieldPermutation, IFileSplitProvider btreeFileSplitProvider,
+ IIndexRegistryProvider<ITreeIndex> treeIndexRegistryProvider, ITreeIndexFrameFactory interiorFrameFactory,
+ ITreeIndexFrameFactory leafFrameFactory, ITypeTrait[] btreeTypeTraits,
+ IBinaryComparatorFactory[] btreeComparatorFactories, float btreeFillFactor,
+ ITreeIndexOpHelperFactory opHelperFactory, IFileSplitProvider invIndexFileSplitProvider,
+ IIndexRegistryProvider<InvertedIndex> invIndexRegistryProvider, ITypeTrait[] invIndexTypeTraits,
+ IBinaryComparatorFactory[] invIndexComparatorFactories, IInvertedListBuilder invListBuilder) {
+ super(spec, 1, 0, null, storageManager, btreeFileSplitProvider, treeIndexRegistryProvider,
+ interiorFrameFactory, leafFrameFactory, btreeTypeTraits, btreeComparatorFactories, btreeFillFactor,
+ opHelperFactory, invIndexFileSplitProvider, invIndexRegistryProvider, invIndexTypeTraits,
+ invIndexComparatorFactories);
+ this.fieldPermutation = fieldPermutation;
+ this.btreeFillFactor = btreeFillFactor;
+ this.invListBuilder = invListBuilder;
+ }
+
+ @Override
+ public IOperatorNodePushable createPushRuntime(IHyracksTaskContext ctx, IOperatorEnvironment env,
+ IRecordDescriptorProvider recordDescProvider, int partition, int nPartitions) {
+ return new InvertedIndexBulkLoadOperatorNodePushable(this, ctx, partition, fieldPermutation, btreeFillFactor,
+ invListBuilder, recordDescProvider);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexBulkLoadOperatorNodePushable.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexBulkLoadOperatorNodePushable.java
new file mode 100644
index 0000000..d2bd395
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexBulkLoadOperatorNodePushable.java
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
+
+import java.nio.ByteBuffer;
+
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IRecordDescriptorProvider;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractUnaryInputSinkOperatorNodePushable;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.IndexHelperOpenMode;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.PermutingFrameTupleReference;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.TreeIndexOpHelper;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
+
+public class InvertedIndexBulkLoadOperatorNodePushable extends AbstractUnaryInputSinkOperatorNodePushable {
+ private final TreeIndexOpHelper treeIndexOpHelper;
+ private float btreeFillFactor;
+
+ private final InvertedIndexOpHelper invIndexOpHelper;
+ protected final IInvertedListBuilder invListBuilder;
+ private InvertedIndex.BulkLoadContext bulkLoadCtx;
+
+ private final IHyracksTaskContext ctx;
+
+ private FrameTupleAccessor accessor;
+ private PermutingFrameTupleReference tuple = new PermutingFrameTupleReference();
+
+ private IRecordDescriptorProvider recordDescProvider;
+
+ public InvertedIndexBulkLoadOperatorNodePushable(AbstractInvertedIndexOperatorDescriptor opDesc,
+ IHyracksTaskContext ctx, int partition, int[] fieldPermutation, float btreeFillFactor,
+ IInvertedListBuilder invListBuilder, IRecordDescriptorProvider recordDescProvider) {
+ treeIndexOpHelper = opDesc.getTreeIndexOpHelperFactory().createTreeIndexOpHelper(opDesc, ctx, partition,
+ IndexHelperOpenMode.CREATE);
+ invIndexOpHelper = new InvertedIndexOpHelper(opDesc, ctx, partition, IndexHelperOpenMode.CREATE);
+ this.btreeFillFactor = btreeFillFactor;
+ this.recordDescProvider = recordDescProvider;
+ this.ctx = ctx;
+ this.invListBuilder = invListBuilder;
+ tuple.setFieldPermutation(fieldPermutation);
+ }
+
+ @Override
+ public void open() throws HyracksDataException {
+ AbstractInvertedIndexOperatorDescriptor opDesc = (AbstractInvertedIndexOperatorDescriptor) treeIndexOpHelper
+ .getOperatorDescriptor();
+ RecordDescriptor recDesc = recordDescProvider.getInputRecordDescriptor(opDesc.getOperatorId(), 0);
+ accessor = new FrameTupleAccessor(treeIndexOpHelper.getHyracksTaskContext().getFrameSize(), recDesc);
+
+ // btree
+ try {
+ treeIndexOpHelper.init();
+ treeIndexOpHelper.getTreeIndex().open(treeIndexOpHelper.getIndexFileId());
+ } catch (Exception e) {
+ // cleanup in case of failure
+ treeIndexOpHelper.deinit();
+ throw new HyracksDataException(e);
+ }
+
+ // inverted index
+ try {
+ invIndexOpHelper.init();
+ invIndexOpHelper.getInvIndex().open(invIndexOpHelper.getInvIndexFileId());
+ bulkLoadCtx = invIndexOpHelper.getInvIndex().beginBulkLoad(invListBuilder, ctx.getFrameSize(),
+ btreeFillFactor);
+ } catch (Exception e) {
+ // cleanup in case of failure
+ invIndexOpHelper.deinit();
+ throw new HyracksDataException(e);
+ }
+ }
+
+ @Override
+ public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
+ accessor.reset(buffer);
+ int tupleCount = accessor.getTupleCount();
+ for (int i = 0; i < tupleCount; i++) {
+ tuple.reset(accessor, i);
+ invIndexOpHelper.getInvIndex().bulkLoadAddTuple(bulkLoadCtx, tuple);
+ }
+ }
+
+ @Override
+ public void close() throws HyracksDataException {
+ try {
+ invIndexOpHelper.getInvIndex().endBulkLoad(bulkLoadCtx);
+ } finally {
+ treeIndexOpHelper.deinit();
+ }
+ }
+
+ @Override
+ public void flush() throws HyracksDataException {
+ }
+}
\ No newline at end of file
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexOpHelper.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexOpHelper.java
new file mode 100644
index 0000000..1eb6757
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/InvertedIndexOpHelper.java
@@ -0,0 +1,154 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
+
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.api.io.FileReference;
+import edu.uci.ics.hyracks.dataflow.std.file.IFileSplitProvider;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndex;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.ITreeIndexOperatorDescriptorHelper;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.IndexHelperOpenMode;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.IndexRegistry;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexOperatorDescriptorHelper;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
+import edu.uci.ics.hyracks.storage.common.file.IFileMapProvider;
+
+public final class InvertedIndexOpHelper {
+
+ private InvertedIndex invIndex;
+ private int invIndexFileId = -1;
+ private int partition;
+
+ private IInvertedIndexOperatorDescriptorHelper opDesc;
+ private IHyracksTaskContext ctx;
+
+ private IndexHelperOpenMode mode;
+
+ public InvertedIndexOpHelper(IInvertedIndexOperatorDescriptorHelper opDesc, final IHyracksTaskContext ctx,
+ int partition, IndexHelperOpenMode mode) {
+ this.opDesc = opDesc;
+ this.ctx = ctx;
+ this.mode = mode;
+ this.partition = partition;
+ }
+
+ public void init() throws HyracksDataException {
+ IBufferCache bufferCache = opDesc.getStorageManager().getBufferCache(ctx);
+ IFileMapProvider fileMapProvider = opDesc.getStorageManager().getFileMapProvider(ctx);
+ IFileSplitProvider fileSplitProvider = opDesc.getInvIndexFileSplitProvider();
+
+ FileReference f = fileSplitProvider.getFileSplits()[partition].getLocalFile();
+ boolean fileIsMapped = fileMapProvider.isMapped(f);
+
+ switch (mode) {
+
+ case OPEN: {
+ if (!fileIsMapped) {
+ throw new HyracksDataException("Trying to open inverted index from unmapped file " + f.toString());
+ }
+ }
+ break;
+
+ case CREATE:
+ case ENLIST: {
+ if (!fileIsMapped) {
+ bufferCache.createFile(f);
+ }
+ }
+ break;
+
+ }
+
+ int fileId = fileMapProvider.lookupFileId(f);
+ try {
+ bufferCache.openFile(fileId);
+ } catch (HyracksDataException e) {
+ // revert state of buffer cache since file failed to open
+ if (!fileIsMapped) {
+ bufferCache.deleteFile(fileId);
+ }
+ throw e;
+ }
+
+ // only set btreeFileId member when openFile() succeeds,
+ // otherwise deinit() will try to close the file that failed to open
+ invIndexFileId = fileId;
+ IndexRegistry<InvertedIndex> invIndexRegistry = opDesc.getInvIndexRegistryProvider().getRegistry(ctx);
+ invIndex = invIndexRegistry.get(invIndexFileId);
+ if (invIndex == null) {
+
+ // create new inverted index and register it
+ invIndexRegistry.lock();
+ try {
+ // check if inverted index has already been registered by
+ // another thread
+ invIndex = invIndexRegistry.get(invIndexFileId);
+ if (invIndex == null) {
+ // this thread should create and register the inverted index
+
+ IBinaryComparator[] comparators = new IBinaryComparator[opDesc.getInvIndexComparatorFactories().length];
+ for (int i = 0; i < opDesc.getInvIndexComparatorFactories().length; i++) {
+ comparators[i] = opDesc.getInvIndexComparatorFactories()[i].createBinaryComparator();
+ }
+
+ MultiComparator cmp = new MultiComparator(opDesc.getInvIndexTypeTraits(), comparators);
+
+ // assumes btree has already been registered
+ IFileSplitProvider btreeFileSplitProvider = opDesc.getTreeIndexFileSplitProvider();
+ IndexRegistry<ITreeIndex> treeIndexRegistry = opDesc.getTreeIndexRegistryProvider()
+ .getRegistry(ctx);
+ FileReference btreeFile = btreeFileSplitProvider.getFileSplits()[partition].getLocalFile();
+ boolean btreeFileIsMapped = fileMapProvider.isMapped(btreeFile);
+ if (!btreeFileIsMapped) {
+ throw new HyracksDataException(
+ "Trying to create inverted index, but associated BTree file has not been mapped");
+ }
+ int btreeFileId = fileMapProvider.lookupFileId(f);
+ BTree btree = (BTree) treeIndexRegistry.get(btreeFileId);
+
+ invIndex = new InvertedIndex(bufferCache, btree, cmp);
+ invIndex.open(invIndexFileId);
+ invIndexRegistry.register(invIndexFileId, invIndex);
+ }
+ } finally {
+ invIndexRegistry.unlock();
+ }
+ }
+ }
+
+ public void deinit() throws HyracksDataException {
+ if (invIndexFileId != -1) {
+ IBufferCache bufferCache = opDesc.getStorageManager().getBufferCache(ctx);
+ bufferCache.closeFile(invIndexFileId);
+ }
+ }
+
+ public InvertedIndex getInvIndex() {
+ return invIndex;
+ }
+
+ public ITreeIndexOperatorDescriptorHelper getOperatorDescriptor() {
+ return opDesc;
+ }
+
+ public int getInvIndexFileId() {
+ return invIndexFileId;
+ }
+}
\ No newline at end of file
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListBuilder.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListBuilder.java
new file mode 100644
index 0000000..03fc9a1
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListBuilder.java
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
+
+public class FixedSizeElementInvertedListBuilder implements IInvertedListBuilder {
+ private final int listElementSize;
+ private int listSize = 0;
+
+ private byte[] targetBuf;
+ private int pos;
+
+ public FixedSizeElementInvertedListBuilder(ITypeTrait[] invListFields) {
+ int tmp = 0;
+ for (int i = 0; i < invListFields.length; i++) {
+ tmp += invListFields[i].getStaticallyKnownDataLength();
+ }
+ listElementSize = tmp;
+ }
+
+ @Override
+ public boolean startNewList(ITupleReference tuple, int tokenField) {
+ if (pos + listElementSize > targetBuf.length)
+ return false;
+ else {
+ listSize = 0;
+ return true;
+ }
+ }
+
+ @Override
+ public boolean appendElement(ITupleReference tuple, int numTokenFields, int numElementFields) {
+ if (pos + listElementSize > targetBuf.length)
+ return false;
+
+ for (int i = 0; i < numElementFields; i++) {
+ int field = numTokenFields + i;
+ System.arraycopy(tuple.getFieldData(field), tuple.getFieldStart(field), targetBuf, pos,
+ tuple.getFieldLength(field));
+ }
+
+ listSize++;
+ pos += listElementSize;
+
+ return true;
+ }
+
+ @Override
+ public void setTargetBuffer(byte[] targetBuf, int startPos) {
+ this.targetBuf = targetBuf;
+ this.pos = startPos;
+ }
+
+ @Override
+ public int getListSize() {
+ return listSize;
+ }
+
+ @Override
+ public int getPos() {
+ return pos;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListCursor.java
new file mode 100644
index 0000000..f7ef56e
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeElementInvertedListCursor.java
@@ -0,0 +1,280 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
+import edu.uci.ics.hyracks.storage.common.buffercache.ICachedPage;
+import edu.uci.ics.hyracks.storage.common.file.BufferedFileHandle;
+
+public class FixedSizeElementInvertedListCursor implements IInvertedListCursor {
+
+ private final IBufferCache bufferCache;
+ private final int fileId;
+ private final int elementSize;
+ private int currentElementIx;
+ private int currentOff;
+ private int currentPageIx;
+
+ private int startPageId;
+ private int endPageId;
+ private int startOff;
+ private int numElements;
+
+ private final FixedSizeTupleReference tuple;
+ private ICachedPage[] pages = new ICachedPage[10];
+ private int[] elementIndexes = new int[10];
+
+ public FixedSizeElementInvertedListCursor(IBufferCache bufferCache, int fileId, ITypeTrait[] invListFields) {
+ this.bufferCache = bufferCache;
+ this.fileId = fileId;
+ this.currentElementIx = 0;
+ this.currentPageIx = 0;
+
+ int tmp = 0;
+ for (int i = 0; i < invListFields.length; i++) {
+ tmp += invListFields[i].getStaticallyKnownDataLength();
+ }
+ elementSize = tmp;
+ this.currentOff = -elementSize;
+ this.tuple = new FixedSizeTupleReference(invListFields);
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (currentElementIx < numElements)
+ return true;
+ else
+ return false;
+ }
+
+ @Override
+ public void next() {
+ if (currentOff + elementSize >= bufferCache.getPageSize()) {
+ currentPageIx++;
+ currentOff = 0;
+ } else {
+ currentOff += elementSize;
+ }
+
+ currentElementIx++;
+ tuple.reset(pages[currentPageIx].getBuffer().array(), currentOff);
+ }
+
+ @Override
+ public void pinPagesAsync() {
+ // TODO: implement
+ }
+
+ @Override
+ public void pinPagesSync() throws HyracksDataException {
+ int pix = 0;
+ for (int i = startPageId; i <= endPageId; i++) {
+ pages[pix] = bufferCache.pin(BufferedFileHandle.getDiskPageId(fileId, i), false);
+ pages[pix].acquireReadLatch();
+ pix++;
+ }
+ }
+
+ @Override
+ public void unpinPages() throws HyracksDataException {
+ int numPages = endPageId - startPageId + 1;
+ for (int i = 0; i < numPages; i++) {
+ pages[i].releaseReadLatch();
+ bufferCache.unpin(pages[i]);
+ }
+ }
+
+ @Override
+ public void positionCursor(int elementIx) {
+ int numPages = endPageId - startPageId + 1;
+
+ currentPageIx = binarySearch(elementIndexes, 0, numPages, elementIx);
+ if (currentPageIx < 0) {
+ throw new IndexOutOfBoundsException("Requested index: " + elementIx + " from array with numElements: "
+ + numElements);
+ }
+
+ if (currentPageIx == 0) {
+ currentOff = startOff + elementIx * elementSize;
+ } else {
+ int relativeElementIx = elementIx - elementIndexes[currentPageIx - 1] - 1;
+ currentOff = relativeElementIx * elementSize;
+ }
+
+ currentElementIx = elementIx;
+ tuple.reset(pages[currentPageIx].getBuffer().array(), currentOff);
+ }
+
+ @Override
+ public boolean containsKey(ITupleReference searchTuple, MultiComparator invListCmp) {
+ int mid;
+ int begin = 0;
+ int end = numElements - 1;
+
+ while (begin <= end) {
+ mid = (begin + end) / 2;
+ positionCursor(mid);
+ int cmp = invListCmp.compare(searchTuple, tuple);
+ if (cmp < 0) {
+ end = mid - 1;
+ } else if (cmp > 0) {
+ begin = mid + 1;
+ } else {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ @Override
+ public void reset(int startPageId, int endPageId, int startOff, int numElements) {
+ this.startPageId = startPageId;
+ this.endPageId = endPageId;
+ this.startOff = startOff;
+ this.numElements = numElements;
+ this.currentElementIx = 0;
+ this.currentPageIx = 0;
+ this.currentOff = startOff - elementSize;
+
+ int numPages = endPageId - startPageId + 1;
+ if (numPages > pages.length) {
+ pages = new ICachedPage[endPageId - startPageId + 1];
+ elementIndexes = new int[endPageId - startPageId + 1];
+ }
+
+ // fill elementIndexes
+ // first page
+ int cumulElements = (bufferCache.getPageSize() - startOff) / elementSize;
+ elementIndexes[0] = cumulElements - 1;
+
+ // middle, full pages
+ for (int i = 1; i < numPages - 1; i++) {
+ elementIndexes[i] = elementIndexes[i - 1] + (bufferCache.getPageSize() / elementSize);
+ }
+
+ // last page
+ elementIndexes[numPages - 1] = numElements - 1;
+ }
+
+ @Override
+ public String printInvList(ISerializerDeserializer[] serdes) throws HyracksDataException {
+ int oldCurrentOff = currentOff;
+ int oldCurrentPageId = currentPageIx;
+ int oldCurrentElementIx = currentElementIx;
+
+ currentOff = startOff - elementSize;
+ currentPageIx = 0;
+ currentElementIx = 0;
+
+ StringBuilder strBuilder = new StringBuilder();
+
+ int count = 0;
+ while (hasNext()) {
+ next();
+ count++;
+ for (int i = 0; i < tuple.getFieldCount(); i++) {
+ ByteArrayInputStream inStream = new ByteArrayInputStream(tuple.getFieldData(i), tuple.getFieldStart(i),
+ tuple.getFieldLength(i));
+ DataInput dataIn = new DataInputStream(inStream);
+ Object o = serdes[i].deserialize(dataIn);
+ strBuilder.append(o.toString());
+ if (i + 1 < tuple.getFieldCount())
+ strBuilder.append(",");
+ }
+ strBuilder.append(" ");
+ }
+
+ // reset previous state
+ currentOff = oldCurrentOff;
+ currentPageIx = oldCurrentPageId;
+ currentElementIx = oldCurrentElementIx;
+
+ return strBuilder.toString();
+ }
+
+ public String printCurrentElement(ISerializerDeserializer[] serdes) throws HyracksDataException {
+ StringBuilder strBuilder = new StringBuilder();
+ for (int i = 0; i < tuple.getFieldCount(); i++) {
+ ByteArrayInputStream inStream = new ByteArrayInputStream(tuple.getFieldData(i), tuple.getFieldStart(i),
+ tuple.getFieldLength(i));
+ DataInput dataIn = new DataInputStream(inStream);
+ Object o = serdes[i].deserialize(dataIn);
+ strBuilder.append(o.toString());
+ if (i + 1 < tuple.getFieldCount())
+ strBuilder.append(",");
+ }
+ return strBuilder.toString();
+ }
+
+ private int binarySearch(int[] arr, int arrStart, int arrLength, int key) {
+ int mid;
+ int begin = arrStart;
+ int end = arrStart + arrLength - 1;
+
+ while (begin <= end) {
+ mid = (begin + end) / 2;
+ int cmp = (key - arr[mid]);
+ if (cmp < 0) {
+ end = mid - 1;
+ } else if (cmp > 0) {
+ begin = mid + 1;
+ } else {
+ return mid;
+ }
+ }
+
+ if (begin > arr.length - 1)
+ return -1;
+ if (key < arr[begin])
+ return begin;
+ else
+ return -1;
+ }
+
+ @Override
+ public int compareTo(IInvertedListCursor invListCursor) {
+ return numElements - invListCursor.getNumElements();
+ }
+
+ @Override
+ public int getEndPageId() {
+ return endPageId;
+ }
+
+ @Override
+ public int getNumElements() {
+ return numElements;
+ }
+
+ @Override
+ public int getStartOff() {
+ return startOff;
+ }
+
+ @Override
+ public int getStartPageId() {
+ return startPageId;
+ }
+
+ public int getOffset() {
+ return currentOff;
+ }
+
+ public ICachedPage getPage() {
+ return pages[currentPageIx];
+ }
+
+ @Override
+ public ITupleReference getTuple() {
+ return tuple;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeFrameTupleAccessor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeFrameTupleAccessor.java
new file mode 100644
index 0000000..9858eb0
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeFrameTupleAccessor.java
@@ -0,0 +1,98 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.nio.ByteBuffer;
+
+import edu.uci.ics.hyracks.api.comm.FrameHelper;
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+
+public class FixedSizeFrameTupleAccessor implements IFrameTupleAccessor {
+
+ private final int frameSize;
+ private ByteBuffer buffer;
+
+ private final ITypeTrait[] fields;
+ private final int[] fieldStartOffsets;
+ private final int tupleSize;
+
+ public FixedSizeFrameTupleAccessor(int frameSize, ITypeTrait[] fields) {
+ this.frameSize = frameSize;
+ this.fields = fields;
+ this.fieldStartOffsets = new int[fields.length];
+ this.fieldStartOffsets[0] = 0;
+ for (int i = 1; i < fields.length; i++) {
+ fieldStartOffsets[i] = fieldStartOffsets[i - 1] + fields[i - 1].getStaticallyKnownDataLength();
+ }
+
+ int tmp = 0;
+ for (int i = 0; i < fields.length; i++) {
+ tmp += fields[i].getStaticallyKnownDataLength();
+ }
+ tupleSize = tmp;
+ }
+
+ @Override
+ public ByteBuffer getBuffer() {
+ return buffer;
+ }
+
+ @Override
+ public int getFieldCount() {
+ return fields.length;
+ }
+
+ @Override
+ public int getFieldEndOffset(int tupleIndex, int fIdx) {
+ return getTupleStartOffset(tupleIndex) + fieldStartOffsets[fIdx] + fields[fIdx].getStaticallyKnownDataLength();
+ }
+
+ @Override
+ public int getFieldLength(int tupleIndex, int fIdx) {
+ return fields[fIdx].getStaticallyKnownDataLength();
+ }
+
+ @Override
+ public int getFieldSlotsLength() {
+ return 0;
+ }
+
+ @Override
+ public int getFieldStartOffset(int tupleIndex, int fIdx) {
+ return tupleIndex * tupleSize + fieldStartOffsets[fIdx];
+ }
+
+ @Override
+ public int getTupleCount() {
+ return buffer.getInt(FrameHelper.getTupleCountOffset(frameSize));
+ }
+
+ @Override
+ public int getTupleEndOffset(int tupleIndex) {
+ return getFieldEndOffset(tupleIndex, fields.length - 1);
+ }
+
+ @Override
+ public int getTupleStartOffset(int tupleIndex) {
+ return tupleIndex * tupleSize;
+ }
+
+ @Override
+ public void reset(ByteBuffer buffer) {
+ this.buffer = buffer;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeFrameTupleAppender.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeFrameTupleAppender.java
new file mode 100644
index 0000000..edc2304
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeFrameTupleAppender.java
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.nio.ByteBuffer;
+
+import edu.uci.ics.hyracks.api.comm.FrameHelper;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+
+public class FixedSizeFrameTupleAppender {
+
+ private static final int TUPLE_COUNT_SIZE = 4;
+ private final int frameSize;
+ private final int tupleSize;
+ private ByteBuffer buffer;
+ private int tupleCount;
+ private int tupleDataEndOffset;
+
+ public FixedSizeFrameTupleAppender(int frameSize, ITypeTrait[] fields) {
+ this.frameSize = frameSize;
+ int tmp = 0;
+ for (int i = 0; i < fields.length; i++) {
+ tmp += fields[i].getStaticallyKnownDataLength();
+ }
+ tupleSize = tmp;
+ }
+
+ public void reset(ByteBuffer buffer, boolean clear) {
+ this.buffer = buffer;
+ if (clear) {
+ buffer.putInt(FrameHelper.getTupleCountOffset(frameSize), 0);
+ tupleCount = 0;
+ tupleDataEndOffset = 0;
+ }
+ }
+
+ public boolean append(byte[] bytes, int offset) {
+ if (tupleDataEndOffset + tupleSize + TUPLE_COUNT_SIZE <= frameSize) {
+ System.arraycopy(bytes, offset, buffer.array(), tupleDataEndOffset, tupleSize);
+ tupleDataEndOffset += tupleSize;
+ tupleCount++;
+ return true;
+ }
+ return false;
+ }
+
+ public boolean append(byte[] bytes, int offset, int length) {
+ if (tupleDataEndOffset + length + TUPLE_COUNT_SIZE <= frameSize) {
+ System.arraycopy(bytes, offset, buffer.array(), tupleDataEndOffset, length);
+ tupleDataEndOffset += length;
+ return true;
+ }
+ return false;
+ }
+
+ public boolean append(int fieldValue) {
+ if (tupleDataEndOffset + 4 + TUPLE_COUNT_SIZE <= frameSize) {
+ buffer.putInt(tupleDataEndOffset, fieldValue);
+ tupleDataEndOffset += 4;
+ tupleCount++;
+ return true;
+ }
+ return false;
+ }
+
+ public boolean append(long fieldValue) {
+ if (tupleDataEndOffset + 8 + TUPLE_COUNT_SIZE <= frameSize) {
+ buffer.putLong(tupleDataEndOffset, fieldValue);
+ tupleDataEndOffset += 8;
+ tupleCount++;
+ return true;
+ }
+ return false;
+ }
+
+ public boolean append(char fieldValue) {
+ if (tupleDataEndOffset + 2 + TUPLE_COUNT_SIZE <= frameSize) {
+ buffer.putLong(tupleDataEndOffset, fieldValue);
+ tupleDataEndOffset += 2;
+ tupleCount++;
+ return true;
+ }
+ return false;
+ }
+
+ public boolean append(byte fieldValue) {
+ if (tupleDataEndOffset + 1 + TUPLE_COUNT_SIZE <= frameSize) {
+ buffer.put(tupleDataEndOffset, fieldValue);
+ tupleDataEndOffset += 1;
+ tupleCount++;
+ return true;
+ }
+ return false;
+ }
+
+ // returns true if an entire tuple fits
+ // returns false otherwise
+ public boolean hasSpace() {
+ return tupleDataEndOffset + tupleSize + TUPLE_COUNT_SIZE <= frameSize;
+ }
+
+ public void incrementTupleCount(int count) {
+ buffer.putInt(FrameHelper.getTupleCountOffset(frameSize),
+ buffer.getInt(FrameHelper.getTupleCountOffset(frameSize)) + count);
+ }
+
+ public int getTupleCount() {
+ return tupleCount;
+ }
+
+ public ByteBuffer getBuffer() {
+ return buffer;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeTupleReference.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeTupleReference.java
new file mode 100644
index 0000000..248b81e
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/FixedSizeTupleReference.java
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+
+public class FixedSizeTupleReference implements ITupleReference {
+
+ private final ITypeTrait[] typeTraits;
+ private final int[] fieldStartOffsets;
+ private byte[] data;
+ private int startOff;
+
+ public FixedSizeTupleReference(ITypeTrait[] typeTraits) {
+ this.typeTraits = typeTraits;
+ this.fieldStartOffsets = new int[typeTraits.length];
+ this.fieldStartOffsets[0] = 0;
+ for (int i = 1; i < typeTraits.length; i++) {
+ fieldStartOffsets[i] = fieldStartOffsets[i - 1] + typeTraits[i - 1].getStaticallyKnownDataLength();
+ }
+ }
+
+ public void reset(byte[] data, int startOff) {
+ this.data = data;
+ this.startOff = startOff;
+ }
+
+ @Override
+ public int getFieldCount() {
+ return typeTraits.length;
+ }
+
+ @Override
+ public byte[] getFieldData(int fIdx) {
+ return data;
+ }
+
+ @Override
+ public int getFieldLength(int fIdx) {
+ return typeTraits[fIdx].getStaticallyKnownDataLength();
+ }
+
+ @Override
+ public int getFieldStart(int fIdx) {
+ return startOff + fieldStartOffsets[fIdx];
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java
new file mode 100644
index 0000000..9eab110
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java
@@ -0,0 +1,289 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.nio.ByteBuffer;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ByteArrayAccessibleOutputStream;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOpContext;
+import edu.uci.ics.hyracks.storage.am.btree.impls.RangePredicate;
+import edu.uci.ics.hyracks.storage.am.common.api.IIndexBulkLoadContext;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexCursor;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
+import edu.uci.ics.hyracks.storage.common.buffercache.ICachedPage;
+import edu.uci.ics.hyracks.storage.common.file.BufferedFileHandle;
+
+/**
+ * An inverted index consists of two files: 1. a file storing (paginated)
+ * inverted lists 2. a BTree-file mapping from tokens to inverted lists
+ *
+ * Implemented features: bulk loading and searching (based on T-Occurrence) Not
+ * implemented features: updates (insert/update/delete) Limitations: a query
+ * cannot exceed the size of a Hyracks frame
+ */
+public class InvertedIndex {
+
+ private BTree btree;
+ private int rootPageId = 0;
+ private IBufferCache bufferCache;
+ private int fileId;
+ private final MultiComparator invListCmp;
+ private final int numTokenFields;
+ private final int numInvListKeys;
+
+ public InvertedIndex(IBufferCache bufferCache, BTree btree, MultiComparator invListCmp) {
+ this.bufferCache = bufferCache;
+ this.btree = btree;
+ this.invListCmp = invListCmp;
+ this.numTokenFields = btree.getMultiComparator().getKeyFieldCount();
+ this.numInvListKeys = invListCmp.getKeyFieldCount();
+ }
+
+ public void open(int fileId) {
+ this.fileId = fileId;
+ }
+
+ public void close() {
+ this.fileId = -1;
+ }
+
+ public BulkLoadContext beginBulkLoad(IInvertedListBuilder invListBuilder, int hyracksFrameSize,
+ float btreeFillFactor) throws HyracksDataException {
+ BulkLoadContext ctx = new BulkLoadContext(invListBuilder, hyracksFrameSize, btreeFillFactor);
+ ctx.init(rootPageId, fileId);
+ return ctx;
+ }
+
+ // ASSUMPTIONS:
+ // the first btree.getMultiComparator().getKeyFieldCount() fields in tuple
+ // are btree keys (e.g., a string token)
+ // the next invListCmp.getKeyFieldCount() fields in tuple are keys of the
+ // inverted list (e.g., primary key)
+ // key fields of inverted list are fixed size
+ public void bulkLoadAddTuple(BulkLoadContext ctx, ITupleReference tuple) throws HyracksDataException {
+
+ // first inverted list, copy token to baaos and start new list
+ if (ctx.currentInvListTokenBaaos.size() == 0) {
+ ctx.currentInvListStartPageId = ctx.currentPageId;
+ ctx.currentInvListStartOffset = ctx.invListBuilder.getPos();
+
+ // remember current token
+ ctx.currentInvListTokenBaaos.reset();
+ for (int i = 0; i < numTokenFields; i++) {
+ ctx.currentInvListTokenBaaos.write(tuple.getFieldData(i), tuple.getFieldStart(i),
+ tuple.getFieldLength(i));
+ }
+
+ if (!ctx.invListBuilder.startNewList(tuple, numTokenFields)) {
+ ctx.pinNextPage();
+ ctx.invListBuilder.setTargetBuffer(ctx.currentPage.getBuffer().array(), 0);
+ if (!ctx.invListBuilder.startNewList(tuple, numTokenFields)) {
+ throw new IllegalStateException("Failed to create first inverted list.");
+ }
+ }
+ }
+
+ // create new inverted list?
+ ctx.currentInvListToken.reset(ctx.currentInvListTokenBaaos.getByteArray(), 0);
+ if (ctx.tokenCmp.compare(tuple, ctx.currentInvListToken) != 0) {
+
+ // create entry in btree for last inverted list
+ createAndInsertBTreeTuple(ctx);
+
+ // remember new token
+ ctx.currentInvListTokenBaaos.reset();
+ for (int i = 0; i < numTokenFields; i++) {
+ ctx.currentInvListTokenBaaos.write(tuple.getFieldData(i), tuple.getFieldStart(i),
+ tuple.getFieldLength(i));
+ }
+
+ // start new list
+ if (!ctx.invListBuilder.startNewList(tuple, numTokenFields)) {
+ ctx.pinNextPage();
+ ctx.invListBuilder.setTargetBuffer(ctx.currentPage.getBuffer().array(), 0);
+ if (!ctx.invListBuilder.startNewList(tuple, numTokenFields)) {
+ throw new IllegalStateException("Failed to start new inverted list after switching to a new page.");
+ }
+ }
+
+ ctx.currentInvListStartPageId = ctx.currentPageId;
+ ctx.currentInvListStartOffset = ctx.invListBuilder.getPos();
+ }
+
+ // append to current inverted list
+ if (!ctx.invListBuilder.appendElement(tuple, numTokenFields, numInvListKeys)) {
+ ctx.pinNextPage();
+ ctx.invListBuilder.setTargetBuffer(ctx.currentPage.getBuffer().array(), 0);
+ if (!ctx.invListBuilder.appendElement(tuple, numTokenFields, numInvListKeys)) {
+ throw new IllegalStateException(
+ "Failed to append element to inverted list after switching to a new page.");
+ }
+ }
+ }
+
+ public boolean openCursor(ITreeIndexCursor btreeCursor, RangePredicate btreePred, BTreeOpContext btreeOpCtx,
+ IInvertedListCursor invListCursor) throws Exception {
+ btree.search(btreeCursor, btreePred, btreeOpCtx);
+
+ boolean ret = false;
+ if (btreeCursor.hasNext()) {
+
+ btreeCursor.next();
+ ITupleReference frameTuple = btreeCursor.getTuple();
+
+ // hardcoded mapping of btree fields
+ int startPageId = IntegerSerializerDeserializer.getInt(frameTuple.getFieldData(1),
+ frameTuple.getFieldStart(1));
+ int endPageId = IntegerSerializerDeserializer.getInt(frameTuple.getFieldData(2),
+ frameTuple.getFieldStart(2));
+ int startOff = IntegerSerializerDeserializer
+ .getInt(frameTuple.getFieldData(3), frameTuple.getFieldStart(3));
+ int numElements = IntegerSerializerDeserializer.getInt(frameTuple.getFieldData(4),
+ frameTuple.getFieldStart(4));
+
+ invListCursor.reset(startPageId, endPageId, startOff, numElements);
+ ret = true;
+ } else {
+ invListCursor.reset(0, 0, 0, 0);
+ }
+
+ btreeCursor.close();
+ btreeCursor.reset();
+
+ return ret;
+ }
+
+ public void createAndInsertBTreeTuple(BulkLoadContext ctx) throws HyracksDataException {
+ // build tuple
+ ctx.btreeTupleBuilder.reset();
+ ctx.btreeTupleBuilder.addField(ctx.currentInvListTokenBaaos.getByteArray(), 0,
+ ctx.currentInvListTokenBaaos.size());
+ ctx.btreeTupleBuilder.addField(IntegerSerializerDeserializer.INSTANCE, ctx.currentInvListStartPageId);
+ ctx.btreeTupleBuilder.addField(IntegerSerializerDeserializer.INSTANCE, ctx.currentPageId);
+ ctx.btreeTupleBuilder.addField(IntegerSerializerDeserializer.INSTANCE, ctx.currentInvListStartOffset);
+ ctx.btreeTupleBuilder.addField(IntegerSerializerDeserializer.INSTANCE, ctx.invListBuilder.getListSize());
+
+ // append to buffer
+ ctx.btreeTupleAppender.reset(ctx.btreeTupleBuffer, true);
+ ctx.btreeTupleAppender.append(ctx.btreeTupleBuilder.getFieldEndOffsets(), ctx.btreeTupleBuilder.getByteArray(),
+ 0, ctx.btreeTupleBuilder.getSize());
+
+ // reset tuple reference
+ ctx.btreeFrameTupleReference.reset(ctx.btreeFrameTupleAccessor, 0);
+
+ btree.bulkLoadAddTuple(ctx.btreeBulkLoadCtx, ctx.btreeFrameTupleReference);
+ }
+
+ public void endBulkLoad(BulkLoadContext ctx) throws HyracksDataException {
+ // create entry in btree for last inverted list
+ createAndInsertBTreeTuple(ctx);
+ btree.endBulkLoad(ctx.btreeBulkLoadCtx);
+ ctx.deinit();
+ }
+
+ public IBufferCache getBufferCache() {
+ return bufferCache;
+ }
+
+ public int getInvListsFileId() {
+ return fileId;
+ }
+
+ public MultiComparator getInvListElementCmp() {
+ return invListCmp;
+ }
+
+ public BTree getBTree() {
+ return btree;
+ }
+
+ public final class BulkLoadContext {
+ private final ByteBuffer btreeTupleBuffer;
+ private final ArrayTupleBuilder btreeTupleBuilder;
+ private final FrameTupleAppender btreeTupleAppender;
+ private final FrameTupleAccessor btreeFrameTupleAccessor;
+ private final FrameTupleReference btreeFrameTupleReference = new FrameTupleReference();
+ private final float btreeFillFactor;
+ private IIndexBulkLoadContext btreeBulkLoadCtx;
+
+ private int currentInvListStartPageId;
+ private int currentInvListStartOffset;
+ private final ByteArrayAccessibleOutputStream currentInvListTokenBaaos = new ByteArrayAccessibleOutputStream();
+ private final FixedSizeTupleReference currentInvListToken = new FixedSizeTupleReference(
+ invListCmp.getTypeTraits());
+
+ private int currentPageId;
+ private ICachedPage currentPage;
+ private final IInvertedListBuilder invListBuilder;
+ private final MultiComparator tokenCmp;
+
+ public BulkLoadContext(IInvertedListBuilder invListBuilder, int hyracksFrameSize, float btreeFillFactor) {
+ this.invListBuilder = invListBuilder;
+ this.tokenCmp = btree.getMultiComparator();
+ this.btreeTupleBuffer = ByteBuffer.allocate(hyracksFrameSize);
+ this.btreeTupleBuilder = new ArrayTupleBuilder(tokenCmp.getFieldCount());
+ this.btreeTupleAppender = new FrameTupleAppender(hyracksFrameSize);
+ // TODO: serde never used, only need correct number of fields
+ // tuple contains (token, start page, end page, start offset, num
+ // elements)
+ RecordDescriptor recDesc = new RecordDescriptor(new ISerializerDeserializer[] {
+ IntegerSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE,
+ IntegerSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE,
+ IntegerSerializerDeserializer.INSTANCE });
+ this.btreeFrameTupleAccessor = new FrameTupleAccessor(hyracksFrameSize, recDesc);
+ this.btreeFillFactor = btreeFillFactor;
+ }
+
+ public void init(int startPageId, int fileId) throws HyracksDataException {
+ btreeBulkLoadCtx = btree.beginBulkLoad(BTree.DEFAULT_FILL_FACTOR,
+ btree.getLeafFrameFactory().createFrame(), btree.getInteriorFrameFactory().createFrame(), btree
+ .getFreePageManager().getMetaDataFrameFactory().createFrame());
+ currentPageId = startPageId;
+ currentPage = bufferCache.pin(BufferedFileHandle.getDiskPageId(fileId, currentPageId), true);
+ currentPage.acquireWriteLatch();
+ invListBuilder.setTargetBuffer(currentPage.getBuffer().array(), 0);
+ btreeFrameTupleAccessor.reset(btreeTupleBuffer);
+ }
+
+ public void deinit() throws HyracksDataException {
+ if (currentPage != null) {
+ currentPage.releaseWriteLatch();
+ bufferCache.unpin(currentPage);
+ }
+ }
+
+ public void pinNextPage() throws HyracksDataException {
+ currentPage.releaseWriteLatch();
+ bufferCache.unpin(currentPage);
+ currentPageId++;
+ currentPage = bufferCache.pin(BufferedFileHandle.getDiskPageId(fileId, currentPageId), true);
+ currentPage.acquireWriteLatch();
+ }
+ };
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndexException.java
similarity index 72%
copy from hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
copy to hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndexException.java
index 7e91fd4..494ac0c 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndexException.java
@@ -13,10 +13,12 @@
* limitations under the License.
*/
-package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
-import java.io.Serializable;
+public class InvertedIndexException extends Exception {
+ private static final long serialVersionUID = 1L;
-public interface IBinaryTokenizerFactory extends Serializable {
- public IBinaryTokenizer createBinaryTokenizer();
+ public InvertedIndexException(String msg) {
+ super(msg);
+ }
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/ListResultCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/ListResultCursor.java
deleted file mode 100644
index adc5221..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/ListResultCursor.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
-
-import java.nio.ByteBuffer;
-import java.util.List;
-
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
-
-public class ListResultCursor implements IInvertedIndexResultCursor {
-
- private List<ByteBuffer> resultBuffers;
- private int numResultBuffers;
- private int currentPos = 0;
-
- public void setResults(List<ByteBuffer> resultBuffers, int numResultBuffers) {
- this.resultBuffers = resultBuffers;
- this.numResultBuffers = numResultBuffers;
- reset();
- }
-
- @Override
- public boolean hasNext() {
- if (currentPos + 1 < numResultBuffers)
- return true;
- else
- return false;
- }
-
- @Override
- public void next() {
- currentPos++;
- }
-
- @Override
- public ByteBuffer getBuffer() {
- return resultBuffers.get(currentPos);
- }
-
- @Override
- public void reset() {
- currentPos = -1;
- }
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/OccurrenceThresholdPanicException.java
similarity index 69%
copy from hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
copy to hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/OccurrenceThresholdPanicException.java
index 7e91fd4..b0e737c 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/OccurrenceThresholdPanicException.java
@@ -13,10 +13,12 @@
* limitations under the License.
*/
-package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
-import java.io.Serializable;
+public class OccurrenceThresholdPanicException extends InvertedIndexException {
+ private static final long serialVersionUID = 1L;
-public interface IBinaryTokenizerFactory extends Serializable {
- public IBinaryTokenizer createBinaryTokenizer();
+ public OccurrenceThresholdPanicException(String msg) {
+ super(msg);
+ }
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SearchResultCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SearchResultCursor.java
new file mode 100644
index 0000000..5a7230c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SearchResultCursor.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
+
+public class SearchResultCursor implements IInvertedIndexResultCursor {
+
+ private List<ByteBuffer> resultBuffers;
+ private IFrameTupleAccessor fta;
+ private FixedSizeTupleReference resultTuple;
+ private int numResultBuffers;
+ private int currentBufferIndex = 0;
+ private int tupleIndex = 0;
+
+ public SearchResultCursor(IFrameTupleAccessor fta, ITupleReference resultTuple) {
+ this.fta = fta;
+ this.resultTuple = (FixedSizeTupleReference) resultTuple;
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (currentBufferIndex < numResultBuffers && tupleIndex < fta.getTupleCount())
+ return true;
+ else
+ return false;
+ }
+
+ @Override
+ public void next() {
+ resultTuple.reset(fta.getBuffer().array(), fta.getTupleStartOffset(tupleIndex));
+ tupleIndex++;
+ if (tupleIndex >= fta.getTupleCount()) {
+ if (currentBufferIndex + 1 < numResultBuffers) {
+ currentBufferIndex++;
+ fta.reset(resultBuffers.get(currentBufferIndex));
+ tupleIndex = 0;
+ }
+ }
+ }
+
+ @Override
+ public ITupleReference getTuple() {
+ return resultTuple;
+ }
+
+ @Override
+ public void reset(IInvertedIndexSearcher invIndexSearcher) {
+ currentBufferIndex = 0;
+ tupleIndex = 0;
+ resultBuffers = invIndexSearcher.getResultBuffers();
+ numResultBuffers = invIndexSearcher.getNumValidResultBuffers();
+ if (numResultBuffers > 0) {
+ fta.reset(resultBuffers.get(0));
+ }
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java
deleted file mode 100644
index f88439c..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java
+++ /dev/null
@@ -1,294 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
-
-import java.io.DataOutput;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
-import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
-import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
-import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
-import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeCursor;
-import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrame;
-import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrame;
-import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
-import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOp;
-import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOpContext;
-import edu.uci.ics.hyracks.storage.am.btree.impls.MultiComparator;
-import edu.uci.ics.hyracks.storage.am.btree.impls.RangePredicate;
-import edu.uci.ics.hyracks.storage.am.btree.impls.RangeSearchCursor;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
-
-public class SimpleConjunctiveSearcher implements IInvertedIndexSearcher {
-
- private final int numKeyFields;
- private final int numValueFields;
-
- private final IBinaryComparator[] keyCmps;
- private final IBinaryComparator[] valueCmps;
-
- private final BTree btree;
- private final IHyracksTaskContext ctx;
- private final ArrayTupleBuilder resultTupleBuilder;
- private final FrameTupleAppender resultTupleAppender;
- private final FrameTupleAccessor resultFrameAccessor;
-
- private List<ByteBuffer> newResultBuffers = new ArrayList<ByteBuffer>();
- private List<ByteBuffer> prevResultBuffers = new ArrayList<ByteBuffer>();
- private List<ByteBuffer> swap = null;
- private final ListResultCursor resultCursor = new ListResultCursor();
- private int maxResultBufIdx = 0;
-
- private final IBTreeLeafFrame leafFrame;
- private final IBTreeInteriorFrame interiorFrame;
- private final IBTreeCursor btreeCursor;
- private final FrameTupleReference searchKey = new FrameTupleReference();
- private final RangePredicate pred = new RangePredicate(true, null, null, true, true, null, null);
-
- private final IBinaryTokenizer queryTokenizer;
-
- public SimpleConjunctiveSearcher(IHyracksTaskContext ctx, BTree btree, RecordDescriptor btreeRecDesc,
- IBinaryTokenizer queryTokenizer, int numKeyFields, int numValueFields) {
- this.ctx = ctx;
- this.btree = btree;
- this.queryTokenizer = queryTokenizer;
- this.numKeyFields = numKeyFields;
- this.numValueFields = numValueFields;
-
- leafFrame = btree.getLeafFrameFactory().getFrame();
- interiorFrame = btree.getInteriorFrameFactory().getFrame();
- btreeCursor = new RangeSearchCursor(leafFrame);
- resultTupleAppender = new FrameTupleAppender(ctx.getFrameSize());
- resultTupleBuilder = new ArrayTupleBuilder(numValueFields);
- newResultBuffers.add(ctx.allocateFrame());
- prevResultBuffers.add(ctx.allocateFrame());
-
- MultiComparator btreeCmp = btree.getMultiComparator();
-
- keyCmps = new IBinaryComparator[numKeyFields];
- for (int i = 0; i < numKeyFields; i++) {
- keyCmps[i] = btreeCmp.getComparators()[i];
- }
-
- valueCmps = new IBinaryComparator[numValueFields];
- for (int i = 0; i < numValueFields; i++) {
- valueCmps[i] = btreeCmp.getComparators()[numKeyFields + i];
- }
-
- MultiComparator searchCmp = new MultiComparator(btreeCmp.getTypeTraits(), keyCmps);
- pred.setLowKeyComparator(searchCmp);
- pred.setHighKeyComparator(searchCmp);
- pred.setLowKey(searchKey, true);
- pred.setHighKey(searchKey, true);
-
- ISerializerDeserializer[] valueSerde = new ISerializerDeserializer[numValueFields];
- for (int i = 0; i < numValueFields; i++) {
- valueSerde[i] = btreeRecDesc.getFields()[numKeyFields + i];
-
- }
- RecordDescriptor valueRecDesc = new RecordDescriptor(valueSerde);
- resultFrameAccessor = new FrameTupleAccessor(ctx.getFrameSize(), valueRecDesc);
- }
-
- public void search(ITupleReference queryTuple, int queryFieldIndex) throws Exception {
-
- // parse query, TODO: this parsing is too simple
- RecordDescriptor queryTokenRecDesc = new RecordDescriptor(
- new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE });
-
- ArrayTupleBuilder queryTokenBuilder = new ArrayTupleBuilder(queryTokenRecDesc.getFields().length);
- DataOutput queryTokenDos = queryTokenBuilder.getDataOutput();
- FrameTupleAppender queryTokenAppender = new FrameTupleAppender(ctx.getFrameSize());
- ByteBuffer queryTokenFrame = ctx.allocateFrame();
- queryTokenAppender.reset(queryTokenFrame, true);
-
- queryTokenizer.reset(queryTuple.getFieldData(queryFieldIndex), queryTuple.getFieldStart(queryFieldIndex),
- queryTuple.getFieldLength(queryFieldIndex));
- while (queryTokenizer.hasNext()) {
- queryTokenizer.next();
-
- queryTokenBuilder.reset();
- try {
- queryTokenizer.writeToken(queryTokenDos);
- queryTokenBuilder.addFieldEndOffset();
- } catch (IOException e) {
- throw new HyracksDataException(e);
- }
-
- // WARNING: assuming one frame is enough to hold all tokens
- queryTokenAppender.append(queryTokenBuilder.getFieldEndOffsets(), queryTokenBuilder.getByteArray(), 0,
- queryTokenBuilder.getSize());
- }
-
- FrameTupleAccessor queryTokenAccessor = new FrameTupleAccessor(ctx.getFrameSize(), queryTokenRecDesc);
- queryTokenAccessor.reset(queryTokenFrame);
- int numQueryTokens = queryTokenAccessor.getTupleCount();
-
- maxResultBufIdx = 0;
-
- BTreeOpContext opCtx = btree.createOpContext(BTreeOp.BTO_SEARCH, leafFrame, interiorFrame, null);
-
- resultTupleAppender.reset(newResultBuffers.get(0), true);
- try {
- // append first inverted list to temporary results
- searchKey.reset(queryTokenAccessor, 0);
- btree.search(btreeCursor, pred, opCtx);
- while (btreeCursor.hasNext()) {
- btreeCursor.next();
- maxResultBufIdx = appendTupleToNewResults(btreeCursor, maxResultBufIdx);
- }
- btreeCursor.close();
- btreeCursor.reset();
- } catch (Exception e) {
- throw new HyracksDataException(e);
- }
-
- resultFrameAccessor.reset(newResultBuffers.get(0));
-
- // intersect temporary results with remaining inverted lists
- for (int i = 1; i < numQueryTokens; i++) {
- swap = prevResultBuffers;
- prevResultBuffers = newResultBuffers;
- newResultBuffers = swap;
- try {
- searchKey.reset(queryTokenAccessor, i);
- btree.search(btreeCursor, pred, opCtx);
- maxResultBufIdx = intersectList(btreeCursor, prevResultBuffers, maxResultBufIdx, newResultBuffers);
- } catch (Exception e) {
- throw new HyracksDataException(e);
- }
- btreeCursor.close();
- btreeCursor.reset();
- }
- }
-
- private int appendTupleToNewResults(IBTreeCursor btreeCursor, int newBufIdx) throws IOException {
- ByteBuffer newCurrentBuffer = newResultBuffers.get(newBufIdx);
-
- ITupleReference tuple = btreeCursor.getTuple();
- resultTupleBuilder.reset();
- DataOutput dos = resultTupleBuilder.getDataOutput();
- for (int i = 0; i < numValueFields; i++) {
- int fIdx = numKeyFields + i;
- dos.write(tuple.getFieldData(fIdx), tuple.getFieldStart(fIdx), tuple.getFieldLength(fIdx));
- resultTupleBuilder.addFieldEndOffset();
- }
-
- if (!resultTupleAppender.append(resultTupleBuilder.getFieldEndOffsets(), resultTupleBuilder.getByteArray(), 0,
- resultTupleBuilder.getSize())) {
- newBufIdx++;
- if (newBufIdx >= newResultBuffers.size()) {
- newResultBuffers.add(ctx.allocateFrame());
- }
- newCurrentBuffer = newResultBuffers.get(newBufIdx);
- resultTupleAppender.reset(newCurrentBuffer, true);
- if (!resultTupleAppender.append(resultTupleBuilder.getFieldEndOffsets(), resultTupleBuilder.getByteArray(),
- 0, resultTupleBuilder.getSize())) {
- throw new IllegalStateException();
- }
- }
-
- return newBufIdx;
- }
-
- private int intersectList(IBTreeCursor btreeCursor, List<ByteBuffer> prevResultBuffers, int maxPrevBufIdx,
- List<ByteBuffer> newResultBuffers) throws IOException, Exception {
-
- int newBufIdx = 0;
- ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
-
- int prevBufIdx = 0;
- ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
-
- resultTupleBuilder.reset();
- resultTupleAppender.reset(newCurrentBuffer, true);
- resultFrameAccessor.reset(prevCurrentBuffer);
-
- // WARNING: not very efficient but good enough for the first cut
- boolean advanceCursor = true;
- boolean advancePrevResult = false;
- int resultTidx = 0;
-
- while ((!advanceCursor || btreeCursor.hasNext()) && prevBufIdx <= maxPrevBufIdx
- && resultTidx < resultFrameAccessor.getTupleCount()) {
-
- if (advanceCursor)
- btreeCursor.next();
- ITupleReference tuple = btreeCursor.getTuple();
-
- int cmp = 0;
- for (int i = 0; i < valueCmps.length; i++) {
- int tupleFidx = numKeyFields + i;
- cmp = valueCmps[i].compare(tuple.getFieldData(tupleFidx), tuple.getFieldStart(tupleFidx),
- tuple.getFieldLength(tupleFidx), resultFrameAccessor.getBuffer().array(),
- resultFrameAccessor.getTupleStartOffset(resultTidx) + resultFrameAccessor.getFieldSlotsLength()
- + resultFrameAccessor.getFieldStartOffset(resultTidx, i),
- resultFrameAccessor.getFieldLength(resultTidx, i));
- if (cmp != 0)
- break;
- }
-
- // match found
- if (cmp == 0) {
- newBufIdx = appendTupleToNewResults(btreeCursor, newBufIdx);
-
- advanceCursor = true;
- advancePrevResult = true;
- } else {
- if (cmp < 0) {
- advanceCursor = true;
- advancePrevResult = false;
- } else {
- advanceCursor = false;
- advancePrevResult = true;
- }
- }
-
- if (advancePrevResult) {
- resultTidx++;
- if (resultTidx >= resultFrameAccessor.getTupleCount()) {
- prevBufIdx++;
- if (prevBufIdx <= maxPrevBufIdx) {
- prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
- resultFrameAccessor.reset(prevCurrentBuffer);
- resultTidx = 0;
- }
- }
- }
- }
-
- return newBufIdx;
- }
-
- @Override
- public IInvertedIndexResultCursor getResultCursor() {
- resultCursor.setResults(newResultBuffers, maxResultBufIdx + 1);
- return resultCursor;
- }
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
new file mode 100644
index 0000000..7505ae0
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
@@ -0,0 +1,543 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.dataflow.value.TypeTrait;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrame;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOpContext;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeRangeSearchCursor;
+import edu.uci.ics.hyracks.storage.am.btree.impls.RangePredicate;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexCursor;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexFrame;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.IndexOp;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
+
+public class TOccurrenceSearcher implements IInvertedIndexSearcher {
+
+ protected final IHyracksTaskContext ctx;
+ protected final FixedSizeFrameTupleAppender resultFrameTupleApp;
+ protected final FixedSizeFrameTupleAccessor resultFrameTupleAcc;
+ protected final FixedSizeTupleReference resultTuple;
+ protected final int invListKeyLength;
+ protected int currentNumResults;
+
+ protected List<ByteBuffer> newResultBuffers = new ArrayList<ByteBuffer>();
+ protected List<ByteBuffer> prevResultBuffers = new ArrayList<ByteBuffer>();
+ protected List<ByteBuffer> swap = null;
+ protected int maxResultBufIdx = 0;
+
+ protected final ITreeIndexFrame leafFrame;
+ protected final ITreeIndexFrame interiorFrame;
+ protected final ITreeIndexCursor btreeCursor;
+ protected final FrameTupleReference searchKey = new FrameTupleReference();
+ protected final RangePredicate btreePred = new RangePredicate(true, null, null, true, true, null, null);
+ protected final BTreeOpContext btreeOpCtx;
+
+ protected RecordDescriptor queryTokenRecDesc = new RecordDescriptor(
+ new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE });
+ protected ArrayTupleBuilder queryTokenBuilder = new ArrayTupleBuilder(queryTokenRecDesc.getFields().length);
+ protected DataOutput queryTokenDos = queryTokenBuilder.getDataOutput();
+ protected FrameTupleAppender queryTokenAppender;
+ protected ByteBuffer queryTokenFrame;
+
+ protected final InvertedIndex invIndex;
+ protected final IBinaryTokenizer queryTokenizer;
+ protected final ITypeTrait[] invListFieldsWithCount;
+ protected int occurrenceThreshold;
+
+ protected final int cursorCacheSize = 10;
+ protected List<IInvertedListCursor> invListCursorCache = new ArrayList<IInvertedListCursor>(cursorCacheSize);
+ protected List<IInvertedListCursor> invListCursors = new ArrayList<IInvertedListCursor>(cursorCacheSize);
+
+ public TOccurrenceSearcher(IHyracksTaskContext ctx, InvertedIndex invIndex, IBinaryTokenizer queryTokenizer) {
+ this.ctx = ctx;
+ this.invIndex = invIndex;
+ this.queryTokenizer = queryTokenizer;
+
+ leafFrame = invIndex.getBTree().getLeafFrameFactory().createFrame();
+ interiorFrame = invIndex.getBTree().getInteriorFrameFactory().createFrame();
+
+ btreeCursor = new BTreeRangeSearchCursor((IBTreeLeafFrame) leafFrame);
+ ITypeTrait[] invListFields = invIndex.getInvListElementCmp().getTypeTraits();
+ invListFieldsWithCount = new TypeTrait[invListFields.length + 1];
+ int tmp = 0;
+ for (int i = 0; i < invListFields.length; i++) {
+ invListFieldsWithCount[i] = invListFields[i];
+ tmp += invListFields[i].getStaticallyKnownDataLength();
+ }
+ // using an integer for counting occurrences
+ invListFieldsWithCount[invListFields.length] = new TypeTrait(4);
+ invListKeyLength = tmp;
+
+ btreeOpCtx = invIndex.getBTree().createOpContext(IndexOp.SEARCH, leafFrame, interiorFrame, null);
+
+ resultFrameTupleApp = new FixedSizeFrameTupleAppender(ctx.getFrameSize(), invListFieldsWithCount);
+ resultFrameTupleAcc = new FixedSizeFrameTupleAccessor(ctx.getFrameSize(), invListFieldsWithCount);
+ resultTuple = new FixedSizeTupleReference(invListFieldsWithCount);
+ newResultBuffers.add(ctx.allocateFrame());
+ prevResultBuffers.add(ctx.allocateFrame());
+
+ MultiComparator searchCmp = invIndex.getBTree().getMultiComparator();
+ btreePred.setLowKeyComparator(searchCmp);
+ btreePred.setHighKeyComparator(searchCmp);
+ btreePred.setLowKey(searchKey, true);
+ btreePred.setHighKey(searchKey, true);
+
+ // pre-create cursor objects
+ for (int i = 0; i < cursorCacheSize; i++) {
+ invListCursorCache.add(new FixedSizeElementInvertedListCursor(invIndex.getBufferCache(), invIndex
+ .getInvListsFileId(), invIndex.getInvListElementCmp().getTypeTraits()));
+ }
+
+ queryTokenAppender = new FrameTupleAppender(ctx.getFrameSize());
+ queryTokenFrame = ctx.allocateFrame();
+
+ currentNumResults = 0;
+ }
+
+ public void reset() {
+ for (ByteBuffer b : newResultBuffers) {
+ resultFrameTupleApp.reset(b, true);
+ }
+ for (ByteBuffer b : prevResultBuffers) {
+ resultFrameTupleApp.reset(b, true);
+ }
+ currentNumResults = 0;
+ }
+
+ public void search(IInvertedIndexResultCursor resultCursor, ITupleReference queryTuple, int queryFieldIndex,
+ IInvertedIndexSearchModifier searchModifier) throws Exception {
+
+ queryTokenAppender.reset(queryTokenFrame, true);
+ queryTokenizer.reset(queryTuple.getFieldData(queryFieldIndex), queryTuple.getFieldStart(queryFieldIndex),
+ queryTuple.getFieldLength(queryFieldIndex));
+
+ while (queryTokenizer.hasNext()) {
+ queryTokenizer.next();
+ queryTokenBuilder.reset();
+ try {
+ IToken token = queryTokenizer.getToken();
+ token.serializeToken(queryTokenDos);
+ queryTokenBuilder.addFieldEndOffset();
+ // WARNING: assuming one frame is big enough to hold all tokens
+ queryTokenAppender.append(queryTokenBuilder.getFieldEndOffsets(), queryTokenBuilder.getByteArray(), 0,
+ queryTokenBuilder.getSize());
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+ }
+
+ FrameTupleAccessor queryTokenAccessor = new FrameTupleAccessor(ctx.getFrameSize(), queryTokenRecDesc);
+ queryTokenAccessor.reset(queryTokenFrame);
+ int numQueryTokens = queryTokenAccessor.getTupleCount();
+
+ // expand cursor cache if necessary
+ if (numQueryTokens > invListCursorCache.size()) {
+ int diff = numQueryTokens - invListCursorCache.size();
+ for (int i = 0; i < diff; i++) {
+ invListCursorCache.add(new FixedSizeElementInvertedListCursor(invIndex.getBufferCache(), invIndex
+ .getInvListsFileId(), invIndex.getInvListElementCmp().getTypeTraits()));
+ }
+ }
+
+ invListCursors.clear();
+ for (int i = 0; i < numQueryTokens; i++) {
+ searchKey.reset(queryTokenAccessor, i);
+ invIndex.openCursor(btreeCursor, btreePred, btreeOpCtx, invListCursorCache.get(i));
+ invListCursors.add(invListCursorCache.get(i));
+ }
+
+ occurrenceThreshold = searchModifier.getOccurrenceThreshold(invListCursors);
+
+ // TODO: deal with panic cases properly
+ if (occurrenceThreshold <= 0) {
+ throw new OccurrenceThresholdPanicException("Merge Threshold is <= 0. Failing Search.");
+ }
+
+ int numPrefixLists = searchModifier.getPrefixLists(invListCursors);
+ maxResultBufIdx = mergePrefixLists(numPrefixLists, numQueryTokens);
+ maxResultBufIdx = mergeSuffixLists(numPrefixLists, numQueryTokens, maxResultBufIdx);
+
+ resultCursor.reset(this);
+ }
+
+ protected int mergePrefixLists(int numPrefixTokens, int numQueryTokens) throws IOException {
+ int maxPrevBufIdx = 0;
+ for (int i = 0; i < numPrefixTokens; i++) {
+ swap = prevResultBuffers;
+ prevResultBuffers = newResultBuffers;
+ newResultBuffers = swap;
+ currentNumResults = 0;
+
+ invListCursors.get(i).pinPagesSync();
+ maxPrevBufIdx = mergePrefixList(invListCursors.get(i), prevResultBuffers, maxPrevBufIdx, newResultBuffers);
+ invListCursors.get(i).unpinPages();
+ }
+ return maxPrevBufIdx;
+ }
+
+ protected int mergeSuffixLists(int numPrefixTokens, int numQueryTokens, int maxPrevBufIdx) throws IOException {
+ for (int i = numPrefixTokens; i < numQueryTokens; i++) {
+ swap = prevResultBuffers;
+ prevResultBuffers = newResultBuffers;
+ newResultBuffers = swap;
+
+ invListCursors.get(i).pinPagesSync();
+ int numInvListElements = invListCursors.get(i).getNumElements();
+ // should we binary search the next list or should we sort-merge it?
+ if (currentNumResults * Math.log(numInvListElements) < currentNumResults + numInvListElements) {
+ maxPrevBufIdx = mergeSuffixListProbe(invListCursors.get(i), prevResultBuffers, maxPrevBufIdx,
+ newResultBuffers, i, numQueryTokens);
+ } else {
+ maxPrevBufIdx = mergeSuffixListScan(invListCursors.get(i), prevResultBuffers, maxPrevBufIdx,
+ newResultBuffers, i, numQueryTokens);
+ }
+ invListCursors.get(i).unpinPages();
+ }
+ return maxPrevBufIdx;
+ }
+
+ protected int mergeSuffixListProbe(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
+ int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws IOException {
+
+ int newBufIdx = 0;
+ ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
+
+ int prevBufIdx = 0;
+ ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+
+ int resultTidx = 0;
+
+ currentNumResults = 0;
+
+ MultiComparator invListCmp = invIndex.getInvListElementCmp();
+
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultFrameTupleApp.reset(newCurrentBuffer, true);
+
+ while (resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+ resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+ resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+
+ if (invListCursor.containsKey(resultTuple, invListCmp)) {
+ count++;
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ } else {
+ if (count + numQueryTokens - invListIx > occurrenceThreshold) {
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ }
+ }
+
+ resultTidx++;
+ if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+ prevBufIdx++;
+ if (prevBufIdx <= maxPrevBufIdx) {
+ prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultTidx = 0;
+ }
+ }
+ }
+
+ return newBufIdx;
+ }
+
+ protected int mergeSuffixListScan(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
+ int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws IOException {
+ int newBufIdx = 0;
+ ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
+
+ int prevBufIdx = 0;
+ ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+
+ boolean advanceCursor = true;
+ boolean advancePrevResult = false;
+ int resultTidx = 0;
+
+ MultiComparator invListCmp = invIndex.getInvListElementCmp();
+
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultFrameTupleApp.reset(newCurrentBuffer, true);
+
+ int invListTidx = 0;
+ int invListNumTuples = invListCursor.getNumElements();
+
+ if (invListCursor.hasNext())
+ invListCursor.next();
+
+ while (invListTidx < invListNumTuples && resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+ ITupleReference invListTuple = invListCursor.getTuple();
+
+ resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+
+ int cmp = invListCmp.compare(invListTuple, resultTuple);
+ if (cmp == 0) {
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+ resultTuple.getFieldStart(resultTuple.getFieldCount() - 1)) + 1;
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ advanceCursor = true;
+ advancePrevResult = true;
+ } else {
+ if (cmp < 0) {
+ advanceCursor = true;
+ advancePrevResult = false;
+ } else {
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+ resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+ if (count + numQueryTokens - invListIx > occurrenceThreshold) {
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ }
+ advanceCursor = false;
+ advancePrevResult = true;
+ }
+ }
+
+ if (advancePrevResult) {
+ resultTidx++;
+ if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+ prevBufIdx++;
+ if (prevBufIdx <= maxPrevBufIdx) {
+ prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultTidx = 0;
+ }
+ }
+ }
+
+ if (advanceCursor) {
+ invListTidx++;
+ invListCursor.next();
+ }
+ }
+
+ // append remaining elements from previous result set
+ while (resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+ resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+ resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+ if (count + numQueryTokens - invListIx > occurrenceThreshold) {
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ }
+
+ resultTidx++;
+ if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+ prevBufIdx++;
+ if (prevBufIdx <= maxPrevBufIdx) {
+ prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultTidx = 0;
+ }
+ }
+ }
+
+ return newBufIdx;
+ }
+
+ protected int mergePrefixList(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
+ int maxPrevBufIdx, List<ByteBuffer> newResultBuffers) throws IOException {
+ int newBufIdx = 0;
+ ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
+
+ int prevBufIdx = 0;
+ ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+
+ boolean advanceCursor = true;
+ boolean advancePrevResult = false;
+ int resultTidx = 0;
+
+ MultiComparator invListCmp = invIndex.getInvListElementCmp();
+
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultFrameTupleApp.reset(newCurrentBuffer, true);
+
+ int invListTidx = 0;
+ int invListNumTuples = invListCursor.getNumElements();
+
+ if (invListCursor.hasNext())
+ invListCursor.next();
+
+ while (invListTidx < invListNumTuples && resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+ ITupleReference invListTuple = invListCursor.getTuple();
+ resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+
+ int cmp = invListCmp.compare(invListTuple, resultTuple);
+ if (cmp == 0) {
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+ resultTuple.getFieldStart(resultTuple.getFieldCount() - 1)) + 1;
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ advanceCursor = true;
+ advancePrevResult = true;
+ } else {
+ if (cmp < 0) {
+ int count = 1;
+ newBufIdx = appendTupleToNewResults(invListTuple, count, newBufIdx);
+ advanceCursor = true;
+ advancePrevResult = false;
+ } else {
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+ resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ advanceCursor = false;
+ advancePrevResult = true;
+ }
+ }
+
+ if (advancePrevResult) {
+ resultTidx++;
+ if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+ prevBufIdx++;
+ if (prevBufIdx <= maxPrevBufIdx) {
+ prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultTidx = 0;
+ }
+ }
+ }
+
+ if (advanceCursor) {
+ invListTidx++;
+ invListCursor.next();
+ }
+ }
+
+ // append remaining new elements from inverted list
+ while (invListTidx < invListNumTuples) {
+ ITupleReference invListTuple = invListCursor.getTuple();
+ newBufIdx = appendTupleToNewResults(invListTuple, 1, newBufIdx);
+ invListTidx++;
+ invListCursor.next();
+ }
+
+ // append remaining elements from previous result set
+ while (resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+ resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+ resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+
+ resultTidx++;
+ if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+ prevBufIdx++;
+ if (prevBufIdx <= maxPrevBufIdx) {
+ prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultTidx = 0;
+ }
+ }
+ }
+
+ return newBufIdx;
+ }
+
+ protected int appendTupleToNewResults(ITupleReference tuple, int newCount, int newBufIdx) throws IOException {
+ ByteBuffer newCurrentBuffer = newResultBuffers.get(newBufIdx);
+
+ if (!resultFrameTupleApp.hasSpace()) {
+ newBufIdx++;
+ if (newBufIdx >= newResultBuffers.size()) {
+ newResultBuffers.add(ctx.allocateFrame());
+ }
+ newCurrentBuffer = newResultBuffers.get(newBufIdx);
+ resultFrameTupleApp.reset(newCurrentBuffer, true);
+ }
+
+ // append key
+ if (!resultFrameTupleApp.append(tuple.getFieldData(0), tuple.getFieldStart(0), invListKeyLength)) {
+ throw new IllegalStateException();
+ }
+
+ // append new count
+ if (!resultFrameTupleApp.append(newCount)) {
+ throw new IllegalStateException();
+ }
+
+ resultFrameTupleApp.incrementTupleCount(1);
+
+ currentNumResults++;
+
+ return newBufIdx;
+ }
+
+ public IFrameTupleAccessor createResultFrameTupleAccessor() {
+ return new FixedSizeFrameTupleAccessor(ctx.getFrameSize(), invListFieldsWithCount);
+ }
+
+ public ITupleReference createResultTupleReference() {
+ return new FixedSizeTupleReference(invListFieldsWithCount);
+ }
+
+ @Override
+ public List<ByteBuffer> getResultBuffers() {
+ return newResultBuffers;
+ }
+
+ @Override
+ public int getNumValidResultBuffers() {
+ return maxResultBufIdx + 1;
+ }
+
+ public int getOccurrenceThreshold() {
+ return occurrenceThreshold;
+ }
+
+ public void printNewResults(int maxResultBufIdx) {
+ StringBuffer strBuffer = new StringBuffer();
+ for (int i = 0; i <= maxResultBufIdx; i++) {
+ ByteBuffer testBuf = newResultBuffers.get(i);
+ resultFrameTupleAcc.reset(testBuf);
+ for (int j = 0; j < resultFrameTupleAcc.getTupleCount(); j++) {
+ strBuffer.append(IntegerSerializerDeserializer.getInt(resultFrameTupleAcc.getBuffer().array(),
+ resultFrameTupleAcc.getFieldStartOffset(j, 0)) + ",");
+ strBuffer.append(IntegerSerializerDeserializer.getInt(resultFrameTupleAcc.getBuffer().array(),
+ resultFrameTupleAcc.getFieldStartOffset(j, 1)) + " ");
+ }
+ }
+ System.out.println(strBuffer.toString());
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java
new file mode 100644
index 0000000..0f5439b
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
+
+public class TOccurrenceSearcherSuffixProbeOnly extends TOccurrenceSearcher {
+
+ public TOccurrenceSearcherSuffixProbeOnly(IHyracksTaskContext ctx, InvertedIndex invIndex,
+ IBinaryTokenizer queryTokenizer) {
+ super(ctx, invIndex, queryTokenizer);
+ }
+
+ protected int mergeSuffixLists(int numPrefixTokens, int numQueryTokens, int maxPrevBufIdx) throws IOException {
+ for (int i = numPrefixTokens; i < numQueryTokens; i++) {
+ swap = prevResultBuffers;
+ prevResultBuffers = newResultBuffers;
+ newResultBuffers = swap;
+ currentNumResults = 0;
+
+ invListCursors.get(i).pinPagesSync();
+ maxPrevBufIdx = mergeSuffixListProbe(invListCursors.get(i), prevResultBuffers, maxPrevBufIdx,
+ newResultBuffers, i, numQueryTokens);
+ invListCursors.get(i).unpinPages();
+ }
+ return maxPrevBufIdx;
+ }
+
+ protected int mergeSuffixListProbe(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
+ int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws IOException {
+
+ int newBufIdx = 0;
+ ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
+
+ int prevBufIdx = 0;
+ ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+
+ int resultTidx = 0;
+
+ MultiComparator invListCmp = invIndex.getInvListElementCmp();
+
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultFrameTupleApp.reset(newCurrentBuffer, true);
+
+ while (resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+ resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+ resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+
+ if (invListCursor.containsKey(resultTuple, invListCmp)) {
+ count++;
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ } else {
+ if (count + numQueryTokens - invListIx > occurrenceThreshold) {
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ }
+ }
+
+ resultTidx++;
+ if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+ prevBufIdx++;
+ if (prevBufIdx <= maxPrevBufIdx) {
+ prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultTidx = 0;
+ }
+ }
+ }
+
+ return newBufIdx;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java
new file mode 100644
index 0000000..b997dc9
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java
@@ -0,0 +1,133 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
+
+public class TOccurrenceSearcherSuffixScanOnly extends TOccurrenceSearcher {
+
+ public TOccurrenceSearcherSuffixScanOnly(IHyracksTaskContext ctx, InvertedIndex invIndex,
+ IBinaryTokenizer queryTokenizer) {
+ super(ctx, invIndex, queryTokenizer);
+ }
+
+ protected int mergeSuffixLists(int numPrefixTokens, int numQueryTokens, int maxPrevBufIdx) throws IOException {
+ for (int i = numPrefixTokens; i < numQueryTokens; i++) {
+ swap = prevResultBuffers;
+ prevResultBuffers = newResultBuffers;
+ newResultBuffers = swap;
+ currentNumResults = 0;
+
+ invListCursors.get(i).pinPagesSync();
+ maxPrevBufIdx = mergeSuffixListScan(invListCursors.get(i), prevResultBuffers, maxPrevBufIdx,
+ newResultBuffers, i, numQueryTokens);
+ invListCursors.get(i).unpinPages();
+ }
+ return maxPrevBufIdx;
+ }
+
+ protected int mergeSuffixListScan(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
+ int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws IOException {
+
+ int newBufIdx = 0;
+ ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
+
+ int prevBufIdx = 0;
+ ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+
+ boolean advanceCursor = true;
+ boolean advancePrevResult = false;
+ int resultTidx = 0;
+
+ MultiComparator invListCmp = invIndex.getInvListElementCmp();
+
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultFrameTupleApp.reset(newCurrentBuffer, true);
+
+ while (invListCursor.hasNext() && resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+ if (advanceCursor)
+ invListCursor.next();
+
+ ITupleReference invListTuple = invListCursor.getTuple();
+
+ resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+
+ int cmp = invListCmp.compare(invListTuple, resultTuple);
+ if (cmp == 0) {
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+ resultTuple.getFieldStart(resultTuple.getFieldCount() - 1)) + 1;
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ advanceCursor = true;
+ advancePrevResult = true;
+ } else {
+ if (cmp < 0) {
+ advanceCursor = true;
+ advancePrevResult = false;
+ } else {
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+ resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+ if (count + numQueryTokens - invListIx > occurrenceThreshold) {
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ }
+ advanceCursor = false;
+ advancePrevResult = true;
+ }
+ }
+
+ if (advancePrevResult) {
+ resultTidx++;
+ if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+ prevBufIdx++;
+ if (prevBufIdx <= maxPrevBufIdx) {
+ prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultTidx = 0;
+ }
+ }
+ }
+ }
+
+ // append remaining elements from previous result set
+ while (resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+ resultTuple.getFieldStart(resultTuple.getFieldCount() - 1));
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+
+ resultTidx++;
+ if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+ prevBufIdx++;
+ if (prevBufIdx <= maxPrevBufIdx) {
+ prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultTidx = 0;
+ }
+ }
+ }
+
+ return newBufIdx;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/ConjunctiveSearchModifier.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/ConjunctiveSearchModifier.java
new file mode 100644
index 0000000..55945be
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/ConjunctiveSearchModifier.java
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers;
+
+import java.util.Collections;
+import java.util.List;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
+
+public class ConjunctiveSearchModifier implements IInvertedIndexSearchModifier {
+
+ @Override
+ public int getOccurrenceThreshold(List<IInvertedListCursor> invListCursors) {
+ return invListCursors.size();
+ }
+
+ @Override
+ public int getPrefixLists(List<IInvertedListCursor> invListCursors) {
+ Collections.sort(invListCursors);
+ return 1;
+ }
+
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/EditDistanceSearchModifier.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/EditDistanceSearchModifier.java
new file mode 100644
index 0000000..ac109b6
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/EditDistanceSearchModifier.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers;
+
+import java.util.Collections;
+import java.util.List;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
+
+public class EditDistanceSearchModifier implements IInvertedIndexSearchModifier {
+
+ private int gramLength;
+ private int edThresh;
+
+ public EditDistanceSearchModifier(int gramLength, int edThresh) {
+ this.gramLength = gramLength;
+ this.edThresh = edThresh;
+ }
+
+ @Override
+ public int getOccurrenceThreshold(List<IInvertedListCursor> invListCursors) {
+ return invListCursors.size() - edThresh * gramLength;
+ }
+
+ @Override
+ public int getPrefixLists(List<IInvertedListCursor> invListCursors) {
+ Collections.sort(invListCursors);
+ return invListCursors.size() - getOccurrenceThreshold(invListCursors) + 1;
+ }
+
+ public int getGramLength() {
+ return gramLength;
+ }
+
+ public void setGramLength(int gramLength) {
+ this.gramLength = gramLength;
+ }
+
+ public int getEdThresh() {
+ return edThresh;
+ }
+
+ public void setEdThresh(int edThresh) {
+ this.edThresh = edThresh;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/JaccardSearchModifier.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/JaccardSearchModifier.java
new file mode 100644
index 0000000..f42841c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/JaccardSearchModifier.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers;
+
+import java.util.Collections;
+import java.util.List;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
+
+public class JaccardSearchModifier implements IInvertedIndexSearchModifier {
+
+ private float jaccThresh;
+
+ public JaccardSearchModifier(float jaccThresh) {
+ this.jaccThresh = jaccThresh;
+ }
+
+ @Override
+ public int getOccurrenceThreshold(List<IInvertedListCursor> invListCursors) {
+ return (int) Math.floor((float) invListCursors.size() * jaccThresh);
+ }
+
+ @Override
+ public int getPrefixLists(List<IInvertedListCursor> invListCursors) {
+ Collections.sort(invListCursors);
+ if (invListCursors.size() == 0) {
+ return 0;
+ }
+ return invListCursors.size() - getOccurrenceThreshold(invListCursors) + 1;
+ }
+
+ public float getJaccThresh() {
+ return jaccThresh;
+ }
+
+ public void setJaccThresh(float jaccThresh) {
+ this.jaccThresh = jaccThresh;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..bbb32d6
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8StringBinaryTokenizer.java
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public abstract class AbstractUTF8StringBinaryTokenizer implements
+ IBinaryTokenizer {
+
+ protected byte[] data;
+ protected int start;
+ protected int length;
+ protected int tokenLength;
+ protected int index;
+ protected int utf8Length;
+
+ protected final IntArray tokensStart;
+ protected final IntArray tokensLength;
+ protected final IToken token;
+
+ protected final boolean ignoreTokenCount;
+ protected final boolean sourceHasTypeTag;
+
+ public AbstractUTF8StringBinaryTokenizer(boolean ignoreTokenCount,
+ boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+ this.ignoreTokenCount = ignoreTokenCount;
+ this.sourceHasTypeTag = sourceHasTypeTag;
+ if (!ignoreTokenCount) {
+ tokensStart = new IntArray();
+ tokensLength = new IntArray();
+ } else {
+ tokensStart = null;
+ tokensLength = null;
+ }
+ token = tokenFactory.createToken();
+ }
+
+ @Override
+ public IToken getToken() {
+ return token;
+ }
+
+ @Override
+ public void reset(byte[] data, int start, int length) {
+ this.start = start;
+ index = this.start;
+ if (sourceHasTypeTag) {
+ index++; // skip type tag
+ }
+ utf8Length = StringUtils.getUTFLen(data, index);
+ index += 2; // skip utf8 length indicator
+ this.data = data;
+ this.length = length + start;
+
+ tokenLength = 0;
+ if (!ignoreTokenCount) {
+ tokensStart.reset();
+ tokensLength.reset();
+ }
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java
new file mode 100644
index 0000000..92d6ac2
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8Token.java
@@ -0,0 +1,106 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public abstract class AbstractUTF8Token implements IToken {
+ public static final int GOLDEN_RATIO_32 = 0x09e3779b9;
+
+ protected int length;
+ protected int tokenLength;
+ protected int start;
+ protected int tokenCount;
+ protected byte[] data;
+ protected final byte tokenTypeTag;
+ protected final byte countTypeTag;
+
+ public AbstractUTF8Token() {
+ tokenTypeTag = -1;
+ countTypeTag = -1;
+ }
+
+ public AbstractUTF8Token(byte tokenTypeTag, byte countTypeTag) {
+ this.tokenTypeTag = tokenTypeTag;
+ this.countTypeTag = countTypeTag;
+ }
+
+ @Override
+ public byte[] getData() {
+ return data;
+ }
+
+ @Override
+ public int getLength() {
+ return length;
+ }
+
+ public int getLowerCaseUTF8Len(int size) {
+ int lowerCaseUTF8Len = 0;
+ int pos = start;
+ for (int i = 0; i < size; i++) {
+ char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+ lowerCaseUTF8Len += StringUtils.getModifiedUTF8Len(c);
+ pos += StringUtils.charSize(data, pos);
+ }
+ return lowerCaseUTF8Len;
+ }
+
+ @Override
+ public int getStart() {
+ return start;
+ }
+
+ @Override
+ public int getTokenLength() {
+ return tokenLength;
+ }
+
+ public void handleCountTypeTag(DataOutput dos) throws IOException {
+ if (countTypeTag > 0) {
+ dos.write(countTypeTag);
+ }
+ }
+
+ public void handleTokenTypeTag(DataOutput dos) throws IOException {
+ if (tokenTypeTag > 0) {
+ dos.write(tokenTypeTag);
+ }
+ }
+
+ @Override
+ public void reset(byte[] data, int start, int length, int tokenLength,
+ int tokenCount) {
+ this.data = data;
+ this.start = start;
+ this.length = length;
+ this.tokenLength = tokenLength;
+ this.tokenCount = tokenCount;
+ }
+
+ @Override
+ public void serializeTokenCount(DataOutput dos) throws IOException {
+ handleCountTypeTag(dos);
+ dos.writeInt(tokenCount);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8TokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8TokenFactory.java
new file mode 100644
index 0000000..3b0b82d
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/AbstractUTF8TokenFactory.java
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public abstract class AbstractUTF8TokenFactory implements ITokenFactory {
+ private static final long serialVersionUID = 1L;
+ protected final byte tokenTypeTag;
+ protected final byte countTypeTag;
+
+ public AbstractUTF8TokenFactory() {
+ tokenTypeTag = -1;
+ countTypeTag = -1;
+ }
+
+ public AbstractUTF8TokenFactory(byte tokenTypeTag, byte countTypeTag) {
+ this.tokenTypeTag = tokenTypeTag;
+ this.countTypeTag = countTypeTag;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
index 425436b..de9ad2c 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
@@ -1,116 +1,87 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
-import java.io.DataOutput;
-import java.io.IOException;
-
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-public class DelimitedUTF8StringBinaryTokenizer implements IBinaryTokenizer {
+public class DelimitedUTF8StringBinaryTokenizer extends
+ AbstractUTF8StringBinaryTokenizer {
- private static final RecordDescriptor tokenSchema = new RecordDescriptor(
- new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE });
+ public DelimitedUTF8StringBinaryTokenizer(boolean ignoreTokenCount,
+ boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+ super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+ }
- private final char delimiter;
- private final byte typeTag;
- private byte[] data;
- private int start;
- private int length;
+ @Override
+ public boolean hasNext() {
+ // skip delimiters
+ while (index < length && isSeparator(StringUtils.charAt(data, index))) {
+ index += StringUtils.charSize(data, index);
+ }
+ return index < length;
+ }
- private int tokenLength;
- private int tokenStart;
- private int pos;
+ private boolean isSeparator(char c) {
+ return !(Character.isLetterOrDigit(c)
+ || Character.getType(c) == Character.OTHER_LETTER || Character
+ .getType(c) == Character.OTHER_NUMBER);
+ }
- public DelimitedUTF8StringBinaryTokenizer(char delimiter, byte typeTag) {
- this.delimiter = delimiter;
- this.typeTag = typeTag;
- }
+ @Override
+ public void next() {
+ tokenLength = 0;
+ int currentTokenStart = index;
+ while (index < length && !isSeparator(StringUtils.charAt(data, index))) {
+ index += StringUtils.charSize(data, index);
+ tokenLength++;
+ }
+ int tokenCount = 1;
+ if (tokenLength > 0 && !ignoreTokenCount) {
+ // search if we got the same token before
+ for (int i = 0; i < tokensStart.length(); ++i) {
+ if (tokenLength == tokensLength.get(i)) {
+ int tokenStart = tokensStart.get(i);
+ tokenCount++; // assume we found it
+ int offset = 0;
+ int currLength = 0;
+ while (currLength < tokenLength) {
+ // case insensitive comparison
+ if (StringUtils.toLowerCase(StringUtils.charAt(data,
+ currentTokenStart + offset)) != StringUtils
+ .toLowerCase(StringUtils.charAt(data,
+ tokenStart + offset))) {
+ tokenCount--;
+ break;
+ }
+ offset += StringUtils.charSize(data, currentTokenStart
+ + offset);
+ currLength++;
+ }
+ }
+ }
+ // add the new token to the list of seen tokens
+ tokensStart.add(currentTokenStart);
+ tokensLength.add(tokenLength);
+ }
- public DelimitedUTF8StringBinaryTokenizer(char delimiter) {
- this.delimiter = delimiter;
- this.typeTag = -1;
- }
-
- @Override
- public int getTokenLength() {
- return tokenLength;
- }
-
- @Override
- public int getTokenStartOff() {
- return tokenStart;
- }
-
- @Override
- public boolean hasNext() {
- if (pos >= start + length)
- return false;
- else
- return true;
- }
-
- @Override
- public void next() {
- tokenLength = 0;
- tokenStart = pos;
- while (pos < start + length) {
- int len = StringUtils.charSize(data, pos);
- char ch = StringUtils.charAt(data, pos);
- pos += len;
- if (ch == delimiter) {
- break;
- }
- tokenLength += len;
- }
- }
-
- @Override
- public void reset(byte[] data, int start, int length) {
- this.data = data;
- this.start = start;
- this.pos = start;
- this.length = length;
- this.tokenLength = 0;
- this.tokenStart = 0;
- pos += 2; // UTF-8 specific
- }
-
- @Override
- public void writeToken(DataOutput dos) throws IOException {
- if (typeTag > 0)
- dos.write(typeTag);
-
- // WARNING: 2-byte length indicator is specific to UTF-8
- dos.writeShort((short) tokenLength);
- dos.write(data, tokenStart, tokenLength);
- }
-
- @Override
- public RecordDescriptor getTokenSchema() {
- return tokenSchema;
- }
-
- // cannot be implemented for this tokenizer
- @Override
- public int getNumTokens() {
- return -1;
- }
+ // set token
+ token.reset(data, currentTokenStart, index, tokenLength, tokenCount);
+ }
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
index 2e85db5..4a350b3 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
@@ -1,41 +1,42 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
+ * You may obtain a copy of the License at
+ *
* http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
*/
package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
+public class DelimitedUTF8StringBinaryTokenizerFactory implements
+ IBinaryTokenizerFactory {
-public class DelimitedUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
+ private static final long serialVersionUID = 1L;
+ private final boolean ignoreTokenCount;
+ private final boolean sourceHasTypeTag;
+ private final ITokenFactory tokenFactory;
- private static final long serialVersionUID = 1L;
- private final char delimiter;
- private final byte typeTag;
+ public DelimitedUTF8StringBinaryTokenizerFactory(boolean ignoreTokenCount,
+ boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+ this.ignoreTokenCount = ignoreTokenCount;
+ this.sourceHasTypeTag = sourceHasTypeTag;
+ this.tokenFactory = tokenFactory;
+ }
- public DelimitedUTF8StringBinaryTokenizerFactory(char delimiter, byte typeTag) {
- this.delimiter = delimiter;
- this.typeTag = typeTag;
- }
-
- public DelimitedUTF8StringBinaryTokenizerFactory(char delimiter) {
- this.delimiter = delimiter;
- this.typeTag = -1;
- }
-
- @Override
- public IBinaryTokenizer createBinaryTokenizer() {
- return new DelimitedUTF8StringBinaryTokenizer(delimiter, typeTag);
- }
+ @Override
+ public IBinaryTokenizer createTokenizer() {
+ return new DelimitedUTF8StringBinaryTokenizer(ignoreTokenCount,
+ sourceHasTypeTag, tokenFactory);
+ }
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java
deleted file mode 100644
index 2cc0b6c..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
-
-import java.io.DataOutput;
-import java.io.IOException;
-
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
-import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-
-public class HashedQGramUTF8StringBinaryTokenizer implements IBinaryTokenizer {
-
- private static final RecordDescriptor tokenSchema = new RecordDescriptor(
- new ISerializerDeserializer[] { IntegerSerializerDeserializer.INSTANCE });
-
- private final boolean prePost;
- private final int q;
- private byte[] data;
- private int start;
- private int length;
- private int gramNum;
- private int utflen;
-
- private final char PRECHAR = '#';
- private final char POSTCHAR = '$';
-
- private int charPos;
- private int pos;
- private int hashedGram;
-
- HashedQGramUTF8StringBinaryTokenizer(int q, boolean prePost) {
- this.prePost = prePost;
- this.q = q;
- }
-
- @Override
- public int getTokenLength() {
- // the produced token (hashed q-gram) is derived from data
- // but not contained in it
- // therefore this call does not make sense
- return -1;
- }
-
- @Override
- public int getTokenStartOff() {
- // the produced token (hashed q-gram) is derived from data
- // but not contained in it
- // therefore this call does not make sense
- return -1;
- }
-
- @Override
- public boolean hasNext() {
- if ((prePost && pos >= start + length) || (!prePost && pos >= start + length - q))
- return false;
- else
- return true;
- }
-
- @Override
- public void next() {
- hashedGram = 0;
- if (prePost) {
- if (gramNum < q) {
- for (int i = 0; i < q - gramNum; i++) {
- hashedGram = 31 * hashedGram + PRECHAR;
- }
-
- int tmpPos = pos;
- for (int i = 0; i < gramNum; i++) {
- hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
- tmpPos += StringUtils.charSize(data, tmpPos);
- }
- } else {
- int stopStr = Math.min(charPos + q, utflen);
- int tmpPos = pos;
- for (int i = charPos; i < stopStr; i++) {
- hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
- tmpPos += StringUtils.charSize(data, tmpPos);
- }
-
- int stopPost = (charPos + q) - (utflen);
- for (int i = 0; i < stopPost; i++) {
- hashedGram = 31 * hashedGram + POSTCHAR;
- }
- pos += StringUtils.charSize(data, pos);
- charPos++;
- }
- gramNum++;
- } else {
- int tmpPos = pos;
- for (int i = charPos; i < charPos + q; i++) {
- hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
- tmpPos += StringUtils.charSize(data, tmpPos);
- }
- pos += StringUtils.charSize(data, pos);
- charPos++;
- }
- }
-
- @Override
- public void reset(byte[] data, int start, int length) {
- this.data = data;
- this.start = start;
- this.length = length;
- this.utflen = StringUtils.getUTFLen(data, start);
- this.pos = start + 2; // UTF-8 specific
- this.gramNum = 1;
- this.charPos = 0;
- }
-
- @Override
- public void writeToken(DataOutput dos) throws IOException {
- dos.writeInt(hashedGram);
- }
-
- public char getPreChar() {
- return PRECHAR;
- }
-
- public char getPostChar() {
- return POSTCHAR;
- }
-
- @Override
- public RecordDescriptor getTokenSchema() {
- return tokenSchema;
- }
-
- @Override
- public int getNumTokens() {
- return 0;
- }
-}
\ No newline at end of file
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java
deleted file mode 100644
index a11fe8a..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
-
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
-
-public class HashedQGramUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
-
- private static final long serialVersionUID = 1L;
- private final int q;
- private final boolean prePost;
-
- public HashedQGramUTF8StringBinaryTokenizerFactory(int q, boolean prePost) {
- this.q = q;
- this.prePost = prePost;
- }
-
- @Override
- public IBinaryTokenizer createBinaryTokenizer() {
- return new HashedQGramUTF8StringBinaryTokenizer(q, prePost);
- }
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java
new file mode 100644
index 0000000..25d1a2c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramToken.java
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class HashedUTF8NGramToken extends UTF8NGramToken {
+ public HashedUTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ handleTokenTypeTag(dos);
+
+ int hash = GOLDEN_RATIO_32;
+
+ // pre chars
+ for (int i = 0; i < numPreChars; i++) {
+ hash ^= PRECHAR;
+ hash *= GOLDEN_RATIO_32;
+ }
+
+ // regular chars
+ int numRegGrams = tokenLength - numPreChars - numPostChars;
+ int pos = start;
+ for (int i = 0; i < numRegGrams; i++) {
+ hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+ hash *= GOLDEN_RATIO_32;
+ pos += StringUtils.charSize(data, pos);
+ }
+
+ // post chars
+ for (int i = 0; i < numPostChars; i++) {
+ hash ^= POSTCHAR;
+ hash *= GOLDEN_RATIO_32;
+ }
+
+ // token count
+ hash += tokenCount;
+
+ dos.writeInt(hash);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java
new file mode 100644
index 0000000..4a87793
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class HashedUTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public HashedUTF8NGramTokenFactory() {
+ super();
+ }
+
+ public HashedUTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public IToken createToken() {
+ return new HashedUTF8NGramToken(tokenTypeTag, countTypeTag);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java
new file mode 100644
index 0000000..55237ce
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordToken.java
@@ -0,0 +1,88 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class HashedUTF8WordToken extends UTF8WordToken {
+
+ private int hash = 0;
+
+ public HashedUTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o == null) {
+ return false;
+ }
+ if (!(o instanceof IToken)) {
+ return false;
+ }
+ IToken t = (IToken) o;
+ if (t.getTokenLength() != tokenLength) {
+ return false;
+ }
+ int offset = 0;
+ for (int i = 0; i < tokenLength; i++) {
+ if (StringUtils.charAt(t.getData(), t.getStart() + offset) != StringUtils
+ .charAt(data, start + offset)) {
+ return false;
+ }
+ offset += StringUtils.charSize(data, start + offset);
+ }
+ return true;
+ }
+
+ @Override
+ public int hashCode() {
+ return hash;
+ }
+
+ @Override
+ public void reset(byte[] data, int start, int length, int tokenLength,
+ int tokenCount) {
+ super.reset(data, start, length, tokenLength, tokenCount);
+
+ // pre-compute hash value using JAQL-like string hashing
+ int pos = start;
+ hash = GOLDEN_RATIO_32;
+ for (int i = 0; i < tokenLength; i++) {
+ hash ^= StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+ hash *= GOLDEN_RATIO_32;
+ pos += StringUtils.charSize(data, pos);
+ }
+ hash += tokenCount;
+ }
+
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ if (tokenTypeTag > 0) {
+ dos.write(tokenTypeTag);
+ }
+
+ // serialize hash value
+ dos.writeInt(hash);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java
new file mode 100644
index 0000000..318f041
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedUTF8WordTokenFactory.java
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class HashedUTF8WordTokenFactory extends AbstractUTF8TokenFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public HashedUTF8WordTokenFactory() {
+ super();
+ }
+
+ public HashedUTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public IToken createToken() {
+ return new HashedUTF8WordToken(tokenTypeTag, countTypeTag);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizer.java
new file mode 100644
index 0000000..05c6d0b
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizer.java
@@ -0,0 +1,30 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public interface IBinaryTokenizer {
+ public IToken getToken();
+
+ public boolean hasNext();
+
+ public void next();
+
+ public void reset(byte[] data, int start, int length);
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizerFactory.java
new file mode 100644
index 0000000..bfe78ee
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IBinaryTokenizerFactory.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.Serializable;
+
+public interface IBinaryTokenizerFactory extends Serializable {
+ public IBinaryTokenizer createTokenizer();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/INGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/INGramToken.java
new file mode 100644
index 0000000..befc6d2
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/INGramToken.java
@@ -0,0 +1,28 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public interface INGramToken {
+ public int getNumPostChars();
+
+ public int getNumPreChars();
+
+ public void setNumPrePostChars(int numPreChars, int numPostChars);
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IToken.java
new file mode 100644
index 0000000..c1840d7
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IToken.java
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+public interface IToken {
+ public byte[] getData();
+
+ public int getLength();
+
+ public int getStart();
+
+ public int getTokenLength();
+
+ public void reset(byte[] data, int start, int length, int tokenLength,
+ int tokenCount);
+
+ public void serializeToken(DataOutput dos) throws IOException;
+
+ public void serializeTokenCount(DataOutput dos) throws IOException;
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/ITokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/ITokenFactory.java
new file mode 100644
index 0000000..8b5d71d
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/ITokenFactory.java
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.Serializable;
+
+public interface ITokenFactory extends Serializable {
+ public IToken createToken();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IntArray.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IntArray.java
new file mode 100644
index 0000000..2eb9ff4
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/IntArray.java
@@ -0,0 +1,80 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Rares Vernica <rares (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.util.Arrays;
+
+public class IntArray {
+ private static final int SIZE = 128;
+
+ private int[] data;
+ private int length;
+
+ public IntArray() {
+ data = new int[SIZE];
+ length = 0;
+ }
+
+ public void add(int d) {
+ if (length == data.length) {
+ data = Arrays.copyOf(data, data.length << 1);
+ }
+ data[length++] = d;
+ }
+
+ public int[] get() {
+ return data;
+ }
+
+ public int get(int i) {
+ return data[i];
+ }
+
+ public int length() {
+ return length;
+ }
+
+ public void reset() {
+ length = 0;
+ }
+
+ public void sort() {
+ sort(0, length);
+ }
+
+ public void sort(int start, int end) {
+ Arrays.sort(data, start, end);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder out = new StringBuilder();
+ out.append('[');
+ for (int i = 0; i < length; ++i) {
+ out.append(data[i]);
+ if (i < length - 1) {
+ out.append(',');
+ out.append(' ');
+ }
+ }
+ out.append(']');
+ return out.toString();
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..2a13f83
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
@@ -0,0 +1,123 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class NGramUTF8StringBinaryTokenizer extends
+ AbstractUTF8StringBinaryTokenizer {
+
+ private int gramLength;
+ private boolean usePrePost;
+
+ private int gramNum;
+ private int totalGrams;
+
+ private final INGramToken concreteToken;
+
+ public NGramUTF8StringBinaryTokenizer(int gramLength, boolean usePrePost,
+ boolean ignoreTokenCount, boolean sourceHasTypeTag,
+ ITokenFactory tokenFactory) {
+ super(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+ this.gramLength = gramLength;
+ this.usePrePost = usePrePost;
+ concreteToken = (INGramToken) token;
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (gramNum < totalGrams) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ @Override
+ public void next() {
+ int currentTokenStart = index;
+ int tokenCount = 1;
+ int numPreChars = 0;
+ int numPostChars = 0;
+ if (usePrePost) {
+ numPreChars = Math.max(gramLength - gramNum - 1, 0);
+ numPostChars = (gramNum > totalGrams - gramLength) ? gramLength
+ - totalGrams + gramNum : 0;
+ }
+ gramNum++;
+
+ concreteToken.setNumPrePostChars(numPreChars, numPostChars);
+ if (numPreChars == 0) {
+ index += StringUtils.charSize(data, index);
+ }
+
+ // compute token count
+ // ignore pre and post grams for duplicate detection
+ if (!ignoreTokenCount && numPreChars == 0 && numPostChars == 0) {
+ int tmpIndex = start;
+ while (tmpIndex < currentTokenStart) {
+ tokenCount++; // assume found
+ int offset = 0;
+ for (int j = 0; j < gramLength; j++) {
+ if (StringUtils.toLowerCase(StringUtils.charAt(data,
+ currentTokenStart + offset)) != StringUtils
+ .toLowerCase(StringUtils.charAt(data, tmpIndex
+ + offset))) {
+ tokenCount--;
+ break;
+ }
+ offset += StringUtils.charSize(data, tmpIndex + offset);
+ }
+ tmpIndex += StringUtils.charSize(data, tmpIndex);
+ }
+ }
+
+ // set token
+ token.reset(data, currentTokenStart, length, gramLength, tokenCount);
+ }
+
+ @Override
+ public void reset(byte[] data, int start, int length) {
+ super.reset(data, start, length);
+ gramNum = 0;
+
+ int numChars = 0;
+ int pos = index;
+ int end = pos + utf8Length;
+ while (pos < end) {
+ numChars++;
+ pos += StringUtils.charSize(data, pos);
+ }
+
+ if (usePrePost) {
+ totalGrams = numChars + gramLength - 1;
+ } else {
+ totalGrams = numChars - gramLength + 1;
+ }
+ }
+
+ public void setGramlength(int gramLength) {
+ this.gramLength = gramLength;
+ }
+
+ public void setPrePost(boolean usePrePost) {
+ this.usePrePost = usePrePost;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java
new file mode 100644
index 0000000..6b6406f
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramToken.java
@@ -0,0 +1,86 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class UTF8NGramToken extends AbstractUTF8Token implements INGramToken {
+
+ public final static char PRECHAR = '#';
+
+ public final static char POSTCHAR = '$';
+
+ protected int numPreChars;
+ protected int numPostChars;
+
+ public UTF8NGramToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public int getNumPostChars() {
+ return numPreChars;
+ }
+
+ @Override
+ public int getNumPreChars() {
+ return numPostChars;
+ }
+
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ handleTokenTypeTag(dos);
+
+ // regular chars
+ int numRegChars = tokenLength - numPreChars - numPostChars;
+
+ // assuming pre and post char need 1-byte each in utf8
+ int tokenUTF8Len = getLowerCaseUTF8Len(numRegChars) + numPreChars
+ + numPostChars;
+
+ // write utf8 length indicator
+ StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+
+ // pre chars
+ for (int i = 0; i < numPreChars; i++) {
+ StringUtils.writeCharAsModifiedUTF8(PRECHAR, dos);
+ }
+
+ int pos = start;
+ for (int i = 0; i < numRegChars; i++) {
+ char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+ StringUtils.writeCharAsModifiedUTF8(c, dos);
+ pos += StringUtils.charSize(data, pos);
+ }
+
+ // post chars
+ for (int i = 0; i < numPostChars; i++) {
+ StringUtils.writeCharAsModifiedUTF8(POSTCHAR, dos);
+ }
+ }
+
+ public void setNumPrePostChars(int numPreChars, int numPostChars) {
+ this.numPreChars = numPreChars;
+ this.numPostChars = numPostChars;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramTokenFactory.java
new file mode 100644
index 0000000..968d8e1
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8NGramTokenFactory.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class UTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public UTF8NGramTokenFactory() {
+ super();
+ }
+
+ public UTF8NGramTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public IToken createToken() {
+ return new UTF8NGramToken(tokenTypeTag, countTypeTag);
+ }
+
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java
new file mode 100644
index 0000000..25e0cd3
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordToken.java
@@ -0,0 +1,46 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+
+public class UTF8WordToken extends AbstractUTF8Token {
+
+ public UTF8WordToken(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public void serializeToken(DataOutput dos) throws IOException {
+ handleTokenTypeTag(dos);
+
+ int tokenUTF8Len = getLowerCaseUTF8Len(tokenLength);
+ StringUtils.writeUTF8Len(tokenUTF8Len, dos);
+ int pos = start;
+ for (int i = 0; i < tokenLength; i++) {
+ char c = StringUtils.toLowerCase(StringUtils.charAt(data, pos));
+ StringUtils.writeCharAsModifiedUTF8(c, dos);
+ pos += StringUtils.charSize(data, pos);
+ }
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordTokenFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordTokenFactory.java
new file mode 100644
index 0000000..4358254
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/UTF8WordTokenFactory.java
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2010-2011 The Regents of the University of California
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on
+ * an "AS IS"; BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under
+ * the License.
+ *
+ * Author: Alexander Behm <abehm (at) ics.uci.edu>
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+public class UTF8WordTokenFactory extends AbstractUTF8TokenFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public UTF8WordTokenFactory() {
+ super();
+ }
+
+ public UTF8WordTokenFactory(byte tokenTypeTag, byte countTypeTag) {
+ super(tokenTypeTag, countTypeTag);
+ }
+
+ @Override
+ public IToken createToken() {
+ return new UTF8WordToken(tokenTypeTag, countTypeTag);
+ }
+
+}