added simple inverted index based on btree. conjunctive queries supported.
git-svn-id: https://hyracks.googlecode.com/svn/trunk/hyracks@205 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-storage-am-invertedindex/pom.xml b/hyracks-storage-am-invertedindex/pom.xml
new file mode 100644
index 0000000..5df3bb5
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/pom.xml
@@ -0,0 +1,70 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-storage-am-invertedindex</artifactId>
+ <version>0.1.3-SNAPSHOT</version>
+
+ <parent>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks</artifactId>
+ <version>0.1.3-SNAPSHOT</version>
+ </parent>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>2.0.2</version>
+ <configuration>
+ <source>1.6</source>
+ <target>1.6</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+ <dependencies>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-storage-common</artifactId>
+ <version>0.1.3-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-dataflow-common</artifactId>
+ <version>0.1.3-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-dataflow-std</artifactId>
+ <version>0.1.3-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-control-nc</artifactId>
+ <version>0.1.3-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-storage-am-btree</artifactId>
+ <version>0.1.3-SNAPSHOT</version>
+ <type>jar</type>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.8.1</version>
+ <type>jar</type>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+</project>
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java
new file mode 100644
index 0000000..b01025b
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizer.java
@@ -0,0 +1,20 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+
+public interface IBinaryTokenizer {
+
+ public void reset(byte[] data, int start, int length);
+ public boolean hasNext();
+ public void next();
+
+ public int getTokenStartOff();
+ public int getTokenLength();
+
+ public void writeToken(DataOutput dos) throws IOException;
+
+ public RecordDescriptor getTokenSchema();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
new file mode 100644
index 0000000..e1072ae
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IBinaryTokenizerFactory.java
@@ -0,0 +1,7 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import java.io.Serializable;
+
+public interface IBinaryTokenizerFactory extends Serializable {
+ public IBinaryTokenizer createBinaryTokenizer();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java
new file mode 100644
index 0000000..378c925a
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java
@@ -0,0 +1,10 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import java.nio.ByteBuffer;
+
+public interface IInvertedIndexResultCursor {
+ public boolean hasNext();
+ public void next();
+ public ByteBuffer getBuffer();
+ public void reset();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java
new file mode 100644
index 0000000..fc2dfd1
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java
@@ -0,0 +1,9 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+
+public interface IInvertedIndexSearcher {
+ public void search(ITupleReference queryTuple, int queryFieldIndex) throws HyracksDataException;
+ public IInvertedIndexResultCursor getResultCursor();
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
new file mode 100644
index 0000000..0d5dbc4
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
@@ -0,0 +1,42 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
+
+import edu.uci.ics.hyracks.api.context.IHyracksContext;
+import edu.uci.ics.hyracks.api.dataflow.IOperatorNodePushable;
+import edu.uci.ics.hyracks.api.dataflow.value.IRecordDescriptorProvider;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.api.job.IOperatorEnvironment;
+import edu.uci.ics.hyracks.api.job.JobSpecification;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractSingleActivityOperatorDescriptor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
+
+public class BinaryTokenizerOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor {
+
+ private static final long serialVersionUID = 1L;
+
+ private final IBinaryTokenizerFactory tokenizerFactory;
+ // fields that will be tokenized
+ private final int[] tokenFields;
+ // operator will emit these projected fields for each token, e.g., as payload for an inverted list
+ // WARNING: too many projected fields can cause significant data blowup
+ private final int[] projFields;
+
+ public BinaryTokenizerOperatorDescriptor(JobSpecification spec, RecordDescriptor recDesc, IBinaryTokenizerFactory tokenizerFactory, int[] tokenFields, int[] projFields) {
+ super(spec, 1, 1);
+ this.tokenizerFactory = tokenizerFactory;
+ this.tokenFields = tokenFields;
+ this.projFields = projFields;
+ recordDescriptors[0] = recDesc;
+ }
+
+ @Override
+ public IOperatorNodePushable createPushRuntime(IHyracksContext ctx,
+ IOperatorEnvironment env,
+ IRecordDescriptorProvider recordDescProvider, int partition,
+ int nPartitions) throws HyracksDataException {
+ return new BinaryTokenizerOperatorNodePushable(ctx,
+ recordDescProvider.getInputRecordDescriptor(odId, 0),
+ recordDescriptors[0], tokenizerFactory.createBinaryTokenizer(),
+ tokenFields, projFields);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
new file mode 100644
index 0000000..869b0be
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
@@ -0,0 +1,103 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.dataflow;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import edu.uci.ics.hyracks.api.context.IHyracksContext;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.comm.util.FrameUtils;
+import edu.uci.ics.hyracks.dataflow.std.base.AbstractUnaryInputUnaryOutputOperatorNodePushable;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+
+public class BinaryTokenizerOperatorNodePushable extends AbstractUnaryInputUnaryOutputOperatorNodePushable {
+
+ private final IHyracksContext ctx;
+ private final IBinaryTokenizer tokenizer;
+ private final int[] tokenFields;
+ private final int[] projFields;
+ private final RecordDescriptor inputRecDesc;
+ private final RecordDescriptor outputRecDesc;
+
+ private FrameTupleAccessor accessor;
+ private ArrayTupleBuilder builder;
+ private DataOutput builderDos;
+ private FrameTupleAppender appender;
+ private ByteBuffer writeBuffer;
+
+ public BinaryTokenizerOperatorNodePushable(IHyracksContext ctx, RecordDescriptor inputRecDesc, RecordDescriptor outputRecDesc, IBinaryTokenizer tokenizer, int[] tokenFields, int[] projFields) {
+ this.ctx = ctx;
+ this.tokenizer = tokenizer;
+ this.tokenFields = tokenFields;
+ this.projFields = projFields;
+ this.inputRecDesc = inputRecDesc;
+ this.outputRecDesc = outputRecDesc;
+ }
+
+ @Override
+ public void open() throws HyracksDataException {
+ accessor = new FrameTupleAccessor(ctx, inputRecDesc);
+ writeBuffer = ctx.getResourceManager().allocateFrame();
+ builder = new ArrayTupleBuilder(outputRecDesc.getFields().length);
+ builderDos = builder.getDataOutput();
+ appender = new FrameTupleAppender(ctx);
+ appender.reset(writeBuffer, true);
+ }
+
+ @Override
+ public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
+ accessor.reset(buffer);
+
+ int tupleCount = accessor.getTupleCount();
+ for (int i = 0; i < tupleCount; i++) {
+
+ for(int j = 0; j < tokenFields.length; j++) {
+
+ tokenizer.reset(accessor.getBuffer().array(),
+ accessor.getTupleStartOffset(i) + accessor.getFieldSlotsLength() + accessor.getFieldStartOffset(i, tokenFields[j]),
+ accessor.getFieldLength(i, tokenFields[j]));
+
+ while(tokenizer.hasNext()) {
+ tokenizer.next();
+
+ builder.reset();
+ try {
+ tokenizer.writeToken(builderDos);
+ builder.addFieldEndOffset();
+ } catch (IOException e) {
+ throw new HyracksDataException(e.getMessage());
+ }
+
+ for(int k = 0; k < projFields.length; k++) {
+ builder.addField(accessor, i, projFields[k]);
+ }
+
+ if (!appender.append(builder.getFieldEndOffsets(), builder.getByteArray(), 0, builder.getSize())) {
+ FrameUtils.flushFrame(writeBuffer, writer);
+ appender.reset(writeBuffer, true);
+ if (!appender.append(builder.getFieldEndOffsets(), builder.getByteArray(), 0, builder.getSize())) {
+ throw new IllegalStateException();
+ }
+ }
+ }
+ }
+ }
+
+ if (appender.getTupleCount() > 0) {
+ FrameUtils.flushFrame(writeBuffer, writer);
+ }
+ }
+
+ @Override
+ public void close() throws HyracksDataException {
+ writer.close();
+ }
+
+ @Override
+ public void flush() throws HyracksDataException {
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/ListResultCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/ListResultCursor.java
new file mode 100644
index 0000000..44bf9f5
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/ListResultCursor.java
@@ -0,0 +1,40 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
+
+public class ListResultCursor implements IInvertedIndexResultCursor {
+
+ private List<ByteBuffer> resultBuffers;
+ private int numResultBuffers;
+ private int currentPos = -1;
+
+ public void setResults(List<ByteBuffer> resultBuffers, int numResultBuffers) {
+ this.resultBuffers = resultBuffers;
+ this.numResultBuffers = numResultBuffers;
+ reset();
+ }
+
+ @Override
+ public boolean hasNext() {
+ if(currentPos < numResultBuffers) return true;
+ else return false;
+ }
+
+ @Override
+ public void next() {
+ currentPos++;
+ }
+
+ @Override
+ public ByteBuffer getBuffer() {
+ return resultBuffers.get(currentPos);
+ }
+
+ @Override
+ public void reset() {
+ currentPos = -1;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java
new file mode 100644
index 0000000..ec45deb
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java
@@ -0,0 +1,265 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.context.IHyracksContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeCursor;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrame;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrame;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
+import edu.uci.ics.hyracks.storage.am.btree.impls.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.btree.impls.RangePredicate;
+import edu.uci.ics.hyracks.storage.am.btree.impls.RangeSearchCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
+
+public class SimpleConjunctiveSearcher implements IInvertedIndexSearcher {
+
+ private final int numKeyFields;
+ private final int numValueFields;
+
+ private final IBinaryComparator[] keyCmps;
+ private final IBinaryComparator[] valueCmps;
+
+ private final BTree btree;
+ private final IHyracksContext ctx;
+ private final ArrayTupleBuilder resultTupleBuilder;
+ private final FrameTupleAppender resultTupleAppender;
+ private final FrameTupleAccessor resultFrameAccessor;
+
+ private List<ByteBuffer> newResultBuffers = new ArrayList<ByteBuffer>();
+ private List<ByteBuffer> prevResultBuffers = new ArrayList<ByteBuffer>();
+ private List<ByteBuffer> swap = null;
+ private final ListResultCursor resultCursor = new ListResultCursor();
+ private int maxResultBufIdx = 0;
+
+ private final IBTreeLeafFrame leafFrame;
+ private final IBTreeInteriorFrame interiorFrame;
+ private final IBTreeCursor btreeCursor;
+ private final FrameTupleReference searchKey = new FrameTupleReference();
+ private final RangePredicate pred = new RangePredicate(true, null, null, null);
+
+ private final IBinaryTokenizer queryTokenizer;
+
+ public SimpleConjunctiveSearcher(IHyracksContext ctx, BTree btree, RecordDescriptor btreeRecDesc, IBinaryTokenizer queryTokenizer, int numKeyFields, int numValueFields) {
+ this.ctx = ctx;
+ this.btree = btree;
+ this.queryTokenizer = queryTokenizer;
+ this.numKeyFields = numKeyFields;
+ this.numValueFields = numValueFields;
+
+ leafFrame = btree.getLeafFrameFactory().getFrame();
+ interiorFrame = btree.getInteriorFrameFactory().getFrame();
+ btreeCursor = new RangeSearchCursor(leafFrame);
+ resultTupleAppender = new FrameTupleAppender(ctx);
+ resultTupleBuilder = new ArrayTupleBuilder(numValueFields);
+ newResultBuffers.add(ctx.getResourceManager().allocateFrame());
+ prevResultBuffers.add(ctx.getResourceManager().allocateFrame());
+
+ MultiComparator btreeCmp = btree.getMultiComparator();
+
+ keyCmps = new IBinaryComparator[numKeyFields];
+ for(int i = 0; i < numKeyFields; i++) {
+ keyCmps[i] = btreeCmp.getComparators()[i];
+ }
+
+ valueCmps = new IBinaryComparator[numValueFields];
+ for(int i = 0; i < numValueFields; i++) {
+ valueCmps[i] = btreeCmp.getComparators()[numKeyFields + i];
+ }
+
+ MultiComparator searchCmp = new MultiComparator(btreeCmp.getTypeTraits(), keyCmps);
+ pred.setComparator(searchCmp);
+ pred.setLowKey(searchKey);
+ pred.setHighKey(searchKey);
+
+ ISerializerDeserializer[] valueSerde = new ISerializerDeserializer[numValueFields];
+ for(int i = 0; i < numValueFields; i++) {
+ valueSerde[i] = btreeRecDesc.getFields()[numKeyFields + i];
+
+ }
+ RecordDescriptor valueRecDesc = new RecordDescriptor(valueSerde);
+ resultFrameAccessor = new FrameTupleAccessor(ctx, valueRecDesc);
+ }
+
+ public void search(ITupleReference queryTuple, int queryFieldIndex) throws HyracksDataException {
+
+ // parse query, TODO: this parsing is too simple
+ RecordDescriptor queryTokenRecDesc = new RecordDescriptor(new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE } );
+
+ ArrayTupleBuilder queryTokenBuilder = new ArrayTupleBuilder(queryTokenRecDesc.getFields().length);
+ DataOutput queryTokenDos = queryTokenBuilder.getDataOutput();
+ FrameTupleAppender queryTokenAppender = new FrameTupleAppender(ctx);
+ ByteBuffer queryTokenFrame = ctx.getResourceManager().allocateFrame();
+ queryTokenAppender.reset(queryTokenFrame, true);
+
+ queryTokenizer.reset(queryTuple.getFieldData(queryFieldIndex), queryTuple.getFieldStart(queryFieldIndex), queryTuple.getFieldLength(queryFieldIndex));
+ while(queryTokenizer.hasNext()) {
+ queryTokenizer.next();
+
+ queryTokenBuilder.reset();
+ try {
+ queryTokenizer.writeToken(queryTokenDos);
+ queryTokenBuilder.addFieldEndOffset();
+ } catch (IOException e) {
+ throw new HyracksDataException(e);
+ }
+
+ // WARNING: assuming one frame is enough to hold all tokens
+ queryTokenAppender.append(queryTokenBuilder.getFieldEndOffsets(), queryTokenBuilder.getByteArray(), 0, queryTokenBuilder.getSize());
+ }
+
+ FrameTupleAccessor queryTokenAccessor = new FrameTupleAccessor(ctx, queryTokenRecDesc);
+ queryTokenAccessor.reset(queryTokenFrame);
+ int numQueryTokens = queryTokenAccessor.getTupleCount();
+
+ maxResultBufIdx = 0;
+
+ resultTupleAppender.reset(newResultBuffers.get(0), true);
+ try {
+ // append first inverted list to temporary results
+ searchKey.reset(queryTokenAccessor, 0);
+ btree.search(btreeCursor, pred, leafFrame, interiorFrame);
+ while(btreeCursor.hasNext()) {
+ btreeCursor.next();
+ maxResultBufIdx = appendTupleToNewResults(btreeCursor, maxResultBufIdx);
+ }
+ btreeCursor.reset();
+ } catch (Exception e) {
+ throw new HyracksDataException(e);
+ }
+
+ resultFrameAccessor.reset(newResultBuffers.get(0));
+
+ // intersect temporary results with remaining inverted lists
+ for(int i = 1; i < numQueryTokens; i++) {
+ swap = prevResultBuffers;
+ prevResultBuffers = newResultBuffers;
+ newResultBuffers = swap;
+ try {
+ searchKey.reset(queryTokenAccessor, i);
+ btree.search(btreeCursor, pred, leafFrame, interiorFrame);
+ maxResultBufIdx = intersectList(btreeCursor, prevResultBuffers, maxResultBufIdx, newResultBuffers);
+ } catch (Exception e) {
+ throw new HyracksDataException(e);
+ }
+ btreeCursor.reset();
+ }
+ }
+
+ private int appendTupleToNewResults(IBTreeCursor btreeCursor, int newBufIdx) throws IOException {
+ ByteBuffer newCurrentBuffer = newResultBuffers.get(newBufIdx);
+
+ ITupleReference tuple = btreeCursor.getTuple();
+ resultTupleBuilder.reset();
+ DataOutput dos = resultTupleBuilder.getDataOutput();
+ for(int i = 0; i < numValueFields; i++) {
+ int fIdx = numKeyFields + i;
+ dos.write(tuple.getFieldData(fIdx), tuple.getFieldStart(fIdx), tuple.getFieldLength(fIdx));
+ resultTupleBuilder.addFieldEndOffset();
+ }
+
+ if (!resultTupleAppender.append(resultTupleBuilder.getFieldEndOffsets(), resultTupleBuilder.getByteArray(), 0, resultTupleBuilder.getSize())) {
+ newBufIdx++;
+ if(newBufIdx >= newResultBuffers.size()) {
+ newResultBuffers.add(ctx.getResourceManager().allocateFrame());
+ }
+ newCurrentBuffer = newResultBuffers.get(newBufIdx);
+ resultTupleAppender.reset(newCurrentBuffer, true);
+ if (!resultTupleAppender.append(resultTupleBuilder.getFieldEndOffsets(), resultTupleBuilder.getByteArray(), 0, resultTupleBuilder.getSize())) {
+ throw new IllegalStateException();
+ }
+ }
+
+ return newBufIdx;
+ }
+
+ private int intersectList(IBTreeCursor btreeCursor, List<ByteBuffer> prevResultBuffers, int maxPrevBufIdx, List<ByteBuffer> newResultBuffers) throws IOException, Exception {
+
+ int newBufIdx = 0;
+ ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
+
+ int prevBufIdx = 0;
+ ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+
+ resultTupleBuilder.reset();
+ resultTupleAppender.reset(newCurrentBuffer, true);
+ resultFrameAccessor.reset(prevCurrentBuffer);
+
+ // WARNING: not very efficient but good enough for the first cut
+ boolean advanceCursor = true;
+ boolean advancePrevResult = false;
+ int resultTidx = 0;
+
+ while( (!advanceCursor || btreeCursor.hasNext()) && prevBufIdx <= maxPrevBufIdx && resultTidx < resultFrameAccessor.getTupleCount()) {
+
+ if(advanceCursor) btreeCursor.next();
+ ITupleReference tuple = btreeCursor.getTuple();
+
+ int cmp = 0;
+ for(int i = 0; i < valueCmps.length; i++) {
+ int tupleFidx = numKeyFields + i;
+ cmp = valueCmps[i].compare(tuple.getFieldData(tupleFidx),
+ tuple.getFieldStart(tupleFidx),
+ tuple.getFieldLength(tupleFidx),
+ resultFrameAccessor.getBuffer().array(),
+ resultFrameAccessor.getTupleStartOffset(resultTidx) + resultFrameAccessor.getFieldSlotsLength() + resultFrameAccessor.getFieldStartOffset(resultTidx, i),
+ resultFrameAccessor.getFieldLength(resultTidx, i));
+ if(cmp != 0) break;
+ }
+
+ // match found
+ if(cmp == 0) {
+ newBufIdx = appendTupleToNewResults(btreeCursor, newBufIdx);
+
+ advanceCursor = true;
+ advancePrevResult = true;
+ }
+ else {
+ if(cmp < 0) {
+ advanceCursor = true;
+ advancePrevResult = false;
+ }
+ else {
+ advanceCursor = false;
+ advancePrevResult = true;
+ }
+ }
+
+ if(advancePrevResult) {
+ resultTidx++;
+ if(resultTidx >= resultFrameAccessor.getTupleCount()) {
+ prevBufIdx++;
+ if(prevBufIdx <= maxPrevBufIdx) {
+ prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+ resultFrameAccessor.reset(prevCurrentBuffer);
+ resultTidx = 0;
+ }
+ }
+ }
+ }
+
+ return newBufIdx;
+ }
+
+ @Override
+ public IInvertedIndexResultCursor getResultCursor() {
+ resultCursor.setResults(newResultBuffers, maxResultBufIdx + 1);
+ return resultCursor;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..9a47280
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
@@ -0,0 +1,83 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+
+public class DelimitedUTF8StringBinaryTokenizer implements IBinaryTokenizer {
+
+ private static final RecordDescriptor tokenSchema =
+ new RecordDescriptor(new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE } );
+
+ private final char delimiter;
+ private byte[] data;
+ private int start;
+ private int length;
+
+ private int tokenLength;
+ private int tokenStart;
+ private int pos;
+
+ public DelimitedUTF8StringBinaryTokenizer(char delimiter) {
+ this.delimiter = delimiter;
+ }
+
+ @Override
+ public int getTokenLength() {
+ return tokenLength;
+ }
+
+ @Override
+ public int getTokenStartOff() {
+ return tokenStart;
+ }
+
+ @Override
+ public boolean hasNext() {
+ if(pos >= start + length) return false;
+ else return true;
+ }
+
+ @Override
+ public void next() {
+ tokenLength = 0;
+ tokenStart = pos;
+ while(pos < start + length) {
+ int len = StringUtils.charSize(data, pos);
+ char ch = StringUtils.charAt(data, pos);
+ pos += len;
+ if(ch == delimiter) {
+ break;
+ }
+ tokenLength += len;
+ }
+ }
+
+ @Override
+ public void reset(byte[] data, int start, int length) {
+ this.data = data;
+ this.start = start;
+ this.pos = start;
+ this.length = length;
+ this.tokenLength = 0;
+ this.tokenStart = 0;
+ pos += 2; // UTF-8 specific
+ }
+
+ @Override
+ public void writeToken(DataOutput dos) throws IOException {
+ // WARNING: 2-byte length indicator is specific to UTF-8
+ dos.writeShort((short)tokenLength);
+ dos.write(data, tokenStart, tokenLength);
+ }
+
+ @Override
+ public RecordDescriptor getTokenSchema() {
+ return tokenSchema;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
new file mode 100644
index 0000000..6432c4a
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
@@ -0,0 +1,19 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
+
+public class DelimitedUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
+
+ private static final long serialVersionUID = 1L;
+ private final char delimiter;
+
+ public DelimitedUTF8StringBinaryTokenizerFactory(char delimiter) {
+ this.delimiter = delimiter;
+ }
+
+ @Override
+ public IBinaryTokenizer createBinaryTokenizer() {
+ return new DelimitedUTF8StringBinaryTokenizer(delimiter);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java
new file mode 100644
index 0000000..68fd502
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizer.java
@@ -0,0 +1,130 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.util.StringUtils;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+
+public class HashedQGramUTF8StringBinaryTokenizer implements IBinaryTokenizer {
+
+ private static final RecordDescriptor tokenSchema =
+ new RecordDescriptor(new ISerializerDeserializer[] { IntegerSerializerDeserializer.INSTANCE } );
+
+ private final boolean prePost;
+ private final int q;
+ private byte[] data;
+ private int start;
+ private int length;
+ private int gramNum;
+ private int utflen;
+
+ private final char PRECHAR = '#';
+ private final char POSTCHAR = '$';
+
+ private int charPos;
+ private int pos;
+ private int hashedGram;
+
+ HashedQGramUTF8StringBinaryTokenizer(int q, boolean prePost) {
+ this.prePost = prePost;
+ this.q = q;
+ }
+
+ @Override
+ public int getTokenLength() {
+ // the produced token (hashed q-gram) is derived from data
+ // but not contained in it
+ // therefore this call does not make sense
+ return -1;
+ }
+
+ @Override
+ public int getTokenStartOff() {
+ // the produced token (hashed q-gram) is derived from data
+ // but not contained in it
+ // therefore this call does not make sense
+ return -1;
+ }
+
+ @Override
+ public boolean hasNext() {
+ if((prePost && pos >= start + length) || (!prePost && pos >= start + length - q)) return false;
+ else return true;
+ }
+
+ @Override
+ public void next() {
+ hashedGram = 0;
+ if(prePost) {
+ if(gramNum < q) {
+ for(int i = 0; i < q - gramNum; i++) {
+ hashedGram = 31 * hashedGram + PRECHAR;
+ }
+
+ int tmpPos = pos;
+ for(int i = 0; i < gramNum; i++) {
+ hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
+ tmpPos += StringUtils.charSize(data, tmpPos);
+ }
+ }
+ else {
+ int stopStr = Math.min(charPos + q, utflen);
+ int tmpPos = pos;
+ for(int i = charPos; i < stopStr; i++) {
+ hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
+ tmpPos += StringUtils.charSize(data, tmpPos);
+ }
+
+ int stopPost = (charPos + q) - (utflen);
+ for(int i = 0; i < stopPost; i++) {
+ hashedGram = 31 * hashedGram + POSTCHAR;
+ }
+ pos += StringUtils.charSize(data, pos);
+ charPos++;
+ }
+ gramNum++;
+ }
+ else {
+ int tmpPos = pos;
+ for(int i = charPos; i < charPos + q; i++) {
+ hashedGram = 31 * hashedGram + StringUtils.charAt(data, tmpPos);
+ tmpPos += StringUtils.charSize(data, tmpPos);
+ }
+ pos += StringUtils.charSize(data, pos);
+ charPos++;
+ }
+ }
+
+ @Override
+ public void reset(byte[] data, int start, int length) {
+ this.data = data;
+ this.start = start;
+ this.length = length;
+ this.utflen = StringUtils.getUTFLen(data, start);
+ this.pos = start + 2; // UTF-8 specific
+ this.gramNum = 1;
+ this.charPos = 0;
+ }
+
+ @Override
+ public void writeToken(DataOutput dos) throws IOException {
+ dos.writeInt(hashedGram);
+ }
+
+ public char getPreChar() {
+ return PRECHAR;
+ }
+
+ public char getPostChar() {
+ return POSTCHAR;
+ }
+
+ @Override
+ public RecordDescriptor getTokenSchema() {
+ return tokenSchema;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java
new file mode 100644
index 0000000..98fd9ad
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/HashedQGramUTF8StringBinaryTokenizerFactory.java
@@ -0,0 +1,21 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizerFactory;
+
+public class HashedQGramUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
+
+ private static final long serialVersionUID = 1L;
+ private final int q;
+ private final boolean prePost;
+
+ public HashedQGramUTF8StringBinaryTokenizerFactory(int q, boolean prePost) {
+ this.q = q;
+ this.prePost = prePost;
+ }
+
+ @Override
+ public IBinaryTokenizer createBinaryTokenizer() {
+ return new HashedQGramUTF8StringBinaryTokenizer(q, prePost);
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java b/hyracks-storage-am-invertedindex/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java
new file mode 100644
index 0000000..3f00e4c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchers/SimpleConjunctiveSearcherTest.java
@@ -0,0 +1,262 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.searchers;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.File;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+import org.junit.Test;
+
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.context.IHyracksContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.dataflow.value.TypeTrait;
+import edu.uci.ics.hyracks.control.nc.runtime.RootHyracksContext;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.comparators.IntegerBinaryComparatorFactory;
+import edu.uci.ics.hyracks.dataflow.common.data.comparators.UTF8StringBinaryComparatorFactory;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrame;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrame;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeMetaDataFrame;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeMetaDataFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.frames.MetaDataFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.frames.NSMInteriorFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.frames.NSMLeafFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
+import edu.uci.ics.hyracks.storage.am.btree.impls.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.btree.tuples.SimpleTupleWriterFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.SimpleConjunctiveSearcher;
+import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.common.buffercache.BufferCache;
+import edu.uci.ics.hyracks.storage.common.buffercache.ClockPageReplacementStrategy;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
+import edu.uci.ics.hyracks.storage.common.buffercache.ICacheMemoryAllocator;
+import edu.uci.ics.hyracks.storage.common.buffercache.IPageReplacementStrategy;
+import edu.uci.ics.hyracks.storage.common.file.FileInfo;
+import edu.uci.ics.hyracks.storage.common.file.FileManager;
+
+public class SimpleConjunctiveSearcherTest {
+
+ // testing params
+ private static final int PAGE_SIZE = 256;
+ private static final int NUM_PAGES = 10;
+ private static final int HYRACKS_FRAME_SIZE = 256;
+
+ // realistic params
+ //private static final int PAGE_SIZE = 32768;
+ //private static final int NUM_PAGES = 1000;
+ //private static final int HYRACKS_FRAME_SIZE = 32768;
+
+ private String tmpDir = System.getProperty("java.io.tmpdir");
+
+ public class BufferAllocator implements ICacheMemoryAllocator {
+ @Override
+ public ByteBuffer[] allocate(int pageSize, int numPages) {
+ ByteBuffer[] buffers = new ByteBuffer[numPages];
+ for (int i = 0; i < numPages; ++i) {
+ buffers[i] = ByteBuffer.allocate(pageSize);
+ }
+ return buffers;
+ }
+ }
+
+ @Test
+ public void test01() throws Exception {
+
+ FileManager fileManager = new FileManager();
+ ICacheMemoryAllocator allocator = new BufferAllocator();
+ IPageReplacementStrategy prs = new ClockPageReplacementStrategy();
+ IBufferCache bufferCache = new BufferCache(allocator, prs, fileManager, PAGE_SIZE, NUM_PAGES);
+
+ File f = new File(tmpDir + "/" + "btreetest.bin");
+ RandomAccessFile raf = new RandomAccessFile(f, "rw");
+ int fileId = 0;
+ FileInfo fi = new FileInfo(fileId, raf);
+ fileManager.registerFile(fi);
+
+ // declare fields
+ int fieldCount = 2;
+ ITypeTrait[] typeTraits = new ITypeTrait[fieldCount];
+ typeTraits[0] = new TypeTrait(ITypeTrait.VARIABLE_LENGTH);
+ typeTraits[1] = new TypeTrait(4);
+
+ // declare keys
+ int keyFieldCount = 2;
+ IBinaryComparator[] cmps = new IBinaryComparator[keyFieldCount];
+ cmps[0] = UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+ cmps[1] = IntegerBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+
+ MultiComparator cmp = new MultiComparator(typeTraits, cmps);
+
+ //TypeAwareTupleWriterFactory tupleWriterFactory = new TypeAwareTupleWriterFactory(typeTraits);
+ SimpleTupleWriterFactory tupleWriterFactory = new SimpleTupleWriterFactory();
+ IBTreeLeafFrameFactory leafFrameFactory = new NSMLeafFrameFactory(tupleWriterFactory);
+ IBTreeInteriorFrameFactory interiorFrameFactory = new NSMInteriorFrameFactory(tupleWriterFactory);
+ IBTreeMetaDataFrameFactory metaFrameFactory = new MetaDataFrameFactory();
+
+ IBTreeLeafFrame leafFrame = leafFrameFactory.getFrame();
+ IBTreeInteriorFrame interiorFrame = interiorFrameFactory.getFrame();
+ IBTreeMetaDataFrame metaFrame = metaFrameFactory.getFrame();
+
+ BTree btree = new BTree(bufferCache, interiorFrameFactory, leafFrameFactory, cmp);
+ btree.create(fileId, leafFrame, metaFrame);
+ btree.open(fileId);
+
+ Random rnd = new Random();
+ rnd.setSeed(50);
+
+ IHyracksContext ctx = new RootHyracksContext(HYRACKS_FRAME_SIZE);
+ ByteBuffer frame = ctx.getResourceManager().allocateFrame();
+ FrameTupleAppender appender = new FrameTupleAppender(ctx);
+ ArrayTupleBuilder tb = new ArrayTupleBuilder(cmp.getFieldCount());
+ DataOutput dos = tb.getDataOutput();
+
+ ISerializerDeserializer[] btreeSerde = { UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
+ RecordDescriptor btreeRecDesc = new RecordDescriptor(btreeSerde);
+ IFrameTupleAccessor accessor = new FrameTupleAccessor(ctx, btreeRecDesc);
+ accessor.reset(frame);
+ FrameTupleReference tuple = new FrameTupleReference();
+
+ List<String> tokens = new ArrayList<String>();
+ tokens.add("computer");
+ tokens.add("hyracks");
+ tokens.add("fast");
+ tokens.add("university");
+ tokens.add("science");
+ tokens.add("major");
+
+ int maxId = 1000;
+ int addProb = 0;
+ int addProbStep = 2;
+
+ for (int i = 0; i < tokens.size(); i++) {
+
+ addProb += addProbStep;
+ for(int j = 0; j < maxId; j++) {
+ if((Math.abs(rnd.nextInt()) % addProb) == 0) {
+ tb.reset();
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(tokens.get(i), dos);
+ tb.addFieldEndOffset();
+ IntegerSerializerDeserializer.INSTANCE.serialize(j, dos);
+ tb.addFieldEndOffset();
+
+ appender.reset(frame, true);
+ appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize());
+
+ tuple.reset(accessor, 0);
+
+ try {
+ btree.insert(tuple, leafFrame, interiorFrame, metaFrame);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ // build query as tuple reference
+ ISerializerDeserializer[] querySerde = { UTF8StringSerializerDeserializer.INSTANCE };
+ RecordDescriptor queryRecDesc = new RecordDescriptor(querySerde);
+
+ FrameTupleAppender queryAppender = new FrameTupleAppender(ctx);
+ ArrayTupleBuilder queryTb = new ArrayTupleBuilder(querySerde.length);
+ DataOutput queryDos = queryTb.getDataOutput();
+
+ IFrameTupleAccessor queryAccessor = new FrameTupleAccessor(ctx, queryRecDesc);
+ queryAccessor.reset(frame);
+ FrameTupleReference queryTuple = new FrameTupleReference();
+
+ String query = "computer hyracks fast";
+ char queryDelimiter = ' ';
+ IBinaryTokenizer queryTokenizer = new DelimitedUTF8StringBinaryTokenizer(queryDelimiter);
+
+ queryTb.reset();
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(query, queryDos);
+ queryTb.addFieldEndOffset();
+
+ queryAppender.reset(frame, true);
+ queryAppender.append(queryTb.getFieldEndOffsets(), queryTb.getByteArray(), 0, queryTb.getSize());
+ queryTuple.reset(queryAccessor, 0);
+
+ int numKeyFields = 1;
+ int numValueFields = 1;
+ ISerializerDeserializer[] resultSerde = new ISerializerDeserializer[numValueFields];
+ for(int i = 0; i < numValueFields; i++) {
+ resultSerde[i] = btreeSerde[numKeyFields + i];
+ }
+ RecordDescriptor resultRecDesc = new RecordDescriptor(resultSerde);
+ FrameTupleAccessor resultAccessor = new FrameTupleAccessor(ctx, resultRecDesc);
+ FrameTupleReference resultTuple = new FrameTupleReference();
+
+ SimpleConjunctiveSearcher searcher = new SimpleConjunctiveSearcher(ctx, btree, btreeRecDesc, queryTokenizer, numKeyFields, numValueFields);
+
+ long timeStart = System.currentTimeMillis();
+ searcher.search(queryTuple, 0);
+ long timeEnd = System.currentTimeMillis();
+ System.out.println("SEARCH TIME: " + (timeEnd - timeStart) + "ms");
+
+ System.out.println("INTERSECTION RESULTS");
+ IInvertedIndexResultCursor resultCursor = searcher.getResultCursor();
+ while(resultCursor.hasNext()) {
+ resultCursor.next();
+ resultAccessor.reset(resultCursor.getBuffer());
+ for(int i = 0; i < resultAccessor.getTupleCount(); i++) {
+ resultTuple.reset(resultAccessor, i);
+ for(int j = 0; j < resultTuple.getFieldCount(); j++) {
+ ByteArrayInputStream inStream = new ByteArrayInputStream(resultTuple.getFieldData(j), resultTuple.getFieldStart(j), resultTuple.getFieldLength(j));
+ DataInput dataIn = new DataInputStream(inStream);
+ Object o = resultSerde[j].deserialize(dataIn);
+ System.out.print(o + " ");
+ }
+ System.out.println();
+ }
+ }
+
+ /*
+ IBinaryComparator[] searchCmps = new IBinaryComparator[1];
+ searchCmps[0] = UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+ MultiComparator searchCmp = new MultiComparator(typeTraits, searchCmps);
+
+ // ordered scan
+ IBTreeCursor scanCursor = new RangeSearchCursor(leafFrame);
+ RangePredicate nullPred = new RangePredicate(true, null, null, null);
+ btree.search(scanCursor, nullPred, leafFrame, interiorFrame);
+
+ try {
+ while (scanCursor.hasNext()) {
+ scanCursor.next();
+ ITupleReference frameTuple = scanCursor.getTuple();
+ String rec = cmp.printTuple(frameTuple, recDescSers);
+ System.out.println(rec);
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ } finally {
+ scanCursor.close();
+ }
+ */
+
+ btree.close();
+
+ bufferCache.close();
+ fileManager.close();
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java b/hyracks-storage-am-invertedindex/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java
new file mode 100644
index 0000000..1c1f1f9
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java
@@ -0,0 +1,173 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.util.ArrayList;
+import java.util.Random;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryHashFunction;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ByteArrayAccessibleOutputStream;
+import edu.uci.ics.hyracks.dataflow.common.data.hash.UTF8StringBinaryHashFunctionFactory;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+
+public class TokenizerTest {
+
+ // testing DelimitedUTF8StringBinaryTokenizer
+ @Test
+ public void test01() throws Exception {
+ Random rnd = new Random(50);
+
+ int numDocs = 100;
+ int maxWords = 1000;
+ int maxWordLength = 50;
+ char delimiter = ' ';
+
+ DelimitedUTF8StringBinaryTokenizer tok = new DelimitedUTF8StringBinaryTokenizer(delimiter);
+
+ // create a bunch of documents
+ for(int i = 0; i < numDocs; i++) {
+
+ // create a single document with a bunch of words
+ int words = (Math.abs(rnd.nextInt()) % maxWords) + 1;
+ StringBuilder strBuilder = new StringBuilder();
+ for(int j = 0; j < words; j++) {
+ int len = (Math.abs(rnd.nextInt()) % maxWordLength) + 1;
+ String s = randomString(len, rnd);
+ strBuilder.append(s);
+ if(j < words-1) strBuilder.append(delimiter);
+ }
+
+ String doc = strBuilder.toString();
+
+ // serialize document into baaos
+ ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
+ DataOutputStream dos = new DataOutputStream(baaos);
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(doc, dos);
+ byte[] data = baaos.toByteArray();
+
+ // use binary tokenizer and compare with Java tokenizer
+ String[] cmpTokens = doc.split(new String(new char[] { delimiter }));
+ int cmpCounter = 0;
+
+ tok.reset(data, 0, data.length);
+ while(tok.hasNext()) {
+ tok.next();
+
+ // write token to outputstream
+ ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
+ DataOutputStream dosWrite = new DataOutputStream(baaosWrite);
+ tok.writeToken(dosWrite);
+
+ // deserialize token to get string object
+ ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
+ DataInput dataIn = new DataInputStream(inStream);
+ String s = UTF8StringSerializerDeserializer.INSTANCE.deserialize(dataIn);
+
+ Assert.assertEquals(s, cmpTokens[cmpCounter++]);
+ }
+ }
+ }
+
+ // testing HashedQGramUTF8StringBinaryTokenizer
+ @Test
+ public void test02() throws Exception {
+ Random rnd = new Random(50);
+
+ int numStrings = 1000;
+ int maxStrLen = 100;
+ int minQ = 2;
+ int maxQ = 10;
+
+ // we test the correctness of HashedQGramUTF8StringBinaryTokenizer as follows:
+ // 1.1. tokenize the string into q-gram strings
+ // 1.2. serialize q-gram strings into bytes
+ // 1.3. compute hashed gram with UTF8StringBinaryHashFunctionFactory
+ // 2.1. serialize string into bytes
+ // 2.2. tokenize serialized string into hashed q-grams
+ // 2.3. test whether hashed grams from 1.3. and 2.3. are equal
+ for(int i = 0; i < numStrings; i++) {
+ int q = (Math.abs(rnd.nextInt()) % (maxQ - minQ)) + minQ;
+ int strLen = (Math.abs(rnd.nextInt()) % (maxStrLen - q)) + q;
+ String str = randomString(strLen, rnd);
+
+ // randomly choose pre and postfixing
+ boolean prePost = false;
+ if(Math.abs(rnd.nextInt()) % 2 == 0) prePost = true;
+
+ HashedQGramUTF8StringBinaryTokenizer qgramTok = new HashedQGramUTF8StringBinaryTokenizer(q, prePost);
+
+ String extendedString = str;
+ if(prePost) {
+ // pre and postfix string
+ StringBuilder strBuilder = new StringBuilder();
+ for(int j = 0; j < q - 1; j++) strBuilder.append(qgramTok.getPreChar());
+ strBuilder.append(str);
+ for(int j = 0; j < q - 1; j++) strBuilder.append(qgramTok.getPostChar());
+ extendedString = strBuilder.toString();
+ }
+
+ // generate q-grams in deserialized form
+ ArrayList<String> javaGrams = new ArrayList<String>();
+ for(int j = 0; j < extendedString.length() - q + 1; j++) {
+ javaGrams.add(extendedString.substring(j, j + q));
+ }
+
+ // serialize string for use in binary gram tokenizer
+ ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
+ DataOutputStream dos = new DataOutputStream(baaos);
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(str, dos);
+ byte[] data = baaos.toByteArray();
+
+ qgramTok.reset(data, 0, data.length);
+
+ int counter = 0;
+ while(qgramTok.hasNext()) {
+ qgramTok.next();
+
+ // write token to outputstream
+ ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
+ DataOutputStream dosWrite = new DataOutputStream(baaosWrite);
+ qgramTok.writeToken(dosWrite);
+
+ // deserialize token to get hashed gram
+ ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
+ DataInput dataIn = new DataInputStream(inStream);
+ Integer binHashedGram = IntegerSerializerDeserializer.INSTANCE.deserialize(dataIn);
+
+ // create hashed gram to test against
+ ByteArrayAccessibleOutputStream baaosCmp = new ByteArrayAccessibleOutputStream();
+ DataOutputStream dosCmp = new DataOutputStream(baaosCmp);
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(javaGrams.get(counter), dosCmp);
+
+ IBinaryHashFunction strHasher = UTF8StringBinaryHashFunctionFactory.INSTANCE.createBinaryHashFunction();
+ byte[] cmpData = baaosCmp.toByteArray();
+ int cmpHash = strHasher.hash(cmpData, 0, cmpData.length);
+
+ Assert.assertEquals(binHashedGram.intValue(), cmpHash);
+
+ counter++;
+ }
+ }
+ }
+
+ public static String randomString(int length, Random random) {
+ int maxAttempts = 1000;
+ int count = 0;
+ while(count < maxAttempts) {
+ String s = Long.toHexString(Double.doubleToLongBits(random.nextDouble()));
+ StringBuilder strBuilder = new StringBuilder();
+ for (int i = 0; i < s.length() && i < length; i++) {
+ strBuilder.append(s.charAt(Math.abs(random.nextInt()) % s.length()));
+ }
+ if(strBuilder.length() > 0) return strBuilder.toString();
+ count++;
+ }
+ return "abc";
+ }
+}