1. Added the following search modifiers for the inverted index: Conjunctive, Jaccard, Edit Distance.
2. Added tests for bulk loading and searching the inverted index.
3. Still some bugs in search test. Will fix next.
git-svn-id: https://hyracks.googlecode.com/svn/branches/hyracks_indexes@451 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java
index 990936e..fb73a65 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexResultCursor.java
@@ -15,14 +15,14 @@
package edu.uci.ics.hyracks.storage.am.invertedindex.api;
-import java.nio.ByteBuffer;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
public interface IInvertedIndexResultCursor {
public boolean hasNext();
public void next();
- public ByteBuffer getBuffer();
+ public ITupleReference getTuple();
- public void reset();
+ public void reset(IInvertedIndexSearcher invIndexSearcher);
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearchModifier.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearchModifier.java
new file mode 100644
index 0000000..b3819a2
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearchModifier.java
@@ -0,0 +1,8 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+
+import java.util.List;
+
+public interface IInvertedIndexSearchModifier {
+ int getOccurrenceThreshold(List<IInvertedListCursor> invListCursors);
+ int getPrefixLists(List<IInvertedListCursor> invListCursors);
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java
index 98dc3b9..db031e5 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/api/IInvertedIndexSearcher.java
@@ -15,10 +15,16 @@
package edu.uci.ics.hyracks.storage.am.invertedindex.api;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
public interface IInvertedIndexSearcher {
- public void search(ITupleReference queryTuple, int queryFieldIndex) throws Exception;
-
- public IInvertedIndexResultCursor getResultCursor();
+ public void search(IInvertedIndexResultCursor resultCursor, ITupleReference queryTuple, int queryFieldIndex, IInvertedIndexSearchModifier searchModifier) throws Exception;
+ public IFrameTupleAccessor createResultFrameTupleAccessor();
+ public ITupleReference createResultTupleReference();
+ public List<ByteBuffer> getResultBuffers();
+ public int getNumValidResultBuffers();
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java
index d181191..d29ebf0 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java
@@ -1,8 +1,5 @@
package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
-import java.io.ByteArrayInputStream;
-import java.io.DataInput;
-import java.io.DataInputStream;
import java.nio.ByteBuffer;
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
@@ -15,7 +12,6 @@
import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeCursor;
import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOpContext;
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/ListResultCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/ListResultCursor.java
deleted file mode 100644
index adc5221..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/ListResultCursor.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
-
-import java.nio.ByteBuffer;
-import java.util.List;
-
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
-
-public class ListResultCursor implements IInvertedIndexResultCursor {
-
- private List<ByteBuffer> resultBuffers;
- private int numResultBuffers;
- private int currentPos = 0;
-
- public void setResults(List<ByteBuffer> resultBuffers, int numResultBuffers) {
- this.resultBuffers = resultBuffers;
- this.numResultBuffers = numResultBuffers;
- reset();
- }
-
- @Override
- public boolean hasNext() {
- if (currentPos + 1 < numResultBuffers)
- return true;
- else
- return false;
- }
-
- @Override
- public void next() {
- currentPos++;
- }
-
- @Override
- public ByteBuffer getBuffer() {
- return resultBuffers.get(currentPos);
- }
-
- @Override
- public void reset() {
- currentPos = -1;
- }
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SearchResultCursor.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SearchResultCursor.java
new file mode 100644
index 0000000..f7236bd
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SearchResultCursor.java
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
+
+public class SearchResultCursor implements IInvertedIndexResultCursor {
+
+ private List<ByteBuffer> resultBuffers;
+ private IFrameTupleAccessor fta;
+ private FixedSizeTupleReference resultTuple;
+ private int numResultBuffers;
+ private int currentBufferIndex = 0;
+ private int tupleIndex = 0;
+
+ public SearchResultCursor(IFrameTupleAccessor fta, ITupleReference resultTuple) {
+ this.fta = fta;
+ this.resultTuple = (FixedSizeTupleReference)resultTuple;
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (currentBufferIndex < numResultBuffers && tupleIndex < fta.getTupleCount())
+ return true;
+ else
+ return false;
+ }
+
+ @Override
+ public void next() {
+ resultTuple.reset(fta.getBuffer().array(), fta.getTupleStartOffset(tupleIndex));
+ tupleIndex++;
+ if(tupleIndex >= fta.getTupleCount()) {
+ if(currentBufferIndex + 1 < numResultBuffers) {
+ currentBufferIndex++;
+ fta.reset(resultBuffers.get(currentBufferIndex));
+ resultTuple.reset(fta.getBuffer().array(), fta.getTupleStartOffset(0));
+ tupleIndex = 1;
+ }
+ }
+ }
+
+ @Override
+ public ITupleReference getTuple() {
+ return resultTuple;
+ }
+
+ @Override
+ public void reset(IInvertedIndexSearcher invIndexSearcher) {
+ currentBufferIndex = 0;
+ tupleIndex = 0;
+ resultBuffers = invIndexSearcher.getResultBuffers();
+ numResultBuffers = invIndexSearcher.getNumValidResultBuffers();
+ if(numResultBuffers > 0) {
+ fta.reset(resultBuffers.get(0));
+ }
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java
deleted file mode 100644
index 1692ee7..0000000
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/SimpleConjunctiveSearcher.java
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
-
-import java.io.DataOutput;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
-import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
-import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
-import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
-import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
-import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeCursor;
-import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrame;
-import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrame;
-import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
-import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOpContext;
-import edu.uci.ics.hyracks.storage.am.btree.impls.RangePredicate;
-import edu.uci.ics.hyracks.storage.am.btree.impls.RangeSearchCursor;
-import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
-import edu.uci.ics.hyracks.storage.am.common.ophelpers.TreeIndexOp;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
-
-public class SimpleConjunctiveSearcher implements IInvertedIndexSearcher {
-
- private final int numKeyFields;
- private final int numValueFields;
-
- private final IBinaryComparator[] keyCmps;
- private final IBinaryComparator[] valueCmps;
-
- private final BTree btree;
- private final IHyracksStageletContext ctx;
- private final ArrayTupleBuilder resultTupleBuilder;
- private final FrameTupleAppender resultTupleAppender;
- private final FrameTupleAccessor resultFrameAccessor;
-
- private List<ByteBuffer> newResultBuffers = new ArrayList<ByteBuffer>();
- private List<ByteBuffer> prevResultBuffers = new ArrayList<ByteBuffer>();
- private List<ByteBuffer> swap = null;
- private final ListResultCursor resultCursor = new ListResultCursor();
- private int maxResultBufIdx = 0;
-
- private final IBTreeLeafFrame leafFrame;
- private final IBTreeInteriorFrame interiorFrame;
- private final IBTreeCursor btreeCursor;
- private final FrameTupleReference searchKey = new FrameTupleReference();
- private final RangePredicate pred = new RangePredicate(true, null, null, true, true, null, null);
-
- private final IBinaryTokenizer queryTokenizer;
-
- public SimpleConjunctiveSearcher(IHyracksStageletContext ctx, BTree btree, RecordDescriptor btreeRecDesc,
- IBinaryTokenizer queryTokenizer, int numKeyFields, int numValueFields) {
- this.ctx = ctx;
- this.btree = btree;
- this.queryTokenizer = queryTokenizer;
- this.numKeyFields = numKeyFields;
- this.numValueFields = numValueFields;
-
- leafFrame = btree.getLeafFrameFactory().getFrame();
- interiorFrame = btree.getInteriorFrameFactory().getFrame();
- btreeCursor = new RangeSearchCursor(leafFrame);
- resultTupleAppender = new FrameTupleAppender(ctx.getFrameSize());
- resultTupleBuilder = new ArrayTupleBuilder(numValueFields);
- newResultBuffers.add(ctx.allocateFrame());
- prevResultBuffers.add(ctx.allocateFrame());
-
- MultiComparator btreeCmp = btree.getMultiComparator();
-
- keyCmps = new IBinaryComparator[numKeyFields];
- for (int i = 0; i < numKeyFields; i++) {
- keyCmps[i] = btreeCmp.getComparators()[i];
- }
-
- valueCmps = new IBinaryComparator[numValueFields];
- for (int i = 0; i < numValueFields; i++) {
- valueCmps[i] = btreeCmp.getComparators()[numKeyFields + i];
- }
-
- MultiComparator searchCmp = new MultiComparator(btreeCmp.getTypeTraits(), keyCmps);
- pred.setLowKeyComparator(searchCmp);
- pred.setHighKeyComparator(searchCmp);
- pred.setLowKey(searchKey, true);
- pred.setHighKey(searchKey, true);
-
- ISerializerDeserializer[] valueSerde = new ISerializerDeserializer[numValueFields];
- for (int i = 0; i < numValueFields; i++) {
- valueSerde[i] = btreeRecDesc.getFields()[numKeyFields + i];
-
- }
- RecordDescriptor valueRecDesc = new RecordDescriptor(valueSerde);
- resultFrameAccessor = new FrameTupleAccessor(ctx.getFrameSize(), valueRecDesc);
- }
-
- public void search(ITupleReference queryTuple, int queryFieldIndex) throws Exception {
-
- // parse query, TODO: this parsing is too simple
- RecordDescriptor queryTokenRecDesc = new RecordDescriptor(
- new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE });
-
- ArrayTupleBuilder queryTokenBuilder = new ArrayTupleBuilder(queryTokenRecDesc.getFields().length);
- DataOutput queryTokenDos = queryTokenBuilder.getDataOutput();
- FrameTupleAppender queryTokenAppender = new FrameTupleAppender(ctx.getFrameSize());
- ByteBuffer queryTokenFrame = ctx.allocateFrame();
- queryTokenAppender.reset(queryTokenFrame, true);
-
- queryTokenizer.reset(queryTuple.getFieldData(queryFieldIndex), queryTuple.getFieldStart(queryFieldIndex),
- queryTuple.getFieldLength(queryFieldIndex));
- while (queryTokenizer.hasNext()) {
- queryTokenizer.next();
-
- queryTokenBuilder.reset();
- try {
- IToken token = queryTokenizer.getToken();
- token.serializeToken(queryTokenDos);
- queryTokenBuilder.addFieldEndOffset();
- } catch (IOException e) {
- throw new HyracksDataException(e);
- }
-
- // WARNING: assuming one frame is big enough to hold all tokens
- queryTokenAppender.append(queryTokenBuilder.getFieldEndOffsets(), queryTokenBuilder.getByteArray(), 0,
- queryTokenBuilder.getSize());
- }
-
- FrameTupleAccessor queryTokenAccessor = new FrameTupleAccessor(ctx.getFrameSize(), queryTokenRecDesc);
- queryTokenAccessor.reset(queryTokenFrame);
- int numQueryTokens = queryTokenAccessor.getTupleCount();
-
- maxResultBufIdx = 0;
-
- BTreeOpContext opCtx = btree.createOpContext(TreeIndexOp.TI_SEARCH, leafFrame, interiorFrame, null);
-
- resultTupleAppender.reset(newResultBuffers.get(0), true);
- try {
- // append first inverted list to temporary results
- searchKey.reset(queryTokenAccessor, 0);
- btree.search(btreeCursor, pred, opCtx);
- while (btreeCursor.hasNext()) {
- btreeCursor.next();
- maxResultBufIdx = appendTupleToNewResults(btreeCursor, maxResultBufIdx);
- }
- btreeCursor.close();
- btreeCursor.reset();
- } catch (Exception e) {
- throw new HyracksDataException(e);
- }
-
- resultFrameAccessor.reset(newResultBuffers.get(0));
-
- // intersect temporary results with remaining inverted lists
- for (int i = 1; i < numQueryTokens; i++) {
- swap = prevResultBuffers;
- prevResultBuffers = newResultBuffers;
- newResultBuffers = swap;
- try {
- searchKey.reset(queryTokenAccessor, i);
- btree.search(btreeCursor, pred, opCtx);
- maxResultBufIdx = intersectList(btreeCursor, prevResultBuffers, maxResultBufIdx, newResultBuffers);
- } catch (Exception e) {
- throw new HyracksDataException(e);
- }
- btreeCursor.close();
- btreeCursor.reset();
- }
- }
-
- private int appendTupleToNewResults(IBTreeCursor btreeCursor, int newBufIdx) throws IOException {
- ByteBuffer newCurrentBuffer = newResultBuffers.get(newBufIdx);
-
- ITupleReference tuple = btreeCursor.getTuple();
- resultTupleBuilder.reset();
- DataOutput dos = resultTupleBuilder.getDataOutput();
- for (int i = 0; i < numValueFields; i++) {
- int fIdx = numKeyFields + i;
- dos.write(tuple.getFieldData(fIdx), tuple.getFieldStart(fIdx), tuple.getFieldLength(fIdx));
- resultTupleBuilder.addFieldEndOffset();
- }
-
- if (!resultTupleAppender.append(resultTupleBuilder.getFieldEndOffsets(), resultTupleBuilder.getByteArray(), 0,
- resultTupleBuilder.getSize())) {
- newBufIdx++;
- if (newBufIdx >= newResultBuffers.size()) {
- newResultBuffers.add(ctx.allocateFrame());
- }
- newCurrentBuffer = newResultBuffers.get(newBufIdx);
- resultTupleAppender.reset(newCurrentBuffer, true);
- if (!resultTupleAppender.append(resultTupleBuilder.getFieldEndOffsets(), resultTupleBuilder.getByteArray(),
- 0, resultTupleBuilder.getSize())) {
- throw new IllegalStateException();
- }
- }
-
- return newBufIdx;
- }
-
- private int intersectList(IBTreeCursor btreeCursor, List<ByteBuffer> prevResultBuffers, int maxPrevBufIdx,
- List<ByteBuffer> newResultBuffers) throws IOException, Exception {
-
- int newBufIdx = 0;
- ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
-
- int prevBufIdx = 0;
- ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
-
- resultTupleBuilder.reset();
- resultTupleAppender.reset(newCurrentBuffer, true);
- resultFrameAccessor.reset(prevCurrentBuffer);
-
- // WARNING: not very efficient but good enough for the first cut
- boolean advanceCursor = true;
- boolean advancePrevResult = false;
- int resultTidx = 0;
-
- while ((!advanceCursor || btreeCursor.hasNext()) && prevBufIdx <= maxPrevBufIdx
- && resultTidx < resultFrameAccessor.getTupleCount()) {
-
- if (advanceCursor)
- btreeCursor.next();
- ITupleReference tuple = btreeCursor.getTuple();
-
- int cmp = 0;
- for (int i = 0; i < valueCmps.length; i++) {
- int tupleFidx = numKeyFields + i;
- cmp = valueCmps[i].compare(tuple.getFieldData(tupleFidx), tuple.getFieldStart(tupleFidx),
- tuple.getFieldLength(tupleFidx), resultFrameAccessor.getBuffer().array(),
- resultFrameAccessor.getTupleStartOffset(resultTidx) + resultFrameAccessor.getFieldSlotsLength()
- + resultFrameAccessor.getFieldStartOffset(resultTidx, i),
- resultFrameAccessor.getFieldLength(resultTidx, i));
- if (cmp != 0)
- break;
- }
-
- // match found
- if (cmp == 0) {
- newBufIdx = appendTupleToNewResults(btreeCursor, newBufIdx);
-
- advanceCursor = true;
- advancePrevResult = true;
- } else {
- if (cmp < 0) {
- advanceCursor = true;
- advancePrevResult = false;
- } else {
- advanceCursor = false;
- advancePrevResult = true;
- }
- }
-
- if (advancePrevResult) {
- resultTidx++;
- if (resultTidx >= resultFrameAccessor.getTupleCount()) {
- prevBufIdx++;
- if (prevBufIdx <= maxPrevBufIdx) {
- prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
- resultFrameAccessor.reset(prevCurrentBuffer);
- resultTidx = 0;
- }
- }
- }
- }
-
- return newBufIdx;
- }
-
- @Override
- public IInvertedIndexResultCursor getResultCursor() {
- resultCursor.setResults(newResultBuffers, maxResultBufIdx + 1);
- return resultCursor;
- }
-}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
index fe67635..1b03a5a 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
@@ -19,11 +19,11 @@
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
-import java.util.Collections;
import java.util.List;
import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
@@ -45,9 +45,12 @@
import edu.uci.ics.hyracks.storage.am.btree.impls.RangeSearchCursor;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.TreeIndexOp;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearcher;
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
-public class TOccurrenceSearcher {
+public class TOccurrenceSearcher implements IInvertedIndexSearcher {
protected final IHyracksStageletContext ctx;
protected final FixedSizeFrameTupleAppender resultFrameTupleApp;
@@ -59,7 +62,6 @@
protected List<ByteBuffer> newResultBuffers = new ArrayList<ByteBuffer>();
protected List<ByteBuffer> prevResultBuffers = new ArrayList<ByteBuffer>();
protected List<ByteBuffer> swap = null;
- protected final ListResultCursor resultCursor = new ListResultCursor();
protected int maxResultBufIdx = 0;
protected final IBTreeLeafFrame leafFrame;
@@ -67,10 +69,19 @@
protected final IBTreeCursor btreeCursor;
protected final FrameTupleReference searchKey = new FrameTupleReference();
protected final RangePredicate btreePred = new RangePredicate(true, null, null, true, true, null, null);
-
+ protected final BTreeOpContext btreeOpCtx;
+
+ protected RecordDescriptor queryTokenRecDesc = new RecordDescriptor(
+ new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE });
+ protected ArrayTupleBuilder queryTokenBuilder = new ArrayTupleBuilder(queryTokenRecDesc.getFields().length);
+ protected DataOutput queryTokenDos = queryTokenBuilder.getDataOutput();
+ protected FrameTupleAppender queryTokenAppender;
+ protected ByteBuffer queryTokenFrame;
+
protected final InvertedIndex invIndex;
protected final IBinaryTokenizer queryTokenizer;
- protected int occurrenceThreshold;
+ protected final ITypeTrait[] invListFieldsWithCount;
+ protected int occurrenceThreshold;
protected final int cursorCacheSize = 10;
protected List<IInvertedListCursor> invListCursorCache = new ArrayList<IInvertedListCursor>(cursorCacheSize);
@@ -86,7 +97,7 @@
btreeCursor = new RangeSearchCursor(leafFrame);
ITypeTrait[] invListFields = invIndex.getInvListElementCmp().getTypeTraits();
- ITypeTrait[] invListFieldsWithCount = new TypeTrait[invListFields.length + 1];
+ invListFieldsWithCount = new TypeTrait[invListFields.length + 1];
int tmp = 0;
for(int i = 0; i < invListFields.length; i++) {
invListFieldsWithCount[i] = invListFields[i];
@@ -96,6 +107,9 @@
invListFieldsWithCount[invListFields.length] = new TypeTrait(4);
invListKeyLength = tmp;
+ btreeOpCtx = invIndex.getBTree().createOpContext(TreeIndexOp.TI_SEARCH, leafFrame,
+ interiorFrame, null);
+
resultFrameTupleApp = new FixedSizeFrameTupleAppender(ctx.getFrameSize(), invListFieldsWithCount);
resultFrameTupleAcc = new FixedSizeFrameTupleAccessor(ctx.getFrameSize(), invListFieldsWithCount);
resultTuple = new FixedSizeTupleReference(invListFieldsWithCount);
@@ -112,7 +126,10 @@
for (int i = 0; i < cursorCacheSize; i++) {
invListCursorCache.add(new FixedSizeElementInvertedListCursor(invIndex.getBufferCache(), invIndex
.getInvListsFileId(), invIndex.getInvListElementCmp().getTypeTraits()));
- }
+ }
+
+ queryTokenAppender = new FrameTupleAppender(ctx.getFrameSize());
+ queryTokenFrame = ctx.allocateFrame();
currentNumResults = 0;
}
@@ -127,17 +144,9 @@
currentNumResults = 0;
}
- public void search(ITupleReference queryTuple, int queryFieldIndex) throws Exception {
+ public void search(IInvertedIndexResultCursor resultCursor, ITupleReference queryTuple, int queryFieldIndex, IInvertedIndexSearchModifier searchModifier) throws Exception {
- RecordDescriptor queryTokenRecDesc = new RecordDescriptor(
- new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE });
-
- ArrayTupleBuilder queryTokenBuilder = new ArrayTupleBuilder(queryTokenRecDesc.getFields().length);
- DataOutput queryTokenDos = queryTokenBuilder.getDataOutput();
- FrameTupleAppender queryTokenAppender = new FrameTupleAppender(ctx.getFrameSize());
- ByteBuffer queryTokenFrame = ctx.allocateFrame();
queryTokenAppender.reset(queryTokenFrame, true);
-
queryTokenizer.reset(queryTuple.getFieldData(queryFieldIndex), queryTuple.getFieldStart(queryFieldIndex),
queryTuple.getFieldLength(queryFieldIndex));
while (queryTokenizer.hasNext()) {
@@ -159,9 +168,7 @@
FrameTupleAccessor queryTokenAccessor = new FrameTupleAccessor(ctx.getFrameSize(), queryTokenRecDesc);
queryTokenAccessor.reset(queryTokenFrame);
int numQueryTokens = queryTokenAccessor.getTupleCount();
-
- maxResultBufIdx = 0;
-
+
// expand cursor cache if necessary
if (numQueryTokens > invListCursorCache.size()) {
int diff = numQueryTokens - invListCursorCache.size();
@@ -169,18 +176,14 @@
invListCursorCache.add(new FixedSizeElementInvertedListCursor(invIndex.getBufferCache(), invIndex
.getInvListsFileId(), invIndex.getInvListElementCmp().getTypeTraits()));
}
- }
+ }
- BTreeOpContext btreeOpCtx = invIndex.getBTree().createOpContext(TreeIndexOp.TI_SEARCH, leafFrame,
- interiorFrame, null);
-
- invListCursors.clear();
+ invListCursors.clear();
for (int i = 0; i < numQueryTokens; i++) {
searchKey.reset(queryTokenAccessor, i);
invIndex.openCursor(btreeCursor, btreePred, btreeOpCtx, invListCursorCache.get(i));
invListCursors.add(invListCursorCache.get(i));
}
- Collections.sort(invListCursors);
/*
for(int i = 0; i < numQueryTokens; i++) {
@@ -188,16 +191,19 @@
}
*/
- occurrenceThreshold = numQueryTokens;
+ occurrenceThreshold = searchModifier.getOccurrenceThreshold(invListCursors);
+
+ int numPrefixLists = searchModifier.getPrefixLists(invListCursors);
+ maxResultBufIdx = mergePrefixLists(numPrefixLists, numQueryTokens);
+ maxResultBufIdx = mergeSuffixLists(numPrefixLists, numQueryTokens, maxResultBufIdx);
- int numPrefixTokens = numQueryTokens - occurrenceThreshold + 1;
+ resultCursor.reset(this);
- int maxPrevBufIdx = mergePrefixLists(numPrefixTokens, numQueryTokens);
- maxPrevBufIdx = mergeSuffixLists(numPrefixTokens, numQueryTokens, maxPrevBufIdx);
+ //System.out.println("NUMBER RESULTS: " + currentNumResults);
/*
StringBuffer strBuffer = new StringBuffer();
- for(int i = 0; i <= maxPrevBufIdx; i++) {
+ for(int i = 0; i <= maxResultBufIdx; i++) {
ByteBuffer testBuf = newResultBuffers.get(i);
resultFrameTupleAcc.reset(testBuf);
for(int j = 0; j < resultFrameTupleAcc.getTupleCount(); j++) {
@@ -206,7 +212,7 @@
}
}
System.out.println(strBuffer.toString());
- */
+ */
}
@@ -276,6 +282,7 @@
}
else {
if(count + numQueryTokens - invListIx > occurrenceThreshold) {
+ //System.out.println("C: " + count);
newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
}
}
@@ -370,8 +377,8 @@
resultTidx = 0;
}
}
- }
-
+ }
+
return newBufIdx;
}
@@ -434,10 +441,10 @@
// append remaining new elements from inverted list
//if(invListCursor.hasNext()) System.out.println("APPENDING FROM INV LIST");
- while(invListCursor.hasNext()) {
+ while(invListCursor.hasNext()) {
invListCursor.next();
ITupleReference invListTuple = invListCursor.getTuple();
- newBufIdx = appendTupleToNewResults(invListTuple, 1, newBufIdx);
+ newBufIdx = appendTupleToNewResults(invListTuple, 1, newBufIdx);
}
// append remaining elements from previous result set
@@ -489,146 +496,22 @@
return newBufIdx;
}
+
+ public IFrameTupleAccessor createResultFrameTupleAccessor() {
+ return new FixedSizeFrameTupleAccessor(ctx.getFrameSize(), invListFieldsWithCount);
+ }
-
- /*
- private int appendTupleToNewResults(IBTreeCursor btreeCursor, int newBufIdx) throws IOException {
- ByteBuffer newCurrentBuffer = newResultBuffers.get(newBufIdx);
-
- ITupleReference tuple = btreeCursor.getTuple();
- resultTupleBuilder.reset();
- DataOutput dos = resultTupleBuilder.getDataOutput();
- for (int i = 0; i < numValueFields; i++) {
- int fIdx = numKeyFields + i;
- dos.write(tuple.getFieldData(fIdx), tuple.getFieldStart(fIdx), tuple.getFieldLength(fIdx));
- resultTupleBuilder.addFieldEndOffset();
- }
-
- if (!resultTupleAppender.append(resultTupleBuilder.getFieldEndOffsets(), resultTupleBuilder.getByteArray(), 0,
- resultTupleBuilder.getSize())) {
- newBufIdx++;
- if (newBufIdx >= newResultBuffers.size()) {
- newResultBuffers.add(ctx.allocateFrame());
- }
- newCurrentBuffer = newResultBuffers.get(newBufIdx);
- resultTupleAppender.reset(newCurrentBuffer, true);
- if (!resultTupleAppender.append(resultTupleBuilder.getFieldEndOffsets(), resultTupleBuilder.getByteArray(),
- 0, resultTupleBuilder.getSize())) {
- throw new IllegalStateException();
- }
- }
-
- return newBufIdx;
+ public ITupleReference createResultTupleReference() {
+ return new FixedSizeTupleReference(invListFieldsWithCount);
}
- */
-
- /*
- private int mergeInvList(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers, int maxPrevBufIdx, List<ByteBuffer> newResultBuffers) throws IOException, Exception {
-
- int newBufIdx = 0;
- ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
-
- int prevBufIdx = 0;
- ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
-
- resultTupleBuilder.reset();
- resultTupleAppender.reset(newCurrentBuffer, true);
- resultFrameAccessor.reset(prevCurrentBuffer);
-
- // WARNING: not very efficient but good enough for the first cut
- boolean advanceCursor = true;
- boolean advancePrevResult = false;
- int resultTidx = 0;
-
- while ((!advanceCursor || invListCursor.hasNext()) && prevBufIdx <= maxPrevBufIdx
- && resultTidx < resultFrameAccessor.getTupleCount()) {
-
- if (advanceCursor)
- invListCursor.next();
-
- ICachedPage invListPage = invListCursor.getPage();
- int invListOff = invListCursor.getOffset();
-
- int cmp = 0;
- int valueFields = 1;
- for (int i = 0; i < valueFields; i++) {
-
- int tupleFidx = numKeyFields + i;
- cmp = valueCmps[i].compare(tuple.getFieldData(tupleFidx), tuple.getFieldStart(tupleFidx),
- tuple.getFieldLength(tupleFidx), resultFrameAccessor.getBuffer().array(),
- resultFrameAccessor.getTupleStartOffset(resultTidx) + resultFrameAccessor.getFieldSlotsLength()
- + resultFrameAccessor.getFieldStartOffset(resultTidx, i),
- resultFrameAccessor.getFieldLength(resultTidx, i));
- if (cmp != 0)
- break;
- }
-
- // match found
- if (cmp == 0) {
- newBufIdx = appendTupleToNewResults(btreeCursor, newBufIdx);
-
- advanceCursor = true;
- advancePrevResult = true;
- } else {
- if (cmp < 0) {
- advanceCursor = true;
- advancePrevResult = false;
- } else {
- advanceCursor = false;
- advancePrevResult = true;
- }
- }
-
- if (advancePrevResult) {
- resultTidx++;
- if (resultTidx >= resultFrameAccessor.getTupleCount()) {
- prevBufIdx++;
- if (prevBufIdx <= maxPrevBufIdx) {
- prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
- resultFrameAccessor.reset(prevCurrentBuffer);
- resultTidx = 0;
- }
- }
- }
- }
-
- return newBufIdx;
+ @Override
+ public List<ByteBuffer> getResultBuffers() {
+ return newResultBuffers;
}
-
- private int appendTupleToNewResults(IBTreeCursor btreeCursor, int newBufIdx) throws IOException {
- ByteBuffer newCurrentBuffer = newResultBuffers.get(newBufIdx);
- ITupleReference tuple = btreeCursor.getTuple();
- resultTupleBuilder.reset();
- DataOutput dos = resultTupleBuilder.getDataOutput();
- for (int i = 0; i < numValueFields; i++) {
- int fIdx = numKeyFields + i;
- dos.write(tuple.getFieldData(fIdx), tuple.getFieldStart(fIdx), tuple.getFieldLength(fIdx));
- resultTupleBuilder.addFieldEndOffset();
- }
-
- if (!resultTupleAppender.append(resultTupleBuilder.getFieldEndOffsets(), resultTupleBuilder.getByteArray(), 0,
- resultTupleBuilder.getSize())) {
- newBufIdx++;
- if (newBufIdx >= newResultBuffers.size()) {
- newResultBuffers.add(ctx.allocateFrame());
- }
- newCurrentBuffer = newResultBuffers.get(newBufIdx);
- resultTupleAppender.reset(newCurrentBuffer, true);
- if (!resultTupleAppender.append(resultTupleBuilder.getFieldEndOffsets(), resultTupleBuilder.getByteArray(),
- 0, resultTupleBuilder.getSize())) {
- throw new IllegalStateException();
- }
- }
-
- return newBufIdx;
- }
- */
-
- /*
- * @Override public IInvertedIndexResultCursor getResultCursor() {
- * resultCursor.setResults(newResultBuffers, maxResultBufIdx + 1); return
- * resultCursor; }
- */
+ @Override
+ public int getNumValidResultBuffers() {
+ return maxResultBufIdx + 1;
+ }
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/ConjunctiveSearchModifier.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/ConjunctiveSearchModifier.java
new file mode 100644
index 0000000..1f1d32c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/ConjunctiveSearchModifier.java
@@ -0,0 +1,22 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers;
+
+import java.util.Collections;
+import java.util.List;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
+
+public class ConjunctiveSearchModifier implements IInvertedIndexSearchModifier {
+
+ @Override
+ public int getOccurrenceThreshold(List<IInvertedListCursor> invListCursors) {
+ return invListCursors.size();
+ }
+
+ @Override
+ public int getPrefixLists(List<IInvertedListCursor> invListCursors) {
+ Collections.sort(invListCursors);
+ return 1;
+ }
+
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/EditDistanceSearchModifier.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/EditDistanceSearchModifier.java
new file mode 100644
index 0000000..19481af
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/EditDistanceSearchModifier.java
@@ -0,0 +1,45 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers;
+
+import java.util.Collections;
+import java.util.List;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
+
+public class EditDistanceSearchModifier implements IInvertedIndexSearchModifier {
+
+ private int gramLength;
+ private int edThresh;
+
+ public EditDistanceSearchModifier(int gramLength, int edThresh) {
+ this.gramLength = gramLength;
+ this.edThresh = edThresh;
+ }
+
+ @Override
+ public int getOccurrenceThreshold(List<IInvertedListCursor> invListCursors) {
+ return invListCursors.size() - edThresh * gramLength;
+ }
+
+ @Override
+ public int getPrefixLists(List<IInvertedListCursor> invListCursors) {
+ Collections.sort(invListCursors);
+ return invListCursors.size() - getOccurrenceThreshold(invListCursors) + 1;
+ }
+
+ public int getGramLength() {
+ return gramLength;
+ }
+
+ public void setGramLength(int gramLength) {
+ this.gramLength = gramLength;
+ }
+
+ public int getEdThresh() {
+ return edThresh;
+ }
+
+ public void setEdThresh(int edThresh) {
+ this.edThresh = edThresh;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/JaccardSearchModifier.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/JaccardSearchModifier.java
new file mode 100644
index 0000000..dbd2196
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/searchmodifiers/JaccardSearchModifier.java
@@ -0,0 +1,38 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers;
+
+import java.util.Collections;
+import java.util.List;
+
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
+
+public class JaccardSearchModifier implements IInvertedIndexSearchModifier {
+
+ private float jaccThresh;
+
+ public JaccardSearchModifier(float jaccThresh) {
+ this.jaccThresh = jaccThresh;
+ }
+
+ @Override
+ public int getOccurrenceThreshold(List<IInvertedListCursor> invListCursors) {
+ return (int) Math.floor((float) invListCursors.size() * jaccThresh);
+ }
+
+ @Override
+ public int getPrefixLists(List<IInvertedListCursor> invListCursors) {
+ Collections.sort(invListCursors);
+ if (invListCursors.size() == 0) {
+ return 0;
+ }
+ return invListCursors.size() - (int) Math.ceil(jaccThresh * invListCursors.size()) + 1;
+ }
+
+ public float getJaccThresh() {
+ return jaccThresh;
+ }
+
+ public void setJaccThresh(float jaccThresh) {
+ this.jaccThresh = jaccThresh;
+ }
+}
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/BulkLoadTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/BulkLoadTest.java
index 868aa3a..62a0879 100644
--- a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/BulkLoadTest.java
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/BulkLoadTest.java
@@ -1,8 +1,5 @@
package edu.uci.ics.hyracks.storage.am.invertedindex;
-import java.io.ByteArrayInputStream;
-import java.io.DataInput;
-import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.File;
import java.nio.ByteBuffer;
@@ -10,12 +7,10 @@
import java.util.List;
import java.util.Random;
+import junit.framework.Assert;
+
import org.junit.Test;
-import edu.uci.ics.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.ITokenFactory;
-import edu.uci.ics.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
@@ -28,10 +23,12 @@
import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
import edu.uci.ics.hyracks.dataflow.common.data.comparators.IntegerBinaryComparatorFactory;
import edu.uci.ics.hyracks.dataflow.common.data.comparators.UTF8StringBinaryComparatorFactory;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeCursor;
import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrame;
import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrameFactory;
import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrame;
@@ -39,29 +36,29 @@
import edu.uci.ics.hyracks.storage.am.btree.frames.NSMInteriorFrameFactory;
import edu.uci.ics.hyracks.storage.am.btree.frames.NSMLeafFrameFactory;
import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOpContext;
+import edu.uci.ics.hyracks.storage.am.btree.impls.RangePredicate;
+import edu.uci.ics.hyracks.storage.am.btree.impls.RangeSearchCursor;
import edu.uci.ics.hyracks.storage.am.common.api.IFreePageManager;
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexMetaDataFrame;
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexMetaDataFrameFactory;
import edu.uci.ics.hyracks.storage.am.common.frames.LIFOMetaDataFrameFactory;
import edu.uci.ics.hyracks.storage.am.common.freepage.LinkedListFreePageManager;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.TreeIndexOp;
import edu.uci.ics.hyracks.storage.am.common.tuples.TypeAwareTupleWriterFactory;
import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.FixedSizeElementInvertedListBuilder;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.FixedSizeElementInvertedListCursor;
import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
-import edu.uci.ics.hyracks.storage.am.invertedindex.impls.TOccurrenceSearcher;
import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
import edu.uci.ics.hyracks.storage.common.file.IFileMapProvider;
import edu.uci.ics.hyracks.test.support.TestStorageManagerComponentHolder;
import edu.uci.ics.hyracks.test.support.TestUtils;
public class BulkLoadTest extends AbstractInvIndexTest {
- // testing params
- //private static final int PAGE_SIZE = 256;
- //private static final int NUM_PAGES = 100;
- //private static final int HYRACKS_FRAME_SIZE = 256;
-
- // realistic params
+
// private static final int PAGE_SIZE = 65536;
private static final int PAGE_SIZE = 32768;
private static final int NUM_PAGES = 100;
@@ -69,20 +66,18 @@
private IHyracksStageletContext stageletCtx = TestUtils.create(HYRACKS_FRAME_SIZE);
@Test
- public void test01() throws Exception {
+ public void singleFieldPayloadTest() throws Exception {
TestStorageManagerComponentHolder.init(PAGE_SIZE, NUM_PAGES);
IBufferCache bufferCache = TestStorageManagerComponentHolder.getBufferCache(stageletCtx);
IFileMapProvider fmp = TestStorageManagerComponentHolder.getFileMapProvider(stageletCtx);
// create file refs
- System.out.println(btreeFileName);
FileReference btreeFile = new FileReference(new File(btreeFileName));
bufferCache.createFile(btreeFile);
int btreeFileId = fmp.lookupFileId(btreeFile);
bufferCache.openFile(btreeFileId);
-
- System.out.println(invListsFileName);
+
FileReference invListsFile = new FileReference(new File(invListsFileName));
bufferCache.createFile(invListsFile);
int invListsFileId = fmp.lookupFileId(invListsFile);
@@ -91,10 +86,15 @@
// declare btree fields
int fieldCount = 5;
ITypeTrait[] typeTraits = new ITypeTrait[fieldCount];
+ // token (key)
typeTraits[0] = new TypeTrait(ITypeTrait.VARIABLE_LENGTH);
+ // startPageId
typeTraits[1] = new TypeTrait(4);
+ // endPageId
typeTraits[2] = new TypeTrait(4);
+ // startOff
typeTraits[3] = new TypeTrait(4);
+ // numElements
typeTraits[4] = new TypeTrait(4);
// declare btree keys
@@ -102,12 +102,10 @@
IBinaryComparator[] cmps = new IBinaryComparator[keyFieldCount];
cmps[0] = UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
- MultiComparator cmp = new MultiComparator(typeTraits, cmps);
+ MultiComparator btreeCmp = new MultiComparator(typeTraits, cmps);
TypeAwareTupleWriterFactory tupleWriterFactory = new TypeAwareTupleWriterFactory(typeTraits);
- IBTreeLeafFrameFactory leafFrameFactory = new NSMLeafFrameFactory(tupleWriterFactory);
- // IBTreeLeafFrameFactory leafFrameFactory = new
- // FieldPrefixNSMLeafFrameFactory(tupleWriterFactory);
+ IBTreeLeafFrameFactory leafFrameFactory = new NSMLeafFrameFactory(tupleWriterFactory);
IBTreeInteriorFrameFactory interiorFrameFactory = new NSMInteriorFrameFactory(tupleWriterFactory);
ITreeIndexMetaDataFrameFactory metaFrameFactory = new LIFOMetaDataFrameFactory();
@@ -117,7 +115,7 @@
IFreePageManager freePageManager = new LinkedListFreePageManager(bufferCache, btreeFileId, 0, metaFrameFactory);
- BTree btree = new BTree(bufferCache, freePageManager, interiorFrameFactory, leafFrameFactory, cmp);
+ BTree btree = new BTree(bufferCache, freePageManager, interiorFrameFactory, leafFrameFactory, btreeCmp);
btree.create(btreeFileId, leafFrame, metaFrame);
btree.open(btreeFileId);
@@ -160,6 +158,11 @@
tokens.add("systems");
tokens.add("university");
+ ArrayList<ArrayList<Integer>> checkListElements = new ArrayList<ArrayList<Integer>>();
+ for(int i = 0; i < tokens.size(); i++) {
+ checkListElements.add(new ArrayList<Integer>());
+ }
+
int maxId = 1000000;
int addProb = 0;
int addProbStep = 10;
@@ -168,12 +171,9 @@
InvertedIndex.BulkLoadContext ctx = invIndex.beginBulkLoad(invListBuilder, HYRACKS_FRAME_SIZE);
int totalElements = 0;
- int tokenField = 0;
- int[] elementFields = { 1 };
for (int i = 0; i < tokens.size(); i++) {
addProb += addProbStep * (i+1);
- StringBuilder strBuilder = new StringBuilder();
for (int j = 0; j < maxId; j++) {
if ((Math.abs(rnd.nextInt()) % addProb) == 0) {
@@ -185,7 +185,7 @@
IntegerSerializerDeserializer.INSTANCE.serialize(j, dos);
tb.addFieldEndOffset();
- //strBuilder.append(j + " ");
+ checkListElements.get(i).add(j);
appender.reset(frame, true);
appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize());
@@ -198,128 +198,81 @@
e.printStackTrace();
}
}
- }
- System.out.println(tokens.get(i));
- System.out.println(strBuilder.toString());
+ }
}
invIndex.endBulkLoad(ctx);
+
+ // ------- START VERIFICATION -----------
+
+ IBTreeCursor btreeCursor = new RangeSearchCursor(leafFrame);
+ FrameTupleReference searchKey = new FrameTupleReference();
+ RangePredicate btreePred = new RangePredicate(true, searchKey, searchKey, true, true, btreeCmp, btreeCmp);
- int numPages = btree.getFreePageManager().getMaxPage(metaFrame);
- System.out.println("NUMPAGES: " + numPages);
- System.out.println("TOTAL ELEMENTS: " + totalElements);
+ IInvertedListCursor invListCursor = new FixedSizeElementInvertedListCursor(bufferCache, invListsFileId, invListTypeTraits);
- // --------------------------- TEST A
- // build query as tuple reference
- ISerializerDeserializer[] querySerde = { UTF8StringSerializerDeserializer.INSTANCE };
- RecordDescriptor queryRecDesc = new RecordDescriptor(querySerde);
-
- FrameTupleAppender queryAppender = new FrameTupleAppender(stageletCtx.getFrameSize());
- ArrayTupleBuilder queryTb = new ArrayTupleBuilder(querySerde.length);
- DataOutput queryDos = queryTb.getDataOutput();
-
- IFrameTupleAccessor queryAccessor = new FrameTupleAccessor(stageletCtx.getFrameSize(), queryRecDesc);
- queryAccessor.reset(frame);
- FrameTupleReference queryTuple = new FrameTupleReference();
-
- //String query = "computer hyracks fast";
- //String query = "compilers fast university hyracks";
- String query = "compilers fast";
+ ISerializerDeserializer[] tokenSerde = { UTF8StringSerializerDeserializer.INSTANCE};
+ RecordDescriptor tokenRecDesc = new RecordDescriptor(tokenSerde);
+ FrameTupleAppender tokenAppender = new FrameTupleAppender(stageletCtx.getFrameSize());
+ ArrayTupleBuilder tokenTupleBuilder = new ArrayTupleBuilder(1);
+ DataOutput tokenDos = tokenTupleBuilder.getDataOutput();
+ IFrameTupleAccessor tokenAccessor = new FrameTupleAccessor(stageletCtx.getFrameSize(), tokenRecDesc);
+ tokenAccessor.reset(frame);
- ITokenFactory tokenFactory = new UTF8WordTokenFactory();
- IBinaryTokenizer queryTokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
-
- queryTb.reset();
- UTF8StringSerializerDeserializer.INSTANCE.serialize(query, queryDos);
- queryTb.addFieldEndOffset();
-
- queryAppender.reset(frame, true);
- queryAppender.append(queryTb.getFieldEndOffsets(), queryTb.getByteArray(), 0, queryTb.getSize());
- queryTuple.reset(queryAccessor, 0);
+ BTreeOpContext btreeOpCtx = invIndex.getBTree().createOpContext(TreeIndexOp.TI_SEARCH, leafFrame,
+ interiorFrame, null);
- UTF8StringSerializerDeserializer serde = UTF8StringSerializerDeserializer.INSTANCE;
- ByteArrayInputStream inStream = new ByteArrayInputStream(queryTuple.getFieldData(0), queryTuple.getFieldStart(0), queryTuple.getFieldLength(0));
- DataInput dataIn = new DataInputStream(inStream);
- Object o = serde.deserialize(dataIn);
- System.out.println(o.toString());
+ // verify created inverted lists one-by-one
+ for(int i = 0; i < tokens.size(); i++) {
+
+ tokenTupleBuilder.reset();
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(tokens.get(i), tokenDos);
+ tokenTupleBuilder.addFieldEndOffset();
+
+ tokenAppender.reset(frame, true);
+ tokenAppender.append(tokenTupleBuilder.getFieldEndOffsets(), tokenTupleBuilder.getByteArray(), 0, tokenTupleBuilder.getSize());
+
+ searchKey.reset(tokenAccessor, 0);
+
+ invIndex.openCursor(btreeCursor, btreePred, btreeOpCtx, invListCursor);
+
+ invListCursor.pinPagesSync();
+ int checkIndex = 0;
+ while(invListCursor.hasNext()) {
+ invListCursor.next();
+ ITupleReference invListTuple = invListCursor.getTuple();
+ int invListElement = IntegerSerializerDeserializer.getInt(invListTuple.getFieldData(0), invListTuple.getFieldStart(0));
+ int checkInvListElement = checkListElements.get(i).get(checkIndex).intValue();
+ Assert.assertEquals(invListElement, checkInvListElement);
+ checkIndex++;
+ }
+ invListCursor.unpinPages();
+ Assert.assertEquals(checkIndex, checkListElements.get(i).size());
+ }
- TOccurrenceSearcher searcher = new TOccurrenceSearcher(stageletCtx, invIndex, queryTokenizer);
- //TOccurrenceSearcherSuffixProbeOnly searcher = new TOccurrenceSearcherSuffixProbeSingle(stageletCtx, invIndex, queryTokenizer);
- //TOccurrenceSearcherSuffixScanOnly searcher = new TOccurrenceSearcherSuffixScan(stageletCtx, invIndex, queryTokenizer);
-
- int repeats = 100;
- double totalTime = 0;
- for(int i = 0; i < repeats; i++) {
- long timeStart = System.currentTimeMillis();
- searcher.reset();
- searcher.search(queryTuple, 0);
- long timeEnd = System.currentTimeMillis();
- //System.out.println("SEARCH TIME: " + (timeEnd - timeStart) + "ms");
- totalTime += timeEnd - timeStart;
- }
- double avgTime = totalTime / (double)repeats;
- System.out.println("AVG TIME: " + avgTime + "ms");
+ // check that non-existing tokens have an empty inverted list
+ List<String> nonExistingTokens = new ArrayList<String>();
+ nonExistingTokens.add("watermelon");
+ nonExistingTokens.add("avocado");
+ nonExistingTokens.add("lemon");
- /*
- // ------------------------- TEST B
- IInvertedListCursor invListCursor = new FixedSizeElementInvertedListCursor(bufferCache, invListsFileId, invListTypeTraits);
+ for(int i = 0; i < nonExistingTokens.size(); i++) {
+
+ tokenTupleBuilder.reset();
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(nonExistingTokens.get(i), tokenDos);
+ tokenTupleBuilder.addFieldEndOffset();
+
+ tokenAppender.reset(frame, true);
+ tokenAppender.append(tokenTupleBuilder.getFieldEndOffsets(), tokenTupleBuilder.getByteArray(), 0, tokenTupleBuilder.getSize());
+
+ searchKey.reset(tokenAccessor, 0);
+
+ invIndex.openCursor(btreeCursor, btreePred, btreeOpCtx, invListCursor);
+
+ invListCursor.pinPagesSync();
+ Assert.assertEquals(invListCursor.hasNext(), false);
+ invListCursor.unpinPages();
+ }
- ISerializerDeserializer[] btreeSerde = { UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
- ISerializerDeserializer[] invListSerdes = { IntegerSerializerDeserializer.INSTANCE };
-
- System.out.println("ORDERED SCAN:");
- IBTreeCursor scanCursor = new RangeSearchCursor(leafFrame);
- RangePredicate nullPred = new RangePredicate(true, null, null, true,
- true, null, null);
- BTreeOpContext searchOpCtx = btree.createOpContext(TreeIndexOp.TI_SEARCH,
- leafFrame, interiorFrame, null);
- btree.search(scanCursor, nullPred, searchOpCtx);
- try {
- while (scanCursor.hasNext()) {
- scanCursor.next();
- ITupleReference frameTuple = scanCursor.getTuple();
- String rec = cmp.printTuple(frameTuple, btreeSerde);
- System.out.println(rec);
-
- int startPageId = IntegerSerializerDeserializer.getInt(frameTuple.getFieldData(1), frameTuple.getFieldStart(1));
- int endPageId = IntegerSerializerDeserializer.getInt(frameTuple.getFieldData(2), frameTuple.getFieldStart(2));
- int startOff = IntegerSerializerDeserializer.getInt(frameTuple.getFieldData(3), frameTuple.getFieldStart(3));
- int numElements = IntegerSerializerDeserializer.getInt(frameTuple.getFieldData(4), frameTuple.getFieldStart(4));
-
- invListCursor.reset(startPageId, endPageId, startOff, numElements);
- invListCursor.pinPagesSync();
- try {
- String invList = invListCursor.printInvList(invListSerdes);
- System.out.println(invList);
-
- for(int i = 0; i < numElements; i++) {
- invListCursor.positionCursor(i);
- String curr = invListCursor.printCurrentElement(invListSerdes);
- System.out.print(curr + " ");
- }
- System.out.println();
-
- ByteBuffer buf = ByteBuffer.allocate(4);
- IBinaryComparator intCmp = IntegerBinaryComparatorFactory.INSTANCE.createBinaryComparator();
- for(int i = 0; i < maxId; i++) {
- buf.putInt(0, i);
- invListCursor.reset(startPageId, endPageId, startOff, numElements);
- if(invListCursor.containsKey(buf.array(), 0, 4, intCmp)) {
- System.out.print(i + " ");
- }
- }
- System.out.println();
-
- } finally {
- invListCursor.unpinPages();
- }
- }
- } catch (Exception e) {
- e.printStackTrace();
- } finally {
- scanCursor.close();
- }
- */
-
btree.close();
bufferCache.closeFile(btreeFileId);
bufferCache.closeFile(invListsFileId);
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/FixedSizeFrameTupleTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/FixedSizeFrameTupleTest.java
index 3d295e1..d48f181 100644
--- a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/FixedSizeFrameTupleTest.java
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/FixedSizeFrameTupleTest.java
@@ -44,12 +44,10 @@
}
}
- ftacc.reset(buffer);
- System.out.println("TUPLECOUNT: " + ftacc.getTupleCount());
- System.out.println("CHECKCOUNT: " + check.size());
+ ftacc.reset(buffer);
for(int i = 0; i < ftacc.getTupleCount(); i++) {
- int val = IntegerSerializerDeserializer.INSTANCE.getInt(ftacc.getBuffer().array(), ftacc.getTupleStartOffset(i));
- Assert.assertEquals(check.get(i).intValue(), val);
+ int val = IntegerSerializerDeserializer.getInt(ftacc.getBuffer().array(), ftacc.getTupleStartOffset(i));
+ Assert.assertEquals(check.get(i).intValue(), val);
}
}
}
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SearchTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SearchTest.java
new file mode 100644
index 0000000..18c938f
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SearchTest.java
@@ -0,0 +1,297 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex;
+
+import java.io.DataOutput;
+import java.io.File;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import java.util.TreeSet;
+
+import org.junit.Test;
+
+import edu.uci.ics.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer;
+import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
+import edu.uci.ics.fuzzyjoin.tokenizer.ITokenFactory;
+import edu.uci.ics.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
+import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
+import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
+import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
+import edu.uci.ics.hyracks.api.dataflow.value.TypeTrait;
+import edu.uci.ics.hyracks.api.io.FileReference;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.comparators.IntegerBinaryComparatorFactory;
+import edu.uci.ics.hyracks.dataflow.common.data.comparators.UTF8StringBinaryComparatorFactory;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrame;
+import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.frames.NSMInteriorFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.frames.NSMLeafFrameFactory;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
+import edu.uci.ics.hyracks.storage.am.common.api.IFreePageManager;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexMetaDataFrame;
+import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexMetaDataFrameFactory;
+import edu.uci.ics.hyracks.storage.am.common.frames.LIFOMetaDataFrameFactory;
+import edu.uci.ics.hyracks.storage.am.common.freepage.LinkedListFreePageManager;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.common.tuples.TypeAwareTupleWriterFactory;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListBuilder;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.FixedSizeElementInvertedListBuilder;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.InvertedIndex;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.SearchResultCursor;
+import edu.uci.ics.hyracks.storage.am.invertedindex.impls.TOccurrenceSearcher;
+import edu.uci.ics.hyracks.storage.am.invertedindex.searchmodifiers.ConjunctiveSearchModifier;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
+import edu.uci.ics.hyracks.storage.common.file.IFileMapProvider;
+import edu.uci.ics.hyracks.test.support.TestStorageManagerComponentHolder;
+import edu.uci.ics.hyracks.test.support.TestUtils;
+
+public class SearchTest extends AbstractInvIndexTest {
+
+ //private static final int PAGE_SIZE = 256;
+ //private static final int NUM_PAGES = 100;
+
+ // private static final int PAGE_SIZE = 65536;
+ private static final int PAGE_SIZE = 32768;
+ private static final int NUM_PAGES = 100;
+ private static final int HYRACKS_FRAME_SIZE = 32768;
+ private IHyracksStageletContext stageletCtx = TestUtils.create(HYRACKS_FRAME_SIZE);
+
+ @Test
+ public void conjunctiveSearchTest() throws Exception {
+
+ TestStorageManagerComponentHolder.init(PAGE_SIZE, NUM_PAGES);
+ IBufferCache bufferCache = TestStorageManagerComponentHolder.getBufferCache(stageletCtx);
+ IFileMapProvider fmp = TestStorageManagerComponentHolder.getFileMapProvider(stageletCtx);
+
+ // create file refs
+ System.out.println(btreeFileName);
+ FileReference btreeFile = new FileReference(new File(btreeFileName));
+ bufferCache.createFile(btreeFile);
+ int btreeFileId = fmp.lookupFileId(btreeFile);
+ bufferCache.openFile(btreeFileId);
+
+ System.out.println(invListsFileName);
+ FileReference invListsFile = new FileReference(new File(invListsFileName));
+ bufferCache.createFile(invListsFile);
+ int invListsFileId = fmp.lookupFileId(invListsFile);
+ bufferCache.openFile(invListsFileId);
+
+ // declare btree fields
+ int fieldCount = 5;
+ ITypeTrait[] typeTraits = new ITypeTrait[fieldCount];
+ // token (key)
+ typeTraits[0] = new TypeTrait(ITypeTrait.VARIABLE_LENGTH);
+ // startPageId
+ typeTraits[1] = new TypeTrait(4);
+ // endPageId
+ typeTraits[2] = new TypeTrait(4);
+ // startOff
+ typeTraits[3] = new TypeTrait(4);
+ // numElements
+ typeTraits[4] = new TypeTrait(4);
+
+ // declare btree keys
+ int keyFieldCount = 1;
+ IBinaryComparator[] cmps = new IBinaryComparator[keyFieldCount];
+ cmps[0] = UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+
+ MultiComparator cmp = new MultiComparator(typeTraits, cmps);
+
+ TypeAwareTupleWriterFactory tupleWriterFactory = new TypeAwareTupleWriterFactory(typeTraits);
+ IBTreeLeafFrameFactory leafFrameFactory = new NSMLeafFrameFactory(tupleWriterFactory);
+ // IBTreeLeafFrameFactory leafFrameFactory = new
+ // FieldPrefixNSMLeafFrameFactory(tupleWriterFactory);
+ IBTreeInteriorFrameFactory interiorFrameFactory = new NSMInteriorFrameFactory(tupleWriterFactory);
+ ITreeIndexMetaDataFrameFactory metaFrameFactory = new LIFOMetaDataFrameFactory();
+
+ IBTreeLeafFrame leafFrame = leafFrameFactory.getFrame();
+ ITreeIndexMetaDataFrame metaFrame = metaFrameFactory.getFrame();
+
+ IFreePageManager freePageManager = new LinkedListFreePageManager(bufferCache, btreeFileId, 0, metaFrameFactory);
+
+ BTree btree = new BTree(bufferCache, freePageManager, interiorFrameFactory, leafFrameFactory, cmp);
+ btree.create(btreeFileId, leafFrame, metaFrame);
+ btree.open(btreeFileId);
+
+ int invListFields = 1;
+ ITypeTrait[] invListTypeTraits = new ITypeTrait[invListFields];
+ invListTypeTraits[0] = new TypeTrait(4);
+
+ int invListKeys = 1;
+ IBinaryComparator[] invListBinCmps = new IBinaryComparator[invListKeys];
+ invListBinCmps[0] = IntegerBinaryComparatorFactory.INSTANCE.createBinaryComparator();
+
+ MultiComparator invListCmp = new MultiComparator(invListTypeTraits, invListBinCmps);
+
+ InvertedIndex invIndex = new InvertedIndex(bufferCache, btree, invListCmp);
+ invIndex.open(invListsFileId);
+
+ Random rnd = new Random();
+ rnd.setSeed(50);
+
+ ByteBuffer frame = stageletCtx.allocateFrame();
+ FrameTupleAppender appender = new FrameTupleAppender(stageletCtx.getFrameSize());
+ ArrayTupleBuilder tb = new ArrayTupleBuilder(2);
+ DataOutput dos = tb.getDataOutput();
+
+ ISerializerDeserializer[] insertSerde = { UTF8StringSerializerDeserializer.INSTANCE,
+ IntegerSerializerDeserializer.INSTANCE };
+ RecordDescriptor insertRecDesc = new RecordDescriptor(insertSerde);
+ IFrameTupleAccessor accessor = new FrameTupleAccessor(stageletCtx.getFrameSize(), insertRecDesc);
+ accessor.reset(frame);
+ FrameTupleReference tuple = new FrameTupleReference();
+
+ List<String> tokens = new ArrayList<String>();
+ tokens.add("compilers");
+ tokens.add("computer");
+ tokens.add("databases");
+ tokens.add("fast");
+ tokens.add("hyracks");
+ tokens.add("major");
+ tokens.add("science");
+ tokens.add("systems");
+ tokens.add("university");
+
+ ArrayList<TreeSet<Integer>> checkSets = new ArrayList<TreeSet<Integer>>();
+ for(int i = 0; i < tokens.size(); i++) {
+ checkSets.add(new TreeSet<Integer>());
+ }
+
+ int maxId = 1000000;
+ int addProb = 0;
+ int addProbStep = 10;
+
+ IInvertedListBuilder invListBuilder = new FixedSizeElementInvertedListBuilder(invListTypeTraits);
+ InvertedIndex.BulkLoadContext ctx = invIndex.beginBulkLoad(invListBuilder, HYRACKS_FRAME_SIZE);
+
+ int totalElements = 0;
+ for (int i = 0; i < tokens.size(); i++) {
+
+ addProb += addProbStep * (i+1);
+ for (int j = 0; j < maxId; j++) {
+ if ((Math.abs(rnd.nextInt()) % addProb) == 0) {
+
+ totalElements++;
+
+ tb.reset();
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(tokens.get(i), dos);
+ tb.addFieldEndOffset();
+ IntegerSerializerDeserializer.INSTANCE.serialize(j, dos);
+ tb.addFieldEndOffset();
+
+ checkSets.get(i).add(j);
+
+ appender.reset(frame, true);
+ appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize());
+
+ tuple.reset(accessor, 0);
+
+ try {
+ invIndex.bulkLoadAddTuple(ctx, tuple);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+ invIndex.endBulkLoad(ctx);
+
+ // --------------------------- TEST A
+ // build query as tuple reference
+ ISerializerDeserializer[] querySerde = { UTF8StringSerializerDeserializer.INSTANCE };
+ RecordDescriptor queryRecDesc = new RecordDescriptor(querySerde);
+
+ FrameTupleAppender queryAppender = new FrameTupleAppender(stageletCtx.getFrameSize());
+ ArrayTupleBuilder queryTb = new ArrayTupleBuilder(querySerde.length);
+ DataOutput queryDos = queryTb.getDataOutput();
+
+ IFrameTupleAccessor queryAccessor = new FrameTupleAccessor(stageletCtx.getFrameSize(), queryRecDesc);
+ queryAccessor.reset(frame);
+ FrameTupleReference queryTuple = new FrameTupleReference();
+
+ ITokenFactory tokenFactory = new UTF8WordTokenFactory();
+ IBinaryTokenizer queryTokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
+
+
+
+ TOccurrenceSearcher searcher = new TOccurrenceSearcher(stageletCtx, invIndex, queryTokenizer);
+ IInvertedIndexSearchModifier searchModifier = new ConjunctiveSearchModifier();
+ IInvertedIndexResultCursor resultCursor = new SearchResultCursor(searcher.createResultFrameTupleAccessor(), searcher.createResultTupleReference());
+
+ // generate random queries
+ int queries = 100;
+ int[] queryTokenIndexes = new int[tokens.size()];
+ for(int i = 0; i < queries; i++) {
+ int numQueryTokens = Math.abs(rnd.nextInt() % tokens.size()) + 1;
+ for(int j = 0; j < numQueryTokens; j++) {
+ queryTokenIndexes[j] = Math.abs(rnd.nextInt() % tokens.size());
+ }
+
+ StringBuilder strBuilder = new StringBuilder();
+ for(int j = 0; j < numQueryTokens; j++) {
+ strBuilder.append(tokens.get(queryTokenIndexes[j]));
+ if(j+1 != numQueryTokens) strBuilder.append(" ");
+ }
+
+ String queryString = strBuilder.toString();
+ //String queryString = "major";
+
+ queryTb.reset();
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(queryString, queryDos);
+ queryTb.addFieldEndOffset();
+
+ queryAppender.reset(frame, true);
+ queryAppender.append(queryTb.getFieldEndOffsets(), queryTb.getByteArray(), 0, queryTb.getSize());
+ queryTuple.reset(queryAccessor, 0);
+
+ int repeats = 1;
+ double totalTime = 0;
+ for(int j = 0; j < repeats; j++) {
+ long timeStart = System.currentTimeMillis();
+ searcher.reset();
+ searcher.search(resultCursor, queryTuple, 0, searchModifier);
+ long timeEnd = System.currentTimeMillis();
+ totalTime += timeEnd - timeStart;
+ }
+ double avgTime = totalTime / (double)repeats;
+ System.out.println("\"" + queryString + "\": " + avgTime + "ms");
+
+ // generate intersection for verification
+ TreeSet<Integer> checkResults = new TreeSet<Integer>(checkSets.get(queryTokenIndexes[0]));
+ for(int j = 1; j < numQueryTokens; j++) {
+ checkResults.retainAll(checkSets.get(queryTokenIndexes[j]));
+ }
+ Integer[] check = new Integer[checkResults.size()];
+ check = checkResults.toArray(check);
+
+ // verify results
+ int checkIndex = 0;
+ while(resultCursor.hasNext()) {
+ resultCursor.next();
+ ITupleReference resultTuple = resultCursor.getTuple();
+ int id = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0), resultTuple.getFieldStart(0));
+ //Assert.assertEquals(id, check[checkIndex].intValue());
+ checkIndex++;
+ }
+
+ //System.out.println("RESULTS: " + check.length + " " + checkIndex);
+ }
+
+ btree.close();
+ bufferCache.closeFile(btreeFileId);
+ bufferCache.closeFile(invListsFileId);
+ bufferCache.close();
+ }
+}
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SimpleConjunctiveSearcherTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SimpleConjunctiveSearcherTest.java
deleted file mode 100644
index 1ebb2f3..0000000
--- a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/SimpleConjunctiveSearcherTest.java
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex;
-
-import java.io.ByteArrayInputStream;
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.File;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Random;
-
-import org.junit.Test;
-
-import edu.uci.ics.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.ITokenFactory;
-import edu.uci.ics.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
-import edu.uci.ics.hyracks.api.comm.IFrameTupleAccessor;
-import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
-import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
-import edu.uci.ics.hyracks.api.dataflow.value.ITypeTrait;
-import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
-import edu.uci.ics.hyracks.api.dataflow.value.TypeTrait;
-import edu.uci.ics.hyracks.api.io.FileReference;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAccessor;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.FrameTupleAppender;
-import edu.uci.ics.hyracks.dataflow.common.data.accessors.FrameTupleReference;
-import edu.uci.ics.hyracks.dataflow.common.data.comparators.IntegerBinaryComparatorFactory;
-import edu.uci.ics.hyracks.dataflow.common.data.comparators.UTF8StringBinaryComparatorFactory;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
-import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrame;
-import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeInteriorFrameFactory;
-import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrame;
-import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeLeafFrameFactory;
-import edu.uci.ics.hyracks.storage.am.btree.frames.NSMInteriorFrameFactory;
-import edu.uci.ics.hyracks.storage.am.btree.frames.NSMLeafFrameFactory;
-import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
-import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOpContext;
-import edu.uci.ics.hyracks.storage.am.common.api.IFreePageManager;
-import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexMetaDataFrame;
-import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexMetaDataFrameFactory;
-import edu.uci.ics.hyracks.storage.am.common.frames.LIFOMetaDataFrameFactory;
-import edu.uci.ics.hyracks.storage.am.common.freepage.LinkedListFreePageManager;
-import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
-import edu.uci.ics.hyracks.storage.am.common.ophelpers.TreeIndexOp;
-import edu.uci.ics.hyracks.storage.am.common.tuples.TypeAwareTupleWriterFactory;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexResultCursor;
-import edu.uci.ics.hyracks.storage.am.invertedindex.impls.SimpleConjunctiveSearcher;
-import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
-import edu.uci.ics.hyracks.storage.common.file.IFileMapProvider;
-import edu.uci.ics.hyracks.test.support.TestStorageManagerComponentHolder;
-import edu.uci.ics.hyracks.test.support.TestUtils;
-
-public class SimpleConjunctiveSearcherTest extends AbstractInvIndexTest {
-
- // testing params
- // private static final int PAGE_SIZE = 256;
- // private static final int NUM_PAGES = 10;
- // private static final int HYRACKS_FRAME_SIZE = 256;
-
- // realistic params
- // private static final int PAGE_SIZE = 65536;
- private static final int PAGE_SIZE = 32768;
- private static final int NUM_PAGES = 10;
- private static final int HYRACKS_FRAME_SIZE = 32768;
- private IHyracksStageletContext ctx = TestUtils.create(HYRACKS_FRAME_SIZE);
-
- @Test
- public void test01() throws Exception {
-
- TestStorageManagerComponentHolder.init(PAGE_SIZE, NUM_PAGES);
- IBufferCache bufferCache = TestStorageManagerComponentHolder.getBufferCache(ctx);
- IFileMapProvider fmp = TestStorageManagerComponentHolder.getFileMapProvider(ctx);
- FileReference file = new FileReference(new File(btreeFileName));
- bufferCache.createFile(file);
- int fileId = fmp.lookupFileId(file);
- bufferCache.openFile(fileId);
-
- // declare fields
- int fieldCount = 2;
- ITypeTrait[] typeTraits = new ITypeTrait[fieldCount];
- typeTraits[0] = new TypeTrait(ITypeTrait.VARIABLE_LENGTH);
- typeTraits[1] = new TypeTrait(4);
-
- // declare keys
- int keyFieldCount = 2;
- IBinaryComparator[] cmps = new IBinaryComparator[keyFieldCount];
- cmps[0] = UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
- cmps[1] = IntegerBinaryComparatorFactory.INSTANCE.createBinaryComparator();
-
- MultiComparator cmp = new MultiComparator(typeTraits, cmps);
-
- TypeAwareTupleWriterFactory tupleWriterFactory = new TypeAwareTupleWriterFactory(typeTraits);
- IBTreeLeafFrameFactory leafFrameFactory = new NSMLeafFrameFactory(tupleWriterFactory);
- // IBTreeLeafFrameFactory leafFrameFactory = new
- // FieldPrefixNSMLeafFrameFactory(tupleWriterFactory);
- IBTreeInteriorFrameFactory interiorFrameFactory = new NSMInteriorFrameFactory(tupleWriterFactory);
- ITreeIndexMetaDataFrameFactory metaFrameFactory = new LIFOMetaDataFrameFactory();
-
- IBTreeLeafFrame leafFrame = leafFrameFactory.getFrame();
- IBTreeInteriorFrame interiorFrame = interiorFrameFactory.getFrame();
- ITreeIndexMetaDataFrame metaFrame = metaFrameFactory.getFrame();
-
- IFreePageManager freePageManager = new LinkedListFreePageManager(bufferCache, fileId, 0, metaFrameFactory);
-
- BTree btree = new BTree(bufferCache, freePageManager, interiorFrameFactory, leafFrameFactory, cmp);
- btree.create(fileId, leafFrame, metaFrame);
- btree.open(fileId);
-
- Random rnd = new Random();
- rnd.setSeed(50);
-
- ByteBuffer frame = ctx.allocateFrame();
- FrameTupleAppender appender = new FrameTupleAppender(ctx.getFrameSize());
- ArrayTupleBuilder tb = new ArrayTupleBuilder(cmp.getFieldCount());
- DataOutput dos = tb.getDataOutput();
-
- ISerializerDeserializer[] btreeSerde = { UTF8StringSerializerDeserializer.INSTANCE,
- IntegerSerializerDeserializer.INSTANCE };
- RecordDescriptor btreeRecDesc = new RecordDescriptor(btreeSerde);
- IFrameTupleAccessor accessor = new FrameTupleAccessor(ctx.getFrameSize(), btreeRecDesc);
- accessor.reset(frame);
- FrameTupleReference tuple = new FrameTupleReference();
-
- List<String> tokens = new ArrayList<String>();
- tokens.add("computer");
- tokens.add("hyracks");
- tokens.add("fast");
- tokens.add("university");
- tokens.add("science");
- tokens.add("major");
-
- int maxId = 10000;
- int addProb = 0;
- int addProbStep = 2;
-
- BTreeOpContext opCtx = btree.createOpContext(TreeIndexOp.TI_INSERT, leafFrame, interiorFrame, metaFrame);
-
- for (int i = 0; i < tokens.size(); i++) {
-
- addProb += addProbStep;
- for (int j = 0; j < maxId; j++) {
- if ((Math.abs(rnd.nextInt()) % addProb) == 0) {
- tb.reset();
- UTF8StringSerializerDeserializer.INSTANCE.serialize(tokens.get(i), dos);
- tb.addFieldEndOffset();
- IntegerSerializerDeserializer.INSTANCE.serialize(j, dos);
- tb.addFieldEndOffset();
-
- appender.reset(frame, true);
- appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize());
-
- tuple.reset(accessor, 0);
-
- try {
- btree.insert(tuple, opCtx);
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- }
- }
-
- int numPages = btree.getFreePageManager().getMaxPage(metaFrame);
- System.out.println("NUMPAGES: " + numPages);
-
- // build query as tuple reference
- ISerializerDeserializer[] querySerde = { UTF8StringSerializerDeserializer.INSTANCE };
- RecordDescriptor queryRecDesc = new RecordDescriptor(querySerde);
-
- FrameTupleAppender queryAppender = new FrameTupleAppender(ctx.getFrameSize());
- ArrayTupleBuilder queryTb = new ArrayTupleBuilder(querySerde.length);
- DataOutput queryDos = queryTb.getDataOutput();
-
- IFrameTupleAccessor queryAccessor = new FrameTupleAccessor(ctx.getFrameSize(), queryRecDesc);
- queryAccessor.reset(frame);
- FrameTupleReference queryTuple = new FrameTupleReference();
-
- String query = "computer hyracks fast";
-
- ITokenFactory tokenFactory = new UTF8WordTokenFactory();
- IBinaryTokenizer queryTokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
-
- queryTb.reset();
- UTF8StringSerializerDeserializer.INSTANCE.serialize(query, queryDos);
- queryTb.addFieldEndOffset();
-
- queryAppender.reset(frame, true);
- queryAppender.append(queryTb.getFieldEndOffsets(), queryTb.getByteArray(), 0, queryTb.getSize());
- queryTuple.reset(queryAccessor, 0);
-
- int numKeyFields = 1;
- int numValueFields = 1;
- ISerializerDeserializer[] resultSerde = new ISerializerDeserializer[numValueFields];
- for (int i = 0; i < numValueFields; i++) {
- resultSerde[i] = btreeSerde[numKeyFields + i];
- }
- RecordDescriptor resultRecDesc = new RecordDescriptor(resultSerde);
- FrameTupleAccessor resultAccessor = new FrameTupleAccessor(ctx.getFrameSize(), resultRecDesc);
- FrameTupleReference resultTuple = new FrameTupleReference();
-
- SimpleConjunctiveSearcher searcher = new SimpleConjunctiveSearcher(ctx, btree, btreeRecDesc, queryTokenizer,
- numKeyFields, numValueFields);
-
- long timeStart = System.currentTimeMillis();
- searcher.search(queryTuple, 0);
- long timeEnd = System.currentTimeMillis();
- System.out.println("SEARCH TIME: " + (timeEnd - timeStart) + "ms");
-
- // System.out.println("INTERSECTION RESULTS");
- IInvertedIndexResultCursor resultCursor = searcher.getResultCursor();
- while (resultCursor.hasNext()) {
- resultCursor.next();
- resultAccessor.reset(resultCursor.getBuffer());
- for (int i = 0; i < resultAccessor.getTupleCount(); i++) {
- resultTuple.reset(resultAccessor, i);
- for (int j = 0; j < resultTuple.getFieldCount(); j++) {
- ByteArrayInputStream inStream = new ByteArrayInputStream(resultTuple.getFieldData(j), resultTuple
- .getFieldStart(j), resultTuple.getFieldLength(j));
- DataInput dataIn = new DataInputStream(inStream);
- Object o = resultSerde[j].deserialize(dataIn);
- System.out.print(o + " ");
- }
- System.out.println();
- }
- }
-
-//
-// IBinaryComparator[] searchCmps = new IBinaryComparator[1];
-// searchCmps[0] =
-// UTF8StringBinaryComparatorFactory.INSTANCE.createBinaryComparator();
-// MultiComparator searchCmp = new MultiComparator(typeTraits,
-// searchCmps);
-//
-// // ordered scan IBTreeCursor scanCursor = new
-// RangeSearchCursor(leafFrame); RangePredicate nullPred = new
-// RangePredicate(true, null, null, true, true, null); BTreeOpContext
-// searchOpCtx = btree.createOpContext(BTreeOp.BTO_SEARCH, leafFrame,
-// interiorFrame, metaFrame); btree.search(scanCursor, nullPred,
-// searchOpCtx);
-//
-// try { while (scanCursor.hasNext()) { scanCursor.next();
-// ITupleReference frameTuple = scanCursor.getTuple(); String rec =
-// cmp.printTuple(frameTuple, btreeSerde); System.out.println(rec); } }
-// catch (Exception e) { e.printStackTrace(); } finally {
-// scanCursor.close(); }
-
-
- btree.close();
- bufferCache.closeFile(fileId);
- bufferCache.close();
- }
-}
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java
deleted file mode 100644
index 7181b77..0000000
--- a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/tokenizers/TokenizerTest.java
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers;
-
-import java.io.ByteArrayInputStream;
-import java.io.DataInput;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
-import java.util.Random;
-
-import org.junit.Assert;
-import org.junit.Test;
-
-import edu.uci.ics.fuzzyjoin.tokenizer.DelimitedUTF8StringBinaryTokenizer;
-import edu.uci.ics.fuzzyjoin.tokenizer.IToken;
-import edu.uci.ics.fuzzyjoin.tokenizer.ITokenFactory;
-import edu.uci.ics.fuzzyjoin.tokenizer.UTF8WordTokenFactory;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.ByteArrayAccessibleOutputStream;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
-
-public class TokenizerTest {
-
- // testing DelimitedUTF8StringBinaryTokenizer
- @Test
- public void test01() throws Exception {
- Random rnd = new Random(50);
-
- int numDocs = 100;
- int maxWords = 1000;
- int maxWordLength = 50;
- char delimiter = ' ';
-
- ITokenFactory tokenFactory = new UTF8WordTokenFactory();
- DelimitedUTF8StringBinaryTokenizer tok = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
-
- // create a bunch of documents
- for (int i = 0; i < numDocs; i++) {
-
- // create a single document with a bunch of words
- int words = (Math.abs(rnd.nextInt()) % maxWords) + 1;
- StringBuilder strBuilder = new StringBuilder();
- for (int j = 0; j < words; j++) {
- int len = (Math.abs(rnd.nextInt()) % maxWordLength) + 1;
- String s = randomString(len, rnd);
- strBuilder.append(s);
- if (j < words - 1)
- strBuilder.append(delimiter);
- }
-
- String doc = strBuilder.toString();
-
- // serialize document into baaos
- ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
- DataOutputStream dos = new DataOutputStream(baaos);
- UTF8StringSerializerDeserializer.INSTANCE.serialize(doc, dos);
- byte[] data = baaos.toByteArray();
-
- // use binary tokenizer and compare with Java tokenizer
- String[] cmpTokens = doc.split(new String(new char[] { delimiter }));
- int cmpCounter = 0;
-
- tok.reset(data, 0, data.length);
- while (tok.hasNext()) {
- tok.next();
-
- // write token to outputstream
- ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
- DataOutputStream dosWrite = new DataOutputStream(baaosWrite);
- IToken token = tok.getToken();
- token.serializeToken(dosWrite);
-
- // deserialize token to get string object
- ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
- DataInput dataIn = new DataInputStream(inStream);
- String s = UTF8StringSerializerDeserializer.INSTANCE.deserialize(dataIn);
-
- Assert.assertEquals(s, cmpTokens[cmpCounter++]);
- }
- }
- }
-
- /*
- // testing HashedNGramUTF8StringBinaryTokenizer
- @Test
- public void test02() throws Exception {
- Random rnd = new Random(50);
-
- int numStrings = 1000;
- int maxStrLen = 100;
- int minQ = 2;
- int maxQ = 10;
-
- // we test the correctness of HashedQGramUTF8StringBinaryTokenizer as
- // follows:
- // 1.1. tokenize the string into q-gram strings
- // 1.2. serialize q-gram strings into bytes
- // 1.3. compute hashed gram with UTF8StringBinaryHashFunctionFactory
- // 2.1. serialize string into bytes
- // 2.2. tokenize serialized string into hashed q-grams
- // 2.3. test whether hashed grams from 1.3. and 2.3. are equal
- for (int i = 0; i < numStrings; i++) {
- int q = (Math.abs(rnd.nextInt()) % (maxQ - minQ)) + minQ;
- int strLen = (Math.abs(rnd.nextInt()) % (maxStrLen - q)) + q;
- String str = randomString(strLen, rnd);
-
- // randomly choose pre and postfixing
- boolean prePost = false;
- //if (Math.abs(rnd.nextInt()) % 2 == 0)
- // prePost = true;
-
- ITokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
- NGramUTF8StringBinaryTokenizer qgramTok = new NGramUTF8StringBinaryTokenizer(q, prePost, true, false, tokenFactory);
-
- // generate q-grams in deserialized form
- ArrayList<String> javaGrams = new ArrayList<String>();
- for (int j = 0; j < str.length() - q + 1; j++) {
- javaGrams.add(str.substring(j, j + q).toLowerCase());
- }
-
- // serialize string for use in binary gram tokenizer
- ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
- DataOutputStream dos = new DataOutputStream(baaos);
- UTF8StringSerializerDeserializer.INSTANCE.serialize(str, dos);
- byte[] data = baaos.toByteArray();
-
- qgramTok.reset(data, 0, data.length);
-
- int counter = 0;
- while (qgramTok.hasNext()) {
- qgramTok.next();
-
- // write token to outputstream
- ByteArrayAccessibleOutputStream baaosWrite = new ByteArrayAccessibleOutputStream();
- DataOutputStream dosWrite = new DataOutputStream(baaosWrite);
- IToken token = qgramTok.getToken();
- token.serializeToken(dosWrite);
-
- // deserialize token to get hashed gram
- ByteArrayInputStream inStream = new ByteArrayInputStream(baaosWrite.toByteArray());
- DataInput dataIn = new DataInputStream(inStream);
- Integer binHashedGram = IntegerSerializerDeserializer.INSTANCE.deserialize(dataIn);
-
- // create hashed gram to test against
- ByteArrayAccessibleOutputStream baaosCmp = new ByteArrayAccessibleOutputStream();
- DataOutputStream dosCmp = new DataOutputStream(baaosCmp);
- UTF8StringSerializerDeserializer.INSTANCE.serialize(javaGrams.get(counter), dosCmp);
-
- IBinaryHashFunction strHasher = UTF8StringBinaryHashFunctionFactory.INSTANCE.createBinaryHashFunction();
- byte[] cmpData = baaosCmp.toByteArray();
- int cmpHash = strHasher.hash(cmpData, 0, cmpData.length);
-
- Assert.assertEquals(binHashedGram.intValue(), cmpHash);
-
- counter++;
- }
- }
- }
- */
-
- public static String randomString(int length, Random random) {
- int maxAttempts = 1000;
- int count = 0;
- while (count < maxAttempts) {
- String s = Long.toHexString(Double.doubleToLongBits(random.nextDouble()));
- StringBuilder strBuilder = new StringBuilder();
- for (int i = 0; i < s.length() && i < length; i++) {
- strBuilder.append(s.charAt(Math.abs(random.nextInt()) % s.length()));
- }
- if (strBuilder.length() > 0)
- return strBuilder.toString();
- count++;
- }
- return "abc";
- }
-}