Improved inverted index search performance. Added searcher variants for performance comparison.
git-svn-id: https://hyracks.googlecode.com/svn/branches/hyracks_indexes@430 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java
index 779cfbf..d181191 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/InvertedIndex.java
@@ -5,7 +5,6 @@
import java.io.DataInputStream;
import java.nio.ByteBuffer;
-import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
import edu.uci.ics.hyracks.api.dataflow.value.RecordDescriptor;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
@@ -18,7 +17,6 @@
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
import edu.uci.ics.hyracks.storage.am.btree.api.IBTreeCursor;
-import edu.uci.ics.hyracks.storage.am.btree.dataflow.PermutingFrameTupleReference;
import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
import edu.uci.ics.hyracks.storage.am.btree.impls.BTreeOpContext;
import edu.uci.ics.hyracks.storage.am.btree.impls.RangePredicate;
@@ -132,13 +130,7 @@
}
public boolean openCursor(IBTreeCursor btreeCursor, RangePredicate btreePred, BTreeOpContext btreeOpCtx, IInvertedListCursor invListCursor) throws Exception {
- btree.search(btreeCursor, btreePred, btreeOpCtx);
-
- UTF8StringSerializerDeserializer serde = UTF8StringSerializerDeserializer.INSTANCE;
- ByteArrayInputStream inStream = new ByteArrayInputStream(btreePred.getHighKey().getFieldData(0), btreePred.getHighKey().getFieldStart(0), btreePred.getHighKey().getFieldLength(0));
- DataInput dataIn = new DataInputStream(inStream);
- Object o = serde.deserialize(dataIn);
- System.out.println(o.toString());
+ btree.search(btreeCursor, btreePred, btreeOpCtx);
boolean ret = false;
if(btreeCursor.hasNext()) {
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
index 618c61b..03fd3bc 100644
--- a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcher.java
@@ -49,32 +49,33 @@
public class TOccurrenceSearcher {
- private final IHyracksStageletContext ctx;
- private final FixedSizeFrameTupleAppender resultFrameTupleApp;
- private final FixedSizeFrameTupleAccessor resultFrameTupleAcc;
- private final FixedSizeTupleReference resultTuple;
- private final int invListKeyLength;
+ protected final IHyracksStageletContext ctx;
+ protected final FixedSizeFrameTupleAppender resultFrameTupleApp;
+ protected final FixedSizeFrameTupleAccessor resultFrameTupleAcc;
+ protected final FixedSizeTupleReference resultTuple;
+ protected final int invListKeyLength;
+ protected int currentNumResults;
- private List<ByteBuffer> newResultBuffers = new ArrayList<ByteBuffer>();
- private List<ByteBuffer> prevResultBuffers = new ArrayList<ByteBuffer>();
- private List<ByteBuffer> swap = null;
- private final ListResultCursor resultCursor = new ListResultCursor();
- private int maxResultBufIdx = 0;
+ protected List<ByteBuffer> newResultBuffers = new ArrayList<ByteBuffer>();
+ protected List<ByteBuffer> prevResultBuffers = new ArrayList<ByteBuffer>();
+ protected List<ByteBuffer> swap = null;
+ protected final ListResultCursor resultCursor = new ListResultCursor();
+ protected int maxResultBufIdx = 0;
- private final IBTreeLeafFrame leafFrame;
- private final IBTreeInteriorFrame interiorFrame;
- private final IBTreeCursor btreeCursor;
- private final FrameTupleReference searchKey = new FrameTupleReference();
- private final RangePredicate btreePred = new RangePredicate(true, null, null, true, true, null, null);
+ protected final IBTreeLeafFrame leafFrame;
+ protected final IBTreeInteriorFrame interiorFrame;
+ protected final IBTreeCursor btreeCursor;
+ protected final FrameTupleReference searchKey = new FrameTupleReference();
+ protected final RangePredicate btreePred = new RangePredicate(true, null, null, true, true, null, null);
- private final InvertedIndex invIndex;
- private final IBinaryTokenizer queryTokenizer;
- private int occurrenceThreshold;
+ protected final InvertedIndex invIndex;
+ protected final IBinaryTokenizer queryTokenizer;
+ protected int occurrenceThreshold;
- private final int cursorCacheSize = 10;
- private ArrayList<IInvertedListCursor> invListCursorCache = new ArrayList<IInvertedListCursor>(cursorCacheSize);
- private ArrayList<IInvertedListCursor> invListCursors = new ArrayList<IInvertedListCursor>(cursorCacheSize);
-
+ protected final int cursorCacheSize = 10;
+ protected ArrayList<IInvertedListCursor> invListCursorCache = new ArrayList<IInvertedListCursor>(cursorCacheSize);
+ protected ArrayList<IInvertedListCursor> invListCursors = new ArrayList<IInvertedListCursor>(cursorCacheSize);
+
public TOccurrenceSearcher(IHyracksStageletContext ctx, InvertedIndex invIndex, IBinaryTokenizer queryTokenizer) {
this.ctx = ctx;
this.invIndex = invIndex;
@@ -112,8 +113,21 @@
invListCursorCache.add(new FixedSizeElementInvertedListCursor(invIndex.getBufferCache(), invIndex
.getInvListsFileId(), invIndex.getInvListElementCmp().getTypeTraits()));
}
+
+ currentNumResults = 0;
}
+ public void reset() {
+ for(ByteBuffer b : newResultBuffers) {
+ resultFrameTupleApp.reset(b, true);
+ }
+ for(ByteBuffer b : prevResultBuffers) {
+ resultFrameTupleApp.reset(b, true);
+ }
+ currentNumResults = 0;
+ }
+
+
public void search(ITupleReference queryTuple, int queryFieldIndex) throws Exception {
// parse query, TODO: this parsing is too simple
@@ -170,9 +184,11 @@
}
Collections.sort(invListCursors);
+ /*
for(int i = 0; i < numQueryTokens; i++) {
System.out.println("SIZE: " + i + " " + invListCursors.get(i).getNumElements());
}
+ */
occurrenceThreshold = numQueryTokens;
@@ -182,26 +198,27 @@
maxPrevBufIdx = mergeSuffixLists(numPrefixTokens, numQueryTokens, maxPrevBufIdx);
/*
+ StringBuffer strBuffer = new StringBuffer();
for(int i = 0; i <= maxPrevBufIdx; i++) {
ByteBuffer testBuf = newResultBuffers.get(i);
resultFrameTupleAcc.reset(testBuf);
for(int j = 0; j < resultFrameTupleAcc.getTupleCount(); j++) {
- System.out.print(IntegerSerializerDeserializer.getInt(resultFrameTupleAcc.getBuffer().array(), resultFrameTupleAcc.getFieldStartOffset(j, 0)) + ",");
- System.out.print(IntegerSerializerDeserializer.getInt(resultFrameTupleAcc.getBuffer().array(), resultFrameTupleAcc.getFieldStartOffset(j, 1)) + " ");
+ strBuffer.append(IntegerSerializerDeserializer.getInt(resultFrameTupleAcc.getBuffer().array(), resultFrameTupleAcc.getFieldStartOffset(j, 0)) + ",");
+ strBuffer.append(IntegerSerializerDeserializer.getInt(resultFrameTupleAcc.getBuffer().array(), resultFrameTupleAcc.getFieldStartOffset(j, 1)) + " ");
}
}
- System.out.println();
- */
+ System.out.println(strBuffer.toString());
+ */
}
- private int mergePrefixLists(int numPrefixTokens, int numQueryTokens) throws IOException {
- resultFrameTupleApp.reset(newResultBuffers.get(0), true);
+ protected int mergePrefixLists(int numPrefixTokens, int numQueryTokens) throws IOException {
int maxPrevBufIdx = 0;
for(int i = 0; i < numPrefixTokens; i++) {
swap = prevResultBuffers;
prevResultBuffers = newResultBuffers;
newResultBuffers = swap;
+ currentNumResults = 0;
invListCursors.get(i).pinPagesSync();
maxPrevBufIdx = mergePrefixList(invListCursors.get(i), prevResultBuffers, maxPrevBufIdx, newResultBuffers);
@@ -211,13 +228,75 @@
return maxPrevBufIdx;
}
- private int mergeSuffixLists(int numPrefixTokens, int numQueryTokens, int maxPrevBufIdx) throws IOException {
+ protected int mergeSuffixLists(int numPrefixTokens, int numQueryTokens, int maxPrevBufIdx) throws IOException {
+ for(int i = numPrefixTokens; i < numQueryTokens; i++) {
+ swap = prevResultBuffers;
+ prevResultBuffers = newResultBuffers;
+ newResultBuffers = swap;
+
+ invListCursors.get(i).pinPagesSync();
+ int numInvListElements = invListCursors.get(i).getNumElements();
+ // should we binary search the next list or should we sort-merge it?
+ if(currentNumResults * Math.log(numInvListElements) < currentNumResults + numInvListElements) {
+ //System.out.println("PROBING LIST: " + i);
+ maxPrevBufIdx = mergeSuffixListProbe(invListCursors.get(i), prevResultBuffers, maxPrevBufIdx, newResultBuffers, i, numQueryTokens);
+ }
+ else {
+ //System.out.println("SCANNING LIST: " + i);
+ maxPrevBufIdx = mergeSuffixListScan(invListCursors.get(i), prevResultBuffers, maxPrevBufIdx, newResultBuffers, i, numQueryTokens);
+ }
+ invListCursors.get(i).unpinPages();
+ }
+ return maxPrevBufIdx;
+ }
+
+ protected int mergeSuffixListProbe(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers, int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws IOException {
- swap = prevResultBuffers;
- prevResultBuffers = newResultBuffers;
- newResultBuffers = swap;
+ int newBufIdx = 0;
+ ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
+
+ int prevBufIdx = 0;
+ ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+
+ int resultTidx = 0;
- System.out.println("MERGING SUFFIX LISTS");
+ currentNumResults = 0;
+
+ MultiComparator invListCmp = invIndex.getInvListElementCmp();
+
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultFrameTupleApp.reset(newCurrentBuffer, true);
+
+ while(resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+ resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0), resultTuple.getFieldStart(resultTuple.getFieldCount()-1));
+
+ if(invListCursor.containsKey(resultTuple, invListCmp)) {
+ count++;
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ }
+ else {
+ if(count + numQueryTokens - invListIx > occurrenceThreshold) {
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ }
+ }
+
+ resultTidx++;
+ if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+ prevBufIdx++;
+ if (prevBufIdx <= maxPrevBufIdx) {
+ prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultTidx = 0;
+ }
+ }
+ }
+
+ return newBufIdx;
+ }
+
+ protected int mergeSuffixListScan(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers, int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws IOException {
int newBufIdx = 0;
ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
@@ -225,41 +304,46 @@
int prevBufIdx = 0;
ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+ boolean advanceCursor = true;
+ boolean advancePrevResult = false;
int resultTidx = 0;
+ currentNumResults = 0;
+
MultiComparator invListCmp = invIndex.getInvListElementCmp();
resultFrameTupleAcc.reset(prevCurrentBuffer);
resultFrameTupleApp.reset(newCurrentBuffer, true);
- for(int i = numPrefixTokens; i < numQueryTokens; i++) {
- invListCursors.get(i).pinPagesSync();
- }
-
- try {
- while(resultTidx < resultFrameTupleAcc.getTupleCount()) {
-
- resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
- int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0), resultTuple.getFieldStart(resultTuple.getFieldCount()-1));
- for(int i = numPrefixTokens; i < numQueryTokens; i++) {
- if(invListCursors.get(i).containsKey(resultTuple, invListCmp)) {
- count++;
- if(count >= occurrenceThreshold) {
- break;
- }
+ while(invListCursor.hasNext() && resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+ if(advanceCursor) invListCursor.next();
+
+ ITupleReference invListTuple = invListCursor.getTuple();
+
+ resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+
+ int cmp = invListCmp.compare(invListTuple, resultTuple);
+ if (cmp == 0) {
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0), resultTuple.getFieldStart(resultTuple.getFieldCount()-1)) + 1;
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ advanceCursor = true;
+ advancePrevResult = true;
+ } else {
+ if (cmp < 0) {
+ advanceCursor = true;
+ advancePrevResult = false;
+ } else {
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0), resultTuple.getFieldStart(resultTuple.getFieldCount()-1));
+ if(count + numQueryTokens - invListIx > occurrenceThreshold) {
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
}
- else {
- if(count + numQueryTokens - i + 1 < occurrenceThreshold) {
- break; // early termination
- }
- }
+ advanceCursor = false;
+ advancePrevResult = true;
}
+ }
- // add to final results?
- if(count >= occurrenceThreshold) {
- newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
- }
-
+ if (advancePrevResult) {
resultTidx++;
if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
prevBufIdx++;
@@ -269,17 +353,31 @@
resultTidx = 0;
}
}
- }
- } finally {
- for(int i = numPrefixTokens; i < numQueryTokens; i++) {
- invListCursors.get(i).unpinPages();
+ }
+ }
+
+ // append remaining elements from previous result set
+ //if(resultTidx < resultFrameTupleAcc.getTupleCount()) System.out.println("APPENDING FROM RESULTS");
+ while(resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0), resultTuple.getFieldStart(resultTuple.getFieldCount()-1));
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+
+ resultTidx++;
+ if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+ prevBufIdx++;
+ if (prevBufIdx <= maxPrevBufIdx) {
+ prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultTidx = 0;
+ }
}
}
-
+
return newBufIdx;
}
- private int mergePrefixList(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers, int maxPrevBufIdx, List<ByteBuffer> newResultBuffers) throws IOException {
+ protected int mergePrefixList(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers, int maxPrevBufIdx, List<ByteBuffer> newResultBuffers) throws IOException {
int newBufIdx = 0;
ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
@@ -365,7 +463,7 @@
return newBufIdx;
}
- private int appendTupleToNewResults(ITupleReference tuple, int newCount, int newBufIdx) throws IOException {
+ protected int appendTupleToNewResults(ITupleReference tuple, int newCount, int newBufIdx) throws IOException {
ByteBuffer newCurrentBuffer = newResultBuffers.get(newBufIdx);
if (!resultFrameTupleApp.hasSpace()) {
@@ -388,6 +486,8 @@
}
resultFrameTupleApp.incrementTupleCount(1);
+
+ currentNumResults++;
return newBufIdx;
}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java
new file mode 100644
index 0000000..4a4e91c
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixProbeOnly.java
@@ -0,0 +1,77 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
+import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+
+public class TOccurrenceSearcherSuffixProbeOnly extends TOccurrenceSearcher {
+
+ public TOccurrenceSearcherSuffixProbeOnly(IHyracksStageletContext ctx, InvertedIndex invIndex,
+ IBinaryTokenizer queryTokenizer) {
+ super(ctx, invIndex, queryTokenizer);
+ }
+
+ protected int mergeSuffixLists(int numPrefixTokens, int numQueryTokens, int maxPrevBufIdx) throws IOException {
+ for(int i = numPrefixTokens; i < numQueryTokens; i++) {
+ swap = prevResultBuffers;
+ prevResultBuffers = newResultBuffers;
+ newResultBuffers = swap;
+ currentNumResults = 0;
+
+ invListCursors.get(i).pinPagesSync();
+ maxPrevBufIdx = mergeSuffixListProbe(invListCursors.get(i), prevResultBuffers, maxPrevBufIdx, newResultBuffers, i, numQueryTokens);
+ invListCursors.get(i).unpinPages();
+ }
+ return maxPrevBufIdx;
+ }
+
+ protected int mergeSuffixListProbe(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers, int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws IOException {
+
+ int newBufIdx = 0;
+ ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
+
+ int prevBufIdx = 0;
+ ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+
+ int resultTidx = 0;
+
+ MultiComparator invListCmp = invIndex.getInvListElementCmp();
+
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultFrameTupleApp.reset(newCurrentBuffer, true);
+
+ while(resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+ resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0), resultTuple.getFieldStart(resultTuple.getFieldCount()-1));
+
+ if(invListCursor.containsKey(resultTuple, invListCmp)) {
+ count++;
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ }
+ else {
+ if(count + numQueryTokens - invListIx > occurrenceThreshold) {
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ }
+ }
+
+ resultTidx++;
+ if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+ prevBufIdx++;
+ if (prevBufIdx <= maxPrevBufIdx) {
+ prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultTidx = 0;
+ }
+ }
+ }
+
+ return newBufIdx;
+ }
+}
diff --git a/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java
new file mode 100644
index 0000000..8db1b91
--- /dev/null
+++ b/hyracks-storage-am-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/invertedindex/impls/TOccurrenceSearcherSuffixScanOnly.java
@@ -0,0 +1,113 @@
+package edu.uci.ics.hyracks.storage.am.invertedindex.impls;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import edu.uci.ics.fuzzyjoin.tokenizer.IBinaryTokenizer;
+import edu.uci.ics.hyracks.api.context.IHyracksStageletContext;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+
+public class TOccurrenceSearcherSuffixScanOnly extends TOccurrenceSearcher {
+
+ public TOccurrenceSearcherSuffixScanOnly(IHyracksStageletContext ctx, InvertedIndex invIndex,
+ IBinaryTokenizer queryTokenizer) {
+ super(ctx, invIndex, queryTokenizer);
+ }
+
+ protected int mergeSuffixLists(int numPrefixTokens, int numQueryTokens, int maxPrevBufIdx) throws IOException {
+ for(int i = numPrefixTokens; i < numQueryTokens; i++) {
+ swap = prevResultBuffers;
+ prevResultBuffers = newResultBuffers;
+ newResultBuffers = swap;
+ currentNumResults = 0;
+
+ invListCursors.get(i).pinPagesSync();
+ maxPrevBufIdx = mergeSuffixListScan(invListCursors.get(i), prevResultBuffers, maxPrevBufIdx, newResultBuffers, i, numQueryTokens);
+ invListCursors.get(i).unpinPages();
+ }
+ return maxPrevBufIdx;
+ }
+
+ protected int mergeSuffixListScan(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers, int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws IOException {
+
+ int newBufIdx = 0;
+ ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
+
+ int prevBufIdx = 0;
+ ByteBuffer prevCurrentBuffer = prevResultBuffers.get(0);
+
+ boolean advanceCursor = true;
+ boolean advancePrevResult = false;
+ int resultTidx = 0;
+
+ MultiComparator invListCmp = invIndex.getInvListElementCmp();
+
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultFrameTupleApp.reset(newCurrentBuffer, true);
+
+ while(invListCursor.hasNext() && resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+ if(advanceCursor) invListCursor.next();
+
+ ITupleReference invListTuple = invListCursor.getTuple();
+
+ resultTuple.reset(prevCurrentBuffer.array(), resultFrameTupleAcc.getTupleStartOffset(resultTidx));
+
+ int cmp = invListCmp.compare(invListTuple, resultTuple);
+ if (cmp == 0) {
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0), resultTuple.getFieldStart(resultTuple.getFieldCount()-1)) + 1;
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ advanceCursor = true;
+ advancePrevResult = true;
+ } else {
+ if (cmp < 0) {
+ advanceCursor = true;
+ advancePrevResult = false;
+ } else {
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0), resultTuple.getFieldStart(resultTuple.getFieldCount()-1));
+ if(count + numQueryTokens - invListIx > occurrenceThreshold) {
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+ }
+ advanceCursor = false;
+ advancePrevResult = true;
+ }
+ }
+
+ if (advancePrevResult) {
+ resultTidx++;
+ if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+ prevBufIdx++;
+ if (prevBufIdx <= maxPrevBufIdx) {
+ prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultTidx = 0;
+ }
+ }
+ }
+ }
+
+ // append remaining elements from previous result set
+ //if(resultTidx < resultFrameTupleAcc.getTupleCount()) System.out.println("APPENDING FROM RESULTS");
+ while(resultTidx < resultFrameTupleAcc.getTupleCount()) {
+
+ int count = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0), resultTuple.getFieldStart(resultTuple.getFieldCount()-1));
+ newBufIdx = appendTupleToNewResults(resultTuple, count, newBufIdx);
+
+ resultTidx++;
+ if (resultTidx >= resultFrameTupleAcc.getTupleCount()) {
+ prevBufIdx++;
+ if (prevBufIdx <= maxPrevBufIdx) {
+ prevCurrentBuffer = prevResultBuffers.get(prevBufIdx);
+ resultFrameTupleAcc.reset(prevCurrentBuffer);
+ resultTidx = 0;
+ }
+ }
+ }
+
+ return newBufIdx;
+ }
+}
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/BulkLoadTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/BulkLoadTest.java
index 90cbb03..228350e 100644
--- a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/BulkLoadTest.java
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/BulkLoadTest.java
@@ -160,9 +160,9 @@
tokens.add("systems");
tokens.add("university");
- int maxId = 100;
+ int maxId = 1000000;
int addProb = 0;
- int addProbStep = 2;
+ int addProbStep = 10;
IInvertedListBuilder invListBuilder = new FixedSizeElementInvertedListBuilder(invListTypeTraits);
InvertedIndex.BulkLoadContext ctx = invIndex.beginBulkLoad(invListBuilder, HYRACKS_FRAME_SIZE);
@@ -172,7 +172,7 @@
int[] elementFields = { 1 };
for (int i = 0; i < tokens.size(); i++) {
- addProb += addProbStep;
+ addProb += addProbStep * (i+1);
StringBuilder strBuilder = new StringBuilder();
for (int j = 0; j < maxId; j++) {
if ((Math.abs(rnd.nextInt()) % addProb) == 0) {
@@ -222,7 +222,8 @@
FrameTupleReference queryTuple = new FrameTupleReference();
//String query = "computer hyracks fast";
- String query = "compilers fast university hyracks";
+ //String query = "compilers fast university hyracks";
+ String query = "compilers fast";
ITokenFactory tokenFactory = new UTF8WordTokenFactory();
IBinaryTokenizer queryTokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
@@ -240,19 +241,23 @@
DataInput dataIn = new DataInputStream(inStream);
Object o = serde.deserialize(dataIn);
System.out.println(o.toString());
-
+
TOccurrenceSearcher searcher = new TOccurrenceSearcher(stageletCtx, invIndex, queryTokenizer);
+ //TOccurrenceSearcherSuffixProbeOnly searcher = new TOccurrenceSearcherSuffixProbeSingle(stageletCtx, invIndex, queryTokenizer);
+ //TOccurrenceSearcherSuffixScanOnly searcher = new TOccurrenceSearcherSuffixScan(stageletCtx, invIndex, queryTokenizer);
- int repeats = 10;
+ int repeats = 1000;
+ double totalTime = 0;
for(int i = 0; i < repeats; i++) {
long timeStart = System.currentTimeMillis();
+ searcher.reset();
searcher.search(queryTuple, 0);
long timeEnd = System.currentTimeMillis();
- System.out.println("SEARCH TIME: " + (timeEnd - timeStart) + "ms");
+ //System.out.println("SEARCH TIME: " + (timeEnd - timeStart) + "ms");
+ totalTime += timeEnd - timeStart;
}
-
-
-
+ double avgTime = totalTime / (double)repeats;
+ System.out.println("AVG TIME: " + avgTime + "ms");
/*
// ------------------------- TEST B