Added tests for gram-based indexes, and hashed tokens.
git-svn-id: https://hyracks.googlecode.com/svn/branches/hyracks_inverted_index_updates_new@1868 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-storage-am-common/src/main/java/edu/uci/ics/hyracks/storage/am/common/api/IIndexCursor.java b/hyracks-storage-am-common/src/main/java/edu/uci/ics/hyracks/storage/am/common/api/IIndexCursor.java
index 5a23fc6..838be6f 100644
--- a/hyracks-storage-am-common/src/main/java/edu/uci/ics/hyracks/storage/am/common/api/IIndexCursor.java
+++ b/hyracks-storage-am-common/src/main/java/edu/uci/ics/hyracks/storage/am/common/api/IIndexCursor.java
@@ -22,7 +22,7 @@
public void open(ICursorInitialState initialState, ISearchPredicate searchPred) throws IndexException,
HyracksDataException;
- public boolean hasNext() throws HyracksDataException;
+ public boolean hasNext() throws HyracksDataException, IndexException;
public void next() throws HyracksDataException;
diff --git a/hyracks-storage-am-lsm-btree/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/btree/impls/LSMBTreeRangeSearchCursor.java b/hyracks-storage-am-lsm-btree/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/btree/impls/LSMBTreeRangeSearchCursor.java
index a734ea4..3916ac7 100644
--- a/hyracks-storage-am-lsm-btree/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/btree/impls/LSMBTreeRangeSearchCursor.java
+++ b/hyracks-storage-am-lsm-btree/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/btree/impls/LSMBTreeRangeSearchCursor.java
@@ -47,7 +47,7 @@
}
@Override
- public boolean hasNext() throws HyracksDataException {
+ public boolean hasNext() throws HyracksDataException, IndexException {
checkPriorityQueue();
PriorityQueueElement pqHead = outputPriorityQueue.peek();
if (pqHead == null) {
diff --git a/hyracks-storage-am-lsm-common/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/common/impls/LSMTreeSearchCursor.java b/hyracks-storage-am-lsm-common/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/common/impls/LSMTreeSearchCursor.java
index 722d1b5..8625b13 100644
--- a/hyracks-storage-am-lsm-common/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/common/impls/LSMTreeSearchCursor.java
+++ b/hyracks-storage-am-lsm-common/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/common/impls/LSMTreeSearchCursor.java
@@ -23,6 +23,7 @@
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
import edu.uci.ics.hyracks.storage.am.common.api.IIndexCursor;
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexCursor;
+import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
import edu.uci.ics.hyracks.storage.am.lsm.common.api.ILSMTreeTupleReference;
import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
@@ -44,7 +45,7 @@
needPush = false;
}
- public void initPriorityQueue() throws HyracksDataException {
+ public void initPriorityQueue() throws HyracksDataException, IndexException {
int pqInitSize = (rangeCursors.length > 0) ? rangeCursors.length : 1;
outputPriorityQueue = new PriorityQueue<PriorityQueueElement>(pqInitSize, pqCmp);
for (int i = 0; i < rangeCursors.length; i++) {
@@ -79,7 +80,7 @@
}
@Override
- public boolean hasNext() throws HyracksDataException {
+ public boolean hasNext() throws HyracksDataException, IndexException {
checkPriorityQueue();
return !outputPriorityQueue.isEmpty();
}
@@ -127,7 +128,7 @@
return (ITupleReference) outputElement.getTuple();
}
- protected boolean pushIntoPriorityQueue(PriorityQueueElement e) throws HyracksDataException {
+ protected boolean pushIntoPriorityQueue(PriorityQueueElement e) throws HyracksDataException, IndexException {
int cursorIndex = e.getCursorIndex();
if (rangeCursors[cursorIndex].hasNext()) {
rangeCursors[cursorIndex].next();
@@ -143,7 +144,7 @@
return ((ILSMTreeTupleReference) checkElement.getTuple()).isAntimatter();
}
- protected void checkPriorityQueue() throws HyracksDataException {
+ protected void checkPriorityQueue() throws HyracksDataException, IndexException {
while (!outputPriorityQueue.isEmpty() || needPush == true) {
if (!outputPriorityQueue.isEmpty()) {
PriorityQueueElement checkElement = outputPriorityQueue.peek();
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/api/IInvertedListCursor.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/api/IInvertedListCursor.java
index 321eab0..489495c 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/api/IInvertedListCursor.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/api/IInvertedListCursor.java
@@ -18,6 +18,7 @@
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
public interface IInvertedListCursor extends Comparable<IInvertedListCursor> {
@@ -27,7 +28,7 @@
public void unpinPages() throws HyracksDataException;
- public boolean hasNext() throws HyracksDataException;
+ public boolean hasNext() throws HyracksDataException, IndexException;
public void next() throws HyracksDataException;
@@ -42,11 +43,11 @@
public int getStartOff();
- public boolean containsKey(ITupleReference searchTuple, MultiComparator invListCmp) throws HyracksDataException;
+ public boolean containsKey(ITupleReference searchTuple, MultiComparator invListCmp) throws HyracksDataException, IndexException;
// for debugging
@SuppressWarnings("rawtypes")
- public String printInvList(ISerializerDeserializer[] serdes) throws HyracksDataException;
+ public String printInvList(ISerializerDeserializer[] serdes) throws HyracksDataException, IndexException;
@SuppressWarnings("rawtypes")
public String printCurrentElement(ISerializerDeserializer[] serdes) throws HyracksDataException;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/impls/LSMInvertedIndexSearchCursor.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/impls/LSMInvertedIndexSearchCursor.java
index 6c2de44..aafe950 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/impls/LSMInvertedIndexSearchCursor.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/impls/LSMInvertedIndexSearchCursor.java
@@ -27,6 +27,7 @@
import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
import edu.uci.ics.hyracks.storage.am.lsm.common.impls.LSMHarness;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.exceptions.OccurrenceThresholdPanicException;
/**
* Searches the components one-by-one, completely consuming a cursor before moving on to the next one.
@@ -86,7 +87,7 @@
}
// Move to the next tuple that has not been deleted.
- private boolean nextValidTuple() throws HyracksDataException {
+ private boolean nextValidTuple() throws HyracksDataException, IndexException {
while (currentCursor.hasNext()) {
currentCursor.next();
if (!isDeleted(currentCursor.getTuple())) {
@@ -98,7 +99,7 @@
}
@Override
- public boolean hasNext() throws HyracksDataException {
+ public boolean hasNext() throws HyracksDataException, IndexException {
if (!tupleConsumed) {
return true;
}
@@ -115,6 +116,8 @@
currentCursor = currentAccessor.createSearchCursor();
try {
currentAccessor.search(currentCursor, searchPred);
+ } catch (OccurrenceThresholdPanicException e) {
+ throw e;
} catch (IndexException e) {
throw new HyracksDataException(e);
}
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedListCursor.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedListCursor.java
index c182c87..68de656 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedListCursor.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedListCursor.java
@@ -108,7 +108,7 @@
}
@Override
- public boolean hasNext() throws HyracksDataException {
+ public boolean hasNext() throws HyracksDataException, IndexException {
return btreeCursor.hasNext();
}
@@ -170,7 +170,7 @@
}
@Override
- public boolean containsKey(ITupleReference searchTuple, MultiComparator invListCmp) throws HyracksDataException {
+ public boolean containsKey(ITupleReference searchTuple, MultiComparator invListCmp) throws HyracksDataException, IndexException {
btreeSearchTuple.addTuple(searchTuple);
btreePred.setLowKeyComparator(btreeCmp);
btreePred.setHighKeyComparator(btreeCmp);
@@ -195,7 +195,7 @@
@SuppressWarnings("rawtypes")
@Override
- public String printInvList(ISerializerDeserializer[] serdes) throws HyracksDataException {
+ public String printInvList(ISerializerDeserializer[] serdes) throws HyracksDataException, IndexException {
StringBuilder strBuilder = new StringBuilder();
try {
while (btreeCursor.hasNext()) {
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndexRangeSearchCursor.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndexRangeSearchCursor.java
index 875421b..540fb9f 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndexRangeSearchCursor.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndexRangeSearchCursor.java
@@ -78,7 +78,7 @@
}
@Override
- public boolean hasNext() throws HyracksDataException {
+ public boolean hasNext() throws HyracksDataException, IndexException {
if (invListCursor.hasNext()) {
return true;
}
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcher.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcher.java
index 1613a0b..2caa740 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcher.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcher.java
@@ -186,7 +186,7 @@
resultCursor.open(null, searchPred);
}
- protected int mergePrefixLists(int numPrefixTokens, int numQueryTokens) throws HyracksDataException {
+ protected int mergePrefixLists(int numPrefixTokens, int numQueryTokens) throws HyracksDataException, IndexException {
int maxPrevBufIdx = 0;
for (int i = 0; i < numPrefixTokens; i++) {
swap = prevResultBuffers;
@@ -202,7 +202,7 @@
}
protected int mergeSuffixLists(int numPrefixTokens, int numQueryTokens, int maxPrevBufIdx)
- throws HyracksDataException {
+ throws HyracksDataException, IndexException {
for (int i = numPrefixTokens; i < numQueryTokens; i++) {
swap = prevResultBuffers;
prevResultBuffers = newResultBuffers;
@@ -224,7 +224,7 @@
}
protected int mergeSuffixListProbe(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
- int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws HyracksDataException {
+ int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws HyracksDataException, IndexException {
int newBufIdx = 0;
ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
@@ -270,7 +270,7 @@
protected int mergeSuffixListScan(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens)
- throws HyracksDataException {
+ throws HyracksDataException, IndexException {
int newBufIdx = 0;
ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
@@ -365,7 +365,7 @@
}
protected int mergePrefixList(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
- int maxPrevBufIdx, List<ByteBuffer> newResultBuffers) throws HyracksDataException {
+ int maxPrevBufIdx, List<ByteBuffer> newResultBuffers) throws HyracksDataException, IndexException {
int newBufIdx = 0;
ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcherSuffixProbeOnly.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcherSuffixProbeOnly.java
index d4ec2cf..630b810 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcherSuffixProbeOnly.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcherSuffixProbeOnly.java
@@ -21,6 +21,7 @@
import edu.uci.ics.hyracks.api.context.IHyracksTaskContext;
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedListCursor;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.ondisk.OnDiskInvertedIndex;
@@ -34,7 +35,7 @@
this.invListCmp = MultiComparator.create(invIndex.getInvListCmpFactories());
}
- protected int mergeSuffixLists(int numPrefixTokens, int numQueryTokens, int maxPrevBufIdx) throws HyracksDataException {
+ protected int mergeSuffixLists(int numPrefixTokens, int numQueryTokens, int maxPrevBufIdx) throws HyracksDataException, IndexException {
for (int i = numPrefixTokens; i < numQueryTokens; i++) {
swap = prevResultBuffers;
prevResultBuffers = newResultBuffers;
@@ -50,7 +51,7 @@
}
protected int mergeSuffixListProbe(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
- int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws HyracksDataException {
+ int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws HyracksDataException, IndexException {
int newBufIdx = 0;
ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcherSuffixScanOnly.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcherSuffixScanOnly.java
index 8c6f2c7..3640511 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcherSuffixScanOnly.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcherSuffixScanOnly.java
@@ -22,6 +22,7 @@
import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedListCursor;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.ondisk.OnDiskInvertedIndex;
@@ -35,7 +36,7 @@
this.invListCmp = MultiComparator.create(invIndex.getInvListCmpFactories());
}
- protected int mergeSuffixLists(int numPrefixTokens, int numQueryTokens, int maxPrevBufIdx) throws HyracksDataException {
+ protected int mergeSuffixLists(int numPrefixTokens, int numQueryTokens, int maxPrevBufIdx) throws HyracksDataException, IndexException {
for (int i = numPrefixTokens; i < numQueryTokens; i++) {
swap = prevResultBuffers;
prevResultBuffers = newResultBuffers;
@@ -51,7 +52,7 @@
}
protected int mergeSuffixListScan(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
- int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws HyracksDataException {
+ int maxPrevBufIdx, List<ByteBuffer> newResultBuffers, int invListIx, int numQueryTokens) throws HyracksDataException, IndexException {
int newBufIdx = 0;
ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
index 9813154..986d938 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizerFactory.java
@@ -15,25 +15,22 @@
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
+public class DelimitedUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
-public class DelimitedUTF8StringBinaryTokenizerFactory implements
- IBinaryTokenizerFactory {
+ private static final long serialVersionUID = 1L;
+ private final boolean ignoreTokenCount;
+ private final boolean sourceHasTypeTag;
+ private final ITokenFactory tokenFactory;
- private static final long serialVersionUID = 1L;
- private final boolean ignoreTokenCount;
- private final boolean sourceHasTypeTag;
- private final ITokenFactory tokenFactory;
+ public DelimitedUTF8StringBinaryTokenizerFactory(boolean ignoreTokenCount, boolean sourceHasTypeTag,
+ ITokenFactory tokenFactory) {
+ this.ignoreTokenCount = ignoreTokenCount;
+ this.sourceHasTypeTag = sourceHasTypeTag;
+ this.tokenFactory = tokenFactory;
+ }
- public DelimitedUTF8StringBinaryTokenizerFactory(boolean ignoreTokenCount,
- boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
- this.ignoreTokenCount = ignoreTokenCount;
- this.sourceHasTypeTag = sourceHasTypeTag;
- this.tokenFactory = tokenFactory;
- }
-
- @Override
- public IBinaryTokenizer createTokenizer() {
- return new DelimitedUTF8StringBinaryTokenizer(ignoreTokenCount,
- sourceHasTypeTag, tokenFactory);
- }
+ @Override
+ public IBinaryTokenizer createTokenizer() {
+ return new DelimitedUTF8StringBinaryTokenizer(ignoreTokenCount, sourceHasTypeTag, tokenFactory);
+ }
}
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java
index 866b17c..22efc92 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/HashedUTF8NGramTokenFactory.java
@@ -15,7 +15,6 @@
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
-
public class HashedUTF8NGramTokenFactory extends AbstractUTF8TokenFactory {
private static final long serialVersionUID = 1L;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizerFactory.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizerFactory.java
new file mode 100644
index 0000000..da3d411
--- /dev/null
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizerFactory.java
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers;
+
+public class NGramUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
+
+ private static final long serialVersionUID = 1L;
+ private final int gramLength;
+ private final boolean usePrePost;
+ private final boolean ignoreTokenCount;
+ private final boolean sourceHasTypeTag;
+ private final ITokenFactory tokenFactory;
+
+ public NGramUTF8StringBinaryTokenizerFactory(int gramLength, boolean usePrePost, boolean ignoreTokenCount,
+ boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+ this.gramLength = gramLength;
+ this.usePrePost = usePrePost;
+ this.ignoreTokenCount = ignoreTokenCount;
+ this.sourceHasTypeTag = sourceHasTypeTag;
+ this.tokenFactory = tokenFactory;
+ }
+
+ @Override
+ public IBinaryTokenizer createTokenizer() {
+ return new NGramUTF8StringBinaryTokenizer(gramLength, usePrePost, ignoreTokenCount, sourceHasTypeTag,
+ tokenFactory);
+ }
+
+}
diff --git a/hyracks-storage-am-lsm-rtree/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/impls/AbstractLSMRTree.java b/hyracks-storage-am-lsm-rtree/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/impls/AbstractLSMRTree.java
index 65d52d5..b39d337 100644
--- a/hyracks-storage-am-lsm-rtree/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/impls/AbstractLSMRTree.java
+++ b/hyracks-storage-am-lsm-rtree/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/impls/AbstractLSMRTree.java
@@ -33,16 +33,16 @@
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndex;
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexCursor;
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexFrameFactory;
+import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
import edu.uci.ics.hyracks.storage.am.common.api.IndexType;
-import edu.uci.ics.hyracks.storage.am.common.api.TreeIndexException;
import edu.uci.ics.hyracks.storage.am.common.impls.NoOpOperationCallback;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.IndexOp;
import edu.uci.ics.hyracks.storage.am.lsm.common.api.ILSMComponentFinalizer;
-import edu.uci.ics.hyracks.storage.am.lsm.common.api.ILSMIndexFileManager;
import edu.uci.ics.hyracks.storage.am.lsm.common.api.ILSMFlushController;
import edu.uci.ics.hyracks.storage.am.lsm.common.api.ILSMIOOperationScheduler;
import edu.uci.ics.hyracks.storage.am.lsm.common.api.ILSMIndex;
import edu.uci.ics.hyracks.storage.am.lsm.common.api.ILSMIndexAccessor;
+import edu.uci.ics.hyracks.storage.am.lsm.common.api.ILSMIndexFileManager;
import edu.uci.ics.hyracks.storage.am.lsm.common.api.ILSMMergePolicy;
import edu.uci.ics.hyracks.storage.am.lsm.common.api.ILSMOperationTracker;
import edu.uci.ics.hyracks.storage.am.lsm.common.freepage.InMemoryBufferCache;
@@ -270,7 +270,7 @@
@Override
public boolean insertUpdateOrDelete(ITupleReference tuple, IIndexOpContext ictx) throws HyracksDataException,
- TreeIndexException {
+ IndexException {
LSMRTreeOpContext ctx = (LSMRTreeOpContext) ictx;
if (ctx.getIndexOp() == IndexOp.PHYSICALDELETE) {
throw new UnsupportedOperationException("Physical delete not yet supported in LSM R-tree");
diff --git a/hyracks-storage-am-lsm-rtree/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/impls/LSMRTreeWithAntiMatterTuplesSearchCursor.java b/hyracks-storage-am-lsm-rtree/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/impls/LSMRTreeWithAntiMatterTuplesSearchCursor.java
index 63ad633..60b0370 100644
--- a/hyracks-storage-am-lsm-rtree/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/impls/LSMRTreeWithAntiMatterTuplesSearchCursor.java
+++ b/hyracks-storage-am-lsm-rtree/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/impls/LSMRTreeWithAntiMatterTuplesSearchCursor.java
@@ -43,7 +43,7 @@
private int[] comparatorFields;
private MultiComparator btreeCmp;
- public void initPriorityQueue() throws HyracksDataException {
+ public void initPriorityQueue() throws HyracksDataException, IndexException {
int pqInitSize = (rangeCursors.length > 0) ? rangeCursors.length : 1;
outputPriorityQueue = new PriorityQueue<PriorityQueueElement>(pqInitSize, pqCmp);
for (int i = 0; i < rangeCursors.length; i++) {
@@ -80,7 +80,7 @@
}
@Override
- public boolean hasNext() throws HyracksDataException {
+ public boolean hasNext() throws HyracksDataException, IndexException {
if (includeMemComponent) {
if (foundNext) {
return true;
diff --git a/hyracks-test-support/src/main/java/edu/uci/ics/hyracks/storage/am/common/AbstractTreeIndexTestWorker.java b/hyracks-test-support/src/main/java/edu/uci/ics/hyracks/storage/am/common/AbstractTreeIndexTestWorker.java
index ca162ed..2108260 100644
--- a/hyracks-test-support/src/main/java/edu/uci/ics/hyracks/storage/am/common/AbstractTreeIndexTestWorker.java
+++ b/hyracks-test-support/src/main/java/edu/uci/ics/hyracks/storage/am/common/AbstractTreeIndexTestWorker.java
@@ -23,6 +23,7 @@
import edu.uci.ics.hyracks.storage.am.common.api.IIndexAccessor;
import edu.uci.ics.hyracks.storage.am.common.api.IIndexCursor;
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndex;
+import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
import edu.uci.ics.hyracks.storage.am.common.datagen.DataGenThread;
import edu.uci.ics.hyracks.storage.am.common.datagen.TupleBatch;
@@ -60,7 +61,7 @@
}
}
- protected void consumeCursorTuples(IIndexCursor cursor) throws HyracksDataException {
+ protected void consumeCursorTuples(IIndexCursor cursor) throws HyracksDataException, IndexException {
try {
while (cursor.hasNext()) {
cursor.next();
diff --git a/hyracks-test-support/src/main/java/edu/uci/ics/hyracks/storage/am/config/AccessMethodTestsConfig.java b/hyracks-test-support/src/main/java/edu/uci/ics/hyracks/storage/am/config/AccessMethodTestsConfig.java
index d8ebb1d..a70a0fe 100644
--- a/hyracks-test-support/src/main/java/edu/uci/ics/hyracks/storage/am/config/AccessMethodTestsConfig.java
+++ b/hyracks-test-support/src/main/java/edu/uci/ics/hyracks/storage/am/config/AccessMethodTestsConfig.java
@@ -74,7 +74,7 @@
public static final int LSM_INVINDEX_MAX_OPEN_FILES = 10;
public static final int LSM_INVINDEX_HYRACKS_FRAME_SIZE = 32768;
- public static final int LSM_INVINDEX_NUM_DOCS_TO_INSERT = 10000;
+ public static final int LSM_INVINDEX_NUM_DOCS_TO_INSERT = 1000;
public static final int LSM_INVINDEX_NUM_BULKLOAD_ROUNDS = 5;
public static final int LSM_INVINDEX_MAX_TREES_TO_MERGE = 5;
public static final int LSM_INVINDEX_NUM_INSERT_ROUNDS = 3;
diff --git a/hyracks-tests/hyracks-storage-am-btree-test/src/test/java/edu/uci/ics/hyracks/storage/am/btree/multithread/BTreeTestWorker.java b/hyracks-tests/hyracks-storage-am-btree-test/src/test/java/edu/uci/ics/hyracks/storage/am/btree/multithread/BTreeTestWorker.java
index 9be5a75..7fd4f5f 100644
--- a/hyracks-tests/hyracks-storage-am-btree-test/src/test/java/edu/uci/ics/hyracks/storage/am/btree/multithread/BTreeTestWorker.java
+++ b/hyracks-tests/hyracks-storage-am-btree-test/src/test/java/edu/uci/ics/hyracks/storage/am/btree/multithread/BTreeTestWorker.java
@@ -29,7 +29,7 @@
import edu.uci.ics.hyracks.storage.am.common.TestOperationSelector.TestOperation;
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndex;
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexCursor;
-import edu.uci.ics.hyracks.storage.am.common.api.TreeIndexException;
+import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
import edu.uci.ics.hyracks.storage.am.common.datagen.DataGenThread;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
@@ -48,7 +48,7 @@
}
@Override
- public void performOp(ITupleReference tuple, TestOperation op) throws HyracksDataException, TreeIndexException {
+ public void performOp(ITupleReference tuple, TestOperation op) throws HyracksDataException, IndexException {
BTree.BTreeAccessor accessor = (BTree.BTreeAccessor) indexAccessor;
ITreeIndexCursor searchCursor = accessor.createSearchCursor();
ITreeIndexCursor diskOrderScanCursor = accessor.createDiskOrderScanCursor();
@@ -121,7 +121,7 @@
}
}
- private void consumeCursorTuples(ITreeIndexCursor cursor) throws HyracksDataException {
+ private void consumeCursorTuples(ITreeIndexCursor cursor) throws HyracksDataException, IndexException {
try {
while (cursor.hasNext()) {
cursor.next();
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexDeleteTest.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexDeleteTest.java
index ad47653..db2bc8e 100644
--- a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexDeleteTest.java
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexDeleteTest.java
@@ -74,4 +74,11 @@
TupleGenerator tupleGen = InvertedIndexTestUtils.createStringDocumentTupleGen(harness.getRandom());
runTest(testCtx, tupleGen);
}
+
+ @Test
+ public void hashedWordTokensInvIndexTest() throws IOException, IndexException {
+ InvertedIndexTestContext testCtx = InvertedIndexTestUtils.createHashedWordInvIndexTestContext(harness, invIndexType);
+ TupleGenerator tupleGen = InvertedIndexTestUtils.createStringDocumentTupleGen(harness.getRandom());
+ runTest(testCtx, tupleGen);
+ }
}
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexLoadTest.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexLoadTest.java
index bb8c851..0c3d53e 100644
--- a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexLoadTest.java
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexLoadTest.java
@@ -63,4 +63,11 @@
TupleGenerator tupleGen = InvertedIndexTestUtils.createStringDocumentTupleGen(harness.getRandom());
runTest(testCtx, tupleGen);
}
+
+ @Test
+ public void hashedWordTokensInvIndexTest() throws IOException, IndexException {
+ InvertedIndexTestContext testCtx = InvertedIndexTestUtils.createHashedWordInvIndexTestContext(harness, invIndexType);
+ TupleGenerator tupleGen = InvertedIndexTestUtils.createStringDocumentTupleGen(harness.getRandom());
+ runTest(testCtx, tupleGen);
+ }
}
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexSearchTest.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexSearchTest.java
index d777b85..ce6e829 100644
--- a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexSearchTest.java
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexSearchTest.java
@@ -16,6 +16,10 @@
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.common;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Level;
+import java.util.logging.Logger;
import org.junit.Test;
@@ -24,15 +28,18 @@
import edu.uci.ics.hyracks.storage.am.common.datagen.TupleGenerator;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifier;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.ConjunctiveSearchModifier;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.EditDistanceSearchModifier;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.JaccardSearchModifier;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext.InvertedIndexType;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestUtils;
public abstract class AbstractInvertedIndexSearchTest extends AbstractInvertedIndexTest {
+ protected final Logger LOGGER = Logger.getLogger(AbstractInvertedIndexSearchTest.class.getName());
+
protected int NUM_DOC_QUERIES = 8000;
protected int NUM_RANDOM_QUERIES = 2000;
- protected int[] scanCountArray = new int[NUM_DOCS_TO_INSERT];
protected final boolean bulkLoad;
public AbstractInvertedIndexSearchTest(InvertedIndexType invIndexType, boolean bulkLoad) {
@@ -41,7 +48,7 @@
}
protected void runTest(InvertedIndexTestContext testCtx, TupleGenerator tupleGen,
- IInvertedIndexSearchModifier searchModifier) throws IOException, IndexException {
+ List<IInvertedIndexSearchModifier> searchModifiers) throws IOException, IndexException {
IIndex invIndex = testCtx.getIndex();
invIndex.create();
invIndex.activate();
@@ -53,19 +60,70 @@
}
invIndex.validate();
- InvertedIndexTestUtils.testIndexSearch(testCtx, tupleGen, harness.getRandom(), NUM_DOC_QUERIES,
- NUM_RANDOM_QUERIES, searchModifier, scanCountArray);
-
+ for (IInvertedIndexSearchModifier searchModifier : searchModifiers) {
+ if (LOGGER.isLoggable(Level.INFO)) {
+ LOGGER.info("Running searches with: " + searchModifier.toString());
+ }
+ InvertedIndexTestUtils.testIndexSearch(testCtx, tupleGen, harness.getRandom(), NUM_DOC_QUERIES,
+ NUM_RANDOM_QUERIES, searchModifier, SCAN_COUNT_ARRAY);
+ }
+
invIndex.deactivate();
invIndex.destroy();
}
+ private void testWordInvIndexIndex(InvertedIndexTestContext testCtx) throws IOException, IndexException {
+ TupleGenerator tupleGen = InvertedIndexTestUtils.createStringDocumentTupleGen(harness.getRandom());
+ List<IInvertedIndexSearchModifier> searchModifiers = new ArrayList<IInvertedIndexSearchModifier>();
+ searchModifiers.add(new ConjunctiveSearchModifier());
+ searchModifiers.add(new JaccardSearchModifier(1.0f));
+ searchModifiers.add(new JaccardSearchModifier(0.9f));
+ searchModifiers.add(new JaccardSearchModifier(0.8f));
+ searchModifiers.add(new JaccardSearchModifier(0.7f));
+ searchModifiers.add(new JaccardSearchModifier(0.6f));
+ searchModifiers.add(new JaccardSearchModifier(0.5f));
+ runTest(testCtx, tupleGen, searchModifiers);
+ }
+
+ private void testNGramInvIndexIndex(InvertedIndexTestContext testCtx) throws IOException, IndexException {
+ TupleGenerator tupleGen = InvertedIndexTestUtils.createPersonNamesTupleGen(harness.getRandom());
+ List<IInvertedIndexSearchModifier> searchModifiers = new ArrayList<IInvertedIndexSearchModifier>();
+ searchModifiers.add(new ConjunctiveSearchModifier());
+ searchModifiers.add(new JaccardSearchModifier(1.0f));
+ searchModifiers.add(new JaccardSearchModifier(0.9f));
+ searchModifiers.add(new JaccardSearchModifier(0.8f));
+ searchModifiers.add(new JaccardSearchModifier(0.7f));
+ searchModifiers.add(new JaccardSearchModifier(0.6f));
+ searchModifiers.add(new JaccardSearchModifier(0.5f));
+ searchModifiers.add(new EditDistanceSearchModifier(InvertedIndexTestUtils.TEST_GRAM_LENGTH, 0));
+ searchModifiers.add(new EditDistanceSearchModifier(InvertedIndexTestUtils.TEST_GRAM_LENGTH, 1));
+ searchModifiers.add(new EditDistanceSearchModifier(InvertedIndexTestUtils.TEST_GRAM_LENGTH, 2));
+ searchModifiers.add(new EditDistanceSearchModifier(InvertedIndexTestUtils.TEST_GRAM_LENGTH, 3));
+ runTest(testCtx, tupleGen, searchModifiers);
+ }
+
@Test
public void wordTokensInvIndexTest() throws IOException, IndexException {
InvertedIndexTestContext testCtx = InvertedIndexTestUtils.createWordInvIndexTestContext(harness, invIndexType);
- TupleGenerator tupleGen = InvertedIndexTestUtils.createStringDocumentTupleGen(harness.getRandom());
- IInvertedIndexSearchModifier searchModifier = new ConjunctiveSearchModifier();
- runTest(testCtx, tupleGen, searchModifier);
+ testWordInvIndexIndex(testCtx);
+ }
+
+ @Test
+ public void hashedWordTokensInvIndexTest() throws IOException, IndexException {
+ InvertedIndexTestContext testCtx = InvertedIndexTestUtils.createHashedWordInvIndexTestContext(harness, invIndexType);
+ testWordInvIndexIndex(testCtx);
+ }
+
+ @Test
+ public void ngramTokensInvIndexTest() throws IOException, IndexException {
+ InvertedIndexTestContext testCtx = InvertedIndexTestUtils.createNGramInvIndexTestContext(harness, invIndexType);
+ testNGramInvIndexIndex(testCtx);
+ }
+
+ @Test
+ public void hashedNGramTokensInvIndexTest() throws IOException, IndexException {
+ InvertedIndexTestContext testCtx = InvertedIndexTestUtils.createHashedNGramInvIndexTestContext(harness, invIndexType);
+ testNGramInvIndexIndex(testCtx);
}
}
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTestUtils.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTestUtils.java
index bcbd0fa..9551921 100644
--- a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTestUtils.java
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTestUtils.java
@@ -49,6 +49,7 @@
import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
import edu.uci.ics.hyracks.storage.am.common.datagen.DocumentStringFieldValueGenerator;
import edu.uci.ics.hyracks.storage.am.common.datagen.IFieldValueGenerator;
+import edu.uci.ics.hyracks.storage.am.common.datagen.PersonNameFieldValueGenerator;
import edu.uci.ics.hyracks.storage.am.common.datagen.SortedIntegerFieldValueGenerator;
import edu.uci.ics.hyracks.storage.am.common.datagen.TupleGenerator;
import edu.uci.ics.hyracks.storage.am.common.impls.NoOpOperationCallback;
@@ -62,19 +63,22 @@
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.exceptions.OccurrenceThresholdPanicException;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.InvertedIndexSearchPredicate;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizerFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8NGramTokenFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8WordTokenFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizerFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8NGramTokenFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8WordTokenFactory;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext.InvertedIndexType;
@SuppressWarnings("rawtypes")
public class InvertedIndexTestUtils {
- // Probability that a randomly generated query is used, instead of a document from the corpus.
- protected static final float RQNDOM_QUERY_PROB = 0.9f;
-
+ public static final int TEST_GRAM_LENGTH = 3;
+
public static TupleGenerator createStringDocumentTupleGen(Random rnd) throws IOException {
IFieldValueGenerator[] fieldGens = new IFieldValueGenerator[2];
fieldGens[0] = new DocumentStringFieldValueGenerator(2, 10, 10000, rnd);
@@ -85,6 +89,16 @@
return tupleGen;
}
+ public static TupleGenerator createPersonNamesTupleGen(Random rnd) throws IOException {
+ IFieldValueGenerator[] fieldGens = new IFieldValueGenerator[2];
+ fieldGens[0] = new PersonNameFieldValueGenerator(rnd, 0.5f);
+ fieldGens[1] = new SortedIntegerFieldValueGenerator(0);
+ ISerializerDeserializer[] fieldSerdes = new ISerializerDeserializer[] {
+ UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
+ TupleGenerator tupleGen = new TupleGenerator(fieldGens, fieldSerdes, 0);
+ return tupleGen;
+ }
+
public static InvertedIndexTestContext createWordInvIndexTestContext(LSMInvertedIndexTestHarness harness,
InvertedIndexType invIndexType) throws IOException, IndexException {
ISerializerDeserializer[] fieldSerdes = new ISerializerDeserializer[] {
@@ -97,6 +111,42 @@
return testCtx;
}
+ public static InvertedIndexTestContext createHashedWordInvIndexTestContext(LSMInvertedIndexTestHarness harness,
+ InvertedIndexType invIndexType) throws IOException, IndexException {
+ ISerializerDeserializer[] fieldSerdes = new ISerializerDeserializer[] { IntegerSerializerDeserializer.INSTANCE,
+ IntegerSerializerDeserializer.INSTANCE };
+ ITokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
+ IBinaryTokenizerFactory tokenizerFactory = new DelimitedUTF8StringBinaryTokenizerFactory(true, false,
+ tokenFactory);
+ InvertedIndexTestContext testCtx = InvertedIndexTestContext.create(harness, fieldSerdes, 1, tokenizerFactory,
+ invIndexType);
+ return testCtx;
+ }
+
+ public static InvertedIndexTestContext createNGramInvIndexTestContext(LSMInvertedIndexTestHarness harness,
+ InvertedIndexType invIndexType) throws IOException, IndexException {
+ ISerializerDeserializer[] fieldSerdes = new ISerializerDeserializer[] {
+ UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
+ ITokenFactory tokenFactory = new UTF8NGramTokenFactory();
+ IBinaryTokenizerFactory tokenizerFactory = new NGramUTF8StringBinaryTokenizerFactory(TEST_GRAM_LENGTH, true,
+ true, false, tokenFactory);
+ InvertedIndexTestContext testCtx = InvertedIndexTestContext.create(harness, fieldSerdes, 1, tokenizerFactory,
+ invIndexType);
+ return testCtx;
+ }
+
+ public static InvertedIndexTestContext createHashedNGramInvIndexTestContext(LSMInvertedIndexTestHarness harness,
+ InvertedIndexType invIndexType) throws IOException, IndexException {
+ ISerializerDeserializer[] fieldSerdes = new ISerializerDeserializer[] { IntegerSerializerDeserializer.INSTANCE,
+ IntegerSerializerDeserializer.INSTANCE };
+ ITokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
+ IBinaryTokenizerFactory tokenizerFactory = new NGramUTF8StringBinaryTokenizerFactory(TEST_GRAM_LENGTH, true,
+ true, false, tokenFactory);
+ InvertedIndexTestContext testCtx = InvertedIndexTestContext.create(harness, fieldSerdes, 1, tokenizerFactory,
+ invIndexType);
+ return testCtx;
+ }
+
public static void bulkLoadInvIndex(InvertedIndexTestContext testCtx, TupleGenerator tupleGen, int numDocs)
throws IndexException, IOException {
SortedSet<CheckTuple> tmpMemIndex = new TreeSet<CheckTuple>();;
@@ -147,7 +197,7 @@
documentCorpus.remove(size - 1);
}
}
-
+
/**
* Compares actual and expected indexes using the rangeSearch() method of the inverted-index accessor.
*/
@@ -241,7 +291,7 @@
checkLowKey.appendField(token);
CheckTuple checkHighKey = new CheckTuple(tokenFieldCount, tokenFieldCount);
checkHighKey.appendField(token);
-
+
SortedSet<CheckTuple> expectedInvList = OrderedIndexTestUtils.getPrefixExpectedSubset(
testCtx.getCheckTuples(), checkLowKey, checkHighKey);
Iterator<CheckTuple> expectedInvListIter = expectedInvList.iterator();
@@ -329,8 +379,8 @@
}
public static void testIndexSearch(InvertedIndexTestContext testCtx, TupleGenerator tupleGen, Random rnd,
- int numDocQueries, int numRandomQueries, IInvertedIndexSearchModifier searchModifier, int[] scanCountArray) throws IOException,
- IndexException {
+ int numDocQueries, int numRandomQueries, IInvertedIndexSearchModifier searchModifier, int[] scanCountArray)
+ throws IOException, IndexException {
IInvertedIndex invIndex = testCtx.invIndex;
IInvertedIndexAccessor accessor = (IInvertedIndexAccessor) invIndex.createAccessor(
NoOpOperationCallback.INSTANCE, NoOpOperationCallback.INSTANCE);
@@ -364,7 +414,7 @@
try {
accessor.search(resultCursor, searchPred);
} catch (OccurrenceThresholdPanicException e) {
- // ignore panic queries
+ // ignore panic queries.
panic = true;
}
@@ -372,12 +422,17 @@
if (!panic) {
// Consume cursor and deserialize results so we can sort them. Some search cursors may not deliver the result sorted (e.g., LSM search cursor).
ArrayList<Integer> actualResults = new ArrayList<Integer>();
- while (resultCursor.hasNext()) {
- resultCursor.next();
- ITupleReference resultTuple = resultCursor.getTuple();
- int actual = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
- resultTuple.getFieldStart(0));
- actualResults.add(Integer.valueOf(actual));
+ try {
+ while (resultCursor.hasNext()) {
+ resultCursor.next();
+ ITupleReference resultTuple = resultCursor.getTuple();
+ int actual = IntegerSerializerDeserializer.getInt(resultTuple.getFieldData(0),
+ resultTuple.getFieldStart(0));
+ actualResults.add(Integer.valueOf(actual));
+ }
+ } catch (OccurrenceThresholdPanicException e) {
+ // Ignore panic queries.
+ continue;
}
Collections.sort(actualResults);
@@ -407,43 +462,4 @@
}
}
}
-
-
-
- /*
- public static OnDiskInvertedIndex createTestInvertedIndex(LSMInvertedIndexTestHarness harness, IBinaryTokenizer tokenizer)
- throws HyracksDataException {
- ITreeIndexMetaDataFrameFactory metaFrameFactory = new LIFOMetaDataFrameFactory();
- ITypeTraits[] btreeTypeTraits = new ITypeTraits[] { UTF8StringPointable.TYPE_TRAITS,
- IntegerPointable.TYPE_TRAITS, IntegerPointable.TYPE_TRAITS, IntegerPointable.TYPE_TRAITS,
- IntegerPointable.TYPE_TRAITS };
- ITreeIndexTupleWriterFactory tupleWriterFactory = new TypeAwareTupleWriterFactory(btreeTypeTraits);
- ITreeIndexFrameFactory leafFrameFactory = new BTreeNSMLeafFrameFactory(tupleWriterFactory);
- ITreeIndexFrameFactory interiorFrameFactory = new BTreeNSMInteriorFrameFactory(tupleWriterFactory);
- IFreePageManager freePageManager = new LinkedListFreePageManager(harness.getDiskBufferCache(), 0,
- metaFrameFactory);
- IBinaryComparatorFactory[] btreeCmpFactories = new IBinaryComparatorFactory[] { PointableBinaryComparatorFactory
- .of(UTF8StringPointable.FACTORY) };
- return InvertedIndexUtils.createInvertedIndex(harness.getDiskBufferCache(),
- harness.getInvListTypeTraits(), harness.getInvListCmpFactories(), tokenizer);
- }
-
- public static InMemoryInvertedIndex createInMemoryInvertedIndex(LSMInvertedIndexTestHarness harness,
- IBinaryTokenizer tokenizer) {
- return InvertedIndexUtils.createInMemoryBTreeInvertedindex(harness.getMemBufferCache(),
- harness.getMemFreePageManager(), harness.getTokenTypeTraits(), harness.getInvListTypeTraits(),
- harness.getTokenCmpFactories(), harness.getInvListCmpFactories(),
- tokenizer);
- }
-
- public static LSMInvertedIndex createLSMInvertedIndex(LSMInvertedIndexTestHarness harness,
- IBinaryTokenizer tokenizer) {
- return InvertedIndexUtils.createLSMInvertedIndex(harness.getMemBufferCache(),
- harness.getMemFreePageManager(), harness.getTokenTypeTraits(), harness.getInvListTypeTraits(),
- harness.getTokenCmpFactories(), harness.getInvListCmpFactories(),
- tokenizer, harness.getDiskBufferCache(),
- new LinkedListFreePageManagerFactory(harness.getDiskBufferCache(), new LIFOMetaDataFrameFactory()),
- harness.getIOManager(), harness.getOnDiskDir(), harness.getDiskFileMapProvider());
- }
- */
}
diff --git a/hyracks-tests/hyracks-storage-am-lsm-rtree-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/multithread/AbstractLSMRTreeTestWorker.java b/hyracks-tests/hyracks-storage-am-lsm-rtree-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/multithread/AbstractLSMRTreeTestWorker.java
index a89dca1..c54c948 100644
--- a/hyracks-tests/hyracks-storage-am-lsm-rtree-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/multithread/AbstractLSMRTreeTestWorker.java
+++ b/hyracks-tests/hyracks-storage-am-lsm-rtree-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/multithread/AbstractLSMRTreeTestWorker.java
@@ -23,6 +23,7 @@
import edu.uci.ics.hyracks.storage.am.common.TestOperationSelector;
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndex;
import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexCursor;
+import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
import edu.uci.ics.hyracks.storage.am.common.datagen.DataGenThread;
import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
@@ -72,7 +73,7 @@
rearrangedTuple.reset(rearrangedTb.getFieldEndOffsets(), rearrangedTb.getByteArray());
}
- protected void consumeCursorTuples(ITreeIndexCursor cursor) throws HyracksDataException {
+ protected void consumeCursorTuples(ITreeIndexCursor cursor) throws HyracksDataException, IndexException {
try {
while (cursor.hasNext()) {
cursor.next();
diff --git a/hyracks-tests/hyracks-storage-am-lsm-rtree-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/multithread/LSMRTreeTestWorker.java b/hyracks-tests/hyracks-storage-am-lsm-rtree-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/multithread/LSMRTreeTestWorker.java
index 59a7728..3a3bfe6 100644
--- a/hyracks-tests/hyracks-storage-am-lsm-rtree-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/multithread/LSMRTreeTestWorker.java
+++ b/hyracks-tests/hyracks-storage-am-lsm-rtree-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/rtree/multithread/LSMRTreeTestWorker.java
@@ -122,7 +122,7 @@
rearrangedTuple.reset(rearrangedTb.getFieldEndOffsets(), rearrangedTb.getByteArray());
}
- private void consumeCursorTuples(ITreeIndexCursor cursor) throws HyracksDataException {
+ private void consumeCursorTuples(ITreeIndexCursor cursor) throws HyracksDataException, IndexException {
try {
while (cursor.hasNext()) {
cursor.next();