Added first inverted-index search test based on new testing framework. Found and fixed a few bugs.
git-svn-id: https://hyracks.googlecode.com/svn/branches/hyracks_inverted_index_updates_new@1823 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-storage-am-common/src/main/java/edu/uci/ics/hyracks/storage/am/common/datagen/PersonNameFieldValueGenerator.java b/hyracks-storage-am-common/src/main/java/edu/uci/ics/hyracks/storage/am/common/datagen/PersonNameFieldValueGenerator.java
new file mode 100644
index 0000000..7c94f18
--- /dev/null
+++ b/hyracks-storage-am-common/src/main/java/edu/uci/ics/hyracks/storage/am/common/datagen/PersonNameFieldValueGenerator.java
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.common.datagen;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+
+public class PersonNameFieldValueGenerator implements IFieldValueGenerator<String> {
+ private final String FIRST_NAMES_FILE = "dist.all.first.cleaned";
+ private final String LAST_NAMES_FILE = "dist.all.last.cleaned";
+
+ private final Random rnd;
+ private final double middleInitialProb;
+ private final String letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+ private List<String> firstNames = new ArrayList<String>();
+ private List<String> lastNames = new ArrayList<String>();
+
+ public PersonNameFieldValueGenerator(Random rnd, double middleInitialProb)
+ throws IOException {
+ this.rnd = rnd;
+ this.middleInitialProb = middleInitialProb;
+ initNames();
+ }
+
+ private void initNames() throws IOException {
+ String line;
+
+ // Read first names from data file.
+ InputStream firstNamesIn = this.getClass().getClassLoader().getResourceAsStream(FIRST_NAMES_FILE);
+ BufferedReader firstNamesReader = new BufferedReader(new InputStreamReader(firstNamesIn));
+ try {
+ while ((line = firstNamesReader.readLine()) != null) {
+ firstNames.add(line.trim());
+ }
+ } finally {
+ firstNamesReader.close();
+ }
+
+ // Read last names from data file.
+ InputStream lastNamesIn = this.getClass().getClassLoader().getResourceAsStream(LAST_NAMES_FILE);
+ BufferedReader lastNamesReader = new BufferedReader(new InputStreamReader(lastNamesIn));
+ try {
+ while ((line = lastNamesReader.readLine()) != null) {
+ lastNames.add(line.trim());
+ }
+ } finally {
+ lastNamesReader.close();
+ }
+ }
+
+ @Override
+ public String next() {
+ StringBuilder strBuilder = new StringBuilder();
+
+ // First name.
+ int fix = Math.abs(rnd.nextInt()) % firstNames.size();
+ strBuilder.append(firstNames.get(fix));
+ strBuilder.append(" ");
+
+ // Optional middle initial.
+ double d = Math.abs(rnd.nextDouble());
+ if (d <= middleInitialProb) {
+ int mix = Math.abs(rnd.nextInt()) % letters.length();
+ strBuilder.append(letters.charAt(mix));
+ strBuilder.append(". ");
+ }
+
+ // Last name.
+ int lix = Math.abs(rnd.nextInt()) % lastNames.size();
+ strBuilder.append(lastNames.get(lix));
+
+ return strBuilder.toString();
+ }
+}
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/api/IInvertedIndexAccessor.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/api/IInvertedIndexAccessor.java
index d5266ff..48f147f 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/api/IInvertedIndexAccessor.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/api/IInvertedIndexAccessor.java
@@ -25,4 +25,6 @@
public void openInvertedListCursor(IInvertedListCursor listCursor, ITupleReference searchKey)
throws HyracksDataException, IndexException;
+
+ public IInvertedIndexSearcher getSearcher();
}
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/FixedSizeElementInvertedListBuilder.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/FixedSizeElementInvertedListBuilder.java
index 5cda730..fd12792 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/FixedSizeElementInvertedListBuilder.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/FixedSizeElementInvertedListBuilder.java
@@ -36,9 +36,9 @@
@Override
public boolean startNewList(ITupleReference tuple, int tokenField) {
- if (pos + listElementSize >= targetBuf.length)
+ if (pos + listElementSize > targetBuf.length) {
return false;
- else {
+ } else {
listSize = 0;
return true;
}
@@ -46,8 +46,9 @@
@Override
public boolean appendElement(ITupleReference tuple, int numTokenFields, int numElementFields) {
- if (pos + listElementSize >= targetBuf.length)
+ if (pos + listElementSize > targetBuf.length) {
return false;
+ }
for (int i = 0; i < numElementFields; i++) {
int field = numTokenFields + i;
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/FixedSizeElementInvertedListCursor.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/FixedSizeElementInvertedListCursor.java
index ed3838a..d32c3c4 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/FixedSizeElementInvertedListCursor.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/FixedSizeElementInvertedListCursor.java
@@ -72,7 +72,7 @@
@Override
public void next() {
- if (currentOff + 2 * elementSize >= bufferCache.getPageSize()) {
+ if (currentOff + 2 * elementSize > bufferCache.getPageSize()) {
currentPageIx++;
currentOff = 0;
} else {
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndex.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndex.java
index 2e72894..adc0f80 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndex.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndex.java
@@ -441,6 +441,7 @@
searcher.search((OnDiskInvertedIndexSearchCursor) cursor, (InvertedIndexSearchPredicate) searchPred, opCtx);
}
+ @Override
public IInvertedIndexSearcher getSearcher() {
return searcher;
}
@@ -539,6 +540,7 @@
NoOpOperationCallback.INSTANCE, NoOpOperationCallback.INSTANCE);
IInvertedListCursor invListCursor = invIndexAccessor.createInvertedListCursor();
MultiComparator invListCmp = MultiComparator.create(invListCmpFactories);
+
try {
// Search key for finding an inverted-list in the actual index.
ArrayTupleBuilder prevBuilder = new ArrayTupleBuilder(invListTypeTraits.length);
diff --git a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcher.java b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcher.java
index a78287e..396c06e 100644
--- a/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcher.java
+++ b/hyracks-storage-am-lsm-invertedindex/src/main/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/search/TOccurrenceSearcher.java
@@ -364,6 +364,7 @@
protected int mergePrefixList(IInvertedListCursor invListCursor, List<ByteBuffer> prevResultBuffers,
int maxPrevBufIdx, List<ByteBuffer> newResultBuffers) throws HyracksDataException {
+
int newBufIdx = 0;
ByteBuffer newCurrentBuffer = newResultBuffers.get(0);
@@ -514,10 +515,10 @@
return occurrenceThreshold;
}
- public void printNewResults(int maxResultBufIdx) {
+ public void printNewResults(int maxResultBufIdx, List<ByteBuffer> buffer) {
StringBuffer strBuffer = new StringBuffer();
for (int i = 0; i <= maxResultBufIdx; i++) {
- ByteBuffer testBuf = newResultBuffers.get(i);
+ ByteBuffer testBuf = buffer.get(i);
resultFrameTupleAcc.reset(testBuf);
for (int j = 0; j < resultFrameTupleAcc.getTupleCount(); j++) {
strBuffer.append(IntegerSerializerDeserializer.getInt(resultFrameTupleAcc.getBuffer().array(),
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexLoadTest.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexLoadTest.java
new file mode 100644
index 0000000..3eaeb96
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexLoadTest.java
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.common;
+
+import java.io.IOException;
+
+import org.junit.Test;
+
+import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.IIndex;
+import edu.uci.ics.hyracks.storage.am.common.datagen.TupleGenerator;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext.InvertedIndexType;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestUtils;
+
+public abstract class AbstractInvertedIndexLoadTest extends AbstractInvertedIndexTest {
+
+ protected final boolean bulkLoad;
+
+ public AbstractInvertedIndexLoadTest(InvertedIndexType invIndexType, boolean bulkLoad) {
+ super(invIndexType);
+ this.bulkLoad = bulkLoad;
+ }
+
+ protected void runTest(InvertedIndexTestContext testCtx, TupleGenerator tupleGen) throws IOException,
+ IndexException {
+ IIndex invIndex = testCtx.getIndex();
+ invIndex.create();
+ invIndex.activate();
+
+ if (bulkLoad) {
+ InvertedIndexTestUtils.bulkLoadInvIndex(testCtx, tupleGen, NUM_DOCS_TO_INSERT);
+ } else {
+ InvertedIndexTestUtils.insertIntoInvIndex(testCtx, tupleGen, NUM_DOCS_TO_INSERT);
+ }
+
+ // Validate index and compare against expected index.
+ invIndex.validate();
+ if (invIndexType == InvertedIndexType.INMEMORY || invIndexType == InvertedIndexType.ONDISK) {
+ InvertedIndexTestUtils.compareActualAndExpectedIndexes(testCtx);
+ }
+
+ invIndex.deactivate();
+ invIndex.destroy();
+ }
+
+ @Test
+ public void wordTokensInvIndexTest() throws IOException, IndexException {
+ InvertedIndexTestContext testCtx = InvertedIndexTestUtils.createWordInvIndexTestContext(harness,
+ getBufferCache(), invIndexType);
+ TupleGenerator tupleGen = InvertedIndexTestUtils.createStringDocumentTupleGen(harness.getRandom());
+ runTest(testCtx, tupleGen);
+ }
+}
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexSearchTest.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexSearchTest.java
new file mode 100644
index 0000000..ecf0e5b
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexSearchTest.java
@@ -0,0 +1,148 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.common;
+
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import org.junit.Test;
+
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.common.api.IIndexCursor;
+import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
+import edu.uci.ics.hyracks.storage.am.common.dataflow.IIndex;
+import edu.uci.ics.hyracks.storage.am.common.datagen.TupleGenerator;
+import edu.uci.ics.hyracks.storage.am.common.impls.NoOpOperationCallback;
+import edu.uci.ics.hyracks.storage.am.common.tuples.PermutingTupleReference;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexAccessor;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifier;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.exceptions.OccurrenceThresholdPanicException;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.ConjunctiveSearchModifier;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.InvertedIndexSearchPredicate;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.TOccurrenceSearcher;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext.InvertedIndexType;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestUtils;
+
+public abstract class AbstractInvertedIndexSearchTest extends AbstractInvertedIndexTest {
+
+ protected int NUM_QUERIES = 10000;
+ protected int[] scanCountArray = new int[NUM_DOCS_TO_INSERT];
+ protected final boolean bulkLoad;
+
+ // Probability that a randomly generated query is used, instead of a document from the corpus.
+ protected final float randomQueryProb = 0.2f;
+
+ public AbstractInvertedIndexSearchTest(InvertedIndexType invIndexType, boolean bulkLoad) {
+ super(invIndexType);
+ this.bulkLoad = bulkLoad;
+ }
+
+ protected void runTest(InvertedIndexTestContext testCtx, TupleGenerator tupleGen, IInvertedIndexSearchModifier searchModifier) throws IOException,
+ IndexException {
+ IIndex invIndex = testCtx.getIndex();
+ invIndex.create();
+ invIndex.activate();
+
+ if (bulkLoad) {
+ InvertedIndexTestUtils.bulkLoadInvIndex(testCtx, tupleGen, NUM_DOCS_TO_INSERT);
+ } else {
+ InvertedIndexTestUtils.insertIntoInvIndex(testCtx, tupleGen, NUM_DOCS_TO_INSERT);
+ }
+ invIndex.validate();
+
+ IInvertedIndexAccessor accessor = (IInvertedIndexAccessor) invIndex.createAccessor(
+ NoOpOperationCallback.INSTANCE, NoOpOperationCallback.INSTANCE);
+ IBinaryTokenizer tokenizer = testCtx.getTokenizerFactory().createTokenizer();
+ InvertedIndexSearchPredicate searchPred = new InvertedIndexSearchPredicate(tokenizer, searchModifier);
+ Random rnd = harness.getRandom();
+ List<ITupleReference> documentCorpus = testCtx.getDocumentCorpus();
+ // Project away the primary-key field.
+ int[] fieldPermutation = new int[] { 0 };
+ PermutingTupleReference searchDocument = new PermutingTupleReference(fieldPermutation);
+
+ IIndexCursor resultCursor = accessor.createSearchCursor();
+ for (int i = 0; i < NUM_QUERIES; i++) {
+ if (rnd.nextFloat() <= randomQueryProb) {
+ // Generate a random query.
+ ITupleReference randomQuery = tupleGen.next();
+ searchDocument.reset(randomQuery);
+ } else {
+ // Pick a random document from the corpus to use as the search query.
+ int queryIndex = Math.abs(rnd.nextInt() % documentCorpus.size());
+ searchDocument.reset(documentCorpus.get(queryIndex));
+ }
+
+ // Set query tuple in search predicate.
+ searchPred.setQueryTuple(searchDocument);
+ searchPred.setQueryFieldIndex(0);
+
+ resultCursor.reset();
+ boolean panic = false;
+ try {
+ accessor.search(resultCursor, searchPred);
+ } catch (OccurrenceThresholdPanicException e) {
+ // ignore panic queries
+ panic = true;
+ }
+
+ if (!panic) {
+ TOccurrenceSearcher searcher = (TOccurrenceSearcher) accessor.getSearcher();
+ List<Integer> expectedResults = new ArrayList<Integer>();
+ InvertedIndexTestUtils.getExpectedResults(scanCountArray, testCtx.getCheckTuples(), searchDocument,
+ tokenizer, testCtx.getFieldSerdes()[0], searcher.getOccurrenceThreshold(), expectedResults);
+
+ Iterator<Integer> expectedIter = expectedResults.iterator();
+ while (expectedIter.hasNext() && resultCursor.hasNext()) {
+ int expected = expectedIter.next();
+ resultCursor.next();
+ ITupleReference resultTuple = resultCursor.getTuple();
+ int actual = IntegerSerializerDeserializer
+ .getInt(resultTuple.getFieldData(0), resultTuple.getFieldStart(0));
+ if (actual != expected) {
+ fail("Query results do not match. Encountered: " + actual + ". Expected: " + expected + "");
+ }
+ }
+ if (expectedIter.hasNext()) {
+ fail("Query results do not match. Actual results missing.");
+ }
+ if (resultCursor.hasNext()) {
+ fail("Query results do not match. Actual contains too many results.");
+ }
+ }
+ }
+
+ invIndex.deactivate();
+ invIndex.destroy();
+ }
+
+ @Test
+ public void wordTokensInvIndexTest() throws IOException, IndexException {
+ InvertedIndexTestContext testCtx = InvertedIndexTestUtils.createWordInvIndexTestContext(harness,
+ getBufferCache(), invIndexType);
+ TupleGenerator tupleGen = InvertedIndexTestUtils.createStringDocumentTupleGen(harness.getRandom());
+ IInvertedIndexSearchModifier searchModifier = new ConjunctiveSearchModifier();
+ runTest(testCtx, tupleGen, searchModifier);
+ }
+
+}
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexTest.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexTest.java
new file mode 100644
index 0000000..2097994
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/common/AbstractInvertedIndexTest.java
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2009-2012 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.common;
+
+import org.junit.After;
+import org.junit.Before;
+
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.api.exceptions.HyracksException;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.LSMInvertedIndexTestHarness;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext.InvertedIndexType;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
+
+public abstract class AbstractInvertedIndexTest {
+ protected final LSMInvertedIndexTestHarness harness = new LSMInvertedIndexTestHarness();
+
+ protected int NUM_DOCS_TO_INSERT = 10000;
+
+ protected final InvertedIndexType invIndexType;
+
+ public AbstractInvertedIndexTest(InvertedIndexType invIndexType) {
+ this.invIndexType = invIndexType;
+ }
+
+ @Before
+ public void setUp() throws HyracksException {
+ harness.setUp();
+ }
+
+ @After
+ public void tearDown() throws HyracksDataException {
+ harness.tearDown();
+ }
+
+ public abstract IBufferCache getBufferCache();
+}
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/inmemory/AbstractSingleComponentInvertedIndexTest.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/inmemory/AbstractSingleComponentInvertedIndexTest.java
deleted file mode 100644
index a035276..0000000
--- a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/inmemory/AbstractSingleComponentInvertedIndexTest.java
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright 2009-2012 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.inmemory;
-
-import static org.junit.Assert.fail;
-
-import java.io.IOException;
-import java.util.Iterator;
-import java.util.SortedSet;
-
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
-import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
-import edu.uci.ics.hyracks.api.exceptions.HyracksException;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleReference;
-import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
-import edu.uci.ics.hyracks.storage.am.btree.OrderedIndexTestUtils;
-import edu.uci.ics.hyracks.storage.am.common.CheckTuple;
-import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
-import edu.uci.ics.hyracks.storage.am.common.dataflow.IIndex;
-import edu.uci.ics.hyracks.storage.am.common.datagen.DocumentStringFieldValueGenerator;
-import edu.uci.ics.hyracks.storage.am.common.datagen.IFieldValueGenerator;
-import edu.uci.ics.hyracks.storage.am.common.datagen.SortedIntegerFieldValueGenerator;
-import edu.uci.ics.hyracks.storage.am.common.datagen.TupleGenerator;
-import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
-import edu.uci.ics.hyracks.storage.am.common.tuples.PermutingTupleReference;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.LSMInvertedIndexTestHarness;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndex;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexAccessor;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedListCursor;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizerFactory;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8WordTokenFactory;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext.InvertedIndexType;
-import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
-
-@SuppressWarnings("rawtypes")
-public abstract class AbstractSingleComponentInvertedIndexTest {
- protected final LSMInvertedIndexTestHarness harness = new LSMInvertedIndexTestHarness();
-
- protected int NUM_DOCS_TO_INSERT = 10000;
-
- protected final InvertedIndexType invIndexType;
-
- public AbstractSingleComponentInvertedIndexTest(InvertedIndexType invIndexType) {
- this.invIndexType = invIndexType;
- }
-
- @Before
- public void setUp() throws HyracksException {
- harness.setUp();
- }
-
- public abstract void loadIndex(TupleGenerator tupleGen, InvertedIndexTestContext testCtx) throws IOException,
- IndexException;
-
- public abstract IBufferCache getBufferCache();
-
- @Test
- public void stringDocumentTest() throws IOException, IndexException {
- ISerializerDeserializer[] fieldSerdes = new ISerializerDeserializer[] {
- UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
- IFieldValueGenerator[] fieldGens = new IFieldValueGenerator[2];
- fieldGens[0] = new DocumentStringFieldValueGenerator(2, 10, 10000, harness.getRandom());
- fieldGens[1] = new SortedIntegerFieldValueGenerator(0);
- TupleGenerator tupleGen = new TupleGenerator(fieldGens, fieldSerdes, 0);
-
- ITokenFactory tokenFactory = new UTF8WordTokenFactory();
- IBinaryTokenizerFactory tokenizerFactory = new DelimitedUTF8StringBinaryTokenizerFactory(true, false,
- tokenFactory);
-
- InvertedIndexTestContext testCtx = InvertedIndexTestContext.create(getBufferCache(),
- harness.getMemFreePageManager(), harness.getDiskFileMapProvider(), harness.getInvListsFileRef(),
- fieldSerdes, 1, tokenizerFactory, invIndexType);
-
- IIndex invIndex = testCtx.getIndex();
- invIndex.create();
- invIndex.activate();
-
- loadIndex(tupleGen, testCtx);
-
- // Validate index and compare against expected index.
- invIndex.validate();
- compareActualAndExpectedIndexes(testCtx);
-
- invIndex.deactivate();
- invIndex.destroy();
- }
-
- @SuppressWarnings("unchecked")
- protected void compareActualAndExpectedIndexes(InvertedIndexTestContext testCtx) throws HyracksDataException,
- IndexException {
- IInvertedIndex invIndex = (IInvertedIndex) testCtx.getIndex();
- ISerializerDeserializer[] fieldSerdes = testCtx.getFieldSerdes();
- MultiComparator invListCmp = MultiComparator.create(invIndex.getInvListCmpFactories());
- IInvertedIndexAccessor invIndexAccessor = (IInvertedIndexAccessor) testCtx.getIndexAccessor();
- int tokenFieldCount = invIndex.getTokenTypeTraits().length;
- int invListFieldCount = invIndex.getInvListTypeTraits().length;
- // All tokens that were inserted into the indexes.
- Iterator<Comparable> tokensIter = testCtx.getAllTokens().iterator();
-
- // Search key for finding an inverted-list in the actual index.
- ArrayTupleBuilder searchKeyBuilder = new ArrayTupleBuilder(tokenFieldCount);
- ArrayTupleReference searchKey = new ArrayTupleReference();
- // Cursor over inverted list from actual index.
- IInvertedListCursor actualInvListCursor = invIndexAccessor.createInvertedListCursor();
-
- // Helpers for generating a serialized inverted-list element from a CheckTuple from the expected index.
- ArrayTupleBuilder expectedBuilder = new ArrayTupleBuilder(fieldSerdes.length);
- // Includes the token fields.
- ArrayTupleReference completeExpectedTuple = new ArrayTupleReference();
- // Field permutation and permuting tuple reference to strip away token fields from completeExpectedTuple.
- int[] fieldPermutation = new int[invListFieldCount];
- for (int i = 0; i < fieldPermutation.length; i++) {
- fieldPermutation[i] = tokenFieldCount + i;
- }
- PermutingTupleReference expectedTuple = new PermutingTupleReference(fieldPermutation);
-
- // Iterate over all tokens. Find the inverted-lists in actual and expected indexes. Compare the inverted lists,
- while (tokensIter.hasNext()) {
- Comparable token = tokensIter.next();
-
- // Position inverted-list iterator on expected index.
- CheckTuple checkLowKey = new CheckTuple(tokenFieldCount, tokenFieldCount);
- checkLowKey.appendField(token);
- CheckTuple checkHighKey = new CheckTuple(tokenFieldCount, tokenFieldCount);
- checkHighKey.appendField(token);
- SortedSet<CheckTuple> expectedInvList = OrderedIndexTestUtils.getPrefixExpectedSubset(
- testCtx.getCheckTuples(), checkLowKey, checkHighKey);
- Iterator<CheckTuple> expectedInvListIter = expectedInvList.iterator();
-
- // Position inverted-list cursor in actual index.
- OrderedIndexTestUtils.createTupleFromCheckTuple(checkLowKey, searchKeyBuilder, searchKey, fieldSerdes);
- invIndexAccessor.openInvertedListCursor(actualInvListCursor, searchKey);
-
- if (actualInvListCursor.size() != expectedInvList.size()) {
- fail("Actual and expected inverted lists for token '" + token.toString()
- + "' have different sizes. Actual size: " + actualInvListCursor.size() + ". Expected size: "
- + expectedInvList.size() + ".");
- }
- // Compare inverted-list elements.
- int count = 0;
- actualInvListCursor.pinPages();
- try {
- while (actualInvListCursor.hasNext() && expectedInvListIter.hasNext()) {
- actualInvListCursor.next();
- ITupleReference actual = actualInvListCursor.getTuple();
- CheckTuple expected = expectedInvListIter.next();
- OrderedIndexTestUtils.createTupleFromCheckTuple(expected, expectedBuilder, completeExpectedTuple,
- fieldSerdes);
- expectedTuple.reset(completeExpectedTuple);
- if (invListCmp.compare(actual, expectedTuple) != 0) {
- fail("Inverted lists of token '" + token + "' differ at position " + count + ".");
- }
- count++;
- }
- } finally {
- actualInvListCursor.unpinPages();
- }
- }
- }
-
- @After
- public void tearDown() throws HyracksDataException {
- harness.tearDown();
- }
-}
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedIndexInsertTest.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedIndexInsertTest.java
index dfc4881..c1ee925 100644
--- a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedIndexInsertTest.java
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedIndexInsertTest.java
@@ -15,29 +15,14 @@
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.inmemory;
-import java.io.IOException;
-
-import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
-import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
-import edu.uci.ics.hyracks.storage.am.common.datagen.TupleGenerator;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.common.AbstractInvertedIndexLoadTest;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext.InvertedIndexType;
import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
-public class InMemoryInvertedIndexInsertTest extends AbstractSingleComponentInvertedIndexTest {
+public class InMemoryInvertedIndexInsertTest extends AbstractInvertedIndexLoadTest {
public InMemoryInvertedIndexInsertTest() {
- super(InvertedIndexType.INMEMORY);
- }
-
- @Override
- public void loadIndex(TupleGenerator tupleGen, InvertedIndexTestContext testCtx) throws IOException, IndexException {
- // InMemoryInvertedIndex only supports insert.
- for (int i = 0; i < NUM_DOCS_TO_INSERT; i++) {
- ITupleReference tuple = tupleGen.next();
- testCtx.getIndexAccessor().insert(tuple);
- testCtx.insertCheckTuples(tuple);
- }
+ super(InvertedIndexType.INMEMORY, false);
}
@Override
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedIndexSearchTest.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedIndexSearchTest.java
deleted file mode 100644
index 98249ac..0000000
--- a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedIndexSearchTest.java
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright 2009-2010 by The Regents of the University of California
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * you may obtain a copy of the License from
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.inmemory;
-
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-import org.junit.Test;
-
-import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.AbstractInvertedIndexTest;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifier;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.ConjunctiveSearchModifier;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.EditDistanceSearchModifier;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.JaccardSearchModifier;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8NGramTokenFactory;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestUtils;
-
-public class InMemoryInvertedIndexSearchTest extends AbstractInvertedIndexTest {
-
- /**
- * Runs 5 random conjunctive search queries to test the
- * ConjunctiveSearchModifier.
- */
- @Test
- public void conjunctiveQueryTest() throws Exception {
- insertDocuments();
- IInvertedIndexSearchModifier searchModifier = new ConjunctiveSearchModifier();
- runQueries(searchModifier, 5);
- }
-
- /**
- * Runs 5 random jaccard-based search queries with thresholds 0.9, 0.8, 0.7.
- * Tests the JaccardSearchModifier.
- */
- @Test
- public void jaccardQueryTest() throws Exception {
- insertDocuments();
- JaccardSearchModifier searchModifier = new JaccardSearchModifier(1.0f);
-
- if (LOGGER.isLoggable(Level.INFO)) {
- LOGGER.info("JACCARD: " + 0.9f);
- }
- searchModifier.setJaccThresh(0.9f);
- runQueries(searchModifier, 5);
-
- if (LOGGER.isLoggable(Level.INFO)) {
- LOGGER.info("JACCARD: " + 0.8f);
- }
- searchModifier.setJaccThresh(0.8f);
- runQueries(searchModifier, 5);
-
- if (LOGGER.isLoggable(Level.INFO)) {
- LOGGER.info("JACCARD: " + 0.7f);
- }
- searchModifier.setJaccThresh(0.7f);
- runQueries(searchModifier, 5);
- }
-
- /**
- * Runs 5 random edit-distance based search queries with thresholds 1, 2, 3.
- * Tests the EditDistanceSearchModifier.
- */
- @Test
- public void editDistanceQueryTest() throws Exception {
- insertDocuments();
- EditDistanceSearchModifier searchModifier = new EditDistanceSearchModifier(3, 0);
-
- if (LOGGER.isLoggable(Level.INFO)) {
- LOGGER.info("EDIT DISTANCE: " + 1);
- }
- searchModifier.setEdThresh(1);
- runQueries(searchModifier, 5);
-
- if (LOGGER.isLoggable(Level.INFO)) {
- LOGGER.info("EDIT DISTANCE: " + 2);
- }
- searchModifier.setEdThresh(2);
- runQueries(searchModifier, 5);
-
- if (LOGGER.isLoggable(Level.INFO)) {
- LOGGER.info("EDIT DISTANCE: " + 3);
- }
- searchModifier.setEdThresh(3);
- runQueries(searchModifier, 5);
- }
-
- @Override
- protected void setTokenizer() {
- ITokenFactory tokenFactory = new UTF8NGramTokenFactory();
- tokenizer = new NGramUTF8StringBinaryTokenizer(3, false, true, false, tokenFactory);
- }
-
- @Override
- protected void setInvertedIndex() throws HyracksDataException {
- invertedIndex = InvertedIndexTestUtils.createInMemoryInvertedIndex(harness, tokenizer);
- invertedIndex.create(harness.getFileId());
- invertedIndex.open(harness.getFileId());
- }
-
- @Override
- protected void setLogger() {
- LOGGER = Logger.getLogger(InMemoryInvertedIndexSearchTest.class.getName());
- }
-
- @Override
- protected void setRandom() {
- random = false;
- }
-
-}
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndexBulkLoadTest.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndexBulkLoadTest.java
index 1c14df0..700b70f 100644
--- a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndexBulkLoadTest.java
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndexBulkLoadTest.java
@@ -15,50 +15,14 @@
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.ondisk;
-import java.io.IOException;
-import java.util.Iterator;
-
-import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
-import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleReference;
-import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
-import edu.uci.ics.hyracks.storage.am.btree.OrderedIndexTestUtils;
-import edu.uci.ics.hyracks.storage.am.common.CheckTuple;
-import edu.uci.ics.hyracks.storage.am.common.api.IIndexBulkLoader;
-import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
-import edu.uci.ics.hyracks.storage.am.common.datagen.TupleGenerator;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.inmemory.AbstractSingleComponentInvertedIndexTest;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.common.AbstractInvertedIndexLoadTest;
import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext.InvertedIndexType;
import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
-@SuppressWarnings("rawtypes")
-public class OnDiskInvertedIndexBulkLoadTest extends AbstractSingleComponentInvertedIndexTest {
+public class OnDiskInvertedIndexBulkLoadTest extends AbstractInvertedIndexLoadTest {
public OnDiskInvertedIndexBulkLoadTest() {
- super(InvertedIndexType.ONDISK);
- }
-
- @Override
- public void loadIndex(TupleGenerator tupleGen, InvertedIndexTestContext testCtx) throws IOException, IndexException {
- // First generate the expected index by inserting the documents one-by-one.
- for (int i = 0; i < NUM_DOCS_TO_INSERT; i++) {
- ITupleReference tuple = tupleGen.next();
- testCtx.insertCheckTuples(tuple);
- }
- ISerializerDeserializer[] fieldSerdes = testCtx.getFieldSerdes();
-
- // Use the expected index to bulk-load the actual index.
- IIndexBulkLoader bulkLoader = testCtx.getIndex().createBulkLoader(1.0f, false);
- ArrayTupleBuilder tupleBuilder = new ArrayTupleBuilder(testCtx.getFieldSerdes().length);
- ArrayTupleReference tuple = new ArrayTupleReference();
- Iterator<CheckTuple> checkTupleIter = testCtx.getCheckTuples().iterator();
- while (checkTupleIter.hasNext()) {
- CheckTuple checkTuple = checkTupleIter.next();
- OrderedIndexTestUtils.createTupleFromCheckTuple(checkTuple, tupleBuilder, tuple, fieldSerdes);
- bulkLoader.add(tuple);
- }
- bulkLoader.end();
+ super(InvertedIndexType.ONDISK, true);
}
@Override
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndexSearchTest.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndexSearchTest.java
index 6f93014..702dc49 100644
--- a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndexSearchTest.java
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndexSearchTest.java
@@ -1,5 +1,5 @@
/*
- * Copyright 2009-2010 by The Regents of the University of California
+ * Copyright 2009-2012 by The Regents of the University of California
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
@@ -15,281 +15,18 @@
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.ondisk;
-import java.io.DataOutputStream;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.logging.Level;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.common.AbstractInvertedIndexSearchTest;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext.InvertedIndexType;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
-import org.junit.Before;
-import org.junit.Test;
+public class OnDiskInvertedIndexSearchTest extends AbstractInvertedIndexSearchTest {
-import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
-import edu.uci.ics.hyracks.data.std.util.ByteArrayAccessibleOutputStream;
-import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
-import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
-import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
-import edu.uci.ics.hyracks.storage.am.common.api.IIndexBulkLoader;
-import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
-import edu.uci.ics.hyracks.storage.am.common.impls.NoOpOperationCallback;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifier;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.exceptions.OccurrenceThresholdPanicException;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.ondisk.OnDiskInvertedIndex.OnDiskInvertedIndexAccessor;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.ConjunctiveSearchModifier;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.EditDistanceSearchModifier;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.InvertedIndexSearchPredicate;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.JaccardSearchModifier;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8NGramTokenFactory;
-
-public class OnDiskInvertedIndexSearchTest extends AbstractInvIndexSearchTest {
-
- protected List<String> dataStrings = new ArrayList<String>();
- protected List<String> firstNames = new ArrayList<String>();
- protected List<String> lastNames = new ArrayList<String>();
-
- protected IBinaryComparator[] btreeBinCmps;
+ public OnDiskInvertedIndexSearchTest() {
+ super(InvertedIndexType.ONDISK, true);
+ }
@Override
- protected void setTokenizer() {
- tokenFactory = new UTF8NGramTokenFactory();
- tokenizer = new NGramUTF8StringBinaryTokenizer(3, false, true, false, tokenFactory);
- }
-
- @Before
- public void start() throws Exception {
- super.start();
- btreeBinCmps = new IBinaryComparator[tokenCmpFactories.length];
- for (int i = 0; i < tokenCmpFactories.length; i++) {
- btreeBinCmps[i] = tokenCmpFactories[i].createBinaryComparator();
- }
- generateDataStrings();
- loadData();
- }
-
- public void generateDataStrings() {
- firstNames.add("Kathrin");
- firstNames.add("Cathrin");
- firstNames.add("Kathryn");
- firstNames.add("Cathryn");
- firstNames.add("Kathrine");
- firstNames.add("Cathrine");
- firstNames.add("Kathryne");
- firstNames.add("Cathryne");
- firstNames.add("Katherin");
- firstNames.add("Catherin");
- firstNames.add("Katheryn");
- firstNames.add("Catheryn");
- firstNames.add("Katherine");
- firstNames.add("Catherine");
- firstNames.add("Katheryne");
- firstNames.add("Catheryne");
- firstNames.add("John");
- firstNames.add("Jack");
- firstNames.add("Jonathan");
- firstNames.add("Nathan");
-
- lastNames.add("Miller");
- lastNames.add("Myller");
- lastNames.add("Keller");
- lastNames.add("Ketler");
- lastNames.add("Muller");
- lastNames.add("Fuller");
- lastNames.add("Smith");
- lastNames.add("Smyth");
- lastNames.add("Smithe");
- lastNames.add("Smythe");
-
- // Generate all 'firstName lastName' combinations as data strings
- for (String f : firstNames) {
- for (String l : lastNames) {
- dataStrings.add(f + " " + l);
- }
- }
- }
-
- private class TokenIdPair implements Comparable<TokenIdPair> {
- public ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
- public DataOutputStream dos = new DataOutputStream(baaos);
- public int id;
-
- TokenIdPair(IToken token, int id) throws IOException {
- token.serializeToken(dos);
- this.id = id;
- }
-
- @Override
- public int compareTo(TokenIdPair o) {
- int cmp = btreeBinCmps[0].compare(baaos.getByteArray(), 0, baaos.getByteArray().length,
- o.baaos.getByteArray(), 0, o.baaos.getByteArray().length);
- if (cmp == 0) {
- return id - o.id;
- } else {
- return cmp;
- }
- }
- }
-
- public void loadData() throws IOException, IndexException {
- List<TokenIdPair> pairs = new ArrayList<TokenIdPair>();
- // Generate pairs for subsequent sorting and bulk-loading.
- int id = 0;
- for (String s : dataStrings) {
- ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
- DataOutputStream dos = new DataOutputStream(baaos);
- UTF8StringSerializerDeserializer.INSTANCE.serialize(s, dos);
- tokenizer.reset(baaos.getByteArray(), 0, baaos.size());
- while (tokenizer.hasNext()) {
- tokenizer.next();
- IToken token = tokenizer.getToken();
- pairs.add(new TokenIdPair(token, id));
- }
- ++id;
- }
- Collections.sort(pairs);
-
- // Bulk load index.
- IIndexBulkLoader bulkLoader = invIndex.createBulkLoader(BTree.DEFAULT_FILL_FACTOR, false);
-
- for (TokenIdPair t : pairs) {
- tb.reset();
- tb.addField(t.baaos.getByteArray(), 0, t.baaos.getByteArray().length);
- IntegerSerializerDeserializer.INSTANCE.serialize(t.id, tb.getDataOutput());
- tb.addFieldEndOffset();
- tuple.reset(tb.getFieldEndOffsets(), tb.getByteArray());
-
- try {
- bulkLoader.add(tuple);
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- bulkLoader.end();
- }
-
- /**
- * Runs a specified number of randomly picked strings from dataStrings as
- * queries. We run each query, measure it's time, and print it's results.
- */
- private void runQueries(IInvertedIndexSearchModifier searchModifier, int numQueries) throws Exception {
- rnd.setSeed(50);
- OnDiskInvertedIndexAccessor accessor = (OnDiskInvertedIndexAccessor) invIndex.createAccessor(
- NoOpOperationCallback.INSTANCE, NoOpOperationCallback.INSTANCE);
- InvertedIndexSearchPredicate searchPred = new InvertedIndexSearchPredicate(tokenizer, searchModifier);
- for (int i = 0; i < numQueries; i++) {
-
- int queryIndex = Math.abs(rnd.nextInt() % dataStrings.size());
- String queryString = dataStrings.get(queryIndex);
-
- // Serialize query.
- queryTb.reset();
- UTF8StringSerializerDeserializer.INSTANCE.serialize(queryString, queryTb.getDataOutput());
- queryTb.addFieldEndOffset();
- queryTuple.reset(queryTb.getFieldEndOffsets(), queryTb.getByteArray());
-
- // Set query tuple in search predicate.
- searchPred.setQueryTuple(queryTuple);
- searchPred.setQueryFieldIndex(0);
-
- resultCursor = accessor.createSearchCursor();
-
- int repeats = 1;
- double totalTime = 0;
- for (int j = 0; j < repeats; j++) {
- long timeStart = System.currentTimeMillis();
- try {
- resultCursor.reset();
- accessor.search(resultCursor, searchPred);
- } catch (OccurrenceThresholdPanicException e) {
- // ignore panic queries
- }
- long timeEnd = System.currentTimeMillis();
- totalTime += timeEnd - timeStart;
- }
- double avgTime = totalTime / (double) repeats;
- StringBuilder strBuilder = new StringBuilder();
- strBuilder.append(i + ": " + "\"" + queryString + "\": " + avgTime + "ms" + "\n");
- strBuilder.append("CANDIDATE RESULTS:\n");
- while (resultCursor.hasNext()) {
- resultCursor.next();
- ITupleReference resultTuple = resultCursor.getTuple();
- int id = IntegerSerializerDeserializer
- .getInt(resultTuple.getFieldData(0), resultTuple.getFieldStart(0));
- strBuilder.append(id + " " + dataStrings.get(id));
- strBuilder.append('\n');
- }
- // remove trailing newline
- strBuilder.deleteCharAt(strBuilder.length() - 1);
- if (LOGGER.isLoggable(Level.INFO)) {
- LOGGER.info(strBuilder.toString());
- }
- }
- }
-
- /**
- * Runs 5 random conjunctive search queries to test the
- * ConjunctiveSearchModifier.
- */
- @Test
- public void conjunctiveQueryTest() throws Exception {
- IInvertedIndexSearchModifier searchModifier = new ConjunctiveSearchModifier();
- runQueries(searchModifier, 5);
- }
-
- /**
- * Runs 5 random jaccard-based search queries with thresholds 0.9, 0.8, 0.7.
- * Tests the JaccardSearchModifier.
- */
- @Test
- public void jaccardQueryTest() throws Exception {
- JaccardSearchModifier searchModifier = new JaccardSearchModifier(1.0f);
-
- if (LOGGER.isLoggable(Level.INFO)) {
- LOGGER.info("JACCARD: " + 0.9f);
- }
- searchModifier.setJaccThresh(0.9f);
- runQueries(searchModifier, 5);
-
- if (LOGGER.isLoggable(Level.INFO)) {
- LOGGER.info("JACCARD: " + 0.8f);
- }
- searchModifier.setJaccThresh(0.8f);
- runQueries(searchModifier, 5);
-
- if (LOGGER.isLoggable(Level.INFO)) {
- LOGGER.info("JACCARD: " + 0.7f);
- }
- searchModifier.setJaccThresh(0.7f);
- runQueries(searchModifier, 5);
- }
-
- /**
- * Runs 5 random edit-distance based search queries with thresholds 1, 2, 3.
- * Tests the EditDistanceSearchModifier.
- */
- @Test
- public void editDistanceQueryTest() throws Exception {
- EditDistanceSearchModifier searchModifier = new EditDistanceSearchModifier(3, 0);
-
- if (LOGGER.isLoggable(Level.INFO)) {
- LOGGER.info("EDIT DISTANCE: " + 1);
- }
- searchModifier.setEdThresh(1);
- runQueries(searchModifier, 5);
-
- if (LOGGER.isLoggable(Level.INFO)) {
- LOGGER.info("EDIT DISTANCE: " + 2);
- }
- searchModifier.setEdThresh(2);
- runQueries(searchModifier, 5);
-
- if (LOGGER.isLoggable(Level.INFO)) {
- LOGGER.info("EDIT DISTANCE: " + 3);
- }
- searchModifier.setEdThresh(3);
- runQueries(searchModifier, 5);
+ public IBufferCache getBufferCache() {
+ return harness.getDiskBufferCache();
}
}
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndexSearchTestOld.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndexSearchTestOld.java
new file mode 100644
index 0000000..f00747a
--- /dev/null
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/ondisk/OnDiskInvertedIndexSearchTestOld.java
@@ -0,0 +1,295 @@
+/*
+ * Copyright 2009-2010 by The Regents of the University of California
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * you may obtain a copy of the License from
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.ondisk;
+
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.logging.Level;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparator;
+import edu.uci.ics.hyracks.data.std.util.ByteArrayAccessibleOutputStream;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
+import edu.uci.ics.hyracks.storage.am.common.api.IIndexBulkLoader;
+import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
+import edu.uci.ics.hyracks.storage.am.common.impls.NoOpOperationCallback;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifier;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.exceptions.OccurrenceThresholdPanicException;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.ondisk.OnDiskInvertedIndex.OnDiskInvertedIndexAccessor;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.ConjunctiveSearchModifier;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.EditDistanceSearchModifier;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.InvertedIndexSearchPredicate;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.search.JaccardSearchModifier;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8NGramTokenFactory;
+
+public class OnDiskInvertedIndexSearchTestOld extends AbstractInvIndexSearchTest {
+
+ protected List<String> dataStrings = new ArrayList<String>();
+ protected List<String> firstNames = new ArrayList<String>();
+ protected List<String> lastNames = new ArrayList<String>();
+
+ protected IBinaryComparator[] btreeBinCmps;
+
+ @Override
+ protected void setTokenizer() {
+ tokenFactory = new UTF8NGramTokenFactory();
+ tokenizer = new NGramUTF8StringBinaryTokenizer(3, false, true, false, tokenFactory);
+ }
+
+ @Before
+ public void start() throws Exception {
+ super.start();
+ btreeBinCmps = new IBinaryComparator[tokenCmpFactories.length];
+ for (int i = 0; i < tokenCmpFactories.length; i++) {
+ btreeBinCmps[i] = tokenCmpFactories[i].createBinaryComparator();
+ }
+ generateDataStrings();
+ loadData();
+ }
+
+ public void generateDataStrings() {
+ firstNames.add("Kathrin");
+ firstNames.add("Cathrin");
+ firstNames.add("Kathryn");
+ firstNames.add("Cathryn");
+ firstNames.add("Kathrine");
+ firstNames.add("Cathrine");
+ firstNames.add("Kathryne");
+ firstNames.add("Cathryne");
+ firstNames.add("Katherin");
+ firstNames.add("Catherin");
+ firstNames.add("Katheryn");
+ firstNames.add("Catheryn");
+ firstNames.add("Katherine");
+ firstNames.add("Catherine");
+ firstNames.add("Katheryne");
+ firstNames.add("Catheryne");
+ firstNames.add("John");
+ firstNames.add("Jack");
+ firstNames.add("Jonathan");
+ firstNames.add("Nathan");
+
+ lastNames.add("Miller");
+ lastNames.add("Myller");
+ lastNames.add("Keller");
+ lastNames.add("Ketler");
+ lastNames.add("Muller");
+ lastNames.add("Fuller");
+ lastNames.add("Smith");
+ lastNames.add("Smyth");
+ lastNames.add("Smithe");
+ lastNames.add("Smythe");
+
+ // Generate all 'firstName lastName' combinations as data strings
+ for (String f : firstNames) {
+ for (String l : lastNames) {
+ dataStrings.add(f + " " + l);
+ }
+ }
+ }
+
+ private class TokenIdPair implements Comparable<TokenIdPair> {
+ public ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
+ public DataOutputStream dos = new DataOutputStream(baaos);
+ public int id;
+
+ TokenIdPair(IToken token, int id) throws IOException {
+ token.serializeToken(dos);
+ this.id = id;
+ }
+
+ @Override
+ public int compareTo(TokenIdPair o) {
+ int cmp = btreeBinCmps[0].compare(baaos.getByteArray(), 0, baaos.getByteArray().length,
+ o.baaos.getByteArray(), 0, o.baaos.getByteArray().length);
+ if (cmp == 0) {
+ return id - o.id;
+ } else {
+ return cmp;
+ }
+ }
+ }
+
+ public void loadData() throws IOException, IndexException {
+ List<TokenIdPair> pairs = new ArrayList<TokenIdPair>();
+ // Generate pairs for subsequent sorting and bulk-loading.
+ int id = 0;
+ for (String s : dataStrings) {
+ ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
+ DataOutputStream dos = new DataOutputStream(baaos);
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(s, dos);
+ tokenizer.reset(baaos.getByteArray(), 0, baaos.size());
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+ IToken token = tokenizer.getToken();
+ pairs.add(new TokenIdPair(token, id));
+ }
+ ++id;
+ }
+ Collections.sort(pairs);
+
+ // Bulk load index.
+ IIndexBulkLoader bulkLoader = invIndex.createBulkLoader(BTree.DEFAULT_FILL_FACTOR, false);
+
+ for (TokenIdPair t : pairs) {
+ tb.reset();
+ tb.addField(t.baaos.getByteArray(), 0, t.baaos.getByteArray().length);
+ IntegerSerializerDeserializer.INSTANCE.serialize(t.id, tb.getDataOutput());
+ tb.addFieldEndOffset();
+ tuple.reset(tb.getFieldEndOffsets(), tb.getByteArray());
+
+ try {
+ bulkLoader.add(tuple);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+ bulkLoader.end();
+ }
+
+ /**
+ * Runs a specified number of randomly picked strings from dataStrings as
+ * queries. We run each query, measure it's time, and print it's results.
+ */
+ private void runQueries(IInvertedIndexSearchModifier searchModifier, int numQueries) throws Exception {
+ rnd.setSeed(50);
+ OnDiskInvertedIndexAccessor accessor = (OnDiskInvertedIndexAccessor) invIndex.createAccessor(
+ NoOpOperationCallback.INSTANCE, NoOpOperationCallback.INSTANCE);
+ InvertedIndexSearchPredicate searchPred = new InvertedIndexSearchPredicate(tokenizer, searchModifier);
+ for (int i = 0; i < numQueries; i++) {
+
+ int queryIndex = Math.abs(rnd.nextInt() % dataStrings.size());
+ String queryString = dataStrings.get(queryIndex);
+
+ // Serialize query.
+ queryTb.reset();
+ UTF8StringSerializerDeserializer.INSTANCE.serialize(queryString, queryTb.getDataOutput());
+ queryTb.addFieldEndOffset();
+ queryTuple.reset(queryTb.getFieldEndOffsets(), queryTb.getByteArray());
+
+ // Set query tuple in search predicate.
+ searchPred.setQueryTuple(queryTuple);
+ searchPred.setQueryFieldIndex(0);
+
+ resultCursor = accessor.createSearchCursor();
+
+ int repeats = 1;
+ double totalTime = 0;
+ for (int j = 0; j < repeats; j++) {
+ long timeStart = System.currentTimeMillis();
+ try {
+ resultCursor.reset();
+ accessor.search(resultCursor, searchPred);
+ } catch (OccurrenceThresholdPanicException e) {
+ // ignore panic queries
+ }
+ long timeEnd = System.currentTimeMillis();
+ totalTime += timeEnd - timeStart;
+ }
+ double avgTime = totalTime / (double) repeats;
+ StringBuilder strBuilder = new StringBuilder();
+ strBuilder.append(i + ": " + "\"" + queryString + "\": " + avgTime + "ms" + "\n");
+ strBuilder.append("CANDIDATE RESULTS:\n");
+ while (resultCursor.hasNext()) {
+ resultCursor.next();
+ ITupleReference resultTuple = resultCursor.getTuple();
+ int id = IntegerSerializerDeserializer
+ .getInt(resultTuple.getFieldData(0), resultTuple.getFieldStart(0));
+ strBuilder.append(id + " " + dataStrings.get(id));
+ strBuilder.append('\n');
+ }
+ // remove trailing newline
+ strBuilder.deleteCharAt(strBuilder.length() - 1);
+ if (LOGGER.isLoggable(Level.INFO)) {
+ LOGGER.info(strBuilder.toString());
+ }
+ }
+ }
+
+ /**
+ * Runs 5 random conjunctive search queries to test the
+ * ConjunctiveSearchModifier.
+ */
+ @Test
+ public void conjunctiveQueryTest() throws Exception {
+ IInvertedIndexSearchModifier searchModifier = new ConjunctiveSearchModifier();
+ runQueries(searchModifier, 5);
+ }
+
+ /**
+ * Runs 5 random jaccard-based search queries with thresholds 0.9, 0.8, 0.7.
+ * Tests the JaccardSearchModifier.
+ */
+ @Test
+ public void jaccardQueryTest() throws Exception {
+ JaccardSearchModifier searchModifier = new JaccardSearchModifier(1.0f);
+
+ if (LOGGER.isLoggable(Level.INFO)) {
+ LOGGER.info("JACCARD: " + 0.9f);
+ }
+ searchModifier.setJaccThresh(0.9f);
+ runQueries(searchModifier, 5);
+
+ if (LOGGER.isLoggable(Level.INFO)) {
+ LOGGER.info("JACCARD: " + 0.8f);
+ }
+ searchModifier.setJaccThresh(0.8f);
+ runQueries(searchModifier, 5);
+
+ if (LOGGER.isLoggable(Level.INFO)) {
+ LOGGER.info("JACCARD: " + 0.7f);
+ }
+ searchModifier.setJaccThresh(0.7f);
+ runQueries(searchModifier, 5);
+ }
+
+ /**
+ * Runs 5 random edit-distance based search queries with thresholds 1, 2, 3.
+ * Tests the EditDistanceSearchModifier.
+ */
+ @Test
+ public void editDistanceQueryTest() throws Exception {
+ EditDistanceSearchModifier searchModifier = new EditDistanceSearchModifier(3, 0);
+
+ if (LOGGER.isLoggable(Level.INFO)) {
+ LOGGER.info("EDIT DISTANCE: " + 1);
+ }
+ searchModifier.setEdThresh(1);
+ runQueries(searchModifier, 5);
+
+ if (LOGGER.isLoggable(Level.INFO)) {
+ LOGGER.info("EDIT DISTANCE: " + 2);
+ }
+ searchModifier.setEdThresh(2);
+ runQueries(searchModifier, 5);
+
+ if (LOGGER.isLoggable(Level.INFO)) {
+ LOGGER.info("EDIT DISTANCE: " + 3);
+ }
+ searchModifier.setEdThresh(3);
+ runQueries(searchModifier, 5);
+ }
+}
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTestContext.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTestContext.java
index 5648d33..2fbede6 100644
--- a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTestContext.java
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTestContext.java
@@ -18,8 +18,10 @@
import java.io.ByteArrayInputStream;
import java.io.DataInput;
import java.io.DataInputStream;
+import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
+import java.util.List;
import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
@@ -28,6 +30,7 @@
import edu.uci.ics.hyracks.api.io.FileReference;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
import edu.uci.ics.hyracks.dataflow.common.util.SerdeUtils;
+import edu.uci.ics.hyracks.dataflow.common.util.TupleUtils;
import edu.uci.ics.hyracks.storage.am.btree.OrderedIndexTestContext;
import edu.uci.ics.hyracks.storage.am.common.CheckTuple;
import edu.uci.ics.hyracks.storage.am.common.api.IFreePageManager;
@@ -51,7 +54,8 @@
protected IBinaryComparatorFactory[] allCmpFactories;
protected IBinaryTokenizerFactory tokenizerFactory;
protected InvertedIndexInsertTupleIterator indexInsertIter;
- protected HashSet<Comparable> allTokens = new HashSet<Comparable>();
+ protected HashSet<Comparable> allTokens = new HashSet<Comparable>();
+ protected List<ITupleReference> documentCorpus = new ArrayList<ITupleReference>();
public InvertedIndexTestContext(ISerializerDeserializer[] fieldSerdes, IIndex index,
IBinaryTokenizerFactory tokenizerFactory) {
@@ -128,8 +132,9 @@
return testCtx;
}
- public void insertCheckTuples(ITupleReference tuple) throws HyracksDataException {
- indexInsertIter.reset(tuple);
+ public void insertCheckTuples(ITupleReference tuple, Collection<CheckTuple> checkTuples) throws HyracksDataException {
+ documentCorpus.add(TupleUtils.copyTuple(tuple));
+ indexInsertIter.reset(tuple);
while (indexInsertIter.hasNext()) {
indexInsertIter.next();
ITupleReference insertTuple = indexInsertIter.getTuple();
@@ -139,7 +144,7 @@
}
}
- public void deleteCheckTuples(ITupleReference tuple) throws HyracksDataException {
+ public void deleteCheckTuples(ITupleReference tuple, Collection<CheckTuple> checkTuples) throws HyracksDataException {
indexInsertIter.reset(tuple);
while (indexInsertIter.hasNext()) {
indexInsertIter.next();
@@ -169,4 +174,12 @@
public void upsertCheckTuple(CheckTuple checkTuple, Collection<CheckTuple> checkTuples) {
throw new UnsupportedOperationException("Upsert not supported by inverted index.");
}
+
+ public IBinaryTokenizerFactory getTokenizerFactory() {
+ return tokenizerFactory;
+ }
+
+ public List<ITupleReference> getDocumentCorpus() {
+ return documentCorpus;
+ }
}
diff --git a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTestUtils.java b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTestUtils.java
index 06fcb44..3b626ae 100644
--- a/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTestUtils.java
+++ b/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTestUtils.java
@@ -15,31 +15,236 @@
package edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util;
-import edu.uci.ics.hyracks.api.dataflow.value.IBinaryComparatorFactory;
-import edu.uci.ics.hyracks.api.dataflow.value.ITypeTraits;
-import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
-import edu.uci.ics.hyracks.data.std.accessors.PointableBinaryComparatorFactory;
-import edu.uci.ics.hyracks.data.std.primitive.IntegerPointable;
-import edu.uci.ics.hyracks.data.std.primitive.UTF8StringPointable;
-import edu.uci.ics.hyracks.storage.am.btree.frames.BTreeNSMInteriorFrameFactory;
-import edu.uci.ics.hyracks.storage.am.btree.frames.BTreeNSMLeafFrameFactory;
-import edu.uci.ics.hyracks.storage.am.btree.impls.BTree;
-import edu.uci.ics.hyracks.storage.am.common.api.IFreePageManager;
-import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexFrameFactory;
-import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexMetaDataFrameFactory;
-import edu.uci.ics.hyracks.storage.am.common.api.ITreeIndexTupleWriterFactory;
-import edu.uci.ics.hyracks.storage.am.common.frames.LIFOMetaDataFrameFactory;
-import edu.uci.ics.hyracks.storage.am.common.freepage.LinkedListFreePageManager;
-import edu.uci.ics.hyracks.storage.am.common.freepage.LinkedListFreePageManagerFactory;
-import edu.uci.ics.hyracks.storage.am.common.tuples.TypeAwareTupleWriterFactory;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.LSMInvertedIndexTestHarness;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.impls.LSMInvertedIndex;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.inmemory.InMemoryInvertedIndex;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.ondisk.OnDiskInvertedIndex;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexUtils;
+import static org.junit.Assert.fail;
+import java.io.ByteArrayInputStream;
+import java.io.DataInput;
+import java.io.DataInputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
+import edu.uci.ics.hyracks.api.exceptions.HyracksDataException;
+import edu.uci.ics.hyracks.data.std.util.ByteArrayAccessibleOutputStream;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
+import edu.uci.ics.hyracks.dataflow.common.comm.io.ArrayTupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.accessors.ITupleReference;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.IntegerSerializerDeserializer;
+import edu.uci.ics.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
+import edu.uci.ics.hyracks.storage.am.btree.OrderedIndexTestUtils;
+import edu.uci.ics.hyracks.storage.am.common.CheckTuple;
+import edu.uci.ics.hyracks.storage.am.common.api.IIndexBulkLoader;
+import edu.uci.ics.hyracks.storage.am.common.api.IndexException;
+import edu.uci.ics.hyracks.storage.am.common.datagen.DocumentStringFieldValueGenerator;
+import edu.uci.ics.hyracks.storage.am.common.datagen.IFieldValueGenerator;
+import edu.uci.ics.hyracks.storage.am.common.datagen.SortedIntegerFieldValueGenerator;
+import edu.uci.ics.hyracks.storage.am.common.datagen.TupleGenerator;
+import edu.uci.ics.hyracks.storage.am.common.ophelpers.MultiComparator;
+import edu.uci.ics.hyracks.storage.am.common.tuples.PermutingTupleReference;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.LSMInvertedIndexTestHarness;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndex;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexAccessor;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizerFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8WordTokenFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTestContext.InvertedIndexType;
+import edu.uci.ics.hyracks.storage.common.buffercache.IBufferCache;
+
+@SuppressWarnings("rawtypes")
public class InvertedIndexTestUtils {
+
+ public static TupleGenerator createStringDocumentTupleGen(Random rnd) throws IOException {
+ IFieldValueGenerator[] fieldGens = new IFieldValueGenerator[2];
+ fieldGens[0] = new DocumentStringFieldValueGenerator(2, 10, 10000, rnd);
+ fieldGens[1] = new SortedIntegerFieldValueGenerator(0);
+ ISerializerDeserializer[] fieldSerdes = new ISerializerDeserializer[] {
+ UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
+ TupleGenerator tupleGen = new TupleGenerator(fieldGens, fieldSerdes, 0);
+ return tupleGen;
+ }
+
+ public static InvertedIndexTestContext createWordInvIndexTestContext(LSMInvertedIndexTestHarness harness,
+ IBufferCache bufferCache, InvertedIndexType invIndexType) throws IOException, IndexException {
+ ISerializerDeserializer[] fieldSerdes = new ISerializerDeserializer[] {
+ UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
+ ITokenFactory tokenFactory = new UTF8WordTokenFactory();
+ IBinaryTokenizerFactory tokenizerFactory = new DelimitedUTF8StringBinaryTokenizerFactory(true, false,
+ tokenFactory);
+ InvertedIndexTestContext testCtx = InvertedIndexTestContext.create(bufferCache,
+ harness.getMemFreePageManager(), harness.getDiskFileMapProvider(), harness.getInvListsFileRef(),
+ fieldSerdes, 1, tokenizerFactory, invIndexType);
+ return testCtx;
+ }
+
+ public static void bulkLoadInvIndex(InvertedIndexTestContext testCtx, TupleGenerator tupleGen, int numDocs)
+ throws IndexException, IOException {
+ SortedSet<CheckTuple> tmpMemIndex = new TreeSet<CheckTuple>();;
+ // First generate the expected index by inserting the documents one-by-one.
+ for (int i = 0; i < numDocs; i++) {
+ ITupleReference tuple = tupleGen.next();
+ testCtx.insertCheckTuples(tuple, tmpMemIndex);
+ }
+ ISerializerDeserializer[] fieldSerdes = testCtx.getFieldSerdes();
+
+ // Use the expected index to bulk-load the actual index.
+ IIndexBulkLoader bulkLoader = testCtx.getIndex().createBulkLoader(1.0f, false);
+ ArrayTupleBuilder tupleBuilder = new ArrayTupleBuilder(testCtx.getFieldSerdes().length);
+ ArrayTupleReference tuple = new ArrayTupleReference();
+ Iterator<CheckTuple> checkTupleIter = testCtx.getCheckTuples().iterator();
+ while (checkTupleIter.hasNext()) {
+ CheckTuple checkTuple = checkTupleIter.next();
+ OrderedIndexTestUtils.createTupleFromCheckTuple(checkTuple, tupleBuilder, tuple, fieldSerdes);
+ bulkLoader.add(tuple);
+ }
+ bulkLoader.end();
+
+ // Add all check tuples from the temp index to the text context.
+ testCtx.getCheckTuples().addAll(tmpMemIndex);
+ }
+
+ public static void insertIntoInvIndex(InvertedIndexTestContext testCtx, TupleGenerator tupleGen, int numDocs)
+ throws IOException, IndexException {
+ // InMemoryInvertedIndex only supports insert.
+ for (int i = 0; i < numDocs; i++) {
+ ITupleReference tuple = tupleGen.next();
+ testCtx.getIndexAccessor().insert(tuple);
+ testCtx.insertCheckTuples(tuple, testCtx.getCheckTuples());
+ }
+ }
+
+ @SuppressWarnings("unchecked")
+ public static void compareActualAndExpectedIndexes(InvertedIndexTestContext testCtx) throws HyracksDataException,
+ IndexException {
+ IInvertedIndex invIndex = (IInvertedIndex) testCtx.getIndex();
+ ISerializerDeserializer[] fieldSerdes = testCtx.getFieldSerdes();
+ MultiComparator invListCmp = MultiComparator.create(invIndex.getInvListCmpFactories());
+ IInvertedIndexAccessor invIndexAccessor = (IInvertedIndexAccessor) testCtx.getIndexAccessor();
+ int tokenFieldCount = invIndex.getTokenTypeTraits().length;
+ int invListFieldCount = invIndex.getInvListTypeTraits().length;
+ // All tokens that were inserted into the indexes.
+ Iterator<Comparable> tokensIter = testCtx.getAllTokens().iterator();
+
+ // Search key for finding an inverted-list in the actual index.
+ ArrayTupleBuilder searchKeyBuilder = new ArrayTupleBuilder(tokenFieldCount);
+ ArrayTupleReference searchKey = new ArrayTupleReference();
+ // Cursor over inverted list from actual index.
+ IInvertedListCursor actualInvListCursor = invIndexAccessor.createInvertedListCursor();
+
+ // Helpers for generating a serialized inverted-list element from a CheckTuple from the expected index.
+ ArrayTupleBuilder expectedBuilder = new ArrayTupleBuilder(fieldSerdes.length);
+ // Includes the token fields.
+ ArrayTupleReference completeExpectedTuple = new ArrayTupleReference();
+ // Field permutation and permuting tuple reference to strip away token fields from completeExpectedTuple.
+ int[] fieldPermutation = new int[invListFieldCount];
+ for (int i = 0; i < fieldPermutation.length; i++) {
+ fieldPermutation[i] = tokenFieldCount + i;
+ }
+ PermutingTupleReference expectedTuple = new PermutingTupleReference(fieldPermutation);
+
+ // Iterate over all tokens. Find the inverted-lists in actual and expected indexes. Compare the inverted lists,
+ while (tokensIter.hasNext()) {
+ Comparable token = tokensIter.next();
+
+ // Position inverted-list iterator on expected index.
+ CheckTuple checkLowKey = new CheckTuple(tokenFieldCount, tokenFieldCount);
+ checkLowKey.appendField(token);
+ CheckTuple checkHighKey = new CheckTuple(tokenFieldCount, tokenFieldCount);
+ checkHighKey.appendField(token);
+ SortedSet<CheckTuple> expectedInvList = OrderedIndexTestUtils.getPrefixExpectedSubset(
+ testCtx.getCheckTuples(), checkLowKey, checkHighKey);
+ Iterator<CheckTuple> expectedInvListIter = expectedInvList.iterator();
+
+ // Position inverted-list cursor in actual index.
+ OrderedIndexTestUtils.createTupleFromCheckTuple(checkLowKey, searchKeyBuilder, searchKey, fieldSerdes);
+ invIndexAccessor.openInvertedListCursor(actualInvListCursor, searchKey);
+
+ if (actualInvListCursor.size() != expectedInvList.size()) {
+ fail("Actual and expected inverted lists for token '" + token.toString()
+ + "' have different sizes. Actual size: " + actualInvListCursor.size() + ". Expected size: "
+ + expectedInvList.size() + ".");
+ }
+ // Compare inverted-list elements.
+ int count = 0;
+ actualInvListCursor.pinPages();
+ try {
+ while (actualInvListCursor.hasNext() && expectedInvListIter.hasNext()) {
+ actualInvListCursor.next();
+ ITupleReference actual = actualInvListCursor.getTuple();
+ CheckTuple expected = expectedInvListIter.next();
+ OrderedIndexTestUtils.createTupleFromCheckTuple(expected, expectedBuilder, completeExpectedTuple,
+ fieldSerdes);
+ expectedTuple.reset(completeExpectedTuple);
+ if (invListCmp.compare(actual, expectedTuple) != 0) {
+ fail("Inverted lists of token '" + token + "' differ at position " + count + ".");
+ }
+ count++;
+ }
+ } finally {
+ actualInvListCursor.unpinPages();
+ }
+ }
+ }
+
+ /**
+ * Determine the expected results with the simple ScanCount algorithm.
+ */
+ @SuppressWarnings("unchecked")
+ public static void getExpectedResults(int[] scanCountArray, TreeSet<CheckTuple> checkTuples,
+ ITupleReference searchDocument, IBinaryTokenizer tokenizer, ISerializerDeserializer tokenSerde,
+ int occurrenceThreshold, List<Integer> expectedResults) throws IOException {
+ // Reset scan count array.
+ Arrays.fill(scanCountArray, 0);
+ expectedResults.clear();
+
+ ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
+ tokenizer.reset(searchDocument.getFieldData(0), searchDocument.getFieldStart(0),
+ searchDocument.getFieldLength(0));
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+ IToken token = tokenizer.getToken();
+ baaos.reset();
+ DataOutput out = new DataOutputStream(baaos);
+ token.serializeToken(out);
+ ByteArrayInputStream inStream = new ByteArrayInputStream(baaos.getByteArray(), 0, baaos.size());
+ DataInput dataIn = new DataInputStream(inStream);
+ Comparable tokenObj = (Comparable) tokenSerde.deserialize(dataIn);
+ CheckTuple lowKey = new CheckTuple(1, 1);
+ lowKey.appendField(tokenObj);
+ CheckTuple highKey = new CheckTuple(1, 1);
+ highKey.appendField(tokenObj);
+
+ // Get view over check tuples containing inverted-list corresponding to token.
+ SortedSet<CheckTuple> invList = OrderedIndexTestUtils.getPrefixExpectedSubset(checkTuples, lowKey, highKey);
+ Iterator<CheckTuple> invListIter = invList.iterator();
+ // Iterate over inverted list and update scan count array.
+ while (invListIter.hasNext()) {
+ CheckTuple checkTuple = invListIter.next();
+ Integer element = (Integer) checkTuple.getField(1);
+ scanCountArray[element]++;
+ }
+ }
+
+ // Run through scan count array, and see whether elements satisfy the given occurrence threshold.
+ expectedResults.clear();
+ for (int i = 0; i < scanCountArray.length; i++) {
+ if (scanCountArray[i] >= occurrenceThreshold) {
+ expectedResults.add(i);
+ }
+ }
+ }
+
+ /*
public static OnDiskInvertedIndex createTestInvertedIndex(LSMInvertedIndexTestHarness harness, IBinaryTokenizer tokenizer)
throws HyracksDataException {
ITreeIndexMetaDataFrameFactory metaFrameFactory = new LIFOMetaDataFrameFactory();
@@ -74,4 +279,5 @@
new LinkedListFreePageManagerFactory(harness.getDiskBufferCache(), new LIFOMetaDataFrameFactory()),
harness.getIOManager(), harness.getOnDiskDir(), harness.getDiskFileMapProvider());
}
+ */
}