checkpointing LSM integration: missing LSM RTree linearizer fix and jaccard computation fixes
git-svn-id: https://asterixdb.googlecode.com/svn/branches/asterix_lsm_stabilization@721 eaa15691-b419-025a-1212-ee371bd00084
diff --git a/asterix-om/pom.xml b/asterix-om/pom.xml
index fc9acd8..406c402 100644
--- a/asterix-om/pom.xml
+++ b/asterix-om/pom.xml
@@ -1,4 +1,5 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<artifactId>asterix</artifactId>
@@ -31,11 +32,6 @@
<scope>compile</scope>
</dependency>
<dependency>
- <groupId>edu.uci.ics.hyracks</groupId>
- <artifactId>hyracks-storage-am-invertedindex</artifactId>
- <version>0.2.1-SNAPSHOT</version>
- </dependency>
- <dependency>
<groupId>edu.uci.ics.asterix</groupId>
<artifactId>asterix-hyracks-glue</artifactId>
<version>0.0.4-SNAPSHOT</version>
@@ -48,7 +44,14 @@
</dependency>
<dependency>
<groupId>edu.uci.ics.hyracks</groupId>
- <artifactId>hyracks-storage-am-rtree</artifactId>
+ <artifactId>
+ hyracks-storage-am-lsm-invertedindex
+ </artifactId>
+ <version>0.2.1-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <groupId>edu.uci.ics.hyracks</groupId>
+ <artifactId>hyracks-storage-am-lsm-rtree</artifactId>
<version>0.2.1-SNAPSHOT</version>
</dependency>
</dependencies>
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AListElementToken.java b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AListElementToken.java
index cb6838e..82780bb 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AListElementToken.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AListElementToken.java
@@ -3,7 +3,7 @@
import java.io.DataOutput;
import java.io.IOException;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
public class AListElementToken implements IToken {
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AListElementTokenFactory.java b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AListElementTokenFactory.java
index fc46039..1ccf2db 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AListElementTokenFactory.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AListElementTokenFactory.java
@@ -1,7 +1,7 @@
package edu.uci.ics.asterix.dataflow.data.common;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.ITokenFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
public class AListElementTokenFactory implements ITokenFactory {
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java
index 667bfe7..89a3ff7 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AOrderedListBinaryTokenizer.java
@@ -5,9 +5,9 @@
import edu.uci.ics.asterix.om.types.ATypeTag;
import edu.uci.ics.asterix.om.types.EnumDeserializer;
import edu.uci.ics.asterix.om.util.NonTaggedFormatUtil;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IToken;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.ITokenFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
public class AOrderedListBinaryTokenizer implements IBinaryTokenizer {
@@ -16,13 +16,13 @@
protected int length;
protected int listLength;
protected int itemIndex;
-
+
protected final IToken token;
-
+
public AOrderedListBinaryTokenizer(ITokenFactory tokenFactory) {
token = tokenFactory.createToken();
}
-
+
@Override
public IToken getToken() {
return token;
@@ -58,11 +58,11 @@
this.listLength = getNumberOfItems(data, start);
this.itemIndex = 0;
}
-
+
protected int getItemOffset(byte[] data, int start, int itemIndex) throws AsterixException {
return AOrderedListSerializerDeserializer.getItemOffset(data, start, itemIndex);
}
-
+
protected int getNumberOfItems(byte[] data, int start) {
return AOrderedListSerializerDeserializer.getNumberOfItems(data, start);
}
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AOrderedListBinaryTokenizerFactory.java b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AOrderedListBinaryTokenizerFactory.java
index d7712ea..6c5077b 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AOrderedListBinaryTokenizerFactory.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AOrderedListBinaryTokenizerFactory.java
@@ -1,18 +1,18 @@
package edu.uci.ics.asterix.dataflow.data.common;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizerFactory;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.ITokenFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
public class AOrderedListBinaryTokenizerFactory implements IBinaryTokenizerFactory {
private static final long serialVersionUID = 1L;
private final ITokenFactory tokenFactory;
-
+
public AOrderedListBinaryTokenizerFactory(ITokenFactory tokenFactory) {
this.tokenFactory = tokenFactory;
}
-
+
@Override
public IBinaryTokenizer createTokenizer() {
return new AOrderedListBinaryTokenizer(tokenFactory);
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AUnorderedListBinaryTokenizer.java b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AUnorderedListBinaryTokenizer.java
index 11ab251..7e0ac32 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AUnorderedListBinaryTokenizer.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AUnorderedListBinaryTokenizer.java
@@ -2,14 +2,14 @@
import edu.uci.ics.asterix.common.exceptions.AsterixException;
import edu.uci.ics.asterix.dataflow.data.nontagged.serde.AUnorderedListSerializerDeserializer;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.ITokenFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
public class AUnorderedListBinaryTokenizer extends AOrderedListBinaryTokenizer {
public AUnorderedListBinaryTokenizer(ITokenFactory tokenFactory) {
super(tokenFactory);
}
-
+
@Override
protected int getItemOffset(byte[] data, int start, int itemIndex) throws AsterixException {
return AUnorderedListSerializerDeserializer.getItemOffset(data, start, itemIndex);
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AUnorderedListBinaryTokenizerFactory.java b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AUnorderedListBinaryTokenizerFactory.java
index 54b7692..fc55aa6 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AUnorderedListBinaryTokenizerFactory.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/AUnorderedListBinaryTokenizerFactory.java
@@ -1,18 +1,18 @@
package edu.uci.ics.asterix.dataflow.data.common;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizerFactory;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.ITokenFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
public class AUnorderedListBinaryTokenizerFactory implements IBinaryTokenizerFactory {
-
+
private static final long serialVersionUID = 1L;
private final ITokenFactory tokenFactory;
-
+
public AUnorderedListBinaryTokenizerFactory(ITokenFactory tokenFactory) {
this.tokenFactory = tokenFactory;
}
-
+
@Override
public IBinaryTokenizer createTokenizer() {
return new AUnorderedListBinaryTokenizer(tokenFactory);
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/IBinaryTokenizerFactoryProvider.java b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/IBinaryTokenizerFactoryProvider.java
index cddb014..b9af9bf 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/IBinaryTokenizerFactoryProvider.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/IBinaryTokenizerFactoryProvider.java
@@ -1,9 +1,11 @@
package edu.uci.ics.asterix.dataflow.data.common;
import edu.uci.ics.asterix.om.types.ATypeTag;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizerFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
public interface IBinaryTokenizerFactoryProvider {
public IBinaryTokenizerFactory getWordTokenizerFactory(ATypeTag typeTag, boolean hashedTokens);
- public IBinaryTokenizerFactory getNGramTokenizerFactory(ATypeTag typeTag, int gramLength, boolean usePrePost, boolean hashedTokens);
+
+ public IBinaryTokenizerFactory getNGramTokenizerFactory(ATypeTag typeTag, int gramLength, boolean usePrePost,
+ boolean hashedTokens);
}
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/ListEditDistanceSearchModifier.java b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/ListEditDistanceSearchModifier.java
index 2a7fdb5..3d66912 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/ListEditDistanceSearchModifier.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/ListEditDistanceSearchModifier.java
@@ -1,10 +1,6 @@
package edu.uci.ics.asterix.dataflow.data.common;
-import java.util.Collections;
-import java.util.List;
-
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedListCursor;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifier;
// TODO: Should go into hyracks.
public class ListEditDistanceSearchModifier implements IInvertedIndexSearchModifier {
@@ -15,17 +11,6 @@
this.edThresh = edThresh;
}
- @Override
- public int getOccurrenceThreshold(List<IInvertedListCursor> invListCursors) {
- return invListCursors.size() - edThresh;
- }
-
- @Override
- public int getPrefixLists(List<IInvertedListCursor> invListCursors) {
- Collections.sort(invListCursors);
- return invListCursors.size() - getOccurrenceThreshold(invListCursors) + 1;
- }
-
public int getEdThresh() {
return edThresh;
}
@@ -33,4 +18,14 @@
public void setEdThresh(int edThresh) {
this.edThresh = edThresh;
}
+
+ @Override
+ public int getOccurrenceThreshold(int numQueryTokens) {
+ return numQueryTokens - edThresh;
+ }
+
+ @Override
+ public int getNumPrefixLists(int numQueryTokens) {
+ return numQueryTokens - getOccurrenceThreshold(numQueryTokens) + 1;
+ }
}
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/ListEditDistanceSearchModifierFactory.java b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/ListEditDistanceSearchModifierFactory.java
index b117a36..4148c85 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/ListEditDistanceSearchModifierFactory.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/dataflow/data/common/ListEditDistanceSearchModifierFactory.java
@@ -1,7 +1,7 @@
package edu.uci.ics.asterix.dataflow.data.common;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifier;
-import edu.uci.ics.hyracks.storage.am.invertedindex.api.IInvertedIndexSearchModifierFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifier;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifierFactory;
// TODO: Should go into hyracks.
public class ListEditDistanceSearchModifierFactory implements IInvertedIndexSearchModifierFactory {
@@ -9,11 +9,11 @@
private static final long serialVersionUID = 1L;
private final int edThresh;
-
+
public ListEditDistanceSearchModifierFactory(int edThresh) {
this.edThresh = edThresh;
}
-
+
@Override
public IInvertedIndexSearchModifier createSearchModifier() {
return new ListEditDistanceSearchModifier(edThresh);
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/formats/nontagged/AqlBinaryTokenizerFactoryProvider.java b/asterix-om/src/main/java/edu/uci/ics/asterix/formats/nontagged/AqlBinaryTokenizerFactoryProvider.java
index 859b0f2..41a921e 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/formats/nontagged/AqlBinaryTokenizerFactoryProvider.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/formats/nontagged/AqlBinaryTokenizerFactoryProvider.java
@@ -1,15 +1,15 @@
package edu.uci.ics.asterix.formats.nontagged;
-import edu.uci.ics.asterix.dataflow.data.common.IBinaryTokenizerFactoryProvider;
import edu.uci.ics.asterix.dataflow.data.common.AListElementTokenFactory;
import edu.uci.ics.asterix.dataflow.data.common.AOrderedListBinaryTokenizerFactory;
import edu.uci.ics.asterix.dataflow.data.common.AUnorderedListBinaryTokenizerFactory;
+import edu.uci.ics.asterix.dataflow.data.common.IBinaryTokenizerFactoryProvider;
import edu.uci.ics.asterix.om.types.ATypeTag;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizerFactory;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.HashedUTF8WordTokenFactory;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizerFactory;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.UTF8NGramTokenFactory;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.UTF8WordTokenFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizerFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.HashedUTF8WordTokenFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8NGramTokenFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.UTF8WordTokenFactory;
public class AqlBinaryTokenizerFactoryProvider implements IBinaryTokenizerFactoryProvider {
@@ -17,14 +17,16 @@
private static final IBinaryTokenizerFactory aqlStringTokenizer = new DelimitedUTF8StringBinaryTokenizerFactory(
true, true, new UTF8WordTokenFactory(ATypeTag.STRING.serialize(), ATypeTag.INT32.serialize()));
-
+
private static final IBinaryTokenizerFactory aqlHashingStringTokenizer = new DelimitedUTF8StringBinaryTokenizerFactory(
true, true, new HashedUTF8WordTokenFactory(ATypeTag.INT32.serialize(), ATypeTag.INT32.serialize()));
- private static final IBinaryTokenizerFactory orderedListTokenizer = new AOrderedListBinaryTokenizerFactory(new AListElementTokenFactory());
-
- private static final IBinaryTokenizerFactory unorderedListTokenizer = new AUnorderedListBinaryTokenizerFactory(new AListElementTokenFactory());
-
+ private static final IBinaryTokenizerFactory orderedListTokenizer = new AOrderedListBinaryTokenizerFactory(
+ new AListElementTokenFactory());
+
+ private static final IBinaryTokenizerFactory unorderedListTokenizer = new AUnorderedListBinaryTokenizerFactory(
+ new AListElementTokenFactory());
+
@Override
public IBinaryTokenizerFactory getWordTokenizerFactory(ATypeTag typeTag, boolean hashedTokens) {
switch (typeTag) {
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/formats/nontagged/NGramUTF8StringBinaryTokenizerFactory.java b/asterix-om/src/main/java/edu/uci/ics/asterix/formats/nontagged/NGramUTF8StringBinaryTokenizerFactory.java
index 895dac4..48dde10 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/formats/nontagged/NGramUTF8StringBinaryTokenizerFactory.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/formats/nontagged/NGramUTF8StringBinaryTokenizerFactory.java
@@ -19,30 +19,29 @@
package edu.uci.ics.asterix.formats.nontagged;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizer;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.IBinaryTokenizerFactory;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.ITokenFactory;
-import edu.uci.ics.hyracks.storage.am.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.ITokenFactory;
+import edu.uci.ics.hyracks.storage.am.lsm.invertedindex.tokenizers.NGramUTF8StringBinaryTokenizer;
// TODO: Should be moved into Hyracks with the rest of the tokenizer code.
-public class NGramUTF8StringBinaryTokenizerFactory implements
- IBinaryTokenizerFactory {
+public class NGramUTF8StringBinaryTokenizerFactory implements IBinaryTokenizerFactory {
- private static final long serialVersionUID = 1L;
- private final int gramLength;
- private final boolean usePrePost;
- private final boolean ignoreTokenCount;
- private final boolean sourceHasTypeTag;
- private final ITokenFactory tokenFactory;
+ private static final long serialVersionUID = 1L;
+ private final int gramLength;
+ private final boolean usePrePost;
+ private final boolean ignoreTokenCount;
+ private final boolean sourceHasTypeTag;
+ private final ITokenFactory tokenFactory;
- public NGramUTF8StringBinaryTokenizerFactory(int gramLength, boolean usePrePost, boolean ignoreTokenCount,
- boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
- this.gramLength = gramLength;
- this.usePrePost = usePrePost;
- this.ignoreTokenCount = ignoreTokenCount;
- this.sourceHasTypeTag = sourceHasTypeTag;
- this.tokenFactory = tokenFactory;
- }
+ public NGramUTF8StringBinaryTokenizerFactory(int gramLength, boolean usePrePost, boolean ignoreTokenCount,
+ boolean sourceHasTypeTag, ITokenFactory tokenFactory) {
+ this.gramLength = gramLength;
+ this.usePrePost = usePrePost;
+ this.ignoreTokenCount = ignoreTokenCount;
+ this.sourceHasTypeTag = sourceHasTypeTag;
+ this.tokenFactory = tokenFactory;
+ }
@Override
public IBinaryTokenizer createTokenizer() {