[NO ISSUE] Add full-text stopwords filter
Support stopwords filter when processing full-text search query.
Users can configure full-text configurations and filters via SQLPP, and
create full-text search index with a specific full-text configuration.
Later, users can run ftcontains() search query and specify a full-text
configuration.
The full-text index will be utilized only if the index
full-text configuration is the same as the query full-text configuration.
If none of the index meets the requirement then a full-scan will run for
the ftcontains() query.
Change-Id: I6773a0d6c63693981aa1e29e9134e29391affe9a
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/8183
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Dmitry Lychagin <dmitry.lychagin@couchbase.com>
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/pom.xml b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/pom.xml
index 366c2a7..23d81dc 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/pom.xml
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/pom.xml
@@ -96,5 +96,14 @@
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-lang3</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ </dependency>
+
</dependencies>
</project>
\ No newline at end of file
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
index 3ed47e6..4477e65 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorDescriptor.java
@@ -27,6 +27,9 @@
import org.apache.hyracks.api.exceptions.HyracksDataException;
import org.apache.hyracks.api.job.IOperatorDescriptorRegistry;
import org.apache.hyracks.dataflow.std.base.AbstractSingleActivityOperatorDescriptor;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluator;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluatorFactory;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
public class BinaryTokenizerOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor {
@@ -34,6 +37,7 @@
private static final long serialVersionUID = 1L;
private final IBinaryTokenizerFactory tokenizerFactory;
+ private final IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory;
// Field that will be tokenized.
private final int docField;
// operator will append these key fields to each token, e.g., as
@@ -53,10 +57,12 @@
private final IMissingWriterFactory missingWriterFactory;
public BinaryTokenizerOperatorDescriptor(IOperatorDescriptorRegistry spec, RecordDescriptor recDesc,
- IBinaryTokenizerFactory tokenizerFactory, int docField, int[] keyFields, boolean addNumTokensKey,
- boolean writeKeyFieldsFirst, boolean writeMissing, IMissingWriterFactory missingWriterFactory) {
+ IBinaryTokenizerFactory tokenizerFactory, IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory,
+ int docField, int[] keyFields, boolean addNumTokensKey, boolean writeKeyFieldsFirst, boolean writeMissing,
+ IMissingWriterFactory missingWriterFactory) {
super(spec, 1, 1);
this.tokenizerFactory = tokenizerFactory;
+ this.fullTextConfigEvaluatorFactory = fullTextConfigEvaluatorFactory;
this.docField = docField;
this.keyFields = keyFields;
this.addNumTokensKey = addNumTokensKey;
@@ -69,9 +75,14 @@
@Override
public IOperatorNodePushable createPushRuntime(IHyracksTaskContext ctx,
IRecordDescriptorProvider recordDescProvider, int partition, int nPartitions) throws HyracksDataException {
+ IBinaryTokenizer tokenizer = tokenizerFactory.createTokenizer();
+ IFullTextConfigEvaluator fullTextConfigEvaluator =
+ fullTextConfigEvaluatorFactory.createFullTextConfigEvaluator();
+
return new BinaryTokenizerOperatorNodePushable(ctx,
- recordDescProvider.getInputRecordDescriptor(getActivityId(), 0), outRecDescs[0],
- tokenizerFactory.createTokenizer(), docField, keyFields, addNumTokensKey, writeKeyFieldsFirst,
- writeMissing, missingWriterFactory);
+ recordDescProvider.getInputRecordDescriptor(getActivityId(), 0), outRecDescs[0], tokenizer,
+ fullTextConfigEvaluator, docField, keyFields, addNumTokensKey, writeKeyFieldsFirst, writeMissing,
+ missingWriterFactory);
}
+
}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
index 3df185a..b9c8c8a 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/BinaryTokenizerOperatorNodePushable.java
@@ -36,13 +36,14 @@
import org.apache.hyracks.dataflow.common.data.accessors.FrameTupleReference;
import org.apache.hyracks.dataflow.common.data.accessors.ITupleReference;
import org.apache.hyracks.dataflow.std.base.AbstractUnaryInputUnaryOutputOperatorNodePushable;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluator;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
public class BinaryTokenizerOperatorNodePushable extends AbstractUnaryInputUnaryOutputOperatorNodePushable {
private final IHyracksTaskContext ctx;
- private final IBinaryTokenizer tokenizer;
+ private final IFullTextConfigEvaluator fullTextEvaluator;
private final int docField;
private final int[] keyFields;
private final boolean addNumTokensKey;
@@ -59,11 +60,24 @@
private FrameTupleAppender appender;
public BinaryTokenizerOperatorNodePushable(IHyracksTaskContext ctx, RecordDescriptor inputRecDesc,
- RecordDescriptor outputRecDesc, IBinaryTokenizer tokenizer, int docField, int[] keyFields,
- boolean addNumTokensKey, boolean writeKeyFieldsFirst, boolean writeMissing,
+ RecordDescriptor outputRecDesc, IBinaryTokenizer tokenizer, IFullTextConfigEvaluator fullTextEvaluator,
+ int docField, int[] keyFields, boolean addNumTokensKey, boolean writeKeyFieldsFirst, boolean writeMissing,
IMissingWriterFactory missingWriterFactory) {
this.ctx = ctx;
- this.tokenizer = tokenizer;
+ this.fullTextEvaluator = fullTextEvaluator;
+ // Need to use the tokenizer created in the upper-layer when:
+ // 1. The tokenizer is of TokenizerCategory.NGram rather than Word
+ // 2. If the tokenizer is a TokenizerCategory.Word one, then its parameters
+ // (e.g. boolean ignoreTokenCount, boolean sourceHasTypeTag) may be different
+ // from the tokenizer in the default full-text config.
+ //
+ // Note that those parameters might be call-site specific, for example, one string byte array may contains
+ // the ATypeTag.String in it while some doesn't. Even though the tokenizers are both Word tokenizer,
+ // we still need to set different tokenizer here.
+ // The different tokeniers are defined in BinaryTokenizerFactoryProvider.
+ // The big plan is to remove the tokenizer from this interface and use fullTextEvaluator only.
+ this.fullTextEvaluator.setTokenizer(tokenizer);
+
this.docField = docField;
this.keyFields = keyFields;
this.addNumTokensKey = addNumTokensKey;
@@ -91,20 +105,20 @@
for (int i = 0; i < tupleCount; i++) {
tuple.reset(accessor, i);
- short numTokens = 0;
+ int numTokens = 0;
if (!isDocFieldMissing(tuple)) {
- tokenizer.reset(tuple.getFieldData(docField), tuple.getFieldStart(docField),
+ fullTextEvaluator.reset(tuple.getFieldData(docField), tuple.getFieldStart(docField),
tuple.getFieldLength(docField));
if (addNumTokensKey) {
// Get the total number of tokens.
- numTokens = tokenizer.getTokensCount();
+ numTokens = fullTextEvaluator.getTokensCount();
}
// Write token and data into frame by following the order specified
// in the writeKeyFieldsFirst field.
- while (tokenizer.hasNext()) {
- tokenizer.next();
- IToken token = tokenizer.getToken();
+ while (fullTextEvaluator.hasNext()) {
+ fullTextEvaluator.next();
+ IToken token = fullTextEvaluator.getToken();
writeTuple(token, numTokens, i);
}
} else if (writeMissing) {
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexLocalResource.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexLocalResource.java
index 0904c4a..0af1f3f 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexLocalResource.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexLocalResource.java
@@ -42,6 +42,8 @@
import org.apache.hyracks.storage.am.lsm.common.api.IVirtualBufferCache;
import org.apache.hyracks.storage.am.lsm.common.api.IVirtualBufferCacheProvider;
import org.apache.hyracks.storage.am.lsm.common.dataflow.LsmResource;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.FullTextConfigEvaluatorFactory;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluatorFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexUtils;
import org.apache.hyracks.storage.common.IStorageManager;
@@ -54,33 +56,50 @@
public class LSMInvertedIndexLocalResource extends LsmResource {
private static final long serialVersionUID = 1L;
+ public static final String TOKEN_TYPE_TRAITS = "tokenTypeTraits";
+ public static final String TOKEN_CMP_FACTORIES = "tokenCmpFactories";
+ public static final String IS_PARTITIONED = "isPartitioned";
+ public static final String FULL_TEXT_CONFIG_EVALUATOR_FACTORY = "fullTextConfigEvaluatorFactory";
private final ITypeTraits[] tokenTypeTraits;
private final IBinaryComparatorFactory[] tokenCmpFactories;
+ // ToDo: totally replace tokenizer with fullTextConfig
+ // Currently, the configuration of tokenizer is complicated and the parameters
+ // (e.g. whether the string contains a ATypeTag.AString at the beginning),
+ // as a first step, we set the tokenizer of the full-text config on-the-fly
+ // See comments in FullTextConfigEvaluator
private final IBinaryTokenizerFactory tokenizerFactory;
+ private final IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory;
private final boolean isPartitioned;
private final int[] invertedIndexFields;
private final int[] filterFieldsForNonBulkLoadOps;
private final int[] invertedIndexFieldsForNonBulkLoadOps;
private final double bloomFilterFalsePositiveRate;
- public LSMInvertedIndexLocalResource(String path, IStorageManager storageManager, ITypeTraits[] typeTraits,
+ public LSMInvertedIndexLocalResource(
+ // inherited fields
+ String path, IStorageManager storageManager, ITypeTraits[] typeTraits,
IBinaryComparatorFactory[] cmpFactories, ITypeTraits[] filterTypeTraits,
IBinaryComparatorFactory[] filterCmpFactories, int[] filterFields,
ILSMOperationTrackerFactory opTrackerProvider, ILSMIOOperationCallbackFactory ioOpCallbackFactory,
ILSMPageWriteCallbackFactory pageWriteCallbackFactory,
IMetadataPageManagerFactory metadataPageManagerFactory, IVirtualBufferCacheProvider vbcProvider,
ILSMIOOperationSchedulerProvider ioSchedulerProvider, ILSMMergePolicyFactory mergePolicyFactory,
- Map<String, String> mergePolicyProperties, boolean durable, ITypeTraits[] tokenTypeTraits,
- IBinaryComparatorFactory[] tokenCmpFactories, IBinaryTokenizerFactory tokenizerFactory,
+ Map<String, String> mergePolicyProperties, boolean durable,
+ // new fields
+ ITypeTraits[] tokenTypeTraits, IBinaryComparatorFactory[] tokenCmpFactories,
+ IBinaryTokenizerFactory tokenizerFactory, IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory,
boolean isPartitioned, int[] invertedIndexFields, int[] filterFieldsForNonBulkLoadOps,
int[] invertedIndexFieldsForNonBulkLoadOps, double bloomFilterFalsePositiveRate) {
+
super(path, storageManager, typeTraits, cmpFactories, filterTypeTraits, filterCmpFactories, filterFields,
opTrackerProvider, ioOpCallbackFactory, pageWriteCallbackFactory, metadataPageManagerFactory,
vbcProvider, ioSchedulerProvider, mergePolicyFactory, mergePolicyProperties, durable);
+
this.tokenTypeTraits = tokenTypeTraits;
this.tokenCmpFactories = tokenCmpFactories;
this.tokenizerFactory = tokenizerFactory;
+ this.fullTextConfigEvaluatorFactory = fullTextConfigEvaluatorFactory;
this.isPartitioned = isPartitioned;
this.invertedIndexFields = invertedIndexFields;
this.filterFieldsForNonBulkLoadOps = filterFieldsForNonBulkLoadOps;
@@ -90,13 +109,15 @@
private LSMInvertedIndexLocalResource(IPersistedResourceRegistry registry, JsonNode json,
ITypeTraits[] tokenTypeTraits, IBinaryComparatorFactory[] tokenCmpFactories,
- IBinaryTokenizerFactory tokenizerFactory, boolean isPartitioned, int[] invertedIndexFields,
- int[] filterFieldsForNonBulkLoadOps, int[] invertedIndexFieldsForNonBulkLoadOps,
- double bloomFilterFalsePositiveRate) throws HyracksDataException {
+ IBinaryTokenizerFactory tokenizerFactory, IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory,
+ boolean isPartitioned, int[] invertedIndexFields, int[] filterFieldsForNonBulkLoadOps,
+ int[] invertedIndexFieldsForNonBulkLoadOps, double bloomFilterFalsePositiveRate)
+ throws HyracksDataException {
super(registry, json);
this.tokenTypeTraits = tokenTypeTraits;
this.tokenCmpFactories = tokenCmpFactories;
this.tokenizerFactory = tokenizerFactory;
+ this.fullTextConfigEvaluatorFactory = fullTextConfigEvaluatorFactory;
this.isPartitioned = isPartitioned;
this.invertedIndexFields = invertedIndexFields;
this.filterFieldsForNonBulkLoadOps = filterFieldsForNonBulkLoadOps;
@@ -116,19 +137,20 @@
pageWriteCallbackFactory.initialize(serviceCtx, this);
if (isPartitioned) {
return InvertedIndexUtils.createPartitionedLSMInvertedIndex(ioManager, virtualBufferCaches, typeTraits,
- cmpFactories, tokenTypeTraits, tokenCmpFactories, tokenizerFactory, bufferCache,
- file.getAbsolutePath(), bloomFilterFalsePositiveRate, mergePolicy,
+ cmpFactories, tokenTypeTraits, tokenCmpFactories, tokenizerFactory, fullTextConfigEvaluatorFactory,
+ bufferCache, file.getAbsolutePath(), bloomFilterFalsePositiveRate, mergePolicy,
opTrackerProvider.getOperationTracker(serviceCtx, this), ioScheduler, ioOpCallbackFactory,
pageWriteCallbackFactory, invertedIndexFields, filterTypeTraits, filterCmpFactories, filterFields,
filterFieldsForNonBulkLoadOps, invertedIndexFieldsForNonBulkLoadOps, durable,
metadataPageManagerFactory, serviceCtx.getTracer());
} else {
return InvertedIndexUtils.createLSMInvertedIndex(ioManager, virtualBufferCaches, typeTraits, cmpFactories,
- tokenTypeTraits, tokenCmpFactories, tokenizerFactory, bufferCache, file.getAbsolutePath(),
- bloomFilterFalsePositiveRate, mergePolicy, opTrackerProvider.getOperationTracker(serviceCtx, this),
- ioScheduler, ioOpCallbackFactory, pageWriteCallbackFactory, invertedIndexFields, filterTypeTraits,
- filterCmpFactories, filterFields, filterFieldsForNonBulkLoadOps,
- invertedIndexFieldsForNonBulkLoadOps, durable, metadataPageManagerFactory, serviceCtx.getTracer());
+ tokenTypeTraits, tokenCmpFactories, tokenizerFactory, fullTextConfigEvaluatorFactory, bufferCache,
+ file.getAbsolutePath(), bloomFilterFalsePositiveRate, mergePolicy,
+ opTrackerProvider.getOperationTracker(serviceCtx, this), ioScheduler, ioOpCallbackFactory,
+ pageWriteCallbackFactory, invertedIndexFields, filterTypeTraits, filterCmpFactories, filterFields,
+ filterFieldsForNonBulkLoadOps, invertedIndexFieldsForNonBulkLoadOps, durable,
+ metadataPageManagerFactory, serviceCtx.getTracer());
}
}
@@ -140,14 +162,15 @@
for (ITypeTraits tt : tokenTypeTraits) {
tokenTypeTraitsArray.add(tt.toJson(registry));
}
- jsonObject.set("tokenTypeTraits", tokenTypeTraitsArray);
+ jsonObject.set(TOKEN_TYPE_TRAITS, tokenTypeTraitsArray);
final ArrayNode tokenCmpFactoriesArray = OBJECT_MAPPER.createArrayNode();
for (IBinaryComparatorFactory factory : tokenCmpFactories) {
tokenCmpFactoriesArray.add(factory.toJson(registry));
}
- jsonObject.set("tokenCmpFactories", tokenCmpFactoriesArray);
+ jsonObject.set(TOKEN_CMP_FACTORIES, tokenCmpFactoriesArray);
jsonObject.set("tokenizerFactory", tokenizerFactory.toJson(registry));
- jsonObject.put("isPartitioned", isPartitioned);
+ jsonObject.set(FULL_TEXT_CONFIG_EVALUATOR_FACTORY, fullTextConfigEvaluatorFactory.toJson(registry));
+ jsonObject.put(IS_PARTITIONED, isPartitioned);
jsonObject.putPOJO("invertedIndexFields", invertedIndexFields);
jsonObject.putPOJO("filterFieldsForNonBulkLoadOps", filterFieldsForNonBulkLoadOps);
jsonObject.putPOJO("invertedIndexFieldsForNonBulkLoadOps", invertedIndexFieldsForNonBulkLoadOps);
@@ -173,7 +196,17 @@
tokenCmpFactoriesList.toArray(new IBinaryComparatorFactory[0]);
final IBinaryTokenizerFactory tokenizerFactory =
(IBinaryTokenizerFactory) registry.deserialize(json.get("tokenizerFactory"));
- final boolean isPartitioned = json.get("isPartitioned").asBoolean();
+
+ final IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory;
+ // back-compatible: the local resource in an older version of AsterixDB may not contain the newly added fullTextConfigFactory
+ if (json.has(FULL_TEXT_CONFIG_EVALUATOR_FACTORY)) {
+ fullTextConfigEvaluatorFactory = (IFullTextConfigEvaluatorFactory) registry
+ .deserialize(json.get(FULL_TEXT_CONFIG_EVALUATOR_FACTORY));
+ } else {
+ fullTextConfigEvaluatorFactory = FullTextConfigEvaluatorFactory.getDefaultFactory();
+ }
+
+ final boolean isPartitioned = json.get(IS_PARTITIONED).asBoolean();
final int[] invertedIndexFields = OBJECT_MAPPER.convertValue(json.get("invertedIndexFields"), int[].class);
final int[] filterFieldsForNonBulkLoadOps =
OBJECT_MAPPER.convertValue(json.get("filterFieldsForNonBulkLoadOps"), int[].class);
@@ -181,7 +214,7 @@
OBJECT_MAPPER.convertValue(json.get("invertedIndexFieldsForNonBulkLoadOps"), int[].class);
final double bloomFilterFalsePositiveRate = json.get("bloomFilterFalsePositiveRate").asDouble();
return new LSMInvertedIndexLocalResource(registry, json, tokenTypeTraits, tokenCmpFactories, tokenizerFactory,
- isPartitioned, invertedIndexFields, filterFieldsForNonBulkLoadOps, invertedIndexFieldsForNonBulkLoadOps,
- bloomFilterFalsePositiveRate);
+ fullTextConfigEvaluatorFactory, isPartitioned, invertedIndexFields, filterFieldsForNonBulkLoadOps,
+ invertedIndexFieldsForNonBulkLoadOps, bloomFilterFalsePositiveRate);
}
}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexLocalResourceFactory.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexLocalResourceFactory.java
index c36e8d3..09c9699 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexLocalResourceFactory.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexLocalResourceFactory.java
@@ -31,6 +31,7 @@
import org.apache.hyracks.storage.am.lsm.common.api.ILSMPageWriteCallbackFactory;
import org.apache.hyracks.storage.am.lsm.common.api.IVirtualBufferCacheProvider;
import org.apache.hyracks.storage.am.lsm.common.dataflow.LsmResourceFactory;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluatorFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
import org.apache.hyracks.storage.common.IResource;
import org.apache.hyracks.storage.common.IStorageManager;
@@ -46,16 +47,20 @@
private final int[] filterFieldsForNonBulkLoadOps;
private final int[] invertedIndexFieldsForNonBulkLoadOps;
private final double bloomFilterFalsePositiveRate;
+ private final IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory;
public LSMInvertedIndexLocalResourceFactory(IStorageManager storageManager, ITypeTraits[] typeTraits,
+ // Inherited fields
IBinaryComparatorFactory[] cmpFactories, ITypeTraits[] filterTypeTraits,
IBinaryComparatorFactory[] filterCmpFactories, int[] filterFields,
ILSMOperationTrackerFactory opTrackerFactory, ILSMIOOperationCallbackFactory ioOpCallbackFactory,
ILSMPageWriteCallbackFactory pageWriteCallbackFactory,
IMetadataPageManagerFactory metadataPageManagerFactory, IVirtualBufferCacheProvider vbcProvider,
ILSMIOOperationSchedulerProvider ioSchedulerProvider, ILSMMergePolicyFactory mergePolicyFactory,
- Map<String, String> mergePolicyProperties, boolean durable, ITypeTraits[] tokenTypeTraits,
- IBinaryComparatorFactory[] tokenCmpFactories, IBinaryTokenizerFactory tokenizerFactory,
+ Map<String, String> mergePolicyProperties, boolean durable,
+ // New fields
+ ITypeTraits[] tokenTypeTraits, IBinaryComparatorFactory[] tokenCmpFactories,
+ IBinaryTokenizerFactory tokenizerFactory, IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory,
boolean isPartitioned, int[] invertedIndexFields, int[] filterFieldsForNonBulkLoadOps,
int[] invertedIndexFieldsForNonBulkLoadOps, double bloomFilterFalsePositiveRate) {
super(storageManager, typeTraits, cmpFactories, filterTypeTraits, filterCmpFactories, filterFields,
@@ -63,7 +68,9 @@
vbcProvider, ioSchedulerProvider, mergePolicyFactory, mergePolicyProperties, durable);
this.tokenTypeTraits = tokenTypeTraits;
this.tokenCmpFactories = tokenCmpFactories;
+ // ToDo: totally replace tokenizerFactory with full-text config
this.tokenizerFactory = tokenizerFactory;
+ this.fullTextConfigEvaluatorFactory = fullTextConfigEvaluatorFactory;
this.isPartitioned = isPartitioned;
this.invertedIndexFields = invertedIndexFields;
this.filterFieldsForNonBulkLoadOps = filterFieldsForNonBulkLoadOps;
@@ -77,8 +84,8 @@
filterTypeTraits, filterCmpFactories, filterFields, opTrackerProvider, ioOpCallbackFactory,
pageWriteCallbackFactory, metadataPageManagerFactory, vbcProvider, ioSchedulerProvider,
mergePolicyFactory, mergePolicyProperties, durable, tokenTypeTraits, tokenCmpFactories,
- tokenizerFactory, isPartitioned, invertedIndexFields, filterFieldsForNonBulkLoadOps,
- invertedIndexFieldsForNonBulkLoadOps, bloomFilterFalsePositiveRate);
+ tokenizerFactory, fullTextConfigEvaluatorFactory, isPartitioned, invertedIndexFields,
+ filterFieldsForNonBulkLoadOps, invertedIndexFieldsForNonBulkLoadOps, bloomFilterFalsePositiveRate);
}
}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexSearchOperatorDescriptor.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexSearchOperatorDescriptor.java
index 13a649d..2cee54d 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexSearchOperatorDescriptor.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexSearchOperatorDescriptor.java
@@ -31,6 +31,7 @@
import org.apache.hyracks.storage.am.common.dataflow.IIndexDataflowHelperFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifier;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifierFactory;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluatorFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
public class LSMInvertedIndexSearchOperatorDescriptor extends AbstractSingleActivityOperatorDescriptor {
@@ -44,6 +45,7 @@
private final boolean isFullTextSearchQuery;
private final IIndexDataflowHelperFactory indexHelperFactory;
private final IBinaryTokenizerFactory queryTokenizerFactory;
+ private final IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory;
private final boolean retainInput;
private final boolean retainMissing;
private final IMissingWriterFactory missingWriterFactory;
@@ -54,14 +56,16 @@
public LSMInvertedIndexSearchOperatorDescriptor(IOperatorDescriptorRegistry spec, RecordDescriptor outRecDesc,
int queryField, IIndexDataflowHelperFactory indexHelperFactory,
- IBinaryTokenizerFactory queryTokenizerFactory, IInvertedIndexSearchModifierFactory searchModifierFactory,
- boolean retainInput, boolean retainMissing, IMissingWriterFactory missingWriterFactory,
- ISearchOperationCallbackFactory searchCallbackFactory, int[] minFilterFieldIndexes,
- int[] maxFilterFieldIndexes, boolean isFullTextSearchQuery, int numOfFields, boolean appendIndexFilter,
- int frameLimit) {
+ IBinaryTokenizerFactory queryTokenizerFactory,
+ IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory,
+ IInvertedIndexSearchModifierFactory searchModifierFactory, boolean retainInput, boolean retainMissing,
+ IMissingWriterFactory missingWriterFactory, ISearchOperationCallbackFactory searchCallbackFactory,
+ int[] minFilterFieldIndexes, int[] maxFilterFieldIndexes, boolean isFullTextSearchQuery, int numOfFields,
+ boolean appendIndexFilter, int frameLimit) {
super(spec, 1, 1);
this.indexHelperFactory = indexHelperFactory;
this.queryTokenizerFactory = queryTokenizerFactory;
+ this.fullTextConfigEvaluatorFactory = fullTextConfigEvaluatorFactory;
this.queryField = queryField;
this.retainInput = retainInput;
this.retainMissing = retainMissing;
@@ -84,7 +88,7 @@
return new LSMInvertedIndexSearchOperatorNodePushable(ctx,
recordDescProvider.getInputRecordDescriptor(getActivityId(), 0), partition, minFilterFieldIndexes,
maxFilterFieldIndexes, indexHelperFactory, retainInput, retainMissing, missingWriterFactory,
- searchCallbackFactory, searchModifier, queryTokenizerFactory, queryField, isFullTextSearchQuery,
- numOfFields, appendIndexFilter, frameLimit);
+ searchCallbackFactory, searchModifier, queryTokenizerFactory, fullTextConfigEvaluatorFactory,
+ queryField, isFullTextSearchQuery, numOfFields, appendIndexFilter, frameLimit);
}
}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexSearchOperatorNodePushable.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexSearchOperatorNodePushable.java
index a27dea7..e4dbb43 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexSearchOperatorNodePushable.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/dataflow/LSMInvertedIndexSearchOperatorNodePushable.java
@@ -34,7 +34,10 @@
import org.apache.hyracks.storage.am.common.dataflow.IIndexDataflowHelperFactory;
import org.apache.hyracks.storage.am.common.dataflow.IndexSearchOperatorNodePushable;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifier;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluator;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluatorFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.search.InvertedIndexSearchPredicate;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
import org.apache.hyracks.storage.common.IIndexAccessParameters;
import org.apache.hyracks.storage.common.ISearchPredicate;
@@ -43,6 +46,7 @@
protected final IInvertedIndexSearchModifier searchModifier;
protected final IBinaryTokenizerFactory binaryTokenizerFactory;
+ protected final IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory;
protected final int queryFieldIndex;
protected final int numOfFields;
// Keeps the information whether the given query is a full-text search or not.
@@ -57,12 +61,14 @@
IIndexDataflowHelperFactory indexHelperFactory, boolean retainInput, boolean retainMissing,
IMissingWriterFactory missingWriterFactory, ISearchOperationCallbackFactory searchCallbackFactory,
IInvertedIndexSearchModifier searchModifier, IBinaryTokenizerFactory binaryTokenizerFactory,
- int queryFieldIndex, boolean isFullTextSearchQuery, int numOfFields, boolean appendIndexFilter,
- int frameLimit) throws HyracksDataException {
+ IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory, int queryFieldIndex,
+ boolean isFullTextSearchQuery, int numOfFields, boolean appendIndexFilter, int frameLimit)
+ throws HyracksDataException {
super(ctx, inputRecDesc, partition, minFilterFieldIndexes, maxFilterFieldIndexes, indexHelperFactory,
retainInput, retainMissing, missingWriterFactory, searchCallbackFactory, appendIndexFilter);
this.searchModifier = searchModifier;
this.binaryTokenizerFactory = binaryTokenizerFactory;
+ this.fullTextConfigEvaluatorFactory = fullTextConfigEvaluatorFactory;
this.queryFieldIndex = queryFieldIndex;
this.isFullTextSearchQuery = isFullTextSearchQuery;
// If retainInput is true, the frameTuple is created in IndexSearchOperatorNodePushable.open().
@@ -79,7 +85,11 @@
@Override
protected ISearchPredicate createSearchPredicate() {
- return new InvertedIndexSearchPredicate(binaryTokenizerFactory.createTokenizer(), searchModifier, minFilterKey,
+ IBinaryTokenizer tokenizer = binaryTokenizerFactory.createTokenizer();
+ IFullTextConfigEvaluator fullTextConfigEvaluator =
+ fullTextConfigEvaluatorFactory.createFullTextConfigEvaluator();
+
+ return new InvertedIndexSearchPredicate(tokenizer, fullTextConfigEvaluator, searchModifier, minFilterKey,
maxFilterKey, isFullTextSearchQuery);
}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/AbstractFullTextFilterEvaluator.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/AbstractFullTextFilterEvaluator.java
new file mode 100644
index 0000000..6225101
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/AbstractFullTextFilterEvaluator.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.storage.am.lsm.invertedindex.fulltext;
+
+public abstract class AbstractFullTextFilterEvaluator implements IFullTextFilterEvaluator {
+
+ protected final String name;
+ protected final FullTextFilterType type;
+
+ public AbstractFullTextFilterEvaluator(String name, FullTextFilterType type) {
+ this.name = name;
+ this.type = type;
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public FullTextFilterType getFilterType() {
+ return type;
+ }
+}
\ No newline at end of file
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/AbstractFullTextFilterEvaluatorFactory.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/AbstractFullTextFilterEvaluatorFactory.java
new file mode 100644
index 0000000..d883829
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/AbstractFullTextFilterEvaluatorFactory.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.storage.am.lsm.invertedindex.fulltext;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+public abstract class AbstractFullTextFilterEvaluatorFactory implements IFullTextFilterEvaluatorFactory {
+ protected final String name;
+ protected final FullTextFilterType type;
+ protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ protected AbstractFullTextFilterEvaluatorFactory(String name, FullTextFilterType type) {
+ this.name = name;
+ this.type = type;
+ }
+}
\ No newline at end of file
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/FullTextConfigEvaluator.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/FullTextConfigEvaluator.java
new file mode 100644
index 0000000..6805c43
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/FullTextConfigEvaluator.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.storage.am.lsm.invertedindex.fulltext;
+
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
+
+import com.google.common.collect.ImmutableList;
+
+// FullTextConfigEvaluator is a run-time evaluator while the IFullTextConfigDescriptor is a compile-time descriptor
+//
+// The descriptor is responsible for serialization (i.e. distributed from the CC to NC)
+// and Metadata translator (i.e. be written to the metadata catalog)
+// And the analyzer is to process the tokens in each NC at run-time
+public class FullTextConfigEvaluator implements IFullTextConfigEvaluator {
+
+ // name is not necessary for run-time token processing, however, let's keep this field for debugging purpose
+ // The name of the evaluator is supposed to have the dataverse name and the name of the full-text config descriptor
+ private final String name;
+ // Due to the limitation of existing code (e.g. some call sites assume the input has a Asterix type tag while some don't),
+ // we may need to replace the tokenizer on-the-fly in certain call sites.
+ // So this field is not final
+ private IBinaryTokenizer tokenizer;
+ private final ImmutableList<IFullTextFilterEvaluator> filters;
+
+ private IToken currentToken;
+ private IToken nextToken;
+
+ public FullTextConfigEvaluator(String name, TokenizerCategory tokenizerCategory,
+ ImmutableList<IFullTextFilterEvaluator> filters) {
+ this.name = name;
+ this.filters = filters;
+
+ switch (tokenizerCategory) {
+ case WORD:
+ // Currently, the tokenizer will be set later after the analyzer created
+ // This is because the tokenizer logic is complex,
+ // and we are already using a dedicated tokenizer factory to create tokenizer.
+ // One tricky part of tokenizer is that it can be call-site specific, e.g. the string in some call-site
+ // has the ATypeTag.String in the beginning of its byte array, and some doesn't,
+ // so if we only know the category of the tokenizer, e.g. a WORD tokenizer,
+ // we still cannot create a suitable tokenizer here as the tokenizer factory does.
+ //
+ // Finally we should get rid of the dedicated tokenizer factory and put its related logic
+ // in the full-text descriptor and analyzer
+ this.tokenizer = null;
+ break;
+ case NGRAM:
+ default:
+ throw new IllegalArgumentException();
+ }
+ }
+
+ @Override
+ public IBinaryTokenizer getTokenizer() {
+ return tokenizer;
+ }
+
+ @Override
+ public TokenizerCategory getTokenizerCategory() {
+ return tokenizer.getTokenizerCategory();
+ }
+
+ @Override
+ public void setTokenizer(IBinaryTokenizer tokenizer) {
+ this.tokenizer = tokenizer;
+ }
+
+ @Override
+ public String getName() {
+ return name;
+ }
+
+ @Override
+ public void reset(byte[] data, int start, int length) {
+ currentToken = null;
+ nextToken = null;
+ tokenizer.reset(data, start, length);
+ }
+
+ @Override
+ public IToken getToken() {
+ return currentToken;
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (nextToken != null) {
+ return true;
+ }
+
+ while (tokenizer.hasNext()) {
+ tokenizer.next();
+ IToken candidateToken = tokenizer.getToken();
+ for (IFullTextFilterEvaluator filter : filters) {
+ // ToDo: Tokenizer of TokenizerType.List would return strings starting with the length,
+ // e.g. 8database where 8 is the length
+ // Should we let TokenizerType.List returns the same thing as TokenizerType.String to make things easier?
+ // If so, we need to remove the length in the input string in all the call site of the tokenizer
+ // Otherwise, filters need tokenizer.getTokenizerType to decide if they need to remove the length themselves
+ candidateToken = filter.processToken(tokenizer.getTokenizerType(), candidateToken);
+ // null means the token is removed, i.e. it is a stopword
+ if (candidateToken == null) {
+ break;
+ }
+ }
+
+ if (candidateToken != null) {
+ nextToken = candidateToken;
+ break;
+ }
+ }
+
+ return nextToken != null;
+ }
+
+ @Override
+ public void next() {
+ currentToken = nextToken;
+ nextToken = null;
+ }
+
+ @Override
+ public int getTokensCount() {
+ return tokenizer.getTokensCount();
+ }
+
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/FullTextConfigEvaluatorFactory.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/FullTextConfigEvaluatorFactory.java
new file mode 100644
index 0000000..cb7c86a
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/FullTextConfigEvaluatorFactory.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.storage.am.lsm.invertedindex.fulltext;
+
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.api.io.IJsonSerializable;
+import org.apache.hyracks.api.io.IPersistedResourceRegistry;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import com.google.common.collect.ImmutableList;
+
+public class FullTextConfigEvaluatorFactory implements IFullTextConfigEvaluatorFactory {
+ private static final long serialVersionUID = 1L;
+
+ private final String name;
+ private final TokenizerCategory tokenizerCategory;
+ private final ImmutableList<IFullTextFilterEvaluatorFactory> filters;
+
+ public FullTextConfigEvaluatorFactory(String name, TokenizerCategory tokenizerCategory,
+ ImmutableList<IFullTextFilterEvaluatorFactory> filters) {
+ this.name = name;
+ this.tokenizerCategory = tokenizerCategory;
+ this.filters = filters;
+ }
+
+ @Override
+ public IFullTextConfigEvaluator createFullTextConfigEvaluator() {
+ ImmutableList.Builder<IFullTextFilterEvaluator> filterEvaluatorsBuilder = ImmutableList.builder();
+ for (IFullTextFilterEvaluatorFactory factory : filters) {
+ filterEvaluatorsBuilder.add(factory.createFullTextFilterEvaluator());
+ }
+ return new FullTextConfigEvaluator(name, tokenizerCategory, filterEvaluatorsBuilder.build());
+ }
+
+ public static IFullTextConfigEvaluatorFactory getDefaultFactory() {
+ return new FullTextConfigEvaluatorFactory("default_config_evaluator_factory", TokenizerCategory.WORD,
+ ImmutableList.of());
+ }
+
+ private static final String FIELD_NAME = "name";
+ private static final String FIELD_TOKENIZER_CATEGORY = "tokenizerCategory";
+ private static final String FIELD_FILTERS = "filters";
+ protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+ @Override
+ public JsonNode toJson(IPersistedResourceRegistry registry) throws HyracksDataException {
+ final ObjectNode json = registry.getClassIdentifier(getClass(), serialVersionUID);
+ json.put(FIELD_NAME, name);
+ json.put(FIELD_TOKENIZER_CATEGORY, tokenizerCategory.toString());
+
+ final ArrayNode filterArray = OBJECT_MAPPER.createArrayNode();
+ for (IFullTextFilterEvaluatorFactory filter : filters) {
+ filterArray.add(filter.toJson(registry));
+ }
+ json.set(FIELD_FILTERS, filterArray);
+
+ return json;
+ }
+
+ public static IJsonSerializable fromJson(IPersistedResourceRegistry registry, JsonNode json)
+ throws HyracksDataException {
+ final String name = json.get(FIELD_NAME).asText();
+ final String tokenizerCategoryStr = json.get(FIELD_TOKENIZER_CATEGORY).asText();
+ TokenizerCategory tc = TokenizerCategory.getEnumIgnoreCase(tokenizerCategoryStr);
+
+ ArrayNode filtersJsonNode = (ArrayNode) json.get(FIELD_FILTERS);
+ ImmutableList.Builder<IFullTextFilterEvaluatorFactory> filtersBuilder = ImmutableList.builder();
+ for (int i = 0; i < filtersJsonNode.size(); i++) {
+ filtersBuilder.add((IFullTextFilterEvaluatorFactory) registry.deserialize(filtersJsonNode.get(i)));
+ }
+ return new FullTextConfigEvaluatorFactory(name, tc, filtersBuilder.build());
+ }
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/FullTextFilterType.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/FullTextFilterType.java
new file mode 100644
index 0000000..abfd753
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/FullTextFilterType.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hyracks.storage.am.lsm.invertedindex.fulltext;
+
+import org.apache.commons.lang3.EnumUtils;
+
+public enum FullTextFilterType {
+ STOPWORDS("Stopwords"),
+ SYNONYM("Synonym"),
+ STEMMER("Stemmer");
+
+ private final String value;
+
+ FullTextFilterType(String value) {
+ this.value = value;
+ }
+
+ public String getValue() {
+ return value;
+ }
+
+ public static FullTextFilterType getEnumIgnoreCase(String str) {
+ FullTextFilterType type = EnumUtils.getEnumIgnoreCase(FullTextFilterType.class, str);
+
+ if (type == null) {
+ throw new IllegalArgumentException("Cannot convert string " + str + " to FullTextFilterType!");
+ }
+ return type;
+ }
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/IFullTextConfigEvaluator.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/IFullTextConfigEvaluator.java
new file mode 100644
index 0000000..94ee693
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/IFullTextConfigEvaluator.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.storage.am.lsm.invertedindex.fulltext;
+
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
+
+public interface IFullTextConfigEvaluator {
+ String getName();
+
+ TokenizerCategory getTokenizerCategory();
+
+ void setTokenizer(IBinaryTokenizer tokenizer);
+
+ void reset(byte[] data, int start, int length);
+
+ // ToDo: wrap the tokenizer and filters into a dedicated Java class
+ // so that at runtime the operators (evaluators) don't touch the usedByIndices filed
+ // That means, the usedByIndices field should be modified via MetadataManager only at compile time
+ IBinaryTokenizer getTokenizer();
+
+ IToken getToken();
+
+ boolean hasNext();
+
+ void next();
+
+ // Get the total number of tokens
+ // Currently, it returns the number of tokens in the original text, that means stopwords are still counted
+ int getTokensCount();
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/IFullTextConfigEvaluatorFactory.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/IFullTextConfigEvaluatorFactory.java
new file mode 100644
index 0000000..9469b66
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/IFullTextConfigEvaluatorFactory.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.storage.am.lsm.invertedindex.fulltext;
+
+import java.io.Serializable;
+
+import org.apache.hyracks.api.io.IJsonSerializable;
+
+public interface IFullTextConfigEvaluatorFactory extends Serializable, IJsonSerializable {
+ IFullTextConfigEvaluator createFullTextConfigEvaluator();
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/IFullTextFilterEvaluator.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/IFullTextFilterEvaluator.java
new file mode 100644
index 0000000..9c36ea3
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/IFullTextFilterEvaluator.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.storage.am.lsm.invertedindex.fulltext;
+
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.TokenizerInfo;
+
+// Full-text filter evaluator that process tokens
+// Such an evaluator is created via IFullTextFilterEvaluatorFactory,
+// and the run-time evaluator factory is created from IFullTextFilterDescriptor which is a compile-time concept.
+public interface IFullTextFilterEvaluator {
+ IToken processToken(TokenizerInfo.TokenizerType tokenizerType, IToken token);
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/IFullTextFilterEvaluatorFactory.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/IFullTextFilterEvaluatorFactory.java
new file mode 100644
index 0000000..7f77e0f
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/IFullTextFilterEvaluatorFactory.java
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.storage.am.lsm.invertedindex.fulltext;
+
+import java.io.Serializable;
+
+import org.apache.hyracks.api.io.IJsonSerializable;
+
+// This full-text filter evaluator factory would to be stored in the index local resource,
+// so it needs to be IJsonSerializable.
+// Also, it would to be distributed from CC (compile-time) to NC (run-time), so it needs to be Serializable.
+//
+// Such a IFullTextFilterEvaluatorFactory should always be wrapped in a IFullTextConfigEvaluatorFactory
+// because filter cannot live without a config: a full-text config is responsible to tokenize strings
+// and then feed the tokens into the filters.
+public interface IFullTextFilterEvaluatorFactory extends IJsonSerializable, Serializable {
+ IFullTextFilterEvaluator createFullTextFilterEvaluator();
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/StopwordsFullTextFilterEvaluator.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/StopwordsFullTextFilterEvaluator.java
new file mode 100644
index 0000000..da5fbb0
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/StopwordsFullTextFilterEvaluator.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.storage.am.lsm.invertedindex.fulltext;
+
+import java.util.List;
+
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
+import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.TokenizerInfo;
+import org.apache.hyracks.util.string.UTF8StringUtil;
+
+import com.google.common.collect.ImmutableList;
+
+public class StopwordsFullTextFilterEvaluator extends AbstractFullTextFilterEvaluator {
+
+ private final ImmutableList<String> stopwordList;
+
+ public StopwordsFullTextFilterEvaluator(String name, ImmutableList<String> stopwordList) {
+ super(name, FullTextFilterType.STOPWORDS);
+ this.stopwordList = stopwordList;
+ }
+
+ public List<String> getStopwordList() {
+ return stopwordList;
+ }
+
+ @Override
+ public IToken processToken(TokenizerInfo.TokenizerType tokenizerType, IToken token) {
+ int start = token.getStartOffset();
+ int length = token.getTokenLength();
+
+ // The List tokenizer returns token starting with the token length,
+ // e.g. 8database where the byte of value 8 means the token has a length of 8
+ // We need to skip the length to fetch the pure string (e.g. "database" without 8)
+ if (tokenizerType == TokenizerInfo.TokenizerType.LIST) {
+ int numBytesToStoreLength = UTF8StringUtil
+ .getNumBytesToStoreLength(UTF8StringUtil.getUTFLength(token.getData(), token.getStartOffset()));
+ start += numBytesToStoreLength;
+ length -= numBytesToStoreLength;
+ }
+
+ String str = UTF8StringUtil.getUTF8StringInArray(token.getData(), start, length);
+ if (stopwordList.contains(str)) {
+ return null;
+ }
+
+ return token;
+ }
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/StopwordsFullTextFilterEvaluatorFactory.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/StopwordsFullTextFilterEvaluatorFactory.java
new file mode 100644
index 0000000..b31aa17
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/StopwordsFullTextFilterEvaluatorFactory.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.storage.am.lsm.invertedindex.fulltext;
+
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.api.io.IJsonSerializable;
+import org.apache.hyracks.api.io.IPersistedResourceRegistry;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import com.google.common.collect.ImmutableList;
+
+public class StopwordsFullTextFilterEvaluatorFactory extends AbstractFullTextFilterEvaluatorFactory {
+ private static final long serialVersionUID = 1L;
+
+ private final ImmutableList<String> stopwordList;
+ private static final String STOPWORDS_FILTER_NAME = "stopwordsFilterName";
+ private static final String STOPWORDS_LIST = "stopwordsList";
+
+ public StopwordsFullTextFilterEvaluatorFactory(String name, ImmutableList<String> stopwordList) {
+ super(name, FullTextFilterType.STOPWORDS);
+ this.stopwordList = stopwordList;
+ }
+
+ @Override
+ public IFullTextFilterEvaluator createFullTextFilterEvaluator() {
+ return new StopwordsFullTextFilterEvaluator(name, stopwordList);
+ }
+
+ // ToDo: extract the common logics to a dedicated helper or utilization class after more filters are implemented
+ @Override
+ public JsonNode toJson(IPersistedResourceRegistry registry) throws HyracksDataException {
+ final ObjectNode json = registry.getClassIdentifier(getClass(), this.serialVersionUID);
+ json.put(STOPWORDS_FILTER_NAME, name);
+
+ ArrayNode stopwordsArrayNode = OBJECT_MAPPER.createArrayNode();
+ for (String s : stopwordList) {
+ stopwordsArrayNode.add(s);
+ }
+ json.set(STOPWORDS_LIST, stopwordsArrayNode);
+
+ return json;
+ }
+
+ public static IJsonSerializable fromJson(IPersistedResourceRegistry registry, JsonNode json)
+ throws HyracksDataException {
+ final String name = json.get(STOPWORDS_FILTER_NAME).asText();
+
+ // ToDo: create a new function to extract a list from json
+ ImmutableList.Builder<String> stopwordsBuilder = ImmutableList.<String> builder();
+ JsonNode stopwordsArrayNode = json.get(STOPWORDS_LIST);
+ for (int i = 0; i < stopwordsArrayNode.size(); i++) {
+ stopwordsBuilder.add(stopwordsArrayNode.get(i).asText());
+ }
+ ImmutableList<String> stopwords = stopwordsBuilder.build();
+
+ return new StopwordsFullTextFilterEvaluatorFactory(name, stopwords);
+ }
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/TokenizerCategory.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/TokenizerCategory.java
new file mode 100644
index 0000000..9c9b1a1
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/fulltext/TokenizerCategory.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hyracks.storage.am.lsm.invertedindex.fulltext;
+
+import org.apache.commons.lang3.EnumUtils;
+
+public enum TokenizerCategory {
+ NGRAM,
+ WORD;
+
+ public static TokenizerCategory getEnumIgnoreCase(String str) {
+ return EnumUtils.getEnumIgnoreCase(TokenizerCategory.class, str);
+ }
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/impls/LSMInvertedIndex.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/impls/LSMInvertedIndex.java
index 068df9a..1469304 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/impls/LSMInvertedIndex.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/impls/LSMInvertedIndex.java
@@ -60,6 +60,7 @@
import org.apache.hyracks.storage.am.lsm.common.impls.LSMComponentFileReferences;
import org.apache.hyracks.storage.am.lsm.common.impls.LSMComponentFilterManager;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndex;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluatorFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.inmemory.InMemoryInvertedIndex;
import org.apache.hyracks.storage.am.lsm.invertedindex.inmemory.InMemoryInvertedIndexAccessor;
import org.apache.hyracks.storage.am.lsm.invertedindex.search.InvertedIndexSearchPredicate;
@@ -82,6 +83,7 @@
private static final Logger LOGGER = LogManager.getLogger();
protected final IBinaryTokenizerFactory tokenizerFactory;
+ protected final IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory;
// Type traits and comparators for tokens and inverted-list elements.
protected final ITypeTraits[] invListTypeTraits;
@@ -97,16 +99,17 @@
double bloomFilterFalsePositiveRate, IBufferCache diskBufferCache, ILSMIndexFileManager fileManager,
ITypeTraits[] invListTypeTraits, IBinaryComparatorFactory[] invListCmpFactories,
ITypeTraits[] tokenTypeTraits, IBinaryComparatorFactory[] tokenCmpFactories,
- IBinaryTokenizerFactory tokenizerFactory, ILSMMergePolicy mergePolicy, ILSMOperationTracker opTracker,
- ILSMIOOperationScheduler ioScheduler, ILSMIOOperationCallbackFactory ioOpCallbackFactory,
- ILSMPageWriteCallbackFactory pageWriteCallbackFactory, int[] invertedIndexFields, int[] filterFields,
- int[] filterFieldsForNonBulkLoadOps, int[] invertedIndexFieldsForNonBulkLoadOps, boolean durable,
- ITracer tracer) throws HyracksDataException {
+ IBinaryTokenizerFactory tokenizerFactory, IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory,
+ ILSMMergePolicy mergePolicy, ILSMOperationTracker opTracker, ILSMIOOperationScheduler ioScheduler,
+ ILSMIOOperationCallbackFactory ioOpCallbackFactory, ILSMPageWriteCallbackFactory pageWriteCallbackFactory,
+ int[] invertedIndexFields, int[] filterFields, int[] filterFieldsForNonBulkLoadOps,
+ int[] invertedIndexFieldsForNonBulkLoadOps, boolean durable, ITracer tracer) throws HyracksDataException {
super(ioManager, virtualBufferCaches, diskBufferCache, fileManager, bloomFilterFalsePositiveRate, mergePolicy,
opTracker, ioScheduler, ioOpCallbackFactory, pageWriteCallbackFactory, componentFactory,
componentFactory, filterFrameFactory, filterManager, filterFields, durable, filterHelper,
invertedIndexFields, tracer);
this.tokenizerFactory = tokenizerFactory;
+ this.fullTextConfigEvaluatorFactory = fullTextConfigEvaluatorFactory;
this.invListTypeTraits = invListTypeTraits;
this.invListCmpFactories = invListCmpFactories;
this.tokenTypeTraits = tokenTypeTraits;
@@ -406,6 +409,7 @@
VirtualFreePageManager virtualFreePageManager, int id) throws HyracksDataException {
return InvertedIndexUtils.createInMemoryBTreeInvertedindex(virtualBufferCache, virtualFreePageManager,
invListTypeTraits, invListCmpFactories, tokenTypeTraits, tokenCmpFactories, tokenizerFactory,
+ fullTextConfigEvaluatorFactory,
ioManager.resolveAbsolutePath(fileManager.getBaseDir() + "_virtual_vocab_" + id));
}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/impls/PartitionedLSMInvertedIndex.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/impls/PartitionedLSMInvertedIndex.java
index 23d6221..de1fc49 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/impls/PartitionedLSMInvertedIndex.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/impls/PartitionedLSMInvertedIndex.java
@@ -37,6 +37,7 @@
import org.apache.hyracks.storage.am.lsm.common.api.IVirtualBufferCache;
import org.apache.hyracks.storage.am.lsm.common.freepage.VirtualFreePageManager;
import org.apache.hyracks.storage.am.lsm.common.impls.LSMComponentFilterManager;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluatorFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.inmemory.InMemoryInvertedIndex;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexUtils;
@@ -51,16 +52,16 @@
double bloomFilterFalsePositiveRate, IBufferCache diskBufferCache, ILSMIndexFileManager fileManager,
ITypeTraits[] invListTypeTraits, IBinaryComparatorFactory[] invListCmpFactories,
ITypeTraits[] tokenTypeTraits, IBinaryComparatorFactory[] tokenCmpFactories,
- IBinaryTokenizerFactory tokenizerFactory, ILSMMergePolicy mergePolicy, ILSMOperationTracker opTracker,
- ILSMIOOperationScheduler ioScheduler, ILSMIOOperationCallbackFactory ioOpCallbackFactory,
- ILSMPageWriteCallbackFactory pageWriteCallbackFactory, int[] invertedIndexFields, int[] filterFields,
- int[] filterFieldsForNonBulkLoadOps, int[] invertedIndexFieldsForNonBulkLoadOps, boolean durable,
- ITracer tracer) throws HyracksDataException {
+ IBinaryTokenizerFactory tokenizerFactory, IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory,
+ ILSMMergePolicy mergePolicy, ILSMOperationTracker opTracker, ILSMIOOperationScheduler ioScheduler,
+ ILSMIOOperationCallbackFactory ioOpCallbackFactory, ILSMPageWriteCallbackFactory pageWriteCallbackFactory,
+ int[] invertedIndexFields, int[] filterFields, int[] filterFieldsForNonBulkLoadOps,
+ int[] invertedIndexFieldsForNonBulkLoadOps, boolean durable, ITracer tracer) throws HyracksDataException {
super(ioManager, virtualBufferCaches, componentFactory, filterHelper, filterFrameFactory, filterManager,
bloomFilterFalsePositiveRate, diskBufferCache, fileManager, invListTypeTraits, invListCmpFactories,
- tokenTypeTraits, tokenCmpFactories, tokenizerFactory, mergePolicy, opTracker, ioScheduler,
- ioOpCallbackFactory, pageWriteCallbackFactory, invertedIndexFields, filterFields,
- filterFieldsForNonBulkLoadOps, invertedIndexFieldsForNonBulkLoadOps, durable, tracer);
+ tokenTypeTraits, tokenCmpFactories, tokenizerFactory, fullTextConfigEvaluatorFactory, mergePolicy,
+ opTracker, ioScheduler, ioOpCallbackFactory, pageWriteCallbackFactory, invertedIndexFields,
+ filterFields, filterFieldsForNonBulkLoadOps, invertedIndexFieldsForNonBulkLoadOps, durable, tracer);
}
@Override
@@ -68,7 +69,8 @@
VirtualFreePageManager virtualFreePageManager, int id) throws HyracksDataException {
return InvertedIndexUtils.createPartitionedInMemoryBTreeInvertedindex(virtualBufferCache,
virtualFreePageManager, invListTypeTraits, invListCmpFactories, tokenTypeTraits, tokenCmpFactories,
- tokenizerFactory, ioManager.resolveAbsolutePath(fileManager.getBaseDir() + "_virtual_vocab_" + id));
+ tokenizerFactory, fullTextConfigEvaluatorFactory,
+ ioManager.resolveAbsolutePath(fileManager.getBaseDir() + "_virtual_vocab_" + id));
}
}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedIndex.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedIndex.java
index 914a469..5d74529 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedIndex.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedIndex.java
@@ -35,6 +35,7 @@
import org.apache.hyracks.storage.am.common.ophelpers.IndexOperation;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInPlaceInvertedIndex;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedListCursor;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluatorFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
import org.apache.hyracks.storage.common.IIndexAccessParameters;
import org.apache.hyracks.storage.common.IIndexBulkLoader;
@@ -50,6 +51,7 @@
protected final ITypeTraits[] invListTypeTraits;
protected final IBinaryComparatorFactory[] invListCmpFactories;
protected final IBinaryTokenizerFactory tokenizerFactory;
+ protected final IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory;
protected final ITypeTraits[] btreeTypeTraits;
protected final IBinaryComparatorFactory[] btreeCmpFactories;
@@ -57,12 +59,14 @@
public InMemoryInvertedIndex(IBufferCache virtualBufferCache, IPageManager virtualFreePageManager,
ITypeTraits[] invListTypeTraits, IBinaryComparatorFactory[] invListCmpFactories,
ITypeTraits[] tokenTypeTraits, IBinaryComparatorFactory[] tokenCmpFactories,
- IBinaryTokenizerFactory tokenizerFactory, FileReference btreeFileRef) throws HyracksDataException {
+ IBinaryTokenizerFactory tokenizerFactory, IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory,
+ FileReference btreeFileRef) throws HyracksDataException {
this.tokenTypeTraits = tokenTypeTraits;
this.tokenCmpFactories = tokenCmpFactories;
this.invListTypeTraits = invListTypeTraits;
this.invListCmpFactories = invListCmpFactories;
this.tokenizerFactory = tokenizerFactory;
+ this.fullTextConfigEvaluatorFactory = fullTextConfigEvaluatorFactory;
// BTree tuples: <tokens, inverted-list elements>.
int numBTreeFields = tokenTypeTraits.length + invListTypeTraits.length;
btreeTypeTraits = new ITypeTraits[numBTreeFields];
@@ -174,7 +178,8 @@
@Override
public InMemoryInvertedIndexAccessor createAccessor(IIndexAccessParameters iap) throws HyracksDataException {
return new InMemoryInvertedIndexAccessor(this,
- new InMemoryInvertedIndexOpContext(btree, tokenCmpFactories, tokenizerFactory),
+ new InMemoryInvertedIndexOpContext(btree, tokenCmpFactories, tokenizerFactory,
+ fullTextConfigEvaluatorFactory),
(IHyracksTaskContext) iap.getParameters().get(HyracksConstants.HYRACKS_TASK_CONTEXT));
}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedIndexOpContext.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedIndexOpContext.java
index 0457b46..1a52d2e 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedIndexOpContext.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/InMemoryInvertedIndexOpContext.java
@@ -27,6 +27,8 @@
import org.apache.hyracks.storage.am.common.api.IIndexOperationContext;
import org.apache.hyracks.storage.am.common.impls.NoOpIndexAccessParameters;
import org.apache.hyracks.storage.am.common.ophelpers.IndexOperation;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluator;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluatorFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexTokenizingTupleIterator;
@@ -46,14 +48,16 @@
// To generate in-memory BTree tuples for insertions.
private final IBinaryTokenizerFactory tokenizerFactory;
+ private final IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory;
private InvertedIndexTokenizingTupleIterator tupleIter;
private boolean destroyed = false;
InMemoryInvertedIndexOpContext(BTree btree, IBinaryComparatorFactory[] tokenCmpFactories,
- IBinaryTokenizerFactory tokenizerFactory) {
+ IBinaryTokenizerFactory tokenizerFactory, IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory) {
this.btree = btree;
this.tokenCmpFactories = tokenCmpFactories;
this.tokenizerFactory = tokenizerFactory;
+ this.fullTextConfigEvaluatorFactory = fullTextConfigEvaluatorFactory;
}
@Override
@@ -93,9 +97,11 @@
}
protected void setTokenizingTupleIterator() {
- IBinaryTokenizer tokenizer = getTokenizerFactory().createTokenizer();
+ IBinaryTokenizer tokenizer = tokenizerFactory.createTokenizer();
+ IFullTextConfigEvaluator fullTextConfigEvaluator =
+ fullTextConfigEvaluatorFactory.createFullTextConfigEvaluator();
tupleIter = new InvertedIndexTokenizingTupleIterator(tokenCmpFactories.length,
- btree.getFieldCount() - tokenCmpFactories.length, tokenizer);
+ btree.getFieldCount() - tokenCmpFactories.length, tokenizer, fullTextConfigEvaluator);
}
public InvertedIndexTokenizingTupleIterator getTupleIter() {
@@ -122,6 +128,10 @@
return tokenizerFactory;
}
+ public IFullTextConfigEvaluatorFactory getFullTextConfigEvaluatorFactory() {
+ return fullTextConfigEvaluatorFactory;
+ }
+
public void setTupleIter(InvertedIndexTokenizingTupleIterator tupleIter) {
this.tupleIter = tupleIter;
}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/PartitionedInMemoryInvertedIndex.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/PartitionedInMemoryInvertedIndex.java
index 806bbf5..bd9ce60 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/PartitionedInMemoryInvertedIndex.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/PartitionedInMemoryInvertedIndex.java
@@ -31,6 +31,7 @@
import org.apache.hyracks.storage.am.common.ophelpers.IndexOperation;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearcher;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IPartitionedInvertedIndex;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluatorFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.search.InvertedListPartitions;
import org.apache.hyracks.storage.am.lsm.invertedindex.search.PartitionedTOccurrenceSearcher;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
@@ -47,9 +48,10 @@
public PartitionedInMemoryInvertedIndex(IBufferCache memBufferCache, IPageManager memFreePageManager,
ITypeTraits[] invListTypeTraits, IBinaryComparatorFactory[] invListCmpFactories,
ITypeTraits[] tokenTypeTraits, IBinaryComparatorFactory[] tokenCmpFactories,
- IBinaryTokenizerFactory tokenizerFactory, FileReference btreeFileRef) throws HyracksDataException {
+ IBinaryTokenizerFactory tokenizerFactory, IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory,
+ FileReference btreeFileRef) throws HyracksDataException {
super(memBufferCache, memFreePageManager, invListTypeTraits, invListCmpFactories, tokenTypeTraits,
- tokenCmpFactories, tokenizerFactory, btreeFileRef);
+ tokenCmpFactories, tokenizerFactory, fullTextConfigEvaluatorFactory, btreeFileRef);
}
@Override
@@ -86,8 +88,8 @@
@Override
public PartitionedInMemoryInvertedIndexAccessor createAccessor(IIndexAccessParameters iap)
throws HyracksDataException {
- return new PartitionedInMemoryInvertedIndexAccessor(this,
- new PartitionedInMemoryInvertedIndexOpContext(btree, tokenCmpFactories, tokenizerFactory), iap);
+ return new PartitionedInMemoryInvertedIndexAccessor(this, new PartitionedInMemoryInvertedIndexOpContext(btree,
+ tokenCmpFactories, tokenizerFactory, fullTextConfigEvaluatorFactory), iap);
}
@Override
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/PartitionedInMemoryInvertedIndexOpContext.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/PartitionedInMemoryInvertedIndexOpContext.java
index 9cfaf7a..6b6401a 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/PartitionedInMemoryInvertedIndexOpContext.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/inmemory/PartitionedInMemoryInvertedIndexOpContext.java
@@ -21,6 +21,8 @@
import org.apache.hyracks.api.dataflow.value.IBinaryComparatorFactory;
import org.apache.hyracks.storage.am.btree.impls.BTree;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluator;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluatorFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.util.PartitionedInvertedIndexTokenizingTupleIterator;
@@ -28,13 +30,15 @@
public class PartitionedInMemoryInvertedIndexOpContext extends InMemoryInvertedIndexOpContext {
public PartitionedInMemoryInvertedIndexOpContext(BTree btree, IBinaryComparatorFactory[] tokenCmpFactories,
- IBinaryTokenizerFactory tokenizerFactory) {
- super(btree, tokenCmpFactories, tokenizerFactory);
+ IBinaryTokenizerFactory tokenizerFactory, IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory) {
+ super(btree, tokenCmpFactories, tokenizerFactory, fullTextConfigEvaluatorFactory);
}
protected void setTokenizingTupleIterator() {
IBinaryTokenizer tokenizer = getTokenizerFactory().createTokenizer();
+ IFullTextConfigEvaluator fullTextConfigEvaluator =
+ getFullTextConfigEvaluatorFactory().createFullTextConfigEvaluator();
setTupleIter(new PartitionedInvertedIndexTokenizingTupleIterator(tokenCmpFactories.length,
- btree.getFieldCount() - tokenCmpFactories.length, tokenizer));
+ btree.getFieldCount() - tokenCmpFactories.length, tokenizer, fullTextConfigEvaluator));
}
}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/search/AbstractTOccurrenceSearcher.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/search/AbstractTOccurrenceSearcher.java
index 1a08fd0..2fb620c 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/search/AbstractTOccurrenceSearcher.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/search/AbstractTOccurrenceSearcher.java
@@ -45,8 +45,8 @@
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedListCursor;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedListTupleReference;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IObjectFactory;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluator;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.DelimitedUTF8StringBinaryTokenizer;
-import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.TokenizerInfo.TokenizerType;
import org.apache.hyracks.storage.am.lsm.invertedindex.util.InvertedIndexUtils;
@@ -126,25 +126,27 @@
protected void tokenizeQuery(InvertedIndexSearchPredicate searchPred) throws HyracksDataException {
ITupleReference queryTuple = searchPred.getQueryTuple();
int queryFieldIndex = searchPred.getQueryFieldIndex();
- IBinaryTokenizer queryTokenizer = searchPred.getQueryTokenizer();
+ IFullTextConfigEvaluator fullTextAnalyzer = searchPred.getFullTextConfigEvaluator();
+ fullTextAnalyzer.setTokenizer(searchPred.getQueryTokenizer());
+
// Is this a full-text query?
// Then, the last argument is conjunctive or disjunctive search option, not a query text.
// Thus, we need to remove the last argument.
boolean isFullTextSearchQuery = searchPred.getIsFullTextSearchQuery();
// Get the type of query tokenizer.
- TokenizerType queryTokenizerType = queryTokenizer.getTokenizerType();
+ TokenizerType queryTokenizerType = fullTextAnalyzer.getTokenizer().getTokenizerType();
int tokenCountInOneField = 0;
queryTokenAppender.reset(queryTokenFrame, true);
- queryTokenizer.reset(queryTuple.getFieldData(queryFieldIndex), queryTuple.getFieldStart(queryFieldIndex),
+ fullTextAnalyzer.reset(queryTuple.getFieldData(queryFieldIndex), queryTuple.getFieldStart(queryFieldIndex),
queryTuple.getFieldLength(queryFieldIndex));
- while (queryTokenizer.hasNext()) {
- queryTokenizer.next();
+ while (fullTextAnalyzer.hasNext()) {
+ fullTextAnalyzer.next();
queryTokenBuilder.reset();
tokenCountInOneField++;
try {
- IToken token = queryTokenizer.getToken();
+ IToken token = fullTextAnalyzer.getToken();
// For the full-text search, we don't support a phrase search yet.
// So, each field should have only one token.
// If it's a list, it can have multiple keywords in it. But, each keyword should not be a phrase.
@@ -161,6 +163,7 @@
}
}
+ // Includes the length of the string, e.g. 8database where 8 (of type byte instead of char) is the length of "database"
token.serializeToken(queryTokenBuilder.getFieldData());
queryTokenBuilder.addFieldEndOffset();
// WARNING: assuming one frame is big enough to hold all tokens
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/search/InvertedIndexSearchPredicate.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/search/InvertedIndexSearchPredicate.java
index 4ee685b..3cf9066 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/search/InvertedIndexSearchPredicate.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/search/InvertedIndexSearchPredicate.java
@@ -22,6 +22,7 @@
import org.apache.hyracks.dataflow.common.data.accessors.ITupleReference;
import org.apache.hyracks.storage.am.common.impls.AbstractSearchPredicate;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifier;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluator;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
import org.apache.hyracks.storage.common.MultiComparator;
@@ -31,21 +32,27 @@
private ITupleReference queryTuple;
private int queryFieldIndex;
private final IBinaryTokenizer queryTokenizer;
+ private final IFullTextConfigEvaluator fullTextConfigEvaluator;
private final IInvertedIndexSearchModifier searchModifier;
// Keeps the information whether the given query is a full-text search or not.
// We need to have this information to stop the search process since we don't allow a phrase search yet.
private boolean isFullTextSearchQuery;
- public InvertedIndexSearchPredicate(IBinaryTokenizer queryTokenizer, IInvertedIndexSearchModifier searchModifier) {
+ // Used for test only
+ public InvertedIndexSearchPredicate(IBinaryTokenizer queryTokenizer,
+ IFullTextConfigEvaluator fullTextConfigEvaluator, IInvertedIndexSearchModifier searchModifier) {
this.queryTokenizer = queryTokenizer;
+ this.fullTextConfigEvaluator = fullTextConfigEvaluator;
this.searchModifier = searchModifier;
this.isFullTextSearchQuery = false;
}
- public InvertedIndexSearchPredicate(IBinaryTokenizer queryTokenizer, IInvertedIndexSearchModifier searchModifier,
+ public InvertedIndexSearchPredicate(IBinaryTokenizer queryTokenizer,
+ IFullTextConfigEvaluator fullTextConfigEvaluator, IInvertedIndexSearchModifier searchModifier,
ITupleReference minFilterTuple, ITupleReference maxFilterTuple, boolean isFullTextSearchQuery) {
super(minFilterTuple, maxFilterTuple);
this.queryTokenizer = queryTokenizer;
+ this.fullTextConfigEvaluator = fullTextConfigEvaluator;
this.searchModifier = searchModifier;
this.isFullTextSearchQuery = isFullTextSearchQuery;
}
@@ -82,6 +89,10 @@
return queryTokenizer;
}
+ public IFullTextConfigEvaluator getFullTextConfigEvaluator() {
+ return fullTextConfigEvaluator;
+ }
+
@Override
public MultiComparator getLowKeyComparator() {
// TODO: This doesn't make sense for an inverted index. Change ISearchPredicate interface.
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
index 3a5224c..df06525 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/DelimitedUTF8StringBinaryTokenizer.java
@@ -19,6 +19,7 @@
package org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.TokenizerCategory;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.TokenizerInfo.TokenizerType;
import org.apache.hyracks.util.string.UTF8StringUtil;
@@ -51,6 +52,8 @@
return byteIndex < sentenceEndOffset;
}
+ // ToDo: current solution (where a white list defined for non-separator chars) is not suitable for double-surrogate utf-8 chars
+ // which are always judged as separators
public static boolean isSeparator(char c) {
return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER
|| Character.getType(c) == Character.OTHER_NUMBER);
@@ -119,4 +122,9 @@
public TokenizerType getTokenizerType() {
return TokenizerType.STRING;
}
+
+ @Override
+ public TokenizerCategory getTokenizerCategory() {
+ return TokenizerCategory.WORD;
+ }
}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
index 6a7da02..0e7f30d 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/IBinaryTokenizer.java
@@ -19,20 +19,24 @@
package org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.TokenizerCategory;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.TokenizerInfo.TokenizerType;
public interface IBinaryTokenizer {
- public IToken getToken();
+ IToken getToken();
- public boolean hasNext();
+ boolean hasNext();
- public void next();
+ void next();
- public void reset(byte[] data, int start, int length);
+ void reset(byte[] data, int start, int length);
// Get the total number of tokens
- public short getTokensCount();
+ short getTokensCount();
- // Get the tokenizer types
- public TokenizerType getTokenizerType();
+ // Get the tokenizer types: String or List
+ TokenizerType getTokenizerType();
+
+ // WORD or NGRAM tokenizer
+ TokenizerCategory getTokenizerCategory();
}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
index 711a82f..0ab1662 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramUTF8StringBinaryTokenizer.java
@@ -19,6 +19,7 @@
package org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.TokenizerCategory;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.TokenizerInfo.TokenizerType;
import org.apache.hyracks.util.string.UTF8StringUtil;
@@ -136,4 +137,9 @@
public TokenizerType getTokenizerType() {
return TokenizerType.STRING;
}
+
+ @Override
+ public TokenizerCategory getTokenizerCategory() {
+ return TokenizerCategory.NGRAM;
+ }
}
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTokenizingTupleIterator.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTokenizingTupleIterator.java
index 4cf0c55..19d6f11 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTokenizingTupleIterator.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexTokenizingTupleIterator.java
@@ -25,6 +25,7 @@
import org.apache.hyracks.dataflow.common.comm.io.ArrayTupleBuilder;
import org.apache.hyracks.dataflow.common.comm.io.ArrayTupleReference;
import org.apache.hyracks.dataflow.common.data.accessors.ITupleReference;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluator;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
@@ -36,38 +37,43 @@
protected final int invListFieldCount;
protected final ArrayTupleBuilder tupleBuilder;
protected final ArrayTupleReference tupleReference;
- protected final IBinaryTokenizer tokenizer;
+ protected final IFullTextConfigEvaluator fullTextConfigEvaluator;
protected ITupleReference inputTuple;
- public InvertedIndexTokenizingTupleIterator(int tokensFieldCount, int invListFieldCount,
- IBinaryTokenizer tokenizer) {
+ public InvertedIndexTokenizingTupleIterator(int tokensFieldCount, int invListFieldCount, IBinaryTokenizer tokenizer,
+ IFullTextConfigEvaluator fullTextConfigEvaluator) {
this.invListFieldCount = invListFieldCount;
this.tupleBuilder = new ArrayTupleBuilder(tokensFieldCount + invListFieldCount);
this.tupleReference = new ArrayTupleReference();
- this.tokenizer = tokenizer;
+ this.fullTextConfigEvaluator = fullTextConfigEvaluator;
+
+ // ToDo: check the codes in upper layer to see if we can remove tokenizer to use fullTextConfig instead
+ this.fullTextConfigEvaluator.setTokenizer(tokenizer);
}
public void reset(ITupleReference inputTuple) {
this.inputTuple = inputTuple;
- tokenizer.reset(inputTuple.getFieldData(DOC_FIELD_INDEX), inputTuple.getFieldStart(DOC_FIELD_INDEX),
- inputTuple.getFieldLength(DOC_FIELD_INDEX));
+ fullTextConfigEvaluator.reset(inputTuple.getFieldData(DOC_FIELD_INDEX),
+ inputTuple.getFieldStart(DOC_FIELD_INDEX), inputTuple.getFieldLength(DOC_FIELD_INDEX));
}
public boolean hasNext() {
- return tokenizer.hasNext();
+ return fullTextConfigEvaluator.hasNext();
}
public void next() throws HyracksDataException {
- tokenizer.next();
- IToken token = tokenizer.getToken();
+ fullTextConfigEvaluator.next();
+ IToken token = fullTextConfigEvaluator.getToken();
+
tupleBuilder.reset();
- // Add token field.
try {
+ // Add token field.
token.serializeToken(tupleBuilder.getFieldData());
} catch (IOException e) {
throw HyracksDataException.create(e);
}
tupleBuilder.addFieldEndOffset();
+
// Add inverted-list element fields.
for (int i = 0; i < invListFieldCount; i++) {
tupleBuilder.addField(inputTuple.getFieldData(i + 1), inputTuple.getFieldStart(i + 1),
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexUtils.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexUtils.java
index a9a01e9..dcde832 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexUtils.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/InvertedIndexUtils.java
@@ -52,6 +52,7 @@
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedListBuilder;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedListBuilderFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedListTupleReference;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluatorFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.impls.LSMInvertedIndex;
import org.apache.hyracks.storage.am.lsm.invertedindex.impls.LSMInvertedIndexDiskComponentFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.impls.LSMInvertedIndexFileManager;
@@ -83,18 +84,21 @@
IPageManager virtualFreePageManager, ITypeTraits[] invListTypeTraits,
IBinaryComparatorFactory[] invListCmpFactories, ITypeTraits[] tokenTypeTraits,
IBinaryComparatorFactory[] tokenCmpFactories, IBinaryTokenizerFactory tokenizerFactory,
- FileReference btreeFileRef) throws HyracksDataException {
+ IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory, FileReference btreeFileRef)
+ throws HyracksDataException {
return new InMemoryInvertedIndex(memBufferCache, virtualFreePageManager, invListTypeTraits, invListCmpFactories,
- tokenTypeTraits, tokenCmpFactories, tokenizerFactory, btreeFileRef);
+ tokenTypeTraits, tokenCmpFactories, tokenizerFactory, fullTextConfigEvaluatorFactory, btreeFileRef);
}
public static InMemoryInvertedIndex createPartitionedInMemoryBTreeInvertedindex(IBufferCache memBufferCache,
IPageManager virtualFreePageManager, ITypeTraits[] invListTypeTraits,
IBinaryComparatorFactory[] invListCmpFactories, ITypeTraits[] tokenTypeTraits,
IBinaryComparatorFactory[] tokenCmpFactories, IBinaryTokenizerFactory tokenizerFactory,
- FileReference btreeFileRef) throws HyracksDataException {
+ IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory, FileReference btreeFileRef)
+ throws HyracksDataException {
return new PartitionedInMemoryInvertedIndex(memBufferCache, virtualFreePageManager, invListTypeTraits,
- invListCmpFactories, tokenTypeTraits, tokenCmpFactories, tokenizerFactory, btreeFileRef);
+ invListCmpFactories, tokenTypeTraits, tokenCmpFactories, tokenizerFactory,
+ fullTextConfigEvaluatorFactory, btreeFileRef);
}
public static OnDiskInvertedIndex createOnDiskInvertedIndex(IIOManager ioManager, IBufferCache bufferCache,
@@ -138,8 +142,9 @@
List<IVirtualBufferCache> virtualBufferCaches, ITypeTraits[] invListTypeTraits,
IBinaryComparatorFactory[] invListCmpFactories, ITypeTraits[] tokenTypeTraits,
IBinaryComparatorFactory[] tokenCmpFactories, IBinaryTokenizerFactory tokenizerFactory,
- IBufferCache diskBufferCache, String absoluteOnDiskDir, double bloomFilterFalsePositiveRate,
- ILSMMergePolicy mergePolicy, ILSMOperationTracker opTracker, ILSMIOOperationScheduler ioScheduler,
+ IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory, IBufferCache diskBufferCache,
+ String absoluteOnDiskDir, double bloomFilterFalsePositiveRate, ILSMMergePolicy mergePolicy,
+ ILSMOperationTracker opTracker, ILSMIOOperationScheduler ioScheduler,
ILSMIOOperationCallbackFactory ioOpCallbackFactory, ILSMPageWriteCallbackFactory pageWriteCallbackFactory,
int[] invertedIndexFields, ITypeTraits[] filterTypeTraits, IBinaryComparatorFactory[] filterCmpFactories,
int[] filterFields, int[] filterFieldsForNonBulkLoadOps, int[] invertedIndexFieldsForNonBulkLoadOps,
@@ -179,17 +184,19 @@
return new LSMInvertedIndex(ioManager, virtualBufferCaches, componentFactory, filterHelper, filterFrameFactory,
filterManager, bloomFilterFalsePositiveRate, diskBufferCache, fileManager, invListTypeTraits,
- invListCmpFactories, tokenTypeTraits, tokenCmpFactories, tokenizerFactory, mergePolicy, opTracker,
- ioScheduler, ioOpCallbackFactory, pageWriteCallbackFactory, invertedIndexFields, filterFields,
- filterFieldsForNonBulkLoadOps, invertedIndexFieldsForNonBulkLoadOps, durable, tracer);
+ invListCmpFactories, tokenTypeTraits, tokenCmpFactories, tokenizerFactory,
+ fullTextConfigEvaluatorFactory, mergePolicy, opTracker, ioScheduler, ioOpCallbackFactory,
+ pageWriteCallbackFactory, invertedIndexFields, filterFields, filterFieldsForNonBulkLoadOps,
+ invertedIndexFieldsForNonBulkLoadOps, durable, tracer);
}
public static PartitionedLSMInvertedIndex createPartitionedLSMInvertedIndex(IIOManager ioManager,
List<IVirtualBufferCache> virtualBufferCaches, ITypeTraits[] invListTypeTraits,
IBinaryComparatorFactory[] invListCmpFactories, ITypeTraits[] tokenTypeTraits,
IBinaryComparatorFactory[] tokenCmpFactories, IBinaryTokenizerFactory tokenizerFactory,
- IBufferCache diskBufferCache, String absoluteOnDiskDir, double bloomFilterFalsePositiveRate,
- ILSMMergePolicy mergePolicy, ILSMOperationTracker opTracker, ILSMIOOperationScheduler ioScheduler,
+ IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory, IBufferCache diskBufferCache,
+ String absoluteOnDiskDir, double bloomFilterFalsePositiveRate, ILSMMergePolicy mergePolicy,
+ ILSMOperationTracker opTracker, ILSMIOOperationScheduler ioScheduler,
ILSMIOOperationCallbackFactory ioOpCallbackFactory, ILSMPageWriteCallbackFactory pageWriteCallbackFactory,
int[] invertedIndexFields, ITypeTraits[] filterTypeTraits, IBinaryComparatorFactory[] filterCmpFactories,
int[] filterFields, int[] filterFieldsForNonBulkLoadOps, int[] invertedIndexFieldsForNonBulkLoadOps,
@@ -229,8 +236,9 @@
return new PartitionedLSMInvertedIndex(ioManager, virtualBufferCaches, componentFactory, filterHelper,
filterFrameFactory, filterManager, bloomFilterFalsePositiveRate, diskBufferCache, fileManager,
invListTypeTraits, invListCmpFactories, tokenTypeTraits, tokenCmpFactories, tokenizerFactory,
- mergePolicy, opTracker, ioScheduler, ioOpCallbackFactory, pageWriteCallbackFactory, invertedIndexFields,
- filterFields, filterFieldsForNonBulkLoadOps, invertedIndexFieldsForNonBulkLoadOps, durable, tracer);
+ fullTextConfigEvaluatorFactory, mergePolicy, opTracker, ioScheduler, ioOpCallbackFactory,
+ pageWriteCallbackFactory, invertedIndexFields, filterFields, filterFieldsForNonBulkLoadOps,
+ invertedIndexFieldsForNonBulkLoadOps, durable, tracer);
}
public static boolean checkTypeTraitsAllFixed(ITypeTraits[] typeTraits) {
diff --git a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/PartitionedInvertedIndexTokenizingTupleIterator.java b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/PartitionedInvertedIndexTokenizingTupleIterator.java
index c44dffb..783afc6 100644
--- a/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/PartitionedInvertedIndexTokenizingTupleIterator.java
+++ b/hyracks-fullstack/hyracks/hyracks-storage-am-lsm-invertedindex/src/main/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/PartitionedInvertedIndexTokenizingTupleIterator.java
@@ -23,6 +23,7 @@
import org.apache.hyracks.api.exceptions.HyracksDataException;
import org.apache.hyracks.dataflow.common.data.accessors.ITupleReference;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluator;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizer;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IToken;
@@ -32,28 +33,33 @@
protected short numTokens = 0;
public PartitionedInvertedIndexTokenizingTupleIterator(int tokensFieldCount, int invListFieldCount,
- IBinaryTokenizer tokenizer) {
- super(tokensFieldCount, invListFieldCount, tokenizer);
+ IBinaryTokenizer tokenizer, IFullTextConfigEvaluator fullTextConfigEvaluator) {
+ super(tokensFieldCount, invListFieldCount, tokenizer, fullTextConfigEvaluator);
}
+ @Override
public void reset(ITupleReference inputTuple) {
super.reset(inputTuple);
// Run through the tokenizer once to get the total number of tokens.
numTokens = 0;
- while (tokenizer.hasNext()) {
- tokenizer.next();
+ while (fullTextConfigEvaluator.hasNext()) {
+ fullTextConfigEvaluator.next();
numTokens++;
}
super.reset(inputTuple);
}
+ @Override
public void next() throws HyracksDataException {
- tokenizer.next();
- IToken token = tokenizer.getToken();
+ fullTextConfigEvaluator.next();
+ IToken token = fullTextConfigEvaluator.getToken();
+
tupleBuilder.reset();
try {
// Add token field.
token.serializeToken(tupleBuilder.getFieldData());
+
+ // Different from super.next(): here we write the numTokens
tupleBuilder.addFieldEndOffset();
// Add field with number of tokens.
tupleBuilder.getDataOutput().writeShort(numTokens);
@@ -61,6 +67,7 @@
} catch (IOException e) {
throw HyracksDataException.create(e);
}
+
// Add inverted-list element fields.
for (int i = 0; i < invListFieldCount; i++) {
tupleBuilder.addField(inputTuple.getFieldData(i + 1), inputTuple.getFieldStart(i + 1),
diff --git a/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/pom.xml b/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/pom.xml
index 4934660..aac8b50 100644
--- a/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/pom.xml
+++ b/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/pom.xml
@@ -116,6 +116,10 @@
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-api</artifactId>
</dependency>
+ <dependency>
+ <groupId>com.google.guava</groupId>
+ <artifactId>guava</artifactId>
+ </dependency>
</dependencies>
</project>
diff --git a/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/multithread/LSMInvertedIndexTestWorker.java b/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/multithread/LSMInvertedIndexTestWorker.java
index 2fcd96f..9818147 100644
--- a/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/multithread/LSMInvertedIndexTestWorker.java
+++ b/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/multithread/LSMInvertedIndexTestWorker.java
@@ -43,6 +43,7 @@
import org.apache.hyracks.storage.am.common.impls.IndexAccessParameters;
import org.apache.hyracks.storage.am.config.AccessMethodTestsConfig;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifier;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.FullTextConfigEvaluatorFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.impls.LSMInvertedIndex;
import org.apache.hyracks.storage.am.lsm.invertedindex.impls.LSMInvertedIndexAccessor;
import org.apache.hyracks.storage.am.lsm.invertedindex.search.ConjunctiveSearchModifier;
@@ -90,6 +91,7 @@
IBinaryTokenizerFactory tokenizerFactory = invIndex.getTokenizerFactory();
int searchModifierIndex = Math.abs(rnd.nextInt()) % TEST_SEARCH_MODIFIERS.length;
InvertedIndexSearchPredicate searchPred = new InvertedIndexSearchPredicate(tokenizerFactory.createTokenizer(),
+ FullTextConfigEvaluatorFactory.getDefaultFactory().createFullTextConfigEvaluator(),
TEST_SEARCH_MODIFIERS[searchModifierIndex]);
switch (op) {
diff --git a/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestContext.java b/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestContext.java
index 524904f..3e8db34 100644
--- a/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestContext.java
+++ b/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestContext.java
@@ -51,6 +51,7 @@
import org.apache.hyracks.storage.am.lsm.common.freepage.VirtualFreePageManager;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndex;
import org.apache.hyracks.storage.am.lsm.invertedindex.common.LSMInvertedIndexTestHarness;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluatorFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers.IBinaryTokenizerFactory;
import org.apache.hyracks.storage.am.lsm.invertedindex.util.LSMInvertedIndexTestUtils.HyracksTaskTestContext;
import org.apache.hyracks.storage.common.IIndex;
@@ -127,9 +128,10 @@
public static LSMInvertedIndexTestContext create(LSMInvertedIndexTestHarness harness,
ISerializerDeserializer[] fieldSerdes, int tokenFieldCount, IBinaryTokenizerFactory tokenizerFactory,
- InvertedIndexType invIndexType, int[] invertedIndexFields, ITypeTraits[] filterTypeTraits,
- IBinaryComparatorFactory[] filterCmpFactories, int[] filterFields, int[] filterFieldsForNonBulkLoadOps,
- int[] invertedIndexFieldsForNonBulkLoadOps) throws HyracksDataException {
+ IFullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory, InvertedIndexType invIndexType,
+ int[] invertedIndexFields, ITypeTraits[] filterTypeTraits, IBinaryComparatorFactory[] filterCmpFactories,
+ int[] filterFields, int[] filterFieldsForNonBulkLoadOps, int[] invertedIndexFieldsForNonBulkLoadOps)
+ throws HyracksDataException {
ITypeTraits[] allTypeTraits = SerdeUtils.serdesToTypeTraits(fieldSerdes);
IOManager ioManager = harness.getIOManager();
IBinaryComparatorFactory[] allCmpFactories =
@@ -157,7 +159,7 @@
invIndex = InvertedIndexUtils.createInMemoryBTreeInvertedindex(harness.getVirtualBufferCaches().get(0),
new VirtualFreePageManager(harness.getVirtualBufferCaches().get(0)), invListTypeTraits,
invListCmpFactories, tokenTypeTraits, tokenCmpFactories, tokenizerFactory,
- ioManager.resolveAbsolutePath(harness.getOnDiskDir()));
+ fullTextConfigEvaluatorFactory, ioManager.resolveAbsolutePath(harness.getOnDiskDir()));
break;
}
case PARTITIONED_INMEMORY: {
@@ -165,7 +167,7 @@
harness.getVirtualBufferCaches().get(0),
new VirtualFreePageManager(harness.getVirtualBufferCaches().get(0)), invListTypeTraits,
invListCmpFactories, tokenTypeTraits, tokenCmpFactories, tokenizerFactory,
- ioManager.resolveAbsolutePath(harness.getOnDiskDir()));
+ fullTextConfigEvaluatorFactory, ioManager.resolveAbsolutePath(harness.getOnDiskDir()));
break;
}
case ONDISK: {
@@ -183,8 +185,9 @@
case LSM: {
invIndex = InvertedIndexUtils.createLSMInvertedIndex(ioManager, harness.getVirtualBufferCaches(),
invListTypeTraits, invListCmpFactories, tokenTypeTraits, tokenCmpFactories, tokenizerFactory,
- harness.getDiskBufferCache(), harness.getOnDiskDir(), harness.getBoomFilterFalsePositiveRate(),
- harness.getMergePolicy(), harness.getOperationTracker(), harness.getIOScheduler(),
+ fullTextConfigEvaluatorFactory, harness.getDiskBufferCache(), harness.getOnDiskDir(),
+ harness.getBoomFilterFalsePositiveRate(), harness.getMergePolicy(),
+ harness.getOperationTracker(), harness.getIOScheduler(),
harness.getIOOperationCallbackFactory(), harness.getPageWriteCallbackFactory(),
invertedIndexFields, filterTypeTraits, filterCmpFactories, filterFields,
filterFieldsForNonBulkLoadOps, invertedIndexFieldsForNonBulkLoadOps, true,
@@ -196,9 +199,9 @@
case PARTITIONED_LSM: {
invIndex = InvertedIndexUtils.createPartitionedLSMInvertedIndex(ioManager,
harness.getVirtualBufferCaches(), invListTypeTraits, invListCmpFactories, tokenTypeTraits,
- tokenCmpFactories, tokenizerFactory, harness.getDiskBufferCache(), harness.getOnDiskDir(),
- harness.getBoomFilterFalsePositiveRate(), harness.getMergePolicy(),
- harness.getOperationTracker(), harness.getIOScheduler(),
+ tokenCmpFactories, tokenizerFactory, fullTextConfigEvaluatorFactory,
+ harness.getDiskBufferCache(), harness.getOnDiskDir(), harness.getBoomFilterFalsePositiveRate(),
+ harness.getMergePolicy(), harness.getOperationTracker(), harness.getIOScheduler(),
harness.getIOOperationCallbackFactory(), harness.getPageWriteCallbackFactory(),
invertedIndexFields, filterTypeTraits, filterCmpFactories, filterFields,
filterFieldsForNonBulkLoadOps, invertedIndexFieldsForNonBulkLoadOps, true,
@@ -217,7 +220,8 @@
case ONDISK:
case LSM: {
indexTupleIter = new InvertedIndexTokenizingTupleIterator(invIndex.getTokenTypeTraits().length,
- invIndex.getInvListTypeTraits().length, tokenizerFactory.createTokenizer());
+ invIndex.getInvListTypeTraits().length, tokenizerFactory.createTokenizer(),
+ fullTextConfigEvaluatorFactory.createFullTextConfigEvaluator());
break;
}
case PARTITIONED_INMEMORY:
@@ -225,7 +229,8 @@
case PARTITIONED_LSM: {
indexTupleIter =
new PartitionedInvertedIndexTokenizingTupleIterator(invIndex.getTokenTypeTraits().length,
- invIndex.getInvListTypeTraits().length, tokenizerFactory.createTokenizer());
+ invIndex.getInvListTypeTraits().length, tokenizerFactory.createTokenizer(),
+ fullTextConfigEvaluatorFactory.createFullTextConfigEvaluator());
break;
}
default: {
diff --git a/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java b/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java
index a5e47b5..3cc0913 100644
--- a/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java
+++ b/hyracks-fullstack/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java
@@ -88,6 +88,9 @@
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedIndexSearchModifier;
import org.apache.hyracks.storage.am.lsm.invertedindex.api.IInvertedListCursor;
import org.apache.hyracks.storage.am.lsm.invertedindex.common.LSMInvertedIndexTestHarness;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.FullTextConfigEvaluatorFactory;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.IFullTextConfigEvaluator;
+import org.apache.hyracks.storage.am.lsm.invertedindex.fulltext.TokenizerCategory;
import org.apache.hyracks.storage.am.lsm.invertedindex.impls.LSMInvertedIndexAccessor;
import org.apache.hyracks.storage.am.lsm.invertedindex.impls.LSMInvertedIndexMergeCursor;
import org.apache.hyracks.storage.am.lsm.invertedindex.search.InvertedIndexSearchPredicate;
@@ -113,11 +116,16 @@
import org.apache.hyracks.util.IThreadStatsCollector;
import org.apache.hyracks.util.ThreadStats;
+import com.google.common.collect.ImmutableList;
+
@SuppressWarnings("rawtypes")
public class LSMInvertedIndexTestUtils {
public static final int TEST_GRAM_LENGTH = 3;
+ public static FullTextConfigEvaluatorFactory fullTextConfigEvaluatorFactory =
+ new FullTextConfigEvaluatorFactory("test_config", TokenizerCategory.WORD, ImmutableList.of());
+
public static TupleGenerator createStringDocumentTupleGen(Random rnd) throws IOException {
IFieldValueGenerator[] fieldGens = new IFieldValueGenerator[2];
fieldGens[0] = new DocumentStringFieldValueGenerator(2, 10, 10000, rnd);
@@ -196,8 +204,9 @@
ITokenFactory tokenFactory = new UTF8WordTokenFactory();
IBinaryTokenizerFactory tokenizerFactory =
new DelimitedUTF8StringBinaryTokenizerFactory(true, false, tokenFactory);
- LSMInvertedIndexTestContext testCtx = LSMInvertedIndexTestContext.create(harness, fieldSerdes,
- fieldSerdes.length - 1, tokenizerFactory, invIndexType, null, null, null, null, null, null);
+ LSMInvertedIndexTestContext testCtx =
+ LSMInvertedIndexTestContext.create(harness, fieldSerdes, fieldSerdes.length - 1, tokenizerFactory,
+ fullTextConfigEvaluatorFactory, invIndexType, null, null, null, null, null, null);
return testCtx;
}
@@ -207,8 +216,9 @@
ITokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
IBinaryTokenizerFactory tokenizerFactory =
new DelimitedUTF8StringBinaryTokenizerFactory(true, false, tokenFactory);
- LSMInvertedIndexTestContext testCtx = LSMInvertedIndexTestContext.create(harness, fieldSerdes,
- fieldSerdes.length - 1, tokenizerFactory, invIndexType, null, null, null, null, null, null);
+ LSMInvertedIndexTestContext testCtx =
+ LSMInvertedIndexTestContext.create(harness, fieldSerdes, fieldSerdes.length - 1, tokenizerFactory,
+ fullTextConfigEvaluatorFactory, invIndexType, null, null, null, null, null, null);
return testCtx;
}
@@ -218,8 +228,9 @@
ITokenFactory tokenFactory = new UTF8NGramTokenFactory();
IBinaryTokenizerFactory tokenizerFactory =
new NGramUTF8StringBinaryTokenizerFactory(TEST_GRAM_LENGTH, true, true, false, tokenFactory);
- LSMInvertedIndexTestContext testCtx = LSMInvertedIndexTestContext.create(harness, fieldSerdes,
- fieldSerdes.length - 1, tokenizerFactory, invIndexType, null, null, null, null, null, null);
+ LSMInvertedIndexTestContext testCtx =
+ LSMInvertedIndexTestContext.create(harness, fieldSerdes, fieldSerdes.length - 1, tokenizerFactory,
+ fullTextConfigEvaluatorFactory, invIndexType, null, null, null, null, null, null);
return testCtx;
}
@@ -229,8 +240,9 @@
ITokenFactory tokenFactory = new HashedUTF8NGramTokenFactory();
IBinaryTokenizerFactory tokenizerFactory =
new NGramUTF8StringBinaryTokenizerFactory(TEST_GRAM_LENGTH, true, true, false, tokenFactory);
- LSMInvertedIndexTestContext testCtx = LSMInvertedIndexTestContext.create(harness, fieldSerdes,
- fieldSerdes.length - 1, tokenizerFactory, invIndexType, null, null, null, null, null, null);
+ LSMInvertedIndexTestContext testCtx =
+ LSMInvertedIndexTestContext.create(harness, fieldSerdes, fieldSerdes.length - 1, tokenizerFactory,
+ fullTextConfigEvaluatorFactory, invIndexType, null, null, null, null, null, null);
return testCtx;
}
@@ -561,7 +573,10 @@
iap.getParameters().put(HyracksConstants.HYRACKS_TASK_CONTEXT, ctx);
IInvertedIndexAccessor accessor = (IInvertedIndexAccessor) invIndex.createAccessor(iap);
IBinaryTokenizer tokenizer = testCtx.getTokenizerFactory().createTokenizer();
- InvertedIndexSearchPredicate searchPred = new InvertedIndexSearchPredicate(tokenizer, searchModifier);
+ IFullTextConfigEvaluator fullTextConfigEvaluator =
+ fullTextConfigEvaluatorFactory.createFullTextConfigEvaluator();
+ InvertedIndexSearchPredicate searchPred =
+ new InvertedIndexSearchPredicate(tokenizer, fullTextConfigEvaluator, searchModifier);
List<ITupleReference> documentCorpus = testCtx.getDocumentCorpus();
// Project away the primary-key field.
int[] fieldPermutation = new int[] { 0 };
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index f96ed72..e83aedb 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -618,6 +618,10 @@
int count = 0;
for (int i = 0; i < strlen; i++) {
+ // ToDo: we shouldn't use str.charAt(i) to convert raw byte array to UTF-8 chars
+ // one UTF-8 char has at most four bytes, and one Java char we get via str.charAt(i) has 2 bytes
+ // In this case, a UTF-8 char may be consistent of 2 Java chars, and 1 Java char can be converted into 3 UTF-8 bytes
+ // For the emoji, it can be 6 bytes after encoded to UTF-8
c = str.charAt(i);
utflen += UTF8StringUtil.getModifiedUTF8Len(c);
}