Merge branch 'master' into raman/master_issue376
diff --git a/asterix-algebra/src/main/java/edu/uci/ics/asterix/algebra/operators/physical/InvertedIndexPOperator.java b/asterix-algebra/src/main/java/edu/uci/ics/asterix/algebra/operators/physical/InvertedIndexPOperator.java
index 48f9e36..6edcc39 100644
--- a/asterix-algebra/src/main/java/edu/uci/ics/asterix/algebra/operators/physical/InvertedIndexPOperator.java
+++ b/asterix-algebra/src/main/java/edu/uci/ics/asterix/algebra/operators/physical/InvertedIndexPOperator.java
@@ -205,14 +205,16 @@
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
- storageProperties.getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages());
+ storageProperties.getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate());
} else {
dataflowHelperFactory = new PartitionedLSMInvertedIndexDataflowHelperFactory(
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
- storageProperties.getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages());
+ storageProperties.getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate());
}
LSMInvertedIndexSearchOperatorDescriptor invIndexSearchOp = new LSMInvertedIndexSearchOperatorDescriptor(
jobSpec, queryField, appContext.getStorageManagerInterface(), secondarySplitsAndConstraint.first,
diff --git a/asterix-app/src/main/java/edu/uci/ics/asterix/file/DatasetOperations.java b/asterix-app/src/main/java/edu/uci/ics/asterix/file/DatasetOperations.java
index 25a2551..3600bea 100644
--- a/asterix-app/src/main/java/edu/uci/ics/asterix/file/DatasetOperations.java
+++ b/asterix-app/src/main/java/edu/uci/ics/asterix/file/DatasetOperations.java
@@ -128,7 +128,8 @@
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
- storageProperties.getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages()));
+ storageProperties.getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()));
AlgebricksPartitionConstraintHelper.setPartitionConstraintInJobSpec(specPrimary, primaryBtreeDrop,
splitsAndConstraint.second);
@@ -181,8 +182,9 @@
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER, storageProperties
- .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages()),
- localResourceFactoryProvider, NoOpOperationCallbackFactory.INSTANCE);
+ .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()), localResourceFactoryProvider,
+ NoOpOperationCallbackFactory.INSTANCE);
AlgebricksPartitionConstraintHelper.setPartitionConstraintInJobSpec(spec, indexCreateOp,
splitsAndConstraint.second);
spec.addRoot(indexCreateOp);
@@ -266,13 +268,13 @@
TreeIndexBulkLoadOperatorDescriptor btreeBulkLoad = new TreeIndexBulkLoadOperatorDescriptor(spec,
AsterixRuntimeComponentsProvider.NOINDEX_PROVIDER, AsterixRuntimeComponentsProvider.NOINDEX_PROVIDER,
splitsAndConstraint.first, typeTraits, comparatorFactories, blooFilterKeyFields, fieldPermutation,
- GlobalConfig.DEFAULT_BTREE_FILL_FACTOR, false, numElementsHint,
- new LSMBTreeDataflowHelperFactory(AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
+ GlobalConfig.DEFAULT_BTREE_FILL_FACTOR, false, numElementsHint, new LSMBTreeDataflowHelperFactory(
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
- AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER, storageProperties
- .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages()),
- NoOpOperationCallbackFactory.INSTANCE);
+ AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
+ AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
+ storageProperties.getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()), NoOpOperationCallbackFactory.INSTANCE);
AlgebricksPartitionConstraintHelper.setPartitionConstraintInJobSpec(spec, btreeBulkLoad,
splitsAndConstraint.second);
diff --git a/asterix-app/src/main/java/edu/uci/ics/asterix/file/IndexOperations.java b/asterix-app/src/main/java/edu/uci/ics/asterix/file/IndexOperations.java
index cec97ad..67bdabb 100644
--- a/asterix-app/src/main/java/edu/uci/ics/asterix/file/IndexOperations.java
+++ b/asterix-app/src/main/java/edu/uci/ics/asterix/file/IndexOperations.java
@@ -57,7 +57,8 @@
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
- storageProperties.getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages()));
+ storageProperties.getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()));
AlgebricksPartitionConstraintHelper
.setPartitionConstraintInJobSpec(spec, btreeDrop, splitsAndConstraint.second);
spec.addRoot(btreeDrop);
diff --git a/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryBTreeCreator.java b/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryBTreeCreator.java
index 958f8d1..c3a2c01 100644
--- a/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryBTreeCreator.java
+++ b/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryBTreeCreator.java
@@ -47,13 +47,14 @@
TreeIndexCreateOperatorDescriptor secondaryIndexCreateOp = new TreeIndexCreateOperatorDescriptor(spec,
AsterixRuntimeComponentsProvider.NOINDEX_PROVIDER, AsterixRuntimeComponentsProvider.NOINDEX_PROVIDER,
secondaryFileSplitProvider, secondaryRecDesc.getTypeTraits(), secondaryComparatorFactories,
- secondaryBloomFilterKeyFields,
- new LSMBTreeDataflowHelperFactory(AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
+ secondaryBloomFilterKeyFields, new LSMBTreeDataflowHelperFactory(
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
- AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER, storageProperties
- .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages()),
- localResourceFactoryProvider, NoOpOperationCallbackFactory.INSTANCE);
+ AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
+ AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
+ storageProperties.getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()), localResourceFactoryProvider,
+ NoOpOperationCallbackFactory.INSTANCE);
AlgebricksPartitionConstraintHelper.setPartitionConstraintInJobSpec(spec, secondaryIndexCreateOp,
secondaryPartitionConstraint);
spec.addRoot(secondaryIndexCreateOp);
@@ -92,8 +93,8 @@
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER, storageProperties
- .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages()),
- BTree.DEFAULT_FILL_FACTOR);
+ .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()), BTree.DEFAULT_FILL_FACTOR);
// Connect the operators.
spec.connect(new OneToOneConnectorDescriptor(spec), keyProviderOp, 0, primaryScanOp, 0);
diff --git a/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryIndexCreator.java b/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryIndexCreator.java
index 83c9393..253df4b 100644
--- a/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryIndexCreator.java
+++ b/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryIndexCreator.java
@@ -282,7 +282,8 @@
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER, storageProperties
- .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages()), false,
+ .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()), false,
NoOpOperationCallbackFactory.INSTANCE);
AlgebricksPartitionConstraintHelper.setPartitionConstraintInJobSpec(spec, primarySearchOp,
primaryPartitionConstraint);
diff --git a/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryInvertedIndexCreator.java b/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryInvertedIndexCreator.java
index 7b53a28..366e247 100644
--- a/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryInvertedIndexCreator.java
+++ b/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryInvertedIndexCreator.java
@@ -270,14 +270,16 @@
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
- storageProperties.getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages());
+ storageProperties.getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate());
} else {
return new PartitionedLSMInvertedIndexDataflowHelperFactory(
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
- storageProperties.getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages());
+ storageProperties.getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate());
}
}
}
diff --git a/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryRTreeCreator.java b/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryRTreeCreator.java
index 1a5e0da..89a59e8 100644
--- a/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryRTreeCreator.java
+++ b/asterix-app/src/main/java/edu/uci/ics/asterix/file/SecondaryRTreeCreator.java
@@ -78,8 +78,9 @@
AsterixRuntimeComponentsProvider.LSMRTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMRTREE_PROVIDER, AqlMetadataProvider.proposeLinearizer(
keyType, secondaryComparatorFactories.length), storageProperties
- .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages()),
- localResourceFactoryProvider, NoOpOperationCallbackFactory.INSTANCE);
+ .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()), localResourceFactoryProvider,
+ NoOpOperationCallbackFactory.INSTANCE);
AlgebricksPartitionConstraintHelper.setPartitionConstraintInJobSpec(spec, secondaryIndexCreateOp,
secondaryPartitionConstraint);
spec.addRoot(secondaryIndexCreateOp);
@@ -163,8 +164,8 @@
AsterixRuntimeComponentsProvider.LSMRTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMRTREE_PROVIDER, AqlMetadataProvider.proposeLinearizer(
keyType, secondaryComparatorFactories.length), storageProperties
- .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages()),
- BTree.DEFAULT_FILL_FACTOR);
+ .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()), BTree.DEFAULT_FILL_FACTOR);
// Connect the operators.
spec.connect(new OneToOneConnectorDescriptor(spec), keyProviderOp, 0, primaryScanOp, 0);
diff --git a/asterix-app/src/test/resources/AQLTS/queries/createInternalDataSet.aql b/asterix-app/src/test/resources/AQLTS/queries/createInternalDataSet.aql
new file mode 100644
index 0000000..f141e45
--- /dev/null
+++ b/asterix-app/src/test/resources/AQLTS/queries/createInternalDataSet.aql
@@ -0,0 +1,3 @@
+create dataset ds1(someType) primary key id;
+create internal dataset ds2(someType) primary key id;
+
diff --git a/asterix-app/src/test/resources/AQLTS/queries/utf-8.aql b/asterix-app/src/test/resources/AQLTS/queries/utf-8.aql
new file mode 100644
index 0000000..b9c58ea
--- /dev/null
+++ b/asterix-app/src/test/resources/AQLTS/queries/utf-8.aql
@@ -0,0 +1,3 @@
+string-to-codepoint("äöß");
+string-to-codepoint("迎");
+/* currently fails (issue 277) string-to-codepoint("欢") */
diff --git a/asterix-app/src/test/resources/AQLTS/queries/variables.aql b/asterix-app/src/test/resources/AQLTS/queries/variables.aql
new file mode 100644
index 0000000..57bfa6b
--- /dev/null
+++ b/asterix-app/src/test/resources/AQLTS/queries/variables.aql
@@ -0,0 +1,4 @@
+let $a:=1
+let $b:=1
+return
+ $b-$a
diff --git a/asterix-app/src/test/resources/runtimets/queries/user-defined-functions/udf30/udf30.1.query.aql b/asterix-app/src/test/resources/runtimets/queries/user-defined-functions/udf30/udf30.1.query.aql
new file mode 100644
index 0000000..ffe6981
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/user-defined-functions/udf30/udf30.1.query.aql
@@ -0,0 +1,14 @@
+/*
+ * Description : Declare a UDF and try to use the function parameter outside
+ * of the function.
+ * Expected Res : Failure
+ * Date : Apr 10th 2013
+ */
+
+declare function abc($y) {
+ let $x:=3
+ return $x
+};
+
+let $z:=$y
+return $z
diff --git a/asterix-app/src/test/resources/runtimets/testsuite.xml b/asterix-app/src/test/resources/runtimets/testsuite.xml
index 37df2c8..a13db44 100644
--- a/asterix-app/src/test/resources/runtimets/testsuite.xml
+++ b/asterix-app/src/test/resources/runtimets/testsuite.xml
@@ -4160,6 +4160,12 @@
</compilation-unit>
</test-case>
<test-case FilePath="user-defined-functions">
+ <compilation-unit name="udf30">
+ <output-dir compare="Text">udf30</output-dir>
+ <expected-error>edu.uci.ics.asterix.common.exceptions.AsterixException</expected-error>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="user-defined-functions">
<compilation-unit name="f01">
<output-dir compare="Text">f01</output-dir>
<expected-error>edu.uci.ics.asterix.common.exceptions.AsterixException</expected-error>
diff --git a/asterix-aql/src/main/javacc/AQL.jj b/asterix-aql/src/main/javacc/AQL.jj
index dc01648..2a5f534 100644
--- a/asterix-aql/src/main/javacc/AQL.jj
+++ b/asterix-aql/src/main/javacc/AQL.jj
@@ -301,7 +301,7 @@
fdd,
ifNotExists);
}
- | <DATASET> nameComponents = QualifiedName()
+ | ("internal")? <DATASET> nameComponents = QualifiedName()
<LEFTPAREN> typeName = Identifier() <RIGHTPAREN>
ifNotExists = IfNotExists()
primaryKeyFields = PrimaryKey() ("on" nodeGroupName = Identifier() )?
@@ -420,7 +420,6 @@
boolean ifNotExists = false;
List<VarIdentifier> paramList = new ArrayList<VarIdentifier>();
String functionBody;
- VarIdentifier var = null;
Expression functionBodyExpr;
Token beginPos;
Token endPos;
@@ -431,6 +430,29 @@
{
"function" nameComponents = FunctionOrTypeName()
ifNotExists = IfNotExists()
+ paramList = ParameterList()
+ "{"
+ {
+ beginPos = token;
+ }
+ functionBodyExpr = Expression() "}"
+ {
+ endPos = token;
+ functionBody = extractFragment(beginPos.beginLine, beginPos.beginColumn, endPos.beginLine, endPos.beginColumn);
+ String dataverse = nameComponents.first.getValue();
+ String functionName = nameComponents.second.getValue();
+ signature = new FunctionSignature(dataverse, functionName, paramList.size());
+ getCurrentScope().addFunctionDescriptor(signature, false);
+ return new CreateFunctionStatement(signature, paramList, functionBody, ifNotExists);
+ }
+}
+
+List<VarIdentifier> ParameterList() throws ParseException:
+{
+ List<VarIdentifier> paramList = new ArrayList<VarIdentifier>();
+ VarIdentifier var = null;
+}
+{
<LEFTPAREN> (<VARIABLE>
{
var = new VarIdentifier();
@@ -445,19 +467,9 @@
paramList.add(var);
getCurrentScope().addNewVarSymbolToScope(var);
}
- )*)? <RIGHTPAREN> "{"
+ )*)? <RIGHTPAREN>
{
- beginPos = token;
- }
- functionBodyExpr = Expression() "}"
- {
- endPos = token;
- functionBody = extractFragment(beginPos.beginLine, beginPos.beginColumn, endPos.beginLine, endPos.beginColumn);
- String dataverse = nameComponents.first.getValue();
- String functionName = nameComponents.second.getValue();
- signature = new FunctionSignature(dataverse, functionName, paramList.size());
- getCurrentScope().addFunctionDescriptor(signature, false);
- return new CreateFunctionStatement(signature, paramList, functionBody, ifNotExists);
+ return paramList;
}
}
@@ -1093,34 +1105,19 @@
FunctionDecl funcDecl;
FunctionSignature signature;
String functionName;
- int arity = 0;
List<VarIdentifier> paramList = new ArrayList<VarIdentifier>();
Expression funcBody;
- VarIdentifier var = null;
createNewScope();
}
{
- "declare" "function" functionName = Identifier() <LEFTPAREN> (<VARIABLE>
+ "declare" "function" functionName = Identifier()
+ paramList = ParameterList()
+ "{" funcBody = Expression() "}"
{
- var = new VarIdentifier();
- var.setValue(token.image);
- paramList.add(var);
- getCurrentScope().addNewVarSymbolToScope(var);
- arity++;
- }
- ("," <VARIABLE>
- {
- var = new VarIdentifier();
- var.setValue(token.image);
- paramList.add(var);
- getCurrentScope().addNewVarSymbolToScope(var);
- arity++;
- }
- )*)? <RIGHTPAREN> "{" funcBody = Expression() "}"
- {
- signature = new FunctionSignature(defaultDataverse, functionName, arity);
+ signature = new FunctionSignature(defaultDataverse, functionName, paramList.size());
getCurrentScope().addFunctionDescriptor(signature, false);
funcDecl = new FunctionDecl(signature, paramList, funcBody);
+ removeCurrentScope();
return funcDecl;
}
}
@@ -1539,7 +1536,7 @@
VarIdentifier var = new VarIdentifier();
}
{
- <VARIABLE>
+ <VARIABLE>
{
String varName = token.image;
Identifier ident = lookupSymbol(varName);
@@ -1564,7 +1561,7 @@
VarIdentifier var = new VarIdentifier();
}
{
- <VARIABLE>
+ <VARIABLE>
{
Identifier ident = lookupSymbol(token.image);
if(ident != null) { // exist such ident
@@ -2204,7 +2201,7 @@
<DEFAULT>
TOKEN :
{
- <VARIABLE : "$" <IDENTIFIER> >
+ <VARIABLE : "$" (<LETTER>)+ (<LETTER> | <DIGIT> | "_")* >
}
SKIP:
diff --git a/asterix-common/src/main/java/edu/uci/ics/asterix/common/context/AsterixAppRuntimeContext.java b/asterix-common/src/main/java/edu/uci/ics/asterix/common/context/AsterixAppRuntimeContext.java
index 03e84ef..8fe3341 100644
--- a/asterix-common/src/main/java/edu/uci/ics/asterix/common/context/AsterixAppRuntimeContext.java
+++ b/asterix-common/src/main/java/edu/uci/ics/asterix/common/context/AsterixAppRuntimeContext.java
@@ -149,6 +149,10 @@
return mergePolicy;
}
+ public double getBloomFilterFalsePositiveRate() {
+ return storageProperties.getBloomFilterFalsePositiveRate();
+ }
+
public ILSMOperationTrackerFactory getLSMBTreeOperationTrackerFactory() {
return lsmBTreeOpTrackerFactory;
}
diff --git a/asterix-common/src/main/java/edu/uci/ics/asterix/common/context/AsterixAppRuntimeContextProviderForRecovery.java b/asterix-common/src/main/java/edu/uci/ics/asterix/common/context/AsterixAppRuntimeContextProviderForRecovery.java
index 8f2b96f..10660cc 100644
--- a/asterix-common/src/main/java/edu/uci/ics/asterix/common/context/AsterixAppRuntimeContextProviderForRecovery.java
+++ b/asterix-common/src/main/java/edu/uci/ics/asterix/common/context/AsterixAppRuntimeContextProviderForRecovery.java
@@ -42,6 +42,11 @@
}
@Override
+ public double getBloomFilterFalsePositiveRate() {
+ return asterixAppRuntimeContext.getBloomFilterFalsePositiveRate();
+ }
+
+ @Override
public ILSMMergePolicy getLSMMergePolicy() {
return asterixAppRuntimeContext.getLSMMergePolicy();
}
@@ -100,5 +105,4 @@
public ILSMIOOperationCallbackProvider getNoOpIOOperationCallbackProvider() {
return asterixAppRuntimeContext.getNoOpIOOperationCallbackProvider();
}
-
}
diff --git a/asterix-doc/pom.xml b/asterix-doc/pom.xml
index f44c06d..d987e5f 100644
--- a/asterix-doc/pom.xml
+++ b/asterix-doc/pom.xml
@@ -12,6 +12,9 @@
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId>
<version>3.3</version>
+ <configuration>
+ <generateReports>false</generateReports>
+ </configuration>
</plugin>
</plugins>
</build>
diff --git a/asterix-doc/src/site/markdown/AccessingExternalDataInAsterixDB.md b/asterix-doc/src/site/markdown/AccessingExternalDataInAsterixDB.md
new file mode 100644
index 0000000..7e49a0f
--- /dev/null
+++ b/asterix-doc/src/site/markdown/AccessingExternalDataInAsterixDB.md
@@ -0,0 +1,199 @@
+# Accessing External Data in AsterixDB #
+
+## Introduction ##
+Data that needs to be processed by ASTERIX could be residing outside ASTERIX storage. Examples include data files on a distributed file system such as HDFS or on the local file system of a machine that is part of an ASTERIX cluster. For ASTERIX to process such data, end-user may create a regular dataset in ASTERIX (a.k.a. internal dataset) and load the dataset with the data. ASTERIX supports ''external datasets'' so that it is not necessary to “load” all data prior to using it. This also avoids creating multiple copies of data and the need to keep the copies in sync.
+
+### Adapter for an External Dataset ###
+External data is accessed using wrappers (adapters in ASTERIX) that abstract away the mechanism of connecting with an external service, receiving data and transforming the data into ADM records that are understood by ASTERIX. ASTERIX comes with built-in adapters for common storage systems such as HDFS or the local file system.
+
+### Creating an External Dataset ###
+
+As an example we consider the Lineitem dataset from [TPCH schema](http://www.openlinksw.com/dataspace/doc/dav/wiki/Main/VOSTPCHLinkedData/tpch.sql).
+
+We assume that you have successfully created an ASTERIX instance following the instructions at [Installing Asterix Using Managix](InstallingAsterixUsingManagix.html).
+_For constructing an example, we assume a single machine setup._
+
+Similar to a regular dataset, an external dataset has an associated datatype. We shall first create the datatype associated with each record in Lineitem data.
+Paste the following in the query textbox on the webpage at http://127.0.0.1 and hit 'Execute'.
+
+
+ create dataverse ExternalFileDemo;
+ use dataverse ExternalFileDemo;
+
+ create type LineitemType as closed {
+ l_orderkey:int32,
+ l_partkey: int32,
+ l_suppkey: int32,
+ l_linenumber: int32,
+ l_quantity: double,
+ l_extendedprice: double,
+ l_discount: double,
+ l_tax: double,
+ l_returnflag: string,
+ l_linestatus: string,
+ l_shipdate: string,
+ l_commitdate: string,
+ l_receiptdate: string,
+ l_shipinstruct: string,
+ l_shipmode: string,
+ l_comment: string}
+
+
+We describe here two scenarios.
+
+#### 1) Data file resides on the local file system of a host####
+Prerequisite: The host is a part of the ASTERIX cluster.
+
+Earlier, we assumed a single machine ASTERIX setup. To satisfy the prerequisite, log-in to the machine running ASTERIX.
+
+ * Download the [data file](https://code.google.com/p/asterixdb/downloads/detail?name=lineitem.tbl&can=2&q=) to an appropriate location. We denote this location by SOURCE_PATH.
+
+ASTERIX provides a built-in adapter for data residing on the local file system. The adapter is referred by its alias- 'localfs'. We create an external dataset named Lineitem and use the 'localfs' adapter.
+
+
+ create external dataset Lineitem(LineitemType)
+ using localfs
+
+Above, the definition is not complete as we need to provide a set of parameters that are specific to the source file.
+
+<table>
+<tr>
+ <td> Parameter </td>
+ <td> Description </td>
+</tr>
+<tr>
+ <td> path </td>
+ <td> A fully qualified path of the form <tt>host://<absolute path></tt>.
+ Use a comma separated list if there are multiple files.
+ E.g. <tt>host1://<absolute path></tt>, <tt>host2://<absolute path></tt> and so forth. </td>
+</tr>
+<tr>
+ <td> format </td>
+ <td> The format for the content. Use 'adm' for data in ADM (ASTERIX Data Model) or <a href="http://www.json.org/">JSON</a> format. Use 'delimited-text' if fields are separted by . </td></tr>
+<tr><td>delimiter</td><td>The delimiting character in the source file if format is 'delimited text'</td></tr>
+</table>
+
+As we are using a single single machine ASTERIX instance, we use 127.0.0.1 as host in the path parameter.
+We *complete the create dataset statement* as follows.
+
+
+ use dataverse ExternalFileDemo;
+
+ create external dataset Lineitem(LineitemType)
+ using localfs
+ (("path"="127.0.0.1://SOURCE_PATH"),
+ ("format"="delimited-text"),
+ ("delimiter"="|"));
+
+
+Please substitute SOURCE_PATH with the absolute path to the source file on the local file system.
+
+#### Common source of error ####
+
+An incorrect value for the path parameter will give the following exception message when the dataset is used in a query.
+
+ edu.uci.ics.hyracks.algebricks.common.exceptions.AlgebricksException: edu.uci.ics.hyracks.api.exceptions.HyracksDataException: edu.uci.ics.hyracks.api.exceptions.HyracksDataException: Job failed.
+
+
+Verify the correctness of the path parameter provided to the localfs adapter. Note that the path parameter must be an absolute path to the data file. For e.g. if you saved your file in your home directory (assume it to be /home/joe), then the path value should be
+
+ 127.0.0.1:///home/joe/lineitem.tbl.
+
+
+In your web-browser, navigate to 127.0.0.1 and paste the above to the query text box. Finally hit 'Execute'.
+
+Next we move over to the the section [Writing Queries against an External Dataset](#Writing_Queries_against_an_External_Dataset) and try a sample query against the external dataset.
+
+#### 2) Data file resides on an HDFS instance ####
+Pre-requisite: It is required that the Namenode and atleast one of the HDFS Datanodes are reachable from the hosts that form the ASTERIX cluster. ASTERIX provides a built-in adapter for data residing on HDFS. The HDFS adapter is referred (in AQL) by its alias - 'hdfs'. We create an external dataset named Lineitem and associate the HDFS adapter with it.
+
+
+ create external dataset Lineitem(LineitemType)
+ using hdfs
+
+
+The above statement is *not complete* as we need to provide a set of parameters specific to the HDFS instance and the source file.
+These parameters are described below.
+
+<table>
+<tr>
+ <td> Parameter </td>
+ <td> Description </td>
+</tr>
+<tr>
+ <td> hdfs </td>
+ <td> The HDFS URL </td>
+</tr>
+<tr>
+ <td> path </td>
+ <td> The absolute path to the source HDFS file. Use a comma separated list if there are multiple files. </td></tr>
+<tr>
+ <td> input-format </td>
+ <td> The associated input format. Use 'text-input-format' for textual data or 'sequence-input-format' for binary data (sequence files). </td>
+</tr>
+<tr>
+ <td> format </td>
+ <td> The format for the content. Use 'adm' for data in ADM (ASTERIX Data Model) or
+ <a href="http://www.json.org/">JSON</a> format and use 'delimited-text' for delimited data
+ that has fields separated by a delimiting character. </td>
+</tr>
+<tr>
+ <td> delimiter </td>
+ <td> The delimiting character in the source file if format is 'delimited text' </td>
+</tr>
+</table>
+
+*Difference between 'input-format' and 'format'*
+
+*input-format*: File stored under HDFS have an associated storage format For example, TextInputFormat represents plain text files. SequenceFileInputFormat indicates binary compressed file. The parameter 'input-format' is used to distinguish between these two kind of files.
+
+*format*:
+The parameter 'format' refers to the type of the data contained in the file. For example data contained in a file could be in json, ADM format or could be delimited-text with fields separated by a delimiting character.
+
+As an example. consider the [data file](https://code.google.com/p/asterixdb/downloads/detail?name=lineitem.tbl&can=2&q=). The file is a text file with each line representing a record. The fields in each record are separated by the '|' character.
+
+We assume the HDFS URL to be hdfs://host:port. We further assume that the example data file is copied to the HDFS at a path denoted by HDFS_PATH.
+
+The complete set of parameters for our example file are as follows. (("hdfs"="HDFS_URL",("path"="HDFS_PATH"),("input-format"="text-input-format"),("format"="delimited-text"),("delimiter"="|"))
+
+We modify the create external dataset statement as follows.
+
+
+ create external dataset Lineitem('LineitemType)
+ using hdfs
+ (("hdfs"="HDFS_URL"),("path"="HDFS_PATH"),("input-format"="text-input-format"),("format"="delimited-text"),("delimiter"="|"));
+
+
+Once you have copied the source data file to your HDFS instance, substitute the values of HDFS_URL and HDFS_PATH in the above statement. In your web-browser, navigate to http://127.0.0.1:19001 and execute the above statement with substituted values.
+
+You may now run the sample query in next section.
+
+## Writing Queries against an External Dataset ##
+You may write AQL queries against an external dataset. Following is an example AQL query that applies a filter and returns an ordered result.
+
+
+ use dataverse ExternalFileDemo;
+
+ for $c in dataset('Lineitem')
+ where $c.l_orderkey <= 3
+ order by $c.l_orderkey, $c.l_linenumber
+ return $c
+
+
+The expected result is:
+
+
+ { "l_orderkey": 1, "l_partkey": 156, "l_suppkey": 4, "l_linenumber": 1, "l_quantity": 17, "l_extendedprice": 17954.55d, "l_discount": 0.04d, "l_tax": 0.02d, "l_returnflag": "N", "l_linestatus": "O", "l_shipdate": "1996-03-13", "l_commitdate": "1996-02-12", "l_receiptdate": "1996-03-22", "l_shipinstruct": "DELIVER IN PERSON", "l_shipmode": "TRUCK", "l_comment": "egular courts above the" }
+ { "l_orderkey": 1, "l_partkey": 68, "l_suppkey": 9, "l_linenumber": 2, "l_quantity": 36, "l_extendedprice": 34850.16d, "l_discount": 0.09d, "l_tax": 0.06d, "l_returnflag": "N", "l_linestatus": "O", "l_shipdate": "1996-04-12", "l_commitdate": "1996-02-28", "l_receiptdate": "1996-04-20", "l_shipinstruct": "TAKE BACK RETURN", "l_shipmode": "MAIL", "l_comment": "ly final dependencies: slyly bold " }
+ { "l_orderkey": 1, "l_partkey": 64, "l_suppkey": 5, "l_linenumber": 3, "l_quantity": 8, "l_extendedprice": 7712.48d, "l_discount": 0.1d, "l_tax": 0.02d, "l_returnflag": "N", "l_linestatus": "O", "l_shipdate": "1996-01-29", "l_commitdate": "1996-03-05", "l_receiptdate": "1996-01-31", "l_shipinstruct": "TAKE BACK RETURN", "l_shipmode": "REG AIR", "l_comment": "riously. regular, express dep" }
+ { "l_orderkey": 1, "l_partkey": 3, "l_suppkey": 6, "l_linenumber": 4, "l_quantity": 28, "l_extendedprice": 25284.0d, "l_discount": 0.09d, "l_tax": 0.06d, "l_returnflag": "N", "l_linestatus": "O", "l_shipdate": "1996-04-21", "l_commitdate": "1996-03-30", "l_receiptdate": "1996-05-16", "l_shipinstruct": "NONE", "l_shipmode": "AIR", "l_comment": "lites. fluffily even de" }
+ { "l_orderkey": 1, "l_partkey": 25, "l_suppkey": 8, "l_linenumber": 5, "l_quantity": 24, "l_extendedprice": 22200.48d, "l_discount": 0.1d, "l_tax": 0.04d, "l_returnflag": "N", "l_linestatus": "O", "l_shipdate": "1996-03-30", "l_commitdate": "1996-03-14", "l_receiptdate": "1996-04-01", "l_shipinstruct": "NONE", "l_shipmode": "FOB", "l_comment": " pending foxes. slyly re" }
+ { "l_orderkey": 1, "l_partkey": 16, "l_suppkey": 3, "l_linenumber": 6, "l_quantity": 32, "l_extendedprice": 29312.32d, "l_discount": 0.07d, "l_tax": 0.02d, "l_returnflag": "N", "l_linestatus": "O", "l_shipdate": "1996-01-30", "l_commitdate": "1996-02-07", "l_receiptdate": "1996-02-03", "l_shipinstruct": "DELIVER IN PERSON", "l_shipmode": "MAIL", "l_comment": "arefully slyly ex" }
+ { "l_orderkey": 2, "l_partkey": 107, "l_suppkey": 2, "l_linenumber": 1, "l_quantity": 38, "l_extendedprice": 38269.8d, "l_discount": 0.0d, "l_tax": 0.05d, "l_returnflag": "N", "l_linestatus": "O", "l_shipdate": "1997-01-28", "l_commitdate": "1997-01-14", "l_receiptdate": "1997-02-02", "l_shipinstruct": "TAKE BACK RETURN", "l_shipmode": "RAIL", "l_comment": "ven requests. deposits breach a" }
+ { "l_orderkey": 3, "l_partkey": 5, "l_suppkey": 2, "l_linenumber": 1, "l_quantity": 45, "l_extendedprice": 40725.0d, "l_discount": 0.06d, "l_tax": 0.0d, "l_returnflag": "R", "l_linestatus": "F", "l_shipdate": "1994-02-02", "l_commitdate": "1994-01-04", "l_receiptdate": "1994-02-23", "l_shipinstruct": "NONE", "l_shipmode": "AIR", "l_comment": "ongside of the furiously brave acco" }
+ { "l_orderkey": 3, "l_partkey": 20, "l_suppkey": 10, "l_linenumber": 2, "l_quantity": 49, "l_extendedprice": 45080.98d, "l_discount": 0.1d, "l_tax": 0.0d, "l_returnflag": "R", "l_linestatus": "F", "l_shipdate": "1993-11-09", "l_commitdate": "1993-12-20", "l_receiptdate": "1993-11-24", "l_shipinstruct": "TAKE BACK RETURN", "l_shipmode": "RAIL", "l_comment": " unusual accounts. eve" }
+ { "l_orderkey": 3, "l_partkey": 129, "l_suppkey": 8, "l_linenumber": 3, "l_quantity": 27, "l_extendedprice": 27786.24d, "l_discount": 0.06d, "l_tax": 0.07d, "l_returnflag": "A", "l_linestatus": "F", "l_shipdate": "1994-01-16", "l_commitdate": "1993-11-22", "l_receiptdate": "1994-01-23", "l_shipinstruct": "DELIVER IN PERSON", "l_shipmode": "SHIP", "l_comment": "nal foxes wake. " }
+ { "l_orderkey": 3, "l_partkey": 30, "l_suppkey": 5, "l_linenumber": 4, "l_quantity": 2, "l_extendedprice": 1860.06d, "l_discount": 0.01d, "l_tax": 0.06d, "l_returnflag": "A", "l_linestatus": "F", "l_shipdate": "1993-12-04", "l_commitdate": "1994-01-07", "l_receiptdate": "1994-01-01", "l_shipinstruct": "NONE", "l_shipmode": "TRUCK", "l_comment": "y. fluffily pending d" }
+ { "l_orderkey": 3, "l_partkey": 184, "l_suppkey": 5, "l_linenumber": 5, "l_quantity": 28, "l_extendedprice": 30357.04d, "l_discount": 0.04d, "l_tax": 0.0d, "l_returnflag": "R", "l_linestatus": "F", "l_shipdate": "1993-12-14", "l_commitdate": "1994-01-10", "l_receiptdate": "1994-01-01", "l_shipinstruct": "TAKE BACK RETURN", "l_shipmode": "FOB", "l_comment": "ages nag slyly pending" }
+ { "l_orderkey": 3, "l_partkey": 63, "l_suppkey": 8, "l_linenumber": 6, "l_quantity": 26, "l_extendedprice": 25039.56d, "l_discount": 0.1d, "l_tax": 0.02d, "l_returnflag": "A", "l_linestatus": "F", "l_shipdate": "1993-10-29", "l_commitdate": "1993-12-18", "l_receiptdate": "1993-11-04", "l_shipinstruct": "TAKE BACK RETURN", "l_shipmode": "RAIL", "l_comment": "ges sleep after the caref" }
+
diff --git a/asterix-doc/src/site/markdown/AdmAql101.md b/asterix-doc/src/site/markdown/AdmAql101.md
new file mode 100644
index 0000000..ed4736e
--- /dev/null
+++ b/asterix-doc/src/site/markdown/AdmAql101.md
@@ -0,0 +1,897 @@
+# AsterixDB 101: An ADM and AQL Primer #
+
+## Welcome to AsterixDB! ##
+This document introduces the main features of AsterixDB's data model (ADM) and query language (AQL) by example.
+The example is a simple scenario involving (synthetic) sample data modeled after data from the social domain.
+This document describes a set of sample ADM datasets, together with a set of illustrative AQL queries,
+to introduce you to the "AsterixDB user experience".
+The complete set of steps required to create and load a handful of sample datasets, along with runnable queries
+and the expected results for each query, are included.
+
+This document assumes that you are at least vaguely familiar with AsterixDB and why you might want to use it.
+Most importantly, it assumes you already have a running instance of AsterixDB and that you know how to query
+it using AsterixDB's basic web interface.
+For more information on these topics, you should go through the steps in
+[Installing Asterix Using Managix](InstallingAsterixUsingManagix.html)
+before reading this document and make sure that you have a running AsterixDB instance ready to go.
+To get your feet wet, you should probably start with a simple local installation of AsterixDB on your favorite
+machine, accepting all of the default settings that Managix offers.
+Later you can graduate to trying AsterixDB on a cluster, its real intended home (since it targets Big Data).
+(Note: With the exception of specifying the correct locations where you put the source data for this example,
+there should no changes needed in your ADM or AQL statements to run the examples locally and/or to run them
+on a cluster when you are ready to take that step.)
+
+As you read through this document, you should try each step for yourself on your own AsterixDB instance.
+Once you have reached the end, you will be fully armed and dangerous, with all the basic AsterixDB knowledge
+that you'll need to start down the path of modeling, storing, and querying your own semistructured data.
+
+----
+## ADM: Modeling Semistructed Data in AsterixDB ##
+In this section you will learn all about modeling Big Data using
+ADM, the data model of the AsterixDB BDMS.
+
+### Dataverses, Datatypes, and Datasets ###
+The top-level organizing concept in the AsterixDB world is the _dataverse_.
+A dataverse---short for "data universe"---is a place (similar to a database in a relational DBMS) in which
+to create and manage the types, datasets, functions, and other artifacts for a given AsterixDB application.
+When you start using an AsterixDB instance for the first time, it starts out "empty"; it contains no data
+other than the AsterixDB system catalogs (which live in a special dataverse called the Metadata dataverse).
+To store your data in AsterixDB, you will first create a dataverse and then you use it for the _datatypes_
+and _datasets_ for managing your own data.
+A datatype tells AsterixDB what you know (or more accurately, what you want it to know) a priori about one
+of the kinds of data instances that you want AsterixDB to hold for you.
+A dataset is a collection of data instances of a datatype,
+and AsterixDB makes sure that the data instances that you put in it conform to its specified type.
+Since AsterixDB targets semistructured data, you can use _open_ datatypes and tell it as little or as
+much as you wish about your data up front; the more you tell it up front, the less information it will
+have to store repeatedly in the individual data instances that you give it.
+Instances of open datatypes are permitted to have additional content, beyond what the datatype says,
+as long as they at least contain the information prescribed by the datatype definition.
+Open typing allows data to vary from one instance to another and it leaves wiggle room for application
+evolution in terms of what might need to be stored in the future.
+If you want to restrict data instances in a dataset to have only what the datatype says, and nothing extra,
+you can define a _closed_ datatype for that dataset and AsterixDB will keep users from storing objects
+that have extra data in them.
+Datatypes are open by default unless you tell AsterixDB otherwise.
+Let's put these concepts to work
+
+Our little sample scenario involves hypothetical information about users of two popular social networks,
+Facebook and Twitter, and their messages.
+We'll start by defining a dataverse called "TinySocial" to hold our datatypes and datasets.
+The AsterixDB data model (ADM) is essentially a superset of JSON---it's what you get by extending
+JSON with more data types and additional data modeling constructs borrowed from object databases.
+The following is how we can create the TinySocial dataverse plus a set of ADM types for modeling
+Twitter users, their Tweets, Facebook users, their users' employment information, and their messages.
+(Note: Keep in mind that this is just a tiny and somewhat silly example intended for illustrating
+some of the key features of AsterixDB. :-))
+
+
+ drop dataverse TinySocial if exists;
+ create dataverse TinySocial;
+ use dataverse TinySocial;
+
+ create type TwitterUserType as open {
+ screen-name: string,
+ lang: string,
+ friends_count: int32,
+ statuses_count: int32,
+ name: string,
+ followers_count: int32
+ }
+
+ create type TweetMessageType as closed {
+ tweetid: string,
+ user: TwitterUserType,
+ sender-location: point?,
+ send-time: datetime,
+ referred-topics: {{ string }},
+ message-text: string
+ }
+
+ create type EmploymentType as open {
+ organization-name: string,
+ start-date: date,
+ end-date: date?
+ }
+
+ create type FacebookUserType as closed {
+ id: int32,
+ alias: string,
+ name: string,
+ user-since: datetime,
+ friend-ids: {{ int32 }},
+ employment: [EmploymentType]
+ }
+
+ create type FacebookMessageType as closed {
+ message-id: int32,
+ author-id: int32,
+ in-response-to: int32?,
+ sender-location: point?,
+ message: string
+ }
+
+
+
+The first three lines above tell AsterixDB to drop the old TinySocial dataverse, if one already
+exists, and then to create a brand new one and make it the focus of the statements that follow.
+The first type creation statement creates a datatype for holding information about Twitter users.
+It is a record type with a mix of integer and string data, very much like a (flat) relational tuple.
+The indicated fields are all mandatory, but because the type is open, additional fields are welcome.
+The second statement creates a datatype for Twitter messages; this shows how to specify a closed type.
+Interestingly (based on one of Twitter's APIs), each Twitter message actually embeds an instance of the
+sending user's information (current as of when the message was sent), so this is an example of a nested
+record in ADM.
+Twitter messages can optionally contain the sender's location, which is modeled via the sender-location
+field of spatial type _point_; the question mark following the field type indicates its optionality.
+An optional field is like a nullable field in SQL---it may be present or missing, but when it's present,
+its data type will conform to the datatype's specification.
+The send-time field illustrates the use of a temporal primitive type, _datetime_.
+Lastly, the referred-topics field illustrates another way that ADM is richer than the relational model;
+this field holds a bag (a.k.a. an unordered list) of strings.
+Since the overall datatype definition for Twitter messages says "closed", the fields that it lists are
+the only fields that instances of this type will be allowed to contain.
+The next two create type statements create a record type for holding information about one component of
+the employment history of a Facebook user and then a record type for holding the user information itself.
+The Facebook user type highlights a few additional ADM data model features.
+Its friend-ids field is a bag of integers, presumably the Facebook user ids for this user's friends,
+and its employment field is an ordered list of employment records.
+The final create type statement defines a type for handling the content of a Facebook message in our
+hypothetical social data storage scenario.
+
+Before going on, we need to once again emphasize the idea that AsterixDB is aimed at storing
+and querying not just Big Data, but Big _Semistructured_ Data.
+This means that most of the fields listed in the create type statements above could have been
+omitted without changing anything other than the resulting size of stored data instances on disk.
+AsterixDB stores its information about the fields defined a priori as separate metadata, whereas
+the information about other fields that are "just there" in instances of open datatypes is stored
+with each instance---making for more bits on disk and longer times for operations affected by
+data size (e.g., dataset scans).
+The only fields that _must_ be specified a priori are the primary key and any fields that you
+would like to build indexes on.
+(AsterixDB does not yet support auto-generated keys or indexes on the unspecified "open" fields
+of its data instances).
+
+### Creating Datasets and Indexes ###
+
+Now that we have defined our datatypes, we can move on and create datasets to store the actual data.
+(If we wanted to, we could even have several named datasets based on any one of these datatypes.)
+We can do this as follows, utilizing the DDL capabilities of AsterixDB.
+
+
+
+ use dataverse TinySocial;
+
+ create dataset FacebookUsers(FacebookUserType)
+ primary key id;
+
+ create dataset FacebookMessages(FacebookMessageType)
+ primary key message-id;
+
+ create dataset TwitterUsers(TwitterUserType)
+ primary key screen-name;
+
+ create dataset TweetMessages(TweetMessageType)
+ primary key tweetid
+ hints(cardinality=100);
+
+ create index fbUserSinceIdx on FacebookUsers(user-since);
+ create index fbAuthorIdx on FacebookMessages(author-id) type btree;
+ create index fbSenderLocIndex on FacebookMessages(sender-location) type rtree;
+ create index fbMessageIdx on FacebookMessages(message) type keyword;
+
+ for $ds in dataset Metadata.Dataset return $ds;
+ for $ix in dataset Metadata.Index return $ix;
+
+
+
+The ADM DDL statements above create four datasets for holding our social data in the TinySocial
+dataverse: FacebookUsers, FacebookMessages, TwitterUsers, and TweetMessages.
+The first statement creates the FacebookUsers data set.
+It specifies that this dataset will store data instances conforming to FacebookUserType and that
+it has a primary key which is the id field of each instance.
+The primary key information is used by AsterixDB to uniquely identify instances for the purpose
+of later lookup and for use in secondary indexes.
+Each AsterixDB dataset is stored (and indexed) in the form of a B+ tree on primary key;
+secondary indexes point to their indexed data by primary key.
+In AsterixDB clusters, the primary key is also used to hash-partition (a.k.a. shard) the
+dataset across the nodes of the cluster.
+The next three create dataset statements are similar.
+The last one illustrates an optional clause for providing useful hints to AsterixDB.
+In this case, the hint tells AsterixDB that the dataset definer is anticipating that the
+TweetMessages dataset will contain roughly 100 objects; knowing this can help AsterixDB
+to more efficiently manage and query this dataset.
+(AsterixDB does not yet gather and maintain data statistics; it will currently, abitrarily,
+assume a cardinality of one million objects per dataset in the absence of such an optional
+definition-time hint.)
+
+The create dataset statements above are followed by four more DDL statements, each of which
+creates a secondary index on a field of one of the datasets.
+The first one indexes the FacebookUsers dataset on its user-since field.
+This index will be a B+ tree index; its type is unspecified and _btree_ is the default type.
+The other three illustrate how you can explicitly specify the desired type of index.
+In addition to btree, _rtree_ and inverted _keyword_ indexes are supported by AsterixDB.
+Indexes can also have composite keys, and there are more advanced flavors of text indexing
+available as well (_fuzzy keyword_ and _ngram(k)_, where _k_ is the desired gram length).
+
+### Querying the Metadata Dataverse ###
+
+The last two statements above show how you can use queries in AQL to examine the AsterixDB
+system catalogs and tell what artifacts you have created.
+Just as relational DBMSs use their own tables to store their catalogs, AsterixDB uses
+its own datasets to persist descriptions of its datasets, datatypes, indexes, and so on.
+Running the first of the two queries above will list all of your newly created datasets,
+and it will also show you a full list of all the metadata datasets.
+(You can then explore from there on your own if you are curious)
+These last two queries also illustrate one other factoid worth knowing:
+AsterixDB allows queries to span dataverses by allowing the optional use
+of fully-qualified dataset names (i.e., _dataversename.datasetname_)
+to reference datasets that live in a dataverse other than the one that
+was named in the most recently executed _use dataverse_ directive.
+
+----
+## Loading Data Into AsterixDB ##
+Okay, so far so good---AsterixDB is now ready for data, so let's give it some data to store
+Our next task will be to load some sample data into the four datasets that we just defined.
+Here we will load a tiny set of records, defined in ADM format (a superset of JSON), into each dataset.
+In the boxes below you can see the actual data instances contained in each of the provided sample files.
+In order to load this data yourself, you should first store the four corresponding `.adm` files
+(whose URLs are indicated on top of each box below) into a filesystem directory accessible to your
+running AsterixDB instance.
+Take a few minutes to look carefully at each of the sample data sets.
+This will give you a better sense of the nature of the data that we are about to load and query.
+We should note that ADM format is a textual serialization of what AsterixDB will actually store;
+when persisted in AsterixDB, the data format will be binary and the data in the predefined fields
+of the data instances will be stored separately from their associated field name and type metadata.
+
+[Twitter Users](http://asterixdb.googlecode.com/files/twu.adm)
+
+ {"screen-name":"NathanGiesen@211","lang":"en","friends_count":18,"statuses_count":473,"name":"Nathan Giesen","followers_count":49416}
+ {"screen-name":"ColineGeyer@63","lang":"en","friends_count":121,"statuses_count":362,"name":"Coline Geyer","followers_count":17159}
+ {"screen-name":"NilaMilliron_tw","lang":"en","friends_count":445,"statuses_count":164,"name":"Nila Milliron","followers_count":22649}
+ {"screen-name":"ChangEwing_573","lang":"en","friends_count":182,"statuses_count":394,"name":"Chang Ewing","followers_count":32136}
+
+[Tweet Messages](http://asterixdb.googlecode.com/files/twm.adm)
+
+ {"tweetid":"1","user":{"screen-name":"NathanGiesen@211","lang":"en","friends_count":39339,"statuses_count":473,"name":"Nathan Giesen","followers_count":49416},"sender-location":point("47.44,80.65"),"send-time":datetime("2008-04-26T10:10:00"),"referred-topics":{{"t-mobile","customization"}},"message-text":" love t-mobile its customization is good:)"}
+ {"tweetid":"2","user":{"screen-name":"ColineGeyer@63","lang":"en","friends_count":121,"statuses_count":362,"name":"Coline Geyer","followers_count":17159},"sender-location":point("32.84,67.14"),"send-time":datetime("2010-05-13T10:10:00"),"referred-topics":{{"verizon","shortcut-menu"}},"message-text":" like verizon its shortcut-menu is awesome:)"}
+ {"tweetid":"3","user":{"screen-name":"NathanGiesen@211","lang":"en","friends_count":39339,"statuses_count":473,"name":"Nathan Giesen","followers_count":49416},"sender-location":point("29.72,75.8"),"send-time":datetime("2006-11-04T10:10:00"),"referred-topics":{{"motorola","speed"}},"message-text":" like motorola the speed is good:)"}
+ {"tweetid":"4","user":{"screen-name":"NathanGiesen@211","lang":"en","friends_count":39339,"statuses_count":473,"name":"Nathan Giesen","followers_count":49416},"sender-location":point("39.28,70.48"),"send-time":datetime("2011-12-26T10:10:00"),"referred-topics":{{"sprint","voice-command"}},"message-text":" like sprint the voice-command is mind-blowing:)"}
+ {"tweetid":"5","user":{"screen-name":"NathanGiesen@211","lang":"en","friends_count":39339,"statuses_count":473,"name":"Nathan Giesen","followers_count":49416},"sender-location":point("40.09,92.69"),"send-time":datetime("2006-08-04T10:10:00"),"referred-topics":{{"motorola","speed"}},"message-text":" can't stand motorola its speed is terrible:("}
+ {"tweetid":"6","user":{"screen-name":"ColineGeyer@63","lang":"en","friends_count":121,"statuses_count":362,"name":"Coline Geyer","followers_count":17159},"sender-location":point("47.51,83.99"),"send-time":datetime("2010-05-07T10:10:00"),"referred-topics":{{"iphone","voice-clarity"}},"message-text":" like iphone the voice-clarity is good:)"}
+ {"tweetid":"7","user":{"screen-name":"ChangEwing_573","lang":"en","friends_count":182,"statuses_count":394,"name":"Chang Ewing","followers_count":32136},"sender-location":point("36.21,72.6"),"send-time":datetime("2011-08-25T10:10:00"),"referred-topics":{{"samsung","platform"}},"message-text":" like samsung the platform is good"}
+ {"tweetid":"8","user":{"screen-name":"NathanGiesen@211","lang":"en","friends_count":39339,"statuses_count":473,"name":"Nathan Giesen","followers_count":49416},"sender-location":point("46.05,93.34"),"send-time":datetime("2005-10-14T10:10:00"),"referred-topics":{{"t-mobile","shortcut-menu"}},"message-text":" like t-mobile the shortcut-menu is awesome:)"}
+ {"tweetid":"9","user":{"screen-name":"NathanGiesen@211","lang":"en","friends_count":39339,"statuses_count":473,"name":"Nathan Giesen","followers_count":49416},"sender-location":point("36.86,74.62"),"send-time":datetime("2012-07-21T10:10:00"),"referred-topics":{{"verizon","voicemail-service"}},"message-text":" love verizon its voicemail-service is awesome"}
+ {"tweetid":"10","user":{"screen-name":"ColineGeyer@63","lang":"en","friends_count":121,"statuses_count":362,"name":"Coline Geyer","followers_count":17159},"sender-location":point("29.15,76.53"),"send-time":datetime("2008-01-26T10:10:00"),"referred-topics":{{"verizon","voice-clarity"}},"message-text":" hate verizon its voice-clarity is OMG:("}
+ {"tweetid":"11","user":{"screen-name":"NilaMilliron_tw","lang":"en","friends_count":445,"statuses_count":164,"name":"Nila Milliron","followers_count":22649},"sender-location":point("37.59,68.42"),"send-time":datetime("2008-03-09T10:10:00"),"referred-topics":{{"iphone","platform"}},"message-text":" can't stand iphone its platform is terrible"}
+ {"tweetid":"12","user":{"screen-name":"OliJackson_512","lang":"en","friends_count":445,"statuses_count":164,"name":"Oli Jackson","followers_count":22649},"sender-location":point("24.82,94.63"),"send-time":datetime("2010-02-13T10:10:00"),"referred-topics":{{"samsung","voice-command"}},"message-text":" like samsung the voice-command is amazing:)"}
+
+[Facebook Users](http://asterixdb.googlecode.com/files/fbu.adm)
+
+ {"id":1,"alias":"Margarita","name":"MargaritaStoddard","user-since":datetime("2012-08-20T10:10:00"),"friend-ids":{{2,3,6,10}},"employment":[{"organization-name":"Codetechno","start-date":date("2006-08-06")}]}
+ {"id":2,"alias":"Isbel","name":"IsbelDull","user-since":datetime("2011-01-22T10:10:00"),"friend-ids":{{1,4}},"employment":[{"organization-name":"Hexviafind","start-date":date("2010-04-27")}]}
+ {"id":3,"alias":"Emory","name":"EmoryUnk","user-since":datetime("2012-07-10T10:10:00"),"friend-ids":{{1,5,8,9}},"employment":[{"organization-name":"geomedia","start-date":date("2010-06-17"),"end-date":date("2010-01-26")}]}
+ {"id":4,"alias":"Nicholas","name":"NicholasStroh","user-since":datetime("2010-12-27T10:10:00"),"friend-ids":{{2}},"employment":[{"organization-name":"Zamcorporation","start-date":date("2010-06-08")}]}
+ {"id":5,"alias":"Von","name":"VonKemble","user-since":datetime("2010-01-05T10:10:00"),"friend-ids":{{3,6,10}},"employment":[{"organization-name":"Kongreen","start-date":date("2010-11-27")}]}
+ {"id":6,"alias":"Willis","name":"WillisWynne","user-since":datetime("2005-01-17T10:10:00"),"friend-ids":{{1,3,7}},"employment":[{"organization-name":"jaydax","start-date":date("2009-05-15")}]}
+ {"id":7,"alias":"Suzanna","name":"SuzannaTillson","user-since":datetime("2012-08-07T10:10:00"),"friend-ids":{{6}},"employment":[{"organization-name":"Labzatron","start-date":date("2011-04-19")}]}
+ {"id":8,"alias":"Nila","name":"NilaMilliron","user-since":datetime("2008-01-01T10:10:00"),"friend-ids":{{3}},"employment":[{"organization-name":"Plexlane","start-date":date("2010-02-28")}]}
+ {"id":9,"alias":"Woodrow","name":"WoodrowNehling","user-since":datetime("2005-09-20T10:10:00"),"friend-ids":{{3,10}},"employment":[{"organization-name":"Zuncan","start-date":date("2003-04-22"),"end-date":date("2009-12-13")}]}
+ {"id":10,"alias":"Bram","name":"BramHatch","user-since":datetime("2010-10-16T10:10:00"),"friend-ids":{{1,5,9}},"employment":[{"organization-name":"physcane","start-date":date("2007-06-05"),"end-date":date("2011-11-05")}]}
+
+[Facebook Messages](http://asterixdb.googlecode.com/files/fbm.adm)
+
+ {"message-id":1,"author-id":3,"in-response-to":2,"sender-location":point("47.16,77.75"),"message":" love sprint its shortcut-menu is awesome:)"}
+ {"message-id":2,"author-id":1,"in-response-to":4,"sender-location":point("41.66,80.87"),"message":" dislike iphone its touch-screen is horrible"}
+ {"message-id":3,"author-id":2,"in-response-to":4,"sender-location":point("48.09,81.01"),"message":" like samsung the plan is amazing"}
+ {"message-id":4,"author-id":1,"in-response-to":2,"sender-location":point("37.73,97.04"),"message":" can't stand at&t the network is horrible:("}
+ {"message-id":5,"author-id":6,"in-response-to":2,"sender-location":point("34.7,90.76"),"message":" love sprint the customization is mind-blowing"}
+ {"message-id":6,"author-id":2,"in-response-to":1,"sender-location":point("31.5,75.56"),"message":" like t-mobile its platform is mind-blowing"}
+ {"message-id":7,"author-id":5,"in-response-to":15,"sender-location":point("32.91,85.05"),"message":" dislike sprint the speed is horrible"}
+ {"message-id":8,"author-id":1,"in-response-to":11,"sender-location":point("40.33,80.87"),"message":" like verizon the 3G is awesome:)"}
+ {"message-id":9,"author-id":3,"in-response-to":12,"sender-location":point("34.45,96.48"),"message":" love verizon its wireless is good"}
+ {"message-id":10,"author-id":1,"in-response-to":12,"sender-location":point("42.5,70.01"),"message":" can't stand motorola the touch-screen is terrible"}
+ {"message-id":11,"author-id":1,"in-response-to":1,"sender-location":point("38.97,77.49"),"message":" can't stand at&t its plan is terrible"}
+ {"message-id":12,"author-id":10,"in-response-to":6,"sender-location":point("42.26,77.76"),"message":" can't stand t-mobile its voicemail-service is OMG:("}
+ {"message-id":13,"author-id":10,"in-response-to":4,"sender-location":point("42.77,78.92"),"message":" dislike iphone the voice-command is bad:("}
+ {"message-id":14,"author-id":9,"in-response-to":12,"sender-location":point("41.33,85.28"),"message":" love at&t its 3G is good:)"}
+ {"message-id":15,"author-id":7,"in-response-to":11,"sender-location":point("44.47,67.11"),"message":" like iphone the voicemail-service is awesome"}
+
+
+It's loading time! We can use AQL _load_ statements to populate our datasets with the sample records shown above.
+The following shows how loading can be done for data stored in `.adm` files in your local filesystem.
+*Note:* You _MUST_ replace the `<Host Name>` and `<Absolute File Path>` placeholders in each load
+statement below with valid values based on the host IP address (or host name) for the machine and
+directory that you have downloaded the provided `.adm` files to.
+As you do so, be very, very careful to retain the two slashes in the load statements, i.e.,
+do not delete the two slashes that appear in front of the absolute path to your `.adm` files.
+(This will lead to a three-slash character sequence at the start of each load statement's file
+input path specification.)
+
+
+ use dataverse TinySocial;
+
+ load dataset FacebookUsers using localfs
+ (("path"="<Host Name>://<Absolute File Path>/fbu.adm"),("format"="adm"));
+
+ load dataset FacebookMessages using localfs
+ (("path"="<Host Name>://<Absolute File Path>/fbm.adm"),("format"="adm"));
+
+ load dataset TwitterUsers using localfs
+ (("path"="<Host Name>://<Absolute File Path>/twu.adm"),("format"="adm"));
+
+ load dataset TweetMessages using localfs
+ (("path"="<Host Name>://<Absolute File Path>/twm.adm"),("format"="adm"));
+
+
+----
+## AQL: Querying Your AsterixDB Data ##
+Congratulations! You now have sample social data stored (and indexed) in AsterixDB.
+(You are part of an elite and adventurous group of individuals. :-))
+Now that you have successfully loaded the provided sample data into the datasets that we defined,
+you can start running queries against them.
+
+The query language for AsterixDB is AQL---the Asterix Query Language.
+AQL is loosely based on XQuery, the language developed and standardized in the early to mid 2000's
+by the World Wide Web Consortium (W3C) for querying semistructured data stored in their XML format.
+We have tossed all of the "XML cruft" out of their language but retained many of its core ideas.
+We did this because its design was developed over a period of years by a diverse committee of smart
+and experienced language designers, including "SQL people", "functional programming people", and
+"XML people", all of whom were focused on how to design a new query language that operates well over
+semistructured data.
+(We decided to stand on their shoulders instead of starting from scratch and revisiting many of the
+same issues.)
+Note that AQL is not SQL and not based on SQL: In other words, AsterixDB is fully "NoSQL compliant". :-)
+
+In this section we introduce AQL via a set of example queries, along with their expected results,
+based on the data above, to help you get started.
+Many of the most important features of AQL are presented in this set of representative queries.
+You can find a BNF description of the current AQL grammar at [wiki:AsterixDBGrammar], and someday
+in the not-too-distant future we will also provide a complete reference manual for the language.
+In the meantime, this will get you started down the path of using AsterixDB.
+A more complete list of the supported AsterixDB primitive types and built-in functions can be
+found at [AsterixDataTypesAndFunctions](AsterixDataTypesAndFunctions.html).
+
+AQL is an expression language.
+Even the expression 1+1 is a valid AQL query that evaluates to 2.
+(Try it for yourself!
+Okay, maybe that's _not_ the best use of a 512-node shared-nothing compute cluster.)
+Most useful AQL queries will be based on the _FLWOR_ (pronounced "flower") expression structure
+that AQL has borrowed from XQuery ((http://en.wikipedia.org/wiki/FLWOR)).
+The FLWOR expression syntax supports both the incremental binding (_for_) of variables to ADM data
+instances in a dataset (or in the result of any AQL expression, actually) and the full binding (_let_)
+of variables to entire intermediate results in a fashion similar to temporary views in the SQL world.
+FLWOR is an acronym that is short for _for_-_let_-_where_-_order by_-_return_,
+naming five of the most frequently used clauses from the syntax of a full AQL query.
+AQL also includes _group by_ and _limit_ clauses, as you will see shortly.
+Roughly speaking, for SQL afficiandos, the _for_ clause in AQL is like the _from_ clause in SQL,
+the _return_ clause in AQL is like the _select_ clause in SQL (but appears at the end instead of
+the beginning of a query), the _let_ clause in AQL is like SQL's _with_ clause, and the _where_
+and _order by_ clauses in both languages are similar.
+
+Enough talk!
+Let's go ahead and try writing some queries and see about learning AQL by example.
+
+### Query 0-A - Exact-Match Lookup ###
+For our first query, let's find a Facebook user based on his or her user id.
+Suppose the user we want is the user whose id is 8:
+
+
+ use dataverse TinySocial;
+
+ for $user in dataset FacebookUsers
+ where $user.id = 8
+ return $user;
+
+The query's _for_ clause binds the variable `$user` incrementally to the data instances residing in
+the dataset named FacebookUsers.
+Its _where_ clause selects only those bindings having a user id of interest, filtering out the rest.
+The _return_ clause returns the (entire) data instance for each binding that satisfies the predicate.
+Since this dataset is indexed on user id (its primary key), this query will be done via a quick index lookup.
+
+The expected result for our sample data is as follows:
+
+ { "id": 8, "alias": "Nila", "name": "NilaMilliron", "user-since": datetime("2008-01-01T10:10:00.000Z"), "friend-ids": {{ 3 }}, "employment": [ { "organization-name": "Plexlane", "start-date": date("2010-02-28"), "end-date": null } ] }
+
+
+### Query 0-B - Range Scan ###
+AQL, like SQL, supports a variety of different predicates.
+For example, for our next query, let's find the Facebook users whose ids are in the range between 2 and 4:
+
+ use dataverse TinySocial;
+
+ for $user in dataset FacebookUsers
+ where $user.id >= 2 and $user.id <= 4
+ return $user;
+
+This query's expected result, also evaluable using the primary index on user id, is:
+
+ { "id": 2, "alias": "Isbel", "name": "IsbelDull", "user-since": datetime("2011-01-22T10:10:00.000Z"), "friend-ids": {{ 1, 4 }}, "employment": [ { "organization-name": "Hexviafind", "start-date": date("2010-04-27"), "end-date": null } ] }
+ { "id": 3, "alias": "Emory", "name": "EmoryUnk", "user-since": datetime("2012-07-10T10:10:00.000Z"), "friend-ids": {{ 1, 5, 8, 9 }}, "employment": [ { "organization-name": "geomedia", "start-date": date("2010-06-17"), "end-date": date("2010-01-26") } ] }
+ { "id": 4, "alias": "Nicholas", "name": "NicholasStroh", "user-since": datetime("2010-12-27T10:10:00.000Z"), "friend-ids": {{ 2 }}, "employment": [ { "organization-name": "Zamcorporation", "start-date": date("2010-06-08"), "end-date": null } ] }
+
+
+### Query 1 - Other Query Filters ###
+AQL can do range queries on any data type that supports the appropriate set of comparators.
+As an example, this next query retrieves the Facebook users who joined between July 22, 2010 and July 29, 2012:
+
+ use dataverse TinySocial;
+
+ for $user in dataset FacebookUsers
+ where $user.user-since >= datetime('2010-07-22T00:00:00')
+ and $user.user-since <= datetime('2012-07-29T23:59:59')
+ return $user;
+
+The expected result for this query, also an indexable query, is as follows:
+
+ { "id": 2, "alias": "Isbel", "name": "IsbelDull", "user-since": datetime("2011-01-22T10:10:00.000Z"), "friend-ids": {{ 1, 4 }}, "employment": [ { "organization-name": "Hexviafind", "start-date": date("2010-04-27"), "end-date": null } ] }
+ { "id": 3, "alias": "Emory", "name": "EmoryUnk", "user-since": datetime("2012-07-10T10:10:00.000Z"), "friend-ids": {{ 1, 5, 8, 9 }}, "employment": [ { "organization-name": "geomedia", "start-date": date("2010-06-17"), "end-date": date("2010-01-26") } ] }
+ { "id": 4, "alias": "Nicholas", "name": "NicholasStroh", "user-since": datetime("2010-12-27T10:10:00.000Z"), "friend-ids": {{ 2 }}, "employment": [ { "organization-name": "Zamcorporation", "start-date": date("2010-06-08"), "end-date": null } ] }
+ { "id": 10, "alias": "Bram", "name": "BramHatch", "user-since": datetime("2010-10-16T10:10:00.000Z"), "friend-ids": {{ 1, 5, 9 }}, "employment": [ { "organization-name": "physcane", "start-date": date("2007-06-05"), "end-date": date("2011-11-05") } ] }
+
+
+### Query 2-A - Equijoin ###
+In addition to simply binding variables to data instances and returning them "whole",
+an AQL query can construct new ADM instances to return based on combinations of its variable bindings.
+This gives AQL the power to do joins much like those done using multi-table _from_ clauses in SQL.
+For example, suppose we wanted a list of all Facebook users paired with their associated messages,
+with the list enumerating the author name and the message text associated with each Facebook message.
+We could do this as follows in AQL:
+
+ use dataverse TinySocial;
+
+ for $user in dataset FacebookUsers
+ for $message in dataset FacebookMessages
+ where $message.author-id = $user.id
+ return {
+ "uname": $user.name,
+ "message": $message.message
+ };
+
+The result of this query is a sequence of new ADM instances, one for each author/message pair.
+Each instance in the result will be an ADM record containing two fields, "uname" and "message",
+containing the user's name and the message text, respectively, for each author/message pair.
+(Note that "uname" and "message" are both simple AQL expressions themselves---so in the most
+general case, even the resulting field names can be computed as part of the query, making AQL
+a very powerful tool for slicing and dicing semistructured data.)
+
+The expected result of this example AQL join query for our sample data set is:
+
+ { "uname": "MargaritaStoddard", "message": " dislike iphone its touch-screen is horrible" }
+ { "uname": "MargaritaStoddard", "message": " can't stand at&t the network is horrible:(" }
+ { "uname": "MargaritaStoddard", "message": " like verizon the 3G is awesome:)" }
+ { "uname": "MargaritaStoddard", "message": " can't stand motorola the touch-screen is terrible" }
+ { "uname": "MargaritaStoddard", "message": " can't stand at&t its plan is terrible" }
+ { "uname": "IsbelDull", "message": " like samsung the plan is amazing" }
+ { "uname": "IsbelDull", "message": " like t-mobile its platform is mind-blowing" }
+ { "uname": "EmoryUnk", "message": " love sprint its shortcut-menu is awesome:)" }
+ { "uname": "EmoryUnk", "message": " love verizon its wireless is good" }
+ { "uname": "VonKemble", "message": " dislike sprint the speed is horrible" }
+ { "uname": "WillisWynne", "message": " love sprint the customization is mind-blowing" }
+ { "uname": "SuzannaTillson", "message": " like iphone the voicemail-service is awesome" }
+ { "uname": "WoodrowNehling", "message": " love at&t its 3G is good:)" }
+ { "uname": "BramHatch", "message": " can't stand t-mobile its voicemail-service is OMG:(" }
+ { "uname": "BramHatch", "message": " dislike iphone the voice-command is bad:(" }
+
+
+### Query 2-B - Index join ###
+By default, AsterixDB evaluates equijoin queries using hash-based join methods that work
+well for doing ad hoc joins of very large data sets
+([http://en.wikipedia.org/wiki/Hash_join](http://en.wikipedia.org/wiki/Hash_join)).
+On a cluster, hash partitioning is employed as AsterixDB's divide-and-conquer strategy for
+computing large parallel joins.
+AsterixDB includes other join methods, but in the absence of data statistics and selectivity
+estimates, it doesn't (yet) have the know-how to intelligently choose among its alternatives.
+We therefore asked ourselves the classic question---WWOD?---What Would Oracle Do?---and in the
+interim, AQL includes a clunky (but useful) hint-based mechanism for addressing the occasional
+need to suggest to AsterixDB which join method it should use for a particular AQL query.
+
+The following query is similar to Query 2-A but includes a suggestion to AsterixDB that it
+should consider employing an index-based nested-loop join technique to process the query:
+
+ use dataverse TinySocial;
+
+ for $user in dataset FacebookUsers
+ for $message in dataset FacebookMessages
+ where $message.author-id /*+ indexnl */ = $user.id
+ return {
+ "uname": $user.name,
+ "message": $message.message
+ };
+
+
+The expected result is (of course) the same as before, modulo the order of the instances.
+Result ordering is (intentionally) undefined in AQL in the absence of an _order by_ clause.
+The query result for our sample data in this case is:
+
+ { "uname": "EmoryUnk", "message": " love sprint its shortcut-menu is awesome:)" }
+ { "uname": "MargaritaStoddard", "message": " dislike iphone its touch-screen is horrible" }
+ { "uname": "IsbelDull", "message": " like samsung the plan is amazing" }
+ { "uname": "MargaritaStoddard", "message": " can't stand at&t the network is horrible:(" }
+ { "uname": "WillisWynne", "message": " love sprint the customization is mind-blowing" }
+ { "uname": "IsbelDull", "message": " like t-mobile its platform is mind-blowing" }
+ { "uname": "VonKemble", "message": " dislike sprint the speed is horrible" }
+ { "uname": "MargaritaStoddard", "message": " like verizon the 3G is awesome:)" }
+ { "uname": "EmoryUnk", "message": " love verizon its wireless is good" }
+ { "uname": "MargaritaStoddard", "message": " can't stand motorola the touch-screen is terrible" }
+ { "uname": "MargaritaStoddard", "message": " can't stand at&t its plan is terrible" }
+ { "uname": "BramHatch", "message": " can't stand t-mobile its voicemail-service is OMG:(" }
+ { "uname": "BramHatch", "message": " dislike iphone the voice-command is bad:(" }
+ { "uname": "WoodrowNehling", "message": " love at&t its 3G is good:)" }
+ { "uname": "SuzannaTillson", "message": " like iphone the voicemail-service is awesome" }
+
+
+(It is worth knowing, with respect to influencing AsterixDB's query evaluation, that nested _for_
+clauses---a.k.a. joins--- are currently evaluated with the "outer" clause probing the data of the "inner"
+clause.)
+
+### Query 3 - Nested Outer Join ###
+In order to support joins between tables with missing/dangling join tuples, the designers of SQL ended
+up shoe-horning a subset of the relational algebra into SQL's _from_ clause syntax---and providing a
+variety of join types there for users to choose from.
+Left outer joins are particularly important in SQL, e.g., to print a summary of customers and orders,
+grouped by customer, without omitting those customers who haven't placed any orders yet.
+
+The AQL language supports nesting, both of queries and of query results, and the combination allows for
+an arguably cleaner/more natural approach to such queries.
+As an example, supposed we wanted, for each Facebook user, to produce a record that has his/her name
+plus a list of the messages written by that user.
+In SQL, this would involve a left outer join between users and messages, grouping by user, and having
+the user name repeated along side each message.
+In AQL, this sort of use case can be handled (more naturally) as follows:
+
+ use dataverse TinySocial;
+
+ for $user in dataset FacebookUsers
+ return {
+ "uname": $user.name,
+ "messages": for $message in dataset FacebookMessages
+ where $message.author-id = $user.id
+ return $message.message
+ };
+
+This AQL query binds the variable `$user` to the data instances in FacebookUsers;
+for each user, it constructs a result record containing a "uname" field with the user's
+name and a "messages" field with a nested collection of all messages for that user.
+The nested collection for each user is specified by using a correlated subquery.
+(Note: While it looks like nested loops could be involved in computing the result,
+AsterixDB recogizes the equivalence of such a query to an outerjoin, and it will
+use an efficient hash-based strategy when actually computing the query's result.)
+
+Here is this example query's expected output:
+
+ { "uname": "MargaritaStoddard", "messages": [ " dislike iphone its touch-screen is horrible", " can't stand at&t the network is horrible:(", " like verizon the 3G is awesome:)", " can't stand motorola the touch-screen is terrible", " can't stand at&t its plan is terrible" ] }
+ { "uname": "IsbelDull", "messages": [ " like samsung the plan is amazing", " like t-mobile its platform is mind-blowing" ] }
+ { "uname": "EmoryUnk", "messages": [ " love sprint its shortcut-menu is awesome:)", " love verizon its wireless is good" ] }
+ { "uname": "NicholasStroh", "messages": [ ] }
+ { "uname": "VonKemble", "messages": [ " dislike sprint the speed is horrible" ] }
+ { "uname": "WillisWynne", "messages": [ " love sprint the customization is mind-blowing" ] }
+ { "uname": "SuzannaTillson", "messages": [ " like iphone the voicemail-service is awesome" ] }
+ { "uname": "NilaMilliron", "messages": [ ] }
+ { "uname": "WoodrowNehling", "messages": [ " love at&t its 3G is good:)" ] }
+ { "uname": "BramHatch", "messages": [ " dislike iphone the voice-command is bad:(", " can't stand t-mobile its voicemail-service is OMG:(" ] }
+
+
+### Query 4 - Theta Join ###
+Not all joins are expressible as equijoins and computable using equijoin-oriented algorithms.
+The join predicates for some use cases involve predicates with functions; AsterixDB supports the
+expression of such queries and will still evaluate them as best it can using nested loop based
+techniques (and broadcast joins in the parallel case).
+
+As an example of such a use case, suppose that we wanted, for each tweet T, to find all of the
+other tweets that originated from within a circle of radius of 1 surrounding tweet T's location.
+In AQL, this can be specified in a manner similar to the previous query using one of the built-in
+functions on the spatial data type instead of id equality in the correlated query's _where_ clause:
+
+ use dataverse TinySocial;
+
+ for $t in dataset TweetMessages
+ return {
+ "message": $t.message-text,
+ "nearby-messages": for $t2 in dataset TweetMessages
+ where spatial-distance($t.sender-location, $t2.sender-location) <= 1
+ return { "msgtxt":$t2.message-text}
+ };
+
+Here is the expected result for this query:
+
+ { "message": " love t-mobile its customization is good:)", "nearby-messages": [ { "msgtxt": " love t-mobile its customization is good:)" } ] }
+ { "message": " hate verizon its voice-clarity is OMG:(", "nearby-messages": [ { "msgtxt": " like motorola the speed is good:)" }, { "msgtxt": " hate verizon its voice-clarity is OMG:(" } ] }
+ { "message": " can't stand iphone its platform is terrible", "nearby-messages": [ { "msgtxt": " can't stand iphone its platform is terrible" } ] }
+ { "message": " like samsung the voice-command is amazing:)", "nearby-messages": [ { "msgtxt": " like samsung the voice-command is amazing:)" } ] }
+ { "message": " like verizon its shortcut-menu is awesome:)", "nearby-messages": [ { "msgtxt": " like verizon its shortcut-menu is awesome:)" } ] }
+ { "message": " like motorola the speed is good:)", "nearby-messages": [ { "msgtxt": " hate verizon its voice-clarity is OMG:(" }, { "msgtxt": " like motorola the speed is good:)" } ] }
+ { "message": " like sprint the voice-command is mind-blowing:)", "nearby-messages": [ { "msgtxt": " like sprint the voice-command is mind-blowing:)" } ] }
+ { "message": " can't stand motorola its speed is terrible:(", "nearby-messages": [ { "msgtxt": " can't stand motorola its speed is terrible:(" } ] }
+ { "message": " like iphone the voice-clarity is good:)", "nearby-messages": [ { "msgtxt": " like iphone the voice-clarity is good:)" } ] }
+ { "message": " like samsung the platform is good", "nearby-messages": [ { "msgtxt": " like samsung the platform is good" } ] }
+ { "message": " like t-mobile the shortcut-menu is awesome:)", "nearby-messages": [ { "msgtxt": " like t-mobile the shortcut-menu is awesome:)" } ] }
+ { "message": " love verizon its voicemail-service is awesome", "nearby-messages": [ { "msgtxt": " love verizon its voicemail-service is awesome" } ] }
+
+
+### Query 5 - Fuzzy Join ###
+As another example of a non-equijoin use case, we could ask AsterixDB to find, for each Facebook user,
+all Twitter users with names "similar" to their name.
+AsterixDB supports a variety of "fuzzy match" functions for use with textual and set-based data.
+As one example, we could choose to use edit distance with a threshold of 3 as the definition of name
+similarity, in which case we could write the following query using AQL's operator-based syntax (~=)
+for testing whether or not two values are similar:
+
+ use dataverse TinySocial;
+
+ set simfunction "edit-distance";
+ set simthreshold "3";
+
+ for $fbu in dataset FacebookUsers
+ return {
+ "id": $fbu.id,
+ "name": $fbu.name,
+ "similar-users": for $t in dataset TweetMessages
+ let $tu := $t.user
+ where $tu.name ~= $fbu.name
+ return {
+ "twitter-screenname": $tu.screen-name,
+ "twitter-name": $tu.name
+ }
+ };
+
+The expected result for this query against our sample data is:
+
+ { "id": 1, "name": "MargaritaStoddard", "similar-users": [ ] }
+ { "id": 2, "name": "IsbelDull", "similar-users": [ ] }
+ { "id": 3, "name": "EmoryUnk", "similar-users": [ ] }
+ { "id": 4, "name": "NicholasStroh", "similar-users": [ ] }
+ { "id": 5, "name": "VonKemble", "similar-users": [ ] }
+ { "id": 6, "name": "WillisWynne", "similar-users": [ ] }
+ { "id": 7, "name": "SuzannaTillson", "similar-users": [ ] }
+ { "id": 8, "name": "NilaMilliron", "similar-users": [ { "twitter-screenname": "NilaMilliron_tw", "twitter-name": "Nila Milliron" } ] }
+ { "id": 9, "name": "WoodrowNehling", "similar-users": [ ] }
+ { "id": 10, "name": "BramHatch", "similar-users": [ ] }
+
+
+### Query 6 - Existential Quantification ###
+The expressive power of AQL includes support for queries involving "some" (existentially quantified)
+and "all" (universally quantified) query semantics.
+As an example of an existential AQL query, here we show a query to list the Facebook users who are currently employed.
+Such employees will have an employment history containing a record with a null end-date value, which leads us to the
+following AQL query:
+
+ use dataverse TinySocial;
+
+ for $fbu in dataset FacebookUsers
+ where (some $e in $fbu.employment satisfies is-null($e.end-date))
+ return $fbu;
+
+The expected result in this case is:
+
+ { "id": 1, "alias": "Margarita", "name": "MargaritaStoddard", "user-since": datetime("2012-08-20T10:10:00.000Z"), "friend-ids": {{ 2, 3, 6, 10 }}, "employment": [ { "organization-name": "Codetechno", "start-date": date("2006-08-06"), "end-date": null } ] }
+ { "id": 2, "alias": "Isbel", "name": "IsbelDull", "user-since": datetime("2011-01-22T10:10:00.000Z"), "friend-ids": {{ 1, 4 }}, "employment": [ { "organization-name": "Hexviafind", "start-date": date("2010-04-27"), "end-date": null } ] }
+ { "id": 4, "alias": "Nicholas", "name": "NicholasStroh", "user-since": datetime("2010-12-27T10:10:00.000Z"), "friend-ids": {{ 2 }}, "employment": [ { "organization-name": "Zamcorporation", "start-date": date("2010-06-08"), "end-date": null } ] }
+ { "id": 5, "alias": "Von", "name": "VonKemble", "user-since": datetime("2010-01-05T10:10:00.000Z"), "friend-ids": {{ 3, 6, 10 }}, "employment": [ { "organization-name": "Kongreen", "start-date": date("2010-11-27"), "end-date": null } ] }
+ { "id": 6, "alias": "Willis", "name": "WillisWynne", "user-since": datetime("2005-01-17T10:10:00.000Z"), "friend-ids": {{ 1, 3, 7 }}, "employment": [ { "organization-name": "jaydax", "start-date": date("2009-05-15"), "end-date": null } ] }
+ { "id": 7, "alias": "Suzanna", "name": "SuzannaTillson", "user-since": datetime("2012-08-07T10:10:00.000Z"), "friend-ids": {{ 6 }}, "employment": [ { "organization-name": "Labzatron", "start-date": date("2011-04-19"), "end-date": null } ] }
+ { "id": 8, "alias": "Nila", "name": "NilaMilliron", "user-since": datetime("2008-01-01T10:10:00.000Z"), "friend-ids": {{ 3 }}, "employment": [ { "organization-name": "Plexlane", "start-date": date("2010-02-28"), "end-date": null } ] }
+
+
+### Query 7 - Universal Quantification ###
+As an example of a universal AQL query, here we show a query to list the Facebook users who are currently unemployed.
+Such employees will have an employment history containing no records with null end-date values, leading us to the
+following AQL query:
+
+ use dataverse TinySocial;
+
+ for $fbu in dataset FacebookUsers
+ where (every $e in $fbu.employment satisfies not(is-null($e.end-date)))
+ return $fbu;
+
+Here is the expected result for our sample data:
+
+ { "id": 3, "alias": "Emory", "name": "EmoryUnk", "user-since": datetime("2012-07-10T10:10:00.000Z"), "friend-ids": {{ 1, 5, 8, 9 }}, "employment": [ { "organization-name": "geomedia", "start-date": date("2010-06-17"), "end-date": date("2010-01-26") } ] }
+ { "id": 9, "alias": "Woodrow", "name": "WoodrowNehling", "user-since": datetime("2005-09-20T10:10:00.000Z"), "friend-ids": {{ 3, 10 }}, "employment": [ { "organization-name": "Zuncan", "start-date": date("2003-04-22"), "end-date": date("2009-12-13") } ] }
+ { "id": 10, "alias": "Bram", "name": "BramHatch", "user-since": datetime("2010-10-16T10:10:00.000Z"), "friend-ids": {{ 1, 5, 9 }}, "employment": [ { "organization-name": "physcane", "start-date": date("2007-06-05"), "end-date": date("2011-11-05") } ] }
+
+
+### Query 8 - Simple Aggregation ###
+Like SQL, the AQL language of AsterixDB provides support for computing aggregates over large amounts of data.
+As a very simple example, the following AQL query computes the total number of Facebook users:
+
+ use dataverse TinySocial;
+
+ count(for $fbu in dataset FacebookUsers return $fbu);
+
+In AQL, aggregate functions can be applied to arbitrary subquery results; in this case, the count function
+is applied to the result of a query that enumerates the Facebook users. The expected result here is:
+
+ 10
+
+
+
+### Query 9-A - Grouping and Aggregation ###
+Also like SQL, AQL supports grouped aggregation.
+For every Twitter user, the following group-by/aggregate query counts the number of tweets sent by that user:
+
+ use dataverse TinySocial;
+
+ for $t in dataset TweetMessages
+ group by $uid := $t.user.screen-name with $t
+ return {
+ "user": $uid,
+ "count": count($t)
+ };
+
+The _for_ clause incrementally binds $t to tweets, and the _group by_ clause groups the tweets by its
+issuer's Twitter screen-name.
+Unlike SQL, where data is tabular---flat---the data model underlying AQL allows for nesting.
+Thus, following the _group by_ clause, the _return_ clause in this query sees a sequence of $t groups,
+with each such group having an associated $uid variable value (i.e., the tweeting user's screen name).
+In the context of the return clause, due to "... with $t ...", $uid is bound to the tweeter's id and $t
+is bound to the _set_ of tweets issued by that tweeter.
+The return clause constructs a result record containing the tweeter's user id and the count of the items
+in the associated tweet set.
+The query result will contain one such record per screen name.
+This query also illustrates another feature of AQL; notice that each user's screen name is accessed via a
+path syntax that traverses each tweet's nested record structure.
+
+Here is the expected result for this query over the sample data:
+
+ { "user": "ChangEwing_573", "count": 1 }
+ { "user": "ColineGeyer@63", "count": 3 }
+ { "user": "NathanGiesen@211", "count": 6 }
+ { "user": "NilaMilliron_tw", "count": 1 }
+ { "user": "OliJackson_512", "count": 1 }
+
+
+
+### Query 9-B - (Hash-Based) Grouping and Aggregation ###
+As for joins, AsterixDB has multiple evaluation strategies available for processing grouped aggregate queries.
+For grouped aggregation, the system knows how to employ both sort-based and hash-based aggregation methods,
+with sort-based methods being used by default and a hint being available to suggest that a different approach
+be used in processing a particular AQL query.
+
+The following query is similar to Query 9-A, but adds a hash-based aggregation hint:
+
+ use dataverse TinySocial;
+
+ for $t in dataset TweetMessages
+ /*+ hash*/
+ group by $uid := $t.user.screen-name with $t
+ return {
+ "user": $uid,
+ "count": count($t)
+ };
+
+Here is the expected result:
+
+ { "user": "OliJackson_512", "count": 1 }
+ { "user": "ColineGeyer@63", "count": 3 }
+ { "user": "NathanGiesen@211", "count": 6 }
+ { "user": "NilaMilliron_tw", "count": 1 }
+ { "user": "ChangEwing_573", "count": 1 }
+
+
+
+### Query 10 - Grouping and Limits ###
+In some use cases it is not necessary to compute the entire answer to a query.
+In some cases, just having the first _N_ or top _N_ results is sufficient.
+This is expressible in AQL using the _limit_ clause combined with the _order by_ clause.
+
+The following AQL query returns the top 3 Twitter users based on who has issued the most tweets:
+
+ use dataverse TinySocial;
+
+ for $t in dataset TweetMessages
+ group by $uid := $t.user.screen-name with $t
+ let $c := count($t)
+ order by $c desc
+ limit 3
+ return {
+ "user": $uid,
+ "count": $c
+ };
+
+The expected result for this query is:
+
+ { "user": "NathanGiesen@211", "count": 6 }
+ { "user": "ColineGeyer@63", "count": 3 }
+ { "user": "NilaMilliron_tw", "count": 1 }
+
+
+### Query 11 - Left Outer Fuzzy Join ###
+As a last example of AQL and its query power, the following query, for each tweet,
+finds all of the tweets that are similar based on the topics that they refer to:
+
+ use dataverse TinySocial;
+
+ set simfunction "jaccard";
+ set simthreshold "0.3";
+
+ for $t in dataset TweetMessages
+ return {
+ "tweet": $t,
+ "similar-tweets": for $t2 in dataset TweetMessages
+ where $t2.referred-topics ~= $t.referred-topics
+ and $t2.tweetid != $t.tweetid
+ return $t2.referred-topics
+ };
+
+This query illustrates several things worth knowing in order to write fuzzy queries in AQL.
+First, as mentioned earlier, AQL offers an operator-based syntax for seeing whether two values are "similar" to one another or not.
+Second, recall that the referred-topics field of records of datatype TweetMessageType is a bag of strings.
+This query sets the context for its similarity join by requesting that Jaccard-based similarity semantics
+([http://en.wikipedia.org/wiki/Jaccard_index](http://en.wikipedia.org/wiki/Jaccard_index))
+be used for the query's similarity operator and that a similarity index of 0.3 be used as its similarity threshold.
+
+The expected result for this fuzzy join query is:
+
+ { "tweet": { "tweetid": "1", "user": { "screen-name": "NathanGiesen@211", "lang": "en", "friends_count": 39339, "statuses_count": 473, "name": "Nathan Giesen", "followers_count": 49416 }, "sender-location": point("47.44,80.65"), "send-time": datetime("2008-04-26T10:10:00.000Z"), "referred-topics": {{ "t-mobile", "customization" }}, "message-text": " love t-mobile its customization is good:)" }, "similar-tweets": [ {{ "t-mobile", "shortcut-menu" }} ] }
+ { "tweet": { "tweetid": "10", "user": { "screen-name": "ColineGeyer@63", "lang": "en", "friends_count": 121, "statuses_count": 362, "name": "Coline Geyer", "followers_count": 17159 }, "sender-location": point("29.15,76.53"), "send-time": datetime("2008-01-26T10:10:00.000Z"), "referred-topics": {{ "verizon", "voice-clarity" }}, "message-text": " hate verizon its voice-clarity is OMG:(" }, "similar-tweets": [ {{ "iphone", "voice-clarity" }}, {{ "verizon", "voicemail-service" }}, {{ "verizon", "shortcut-menu" }} ] }
+ { "tweet": { "tweetid": "11", "user": { "screen-name": "NilaMilliron_tw", "lang": "en", "friends_count": 445, "statuses_count": 164, "name": "Nila Milliron", "followers_count": 22649 }, "sender-location": point("37.59,68.42"), "send-time": datetime("2008-03-09T10:10:00.000Z"), "referred-topics": {{ "iphone", "platform" }}, "message-text": " can't stand iphone its platform is terrible" }, "similar-tweets": [ {{ "iphone", "voice-clarity" }}, {{ "samsung", "platform" }} ] }
+ { "tweet": { "tweetid": "12", "user": { "screen-name": "OliJackson_512", "lang": "en", "friends_count": 445, "statuses_count": 164, "name": "Oli Jackson", "followers_count": 22649 }, "sender-location": point("24.82,94.63"), "send-time": datetime("2010-02-13T10:10:00.000Z"), "referred-topics": {{ "samsung", "voice-command" }}, "message-text": " like samsung the voice-command is amazing:)" }, "similar-tweets": [ {{ "samsung", "platform" }}, {{ "sprint", "voice-command" }} ] }
+ { "tweet": { "tweetid": "2", "user": { "screen-name": "ColineGeyer@63", "lang": "en", "friends_count": 121, "statuses_count": 362, "name": "Coline Geyer", "followers_count": 17159 }, "sender-location": point("32.84,67.14"), "send-time": datetime("2010-05-13T10:10:00.000Z"), "referred-topics": {{ "verizon", "shortcut-menu" }}, "message-text": " like verizon its shortcut-menu is awesome:)" }, "similar-tweets": [ {{ "verizon", "voicemail-service" }}, {{ "verizon", "voice-clarity" }}, {{ "t-mobile", "shortcut-menu" }} ] }
+ { "tweet": { "tweetid": "3", "user": { "screen-name": "NathanGiesen@211", "lang": "en", "friends_count": 39339, "statuses_count": 473, "name": "Nathan Giesen", "followers_count": 49416 }, "sender-location": point("29.72,75.8"), "send-time": datetime("2006-11-04T10:10:00.000Z"), "referred-topics": {{ "motorola", "speed" }}, "message-text": " like motorola the speed is good:)" }, "similar-tweets": [ {{ "motorola", "speed" }} ] }
+ { "tweet": { "tweetid": "4", "user": { "screen-name": "NathanGiesen@211", "lang": "en", "friends_count": 39339, "statuses_count": 473, "name": "Nathan Giesen", "followers_count": 49416 }, "sender-location": point("39.28,70.48"), "send-time": datetime("2011-12-26T10:10:00.000Z"), "referred-topics": {{ "sprint", "voice-command" }}, "message-text": " like sprint the voice-command is mind-blowing:)" }, "similar-tweets": [ {{ "samsung", "voice-command" }} ] }
+ { "tweet": { "tweetid": "5", "user": { "screen-name": "NathanGiesen@211", "lang": "en", "friends_count": 39339, "statuses_count": 473, "name": "Nathan Giesen", "followers_count": 49416 }, "sender-location": point("40.09,92.69"), "send-time": datetime("2006-08-04T10:10:00.000Z"), "referred-topics": {{ "motorola", "speed" }}, "message-text": " can't stand motorola its speed is terrible:(" }, "similar-tweets": [ {{ "motorola", "speed" }} ] }
+ { "tweet": { "tweetid": "6", "user": { "screen-name": "ColineGeyer@63", "lang": "en", "friends_count": 121, "statuses_count": 362, "name": "Coline Geyer", "followers_count": 17159 }, "sender-location": point("47.51,83.99"), "send-time": datetime("2010-05-07T10:10:00.000Z"), "referred-topics": {{ "iphone", "voice-clarity" }}, "message-text": " like iphone the voice-clarity is good:)" }, "similar-tweets": [ {{ "verizon", "voice-clarity" }}, {{ "iphone", "platform" }} ] }
+ { "tweet": { "tweetid": "7", "user": { "screen-name": "ChangEwing_573", "lang": "en", "friends_count": 182, "statuses_count": 394, "name": "Chang Ewing", "followers_count": 32136 }, "sender-location": point("36.21,72.6"), "send-time": datetime("2011-08-25T10:10:00.000Z"), "referred-topics": {{ "samsung", "platform" }}, "message-text": " like samsung the platform is good" }, "similar-tweets": [ {{ "iphone", "platform" }}, {{ "samsung", "voice-command" }} ] }
+ { "tweet": { "tweetid": "8", "user": { "screen-name": "NathanGiesen@211", "lang": "en", "friends_count": 39339, "statuses_count": 473, "name": "Nathan Giesen", "followers_count": 49416 }, "sender-location": point("46.05,93.34"), "send-time": datetime("2005-10-14T10:10:00.000Z"), "referred-topics": {{ "t-mobile", "shortcut-menu" }}, "message-text": " like t-mobile the shortcut-menu is awesome:)" }, "similar-tweets": [ {{ "t-mobile", "customization" }}, {{ "verizon", "shortcut-menu" }} ] }
+ { "tweet": { "tweetid": "9", "user": { "screen-name": "NathanGiesen@211", "lang": "en", "friends_count": 39339, "statuses_count": 473, "name": "Nathan Giesen", "followers_count": 49416 }, "sender-location": point("36.86,74.62"), "send-time": datetime("2012-07-21T10:10:00.000Z"), "referred-topics": {{ "verizon", "voicemail-service" }}, "message-text": " love verizon its voicemail-service is awesome" }, "similar-tweets": [ {{ "verizon", "voice-clarity" }}, {{ "verizon", "shortcut-menu" }} ] }
+
+
+### Inserting New Data ###
+In addition to loading and querying data, AsterixDB supports incremental additions to datasets via the AQL _insert_ statement.
+
+The following example adds a new tweet by user "NathanGiesen@211" to the TweetMessages dataset.
+(An astute reader may notice that this tweet was issued a half an hour after his last tweet, so his counts
+have all gone up in the interim, although he appears not to have moved in the last half hour.)
+
+ use dataverse TinySocial;
+
+ insert into dataset TweetMessages
+ (
+ {"tweetid":"13",
+ "user":
+ {"screen-name":"NathanGiesen@211",
+ "lang":"en",
+ "friends_count":39345,
+ "statuses_count":479,
+ "name":"Nathan Giesen",
+ "followers_count":49420
+ },
+ "sender-location":point("47.44,80.65"),
+ "send-time":datetime("2008-04-26T10:10:35"),
+ "referred-topics":{{"tweeting"}},
+ "message-text":"tweety tweet, my fellow tweeters!"
+ }
+ );
+
+In general, the data to be inserted may be specified using any valid AQL query expression.
+The insertion of a single object instance, as in this example, is just a special case where
+the query expression happens to be a record constructor involving only constants.
+
+### Deleting Existing Data ###
+In addition to inserting new data, AsterixDB supports deletion from datasets via the AQL _delete_ statement.
+The statement supports "searched delete" semantics, and its
+_where_ clause can involve any valid XQuery expression.
+
+The following example deletes the tweet that we just added from user "NathanGiesen@211". (Easy come, easy go. :-))
+
+ use dataverse TinySocial;
+
+ delete $tm from dataset TweetMessages where $tm.tweetid = "13";
+
+It should be noted that one form of data change not yet supported by AsterixDB is in-place data modification (_update_).
+Currently, only insert and delete operations are supported; update is not.
+To achieve the effect of an update, two statements are currently needed---one to delete the old record from the
+dataset where it resides, and another to insert the new replacement record (with the same primary key but with
+different field values for some of the associated data content).
+
+## Further Help ##
+That's it You are now armed and dangerous with respect to semistructured data management using AsterixDB.
+
+AsterixDB is a powerful new BDMS---Big Data Management System---that we hope may usher in a new era of much
+more declarative Big Data management.
+AsterixDB is powerful, so use it wisely, and remember: "With great power comes great responsibility..." :-)
+
+Please e-mail the AsterixDB user group
+(asterixdb-users (at) googlegroups.com)
+if you run into any problems or simply have further questions about the AsterixDB system, its features, or their proper use.
diff --git a/asterix-doc/src/site/markdown/AsterixAlphaRelease.md b/asterix-doc/src/site/markdown/AsterixAlphaRelease.md
new file mode 100644
index 0000000..ff9efb2
--- /dev/null
+++ b/asterix-doc/src/site/markdown/AsterixAlphaRelease.md
@@ -0,0 +1,57 @@
+# AsterixDB: A Big Data Management System _(Alpha Release)_ #
+
+## What Is AsterixDB? ##
+
+Welcome to the new home of the AsterixDB Big Data Management System (BDMS).
+The AsterixDB BDMS is the result of about 3.5 years of R&D involving researchers at UC Irvine, UC Riverside, and UC San Diego.
+The AsterixDB code base now consists of roughly 250K lines of Java code that has been co-developed at UC Irvine and UC Riverside.
+
+Initiated in 2009, the NSF-sponsored ASTERIX project has been developing new technologies for ingesting, storing, managing, indexing, querying, and analyzing vast quantities of semi-structured information.
+The project has been combining ideas from three distinct areas---semi-structured data, parallel databases, and data-intensive computing (a.k.a. today's Big Data platforms)---in order to create a next-generation, open-source software platform that scales by running on large, shared-nothing commodity computing clusters.
+The ASTERIX effort has been targeting a wide range of semi-structured information, ranging from "data" use cases---where information is well-typed and highly regular---to "content" use cases---where data tends to be irregular, much of each datum may be textual, and the ultimate schema for the various data types involved may be hard to anticipate up front.
+The ASTERIX project has been addressing technical issues including highly scalable data storage and indexing, semi-structured query processing on very large clusters, and merging time-tested parallel database techniques with modern data-intensive computing techniques to support performant yet declarative solutions to the problem of storing and analyzing semi-structured information effectively.
+The first fruits of this labor have been captured in the AsterixDB system that is now being released in preliminary or "Alpha" release form.
+We are hoping that the arrival of AsterixDB will mark the beginning of the "BDMS era", and we hope that both the Big Data community and the database community will find the AsterixDB system to be interesting and useful for a much broader class of problems than can be addressed with any one of today's current Big Data platforms and related technologies (e.g., Hadoop, Pig, Hive, HBase, MongoDB, and so on). One of our project mottos has been "one size fits a bunch"---at least that has been our aim. For more information about the research effort that led to the birth of AsterixDB, please refer to our NSF project web site: [http://asterix.ics.uci.edu/](http://asterix.ics.uci.edu/).
+
+In a nutshell, AsterixDB is a full-function BDMS with a rich feature set that distinguishes it from pretty much any other Big Data platform that's out and available today. We believe that its feature set makes it well-suited to modern needs such as web data warehousing and social data storage and analysis. AsterixDB has:
+
+ * A semistructured NoSQL style data model (ADM) resulting from extending JSON with object database ideas
+ * An expressive and declarative query language (AQL) that supports a broad range of queries and analysis over semistructured data
+ * A parallel runtime query execution engine, Hyracks, that has been scale-tested on up to 1000+ cores and 500+ disks
+ * Partitioned LSM-based data storage and indexing to support efficient ingestion and management of semistructured data
+ * Support for query access to externally stored data (e.g., data in HDFS) as well as to data stored natively by AsterixDB
+ * A rich set of primitive data types, including spatial and temporal data in addition to integer, floating point, and textual data
+ * Secondary indexing options that include B+ trees, R trees, and inverted keyword (exact and fuzzy) index types
+ * Support for fuzzy and spatial queries as well as for more traditional parametric queries
+ * Basic transactional (concurrency and recovery) capabilities akin to those of a NoSQL store
+
+## Getting and Using AsterixDB ##
+
+You are most likely here because you are interested in getting your hands on AsterixDB---so you would like to know how to get it, how to set it up, and how to use it.
+Someday our plan is to have comprehensive documentation for AsterixDB and its data model (ADM) and query language (AQL) here on this wiki.
+For the Alpha release, we've got a start; for the Beta release a month or so from now, we will hopefully have much more.
+The following is a list of the wiki pages and supporting documents that we have available today:
+
+1. [InstallingAsterixUsingManagix](InstallingAsterixUsingManagix.html) :
+This is our installation guide, and it is where you should start.
+This document will tell you how to obtain, install, and manage instances of [AsterixDB](https://asterixdb.googlecode.com/files/asterix-installer-0.0.4-binary-assembly.zip), including both single-machine setup (for developers) as well as cluster installations (for deployment in its intended form).
+
+2. [AdmAql101](AdmAql101.html) :
+This is a first-timers introduction to the user model of the AsterixDB BDMS, by which we mean the view of AsterixDB as seen from the perspective of an "average user" or Big Data application developer.
+The AsterixDB user model consists of its data modeling features (ADM) and its query capabilities (AQL).
+This document presents a tiny "social data warehousing" example and uses it as a backdrop for describing, by example, the key features of AsterixDB.
+By working through this document, you will learn how to define the artifacts needed to manage data in AsterixDB, how to load data into the system, how to use most of the basic features of its query language, and how to insert and delete data dynamically.
+
+3. [AsterixDataTypesAndFunctions](AsterixDataTypesAndFunctions.html) :
+This is a reference document that catalogs the primitive data types and built-in functions available for use in AsterixDB schemas (in ADM) and queries (in AQL).
+
+4. [AQL Reference](AsterixQueryLanguageReference.html) :
+This is the AQL language reference manual.
+
+5. [AsterixDBRestAPI](AsterixDBRestAPI.html) :
+Access to data in an AsterixDB instance is provided via a REST-based API.
+This is a short document that describes the REST API entry points and their URL syntax.
+
+To all who have now come this far: Thanks for your interest in AsterixDB, and for kicking its tires in its Alpha form
+In addition to getting the system and trying it out, please sign up as a member of the AsterixDB user mailing list (asterixdb-users (at) googlegroups.com) so that you can contact us easily with your questions, issues, and other feedback.
+We want AsterixDB to be a "big hit" some day, and we are anxious to see what users do with it and to learn from that feedback what we should be working on most urgently in the next phase of the project.
diff --git a/asterix-doc/src/site/markdown/AsterixDBRestAPI.md b/asterix-doc/src/site/markdown/AsterixDBRestAPI.md
new file mode 100644
index 0000000..80fbb11
--- /dev/null
+++ b/asterix-doc/src/site/markdown/AsterixDBRestAPI.md
@@ -0,0 +1,285 @@
+# REST API to AsterixDB #
+
+## DDL API ##
+
+*End point for the data definition statements*
+
+Endpoint: _/ddl_
+
+Parameters:
+
+<table>
+<tr>
+ <td>Parameter</td>
+ <td>Description</td>
+ <td>Required?</td>
+</tr>
+<tr>
+ <td>ddl</td>
+ <td>String containing DDL statements to modify Metadata</td>
+ <td>Yes</td>
+</tr>
+</table>
+
+This call does not return any result. If the operations were successful, HTTP OK status code is returned.
+
+### Example ###
+
+#### DDL Statements ####
+
+
+ drop dataverse company if exists;
+ create dataverse company;
+ use dataverse company;
+
+ create type Emp as open {
+ id : int32,
+ name : string
+ };
+
+ create dataset Employee(Emp) primary key id;
+
+
+API call for the above DDL statements in the URL-encoded form.
+
+[http://localhost:19101/ddl?ddl=drop%20dataverse%20company%20if%20exists;create%20dataverse%20company;use%20dataverse%20company;create%20type%20Emp%20as%20open%20{id%20:%20int32,name%20:%20string};create%20dataset%20Employee(Emp)%20primary%20key%20id;](http://localhost:19101/ddl?ddl=drop%20dataverse%20company%20if%20exists;create%20dataverse%20company;use%20dataverse%20company;create%20type%20Emp%20as%20open%20{id%20:%20int32,name%20:%20string};create%20dataset%20Employee(Emp)%20primary%20key%20id;)
+
+#### Response ####
+*HTTP OK 200*
+`<NO PAYLOAD>`
+
+## Update API ##
+
+*End point for update statements (INSERT, DELETE and LOAD)*
+
+Endpoint: _/update_
+
+Parameters:
+
+<table>
+<tr>
+ <td>Parameter</td>
+ <td>Description</td>
+ <td>Required?</td>
+</tr>
+<tr>
+ <td>statements</td>
+ <td>String containing update (insert/delete) statements to execute</td>
+ <td>Yes</td>
+</tr>
+</table>
+
+This call does not return any result. If the operations were successful, HTTP OK status code is returned.
+
+### Example ###
+
+#### Update Statements ####
+
+
+ use dataverse company;
+
+ insert into dataset Employee({ "id":123,"name":"John Doe"});
+
+
+API call for the above update statement in the URL-encoded form.
+
+[http://localhost:19101/update?statements=use%20dataverse%20company;insert%20into%20dataset%20Employee({%20%22id%22:123,%22name%22:%22John%20Doe%22});](http://localhost:19101/update?statements=use%20dataverse%20company;insert%20into%20dataset%20Employee({%20%22id%22:123,%22name%22:%22John%20Doe%22});)
+
+#### Response ####
+*HTTP OK 200*
+`<NO PAYLOAD>`
+
+## Query API ##
+
+*End point for query statements*
+
+Endpoint: _/query_
+
+Parameters:
+
+<table>
+<tr>
+ <td>Parameter</td>
+ <td>Description</td>
+ <td>Required?</td>
+</tr>
+<tr>
+ <td>query</td>
+ <td>Query string to pass to ASTERIX for execution</td>
+ <td>Yes</td>
+</tr>
+<tr>
+ <td>mode</td>
+ <td>Indicate if call should be synchronous or asynchronous. mode = synchronous blocks the call until results are available; mode = asynchronous returns immediately with a handle that can be used later to check the query’s status and to fetch results when available</td>
+ <td>No. default mode = synchronous</td>
+</tr>
+</table>
+
+Result: The result is returned as a JSON object as follows
+
+
+ {
+ results: <result as a string, if mode = synchronous>
+ error-code: [<code>, <message>] (if an error occurs)
+ handle: <opaque result handle, if mode = asynchronous>
+ }
+
+
+### Example ###
+
+#### Select query with synchronous result delivery ####
+
+
+ use dataverse company;
+
+ for $l in dataset('Employee') return $l;
+
+
+API call for the above query statement in the URL-encoded form.
+
+[http://localhost:19101/query?query=use%20dataverse%20company;for%20$l%20in%20dataset('Employee')%20return%20$l;](http://localhost:19101/query?query=use%20dataverse%20company;for%20$l%20in%20dataset('Employee')%20return%20$l;)
+
+#### Response ####
+*HTTP OK 200*
+Payload
+
+
+ {
+ "results": [
+ [
+ "{ "id": 123, "name": "John Doe" }"
+ ]
+ ]
+ }
+
+
+#### Same select query with asynchronous result delivery ####
+
+API call for the above query statement in the URL-encoded form with mode=asynchronous
+
+[http://localhost:19101/query?query=use%20dataverse%20company;for%20$l%20in%20dataset('Employee')%20return%20$l;&mode=asynchronous](http://localhost:19101/query?query=use%20dataverse%20company;for%20$l%20in%20dataset('Employee')%20return%20$l;&mode=asynchronous)
+
+#### Response ####
+*HTTP OK 200*
+Payload
+
+
+ {
+ "handle": [45,0]
+ }
+
+
+## Asynchronous Result API ##
+
+*End point to fetch the results of an asynchronous query*
+
+Endpoint: _/query/result_
+
+Parameters:
+
+<table>
+<tr>
+ <td>Parameter</td>
+ <td>Description</td>
+ <td>Required?</td>
+</tr>
+<tr>
+ <td>handle</td>
+ <td>Result handle that was returned by a previous call to a /query call with mode = asynchronous</td>
+ <td>Yes</td>
+</tr>
+</table>
+
+Result: The result is returned as a JSON object as follows:
+
+
+ {
+ results: <result as a string, if mode = synchronous, or mode = asynchronous and results are available>
+ error-code: [<code>, <message>] (if an error occurs)
+ }
+
+
+If mode = asynchronous and results are not available, the returned JSON object is empty: { }
+
+### Example ###
+
+#### Fetching results for asynchronous query ####
+
+We use the handle returned by the asynchronous query to get the results for the query. The handle returned was:
+
+
+ {
+ "handle": [45,0]
+ }
+
+
+API call for reading results from the previous asynchronous query in the URL-encoded form.
+
+[http://localhost:19101/query/result?handle=%7B%22handle%22%3A+%5B45%2C+0%5D%7D](http://localhost:19101/query/result?handle=%7B%22handle%22%3A+%5B45%2C+0%5D%7D)
+
+#### Response ####
+*HTTP OK 200*
+Payload
+
+
+ {
+ "results": [
+ [
+ "{ "id": 123, "name": "John Doe" }"
+ ]
+ ]
+ }
+
+
+## Query Status API ##
+
+*End point to check the status of the query asynchronous*
+
+Endpoint: _/query/status_
+
+Parameters:
+
+<table>
+<tr>
+ <td>Parameter</td>
+ <td>Description</td>
+ <td>Required?</td>
+</tr>
+<tr>
+ <td>handle</td>
+ <td>Result handle that was returned by a previous call to a /query call with mode = asynchronous</td>
+ <td>Yes</td>
+</tr>
+</table>
+
+Result: The result is returned as a JSON object as follows:
+
+
+ {
+ status: ("RUNNING" | "SUCCESS" | "ERROR")
+ }
+
+
+
+## Error Codes ##
+
+Table of error codes and their types:
+
+<table>
+<tr>
+ <td>Code</td>
+ <td>Type</td>
+</tr>
+<tr>
+ <td>1</td>
+ <td>Invalid statement</td>
+</tr>
+<tr>
+ <td>2</td>
+ <td>Parse failures</td>
+</tr>
+<tr>
+ <td>99</td>
+ <td>Uncategorized error</td>
+</tr>
+</table>
diff --git a/asterix-doc/src/site/markdown/AsterixDataTypes.md b/asterix-doc/src/site/markdown/AsterixDataTypes.md
new file mode 100644
index 0000000..6674006
--- /dev/null
+++ b/asterix-doc/src/site/markdown/AsterixDataTypes.md
@@ -0,0 +1,286 @@
+# Asterix Data Model (ADM) #
+
+# Basic data types #
+
+An instance of Asterix data model (ADM) can be a _primitive type_ (`Int32`, `Int64`, `String`, `Float`, `Double`, `Date`, `Time`, `Datetime`, etc. or `NULL`) or a _derived type_.
+
+## Primitive Types ##
+
+### Boolean ###
+`Boolean` data type can have one of the two values: _*true*_ or _*false*_.
+
+ * Example:
+
+ let $t := true
+ let $f := false
+ return { "true": $t, "false": $f }
+
+
+ * The expected result is:
+
+ { "true": true, "false": false }
+
+
+
+### Int8 / Int16 / Int32 / Int64 ###
+Integer types using 8, 16, 32, or 64 bits.
+
+ * Example:
+
+ let $v8 := int8("125")
+ let $v16 := int16("32765")
+ let $v32 := 294967295
+ let $v64 := int64("1700000000000000000")
+ return { "int8": $v8, "int16": $v16, "int32": $v32, "int64": $v64}
+
+
+ * The expected result is:
+
+ { "int8": 125i8, "int16": 32765i16, "int32": 294967295, "int64": 1700000000000000000i64 }
+
+
+### Float ###
+`Float` represents approximate numeric data values using 4 bytes.
+
+ * Example:
+
+ let $v1 := float("NaN")
+ let $v2 := float("INF")
+ let $v3 := float("-INF")
+ let $v4 := float("-2013.5")
+ return { "v1": $v1, "v2": $v2, "v3": $v3, "v4": $v4 }
+
+
+ * The expected result is:
+
+ { "v1": NaNf, "v2": Infinityf, "v3": -Infinityf, "v4": -2013.5f }
+
+
+### Double ###
+`Double` represents approximate numeric data values using 8 bytes.
+
+ * Example:
+
+ let $v1 := double("NaN")
+ let $v2 := double("INF")
+ let $v3 := double("-INF")
+ let $v4 := double("-2013.593823748327284")
+ return { "v1": $v1, "v2": $v2, "v3": $v3, "v4": $v4 }
+
+
+ * The expected result is:
+
+ { "v1": NaNd, "v2": Infinityd, "v3": -Infinityd, "v4": -2013.5938237483274d }
+
+
+### String ###
+`String` represents a sequence of characters.
+
+ * Example:
+
+ let $v1 := string("This is a string.")
+ let $v2 := string("\"This is a quoted string\"")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": "This is a string.", "v2": "\"This is a quoted string\"" }
+
+
+### Point ###
+`Point` is the fundamental two-dimensional building block for spatial types. It consists of two `double` coordinates x and y.
+
+ * Example:
+
+ let $v1 := point("80.10d, -10E5")
+ let $v2 := point("5.10E-10d, -10E5")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": point("80.1,-1000000.0"), "v2": point("5.1E-10,-1000000.0") }
+
+
+### Line ###
+`Line` consists of two points that represent the start and the end points of a line segment.
+
+ * Example:
+
+ let $v1 := line("10.1234,11.1e-1 +10.2E-2,-11.22")
+ let $v2 := line("0.1234,-1.00e-10 +10.5E-2,-01.02")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": line("10.1234,1.11 0.102,-11.22"), "v2": line("0.1234,-1.0E-10 0.105,-1.02") }
+
+
+### Rectangle ###
+`Rectangle` consists of two points that represent the _*bottom left*_ and _*upper right*_ corners of a rectangle.
+
+ * Example:
+
+ let $v1 := rectangle("5.1,11.8 87.6,15.6548")
+ let $v2 := rectangle("0.1234,-1.00e-10 5.5487,0.48765")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": rectangle("5.1,11.8 87.6,15.6548"), "v2": rectangle("0.1234,-1.0E-10 5.5487,0.48765") }
+
+
+### Circle ###
+`Circle` consists of one `point` that represents the center of the circle and a radius of type `double`.
+
+ * Example:
+
+ let $v1 := circle("10.1234,11.1e-1 +10.2E-2")
+ let $v2 := circle("0.1234,-1.00e-10 +10.5E-2")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": circle("10.1234,1.11 0.102"), "v2": circle("0.1234,-1.0E-10 0.105") }
+
+
+### Polygon ###
+`Polygon` consists of _*n*_ points that represent the vertices of a _*simple closed*_ polygon.
+
+ * Example:
+
+ let $v1 := polygon("-1.2,+1.3e2 -2.14E+5,2.15 -3.5e+2,03.6 -4.6E-3,+4.81")
+ let $v2 := polygon("-1.0,+10.5e2 -02.15E+50,2.5 -1.0,+3.3e3 -2.50E+05,20.15 +3.5e+2,03.6 -4.60E-3,+4.75 -2,+1.0e2 -2.00E+5,20.10 30.5,03.25 -4.33E-3,+4.75")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": polygon("-1.2,130.0 -214000.0,2.15 -350.0,3.6 -0.0046,4.81"), "v2": polygon("-1.0,1050.0 -2.15E50,2.5 -1.0,3300.0 -250000.0,20.15 350.0,3.6 -0.0046,4.75 -2.0,100.0 -200000.0,20.1 30.5,3.25 -0.00433,4.75") }
+
+
+### Date ###
+`Date` represents a time point along the Gregorian calendar system specified by the year, month and day. ASTERIX supports the date from `-9999-01-01` to `9999-12-31`.
+
+A date value can be represented in two formats, extended format and basic format.
+
+ * Extended format is represented as `[-]yyyy-mm-dd` for `year-month-day`. Each field should be padded if there are less digits than the format specified.
+ * Basic format is in the format of `[-]yyyymmdd`.
+
+ * Example:
+
+ let $v1 := date("2013-01-01")
+ let $v2 := date("-19700101")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": date("2013-01-01"), "v2": date("-1970-01-01") }
+
+
+### Time ###
+`Time` type describes the time within the range of a day. It is represented by three fields: hour, minute and second. Millisecond field is optional as the fraction of the second field. Its extended format is as `hh:mm:ss[.mmm]` and the basic format is `hhmmss[mmm]`. The value domain is from `00:00:00.000` to `23:59:59.999`.
+
+Timezone field is optional for a time value. Timezone is represented as `[+|-]hh:mm` for extended format or `[+|-]hhmm` for basic format. Note that the sign designators cannot be omitted. `Z` can also be used to represent the UTC local time. If no timezone information is given, it is UTC by default.
+
+ * Example:
+
+ let $v1 := time("12:12:12.039Z")
+ let $v2 := time("000000000-0800")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": time("12:12:12.039Z"), "v2": time("08:00:00.000Z") }
+
+
+### Datetime ###
+A `Datetime` value is a combination of an `Date` and `Time`, representing a fixed time point along the Gregorian calendar system. The value is among `-9999-01-01 00:00:00.000` and `9999-12-31 23:59:59.999`.
+
+A `Datetime` value is represented as a combination of the representation of its `Date` part and `Time` part, separated by a separator `T`. Either extended or basic format can be used, and the two parts should be the same format.
+
+Millisecond field and timezone field are optional, as specified in the `Time` type.
+
+ * Example:
+
+ let $v1 := datetime("2013-01-01T12:12:12.039Z")
+ let $v2 := datetime("-19700101T000000000-0800")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": datetime("2013-01-01T12:12:12.039Z"), "v2": datetime("-1970-01-01T08:00:00.000Z") }
+
+
+### Duration ###
+`Duration` represents a duration of time. A duration value is specified by integers on at least one of the following fields: year, month, day, hour, minute, second, and millisecond.
+
+A duration value is in the format of `[-]PnYnMnDTnHnMn.mmmS`. The millisecond part (as the fraction of the second field) is optional, and when no millisecond field is used, the decimal point should also be absent.
+
+Negative durations are also supported for the arithmetic operations between time instance types (`Date`, `Time` and `Datetime`), and is used to roll the time back for the given duration. For example `date("2012-01-01") + duration("-P3D")` will return `date("2011-12-29")`.
+
+Note that a canonical representation of the duration is always returned, regardless whether the duration is in the canonical representation or not from the user's input. More information about canonical representation can be found from [XPath dayTimeDuration Canonical Representation](http://www.w3.org/TR/xpath-functions/#canonical-dayTimeDuration) and [yearMonthDuration Canonical Representation](http://www.w3.org/TR/xpath-functions/#canonical-yearMonthDuration).
+
+ * Example:
+
+ let $v1 := duration("P100Y12MT12M")
+ let $v2 := duration("-PT20.943S")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": duration("P101YT12M"), "v2": duration("-PT20.943S") }
+
+
+### Interval ###
+`Interval` represents inclusive-exclusive ranges of time. It is defined by two time point values with the same temporal type(`Date`, `Time` or `Datetime`).
+
+ * Example:
+
+ let $v1 := interval-from-date(date("2013-01-01"), date("20130505"))
+ let $v2 := interval-from-time(time("00:01:01"), time("213901049+0800"))
+ let $v3 := interval-from-datetime(datetime("2013-01-01T00:01:01"), datetime("20130505T213901049+0800"))
+ return { "v1": $v1, "v2": $v2, "v3": $v3 }
+
+
+ * The expected result is:
+
+ { "v1": interval-date("2013-01-01, 2013-05-05"), "v2": interval-time("00:01:01.000Z, 13:39:01.049Z"), "v3": interval-datetime("2013-01-01T00:01:01.000Z, 2013-05-05T13:39:01.049Z") }
+
+
+## Derived Types ##
+
+### Record ###
+A `Record` contains a set of fields, where each field is described by its name and type. A record type is either open or closed. Open records can contain fields that are not part of the type definition, while closed records cannot. Syntactically, record constructors are surrounded by curly braces "{...}".
+
+An example would be
+
+
+ { "id": 213508, "name": "Alice Bob" }
+
+
+### OrderedList ###
+An `OrderedList` is a sequence of values for which the order is determined by creation or insertion. OrderedList constructors are denoted by brackets: "[...]".
+
+An example would be
+
+
+ ["alice", 123, "bob", null]
+
+
+### UnorderedList ###
+An `UnorderedList` is an unordered sequence of values, similar to bags in SQL. UnorderedList constructors are denoted by two opening flower braces followed by data and two closing flower braces, like "{{...}}".
+
+An example would be
+
+
+ {{"hello", 9328, "world", [1, 2, null]}}
+
diff --git a/asterix-doc/src/site/markdown/AsterixDataTypesAndFunctions.md b/asterix-doc/src/site/markdown/AsterixDataTypesAndFunctions.md
new file mode 100644
index 0000000..7576648
--- /dev/null
+++ b/asterix-doc/src/site/markdown/AsterixDataTypesAndFunctions.md
@@ -0,0 +1,2197 @@
+# Asterix Data Model (ADM) #
+
+# Basic data types #
+
+An instance of Asterix data model (ADM) can be a _primitive type_ (`Int32`, `Int64`, `String`, `Float`, `Double`, `Date`, `Time`, `Datetime`, etc. or `NULL`) or a _derived type_.
+
+## Primitive Types ##
+
+### Boolean ###
+`Boolean` data type can have one of the two values: _*true*_ or _*false*_.
+
+ * Example:
+
+ let $t := true
+ let $f := false
+ return { "true": $t, "false": $f }
+
+
+ * The expected result is:
+
+ { "true": true, "false": false }
+
+
+
+### Int8 / Int16 / Int32 / Int64 ###
+Integer types using 8, 16, 32, or 64 bits.
+
+ * Example:
+
+ let $v8 := int8("125")
+ let $v16 := int16("32765")
+ let $v32 := 294967295
+ let $v64 := int64("1700000000000000000")
+ return { "int8": $v8, "int16": $v16, "int32": $v32, "int64": $v64}
+
+
+ * The expected result is:
+
+ { "int8": 125i8, "int16": 32765i16, "int32": 294967295, "int64": 1700000000000000000i64 }
+
+
+### Float ###
+`Float` represents approximate numeric data values using 4 bytes.
+
+ * Example:
+
+ let $v1 := float("NaN")
+ let $v2 := float("INF")
+ let $v3 := float("-INF")
+ let $v4 := float("-2013.5")
+ return { "v1": $v1, "v2": $v2, "v3": $v3, "v4": $v4 }
+
+
+ * The expected result is:
+
+ { "v1": NaNf, "v2": Infinityf, "v3": -Infinityf, "v4": -2013.5f }
+
+
+### Double ###
+`Double` represents approximate numeric data values using 8 bytes.
+
+ * Example:
+
+ let $v1 := double("NaN")
+ let $v2 := double("INF")
+ let $v3 := double("-INF")
+ let $v4 := double("-2013.593823748327284")
+ return { "v1": $v1, "v2": $v2, "v3": $v3, "v4": $v4 }
+
+
+ * The expected result is:
+
+ { "v1": NaNd, "v2": Infinityd, "v3": -Infinityd, "v4": -2013.5938237483274d }
+
+
+### String ###
+`String` represents a sequence of characters.
+
+ * Example:
+
+ let $v1 := string("This is a string.")
+ let $v2 := string("\"This is a quoted string\"")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": "This is a string.", "v2": "\"This is a quoted string\"" }
+
+
+### Point ###
+`Point` is the fundamental two-dimensional building block for spatial types. It consists of two `double` coordinates x and y.
+
+ * Example:
+
+ let $v1 := point("80.10d, -10E5")
+ let $v2 := point("5.10E-10d, -10E5")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": point("80.1,-1000000.0"), "v2": point("5.1E-10,-1000000.0") }
+
+
+### Line ###
+`Line` consists of two points that represent the start and the end points of a line segment.
+
+ * Example:
+
+ let $v1 := line("10.1234,11.1e-1 +10.2E-2,-11.22")
+ let $v2 := line("0.1234,-1.00e-10 +10.5E-2,-01.02")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": line("10.1234,1.11 0.102,-11.22"), "v2": line("0.1234,-1.0E-10 0.105,-1.02") }
+
+
+### Rectangle ###
+`Rectangle` consists of two points that represent the _*bottom left*_ and _*upper right*_ corners of a rectangle.
+
+ * Example:
+
+ let $v1 := rectangle("5.1,11.8 87.6,15.6548")
+ let $v2 := rectangle("0.1234,-1.00e-10 5.5487,0.48765")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": rectangle("5.1,11.8 87.6,15.6548"), "v2": rectangle("0.1234,-1.0E-10 5.5487,0.48765") }
+
+
+### Circle ###
+`Circle` consists of one `point` that represents the center of the circle and a radius of type `double`.
+
+ * Example:
+
+ let $v1 := circle("10.1234,11.1e-1 +10.2E-2")
+ let $v2 := circle("0.1234,-1.00e-10 +10.5E-2")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": circle("10.1234,1.11 0.102"), "v2": circle("0.1234,-1.0E-10 0.105") }
+
+
+### Polygon ###
+`Polygon` consists of _*n*_ points that represent the vertices of a _*simple closed*_ polygon.
+
+ * Example:
+
+ let $v1 := polygon("-1.2,+1.3e2 -2.14E+5,2.15 -3.5e+2,03.6 -4.6E-3,+4.81")
+ let $v2 := polygon("-1.0,+10.5e2 -02.15E+50,2.5 -1.0,+3.3e3 -2.50E+05,20.15 +3.5e+2,03.6 -4.60E-3,+4.75 -2,+1.0e2 -2.00E+5,20.10 30.5,03.25 -4.33E-3,+4.75")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": polygon("-1.2,130.0 -214000.0,2.15 -350.0,3.6 -0.0046,4.81"), "v2": polygon("-1.0,1050.0 -2.15E50,2.5 -1.0,3300.0 -250000.0,20.15 350.0,3.6 -0.0046,4.75 -2.0,100.0 -200000.0,20.1 30.5,3.25 -0.00433,4.75") }
+
+
+### Date ###
+`Date` represents a time point along the Gregorian calendar system specified by the year, month and day. ASTERIX supports the date from `-9999-01-01` to `9999-12-31`.
+
+A date value can be represented in two formats, extended format and basic format.
+
+ * Extended format is represented as `[-]yyyy-mm-dd` for `year-month-day`. Each field should be padded if there are less digits than the format specified.
+ * Basic format is in the format of `[-]yyyymmdd`.
+
+ * Example:
+
+ let $v1 := date("2013-01-01")
+ let $v2 := date("-19700101")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": date("2013-01-01"), "v2": date("-1970-01-01") }
+
+
+### Time ###
+`Time` type describes the time within the range of a day. It is represented by three fields: hour, minute and second. Millisecond field is optional as the fraction of the second field. Its extended format is as `hh:mm:ss[.mmm]` and the basic format is `hhmmss[mmm]`. The value domain is from `00:00:00.000` to `23:59:59.999`.
+
+Timezone field is optional for a time value. Timezone is represented as `[+|-]hh:mm` for extended format or `[+|-]hhmm` for basic format. Note that the sign designators cannot be omitted. `Z` can also be used to represent the UTC local time. If no timezone information is given, it is UTC by default.
+
+ * Example:
+
+ let $v1 := time("12:12:12.039Z")
+ let $v2 := time("000000000-0800")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": time("12:12:12.039Z"), "v2": time("08:00:00.000Z") }
+
+
+### Datetime ###
+A `Datetime` value is a combination of an `Date` and `Time`, representing a fixed time point along the Gregorian calendar system. The value is among `-9999-01-01 00:00:00.000` and `9999-12-31 23:59:59.999`.
+
+A `Datetime` value is represented as a combination of the representation of its `Date` part and `Time` part, separated by a separator `T`. Either extended or basic format can be used, and the two parts should be the same format.
+
+Millisecond field and timezone field are optional, as specified in the `Time` type.
+
+ * Example:
+
+ let $v1 := datetime("2013-01-01T12:12:12.039Z")
+ let $v2 := datetime("-19700101T000000000-0800")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": datetime("2013-01-01T12:12:12.039Z"), "v2": datetime("-1970-01-01T08:00:00.000Z") }
+
+
+### Duration ###
+`Duration` represents a duration of time. A duration value is specified by integers on at least one of the following fields: year, month, day, hour, minute, second, and millisecond.
+
+A duration value is in the format of `[-]PnYnMnDTnHnMn.mmmS`. The millisecond part (as the fraction of the second field) is optional, and when no millisecond field is used, the decimal point should also be absent.
+
+Negative durations are also supported for the arithmetic operations between time instance types (`Date`, `Time` and `Datetime`), and is used to roll the time back for the given duration. For example `date("2012-01-01") + duration("-P3D")` will return `date("2011-12-29")`.
+
+Note that a canonical representation of the duration is always returned, regardless whether the duration is in the canonical representation or not from the user's input. More information about canonical representation can be found from [XPath dayTimeDuration Canonical Representation](http://www.w3.org/TR/xpath-functions/#canonical-dayTimeDuration) and [yearMonthDuration Canonical Representation](http://www.w3.org/TR/xpath-functions/#canonical-yearMonthDuration).
+
+ * Example:
+
+ let $v1 := duration("P100Y12MT12M")
+ let $v2 := duration("-PT20.943S")
+ return { "v1": $v1, "v2": $v2 }
+
+
+ * The expected result is:
+
+ { "v1": duration("P101YT12M"), "v2": duration("-PT20.943S") }
+
+
+### Interval ###
+`Interval` represents inclusive-exclusive ranges of time. It is defined by two time point values with the same temporal type(`Date`, `Time` or `Datetime`).
+
+ * Example:
+
+ let $v1 := interval-from-date(date("2013-01-01"), date("20130505"))
+ let $v2 := interval-from-time(time("00:01:01"), time("213901049+0800"))
+ let $v3 := interval-from-datetime(datetime("2013-01-01T00:01:01"), datetime("20130505T213901049+0800"))
+ return { "v1": $v1, "v2": $v2, "v3": $v3 }
+
+
+ * The expected result is:
+
+ { "v1": interval-date("2013-01-01, 2013-05-05"), "v2": interval-time("00:01:01.000Z, 13:39:01.049Z"), "v3": interval-datetime("2013-01-01T00:01:01.000Z, 2013-05-05T13:39:01.049Z") }
+
+
+## Derived Types ##
+
+### Record ###
+A `Record` contains a set of fields, where each field is described by its name and type. A record type is either open or closed. Open records can contain fields that are not part of the type definition, while closed records cannot. Syntactically, record constructors are surrounded by curly braces "{...}".
+
+An example would be
+
+
+ { "id": 213508, "name": "Alice Bob" }
+
+
+### OrderedList ###
+An `OrderedList` is a sequence of values for which the order is determined by creation or insertion. OrderedList constructors are denoted by brackets: "[...]".
+
+An example would be
+
+
+ ["alice", 123, "bob", null]
+
+
+### UnorderedList ###
+An `UnorderedList` is an unordered sequence of values, similar to bags in SQL. UnorderedList constructors are denoted by two opening flower braces followed by data and two closing flower braces, like "{{...}}".
+
+An example would be
+
+
+ {{"hello", 9328, "world", [1, 2, null]}}
+
+
+# Asterix: Using Functions #
+Asterix provides rich support of various classes of functions to support operations on string, spatial, and temporal data. This document explains how to use these functions.
+
+## String Functions ##
+### string-to-codepoint ###
+ * Syntax:
+
+ string-to-codepoint(string_expression)
+
+ * Converts the string `string_expression` to its code-based representation.
+ * Arguments:
+ * `string_expression` : A `String` that will be converted.
+ * Return Value:
+ * An `OrderedList` of the code points for the string `string_expression`.
+
+### codepoint-to-string ###
+ * Syntax:
+
+ codepoint-to-string(list_expression)
+
+ * Converts the ordered code-based representation `list_expression` to the corresponding string.
+ * Arguments:
+ * `list_expression` : An `OrderedList` of code-points.
+ * Return Value:
+ * A `String` representation of `list_expression`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $s := "Hello ASTERIX!"
+ let $l := string-to-codepoint($s)
+ let $ss := codepoint-to-string($l)
+ return {"codes": $l, "string": $ss}
+
+
+ * The expected result is:
+
+ { "codes": [ 72, 101, 108, 108, 111, 32, 65, 83, 84, 69, 82, 73, 88, 33 ], "string": "Hello ASTERIX!" }
+
+
+### contains ###
+ * Syntax:
+
+ contains(string_expression, string_pattern)
+
+ * Checks whether the string `string_expression` contains the string `string_pattern`
+ * Arguments:
+ * `string_expression` : A `String` that might contain the pattern.
+ * `string_pattern` : A target `String` that might be contained.
+ * Return Value:
+ * A `Boolean`, returns `true` if `string_expression` contains `string_pattern`, otherwise returns `false`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('FacebookMessages')
+ where contains($i.message, "phone")
+ return {"mid": $i.message-id, "message": $i.message}
+
+
+ * The expected result is:
+
+ { "mid": 2, "message": " dislike iphone its touch-screen is horrible" }
+ { "mid": 13, "message": " dislike iphone the voice-command is bad:(" }
+ { "mid": 15, "message": " like iphone the voicemail-service is awesome" }
+
+
+### len ###
+ * Syntax:
+
+ len(list_expression)
+
+ * Returns the length of the list `list_expression`.
+ * Arguments:
+ * `list_expression` : An `OrderedList`, `UnorderedList` or `NULL`, represents the list need to be checked.
+ * Return Value:
+ * An `Int32` that represents the length of `list_expression`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $l := ["ASTERIX", "Hyracks"]
+ return len($l)
+
+
+ * The expected result is:
+
+ 2
+
+
+### like ###
+ * Syntax:
+
+ like(string_expression, string_pattern)
+
+ * Checks whether the string `string_expression` contains the string `string_pattern`.
+ * Arguments:
+ * `string_expression` : A `String` that might contain the pattern or `NULL`.
+ * `string_pattern` : A pattern `String` that might be contained or `NULL`.
+ * Return Value:
+ * A `Boolean`, returns `true` if `string_expression` contains the pattern `string_pattern`, otherwise returns `false`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('FacebookMessages')
+ where like($i.message, "%at&t%")
+ return $i.message
+
+
+ * The expected result is:
+
+ " can't stand at&t the network is horrible:("
+ " can't stand at&t its plan is terrible"
+ " love at&t its 3G is good:)"
+
+
+### starts-with ###
+ * Syntax:
+
+ starts-with(string_expression, string_pattern)
+
+ * Checks whether the string `string_expression` starts with the string `string_pattern`.
+ * Arguments:
+ * `string_expression` : A `String` that might start with the given string.
+ * `string_pattern` : A `String` that might be contained as the starting substring.
+ * Return Value:
+ * A `Boolean`, returns `true` if `string_expression` starts with the string `string_pattern`, otherwise returns `false`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('FacebookMessages')
+ where starts-with($i.message, " like")
+ return $i.message
+
+
+ * The expected result is:
+
+ " like samsung the plan is amazing"
+ " like t-mobile its platform is mind-blowing"
+ " like verizon the 3G is awesome:)"
+ " like iphone the voicemail-service is awesome"
+
+
+### ends-with ###
+ * Syntax:
+
+ ends-with(string_expression, string_pattern)
+
+ * Checks whether the string `string_expression` ends with the string `string_pattern`.
+ * Arguments:
+ * `string_expression` : A `String` that might end with the given string.
+ * `string_pattern` : A `String` that might be contained as the ending substring.
+ * Return Value:
+ * A `Boolean`, returns `true` if `string_expression` ends with the string `string_pattern`, otherwise returns `false`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('FacebookMessages')
+ where ends-with($i.message, ":)")
+ return $i.message
+
+
+ * The expected result is:
+
+ " love sprint its shortcut-menu is awesome:)"
+ " like verizon the 3G is awesome:)"
+ " love at&t its 3G is good:)"
+
+
+### string-concat ###
+ * Syntax:
+
+ string-concat(list_expression)
+
+ * Concatenates a list of strings `list_expression` into a single string.
+ * Arguments:
+ * `list_expression` : An `OrderedList` or `UnorderedList` of `String`s (could be `NULL`) to be concatenated.
+ * Return Value:
+ * Returns the concatenated `String` value.
+
+ * Example:
+
+ let $i := "ASTERIX"
+ let $j := " "
+ let $k := "ROCKS!"
+ return string-concat([$i, $j, $k])
+
+
+ * The expected result is:
+
+ "ASTERIX ROCKS!"
+
+
+### string-equal ###
+ * Syntax:
+
+ string-equal(string_expression1, string_expression2)
+
+ * Checks whether the strings `string_expression1` and `string_expression2` are equal.
+ * Arguments:
+ * `string_expression1` : A `String` to be compared.
+ * `string_expression2` : A `String` to be compared with.
+ * Return Value:
+ * A `Boolean`, returns `true` if `string_expression1` and `string_expression2` are equal, otherwise returns `false`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $i := "Android"
+ return {"Equal": string-equal($i, "Android"), "NotEqual": string-equal($i, "iphone")}
+
+
+ * The expected result is:
+
+ { "Equal": true, "NotEqual": false }
+
+
+### string-join ###
+ * Syntax:
+
+ string-join(list_expression, string_expression)
+
+ * Joins a list of strings `list_expression` with the given separator `string_expression` into a single string.
+ * Arguments:
+ * `list_expression` : An `OrderedList` or `UnorderedList` of `String`s (could be NULL) to be joined.
+ * `string_expression` : A separator `String` value.
+ * Return Value:
+ * Returns the joined `String`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $i := ["ASTERIX", "ROCKS~"]
+ return string-join($i, "!! ")
+
+
+ * The expected result is:
+
+ "ASTERIX!! ROCKS~"
+
+
+### lowercase ###
+ * Syntax:
+
+ lowercase(string_expression)
+
+ * Returns the lowercase of a given string `string_expression`.
+ * Arguments:
+ * `string_expression` : A `String` to be lowercased.
+ * Return Value:
+ * Returns the lowercased `String`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $i := "ASTERIX"
+ return lowercase($i)
+
+
+ * The expected result is:
+
+ asterix
+
+
+### matches ###
+ * Syntax:
+
+ matches(string_expression, string_pattern)
+
+ * Checks whether the strings `string_expression` matches the given pattern `string_pattern`.
+ * Arguments:
+ * `string_expression` : A `String` that might contain the pattern.
+ * `string_pattern` : A pattern `String` to be matched.
+ * Return Value:
+ * A `Boolean`, returns `true` if `string_expression` matches the pattern `string_pattern`, otherwise returns `false`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('FacebookMessages')
+ where matches($i.message, "dislike iphone")
+ return $i.message
+
+
+ * The expected result is:
+
+ " dislike iphone its touch-screen is horrible"
+ " dislike iphone the voice-command is bad:("
+
+
+### replace ###
+ * Syntax:
+
+ replace(string_expression, string_pattern, string_replacement)
+
+ * Checks whether the strings `string_expression` matches the given pattern `string_pattern`, and replace the matched pattern `string_pattern` with the new pattern `string_replacement`.
+ * Arguments:
+ * `string_expression` : A `String` that might contain the pattern.
+ * `string_pattern` : A pattern `String` to be matched.
+ * `string_replacement` : A pattern `String` to be used as the replacement.
+ * Return Value:
+ * Returns a `String` that is obtained after the replacements.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('FacebookMessages')
+ where matches($i.message, " like iphone")
+ return replace($i.message, " like iphone", "like android")
+
+
+ * The expected result is:
+
+ "like android the voicemail-service is awesome"
+
+
+### string-length ###
+ * Syntax:
+
+ string-length(string_expression)
+
+ * Returns the length of the string `string_expression`.
+ * Arguments:
+ * `string_expression` : A `String` or `NULL`, represents the string to be checked.
+ * Return Value:
+ * An `Int32` that represents the length of `string_expression`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('FacebookMessages')
+ return {"mid": $i.message-id, "message-len": string-length($i.message)}
+
+
+ * The expected result is:
+
+ { "mid": 1, "message-len": 43 }
+ { "mid": 2, "message-len": 44 }
+ { "mid": 3, "message-len": 33 }
+ { "mid": 4, "message-len": 43 }
+ { "mid": 5, "message-len": 46 }
+ { "mid": 6, "message-len": 43 }
+ { "mid": 7, "message-len": 37 }
+ { "mid": 8, "message-len": 33 }
+ { "mid": 9, "message-len": 34 }
+ { "mid": 10, "message-len": 50 }
+ { "mid": 11, "message-len": 38 }
+ { "mid": 12, "message-len": 52 }
+ { "mid": 13, "message-len": 42 }
+ { "mid": 14, "message-len": 27 }
+ { "mid": 15, "message-len": 45 }
+
+
+### substring ###
+ * Syntax:
+
+ substring(string_expression, offset, length)
+
+ * Returns the substring from the given string `string_expression` based on the given start offset `offset`.
+ * Arguments:
+ * `string_expression` : A `String` as the string to be extracted.
+ * `offset` : An `Int32` as the starting offset of the substring in `string_expression`.
+ * `length` : (Optional) An `Int32` as the length of the substring.
+ * Return Value:
+ * A `String` that represents the substring.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('FacebookMessages')
+ where string-length($i.message) > 50
+ return substring($i.message, 50)
+
+
+ * The expected result is:
+
+ "G:("
+
+
+### substring-before ###
+ * Syntax:
+
+ substring-before(string_expression, string_pattern)
+
+ * Returns the substring from the given string `string_expression` before the given pattern `string_pattern`.
+ * Arguments:
+ * `string_expression` : A `String` as the string to be extracted.
+ * `string_pattern` : A `String` as the string pattern to be searched.
+ * Return Value:
+ * A `String` that represents the substring.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('FacebookMessages')
+ where contains($i.message, "iphone")
+ return substring-before($i.message, "iphone")
+
+
+ * The expected result is:
+
+ " dislike "
+ " dislike "
+ " like "
+
+
+### substring-after ###
+ * Syntax:
+
+ substring-after(string_expression, string_pattern)
+
+ * Returns the substring from the given string `string_expression` after the given pattern `string_pattern`.
+ * Arguments:
+ * `string_expression` : A `String` as the string to be extracted.
+ * `string_pattern` : A `String` as the string pattern to be searched.
+ * Return Value:
+ * A `String` that represents the substring.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('FacebookMessages')
+ where contains($i.message, "iphone")
+ return substring-after($i.message, "iphone")
+
+
+ * The expected result is:
+
+ " its touch-screen is horrible"
+ " the voice-command is bad:("
+ " the voicemail-service is awesome"
+
+
+## Spatial Functions ##
+### create-point ###
+ * Syntax:
+
+ create-point(latitude, longitude)
+
+ * Creates the primitive type `Point` using `latitude` and `longitude`.
+ * Arguments:
+ * `latitude` : A `Double` that represents the latitude.
+ * `longitude` : A `Double` that represents the longitude.
+ * Return Value:
+ * A `Point`, represents a spatial point created using the latitude and longitude provided in `latitude` and `longitude`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $c := create-point(30.0,70.0)
+ return {"point": $c}
+
+
+ * The expected result is:
+
+ { "point": point("30.0,70.0") }
+
+
+### create-line ###
+ * Syntax:
+
+ create-line(point_expression1, point_expression2)
+
+ * Creates the primitive type `Line` using `point_expression1` and `point_expression2`.
+ * Arguments:
+ * `point_expression1` : A `Point` that represents the start point of the line.
+ * `point_expression2` : A `Point` that represents the end point of the line.
+ * Return Value:
+ * A `Line`, represents a spatial line created using the points provided in `point_expression1` and `point_expression2`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $c := create-line(create-point(30.0,70.0), create-point(50.0,90.0))
+ return {"line": $c}
+
+
+ * The expected result is:
+
+ { "line": line("30.0,70.0 50.0,90.0") }
+
+
+### create-rectangle ###
+ * Syntax:
+
+ create-rectangle(point_expression1, point_expression2)
+
+ * Creates the primitive type Rectangle using `point_expression1` and `point_expression2`.
+ * Arguments:
+ * `point_expression1` : A `Point` that represents the lower-left point of the rectangle.
+ * `point_expression2` : A `Point` that represents the upper-right point of the rectangle.
+ * Return Value:
+ * A `Rectangle`, represents a spatial rectangle created using the points provided in `point_expression1` and `point_expression2`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $c := create-rectangle(create-point(30.0,70.0), create-point(50.0,90.0))
+ return {"rectangle": $c}
+
+
+ * The expected result is:
+
+ { "rectangle": rectangle("30.0,70.0 50.0,90.0") }
+
+
+### create-circle ###
+ * Syntax:
+
+ create-circle(point_expression, radius)
+
+ * Creates the primitive type `Circle` using `point_expression` and `radius`.
+ * Arguments:
+ * `point_expression` : A `Point` that represents the center of the circle.
+ * `radius` : A `Double` that represents the radius of the circle.
+ * Return Value:
+ * A `Circle`, represents a spatial circle created using the center point and the radius provided in `point_expression` and `radius`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $c := create-circle(create-point(30.0,70.0), 5.0)
+ return {"circle": $c}
+
+
+ * The expected result is:
+
+ { "circle": circle("30.0,70.0 5.0") }
+
+
+### create-polygon ###
+ * Syntax:
+
+ create-polygon(point_expression1, point_expression2, ..., point_expressionn)
+
+ * Creates the primitive type `Polygon` using unlimited number of arguments `point_expression1`, `point_expression2`, ..., `point_expressionn`.
+ * Arguments:
+ * `point_expression1`/.../`point_expressionn` : A `Point` that represents a vertex of the polygon.
+ * Return Value:
+ * A `Polygon`, represents a spatial simple polygon created using the points provided in `point_expression1`, `point_expression2`, ..., `point_expressionn`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $c := create-polygon(create-point(1.0,1.0), create-point(2.0,2.0), create-point(3.0,3.0), create-point(4.0,4.0))
+ return {"polygon": $c}
+
+
+ * The expected result is:
+
+ { "polygon": polygon("1.0,1.0 2.0,2.0 3.0,3.0 4.0,4.0") }
+
+
+### point ###
+ * Syntax:
+
+ point(string_expression)
+
+ * Constructor function for `Point` type by parsing a point string `string_expression`
+ * Arguments:
+ * `string_expression` : The `String` value representing a point value.
+ * Return Value:
+ * A `Point` value represented by the given string.
+
+ * Example:
+
+
+ use dataverse TinySocial;
+
+ let $c := point("55.05,-138.04")
+ return {"point": $c}
+
+
+ * The expected result is:
+
+ { "point": point("55.05,-138.04") }
+
+
+### line ###
+ * Syntax:
+
+ line(string_expression)
+
+ * Constructor function for `Line` type by parsing a line string `string_expression`
+ * Arguments:
+ * `string_expression` : The `String` value representing a line value.
+ * Return Value:
+ * A `Line` value represented by the given string.
+
+ * Example:
+
+
+ use dataverse TinySocial;
+
+ let $c := line("55.05,-138.04 13.54,-138.04")
+ return {"line": $c}
+
+
+ * The expected result is:
+
+ { "line": line("55.05,-138.04 13.54,-138.04") }
+
+
+### rectangle ###
+ * Syntax:
+
+ rectangle(string_expression)
+
+ * Constructor function for `Rectangle` type by parsing a rectangle string `string_expression`
+ * Arguments:
+ * `string_expression` : The `String` value representing a rectangle value.
+ * Return Value:
+ * A `Rectangle` value represented by the given string.
+
+ * Example:
+
+
+ use dataverse TinySocial;
+
+ let $c := rectangle("20.05,-125.0 40.67,-100.87")
+ return {"rectangle": $c}
+
+
+ * The expected result is:
+
+ { "rectangle": rectangle("20.05,-125.0 40.67,-100.87") }
+
+
+### circle ###
+ * Syntax:
+
+ circle(string_expression)
+
+ * Constructor function for `Circle` type by parsing a circle string `string_expression`
+ * Arguments:
+ * `string_expression` : The `String` value representing a circle value.
+ * Return Value:
+ * A `Circle` value represented by the given string.
+
+ * Example:
+
+
+ use dataverse TinySocial;
+
+ let $c := circle("55.05,-138.04 10.0")
+ return {"circle": $c}
+
+
+ * The expected result is:
+
+ { "circle": circle("55.05,-138.04 10.0") }
+
+
+### polygon ###
+ * Syntax:
+
+ polygon(string_expression)
+
+ * Constructor function for `Polygon` type by parsing a polygon string `string_expression`
+ * Arguments:
+ * `string_expression` : The `String` value representing a polygon value.
+ * Return Value:
+ * A `Polygon` value represented by the given string.
+
+ * Example:
+
+
+ use dataverse TinySocial;
+
+ let $c := polygon("55.05,-138.04 13.54,-138.04 13.54,-53.31 55.05,-53.31")
+ return {"polygon": $c}
+
+
+ * The expected result is:
+
+ { "polygon": polygon("55.05,-138.04 13.54,-138.04 13.54,-53.31 55.05,-53.31") }
+
+
+### get-x/get-y ###
+ * Syntax:
+
+ get-x(point_expression) or get-y(point_expression)
+
+ * Returns the x or y coordinates of a point `point_expression`.
+ * Arguments:
+ * `point_expression` : A `Point`.
+ * Return Value:
+ * A `Double`, represents the x or y coordinates of the point `point_expression`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $point := create-point(2.3,5.0)
+ return {"x-coordinate": get-x($point), "y-coordinate": get-y($point)}
+
+
+ * The expected result is:
+
+ { "x-coordinate": 2.3d, "y-coordinate": 5.0d }
+
+
+### get-points ###
+ * Syntax:
+
+ get-points(spatial_expression)
+
+ * Returns an ordered list of the points forming the spatial object `spatial_expression`.
+ * Arguments:
+ * `spatial_expression` : A `Point`, `Line`, `Rectangle`, `Circle`, or `Polygon`.
+ * Return Value:
+ * An `OrderedList` of the points forming the spatial object `spatial_expression`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $line := create-line(create-point(100.6,99.4), create-point(-72.0,-76.9))
+ let $rectangle := create-rectangle(create-point(9.2,49.0), create-point(77.8,111.1))
+ let $polygon := create-polygon(create-point(1.0,1.0), create-point(2.0,2.0), create-point(3.0,3.0), create-point(4.0,4.0))
+ let $line_list := get-points($line)
+ let $rectangle_list := get-points($rectangle)
+ let $polygon_list := get-points($polygon)
+ return {"line-first-point": $line_list[0], "line-second-point": $line_list[1], "rectangle-left-bottom-point": $rectangle_list[0], "rectangle-top-upper-point": $rectangle_list[1], "polygon-first-point": $polygon_list[0], "polygon-second-point": $polygon_list[1], "polygon-third-point": $polygon_list[2], "polygon-forth-point": $polygon_list[3]}
+
+
+ * The expected result is:
+
+ { "line-first-point": point("100.6,99.4"), "line-second-point": point("-72.0,-76.9"), "rectangle-left-bottom-point": point("9.2,49.0"), "rectangle-top-upper-point": point("77.8,111.1"), "polygon-first-point": point("1.0,1.0"), "polygon-second-point": point("2.0,2.0"), "polygon-third-point": point("3.0,3.0"), "polygon-forth-point": point("4.0,4.0") }
+
+
+### get-center/get-radius ###
+ * Syntax:
+
+ get-center(circle_expression) or get-radius(circle_expression)
+
+ * Returns the center and the radius of a circle `circle_expression`.
+ * Arguments:
+ * `circle_expression` : A `Circle`.
+ * Return Value:
+ * A `Point` or `Double`, represent the center or radius of the circle `circle_expression`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $circle := create-circle(create-point(6.0,3.0), 1.0)
+ return {"circle-radius": get-radius($circle), "circle-center": get-center($circle)}
+
+
+
+ * The expected result is:
+
+ { "circle-radius": 1.0d, "circle-center": point("6.0,3.0") }
+
+
+
+### spatial-distance ###
+ * Syntax:
+
+ spatial-distance(point_expression1, point_expression2)
+
+ * Returns the euclidean distance between `point_expression1` and `point_expression2`.
+ * Arguments:
+ * `point_expression1` : A `Point`.
+ * `point_expression2` : A `Point`.
+ * Return Value:
+ * A `Double`, represents the euclidean distance between `point_expression1` and `point_expression2`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $t in dataset('TweetMessages')
+ let $d := spatial-distance($t.sender-location, create-point(30.0,70.0))
+ return {"point": $t.sender-location, "distance": $d}
+
+
+
+ * The expected result is:
+
+ { "point": point("47.44,80.65"), "distance": 20.434678857275934d }
+ { "point": point("29.15,76.53"), "distance": 6.585089217315132d }
+ { "point": point("37.59,68.42"), "distance": 7.752709203884797d }
+ { "point": point("24.82,94.63"), "distance": 25.168816023007512d }
+ { "point": point("32.84,67.14"), "distance": 4.030533463451212d }
+ { "point": point("29.72,75.8"), "distance": 5.806754687430835d }
+ { "point": point("39.28,70.48"), "distance": 9.292405501268227d }
+ { "point": point("40.09,92.69"), "distance": 24.832321679617472d }
+ { "point": point("47.51,83.99"), "distance": 22.41250097601782d }
+ { "point": point("36.21,72.6"), "distance": 6.73231758015024d }
+ { "point": point("46.05,93.34"), "distance": 28.325926286707734d }
+ { "point": point("36.86,74.62"), "distance": 8.270671073135482d }
+
+
+### spatial-area ###
+ * Syntax:
+
+ spatial-distance(spatial_2d_expression)
+
+ * Returns the spatial area of `spatial_2d_expression`.
+ * Arguments:
+ * `spatial_2d_expression` : A `Rectangle`, `Circle`, or `Polygon`.
+ * Return Value:
+ * A `Double`, represents the area of `spatial_2d_expression`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $circleArea := spatial-area(create-circle(create-point(0.0,0.0), 5.0))
+ return {"Area":$circleArea}
+
+
+
+ * The expected result is:
+
+ { "Area": 78.53981625d }
+
+
+### spatial-intersect ###
+ * Syntax:
+
+ spatial-intersect(spatial_expression1, spatial_expression2)
+
+ * Checks whether `@arg1` and `@arg2` spatially intersect each other.
+ * Arguments:
+ * `spatial_expression1` : A `Point`, `Line`, `Rectangle`, `Circle`, or `Polygon`.
+ * `spatial_expression2` : A `Point`, `Line`, `Rectangle`, `Circle`, or `Polygon`.
+ * Return Value:
+ * A `Boolean`, represents whether `spatial_expression1` and `spatial_expression2` spatially intersect each other.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $t in dataset('TweetMessages')
+ where spatial-intersect($t.sender-location, create-rectangle(create-point(30.0,70.0), create-point(40.0,80.0)))
+ return $t
+
+
+ * The expected result is:
+
+ { "tweetid": "4", "user": { "screen-name": "NathanGiesen@211", "lang": "en", "friends_count": 39339, "statuses_count": 473, "name": "Nathan Giesen", "followers_count": 49416 }, "sender-location": point("39.28,70.48"), "send-time": datetime("2011-12-26T10:10:00.000Z"), "referred-topics": {{ "sprint", "voice-command" }}, "message-text": " like sprint the voice-command is mind-blowing:)" }
+ { "tweetid": "7", "user": { "screen-name": "ChangEwing_573", "lang": "en", "friends_count": 182, "statuses_count": 394, "name": "Chang Ewing", "followers_count": 32136 }, "sender-location": point("36.21,72.6"), "send-time": datetime("2011-08-25T10:10:00.000Z"), "referred-topics": {{ "samsung", "platform" }}, "message-text": " like samsung the platform is good" }
+ { "tweetid": "9", "user": { "screen-name": "NathanGiesen@211", "lang": "en", "friends_count": 39339, "statuses_count": 473, "name": "Nathan Giesen", "followers_count": 49416 }, "sender-location": point("36.86,74.62"), "send-time": datetime("2012-07-21T10:10:00.000Z"), "referred-topics": {{ "verizon", "voicemail-service" }}, "message-text": " love verizon its voicemail-service is awesome" }
+
+
+### spatial-cell ###
+ * Syntax:
+
+ spatial-cell(point_expression1, point_expression2, x_increment, y_increment)
+
+ * Returns the grid cell that `point_expression1` belongs to.
+ * Arguments:
+ * `point_expression1` : A `Point`, represents the point of interest that its grid cell will be returned.
+ * `point_expression2` : A `Point`, represents the origin of the grid.
+ * `x_increment` : A `Double`, represents X increments.
+ * `y_increment` : A `Double`, represents Y increments.
+ * Return Value:
+ * A `Rectangle`, represents the grid cell that `point_expression1` belongs to.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $t in dataset('TweetMessages')
+ group by $c := spatial-cell($t.sender-location, create-point(20.0,50.0), 5.5, 6.0) with $t
+ let $num := count($t)
+ return { "cell": $c, "count": $num}
+
+
+ * The expected result is:
+
+ { "cell": rectangle("20.0,92.0 25.5,98.0"), "count": 1 }
+ { "cell": rectangle("25.5,74.0 31.0,80.0"), "count": 2 }
+ { "cell": rectangle("31.0,62.0 36.5,68.0"), "count": 1 }
+ { "cell": rectangle("31.0,68.0 36.5,74.0"), "count": 1 }
+ { "cell": rectangle("36.5,68.0 42.0,74.0"), "count": 2 }
+ { "cell": rectangle("36.5,74.0 42.0,80.0"), "count": 1 }
+ { "cell": rectangle("36.5,92.0 42.0,98.0"), "count": 1 }
+ { "cell": rectangle("42.0,80.0 47.5,86.0"), "count": 1 }
+ { "cell": rectangle("42.0,92.0 47.5,98.0"), "count": 1 }
+ { "cell": rectangle("47.5,80.0 53.0,86.0"), "count": 1 }
+
+
+
+
+## Similarity Functions ##
+
+AsterixDB supports queries with different similarity functions, including edit distance and Jaccard.
+
+### edit-distance ###
+ * Syntax:
+
+ edit-distance(expression1, expression2)
+
+ * Returns the [edit distance](http://en.wikipedia.org/wiki/Levenshtein_distance) of `expression1` and `expression2`.
+ * Arguments:
+ * `expression1` : A `String` or a homogeneous `OrderedList` of a comparable item type.
+ * `expression2` : The same type as `expression1`.
+ * Return Value:
+ * An `Int32` that represents the edit-distance similarity of `expression1` and `expression2`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $user in dataset('FacebookUsers')
+ let $ed := edit-distance($user.name, "Suzanna Tilson")
+ where $ed <= 2
+ return $user
+
+
+ * The expected result is:
+
+ {
+ "id": 7, "alias": "Suzanna", "name": "SuzannaTillson", "user-since": datetime("2012-08-07T10:10:00.000Z"), "friend-ids": {{ 6 }},
+ "employment": [ { "organization-name": "Labzatron", "start-date": date("2011-04-19"), "end-date": null } ]
+ }
+
+
+### edit-distance-check ###
+ * Syntax:
+
+ edit-distance-check(expression1, expression2, threshold)
+
+ * Checks whether `expression1` and `expression2` have a [edit distance](http://en.wikipedia.org/wiki/Levenshtein_distance) `<= threshold`. The “check” version of edit distance is faster than the "non-check" version because the former can detect whether two items satisfy a given similarity threshold using early-termination techniques, as opposed to computing their real distance. Although possible, it is not necessary for the user to write queries using the “check” versions explicitly, since a rewrite rule can perform an appropriate transformation from a “non-check” version to a “check” version.
+
+ * Arguments:
+ * `expression1` : A `String` or a homogeneous `OrderedList` of a comparable item type.
+ * `expression2` : The same type as `expression1`.
+ * `threshold` : An `Int32` that represents the distance threshold.
+ * Return Value:
+ * An `OrderedList` with two items:
+ * The first item contains a `Boolean` value representing whether `expression1` and `expression2` are similar.
+ * The second item contains an `Int32` that represents the edit distance of `expression1` and `expression2` if it is `<= `threshold`, or 0 otherwise.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $user in dataset('FacebookUsers')
+ let $ed := edit-distance-check($user.name, "Suzanna Tilson", 2)
+ where $ed[0]
+ return $ed[1]
+
+
+ * The expected result is:
+
+ 2
+
+
+### similarity-jaccard ###
+ * Syntax:
+
+ similarity-jaccard(list_expression1, list_expression2)
+
+ * Returns the [Jaccard similarity](http://en.wikipedia.org/wiki/Jaccard_index) of `list_expression1` and `list_expression2`.
+ * Arguments:
+ * `list_expression1` : An `UnorderedList` or `OrderedList`.
+ * `list_expression2` : An `UnorderedList` or `OrderedList`.
+ * Return Value:
+ * A `Float` that represents the Jaccard similarity of `list_expression1` and `list_expression2`.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $user in dataset('FacebookUsers')
+ let $sim := similarity-jaccard($user.friend-ids, [1,5,9])
+ where $sim >= 0.6f
+ return $user
+
+
+ * The expected result is:
+
+ {
+ "id": 3, "alias": "Emory", "name": "EmoryUnk", "user-since": datetime("2012-07-10T10:10:00.000Z"), "friend-ids": {{ 1, 5, 8, 9 }},
+ "employment": [ { "organization-name": "geomedia", "start-date": date("2010-06-17"), "end-date": date("2010-01-26") } ]
+ }
+ {
+ "id": 10, "alias": "Bram", "name": "BramHatch", "user-since": datetime("2010-10-16T10:10:00.000Z"), "friend-ids": {{ 1, 5, 9 }},
+ "employment": [ { "organization-name": "physcane", "start-date": date("2007-06-05"), "end-date": date("2011-11-05") } ]
+ }
+
+
+### similarity-jaccard-check ###
+ * Syntax:
+
+ similarity-jaccard-check(list_expression1, list_expression2, threshold)
+
+ * Checks whether `list_expression1` and `list_expression2` have a [Jaccard similarity](http://en.wikipedia.org/wiki/Jaccard_index) `>= threshold`. Again, the “check” version of Jaccard is faster than the "non-check" version.
+
+ * Arguments:
+ * `list_expression1` : An `UnorderedList` or `OrderedList`.
+ * `list_expression2` : An `UnorderedList` or `OrderedList`.
+ * `threshold` : A Float that represents the similarity threshold.
+ * Return Value:
+ * An `OrderedList` with two items:
+ * The first item contains a `Boolean` value representing whether `list_expression1` and `list_expression2` are similar.
+ * The second item contains a `Float` that represents the Jaccard similarity of `list_expression1` and `list_expression2` if it is >`= `threshold`, or 0 otherwise.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $user in dataset('FacebookUsers')
+ let $sim := similarity-jaccard-check($user.friend-ids, [1,5,9], 0.6f)
+ where $sim[0]
+ return $sim[1]
+
+
+ * The expected result is:
+
+ 0.75f
+ 1.0f
+
+
+### Similarity Operator ~# ###
+ * "`~=`" is syntactic sugar for expressing a similarity condition with a given similarity threshold.
+ * The similarity function and threshold for "`~=`" are controlled via "set" directives.
+ * The "`~=`" operator returns a Boolean that represents whether the operands are similar.
+
+ * Example for Jaccard similarity:
+
+ use dataverse TinySocial;
+
+ set simfunction "jaccard";
+ set simthreshold "0.6f";
+
+ for $user in dataset('FacebookUsers')
+ where $user.friend-ids ~= [1,5,9]
+ return $user
+
+
+ * The expected result is:
+
+ {
+ "id": 3, "alias": "Emory", "name": "EmoryUnk", "user-since": datetime("2012-07-10T10:10:00.000Z"), "friend-ids": {{ 1, 5, 8, 9 }},
+ "employment": [ { "organization-name": "geomedia", "start-date": date("2010-06-17"), "end-date": date("2010-01-26") } ]
+ }
+ {
+ "id": 10, "alias": "Bram", "name": "BramHatch", "user-since": datetime("2010-10-16T10:10:00.000Z"), "friend-ids": {{ 1, 5, 9 }},
+ "employment": [ { "organization-name": "physcane", "start-date": date("2007-06-05"), "end-date": date("2011-11-05") } ]
+ }
+
+
+ * Example for edit-distance similarity:
+
+ use dataverse TinySocial;
+
+ set simfunction "edit-distance";
+ set simthreshold "2";
+
+ for $user in dataset('FacebookUsers')
+ where $user.name ~= "Suzanna Tilson"
+ return $user
+
+
+ * The expected output is:
+
+ {
+ "id": 7, "alias": "Suzanna", "name": "SuzannaTillson", "user-since": datetime("2012-08-07T10:10:00.000Z"), "friend-ids": {{ 6 }},
+ "employment": [ { "organization-name": "Labzatron", "start-date": date("2011-04-19"), "end-date": null } ]
+ }
+
+
+## Tokenizing Functions ##
+### word-tokens ###
+ * Syntax:
+
+ word-tokens(string_expression)
+
+ * Returns a list of word tokens of `string_expression`.
+ * Arguments:
+ * `string_expression` : A `String` that will be tokenized.
+ * Return Value:
+ * An `OrderedList` of `String` word tokens.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $t in dataset('TweetMessages')
+ let $tokens := word-tokens($t.message-text)
+ where $t.send-time >= datetime('2012-01-01T00:00:00')
+ return {
+ "tweetid": $t.tweetid,
+ "word-tokens": $tokens
+ }
+
+
+ * The expected result is:
+
+ { "tweetid": "9", "word-tokens": [ "love", "verizon", "its", "voicemail", "service", "is", "awesome" ] }
+
+
+### hashed-word-tokens ###
+ * Syntax:
+
+ hashed-word-tokens(string_expression)
+
+ * Returns a list of hashed word tokens of `string_expression`.
+ * Arguments:
+ * `string_expression` : A `String` that will be tokenized.
+ * Return Value:
+ * An `OrderedList` of Int32 hashed tokens.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $t in dataset('TweetMessages')
+ let $tokens := hashed-word-tokens($t.message-text)
+ where $t.send-time >= datetime('2012-01-01T00:00:00')
+ return {
+ "tweetid": $t.tweetid,
+ "hashed-word-tokens": $tokens
+ }
+
+
+ * The expected result is:
+
+ { "tweetid": "9", "hashed-word-tokens": [ -1217719622, -447857469, -1884722688, -325178649, 210976949, 285049676, 1916743959 ] }
+
+
+### counthashed-word-tokens ###
+ * Syntax:
+
+ counthashed-word-tokens(string_expression)
+
+ * Returns a list of hashed word tokens of `string_expression`. The hashing mechanism gives duplicate tokens different hash values, based on the occurrence count of that token.
+ * Arguments:
+ * `string_expression` : A `String` that will be tokenized.
+ * Return Value:
+ * An `OrderedList` of `Int32` hashed tokens.
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $t in dataset('TweetMessages')
+ let $tokens := counthashed-word-tokens($t.message-text)
+ where $t.send-time >= datetime('2012-01-01T00:00:00')
+ return {
+ "tweetid": $t.tweetid,
+ "counthashed-word-tokens": $tokens
+ }
+
+
+ * The expected result is:
+
+ { "tweetid": "9", "counthashed-word-tokens": [ -1217719622, -447857469, -1884722688, -325178649, 210976949, 285049676, 1916743959 ] }
+
+
+### gram-tokens ###
+ * Syntax:
+
+ gram-tokens(string_expression, gram_length, boolean_expression)
+
+ * Returns a list of gram tokens of `string_expression`, which can be obtained by scanning the characters using a sliding window of a fixed length.
+ * Arguments:
+ * `string_expression` : A `String` that will be tokenized.
+ * `gram_length` : An `Int32` as the length of grams.
+ * `boolean_expression` : A `Boolean` value to indicate whether to generate additional grams by pre- and postfixing `string_expression` with special characters.
+ * Return Value:
+ * An `OrderedList` of String gram tokens.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $t in dataset('TweetMessages')
+ let $tokens := gram-tokens($t.message-text, 3, true)
+ where $t.send-time >= datetime('2012-01-01T00:00:00')
+ return {
+ "tweetid": $t.tweetid,
+ "gram-tokens": $tokens
+ }
+
+
+ * The expected result is:
+
+ {
+ "tweetid": "9",
+ "gram-tokens": [ "## ", "# l", " lo", "lov", "ove", "ve ", "e v", " ve", "ver", "eri", "riz", "izo", "zon", "on ", "n i", " it", "its", "ts ", "s v", " vo", "voi", "oic", "ice",
+ "cem", "ema", "mai", "ail", "il-", "l-s", "-se", "ser", "erv", "rvi", "vic", "ice", "ce ", "e i", " is", "is ", "s a", " aw", "awe", "wes", "eso", "som", "ome", "me$", "e$$" ]
+ }
+
+
+### hashed-gram-tokens ###
+ * Syntax:
+
+ hashed-gram-tokens(string_expression, gram_length, boolean_expression)
+
+ * Returns a list of hashed gram tokens of `string_expression`.
+ * Arguments:
+ * `string_expression` : A `String` that will be tokenized.
+ * `gram_length` : An `Int32` as the length of grams.
+ * `boolean_expression` : A `Boolean` to indicate whether to generate additional grams by pre- and postfixing `string_expression` with special characters.
+ * Return Value:
+ * An `OrderedList` of `Int32` hashed gram tokens.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $t in dataset('TweetMessages')
+ let $tokens := hashed-gram-tokens($t.message-text, 3, true)
+ where $t.send-time >= datetime('2012-01-01T00:00:00')
+ return {
+ "tweetid": $t.tweetid,
+ "hashed-gram-tokens": $tokens
+ }
+
+
+ * The expected result is:
+
+ {
+ "tweetid": "9",
+ "hashed-gram-tokens": [ 40557178, -2002241593, 161665899, -856104603, -500544946, 693410611, 395674299, -1015235909, 1115608337, 1187999872, -31006095, -219180466, -1676061637,
+ 1040194153, -1339307841, -1527110163, -1884722688, -179148713, -431014627, -1789789823, -1209719926, 684519765, -486734513, 1734740619, -1971673751, -932421915, -2064668066,
+ -937135958, -790946468, -69070309, 1561601454, 26169001, -160734571, 1330043462, -486734513, -18796768, -470303314, 113421364, 1615760212, 1688217556, 1223719184, 536568131,
+ 1682609873, 2935161, -414769471, -1027490137, 1602276102, 1050490461 ]
+ }
+
+
+### counthashed-gram-tokens ###
+ * Syntax:
+
+ counthashed-gram-tokens(string_expression, gram_length, boolean_expression)
+
+ * Returns a list of hashed gram tokens of `string_expression`. The hashing mechanism gives duplicate tokens different hash values, based on the occurrence count of that token.
+ * Arguments:
+ * `string_expression` : A `String` that will be tokenized.
+ * `gram_length` : An `Int32`, length of grams to generate.
+ * `boolean_expression` : A `Boolean`, whether to generate additional grams by pre- and postfixing `string_expression` with special characters.
+ * Return Value:
+ * An `OrderedList` of `Int32` hashed gram tokens.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $t in dataset('TweetMessages')
+ let $tokens := counthashed-gram-tokens($t.message-text, 3, true)
+ where $t.send-time >= datetime('2012-01-01T00:00:00')
+ return {
+ "tweetid": $t.tweetid,
+ "counthashed-gram-tokens": $tokens
+ }
+
+
+ * The expected result is:
+
+ {
+ "tweetid": "9",
+ "counthashed-gram-tokens": [ 40557178, -2002241593, 161665899, -856104603, -500544946, 693410611, 395674299, -1015235909, 1115608337, 1187999872, -31006095, -219180466, -1676061637,
+ 1040194153, -1339307841, -1527110163, -1884722688, -179148713, -431014627, -1789789823, -1209719926, 684519765, -486734513, 1734740619, -1971673751, -932421915, -2064668066, -937135958,
+ -790946468, -69070309, 1561601454, 26169001, -160734571, 1330043462, -486734512, -18796768, -470303314, 113421364, 1615760212, 1688217556, 1223719184, 536568131, 1682609873, 2935161,
+ -414769471, -1027490137, 1602276102, 1050490461 ]
+ }
+
+
+## Temporal Functions ##
+
+### date ###
+ * Syntax:
+
+ date(string_expression)
+
+ * Constructor function for `Date` type by parsing a date string `string_expression`
+ * Arguments:
+ * `string_expression` : The `String` value representing a date value.
+ * Return Value:
+ * A `Date` value represented by the given string.
+
+ * Example:
+
+ {
+ "date-extended": date("2013-04-01"),
+ "date-basic": date("20130401")
+ }
+
+
+ * The expected result is:
+
+ {
+ "date-extended": date("2013-04-01"),
+ "date-basic": date("2013-04-01")
+ }
+
+
+### time ###
+ * Syntax:
+
+ time(string_expression)
+
+ * Constructor function for `Time` type by parsing a time string `string_expression`
+ * Arguments:
+ * `string_expression` : The `String` value representing a time value.
+ * Return Value:
+ * A `Time` value represented by the given string.
+
+ * Example:
+
+ {
+ "time-extended": time("12:30:45.678+08:00"),
+ "time-basic": time("123045678+0800")
+ }
+
+
+ * The expected result is:
+
+ {
+ "time-extended": time("04:30:45.678Z"),
+ "time-basic": time("04:30:45.678Z")
+ }
+
+
+### datetime ###
+ * Syntax:
+
+ datetime(string_expression)
+
+ * Constructor function for `Datetime` type by parsing a datetime string `string_expression`
+ * Arguments:
+ * `string_expression` : The `String` value representing a datetime value.
+ * Return Value:
+ * A `Datetime` value represented by the given string.
+
+ * Example:
+
+ {
+ "datetime-extended": datetime("2013-04-01T12:30:45.678+08:00"),
+ "datetime-basic": datetime("20130401T123045678+0800")
+ }
+
+
+ * The expected result is:
+
+ {
+ "datetime-extended": datetime("2013-04-01T04:30:45.678Z"),
+ "datetime-basic": datetime("2013-04-01T04:30:45.678Z")
+ }
+
+
+### interval-from-date ###
+ * Syntax:
+
+ interval-from-date(string_expression1, string_expression2)
+
+ * Constructor function for `Interval` type by parsing two date strings.
+ * Arguments:
+ * `string_expression1` : The `String` value representing the starting date.
+ * `string_expression2` : The `String` value representing the ending date.
+ * Return Value:
+ * An `Interval` value between the two dates.
+
+ * Example:
+
+ {"date-interval": interval-from-date("2012-01-01", "2013-04-01")}
+
+
+ * The expected result is:
+
+ { "date-interval": interval-date("2012-01-01, 2013-04-01") }
+
+
+### interval-from-time ###
+ * Syntax:
+
+ interval-from-time(string_expression1, string_expression2)
+
+ * Constructor function for `Interval` type by parsing two time strings.
+ * Arguments:
+ * `string_expression1` : The `String` value representing the starting time.
+ * `string_expression2` : The `String` value representing the ending time.
+ * Return Value:
+ * An `Interval` value between the two times.
+
+ * Example:
+
+ {"time-interval": interval-from-time("12:23:34.456Z", "233445567+0800")}
+
+
+ * The expected result is:
+
+ { "time-interval": interval-time("12:23:34.456Z, 15:34:45.567Z") }
+
+
+### interval-from-datetime ###
+ * Syntax:
+
+ interval-from-datetime(string_expression1, string_expression2)
+
+ * Constructor function for `Interval` type by parsing two datetime strings.
+ * Arguments:
+ * `string_expression1` : The `String` value representing the starting datetime.
+ * `string_expression2` : The `String` value representing the ending datetime.
+ * Return Value:
+ * An `Interval` value between the two datetimes.
+
+ * Example:
+
+ {"datetime-interval": interval-from-datetime("2012-01-01T12:23:34.456+08:00", "20130401T153445567Z")}
+
+
+ * The expected result is:
+
+ { "datetime-interval": interval-datetime("2012-01-01T04:23:34.456Z, 2013-04-01T15:34:45.567Z") }
+
+
+### year/month/day/hour/minute/second/millisecond ###
+ * Syntax:
+
+ year/month/day/hour/minute/second/millisecond(temporal_expression)
+
+ * Accessors for accessing fields in a temporal value
+ * Arguments:
+ * `temporal_expression` : a temporal value represented as one of the following types: `Date`, `Datetime`, `Time`, `Duration`.
+ * Return Value:
+ * An `Int32` value representing the field to be extracted.
+
+ * Example:
+
+ let $c1 := date("2010-10-30")
+ let $c2 := datetime("1987-11-19T23:49:23.938")
+ let $c3 := time("12:23:34.930+07:00")
+ let $c4 := duration("P3Y73M632DT49H743M3948.94S")
+
+ return {"year": year($c1), "month": month($c2), "day": day($c1), "hour": hour($c3), "min": minute($c4), "second": second($c2), "ms": millisecond($c4)}
+
+
+ * The expected result is:
+
+ { "year": 2010, "month": 11, "day": 30, "hour": 5, "min": 28, "second": 23, "ms": 94 }
+
+
+
+### add-date-duration ###
+ * Syntax:
+
+ add-date-duration(date_expression, duration_expression)
+
+ * Create a new date by adding the duration `duration_expression` to the given date `date_expression`.
+ * Arguments:
+ * `date_expression` : The `Date` value to be added onto.
+ * `duration_expression` : The `Duration` to be added.
+ * Return Value:
+ * A `Date` value represents the new date after being adjusted by the duration.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $startdate := date('2011-03-01')
+ for $i in dataset('TweetMessage')
+ where date-from-datetime($i.send-time) > $startdate
+ and date-from-datetime($i.send-time) < add-date-duration($startdate, duration('P2Y'))
+ return {"send-time": $i.send-time, "message": $i.message-text}
+
+
+ * The expected result is:
+
+ { "send-time": datetime("2011-12-26T10:10:00.000Z"), "message": " like sprint the voice-command is mind-blowing:)" }
+ { "send-time": datetime("2011-08-25T10:10:00.000Z"), "message": " like samsung the platform is good" }
+ { "send-time": datetime("2012-07-21T10:10:00.000Z"), "message": " love verizon its voicemail-service is awesome" }
+
+
+### add-datetime-duration ###
+ * Syntax:
+
+ add-date-duration(datetime_expression, duration_expression)
+
+ * Create a new datetime by adding the duration `duration_expression` to the given datetime `datetime_expression`.
+ * Arguments:
+ * `datetime_expression` : The `Datetime` value to be added onto.
+ * `duration_expression` : The `Duration` to be added.
+ * Return Value:
+ * A `Datetime` value represents the new datetime after being adjusted by the duration.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $startdt := datetime('2011-03-01T00:00:00')
+ for $i in dataset('TweetMessage')
+ where $i.send-time > $startdt and $i.send-time < add-datetime-duration($startdt, duration('P2Y'))
+ return {"send-time": $i.send-time, "message": $i.message-text}
+
+
+ * The expected result is:
+
+ { "send-time": datetime("2011-12-26T10:10:00.000Z"), "message": " like sprint the voice-command is mind-blowing:)" }
+ { "send-time": datetime("2011-08-25T10:10:00.000Z"), "message": " like samsung the platform is good" }
+ { "send-time": datetime("2012-07-21T10:10:00.000Z"), "message": " love verizon its voicemail-service is awesome" }
+
+
+### add-time-duration ###
+ * Syntax:
+
+ add-time-duration(time_expression, duration_expression)
+
+ * Create a new time by adding the duration `duration_expression` to the given time `time_expression`.
+ * Arguments:
+ * `time_expression` : The `Time` value to be added onto.
+ * `duration_expression` : The `Duration` to be added.
+ * Return Value:
+ * A `Time` value represents the new time after being adjusted by the duration.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $starttime := time('08:00:00')
+ for $i in dataset('TweetMessage')
+ where time-from-datetime($i.send-time) > $starttime and time-from-datetime($i.send-time) < add-time-duration($starttime, duration('PT5H'))
+ return {"send-time": $i.send-time, "message": $i.message-text}
+
+
+ * The expected result is:
+
+ { "send-time": datetime("2008-04-26T10:10:00.000Z"), "message": " love t-mobile its customization is good:)" }
+ { "send-time": datetime("2010-05-13T10:10:00.000Z"), "message": " like verizon its shortcut-menu is awesome:)" }
+ { "send-time": datetime("2006-11-04T10:10:00.000Z"), "message": " like motorola the speed is good:)" }
+ { "send-time": datetime("2011-12-26T10:10:00.000Z"), "message": " like sprint the voice-command is mind-blowing:)" }
+ { "send-time": datetime("2006-08-04T10:10:00.000Z"), "message": " can't stand motorola its speed is terrible:(" }
+ { "send-time": datetime("2010-05-07T10:10:00.000Z"), "message": " like iphone the voice-clarity is good:)" }
+ { "send-time": datetime("2011-08-25T10:10:00.000Z"), "message": " like samsung the platform is good" }
+ { "send-time": datetime("2005-10-14T10:10:00.000Z"), "message": " like t-mobile the shortcut-menu is awesome:)" }
+ { "send-time": datetime("2012-07-21T10:10:00.000Z"), "message": " love verizon its voicemail-service is awesome" }
+ { "send-time": datetime("2008-01-26T10:10:00.000Z"), "message": " hate verizon its voice-clarity is OMG:(" }
+ { "send-time": datetime("2008-03-09T10:10:00.000Z"), "message": " can't stand iphone its platform is terrible" }
+ { "send-time": datetime("2010-02-13T10:10:00.000Z"), "message": " like samsung the voice-command is amazing:)" }
+
+
+### adjust-datetime-for-timezone ###
+ * Syntax:
+
+ adjust-datetime-for-timezone(datetime_expression, string_expression)
+
+ * Adjust the given datetime `datetime_expression` by applying the timezone information `string_expression`
+ * Arguments:
+ * `datetime_expression` : A `Datetime` value to be adjusted.
+ * `string_expression` : A `String` representing the timezone information.
+ * Return Value:
+ * A `String` value represents the new datetime after being adjusted by the timezone information.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('TweetMessage')
+ return {"adjusted-send-time": adjust-datetime-for-timezone($i.send-time, "+08:00"), "message": $i.message-text}
+
+
+ * The expected result is:
+
+ { "adjusted-send-time": "2008-04-26T18:10:00.000+08:00", "message": " love t-mobile its customization is good:)" }
+ { "adjusted-send-time": "2010-05-13T18:10:00.000+08:00", "message": " like verizon its shortcut-menu is awesome:)" }
+ { "adjusted-send-time": "2006-11-04T18:10:00.000+08:00", "message": " like motorola the speed is good:)" }
+ { "adjusted-send-time": "2011-12-26T18:10:00.000+08:00", "message": " like sprint the voice-command is mind-blowing:)" }
+ { "adjusted-send-time": "2006-08-04T18:10:00.000+08:00", "message": " can't stand motorola its speed is terrible:(" }
+ { "adjusted-send-time": "2010-05-07T18:10:00.000+08:00", "message": " like iphone the voice-clarity is good:)" }
+ { "adjusted-send-time": "2011-08-25T18:10:00.000+08:00", "message": " like samsung the platform is good" }
+ { "adjusted-send-time": "2005-10-14T18:10:00.000+08:00", "message": " like t-mobile the shortcut-menu is awesome:)" }
+ { "adjusted-send-time": "2012-07-21T18:10:00.000+08:00", "message": " love verizon its voicemail-service is awesome" }
+ { "adjusted-send-time": "2008-01-26T18:10:00.000+08:00", "message": " hate verizon its voice-clarity is OMG:(" }
+ { "adjusted-send-time": "2008-03-09T18:10:00.000+08:00", "message": " can't stand iphone its platform is terrible" }
+ { "adjusted-send-time": "2010-02-13T18:10:00.000+08:00", "message": " like samsung the voice-command is amazing:)" }
+
+
+### adjust-time-for-timezone ###
+ * Syntax:
+
+ adjust-time-for-timezone(time_expression, string_expression)
+
+ * Adjust the given time `time_expression` by applying the timezone information `string_expression`
+ * Arguments:
+ * `time_expression` : A `Time` value to be adjusted.
+ * `string_expression` : A `String` representing the timezone information.
+ * Return Value:
+ * A `String` value represents the new time after being adjusted by the timezone information.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('TweetMessage')
+ return {"adjusted-send-time": adjust-time-for-timezone(time-from-datetime($i.send-time), "+08:00"), "message": $i.message-text}
+
+
+ * The expected result is:
+
+ { "adjusted-send-time": "18:10:00.000+08:00", "message": " love t-mobile its customization is good:)" }
+ { "adjusted-send-time": "18:10:00.000+08:00", "message": " like verizon its shortcut-menu is awesome:)" }
+ { "adjusted-send-time": "18:10:00.000+08:00", "message": " like motorola the speed is good:)" }
+ { "adjusted-send-time": "18:10:00.000+08:00", "message": " like sprint the voice-command is mind-blowing:)" }
+ { "adjusted-send-time": "18:10:00.000+08:00", "message": " can't stand motorola its speed is terrible:(" }
+ { "adjusted-send-time": "18:10:00.000+08:00", "message": " like iphone the voice-clarity is good:)" }
+ { "adjusted-send-time": "18:10:00.000+08:00", "message": " like samsung the platform is good" }
+ { "adjusted-send-time": "18:10:00.000+08:00", "message": " like t-mobile the shortcut-menu is awesome:)" }
+ { "adjusted-send-time": "18:10:00.000+08:00", "message": " love verizon its voicemail-service is awesome" }
+ { "adjusted-send-time": "18:10:00.000+08:00", "message": " hate verizon its voice-clarity is OMG:(" }
+ { "adjusted-send-time": "18:10:00.000+08:00", "message": " can't stand iphone its platform is terrible" }
+ { "adjusted-send-time": "18:10:00.000+08:00", "message": " like samsung the voice-command is amazing:)" }
+
+
+### calendar-duration-from-datetime ###
+ * Syntax:
+
+ calendar-duration-from-datetime(datetime_expression, duration_expression)
+
+ * Get a user-friendly representation of the duration `duration_expression` based on the given datetime `datetime_expression`
+ * Arguments:
+ * `datetime_expression` : A `Datetime` value to be used as the reference time point.
+ * `duration_expression` : A `Duration` value to be converted
+ * Return Value:
+ * A `Duration` value with the duration as `duration_expression` but with a user-friendly representation.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('TweetMessage')
+ where $i.send-time > datetime("2011-01-01T00:00:00")
+ return {"since-2011": subtract-datetime($i.send-time, datetime("2011-01-01T00:00:00")), "since-2011-user-friendly": calendar-duration-from-datetime($i.send-time, subtract-datetime($i.send-time, datetime("2011-01-01T00:00:00")))}
+
+
+ * The expected result is:
+
+ { "since-2011": duration("P359DT10H10M"), "since-2011-user-friendly": duration("P11M23DT10H10M") }
+ { "since-2011": duration("P236DT10H10M"), "since-2011-user-friendly": duration("P7M23DT10H10M") }
+ { "since-2011": duration("P567DT10H10M"), "since-2011-user-friendly": duration("P1Y6M18DT10H10M") }
+
+
+### calendar-duration-from-date ###
+ * Syntax:
+
+ calendar-duration-from-date(date_expression, duration_expression)
+
+ * Get a user-friendly representation of the duration `duration_expression` based on the given date `date_expression`
+ * Arguments:
+ * `date_expression` : A `Date` value to be used as the reference time point.
+ * `duration_expression` : A `Duration` value to be converted
+ * Return Value:
+ * A `Duration` value with the duration as `duration_expression` but with a user-friendly representation.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('TweetMessage')
+ where $i.send-time > datetime("2011-01-01T00:00:00")
+ return {"since-2011": subtract-datetime($i.send-time, datetime("2011-01-01T00:00:00")),
+ "since-2011-user-friendly": calendar-duration-from-date(date-from-datetime($i.send-time), subtract-datetime($i.send-time, datetime("2011-01-01T00:00:00")))}
+
+
+ * The expected result is:
+
+ { "since-2011": duration("P359DT10H10M"), "since-2011-user-friendly": duration("P11M23DT10H10M") }
+ { "since-2011": duration("P236DT10H10M"), "since-2011-user-friendly": duration("P7M23DT10H10M") }
+ { "since-2011": duration("P567DT10H10M"), "since-2011-user-friendly": duration("P1Y6M18DT10H10M") }
+
+
+### current-date ###
+ * Syntax:
+
+ current-date()
+
+ * Get the current date
+ * Arguments:None
+ * Return Value:
+ * A `Date` value of the date when the function is called.
+
+### current-time ###
+ * Syntax:
+
+ current-time()
+
+ * Get the current time
+ * Arguments:None
+ * Return Value:
+ * A `Time` value of the time when the function is called.
+
+### current-datetime ###
+ * Syntax:
+
+ current-datetime()
+
+ * Get the current datetime
+ * Arguments:None
+ * Return Value:
+ * A `Datetime` value of the datetime when the function is called.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ {"current-date": current-date(),
+ "current-time": current-time(),
+ "current-datetime": current-datetime()}
+
+
+ * The expected result is:
+
+ { "current-date": date("2013-04-06"),
+ "current-time": time("00:48:44.093Z"),
+ "current-datetime": datetime("2013-04-06T00:48:44.093Z") }
+
+
+### date-from-datetime ###
+ * Syntax:
+
+ date-from-datetime(datetime_expression)
+
+ * Get the date value from the given datetime value `datetime_expression`
+ * Arguments:
+ * `datetime_expression`: A `Datetime` value to be extracted from
+ * Return Value:
+ * A `Date` value from the datetime.
+
+### time-from-datetime ###
+ * Syntax:
+
+ time-from-datetime(datetime_expression)
+
+ * Get the time value from the given datetime value `datetime_expression`
+ * Arguments:
+ * `datetime_expression`: A `Datetime` value to be extracted from
+ * Return Value:
+ * A `Time` value from the datetime.
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('TweetMessage')
+ where $i.send-time > datetime("2011-01-01T00:00:00")
+ return {"send-date": date-from-datetime($i.send-time), "send-time": time-from-datetime($i.send-time)}
+
+
+ * The expected result is:
+
+ { "send-date": date("2011-12-26"), "send-time": time("10:10:00.000Z") }
+ { "send-date": date("2011-08-25"), "send-time": time("10:10:00.000Z") }
+ { "send-date": date("2012-07-21"), "send-time": time("10:10:00.000Z") }
+
+
+### date-from-unix-time-in-days ###
+ * Syntax:
+
+ date-from-unix-time-in-days(numeric_expression)
+
+ * Get date representing the time after `numeric_expression` days since 1970-01-01
+ * Arguments:
+ * `numeric_expression`: A `Int8`/`Int16`/`Int32` value representing the number of days
+ * Return Value:
+ * A `Date` value as the time after `numeric_expression` days since 1970-01-01
+
+### datetime-from-unix-time-in-ms ###
+ * Syntax:
+
+ datetime-from-unix-time-in-ms(numeric_expression)
+
+ * Get datetime representing the time after `numeric_expression` milliseconds since 1970-01-01T00:00:00Z
+ * Arguments:
+ * `numeric_expression`: A `Int8`/`Int16`/`Int32`/`Int64` value representing the number of milliseconds
+ * Return Value:
+ * A `Datetime` value as the time after `numeric_expression` milliseconds since 1970-01-01T00:00:00Z
+
+### time-from-unix-time-in-ms ###
+ * Syntax:
+
+ time-from-unix-time-in-ms(numeric_expression)
+
+ * Get time representing the time after `numeric_expression` milliseconds since 00:00:00.000Z
+ * Arguments:
+ * `numeric_expression`: A `Int8`/`Int16`/`Int32` value representing the number of milliseconds
+ * Return Value:
+ * A `Time` value as the time after `numeric_expression` milliseconds since 00:00:00.000Z
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ let $d := date-from-unix-time-in-days(15800)
+ let $dt := datetime-from-unix-time-in-ms(1365139700000)
+ let $t := time-from-unix-time-in-ms(3748)
+ return {"date": $d, "datetime": $dt, "time": $t}
+
+
+ * The expected result is:
+
+ { "date": date("2013-04-05"), "datetime": datetime("2013-04-05T05:28:20.000Z"), "time": time("00:00:03.748Z") }
+
+
+### subtract-date ###
+ * Syntax:
+
+ subtract-date(date_start, date_end)
+
+ * Get the duration between two dates `date_start` and `date_end`
+ * Arguments:
+ * `date_start`: the starting `Date`
+ * `date_end`: the ending `Date`
+ * Return Value:
+ * A `Duration` value between `date_start` and `date_end`
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('FacebookUser')
+ for $j in dataset('FacebookUser')
+ where $i.user-since < $j.user-since and $i.user-since > datetime("2012-01-01T00:00:00")
+ return {"id1": $i.id, "id2": $j.id, "diff": subtract-date(date-from-datetime($j.user-since), date-from-datetime($i.user-since))}
+
+
+ * The expected result is:
+
+ { "id1": 3, "id2": 1, "diff": duration("P41D") }
+ { "id1": 3, "id2": 7, "diff": duration("P28D") }
+ { "id1": 7, "id2": 1, "diff": duration("P13D") }
+
+
+### subtract-time ###
+ * Syntax:
+
+ subtract-time(time_start, time_end)
+
+ * Get the duration between two times `time_start` and `time_end`
+ * Arguments:
+ * `time_start`: the starting `Time`
+ * `time_end`: the ending `Time`
+ * Return Value:
+ * A `Duration` value between `time_start` and `time_end`
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('FacebookUser')
+ for $j in dataset('FacebookUser')
+ where $i.user-since < $j.user-since and $i.user-since > datetime("2012-01-01T00:00:00")
+ return {"id1": $i.id, "id2": $j.id, "diff": subtract-time(time-from-datetime($j.user-since), time("02:50:48.938"))}
+
+
+ * The expected result is:
+
+ { "id1": 3, "id2": 1, "diff": duration("PT7H19M11.62S") }
+ { "id1": 3, "id2": 7, "diff": duration("PT7H19M11.62S") }
+ { "id1": 7, "id2": 1, "diff": duration("PT7H19M11.62S") }
+
+
+### subtract-datetime ###
+ * Syntax:
+
+ subtract-datetime(datetime_start, datetime_end)
+
+ * Get the duration between two datetimes `datetime_start` and `datetime_end`
+ * Arguments:
+ * `datetime_start`: the starting `Datetime`
+ * `datetime_end`: the ending `Datetime`
+ * Return Value:
+ * A `Duration` value between `datetime_start` and `datetime_end`
+
+ * Example:
+
+ use dataverse TinySocial;
+
+ for $i in dataset('FacebookUser')
+ for $j in dataset('FacebookUser')
+ where $i.user-since < $j.user-since and $i.user-since > datetime("2011-01-01T00:00:00")
+ return {"id1": $i.id, "id2": $j.id, "diff": subtract-datetime($j.user-since, $i.user-since)}
+
+
+ * The expected result is:
+
+ { "id1": 2, "id2": 1, "diff": duration("P576D") }
+ { "id1": 2, "id2": 3, "diff": duration("P535D") }
+ { "id1": 2, "id2": 7, "diff": duration("P563D") }
+ { "id1": 3, "id2": 1, "diff": duration("P41D") }
+ { "id1": 3, "id2": 7, "diff": duration("P28D") }
+ { "id1": 7, "id2": 1, "diff": duration("P13D") }
+
diff --git a/asterix-doc/src/site/markdown/AsterixQueryLanguage.md b/asterix-doc/src/site/markdown/AsterixQueryLanguage.md
new file mode 100644
index 0000000..af25cda
--- /dev/null
+++ b/asterix-doc/src/site/markdown/AsterixQueryLanguage.md
@@ -0,0 +1,195 @@
+`<wiki:toc max_depth="2" />`
+
+# The Asterix Query Language, Version 1.0 #
+
+# Introduction #
+
+This wiki page provides an overview of the Asterix Query language and the Asterix Data model.
+
+*WARNING:* _THIS IS AN INCOMPLETE SUSPENDED WORK IN PROGRESS...
+_ It will hopefully be resumed shortly in order to produce a legit AQL spec to go out with the Beta Release of AsterixDB. What's here is very likely inconsistent with what's in the system as of today, as this was from an older snapshot of the world.
+
+# Asterix Data Model #
+
+Data in Asterix is represented using the Asterix Data Model (ADM). The ADM derives inspiration from prior standards such as JSON, XQuery, and the Object Data Model from ODMG.
+
+## Asterix Types ##
+
+### Primitive Types ##
+
+|| *Primitive Type* || *Description* ||
+|| int8 || Signed 8-bit integer. Valid range -128 thru 127 ||
+|| int16 || Signed 16-bit integer. Valid range -32768 thru 32767 ||
+|| int32 || Signed 32-bit integer. Valid range -2147483648 thru 2147483647 ||
+|| int64 || Signed 64-bit integer. Valid range -9223372036854775807 thru 9223372036854775808 ||
+|| uint8 || Unsigned 8-bit integer. Valid range 0 thru 255 ||
+|| uint16 || Unsigned 16-bit integer. Valid range 0 thru 65535 ||
+|| uint32 || Unsigned 32-bit integer. Valid range 0 thru 4294967295 ||
+|| uint64 || Unsigned 64-bit integer. Valid range 0 thru 18446744073709551615 ||
+|| string || String of characters ||
+|| null || null type (Type of the null value) ||
+|| date || Date ||
+|| time || Time of day ||
+|| boolean || Boolean ||
+|| datetime || Date and time ||
+|| point2d || A point in 2-D space ||
+|| point3d || A point in 3-D space ||
+|| binary || Binary data ||
+|| yminterval || Year-Month interval ||
+|| dtinterval || Day-Time interval ||
+|| interval || Year-Month and Day-Time interval ||
+
+### Collection Types ###
+
+|| *Collection Type* || *Description* ||
+|| Record || A record type describes the record data item. A record contains a set of fields which can have values of any ADM type. Fields of a record must be unique. ||
+|| Union || A union type is an abstract type (A value never has a union type) that describes a set of type choices. ||
+|| Ordered List || An orderedlist instance represents a sequence of values where the order of the instances is determined by creation/insertion ||
+|| UnorderedList || An unorderedlist instance represents a collection of values where the order of the instances where the order is irrelevant ||
+|| Enumeration || An enumeration type represents a choice of string values ||
+
+# AQL Expressions #
+
+## Primary Expressions ##
+
+Primary expressions are the basic expressions that form the core of AQL.
+
+### Literals ###
+
+A Literal is a syntactic representation of a constant value. The various literals allowed in AQL are described in the table below.
+
+|| *Literal type* || *Syntax* ||
+|| StringLiteral || ` STRING_LITERAL : ("\"" ("\\\"" | ~["\""])* "\"") | ("\'"("\\\'" | ~["\'"])* "\'") ` ||
+|| IntegerLiteral || ` INTEGER_LITERAL : (["0" - "9"])+ ` ||
+|| FloatLiteral || ` FLOAT_LITERAL: ((["0" - "9"])* "." (["0" - "9"])+ ("f" | "F")) ` ||
+|| DoubleLiteral || ` DOUBLE_LITERAL: ((["0" - "9"])* "." (["0" - "9"])+) ` ||
+|| NullLiteral || ` NULL_LITERAL: "null" ` ||
+|| BooleanLiteral || ` BOOLEAN_LITERAL: "true" | "false" ` ||
+
+### Function Call ###
+
+Function Calls in AQL can be used to invoke builtin functions as well as user defined functions.
+Function Calls have the following syntax.
+
+
+ IDENTIFIER "(" ( Expression ( "," Expression )* )? ")"
+
+
+### Variable Reference ###
+
+Variables in AQL are used to bind to values. Variables can be bound to values by the For, Let, Group by clauses of the FLWOR expressions. Variables can also be bound by
+the Quantified Expressions.
+
+### Ordered List Constructor ###
+
+Constructs an ordered list. An ordered list represents a collection of values. The order of values is relevant. The collection may contain duplicate values.
+
+### Unordered List Constructor ###
+
+Constructs an unordered list. An unordered list represents a collection of values. The order of values is not relevant. The collection may contain duplicate values.
+
+### Record Constructor ###
+
+Constructs an AQL Record. A record contains fields. Each field has a name and a value. The name of the field is of type string. The value of a field may be any legal ADM data type. A record may not contain duplicate fields.
+
+## Arithmetic Expressions ##
+
+AQL allows all the standard arithmetic operators on numeric data types. The specific operators allowed are:
+
+|| *Operator* || *Description* ||
+|| + || Add ||
+|| - || Subtract ||
+|| * || Multiply ||
+|| / || Divide ||
+|| mod || Modulo ||
+
+## Comparison Expressions ##
+
+AQL provides the six standard comparison expressions listed below. In addition, AQL supports fuzzy comparisons.
+
+|| *Operator* || *Description* ||
+|| = || Equal ||
+|| = || Not Equal ||
+|| `< || Less Than ||
+|| `<= || Less Than or Equal ||
+|| >` || Greater Than ||
+|| >`= || Greater Than or Equal ||
+|| >`= || Greater Than or Equal ||
+|| ~= || Fuzzy Equals ||
+
+## Logical Expressions ##
+
+AQL provides two logical connectors:
+
+|| *Operator* || *Description* ||
+|| and || Logical AND ||
+|| or || Logical OR ||
+
+## Field Access Expressions ##
+
+The "." operator is used to access fields of a record. For example,
+
+
+ $x.name
+
+
+accesses the name field of the record bound to $x.
+
+## Indexed Expressions ##
+
+Indexed expressions are used to access values in an ordered list. For example,
+
+
+ $x[5]
+
+
+accesses the 6th item in the list bound to $x. Indexes start at 0.
+
+## FLWOR Expression ##
+
+The FLWOR expression is the most elaborate expression in AQL. It is made up of two parts -- Clauses and the Return Expression.
+
+The syntax of the FLWOR expression is:
+
+
+
+ ( ForClause | LetClause )
+ ( ForClause | LetClause | WhereClause | OrderClause | GroupClause | LimitClause | DistinctClause )*
+ "return" ReturnExpression
+
+
+
+* For Clause
+
+ "for" Variable "in" Expression
+
+* Let Clause
+
+ "let" Variable ":=" Expression
+
+* Where Clause
+
+ "where" Expression
+
+* Order Clause
+
+ "order" "by" Expression ("asc" | "desc") ("," Expression ("asc" | "desc"))*
+
+* Group Clause
+
+ "group" "by" ((Variable ":=")? Expression) ("," ((Variable ":=")? Expression))* "with" Variable
+
+* Limit Clause
+
+ "limit" Expression ("," Expression)?
+
+
+* If Then Else Expressions *
+
+ "if" "(" Expression ")" "then" Expression "else" Expression
+
+
+* Quantified Expressions *
+
+ ("some" | "every") Variable "in" Expression "satisfies" Expression
+
diff --git a/asterix-doc/src/site/markdown/aql.md b/asterix-doc/src/site/markdown/AsterixQueryLanguageReference.md
similarity index 95%
rename from asterix-doc/src/site/markdown/aql.md
rename to asterix-doc/src/site/markdown/AsterixQueryLanguageReference.md
index c03d7e5..c9ed3e2 100644
--- a/asterix-doc/src/site/markdown/aql.md
+++ b/asterix-doc/src/site/markdown/AsterixQueryLanguageReference.md
@@ -110,7 +110,8 @@
DataverseDeclaration ::= "use" "dataverse" Identifier
SetStatement ::= "set" Identifier StringLiteral
- FunctionDeclaration ::= "declare" "function" Identifier <LEFTPAREN> ( <VARIABLE> ( "," <VARIABLE> )* )? <RIGHTPAREN> "{" Expression "}"
+ FunctionDeclaration ::= "declare" "function" Identifier ParameterList "{" Expression "}"
+ ParameterList ::= <LEFTPAREN> ( <VARIABLE> ( "," <VARIABLE> )* )? <RIGHTPAREN>
### Lifecycle Management Statements
@@ -143,7 +144,7 @@
"using" AdapterName Configuration ( "hints" Properties )?
| "feed" <DATASET> QualifiedName <LEFTPAREN> Identifier <RIGHTPAREN> IfNotExists
"using" AdapterName Configuration ( ApplyFunction )? PrimaryKey ( "on" Identifier )? ( "hints" Properties )?
- | <DATASET> QualifiedName <LEFTPAREN> Identifier <RIGHTPAREN> IfNotExists
+ | "internal"? <DATASET> QualifiedName <LEFTPAREN> Identifier <RIGHTPAREN> IfNotExists
PrimaryKey ( "on" Identifier )? ( "hints" Properties )?
AdapterName ::= Identifier
Configuration ::= <LEFTPAREN> ( KeyValuePair ( "," KeyValuePair )* )? <RIGHTPAREN>
@@ -165,8 +166,7 @@
#### Functions
- FunctionSpecification ::= "function" FunctionOrTypeName IfNotExists <LEFTPAREN> ( <VARIABLE> ( "," <VARIABLE> )* )? <RIGHTPAREN> "{" Expression "}"
-
+ FunctionSpecification ::= "function" FunctionOrTypeName IfNotExists ParameterList "{" Expression "}"
### Import/Export Statements
diff --git a/asterix-doc/src/site/markdown/AsterixSimilarityQueries.md b/asterix-doc/src/site/markdown/AsterixSimilarityQueries.md
new file mode 100644
index 0000000..4f22fef
--- /dev/null
+++ b/asterix-doc/src/site/markdown/AsterixSimilarityQueries.md
@@ -0,0 +1,83 @@
+# AsterixDB Support of Similarity Queries #
+
+## Motivation ##
+
+Similarity queries are widely used in applications where users need to find records that satisfy a similarity predicate, while exact matching is not sufficient. These queries are especially important for social and Web applications, where errors, abbreviations, and inconsistencies are common. As an example, we may want to find all the movies starring Schwarzenegger, while we don't know the exact spelling of his last name (despite his popularity in both the movie industry and politics :-)). As another example, we want to find all the Facebook users who have similar friends. To meet this type of needs, AsterixDB supports similarity queries using efficient indexes and algorithms.
+
+## Data Types and Similarity Functions ##
+
+AsterixDB supports various similarity functions, including [edit distance](http://en.wikipedia.org/wiki/Levenshtein_distance) (on strings) and [Jaccard](http://en.wikipedia.org/wiki/Jaccard_index) (on sets). For instance, in our [TinySocial](AdmAql101.html#ADM:_Modeling_Semistructed_Data_in_AsterixDB) example, the `friend-ids` of a Facebook user forms a set of friends, and we can define a similarity between two sets. We can also convert a string to a set of "q-grams" and define the Jaccard similarity between the two sets of two strings. The "q-grams" of a string are its substrings of length "q". For instance, the 3-grams of the string `schwarzenegger` are `sch`, `chw`, `hwa`, ..., `ger`.
+
+AsterixDB provides [tokenization functions](AsterixDataTypesAndFunctions.html#Tokenizing_Functions) to convert strings to sets, and the [similarity functions](AsterixDataTypesAndFunctions.html#Similarity_Functions).
+
+## Selection Queries ##
+
+The following [query](AsterixDataTypesAndFunctions.html#edit-distance) asks for all the Facebook users whose name is similar to `Suzanna Tilson`, i.e., their edit distance is at most 2.
+
+
+ use dataverse TinySocial;
+
+ for $user in dataset('FacebookUsers')
+ let $ed := edit-distance($user.name, "Suzanna Tilson")
+ where $ed <= 2
+ return $user
+
+
+The following [query](AsterixDataTypesAndFunctions.html#similarity-jaccard) asks for all the Facebook users whose set of friend ids is similar to `[1,5,9]`, i.e., their Jaccard similarity is at least 0.6.
+
+
+ use dataverse TinySocial;
+
+ for $user in dataset('FacebookUsers')
+ let $sim := similarity-jaccard($user.friend-ids, [1,5,9])
+ where $sim >= 0.6f
+ return $user
+
+
+AsterixDB allows a user to use a similarity operator `~=` to express a similarity condition by defining the similiarty function and threshold using "set" statements earlier. For instance, the above query can be equivalently written as:
+
+
+ use dataverse TinySocial;
+
+ set simfunction "jaccard";
+ set simthreshold "0.6f";
+
+ for $user in dataset('FacebookUsers')
+ where $user.friend-ids ~= [1,5,9]
+ return $user
+
+
+
+## Fuzzy Join Queries ##
+
+AsterixDB supports fuzzy joins between two data sets. The following [query](AdmAql101.html#Query_5_-_Fuzzy_Join) finds, for each Facebook user, all Twitter users with names "similar" to their name based on the edit distance.
+
+
+ use dataverse TinySocial;
+
+ set simfunction "edit-distance";
+ set simthreshold "3";
+
+ for $fbu in dataset FacebookUsers
+ return {
+ "id": $fbu.id,
+ "name": $fbu.name,
+ "similar-users": for $t in dataset TweetMessages
+ let $tu := $t.user
+ where $tu.name ~= $fbu.name
+ return {
+ "twitter-screenname": $tu.screen-name,
+ "twitter-name": $tu.name
+ }
+ };
+
+
+## Using Indexes ##
+
+AsterixDB uses inverted index to support similarity queries efficiently. For instance, the following query creates such an index on the `FacebookUser.name` attribute using an inverted index of 3-grams. After the index is created, similarity queries with an edit distance condition on this attribute can be answered more efficiently.
+
+
+ use dataverse TinySocial;
+
+ create index fbUserFuzzyIdx on FacebookUsers(name) type ngram(3);
+
diff --git a/asterix-doc/src/site/markdown/InstallingAsterixUsingManagix.md b/asterix-doc/src/site/markdown/InstallingAsterixUsingManagix.md
new file mode 100644
index 0000000..65aebdf
--- /dev/null
+++ b/asterix-doc/src/site/markdown/InstallingAsterixUsingManagix.md
@@ -0,0 +1,808 @@
+# Introduction #
+This is a quickstart guide for getting ASTERIX running in a distributed environment. This guide also introduces the ASTERIX installer (nicknamed _*Managix*_) and describes how it can be used to create/manage an ASTERIX instance. By following the simple steps described in this guide, you will get a running instance of ASTERIX. You shall be able to use ASTERIX from its Web interface and manage its lifecycle using Managix. This document assumes that you are running some version of _*Linux*_ or _*MacOS X*_.
+
+## Prerequisites for Installing ASTERIX ##
+Prerequisite:
+
+ * [JDK7](http://www.oracle.com/technetwork/java/javase/downloads/index.html) (Otherwise known as JDK 1.7).
+
+To know the version of Java installed on your system, execute the following:
+
+ $ java -version
+
+If you have version as 1.7.0_x, similar to the output shown below, you are good to proceed.
+
+
+ java version "1.7.0_13"
+ Java(TM) SE Runtime Environment (build 1.7.0_13-b20)
+ Java HotSpot(TM) 64-Bit Server VM (build 23.7-b01, mixed mode)
+
+If you need to upgrade or install java, please follow the instructions below.
+
+ * For Linux: [JDK 7 Linux Install](http://docs.oracle.com/javase/7/docs/webnotes/install/linux/linux-jdk.html)
+JDK would be installed at a path under /usr/lib/jvm/jdk-version .
+
+ * For Mac: [JDK 7 Mac Install](http://docs.oracle.com/javase/7/docs/webnotes/install/mac/mac-jdk.html)
+JDK would be installed at /Library/Java/JavaVirtualMachines/jdk-version/Contents/Home .
+
+The java installation directory is referred as JAVA_HOME. Since we upgraded/installed Java, we need to ensure JAVA_HOME points to the installation directory of JDK 7. Modify your ~/.bash_profile (or ~/.bashrc) and define JAVA_HOME accordingly. After modifying, execute the following:
+
+
+ $ java -version
+
+If the version information you obtain does not show 1.7, you need to update the PATH variable. To do so, execute the following:
+
+
+ $ echo "PATH=$JAVA_HOME/bin:$PATH" >> ~/.bash_profile (or ~/.bashrc)
+ $ source ~/.bash_profile (or ~/.bashrc)
+
+We also need to ensure that $JAVA_HOME/bin is in the PATH. $JAVA_HOME/bin should be included in the PATH value. We need to change the if $JAVA_HOME/bin is already in the PATH, we shall simply execute the following:
+
+
+ $ java
+
+If you get the following message, you need to alter the PATH variable in your ~/.bash_profile or ~/.bashrc (whichever you use).
+
+
+ -bash: java: command not found
+
+## Section 1: Single-Machine ASTERIX installation ##
+We assume a user Joe with a home directory as /home/joe. Please note that on Mac, the home directory for user Joe would be /Users/joe.
+
+### Configuring Environment ###
+Ensure that JAVA_HOME variable is defined and points to the the java installation directory on your machine. To verify, execute the following.
+
+
+ $ echo $JAVA_HOME
+
+If you do not see any output, JAVA_HOME is not defined. We need to add the following line to your profile located at /home/joe/.bash_profile or /home/joe/.bashrc, whichever you are using. If you do not any of these files, create a ~/.bash_profile.
+
+
+ export JAVA_HOME=<Path to Java installation directory>
+
+After you have edited ~/.bash_profile (or ~/.bashrc), execute the following to make the changes effective in current shell.
+
+
+ $ source /home/joe/.bash_profile (or /home/joe/.bashrc)
+
+Before proceeding, verify that JAVA_HOME is defined by executing the following.
+
+
+ $ echo $JAVA_HOME
+
+### Configuring SSH ###
+If SSH is not enabled on your system, please follow the instruction below to enable/install it or else skip to the section [Configuring Password-less SSH](#Configuring_Password-less_SSH).
+
+#### Enabling SSH on Mac ####
+The Apple Mac OS X operating system has SSH installed by default but the SSH daemon is not enabled. This means you can’t login remotely or do remote copies until you enable it. To enable it, go to ‘System Preferences’. Under ‘Internet & Networking’ there is a ‘Sharing’ icon. Run that. In the list that appears, check the ‘Remote Login’ option. Also check the "All users" radio button for "Allow access for". This starts the SSH daemon immediately and you can remotely login using your username. The ‘Sharing’ window shows at the bottom the name and IP address to use. You can also find this out using ‘whoami’ and ‘ifconfig’ from the Terminal application.
+
+#### Enabling SSH on Linux ####
+
+ sudo apt-get install openssh-server
+
+Assumming that you have enabled SSH on your system, let us proceed.
+
+#### Configuring Password-less SSH ####
+
+For our single-machine setup of ASTERIX, we need to configure password-less SSH access to localhost. We assume that you are on the machine where you want to install ASTERIX. To verify if you already have password-less SSH configured, execute the following.
+
+
+ $ ssh 127.0.0.1
+
+If you get an output similar to one shown below, type "yes" and press enter.
+
+
+ The authenticity of host '127.0.0.1 (127.0.0.1)' can't be established.
+ RSA key fingerprint is aa:7b:51:90:74:39:c4:f6:28:a2:9d:47:c2:8d:33:31.
+ Are you sure you want to continue connecting (yes/no)?
+
+If you are not prompted for a password, that is if you get an output similar to one shown below, skip to the next section [Configuring Managix](#Configuring_Managix).
+
+
+ $ ssh 127.0.0.1
+ Last login: Sat Mar 23 22:52:49 2013
+
+You are here because you were prompted for a password. You need to configure password less SSH. Follow the instructions below.
+
+
+ $ ssh-keygen -t rsa -P ""
+ Generating public/private rsa key pair.
+ Enter file in which to save the key (/home/joe/.ssh/id_rsa): [We shall use the default value, so simply press enter]
+
+If a key already exists, you should get an output similar to what is shown below. Press 'y' to overwrite the existing key.
+
+
+ /home/joe/.ssh/id_rsa already exists.
+ Overwrite (y/n)?
+
+You should see an output similar to one shown below.
+
+
+ The key fingerprint is:
+ 4d:b0:30:14:45:cc:99:86:15:48:17:0b:39:a0:05:ca joe@joe-machine
+ The key's randomart image is:
+ +--[ RSA 2048]----+
+ | ..o+B@O= |
+ |.. o ==*+ |
+ |.E. oo . |
+ | o |
+ | S . |
+ | |
+ | |
+ | |
+ | |
+ +-----------------+
+
+Note: for Linux users, you may not get an image representation of the key, but this is not an error. Next, execute the following:
+
+
+ $ cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
+
+We shall now retry SSH without password.
+
+
+ $ ssh 127.0.0.1
+
+You may see an output similar to one shown below.
+
+
+ The authenticity of host '127.0.0.1 (127.0.0.1)' can't be established.
+ RSA key fingerprint is aa:7b:51:90:74:39:c4:f6:28:a2:9d:47:c2:8d:33:31.
+ Are you sure you want to continue connecting (yes/no)?
+
+Type 'yes' and press the enter key. You should see an output similar to one shown below.
+
+
+ Warning: Permanently added '127.0.0.1' (RSA) to the list of known hosts.
+ Last login: Thu Mar 28 12:27:10 2013
+
+You should now be able to log in without being prompted for a password or a response.
+
+
+ ssh 127.0.0.1
+ Last login: Sat Mar 23 22:54:40 2013
+
+Execute 'exit' to close the session.
+
+
+ $ exit
+ logout
+ Connection to 127.0.0.1 closed.
+
+### Configuring Managix ###
+You will need the ASTERIX installer (a.k.a Managix). Download Managix from [here](https://asterixdb.googlecode.com/files/asterix-installer-0.0.5-binary-assembly.zip); this includes the bits for Managix as well as ASTERIX.
+
+Unzip the Managix zip bundle to an appropriate location. You may create a sub-directory: asterix-mgmt (short for asterix-management) under your home directory. We shall refer to this location as MANAGIX_HOME.
+
+
+ $ cd ~
+ /home/joe> $ mkdir asterix-mgmt
+ /home/joe> $ cd asterix-mgmt
+ /home/joe/asterix-mgmt> $ unzip <path to the Managix zip bundle>
+ /home/joe/asterix-mgmt> $ export MANAGIX_HOME=`pwd`
+ /home/joe/asterix-mgmt> $ export PATH=$PATH:$MANAGIX_HOME/bin
+
+It is recommended that you add $MANAGIX_HOME/bin to your PATH variable in your bash profile . This can be done by executing the following.
+
+
+ currentDir=`pwd`
+ echo "export MANAGIX_HOME=$currentDir" >> ~/.bash_profile
+ echo "export PATH=$PATH:$MANAGIX_HOME/bin" >> ~/.bash_profile
+
+Above, use ~/.bashrc instead of ~/.bash_profile if you are using ~/.bashrc .
+
+To be able to create an ASTERIX instance and manage its lifecycle, the Managix requires you to configure a set of configuration files namely:
+
+ * `conf/managix-conf.xml`: A configuration XML file that contains configuration settings for Managix.
+ * A configuration XML file that describes the nodes in the cluster, e.g., `$MANAGIX_HOME/clusters/local/local.xml`.
+
+Since we intend to run ASTERIX on a single node, Managix can auto-configure itself and populate the above mentioned configuration files. To auto-configure Managix, execute the following in the MANAGIX_HOME directory:
+
+
+ /home/joe/asterix-mgmt> $ managix configure
+
+Let us do a sample run to validate the set of configuration files auto-generated by Managix.
+
+
+ /home/joe/asterix-mgmt> $ managix validate
+ INFO: Environment [OK]
+ INFO: Managix Configuration [OK]
+
+
+ /home/joe/asterix-mgmt> $ managix validate -c $MANAGIX_HOME/clusters/local/local.xml
+ INFO: Environment [OK]
+ INFO: Cluster configuration [OK]
+
+### Creating an ASTERIX instance ###
+Now that we have configured Managix, we shall next create an ASTERIX instance. An ASTERIX instance is identified by a unique name and is created using the `create` command. The usage description for the `create` command can be obtained by executing the following.
+
+
+ $ managix help -cmd create
+ Creates an ASTERIX instance with a specified name. Post creation, the instance is in ACTIVE state,
+ indicating its availability for executing statements/queries.
+ Usage arguments/options:
+ -n Name of the ASTERIX instance.
+ -c Path to the cluster configuration file
+
+We shall now use the create command to create an ASTERIX instance by the name "my_asterix". In doing so, we shall use the cluster configuration file that was auto-generated by managix.
+
+
+ $ managix create -n my_asterix -c $MANAGIX_HOME/clusters/local/local.xml
+
+A sample output of the above command is shown below:
+
+
+ INFO: Name:my_asterix
+ Created:Thu Mar 07 11:14:13 PST 2013
+ Web-Url:http://127.0.0.1:19001
+ State:ACTIVE
+
+The third line above shows the web-url http://127.0.0.1:19001 for ASTERIX's web-interface. The ASTERIX instance is in the 'ACTIVE' state indicating that you may access the web-interface by navigating to the web-url.
+
+Type in the following "Hello World" query in the box:
+
+
+ let $message := 'Hello World!'
+ return $message
+
+Press the "Execute" button. If the query result shows on the output box, then Congratulations! You have successfully created an ASTERIX instance!
+
+## Section 2: Single-Machine ASTERIX installation (Advanced) ##
+We assume that you have successfully completed the single-machine ASTERIX installation by following the instructions above in section [ASTERIX installation](#Section_1:_Single-Machine_ASTERIX_installation Single Machine). In this section, we shall cover advanced topics related to ASTERIX configuration. Before we proceed, it is imperative to go through some preliminary concepts related to ASTERIX runtime.
+
+### ASTERIX Runtime ###
+An ASTERIX runtime comprises of a ''master node'' and a set of ''worker nodes'', each identified by a unique id. The master node runs a ''Cluster Controller'' service (a.k.a. ''CC''), while each worker node runs a ''Node Controller'' service (a.k.a. ''NC''). Please note that a node in an ASTERIX cluster is a logical concept in the sense that multiple nodes may map to a single physical machine, which is the case for a single-machine ASTERIX installation. This association or mapping between an ASTERIX node and a physical machine is captured in a cluster configuration XML file. In addition, the XML file contains properties and parameters associated with each node.
+
+#### ASTERIX Runtime Configuration ####
+As observed earlier, Managix can auto-configure itself for a single-machine setup. As part of auto-configuration, Managix generated the cluster XML file. Let us understand the components of the generated cluster XML file. If you have configured Managix (via the "configure" command), you can find a similar cluster XML file as $MANAGIX_HOME/clusters/local/local.xml. The following is a sample XML file generated on a Ubuntu (Linux) setup:
+
+
+ <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+ <cluster xmlns="cluster">
+ <name>local</name>
+ <java_home>/usr/lib/jvm/jdk1.7.0</java_home>
+ <java_opts>-Xmx1048m</java_opts>
+ <logdir>/home/joe/asterix-mgmt/clusters/local/working_dir/logs</logdir>
+ <iodevices>/home/joe/asterix-mgmt/clusters/local/working_dir</iodevices>
+ <store>storage</store>
+ <workingDir>
+ <dir>/home/joe/asterix-mgmt/clusters/local/working_dir</dir>
+ <NFS>true</NFS>
+ </workingDir>
+ <master-node>
+ <id>master</id>
+ <client-ip>127.0.0.1</client-ip>
+ <cluster-ip>127.0.0.1</cluster-ip>
+ </master-node>
+ <node>
+ <id>node1</id>
+ <cluster-ip>127.0.0.1</cluster-ip>
+ </node>
+ </cluster>
+
+We shall next explain the components of the cluster configuration XML file.
+
+#### (1) Defining nodes in ASTERIX runtime ####
+The single-machine ASTERIX instance configuration that is auto-generated by Managix (using the "configure" command) involves a master node (CC) and a worker node (NC). Each node is assigned a unique id and provided with an ip address (called ''cluster-ip'') that maps a node to a physical machine. The following snippet from the above XML file captures the master/worker nodes in our ASTERIX installation.
+
+
+ <master-node>
+ <id>master</id>
+ <client-ip>127.0.0.1</client-ip>
+ <cluster-ip>127.0.0.1</cluster-ip>
+ </master-node>
+ <node>
+ <id>node1</id>
+ <cluster-ip>127.0.0.1</cluster-ip>
+ </node>
+
+
+The following is a description of the different elements in the cluster configuration xml file.
+
+<table>
+<tr>
+ <td>Property</td>
+ <td>Description</td>
+</tr>
+<tr>
+ <td>id</td>
+ <td>A unique id for a node.</td>
+</tr>
+<tr>
+ <td>cluster-ip</td>
+ <td>IP address of the machine to which a node maps to. This address is used for all internal communication between the nodes.</td>
+</tr>
+<tr>
+ <td>client-ip</td>
+ <td>Provided for the master node. This IP should be reachable from clients that want to connect with ASTERIX via its web interface.</td>
+</tr>
+</table>
+
+#### (2) Properties associated with a worker node (NC) in ASTERIX ####
+The following is a list of properties associated with each worker node in an ASTERIX configuration.
+
+<table>
+<tr>
+ <td>Property</td>
+ <td>Description</td>
+</tr>
+<tr>
+ <td>java_home</td>
+ <td>Java installation directory at each node.</td>
+</tr>
+<tr>
+ <td>java_opts</td>
+ <td>JVM arguments passed on to the JVM that represents a node.</td>
+</tr>
+<tr>
+ <td>logdir</td>
+ <td>A directory where worker node may write logs.</td>
+</tr>
+<tr>
+ <td>io_devices</td>
+ <td>Comma separated list of IO Device mount points.</td>
+</tr>
+<tr>
+ <td>store</td>
+ <td>A data directory that ASTERIX uses to store data belonging to dataset(s).</td>
+</tr>
+</table>
+
+All the above properties can be defined at the global level or a local level. In the former case, these properties apply to all the nodes in an ASTERIX configuration. In the latter case, these properties apply only to the node(s) under which they are defined. A property defined at the local level overrides the definition at the global level.
+
+#### (3) Working directory of an ASTERIX instance ####
+
+Next we explain the following setting in the file $MANAGIX_HOME/clusters/local/local.xml.
+
+ <workingDir>
+ <dir>/Users/joe/asterix-mgmt/clusters/local/working_dir</dir>
+ <NFS>true</NFS>
+ </workingDir>
+
+
+Managix associates a working directory with an ASTERIX instance and uses this directory for transferring binaries to each node. If there exists a directory that is readable by each node, Managix can use it to place binaries that can be accessed and used by all the nodes in the ASTERIX set up. A network file system (NFS) provides such a functionality for a cluster of physical machines such that a path on NFS is accessible from each machine in the cluster. In the single-machine set up described above, all nodes correspond to a single physical machine. Each path on the local file system is accessible to all the nodes in the ASTERIX setup and the boolean value for NFS above is thus set to `true`.
+
+### Managix Configuration ###
+Managix allows creation and management of multiple ASTERIX instances and uses Zookeeper as its back-end database to keep track of information related to each instance. We need to provide a set of one or more hosts that Managix can use to run a Zookeeper instance. Zookeeper runs as a daemon process on each of the specified hosts. At each host, Zookeeper stores data under the Zookeeper home directory specified as part of the configuration. The following is an example configuration `$MANAGIX_HOME/conf/managix-conf.xml` that has Zookeeper running on the localhost (127.0.0.1) :
+
+
+ <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+ <configuration xmlns="installer">
+ <zookeeper>
+ <homeDir>/home/joe/asterix/.installer/zookeeper</homeDir>
+ <clientPort>2900</clientPort>
+ <servers>
+ <server>127.0.0.1</server>
+ </servers>
+ </zookeeper>
+ </configuration>
+
+It is possible to have a single host for Zookeeper. A larger number of hosts would use Zookeeper's replication and fault-tolerance feature such that a failure of a host running Zookeeper would not result in loss of information about existing ASTERIX instances.
+
+## Section 3: Installing ASTERIX on a Cluster of Multiple Machines ##
+We assume that you have read the two sections above on single-machine ASTERIX setup. Next we explain how to install ASTERIX in a cluster of multiple machines. As an example, we assume we want to setup ASTERIX on a cluster of three machines, in which we use one machine (called machine A) as the master node and two other machines (called machine B and machine C) as the worker nodes, as shown in the following diagram:
+
+![AsterixCluster](https://asterixdb.googlecode.com/files/AsterixCluster.png)
+
+Notice that each machine has a ''cluster-ip'' address, which is used by these machines for their intra-cluster communication. Meanwhile, the master machine also has a ''client-ip'' address, using which an end-user outside the cluster can communicate with this machine. The reason we differentiate between these two types of IP addresses is that we can have a cluster of machines using a private network. In this case they have internal ip addresses that cannot be used outside the network. In the case all the machines are on a public network, the "client-ip" and "cluster-ip" of the master machine can share the same address.
+
+Next we describe how to set up ASTERIX in this cluster, assuming no Managix has been installed on these machines.
+
+### Step (1): Define the ASTERIX cluster ###
+
+We first log into the master machine as the user "joe". On this machine, download Managix from [here](https://asterixdb.googlecode.com/files/asterix-installer-0.0.5-binary-assembly.zip) (save as above), then do the following steps similar to the single-machine case described above:
+
+
+ machineA> cd ~
+ machineA> mkdir asterix-mgmt
+ machineA> cd asterix-mgmt
+ machineA> unzip <path to the Managix zip bundle>
+ machineA> export MANAGIX_HOME=`pwd`
+ machineA> export PATH=$PATH:$MANAGIX_HOME/bin
+
+
+We also need an ASTERIX configuration XML file for the cluster. We give the name to the cluster, say, "rainbow". We create a folder for the configuration of this cluster:
+
+
+ machineA> mkdir $MANAGIX_HOME/rainbow_cluster
+
+
+For this cluster we create a configuration file `$MANAGIX_HOME/rainbow_cluster/rainbow.xml`. The following is a sample file with explanation of the properties:
+
+ <cluster xmlns="cluster">
+
+ <!-- Name of the cluster -->
+ <name>rainbow</name>
+
+ <!-- username, which should be valid for all the three machines -->
+ <username>joe</username>
+
+ <!-- The working directory of Managix. It should be on a network file system (NFS) that
+ can accessed by all the machine. Need to create it before running Managix. -->
+ <workingDir>
+ <dir>/home/joe/managix-workingDir</dir>
+ <NFS>true</NFS>
+ </workingDir>
+
+ <!-- Directory for Asterix to store log information for each machine. Needs
+ to be a local file system. Needs to create it before running Managix. -->
+ <logdir>/mnt/joe/logs</logdir>
+
+ <!-- Directory used by each worker node to store data files. Needs
+ to be a local file system. Needs to create it before running Managix. -->
+ <iodevices>/mnt/joe</iodevices>
+ <store>storage</store>
+
+ <!-- Java home for each machine with its JVM options -->
+ <java_home>/usr/lib/jvm/jdk1.7.0</java_home>
+ <java_opts>-Xmx1024m</java_opts>
+
+ <!-- IP addresses of the master machine A -->
+ <master-node>
+ <id>master</id>
+ <client-ip>128.195.52.177</client-ip>
+ <cluster-ip>192.168.100.0</cluster-ip>
+ </master-node>
+
+ <!-- IP address(es) of machine B -->
+ <node>
+ <id>nodeB</id>
+ <cluster-ip>192.168.100.1</cluster-ip>
+ </node>
+
+ <!-- IP address(es) of machine C -->
+ <node>
+ <id>nodeC</id>
+ <cluster-ip>192.168.100.2</cluster-ip>
+ </node>
+ </cluster>
+
+
+As stated before, each of the above properties can be defined at the cluster level, in which case it applies to all the nodes in the system. Each property can also be defined at a node level.
+
+Once we have formed the cluster XML file, we can validate the configuration by doing the following:
+
+ managix validate -c $MANAGIX_HOME/rainbow_cluster/rainbow.xml
+
+
+If the return message says "OK", it means that the XML configuration file is set properly.
+
+### Step (2): Configure SSH ###
+
+The next steps of setting up SSH are similar to those in the single-machine setup case. We assume we have a common user account called "joe" on each machine in the cluster.
+
+On the master machine, do the following:
+
+
+ machineA> ssh localhost
+
+
+If you are prompted for a password, execute the following
+
+
+ machineA> ssh-keygen -t rsa -P ""
+ machineA> cat $HOME/.ssh/id_rsa.pub >> $HOME/.ssh/authorized_keys
+
+
+If $HOME is not on the NFS, copy the id_rsa.pub to the directory ~/.ssh (login with the same account) on each machine, and then do the following on each machine. (Notice that this step is not needed if the folder ".ssh" is on the NFS and can be accessed by all the nodes.)
+
+
+ cd ~/.ssh
+ cat id_rsa.pub >> authorized_keys
+
+
+Then run the following step again and type "Yes" if promoted:
+
+
+ machineA> ssh localhost
+
+### Step (3): Configuring Managix ###
+
+Managix is using a configuration XML file at `$MANAGIX_HOME/conf/managix-conf.xml` to configure its own properties, such as its Zookeeper service. We can use the `configure` command to auto-generate this configuration file:
+
+
+ machineA> managix configure
+
+
+We use the validate command to validate managix configuration. To do so, execute the following.
+
+ machineA> managix validate
+ INFO: Environment [OK]
+ INFO: Managix Configuration [OK]
+
+
+Note that the `configure` command also generates a cluster configuration XML file at $MANAGIX_HOME/conf/clusters/local.xml. This file is not needed in the case of a cluster of machines.
+
+### Step (4): Creating an ASTERIX instance ###
+
+Now that we have configured Managix, we shall next create an ASTERIX instance. An ASTERIX instance is identified by a unique name and is created using the create command. The usage description for the create command can be obtained by executing the following:
+
+
+ machineA> managix help -cmd create
+
+ Creates an ASTERIX instance with a specified name. Post creation, the instance is in ACTIVE state,
+ indicating its availability for executing statements/queries.
+ Usage arguments/options:
+ -n Name of the ASTERIX instance.
+ -c Path to the cluster configuration file
+
+
+We shall now use the `create` command to create an ASTERIX instance called "rainbow_asterix". In doing so, we shall use the cluster configuration file that was auto-generated by Managix.
+
+
+ machineA> managix create -n rainbow_asterix -c $MANAGIX_HOME/clusters/rainbow/rainbow.xml
+
+
+If the response message does not have warning, then Congratulations! You have successfully installed Asterix on this cluster of machines!
+
+Please refer to the section [Managing the Lifecycle of an ASTERIX Instance](#Section_4:_Managing_the_Lifecycle_of_an_ASTERIX_Instance) for a detailed description on the set of available commands/operations that let you manage the lifecycle of an ASTERIX instance. Note that the output of the commands varies with the cluster definition and may not apply to the cluster specification you built above.
+
+## Section 4: Managing the Lifecycle of an ASTERIX Instance ##
+
+Now that we have an ASTERIX instance running, let us use Managix to manage the instance's lifecycle. Managix provides the following set of commands/operations:
+
+#### Managix Commands ####
+
+<table>
+<tr><td>Command</td> <td>Description</td></tr>
+<tr><td><a href="#Creating_an_ASTERIX_instance">create</a></td> <td>Creates a new asterix instance.</td></tr>
+<tr><td><a href="#Describe_Command" >describe</a></td> <td>Describes an existing asterix instance.</td></tr>
+<tr><td><a href="#Stop_Command" >stop</a></td> <td>Stops an asterix instance that is in the ACTIVE state.</td></tr>
+<tr><td><a href="#Start_Command" >start</a></td> <td>Starts an Asterix instance.</td></tr>
+<tr><td><a href="#Backup_Command" >backup</a></td> <td>Creates a backup for an existing Asterix instance.</td></tr>
+<tr><td><a href="#Restore_Command" >restore</a></td> <td>Restores an Asterix instance.</td></tr>
+<tr><td><a href="#Delete_Command" >delete</a></td> <td>Deletes an Asterix instance.</td></tr>
+<tr><td><a href="#Configuring_Managix" >validate</a></td> <td>Validates the installer/cluster configuration.</td></tr>
+<tr><td><a href="#Configuring_Managix" >configure</a></td><td>Auto generate configuration for an Asterix instance.</td></tr>
+<tr><td><a href="#Shutdown_Command" >shutdown</a></td> <td>Shutdown the installer service.</td></tr>
+</table>
+
+You may obtain the above listing by simply executing 'managix' :
+
+
+ $ managix
+
+We already talked about create and validate commands. We shall next explain the rest of the commands listed above. We also provide sample output messages of these commands assuming we are running an ASTERIX instance on a single machine.
+
+##### Describe Command #####
+The `describe` command provides information about an ASTERIX instance. The usage can be looked up by executing the following:
+
+
+ $ managix help -cmd describe
+
+ Provides information about an ASTERIX instance.
+ The following options are available:
+ [-n] Name of the ASTERIX instance.
+ [-admin] Provides a detailed description
+
+The brackets indicate optional flags.
+
+The output of the `describe` command when used without the `admin` flag contains minimal information and is similar to the output of the create command. Let us try running the describe command in "admin" mode.
+
+
+ $ managix describe -n my_asterix -admin
+ INFO: Name:my_asterix
+ Created:Thu Mar 07 19:07:00 PST 2013
+ Web-Url:http://127.0.0.1:19001
+ State:ACTIVE
+ Master node:master:127.0.0.1
+ node1:127.0.0.1
+
+ Asterix version:0.0.5
+ Asterix Configuration
+ output_dir = /tmp/asterix_output/
+ Metadata Node:node1
+ Processes
+ NC at 127.0.0.1 [ 22195 ]
+ CC at 127.0.0.1 [ 22161 ]
+
+As seen above, the instance 'my_asterix' is configured such that all processes running at the localhost (127.0.0.1). The process id for each process (JVM) is shown next to it.
+
+##### Stop Command #####
+The `stop` command can be used for shutting down an ASTERIX instance. After that, the instance is unavailable for executing queries. The usage can be looked up by executing the following:
+
+
+ $ managix help -cmd stop
+
+ Shuts an ASTERIX instance that is in ACTIVE state. After executing the stop command, the ASTERIX instance transits
+ to the INACTIVE state, indicating that it is no longer available for executing queries.
+
+ Available arguments/options
+ -n name of the ASTERIX instance.
+
+To stop the ASTERIX instance.
+
+
+ $ managix stop -n my_asterix
+ INFO: Stopped Asterix instance: my_asterix
+
+
+ $ managix describe -n my_asterix
+ INFO: Name: my_asterix
+ Created:Thu Mar 07 19:07:00 PST 2013
+ Web-Url:http://127.0.0.1:19001
+ State:INACTIVE (Fri Mar 08 09:49:00 PST 2013)
+
+
+##### Start Command #####
+The `start` command starts an ASTERIX instance that is in the INACTIVE state. The usage can be looked up by executing the following:
+
+
+ $ managix help -cmd start
+
+ Starts an ASTERIX instance that is in INACTIVE state. After executing the start command, the ASTERIX instance transits to the ACTIVE state, indicating that it is now available for executing statements/queries.
+
+ Available arguments/options
+ -n name of the ASTERIX instance.
+
+Let us now start the ASTERIX instance.
+
+
+ $ managix start -n my_asterix
+ INFO: Name:my_asterix
+ Created:Thu Mar 07 19:07:00 PST 2013
+ Web-Url:http://127.0.0.1:19001
+ State:ACTIVE (Fri Mar 08 09:49:00 PST 2013)
+
+
+##### Backup Command #####
+
+In an undesirable event of data loss either due to a disk/system failure or accidental execution of a DDL statement (drop dataverse/dataset), you may need to recover the lost data. The backup command allows you to take a backup of the data stored with an ASTERIX instance. The backup can be taken on the local file system or on an HDFS instance. In either case, the snapshots are stored under a backup directory. You need to make sure the backup directory has appropriate read/write permissions. Configuring settings for backup can be found inside the Managix's configuration file located at `$MANAGIX_HOME/conf/managix-conf.xml`.
+
+*Configuring backup on the local file system*
+
+We need to provide path to a backup directory on the local file system. The backup directory can be configured be editing the Managix configuration XML, found at `$MANAGIX_HOME/conf/managix-conf.xml`.
+
+
+ <backup>
+ <backupDir>Provide path to the backup directory here</backupDir>
+ </backup>
+
+Prior to taking a backup of an ASTERIX instance, it is required for the instance to be in the INACTIVE state. We do so by using the `stop` command, as shown below:
+
+
+ $ managix stop -n my_asterix
+ INFO: Stopped Asterix instance: my_asterix
+
+We can now take the backup by executing the following:
+
+
+ $ managix backup -n my_asterix
+ INFO: my_asterix backed up 0_Fri Mar 08 16:16:34 PST 2013 (LOCAL)
+
+
+*Configuring backup on an HDFS instance*
+
+To configure a backups to be taken on an HDFS instance, we need to provide required information about the running HDFS instance. This information includes the HDFS version and the HDFS url. Simply edit the Managix configuration file and provide the required information.
+
+
+ <backup>
+ <backupDir>Provide path to the backup directory here</backupDir>
+ <hdfs>
+ <version>0.20.2</version>
+ <url></url>
+ </hdfs>
+ </backup>
+
+A sample output when a backup is taken on an HDFS is shown below:
+
+
+ $ managix backup -n my_asterix
+ INFO: my_asterix backed up 1_Fri Mar 08 17:10:38 PST 2013 (HDFS)
+
+
+Each time we take a backup, we are provided with a unique id (a monotonically increasing value starting with 0). This id is required when we need to restore from a previously taken backup. Information about all available backup snapshots can be obtained by using the `describe` command in the admin mode, as shown below:
+
+
+ $ managix describe -n my_asterix -admin
+ INFO: Name:my_asterix
+ Created:Fri Mar 08 15:11:12 PST 2013
+ Web-Url:http://127.0.0.1:19001
+ State:INACTIVE (Fri Mar 08 16:14:20 PST 2013)
+ Master node:master:127.0.0.1
+ node1:127.0.0.1
+
+ Backup:0 created at Fri Mar 08 16:16:34 PST 2013 (LOCAL)
+ Backup:1 created at Fri Mar 08 17:10:38 PST 2013 (HDFS)
+
+ Asterix version:0.0.5
+ Asterix Configuration
+ Metadata Node:node1
+ Processes
+
+
+The above output shows the available backup identified by it's id (0). We shall next describe the method for restoring an ASTERIX instance from a backup snapshot.
+
+##### Restore Command #####
+
+The `restore` command allows you to restore an ASTERIX instance's data from a previously taken backup. The usage description can be obtained as follows:
+
+
+ $ managix help -cmd restore
+
+ Restores an ASTERIX instance's data from a previously taken backup.
+ Available arguments/options
+
+ -n name of the ASTERIX instance
+ -b id of the backup snapshot
+
+
+The following command restores our ASTERIX instance from the backup snapshot identified by the id (0). Prior to restoring an instance from a backup, it is required that the instance is in the INACTIVE state.
+
+
+ $ managix restore -n my_asterix -b 0
+ INFO: Asterix instance: my_asterix has been restored from backup
+
+You can start the ASTERIX instance by using the start command.
+
+##### Delete Command #####
+As the name suggests, the `delete` command permanently removes an ASTERIX instance by cleaning up all associated data/artifacts. The usage can be looked up by executing the following:
+
+
+ $ managix help -cmd delete
+ Permanently deletes an ASTERIX instance. The instance must be in the INACTIVE state.
+
+ Available arguments/options
+ -n name of the ASTERIX instance.
+
+
+ $ managix delete -n my_asterix
+ INFO: Asterix instance my_asterix deleted.
+
+
+##### Shutdown Command #####
+Managix uses Zookeeper service for storing all information about created ASTERIX instances. The Zookeeper service runs in the background and can be shut down using the `shutdown` command.
+
+
+ $ managix shutdown
+
+
+##### Help Command #####
+The `help` command provides a usage description of a Managix command.
+
+
+ $ managix help -cmd <command name>
+
+As an example, for looking up the help for the `configure` command, execute the following
+
+
+ $ managix help -cmd configure
+
+ Auto-generates the ASTERIX installer configruation settings and ASTERIX cluster
+ configuration settings for a single node setup.
+
+
+## Section 5: Frequently Asked Questions ##
+
+
+*Question*
+What is meant by the "UNUSABLE" state in the lifecycle of an ASTERIX instance ?
+
+
+*Answer*
+When Managix fails to start a required process (CC/NC), the instance transits to an UNUSABLE state.
+The reason for the failure needs to be looked up in the logs.
+Before we attempt to start the instance again, any processes that got launched
+as part of failed attempt must be stopped. No other operation except "stop" is supported in the UNUSABLE state.
+
+Get rid of the started processes:-
+
+ $MANAGIX_HOME/bin/managix stop -n my_asterix
+
+
+Any processes associated with the instance are killed and the instance moves to the INACTIVE state.
+You may now delete the instance by executing the following
+
+
+ $MANAGIX_HOME/bin/managix delete -n <name of your ASTERIX instance>
+
+
+Note that above would remove all traces of the instance including the logs and thus the reason for the failed attempt.
+
+OR
+
+make a subsequent attempt to start the instance if you realized a mistake in the cluster configuration XML and have corrected it. To start the instance, we execute the following.
+
+
+ $MANAGIX_HOME/bin/managix start -n <name of your ASTERIX instance>
+
diff --git a/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/bootstrap/MetadataBootstrap.java b/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/bootstrap/MetadataBootstrap.java
index 8d5d13b..f3ccc9c 100644
--- a/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/bootstrap/MetadataBootstrap.java
+++ b/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/bootstrap/MetadataBootstrap.java
@@ -348,8 +348,9 @@
if (create) {
lsmBtree = LSMBTreeUtils.createLSMTree(memBufferCache, memFreePageManager, ioManager, file, bufferCache,
fileMapProvider, typeTraits, comparatorFactories, bloomFilterKeyFields,
- runtimeContext.getLSMMergePolicy(), runtimeContext.getLSMBTreeOperationTrackerFactory(),
- runtimeContext.getLSMIOScheduler(), AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER);
+ runtimeContext.getBloomFilterFalsePositiveRate(), runtimeContext.getLSMMergePolicy(),
+ runtimeContext.getLSMBTreeOperationTrackerFactory(), runtimeContext.getLSMIOScheduler(),
+ AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER);
lsmBtree.create();
resourceID = runtimeContext.getResourceIdFactory().createId();
indexLifecycleManager.register(resourceID, lsmBtree);
@@ -369,8 +370,9 @@
if (lsmBtree == null) {
lsmBtree = LSMBTreeUtils.createLSMTree(memBufferCache, memFreePageManager, ioManager, file,
bufferCache, fileMapProvider, typeTraits, comparatorFactories, bloomFilterKeyFields,
- runtimeContext.getLSMMergePolicy(), runtimeContext.getLSMBTreeOperationTrackerFactory(),
- runtimeContext.getLSMIOScheduler(), AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER);
+ runtimeContext.getBloomFilterFalsePositiveRate(), runtimeContext.getLSMMergePolicy(),
+ runtimeContext.getLSMBTreeOperationTrackerFactory(), runtimeContext.getLSMIOScheduler(),
+ AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER);
indexLifecycleManager.register(resourceID, lsmBtree);
}
}
diff --git a/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/declared/AqlMetadataProvider.java b/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/declared/AqlMetadataProvider.java
index 4773ed0..2e45802 100644
--- a/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/declared/AqlMetadataProvider.java
+++ b/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/declared/AqlMetadataProvider.java
@@ -548,7 +548,8 @@
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
storageProperties.getMemoryComponentPageSize(),
- storageProperties.getMemoryComponentNumPages()), retainInput, searchCallbackFactory);
+ storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()), retainInput, searchCallbackFactory);
return new Pair<IOperatorDescriptor, AlgebricksPartitionConstraint>(btreeSearchOp, spPc.second);
} catch (MetadataException me) {
@@ -617,7 +618,8 @@
AsterixRuntimeComponentsProvider.LSMRTREE_PROVIDER, proposeLinearizer(
nestedKeyType.getTypeTag(), comparatorFactories.length),
storageProperties.getMemoryComponentPageSize(),
- storageProperties.getMemoryComponentNumPages()), retainInput, searchCallbackFactory);
+ storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()), retainInput, searchCallbackFactory);
return new Pair<IOperatorDescriptor, AlgebricksPartitionConstraint>(rtreeSearchOp, spPc.second);
} catch (MetadataException me) {
@@ -777,7 +779,8 @@
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
storageProperties.getMemoryComponentPageSize(),
- storageProperties.getMemoryComponentNumPages()), NoOpOperationCallbackFactory.INSTANCE);
+ storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()), NoOpOperationCallbackFactory.INSTANCE);
return new Pair<IOperatorDescriptor, AlgebricksPartitionConstraint>(btreeBulkLoad,
splitsAndConstraint.second);
} catch (MetadataException me) {
@@ -844,8 +847,8 @@
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER, storageProperties
- .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages()),
- null, modificationCallbackFactory);
+ .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()), null, modificationCallbackFactory);
return new Pair<IOperatorDescriptor, AlgebricksPartitionConstraint>(btreeBulkLoad,
splitsAndConstraint.second);
@@ -1040,8 +1043,9 @@
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER,
AsterixRuntimeComponentsProvider.LSMBTREE_PROVIDER, storageProperties
- .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages()),
- filterFactory, modificationCallbackFactory);
+ .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()), filterFactory,
+ modificationCallbackFactory);
return new Pair<IOperatorDescriptor, AlgebricksPartitionConstraint>(btreeBulkLoad,
splitsAndConstraint.second);
} catch (MetadataException e) {
@@ -1166,8 +1170,9 @@
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER,
AsterixRuntimeComponentsProvider.LSMINVERTEDINDEX_PROVIDER, storageProperties
- .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages()),
- filterFactory, modificationCallbackFactory);
+ .getMemoryComponentPageSize(), storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()), filterFactory,
+ modificationCallbackFactory);
return new Pair<IOperatorDescriptor, AlgebricksPartitionConstraint>(insertDeleteOp,
splitsAndConstraint.second);
} catch (MetadataException e) {
@@ -1262,7 +1267,9 @@
AsterixRuntimeComponentsProvider.LSMRTREE_PROVIDER, proposeLinearizer(
nestedKeyType.getTypeTag(), comparatorFactories.length),
storageProperties.getMemoryComponentPageSize(),
- storageProperties.getMemoryComponentNumPages()), filterFactory, modificationCallbackFactory);
+ storageProperties.getMemoryComponentNumPages(),
+ storageProperties.getBloomFilterFalsePositiveRate()), filterFactory,
+ modificationCallbackFactory);
return new Pair<IOperatorDescriptor, AlgebricksPartitionConstraint>(rtreeUpdate, splitsAndConstraint.second);
} catch (MetadataException | IOException e) {
throw new AlgebricksException(e);
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/om/functions/AsterixBuiltinFunctions.java b/asterix-om/src/main/java/edu/uci/ics/asterix/om/functions/AsterixBuiltinFunctions.java
index 736944d..38af5e1 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/om/functions/AsterixBuiltinFunctions.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/om/functions/AsterixBuiltinFunctions.java
@@ -462,14 +462,10 @@
FunctionNamespace.ASTERIX_PUBLIC.name(), "duration-from-months", 1);
public final static FunctionIdentifier MONTHS_FROM_YEAR_MONTH_DURATION = new FunctionIdentifier(
FunctionNamespace.ASTERIX_PUBLIC.name(), "months-from-year-month-duration", 1);
- public final static FunctionIdentifier MONTHS_OF_YEAR_MONTH_DURATION = new FunctionIdentifier(
- FunctionNamespace.ASTERIX_PUBLIC.name(), "months-of-year-month-duration", 1);
public final static FunctionIdentifier DURATION_FROM_MILLISECONDS = new FunctionIdentifier(
FunctionNamespace.ASTERIX_PUBLIC.name(), "duration-from-ms", 1);
public final static FunctionIdentifier MILLISECONDS_FROM_DAY_TIME_DURATION = new FunctionIdentifier(
FunctionNamespace.ASTERIX_PUBLIC.name(), "ms-from-day-time-duration", 1);
- public final static FunctionIdentifier MILLISECONDS_OF_DAY_TIME_DURATION = new FunctionIdentifier(
- FunctionNamespace.ASTERIX_PUBLIC.name(), "ms-of-day-time-duration", 1);
public final static FunctionIdentifier GET_YEAR_MONTH_DURATION = new FunctionIdentifier(
FunctionNamespace.ASTERIX_PUBLIC.name(), "get-year-month-duration", 1);
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/FunctionManagerImpl.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/FunctionManagerImpl.java
index 6f834a7..127d4a6 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/FunctionManagerImpl.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/evaluators/common/FunctionManagerImpl.java
@@ -37,7 +37,11 @@
@Override
public synchronized IFunctionDescriptor lookupFunction(FunctionIdentifier fid) throws AlgebricksException {
Pair<FunctionIdentifier, Integer> key = new Pair<FunctionIdentifier, Integer>(fid, fid.getArity());
- return functions.get(key).createFunctionDescriptor();
+ IFunctionDescriptorFactory factory = functions.get(key);
+ if (factory == null) {
+ throw new AlgebricksException("Inappropriate use of function " + "'" + fid.getName() + "'");
+ }
+ return factory.createFunctionDescriptor();
}
@Override
diff --git a/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/resource/LSMBTreeLocalResourceMetadata.java b/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/resource/LSMBTreeLocalResourceMetadata.java
index e037e95..7ab0d10 100644
--- a/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/resource/LSMBTreeLocalResourceMetadata.java
+++ b/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/resource/LSMBTreeLocalResourceMetadata.java
@@ -48,7 +48,7 @@
LSMBTree lsmBTree = LSMBTreeUtils.createLSMTree(memBufferCache, memFreePageManager,
runtimeContextProvider.getIOManager(), file, runtimeContextProvider.getBufferCache(),
runtimeContextProvider.getFileMapManager(), typeTraits, cmpFactories, bloomFilterKeyFields,
- runtimeContextProvider.getLSMMergePolicy(),
+ runtimeContextProvider.getBloomFilterFalsePositiveRate(), runtimeContextProvider.getLSMMergePolicy(),
runtimeContextProvider.getLSMBTreeOperationTrackerFactory(),
runtimeContextProvider.getLSMIOScheduler(),
runtimeContextProvider.getLSMBTreeIOOperationCallbackProvider(), partition);
diff --git a/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/resource/LSMInvertedIndexLocalResourceMetadata.java b/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/resource/LSMInvertedIndexLocalResourceMetadata.java
index 894fc16..3bcb747 100644
--- a/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/resource/LSMInvertedIndexLocalResourceMetadata.java
+++ b/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/resource/LSMInvertedIndexLocalResourceMetadata.java
@@ -57,7 +57,9 @@
return InvertedIndexUtils.createPartitionedLSMInvertedIndex(memBufferCache, memFreePageManager,
runtimeContextProvider.getFileMapManager(), invListTypeTraits, invListCmpFactories,
tokenTypeTraits, tokenCmpFactories, tokenizerFactory, runtimeContextProvider.getBufferCache(),
- runtimeContextProvider.getIOManager(), filePath, runtimeContextProvider.getLSMMergePolicy(),
+ runtimeContextProvider.getIOManager(), filePath,
+ runtimeContextProvider.getBloomFilterFalsePositiveRate(),
+ runtimeContextProvider.getLSMMergePolicy(),
runtimeContextProvider.getLSMInvertedIndexOperationTrackerFactory(),
runtimeContextProvider.getLSMIOScheduler(),
runtimeContextProvider.getLSMInvertedIndexIOOperationCallbackProvider(), partition);
@@ -65,7 +67,9 @@
return InvertedIndexUtils.createLSMInvertedIndex(memBufferCache, memFreePageManager,
runtimeContextProvider.getFileMapManager(), invListTypeTraits, invListCmpFactories,
tokenTypeTraits, tokenCmpFactories, tokenizerFactory, runtimeContextProvider.getBufferCache(),
- runtimeContextProvider.getIOManager(), filePath, runtimeContextProvider.getLSMMergePolicy(),
+ runtimeContextProvider.getIOManager(), filePath,
+ runtimeContextProvider.getBloomFilterFalsePositiveRate(),
+ runtimeContextProvider.getLSMMergePolicy(),
runtimeContextProvider.getLSMInvertedIndexOperationTrackerFactory(),
runtimeContextProvider.getLSMIOScheduler(),
runtimeContextProvider.getLSMInvertedIndexIOOperationCallbackProvider(), partition);
diff --git a/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/resource/LSMRTreeLocalResourceMetadata.java b/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/resource/LSMRTreeLocalResourceMetadata.java
index 1705dd3..ff26c54 100644
--- a/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/resource/LSMRTreeLocalResourceMetadata.java
+++ b/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/resource/LSMRTreeLocalResourceMetadata.java
@@ -62,7 +62,8 @@
return LSMRTreeUtils.createLSMTree(memBufferCache, memFreePageManager,
runtimeContextProvider.getIOManager(), file, runtimeContextProvider.getBufferCache(),
runtimeContextProvider.getFileMapManager(), typeTraits, rtreeCmpFactories, btreeCmpFactories,
- valueProviderFactories, rtreePolicyType, runtimeContextProvider.getLSMMergePolicy(),
+ valueProviderFactories, rtreePolicyType, runtimeContextProvider.getBloomFilterFalsePositiveRate(),
+ runtimeContextProvider.getLSMMergePolicy(),
runtimeContextProvider.getLSMRTreeOperationTrackerFactory(),
runtimeContextProvider.getLSMIOScheduler(),
runtimeContextProvider.getLSMRTreeIOOperationCallbackProvider(), linearizeCmpFactory, partition);
diff --git a/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/service/recovery/IAsterixAppRuntimeContextProvider.java b/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/service/recovery/IAsterixAppRuntimeContextProvider.java
index 3ae7da8..d96d5c2 100644
--- a/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/service/recovery/IAsterixAppRuntimeContextProvider.java
+++ b/asterix-transactions/src/main/java/edu/uci/ics/asterix/transaction/management/service/recovery/IAsterixAppRuntimeContextProvider.java
@@ -22,27 +22,29 @@
public IIndexLifecycleManager getIndexLifecycleManager();
+ public double getBloomFilterFalsePositiveRate();
+
public ILSMMergePolicy getLSMMergePolicy();
public ILSMOperationTrackerFactory getLSMBTreeOperationTrackerFactory();
-
+
public ILSMOperationTrackerFactory getLSMRTreeOperationTrackerFactory();
-
+
public ILSMOperationTrackerFactory getLSMInvertedIndexOperationTrackerFactory();
-
+
public ILSMIOOperationCallbackProvider getLSMBTreeIOOperationCallbackProvider();
-
+
public ILSMIOOperationCallbackProvider getLSMRTreeIOOperationCallbackProvider();
-
+
public ILSMIOOperationCallbackProvider getLSMInvertedIndexIOOperationCallbackProvider();
-
+
public ILSMIOOperationCallbackProvider getNoOpIOOperationCallbackProvider();
-
+
public ILSMIOOperationScheduler getLSMIOScheduler();
public ILocalResourceRepository getLocalResourceRepository();
public ResourceIdFactory getResourceIdFactory();
-
+
public IIOManager getIOManager();
}