Issue 548: Doc for CSV
Change-Id: Ia343a7a795876835607be024bb7b8ad3fec33933
Reviewed-on: http://fulliautomatix.ics.uci.edu:8443/194
Reviewed-by: Till Westmann <westmann@gmail.com>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
diff --git a/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/IntroduceDynamicTypeCastRule.java b/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/IntroduceDynamicTypeCastRule.java
index 97f26f8..e7cf79a 100644
--- a/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/IntroduceDynamicTypeCastRule.java
+++ b/asterix-algebra/src/main/java/edu/uci/ics/asterix/optimizer/rules/IntroduceDynamicTypeCastRule.java
@@ -120,8 +120,8 @@
break;
}
case DISTRIBUTE_RESULT: {
- // First, see if there was an outputRecordType specified
- requiredRecordType = (ARecordType) op1.getAnnotations().get("outputRecordType");
+ // First, see if there was an output-record-type specified
+ requiredRecordType = (ARecordType) op1.getAnnotations().get("output-record-type");
if (requiredRecordType == null) {
return false;
}
@@ -133,13 +133,13 @@
// of the singular input operator of the DISTRIBUTE_RESULT
if (op.getInputs().size() > 1) {
// Hopefully not possible?
- throw new AlgebricksException("outputRecordType defined for expression with multiple input operators");
+ throw new AlgebricksException("output-record-type defined for expression with multiple input operators");
}
AbstractLogicalOperator input = (AbstractLogicalOperator) op.getInputs().get(0).getValue();
List<LogicalVariable> liveVars = new ArrayList<LogicalVariable>();
VariableUtilities.getLiveVariables(input, liveVars);
if (liveVars.size() > 1) {
- throw new AlgebricksException("Expression with multiple fields cannot be cast to outputRecordType!");
+ throw new AlgebricksException("Expression with multiple fields cannot be cast to output-record-type!");
}
recordVar = liveVars.get(0);
diff --git a/asterix-algebra/src/main/java/edu/uci/ics/asterix/translator/AqlExpressionToPlanTranslator.java b/asterix-algebra/src/main/java/edu/uci/ics/asterix/translator/AqlExpressionToPlanTranslator.java
index be27c2e2..45db881 100644
--- a/asterix-algebra/src/main/java/edu/uci/ics/asterix/translator/AqlExpressionToPlanTranslator.java
+++ b/asterix-algebra/src/main/java/edu/uci/ics/asterix/translator/AqlExpressionToPlanTranslator.java
@@ -296,7 +296,7 @@
// the DistributeResultOperator
IAType outputRecordType = metadataProvider.findOutputRecordType();
if (outputRecordType != null) {
- topOp.getAnnotations().put("outputRecordType", outputRecordType);
+ topOp.getAnnotations().put("output-record-type", outputRecordType);
}
} else {
/** add the collection-to-sequence right before the final project, because dataset only accept non-collection records */
diff --git a/asterix-app/src/test/resources/runtimets/queries/csv/basic-types/basic-types.2.query.aql b/asterix-app/src/test/resources/runtimets/queries/csv/basic-types/basic-types.2.query.aql
index 6ef20f2..c452c38 100644
--- a/asterix-app/src/test/resources/runtimets/queries/csv/basic-types/basic-types.2.query.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/csv/basic-types/basic-types.2.query.aql
@@ -1,3 +1,3 @@
use dataverse "test";
-set outputRecordType "foo";
+set output-record-type "foo";
{ "money": float("18.25"), "id": 12345, "name": "Chris"}
diff --git a/asterix-doc/src/site/markdown/aql/externaldata.md b/asterix-doc/src/site/markdown/aql/externaldata.md
index ca350b9..4ddc89f 100644
--- a/asterix-doc/src/site/markdown/aql/externaldata.md
+++ b/asterix-doc/src/site/markdown/aql/externaldata.md
@@ -76,7 +76,7 @@
</tr>
<tr>
<td> format </td>
- <td> The format for the content. Use 'adm' for data in ADM (ASTERIX Data Model) or <a href="http://www.json.org/">JSON</a> format. Use 'delimited-text' if fields are separted by . </td></tr>
+ <td> The format for the content. Use 'adm' for data in ADM (ASTERIX Data Model) or <a href="http://www.json.org/">JSON</a> format. Use 'delimited-text' if fields are separated by a delimiting character (eg., CSV). </td></tr>
<tr><td>delimiter</td><td>The delimiting character in the source file if format is 'delimited text'</td></tr>
</table>
diff --git a/asterix-doc/src/site/markdown/aql/manual.md b/asterix-doc/src/site/markdown/aql/manual.md
index a892fb9..0b34dc1 100644
--- a/asterix-doc/src/site/markdown/aql/manual.md
+++ b/asterix-doc/src/site/markdown/aql/manual.md
@@ -765,7 +765,7 @@
##### Example
- insert into dataset UsersCopy (for $user in dataset FacebookUsers return $user
+ insert into dataset UsersCopy (for $user in dataset FacebookUsers return $user)
#### Delete
diff --git a/asterix-doc/src/site/markdown/csv.md b/asterix-doc/src/site/markdown/csv.md
new file mode 100644
index 0000000..53627b1
--- /dev/null
+++ b/asterix-doc/src/site/markdown/csv.md
@@ -0,0 +1,208 @@
+# CSV Support in AsterixDB
+
+## Introduction - Defining a datatype for CSV
+
+AsterixDB supports the CSV format for both data input and query result
+output. In both cases, the structure of the CSV data must be defined
+using a named ADM record datatype. The CSV format, limitations, and
+MIME type are defined by [RFC
+4180](https://tools.ietf.org/html/rfc4180).
+
+CSV is not as expressive as the full Asterix Data Model, meaning that
+not all data which can be represented in ADM can also be represented
+as CSV. So the form of this datatype is limited. First, obviously it
+may not contain any nested records or lists, as CSV has no way to
+represent nested data structures. All fields in the record type must
+be primitive. Second, the set of supported primitive types is limited
+to numerics (`int8`, `int16`, `int32`, `int64`, `float`, `double`) and
+`string`. On output, a few additional primitive types (`boolean`,
+datetime types) are supported and will be represented as strings.
+
+For the purposes of this document, we will use the following dataverse
+and datatype definitions:
+
+ drop dataverse csv if exists;
+ create dataverse csv;
+ use dataverse csv;
+
+ create type "csv_type" as closed {
+ "id": int32,
+ "money": float,
+ "name": string
+ };
+
+ create dataset "csv_set" ("csv_type") primary key "id";
+
+Note: There is no explicit restriction against using an open datatype
+for CSV purposes, and you may have optional fields in the datatype
+(eg., `id: int32?`). However, the CSV format itself is rigid, so
+using either of these datatype features introduces possible failure
+modes on output which will be discussed below.
+
+## CSV Input
+
+CSV data may be loaded into a dataset using the normal "load dataset"
+mechanisms, utilizing the builtin "delimited-text" format. See
+[Accessing External Data](aql/externaldata.html) for more
+details. Note that comma is the default value for the "delimiter"
+parameter, so it does not need to be explicitly specified.
+
+In this case, the datatype used to interpret the CSV data is the
+datatype associated with the dataset being loaded. So, to load a file
+that we have stored locally on the NC into our example dataset:
+
+ use dataverse csv;
+
+ load dataset "csv_set" using localfs
+ (("path"="127.0.0.1:///tmp/my_sample.csv"),
+ ("format"="delimited-text"));
+
+**Note:** Currently the CSV input parser only supports CSV data
+without headers.
+
+So, if the file `/tmp/my_sample.csv` contained
+
+ 1,18.50,"Peter Krabnitz"
+ 2,74.50,"Jesse Stevens"
+
+then the preceding query would load it into the dataset `csv_set`.
+
+CSV data may also be loaded from HDFS; see
+[Accessing External Data](aql/externaldata.html) for details.
+
+## CSV Output
+
+Any query may be rendered as CSV when using AsterixDB's HTTP
+interface. To do so, there are two steps required: specify the record
+type which defines the schema of your CSV, and request that Asterix
+use the CSV output format.
+
+#### Output Record Type
+
+Background: The result of any AQL query is an unordered list of
+_instances_, where each _instance_ is an instance of an AQL
+datatype. When requesting CSV output, there are some restrictions on
+the legal datatypes in this unordered list due to the limited
+expressability of CSV:
+
+1. Each instance must be of a record type.
+2. Each instance must be of the _same_ record type.
+3. The record type must conform to the content and type restrictions
+mentioned in the introduction.
+
+While it would be possible to structure your query to cast all result
+instances to a given type, it is not necessary. AQL offers a built-in
+feature which will automatically cast all top-level instances in the
+result to a specified named ADM record type. To enable this feature,
+use a `set` statement prior to the query to set the parameter
+`output-record-type` to the name of an ADM type. This type must have
+already been defined in the current dataverse.
+
+For example, the following request will ensure that all result
+instances are cast to the `csv_type` type declared earlier:
+
+ use dataverse csv;
+ set output-record-type "csv_type";
+
+ for $n in dataset "csv_set" return $n;
+
+In this case the casting is redundant since by definition every value
+in `csv_set` is already of type `csv_type`. But consider a more
+complex query where the result values are created by joining fields
+from different underlying datasets, etc.
+
+Two notes about `output-record-type`:
+
+1. This feature is not strictly related to CSV; it may be used with
+any output formats (in which case, any record datatype may be
+specified, not subject to the limitations specified in the
+introduction of this page).
+2. When the CSV output format is requested, `output-record-type` is in
+fact required, not optional. This is because the type is used to
+determine the field names for the CSV header and to ensure that the
+ordering of fields in the output is consistent (which is obviously
+vital for the CSV to make any sense).
+
+#### Request the CSV Output Format
+
+When sending requests to the Asterix HTTP API, Asterix decides what
+format to use for rendering the results based on the `Accept` HTTP
+header. By default, Asterix will produce JSON output, and this can be
+requested explicitly by specifying the MIME type `application/json`.
+To select CSV output, set the `Accept` header on your request to the
+MIME type `text/csv`. The details of how to accomplish this will of
+course depend on what tools you are using to contact the HTTP API.
+Here is an example from a Unix shell prompt using the command-line
+utility "curl":
+
+ curl -G -H "Accept: text/csv" "http://localhost:19002/query" --data-urlencode '
+ query=use dataverse csv;
+ set output-record-type "csv_type";
+ for $n in dataset csv_set return $n;'
+
+Similarly, a trivial Java program to execute the above sample query
+would be:
+
+ import java.net.HttpURLConnection;
+ import java.net.URL;
+ import java.net.URLEncoder;
+ import java.io.BufferedReader;
+ import java.io.InputStream;
+ import java.io.InputStreamReader;
+
+ public class AsterixExample {
+ public static void main(String[] args) throws Exception {
+ String query = "use dataverse csv; " +
+ "set output-record-type \"csv_type\";" +
+ "for $n in dataset csv_set return $n";
+ URL asterix = new URL("http://localhost:19002/query?query=" +
+ URLEncoder.encode(query, "UTF-8"));
+ HttpURLConnection conn = (HttpURLConnection) asterix.openConnection();
+ conn.setRequestProperty("Accept", "text/csv");
+ BufferedReader result = new BufferedReader
+ (new InputStreamReader(conn.getInputStream()));
+ String line;
+ while ((line = result.readLine()) != null) {
+ System.out.println(line);
+ }
+ result.close();
+ }
+ }
+
+For either of the above examples, the output would be:
+
+ "id","money","name"
+ 1,18.5,"Peter Krabnitz"
+ 2,74.5,"Jesse Stevens"
+
+assuming you had already run the previous examples to create the
+dataverse and populate the dataset.
+
+#### Issues with open datatypes and optional fields
+
+As mentioned earlier, CSV is a rigid format. It cannot express records
+with different numbers of fields, which ADM allows through both open
+datatypes and optional fields.
+
+If your output record type contains optional fields, this will not
+result in any errors. If the output data of a query does not contain
+values for an optional field, this will be represented in CSV as
+`null`.
+
+If your output record type is open, this will also not result in any
+errors. If the output data of a query contains any open fields, the
+corresponding rows in the resulting CSV will contain more
+comma-separated values than the others. On each such row, the data
+from the closed fields in the type will be output first in the normal
+order, followed by the data from the open fields in an arbitrary
+order.
+
+According to RFC 4180 this is not strictly valid CSV (Section 2, rule
+4, "Each line _should_ contain the same number of fields throughout
+the file"). Hence it will likely not be handled consistently by all
+CSV processors. Some may throw a parsing error. If you attempt to load
+this data into AsterixDB later using `load dataset`, the extra fields
+will be silently ignored. For this reason it is recommended that you
+use only closed datatypes as output record types. AsterixDB allows to
+use an open record type only to support cases where the type already
+exists for other parts of your application.
diff --git a/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/declared/AqlMetadataProvider.java b/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/declared/AqlMetadataProvider.java
index c02b23a..53b99ae 100644
--- a/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/declared/AqlMetadataProvider.java
+++ b/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/declared/AqlMetadataProvider.java
@@ -279,16 +279,16 @@
}
/**
- * Retrieve the Output RecordType, as defined by "set outputRecordType".
+ * Retrieve the Output RecordType, as defined by "set output-record-type".
*/
public ARecordType findOutputRecordType() throws AlgebricksException {
- String outputRecordType = getPropertyValue("outputRecordType");
+ String outputRecordType = getPropertyValue("output-record-type");
if (outputRecordType == null) {
return null;
}
String dataverse = getDefaultDataverseName();
if (dataverse == null) {
- throw new AlgebricksException("Cannot declare outputRecordType with no dataverse!");
+ throw new AlgebricksException("Cannot declare output-record-type with no dataverse!");
}
IAType type = findType(dataverse, outputRecordType);
if (!(type instanceof ARecordType)) {