Add test case and documentation for CSV parsing with headers.
Also fix a minor CSV parsing bug with line counting.
Change-Id: Ib875d60aa2465d4a50ee50c5c0e9356185f35c73
Reviewed-on: http://fulliautomatix.ics.uci.edu:8443/228
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Taewoo Kim <wangsaeu@gmail.com>
Reviewed-by: Ian Maxon <imaxon@uci.edu>
diff --git a/asterix-app/data/csv/sample_08_header.csv b/asterix-app/data/csv/sample_08_header.csv
new file mode 100644
index 0000000..7444e77
--- /dev/null
+++ b/asterix-app/data/csv/sample_08_header.csv
@@ -0,0 +1,9 @@
+id,float,"double","date",time,datetime
+1,0.899682764,5.6256,2013-08-07,07:22:35,1979-02-25T23:48:27.034
+2,0.669052398,,-1923-03-29,19:33:34,-1979-02-25T23:48:27.002
+3,0.572733058,192674,-1923-03-28,19:33:34,-1979-02-25T23:48:27.001
+4,,192674,-1923-03-27,19:33:34,-1979-02-25T23:48:27.001
+5,0.572733058,192674,,19:33:34,-1979-02-25T23:48:27.001
+6,0.572733058,192674,-1923-03-25,,-1979-02-25T23:48:27.001
+7,0.572733058,192674,-1923-03-24,19:33:34,
+8,,,,,
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_08/csv_08.1.ddl.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_08/csv_08.1.ddl.aql
new file mode 100644
index 0000000..e890942
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_08/csv_08.1.ddl.aql
@@ -0,0 +1,22 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: success
+ *
+ */
+
+drop dataverse temp if exists;
+create dataverse temp
+use dataverse temp;
+
+create type test as closed {
+ id: int32,
+ float: float?,
+ double: double?,
+ date: string?,
+ time: string?,
+ datetime: string?
+};
+
+create dataset testds (test)
+primary key id;
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_08/csv_08.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_08/csv_08.2.update.aql
new file mode 100644
index 0000000..e8eecf1
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_08/csv_08.2.update.aql
@@ -0,0 +1,12 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: success
+ *
+ */
+
+use dataverse temp;
+
+load dataset testds
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/csv/sample_08_header.csv"),("format"="delimited-text"),("header"="true"));
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_08/csv_08.3.query.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_08/csv_08.3.query.aql
new file mode 100644
index 0000000..efa6dbc
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_08/csv_08.3.query.aql
@@ -0,0 +1,18 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: success
+ *
+ */
+
+use dataverse temp;
+
+for $i in dataset testds
+order by $i.id
+return { "id": $i.id,
+ "float": $i.float,
+ "double": $i.double,
+ "date-before": $i.date, "date-after": date($i.date),
+ "time-before": $i.time, "time-after": time($i.time),
+ "datetime-before": $i.datetime, "datetime-after": datetime($i.datetime)
+ }
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/results/load/csv_08/csv_08.1.adm b/asterix-app/src/test/resources/runtimets/results/load/csv_08/csv_08.1.adm
new file mode 100644
index 0000000..6aaf019
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/results/load/csv_08/csv_08.1.adm
@@ -0,0 +1,9 @@
+[ { "id": 1, "float": 0.89968276f, "double": 5.6256d, "date-before": "2013-08-07", "date-after": date("2013-08-07"), "time-before": "07:22:35", "time-after": time("07:22:35.000Z"), "datetime-before": "1979-02-25T23:48:27.034", "datetime-after": datetime("1979-02-25T23:48:27.034Z") }
+, { "id": 2, "float": 0.6690524f, "double": null, "date-before": "-1923-03-29", "date-after": date("-1923-03-29"), "time-before": "19:33:34", "time-after": time("19:33:34.000Z"), "datetime-before": "-1979-02-25T23:48:27.002", "datetime-after": datetime("-1979-02-25T23:48:27.002Z") }
+, { "id": 3, "float": 0.57273304f, "double": 192674.0d, "date-before": "-1923-03-28", "date-after": date("-1923-03-28"), "time-before": "19:33:34", "time-after": time("19:33:34.000Z"), "datetime-before": "-1979-02-25T23:48:27.001", "datetime-after": datetime("-1979-02-25T23:48:27.001Z") }
+, { "id": 4, "float": null, "double": 192674.0d, "date-before": "-1923-03-27", "date-after": date("-1923-03-27"), "time-before": "19:33:34", "time-after": time("19:33:34.000Z"), "datetime-before": "-1979-02-25T23:48:27.001", "datetime-after": datetime("-1979-02-25T23:48:27.001Z") }
+, { "id": 5, "float": 0.57273304f, "double": 192674.0d, "date-before": null, "date-after": null, "time-before": "19:33:34", "time-after": time("19:33:34.000Z"), "datetime-before": "-1979-02-25T23:48:27.001", "datetime-after": datetime("-1979-02-25T23:48:27.001Z") }
+, { "id": 6, "float": 0.57273304f, "double": 192674.0d, "date-before": "-1923-03-25", "date-after": date("-1923-03-25"), "time-before": null, "time-after": null, "datetime-before": "-1979-02-25T23:48:27.001", "datetime-after": datetime("-1979-02-25T23:48:27.001Z") }
+, { "id": 7, "float": 0.57273304f, "double": 192674.0d, "date-before": "-1923-03-24", "date-after": date("-1923-03-24"), "time-before": "19:33:34", "time-after": time("19:33:34.000Z"), "datetime-before": null, "datetime-after": null }
+, { "id": 8, "float": null, "double": null, "date-before": null, "date-after": null, "time-before": null, "time-after": null, "datetime-before": null, "datetime-after": null }
+ ]
diff --git a/asterix-app/src/test/resources/runtimets/testsuite.xml b/asterix-app/src/test/resources/runtimets/testsuite.xml
index e5ea4d3..60f8e83 100644
--- a/asterix-app/src/test/resources/runtimets/testsuite.xml
+++ b/asterix-app/src/test/resources/runtimets/testsuite.xml
@@ -5098,6 +5098,11 @@
</compilation-unit>
</test-case>
<test-case FilePath="load">
+ <compilation-unit name="csv_08">
+ <output-dir compare="Text">csv_08</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="load">
<compilation-unit name="issue14_query">
<output-dir compare="Text">none</output-dir>
<expected-error>edu.uci.ics.asterix.common.exceptions.AsterixException</expected-error>
diff --git a/asterix-doc/src/site/markdown/csv.md b/asterix-doc/src/site/markdown/csv.md
index 53627b1..8e056b4 100644
--- a/asterix-doc/src/site/markdown/csv.md
+++ b/asterix-doc/src/site/markdown/csv.md
@@ -57,9 +57,6 @@
(("path"="127.0.0.1:///tmp/my_sample.csv"),
("format"="delimited-text"));
-**Note:** Currently the CSV input parser only supports CSV data
-without headers.
-
So, if the file `/tmp/my_sample.csv` contained
1,18.50,"Peter Krabnitz"
@@ -67,6 +64,19 @@
then the preceding query would load it into the dataset `csv_set`.
+If your CSV file has a header (that is, the first line contains a set
+of field names, rather than actual data), you can instruct Asterix to
+ignore this header by adding the parameter `"header"="true"`, eg.
+
+ load dataset "csv_set" using localfs
+ (("path"="127.0.0.1:///tmp/my_header_sample.csv"),
+ ("format"="delimited-text"),
+ ("header"="true"));
+
+This is useful when the CSV file was produced from an earlier
+AsterixDB operation, as AsterixDB's CSV output always has a header
+line.
+
CSV data may also be loaded from HDFS; see
[Accessing External Data](aql/externaldata.html) for details.
diff --git a/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/feeds/ExternalDataScanOperatorDescriptor.java b/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/feeds/ExternalDataScanOperatorDescriptor.java
index 52d6abb..953adcf 100644
--- a/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/feeds/ExternalDataScanOperatorDescriptor.java
+++ b/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/feeds/ExternalDataScanOperatorDescriptor.java
@@ -44,7 +44,7 @@
@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
- IRecordDescriptorProvider recordDescProvider, final int partition, int nPartitions)
+ IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions)
throws HyracksDataException {
return new AbstractUnaryInputUnaryOutputOperatorNodePushable() {
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/DelimitedDataParser.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/DelimitedDataParser.java
index e160586..c3ddde6 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/DelimitedDataParser.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/DelimitedDataParser.java
@@ -327,6 +327,7 @@
} else if (ch == '\n' && !startedQuote) {
start = p + 1;
state = State.EOR;
+ lineCount++;
lastDelimiterPosition = p;
break;
} else if (ch == '\r' && !startedQuote) {
@@ -350,6 +351,7 @@
if (ch == '\n' && !startedQuote) {
++start;
state = State.EOR;
+ lineCount++;
} else {
state = State.IN_RECORD;
return true;