[ASTERIXDB-2713][EXT] Add CSV & TSV support for external dataset
- user model changes: no
- storage format changes: no
- interface changes: no
Details:
Add CSV support for external dataset.
- support S3
- add boolean parser to Hyracks
Change-Id: Id1790fa73461e9f4a5fb443c51c1905ac588cee6
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/5743
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Murtadha Hubail <mhubail@apache.org>
diff --git a/asterixdb/asterix-app/data/csv/01.csv b/asterixdb/asterix-app/data/csv/01.csv
new file mode 100644
index 0000000..6957e76
--- /dev/null
+++ b/asterixdb/asterix-app/data/csv/01.csv
@@ -0,0 +1,3 @@
+1,,"good","recommend"
+2,,"bad","not recommend"
+3,,"good",
\ No newline at end of file
diff --git a/asterixdb/asterix-app/data/csv/02.csv b/asterixdb/asterix-app/data/csv/02.csv
new file mode 100644
index 0000000..630843f
--- /dev/null
+++ b/asterixdb/asterix-app/data/csv/02.csv
@@ -0,0 +1,3 @@
+4,2018,"good","recommend"
+5,2018,,"not recommend"
+6,2018,"good",
\ No newline at end of file
diff --git a/asterixdb/asterix-app/data/csv/sample_09.csv b/asterixdb/asterix-app/data/csv/sample_09.csv
new file mode 100644
index 0000000..b14219d
--- /dev/null
+++ b/asterixdb/asterix-app/data/csv/sample_09.csv
@@ -0,0 +1,17 @@
+a,b,c,d,e
+0,", boo", 1,2,3
+1,"","",❤,
+2,3,4,\n,
+3,"quoted ""f"" field",,,
+4,4,,,
+5,"{""vehicle"": ""car"", ""location"": [2.0, 0.1]}",,,
+6,2,3,,
+7,8,9,,
+8,2,3,,
+9,8,9,,
+10,"field
+""f""
+with multiple lines",,,
+11,4,,,
+12,5,ʤ,,
+John,Green,111 downtown st.,"city, state",99999
\ No newline at end of file
diff --git a/asterixdb/asterix-app/data/csv/sample_10.csv b/asterixdb/asterix-app/data/csv/sample_10.csv
new file mode 100644
index 0000000..3beee08
--- /dev/null
+++ b/asterixdb/asterix-app/data/csv/sample_10.csv
@@ -0,0 +1,39 @@
+1,"?/ Text ending with a backslash / \",2000-09-03 07:12:22
+2,non quoted text!yes......,2003-08-09 22:34:19
+3,Text with more sentences. Another sentence.,2003-09-12 05:29:12
+4,"Quoted text.. yes.",2003-09-13 17:21:49
+5,Another text,2003-01-21 23:31:41
+6,Text with' quotes.,2003-09-14 20:15:50
+7,Text with quote's,2003-09-14 18:34:03
+8,"Text with quotes '",2003-01-28 20:32:13
+9,"Text with quotes """,2003-01-18 11:44:15
+10,Text with question marks!?!?,2003-09-18 06:25:56
+11,""" Text that starts with quotes",2003-09-12 00:31:24
+12,"Text with \"" backslash and quotes",2003-09-13 20:30:06
+13,"Text with \"" backslash and quotes\""",2003-09-14 16:20:36
+14,"Text that has comma ,",2003-09-12 08:21:18
+15,"Text that has "","" quoted comma",2003-09-12 08:21:18
+16,",Text that has ",2003-09-12 08:21:18
+17,","",Text that has ",2003-09-12 08:21:18
+18,"Text with commas,inside it., yes",2003-09-13 23:42:14
+19,"Text that has \n inside ",2003-09-12 08:21:18
+20,"Text that has \\\n inside ",2003-09-12 08:21:18
+21,text with :),2003-09-05 19:15:34
+22,"Text that has \\\"" inside \\",2003-09-12 08:21:18
+23,"Text that has \\\"" inside \\""",2003-09-12 08:21:18
+24,"""text that spans multiple
+Lines and more
+Lines ane more and more
+Lines ...
+And yet more lines
+And more""",2011-09-19 01:09:09
+25,"Text ""
+more lines",2011-09-19 01:09:09
+26,"""
+",2011-09-19 01:09:09
+27,"Text",""
+28,"Text","2011-09-19 01:09:09"
+29,"Text\.","2011-09-19 01:09:09"
+30,Text\.,"2011-09-19 01:09:09"
+31,"\.Text","2011-09-19 01:09:09"
+32,\.Text,"2011-09-19 01:09:09"
\ No newline at end of file
diff --git a/asterixdb/asterix-app/data/csv/sample_11.csv b/asterixdb/asterix-app/data/csv/sample_11.csv
new file mode 100644
index 0000000..b9a9571
--- /dev/null
+++ b/asterixdb/asterix-app/data/csv/sample_11.csv
@@ -0,0 +1,4 @@
+1,","", b", 3,4,5
+","", b",4, 3,4,5
+,,,,
+"dd",,,,
\ No newline at end of file
diff --git a/asterixdb/asterix-app/data/csv/sample_12.csv b/asterixdb/asterix-app/data/csv/sample_12.csv
new file mode 100644
index 0000000..2ab7c6d
--- /dev/null
+++ b/asterixdb/asterix-app/data/csv/sample_12.csv
@@ -0,0 +1,15 @@
+1,true,"text"
+2,false,"text"
+3,true,"text"
+4,true,""
+5,false,
+6,true,"text""
+more lines"
+7,false,"""
+"
+8,true,""
+9,false,"text"""
+10,false,text\.
+11,true,"text\."
+,false,\.text
+13,true,"\.text"
\ No newline at end of file
diff --git a/asterixdb/asterix-app/data/tsv/01.tsv b/asterixdb/asterix-app/data/tsv/01.tsv
new file mode 100644
index 0000000..98876c7
--- /dev/null
+++ b/asterixdb/asterix-app/data/tsv/01.tsv
@@ -0,0 +1,3 @@
+1 "good" "recommend"
+2 "bad" "not recommend"
+3 "good" "recommend"
\ No newline at end of file
diff --git a/asterixdb/asterix-app/data/tsv/02.tsv b/asterixdb/asterix-app/data/tsv/02.tsv
new file mode 100644
index 0000000..c01ce7c
--- /dev/null
+++ b/asterixdb/asterix-app/data/tsv/02.tsv
@@ -0,0 +1,3 @@
+4 2018 "good" "recommend"
+5 2018 "not recommend"
+6 2018 "good" "recommend"
\ No newline at end of file
diff --git a/asterixdb/asterix-app/data/tsv/sample_01.tsv b/asterixdb/asterix-app/data/tsv/sample_01.tsv
new file mode 100644
index 0000000..aab289a
--- /dev/null
+++ b/asterixdb/asterix-app/data/tsv/sample_01.tsv
@@ -0,0 +1,28 @@
+11 55 text field wih , charrrrrrrrrrr true 90 0.666666667
+12 55 text field with " charrrrrrrrrr false 90 0.666666667
+14 55 text field with ' charrrrrrrrrr false 90 0.666666667
+15 55 text field with \ charrrrrrrrrr false 90 0.666666667
+16 55 text field wih \, char true 90 0.666666667
+17 55 text field with \" charrrrrrrrr false 90 0.666666667
+18 55 text field with \' charrrrrrrrr false 90 0.666666667
+19 55 text field with \\ charrrrrrrrr false 90 0.666666667
+20 55 text field ending with charr , false 90 0.666666667
+21 55 text field ending with charr " false 90 0.666666667
+22 55 text field ending with charr ' false 90 0.666666667
+23 55 text field ending with charr \ false 90 0.666666667
+24 55 text field ending with charr \, false 90 0.666666667
+25 55 text field ending with charr \" false 90 0.666666667
+26 55 text field ending with charr \' false 90 0.666666667
+27 55 text field ending with charr \\ false 90 0.666666667
+28 55 ,text field starting with charr false 90 0.666666667
+29 55 "text field starting with charr false 90 0.666666667
+30 55 'text field starting with charr false 90 0.666666667
+31 55 \text field starting with charr false 90 0.666666667
+32 55 \,text field starting with char false 90 0.666666667
+33 55 \"text field starting with char false 90 0.666666667
+34 55 \'text field starting with char false 90 0.666666667
+35 55 \\text field starting with char false 90 0.666666667
+36 55 "text field inside with char" false 90 0.666666667
+37 55 text field with charrrrrrrrr false 90 0.666666667
+38 55 text field with "" charrrrrrrrr false 90 0.666666667
+39 55 text field "with" charrrrrrrrrr false 90 0.666666667
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/translator/QueryTranslator.java b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/translator/QueryTranslator.java
index c1eea7c..e211531 100644
--- a/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/translator/QueryTranslator.java
+++ b/asterixdb/asterix-app/src/main/java/org/apache/asterix/app/translator/QueryTranslator.java
@@ -85,6 +85,7 @@
import org.apache.asterix.external.indexing.IndexingConstants;
import org.apache.asterix.external.operators.FeedIntakeOperatorNodePushable;
import org.apache.asterix.external.util.ExternalDataConstants;
+import org.apache.asterix.external.util.ExternalDataUtils;
import org.apache.asterix.formats.nontagged.TypeTraitProvider;
import org.apache.asterix.lang.common.base.IReturningStatement;
import org.apache.asterix.lang.common.base.IRewriterFactory;
@@ -727,6 +728,7 @@
case EXTERNAL:
ExternalDetailsDecl externalDetails = (ExternalDetailsDecl) dd.getDatasetDetailsDecl();
Map<String, String> properties = createExternalDatasetProperties(dd, metadataProvider, mdTxnCtx);
+ ExternalDataUtils.defaultConfiguration(properties);
datasetDetails = new ExternalDatasetDetails(externalDetails.getAdapter(), properties, new Date(),
TransactionState.COMMIT);
break;
diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java
index 3b4cdf8..d2158ba 100644
--- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java
+++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/aws/AwsS3ExternalDatasetTest.java
@@ -18,9 +18,12 @@
*/
package org.apache.asterix.test.external_dataset.aws;
+import static org.apache.hyracks.util.file.FileUtil.joinPath;
+
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.URI;
+import java.nio.file.Paths;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
@@ -66,9 +69,13 @@
private static S3Client client;
private static final String S3_MOCK_SERVER_BUCKET = "playground";
private static final String S3_MOCK_SERVER_BUCKET_DEFINITION = "json-data/reviews/"; // data resides here
+ private static final String S3_MOCK_SERVER_BUCKET_CSV_DEFINITION = "csv-data/reviews/"; // data resides here
+ private static final String S3_MOCK_SERVER_BUCKET_TSV_DEFINITION = "tsv-data/reviews/"; // data resides here
private static final String S3_MOCK_SERVER_REGION = "us-west-2";
private static final int S3_MOCK_SERVER_PORT = 8001;
private static final String S3_MOCK_SERVER_HOSTNAME = "http://localhost:" + S3_MOCK_SERVER_PORT;
+ private static final String CSV_DATA_PATH = joinPath("data", "csv");
+ private static final String TSV_DATA_PATH = joinPath("data", "tsv");
@BeforeClass
public static void setUp() throws Exception {
@@ -210,6 +217,26 @@
PutObjectRequest.builder().bucket(S3_MOCK_SERVER_BUCKET)
.key(S3_MOCK_SERVER_BUCKET_DEFINITION + "2019/q2/2.json").build(),
RequestBody.fromString("{\"id\": 14, \"year\": 2019, \"quarter\": 2, \"review\": \"bad\"}"));
+
+ LOGGER.info("Adding CSV files to the bucket");
+ client.putObject(
+ PutObjectRequest.builder().bucket(S3_MOCK_SERVER_BUCKET)
+ .key(S3_MOCK_SERVER_BUCKET_CSV_DEFINITION + "01.csv").build(),
+ RequestBody.fromFile(Paths.get(CSV_DATA_PATH, "01.csv")));
+ client.putObject(
+ PutObjectRequest.builder().bucket(S3_MOCK_SERVER_BUCKET)
+ .key(S3_MOCK_SERVER_BUCKET_CSV_DEFINITION + "2018/01.csv").build(),
+ RequestBody.fromFile(Paths.get(CSV_DATA_PATH, "02.csv")));
+
+ LOGGER.info("Adding TSV files to the bucket");
+ client.putObject(
+ PutObjectRequest.builder().bucket(S3_MOCK_SERVER_BUCKET)
+ .key(S3_MOCK_SERVER_BUCKET_TSV_DEFINITION + "01.tsv").build(),
+ RequestBody.fromFile(Paths.get(TSV_DATA_PATH, "01.tsv")));
+ client.putObject(
+ PutObjectRequest.builder().bucket(S3_MOCK_SERVER_BUCKET)
+ .key(S3_MOCK_SERVER_BUCKET_TSV_DEFINITION + "2018/01.tsv").build(),
+ RequestBody.fromFile(Paths.get(TSV_DATA_PATH, "02.tsv")));
LOGGER.info("Files added successfully");
}
}
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp
new file mode 100644
index 0000000..5728e78
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATAVERSE test IF EXISTS;
+CREATE DATAVERSE test;
+
+USE test;
+
+CREATE TYPE t1 AS {f1: string, f2: string, f3: string, f4: string, f5: string};
+CREATE TYPE t2 AS {f1: string, f2: string, f3: string};
+CREATE TYPE t3 AS {f1: int?, f2: boolean, f3: string?};
+
+CREATE EXTERNAL DATASET ds1(t1) USING localfs(("path"="asterix_nc1://data/csv/sample_09.csv"), ("format"="csv"));
+CREATE EXTERNAL DATASET ds2(t2) USING localfs(("path"="asterix_nc1://data/csv/sample_10.csv"), ("format"="csv"));
+CREATE EXTERNAL DATASET ds3(t1) USING localfs(("path"="asterix_nc1://data/csv/sample_11.csv"), ("format"="csv"));
+CREATE EXTERNAL DATASET ds4(t3) USING localfs(("path"="asterix_nc1://data/csv/sample_12.csv"), ("format"="csv"));
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.2.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.2.query.sqlpp
new file mode 100644
index 0000000..d870372
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.2.query.sqlpp
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+FROM ds1 v SELECT VALUE v;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.3.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.3.query.sqlpp
new file mode 100644
index 0000000..64a2f8a
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.3.query.sqlpp
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+FROM ds2 v SELECT VALUE v;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.4.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.4.query.sqlpp
new file mode 100644
index 0000000..313198c
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.4.query.sqlpp
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+FROM ds3 v SELECT VALUE v;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.5.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.5.query.sqlpp
new file mode 100644
index 0000000..065de4e
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.5.query.sqlpp
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+FROM ds4 v SELECT VALUE v;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.6.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.6.ddl.sqlpp
new file mode 100644
index 0000000..86a1b59
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.6.ddl.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATAVERSE test;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.1.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.1.ddl.sqlpp
new file mode 100644
index 0000000..c0faf16
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.1.ddl.sqlpp
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATAVERSE test IF EXISTS;
+CREATE DATAVERSE test;
+
+USE test;
+
+CREATE TYPE t1 AS {f1: int, f2: int, f3: string, f4: boolean, f5: bigint, f6: double};
+
+CREATE EXTERNAL DATASET ds1(t1) USING localfs(("path"="asterix_nc1://data/tsv/sample_01.tsv"), ("format"="tsv"))
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.2.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.2.query.sqlpp
new file mode 100644
index 0000000..d870372
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.2.query.sqlpp
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+FROM ds1 v SELECT VALUE v;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.3.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.3.ddl.sqlpp
new file mode 100644
index 0000000..86a1b59
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/tsv-parser-001/tsv-parser-002.3.ddl.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATAVERSE test;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.000.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.000.ddl.sqlpp
new file mode 100644
index 0000000..b906039
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.000.ddl.sqlpp
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATAVERSE test IF EXISTS;
+CREATE DATAVERSE test;
+USE test;
+
+DROP TYPE test IF EXISTS;
+CREATE TYPE test AS {id: int, year: int?, review: string, details: string?};
+
+DROP DATASET test IF EXISTS;
+CREATE EXTERNAL DATASET test(test) USING S3 (
+("accessKey"="dummyAccessKey"),
+("secretKey"="dummySecretKey"),
+("region"="us-west-2"),
+("serviceEndpoint"="http://localhost:8001"),
+("container"="playground"),
+("definition"="csv-data/reviews"),
+("format"="csv")
+);
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.002.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.002.query.sqlpp
new file mode 100644
index 0000000..6e31eb3
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.002.query.sqlpp
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+FROM test SELECT VALUE test ORDER BY id ASC;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.003.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.003.ddl.sqlpp
new file mode 100644
index 0000000..0ff713d
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/001/query-dataset.003.ddl.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATASET test IF EXISTS;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.000.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.000.ddl.sqlpp
new file mode 100644
index 0000000..d385bee
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.000.ddl.sqlpp
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATAVERSE test IF EXISTS;
+CREATE DATAVERSE test;
+USE test;
+
+DROP TYPE test IF EXISTS;
+CREATE TYPE test AS {id: int, year: int?, review: string, details: string?};
+
+DROP DATASET test IF EXISTS;
+CREATE EXTERNAL DATASET test(test) USING S3 (
+("accessKey"="dummyAccessKey"),
+("secretKey"="dummySecretKey"),
+("region"="us-west-2"),
+("serviceEndpoint"="http://localhost:8001"),
+("container"="playground"),
+("definition"="tsv-data/reviews"),
+("format"="tsv")
+);
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.002.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.002.query.sqlpp
new file mode 100644
index 0000000..6e31eb3
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.002.query.sqlpp
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+FROM test SELECT VALUE test ORDER BY id ASC;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.003.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.003.ddl.sqlpp
new file mode 100644
index 0000000..0ff713d
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/002/query-dataset.003.ddl.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATASET test IF EXISTS;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.2.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.2.adm
new file mode 100644
index 0000000..5c84fb8
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.2.adm
@@ -0,0 +1,15 @@
+{ "f1": "a", "f2": "b", "f3": "c", "f4": "d", "f5": "e" }
+{ "f1": "0", "f2": ", boo", "f3": " 1", "f4": "2", "f5": "3" }
+{ "f1": "1", "f2": "", "f3": "", "f4": "❤", "f5": "" }
+{ "f1": "2", "f2": "3", "f3": "4", "f4": "\\n", "f5": "" }
+{ "f1": "3", "f2": "quoted \"f\" field", "f3": "", "f4": "", "f5": "" }
+{ "f1": "4", "f2": "4", "f3": "", "f4": "", "f5": "" }
+{ "f1": "5", "f2": "{\"vehicle\": \"car\", \"location\": [2.0, 0.1]}", "f3": "", "f4": "", "f5": "" }
+{ "f1": "6", "f2": "2", "f3": "3", "f4": "", "f5": "" }
+{ "f1": "7", "f2": "8", "f3": "9", "f4": "", "f5": "" }
+{ "f1": "8", "f2": "2", "f3": "3", "f4": "", "f5": "" }
+{ "f1": "9", "f2": "8", "f3": "9", "f4": "", "f5": "" }
+{ "f1": "10", "f2": "field\n\"f\"\nwith multiple lines", "f3": "", "f4": "", "f5": "" }
+{ "f1": "11", "f2": "4", "f3": "", "f4": "", "f5": "" }
+{ "f1": "12", "f2": "5", "f3": "ʤ", "f4": "", "f5": "" }
+{ "f1": "John", "f2": "Green", "f3": "111 downtown st.", "f4": "city, state", "f5": "99999" }
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.3.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.3.adm
new file mode 100644
index 0000000..80f5fb7
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.3.adm
@@ -0,0 +1,32 @@
+{ "f1": "1", "f2": "?/ Text ending with a backslash / \\", "f3": "2000-09-03 07:12:22" }
+{ "f1": "2", "f2": "non quoted text!yes......", "f3": "2003-08-09 22:34:19" }
+{ "f1": "3", "f2": "Text with more sentences. Another sentence.", "f3": "2003-09-12 05:29:12" }
+{ "f1": "4", "f2": "Quoted text.. yes.", "f3": "2003-09-13 17:21:49" }
+{ "f1": "5", "f2": "Another text", "f3": "2003-01-21 23:31:41" }
+{ "f1": "6", "f2": "Text with' quotes.", "f3": "2003-09-14 20:15:50" }
+{ "f1": "7", "f2": "Text with quote's", "f3": "2003-09-14 18:34:03" }
+{ "f1": "8", "f2": "Text with quotes '", "f3": "2003-01-28 20:32:13" }
+{ "f1": "9", "f2": "Text with quotes \"", "f3": "2003-01-18 11:44:15" }
+{ "f1": "10", "f2": "Text with question marks!?!?", "f3": "2003-09-18 06:25:56" }
+{ "f1": "11", "f2": "\" Text that starts with quotes", "f3": "2003-09-12 00:31:24" }
+{ "f1": "12", "f2": "Text with \\\" backslash and quotes", "f3": "2003-09-13 20:30:06" }
+{ "f1": "13", "f2": "Text with \\\" backslash and quotes\\\"", "f3": "2003-09-14 16:20:36" }
+{ "f1": "14", "f2": "Text that has comma ,", "f3": "2003-09-12 08:21:18" }
+{ "f1": "15", "f2": "Text that has \",\" quoted comma", "f3": "2003-09-12 08:21:18" }
+{ "f1": "16", "f2": ",Text that has ", "f3": "2003-09-12 08:21:18" }
+{ "f1": "17", "f2": ",\",Text that has ", "f3": "2003-09-12 08:21:18" }
+{ "f1": "18", "f2": "Text with commas,inside it., yes", "f3": "2003-09-13 23:42:14" }
+{ "f1": "19", "f2": "Text that has \\n inside ", "f3": "2003-09-12 08:21:18" }
+{ "f1": "20", "f2": "Text that has \\\\\\n inside ", "f3": "2003-09-12 08:21:18" }
+{ "f1": "21", "f2": "text with :)", "f3": "2003-09-05 19:15:34" }
+{ "f1": "22", "f2": "Text that has \\\\\\\" inside \\\\", "f3": "2003-09-12 08:21:18" }
+{ "f1": "23", "f2": "Text that has \\\\\\\" inside \\\\\"", "f3": "2003-09-12 08:21:18" }
+{ "f1": "24", "f2": "\"text that spans multiple\nLines and more\nLines ane more and more\nLines ...\nAnd yet more lines\nAnd more\"", "f3": "2011-09-19 01:09:09" }
+{ "f1": "25", "f2": "Text \"\nmore lines", "f3": "2011-09-19 01:09:09" }
+{ "f1": "26", "f2": "\"\n", "f3": "2011-09-19 01:09:09" }
+{ "f1": "27", "f2": "Text", "f3": "" }
+{ "f1": "28", "f2": "Text", "f3": "2011-09-19 01:09:09" }
+{ "f1": "29", "f2": "Text\\.", "f3": "2011-09-19 01:09:09" }
+{ "f1": "30", "f2": "Text\\.", "f3": "2011-09-19 01:09:09" }
+{ "f1": "31", "f2": "\\.Text", "f3": "2011-09-19 01:09:09" }
+{ "f1": "32", "f2": "\\.Text", "f3": "2011-09-19 01:09:09" }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.4.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.4.adm
new file mode 100644
index 0000000..5c61b4a
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.4.adm
@@ -0,0 +1,4 @@
+{ "f1": "1", "f2": ",\", b", "f3": " 3", "f4": "4", "f5": "5" }
+{ "f1": ",\", b", "f2": "4", "f3": " 3", "f4": "4", "f5": "5" }
+{ "f1": "", "f2": "", "f3": "", "f4": "", "f5": "" }
+{ "f1": "dd", "f2": "", "f3": "", "f4": "", "f5": "" }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.5.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.5.adm
new file mode 100644
index 0000000..4b80e26
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.5.adm
@@ -0,0 +1,13 @@
+{ "f1": 1, "f2": true, "f3": "text" }
+{ "f1": 2, "f2": false, "f3": "text" }
+{ "f1": 3, "f2": true, "f3": "text" }
+{ "f1": 4, "f2": true, "f3": null }
+{ "f1": 5, "f2": false, "f3": null }
+{ "f1": 6, "f2": true, "f3": "text\"\nmore lines" }
+{ "f1": 7, "f2": false, "f3": "\"\n" }
+{ "f1": 8, "f2": true, "f3": null }
+{ "f1": 9, "f2": false, "f3": "text\"" }
+{ "f1": 10, "f2": false, "f3": "text\\." }
+{ "f1": 11, "f2": true, "f3": "text\\." }
+{ "f1": null, "f2": false, "f3": "\\.text" }
+{ "f1": 13, "f2": true, "f3": "\\.text" }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/tsv-parser-001/tsv-parser-001.2.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/tsv-parser-001/tsv-parser-001.2.adm
new file mode 100644
index 0000000..fbe287b
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/tsv-parser-001/tsv-parser-001.2.adm
@@ -0,0 +1,28 @@
+{ "f1": 11, "f2": 55, "f3": "text field wih , charrrrrrrrrrr", "f4": true, "f5": 90, "f6": 0.666666667 }
+{ "f1": 12, "f2": 55, "f3": "text field with \" charrrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 14, "f2": 55, "f3": "text field with ' charrrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 15, "f2": 55, "f3": "text field with \\ charrrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 16, "f2": 55, "f3": "text field wih \\, char ", "f4": true, "f5": 90, "f6": 0.666666667 }
+{ "f1": 17, "f2": 55, "f3": "text field with \\\" charrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 18, "f2": 55, "f3": "text field with \\' charrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 19, "f2": 55, "f3": "text field with \\\\ charrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 20, "f2": 55, "f3": "text field ending with charr ,", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 21, "f2": 55, "f3": "text field ending with charr \"", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 22, "f2": 55, "f3": "text field ending with charr '", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 23, "f2": 55, "f3": "text field ending with charr \\", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 24, "f2": 55, "f3": "text field ending with charr \\,", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 25, "f2": 55, "f3": "text field ending with charr \\\"", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 26, "f2": 55, "f3": "text field ending with charr \\'", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 27, "f2": 55, "f3": "text field ending with charr \\\\", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 28, "f2": 55, "f3": ",text field starting with charr", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 29, "f2": 55, "f3": "\"text field starting with charr", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 30, "f2": 55, "f3": "'text field starting with charr", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 31, "f2": 55, "f3": "\\text field starting with charr", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 32, "f2": 55, "f3": "\\,text field starting with char", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 33, "f2": 55, "f3": "\\\"text field starting with char", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 34, "f2": 55, "f3": "\\'text field starting with char", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 35, "f2": 55, "f3": "\\\\text field starting with char", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 36, "f2": 55, "f3": "\"text field inside with char\"", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 37, "f2": 55, "f3": " text field with charrrrrrrrr ", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 38, "f2": 55, "f3": "text field with \"\" charrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 }
+{ "f1": 39, "f2": 55, "f3": "text field \"with\" charrrrrrrrrr", "f4": false, "f5": 90, "f6": 0.666666667 }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/001/external_dataset.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/001/external_dataset.001.adm
new file mode 100644
index 0000000..93d1b57
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/001/external_dataset.001.adm
@@ -0,0 +1,6 @@
+{ "id": 1, "year": null, "review": "good", "details": "recommend" }
+{ "id": 2, "year": null, "review": "bad", "details": "not recommend" }
+{ "id": 3, "year": null, "review": "good", "details": null }
+{ "id": 4, "year": 2018, "review": "good", "details": "recommend" }
+{ "id": 5, "year": 2018, "review": "", "details": "not recommend" }
+{ "id": 6, "year": 2018, "review": "good", "details": null }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/002/external_dataset.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/002/external_dataset.001.adm
new file mode 100644
index 0000000..1954b05
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/002/external_dataset.001.adm
@@ -0,0 +1,6 @@
+{ "id": 1, "year": null, "review": "\"good\"", "details": "\"recommend\"" }
+{ "id": 2, "year": null, "review": "\"bad\"", "details": "\"not recommend\"" }
+{ "id": 3, "year": null, "review": "\"good\"", "details": "\"recommend\"" }
+{ "id": 4, "year": 2018, "review": "\"good\"", "details": "\"recommend\"" }
+{ "id": 5, "year": 2018, "review": "", "details": "\"not recommend\"" }
+{ "id": 6, "year": 2018, "review": "\"good\"", "details": "\"recommend\"" }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml
index cd1fb12..9948209 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml
@@ -24,5 +24,15 @@
<output-dir compare="Text">aws/s3/000</output-dir>
</compilation-unit>
</test-case>
+ <test-case FilePath="external-dataset">
+ <compilation-unit name="aws/s3/001">
+ <output-dir compare="Text">aws/s3/001</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="external-dataset">
+ <compilation-unit name="aws/s3/002">
+ <output-dir compare="Text">aws/s3/002</output-dir>
+ </compilation-unit>
+ </test-case>
</test-group>
</test-suite>
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
index 63db153..a578690 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
@@ -12249,6 +12249,18 @@
</compilation-unit>
</test-case>
</test-group>
+ <test-group name="csv-tsv-parser">
+ <test-case FilePath="csv-tsv-parser">
+ <compilation-unit name="csv-parser-001">
+ <output-dir compare="Text">csv-parser-001</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="csv-tsv-parser">
+ <compilation-unit name="tsv-parser-001">
+ <output-dir compare="Text">tsv-parser-001</output-dir>
+ </compilation-unit>
+ </test-case>
+ </test-group>
<test-group name="binary">
<test-case FilePath="binary">
<compilation-unit name="parse">
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java
index 8255ebb..5c8f219 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java
@@ -41,7 +41,7 @@
this.cursor = new FieldCursorForDelimitedDataParser(null, delimiter, ExternalDataConstants.QUOTE);
this.record = new CharArrayRecord();
this.valueIndex = valueIndex;
- this.recordWithMetadata = new RecordWithMetadataAndPK<char[]>(record, metaType.getFieldTypes(), recordType,
+ this.recordWithMetadata = new RecordWithMetadataAndPK<>(record, metaType.getFieldTypes(), recordType,
keyIndicator, keyIndexes, keyTypes);
}
@@ -53,16 +53,15 @@
int i = 0;
int j = 0;
while (cursor.nextField()) {
- if (cursor.isDoubleQuoteIncludedInThisField) {
- cursor.eliminateDoubleQuote(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart);
- cursor.fEnd -= cursor.doubleQuoteCount;
- cursor.isDoubleQuoteIncludedInThisField = false;
+ if (cursor.fieldHasDoubleQuote()) {
+ cursor.eliminateDoubleQuote();
}
if (i == valueIndex) {
- record.setValue(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart);
+ record.setValue(cursor.getBuffer(), cursor.getFieldStart(), cursor.getFieldLength());
record.endRecord();
} else {
- recordWithMetadata.setRawMetadata(j, cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart);
+ recordWithMetadata.setRawMetadata(j, cursor.getBuffer(), cursor.getFieldStart(),
+ cursor.getFieldLength());
j++;
}
i++;
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java
index a9f7898..6b8bb59 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java
@@ -27,6 +27,7 @@
import java.util.Map;
import org.apache.asterix.common.dataflow.ICcApplicationContext;
+import org.apache.asterix.common.exceptions.AsterixException;
import org.apache.asterix.common.exceptions.ErrorCode;
import org.apache.asterix.external.api.AsterixInputStream;
import org.apache.asterix.external.api.IInputStreamFactory;
@@ -35,7 +36,6 @@
import org.apache.hyracks.algebricks.common.exceptions.AlgebricksException;
import org.apache.hyracks.api.application.IServiceContext;
import org.apache.hyracks.api.context.IHyracksTaskContext;
-import org.apache.hyracks.api.exceptions.HyracksDataException;
import software.amazon.awssdk.auth.credentials.AwsBasicCredentials;
import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider;
@@ -77,8 +77,7 @@
}
@Override
- public void configure(IServiceContext ctx, Map<String, String> configuration)
- throws AlgebricksException, HyracksDataException {
+ public void configure(IServiceContext ctx, Map<String, String> configuration) throws AlgebricksException {
this.configuration = configuration;
ICcApplicationContext ccApplicationContext = (ICcApplicationContext) ctx.getApplicationContext();
@@ -115,13 +114,13 @@
*
* @return A list of string paths that point to files only
*
- * @throws HyracksDataException HyracksDataException
+ * @throws AsterixException AsterixException
*/
- private List<S3Object> getFilesOnly(List<S3Object> s3Objects, String fileFormat) throws HyracksDataException {
+ private List<S3Object> getFilesOnly(List<S3Object> s3Objects, String fileFormat) throws AsterixException {
List<S3Object> filesOnly = new ArrayList<>();
String fileExtension = getFileExtension(fileFormat);
if (fileExtension == null) {
- throw HyracksDataException.create(ErrorCode.INVALID_FORMAT);
+ throw AsterixException.create(ErrorCode.PROVIDER_STREAM_RECORD_READER_UNKNOWN_FORMAT, fileFormat);
}
s3Objects.stream().filter(object -> object.key().endsWith(fileExtension)).forEach(filesOnly::add);
@@ -214,8 +213,12 @@
*/
private String getFileExtension(String format) {
switch (format.toLowerCase()) {
- case "json":
+ case ExternalDataConstants.FORMAT_JSON_LOWER_CASE:
return ".json";
+ case ExternalDataConstants.FORMAT_CSV:
+ return ".csv";
+ case ExternalDataConstants.FORMAT_TSV:
+ return ".tsv";
default:
return null;
}
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java
index 0b41d4b..be600ed 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/LineRecordReader.java
@@ -36,8 +36,9 @@
protected int newlineLength;
protected int recordNumber = 0;
protected boolean nextIsHeader = false;
- private static final List<String> recordReaderFormats = Collections.unmodifiableList(
- Arrays.asList(ExternalDataConstants.FORMAT_DELIMITED_TEXT, ExternalDataConstants.FORMAT_CSV));
+ private static final List<String> recordReaderFormats =
+ Collections.unmodifiableList(Arrays.asList(ExternalDataConstants.FORMAT_DELIMITED_TEXT,
+ ExternalDataConstants.FORMAT_CSV, ExternalDataConstants.FORMAT_TSV));
private static final String REQUIRED_CONFIGS = "";
@Override
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
index 4c4128a..1fd328b 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
@@ -32,11 +32,10 @@
public class QuotedLineRecordReader extends LineRecordReader {
private char quote;
- private boolean prevCharEscape;
- private boolean inQuote;
+ private char quoteEscape;
private static final List<String> recordReaderFormats = Collections.unmodifiableList(
Arrays.asList(ExternalDataConstants.FORMAT_DELIMITED_TEXT, ExternalDataConstants.FORMAT_CSV));
- private static final String REQUIRED_CONFIGS = "quote";
+ private static final String REQUIRED_CONFIGS = ExternalDataConstants.KEY_QUOTE;
@Override
public void configure(AsterixInputStream inputStream, Map<String, String> config) throws HyracksDataException {
@@ -47,6 +46,17 @@
ExternalDataConstants.PARAMETER_OF_SIZE_ONE, quoteString));
}
this.quote = quoteString.charAt(0);
+ String escapeString = config.get(ExternalDataConstants.KEY_QUOTE_ESCAPE);
+ if (escapeString == null) {
+ quoteEscape = ExternalDataConstants.ESCAPE;
+ } else {
+ if (escapeString.length() != 1) {
+ throw new HyracksDataException(
+ ExceptionUtils.incorrectParameterMessage(ExternalDataConstants.KEY_QUOTE_ESCAPE,
+ ExternalDataConstants.PARAMETER_OF_SIZE_ONE, escapeString));
+ }
+ quoteEscape = escapeString.charAt(0);
+ }
}
@Override
@@ -67,31 +77,35 @@
}
newlineLength = 0;
prevCharCR = false;
- prevCharEscape = false;
+ boolean prevCharEscape = false;
record.reset();
int readLength = 0;
- inQuote = false;
+ boolean inQuote = false;
do {
int startPosn = bufferPosn;
if (bufferPosn >= bufferLength) {
startPosn = bufferPosn = 0;
bufferLength = reader.read(inputBuffer);
if (bufferLength <= 0) {
- {
- if (readLength > 0) {
- if (inQuote) {
- throw new IOException("malformed input record ended inside quote");
- }
- record.endRecord();
- recordNumber++;
- return true;
+ if (readLength > 0) {
+ if (inQuote) {
+ throw new IOException("malformed input record ended inside quote");
}
- close();
- return false;
+ record.endRecord();
+ recordNumber++;
+ return true;
}
+ close();
+ return false;
}
}
+ boolean maybeInQuote = false;
for (; bufferPosn < bufferLength; ++bufferPosn) {
+ if (inputBuffer[bufferPosn] == quote && quoteEscape == quote) {
+ inQuote |= maybeInQuote;
+ prevCharEscape |= maybeInQuote;
+ }
+ maybeInQuote = false;
if (!inQuote) {
if (inputBuffer[bufferPosn] == ExternalDataConstants.LF) {
newlineLength = (prevCharCR) ? 2 : 1;
@@ -103,24 +117,25 @@
break;
}
prevCharCR = (inputBuffer[bufferPosn] == ExternalDataConstants.CR);
- if (inputBuffer[bufferPosn] == quote) {
- if (!prevCharEscape) {
- inQuote = true;
- }
+ if (inputBuffer[bufferPosn] == quote && !prevCharEscape) {
+ // this is an opening quote
+ inQuote = true;
}
if (prevCharEscape) {
prevCharEscape = false;
} else {
- prevCharEscape = inputBuffer[bufferPosn] == ExternalDataConstants.ESCAPE;
+ // the quoteEscape != quote is for making an opening quote not an escape
+ prevCharEscape = inputBuffer[bufferPosn] == quoteEscape && quoteEscape != quote;
}
} else {
- // only look for next quote
- if (inputBuffer[bufferPosn] == quote) {
- if (!prevCharEscape) {
- inQuote = false;
- }
+ // if quote == quoteEscape and current char is quote, then it could be closing or escaping
+ if (inputBuffer[bufferPosn] == quote && !prevCharEscape) {
+ // this is most likely a closing quote. the outcome depends on the next char
+ inQuote = false;
+ maybeInQuote = true;
}
- prevCharEscape = inputBuffer[bufferPosn] == ExternalDataConstants.ESCAPE;
+ prevCharEscape =
+ inputBuffer[bufferPosn] == quoteEscape && !prevCharEscape && quoteEscape != quote;
}
}
readLength = bufferPosn - startPosn;
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java
index 4e371c8..8facce6 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java
@@ -57,9 +57,9 @@
private ArrayBackedValueStorage[] nameBuffers;
private boolean areAllNullFields;
- public DelimitedDataParser(IValueParserFactory[] valueParserFactories, char fieldDelimter, char quote,
+ public DelimitedDataParser(IValueParserFactory[] valueParserFactories, char fieldDelimiter, char quote,
boolean hasHeader, ARecordType recordType, boolean isStreamParser) throws HyracksDataException {
- this.fieldDelimiter = fieldDelimter;
+ this.fieldDelimiter = fieldDelimiter;
this.quote = quote;
this.hasHeader = hasHeader;
this.recordType = recordType;
@@ -98,7 +98,7 @@
}
}
if (!isStreamParser) {
- cursor = new FieldCursorForDelimitedDataParser(null, fieldDelimiter, quote);
+ cursor = new FieldCursorForDelimitedDataParser(null, this.fieldDelimiter, quote);
}
}
@@ -134,25 +134,23 @@
fieldValueBuffer.reset();
try {
- if (cursor.fStart == cursor.fEnd && recordType.getFieldTypes()[i].getTypeTag() != ATypeTag.STRING
+ if (cursor.isFieldEmpty() && recordType.getFieldTypes()[i].getTypeTag() != ATypeTag.STRING
&& recordType.getFieldTypes()[i].getTypeTag() != ATypeTag.NULL) {
// if the field is empty and the type is optional, insert
// NULL. Note that string type can also process empty field as an
// empty string
if (!NonTaggedFormatUtil.isOptional(recordType.getFieldTypes()[i])) {
- throw new RuntimeDataException(ErrorCode.PARSER_DELIMITED_NONOPTIONAL_NULL, cursor.recordCount,
- cursor.fieldCount);
+ throw new RuntimeDataException(ErrorCode.PARSER_DELIMITED_NONOPTIONAL_NULL,
+ cursor.getRecordCount(), cursor.getFieldCount());
}
fieldValueBufferOutput.writeByte(ATypeTag.SERIALIZED_NULL_TYPE_TAG);
} else {
fieldValueBufferOutput.writeByte(fieldTypeTags[i]);
- // Eliminate doule quotes in the field that we are going to parse
- if (cursor.isDoubleQuoteIncludedInThisField) {
- cursor.eliminateDoubleQuote(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart);
- cursor.fEnd -= cursor.doubleQuoteCount;
- cursor.isDoubleQuoteIncludedInThisField = false;
+ // Eliminate double quotes in the field that we are going to parse
+ if (cursor.fieldHasDoubleQuote()) {
+ cursor.eliminateDoubleQuote();
}
- valueParsers[i].parse(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart,
+ valueParsers[i].parse(cursor.getBuffer(), cursor.getFieldStart(), cursor.getFieldLength(),
fieldValueBufferOutput);
areAllNullFields = false;
}
@@ -165,15 +163,14 @@
throw HyracksDataException.create(e);
}
}
+ if (valueParsers.length != cursor.getFieldCount()) {
+ throw new HyracksDataException("Record #" + cursor.getRecordCount() + " is missing some fields");
+ }
}
@Override
public void parse(IRawRecord<? extends char[]> record, DataOutput out) throws HyracksDataException {
- try {
- cursor.nextRecord(record.get(), record.size());
- } catch (IOException e) {
- throw HyracksDataException.create(e);
- }
+ cursor.nextRecord(record.get(), record.size());
parseRecord();
if (!areAllNullFields) {
recBuilder.write(out, true);
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java
index f406729..1fee49f 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java
@@ -21,10 +21,7 @@
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
-import java.util.Map;
-import org.apache.asterix.common.exceptions.ErrorCode;
-import org.apache.asterix.common.exceptions.RuntimeDataException;
import org.apache.asterix.external.api.IExternalDataSourceFactory.DataSourceType;
import org.apache.asterix.external.api.IRecordDataParser;
import org.apache.asterix.external.api.IStreamDataParser;
@@ -40,7 +37,8 @@
private static final long serialVersionUID = 1L;
private static final List<String> parserFormats =
- Collections.unmodifiableList(Arrays.asList("csv", "delimited-text"));
+ Collections.unmodifiableList(Arrays.asList(ExternalDataConstants.FORMAT_CSV,
+ ExternalDataConstants.FORMAT_DELIMITED_TEXT, ExternalDataConstants.FORMAT_TSV));
@Override
public IRecordDataParser<char[]> createRecordParser(IHyracksTaskContext ctx) throws HyracksDataException {
@@ -49,8 +47,8 @@
private DelimitedDataParser createParser() throws HyracksDataException {
IValueParserFactory[] valueParserFactories = ExternalDataUtils.getValueParserFactories(recordType);
- Character delimiter = DelimitedDataParserFactory.getDelimiter(configuration);
- char quote = DelimitedDataParserFactory.getQuote(configuration, delimiter);
+ char delimiter = ExternalDataUtils.getDelimiter(configuration);
+ char quote = ExternalDataUtils.getQuote(configuration, delimiter);
boolean hasHeader = ExternalDataUtils.hasHeader(configuration);
return new DelimitedDataParser(valueParserFactories, delimiter, quote, hasHeader, recordType,
ExternalDataUtils.getDataSourceType(configuration).equals(DataSourceType.STREAM));
@@ -67,40 +65,6 @@
return createParser();
}
- // Get a delimiter from the given configuration
- public static char getDelimiter(Map<String, String> configuration) throws HyracksDataException {
- String delimiterValue = configuration.get(ExternalDataConstants.KEY_DELIMITER);
- if (delimiterValue == null) {
- delimiterValue = ExternalDataConstants.DEFAULT_DELIMITER;
- } else if (delimiterValue.length() != 1) {
- throw new RuntimeDataException(ErrorCode.PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_NOT_VALID_DELIMITER,
- delimiterValue);
- }
- return delimiterValue.charAt(0);
- }
-
- // Get a quote from the given configuration when the delimiter is given
- // Need to pass delimiter to check whether they share the same character
- public static char getQuote(Map<String, String> configuration, char delimiter) throws HyracksDataException {
- String quoteValue = configuration.get(ExternalDataConstants.KEY_QUOTE);
- if (quoteValue == null) {
- quoteValue = ExternalDataConstants.DEFAULT_QUOTE;
- } else if (quoteValue.length() != 1) {
- throw new RuntimeDataException(ErrorCode.PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_NOT_VALID_QUOTE,
- quoteValue);
- }
-
- // Since delimiter (char type value) can't be null,
- // we only check whether delimiter and quote use the same character
- if (quoteValue.charAt(0) == delimiter) {
- throw new RuntimeDataException(
- ErrorCode.PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_QUOTE_DELIMITER_MISMATCH, quoteValue,
- delimiter);
- }
-
- return quoteValue.charAt(0);
- }
-
@Override
public void setMetaType(ARecordType metaType) {
}
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/provider/AdapterFactoryProvider.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/provider/AdapterFactoryProvider.java
index 414c460..27ac10e 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/provider/AdapterFactoryProvider.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/provider/AdapterFactoryProvider.java
@@ -27,7 +27,7 @@
import org.apache.asterix.external.api.IIndexingAdapterFactory;
import org.apache.asterix.external.api.ITypedAdapterFactory;
import org.apache.asterix.external.indexing.ExternalFile;
-import org.apache.asterix.external.util.ExternalDataCompatibilityUtils;
+import org.apache.asterix.external.util.ExternalDataUtils;
import org.apache.asterix.om.types.ARecordType;
import org.apache.hyracks.algebricks.common.exceptions.AlgebricksException;
import org.apache.hyracks.api.application.IServiceContext;
@@ -39,11 +39,15 @@
*/
public class AdapterFactoryProvider {
- // Adapters
+ private AdapterFactoryProvider() {
+ }
+
+ // get adapter factory. this method has the side effect of modifying the configuration as necessary
public static ITypedAdapterFactory getAdapterFactory(IServiceContext serviceCtx, String adapterName,
Map<String, String> configuration, ARecordType itemType, ARecordType metaType)
throws HyracksDataException, AlgebricksException {
- ExternalDataCompatibilityUtils.prepare(adapterName, configuration);
+ ExternalDataUtils.defaultConfiguration(configuration);
+ ExternalDataUtils.prepare(adapterName, configuration);
ICcApplicationContext context = (ICcApplicationContext) serviceCtx.getApplicationContext();
ITypedAdapterFactory adapterFactory =
(ITypedAdapterFactory) context.getAdapterFactoryService().createAdapterFactory();
@@ -53,11 +57,12 @@
return adapterFactory;
}
- // Indexing Adapters
+ // get indexing adapter factory. this method has the side effect of modifying the configuration as necessary
public static IIndexingAdapterFactory getIndexingAdapterFactory(IServiceContext serviceCtx, String adapterName,
Map<String, String> configuration, ARecordType itemType, List<ExternalFile> snapshot, boolean indexingOp,
ARecordType metaType) throws HyracksDataException, AlgebricksException {
- ExternalDataCompatibilityUtils.prepare(adapterName, configuration);
+ ExternalDataUtils.defaultConfiguration(configuration);
+ ExternalDataUtils.prepare(adapterName, configuration);
GenericAdapterFactory adapterFactory = new GenericAdapterFactory();
adapterFactory.setOutputType(itemType);
adapterFactory.setMetaType(metaType);
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataCompatibilityUtils.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataCompatibilityUtils.java
index e222e99..8181262 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataCompatibilityUtils.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataCompatibilityUtils.java
@@ -18,8 +18,6 @@
*/
package org.apache.asterix.external.util;
-import java.util.Map;
-
import org.apache.asterix.common.exceptions.AsterixException;
import org.apache.asterix.external.api.IDataParserFactory;
import org.apache.asterix.external.api.IExternalDataSourceFactory;
@@ -30,6 +28,9 @@
public class ExternalDataCompatibilityUtils {
+ private ExternalDataCompatibilityUtils() {
+ }
+
public static void validateCompatibility(IExternalDataSourceFactory dataSourceFactory,
IDataParserFactory dataParserFactory) throws AsterixException {
if (dataSourceFactory.getDataSourceType() != dataParserFactory.getDataSourceType()) {
@@ -58,16 +59,4 @@
+ recordParserFactory.getRecordClass());
}
}
-
- public static void prepare(String adapterName, Map<String, String> configuration) {
- if (!configuration.containsKey(ExternalDataConstants.KEY_READER)) {
- configuration.put(ExternalDataConstants.KEY_READER, adapterName);
- }
- if (!configuration.containsKey(ExternalDataConstants.KEY_PARSER)) {
- if (configuration.containsKey(ExternalDataConstants.KEY_FORMAT)) {
- configuration.put(ExternalDataConstants.KEY_PARSER,
- configuration.get(ExternalDataConstants.KEY_FORMAT));
- }
- }
- }
}
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
index 26f5402..5427bc5 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
@@ -19,6 +19,10 @@
package org.apache.asterix.external.util;
public class ExternalDataConstants {
+
+ private ExternalDataConstants() {
+ }
+
// TODO: Remove unused variables.
/**
* Keys
@@ -62,6 +66,7 @@
public static final String KEY_LOCAL_SOCKET_PATH = "local-socket-path";
public static final String KEY_FORMAT = "format";
public static final String KEY_QUOTE = "quote";
+ public static final String KEY_QUOTE_ESCAPE = "quote-escape";
public static final String KEY_PARSER = "parser";
public static final String KEY_DATASET_RECORD = "dataset-record";
public static final String KEY_HIVE_SERDE = "hive-serde";
@@ -188,6 +193,8 @@
*/
public static final String TRUE = "true";
public static final String FALSE = "false";
+ public static final String TAB_STR = "\t";
+ public static final String NULL_STR = "\0";
/**
* Constant characters
@@ -228,6 +235,7 @@
public static final String KEY_READER_FACTORY = "reader-factory";
public static final String READER_RSS = "rss_feed";
public static final String FORMAT_CSV = "csv";
+ public static final String FORMAT_TSV = "tsv";
public static final String ERROR_PARSE_RECORD = "Parser failed to parse record";
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
index a418cbf..443aa7e 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
@@ -34,6 +34,7 @@
import org.apache.asterix.om.types.AUnionType;
import org.apache.hyracks.algebricks.common.exceptions.NotImplementedException;
import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.apache.hyracks.dataflow.common.data.parsers.BooleanParserFactory;
import org.apache.hyracks.dataflow.common.data.parsers.DoubleParserFactory;
import org.apache.hyracks.dataflow.common.data.parsers.FloatParserFactory;
import org.apache.hyracks.dataflow.common.data.parsers.IValueParserFactory;
@@ -43,48 +44,43 @@
public class ExternalDataUtils {
+ private ExternalDataUtils() {
+ }
+
// Get a delimiter from the given configuration
- public static char getDelimiter(Map<String, String> configuration) throws AsterixException {
+ public static char getDelimiter(Map<String, String> configuration) throws HyracksDataException {
String delimiterValue = configuration.get(ExternalDataConstants.KEY_DELIMITER);
if (delimiterValue == null) {
delimiterValue = ExternalDataConstants.DEFAULT_DELIMITER;
} else if (delimiterValue.length() != 1) {
- throw new AsterixException(
- "'" + delimiterValue + "' is not a valid delimiter. The length of a delimiter should be 1.");
+ throw new RuntimeDataException(ErrorCode.PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_NOT_VALID_DELIMITER,
+ delimiterValue);
}
return delimiterValue.charAt(0);
}
// Get a quote from the given configuration when the delimiter is given
// Need to pass delimiter to check whether they share the same character
- public static char getQuote(Map<String, String> configuration, char delimiter) throws AsterixException {
+ public static char getQuote(Map<String, String> configuration, char delimiter) throws HyracksDataException {
String quoteValue = configuration.get(ExternalDataConstants.KEY_QUOTE);
if (quoteValue == null) {
quoteValue = ExternalDataConstants.DEFAULT_QUOTE;
} else if (quoteValue.length() != 1) {
- throw new AsterixException("'" + quoteValue + "' is not a valid quote. The length of a quote should be 1.");
+ throw new RuntimeDataException(ErrorCode.PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_NOT_VALID_QUOTE,
+ quoteValue);
}
// Since delimiter (char type value) can't be null,
// we only check whether delimiter and quote use the same character
if (quoteValue.charAt(0) == delimiter) {
- throw new AsterixException(
- "Quote '" + quoteValue + "' cannot be used with the delimiter '" + delimiter + "'. ");
+ throw new RuntimeDataException(
+ ErrorCode.PARSER_FACTORY_DELIMITED_DATA_PARSER_FACTORY_QUOTE_DELIMITER_MISMATCH, quoteValue,
+ delimiter);
}
return quoteValue.charAt(0);
}
- // Get the header flag
- public static boolean getHasHeader(Map<String, String> configuration) {
- return Boolean.parseBoolean(configuration.get(ExternalDataConstants.KEY_HEADER));
- }
-
- public static void validateParameters(Map<String, String> configuration) throws AsterixException {
- validateDataSourceParameters(configuration);
- validateDataParserParameters(configuration);
- }
-
public static void validateDataParserParameters(Map<String, String> configuration) throws AsterixException {
String parser = configuration.get(ExternalDataConstants.KEY_FORMAT);
if (parser == null) {
@@ -150,15 +146,6 @@
return parserFormat != null ? parserFormat : configuration.get(ExternalDataConstants.KEY_FORMAT);
}
- public static void setRecordFormat(Map<String, String> configuration, String format) {
- if (!configuration.containsKey(ExternalDataConstants.KEY_DATA_PARSER)) {
- configuration.put(ExternalDataConstants.KEY_DATA_PARSER, format);
- }
- if (!configuration.containsKey(ExternalDataConstants.KEY_FORMAT)) {
- configuration.put(ExternalDataConstants.KEY_FORMAT, format);
- }
- }
-
private static Map<ATypeTag, IValueParserFactory> valueParserFactoryMap = initializeValueParserFactoryMap();
private static Map<ATypeTag, IValueParserFactory> initializeValueParserFactoryMap() {
@@ -168,6 +155,7 @@
m.put(ATypeTag.DOUBLE, DoubleParserFactory.INSTANCE);
m.put(ATypeTag.BIGINT, LongParserFactory.INSTANCE);
m.put(ATypeTag.STRING, UTF8StringParserFactory.INSTANCE);
+ m.put(ATypeTag.BOOLEAN, BooleanParserFactory.INSTANCE);
return m;
}
@@ -201,10 +189,6 @@
return vpf;
}
- public static String getRecordReaderStreamName(Map<String, String> configuration) {
- return configuration.get(ExternalDataConstants.KEY_READER_STREAM);
- }
-
public static boolean hasHeader(Map<String, String> configuration) {
String value = configuration.get(ExternalDataConstants.KEY_HEADER);
if (value != null) {
@@ -281,12 +265,6 @@
return configuration.get(ExternalDataConstants.KEY_FEED_NAME);
}
- public static int getQueueSize(Map<String, String> configuration) {
- return configuration.containsKey(ExternalDataConstants.KEY_QUEUE_SIZE)
- ? Integer.parseInt(configuration.get(ExternalDataConstants.KEY_QUEUE_SIZE))
- : ExternalDataConstants.DEFAULT_QUEUE_SIZE;
- }
-
public static boolean isRecordWithMeta(Map<String, String> configuration) {
return configuration.containsKey(ExternalDataConstants.KEY_META_TYPE_NAME);
}
@@ -339,4 +317,42 @@
}
return intIndicators;
}
+
+ /**
+ * Fills the configuration of the external dataset and its adapter with default values if not provided by user.
+ *
+ * @param configuration external data configuration
+ */
+ public static void defaultConfiguration(Map<String, String> configuration) {
+ String format = configuration.get(ExternalDataConstants.KEY_FORMAT);
+ if (format != null) {
+ // default quote, escape character for quote and fields delimiter for csv and tsv format
+ if (format.equals(ExternalDataConstants.FORMAT_CSV)) {
+ configuration.putIfAbsent(ExternalDataConstants.KEY_DELIMITER, ExternalDataConstants.DEFAULT_DELIMITER);
+ configuration.putIfAbsent(ExternalDataConstants.KEY_QUOTE, ExternalDataConstants.DEFAULT_QUOTE);
+ configuration.putIfAbsent(ExternalDataConstants.KEY_QUOTE_ESCAPE, ExternalDataConstants.DEFAULT_QUOTE);
+ } else if (format.equals(ExternalDataConstants.FORMAT_TSV)) {
+ configuration.putIfAbsent(ExternalDataConstants.KEY_DELIMITER, ExternalDataConstants.TAB_STR);
+ configuration.putIfAbsent(ExternalDataConstants.KEY_QUOTE, ExternalDataConstants.NULL_STR);
+ configuration.putIfAbsent(ExternalDataConstants.KEY_QUOTE_ESCAPE, ExternalDataConstants.NULL_STR);
+ }
+ }
+ }
+
+ /**
+ * Prepares the configuration of the external dataset and its adapter by filling the information required by
+ * adapters and parsers.
+ *
+ * @param adapterName adapter name
+ * @param configuration external data configuration
+ */
+ public static void prepare(String adapterName, Map<String, String> configuration) {
+ if (!configuration.containsKey(ExternalDataConstants.KEY_READER)) {
+ configuration.put(ExternalDataConstants.KEY_READER, adapterName);
+ }
+ if (!configuration.containsKey(ExternalDataConstants.KEY_PARSER)
+ && configuration.containsKey(ExternalDataConstants.KEY_FORMAT)) {
+ configuration.put(ExternalDataConstants.KEY_PARSER, configuration.get(ExternalDataConstants.KEY_FORMAT));
+ }
+ }
}
diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/parsers/BooleanParserFactory.java b/hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/parsers/BooleanParserFactory.java
new file mode 100644
index 0000000..488be04
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/parsers/BooleanParserFactory.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hyracks.dataflow.common.data.parsers;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+
+public class BooleanParserFactory implements IValueParserFactory {
+
+ private static final long serialVersionUID = 1L;
+
+ public static final IValueParserFactory INSTANCE = new BooleanParserFactory();
+
+ private BooleanParserFactory() {
+ }
+
+ @Override
+ public IValueParser createValueParser() {
+ return BooleanParserFactory::parse;
+ }
+
+ public static void parse(char[] buffer, int start, int length, DataOutput out) throws HyracksDataException {
+ try {
+ if (length == 4 && (buffer[start] == 't' || buffer[start] == 'T')
+ && (buffer[start + 1] == 'r' || buffer[start + 1] == 'R')
+ && (buffer[start + 2] == 'u' || buffer[start + 2] == 'U')
+ && (buffer[start + 3] == 'e' || buffer[start + 3] == 'E')) {
+ out.writeBoolean(true);
+ return;
+ } else if (length == 5 && (buffer[start] == 'f' || buffer[start] == 'F')
+ && (buffer[start + 1] == 'a' || buffer[start + 1] == 'A')
+ && (buffer[start + 2] == 'l' || buffer[start + 2] == 'L')
+ && (buffer[start + 3] == 's' || buffer[start + 3] == 'S')
+ && (buffer[start + 4] == 'e' || buffer[start + 4] == 'E')) {
+ out.writeBoolean(false);
+ return;
+ }
+ } catch (IOException e) {
+ throw HyracksDataException.create(e);
+ }
+
+ throw new HyracksDataException("Invalid input data");
+ }
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
index 9ddb4c2..2eb882a 100644
--- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
+++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
@@ -76,12 +76,11 @@
break;
}
// Eliminate double quotes in the field that we are going to parse
- if (cursor.isDoubleQuoteIncludedInThisField) {
- cursor.eliminateDoubleQuote(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart);
- cursor.fEnd -= cursor.doubleQuoteCount;
- cursor.isDoubleQuoteIncludedInThisField = false;
+ if (cursor.fieldHasDoubleQuote()) {
+ cursor.eliminateDoubleQuote();
}
- valueParsers[i].parse(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart, dos);
+ valueParsers[i].parse(cursor.getBuffer(), cursor.getFieldStart(), cursor.getFieldLength(),
+ dos);
tb.addFieldEndOffset();
}
FrameUtils.appendToWriter(writer, appender, tb.getFieldEndOffsets(), tb.getByteArray(), 0,
diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
index 7e5ee2c..fd3e4c3 100644
--- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
+++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
@@ -32,18 +32,18 @@
EOF //end of stream reached
}
- public char[] buffer; //buffer to holds the input coming form the underlying input stream
- public int fStart; //start position for field
- public int fEnd; //end position for field
- public int recordCount; //count of records
- public int fieldCount; //count of fields in current record
- public int doubleQuoteCount; //count of double quotes
- public boolean isDoubleQuoteIncludedInThisField; //does current field include double quotes
+ private char[] buffer; //buffer to holds the input coming form the underlying input stream
+ private int fStart; //start position for field
+ private int fEnd; //end position for field
+ private int recordCount; //count of records
+ private int fieldCount; //count of fields in current record
+ private int doubleQuoteCount; //count of double quotes
+ private boolean isDoubleQuoteIncludedInThisField; //does current field include double quotes
private static final int INITIAL_BUFFER_SIZE = 4096;//initial buffer size
private static final int INCREMENT = 4096; //increment size
- private Reader in; //the underlying buffer
+ private final Reader in; //the underlying buffer
private int start; //start of valid buffer area
private int end; //end of valid buffer area
@@ -55,8 +55,8 @@
private int quoteCount; //count of single quotes
private boolean startedQuote; //whether a quote has been started
- private char quote; //the quote character
- private char fieldDelimiter; //the delimiter
+ private final char quote; //the quote character
+ private final char fieldDelimiter; //the delimiter
public FieldCursorForDelimitedDataParser(Reader in, char fieldDelimiter, char quote) {
this.in = in;
@@ -70,9 +70,9 @@
state = State.INIT;
this.quote = quote;
this.fieldDelimiter = fieldDelimiter;
- lastDelimiterPosition = -99;
- lastQuotePosition = -99;
- lastDoubleQuotePosition = -99;
+ lastDelimiterPosition = -1;
+ lastQuotePosition = -1;
+ lastDoubleQuotePosition = -1;
quoteCount = 0;
doubleQuoteCount = 0;
startedQuote = false;
@@ -81,9 +81,44 @@
fieldCount = 0;
}
- public void nextRecord(char[] buffer, int recordLength) throws IOException {
+ public char[] getBuffer() {
+ return buffer;
+ }
+
+ public int getFieldStart() {
+ return fStart;
+ }
+
+ public int getFieldLength() {
+ return fEnd - fStart;
+ }
+
+ public boolean isFieldEmpty() {
+ return fStart == fEnd;
+ }
+
+ public boolean fieldHasDoubleQuote() {
+ return isDoubleQuoteIncludedInThisField;
+ }
+
+ public int getFieldCount() {
+ return fieldCount;
+ }
+
+ public int getRecordCount() {
+ return recordCount;
+ }
+
+ public void nextRecord(char[] buffer, int recordLength) {
recordCount++;
fieldCount = 0;
+ lastDelimiterPosition = -1;
+ lastQuotePosition = -1;
+ lastDoubleQuotePosition = -1;
+ quoteCount = 0;
+ doubleQuoteCount = 0;
+ startedQuote = false;
+ isDoubleQuoteIncludedInThisField = false;
start = 0;
end = recordLength;
state = State.IN_RECORD;
@@ -187,7 +222,6 @@
}
public boolean nextField() throws IOException {
- fieldCount++;
switch (state) {
case INIT:
case EOR:
@@ -196,12 +230,12 @@
return false;
case IN_RECORD:
- boolean eof;
+ fieldCount++;
// reset quote related values
startedQuote = false;
isDoubleQuoteIncludedInThisField = false;
- lastQuotePosition = -99;
- lastDoubleQuotePosition = -99;
+ lastQuotePosition = -1;
+ lastDoubleQuotePosition = -1;
quoteCount = 0;
doubleQuoteCount = 0;
@@ -209,21 +243,26 @@
while (true) {
if (p >= end) {
int s = start;
- eof = !readMore();
+ boolean eof = !readMore();
p -= (s - start);
- lastQuotePosition -= (s - start);
- lastDoubleQuotePosition -= (s - start);
- lastDelimiterPosition -= (s - start);
+ lastQuotePosition -= (lastQuotePosition > -1) ? (s - start) : 0;
+ lastDoubleQuotePosition -= (lastDoubleQuotePosition > -1) ? (s - start) : 0;
+ lastDelimiterPosition -= (lastDelimiterPosition > -1) ? (s - start) : 0;
if (eof) {
state = State.EOF;
- if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
- && quoteCount == doubleQuoteCount * 2 + 2) {
- // set the position of fStart to +1, fEnd to -1 to remove quote character
- fStart = start + 1;
- fEnd = p - 1;
- } else {
+ if (!startedQuote) {
fStart = start;
fEnd = p;
+ } else {
+ if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
+ && quoteCount == doubleQuoteCount * 2 + 2) {
+ // set the position of fStart to +1, fEnd to -1 to remove quote character
+ fStart = start + 1;
+ fEnd = p - 1;
+ } else {
+ throw new IOException("At record: " + recordCount + ", field#: " + fieldCount
+ + " - missing a closing quote");
+ }
}
return true;
}
@@ -232,12 +271,12 @@
if (ch == quote) {
// If this is first quote in the field, then it needs to be placed in the beginning.
if (!startedQuote) {
- if (lastDelimiterPosition == p - 1 || lastDelimiterPosition == -99) {
+ if (p == start) {
startedQuote = true;
} else {
// In this case, we don't have a quote in the beginning of a field.
throw new IOException("At record: " + recordCount + ", field#: " + fieldCount
- + " - a quote enclosing a field needs to be placed in the beginning of that field.");
+ + " - a quote enclosing a field needs to be placed in the beginning of that field");
}
}
// Check double quotes - "". We check [start != p-2]
@@ -245,8 +284,8 @@
// since it looks like a double quote. However, it's not a double quote.
// (e.g. if field2 has no value:
// field1,"",field3 ... )
- if (lastQuotePosition == p - 1 && lastDelimiterPosition != p - 2
- && lastDoubleQuotePosition != p - 1) {
+ if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
+ && lastQuotePosition != start) {
isDoubleQuoteIncludedInThisField = true;
doubleQuoteCount++;
lastDoubleQuotePosition = p;
@@ -262,64 +301,46 @@
start = p + 1;
lastDelimiterPosition = p;
return true;
- } else if (startedQuote) {
- if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1) {
- // There is a quote right before the delimiter (e.g. ",) and it is not two quote,
- // then the field contains a valid string.
- // We set the position of fStart to +1, fEnd to -1 to remove quote character
- fStart = start + 1;
- fEnd = p - 1;
- start = p + 1;
- lastDelimiterPosition = p;
- startedQuote = false;
- return true;
- } else if (lastQuotePosition < p - 1 && lastQuotePosition != lastDoubleQuotePosition
- && quoteCount == doubleQuoteCount * 2 + 2) {
- // There is a quote before the delimiter, however it is not directly placed before the delimiter.
- // In this case, we throw an exception.
- // quoteCount == doubleQuoteCount * 2 + 2 : only true when we have two quotes except double-quotes.
- throw new IOException("At record: " + recordCount + ", field#: " + fieldCount
- + " - A quote enclosing a field needs to be followed by the delimiter.");
- }
+ }
+
+ if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
+ && lastQuotePosition != start) {
+ // There is a quote right before the delimiter (e.g. ",) and it is not two quote,
+ // then the field contains a valid string.
+ // We set the position of fStart to +1, fEnd to -1 to remove quote character
+ fStart = start + 1;
+ fEnd = p - 1;
+ start = p + 1;
+ lastDelimiterPosition = p;
+ startedQuote = false;
+ return true;
+ } else if (lastQuotePosition < p - 1 && lastQuotePosition != lastDoubleQuotePosition
+ && quoteCount == doubleQuoteCount * 2 + 2) {
+ // There is a quote before the delimiter, however it is not directly placed before the delimiter.
+ // In this case, we throw an exception.
+ // quoteCount == doubleQuoteCount * 2 + 2 : only true when we have two quotes except double-quotes.
+ throw new IOException("At record: " + recordCount + ", field#: " + fieldCount
+ + " - A quote enclosing a field needs to be followed by the delimiter.");
}
// If the control flow reaches here: we have a delimiter in this field and
// there should be a quote in the beginning and the end of
// this field. So, just continue reading next character
- } else if (ch == '\n') {
+ } else if (ch == '\n' || ch == '\r') {
if (!startedQuote) {
fStart = start;
fEnd = p;
start = p + 1;
- state = State.EOR;
+ state = ch == '\n' ? State.EOR : State.CR;
lastDelimiterPosition = p;
return true;
- } else if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
+ } else if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
&& quoteCount == doubleQuoteCount * 2 + 2) {
// set the position of fStart to +1, fEnd to -1 to remove quote character
fStart = start + 1;
fEnd = p - 1;
lastDelimiterPosition = p;
start = p + 1;
- state = State.EOR;
- startedQuote = false;
- return true;
- }
- } else if (ch == '\r') {
- if (!startedQuote) {
- fStart = start;
- fEnd = p;
- start = p + 1;
- state = State.CR;
- lastDelimiterPosition = p;
- return true;
- } else if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
- && quoteCount == doubleQuoteCount * 2 + 2) {
- // set the position of fStart to +1, fEnd to -1 to remove quote character
- fStart = start + 1;
- fEnd = p - 1;
- lastDelimiterPosition = p;
- start = p + 1;
- state = State.CR;
+ state = ch == '\n' ? State.EOR : State.CR;
startedQuote = false;
return true;
}
@@ -330,7 +351,10 @@
throw new IllegalStateException();
}
- protected boolean readMore() throws IOException {
+ private boolean readMore() throws IOException {
+ if (in == null) {
+ return false;
+ }
if (start > 0) {
System.arraycopy(buffer, start, buffer, 0, end - start);
}
@@ -350,10 +374,11 @@
}
// Eliminate escaped double quotes("") in a field
- public void eliminateDoubleQuote(char[] buffer, int start, int length) {
- int lastDoubleQuotePosition = -99;
- int writepos = start;
- int readpos = start;
+ public void eliminateDoubleQuote() {
+ int lastDoubleQuotePosition = -1;
+ int writepos = fStart;
+ int readpos = fStart;
+ int length = fEnd - fStart;
// Find positions where double quotes appear
for (int i = 0; i < length; i++) {
// Skip double quotes
@@ -369,5 +394,7 @@
readpos++;
}
}
+ fEnd -= doubleQuoteCount;
+ isDoubleQuoteIncludedInThisField = false;
}
}
diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java
index e663179..8edcafc 100644
--- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java
@@ -51,9 +51,8 @@
while (cursor.nextRecord()) {
int fieldNumber = 0;
while (cursor.nextField()) {
- if (cursor.isDoubleQuoteIncludedInThisField) {
- cursor.eliminateDoubleQuote(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart);
- cursor.fEnd -= cursor.doubleQuoteCount;
+ if (cursor.fieldHasDoubleQuote()) {
+ cursor.eliminateDoubleQuote();
}
fieldNumber++;
}