Fixed CSV parser to recognize quote and delimiter inside a string
Change-Id: Iaaabc23e86df4f9bbee9f06b7976d7fbdcbb0f3f
Reviewed-on: http://fulliautomatix.ics.uci.edu:8443/135
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Till Westmann <westmann@gmail.com>
diff --git a/asterix-app/data/csv/sample_02.csv b/asterix-app/data/csv/sample_02.csv
new file mode 100644
index 0000000..a4c5b3d
--- /dev/null
+++ b/asterix-app/data/csv/sample_02.csv
@@ -0,0 +1,4 @@
+1|0.1|0.1|0.1|0.1|abc|abc
+2|0.2||0.2||""|""
+3|0.3||0.3|||
+4|0.4||0.4||||extra||
diff --git a/asterix-app/data/csv/sample_03.csv b/asterix-app/data/csv/sample_03.csv
new file mode 100644
index 0000000..eed0699
--- /dev/null
+++ b/asterix-app/data/csv/sample_03.csv
@@ -0,0 +1,5 @@
+1,0.1,"test"",1a","test""1b"
+2,0.2,test2a,test2b
+3,0.3,"test,3a,3a,3a","""""test"""""
+4,0.4,"test""4a"",4a"," test with
+line break "
diff --git a/asterix-app/data/csv/sample_04_quote_error.csv b/asterix-app/data/csv/sample_04_quote_error.csv
new file mode 100644
index 0000000..27b6b7c
--- /dev/null
+++ b/asterix-app/data/csv/sample_04_quote_error.csv
@@ -0,0 +1,5 @@
+1,0.1,"test",1a","test""1b"
+2,0.2,test2a,test2b
+3,0.3,"test,3a,3a,3a","""""test"""""
+4,0.4,"test""4a"",4a"," test with
+line break "
diff --git a/asterix-app/data/csv/sample_05_space_error_1.csv b/asterix-app/data/csv/sample_05_space_error_1.csv
new file mode 100644
index 0000000..92a9862
--- /dev/null
+++ b/asterix-app/data/csv/sample_05_space_error_1.csv
@@ -0,0 +1,5 @@
+1,0.1, "test"",1a","test""1b"
+2,0.2,test2a,test2b
+3,0.3,"test,3a,3a,3a","""""test"""""
+4,0.4,"test""4a"",4a"," test with
+line break "
diff --git a/asterix-app/data/csv/sample_06_space_error_2.csv b/asterix-app/data/csv/sample_06_space_error_2.csv
new file mode 100644
index 0000000..d1e7711
--- /dev/null
+++ b/asterix-app/data/csv/sample_06_space_error_2.csv
@@ -0,0 +1,5 @@
+1,0.1,"test"",1a" ,"test""1b"
+2,0.2,test2a,test2b
+3,0.3,"test,3a,3a,3a","""""test"""""
+4,0.4,"test""4a"",4a"," test with
+line break "
diff --git a/asterix-app/src/test/resources/fuzzyjoin/amerix/10-load-csx-small.aql b/asterix-app/src/test/resources/fuzzyjoin/amerix/10-load-csx-small.aql
index 5e819bd..01c29de 100644
--- a/asterix-app/src/test/resources/fuzzyjoin/amerix/10-load-csx-small.aql
+++ b/asterix-app/src/test/resources/fuzzyjoin/amerix/10-load-csx-small.aql
@@ -1,7 +1,7 @@
use dataverse fuzzy1;
declare type CSXType as open {
- id: int32,
+ id: int32,
csxid: string,
title: string,
authors: string,
@@ -10,11 +10,11 @@
declare nodegroup group1 on nc1, nc2;
-declare dataset CSXSmall(CSXType)
+declare dataset CSXSmall(CSXType)
primary key id on group1;
-load dataset CSXSmall
+load dataset CSXSmall
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/fuzzyjoin/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+(("path"="nc1://data/fuzzyjoin/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000")) pre-sorted;
diff --git a/asterix-app/src/test/resources/fuzzyjoin/pub/040-load-csx.aql b/asterix-app/src/test/resources/fuzzyjoin/pub/040-load-csx.aql
index a87398d..c1eb120 100644
--- a/asterix-app/src/test/resources/fuzzyjoin/pub/040-load-csx.aql
+++ b/asterix-app/src/test/resources/fuzzyjoin/pub/040-load-csx.aql
@@ -1,7 +1,7 @@
use dataverse fuzzy1;
declare type CSXType as open {
- id: int32,
+ id: int32,
csxid: string,
title: string,
authors: string,
@@ -10,11 +10,11 @@
declare nodegroup group1 on nc1, nc2;
-declare dataset CSX(CSXType)
+declare dataset CSX(CSXType)
primary key id on group1;
// load dataset CSX from nc1:'/asterix/asterix-app/data/pub-small/csx-small-id.txt'
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1:///asterix/asterix-app/data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
+(("path"="nc1:///asterix/asterix-app/data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000")) pre-sorted;
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_1/dblp-csx-2_1.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_1/dblp-csx-2_1.2.update.aql
index 47d8420..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_1/dblp-csx-2_1.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_1/dblp-csx-2_1.2.update.aql
@@ -1,10 +1,10 @@
use dataverse fuzzyjoin;
-load dataset DBLP
+load dataset DBLP
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
-load dataset CSX
+load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_2/dblp-csx-2_2.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_2/dblp-csx-2_2.2.update.aql
index 47d8420..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_2/dblp-csx-2_2.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_2/dblp-csx-2_2.2.update.aql
@@ -1,10 +1,10 @@
use dataverse fuzzyjoin;
-load dataset DBLP
+load dataset DBLP
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
-load dataset CSX
+load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_3/dblp-csx-2_3.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_3/dblp-csx-2_3.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_3/dblp-csx-2_3.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_3/dblp-csx-2_3.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_4/dblp-csx-2_4.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_4/dblp-csx-2_4.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_4/dblp-csx-2_4.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_4/dblp-csx-2_4.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.1/dblp-csx-2_5.1.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.1/dblp-csx-2_5.1.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.1/dblp-csx-2_5.1.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.1/dblp-csx-2_5.1.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2/dblp-csx-2_5.2.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2/dblp-csx-2_5.2.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2/dblp-csx-2_5.2.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.2/dblp-csx-2_5.2.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.3.1/dblp-csx-2_5.3.1.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.3.1/dblp-csx-2_5.3.1.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.3.1/dblp-csx-2_5.3.1.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.3.1/dblp-csx-2_5.3.1.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.3/dblp-csx-2_5.3.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.3/dblp-csx-2_5.3.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.3/dblp-csx-2_5.3.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5.3/dblp-csx-2_5.3.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5/dblp-csx-2_5.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5/dblp-csx-2_5.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5/dblp-csx-2_5.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-2_5/dblp-csx-2_5.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_1/dblp-csx-3_1.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_1/dblp-csx-3_1.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_1/dblp-csx-3_1.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_1/dblp-csx-3_1.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_2/dblp-csx-3_2.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_2/dblp-csx-3_2.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_2/dblp-csx-3_2.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_2/dblp-csx-3_2.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_3/dblp-csx-3_3.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_3/dblp-csx-3_3.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_3/dblp-csx-3_3.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_3/dblp-csx-3_3.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_4/dblp-csx-3_4.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_4/dblp-csx-3_4.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_4/dblp-csx-3_4.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_4/dblp-csx-3_4.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.1/dblp-csx-3_5.1.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.1/dblp-csx-3_5.1.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.1/dblp-csx-3_5.1.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.1/dblp-csx-3_5.1.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.2/dblp-csx-3_5.2.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.2/dblp-csx-3_5.2.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.2/dblp-csx-3_5.2.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.2/dblp-csx-3_5.2.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.3.1/dblp-csx-3_5.3.1.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.3.1/dblp-csx-3_5.3.1.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.3.1/dblp-csx-3_5.3.1.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.3.1/dblp-csx-3_5.3.1.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.3/dblp-csx-3_5.3.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.3/dblp-csx-3_5.3.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.3/dblp-csx-3_5.3.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.3/dblp-csx-3_5.3.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.4.1/dblp-csx-3_5.4.1.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.4.1/dblp-csx-3_5.4.1.2.update.aql
index 50972b4..a452348 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.4.1/dblp-csx-3_5.4.1.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.4.1/dblp-csx-3_5.4.1.2.update.aql
@@ -6,6 +6,6 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.4/dblp-csx-3_5.4.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.4/dblp-csx-3_5.4.2.update.aql
index 50972b4..a452348 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.4/dblp-csx-3_5.4.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5.4/dblp-csx-3_5.4.2.update.aql
@@ -6,6 +6,6 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5/dblp-csx-3_5.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5/dblp-csx-3_5.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5/dblp-csx-3_5.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-3_5/dblp-csx-3_5.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_1/dblp-csx-aqlplus_1.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_1/dblp-csx-aqlplus_1.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_1/dblp-csx-aqlplus_1.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_1/dblp-csx-aqlplus_1.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_2/dblp-csx-aqlplus_2.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_2/dblp-csx-aqlplus_2.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_2/dblp-csx-aqlplus_2.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_2/dblp-csx-aqlplus_2.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_3/dblp-csx-aqlplus_3.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_3/dblp-csx-aqlplus_3.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_3/dblp-csx-aqlplus_3.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-aqlplus_3/dblp-csx-aqlplus_3.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-dblp-aqlplus_1/dblp-csx-dblp-aqlplus_1.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-dblp-aqlplus_1/dblp-csx-dblp-aqlplus_1.2.update.aql
index 916bd56..e0e46d8 100644
--- a/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-dblp-aqlplus_1/dblp-csx-dblp-aqlplus_1.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/fuzzyjoin/dblp-csx-dblp-aqlplus_1/dblp-csx-dblp-aqlplus_1.2.update.aql
@@ -6,5 +6,5 @@
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/index-join/btree-secondary-equi-join/btree-secondary-equi-join.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/index-join/btree-secondary-equi-join/btree-secondary-equi-join.2.update.aql
index 78a8e47..2eeb41e 100644
--- a/asterix-app/src/test/resources/runtimets/queries/index-join/btree-secondary-equi-join/btree-secondary-equi-join.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/index-join/btree-secondary-equi-join/btree-secondary-equi-join.2.update.aql
@@ -1,17 +1,17 @@
/*
* Description : Equi joins two datasets, DBLP and CSX, based on their title.
- * DBLP has a secondary btree index on title, and given the 'indexnl' hint
+ * DBLP has a secondary btree index on title, and given the 'indexnl' hint
* we expect the join to be transformed into an indexed nested-loop join.
* Success : Yes
*/
use dataverse test;
-load dataset DBLP
+load dataset DBLP
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/ngram-jaccard-inline/ngram-jaccard-inline.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/ngram-jaccard-inline/ngram-jaccard-inline.2.update.aql
index 9388959..8c1fd38 100644
--- a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/ngram-jaccard-inline/ngram-jaccard-inline.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/ngram-jaccard-inline/ngram-jaccard-inline.2.update.aql
@@ -8,11 +8,11 @@
use dataverse test;
-load dataset DBLP
+load dataset DBLP
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/ngram-jaccard/ngram-jaccard.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/ngram-jaccard/ngram-jaccard.2.update.aql
index 319c927..d683bff 100644
--- a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/ngram-jaccard/ngram-jaccard.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/ngram-jaccard/ngram-jaccard.2.update.aql
@@ -7,11 +7,11 @@
use dataverse test;
-load dataset DBLP
+load dataset DBLP
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/word-jaccard-inline/word-jaccard-inline.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/word-jaccard-inline/word-jaccard-inline.2.update.aql
index 0a2a629..7faf85f 100644
--- a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/word-jaccard-inline/word-jaccard-inline.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/word-jaccard-inline/word-jaccard-inline.2.update.aql
@@ -8,11 +8,11 @@
use dataverse test;
-load dataset DBLP
+load dataset DBLP
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/word-jaccard/word-jaccard.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/word-jaccard/word-jaccard.2.update.aql
index a166535..22aa5b7 100644
--- a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/word-jaccard/word-jaccard.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join-noeqjoin/word-jaccard/word-jaccard.2.update.aql
@@ -7,11 +7,11 @@
use dataverse test;
-load dataset DBLP
+load dataset DBLP
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/ngram-jaccard-inline/ngram-jaccard-inline.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/ngram-jaccard-inline/ngram-jaccard-inline.2.update.aql
index 8fbfdb9..68f309f 100644
--- a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/ngram-jaccard-inline/ngram-jaccard-inline.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/ngram-jaccard-inline/ngram-jaccard-inline.2.update.aql
@@ -7,11 +7,11 @@
use dataverse test;
-load dataset DBLP
+load dataset DBLP
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/ngram-jaccard/ngram-jaccard.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/ngram-jaccard/ngram-jaccard.2.update.aql
index 93233c1..a267f03 100644
--- a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/ngram-jaccard/ngram-jaccard.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/ngram-jaccard/ngram-jaccard.2.update.aql
@@ -6,11 +6,11 @@
use dataverse test;
-load dataset DBLP
+load dataset DBLP
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/word-jaccard-inline/word-jaccard-inline.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/word-jaccard-inline/word-jaccard-inline.2.update.aql
index 3064796..a90d3f4 100644
--- a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/word-jaccard-inline/word-jaccard-inline.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/word-jaccard-inline/word-jaccard-inline.2.update.aql
@@ -7,11 +7,11 @@
use dataverse test;
-load dataset DBLP
+load dataset DBLP
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/word-jaccard/word-jaccard.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/word-jaccard/word-jaccard.2.update.aql
index 5a307f6..e80da53 100644
--- a/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/word-jaccard/word-jaccard.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/inverted-index-join/word-jaccard/word-jaccard.2.update.aql
@@ -6,11 +6,11 @@
use dataverse test;
-load dataset DBLP
+load dataset DBLP
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/dblp-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":")) pre-sorted;
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/leftouterjoin/query_issue285-2/query_issue285-2.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/leftouterjoin/query_issue285-2/query_issue285-2.2.update.aql
index 923f6e4..54290df 100644
--- a/asterix-app/src/test/resources/runtimets/queries/leftouterjoin/query_issue285-2/query_issue285-2.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/leftouterjoin/query_issue285-2/query_issue285-2.2.update.aql
@@ -1,17 +1,17 @@
/*
* Description : Left-outer joins two datasets, DBLP and CSX, based on their title.
- * DBLP has a secondary btree index on title, and given the 'indexnl' hint
+ * DBLP has a secondary btree index on title, and given the 'indexnl' hint
* we expect the join to be transformed into an indexed nested-loop join.
* Success : Yes
*/
use dataverse test;
-load dataset DBLP
+load dataset DBLP
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/leftouterjoin/query_issue285/query_issue285.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/leftouterjoin/query_issue285/query_issue285.2.update.aql
index 923f6e4..54290df 100644
--- a/asterix-app/src/test/resources/runtimets/queries/leftouterjoin/query_issue285/query_issue285.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/leftouterjoin/query_issue285/query_issue285.2.update.aql
@@ -1,17 +1,17 @@
/*
* Description : Left-outer joins two datasets, DBLP and CSX, based on their title.
- * DBLP has a secondary btree index on title, and given the 'indexnl' hint
+ * DBLP has a secondary btree index on title, and given the 'indexnl' hint
* we expect the join to be transformed into an indexed nested-loop join.
* Success : Yes
*/
use dataverse test;
-load dataset DBLP
+load dataset DBLP
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/leftouterjoin/query_issue658/query_issue658.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/leftouterjoin/query_issue658/query_issue658.2.update.aql
index 4e2123a..1cc3923 100644
--- a/asterix-app/src/test/resources/runtimets/queries/leftouterjoin/query_issue658/query_issue658.2.update.aql
+++ b/asterix-app/src/test/resources/runtimets/queries/leftouterjoin/query_issue658/query_issue658.2.update.aql
@@ -5,11 +5,11 @@
use dataverse test;
-load dataset DBLP
+load dataset DBLP
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
(("path"="nc1://data/pub-small/dblp-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
load dataset CSX
using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
-(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"));
+(("path"="nc1://data/pub-small/csx-small-id.txt"),("format"="delimited-text"),("delimiter"=":"),("quote"="\u0000"));
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_03/csv_03.1.ddl.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_03/csv_03.1.ddl.aql
new file mode 100644
index 0000000..0b7c16f
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_03/csv_03.1.ddl.aql
@@ -0,0 +1,23 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: success
+ *
+ */
+
+drop dataverse temp if exists;
+create dataverse temp
+use dataverse temp;
+
+create type test as closed {
+ id: int32,
+ float: float,
+ floatq: float?,
+ double: double,
+ doubleq: double?,
+ string: string,
+ stringq: string?
+};
+
+create dataset testds (test)
+primary key id;
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_03/csv_03.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_03/csv_03.2.update.aql
new file mode 100644
index 0000000..e0e04ed
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_03/csv_03.2.update.aql
@@ -0,0 +1,12 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: success
+ *
+ */
+
+use dataverse temp;
+
+load dataset testds
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/csv/sample_02.csv"),("format"="delimited-text"),("delimiter"="|"));
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_03/csv_03.3.query.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_03/csv_03.3.query.aql
new file mode 100644
index 0000000..e7d5f60
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_03/csv_03.3.query.aql
@@ -0,0 +1,20 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: success
+ *
+ */
+
+use dataverse temp;
+
+for $i in dataset testds
+order by $i.id
+return {
+ "id": $i.id,
+ "float": $i.float,
+ "floatq": $i.floatq,
+ "double": $i.double,
+ "doubleq": $i.doubleq,
+ "string": $i.string,
+ "stringq": $i.stringq
+}
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_04/csv_04.1.ddl.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_04/csv_04.1.ddl.aql
new file mode 100644
index 0000000..b51d617
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_04/csv_04.1.ddl.aql
@@ -0,0 +1,22 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: success
+ * In this test, we check quote
+ * and delimiter in a field
+ *
+ */
+
+drop dataverse temp if exists;
+create dataverse temp
+use dataverse temp;
+
+create type test as closed {
+ id: int32,
+ float: float,
+ stringa: string,
+ stringb: string?
+};
+
+create dataset testds (test)
+primary key id;
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_04/csv_04.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_04/csv_04.2.update.aql
new file mode 100644
index 0000000..df24a65
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_04/csv_04.2.update.aql
@@ -0,0 +1,12 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: success
+ *
+ */
+
+use dataverse temp;
+
+load dataset testds
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/csv/sample_03.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\""));
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_04/csv_04.3.query.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_04/csv_04.3.query.aql
new file mode 100644
index 0000000..ee72474
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_04/csv_04.3.query.aql
@@ -0,0 +1,17 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: success
+ *
+ */
+
+use dataverse temp;
+
+for $i in dataset testds
+order by $i.id
+return {
+ "id": $i.id,
+ "float": $i.float,
+ "stringa": $i.stringa,
+ "stringb": $i.stringb
+}
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_05/csv_05.1.ddl.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_05/csv_05.1.ddl.aql
new file mode 100644
index 0000000..cba880c
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_05/csv_05.1.ddl.aql
@@ -0,0 +1,20 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: fail - a field is not enclosed in two quotes properly. It misses one quote.
+ *
+ */
+
+drop dataverse temp if exists;
+create dataverse temp
+use dataverse temp;
+
+create type test as closed {
+ id: int32,
+ float: float,
+ stringa: string,
+ stringb: string?
+};
+
+create dataset testds (test)
+primary key id;
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_05/csv_05.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_05/csv_05.2.update.aql
new file mode 100644
index 0000000..b851701
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_05/csv_05.2.update.aql
@@ -0,0 +1,12 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: fail - a field is not enclosed in two quotes properly. It misses one quote.
+ *
+ */
+
+use dataverse temp;
+
+load dataset testds
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/csv/sample_04_quote_error.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\""));
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_05/csv_05.3.query.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_05/csv_05.3.query.aql
new file mode 100644
index 0000000..ba85528
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_05/csv_05.3.query.aql
@@ -0,0 +1,17 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: fail - a field is not enclosed in two quotes properly. It misses one quote.
+ *
+ */
+
+use dataverse temp;
+
+for $i in dataset testds
+order by $i.id
+return {
+ "id": $i.id,
+ "float": $i.float,
+ "stringa": $i.stringa,
+ "stringb": $i.stringb
+}
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_06/csv_06.1.ddl.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_06/csv_06.1.ddl.aql
new file mode 100644
index 0000000..e2b5d8f
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_06/csv_06.1.ddl.aql
@@ -0,0 +1,21 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: fail - a whitespace is placed after the delimiter, and there is a quote after that.
+ * According to RFC (http://tools.ietf.org/html/rfc4180), this is not allowed.
+ *
+ */
+
+drop dataverse temp if exists;
+create dataverse temp
+use dataverse temp;
+
+create type test as closed {
+ id: int32,
+ float: float,
+ stringa: string,
+ stringb: string?
+};
+
+create dataset testds (test)
+primary key id;
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_06/csv_06.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_06/csv_06.2.update.aql
new file mode 100644
index 0000000..468d7cb
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_06/csv_06.2.update.aql
@@ -0,0 +1,13 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: fail - a whitespace is placed after the delimiter, and there is a quote after that.
+ * According to RFC (http://tools.ietf.org/html/rfc4180), this is not allowed.
+ *
+ */
+
+use dataverse temp;
+
+load dataset testds
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/csv/sample_05_space_error_1.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\""));
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_06/csv_06.3.query.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_06/csv_06.3.query.aql
new file mode 100644
index 0000000..da3a4fa
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_06/csv_06.3.query.aql
@@ -0,0 +1,18 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: fail - a whitespace is placed after the delimiter, and there is a quote after that.
+ * According to RFC (http://tools.ietf.org/html/rfc4180), this is not allowed.
+ *
+ */
+
+use dataverse temp;
+
+for $i in dataset testds
+order by $i.id
+return {
+ "id": $i.id,
+ "float": $i.float,
+ "stringa": $i.stringa,
+ "stringb": $i.stringb
+}
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_07/csv_07.1.ddl.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_07/csv_07.1.ddl.aql
new file mode 100644
index 0000000..41a66ef
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_07/csv_07.1.ddl.aql
@@ -0,0 +1,21 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: fail - a whitespace is placed after a quote, and there is a delimiter after that space.
+ * According to RFC (http://tools.ietf.org/html/rfc4180), this is not allowed.
+ *
+ */
+
+drop dataverse temp if exists;
+create dataverse temp
+use dataverse temp;
+
+create type test as closed {
+ id: int32,
+ float: float,
+ stringa: string,
+ stringb: string?
+};
+
+create dataset testds (test)
+primary key id;
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_07/csv_07.2.update.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_07/csv_07.2.update.aql
new file mode 100644
index 0000000..c5fbc69
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_07/csv_07.2.update.aql
@@ -0,0 +1,12 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: success
+ *
+ */
+
+use dataverse temp;
+
+load dataset testds
+using "edu.uci.ics.asterix.external.dataset.adapter.NCFileSystemAdapter"
+(("path"="nc1://data/csv/sample_06_space_error_2.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\""));
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/queries/load/csv_07/csv_07.3.query.aql b/asterix-app/src/test/resources/runtimets/queries/load/csv_07/csv_07.3.query.aql
new file mode 100644
index 0000000..ee72474
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/queries/load/csv_07/csv_07.3.query.aql
@@ -0,0 +1,17 @@
+/**
+ *
+ * CSV file loading test
+ * Expected result: success
+ *
+ */
+
+use dataverse temp;
+
+for $i in dataset testds
+order by $i.id
+return {
+ "id": $i.id,
+ "float": $i.float,
+ "stringa": $i.stringa,
+ "stringb": $i.stringb
+}
\ No newline at end of file
diff --git a/asterix-app/src/test/resources/runtimets/results/load/csv_03/csv_03.1.adm b/asterix-app/src/test/resources/runtimets/results/load/csv_03/csv_03.1.adm
new file mode 100644
index 0000000..712d6fd
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/results/load/csv_03/csv_03.1.adm
@@ -0,0 +1,4 @@
+{ "id": 1, "float": 0.1f, "floatq": 0.1f, "double": 0.1d, "doubleq": 0.1d, "string": "abc", "stringq": "abc" }
+{ "id": 2, "float": 0.2f, "floatq": null, "double": 0.2d, "doubleq": null, "string": "", "stringq": null }
+{ "id": 3, "float": 0.3f, "floatq": null, "double": 0.3d, "doubleq": null, "string": "", "stringq": null }
+{ "id": 4, "float": 0.4f, "floatq": null, "double": 0.4d, "doubleq": null, "string": "", "stringq": null }
diff --git a/asterix-app/src/test/resources/runtimets/results/load/csv_04/csv_04.1.adm b/asterix-app/src/test/resources/runtimets/results/load/csv_04/csv_04.1.adm
new file mode 100644
index 0000000..292a507
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/results/load/csv_04/csv_04.1.adm
@@ -0,0 +1,4 @@
+{ "id": 1, "float": 0.1f, "stringa": "test\",1a", "stringb": "test\"1b" }
+{ "id": 2, "float": 0.2f, "stringa": "test2a", "stringb": "test2b" }
+{ "id": 3, "float": 0.3f, "stringa": "test,3a,3a,3a", "stringb": "\"\"test\"\"" }
+{ "id": 4, "float": 0.4f, "stringa": "test\"4a\",4a", "stringb": " test with\nline break " }
diff --git a/asterix-app/src/test/resources/runtimets/results/load/csv_05/csv_05.1.adm b/asterix-app/src/test/resources/runtimets/results/load/csv_05/csv_05.1.adm
new file mode 100644
index 0000000..292a507
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/results/load/csv_05/csv_05.1.adm
@@ -0,0 +1,4 @@
+{ "id": 1, "float": 0.1f, "stringa": "test\",1a", "stringb": "test\"1b" }
+{ "id": 2, "float": 0.2f, "stringa": "test2a", "stringb": "test2b" }
+{ "id": 3, "float": 0.3f, "stringa": "test,3a,3a,3a", "stringb": "\"\"test\"\"" }
+{ "id": 4, "float": 0.4f, "stringa": "test\"4a\",4a", "stringb": " test with\nline break " }
diff --git a/asterix-app/src/test/resources/runtimets/results/load/csv_06/csv_06.1.adm b/asterix-app/src/test/resources/runtimets/results/load/csv_06/csv_06.1.adm
new file mode 100644
index 0000000..292a507
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/results/load/csv_06/csv_06.1.adm
@@ -0,0 +1,4 @@
+{ "id": 1, "float": 0.1f, "stringa": "test\",1a", "stringb": "test\"1b" }
+{ "id": 2, "float": 0.2f, "stringa": "test2a", "stringb": "test2b" }
+{ "id": 3, "float": 0.3f, "stringa": "test,3a,3a,3a", "stringb": "\"\"test\"\"" }
+{ "id": 4, "float": 0.4f, "stringa": "test\"4a\",4a", "stringb": " test with\nline break " }
diff --git a/asterix-app/src/test/resources/runtimets/results/load/csv_07/csv_07.1.adm b/asterix-app/src/test/resources/runtimets/results/load/csv_07/csv_07.1.adm
new file mode 100644
index 0000000..292a507
--- /dev/null
+++ b/asterix-app/src/test/resources/runtimets/results/load/csv_07/csv_07.1.adm
@@ -0,0 +1,4 @@
+{ "id": 1, "float": 0.1f, "stringa": "test\",1a", "stringb": "test\"1b" }
+{ "id": 2, "float": 0.2f, "stringa": "test2a", "stringb": "test2b" }
+{ "id": 3, "float": 0.3f, "stringa": "test,3a,3a,3a", "stringb": "\"\"test\"\"" }
+{ "id": 4, "float": 0.4f, "stringa": "test\"4a\",4a", "stringb": " test with\nline break " }
diff --git a/asterix-app/src/test/resources/runtimets/testsuite.xml b/asterix-app/src/test/resources/runtimets/testsuite.xml
index 3464a29..2dd0f18 100644
--- a/asterix-app/src/test/resources/runtimets/testsuite.xml
+++ b/asterix-app/src/test/resources/runtimets/testsuite.xml
@@ -4814,6 +4814,34 @@
</compilation-unit>
</test-case>
<test-case FilePath="load">
+ <compilation-unit name="csv_03">
+ <output-dir compare="Text">csv_03</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="load">
+ <compilation-unit name="csv_04">
+ <output-dir compare="Text">csv_04</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="load">
+ <compilation-unit name="csv_05">
+ <output-dir compare="Text">csv_05</output-dir>
+ <expected-error>edu.uci.ics.hyracks.api.exceptions.HyracksDataException</expected-error>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="load">
+ <compilation-unit name="csv_06">
+ <output-dir compare="Text">csv_06</output-dir>
+ <expected-error>edu.uci.ics.hyracks.api.exceptions.HyracksDataException</expected-error>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="load">
+ <compilation-unit name="csv_07">
+ <output-dir compare="Text">csv_07</output-dir>
+ <expected-error>edu.uci.ics.hyracks.api.exceptions.HyracksDataException</expected-error>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="load">
<compilation-unit name="issue14_query">
<output-dir compare="Text">none</output-dir>
<expected-error>edu.uci.ics.asterix.common.exceptions.AsterixException</expected-error>
diff --git a/asterix-common/src/main/java/edu/uci/ics/asterix/common/parse/IParseFileSplitsDecl.java b/asterix-common/src/main/java/edu/uci/ics/asterix/common/parse/IParseFileSplitsDecl.java
index 98f2848..4a9dad5 100644
--- a/asterix-common/src/main/java/edu/uci/ics/asterix/common/parse/IParseFileSplitsDecl.java
+++ b/asterix-common/src/main/java/edu/uci/ics/asterix/common/parse/IParseFileSplitsDecl.java
@@ -3,9 +3,9 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -19,7 +19,9 @@
public interface IParseFileSplitsDecl {
public boolean isDelimitedFileFormat();
- public Character getDelimChar();
+ public char getDelimChar();
+
+ public char getQuote();
public FileSplit[] getSplits();
}
diff --git a/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/adapter/factory/HDFSIndexingAdapterFactory.java b/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/adapter/factory/HDFSIndexingAdapterFactory.java
index 5e573cb..040f506 100644
--- a/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/adapter/factory/HDFSIndexingAdapterFactory.java
+++ b/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/adapter/factory/HDFSIndexingAdapterFactory.java
@@ -22,6 +22,7 @@
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
+import edu.uci.ics.asterix.external.adapter.factory.StreamBasedAdapterFactory;
import edu.uci.ics.asterix.external.dataset.adapter.HDFSIndexingAdapter;
import edu.uci.ics.asterix.external.indexing.dataflow.HDFSIndexingParserFactory;
import edu.uci.ics.asterix.external.indexing.dataflow.IndexingScheduler;
@@ -104,13 +105,14 @@
((HDFSIndexingParserFactory) parserFactory).setJobConf(conf);
((HDFSIndexingParserFactory) parserFactory).setArguments(configuration);
HDFSIndexingAdapter hdfsIndexingAdapter = new HDFSIndexingAdapter(atype, readSchedule, executed, inputSplits,
- conf, clusterLocations, files, parserFactory, ctx, nodeName, (String) configuration.get(HDFSAdapterFactory.KEY_INPUT_FORMAT),
- (String) configuration.get(KEY_FORMAT));
+ conf, clusterLocations, files, parserFactory, ctx, nodeName,
+ (String) configuration.get(HDFSAdapterFactory.KEY_INPUT_FORMAT), (String) configuration.get(KEY_FORMAT));
return hdfsIndexingAdapter;
}
@Override
- public void configure(Map<String, String> configuration, ARecordType outputType, boolean isPKAutoGenerated, List<String> primaryKeys) throws Exception {
+ public void configure(Map<String, String> configuration, ARecordType outputType, boolean isPKAutoGenerated,
+ List<String> primaryKeys) throws Exception {
if (!initialized) {
hdfsScheduler = initializeHDFSScheduler();
initialized = true;
@@ -130,22 +132,28 @@
configureFormat(atype);
}
-
protected void configureFormat(IAType sourceDatatype) throws Exception {
- parserFactory = new HDFSIndexingParserFactory((ARecordType)atype,
+
+ char delimiter = StreamBasedAdapterFactory.getDelimiter(configuration);
+ char quote = StreamBasedAdapterFactory.getQuote(configuration, delimiter);
+
+ parserFactory = new HDFSIndexingParserFactory((ARecordType) atype,
(String) configuration.get(HDFSAdapterFactory.KEY_INPUT_FORMAT),
- (String) configuration.get(KEY_FORMAT),
- (String) configuration.get(KEY_DELIMITER),
+ (String) configuration.get(KEY_FORMAT), delimiter, quote,
(String) configuration.get(HDFSAdapterFactory.KEY_PARSER));
}
/**
* A static function that creates and return delimited text data parser
- * @param recordType (the record type to be parsed)
- * @param delimiter (the dilimiter value)
+ *
+ * @param recordType
+ * (the record type to be parsed)
+ * @param delimiter
+ * (the delimiter value)
* @return
*/
- public static DelimitedDataParser getDilimitedDataParser(ARecordType recordType, Character delimiter){
+ @SuppressWarnings("null")
+ public static DelimitedDataParser getDilimitedDataParser(ARecordType recordType, char delimiter, char quote) {
int n = recordType.getFieldTypes().length;
IValueParserFactory[] fieldParserFactories = new IValueParserFactory[n];
for (int i = 0; i < n; i++) {
@@ -168,7 +176,7 @@
}
fieldParserFactories[i] = vpf;
}
- return new DelimitedDataParser(recordType, fieldParserFactories, delimiter, false, -1, null);
+ return new DelimitedDataParser(recordType, fieldParserFactories, delimiter, quote, false, -1, null);
}
public static AlgebricksPartitionConstraint getClusterLocations() {
diff --git a/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/adapter/factory/StreamBasedAdapterFactory.java b/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/adapter/factory/StreamBasedAdapterFactory.java
index b424b96..6fed446 100644
--- a/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/adapter/factory/StreamBasedAdapterFactory.java
+++ b/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/adapter/factory/StreamBasedAdapterFactory.java
@@ -51,8 +51,14 @@
public static final String KEY_FORMAT = "format";
public static final String KEY_PARSER_FACTORY = "parser";
public static final String KEY_DELIMITER = "delimiter";
+ public static final String KEY_QUOTE = "quote";
public static final String KEY_PATH = "path";
public static final String KEY_SOURCE_DATATYPE = "output-type-name";
+ // The length of a delimiter should be 1.
+ public static final String DEFAULT_DELIMITER = ",";
+ // A quote is used to enclose a string if it includes delimiter(s) in it.
+ // The length of a quote should be 1.
+ public static final String DEFAULT_QUOTE = "\"";
public static final String FORMAT_DELIMITED_TEXT = "delimited-text";
public static final String FORMAT_ADM = "adm";
public static final String NODE_RESOLVER_FACTORY_PROPERTY = "node.Resolver";
@@ -99,16 +105,13 @@
}
fieldParserFactories[i] = vpf;
}
- String delimiterValue = (String) configuration.get(KEY_DELIMITER);
- if (delimiterValue != null && delimiterValue.length() > 1) {
- throw new AsterixException("improper delimiter");
- }
- Character delimiter = delimiterValue.charAt(0);
+ char delimiter = getDelimiter(configuration);
+ char quote = getQuote(configuration, delimiter);
return conditionalPush ? new ConditionalPushTupleParserFactory(recordType, fieldParserFactories, delimiter,
- configuration) : new NtDelimitedDataTupleParserFactory(recordType, fieldParserFactories, delimiter,
- isPKAutoGenerated, primaryKeyPosition, origSourceDataTypeForAutoGeneratedPK);
+ quote, configuration) : new NtDelimitedDataTupleParserFactory(recordType, fieldParserFactories,
+ delimiter, quote, isPKAutoGenerated, primaryKeyPosition, origSourceDataTypeForAutoGeneratedPK);
}
protected ITupleParserFactory getADMDataTupleParserFactory(ARecordType recordType, boolean conditionalPush,
@@ -116,16 +119,16 @@
throws AsterixException {
try {
return conditionalPush ? new ConditionalPushTupleParserFactory(recordType, configuration)
- : new AdmSchemafullRecordParserFactory(recordType, isPKAutoGenerated,
- primaryKeyPosition, origSourceDataTypeForAutoGeneratedPK);
+ : new AdmSchemafullRecordParserFactory(recordType, isPKAutoGenerated, primaryKeyPosition,
+ origSourceDataTypeForAutoGeneratedPK);
} catch (Exception e) {
throw new AsterixException(e);
}
}
- protected void configureFormat(IAType sourceDatatype, boolean isPKAutoGenerated,
- int primaryKeyPosition, IAType origSourceDataTypeForAutoGeneratedPK) throws Exception {
+ protected void configureFormat(IAType sourceDatatype, boolean isPKAutoGenerated, int primaryKeyPosition,
+ IAType origSourceDataTypeForAutoGeneratedPK) throws Exception {
String propValue = (String) configuration.get(BATCH_SIZE);
int batchSize = propValue != null ? Integer.parseInt(propValue) : -1;
propValue = (String) configuration.get(BATCH_INTERVAL);
@@ -152,6 +155,36 @@
}
+ // Get a delimiter from the given configuration
+ public static char getDelimiter(Map<String, String> configuration) throws AsterixException {
+ String delimiterValue = (String) configuration.get(KEY_DELIMITER);
+ if (delimiterValue == null) {
+ delimiterValue = DEFAULT_DELIMITER;
+ } else if (delimiterValue.length() != 1) {
+ throw new AsterixException("'" + delimiterValue
+ + "' is not a valid delimiter. The length of a delimiter should be 1.");
+ }
+ return delimiterValue.charAt(0);
+ }
+ // Get a quote from the given configuration when the delimiter is given
+ // Need to pass delimiter to check whether they share the same character
+ public static char getQuote(Map<String, String> configuration, char delimiter) throws AsterixException {
+ String quoteValue = (String) configuration.get(KEY_QUOTE);
+ if (quoteValue == null) {
+ quoteValue = DEFAULT_QUOTE;
+ } else if (quoteValue.length() != 1) {
+ throw new AsterixException("'" + quoteValue + "' is not a valid quote. The length of a quote should be 1.");
+ }
+
+ // Since delimiter (char type value) can't be null,
+ // we only check whether delimiter and quote use the same character
+ if (quoteValue.charAt(0) == delimiter) {
+ throw new AsterixException("Quote '" + quoteValue + "' cannot be used with the delimiter '" + delimiter
+ + "'. ");
+ }
+
+ return quoteValue.charAt(0);
+ }
}
diff --git a/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/indexing/dataflow/HDFSIndexingParserFactory.java b/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/indexing/dataflow/HDFSIndexingParserFactory.java
index da37399..122f15a 100644
--- a/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/indexing/dataflow/HDFSIndexingParserFactory.java
+++ b/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/indexing/dataflow/HDFSIndexingParserFactory.java
@@ -18,6 +18,7 @@
import org.apache.hadoop.mapred.JobConf;
+import edu.uci.ics.asterix.common.exceptions.AsterixException;
import edu.uci.ics.asterix.external.adapter.factory.HDFSAdapterFactory;
import edu.uci.ics.asterix.external.adapter.factory.HDFSIndexingAdapterFactory;
import edu.uci.ics.asterix.external.adapter.factory.StreamBasedAdapterFactory;
@@ -41,7 +42,9 @@
// content format <adm, delimited-text, binary>
private String format;
// delimiter in case of delimited text
- private String delimiter;
+ private char delimiter;
+ // quote in case of delimited text
+ private char quote;
// parser class name in case of binary format
private String parserClassName;
// the expected data type
@@ -49,14 +52,15 @@
// the hadoop job conf
private transient JobConf jobConf;
// adapter arguments
- private Map<String,String> arguments;
+ private Map<String, String> arguments;
- public HDFSIndexingParserFactory(ARecordType atype, String inputFormat, String format, String delimiter,
- String parserClassName) {
+ public HDFSIndexingParserFactory(ARecordType atype, String inputFormat, String format, char delimiter,
+ char quote, String parserClassName) {
this.inputFormat = inputFormat;
this.format = format;
this.parserClassName = parserClassName;
this.delimiter = delimiter;
+ this.quote = quote;
this.atype = atype;
}
@@ -68,7 +72,8 @@
if (inputFormat == null) {
throw new IllegalArgumentException("Unspecified data format");
}
- if (!inputFormat.equalsIgnoreCase(HDFSAdapterFactory.INPUT_FORMAT_RC) && !inputFormat.equalsIgnoreCase(HDFSAdapterFactory.INPUT_FORMAT_TEXT)
+ if (!inputFormat.equalsIgnoreCase(HDFSAdapterFactory.INPUT_FORMAT_RC)
+ && !inputFormat.equalsIgnoreCase(HDFSAdapterFactory.INPUT_FORMAT_TEXT)
&& !inputFormat.equalsIgnoreCase(HDFSAdapterFactory.INPUT_FORMAT_SEQUENCE)) {
throw new IllegalArgumentException("External Indexing not supportd for format " + inputFormat);
}
@@ -85,7 +90,8 @@
return new AdmOrDelimitedIndexingTupleParser(ctx, atype, dataParser);
} else if (format.equalsIgnoreCase(StreamBasedAdapterFactory.FORMAT_DELIMITED_TEXT)) {
// choice 3 with delimited data parser
- DelimitedDataParser dataParser = HDFSIndexingAdapterFactory.getDilimitedDataParser(atype, delimiter.charAt(0));
+ DelimitedDataParser dataParser = HDFSIndexingAdapterFactory.getDilimitedDataParser(atype,
+ delimiter, quote);
return new AdmOrDelimitedIndexingTupleParser(ctx, atype, dataParser);
}
@@ -105,11 +111,11 @@
} catch (Exception e) {
throw new HyracksDataException("Unable to initialize object parser", e);
}
-
- if(inputFormat.equalsIgnoreCase(HDFSAdapterFactory.INPUT_FORMAT_RC)){
+
+ if (inputFormat.equalsIgnoreCase(HDFSAdapterFactory.INPUT_FORMAT_RC)) {
// Case 2
return new RCFileIndexingTupleParser(ctx, atype, objectParser);
- } else{
+ } else {
// Case 1
return new TextOrSeqIndexingTupleParser(ctx, atype, objectParser);
}
@@ -123,11 +129,11 @@
this.jobConf = jobConf;
}
- public Map<String,String> getArguments() {
+ public Map<String, String> getArguments() {
return arguments;
}
- public void setArguments(Map<String,String> arguments) {
+ public void setArguments(Map<String, String> arguments) {
this.arguments = arguments;
}
diff --git a/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/indexing/dataflow/HDFSLookupAdapter.java b/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/indexing/dataflow/HDFSLookupAdapter.java
index 90bde5a..4554228 100644
--- a/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/indexing/dataflow/HDFSLookupAdapter.java
+++ b/asterix-external-data/src/main/java/edu/uci/ics/asterix/external/indexing/dataflow/HDFSLookupAdapter.java
@@ -24,6 +24,7 @@
import edu.uci.ics.asterix.external.adapter.factory.HDFSAdapterFactory;
import edu.uci.ics.asterix.external.adapter.factory.HDFSIndexingAdapterFactory;
+import edu.uci.ics.asterix.external.adapter.factory.StreamBasedAdapterFactory;
import edu.uci.ics.asterix.external.indexing.input.RCFileLookupReader;
import edu.uci.ics.asterix.external.indexing.input.SequenceFileLookupInputStream;
import edu.uci.ics.asterix.external.indexing.input.SequenceFileLookupReader;
@@ -97,8 +98,11 @@
}
} else if (configuration.get(HDFSAdapterFactory.KEY_FORMAT).equals(HDFSAdapterFactory.FORMAT_DELIMITED_TEXT)) {
// create a delimited text parser
+ char delimiter = StreamBasedAdapterFactory.getDelimiter(configuration);
+ char quote = StreamBasedAdapterFactory.getQuote(configuration, delimiter);
+
DelimitedDataParser dataParser = HDFSIndexingAdapterFactory.getDilimitedDataParser((ARecordType) atype,
- (configuration.get(HDFSAdapterFactory.KEY_DELIMITER)).charAt(0));
+ delimiter, quote);
if (configuration.get(HDFSAdapterFactory.KEY_INPUT_FORMAT).equals(HDFSAdapterFactory.INPUT_FORMAT_TEXT)) {
// Text input format
TextFileLookupInputStream in = new TextFileLookupInputStream(fileIndexAccessor, jobConf);
@@ -144,7 +148,8 @@
// Do nothing
}
- private void configureRCFile(Configuration jobConf, INullWriterFactory iNullWriterFactory) throws IOException, Exception {
+ private void configureRCFile(Configuration jobConf, INullWriterFactory iNullWriterFactory) throws IOException,
+ Exception {
// RCFileLookupReader
RCFileLookupReader reader = new RCFileLookupReader(fileIndexAccessor,
HDFSAdapterFactory.configureJobConf(configuration));
diff --git a/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/feeds/ConditionalPushTupleParserFactory.java b/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/feeds/ConditionalPushTupleParserFactory.java
index f834a24..3cf8fe6 100644
--- a/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/feeds/ConditionalPushTupleParserFactory.java
+++ b/asterix-metadata/src/main/java/edu/uci/ics/asterix/metadata/feeds/ConditionalPushTupleParserFactory.java
@@ -47,7 +47,7 @@
dataParser = new ADMDataParser();
break;
case DELIMITED_DATA:
- dataParser = new DelimitedDataParser(recordType, valueParserFactories, delimiter, false, -1, null);
+ dataParser = new DelimitedDataParser(recordType, valueParserFactories, delimiter, quote, false, -1, null);
break;
}
return new ConditionalPushTupleParser(ctx, recordType, dataParser, configuration);
@@ -57,6 +57,7 @@
private final Map<String, String> configuration;
private IValueParserFactory[] valueParserFactories;
private char delimiter;
+ private char quote;
private final ParserType parserType;
public enum ParserType {
@@ -65,10 +66,11 @@
}
public ConditionalPushTupleParserFactory(ARecordType recordType, IValueParserFactory[] valueParserFactories,
- char fieldDelimiter, Map<String, String> configuration) {
+ char fieldDelimiter, char quote, Map<String, String> configuration) {
this.recordType = recordType;
this.valueParserFactories = valueParserFactories;
this.delimiter = fieldDelimiter;
+ this.quote = quote;
this.configuration = configuration;
this.parserType = ParserType.DELIMITED_DATA;
diff --git a/asterix-om/src/main/java/edu/uci/ics/asterix/formats/base/IDataFormat.java b/asterix-om/src/main/java/edu/uci/ics/asterix/formats/base/IDataFormat.java
index bb602a4..f2af980 100644
--- a/asterix-om/src/main/java/edu/uci/ics/asterix/formats/base/IDataFormat.java
+++ b/asterix-om/src/main/java/edu/uci/ics/asterix/formats/base/IDataFormat.java
@@ -3,9 +3,9 @@
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may obtain a copy of the License from
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -68,7 +68,7 @@
public ITupleParserFactory createTupleParser(ARecordType recType, IParseFileSplitsDecl decl);
- public ITupleParserFactory createTupleParser(ARecordType recType, boolean isDelimited, Character delimiter);
+ public ITupleParserFactory createTupleParser(ARecordType recType, boolean isDelimited, char delimiter, char quote);
public IFunctionDescriptor resolveFunction(ILogicalExpression expr, IVariableTypeEnvironment typeEnvironment)
throws AlgebricksException;
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/formats/NonTaggedDataFormat.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/formats/NonTaggedDataFormat.java
index b477545..4704ae1 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/formats/NonTaggedDataFormat.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/formats/NonTaggedDataFormat.java
@@ -930,11 +930,11 @@
@Override
public ITupleParserFactory createTupleParser(ARecordType recType, IParseFileSplitsDecl decl) {
- return createTupleParser(recType, decl.isDelimitedFileFormat(), decl.getDelimChar());
+ return createTupleParser(recType, decl.isDelimitedFileFormat(), decl.getDelimChar(), decl.getQuote());
}
@Override
- public ITupleParserFactory createTupleParser(ARecordType recType, boolean delimitedFormat, Character delimiter) {
+ public ITupleParserFactory createTupleParser(ARecordType recType, boolean delimitedFormat, char delimiter, char quote) {
if (delimitedFormat) {
int n = recType.getFieldTypes().length;
IValueParserFactory[] fieldParserFactories = new IValueParserFactory[n];
@@ -946,7 +946,7 @@
}
fieldParserFactories[i] = vpf;
}
- return new NtDelimitedDataTupleParserFactory(recType, fieldParserFactories, delimiter, false, -1, null);
+ return new NtDelimitedDataTupleParserFactory(recType, fieldParserFactories, delimiter, quote, false, -1, null);
} else {
return new AdmSchemafullRecordParserFactory(recType, false, -1, null);
}
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/DelimitedDataParser.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/DelimitedDataParser.java
index f516075..18f9553 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/DelimitedDataParser.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/DelimitedDataParser.java
@@ -41,6 +41,7 @@
protected final IValueParserFactory[] valueParserFactories;
protected final char fieldDelimiter;
+ protected final char quote;
protected final ARecordType recordType;
private IARecordBuilder recBuilder;
@@ -59,12 +60,23 @@
private final ARecordType origRecordTypeForAutoGeneratedPK;
private boolean areAllNullFields;
+ private boolean isDoubleQuoteIncludedInThisField;
+ private int doubleQuoteCount;
+
+ private int lineCount;
+ private int fieldCount;
public DelimitedDataParser(ARecordType recordType, IValueParserFactory[] valueParserFactories, char fieldDelimter,
- boolean isPKAutoGenerated, int primaryKeyPosition, ARecordType origRecordTypeForAutoGeneratedPK) {
+ char quote) {
+ this(recordType, valueParserFactories, fieldDelimter, quote, false, -1, null);
+ }
+
+ public DelimitedDataParser(ARecordType recordType, IValueParserFactory[] valueParserFactories, char fieldDelimter,
+ char quote, boolean isPKAutoGenerated, int primaryKeyPosition, ARecordType origRecordTypeForAutoGeneratedPK) {
this.recordType = recordType;
this.valueParserFactories = valueParserFactories;
this.fieldDelimiter = fieldDelimter;
+ this.quote = quote;
this.isPKAutoGenerated = isPKAutoGenerated;
this.primaryKeyPosition = primaryKeyPosition;
this.origRecordTypeForAutoGeneratedPK = origRecordTypeForAutoGeneratedPK;
@@ -80,17 +92,20 @@
else
recordTypeToApply = recordType;
+ lineCount = 1;
+
valueParsers = new IValueParser[valueParserFactories.length];
for (int i = 0; i < valueParserFactories.length; ++i) {
valueParsers[i] = valueParserFactories[i].createValueParser();
}
+ isDoubleQuoteIncludedInThisField = false;
+
fieldValueBuffer = new ArrayBackedValueStorage();
fieldValueBufferOutput = fieldValueBuffer.getDataOutput();
// If PK is auto-generated, then we need to use the recordType that
- // includes PK,
- // since recordType variable does not include PK field.
+ // includes PK, since recordType variable does not include PK field.
recBuilder = new RecordBuilder();
recBuilder.reset(recordTypeToApply);
recBuilder.init();
@@ -125,6 +140,8 @@
@Override
public boolean parse(DataOutput out) throws AsterixException, IOException {
while (cursor.nextRecord()) {
+ // If PK is auto-generated, then we need to use the recordType that
+ // includes PK, since recordType variable does not include PK field.
if (isPKAutoGenerated)
recBuilder.reset(origRecordTypeForAutoGeneratedPK);
else
@@ -133,7 +150,7 @@
recBuilder.init();
areAllNullFields = true;
- int fieldCount = 0;
+ fieldCount = 0;
for (int i = 0; i < valueParsers.length; ++i) {
if (!cursor.nextField()) {
@@ -144,18 +161,23 @@
if (cursor.fStart == cursor.fEnd && recordType.getFieldTypes()[i].getTypeTag() != ATypeTag.STRING
&& recordType.getFieldTypes()[i].getTypeTag() != ATypeTag.NULL) {
// if the field is empty and the type is optional, insert
- // NULL
- // note that string type can also process empty field as an
+ // NULL. Note that string type can also process empty field as an
// empty string
if (recordType.getFieldTypes()[i].getTypeTag() != ATypeTag.UNION
|| !NonTaggedFormatUtil.isOptionalField((AUnionType) recordType.getFieldTypes()[i])) {
- throw new AsterixException("Field " + i
+ throw new AsterixException("At line: " + lineCount + " - Field " + i
+ " is not an optional type so it cannot accept null value. ");
}
fieldValueBufferOutput.writeByte(ATypeTag.NULL.serialize());
ANullSerializerDeserializer.INSTANCE.serialize(ANull.NULL, out);
} else {
fieldValueBufferOutput.writeByte(fieldTypeTags[i]);
+ // Eliminate doule quotes in the field that we are going to parse
+ if (isDoubleQuoteIncludedInThisField) {
+ eliminateDoubleQuote(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart);
+ cursor.fEnd -= doubleQuoteCount;
+ isDoubleQuoteIncludedInThisField = false;
+ }
valueParsers[i].parse(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart,
fieldValueBufferOutput);
areAllNullFields = false;
@@ -170,13 +192,9 @@
}
- // Should not have any more fields now
- if (cursor.nextField()) {
- fieldCount++;
- }
-
- // Parsed all fields except an auto-generated PK at this point
- // Create a new UUID and assign it as a PK.
+ // Should not have any more fields now.
+ // We parsed all fields except an auto-generated PK at this point.
+ // We now create a new UUID and assign it as a PK.
if (isPKAutoGenerated && fieldCount == origRecordTypeForAutoGeneratedPK.getFieldTypes().length - 1) {
fieldValueBuffer.reset();
aUUID.nextUUID();
@@ -187,12 +205,11 @@
fieldValueBufferOutput);
recBuilder.addField(primaryKeyPosition, fieldValueBuffer);
areAllNullFields = false;
- }
- // If we have all fields in the file including auto-generated PK,
- // throw an exception
- else if (isPKAutoGenerated && fieldCount >= origRecordTypeForAutoGeneratedPK.getFieldTypes().length) {
- throw new AsterixException(
- "Check number of fields. Auto-generated PK field should not exist in the input data.");
+ } else if (isPKAutoGenerated && fieldCount >= origRecordTypeForAutoGeneratedPK.getFieldTypes().length) {
+ // If we have all fields in the file including auto-generated PK,
+ // throw an exception
+ throw new AsterixException("At line: " + lineCount
+ + " - check number of fields. Auto-generated PK field should not exist in the input data.");
}
if (!areAllNullFields) {
@@ -237,12 +254,23 @@
private int fStart;
private int fEnd;
+ private int lastQuotePosition;
+ private int lastDoubleQuotePosition;
+ private int lastDelimiterPosition;
+ private int quoteCount;
+ private boolean startedQuote;
+
public FieldCursor(Reader in) {
this.in = in;
buffer = new char[INITIAL_BUFFER_SIZE];
start = 0;
end = 0;
state = State.INIT;
+ lastDelimiterPosition = -99;
+ lastQuotePosition = -99;
+ lastDoubleQuotePosition = -99;
+ quoteCount = 0;
+ startedQuote = false;
}
public boolean nextRecord() throws IOException {
@@ -269,13 +297,32 @@
return start < end;
}
p -= (s - start);
+ lastQuotePosition -= (s - start);
+ lastDoubleQuotePosition -= (s - start);
+ lastDelimiterPosition -= (s - start);
}
char ch = buffer[p];
- if (ch == '\n') {
+ // We perform rough format correctness (delimiter, quote) check here
+ // to set the starting position of a record.
+ // In the field level, more checking will be conducted.
+ if (ch == quote) {
+ startedQuote = true;
+ // check two quotes in a row - "". This is an escaped quote
+ if (lastQuotePosition == p - 1 && start != p - 1 && lastDoubleQuotePosition != p - 1) {
+ lastDoubleQuotePosition = p;
+ }
+ lastQuotePosition = p;
+ } else if (ch == fieldDelimiter) {
+ if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1) {
+ startedQuote = false;
+ lastDelimiterPosition = p;
+ }
+ } else if (ch == '\n' && !startedQuote) {
start = p + 1;
state = State.EOR;
+ lastDelimiterPosition = p;
break;
- } else if (ch == '\r') {
+ } else if (ch == '\r' && !startedQuote) {
start = p + 1;
state = State.CR;
break;
@@ -293,7 +340,7 @@
}
}
char ch = buffer[start];
- if (ch == '\n') {
+ if (ch == '\n' && !startedQuote) {
++start;
state = State.EOR;
} else {
@@ -310,6 +357,7 @@
}
}
state = State.IN_RECORD;
+ lastDelimiterPosition = start;
return start < end;
case EOF:
@@ -328,37 +376,135 @@
case IN_RECORD:
boolean eof;
+ // reset quote related values
+ startedQuote = false;
+ isDoubleQuoteIncludedInThisField = false;
+ lastQuotePosition = -99;
+ lastDoubleQuotePosition = -99;
+ quoteCount = 0;
+ doubleQuoteCount = 0;
+
int p = start;
while (true) {
if (p >= end) {
int s = start;
eof = !readMore();
p -= (s - start);
+ lastQuotePosition -= (s - start);
+ lastDoubleQuotePosition -= (s - start);
+ lastDelimiterPosition -= (s - start);
if (eof) {
state = State.EOF;
- fStart = start;
- fEnd = p;
+ if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
+ && quoteCount == doubleQuoteCount * 2 + 2) {
+ // set the position of fStart to +1, fEnd to -1 to remove quote character
+ fStart = start + 1;
+ fEnd = p - 1;
+ } else {
+ fStart = start;
+ fEnd = p;
+ }
return true;
}
}
char ch = buffer[p];
- if (ch == fieldDelimiter) {
- fStart = start;
- fEnd = p;
- start = p + 1;
- return true;
+ if (ch == quote) {
+ // If this is first quote in the field, then it needs to be placed in the beginning.
+ if (!startedQuote) {
+ if (lastDelimiterPosition == p - 1 || lastDelimiterPosition == -99) {
+ startedQuote = true;
+ } else {
+ // In this case, we don't have a quote in the beginning of a field.
+ throw new IOException(
+ "At line: "
+ + lineCount
+ + ", field#: "
+ + (fieldCount+1)
+ + " - a quote enclosing a field needs to be placed in the beginning of that field.");
+ }
+ }
+ // Check double quotes - "". We check [start != p-2]
+ // to avoid false positive where there is no value in a field,
+ // since it looks like a double quote. However, it's not a double quote.
+ // (e.g. if field2 has no value:
+ // field1,"",field3 ... )
+ if (lastQuotePosition == p - 1 && lastDelimiterPosition != p - 2
+ && lastDoubleQuotePosition != p - 1) {
+ isDoubleQuoteIncludedInThisField = true;
+ doubleQuoteCount++;
+ lastDoubleQuotePosition = p;
+ }
+ lastQuotePosition = p;
+ quoteCount++;
+ } else if (ch == fieldDelimiter) {
+ // If there was no quote in the field,
+ // then we assume that the field contains a valid string.
+ if (!startedQuote) {
+ fStart = start;
+ fEnd = p;
+ start = p + 1;
+ lastDelimiterPosition = p;
+ return true;
+ } else if (startedQuote) {
+ if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1) {
+ // There is a quote right before the delimiter (e.g. ",) and it is not two quote,
+ // then the field contains a valid string.
+ // We set the position of fStart to +1, fEnd to -1 to remove quote character
+ fStart = start + 1;
+ fEnd = p - 1;
+ start = p + 1;
+ lastDelimiterPosition = p;
+ return true;
+ } else if (lastQuotePosition < p - 1 && lastQuotePosition != lastDoubleQuotePosition
+ && quoteCount == doubleQuoteCount * 2 + 2) {
+ // There is a quote before the delimiter, however it is not directly placed before the delimiter.
+ // In this case, we throw an exception.
+ // quoteCount == doubleQuoteCount * 2 + 2 : only true when we have two quotes except double-quotes.
+ throw new IOException("At line: " + lineCount + ", field#: " + (fieldCount+1)
+ + " - A quote enclosing a field needs to be followed by the delimiter.");
+ }
+ }
+ // If the control flow reaches here: we have a delimiter in this field and
+ // there should be a quote in the beginning and the end of
+ // this field. So, just continue reading next character
} else if (ch == '\n') {
- fStart = start;
- fEnd = p;
- start = p + 1;
- state = State.EOR;
- return true;
+ if (!startedQuote) {
+ fStart = start;
+ fEnd = p;
+ start = p + 1;
+ state = State.EOR;
+ lineCount++;
+ lastDelimiterPosition = p;
+ return true;
+ } else if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
+ && quoteCount == doubleQuoteCount * 2 + 2) {
+ // set the position of fStart to +1, fEnd to -1 to remove quote character
+ fStart = start + 1;
+ fEnd = p - 1;
+ lastDelimiterPosition = p;
+ start = p + 1;
+ state = State.EOR;
+ lineCount++;
+ return true;
+ }
} else if (ch == '\r') {
- fStart = start;
- fEnd = p;
- start = p + 1;
- state = State.CR;
- return true;
+ if (!startedQuote) {
+ fStart = start;
+ fEnd = p;
+ start = p + 1;
+ state = State.CR;
+ lastDelimiterPosition = p;
+ return true;
+ } else if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
+ && quoteCount == doubleQuoteCount * 2 + 2) {
+ // set the position of fStart to +1, fEnd to -1 to remove quote character
+ fStart = start + 1;
+ fEnd = p - 1;
+ lastDelimiterPosition = p;
+ start = p + 1;
+ state = State.CR;
+ return true;
+ }
}
++p;
}
@@ -385,25 +531,27 @@
return true;
}
- public int getfStart() {
- return fStart;
- }
-
- public void setfStart(int fStart) {
- this.fStart = fStart;
- }
-
- public int getfEnd() {
- return fEnd;
- }
-
- public void setfEnd(int fEnd) {
- this.fEnd = fEnd;
- }
-
- public char[] getBuffer() {
- return buffer;
- }
}
+ // Eliminate escaped double quotes("") in a field
+ protected void eliminateDoubleQuote(char[] buffer, int start, int length) {
+ int lastDoubleQuotePosition = -99;
+ int writepos = start;
+ int readpos = start;
+ // Find positions where double quotes appear
+ for (int i = 0; i < length; i++) {
+ // Skip double quotes
+ if (buffer[readpos] == quote && lastDoubleQuotePosition != readpos - 1) {
+ lastDoubleQuotePosition = readpos;
+ readpos++;
+ } else {
+ // Moving characters except double quote to the front
+ if (writepos != readpos) {
+ buffer[writepos] = buffer[readpos];
+ }
+ writepos++;
+ readpos++;
+ }
+ }
+ }
}
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/DelimitedDataTupleParser.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/DelimitedDataTupleParser.java
index be6c42a..8846110 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/DelimitedDataTupleParser.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/DelimitedDataTupleParser.java
@@ -28,10 +28,10 @@
private final DelimitedDataParser dataParser;
public DelimitedDataTupleParser(IHyracksTaskContext ctx, ARecordType recType,
- IValueParserFactory[] valueParserFactories, char fieldDelimter, boolean isPKAutoGenerated,
+ IValueParserFactory[] valueParserFactories, char fieldDelimter, char quote, boolean isPKAutoGenerated,
int primaryKeyPosition, ARecordType origRecordTypeForAutoGeneratedPK) throws HyracksDataException {
super(ctx, recType, isPKAutoGenerated, primaryKeyPosition, origRecordTypeForAutoGeneratedPK);
- dataParser = new DelimitedDataParser(recType, valueParserFactories, fieldDelimter, isPKAutoGenerated,
+ dataParser = new DelimitedDataParser(recType, valueParserFactories, fieldDelimter, quote, isPKAutoGenerated,
primaryKeyPosition, origRecordTypeForAutoGeneratedPK);
}
diff --git a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/NtDelimitedDataTupleParserFactory.java b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/NtDelimitedDataTupleParserFactory.java
index f88d39a..a301c7d 100644
--- a/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/NtDelimitedDataTupleParserFactory.java
+++ b/asterix-runtime/src/main/java/edu/uci/ics/asterix/runtime/operators/file/NtDelimitedDataTupleParserFactory.java
@@ -30,17 +30,25 @@
protected ARecordType recordType;
protected IValueParserFactory[] valueParserFactories;
protected char fieldDelimiter;
+ // quote is used to enclose a string if it includes delimiter(s) in it.
+ protected char quote;
// To deal with an auto-generated PK
protected boolean isPKAutoGenerated;
protected int primaryKeyPosition;
protected ARecordType origRecordTypeForAutoGeneratedPK;
public NtDelimitedDataTupleParserFactory(ARecordType recordType, IValueParserFactory[] valueParserFactories,
- char fieldDelimiter, boolean isPKAutoGenerated,
- int primaryKeyposition, ARecordType origRecordTypeForAutoGeneratedPK) {
+ char fieldDelimiter, char quote) {
+ this(recordType, valueParserFactories, fieldDelimiter, quote, false, -1, null);
+ }
+
+ public NtDelimitedDataTupleParserFactory(ARecordType recordType, IValueParserFactory[] valueParserFactories,
+ char fieldDelimiter, char quote, boolean isPKAutoGenerated, int primaryKeyposition,
+ ARecordType origRecordTypeForAutoGeneratedPK) {
this.recordType = recordType;
this.valueParserFactories = valueParserFactories;
this.fieldDelimiter = fieldDelimiter;
+ this.quote = quote;
this.isPKAutoGenerated = isPKAutoGenerated;
this.primaryKeyPosition = primaryKeyposition;
this.origRecordTypeForAutoGeneratedPK = origRecordTypeForAutoGeneratedPK;
@@ -48,8 +56,7 @@
@Override
public ITupleParser createTupleParser(final IHyracksTaskContext ctx) throws HyracksDataException {
- return new DelimitedDataTupleParser(ctx, recordType, valueParserFactories, fieldDelimiter,
+ return new DelimitedDataTupleParser(ctx, recordType, valueParserFactories, fieldDelimiter, quote,
isPKAutoGenerated, primaryKeyPosition, origRecordTypeForAutoGeneratedPK);
}
-
}
diff --git a/asterix-tools/src/main/java/edu/uci/ics/asterix/tools/external/data/RateControlledFileSystemBasedAdapterFactory.java b/asterix-tools/src/main/java/edu/uci/ics/asterix/tools/external/data/RateControlledFileSystemBasedAdapterFactory.java
index 0306a80..5f30465 100644
--- a/asterix-tools/src/main/java/edu/uci/ics/asterix/tools/external/data/RateControlledFileSystemBasedAdapterFactory.java
+++ b/asterix-tools/src/main/java/edu/uci/ics/asterix/tools/external/data/RateControlledFileSystemBasedAdapterFactory.java
@@ -102,7 +102,8 @@
}
@Override
- public void configure(Map<String, String> configuration, ARecordType recordType, boolean isPKAutoGenerated, List<String> primaryKeys) throws Exception {
+ public void configure(Map<String, String> configuration, ARecordType recordType, boolean isPKAutoGenerated,
+ List<String> primaryKeys) throws Exception {
this.configuration = configuration;
checkRequiredArgs(configuration);
String fileSystem = (String) configuration.get(KEY_FILE_SYSTEM);
@@ -134,13 +135,11 @@
break;
case FORMAT_DELIMITED_TEXT:
- String delimiterValue = (String) configuration.get(KEY_DELIMITER);
- if (delimiterValue != null && delimiterValue.length() > 1) {
- throw new AsterixException("improper delimiter");
- }
+ char delimiter = StreamBasedAdapterFactory.getDelimiter(configuration);
+ char quote = StreamBasedAdapterFactory.getQuote(configuration, delimiter);
IValueParserFactory[] valueParserFactories = getValueParserFactories(atype);
- parserFactory = new RateControlledTupleParserFactory(atype, valueParserFactories,
- delimiterValue.charAt(0), configuration);
+ parserFactory = new RateControlledTupleParserFactory(atype, valueParserFactories, delimiter, quote,
+ configuration);
break;
}
}
@@ -161,7 +160,7 @@
}
@Override
- public void setFiles(List<ExternalFile> files) throws AlgebricksException{
+ public void setFiles(List<ExternalFile> files) throws AlgebricksException {
throw new AlgebricksException("can't set files for this Adapter");
}
@@ -175,6 +174,7 @@
private final Map<String, String> configuration;
private IValueParserFactory[] valueParserFactories;
private char delimiter;
+ private char quote;
private final ParserType parserType;
public enum ParserType {
@@ -183,10 +183,17 @@
}
public RateControlledTupleParserFactory(ARecordType recordType, IValueParserFactory[] valueParserFactories,
- char fieldDelimiter, Map<String, String> configuration) {
+ char fieldDelimiter, Map<String, String> configuration) throws AsterixException {
+ this(recordType, valueParserFactories, fieldDelimiter, StreamBasedAdapterFactory.getQuote(configuration,
+ fieldDelimiter), configuration);
+ }
+
+ public RateControlledTupleParserFactory(ARecordType recordType, IValueParserFactory[] valueParserFactories,
+ char fieldDelimiter, char quote, Map<String, String> configuration) {
this.recordType = recordType;
this.valueParserFactories = valueParserFactories;
this.delimiter = fieldDelimiter;
+ this.quote = quote;
this.configuration = configuration;
this.parserType = ParserType.DELIMITED_DATA;
}
@@ -205,7 +212,8 @@
dataParser = new ADMDataParser();
break;
case DELIMITED_DATA:
- dataParser = new DelimitedDataParser(recordType, valueParserFactories, delimiter, false, -1, null);
+ dataParser = new DelimitedDataParser(recordType, valueParserFactories, delimiter, quote, false, -1,
+ null);
break;
}
return new RateControlledTupleParser(ctx, recordType, dataParser, configuration);