[ASTERIXDB-3429][EXT] Configurable CSV escape char
- user model changes: no
- storage format changes: no
- interface changes: yes
Details:
- Allow the CSV/delimited text parser to use escapes
for quotes within quoted fields other than quote
itself
Change-Id: I50bebc4b8b683889855cb5dd048ab27d7c93af76
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/18373
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Ian Maxon <imaxon@apache.org>
Reviewed-by: Ali Alsuliman <ali.al.solaiman@gmail.com>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
diff --git a/asterixdb/asterix-app/data/csv/nonstandard_escape.csv b/asterixdb/asterix-app/data/csv/nonstandard_escape.csv
new file mode 100644
index 0000000..b46a9f0
--- /dev/null
+++ b/asterixdb/asterix-app/data/csv/nonstandard_escape.csv
@@ -0,0 +1 @@
+1,"It says \"The quick, fox jumped over the lazy dog\""
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp
index 65816f6..6b8738c 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.1.ddl.sqlpp
@@ -26,10 +26,12 @@
CREATE TYPE t2 AS {f1: string, f2: string, f3: string};
CREATE TYPE t3 AS {f1: int?, f2: boolean, f3: string?};
CREATE TYPE t4 AS {f1: string, f2: string, f3: string, f4: string};
+CREATE TYPE t5 AS {f1: int, f2: string};
CREATE EXTERNAL DATASET ds1(t1) USING localfs(("path"="asterix_nc1://data/csv/sample_09.csv"), ("format"="CSV"), ("header"="FALSE"));
CREATE EXTERNAL DATASET ds2(t2) USING localfs(("path"="asterix_nc1://data/csv/sample_10.csv"), ("format"="Csv"), ("header"="False"));
CREATE EXTERNAL DATASET ds3(t1) USING localfs(("path"="asterix_nc1://data/csv/sample_11.csv"), ("format"="csv"), ("header"="FALSE"));
CREATE EXTERNAL DATASET ds4(t3) USING localfs(("path"="asterix_nc1://data/csv/sample_12.csv"), ("format"="csv"), ("header"="True"), ("null"=""));
CREATE EXTERNAL DATASET ds5(t4) USING localfs(("path"="asterix_nc1://data/csv/sample_13.csv"), ("format"="csv"), ("header"="True"));
-CREATE EXTERNAL DATASET ds6(t4) USING localfs(("path"="asterix_nc1://data/csv/empty_lines.csv"), ("format"="csv"), ("header"="false"));
\ No newline at end of file
+CREATE EXTERNAL DATASET ds6(t4) USING localfs(("path"="asterix_nc1://data/csv/empty_lines.csv"), ("format"="csv"), ("header"="false"));
+CREATE EXTERNAL DATASET ds7(t5) USING localfs(("path"="asterix_nc1://data/csv/nonstandard_escape.csv"), ("format"="csv"), ("header"="false"),("escape"="\\"));
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.query.sqlpp
new file mode 100644
index 0000000..f3a3606
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.query.sqlpp
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+FROM ds7 v SELECT VALUE v ORDER BY v.f1;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.99.ddl.sqlpp
similarity index 100%
rename from asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.8.ddl.sqlpp
rename to asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/csv-tsv-parser/csv-parser-001/csv-parser-001.99.ddl.sqlpp
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp
index 32df960..b79ae18 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_04/csv_04.2.update.sqlpp
@@ -26,5 +26,5 @@
use temp;
-load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_03.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\""));
+load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_03.csv"),("format"="csv"),("header"="false"));
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_05/csv_05.2.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_05/csv_05.2.update.sqlpp
index 71a79c7..156bbf2 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_05/csv_05.2.update.sqlpp
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_05/csv_05.2.update.sqlpp
@@ -26,5 +26,5 @@
use temp;
-load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_04_quote_error.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\""));
+load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_04_quote_error.csv"),("format"="csv"),("header"="false"));
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_06/csv_06.2.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_06/csv_06.2.update.sqlpp
index e7b19f5..68b4f4a 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_06/csv_06.2.update.sqlpp
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_06/csv_06.2.update.sqlpp
@@ -27,5 +27,5 @@
use temp;
-load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_05_space_error_1.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\""));
+load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_05_space_error_1.csv"),("format"="csv"),("header"="false"));
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_07/csv_07.2.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_07/csv_07.2.update.sqlpp
index 32988b6..103fe3a 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_07/csv_07.2.update.sqlpp
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_07/csv_07.2.update.sqlpp
@@ -26,5 +26,5 @@
use temp;
-load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_06_space_error_2.csv"),("format"="delimited-text"),("delimiter"=","),("quote"="\""));
+load dataset testds using localfs (("path"="asterix_nc1://data/csv/sample_06_space_error_2.csv"),("format"="csv"),("header"="false"));
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.1.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.1.ddl.sqlpp
new file mode 100644
index 0000000..191a8ac
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.1.ddl.sqlpp
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/**
+ *
+ * CSV file loading test
+ * Expected result: success
+ *
+ */
+
+drop dataverse temp if exists;
+create dataverse temp;
+
+use temp;
+
+CREATE TYPE temp.test AS {f1: int, f2: string};
+
+create dataset testds(test) primary key id;
+
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.2.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.2.update.sqlpp
new file mode 100644
index 0000000..b5633a9
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.2.update.sqlpp
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/**
+ *
+ * CSV file loading test
+ * Expected result: success
+ *
+ */
+
+use temp;
+
+
+load dataset testds using localfs ((`path`=`asterix_nc1://data/csv/nonstandard_escape.csv`),(`format`=`csv`),(`header`=`false`),(`null`=``),(`escape`=`\\`);
+
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.3.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.3.query.sqlpp
new file mode 100644
index 0000000..7a9bdc6
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/load/csv_nonstandard/csv_nonstandard.3.query.sqlpp
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/**
+ *
+ * CSV file loading test
+ * Expected result: success
+ *
+ */
+
+use temp;
+
+FROM testds v SELECT VALUE v ORDER BY v.f1;
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.8.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.8.adm
new file mode 100644
index 0000000..73d07f3
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/csv-tsv-parser/csv-parser-001/csv-parser-001.8.adm
@@ -0,0 +1 @@
+{ "f1": 1, "f2": "It says \"The quick, fox jumped over the lazy dog\"" }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/load/csv_nonstandard/csv_nonstandard.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/load/csv_nonstandard/csv_nonstandard.1.adm
new file mode 100644
index 0000000..73d07f3
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/load/csv_nonstandard/csv_nonstandard.1.adm
@@ -0,0 +1 @@
+{ "f1": 1, "f2": "It says \"The quick, fox jumped over the lazy dog\"" }
\ No newline at end of file
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java
index ad59f75..84d1541 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/converter/CSVToRecordWithMetadataAndPKConverter.java
@@ -47,7 +47,7 @@
IExternalDataRuntimeContext context) {
IWarningCollector warningCollector = context.getTaskContext().getWarningCollector();
this.cursor = new FieldCursorForDelimitedDataParser(null, delimiter, ExternalDataConstants.QUOTE,
- warningCollector, ExternalDataConstants.EMPTY_STRING);
+ ExternalDataConstants.QUOTE, warningCollector, ExternalDataConstants.EMPTY_STRING);
this.record = new CharArrayRecord();
this.valueIndex = valueIndex;
this.recordWithMetadata = new RecordWithMetadataAndPK<>(record, metaType.getFieldTypes(), recordType,
@@ -64,8 +64,8 @@
int j = 0;
FieldCursorForDelimitedDataParser.Result lastResult;
while ((lastResult = cursor.nextField()) == FieldCursorForDelimitedDataParser.Result.OK) {
- if (cursor.fieldHasDoubleQuote()) {
- cursor.eliminateDoubleQuote();
+ if (cursor.fieldHasEscapedQuote()) {
+ cursor.eliminateEscapeChar();
}
if (i == valueIndex) {
record.setValue(cursor.getBuffer(), cursor.getFieldStart(), cursor.getFieldLength());
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
index 4433b49..2035a8e 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/stream/QuotedLineRecordReader.java
@@ -55,7 +55,7 @@
String quoteString = config.get(ExternalDataConstants.KEY_QUOTE);
ExternalDataUtils.validateChar(quoteString, ExternalDataConstants.KEY_QUOTE);
this.quote = quoteString.charAt(0);
- this.escape = ExternalDataUtils.validateGetEscape(config);
+ this.escape = ExternalDataUtils.validateGetEscape(config, config.get(ExternalDataConstants.KEY_FORMAT));
}
@Override
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java
index b8a7480..7f890e1 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/DelimitedDataParser.java
@@ -61,6 +61,7 @@
private final IWarningCollector warnings;
private final char fieldDelimiter;
private final char quote;
+ private final char escape;
private final boolean hasHeader;
private final ARecordType recordType;
private final IARecordBuilder recBuilder;
@@ -79,14 +80,15 @@
private FieldCursorForDelimitedDataParser cursor;
public DelimitedDataParser(IExternalDataRuntimeContext context, IValueParserFactory[] valueParserFactories,
- char fieldDelimiter, char quote, boolean hasHeader, ARecordType recordType, boolean isStreamParser,
- String nullString) throws HyracksDataException {
+ char fieldDelimiter, char quote, char escape, boolean hasHeader, ARecordType recordType,
+ boolean isStreamParser, String nullString) throws HyracksDataException {
this.dataSourceName = context.getDatasourceNameSupplier();
this.lineNumber = context.getLineNumberSupplier();
this.warnings = context.getTaskContext().getWarningCollector();
this.valueEmbedder = context.getValueEmbedder();
this.fieldDelimiter = fieldDelimiter;
this.quote = quote;
+ this.escape = escape;
this.hasHeader = hasHeader;
this.recordType = recordType;
valueParsers = new IValueParser[valueParserFactories.length];
@@ -127,7 +129,7 @@
fieldNames[i] = name;
}
if (!isStreamParser) {
- cursor = new FieldCursorForDelimitedDataParser(null, this.fieldDelimiter, quote, warnings,
+ cursor = new FieldCursorForDelimitedDataParser(null, this.fieldDelimiter, quote, escape, warnings,
this::getDataSourceName);
}
this.nullChars = nullString != null ? nullString.toCharArray() : null;
@@ -186,8 +188,8 @@
}
fieldValueBufferOutput.writeByte(fieldTypeTags[i]);
// Eliminate double quotes in the field that we are going to parse
- if (cursor.fieldHasDoubleQuote()) {
- cursor.eliminateDoubleQuote();
+ if (cursor.fieldHasEscapedQuote()) {
+ cursor.eliminateEscapeChar();
}
boolean success = valueParsers[i].parse(cursor.getBuffer(), cursor.getFieldStart(),
cursor.getFieldLength(), fieldValueBufferOutput);
@@ -232,8 +234,8 @@
@Override
public void setInputStream(InputStream in) throws IOException {
// TODO(ali): revisit this in regards to stream
- cursor = new FieldCursorForDelimitedDataParser(new InputStreamReader(in), fieldDelimiter, quote, warnings,
- this::getDataSourceName);
+ cursor = new FieldCursorForDelimitedDataParser(new InputStreamReader(in), fieldDelimiter, quote, escape,
+ warnings, this::getDataSourceName);
if (hasHeader) {
cursor.nextRecord();
FieldCursorForDelimitedDataParser.Result result;
@@ -249,8 +251,8 @@
@Override
public boolean reset(InputStream in) throws IOException {
// TODO(ali): revisit this in regards to stream
- cursor = new FieldCursorForDelimitedDataParser(new InputStreamReader(in), fieldDelimiter, quote, warnings,
- this::getDataSourceName);
+ cursor = new FieldCursorForDelimitedDataParser(new InputStreamReader(in), fieldDelimiter, quote, escape,
+ warnings, this::getDataSourceName);
return true;
}
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java
index 8b88a69..bad742e 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/parser/factory/DelimitedDataParserFactory.java
@@ -50,9 +50,11 @@
IValueParserFactory[] valueParserFactories = ExternalDataUtils.getValueParserFactories(recordType);
char delimiter = ExternalDataUtils.validateGetDelimiter(configuration);
char quote = ExternalDataUtils.validateGetQuote(configuration, delimiter);
+ char escape =
+ ExternalDataUtils.validateGetEscape(configuration, configuration.get(ExternalDataConstants.KEY_FORMAT));
boolean hasHeader = ExternalDataUtils.hasHeader(configuration);
String nullString = configuration.get(ExternalDataConstants.KEY_NULL_STR);
- return new DelimitedDataParser(context, valueParserFactories, delimiter, quote, hasHeader, recordType,
+ return new DelimitedDataParser(context, valueParserFactories, delimiter, quote, escape, hasHeader, recordType,
ExternalDataUtils.getDataSourceType(configuration).equals(DataSourceType.STREAM), nullString);
}
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
index 5dfa803..3139be7 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataConstants.java
@@ -246,6 +246,8 @@
* Constant characters
*/
public static final char ESCAPE = '\\';
+
+ public static final char CSV_ESCAPE = '\"';
public static final char QUOTE = '"';
public static final char SPACE = ' ';
public static final char TAB = '\t';
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
index 3298759..e5b2c9e 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
@@ -143,7 +143,10 @@
return quote;
}
- public static char validateGetEscape(Map<String, String> configuration) throws HyracksDataException {
+ public static char validateGetEscape(Map<String, String> configuration, String format) throws HyracksDataException {
+ if (ExternalDataConstants.FORMAT_CSV.equals(format)) {
+ return validateCharOrDefault(configuration, KEY_ESCAPE, ExternalDataConstants.CSV_ESCAPE);
+ }
return validateCharOrDefault(configuration, KEY_ESCAPE, ExternalDataConstants.ESCAPE);
}
@@ -578,7 +581,7 @@
}
char delimiter = validateGetDelimiter(configuration);
validateGetQuote(configuration, delimiter);
- validateGetEscape(configuration);
+ validateGetEscape(configuration, format);
String value = configuration.get(ExternalDataConstants.KEY_REDACT_WARNINGS);
if (value != null && !isBoolean(value)) {
throw new RuntimeDataException(ErrorCode.INVALID_REQ_PARAM_VAL, ExternalDataConstants.KEY_REDACT_WARNINGS,
diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
index b8b2ba8..34d17c9 100644
--- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
+++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
@@ -43,16 +43,18 @@
private IValueParserFactory[] valueParserFactories;
private char fieldDelimiter;
private char quote;
+ private char escape;
public DelimitedDataTupleParserFactory(IValueParserFactory[] fieldParserFactories, char fieldDelimiter) {
- this(fieldParserFactories, fieldDelimiter, '\"');
+ this(fieldParserFactories, fieldDelimiter, '\"', '\"');
}
- public DelimitedDataTupleParserFactory(IValueParserFactory[] fieldParserFactories, char fieldDelimiter,
- char quote) {
+ public DelimitedDataTupleParserFactory(IValueParserFactory[] fieldParserFactories, char fieldDelimiter, char quote,
+ char escape) {
this.valueParserFactories = fieldParserFactories;
this.fieldDelimiter = fieldDelimiter;
this.quote = quote;
+ this.escape = escape;
}
@Override
@@ -74,7 +76,7 @@
DataOutput dos = tb.getDataOutput();
FieldCursorForDelimitedDataParser cursor = new FieldCursorForDelimitedDataParser(
- new InputStreamReader(in), fieldDelimiter, quote, warningCollector, () -> "");
+ new InputStreamReader(in), fieldDelimiter, quote, escape, warningCollector, () -> "");
while (cursor.nextRecord()) {
tb.reset();
for (int i = 0; i < valueParsers.length; ++i) {
@@ -88,9 +90,9 @@
default:
throw new IllegalStateException();
}
- // Eliminate double quotes in the field that we are going to parse
- if (cursor.fieldHasDoubleQuote()) {
- cursor.eliminateDoubleQuote();
+ // Eliminate escaped quotes in the field that we are going to parse
+ if (cursor.fieldHasEscapedQuote()) {
+ cursor.eliminateEscapeChar();
}
if (!valueParsers[i].parse(cursor.getBuffer(), cursor.getFieldStart(),
cursor.getFieldLength(), dos)) {
diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
index ffc87cd..7cba88c 100644
--- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
+++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/main/java/org/apache/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
@@ -53,8 +53,8 @@
private int fEnd; //end position for field
private long lineCount; //count of lines
private int fieldCount; //count of fields in current record
- private int doubleQuoteCount; //count of double quotes
- private boolean isDoubleQuoteIncludedInThisField; //does current field include double quotes
+ private int escapedQuoteCount; //count of escaped quotes
+ private boolean containsEscapedQuotes; //does current field contain escaped quotes
private static final int INITIAL_BUFFER_SIZE = 4096;//initial buffer size
private static final int INCREMENT = 4096; //increment size
@@ -66,15 +66,18 @@
private State state; //state (see states above)
private int lastQuotePosition; //position of last quote
- private int lastDoubleQuotePosition; //position of last double quote
+ private int lastEscapedQuotePosition; //position of last escaped quote
private int lastDelimiterPosition; //position of last delimiter
+ private int lastEscapePosition; //position of last escape
private int quoteCount; //count of single quotes
private boolean startedQuote; //whether a quote has been started
private final char quote; //the quote character
private final char fieldDelimiter; //the delimiter
- public FieldCursorForDelimitedDataParser(Reader in, char fieldDelimiter, char quote,
+ private final char escape;
+
+ public FieldCursorForDelimitedDataParser(Reader in, char fieldDelimiter, char quote, char escape,
IWarningCollector warningCollector, Supplier<String> dataSourceName) {
this.warnings = warningCollector;
this.dataSourceName = dataSourceName;
@@ -89,13 +92,15 @@
state = State.INIT;
this.quote = quote;
this.fieldDelimiter = fieldDelimiter;
+ this.escape = escape;
lastDelimiterPosition = -1;
lastQuotePosition = -1;
- lastDoubleQuotePosition = -1;
+ lastEscapedQuotePosition = -1;
+ lastEscapePosition = -1;
quoteCount = 0;
- doubleQuoteCount = 0;
+ escapedQuoteCount = 0;
startedQuote = false;
- isDoubleQuoteIncludedInThisField = false;
+ containsEscapedQuotes = false;
lineCount = 1;
fieldCount = 0;
}
@@ -116,8 +121,8 @@
return fStart == fEnd;
}
- public boolean fieldHasDoubleQuote() {
- return isDoubleQuoteIncludedInThisField;
+ public boolean fieldHasEscapedQuote() {
+ return containsEscapedQuotes;
}
public int getFieldCount() {
@@ -133,11 +138,12 @@
fieldCount = 0;
lastDelimiterPosition = -1;
lastQuotePosition = -1;
- lastDoubleQuotePosition = -1;
+ lastEscapedQuotePosition = -1;
+ lastEscapePosition = -1;
quoteCount = 0;
- doubleQuoteCount = 0;
+ escapedQuoteCount = 0;
startedQuote = false;
- isDoubleQuoteIncludedInThisField = false;
+ containsEscapedQuotes = false;
start = 0;
end = recordLength;
state = State.IN_RECORD;
@@ -171,22 +177,34 @@
}
p -= (s - start);
lastQuotePosition -= (s - start);
- lastDoubleQuotePosition -= (s - start);
+ lastEscapedQuotePosition -= (s - start);
lastDelimiterPosition -= (s - start);
}
char ch = buffer[p];
// We perform rough format correctness (delimiter, quote) check here
// to set the starting position of a record.
// In the field level, more checking will be conducted.
+ if (ch == escape) {
+ // this may or may not be an escape. the next character must be a quote for it to be.
+ lastEscapePosition = p;
+ }
if (ch == quote) {
- startedQuote = true;
- // check two quotes in a row - "". This is an escaped quote
- if (lastQuotePosition == p - 1 && start != p - 1 && lastDoubleQuotePosition != p - 1) {
- lastDoubleQuotePosition = p;
+ boolean couldBeEscapedQuote =
+ lastEscapePosition == p - 1 && lastEscapedQuotePosition != p - 1;
+ if (quote == escape) {
+ startedQuote = true;
+ // check two quotes in a row that aren't at the start of a field if quote is escape, e.g. ""
+ if (couldBeEscapedQuote && start != p - 1) {
+ lastEscapedQuotePosition = p;
+ }
+ } else {
+ if (couldBeEscapedQuote) {
+ lastEscapedQuotePosition = p;
+ }
}
lastQuotePosition = p;
} else if (ch == fieldDelimiter) {
- if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1) {
+ if (startedQuote && lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1) {
startedQuote = false;
lastDelimiterPosition = p;
}
@@ -266,11 +284,12 @@
fieldCount++;
// reset quote related values
startedQuote = false;
- isDoubleQuoteIncludedInThisField = false;
+ containsEscapedQuotes = false;
lastQuotePosition = -1;
- lastDoubleQuotePosition = -1;
+ lastEscapedQuotePosition = -1;
+ lastEscapePosition = -1;
quoteCount = 0;
- doubleQuoteCount = 0;
+ escapedQuoteCount = 0;
char lastChar = '\0';
int p = start;
@@ -280,7 +299,7 @@
boolean eof = !readMore();
p -= (s - start);
lastQuotePosition -= (lastQuotePosition > -1) ? (s - start) : 0;
- lastDoubleQuotePosition -= (lastDoubleQuotePosition > -1) ? (s - start) : 0;
+ lastEscapedQuotePosition -= (lastEscapedQuotePosition > -1) ? (s - start) : 0;
lastDelimiterPosition -= (lastDelimiterPosition > -1) ? (s - start) : 0;
if (eof) {
state = State.EOF;
@@ -288,8 +307,8 @@
fStart = start;
fEnd = p;
} else {
- if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
- && quoteCount == doubleQuoteCount * 2 + 2) {
+ if (lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1
+ && quoteCount == escapedQuoteCount * (escape == quote ? 2 : 1) + 2) {
// set the position of fStart to +1, fEnd to -1 to remove quote character
fStart = start + 1;
fEnd = p - 1;
@@ -319,16 +338,18 @@
return Result.ERROR;
}
}
- // Check double quotes - "". We check [start != p-2]
+ // Check escaped quotes - \ESC". We check [start != p-2] if escape is quote
// to avoid false positive where there is no value in a field,
- // since it looks like a double quote. However, it's not a double quote.
+ // since it looks like an escaped quote. However, it's not an escaped quote.
// (e.g. if field2 has no value:
// field1,"",field3 ... )
- if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
- && lastQuotePosition != start) {
- isDoubleQuoteIncludedInThisField = true;
- doubleQuoteCount++;
- lastDoubleQuotePosition = p;
+ boolean couldBeEscaped = lastEscapePosition == p - 1 && lastEscapedQuotePosition != p - 1;
+ boolean isEscapedQuote =
+ quote == escape ? couldBeEscaped && lastQuotePosition != start : couldBeEscaped;
+ if (isEscapedQuote) {
+ containsEscapedQuotes = true;
+ escapedQuoteCount++;
+ lastEscapedQuotePosition = p;
}
lastQuotePosition = p;
quoteCount++;
@@ -343,9 +364,9 @@
return Result.OK;
}
- if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
+ if (lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1
&& lastQuotePosition != start) {
- // There is a quote right before the delimiter (e.g. ",) and it is not two quote,
+ // There is a quote right before the delimiter (e.g. ",) and it is not an escaped quote,
// then the field contains a valid string.
// We set the position of fStart to +1, fEnd to -1 to remove quote character
fStart = start + 1;
@@ -354,8 +375,8 @@
lastDelimiterPosition = p;
startedQuote = false;
return Result.OK;
- } else if (lastQuotePosition < p - 1 && lastQuotePosition != lastDoubleQuotePosition
- && quoteCount == doubleQuoteCount * 2 + 2) {
+ } else if (lastQuotePosition < p - 1 && lastQuotePosition != lastEscapedQuotePosition
+ && quoteCount == escapedQuoteCount * (escape == quote ? 2 : 1) + 2) {
// There is a quote before the delimiter, however it is not directly placed before the delimiter.
// In this case, we throw an exception.
// quoteCount == doubleQuoteCount * 2 + 2 : only true when we have two quotes except double-quotes.
@@ -376,8 +397,8 @@
state = ch == '\n' ? State.EOR : State.CR;
lastDelimiterPosition = p;
return Result.OK;
- } else if (lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
- && quoteCount == doubleQuoteCount * 2 + 2) {
+ } else if (lastQuotePosition == p - 1 && lastEscapedQuotePosition != p - 1
+ && quoteCount == escapedQuoteCount * (escape == quote ? 2 : 1) + 2) {
// set the position of fStart to +1, fEnd to -1 to remove quote character
fStart = start + 1;
fEnd = p - 1;
@@ -388,6 +409,12 @@
return Result.OK;
}
}
+ if (ch == escape) {
+ //RFC4180 defines the escape character for quotes as quotes. however CSV is not a well-defined
+ //format, and so frequently nonstandard escaping such as C-style \ escaping is used.
+ //Therefore, we need to track potential escapes separately to support these cases.
+ lastEscapePosition = p;
+ }
// count lines inside quotes
if (ch == '\r' || (ch == '\n' && lastChar != '\r')) {
lineCount++;
@@ -421,17 +448,17 @@
return true;
}
- // Eliminate escaped double quotes("") in a field
- public void eliminateDoubleQuote() {
- int lastDoubleQuotePosition = -1;
+ // Eliminate escaped quotes("" by default) in a field
+ public void eliminateEscapeChar() {
+ int lastEsc = -1;
int writepos = fStart;
int readpos = fStart;
int length = fEnd - fStart;
// Find positions where double quotes appear
for (int i = 0; i < length; i++) {
// Skip double quotes
- if (buffer[readpos] == quote && lastDoubleQuotePosition != readpos - 1) {
- lastDoubleQuotePosition = readpos;
+ if (buffer[readpos] == escape && lastEsc != readpos - 1) {
+ lastEsc = readpos;
readpos++;
} else {
// Moving characters except double quote to the front
@@ -442,8 +469,8 @@
readpos++;
}
}
- fEnd -= doubleQuoteCount;
- isDoubleQuoteIncludedInThisField = false;
+ fEnd -= escapedQuoteCount;
+ containsEscapedQuotes = false;
}
private void warn(String message) {
diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java
index 5561ad1..10b3cd3 100644
--- a/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-dataflow-std/src/test/java/org/apache/hyracks/dataflow/std/file/CursorTest.java
@@ -40,7 +40,7 @@
reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
// skip header
final FieldCursorForDelimitedDataParser cursor =
- new FieldCursorForDelimitedDataParser(reader, ',', '"', null, () -> "");
+ new FieldCursorForDelimitedDataParser(reader, ',', '"', '"', null, () -> "");
// get number of fields from header (first record is header)
cursor.nextRecord();
int numOfFields = 0;
@@ -55,8 +55,8 @@
while (cursor.nextRecord()) {
int fieldNumber = 0;
while ((lastResult = cursor.nextField()) == FieldCursorForDelimitedDataParser.Result.OK) {
- if (cursor.fieldHasDoubleQuote()) {
- cursor.eliminateDoubleQuote();
+ if (cursor.fieldHasEscapedQuote()) {
+ cursor.eliminateEscapeChar();
}
fieldNumber++;
}