Issue 867: Handle delimited files using CR-only line separators
Also simplify record- and field-counting logic.
Change-Id: Ie28abda93fc9e5996008fac8b60aaf906df49cb7
Reviewed-on: https://asterix-gerrit.ics.uci.edu/246
Reviewed-by: Ian Maxon <imaxon@uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Preston Carman <ecarm002@ucr.edu>
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
index 6fd38d2..5be1eab 100644
--- a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/DelimitedDataTupleParserFactory.java
@@ -37,8 +37,6 @@
private char fieldDelimiter;
private char quote;
- private int fieldCount;
-
public DelimitedDataTupleParserFactory(IValueParserFactory[] fieldParserFactories, char fieldDelimiter) {
this(fieldParserFactories, fieldDelimiter, '\"');
}
@@ -47,7 +45,6 @@
this.valueParserFactories = fieldParserFactories;
this.fieldDelimiter = fieldDelimiter;
this.quote = quote;
- this.fieldCount = 0;
}
@Override
@@ -71,7 +68,7 @@
while (cursor.nextRecord()) {
tb.reset();
for (int i = 0; i < valueParsers.length; ++i) {
- if (!cursor.nextField(fieldCount)) {
+ if (!cursor.nextField()) {
break;
}
// Eliminate double quotes in the field that we are going to parse
@@ -82,7 +79,6 @@
}
valueParsers[i].parse(cursor.buffer, cursor.fStart, cursor.fEnd - cursor.fStart, dos);
tb.addFieldEndOffset();
- fieldCount++;
}
if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
FrameUtils.flushFrame(frame, writer);
diff --git a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
index 69ea0b1..780574c 100644
--- a/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
+++ b/hyracks/hyracks-dataflow-std/src/main/java/edu/uci/ics/hyracks/dataflow/std/file/FieldCursorForDelimitedDataParser.java
@@ -32,7 +32,8 @@
public char[] buffer;
public int fStart;
public int fEnd;
- public int lineCount;
+ public int recordCount;
+ public int fieldCount;
public int doubleQuoteCount;
public boolean isDoubleQuoteIncludedInThisField;
@@ -69,10 +70,13 @@
doubleQuoteCount = 0;
startedQuote = false;
isDoubleQuoteIncludedInThisField = false;
- lineCount = 1;
+ recordCount = 0;
+ fieldCount = 0;
}
public boolean nextRecord() throws IOException {
+ recordCount++;
+ fieldCount = 0;
while (true) {
switch (state) {
case INIT:
@@ -119,12 +123,12 @@
} else if (ch == '\n' && !startedQuote) {
start = p + 1;
state = State.EOR;
- lineCount++;
lastDelimiterPosition = p;
break;
} else if (ch == '\r' && !startedQuote) {
start = p + 1;
state = State.CR;
+ lastDelimiterPosition = p;
break;
}
++p;
@@ -143,7 +147,6 @@
if (ch == '\n' && !startedQuote) {
++start;
state = State.EOR;
- lineCount++;
} else {
state = State.IN_RECORD;
return true;
@@ -167,7 +170,8 @@
}
}
- public boolean nextField(int fieldCount) throws IOException {
+ public boolean nextField() throws IOException {
+ fieldCount++;
switch (state) {
case INIT:
case EOR:
@@ -217,10 +221,10 @@
} else {
// In this case, we don't have a quote in the beginning of a field.
throw new IOException(
- "At line: "
- + lineCount
+ "At record: "
+ + recordCount
+ ", field#: "
- + (fieldCount + 1)
+ + fieldCount
+ " - a quote enclosing a field needs to be placed in the beginning of that field.");
}
}
@@ -262,7 +266,7 @@
// There is a quote before the delimiter, however it is not directly placed before the delimiter.
// In this case, we throw an exception.
// quoteCount == doubleQuoteCount * 2 + 2 : only true when we have two quotes except double-quotes.
- throw new IOException("At line: " + lineCount + ", field#: " + (fieldCount + 1)
+ throw new IOException("At record: " + recordCount + ", field#: " + fieldCount
+ " - A quote enclosing a field needs to be followed by the delimiter.");
}
}
@@ -275,7 +279,6 @@
fEnd = p;
start = p + 1;
state = State.EOR;
- lineCount++;
lastDelimiterPosition = p;
return true;
} else if (startedQuote && lastQuotePosition == p - 1 && lastDoubleQuotePosition != p - 1
@@ -286,7 +289,6 @@
lastDelimiterPosition = p;
start = p + 1;
state = State.EOR;
- lineCount++;
startedQuote = false;
return true;
}