[ASTERIXDB-3321][FUN]: Return null and warn for string functions for invalid unicode sequence
Change-Id: I67c04de2144f740fd63e85ecbd4efded544db62c
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17986
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Murtadha Hubail <mhubail@apache.org>
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp
new file mode 100644
index 0000000..a533d7e
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// param max-warnings:json=1000
+
+[
+ string_length("\uDEAD x \uDEAD"),
+ string_to_codepoint("\uDEAD x \uDEAD"),
+ trim("\uDEAD x \uDEAD"),
+ ltrim("\uDEAD x \uDEAD"),
+ rtrim("\uDEAD x \uDEAD"),
+ trim("\uDEAD x \uDEAD", "x"),
+ ltrim("\uDEAD x \uDEAD", "x"),
+ rtrim("\uDEAD x \uDEAD", "x"),
+ reverse("\uDEAD x \uDEAD"),
+ position("\uDEAD x \uDEAD", "x"),
+ position1("\uDEAD x \uDEAD", "x")
+];
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm
new file mode 100644
index 0000000..a9fc274
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm
@@ -0,0 +1 @@
+[ null, null, null, null, null, null, null, null, null, null, null ]
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
index dc31f7e..9a3abae 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
@@ -11341,6 +11341,22 @@
<output-dir compare="Text">substring-after-5</output-dir>
</compilation-unit>
</test-case>
+ <test-case FilePath="string" check-warnings="true">
+ <compilation-unit name="invalid-unicode">
+ <output-dir compare="Text">invalid-unicode</output-dir>
+ <expected-warn>ASX0060: Function 'string-length' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+ <expected-warn>ASX0060: Function 'string-to-codepoint' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+ <expected-warn>ASX0060: Function 'trim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+ <expected-warn>ASX0060: Function 'trim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+ <expected-warn>ASX0060: Function 'rtrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+ <expected-warn>ASX0060: Function 'rtrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+ <expected-warn>ASX0060: Function 'ltrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+ <expected-warn>ASX0060: Function 'ltrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+ <expected-warn>ASX0060: Function 'reverse' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+ <expected-warn>ASX0060: Function 'position' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+ <expected-warn>ASX0060: Function 'position1' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+ </compilation-unit>
+ </test-case>
</test-group>
<test-group name="subquery">
<test-case FilePath="subquery">
diff --git a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
index b0826e8..4910343 100644
--- a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
+++ b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
@@ -87,6 +87,7 @@
PARQUET_CONTAINS_OVERFLOWED_BIGINT(57),
UNEXPECTED_ERROR_ENCOUNTERED(58),
INVALID_PARQUET_FILE(59),
+ FUNCTION_EVALUATION_FAILED(60),
UNSUPPORTED_JRE(100),
diff --git a/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties b/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties
index 0bf523a..074245c 100644
--- a/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties
+++ b/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties
@@ -94,6 +94,7 @@
57 = Parquet file(s) contain unsigned integer that is larger than the '%1$s' range
58 = Error encountered: %1$s
59 = Invalid Parquet file: %1$s. Reason: %2$s
+60 = Function '%1$s' failed to evaluate because: %2$s
100 = Unsupported JRE: %1$s
diff --git a/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java b/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java
index ccb3a8d..928a6b5 100644
--- a/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java
+++ b/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java
@@ -143,6 +143,14 @@
warnInvalidValue(ctx, srcLoc, fid, argIdx, argValue, ErrorCode.NEGATIVE_VALUE);
}
+ public static void warnStringFunctionFailed(IEvaluatorContext ctx, SourceLocation srcLoc, FunctionIdentifier fid,
+ String errMsg) {
+ if (ctx.getWarningCollector().shouldWarn()) {
+ ctx.getWarningCollector()
+ .warn(Warning.of(srcLoc, ErrorCode.FUNCTION_EVALUATION_FAILED, fid.getName(), errMsg));
+ }
+ }
+
private static void warnInvalidValue(IEvaluatorContext ctx, SourceLocation srcLoc, FunctionIdentifier fid,
int argIdx, double argValue, ErrorCode errorCode) {
IWarningCollector warningCollector = ctx.getWarningCollector();
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java
index 2fc8654..9de704b 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java
@@ -35,6 +35,7 @@
import org.apache.hyracks.data.std.primitive.VoidPointable;
import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import org.apache.hyracks.util.exceptions.UTF8EncodingException;
public abstract class AbstractBinaryStringEval implements IScalarEvaluator {
@@ -106,6 +107,9 @@
// The actual processing.
try {
process(leftStringPointable, rightStringPointable, resultPointable);
+ } catch (UTF8EncodingException ex) {
+ PointableHelper.setNull(resultPointable);
+ ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, funcID, ex.getMessage());
} catch (IOException e) {
throw HyracksDataException.create(e);
}
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java
index 5efe529..7a60aae 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java
@@ -37,6 +37,7 @@
import org.apache.hyracks.data.std.util.GrowableArray;
import org.apache.hyracks.data.std.util.UTF8StringBuilder;
import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import org.apache.hyracks.util.exceptions.UTF8EncodingException;
abstract class AbstractUnaryStringStringEval implements IScalarEvaluator {
@@ -84,6 +85,9 @@
try {
process(stringPtr, resultPointable);
writeResult(resultPointable);
+ } catch (UTF8EncodingException ex) {
+ PointableHelper.setNull(resultPointable);
+ ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, funcID, ex.getMessage());
} catch (IOException e) {
throw HyracksDataException.create(e);
}
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
index 47caf14..d9c9ecb 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
@@ -41,6 +41,7 @@
import org.apache.hyracks.data.std.primitive.VoidPointable;
import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import org.apache.hyracks.util.exceptions.UTF8EncodingException;
import org.apache.hyracks.util.string.UTF8StringUtil;
@MissingNullInOutFunction
@@ -89,6 +90,9 @@
result.setValue(len);
int64Serde.serialize(result, out);
resultPointable.set(resultStorage);
+ } catch (UTF8EncodingException ex) {
+ PointableHelper.setNull(resultPointable);
+ ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, getIdentifier(), ex.getMessage());
} catch (IOException e1) {
throw HyracksDataException.create(e1);
}
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
index 2f6a223..d4f5368 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
@@ -43,6 +43,7 @@
import org.apache.hyracks.data.std.primitive.VoidPointable;
import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import org.apache.hyracks.util.exceptions.UTF8EncodingException;
import org.apache.hyracks.util.string.UTF8StringUtil;
@MissingNullInOutFunction
@@ -109,6 +110,9 @@
}
listBuilder.write(out, true);
result.set(resultStorage);
+ } catch (UTF8EncodingException ex) {
+ PointableHelper.setNull(result);
+ ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, getIdentifier(), ex.getMessage());
} catch (IOException e1) {
throw HyracksDataException.create(e1);
}
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 49f6221..4acc823 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
@@ -20,6 +20,7 @@
import static org.apache.hyracks.util.string.UTF8StringUtil.HIGH_SURROGATE_WITHOUT_LOW_SURROGATE;
import static org.apache.hyracks.util.string.UTF8StringUtil.LOW_SURROGATE_WITHOUT_HIGH_SURROGATE;
+import static org.apache.hyracks.util.string.UTF8StringUtil.MALFORMED_BYTES;
import java.io.IOException;
import java.nio.charset.Charset;
@@ -35,6 +36,7 @@
import org.apache.hyracks.data.std.api.IPointableFactory;
import org.apache.hyracks.data.std.util.GrowableArray;
import org.apache.hyracks.data.std.util.UTF8StringBuilder;
+import org.apache.hyracks.util.exceptions.UTF8EncodingException;
import org.apache.hyracks.util.string.UTF8StringUtil;
import com.fasterxml.jackson.databind.JsonNode;
@@ -136,7 +138,7 @@
}
if (byteIdx != utf8Length) {
- throw new IllegalArgumentException("Decoding error: malformed bytes");
+ throw new UTF8EncodingException(MALFORMED_BYTES);
}
}
@@ -317,7 +319,7 @@
return startMatchPos;
} else {
if (prevHighSurrogate) {
- throw new IllegalArgumentException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
+ throw new UTF8EncodingException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
}
return codePointCount;
}
@@ -333,7 +335,7 @@
codePointCount++;
prevHighSurrogate = false;
} else {
- throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+ throw new UTF8EncodingException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
}
} else {
codePointCount++;
@@ -678,9 +680,9 @@
endIndex = startIndex;
int cursorIndex = startIndex;
while (cursorIndex < srcUtfLen) {
- int codePioint = srcPtr.codePointAt(srcStart + cursorIndex);
+ int codePoint = srcPtr.codePointAt(srcStart + cursorIndex);
cursorIndex += srcPtr.codePointSize(srcStart + cursorIndex);
- if (!codePointSet.contains(codePioint)) {
+ if (!codePointSet.contains(codePoint)) {
endIndex = cursorIndex;
}
}
@@ -739,9 +741,8 @@
cursorIndex--;
if (UTF8StringUtil.isCharStart(srcPtr.bytes, cursorIndex)) {
ch = UTF8StringUtil.charAt(srcPtr.bytes, cursorIndex);
- if (Character.isHighSurrogate(ch) == false) {
- throw new IllegalArgumentException(
- "Decoding Error: no corresponding high surrogate found for the following low surrogate");
+ if (!Character.isHighSurrogate(ch)) {
+ throw new UTF8EncodingException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
}
charSize += UTF8StringUtil.charSize(srcPtr.bytes, cursorIndex);
@@ -749,7 +750,7 @@
}
}
} else if (Character.isHighSurrogate(ch)) {
- throw new IllegalArgumentException("Decoding Error: get a high surrogate without low surrogate");
+ throw new UTF8EncodingException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
}
builder.appendUtf8StringPointable(srcPtr, cursorIndex, charSize);
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/exceptions/UTF8EncodingException.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/exceptions/UTF8EncodingException.java
new file mode 100644
index 0000000..3853a1f
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/exceptions/UTF8EncodingException.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hyracks.util.exceptions;
+
+public class UTF8EncodingException extends IllegalArgumentException {
+ public UTF8EncodingException(String s) {
+ super(s);
+ }
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index 3eb8687..15638f9 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -28,6 +28,7 @@
import java.lang.ref.SoftReference;
import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
+import org.apache.hyracks.util.exceptions.UTF8EncodingException;
/**
* A helper package to operate the UTF8String in Hyracks.
@@ -35,6 +36,7 @@
*/
public class UTF8StringUtil {
+ public static final String MALFORMED_BYTES = "Decoding error: malformed bytes";
public static final String LOW_SURROGATE_WITHOUT_HIGH_SURROGATE =
"Decoding error: got a low surrogate without a leading high surrogate";
public static final String HIGH_SURROGATE_WITHOUT_LOW_SURROGATE =
@@ -101,7 +103,7 @@
if (Character.isLowSurrogate(c1)) {
// In this case, the index s doesn't point to a correct position
- throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+ throw new UTF8EncodingException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
}
if (Character.isHighSurrogate(c1)) {
@@ -112,7 +114,7 @@
if (Character.isLowSurrogate(c2)) {
return Character.toCodePoint(c1, c2);
} else {
- throw new IllegalArgumentException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
+ throw new UTF8EncodingException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
}
}
@@ -124,7 +126,7 @@
int size1 = charSize(b, s);
if (Character.isLowSurrogate(c1)) {
- throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+ throw new UTF8EncodingException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
}
if (Character.isHighSurrogate(c1)) {