[ASTERIXDB-3321][FUN]: Return null and warn for string functions for invalid unicode sequence

Change-Id: I67c04de2144f740fd63e85ecbd4efded544db62c
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17986
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Murtadha Hubail <mhubail@apache.org>
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp
new file mode 100644
index 0000000..a533d7e
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/invalid-unicode/test.000.query.sqlpp
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// param max-warnings:json=1000
+
+[
+ string_length("\uDEAD x \uDEAD"),
+ string_to_codepoint("\uDEAD x \uDEAD"),
+ trim("\uDEAD x \uDEAD"),
+ ltrim("\uDEAD x \uDEAD"),
+ rtrim("\uDEAD x \uDEAD"),
+ trim("\uDEAD x \uDEAD", "x"),
+ ltrim("\uDEAD x \uDEAD", "x"),
+ rtrim("\uDEAD x \uDEAD", "x"),
+ reverse("\uDEAD x \uDEAD"),
+ position("\uDEAD x \uDEAD", "x"),
+ position1("\uDEAD x \uDEAD", "x")
+];
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm
new file mode 100644
index 0000000..a9fc274
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/invalid-unicode/result.000.adm
@@ -0,0 +1 @@
+[ null, null, null, null, null, null, null, null, null, null, null ]
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
index dc31f7e..9a3abae 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
@@ -11341,6 +11341,22 @@
         <output-dir compare="Text">substring-after-5</output-dir>
       </compilation-unit>
     </test-case>
+    <test-case FilePath="string" check-warnings="true">
+      <compilation-unit name="invalid-unicode">
+        <output-dir compare="Text">invalid-unicode</output-dir>
+        <expected-warn>ASX0060: Function 'string-length' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'string-to-codepoint' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'trim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'trim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'rtrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'rtrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'ltrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'ltrim' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'reverse' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'position' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+        <expected-warn>ASX0060: Function 'position1' failed to evaluate because: Decoding error: got a low surrogate without a leading high surrogate</expected-warn>
+      </compilation-unit>
+    </test-case>
   </test-group>
   <test-group name="subquery">
     <test-case FilePath="subquery">
diff --git a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
index b0826e8..4910343 100644
--- a/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
+++ b/asterixdb/asterix-common/src/main/java/org/apache/asterix/common/exceptions/ErrorCode.java
@@ -87,6 +87,7 @@
     PARQUET_CONTAINS_OVERFLOWED_BIGINT(57),
     UNEXPECTED_ERROR_ENCOUNTERED(58),
     INVALID_PARQUET_FILE(59),
+    FUNCTION_EVALUATION_FAILED(60),
 
     UNSUPPORTED_JRE(100),
 
diff --git a/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties b/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties
index 0bf523a..074245c 100644
--- a/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties
+++ b/asterixdb/asterix-common/src/main/resources/asx_errormsg/en.properties
@@ -94,6 +94,7 @@
 57 = Parquet file(s) contain unsigned integer that is larger than the '%1$s' range
 58 = Error encountered: %1$s
 59 = Invalid Parquet file: %1$s. Reason: %2$s
+60 = Function '%1$s' failed to evaluate because: %2$s
 
 100 = Unsupported JRE: %1$s
 
diff --git a/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java b/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java
index ccb3a8d..928a6b5 100644
--- a/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java
+++ b/asterixdb/asterix-om/src/main/java/org/apache/asterix/om/exceptions/ExceptionUtil.java
@@ -143,6 +143,14 @@
         warnInvalidValue(ctx, srcLoc, fid, argIdx, argValue, ErrorCode.NEGATIVE_VALUE);
     }
 
+    public static void warnStringFunctionFailed(IEvaluatorContext ctx, SourceLocation srcLoc, FunctionIdentifier fid,
+            String errMsg) {
+        if (ctx.getWarningCollector().shouldWarn()) {
+            ctx.getWarningCollector()
+                    .warn(Warning.of(srcLoc, ErrorCode.FUNCTION_EVALUATION_FAILED, fid.getName(), errMsg));
+        }
+    }
+
     private static void warnInvalidValue(IEvaluatorContext ctx, SourceLocation srcLoc, FunctionIdentifier fid,
             int argIdx, double argValue, ErrorCode errorCode) {
         IWarningCollector warningCollector = ctx.getWarningCollector();
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java
index 2fc8654..9de704b 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractBinaryStringEval.java
@@ -35,6 +35,7 @@
 import org.apache.hyracks.data.std.primitive.VoidPointable;
 import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
 import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import org.apache.hyracks.util.exceptions.UTF8EncodingException;
 
 public abstract class AbstractBinaryStringEval implements IScalarEvaluator {
 
@@ -106,6 +107,9 @@
         // The actual processing.
         try {
             process(leftStringPointable, rightStringPointable, resultPointable);
+        } catch (UTF8EncodingException ex) {
+            PointableHelper.setNull(resultPointable);
+            ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, funcID, ex.getMessage());
         } catch (IOException e) {
             throw HyracksDataException.create(e);
         }
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java
index 5efe529..7a60aae 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/AbstractUnaryStringStringEval.java
@@ -37,6 +37,7 @@
 import org.apache.hyracks.data.std.util.GrowableArray;
 import org.apache.hyracks.data.std.util.UTF8StringBuilder;
 import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import org.apache.hyracks.util.exceptions.UTF8EncodingException;
 
 abstract class AbstractUnaryStringStringEval implements IScalarEvaluator {
 
@@ -84,6 +85,9 @@
         try {
             process(stringPtr, resultPointable);
             writeResult(resultPointable);
+        } catch (UTF8EncodingException ex) {
+            PointableHelper.setNull(resultPointable);
+            ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, funcID, ex.getMessage());
         } catch (IOException e) {
             throw HyracksDataException.create(e);
         }
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
index 47caf14..d9c9ecb 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLengthDescriptor.java
@@ -41,6 +41,7 @@
 import org.apache.hyracks.data.std.primitive.VoidPointable;
 import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
 import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import org.apache.hyracks.util.exceptions.UTF8EncodingException;
 import org.apache.hyracks.util.string.UTF8StringUtil;
 
 @MissingNullInOutFunction
@@ -89,6 +90,9 @@
                             result.setValue(len);
                             int64Serde.serialize(result, out);
                             resultPointable.set(resultStorage);
+                        } catch (UTF8EncodingException ex) {
+                            PointableHelper.setNull(resultPointable);
+                            ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, getIdentifier(), ex.getMessage());
                         } catch (IOException e1) {
                             throw HyracksDataException.create(e1);
                         }
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
index 2f6a223..d4f5368 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringToCodePointDescriptor.java
@@ -43,6 +43,7 @@
 import org.apache.hyracks.data.std.primitive.VoidPointable;
 import org.apache.hyracks.data.std.util.ArrayBackedValueStorage;
 import org.apache.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
+import org.apache.hyracks.util.exceptions.UTF8EncodingException;
 import org.apache.hyracks.util.string.UTF8StringUtil;
 
 @MissingNullInOutFunction
@@ -109,6 +110,9 @@
                             }
                             listBuilder.write(out, true);
                             result.set(resultStorage);
+                        } catch (UTF8EncodingException ex) {
+                            PointableHelper.setNull(result);
+                            ExceptionUtil.warnStringFunctionFailed(ctx, sourceLoc, getIdentifier(), ex.getMessage());
                         } catch (IOException e1) {
                             throw HyracksDataException.create(e1);
                         }
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 49f6221..4acc823 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
@@ -20,6 +20,7 @@
 
 import static org.apache.hyracks.util.string.UTF8StringUtil.HIGH_SURROGATE_WITHOUT_LOW_SURROGATE;
 import static org.apache.hyracks.util.string.UTF8StringUtil.LOW_SURROGATE_WITHOUT_HIGH_SURROGATE;
+import static org.apache.hyracks.util.string.UTF8StringUtil.MALFORMED_BYTES;
 
 import java.io.IOException;
 import java.nio.charset.Charset;
@@ -35,6 +36,7 @@
 import org.apache.hyracks.data.std.api.IPointableFactory;
 import org.apache.hyracks.data.std.util.GrowableArray;
 import org.apache.hyracks.data.std.util.UTF8StringBuilder;
+import org.apache.hyracks.util.exceptions.UTF8EncodingException;
 import org.apache.hyracks.util.string.UTF8StringUtil;
 
 import com.fasterxml.jackson.databind.JsonNode;
@@ -136,7 +138,7 @@
         }
 
         if (byteIdx != utf8Length) {
-            throw new IllegalArgumentException("Decoding error: malformed bytes");
+            throw new UTF8EncodingException(MALFORMED_BYTES);
         }
     }
 
@@ -317,7 +319,7 @@
                     return startMatchPos;
                 } else {
                     if (prevHighSurrogate) {
-                        throw new IllegalArgumentException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
+                        throw new UTF8EncodingException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
                     }
                     return codePointCount;
                 }
@@ -333,7 +335,7 @@
                         codePointCount++;
                         prevHighSurrogate = false;
                     } else {
-                        throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+                        throw new UTF8EncodingException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
                     }
                 } else {
                     codePointCount++;
@@ -678,9 +680,9 @@
             endIndex = startIndex;
             int cursorIndex = startIndex;
             while (cursorIndex < srcUtfLen) {
-                int codePioint = srcPtr.codePointAt(srcStart + cursorIndex);
+                int codePoint = srcPtr.codePointAt(srcStart + cursorIndex);
                 cursorIndex += srcPtr.codePointSize(srcStart + cursorIndex);
-                if (!codePointSet.contains(codePioint)) {
+                if (!codePointSet.contains(codePoint)) {
                     endIndex = cursorIndex;
                 }
             }
@@ -739,9 +741,8 @@
                         cursorIndex--;
                         if (UTF8StringUtil.isCharStart(srcPtr.bytes, cursorIndex)) {
                             ch = UTF8StringUtil.charAt(srcPtr.bytes, cursorIndex);
-                            if (Character.isHighSurrogate(ch) == false) {
-                                throw new IllegalArgumentException(
-                                        "Decoding Error: no corresponding high surrogate found for the following low surrogate");
+                            if (!Character.isHighSurrogate(ch)) {
+                                throw new UTF8EncodingException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
                             }
 
                             charSize += UTF8StringUtil.charSize(srcPtr.bytes, cursorIndex);
@@ -749,7 +750,7 @@
                         }
                     }
                 } else if (Character.isHighSurrogate(ch)) {
-                    throw new IllegalArgumentException("Decoding Error: get a high surrogate without low surrogate");
+                    throw new UTF8EncodingException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
                 }
 
                 builder.appendUtf8StringPointable(srcPtr, cursorIndex, charSize);
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/exceptions/UTF8EncodingException.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/exceptions/UTF8EncodingException.java
new file mode 100644
index 0000000..3853a1f
--- /dev/null
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/exceptions/UTF8EncodingException.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hyracks.util.exceptions;
+
+public class UTF8EncodingException extends IllegalArgumentException {
+    public UTF8EncodingException(String s) {
+        super(s);
+    }
+}
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index 3eb8687..15638f9 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -28,6 +28,7 @@
 import java.lang.ref.SoftReference;
 
 import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
+import org.apache.hyracks.util.exceptions.UTF8EncodingException;
 
 /**
  * A helper package to operate the UTF8String in Hyracks.
@@ -35,6 +36,7 @@
  */
 public class UTF8StringUtil {
 
+    public static final String MALFORMED_BYTES = "Decoding error: malformed bytes";
     public static final String LOW_SURROGATE_WITHOUT_HIGH_SURROGATE =
             "Decoding error: got a low surrogate without a leading high surrogate";
     public static final String HIGH_SURROGATE_WITHOUT_LOW_SURROGATE =
@@ -101,7 +103,7 @@
 
         if (Character.isLowSurrogate(c1)) {
             // In this case, the index s doesn't point to a correct position
-            throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+            throw new UTF8EncodingException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
         }
 
         if (Character.isHighSurrogate(c1)) {
@@ -112,7 +114,7 @@
             if (Character.isLowSurrogate(c2)) {
                 return Character.toCodePoint(c1, c2);
             } else {
-                throw new IllegalArgumentException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
+                throw new UTF8EncodingException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
             }
         }
 
@@ -124,7 +126,7 @@
         int size1 = charSize(b, s);
 
         if (Character.isLowSurrogate(c1)) {
-            throw new IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
+            throw new UTF8EncodingException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
         }
 
         if (Character.isHighSurrogate(c1)) {