[ASTERIXDB-2757] Properly handle include/exclude pattern translation
- user model changes: no
- storage format changes: no
- interface changes: no
Details:
- Handles properly handling the translation of a wildcard
pattern to a regex to avoid potential failure in the query.
- Added and updated some test cases.
Change-Id: I661c1fad9b2c2692fa231e48f00eb5cf9d79ad5e
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7143
Tested-by: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Reviewed-by: Ali Alsuliman <ali.al.solaiman@gmail.com>
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.000.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.000.ddl.sqlpp
new file mode 100644
index 0000000..e4cf69b
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.000.ddl.sqlpp
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+drop dataverse test if exists;
+create dataverse test;
+use test;
+
+drop type test if exists;
+create type test as open {
+};
+
+drop dataset test if exists;
+create external dataset test(test) using S3 (
+("accessKeyId"="dummyAccessKey"),
+("secretAccessKey"="dummySecretKey"),
+("region"="us-west-2"),
+("serviceEndpoint"="http://localhost:8001"),
+("container"="include-exclude"),
+("definition"="data/mixed/"),
+("format"="csv"),
+("header"=false),
+("include"="*.[a-c][a-z][a-z**||\\\\&&--~~]")
+);
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.001.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.001.query.sqlpp
new file mode 100644
index 0000000..3306d5c
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.001.query.sqlpp
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use test;
+select count(*) as `count` from test;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.099.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.099.ddl.sqlpp
new file mode 100644
index 0000000..548e632
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-11/test.099.ddl.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+drop dataverse test if exists;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.000.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.000.ddl.sqlpp
new file mode 100644
index 0000000..47fbaef
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.000.ddl.sqlpp
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+ // This test case matches nothing for "include", but has extreme cases and complicated pattern, expected is to not fail
+
+drop dataverse test if exists;
+create dataverse test;
+use test;
+
+drop type test if exists;
+create type test as open {
+};
+
+drop dataset test if exists;
+create external dataset test(test) using S3 (
+("accessKeyId"="dummyAccessKey"),
+("secretAccessKey"="dummySecretKey"),
+("region"="us-west-2"),
+("serviceEndpoint"="http://localhost:8001"),
+("container"="include-exclude"),
+("definition"="data/mixed/"),
+("format"="csv"),
+("header"=false),
+("include"="[][!][^]]]]*[![*a-zA--&&^$||0-9B$\\*&&]*&&[^a-b||0--9][[[")
+);
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.001.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.001.query.sqlpp
new file mode 100644
index 0000000..3306d5c
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.001.query.sqlpp
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use test;
+select count(*) as `count` from test;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.099.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.099.ddl.sqlpp
new file mode 100644
index 0000000..548e632
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/aws/s3/include-exclude/include-12/test.099.ddl.sqlpp
@@ -0,0 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+drop dataverse test if exists;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-10/result.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-10/result.001.adm
new file mode 100644
index 0000000..c1a0ea2
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-10/result.001.adm
@@ -0,0 +1 @@
+{ "count": 0 }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-11/result.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-11/result.001.adm
new file mode 100644
index 0000000..95204aa
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-11/result.001.adm
@@ -0,0 +1 @@
+{ "count": 6 }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-12/result.001.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-12/result.001.adm
new file mode 100644
index 0000000..c1a0ea2
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/aws/s3/include-exclude/include-12/result.001.adm
@@ -0,0 +1 @@
+{ "count": 0 }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml
index ad17afe..3a4f03e 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset.xml
@@ -232,7 +232,16 @@
<test-case FilePath="external-dataset">
<compilation-unit name="aws/s3/include-exclude/include-10">
<output-dir compare="Text">aws/s3/include-exclude/include-10</output-dir>
- <expected-error>Invalid pattern *[abc][.*</expected-error>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="external-dataset">
+ <compilation-unit name="aws/s3/include-exclude/include-11">
+ <output-dir compare="Text">aws/s3/include-exclude/include-11</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="external-dataset">
+ <compilation-unit name="aws/s3/include-exclude/include-12">
+ <output-dir compare="Text">aws/s3/include-exclude/include-12</output-dir>
</compilation-unit>
</test-case>
</test-group>
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java
index 67a7c93..f3a36ff 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/input/record/reader/aws/AwsS3InputStreamFactory.java
@@ -103,10 +103,10 @@
for (Map.Entry<String, String> entry : configuration.entrySet()) {
if (entry.getKey().startsWith(KEY_INCLUDE)) {
pattern = entry.getValue();
- includeMatchers.add(Pattern.compile(ExternalDataUtils.wildcardToRegex(pattern)).matcher(""));
+ includeMatchers.add(Pattern.compile(ExternalDataUtils.patternToRegex(pattern)).matcher(""));
} else if (entry.getKey().startsWith(KEY_EXCLUDE)) {
pattern = entry.getValue();
- excludeMatchers.add(Pattern.compile(ExternalDataUtils.wildcardToRegex(pattern)).matcher(""));
+ excludeMatchers.add(Pattern.compile(ExternalDataUtils.patternToRegex(pattern)).matcher(""));
}
}
} catch (PatternSyntaxException ex) {
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
index 1300ac3..fc31286 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/util/ExternalDataUtils.java
@@ -23,10 +23,12 @@
import static org.apache.asterix.external.util.ExternalDataConstants.KEY_QUOTE;
import static org.apache.asterix.external.util.ExternalDataConstants.KEY_RECORD_END;
import static org.apache.asterix.external.util.ExternalDataConstants.KEY_RECORD_START;
+import static org.apache.asterix.runtime.evaluators.functions.StringEvaluatorUtils.RESERVED_REGEX_CHARS;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;
@@ -498,76 +500,91 @@
/**
* Converts the wildcard to proper regex
*
- * @param wildcard wildcard pattern to convert
+ * @param pattern wildcard pattern to convert
*
* @return regex expression
*/
- public static String wildcardToRegex(String wildcard) {
- StringBuilder builder = new StringBuilder(wildcard.length());
- builder.append('^');
+ public static String patternToRegex(String pattern) {
+ int charPosition = 0;
+ int patternLength = pattern.length();
+ StringBuilder stuffBuilder = new StringBuilder();
+ StringBuilder result = new StringBuilder();
- // This keeps an eye on the presence inside or outside a sequence, everything inside a sequence is a literal
- // e.g ("*" ===> ".*" while "[*]" ===> "[\*]"
- boolean outsideBracketSequence = true;
+ while (charPosition < patternLength) {
+ char c = pattern.charAt(charPosition);
+ charPosition++;
- for (int i = 0; i < wildcard.length(); i++) {
- char c = wildcard.charAt(i);
switch (c) {
case '*':
- builder.append(outsideBracketSequence ? "." : "\\").append(c);
+ result.append(".*");
break;
case '?':
- builder.append(outsideBracketSequence ? "." : "\\?");
+ result.append(".");
break;
case '[':
- if (outsideBracketSequence) {
- outsideBracketSequence = false;
- builder.append(c);
- if (i + 1 < wildcard.length()) {
- if (wildcard.charAt(i + 1) == '!') {
- i++;
- builder.append('^');
- }
+ int closingBracketPosition = charPosition;
+ if (closingBracketPosition < patternLength && pattern.charAt(closingBracketPosition) == '!') {
+ closingBracketPosition++;
+ }
+
+ // 2 cases can happen here:
+ // 1- Empty character class [] which is invalid for java, so treat ] as literal and find another
+ // closing bracket, if no closing bracket is found, the whole thing is a literal
+ // 2- Negated empty class [!] converted to [^] which is invalid for java, so treat ] as literal and
+ // find another closing bracket, if no closing bracket is found, the whole thing is a literal
+ if (closingBracketPosition < patternLength && pattern.charAt(closingBracketPosition) == ']') {
+ closingBracketPosition++;
+ }
+
+ // No [] and [!] cases, search for the closing bracket
+ while (closingBracketPosition < patternLength && pattern.charAt(closingBracketPosition) != ']') {
+ closingBracketPosition++;
+ }
+
+ // No closing bracket found (or [] or [!]), escape the opening bracket, treat it as literals
+ if (closingBracketPosition >= patternLength) {
+ result.append("\\[");
+ } else {
+ // Found closing bracket, get the stuff in between the found the character class ("[" and "]")
+ String stuff = pattern.substring(charPosition, closingBracketPosition);
+
+ stuffBuilder.setLength(0);
+ int stuffCharPos = 0;
+
+ // If first character in the character class is "!" then convert it to "^"
+ if (stuff.charAt(0) == '!') {
+ stuffBuilder.append('^');
+ stuffCharPos++; // ignore first character when escaping metacharacters next step
}
- } else {
- // escape the open bracket "[" if we are already inside a bracket sequence
- builder.append("\\").append(c);
+
+ for (; stuffCharPos < stuff.length(); stuffCharPos++) {
+ char stuffChar = stuff.charAt(stuffCharPos);
+ if (stuffChar != '-' && Arrays.binarySearch(RESERVED_REGEX_CHARS, stuffChar) >= 0) {
+ stuffBuilder.append("\\");
+ }
+ stuffBuilder.append(stuffChar);
+ }
+
+ String stuffEscaped = stuffBuilder.toString();
+
+ // Escape the set operations
+ stuffEscaped = stuffEscaped.replace("&&", "\\&\\&").replace("~~", "\\~\\~")
+ .replace("||", "\\|\\|").replace("--", "\\-\\-");
+
+ result.append("[").append(stuffEscaped).append("]");
+ charPosition = closingBracketPosition + 1;
}
break;
- case ']':
- if (outsideBracketSequence) {
- // escape if we are outside bracket sequence
- builder.append("\\").append(c);
- } else {
- // Inside bracket, close it and mark as outside bracket
- outsideBracketSequence = true;
- builder.append(c);
- }
- break;
- // escape special regexp-characters
- case '(':
- case ')':
- case '$':
- case '^':
- case '.':
- case '{':
- case '}':
- case '|':
- case '+':
- case '=':
- case '<':
- case '>':
- case '!':
- case '\\':
- builder.append("\\").append(c);
- break;
default:
- builder.append(c);
+ if (Arrays.binarySearch(RESERVED_REGEX_CHARS, c) >= 0) {
+ result.append("\\");
+ }
+ result.append(c);
break;
}
}
- builder.append('$');
- return builder.toString();
+
+ return result.toString();
}
public static class AwsS3 {
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringEvaluatorUtils.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringEvaluatorUtils.java
index 81ce832..492d64c 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringEvaluatorUtils.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringEvaluatorUtils.java
@@ -60,8 +60,8 @@
return destString;
}
- static final char[] RESERVED_REGEX_CHARS = new char[] { '\\', '(', ')', '[', ']', '{', '}', '.', '^', '$', '*', '|',
- '+', '?', '<', '>', '-', '=', '!' };
+ public static final char[] RESERVED_REGEX_CHARS = new char[] { '\\', '(', ')', '[', ']', '{', '}', '.', '^', '$',
+ '*', '|', '+', '?', '<', '>', '-', '=', '!' };
static {
Arrays.sort(RESERVED_REGEX_CHARS);