[ASTERIXDB-3540][COMP] Fixed calculation of expected schema for pushdown
- user model changes: no
- storage format changes: no
- interface changes: no
Details:
if the getField expr consisted of a function which needs to be
evaluated at runtime, the pushdown computer was not evaluating
those expression leading to incorrect computation.
eg:
1. `field-access-by-name`(t.r.p, x.y.age_field)
2. `field-access-by-name`(t.r.p, substring(x.y.age_field, 0, 4))
Ext-ref: MB-64730
Change-Id: Iac55527af143c292557158ca8e47e92538e93970
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/19288
Reviewed-by: Murtadha Hubail <mhubail@apache.org>
Tested-by: Murtadha Hubail <mhubail@apache.org>
Integration-Tests: Murtadha Hubail <mhubail@apache.org>
diff --git a/asterixdb/NOTICE b/asterixdb/NOTICE
index 06d538d..5118782 100644
--- a/asterixdb/NOTICE
+++ b/asterixdb/NOTICE
@@ -1,5 +1,5 @@
Apache AsterixDB
-Copyright 2015-2024 The Apache Software Foundation
+Copyright 2015-2025 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
diff --git a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/pushdown/ExpectedSchemaBuilder.java b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/pushdown/ExpectedSchemaBuilder.java
index b7632db..a9937d1 100644
--- a/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/pushdown/ExpectedSchemaBuilder.java
+++ b/asterixdb/asterix-algebra/src/main/java/org/apache/asterix/optimizer/rules/pushdown/ExpectedSchemaBuilder.java
@@ -22,6 +22,7 @@
import static org.apache.asterix.optimizer.rules.pushdown.ExpressionValueAccessPushdownVisitor.SUPPORTED_FUNCTIONS;
import java.util.HashMap;
+import java.util.List;
import java.util.Map;
import org.apache.asterix.om.functions.BuiltinFunctions;
@@ -37,6 +38,7 @@
import org.apache.asterix.optimizer.rules.pushdown.schema.UnionExpectedSchemaNode;
import org.apache.asterix.runtime.projection.DataProjectionInfo;
import org.apache.asterix.runtime.projection.FunctionCallInformation;
+import org.apache.commons.lang3.mutable.Mutable;
import org.apache.hyracks.algebricks.core.algebra.base.ILogicalExpression;
import org.apache.hyracks.algebricks.core.algebra.base.LogicalExpressionTag;
import org.apache.hyracks.algebricks.core.algebra.base.LogicalVariable;
@@ -72,6 +74,10 @@
}
public boolean setSchemaFromExpression(AbstractFunctionCallExpression expr, LogicalVariable producedVar) {
+ return buildExpectedSchemaNodes(expr, producedVar);
+ }
+
+ public boolean setSchemaFromCalculatedExpression(AbstractFunctionCallExpression expr, LogicalVariable producedVar) {
//Parent always nested
AbstractComplexExpectedSchemaNode parent = (AbstractComplexExpectedSchemaNode) buildNestedNode(expr);
if (parent != null) {
@@ -111,6 +117,67 @@
return !varToNode.isEmpty();
}
+ private boolean buildExpectedSchemaNodes(ILogicalExpression expr, LogicalVariable producedVar) {
+ return buildNestedNodes(expr, producedVar);
+ }
+
+ private boolean buildNestedNodes(ILogicalExpression expr, LogicalVariable producedVar) {
+ //The current node expression
+ boolean changed = false;
+ if (expr.getExpressionTag() != LogicalExpressionTag.FUNCTION_CALL) {
+ return false;
+ }
+ AbstractFunctionCallExpression myExpr = (AbstractFunctionCallExpression) expr;
+ if (!SUPPORTED_FUNCTIONS.contains(myExpr.getFunctionIdentifier()) || noArgsOrFirstArgIsConstant(myExpr)) {
+ // Check if the function consists of the Supported Functions
+ for (Mutable<ILogicalExpression> arg : myExpr.getArguments()) {
+ changed |= buildNestedNodes(arg.getValue(), producedVar);
+ }
+ return changed;
+ }
+ // if the child is not a function expression, then just one node.
+ if (BuiltinFunctions.ARRAY_STAR.equals(myExpr.getFunctionIdentifier())
+ || BuiltinFunctions.SCAN_COLLECTION.equals(myExpr.getFunctionIdentifier())) {
+ // these supported function won't have second child
+ IExpectedSchemaNode expectedSchemaNode = buildNestedNode(expr);
+ if (expectedSchemaNode != null) {
+ changed |= setSchemaFromCalculatedExpression((AbstractFunctionCallExpression) expr, producedVar);
+ }
+ } else {
+ ILogicalExpression childExpr = myExpr.getArguments().get(1).getValue();
+ if (childExpr.getExpressionTag() != LogicalExpressionTag.FUNCTION_CALL) {
+ // must be a variable or constant
+ IExpectedSchemaNode expectedSchemaNode = buildNestedNode(expr);
+ if (expectedSchemaNode != null) {
+ changed |= setSchemaFromCalculatedExpression((AbstractFunctionCallExpression) expr, producedVar);
+ }
+ } else {
+ // as the childExpr is a function.
+ // if the function had been evaluated at compile time, it would have been
+ // evaluated at this stage of compilation.
+ // eg: field-access(t.r.p, substring("name",2,4))
+ // this will be evaluated to field-access(t.r.p, "me") at compile time itself.
+ // since the execution reached this branch, this means the childExpr
+ // need to be evaluated at runtime, hence the childExpr should also be checked
+ // for possible pushdown.
+ // eg: field-access(t.r.p, substring(x.y.age_field, 0, 4))
+ ILogicalExpression parentExpr = myExpr.getArguments().get(0).getValue();
+ IExpectedSchemaNode parentExpectedNode = buildNestedNode(parentExpr);
+ if (parentExpectedNode != null) {
+ changed |=
+ setSchemaFromCalculatedExpression((AbstractFunctionCallExpression) parentExpr, producedVar);
+ }
+ changed |= buildNestedNodes(childExpr, producedVar);
+ }
+ }
+ return changed;
+ }
+
+ private boolean noArgsOrFirstArgIsConstant(AbstractFunctionCallExpression myExpr) {
+ List<Mutable<ILogicalExpression>> args = myExpr.getArguments();
+ return args.isEmpty() || args.get(0).getValue().getExpressionTag() == LogicalExpressionTag.CONSTANT;
+ }
+
private IExpectedSchemaNode buildNestedNode(ILogicalExpression expr) {
//The current node expression
AbstractFunctionCallExpression myExpr = (AbstractFunctionCallExpression) expr;
diff --git a/asterixdb/asterix-app/data/hdfs/parquet/friends.json b/asterixdb/asterix-app/data/hdfs/parquet/friends.json
new file mode 100644
index 0000000..d708ad9
--- /dev/null
+++ b/asterixdb/asterix-app/data/hdfs/parquet/friends.json
@@ -0,0 +1 @@
+{ "id": "1", "name": "Monica", "x": { "y": { "age_field": "age" } }, "t": { "r": { "p": { "age": "26" } } } }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java
index 316d261..7963132 100644
--- a/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java
+++ b/asterixdb/asterix-app/src/test/java/org/apache/asterix/test/external_dataset/ExternalDatasetTestUtils.java
@@ -272,6 +272,7 @@
loadData(generatedDataBasePath, "", "heterogeneous_1.parquet", definition, definitionSegment, false, false);
loadData(generatedDataBasePath, "", "heterogeneous_2.parquet", definition, definitionSegment, false, false);
loadData(generatedDataBasePath, "", "parquetTypes.parquet", definition, definitionSegment, false, false);
+ loadData(generatedDataBasePath, "", "friends.parquet", definition, definitionSegment, false, false);
}
private static void loadData(String fileBasePath, String filePathSegment, String filename, String definition,
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.01.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.01.ddl.sqlpp
new file mode 100644
index 0000000..a601a8d
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.01.ddl.sqlpp
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/*
+* Description : Field access pushdown
+* Expected Res : Success
+* Date : June 22nd 2020
+*/
+
+DROP DATAVERSE test IF EXISTS;
+CREATE DATAVERSE test;
+
+USE test;
+
+
+CREATE TYPE ParquetType as {
+};
+
+CREATE EXTERNAL DATASET ParquetDataset(ParquetType) USING %adapter%
+(
+ %template%,
+ ("container"="playground"),
+ ("definition"="parquet-data/reviews"),
+ ("include"="*friends.parquet"),
+ ("format" = "parquet")
+);
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.02.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.02.query.sqlpp
new file mode 100644
index 0000000..e72d412
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.02.query.sqlpp
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+SET `compiler.external.field.pushdown` "true";
+
+EXPLAIN
+SELECT t.r.g, `field-access-by-name`(t.r.p, x.y.age_field)
+FROM ParquetDataset;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.03.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.03.query.sqlpp
new file mode 100644
index 0000000..d15ba8d
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.03.query.sqlpp
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+SET `compiler.external.field.pushdown` "true";
+
+SELECT t.r.g, `field-access-by-name`(t.r.p, x.y.age_field)
+FROM ParquetDataset;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.02.plan b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.02.plan
new file mode 100644
index 0000000..4806a28
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.02.plan
@@ -0,0 +1 @@
+"distribute result [$$24] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]\n-- DISTRIBUTE_RESULT |PARTITIONED|\n exchange [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]\n -- ONE_TO_ONE_EXCHANGE |PARTITIONED|\n project ([$$24]) [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]\n -- STREAM_PROJECT |PARTITIONED|\n assign [$$24] <- [{\"g\": $$25.getField(\"g\"), \"$1\": $$25.getField(\"p\").getField(\"$$ParquetDataset.getField(\"x\").getField(\"y\").getField(\"age_field\")\")}] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]\n -- ASSIGN |PARTITIONED|\n assign [$$25] <- [$$ParquetDataset.getField(\"t\").getField(\"r\")] [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]\n -- ASSIGN |PARTITIONED|\n exchange [cardinality: 1000000.0, op-cost: 0.0, total-cost: 1000000.0]\n -- ONE_TO_ONE_EXCHANGE |PARTITIONED|\n data-scan []<-[$$ParquetDataset] <- test.ParquetDataset project ({t:{r:{p:any,g:any}},x:{y:{age_field:any}}}) [cardinality: 1000000.0, op-cost: 1000000.0, total-cost: 1000000.0]\n -- DATASOURCE_SCAN |PARTITIONED|\n exchange [cardinality: 0.0, op-cost: 0.0, total-cost: 0.0]\n -- ONE_TO_ONE_EXCHANGE |PARTITIONED|\n empty-tuple-source [cardinality: 0.0, op-cost: 0.0, total-cost: 0.0]\n -- EMPTY_TUPLE_SOURCE |PARTITIONED|\n"
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.03.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.03.adm
new file mode 100644
index 0000000..2246335
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/external-dataset/common/parquet/ASTERIXDB-3540/ASTERIXDB-3540.03.adm
@@ -0,0 +1 @@
+{ "$1": "26" }
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml
index 7242984..723c118 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml
@@ -102,6 +102,12 @@
</compilation-unit>
</test-case>
<test-case FilePath="external-dataset">
+ <compilation-unit name="common/parquet/ASTERIXDB-3540">
+ <placeholder name="adapter" value="S3" />
+ <output-dir compare="Clean-JSON">common/parquet/ASTERIXDB-3540</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="external-dataset">
<compilation-unit name="common/parquet/array-access-pushdown">
<placeholder name="adapter" value="S3" />
<output-dir compare="Text">common/parquet/array-access-pushdown</output-dir>
diff --git a/hyracks-fullstack/NOTICE b/hyracks-fullstack/NOTICE
index e9bb9a4..722db88 100644
--- a/hyracks-fullstack/NOTICE
+++ b/hyracks-fullstack/NOTICE
@@ -1,5 +1,5 @@
Apache Hyracks and Algebricks
-Copyright 2015-2024 The Apache Software Foundation
+Copyright 2015-2025 The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).