[ASTERIXDB-3392] Add type hierarchy in schema inference for copy to parquet
Details:
Implemented type hierarchy: double > float > int64 > int32. Schema inference will now assign the largest type encountered for a given field.
Ext-ref: MB-65895
Change-Id: Iabc5e2d8e68bf17f4c080ddaf6cc41141a0c3345
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/19559
Integration-Tests: Jenkins <jenkins@fulliautomatix.ics.uci.edu>
Tested-by: Ali Alsuliman <ali.al.solaiman@gmail.com>
Reviewed-by: Ali Alsuliman <ali.al.solaiman@gmail.com>
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.01.ddl.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.01.ddl.sqlpp
new file mode 100644
index 0000000..6ab6331
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.01.ddl.sqlpp
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+DROP DATAVERSE test if exists;
+CREATE DATAVERSE test;
+USE test;
+
+CREATE TYPE ColumnType1 AS {
+ id: int
+ };
+
+
+CREATE COLLECTION TestCollection(ColumnType1) PRIMARY KEY id;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.02.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.02.update.sqlpp
new file mode 100644
index 0000000..9a6f3c4
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.02.update.sqlpp
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+use test;
+
+
+insert into TestCollection({"id":2});
+insert into TestCollection({"id":5,"rating" :1 , "ratings" : [] });
+insert into TestCollection({"id":8,"rating" :2 , "ratings" : [ 1 ] });
+insert into TestCollection({"id":10,"rating" :3.0 , "ratings" : [ 1, 2, 3] });
+insert into TestCollection({"id":12,"rating" :4.3 , "ratings" : [1,2,3.0,4,5]});
+insert into TestCollection({"id":15,"rating" :4.7 , "ratings" : [1.0,2.0,3.0,4.0,5.0]});
+insert into TestCollection({"id":17,"rating" :4.22222 , "ratings" : [1.1111,2.222222,3.3333,4.44444,5.555555]});
+insert into TestCollection({"id":20,"rating" :5.455555555 , "ratings" : [1,2,3,4,5]});
+insert into TestCollection({"id":21,"rating" :1 , "ratings" : [0.0,6.7]});
+insert into TestCollection({"id":27,"rating" :8 , "ratings" : [1]});
+insert into TestCollection({"id":28,"rating" :3 , "ratings" : []});
+
+
+
+
+
+
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.03.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.03.update.sqlpp
new file mode 100644
index 0000000..b4a7b18
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.03.update.sqlpp
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+COPY (
+select c.* from TestCollection c
+ ) toWriter
+TO %adapter%
+PATH (%pathprefix% "copy-to-result", "parquet-type-hierarchy")
+WITH {
+ %template_colons%,
+ %additionalProperties%
+ "format":"parquet",
+ "max-schemas" : "1"
+ };
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.04.update.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.04.update.sqlpp
new file mode 100644
index 0000000..dad1090
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.04.update.sqlpp
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+CREATE TYPE ColumnType2 AS {
+ };
+
+
+
+CREATE EXTERNAL DATASET TestDataset(ColumnType2) USING %adapter%
+(
+ %template%,
+ %additional_Properties%,
+ ("definition"="%path_prefix%copy-to-result/parquet-type-hierarchy/"),
+ ("include"="*.parquet"),
+ ("requireVersionChangeDetection"="false"),
+ ("format" = "parquet")
+);
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.query.sqlpp
new file mode 100644
index 0000000..3d26c18
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.query.sqlpp
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+USE test;
+
+
+SELECT c.*
+FROM TestDataset c
+ORDER BY c.id;
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.adm
new file mode 100644
index 0000000..5c4c334
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/copy-to/parquet-type-hierarchy/parquet-type-hierarchy.05.adm
@@ -0,0 +1,11 @@
+{ "id": 2 }
+{ "ratings": [ ], "rating": 1.0, "id": 5 }
+{ "ratings": [ 1 ], "rating": 2, "id": 8 }
+{ "ratings": [ 1, 2, 3 ], "rating": 3, "id": 10 }
+{ "ratings": [ 1.0, 2.0, 3.0, 4.0, 5.0 ], "rating": 4.3, "id": 12 }
+{ "ratings": [ 1.0, 2.0, 3.0, 4.0, 5.0 ], "rating": 4.7, "id": 15 }
+{ "ratings": [ 1.1111, 2.222222, 3.3333, 4.44444, 5.555555 ], "rating": 4.22222, "id": 17 }
+{ "ratings": [ 1.0, 2.0, 3.0, 4.0, 5.0 ], "rating": 5.455555555, "id": 20 }
+{ "ratings": [ 0.0, 6.7 ], "rating": 1.0, "id": 21 }
+{ "ratings": [ 1.0 ], "rating": 8.0, "id": 27 }
+{ "ratings": [ ], "rating": 3, "id": 28 }
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml
index e925dc6..970ccdd 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_external_dataset_s3.xml
@@ -145,6 +145,16 @@
</compilation-unit>
</test-case>
<test-case FilePath="copy-to">
+ <compilation-unit name="parquet-type-hierarchy">
+ <placeholder name="adapter" value="S3" />
+ <placeholder name="pathprefix" value="" />
+ <placeholder name="path_prefix" value="" />
+ <placeholder name="additionalProperties" value='"container":"playground",' />
+ <placeholder name="additional_Properties" value='("container"="playground")' />
+ <output-dir compare="Text">parquet-type-hierarchy</output-dir>
+ </compilation-unit>
+ </test-case>
+ <test-case FilePath="copy-to">
<compilation-unit name="empty-path">
<output-dir compare="Text">empty-path</output-dir>
</compilation-unit>
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp_hdfs.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp_hdfs.xml
index 36087a5..c163fd2 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp_hdfs.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp_hdfs.xml
@@ -259,6 +259,10 @@
<expected-error>HYR0133: Schema could not be inferred, empty types found in the result</expected-error>
<expected-error>HYR0134: Schema Limit exceeded, maximum number of heterogeneous schemas allowed : '2'</expected-error>
<expected-error>ASX1204: 'rectangle' type not supported in parquet format</expected-error>
+ <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error>
+ <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error>
+ <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error>
+ <expected-error>ASX0072: Parquet does not support arrays containing mixed data types</expected-error>
<source-location>false</source-location>
</compilation-unit>
</test-case>
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetTypeMap.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetTypeMap.java
index 1eb8e84..6bdc586 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetTypeMap.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/AsterixParquetTypeMap.java
@@ -26,6 +26,14 @@
public class AsterixParquetTypeMap {
+ public static final Map<ATypeTag, Integer> HIERARCHIAL_TYPES = Map.ofEntries(Map.entry(ATypeTag.TINYINT, 1),
+ Map.entry(ATypeTag.SMALLINT, 1), Map.entry(ATypeTag.INTEGER, 1), Map.entry(ATypeTag.BIGINT, 2),
+ Map.entry(ATypeTag.FLOAT, 3), Map.entry(ATypeTag.DOUBLE, 4));
+
+ public static ATypeTag maxHierarchicalType(ATypeTag a, ATypeTag b) {
+ return HIERARCHIAL_TYPES.get(a) > HIERARCHIAL_TYPES.get(b) ? a : b;
+ }
+
public static final Map<ATypeTag, PrimitiveType.PrimitiveTypeName> PRIMITIVE_TYPE_NAME_MAP =
Map.ofEntries(Map.entry(ATypeTag.BOOLEAN, PrimitiveType.PrimitiveTypeName.BOOLEAN),
Map.entry(ATypeTag.STRING, PrimitiveType.PrimitiveTypeName.BINARY),
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java
index 9ea6d77..bd869e9 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaLazyVisitor.java
@@ -110,21 +110,20 @@
if (!AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP.containsKey(pointable.getTypeTag())) {
throw RuntimeDataException.create(TYPE_UNSUPPORTED_PARQUET_WRITE, pointable.getTypeTag());
}
- schemaNode.setType(new ParquetSchemaTree.FlatType(
- AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP.get(pointable.getTypeTag()),
- AsterixParquetTypeMap.LOGICAL_TYPE_ANNOTATION_MAP.get(pointable.getTypeTag())));
+ schemaNode.setType(new ParquetSchemaTree.FlatType(pointable.getTypeTag()));
return null;
}
if (!(schemaNode.getType() instanceof ParquetSchemaTree.FlatType)) {
throw RuntimeDataException.create(PARQUET_UNSUPPORTED_MIXED_TYPE_ARRAY);
}
ParquetSchemaTree.FlatType flatType = (ParquetSchemaTree.FlatType) schemaNode.getType();
- if (!(flatType.getPrimitiveTypeName() == AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP
- .get(pointable.getTypeTag()))
- || !(flatType.getLogicalTypeAnnotation() == AsterixParquetTypeMap.LOGICAL_TYPE_ANNOTATION_MAP
- .get(pointable.getTypeTag()))) {
+
+ if (!flatType.isCompatibleWith(pointable.getTypeTag())) {
throw RuntimeDataException.create(PARQUET_UNSUPPORTED_MIXED_TYPE_ARRAY);
}
+
+ flatType.coalesce(pointable.getTypeTag());
+
return null;
}
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java
index ff512df..2ca086f 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/ParquetSchemaTree.java
@@ -25,6 +25,7 @@
import java.util.HashMap;
import java.util.Map;
+import org.apache.asterix.om.types.ATypeTag;
import org.apache.hyracks.api.exceptions.ErrorCode;
import org.apache.hyracks.api.exceptions.HyracksDataException;
import org.apache.parquet.schema.LogicalTypeAnnotation;
@@ -73,21 +74,35 @@
}
static class FlatType extends AbstractType {
- private final PrimitiveType.PrimitiveTypeName primitiveTypeName;
- private final LogicalTypeAnnotation logicalTypeAnnotation;
+ private ATypeTag typeTag;
+ private boolean isHierarchical;
- public FlatType(PrimitiveType.PrimitiveTypeName primitiveTypeName,
- LogicalTypeAnnotation logicalTypeAnnotation) {
- this.primitiveTypeName = primitiveTypeName;
- this.logicalTypeAnnotation = logicalTypeAnnotation;
+ public FlatType(ATypeTag typeTag) {
+ this.typeTag = typeTag;
+ isHierarchical = AsterixParquetTypeMap.HIERARCHIAL_TYPES.containsKey(typeTag);
+ }
+
+ public boolean isCompatibleWith(ATypeTag typeTag) {
+ if (isHierarchical) {
+ return AsterixParquetTypeMap.HIERARCHIAL_TYPES.containsKey(typeTag);
+ } else {
+ return this.typeTag == typeTag;
+ }
+ }
+
+ public void coalesce(ATypeTag typeTag) {
+ if (!isCompatibleWith(typeTag) || !isHierarchical) {
+ return;
+ }
+ this.typeTag = AsterixParquetTypeMap.maxHierarchicalType(this.typeTag, typeTag);
}
public LogicalTypeAnnotation getLogicalTypeAnnotation() {
- return logicalTypeAnnotation;
+ return AsterixParquetTypeMap.LOGICAL_TYPE_ANNOTATION_MAP.get(typeTag);
}
public PrimitiveType.PrimitiveTypeName getPrimitiveTypeName() {
- return primitiveTypeName;
+ return AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP.get(typeTag);
}
}
diff --git a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java
index fc43c89..28d4247 100644
--- a/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java
+++ b/asterixdb/asterix-external-data/src/main/java/org/apache/asterix/external/writer/printer/parquet/SchemaCheckerLazyVisitor.java
@@ -115,9 +115,7 @@
ParquetSchemaTree.FlatType flatType = (ParquetSchemaTree.FlatType) schemaNode.getType();
- if (flatType.getPrimitiveTypeName() != AsterixParquetTypeMap.PRIMITIVE_TYPE_NAME_MAP.get(pointable.getTypeTag())
- || !(flatType.getLogicalTypeAnnotation() == AsterixParquetTypeMap.LOGICAL_TYPE_ANNOTATION_MAP
- .get(pointable.getTypeTag()))) {
+ if (!flatType.isCompatibleWith(pointable.getTypeTag())) {
return ISchemaChecker.SchemaComparisonType.CONFLICTING;
}