diff --git a/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/DefaultMetadataPolicy.java b/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/DefaultMetadataPolicy.java new file mode 100644 index 000000000000..c5abfc1bef28 --- /dev/null +++ b/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/DefaultMetadataPolicy.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.iceberg.hive; + +/** + * Controls which Iceberg schema default metadata is embedded when converting Hive types. + * + *

ORC tables use {@link #WRITE_DEFAULT} because initial-default is applied via + * {@code updateColumnDefault} at the schema-update layer. Other file formats use + * {@link #WRITE_AND_INITIAL_DEFAULT} for V3 column defaults. + */ +public enum DefaultMetadataPolicy { + WRITE_DEFAULT, + WRITE_AND_INITIAL_DEFAULT +} diff --git a/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java b/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java index 1b743343163e..716563caccba 100644 --- a/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java +++ b/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaConverter.java @@ -48,21 +48,33 @@ class HiveSchemaConverter { private int id; private final boolean autoConvert; + private final DefaultMetadataPolicy defaultPolicy; - private HiveSchemaConverter(boolean autoConvert) { + private HiveSchemaConverter(boolean autoConvert, DefaultMetadataPolicy defaultPolicy) { this.autoConvert = autoConvert; + this.defaultPolicy = defaultPolicy; // Iceberg starts field id assignment from 1. this.id = 1; } static Schema convert(List names, List typeInfos, List comments, boolean autoConvert, Map defaultValues) { - HiveSchemaConverter converter = new HiveSchemaConverter(autoConvert); + return convert(names, typeInfos, comments, autoConvert, defaultValues, DefaultMetadataPolicy.WRITE_DEFAULT); + } + + static Schema convert(List names, List typeInfos, List comments, boolean autoConvert, + Map defaultValues, DefaultMetadataPolicy defaultPolicy) { + HiveSchemaConverter converter = new HiveSchemaConverter(autoConvert, defaultPolicy); return new Schema(converter.convertInternal(names, typeInfos, defaultValues, comments)); } public static Type convert(TypeInfo typeInfo, boolean autoConvert, String defaultValue) { - HiveSchemaConverter converter = new HiveSchemaConverter(autoConvert); + return convert(typeInfo, autoConvert, defaultValue, DefaultMetadataPolicy.WRITE_DEFAULT); + } + + public static Type convert( + TypeInfo typeInfo, boolean autoConvert, String defaultValue, DefaultMetadataPolicy defaultPolicy) { + HiveSchemaConverter converter = new HiveSchemaConverter(autoConvert, defaultPolicy); return converter.convertType(typeInfo, defaultValue); } @@ -86,7 +98,7 @@ List convertInternal(List names, List typeI if (type.isPrimitiveType()) { Object icebergDefaultValue = HiveSchemaUtil.getDefaultValue(defaultValues.get(columnName), type); if (icebergDefaultValue != null) { - fieldBuilder.withWriteDefault(Expressions.lit(icebergDefaultValue)); + applyDefaultMetadata(fieldBuilder, icebergDefaultValue); } } else if (!type.isStructType()) { throw new UnsupportedOperationException( @@ -99,6 +111,14 @@ List convertInternal(List names, List typeI return result; } + private void applyDefaultMetadata(Types.NestedField.Builder fieldBuilder, Object icebergDefaultValue) { + org.apache.iceberg.expressions.Literal literal = Expressions.lit(icebergDefaultValue); + fieldBuilder.withWriteDefault(literal); + if (defaultPolicy == DefaultMetadataPolicy.WRITE_AND_INITIAL_DEFAULT) { + fieldBuilder.withInitialDefault(literal); + } + } + Type convertType(TypeInfo typeInfo, String defaultValue) { switch (typeInfo.getCategory()) { case PRIMITIVE: diff --git a/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java b/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java index b1563040bcb9..443b5804852c 100644 --- a/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java +++ b/iceberg/iceberg-catalog/src/main/java/org/apache/iceberg/hive/HiveSchemaUtil.java @@ -154,6 +154,10 @@ public static Type convert(TypeInfo typeInfo, String defaultValue) { return HiveSchemaConverter.convert(typeInfo, false, defaultValue); } + public static Type convert(TypeInfo typeInfo, String defaultValue, DefaultMetadataPolicy defaultPolicy) { + return HiveSchemaConverter.convert(typeInfo, false, defaultValue, defaultPolicy); + } + /** * Returns a SchemaDifference containing those fields which are present in only one of the collections, as well as * those fields which are present in both (in terms of the name) but their type or comment has changed. @@ -429,6 +433,24 @@ public static void setDefaultValues(Record record, List missi } } + /** + * Backfills struct column that is null on read using nested {@code initialDefault} metadata. + * This applies to rows written before {@code ADD COLUMNS} added the struct. + * Spec allows struct defaults as {@code {}} (see https://iceberg.apache.org/spec/#default-values), but + * {@code UpdateSchema} add column only supports primitives today; + * if empty structs are allowed, this backfill can be removed. + */ + public static void backfillStructInitialDefaults(Record record, List columns) { + for (Types.NestedField field : columns) { + if (field.type().isStructType() && record.getField(field.name()) == null) { + Record nestedRecord = buildStructWithInitialDefaults(field.type().asStructType()); + if (nestedRecord != null) { + record.setField(field.name(), nestedRecord); + } + } + } + } + /** * Recursively builds a struct populated with write defaults. * * @return A populated Record, or null if no nested fields have defaults. @@ -458,6 +480,45 @@ private static Record buildStructWithDefaults(Types.StructType structType) { return hasAnyDefault ? nestedRecord : null; } + private static Record buildStructWithInitialDefaults(Types.StructType structType) { + Record nestedRecord = GenericRecord.create(structType); + boolean hasAnyDefault = false; + + for (Types.NestedField field : structType.fields()) { + if (field.initialDefault() != null) { + Object defaultValue = convertToWriteType(field.initialDefault(), field.type()); + nestedRecord.setField(field.name(), defaultValue); + hasAnyDefault = true; + } else if (field.type().isStructType()) { + Record deeperRecord = buildStructWithInitialDefaults(field.type().asStructType()); + if (deeperRecord != null) { + nestedRecord.setField(field.name(), deeperRecord); + hasAnyDefault = true; + } + } + } + + return hasAnyDefault ? nestedRecord : null; + } + + /** + * Builds a map of nested field names to their initial-default values for a struct column. + * + * @return field-name to initial-default map, or empty if no nested field has an initial-default + */ + public static Map getStructInitialDefaults(Types.StructType structType) { + Map result = Maps.newHashMap(); + for (Types.NestedField field : structType.fields()) { + if (field.initialDefault() != null) { + result.put(field.name(), field.initialDefault()); + } else if (field.type().isStructType()) { + Map nested = getStructInitialDefaults(field.type().asStructType()); + result.put(field.name(), nested); + } + } + return result; + } + /** * Sets a value into a {@link Record} using a struct-only field path (top-level column or nested * through structs). Intermediate struct records are created as needed. @@ -496,21 +557,6 @@ private static Record getOrCreateStructRecord( return record; } - // Special method for nested structs that always applies defaults to null fields - private static void setDefaultValuesForNestedStruct(Record record, List fields) { - for (Types.NestedField field : fields) { - Object fieldValue = record.getField(field.name()); - - if (field.writeDefault() != null) { - Object defaultValue = convertToWriteType(field.writeDefault(), field.type()); - record.setField(field.name(), defaultValue); - } else if (field.type().isStructType()) { - // Recursively process nested structs - setDefaultValuesForNestedStruct((Record) fieldValue, field.type().asStructType().fields()); - } - } - } - public static Object convertToWriteType(Object value, Type type) { if (value == null) { return null; diff --git a/iceberg/iceberg-catalog/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java b/iceberg/iceberg-catalog/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java index 6daf3aeca5d7..e0373374033e 100644 --- a/iceberg/iceberg-catalog/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java +++ b/iceberg/iceberg-catalog/src/test/java/org/apache/iceberg/hive/TestHiveSchemaUtil.java @@ -28,6 +28,9 @@ import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; import org.apache.iceberg.Schema; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.expressions.Expressions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.types.Type; @@ -240,4 +243,37 @@ private void assertEquals(Type expected, Type actual) { } } } + + @Test + public void testBackfillStructInitialDefaults() { + Schema schema = new Schema( + optional(1, "id", Types.IntegerType.get()), + optional(2, "point", Types.StructType.of( + Types.NestedField.builder() + .withId(3) + .withName("x") + .asOptional() + .ofType(Types.IntegerType.get()) + .withInitialDefault(Expressions.lit(100)) + .build(), + Types.NestedField.builder() + .withId(4) + .withName("y") + .asOptional() + .ofType(Types.IntegerType.get()) + .withInitialDefault(Expressions.lit(99)) + .build() + )) + ); + + Record record = GenericRecord.create(schema); + record.setField("id", 1); + + HiveSchemaUtil.backfillStructInitialDefaults(record, schema.columns()); + + Record point = (Record) record.getField("point"); + assertThat(point).isNotNull(); + assertThat(point.getField("x")).isEqualTo(100); + assertThat(point.getField("y")).isEqualTo(99); + } } diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/BaseHiveIcebergMetaHook.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/BaseHiveIcebergMetaHook.java index 917829ac304d..3b0742a16931 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/BaseHiveIcebergMetaHook.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/BaseHiveIcebergMetaHook.java @@ -60,6 +60,7 @@ import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.exceptions.NoSuchTableException; import org.apache.iceberg.exceptions.NotFoundException; +import org.apache.iceberg.hive.DefaultMetadataPolicy; import org.apache.iceberg.hive.HMSTablePropertyHelper; import org.apache.iceberg.hive.HiveOperationsBase; import org.apache.iceberg.hive.HiveSchemaUtil; @@ -490,6 +491,13 @@ static boolean isOrcFileFormat(org.apache.hadoop.hive.metastore.api.Table hmsTab .equalsIgnoreCase(hmsTable.getParameters().get(TableProperties.DEFAULT_FILE_FORMAT)); } + protected static DefaultMetadataPolicy defaultMetadataPolicy( + org.apache.hadoop.hive.metastore.api.Table hmsTable) { + return isOrcFileFormat(hmsTable) ? + DefaultMetadataPolicy.WRITE_DEFAULT : + DefaultMetadataPolicy.WRITE_AND_INITIAL_DEFAULT; + } + protected void setWriteModeDefaults(Table icebergTbl, Map newProps, EnvironmentContext context) { if ((icebergTbl == null || TableUtil.formatVersion(icebergTbl) == 1) && IcebergTableUtil.isV2TableOrAbove(newProps)) { diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java index 5a2bbbf70219..b3c931d37503 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergMetaHook.java @@ -114,6 +114,7 @@ import org.apache.iceberg.expressions.UnboundPredicate; import org.apache.iceberg.expressions.UnboundTerm; import org.apache.iceberg.hive.CachedClientPool; +import org.apache.iceberg.hive.DefaultMetadataPolicy; import org.apache.iceberg.hive.HiveLock; import org.apache.iceberg.hive.HiveSchemaUtil; import org.apache.iceberg.hive.HiveTableOperations; @@ -741,16 +742,21 @@ private void handleAddColumns(org.apache.hadoop.hive.metastore.api.Table hmsTabl (List) SessionStateUtil.getResource(conf, SessionStateUtil.COLUMN_DEFAULTS).orElse(null); Map defaultValues = Stream.ofNullable(sqlDefaultConstraints).flatMap(Collection::stream) .collect(Collectors.toMap(SQLDefaultConstraint::getColumn_name, SQLDefaultConstraint::getDefault_value)); - boolean isORc = isOrcFileFormat(hmsTable); + DefaultMetadataPolicy defaultPolicy = defaultMetadataPolicy(hmsTable); for (FieldSchema addedCol : addedCols) { String defaultValue = defaultValues.get(addedCol.getName()); - Type type = HiveSchemaUtil.convert(TypeInfoUtils.getTypeInfoFromTypeString(addedCol.getType()), defaultValue); + Type type = + HiveSchemaUtil.convert( + TypeInfoUtils.getTypeInfoFromTypeString(addedCol.getType()), defaultValue, defaultPolicy); Literal defaultVal = Optional.ofNullable(defaultValue).filter(v -> !type.isStructType()) .map(v -> Expressions.lit(HiveSchemaUtil.getDefaultValue(v, type))).orElse(null); - // ORC doesn't have support for initialDefault from iceberg layer, we only need to set default for writeDefault. - updateSchema.addColumn(addedCol.getName(), type, addedCol.getComment(), isORc ? null : defaultVal); - if (isORc && defaultVal != null) { + updateSchema.addColumn( + addedCol.getName(), + type, + addedCol.getComment(), + defaultPolicy == DefaultMetadataPolicy.WRITE_AND_INITIAL_DEFAULT ? defaultVal : null); + if (defaultPolicy == DefaultMetadataPolicy.WRITE_DEFAULT && defaultVal != null) { updateSchema.updateColumnDefault(addedCol.getName(), defaultVal); } } diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java index 834d762062da..23457c8676cf 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/vector/HiveVectorizedReader.java @@ -52,6 +52,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.Table; import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.hive.HiveSchemaUtil; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.mr.InputFormatConfig; @@ -187,6 +188,10 @@ static Map getInitialColumnDefaults(List colu for (Types.NestedField column : columns) { if (column.initialDefault() != null) { columnDefaults.put(column.name(), column.initialDefault()); + } else if (column.type().isStructType()) { + Map structDefaults = + HiveSchemaUtil.getStructInitialDefaults(column.type().asStructType()); + columnDefaults.put(column.name(), structDefaults); } } return columnDefaults; diff --git a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergRecordReader.java b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergRecordReader.java index e3d03a6bee66..26c715883340 100644 --- a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergRecordReader.java +++ b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergRecordReader.java @@ -53,6 +53,7 @@ import org.apache.iceberg.data.parquet.GenericParquetReaders; import org.apache.iceberg.encryption.EncryptedFiles; import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.hive.HiveSchemaUtil; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.io.InputFile; @@ -173,7 +174,16 @@ private CloseableIterable openGeneric(FileScanTask task, Schema readSchema) { default -> throw new UnsupportedOperationException( String.format("Cannot read %s file: %s", file.format().name(), file.location())); }; - return applyResidualFiltering(iterable, residual, readSchema); + return applyResidualFiltering(withStructInitialDefaultBackfill(iterable, readSchema), residual, readSchema); + } + + private CloseableIterable withStructInitialDefaultBackfill(CloseableIterable iterable, Schema readSchema) { + return CloseableIterable.transform(iterable, record -> { + if (record instanceof Record) { + HiveSchemaUtil.backfillStructInitialDefaults((Record) record, readSchema.columns()); + } + return record; + }); } private CloseableIterable newAvroIterable( diff --git a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_initial_default.q b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_initial_default.q index c0c058bcd5dc..c7007114e5be 100644 --- a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_initial_default.q +++ b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_initial_default.q @@ -13,7 +13,14 @@ ALTER TABLE ice_parq ADD COLUMNS (point STRUCT DEFAULT '{"x":100," created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general'); + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}'); INSERT INTO ice_parq (id) VALUES (2); @@ -45,7 +52,14 @@ ALTER TABLE ice_avro ADD COLUMNS (point STRUCT DEFAULT '{"x":100," created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general'); + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}'); INSERT INTO ice_avro (id) VALUES (2); @@ -77,7 +91,14 @@ ALTER TABLE ice_orc ADD COLUMNS (point STRUCT DEFAULT '{"x":100,"y created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general'); + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}'); INSERT INTO ice_orc (id) VALUES (2); diff --git a/iceberg/iceberg-handler/src/test/results/positive/iceberg_initial_default.q.out b/iceberg/iceberg-handler/src/test/results/positive/iceberg_initial_default.q.out index da857354d811..d15997e7d81f 100644 --- a/iceberg/iceberg-handler/src/test/results/positive/iceberg_initial_default.q.out +++ b/iceberg/iceberg-handler/src/test/results/positive/iceberg_initial_default.q.out @@ -28,7 +28,14 @@ PREHOOK: query: ALTER TABLE ice_parq ADD COLUMNS (point STRUCT DEF created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general') + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}') PREHOOK: type: ALTERTABLE_ADDCOLS PREHOOK: Input: default@ice_parq PREHOOK: Output: default@ice_parq @@ -40,7 +47,14 @@ POSTHOOK: query: ALTER TABLE ice_parq ADD COLUMNS (point STRUCT DE created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general') + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}') POSTHOOK: type: ALTERTABLE_ADDCOLS POSTHOOK: Input: default@ice_parq POSTHOOK: Output: default@ice_parq @@ -60,8 +74,8 @@ POSTHOOK: query: SELECT * FROM ice_parq ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_parq POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: ALTER TABLE ice_parq CHANGE COLUMN point point STRUCT DEFAULT '{"x":100,"y":88}' PREHOOK: type: ALTERTABLE_RENAMECOL PREHOOK: Input: default@ice_parq @@ -94,9 +108,9 @@ POSTHOOK: query: SELECT * FROM ice_parq ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_parq POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: CREATE TABLE ice_avro ( id INT) STORED BY ICEBERG stored as avro @@ -127,7 +141,14 @@ PREHOOK: query: ALTER TABLE ice_avro ADD COLUMNS (point STRUCT DEF created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general') + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}') PREHOOK: type: ALTERTABLE_ADDCOLS PREHOOK: Input: default@ice_avro PREHOOK: Output: default@ice_avro @@ -139,7 +160,14 @@ POSTHOOK: query: ALTER TABLE ice_avro ADD COLUMNS (point STRUCT DE created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general') + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}') POSTHOOK: type: ALTERTABLE_ADDCOLS POSTHOOK: Input: default@ice_avro POSTHOOK: Output: default@ice_avro @@ -159,8 +187,8 @@ POSTHOOK: query: SELECT * FROM ice_avro ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_avro POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: ALTER TABLE ice_avro CHANGE COLUMN point point STRUCT DEFAULT '{"x":100,"y":88}' PREHOOK: type: ALTERTABLE_RENAMECOL PREHOOK: Input: default@ice_avro @@ -193,9 +221,9 @@ POSTHOOK: query: SELECT * FROM ice_avro ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_avro POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: CREATE TABLE ice_orc ( id INT) STORED BY ICEBERG stored as orc @@ -226,7 +254,14 @@ PREHOOK: query: ALTER TABLE ice_orc ADD COLUMNS (point STRUCT DEFA created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general') + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}') PREHOOK: type: ALTERTABLE_ADDCOLS PREHOOK: Input: default@ice_orc PREHOOK: Output: default@ice_orc @@ -238,7 +273,14 @@ POSTHOOK: query: ALTER TABLE ice_orc ADD COLUMNS (point STRUCT DEF created_date DATE DEFAULT '2024-01-01', created_ts TIMESTAMP DEFAULT '2024-01-01T10:00:00', score DECIMAL(5,2) DEFAULT 100.00, - category STRING DEFAULT 'general') + category STRING DEFAULT 'general', + person STRUCT< + name: STRING, + address: STRUCT< + street: STRING, + city: STRING + > + > DEFAULT '{"name":"John","address":{"street":"Main St","city":"New York"}}') POSTHOOK: type: ALTERTABLE_ADDCOLS POSTHOOK: Input: default@ice_orc POSTHOOK: Output: default@ice_orc @@ -258,8 +300,8 @@ POSTHOOK: query: SELECT * FROM ice_orc ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_orc POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL NULL NULL NULL NULL NULL NULL NULL NULL -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: ALTER TABLE ice_orc CHANGE COLUMN point point STRUCT DEFAULT '{"x":100,"y":88}' PREHOOK: type: ALTERTABLE_RENAMECOL PREHOOK: Input: default@ice_orc @@ -292,9 +334,9 @@ POSTHOOK: query: SELECT * FROM ice_orc ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_orc POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL NULL NULL NULL NULL NULL NULL NULL NULL -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: SELECT * FROM ice_parq ORDER BY id PREHOOK: type: QUERY PREHOOK: Input: default@ice_parq @@ -303,9 +345,9 @@ POSTHOOK: query: SELECT * FROM ice_parq ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_parq POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: SELECT * FROM ice_avro ORDER BY id PREHOOK: type: QUERY PREHOOK: Input: default@ice_avro @@ -314,9 +356,9 @@ POSTHOOK: query: SELECT * FROM ice_avro ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_avro POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} PREHOOK: query: SELECT * FROM ice_orc ORDER BY id PREHOOK: type: QUERY PREHOOK: Input: default@ice_orc @@ -325,6 +367,6 @@ POSTHOOK: query: SELECT * FROM ice_orc ORDER BY id POSTHOOK: type: QUERY POSTHOOK: Input: default@ice_orc POSTHOOK: Output: hdfs://### HDFS PATH ### -1 NULL NULL NULL NULL NULL NULL NULL NULL NULL -2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general -3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general +1 NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL +2 {"x":100,"y":99} unknown 25 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} +3 {"x":100,"y":88} unknown 21 50000.0 true 2024-01-01 2024-01-01 10:00:00 100.00 general {"name":"John","address":{"street":"Main St","city":"New York"}} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedDummyColumnReader.java b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedDummyColumnReader.java index e8d95ccd1587..a31b0e69858e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedDummyColumnReader.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/parquet/vector/VectorizedDummyColumnReader.java @@ -24,14 +24,18 @@ import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector; import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector; import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector; import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Arrays; +import java.util.List; +import java.util.Map; /** * A dummy vectorized parquet reader used for schema evolution. @@ -64,6 +68,41 @@ public void readBatch(int total, ColumnVector col, TypeInfo typeInfo) throws IOE if (typeInfo.getCategory() == ObjectInspector.Category.PRIMITIVE) { fillPrimitive(col, (PrimitiveTypeInfo) typeInfo, defaultValue); + } else if (typeInfo.getCategory() == ObjectInspector.Category.STRUCT) { + fillStruct(col, (StructTypeInfo) typeInfo, defaultValue); + } else { + throw new IOException("Unsupported type category in DummyColumnReader: " + typeInfo.getCategory()); + } + } + + private void fillStruct(ColumnVector col, StructTypeInfo structTypeInfo, Object defaultValue) throws IOException { + StructColumnVector structCol = (StructColumnVector) col; + List fieldNames = structTypeInfo.getAllStructFieldNames(); + List fieldTypes = structTypeInfo.getAllStructFieldTypeInfos(); + Map fieldDefaults = defaultValue instanceof Map ? (Map) defaultValue : null; + + for (int i = 0; i < fieldNames.size(); i++) { + Object fieldDefault = fieldDefaults != null ? fieldDefaults.get(fieldNames.get(i)) : null; + fillStructField(structCol.fields[i], fieldTypes.get(i), fieldDefault); + } + } + + private void fillStructField(ColumnVector col, TypeInfo typeInfo, Object fieldDefault) throws IOException { + if (fieldDefault == null) { + col.isRepeating = true; + Arrays.fill(col.isNull, true); + col.noNulls = false; + return; + } + + col.isRepeating = true; + col.noNulls = true; + col.isNull[0] = false; + + if (typeInfo.getCategory() == ObjectInspector.Category.PRIMITIVE) { + fillPrimitive(col, (PrimitiveTypeInfo) typeInfo, fieldDefault); + } else if (typeInfo.getCategory() == ObjectInspector.Category.STRUCT) { + fillStruct(col, (StructTypeInfo) typeInfo, fieldDefault); } else { throw new IOException("Unsupported type category in DummyColumnReader: " + typeInfo.getCategory()); }