Skip to content

Commit 5c1a99a

Browse files
authored
HIVE-26877: Parquet CTAS with JOIN on decimals with different precision/scale fail (#6274)
1 parent 6f6ab84 commit 5c1a99a

3 files changed

Lines changed: 275 additions & 4 deletions

File tree

ql/src/java/org/apache/hadoop/hive/ql/io/parquet/write/DataWritableWriter.java

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,12 @@
4545
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
4646
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
4747
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
48+
import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils;
4849
import org.apache.parquet.io.api.Binary;
4950
import org.apache.parquet.io.api.RecordConsumer;
5051
import org.apache.parquet.schema.GroupType;
5152
import org.apache.parquet.schema.LogicalTypeAnnotation;
53+
import org.apache.parquet.schema.LogicalTypeAnnotation.DecimalLogicalTypeAnnotation;
5254
import org.apache.parquet.schema.LogicalTypeAnnotation.ListLogicalTypeAnnotation;
5355
import org.apache.parquet.schema.LogicalTypeAnnotation.MapLogicalTypeAnnotation;
5456
import org.apache.parquet.schema.Type;
@@ -157,7 +159,8 @@ private DataWriter createWriter(ObjectInspector inspector, Type type) {
157159
case TIMESTAMP:
158160
return new TimestampDataWriter((TimestampObjectInspector)inspector);
159161
case DECIMAL:
160-
return new DecimalDataWriter((HiveDecimalObjectInspector)inspector);
162+
return new DecimalDataWriter((HiveDecimalObjectInspector) inspector,
163+
getSchemaDecimalTypeInfo(type, (HiveDecimalObjectInspector) inspector));
161164
case DATE:
162165
return new DateDataWriter((DateObjectInspector)inspector);
163166
default:
@@ -180,6 +183,22 @@ private DataWriter createWriter(ObjectInspector inspector, Type type) {
180183
}
181184
}
182185

186+
/**
187+
* Return the decimal type information defined by the Parquet schema. This ensures the writer
188+
* uses the declared precision/scale.
189+
* @param type Type that contains information about the type schema.
190+
* @param inspector The object inspector used to get the value type.
191+
* @return DecimalTypeInfo The decimal type info object with proper precision and scale.
192+
*/
193+
private DecimalTypeInfo getSchemaDecimalTypeInfo(Type type, HiveDecimalObjectInspector inspector) {
194+
LogicalTypeAnnotation logicalType = type.getLogicalTypeAnnotation();
195+
if (logicalType instanceof DecimalLogicalTypeAnnotation decimal) {
196+
return new DecimalTypeInfo(decimal.getPrecision(), decimal.getScale());
197+
}
198+
// Fallback to the inspector's type info if the schema does not carry the logical annotation.
199+
return (DecimalTypeInfo) inspector.getTypeInfo();
200+
}
201+
183202
/**
184203
* Checks that an inspector matches the category indicated as a parameter.
185204
* @param inspector The object inspector to check
@@ -559,16 +578,23 @@ boolean isValidTimestamp(Object fieldValue) {
559578

560579
private class DecimalDataWriter implements DataWriter {
561580
private HiveDecimalObjectInspector inspector;
581+
private final DecimalTypeInfo schemaDecimalTypeInfo;
562582

563-
public DecimalDataWriter(HiveDecimalObjectInspector inspector) {
583+
public DecimalDataWriter(HiveDecimalObjectInspector inspector, DecimalTypeInfo schemaDecimalTypeInfo) {
564584
this.inspector = inspector;
585+
this.schemaDecimalTypeInfo = schemaDecimalTypeInfo;
565586
}
566587

567588
@Override
568589
public void write(Object value) {
569590
HiveDecimal vDecimal = inspector.getPrimitiveJavaObject(value);
570-
DecimalTypeInfo decTypeInfo = (DecimalTypeInfo)inspector.getTypeInfo();
571-
recordConsumer.addBinary(decimalToBinary(vDecimal, decTypeInfo));
591+
// Enforce the Parquet schema precision/scale before converting to binary to avoid size mismatches.
592+
HiveDecimal enforcedDecimal = HiveDecimalUtils.enforcePrecisionScale(vDecimal, schemaDecimalTypeInfo);
593+
if (enforcedDecimal == null) {
594+
throw new RuntimeException(
595+
"Decimal value " + vDecimal + " does not fit in declared type " + schemaDecimalTypeInfo.getQualifiedName());
596+
}
597+
recordConsumer.addBinary(decimalToBinary(vDecimal, schemaDecimalTypeInfo));
572598
}
573599

574600
private Binary decimalToBinary(final HiveDecimal hiveDecimal, final DecimalTypeInfo decimalTypeInfo) {
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
CREATE TABLE table_a (col_dec_a decimal(12,7));
2+
CREATE TABLE table_b (col_dec_b decimal(15,5));
3+
INSERT INTO table_a VALUES (12345.6789101);
4+
INSERT INTO table_b VALUES (1234567891.01112);
5+
6+
set hive.default.fileformat=parquet;
7+
8+
explain create table target as
9+
select table_a.col_dec_a target_col
10+
from table_a
11+
left outer join table_b on
12+
table_a.col_dec_a = table_b.col_dec_b;
13+
14+
create table target as
15+
select table_a.col_dec_a target_col
16+
from table_a
17+
left outer join table_b on
18+
table_a.col_dec_a = table_b.col_dec_b;
19+
20+
desc target;
21+
select * from target;
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
PREHOOK: query: CREATE TABLE table_a (col_dec_a decimal(12,7))
2+
PREHOOK: type: CREATETABLE
3+
PREHOOK: Output: database:default
4+
PREHOOK: Output: default@table_a
5+
POSTHOOK: query: CREATE TABLE table_a (col_dec_a decimal(12,7))
6+
POSTHOOK: type: CREATETABLE
7+
POSTHOOK: Output: database:default
8+
POSTHOOK: Output: default@table_a
9+
PREHOOK: query: CREATE TABLE table_b (col_dec_b decimal(15,5))
10+
PREHOOK: type: CREATETABLE
11+
PREHOOK: Output: database:default
12+
PREHOOK: Output: default@table_b
13+
POSTHOOK: query: CREATE TABLE table_b (col_dec_b decimal(15,5))
14+
POSTHOOK: type: CREATETABLE
15+
POSTHOOK: Output: database:default
16+
POSTHOOK: Output: default@table_b
17+
PREHOOK: query: INSERT INTO table_a VALUES (12345.6789101)
18+
PREHOOK: type: QUERY
19+
PREHOOK: Input: _dummy_database@_dummy_table
20+
PREHOOK: Output: default@table_a
21+
POSTHOOK: query: INSERT INTO table_a VALUES (12345.6789101)
22+
POSTHOOK: type: QUERY
23+
POSTHOOK: Input: _dummy_database@_dummy_table
24+
POSTHOOK: Output: default@table_a
25+
POSTHOOK: Lineage: table_a.col_dec_a SCRIPT []
26+
PREHOOK: query: INSERT INTO table_b VALUES (1234567891.01112)
27+
PREHOOK: type: QUERY
28+
PREHOOK: Input: _dummy_database@_dummy_table
29+
PREHOOK: Output: default@table_b
30+
POSTHOOK: query: INSERT INTO table_b VALUES (1234567891.01112)
31+
POSTHOOK: type: QUERY
32+
POSTHOOK: Input: _dummy_database@_dummy_table
33+
POSTHOOK: Output: default@table_b
34+
POSTHOOK: Lineage: table_b.col_dec_b SCRIPT []
35+
PREHOOK: query: explain create table target as
36+
select table_a.col_dec_a target_col
37+
from table_a
38+
left outer join table_b on
39+
table_a.col_dec_a = table_b.col_dec_b
40+
PREHOOK: type: CREATETABLE_AS_SELECT
41+
PREHOOK: Input: default@table_a
42+
PREHOOK: Input: default@table_b
43+
PREHOOK: Output: database:default
44+
PREHOOK: Output: default@target
45+
POSTHOOK: query: explain create table target as
46+
select table_a.col_dec_a target_col
47+
from table_a
48+
left outer join table_b on
49+
table_a.col_dec_a = table_b.col_dec_b
50+
POSTHOOK: type: CREATETABLE_AS_SELECT
51+
POSTHOOK: Input: default@table_a
52+
POSTHOOK: Input: default@table_b
53+
POSTHOOK: Output: database:default
54+
POSTHOOK: Output: default@target
55+
STAGE DEPENDENCIES:
56+
Stage-1 is a root stage
57+
Stage-2 depends on stages: Stage-1
58+
Stage-4 depends on stages: Stage-0, Stage-2
59+
Stage-3 depends on stages: Stage-4
60+
Stage-0 depends on stages: Stage-1
61+
62+
STAGE PLANS:
63+
Stage: Stage-1
64+
Tez
65+
#### A masked pattern was here ####
66+
Edges:
67+
Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE)
68+
Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
69+
#### A masked pattern was here ####
70+
Vertices:
71+
Map 1
72+
Map Operator Tree:
73+
TableScan
74+
alias: table_a
75+
Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE
76+
Select Operator
77+
expressions: col_dec_a (type: decimal(12,7))
78+
outputColumnNames: _col0
79+
Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE
80+
Reduce Output Operator
81+
key expressions: _col0 (type: decimal(17,7))
82+
null sort order: z
83+
sort order: +
84+
Map-reduce partition columns: _col0 (type: decimal(17,7))
85+
Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE
86+
Execution mode: vectorized, llap
87+
LLAP IO: all inputs
88+
Map 4
89+
Map Operator Tree:
90+
TableScan
91+
alias: table_b
92+
filterExpr: col_dec_b is not null (type: boolean)
93+
Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE
94+
Filter Operator
95+
predicate: col_dec_b is not null (type: boolean)
96+
Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE
97+
Select Operator
98+
expressions: col_dec_b (type: decimal(15,5))
99+
outputColumnNames: _col0
100+
Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE
101+
Reduce Output Operator
102+
key expressions: _col0 (type: decimal(17,7))
103+
null sort order: z
104+
sort order: +
105+
Map-reduce partition columns: _col0 (type: decimal(17,7))
106+
Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE
107+
Execution mode: vectorized, llap
108+
LLAP IO: all inputs
109+
Reducer 2
110+
Execution mode: llap
111+
Reduce Operator Tree:
112+
Merge Join Operator
113+
condition map:
114+
Left Outer Join 0 to 1
115+
keys:
116+
0 _col0 (type: decimal(17,7))
117+
1 _col0 (type: decimal(17,7))
118+
outputColumnNames: _col0
119+
Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE
120+
File Output Operator
121+
compressed: false
122+
Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE
123+
table:
124+
input format: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
125+
output format: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat
126+
serde: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
127+
name: default.target
128+
Select Operator
129+
expressions: _col0 (type: decimal(12,7))
130+
outputColumnNames: col1
131+
Statistics: Num rows: 1 Data size: 112 Basic stats: COMPLETE Column stats: COMPLETE
132+
Group By Operator
133+
aggregations: min(col1), max(col1), count(1), count(col1), compute_bit_vector_hll(col1)
134+
minReductionHashAggr: 0.4
135+
mode: hash
136+
outputColumnNames: _col0, _col1, _col2, _col3, _col4
137+
Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE
138+
Reduce Output Operator
139+
null sort order:
140+
sort order:
141+
Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE
142+
value expressions: _col0 (type: decimal(12,7)), _col1 (type: decimal(12,7)), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary)
143+
Reducer 3
144+
Execution mode: vectorized, llap
145+
Reduce Operator Tree:
146+
Group By Operator
147+
aggregations: min(VALUE._col0), max(VALUE._col1), count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4)
148+
mode: mergepartial
149+
outputColumnNames: _col0, _col1, _col2, _col3, _col4
150+
Statistics: Num rows: 1 Data size: 384 Basic stats: COMPLETE Column stats: COMPLETE
151+
Select Operator
152+
expressions: 'DECIMAL' (type: string), _col0 (type: decimal(12,7)), _col1 (type: decimal(12,7)), (_col2 - _col3) (type: bigint), COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary)
153+
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
154+
Statistics: Num rows: 1 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE
155+
File Output Operator
156+
compressed: false
157+
Statistics: Num rows: 1 Data size: 475 Basic stats: COMPLETE Column stats: COMPLETE
158+
table:
159+
input format: org.apache.hadoop.mapred.SequenceFileInputFormat
160+
output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
161+
serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
162+
163+
Stage: Stage-2
164+
Dependency Collection
165+
166+
Stage: Stage-4
167+
Create Table
168+
columns: target_col decimal(12,7)
169+
name: default.target
170+
input format: org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat
171+
output format: org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat
172+
serde name: org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
173+
174+
Stage: Stage-3
175+
Stats Work
176+
Basic Stats Work:
177+
Column Stats Desc:
178+
Columns: target_col
179+
Column Types: decimal(12,7)
180+
Table: default.target
181+
182+
Stage: Stage-0
183+
Move Operator
184+
files:
185+
hdfs directory: true
186+
#### A masked pattern was here ####
187+
188+
PREHOOK: query: create table target as
189+
select table_a.col_dec_a target_col
190+
from table_a
191+
left outer join table_b on
192+
table_a.col_dec_a = table_b.col_dec_b
193+
PREHOOK: type: CREATETABLE_AS_SELECT
194+
PREHOOK: Input: default@table_a
195+
PREHOOK: Input: default@table_b
196+
PREHOOK: Output: database:default
197+
PREHOOK: Output: default@target
198+
POSTHOOK: query: create table target as
199+
select table_a.col_dec_a target_col
200+
from table_a
201+
left outer join table_b on
202+
table_a.col_dec_a = table_b.col_dec_b
203+
POSTHOOK: type: CREATETABLE_AS_SELECT
204+
POSTHOOK: Input: default@table_a
205+
POSTHOOK: Input: default@table_b
206+
POSTHOOK: Output: database:default
207+
POSTHOOK: Output: default@target
208+
POSTHOOK: Lineage: target.target_col SIMPLE [(table_a)table_a.FieldSchema(name:col_dec_a, type:decimal(12,7), comment:null), ]
209+
PREHOOK: query: desc target
210+
PREHOOK: type: DESCTABLE
211+
PREHOOK: Input: default@target
212+
POSTHOOK: query: desc target
213+
POSTHOOK: type: DESCTABLE
214+
POSTHOOK: Input: default@target
215+
target_col decimal(12,7)
216+
PREHOOK: query: select * from target
217+
PREHOOK: type: QUERY
218+
PREHOOK: Input: default@target
219+
#### A masked pattern was here ####
220+
POSTHOOK: query: select * from target
221+
POSTHOOK: type: QUERY
222+
POSTHOOK: Input: default@target
223+
#### A masked pattern was here ####
224+
12345.6789101

0 commit comments

Comments
 (0)