If you run the following code, you will encounter AnalysisException with data type mismatch.
spark.sql("create table test_table(id int, data struct<name: string, age:int>)")
spark.sql("insert overwrite table test_table select 1, struct('joe', 15)")
val d = spark.read.table("test_table")
case class UserData(name: String, age: Int)
case class User(id: Int, data: UserData)
val u = Seq(User(1, UserData("joe", 15)), User(2, UserData("mary", 10))).toDF
val missing = {
u.join(
d,
u("id") === d("id")
&& u("data")("name") === d("data")("name")
&& u("data")("age") === d("data")("age"),
"outer"
)
.where( u("id").isNull || d("id").isNull)
}
// show this result
// +---+---------+----+----+
// |id |data |id |data|
// +---+---------+----+----+
// |2 |[mary,10]|null|null|
// +---+---------+----+----+
missing.show(false)
// Throws this error: org.apache.spark.sql.AnalysisException: cannot resolve '(`data` = test_table.`data`)'
// due to data type mismatch: differing types in '(`data` = test_table.`data`)'
// (struct<name:string,age:int> and struct<name:string,age:int>).;
val missing_2 = {
u.join(d,
u("id") === d("id") && u("data") === d("data"),
"outer")
.where(u("id").isNull || d("id").isNull)
}
Don’t be fooled by
(struct<name:string,age:int> and struct<name:string,age:int>)
. The problem is caused by nullable, which is not shown in the error message. The DataFrame created from case classes has nullable=false for id
and age
because Scala Int cannot be null, while the SQL creates nullable fields. And if you compare a field with complex type (struct, array), Spark just thinks they are different as shown in missing_2
. But if you compare field by field, there is no problem as shown in missing
.scala> u.schema.treeString
res4: String =
"root
|-- id: integer (nullable = false)
|-- data: struct (nullable = true)
| |-- name: string (nullable = true)
| |-- age: integer (nullable = false)
"
scala> d.schema.treeString
res5: String =
"root
|-- id: integer (nullable = true)
|-- data: struct (nullable = true)
| |-- name: string (nullable = true)
| |-- age: integer (nullable = true)
"
No comments:
Post a Comment