admin管理员组

文章数量:1310420

I have a below spark dataframe which I am using to create another dataframe with defined schema.

+----------+-----------------------+----------------------+-----------+------+-------------+------------+
|TECHNOLOGY|KPI_NAME               |FUNCTIONS             |DESCRIPTION|ACTION|FORMULA_VALID|VALIDITY_LOG|
+----------+-----------------------+----------------------+-----------+------+-------------+------------+
|GSM       |Cell_Availability_test3|{SUM, SUM, NULL, NULL}|NULL       |ADD   |true         |[]          |
+----------+-----------------------+----------------------+-----------+------+-------------+------------+

which has schema type below as:

root
|-- TECHNOLOGY: string (nullable = true)
|-- KPI_NAME: string (nullable = true)
|-- FUNCTIONS: struct (nullable = true)
|    |-- fun_temporal: string (nullable = true)
|    |-- fun_regional: string (nullable = true)
|    |-- fun_temporal_unit: map (nullable = true)
|    |    |-- key: string
|    |    |-- value: string (valueContainsNull = true)
|    |-- fun_regional_unit: map (nullable = true)
|    |    |-- key: string
|    |    |-- value: string (valueContainsNull = true)
|-- DESCRIPTION: string (nullable = true)
|-- ACTION: string (nullable = true)
|-- FORMULA_VALID: boolean (nullable = false)
|-- VALIDITY_LOG: array (nullable = true)
|    |-- element: struct (containsNull = true)
|    |    |-- key: string (nullable = true)
|    |    |-- value: string (nullable = true)




val outputTypeTest: StructType = StructType(Seq(
    StructField("TECHNOLOGY", StringType, true),
    StructField("KPI_NAME", StringType, true),
        StructField("FUNCTIONS", StructType(Seq(
          StructField("fun_temporal", StringType, true),
          StructField("fun_regional", StringType, true),
          StructField("fun_temporal_unit", ArrayType(StructType(Seq(
            StructField("key", StringType, true),
            StructField("value", StringType, true))), false), false),
          StructField("fun_regional_unit", ArrayType(StructType(Seq(
            StructField("key", StringType, true),
            StructField("value", StringType, true))), false), false))), true),
    StructField("DESCRIPTION", StringType, true),
    StructField("ACTION", StringType, true),
    StructField("FORMULA_VALID", BooleanType, true),
    StructField("VALIDITY_LOG", ArrayType(StructType(Seq(
      StructField("key", StringType, true),
      StructField("value", StringType, true))), false), false)))
 
val formulaMappingOutputNotTypedTest= formulaMappingOutputNotTyped.select("TECHNOLOGY","KPI_NAME","FUNCTIONS","DESCRIPTION","ACTION","FORMULA_VALID","VALIDITY_LOG")
formulaMappingOutputNotTypedTest.show(truncate = false)
val formulaMappingOutput = spark.createDataFrame(formulaMappingOutputNotTypedTest.rdd, outputTypeTest)

  

Caused by: java.lang.RuntimeException: The 2th field 'fun_temporal_unit' of input row cannot be null.

I have a below spark dataframe which I am using to create another dataframe with defined schema.

+----------+-----------------------+----------------------+-----------+------+-------------+------------+
|TECHNOLOGY|KPI_NAME               |FUNCTIONS             |DESCRIPTION|ACTION|FORMULA_VALID|VALIDITY_LOG|
+----------+-----------------------+----------------------+-----------+------+-------------+------------+
|GSM       |Cell_Availability_test3|{SUM, SUM, NULL, NULL}|NULL       |ADD   |true         |[]          |
+----------+-----------------------+----------------------+-----------+------+-------------+------------+

which has schema type below as:

root
|-- TECHNOLOGY: string (nullable = true)
|-- KPI_NAME: string (nullable = true)
|-- FUNCTIONS: struct (nullable = true)
|    |-- fun_temporal: string (nullable = true)
|    |-- fun_regional: string (nullable = true)
|    |-- fun_temporal_unit: map (nullable = true)
|    |    |-- key: string
|    |    |-- value: string (valueContainsNull = true)
|    |-- fun_regional_unit: map (nullable = true)
|    |    |-- key: string
|    |    |-- value: string (valueContainsNull = true)
|-- DESCRIPTION: string (nullable = true)
|-- ACTION: string (nullable = true)
|-- FORMULA_VALID: boolean (nullable = false)
|-- VALIDITY_LOG: array (nullable = true)
|    |-- element: struct (containsNull = true)
|    |    |-- key: string (nullable = true)
|    |    |-- value: string (nullable = true)




val outputTypeTest: StructType = StructType(Seq(
    StructField("TECHNOLOGY", StringType, true),
    StructField("KPI_NAME", StringType, true),
        StructField("FUNCTIONS", StructType(Seq(
          StructField("fun_temporal", StringType, true),
          StructField("fun_regional", StringType, true),
          StructField("fun_temporal_unit", ArrayType(StructType(Seq(
            StructField("key", StringType, true),
            StructField("value", StringType, true))), false), false),
          StructField("fun_regional_unit", ArrayType(StructType(Seq(
            StructField("key", StringType, true),
            StructField("value", StringType, true))), false), false))), true),
    StructField("DESCRIPTION", StringType, true),
    StructField("ACTION", StringType, true),
    StructField("FORMULA_VALID", BooleanType, true),
    StructField("VALIDITY_LOG", ArrayType(StructType(Seq(
      StructField("key", StringType, true),
      StructField("value", StringType, true))), false), false)))
 
val formulaMappingOutputNotTypedTest= formulaMappingOutputNotTyped.select("TECHNOLOGY","KPI_NAME","FUNCTIONS","DESCRIPTION","ACTION","FORMULA_VALID","VALIDITY_LOG")
formulaMappingOutputNotTypedTest.show(truncate = false)
val formulaMappingOutput = spark.createDataFrame(formulaMappingOutputNotTypedTest.rdd, outputTypeTest)

  

Caused by: java.lang.RuntimeException: The 2th field 'fun_temporal_unit' of input row cannot be null.

Share Improve this question edited Feb 3 at 9:25 Ged 18.1k8 gold badges47 silver badges103 bronze badges asked Feb 3 at 7:17 Suhani BhatiaSuhani Bhatia 971 silver badge5 bronze badges 1
  • Does it have anything to do with key being nullable=undefined in this struct? – mazaneicha Commented Feb 3 at 15:49
Add a comment  | 

1 Answer 1

Reset to default 1

Looks like your fun_temporal_unit field is defined as non-nullable, but you pass a null in your original DataFrame. I'd suggest to either replace null values with empty array, or to define the field as nullable - whichever is more appropriate for you.

Something like (not tested, and possibly not the most elegant)

val formulaMappingOutputNotTypedTest = 
  formulaMappingOutputNotTyped
    .select(
      "TECHNOLOGY", "KPI_NAME", "FUNCTIONS", "DESCRIPTION",
      "ACTION", "FORMULA_VALID", "VALIDITY_LOG"
    )
    // replace old field FUNCTIONS with a new one:
    .withColumn(
      "FUNCTIONS", 
      struct(
        col("FUNCTIONS.fun_temporal"),
        col("FUNCTIONS.fun_regional"),
        // replace NULL fun_temporal_unit with an empty Array:
        coalesce(col("FUNCTIONS.fun_temporal_unit"), lit(Array.empty[Row])),
        col("FUNCTIONS.fun_regional_unit")
      )
    )


or

          StructField("fun_temporal_unit", ArrayType(StructType(Seq(
            StructField("key", StringType, true),
            StructField("value", StringType, true))), false), true),
//                                                            ^ here

本文标签: