Thursday, June 15, 2017

Spark UDFs

  • toUnixTimestamp
    • Convert date in string to time stamp while reserving milliseconds.
    • Option 1
      import org.apache.spark.sql.functions.udf
      import java.text.SimpleDateFormat
      import java.util.TimeZone
       
      def toUnixTimestamp(format:String, timeZone:String) = udf(
        (date:String) => {
          val dateFormat = new SimpleDateFormat(format)
          dateFormat.setTimeZone(TimeZone.getTimeZone(timeZone))
          dateFormat.parse(date).getTime
        }
      )
       
      val test = spark.sql("select charge_datetime from temp.data_sql_server limit 3")
      test.select(test("charge_datetime"), toUnixTimestamp("yyyy-MM-dd HH:mm:ss.SSS","UTC")(test("charge_datetime")).alias("i_regdatetime")).show(false)
    • Option 2
      import java.text.SimpleDateFormat
      import java.util.TimeZone
       
      spark.sqlContext.udf.register("toUnixTimestamp", (date:String, format:String, timeZone:String) => {
        val dateFormat = new SimpleDateFormat(format)
        dateFormat.setTimeZone(TimeZone.getTimeZone(timeZone))
        dateFormat.parse(date).getTime
      })
       
      spark.sql("select toUnixTimestamp('2017-03-13 15:40:53.147', 'yyyy-MM-dd HH:mm:ss.SSS', 'UTC')").show(false)
  • fromUnixTimestamp
    • Convert time stamp to date in string while reserving milliseconds.
      import java.text.SimpleDateFormat
      import java.util.TimeZone
      spark.sqlContext.udf.register("fromUnixTimestamp", (timestamp:Double, format:String, timeZone:String) => {
        val dateFormat = new SimpleDateFormat(format)
        dateFormat.setTimeZone(TimeZone.getTimeZone(timeZone))
        dateFormat.format(timestamp)
      })
       
        
      spark.sql("select fromUnixTimestamp(1489419653147, 'yyyy-MM-dd HH:mm:ss.SSS', 'UTC')").show(false)

No comments:

Post a Comment

Note: Only a member of this blog may post a comment.