Melt

Trabajar con Spark usando Scala implica renunciar a ese paraíso que son las funciones melt y (d)cast de reshape2.

¿O no?

import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.sql.types.StringType;
import org.apache.spark.sql.types.DoubleType;
import org.apache.spark.sql.Row;

/** Create some data **/

val nrows = 20
val origDF = sc.parallelize(1.to(nrows).map(x => (x, math.pow(x,2), math.pow(x,3)))).toDF("id", "cuadrado", "cubo")

/** Melt **/

val ids  = Map("id" -> 0)
val cols = Map("cuadrado" -> 1, "cubo" -> 2)

def melt(x:Row, ids:Map[String, Int] , cols:Map[String, Int]) = {
        var tmp = ids.mapValues(y => x(y))
        for((k,v) <- cols) yield tmp + ("var" -> k, "value" -> x(v))
}

val df = origDF.flatMap(x => melt(x, ids, cols))

val newStructure = StructType( ids.values.map(x => origDF.schema(x)).toList ::: List(StructField("var", StringType), StructField("value", DoubleType)) )
val meltDF = sqlContext.applySchema(df.map(x => Row.fromSeq(x.values.toList)), newStructure)

/** cast **/

val castDF = meltDF.groupBy("id").pivot("var").sum("value")

En ocasiones uno recibe datos no muy distintos de

aragon <- read.table("http://datanalytics.com/uploads/pob_aragon",
                        header = T, sep = "\t")
aragon

# Provincias Periodo Hombres Mujeres
# 1     Huesca    2014  113840  111069
# 2     Huesca    2004  107961  104940
# 3     Teruel    2014   71449   68916
# 4     Teruel    2004   71073   68260
# 5   Zaragoza    2014  471675  488436
# 6   Zaragoza    2004  441840  455510

Los mismos datos en formato largo son:

library(reshape2)

aragon.largo <- melt(aragon, id.vars = c("Provincias", "Periodo"))
aragon.largo
# Provincias Periodo variable  value
# 1      Huesca    2014  Hombres 113840
# 2      Huesca    2004  Hombres 107961
# 3      Teruel    2014  Hombres  71449
# 4      Teruel    2004  Hombres  71073
# 5    Zaragoza    2014  Hombres 471675
# 6    Zaragoza    2004  Hombres 441840
# 7      Huesca    2014  Mujeres 111069
# 8      Huesca    2004  Mujeres 104940
# 9      Teruel    2014  Mujeres  68916
# 10     Teruel    2004  Mujeres  68260
# 11   Zaragoza    2014  Mujeres 488436
# 12   Zaragoza    2004  Mujeres 455510

Si eso de datos largos (o en formato largo) no te suena, pierde un momento en:

Melt y cast en Spark con scala

Datos en formato largo y melt