I am trying to scale every column of a dataframe.
First I convert each column into a vector and then I use the ml MinMax Scaler.
Is there a better/more elegant way to apply the same function to each column other than simply repeating it?
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.functions.udf
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.sql.DataFrame
val toVector = udf((vct:Double) => Vectors.dense(Array(vct)) )
val df = (Seq((1,5,3),(4,2,9),(7,8,6))).toDF("A","B","C")
val dfVec = df.withColumn("AVec",vectorizeDf('A))
.withColumn("BVec",toVector('B))
.withColumn("CVec",toVector('C))
def scaler (df:DataFrame,inputCol:String) = new MinMaxScaler()
.setInputCol(inputCol)
.setOutputCol(inputCol+"Scaled")
.setMax(1)
.setMin(0)
.fit(df)
.transform(df)
scaler(scaler(scaler(dfVec,"AVec"),"BVec"),"CVec")
+---+---+---+-----+-----+-----+----------+----------+----------+
| A| B| C| AVec| BVec| CVec|AVecScaled|BVecScaled|CVecScaled|
+---+---+---+-----+-----+-----+----------+----------+----------+
| 1| 5| 3|[1.0]|[5.0]|[3.0]| [0.0]| [0.5]| [0.0]|
| 4| 2| 9|[4.0]|[2.0]|[9.0]| [0.5]| [0.0]| [1.0]|
| 7| 8| 6|[7.0]|[8.0]|[6.0]| [1.0]| [1.0]| [0.5]|
+---+---+---+-----+-----+-----+----------+----------+----------+