I have read in Parquet Documentation that Only the column I query, the data of that column is read and processed. But When I see the Spark-UI I am finding that complete file is read.
Following is the code to write parquet file and read in Spark-Sql.
object ParquetFileCreator_simple {
def datagenerate(schema: Schema, ind: Long): GenericRecord ={
var data: GenericRecord = new GenericData.Record(schema)
data.put("first", "Pg20 " + ind )
data.put("appType", "WAP" + ind)
data
}
def main (args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").set("spark.app.name", "merger").set("spark.eventLog.enabled", "true")
val sc = new SparkContext(conf)
val sqlc = new org.apache.spark.sql.SQLContext(sc)
val schemaPath = "/home/a.avsc"
val schema = new Schema.Parser().parse(new FileInputStream(schemaPath))
val outFile = "/home/parquet_simple.parquet"
val outPath: org.apache.hadoop.fs.Path = new org.apache.hadoop.fs.Path(outFile);
var writer: AvroParquetWriter[GenericRecord] = new AvroParquetWriter[GenericRecord](outPath, schema);
for(ind <- 1 to 50000000) {
var r = datagenerate(schema, ind)
writer.write(r);
}
writer.close();
val df = sqlc.read.parquet(outFile)
df.registerTempTable("nestedread")
//var results = df.select("address.pincode")
val results = sqlc.sql("SELECT first FROM nestedread ")
results.count()
//results.foreach(x=>println(x))
Thread.sleep(60000)
}
}
My Avro Schema is: a.avsc
{
"type": "record",
"name": "FullName",
"namespace": "com.snapdeal.event.avro",
"fields": [{
"name": "first",
"type": ["string", "null"]
}, {
"name" : "appType",
"type" : {
"name" : "app_types",
"type" : "enum",
"symbols" : [ "WAP", "WEB", "APP" ]
}
}
]
}
I have run this on local. I have first created a file of 1.7GB and same is read by Parquet.