I have a big table in hbase that name is UserAction, and it has three column families(song,album,singer). I need to fetch all of data from 'song' column family as a JavaRDD object. I try this code, but it's not efficient. Is there a better solution to do this ?
static SparkConf sparkConf = new SparkConf().setAppName("test").setMaster(
"local[4]");
static JavaSparkContext jsc = new JavaSparkContext(sparkConf);
static void getRatings() {
Configuration conf = HBaseConfiguration.create();
conf.set(TableInputFormat.INPUT_TABLE, "UserAction");
conf.set(TableInputFormat.SCAN_COLUMN_FAMILY, "song");
JavaPairRDD<ImmutableBytesWritable, Result> hBaseRDD = jsc
.newAPIHadoopRDD(
conf,
TableInputFormat.class,
org.apache.hadoop.hbase.io.ImmutableBytesWritable.class,
org.apache.hadoop.hbase.client.Result.class);
JavaRDD<Rating> count = hBaseRDD
.map(new Function<Tuple2<ImmutableBytesWritable, Result>, JavaRDD<Rating>>() {
@Override
public JavaRDD<Rating> call(
Tuple2<ImmutableBytesWritable, Result> t)
throws Exception {
Result r = t._2;
int user = Integer.parseInt(Bytes.toString(r.getRow()));
ArrayList<Rating> ra = new ArrayList<>();
for (Cell c : r.rawCells()) {
int product = Integer.parseInt(Bytes
.toString(CellUtil.cloneQualifier(c)));
double rating = Double.parseDouble(Bytes
.toString(CellUtil.cloneValue(c)));
ra.add(new Rating(user, product, rating));
}
return jsc.parallelize(ra);
}
})
.reduce(new Function2<JavaRDD<Rating>, JavaRDD<Rating>, JavaRDD<Rating>>() {
@Override
public JavaRDD<Rating> call(JavaRDD<Rating> r1,
JavaRDD<Rating> r2) throws Exception {
return r1.union(r2);
}
});
jsc.stop();
}
Song column family scheme design is :
RowKey = userID, columnQualifier = songID and value = rating.