In Java Avro, how do I parse data1, data2 and data3 below to a GenericRecord.
//Schema
{
"type": "record", "name": "user",
"fields": [
{"name": "name", "type": "string"},
{"name": "colour", "type": "string", "default": "green"},
{"name": "mass", "type": "int", "default": 100}
]
}
//data 1
{"name":"Sean"}
//data 2
{"name":"Sean", "colour":"red"}
//data 3
{"name":"Sean", "colour":"red", "mass":200}
I've seen some discussion on schema evolution etc, and the ability pass a writer's schema and a reader's schema to GenericDatumReader and ResolvingDecoder, but I only have one schema. In general I don't know what exact schema the writer used (if any).
I could "infer" a "base" schema by parsing the schema and removing all fields with default values. However, if there are multiple fields with default values some may/may not be present so I will not be able to infer the schema that would be compliant with the data.
For example
- If I try GenericDatumReader using the given schema to read data3 then parsing is successful.
- If I try GenericDatumReader using the inferred schema to read data1 then parsing is successful.
- If I try GenericDatumReader with a ResolvingDecoder using both the inferred schema and the given schema to read data1 then parsing is successful.
- All other options fail to parse data1 and data3 into GenericRecords containing all values the JSON string and appropriate defaults for missing fields.
- And it doesn't seem to be possible to correctly parse data2 at all!
Has anyone any advice?
import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.io.JsonDecoder;
import org.apache.avro.io.ResolvingDecoder;
public class DefaultAvroTest2 {
private static String properSchama_string = "{" +
" \"type\": \"record\"," +
" \"name\": \"user\"," +
" \"fields\": [" +
" {\"name\": \"name\", \"type\": \"string\"}," +
" {\"name\": \"colour\", \"type\": \"string\", \"default\": \"green\"}," +
" {\"name\": \"mass\", \"type\": \"int\", \"default\": 100}" +
" ]" +
" }";
private static String inferred_base_schama_string = "{" +
" \"type\": \"record\"," +
" \"name\": \"user\"," +
" \"fields\": [" +
" {\"name\": \"name\", \"type\": \"string\"}" +
" ]" +
" }";
private static String data1 = "{\"name\":\"Sean\"}";
private static String data2 = "{\"name\":\"Sean\", \"colour\":\"red\"}";
private static String data3 = "{\"name\":\"Sean\", \"colour\":\"blue\", \"mass\":200}";
public static void main(String[] args) throws IOException {
System.out.println("\nObject 1 :\n"+data1);
System.out.println("\nObject 2 :\n"+data2);
System.out.println("\nObject 3 :\n"+data3);
Schema inferred_base_schema = new Schema.Parser().parse(inferred_base_schama_string);
Schema defined_schema = new Schema.Parser().parse(properSchama_string);
System.out.println("\nProper schema :\n"+defined_schema.toString(true));
System.out.println("\nA base schema that could be inferred from the proper schema :\n"+inferred_base_schema.toString(true));
JsonDecoder jsonDecoder_inferred_1 = DecoderFactory.get().jsonDecoder(inferred_base_schema, data1);
JsonDecoder jsonDecoder_inferred_2 = DecoderFactory.get().jsonDecoder(inferred_base_schema, data2);
JsonDecoder jsonDecoder_inferred_3 = DecoderFactory.get().jsonDecoder(inferred_base_schema, data3);
//Correct
GenericRecord object1_inferred = new GenericDatumReader<GenericRecord>(inferred_base_schema).read(null, jsonDecoder_inferred_1);
//Incorrect: colour is missing
GenericRecord object2_inferred = new GenericDatumReader<GenericRecord>(inferred_base_schema).read(null, jsonDecoder_inferred_2);
//Incorrect: colour and mass are missing
GenericRecord object3_inferred = new GenericDatumReader<GenericRecord>(inferred_base_schema).read(null, jsonDecoder_inferred_3);
ResolvingDecoder resolvingDecoder1 = DecoderFactory.get().resolvingDecoder(inferred_base_schema, defined_schema, DecoderFactory.get().jsonDecoder(defined_schema, data1));
ResolvingDecoder resolvingDecoder2 = DecoderFactory.get().resolvingDecoder(inferred_base_schema, defined_schema, DecoderFactory.get().jsonDecoder(defined_schema, data2));
ResolvingDecoder resolvingDecoder3 = DecoderFactory.get().resolvingDecoder(inferred_base_schema, defined_schema, DecoderFactory.get().jsonDecoder(defined_schema, data3));
//Correct
GenericRecord object1_resolved = new GenericDatumReader<GenericRecord>(defined_schema).read(null, resolvingDecoder1);
//Incorrect: colour is default(green) not red
GenericRecord object2_resolved = new GenericDatumReader<GenericRecord>(defined_schema).read(null, resolvingDecoder2);
//Incorrect: colour is default(green) not blue, and mass is default(100) not 200
GenericRecord object3_resovled = new GenericDatumReader<GenericRecord>(defined_schema).read(null, resolvingDecoder3);
JsonDecoder jsonDecoder_defined_1 = DecoderFactory.get().jsonDecoder(defined_schema, data1);
JsonDecoder jsonDecoder_defined_2 = DecoderFactory.get().jsonDecoder(defined_schema, data2);
JsonDecoder jsonDecoder_defined_3 = DecoderFactory.get().jsonDecoder(defined_schema, data3);
//Fail: org.apache.avro.AvroTypeException: Expected string. Got END_OBJECT
//GenericRecord object1_defined = new GenericDatumReader<GenericRecord>(defined_schema).read(null, jsonDecoder_defined_1);
//Fail: org.apache.avro.AvroTypeException: Expected int. Got END_OBJECT
//GenericRecord object2_defined = new GenericDatumReader<GenericRecord>(defined_schema).read(null, jsonDecoder_defined_2);
//Correct
GenericRecord object3_defined = new GenericDatumReader<GenericRecord>(defined_schema).read(null, jsonDecoder_defined_3);
//Correct
System.out.println("\nObject 1 read with inferred schema:\n"+object1_inferred);
//Incorrect: colour is missing
System.out.println("\nObject 2 read with inferred schema:\n"+object2_inferred);
//Incorrect: colour and mass are missing
System.out.println("\nObject 3 read with inferred schema:\n"+object3_inferred);
//Correct
System.out.println("\nObject 1 read with resolving decoder:\n"+object1_resolved);
//Incorrect: colour is default(green) not red
System.out.println("\nObject 2 read with resolving decoder:\n"+object2_resolved);
//Incorrect: colour is default(green) not blue, and mass is default(100) not 200
System.out.println("\nObject 3 read with resolving decoder:\n"+object3_resovled);
//Fail
//System.out.println("\nObject 1 read with defined schema:\n"+object1_defined);
//Fail
//System.out.println("\nObject 2 read with defined schema:\n"+object2_defined);
//Correct
System.out.println("\nObject 3 read with defined schema:\n"+object3_defined);
}
}
Output:
Object 1 :
{"name":"Sean"}
Object 2 :
{"name":"Sean", "colour":"red"}
Object 3 :
{"name":"Sean", "colour":"blue", "mass":200}
Proper schema :
{
"type" : "record",
"name" : "user",
"fields" : [ {
"name" : "name",
"type" : "string"
}, {
"name" : "colour",
"type" : "string",
"default" : "green"
}, {
"name" : "mass",
"type" : "int",
"default" : 100
} ]
}
A base schema that could be inferred from the proper schema :
{
"type" : "record",
"name" : "user",
"fields" : [ {
"name" : "name",
"type" : "string"
} ]
}
Object 1 read with inferred schema:
{"name": "Sean"}
Object 2 read with inferred schema:
{"name": "Sean"}
Object 3 read with inferred schema:
{"name": "Sean"}
Object 1 read with resolving decoder:
{"name": "Sean", "colour": "green", "mass": 100}
Object 2 read with resolving decoder:
{"name": "Sean", "colour": "green", "mass": 100}
Object 3 read with resolving decoder:
{"name": "Sean", "colour": "green", "mass": 100}
Object 3 read with defined schema:
{"name": "Sean", "colour": "blue", "mass": 200}