How to query struct array with Hive (get_json_obje

I store the following JSON objects in a Hive table:

{
  "main_id": "qwert",
  "features": [
    {
      "scope": "scope1",
      "name": "foo",
      "value": "ab12345",
      "age": 50,
      "somelist": ["abcde","fghij"]
    },
    {
      "scope": "scope2",
      "name": "bar",
      "value": "cd67890"
    },
    {
      "scope": "scope3",
      "name": "baz",
      "value": [
        "A",
        "B",
        "C"
      ]
    }
  ]
}

"features" is an array of varying length, i.e. all objects are optional. The objects have arbitrary elements, but all of them contain "scope", "name" and "value".

This is the Hive table I created:

CREATE TABLE tbl(
main_id STRING,features array<struct<scope:STRING,name:STRING,value:array<STRING>,age:INT,somelist:array<STRING>>>
)

I need a Hive query that returns the main_id and the value of the struct with the name "baz", i.e.,

main_id baz_value
qwert ["A","B","C"]

My problem is that the Hive UDF "get_json_object" supports only a limited version of JSONPath. It does not support a path like get_json_object(features, '$.features[?(@.name='baz')]').

How can query the wanted result with Hive? Is it maybe easier with another Hive table structure?

回答1:

I found a solution for this:

Use the Hive explode UDTF to explode the struct array, i.e., create a second (temporary) table with one record for each struct in the array "features".

CREATE TABLE tbl_exploded as
select main_id, 
f.name as f_name,
f.value as f_value
from tbl
LATERAL VIEW explode(features) exploded_table as f
-- optionally filter here instead of in 2nd query:
-- where f.name = 'baz';

The result of this is:

qwert, foo, ab12345
qwert, bar, cd67890
qwert, baz, ["A","B","C"]

Now you can select the main_id and value like this:

select main_id, f_value from tbl_exploded where f_name = 'baz';

回答2:

This one could be ok.

ParseJsonWithPath

ADD JAR your-path/ParseJsonWithPath.jar;
CREATE TEMPORARY FUNCTION parseJsonWithPath AS 'com.ntc.hive.udf.ParseJsonWithPath';

SELECT parseJsonWithPath(jsonStr, xpath) FROM ....

The field to be parsed can be a json string(jsonStr), given the xpath, you can get what you want.

For example

jsonStr
{ "book": [
    {
        "category": "reference",
        "author": "Nigel Rees",
        "title": "Sayings of the Century",
        "price": 8.95
    },
    {
        "category": "fiction",
        "author": "Evelyn Waugh",
        "title": "Sword of Honour",
        "price": 12.99
   }
}

xpath
"$.book" 
        return the insider json string [....]
"$.book[?(@.price < 10)]" 
        return the [8.95]

more detail

回答3:

The UDF pasted below I think is close to your needs. It takes array<struct>, a string, and an integer. String is the field name, in your case "name", and the third argument is the value to match on. Currently it expects an integer but it should be relatively easy to change this to string / text for your purpose.

import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.lazy.LazyString;
import org.apache.hadoop.hive.serde2.lazy.LazyLong;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantIntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantStringObjectInspector;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
import java.util.ArrayList;
import org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyLongObjectInspector;

@Description(name = "extract_value",
    value = "_FUNC_( array< struct<value:string> > ) - Collect all \"value\" field values inside an array of struct(s), and return the results in an array<string>",
    extended = "Example:\n SELECT _FUNC_(array_of_structs_with_value_field)")
public class StructFromArrayStructDynamicInt
        extends GenericUDF
{
    private ArrayList ret;

    private ListObjectInspector listOI;
    private StructObjectInspector structOI;
    private ObjectInspector indOI;
    private ObjectInspector valOI;
    private ObjectInspector arg1OI;
    private ObjectInspector arg2OI;

    private String indexName;

    WritableConstantStringObjectInspector element1OI;
    WritableConstantIntObjectInspector element2OI;

    @Override
    public ObjectInspector initialize(ObjectInspector[] args)
            throws UDFArgumentException
    {
        if (args.length != 3) {
            throw new UDFArgumentLengthException("The function extract_value() requires exactly three arguments.");
        }

        if (args[0].getCategory() != Category.LIST) {
            throw new UDFArgumentTypeException(0, "Type array<struct> is expected to be the argument for extract_value but " + args[0].getTypeName() + " is found instead");
        }
        if (args[1].getCategory() != Category.PRIMITIVE) {
            throw new UDFArgumentTypeException(0, "Second argument is expected to be primitive but " + args[1].getTypeName() + " is found instead");
        }
        if (args[2].getCategory() != Category.PRIMITIVE) {
            throw new UDFArgumentTypeException(0, "Second argument is expected to be primitive but " + args[2].getTypeName() + " is found instead");
        }

        listOI = ((ListObjectInspector) args[0]);
        structOI = ((StructObjectInspector) listOI.getListElementObjectInspector());
        arg1OI = (StringObjectInspector) args[1];
        arg2OI = args[2];

        this.element1OI = (WritableConstantStringObjectInspector) arg1OI;
        this.element2OI = (WritableConstantIntObjectInspector) arg2OI;

        indexName = element1OI.getWritableConstantValue().toString();

//        if (structOI.getAllStructFieldRefs().size() != 2) {
//            throw new UDFArgumentTypeException(0, "Incorrect number of fields in the struct, should be one");
//        }

//        StructField valueField = structOI.getStructFieldRef("value");
        StructField indexField = structOI.getStructFieldRef(indexName);
        //If not, throw exception
//        if (valueField == null) {
//            throw new UDFArgumentTypeException(0, "NO \"value\" field in input structure");
//        }

        if (indexField == null) {
            throw new UDFArgumentTypeException(0, "Index field not in input structure");
        }

        //Are they of the correct types?
        //We store these object inspectors for use in the evaluate() method
//        valOI = valueField.getFieldObjectInspector();
        indOI = indexField.getFieldObjectInspector();

        //First are they primitives
//        if (valOI.getCategory() != Category.PRIMITIVE) {
//            throw new UDFArgumentTypeException(0, "value field must be of primitive type");
//        }
        if (indOI.getCategory() != Category.PRIMITIVE) {
            throw new UDFArgumentTypeException(0, "index field must be of primitive type");
        }
        if (arg1OI.getCategory() != Category.PRIMITIVE) {
            throw new UDFArgumentTypeException(0, "second argument must be primitive type");
        }
        if (arg2OI.getCategory() != Category.PRIMITIVE) {
            throw new UDFArgumentTypeException(0, "third argument must be primitive type");
        }

        //Are they of the correct primitives?
//        if (((PrimitiveObjectInspector)valOI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {
//            throw new UDFArgumentTypeException(0, "value field must be of string type");
//        }
        if (((PrimitiveObjectInspector)indOI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.LONG) {
            throw new UDFArgumentTypeException(0, "index field must be of long type");
        }
        if (((PrimitiveObjectInspector)arg1OI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) {
            throw new UDFArgumentTypeException(0, "second arg must be of string type");
        }
        if (((PrimitiveObjectInspector)arg2OI).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.INT) {
            throw new UDFArgumentTypeException(0, "third arg must be of int type");
        }

//        ret = new ArrayList();
        return listOI.getListElementObjectInspector();
//        return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
//        return ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.writableStringObjectInspector);
    }

    @Override
    public Object evaluate(DeferredObject[] arguments)
            throws HiveException
    {
//        ret.clear();

        if (arguments.length != 3) {
            return null;
        }

        if (arguments[0].get() == null) {
        return null;
        }

        int numElements = listOI.getListLength(arguments[0].get());
//        long xl = argOI.getPrimitiveJavaObject(arguments[1].get());
//        long xl = arguments[1].get(); //9;
        long xl2 = element2OI.get(arguments[2].get());
//        String xl1 = element1OI.getPrimitiveJavaObject(arguments[2].get());

//        long xl = 9;

        for (int i = 0; i < numElements; i++) {
//            LazyString valDataObject = (LazyString) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef("value")));
            long indValue = (Long) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef(indexName)));
//            throw new HiveException("second arg must be of string type");
//            LazyString indDataObject = (LazyString) (structOI.getStructFieldData(listOI.getListElement(arguments[0].get(), i), structOI.getStructFieldRef("index")));
//            Text valueValue = ((StringObjectInspector) valOI).getPrimitiveWritableObject(valDataObject);
//            LongWritable indValue = ((LazyLongObjectInspector) indOI).getPrimitiveWritableObject(indDataObject);

            if(indValue == xl2) {
                return listOI.getListElement(arguments[0].get(), i);
            }

//            ret.add(valueValue);
       }
        return null;
    }

    @Override
    public String getDisplayString(String[] strings) {
        assert (strings.length > 0);
        StringBuilder sb = new StringBuilder();
        sb.append("extract_value(");
        sb.append(strings[0]);
        sb.append(")");
        return sb.toString();
    }
}

Here is the code for this and a couple other working udfs that do things with array<struct>.