Boost::Spirit : Optimizing an expression parser

2019-05-21 04:22发布

I'm trying to write a program to parse and evaluate mathematical, litteral and boolean expressions, for example :

  • "(9/3) == 3+3*2" would be parsed as "(9/3) == (3+(3*2))" and evaluted as "false"
  • "1+2/3" would be parsed as "1+(2/3)" and evaluated as "6".
  • "this + is + a + test" would be parsed as "this+is+a+test" and evaluated as "thisisatest"

The program correctly parses and solves what I'm giving it, but as soon as I put parenthesis in the expressions, the parsing takes a ridiculous amount of time.

I've based my work on Sehe's impressively exhaustive answer on how to write a boolean grammar parser. I've added new operators (+, -, /, *, ==, !=) following that example.

Here is the parser

    // DEFINING TYPES
struct op_not {};
struct op_or {};
struct op_and {};
struct op_xor {};
struct op_equal {};
struct op_unequal {};
struct op_sum {};
struct op_difference {};
struct op_factor {};
struct op_division {};

typedef ustring var;
template <typename tag> struct binop;
template <typename tag> struct unop;

typedef boost::variant<var,
    boost::recursive_wrapper<unop <op_not> >,
    boost::recursive_wrapper<binop<op_equal> >,
    boost::recursive_wrapper<binop<op_unequal> >,
    boost::recursive_wrapper<binop<op_and> >,
    boost::recursive_wrapper<binop<op_xor> >,
    boost::recursive_wrapper<binop<op_or> >,
    boost::recursive_wrapper<binop<op_difference> >,
    boost::recursive_wrapper<binop<op_sum> >,
    boost::recursive_wrapper<binop<op_factor> >,
    boost::recursive_wrapper<binop<op_division> >
> expressionContainer;

template <typename tag> struct binop
{
    explicit binop(const expressionContainer& l
        , const expressionContainer& r)
        : oper1(l), oper2(r) { }
    expressionContainer oper1, oper2;
};

template <typename tag> struct unop
{
    explicit unop(const expressionContainer& o) : oper1(o) { }
    expressionContainer oper1;
};


    // EXPRESSION PARSER
template <typename It, typename Skipper = boost::spirit::standard_wide::space_type>
struct parserExpression : qi::grammar<It, expressionContainer(), Skipper>
{
    parserExpression() : parserExpression::base_type(expr_)
    {
        using namespace qi;

        expr_ = or_.alias();

        // Logical Operators
        or_ = (xor_ >> orOperator_ >> or_) [_val = boost::phoenix::construct<Expression::binop<op_or >>(_1, _3)]    | xor_[_val = _1];
        xor_ = (and_ >> xorOperator_ >> xor_) [_val = boost::phoenix::construct<Expression::binop<op_xor>>(_1, _3)]     | and_[_val = _1];
        and_ = (equal_ >> andOperator_ >> and_) [_val = boost::phoenix::construct<Expression::binop<op_and>>(_1, _3)]   | equal_[_val = _1];
        equal_ = (unequal_ >> equalOperator_ >> equal_) [_val = boost::phoenix::construct<Expression::binop<op_equal>>(_1, _3)] | unequal_[_val = _1];
        unequal_ = (factor_ >> unequalOperator_ >> unequal_) [_val = boost::phoenix::construct<Expression::binop<op_unequal>>(_1, _3)] | factor_[_val = _1];

        // Numerical Operators
        factor_ = (division_ >> factorOperator_ >> factor_) [_val = boost::phoenix::construct<Expression::binop<op_factor>>(_1, _3)] | division_[_val = _1];
        division_ = (sum_ >> divisionOperator_ >> division_) [_val = boost::phoenix::construct<Expression::binop<op_division>>(_1, _3)] | sum_[_val = _1];
        sum_ = (difference_ >> sumOperator_ >> sum_) [_val = boost::phoenix::construct<Expression::binop<op_sum>>(_1, _3)] | difference_[_val = _1];
        difference_ = (not_ >> differenceOperator_ >> difference_) [_val = boost::phoenix::construct<Expression::binop<op_difference>>(_1, _3)] | not_[_val = _1];

        // UNARY OPERATIONS
        not_ = (notOperator_ > simple) [_val = boost::phoenix::construct<Expression::unop <op_not>>(_2)] | simple[_val = _1];

        simple = (('(' > expr_ > ')') | var_);
        var_ = qi::lexeme[+alnum];

        notOperator_        = qi::char_('!');
        andOperator_        = qi::string("&&");
        orOperator_         = qi::string("||");
        xorOperator_        = qi::char_("^");
        equalOperator_      = qi::string("==");
        unequalOperator_    = qi::string("!=");
        sumOperator_        = qi::char_("+");
        differenceOperator_ = qi::char_("-");
        factorOperator_     = qi::char_("*");
        divisionOperator_   = qi::char_("/");

        /*BOOST_SPIRIT_DEBUG_NODE(expr_);
        BOOST_SPIRIT_DEBUG_NODE(or_);
        BOOST_SPIRIT_DEBUG_NODE(xor_);
        BOOST_SPIRIT_DEBUG_NODE(and_);
        BOOST_SPIRIT_DEBUG_NODE(not_);
        BOOST_SPIRIT_DEBUG_NODE(simple);
        BOOST_SPIRIT_DEBUG_NODE(var_);
        BOOST_SPIRIT_DEBUG_NODE(notOperator_);
        BOOST_SPIRIT_DEBUG_NODE(andOperator_);
        BOOST_SPIRIT_DEBUG_NODE(orOperator_);
        BOOST_SPIRIT_DEBUG_NODE(xorOperator_);
        BOOST_SPIRIT_DEBUG_NODE(sumOperator_);
        BOOST_SPIRIT_DEBUG_NODE(differenceOperator_);
        BOOST_SPIRIT_DEBUG_NODE(factorOperator_);
        BOOST_SPIRIT_DEBUG_NODE(divisionOperator_);*/
    }

private:
    qi::rule<It, var(), Skipper> var_;
    qi::rule<It, expressionContainer(), Skipper> not_
        , and_
        , xor_
        , or_
        , equal_
        , unequal_
        , sum_
        , difference_
        , factor_
        , division_
        , simple
        , expr_;

    qi::rule<It, ustring(), Skipper> notOperator_
        , andOperator_
        , orOperator_
        , xorOperator_
        , equalOperator_
        , unequalOperator_
        , sumOperator_
        , differenceOperator_
        , factorOperator_
        , divisionOperator_;
};

With the above code, on my computer (running an Intel I5 CPU) :

  • parsing "1 + 2 - 3 * 4 / 5 == 6 != 7 && 8 || 9 ^ 8 * 7 / 6 ^ 5 && 4 || 3 != 2 == 1" is instantaneous.
  • parsing "(1)" takes approximately 200ms

Spirit's performance having been proved before, I'm left with the obvious question : what could I improve?

1条回答
对你真心纯属浪费
2楼-- · 2019-05-21 05:19

Your problem is that (1) is the worst possible case for backtracking with that grammar. Let's study a simplified example:

or_ = (and_ >> '|' >> or_) | and_;
and_ = (not_ >> '&' >> and_) | not_;
not_ = ('!' >> simple_) | simple_;
simple_ = ('(' >> or_ >> ')') | var_;

And here is a step by step walkthrough:

  • We try or_
    • We try and_
      • We try not_
        • We try '!', '!' >> simple_ fails
        • We try simple
          • We try '(', it matches
          • We try or_
            • We try and_
              • We try not_
                • We try '!', '!' >> simple_ fails
                • We try simple
                  • We try '(', '(' >> or_ >> ')' fails
                  • We try var_, it matches
                • simple_ suceeds
              • not_ suceeds
              • We try '&', not_ >> '&' >> and_ fails (the previous simple_ and not_ matches are discarded)
              • We try not_ (the one that is alone)
                • everything as before
              • not_ succeeds
            • and_ succeeds
            • We try '|', and_ >> '|' >> or_ fails (and_, not_ and simple_ matches discarded)
            • We try and_(the one alone)
              • everything as before
            • and_ succeeds
          • or_ succeeds
          • We try ')', '(' >> or_ >> ')' succeeds
        • simple_ succeeds
      • not_ succeeds
      • We try '&', not_ >> '&' >> and_ fails (everything is discarded)
      • We try not_ (the one alone)
        • everything as before
      • not_ suceeds
    • and_ suceeds
    • We try '|', and_ >> '|' >> or_ fails (everything is discarded)
    • We try and_ (the one alone)
      • everything as before
    • and_ succeeds
  • or_ succeeds

And that is with only two binary rules, your case is much worse.

You could possibly use something like:

or_ = and_[_val=_1] >> -( '|' >> or_ )[_val=construct<binop<op_or> >(_val,_1)]; 

and, although even uglier than before, it will not discard any of the matches.

One problem I don't know you if you have noticed is that the result of the parse is right-associative (meaning 3-2-1 => 3-(2-1)). I think that something like:

or_ = and_[_val=_1] >> *( '|' >> and_)[_val=construct<binop<op_or> >(_val,_1)]; //note the `and_` instead of `or_` after '|'

could solve the problem, but I haven't tested it.

Also because of the way you have arranged your rules you have given + and - a higher priority than * and /.

Trying to solve these problems (and to remove the semantic actions) I have come up with a custom directive that seems to work, you would use it like this:

or_ = fold<binop<op_or> >(xor_.alias())['|' >> xor_]; //sadly the `.alias()` is required

The directive parses the initial parser (xor_.alias()) and tries the subject many times. If the subject never succeeds the final attribute is the attribute of the initial parser. If the subject succeeds the final attribute will be binop<op_or>(initial_attr,subject_attr)/binop<op_or>(binop<op_or>(initial_attr,subject_attr1),subject_attr2)/...

Full Sample (Running on WandBox)

custom_fold_directive.hpp

namespace custom
{
    namespace tag
    {
        struct fold { BOOST_SPIRIT_IS_TAG() };
    }

    template <typename Exposed, typename Expr>
    boost::spirit::stateful_tag_type<Expr, tag::fold, Exposed>
    fold(Expr const& expr)
    {
        return boost::spirit::stateful_tag_type<Expr, tag::fold, Exposed>(expr);
    }

}

namespace boost { namespace spirit 
{
    template <typename Expr, typename Exposed>
    struct use_directive<qi::domain
          , tag::stateful_tag<Expr, custom::tag::fold, Exposed> >
      : mpl::true_ {};
}}

namespace custom
{
    template <typename Exposed, typename InitialParser, typename RepeatingParser>
    struct fold_directive
    {
        fold_directive(InitialParser const& initial, RepeatingParser const& repeating):initial(initial),repeating(repeating){}

        template <typename Context, typename Iterator>
        struct attribute
        {
            typedef typename boost::spirit::traits::attribute_of<InitialParser,Context,Iterator>::type type;//This works in this case but is not generic
        };

        template <typename Iterator, typename Context
          , typename Skipper, typename Attribute>
        bool parse(Iterator& first, Iterator const& last
          , Context& context, Skipper const& skipper, Attribute& attr_) const
        {
            Iterator start = first;

            typename boost::spirit::traits::attribute_of<InitialParser,Context,Iterator>::type initial_attr;


            if (!initial.parse(first, last, context, skipper, initial_attr))
            {
                first=start;
                return false;
            }

            typename boost::spirit::traits::attribute_of<RepeatingParser,Context,Iterator>::type repeating_attr;

            if(!repeating.parse(first, last, context, skipper, repeating_attr))
            {
                boost::spirit::traits::assign_to(initial_attr, attr_);
                return true;
            }
            Exposed current_attr(initial_attr,repeating_attr);

            while(repeating.parse(first, last, context, skipper, repeating_attr))
            {
                boost::spirit::traits::assign_to(Exposed(current_attr,repeating_attr),current_attr);
            }
            boost::spirit::traits::assign_to(current_attr,attr_);
            return true;
        }

        template <typename Context>
        boost::spirit::info what(Context& context) const
        {
            return boost::spirit::info("fold");
        }

        InitialParser initial;
        RepeatingParser repeating;
    };
}

namespace boost { namespace spirit { namespace qi
{
    template <typename Expr, typename Exposed, typename Subject, typename Modifiers>
    struct make_directive<
        tag::stateful_tag<Expr, custom::tag::fold, Exposed>, Subject, Modifiers>
    {
        typedef custom::fold_directive<Exposed, Expr, Subject> result_type;

        template <typename Terminal>
        result_type operator()(Terminal const& term, Subject const& subject, Modifiers const&) const
        {
            typedef tag::stateful_tag<
                Expr, custom::tag::fold, Exposed> tag_type;
            using spirit::detail::get_stateful_data;

            return result_type(get_stateful_data<tag_type>::call(term),subject);
        }
    };
}}}

main.cpp

//#define BOOST_SPIRIT_DEBUG
#include <iostream>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include "custom_fold_directive.hpp"

namespace qi = boost::spirit::qi;


// DEFINING TYPES
struct op_not {};
struct op_or {};
struct op_and {};
struct op_xor {};
struct op_equal {};
struct op_unequal {};
struct op_sum {};
struct op_difference {};
struct op_factor {};
struct op_division {};

namespace Expression{

typedef std::string var;
template <typename tag> struct binop;
template <typename tag> struct unop;

typedef boost::variant<var,
    boost::recursive_wrapper<unop <op_not> >,
    boost::recursive_wrapper<binop<op_equal> >,
    boost::recursive_wrapper<binop<op_unequal> >,
    boost::recursive_wrapper<binop<op_and> >,
    boost::recursive_wrapper<binop<op_xor> >,
    boost::recursive_wrapper<binop<op_or> >,
    boost::recursive_wrapper<binop<op_difference> >,
    boost::recursive_wrapper<binop<op_sum> >,
    boost::recursive_wrapper<binop<op_factor> >,
    boost::recursive_wrapper<binop<op_division> >
> expressionContainer;


template <typename tag> struct binop
{
    explicit binop(const expressionContainer& l
        , const expressionContainer& r)
        : oper1(l), oper2(r) { }
    expressionContainer oper1, oper2;

    friend std::ostream& operator<<(std::ostream& os, const binop& val)
    {
        os << "(" << typeid(tag).name() << " " << val.oper1 << ", "<< val.oper2 << ")";
        return os;
    }
};

template <typename tag> struct unop
{
    explicit unop(const expressionContainer& o) : oper1(o) { }
    expressionContainer oper1;

    friend std::ostream& operator<<(std::ostream& os, const unop& val)
    {
        os << "(" << typeid(tag).name() << " " << val.oper1 << ")";
        return os;
    }
};

}

    // EXPRESSION PARSER
template <typename It, typename Skipper = boost::spirit::standard_wide::space_type>
struct parserExpression : qi::grammar<It, Expression::expressionContainer(), Skipper>
{
    parserExpression() : parserExpression::base_type(expr_)
    {
        using namespace qi;
        using namespace Expression;
        using custom::fold;

        expr_ = or_.alias();

        // Logical Operators
        or_ = fold<binop<op_or> >(xor_.alias())[orOperator_ >> xor_];
        xor_ = fold<binop<op_xor> >(and_.alias())[xorOperator_ >> and_];
        and_ = fold<binop<op_and> >(equal_.alias())[andOperator_ >> equal_];
        equal_ = fold<binop<op_equal> >(unequal_.alias())[equalOperator_ >> unequal_]; 
        unequal_ = fold<binop<op_unequal> >(sum_.alias())[unequalOperator_ >> sum_];

        // Numerical Operators
        sum_ = fold<binop<op_sum> >(difference_.alias())[sumOperator_ >> difference_];
        difference_ = fold<binop<op_difference> >(factor_.alias())[differenceOperator_ >> factor_];
        factor_ = fold<binop<op_factor> >(division_.alias())[factorOperator_ >> division_]; 
        division_ = fold<binop<op_division> >(not_.alias())[divisionOperator_ >> not_];

        // UNARY OPERATIONS
        not_ = (notOperator_ > simple) [_val = boost::phoenix::construct<Expression::unop <op_not>>(_1)] | simple[_val = _1];

        simple = (('(' > expr_ > ')') | var_);
        var_ = qi::lexeme[+alnum];

        notOperator_        = qi::char_('!');
        andOperator_        = qi::string("&&");
        orOperator_         = qi::string("||");
        xorOperator_        = qi::char_("^");
        equalOperator_      = qi::string("==");
        unequalOperator_    = qi::string("!=");
        sumOperator_        = qi::char_("+");
        differenceOperator_ = qi::char_("-");
        factorOperator_     = qi::char_("*");
        divisionOperator_   = qi::char_("/");

        BOOST_SPIRIT_DEBUG_NODES((expr_)(or_)(xor_)(and_)(equal_)(unequal_)(sum_)(difference_)(factor_)(division_)(simple)(notOperator_)
                                 (andOperator_)(orOperator_)(xorOperator_)(equalOperator_)(unequalOperator_)(sumOperator_)(differenceOperator_)(factorOperator_)(divisionOperator_));

    }

private:
    qi::rule<It, Expression::var(), Skipper> var_;
    qi::rule<It, Expression::expressionContainer(), Skipper> not_
        , and_
        , xor_
        , or_
        , equal_
        , unequal_
        , sum_
        , difference_
        , factor_
        , division_
        , simple
        , expr_;

    qi::rule<It, Skipper> notOperator_
        , andOperator_
        , orOperator_
        , xorOperator_
        , equalOperator_
        , unequalOperator_
        , sumOperator_
        , differenceOperator_
        , factorOperator_
        , divisionOperator_;
};

void parse(const std::string& str)
{
    std::string::const_iterator iter = str.begin(), end = str.end();

    parserExpression<std::string::const_iterator,qi::space_type> parser;
    Expression::expressionContainer expr;

    bool result = qi::phrase_parse(iter,end,parser,qi::space, expr);

    if(result && iter==end)
    {
        std::cout << "Success." << std::endl;
        std::cout << str << " => " << expr << std::endl;
    }
    else
    {
        std::cout << "Failure." << std::endl;
    }
}

int main()
{
    parse("(1)");
    parse("3-2-1");
    parse("a+b*c");
    parse("a*b+c");
    parse("(a+b)*c");
    parse("a*b+c*(d+e)&&true||false");
}
查看更多
登录 后发表回答