Boost.Spirit 将表达式转换为 AST

Vla*_*ylo 2 c++ boost abstract-syntax-tree boost-spirit

使用 Boost.Spirit 将某些表达式转换为 AST 的正确方法是什么?

我试图构建它,但我认为它很混乱并且可以简化很多。

https://godbolt.org/z/VXHXLY

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>

namespace ast {
  struct unary_operator;
  struct binary_operator;

  struct expression {
    typedef boost::variant<
      double,
      std::string,
      boost::recursive_wrapper<unary_operator>,
      boost::recursive_wrapper<binary_operator>,
      boost::recursive_wrapper<expression>
    > type;

    expression() {

    }

    template<typename Expr>
    expression(const Expr &expr)
      : expr(expr) {

    }

    expression &operator+=(expression rhs);
    expression &operator-=(expression rhs);

    expression &operator*=(expression rhs);
    expression &operator/=(expression rhs);

    expression &and_(expression rhs);
    expression &or_(expression rhs);

    expression &equals(expression rhs);
    expression &not_equals(expression rhs);

    expression &less_than(expression rhs);
    expression &less_equals(expression rhs);
    expression &greater_than(expression rhs);
    expression &greater_equals(expression rhs);

    expression &factor(expression rhs);

    expression &dot(expression rhs);

    type expr;
  };

  struct unary_operator {
    std::string op;
    expression rhs;

    unary_operator() {}

    unary_operator(std::string op, expression rhs)
      : op(std::move(op)), rhs(std::move(rhs)) {
    }
  };

  struct binary_operator {
    std::string op;
    expression lhs;
    expression rhs;

    binary_operator() {}

    binary_operator(std::string op, expression lhs, expression rhs)
      : op(std::move(op)), lhs(std::move(lhs)), rhs(std::move(rhs)) {
    }
  };

  expression &expression::operator+=(expression rhs) {
    expr = binary_operator("+", std::move(expr), std::move(rhs));
    return *this;
  }

  expression &expression::operator-=(expression rhs) {
    expr = binary_operator("-", std::move(expr), std::move(rhs));
    return *this;
  }

  expression &expression::operator*=(expression rhs) {
    expr = binary_operator("*", std::move(expr), std::move(rhs));
    return *this;
  }

  expression &expression::operator/=(expression rhs) {
    expr = binary_operator("/", std::move(expr), std::move(rhs));
    return *this;
  }

  expression &expression::and_(expression rhs) {
    expr = binary_operator("&&", std::move(expr), std::move(rhs));
    return *this;
  }

  expression &expression::or_(expression rhs) {
    expr = binary_operator("||", std::move(expr), std::move(rhs));
    return *this;
  }


  expression &expression::equals(expression rhs) {
    expr = binary_operator("==", std::move(expr), std::move(rhs));
    return *this;
  }

  expression &expression::not_equals(expression rhs) {
    expr = binary_operator("!=", std::move(expr), std::move(rhs));
    return *this;
  }


  expression &expression::less_than(expression rhs) {
    expr = binary_operator("<", std::move(expr), std::move(rhs));
    return *this;
  }

  expression &expression::less_equals(expression rhs) {
    expr = binary_operator("<=", std::move(expr), std::move(rhs));
    return *this;
  }

  expression &expression::greater_than(expression rhs) {
    expr = binary_operator(">", std::move(expr), std::move(rhs));
    return *this;
  }

  expression &expression::greater_equals(expression rhs) {
    expr = binary_operator(">=", std::move(expr), std::move(rhs));
    return *this;
  }


  expression &expression::factor(expression rhs) {
    expr = binary_operator("**", std::move(expr), std::move(rhs));
    return *this;
  }


  expression &expression::dot(expression rhs) {
    expr = binary_operator(".", std::move(expr), std::move(rhs));
    return *this;
  }

  struct printer {
    void operator()(const double n) const {
      std::cout << n;
    }

    void operator()(const std::string &s) const {
      std::cout << s;
    }

    void operator()(const expression &ast) const {
      boost::apply_visitor(*this, ast.expr);
    }

    void operator()(const binary_operator &expr) const {
      std::cout << "op:" << expr.op << "(";
      boost::apply_visitor(*this, expr.lhs.expr);
      std::cout << ", ";
      boost::apply_visitor(*this, expr.rhs.expr);
      std::cout << ')';
    }

    void operator()(const unary_operator &expr) const {
      std::cout << "op:" << expr.op << "(";
      boost::apply_visitor(*this, expr.rhs.expr);
      std::cout << ')';
    }
  };

  struct operators {
    struct and_ {
    };
    struct or_ {
    };

    struct equals {
    };
    struct not_equals {
    };

    struct less_than {
    };
    struct less_equals {
    };
    struct greater_than {
    };
    struct greater_equals {
    };

    struct factor {
    };

    struct dot {
    };

    expression &operator()(expression &lhs, expression rhs, and_) const {
      return lhs.and_(std::move(rhs));
    }

    expression &operator()(expression &lhs, expression rhs, or_) const {
      return lhs.or_(std::move(rhs));
    }


    expression &operator()(expression &lhs, expression rhs, equals) const {
      return lhs.equals(std::move(rhs));
    }

    expression &operator()(expression &lhs, expression rhs, not_equals) const {
      return lhs.not_equals(std::move(rhs));
    }


    expression &operator()(expression &lhs, expression rhs, less_than) const {
      return lhs.less_than(std::move(rhs));
    }

    expression &operator()(expression &lhs, expression rhs, less_equals) const {
      return lhs.less_equals(std::move(rhs));
    }

    expression &operator()(expression &lhs, expression rhs, greater_than) const {
      return lhs.greater_than(std::move(rhs));
    }

    expression &operator()(expression &lhs, expression rhs, greater_equals) const {
      return lhs.greater_equals(std::move(rhs));
    }

    expression &operator()(expression &lhs, expression rhs, factor) const {
      return lhs.factor(std::move(rhs));
    }


    expression &operator()(expression &lhs, expression rhs, dot) const {
      return lhs.dot(std::move(rhs));
    }
  };
}

namespace qi = boost::spirit::qi;

struct expectation_handler {
  template<typename Iterator>
  void operator()(Iterator first, Iterator last, const boost::spirit::info &info) const {
    std::stringstream msg;
    msg << "Expected " << info << " at \"" << std::string(first, last) << "\"";
    throw std::runtime_error(msg.str());
  }
};

template<typename Iterator>
struct grammar : qi::grammar<Iterator, ast::expression(), qi::ascii::space_type> {
  grammar()
    : grammar::base_type(expression) {

    variable = qi::lexeme[qi::alpha >> *(qi::alnum | '_')];

    expression = logical.alias() > qi::eoi;

    logical = equality[qi::_val = qi::_1]
      >> *(
        ((qi::lit("&&") > equality[op(qi::_val, qi::_1, ast::operators::and_{})]) |
         (qi::lit("||") > equality[op(qi::_val, qi::_1, ast::operators::or_{})]))
      );

    equality = relational[qi::_val = qi::_1]
      >> *(
        ((qi::lit("==") > relational[op(qi::_val, qi::_1, ast::operators::equals{})]) |
         (qi::lit("!=") > relational[op(qi::_val, qi::_1, ast::operators::not_equals{})]))
      );

    relational = additive[qi::_val = qi::_1]
      >> *(
        ((qi::lit("<") > relational[op(qi::_val, qi::_1, ast::operators::less_than{})]) |
         (qi::lit("<=") > relational[op(qi::_val, qi::_1, ast::operators::less_equals{})]) |
         (qi::lit(">") > relational[op(qi::_val, qi::_1, ast::operators::greater_than{})]) |
         (qi::lit(">=") > relational[op(qi::_val, qi::_1, ast::operators::greater_equals{})]))
      );

    additive = multiplicative[qi::_val = qi::_1]
      >> *(
        ((qi::lit("+") > multiplicative[qi::_val += qi::_1]) |
         (qi::lit("-") > multiplicative[qi::_val -= qi::_1]))
      );

    multiplicative = factor[qi::_val = qi::_1]
      >> *(
        ((qi::lit("*") > factor[qi::_val *= qi::_1]) |
         (qi::lit("/") > factor[qi::_val /= qi::_1]))
      );

    factor = primary[qi::_val = qi::_1]
      >> *((qi::lit("**")) > primary[op(qi::_val, qi::_1, ast::operators::factor{})]);

    primary =
      qi::double_[qi::_val = qi::_1]
      | ('(' > expression[qi::_val = qi::_1] > ')')
        >> *(qi::char_('.') > variable[qi::_val = op(qi::_val, qi::_1, ast::operators::dot{})])
      | variable[qi::_val = qi::_1]
        >> *(qi::char_('.') > variable[qi::_val = op(qi::_val, qi::_1, ast::operators::dot{})]);

    qi::on_error<qi::fail>(
      expression,
      boost::phoenix::bind(boost::phoenix::ref(err_handler), qi::_3, qi::_2, qi::_4));
  }

  qi::rule<Iterator, ast::expression(), qi::ascii::space_type> expression, logical, equality, relational, additive, multiplicative, factor, unary, binary, primary;
  qi::rule<Iterator, std::string()> variable;
  boost::phoenix::function<ast::operators> op;
  expectation_handler err_handler;
};

int main(int argc, const char *argv[]) {
  std::string input("2 + 5 + t.a");
  auto it_begin(input.begin()), it_end(input.end());

  grammar<decltype(it_begin)> parser;

  ast::expression expression;
  qi::phrase_parse(it_begin, it_end, parser, qi::ascii::space, expression);

  ast::printer printer;
  printer(expression);

  return 0;
}
Run Code Online (Sandbox Code Playgroud)

印刷

op:+(op:+(2, 5), op:.(t, a))
Run Code Online (Sandbox Code Playgroud)

seh*_*ehe 5

我会按照我“发现”你的代码的顺序来叙述这个。然后我将介绍一些我认为最后最重要的调整。

我喜欢你所做的很多事情。

  1. 可以(应该?)改进一些名称。例如,ast::operators没有任何暗示其目的。它是二元运算符表达式的惰性工厂。

    所以,命名它make_binary或类似的。

    phoenix::function<>包装它的包装器相同。op在语义动作中并没有很好地表达那里发生的事情。

  2. 与其让op(别名make_binary)actor 对 _val 参数产生副作用,不如考虑让它返回一个不同的值。然后一切都可以变得不可变,语义动作更好地表达意图:

    rule = expr [ _val = foo(_val, _1, _2, _3) ];
    
    Run Code Online (Sandbox Code Playgroud)

    表示 _val 已更新为根据给定参数创建的内容。

  3. 在语法层面,事情看起来并不“整洁”。很多都可以通过简单地改进using namespace qi::labels,并摆脱多余的qi::lit()包装器,这会发生变化,例如

    logical = equality[qi::_val = qi::_1]
      >> *(
        ((qi::lit("&&") > equality[op(qi::_val, qi::_1, ast::operators::and_{})]) |
         (qi::lit("||") > equality[op(qi::_val, qi::_1, ast::operators::or_{})]))
      );
    
    Run Code Online (Sandbox Code Playgroud)

    进入

    using ast::operators;
    using namespace qi::labels;
    
    logical = equality[_val = _1]
      >> *(
        (("&&" > equality[op(_val, _1, operators::and_{})]) |
         ("||" > equality[op(_val, _1, operators::or_{})]))
      );
    
    Run Code Online (Sandbox Code Playgroud)
  4. 你检查eoi你的语法(对你有好处!)。然而,它被放在一个递归规则中:

    expression = logical.alias() > qi::eoi;
    
    Run Code Online (Sandbox Code Playgroud)

    这意味着(a+b)*3永远不会解析,因为)eoi需要的地方找到。通过放在eoi顶层来修复它。

  5. 您在语法级别有一个船长,这意味着人们必须传递正确的船长。如果他们不这样做,他们可能会破坏语法。相反,将船长设置为内部,以便您控制它,并且界面更易于使用(正确):

    start = qi::skip(qi::ascii::space) [ expression ];
    
    Run Code Online (Sandbox Code Playgroud)

    用法:

    if (qi::parse(it_begin, it_end, parser, expression)) {
    
    Run Code Online (Sandbox Code Playgroud)

    也许:

    if (qi::parse(it_begin, it_end, parser > qi::eoi, expression)) {
    
    Run Code Online (Sandbox Code Playgroud)
  6. 我意识到驱动程序代码 ( main) 可能超出了您的审查范围,但我将向您展示缺少的错误处理,因为它可能是非常微妙的部分解析:

    int main() {
        ast::printer printer;
        grammar<std::string::const_iterator> parser;
    
        for (std::string const input : {
                "2 + 5 + t.a",
                "(2 + 5) + t.a", // note the removed eoi constraint
                "2 + 5 * t.a",
                "2 * 5 - t.a",
                "partial match",
                "uhoh *",
            })
        try {
            std::cout << "----- " << std::quoted(input) << " ---- \n";
            auto it_begin(input.begin()), it_end(input.end());
    
            ast::expression expression;
            if (qi::parse(it_begin, it_end, parser, expression)) {
                printer(expression);
                std::cout << std::endl;
            } else {
                std::cout << "Not matched\n";
            }
    
            if (it_begin != it_end) {
                std::string tail(it_begin, it_end);
                std::cout << "Remaining unparsed input: " << std::quoted(tail) << "\n";
            }
        } catch(std::exception const& e) {
            std::cout << "Exception: " << std::quoted(e.what()) << "\n";
        }
    }
    
    Run Code Online (Sandbox Code Playgroud)
  7. 请注意,除非您命名规则,否则期望不会提供有用的消息。

    Exception: Expected <unnamed-rule> at ""
    
    Run Code Online (Sandbox Code Playgroud)

    命名它们的惯用方法是使用 DEBUG 宏:

    BOOST_SPIRIT_DEBUG_NODES(
            (start)
            (expression)(logical)(equality)
            (relational)(additive)(multiplicative)
            (factor)(unary)(binary)(primary)
            (variable)
        )
    
    Run Code Online (Sandbox Code Playgroud)

    现在:

    Exception: Expected <factor> at ""
    
    Run Code Online (Sandbox Code Playgroud)

    中场休息:这里的表面变化:Live On Coliru

  8. 在打印机中有很多重复 ( apply_visitor(*this...) 并且由于operator(). 我的偏好是中继到 acallapply函数

  9. 同样在打印机中,不要对输出流进行硬编码。有一天(TM)你会想要格式化为一个字符串。或std::cerr, 或文件

    在打印机上结合这些注意事项:Live On Coliru

    struct printer {
        std::ostream& _os;
    
        template <typename T> std::ostream& operator()(T const& v) const
            { return call(v); }
    
      private:
        std::ostream& call(expression const& ast) const {
            return boost::apply_visitor(*this, ast.expr);
        }
    
        std::ostream& call(binary_operator const& expr) const {
            _os << "op:" << expr.op << "(";
            call(expr.lhs) << ", ";
            return call(expr.rhs) << ')';
        }
    
        std::ostream& call(unary_operator const& expr) const {
            _os << "op:" << expr.op << "(";
            return call(expr.rhs) << ')';
        }
    
        template <typename Lit>
        std::ostream& call(Lit const& v) const { return _os << v; }
    };
    
    Run Code Online (Sandbox Code Playgroud)
  10. 其逻辑扩展是使其成为实际的输出操纵器

        std::cout << "Parsed: " << fmt_expr{expression} << std::endl;
    
    Run Code Online (Sandbox Code Playgroud)

    同样,Live On Coliru,也printer再次简化了访问者:

    std::ostream& call(binary_operator const& expr) const {
        return _os
            << "op:" << expr.op
            << "("   << fmt_expr{expr.lhs}
            << ", "  << fmt_expr{expr.rhs} << ')';
    }
    
    Run Code Online (Sandbox Code Playgroud)
  11. 在 AST 中,您将实际操作符动态存储为字符串。在我看来,仅针对所有 ast 构建重载(ast::operator::operator()以及 的所有委托成员ast::expr)对运算符进行静态编码并没有太多价值。相反,每次只传递一个字符串?

    现在 builder 命名空间可以消失,不对称的工厂成员,并且整个 phoenix 函数是语法本地的:

    struct make_binary_f {
        ast::binary_operator operator()(ast::expression lhs, ast::expression rhs, std::string op) const {
            return { op, lhs, rhs };
        }
    };
    boost::phoenix::function<make_binary_f> make;
    
    Run Code Online (Sandbox Code Playgroud)

    另一个中间站Live On Coliru

    成就解锁

    编写 113 行代码(现在是 218 行而不是 331 行代码)

  12. 随机地点:

    variable = qi::lexeme[qi::alpha >> *(qi::alnum | '_')];
    
    Run Code Online (Sandbox Code Playgroud)

    '_'等价于qi::lit('_'), 不是qi::char_('_')这样,这将删除所有下划线。要么使用 char_,要么使用raw[]直接从源迭代器构造参数。

  13. 现在我们进入细节:[_val=_1]我们可以使用自动属性传播来代替(参见Boost Spirit: “Semantic actions are evil”?operator %=rule init)。

  14. 分解出更常见的子表达式。与之前的项目符号一起:

    primary
        = qi::double_[_val = _1]
        | ('(' > expression[_val = _1] > ')')
          >> *("." > variable[_val = make(_val, _1, ".")])
        | variable[_val = _1]
          >> *("." > variable[_val = make(_val, _1, ".")]);
    
    Run Code Online (Sandbox Code Playgroud)

    变成:

    primary %= qi::double_
        | (('(' > expression > ')') | variable)
            >> *("." > variable[_val = make(_val, _1, ".")])
        ;
    
    Run Code Online (Sandbox Code Playgroud)
  15. 将变体类型提升到外部,expression以便您可以在expression. 另外,请考虑expression从变体 ( LSK )派生。在您的情况下,实际上不需要嵌套表达式,因为一元/二元节点已经强加了顺序。所以你的整个 AST 可以是:

    struct unary_operator;
    struct binary_operator;
    
    typedef boost::variant<
        double,
        std::string,
        boost::recursive_wrapper<unary_operator>,
        boost::recursive_wrapper<binary_operator>
    > expr_variant;
    
    struct expression : expr_variant {
        using expr_variant::expr_variant;
        using expr_variant::operator=;
    };
    
    struct unary_operator  { expression rhs;                 std::string op; } ;
    struct binary_operator { expression lhs; expression rhs; std::string op; } ;
    
    Run Code Online (Sandbox Code Playgroud)
  16. 移动expectation_handler语法类(这是没有用的别的)内,并选择将其与凤凰::功能的现代化?无论如何,由于函子是无状态的,所以不需要ref(当然也不是ref代替cref):

    qi::on_error<qi::fail>(
        expression,
        boost::phoenix::bind(expectation_handler{}, _3, _2, _4));
    
    Run Code Online (Sandbox Code Playgroud)

    其实只要做好

    auto handler = [](Iterator first, Iterator last, const boost::spirit::info &info) {
        std::stringstream msg;
        msg << "Expected " << info << " at \"" << std::string(first, last) << "\"";
        throw std::runtime_error(msg.str());
    };
    
    qi::on_error<qi::fail>(
        expression,
        boost::phoenix::bind(handler, _3, _2, _4));
    
    Run Code Online (Sandbox Code Playgroud)
  17. 小问题:使用std::quoted而不是“假”引用:)

  18. 后期脑电波,你可以提取大部分语义动作:

    auto make_bin =
        _val = px::bind(make_<ast::binary_expr>{}, _val, _2, _1);
    
    Run Code Online (Sandbox Code Playgroud)

    只要所有的肢体都是无状态的/按值,这不是问题(尽管与将解析器分配给自动变量相反!)。现在只需让运算符公开属性:

    expression %= equality
      >> *(
          (qi::string("&&") > equality)[make_bin] |
          (qi::string("||") > equality)[make_bin]
      );
    
    equality %= relational
      >> *(
          (qi::string("==") > relational)[make_bin] |
          (qi::string("!=") > relational)[make_bin]
      );
    
    relational %= additive
      >> *(
          (qi::string("<")  > relational)[make_bin]  |
          (qi::string("<=") > relational)[make_bin] |
          (qi::string(">")  > relational)[make_bin]  |
          (qi::string(">=") > relational)[make_bin]
      );
    
    additive %= multiplicative
      >> *(
          (qi::string("+") > multiplicative)[make_bin] |
          (qi::string("-") > multiplicative)[make_bin]
      );
    
    multiplicative %= factor
      >> *(
          (qi::string("*") > factor)[make_bin] |
          (qi::string("/") > factor)[make_bin]
      );
    
    factor %= primary
      >> *(
          (qi::string("**") > primary)[make_bin]
      );
    
    primary %= qi::double_
        | (('(' > expression > ')') | variable)
            >> *(qi::string(".") > variable)[make_bin]
        ;
    
    Run Code Online (Sandbox Code Playgroud)
  19. 实际上,刚刚检查并phoenix::construct可以进行聚合:

    auto make_bin =
        _val = boost::phoenix::construct<ast::binary_expr>(_1, _val, _2);
    
    Run Code Online (Sandbox Code Playgroud)
  20. 还删除了未使用的unary_*机器,将 IO 操纵器移入io命名空间(而不是ast)并重新引入eoi检查main驱动程序...

  21. 哎呀,使用一些 c++17,您可以组合每个产品的分支:

    auto op = [](auto... sym) { return qi::copy((qi::string(sym) | ...)); };
    
    expression     %= equality       >> *(op("&&","||") > equality)[make_bin];
    equality       %= relational     >> *(op("==","!=") > relational)[make_bin];
    relational     %= additive       >> *(op("<","<=",">",">=") > relational)[make_bin];
    additive       %= multiplicative >> *(op("+","-")   > multiplicative)[make_bin];
    multiplicative %= factor         >> *(op("*","/")   > factor)[make_bin];
    factor         %= primary        >> *(op("**")      > primary)[make_bin];
    
    Run Code Online (Sandbox Code Playgroud)

完整演示,103 行代码

只是没有设法将其降低到 100 LoC 以下,但我在此过程中添加了更多测试用例。

rule = expr [ _val = foo(_val, _1, _2, _3) ];
Run Code Online (Sandbox Code Playgroud)

印刷

"2 + 5 + t.a" -> op:+(op:+(2, 5), op:.(t, a))
"(2 + 5) + t.a" -> op:+(op:+(2, 5), op:.(t, a))
"2 + 5 * t.a" -> op:+(2, op:*(5, op:.(t, a)))
"2 * 5 - t.a" -> op:-(op:*(2, 5), op:.(t, a))
Exception: Expected <eoi> at " match"
Exception: Expected <factor> at ""
"under_scores" -> under_scores
Run Code Online (Sandbox Code Playgroud)

超越范围

我将考虑的超出范围的事情与您的语法/ast 语义有关。

  1. 运算符优先级有点嘈杂。您想要的是一些元数据,允许您“组合”二进制操作数并显示正确的优先级,如下所示:

    expression %= primary
      >> *(
          (binop > expression) [_val = make_bin(_1, _val, _2)]
      );
    
    Run Code Online (Sandbox Code Playgroud)

    我已经在这个答案扩展聊天中应用了这个策略,结果代码在 github 上:https : //github.com/sehe/qi-extended-parser-evaluator

  2. 如果您有 C++14 支持,请考虑使用 X3。编译时间会少很多。