How to get Abstract Syntax Tree (AST) out of JISON

2019-01-31 19:40发布

So I have generated a parser via JISON:

// mygenerator.js
var Parser = require("jison").Parser;

// a grammar in JSON
var grammar = {
    "lex": {
        "rules": [
           ["\\s+", "/* skip whitespace */"],
           ["[a-f0-9]+", "return 'HEX';"]
        ]
    },

    "bnf": {
        "hex_strings" :[ "hex_strings HEX",
                         "HEX" ]
    }
};

// `grammar` can also be a string that uses jison's grammar format
var parser = new Parser(grammar);

// generate source, ready to be written to disk
var parserSource = parser.generate();

// you can also use the parser directly from memory

// returns true
parser.parse("adfe34bc e82a");

// throws lexical error
parser.parse("adfe34bc zxg");

My question is, how do I retrieve the AST now? I can see that I can run the parser against input, but it just returns true if it works or fails if not.

For the record, I am using JISON: http://zaach.github.com/jison/docs/

2条回答
你好瞎i
2楼-- · 2019-01-31 19:57

I discovered an easier and cleaner way than the one in the other answer.

This post is divided into 2 parts:

  • General way: Read how to implement my way.
  • Actual answer: An implementation of the previously described way specific to OP's request.

General way

  1. Add a return statement to your start rule.

    Example:

    start
        : xyz EOF
            {return $1;}
        ;
    

    xyz is another production rule. $1 accesses the value of the first symbol (either terminal or non-terminal) of the associated production rule. In the above code $1 contains the result from xyz.

  2. Add $$ = ... statements to all other rules.

    Warning: Use $$ = ..., don't return! return will immediately abort further execution by returning the specified value, as the name indicates.

    Example:

    multiplication
        : variable '*' variable
            {$$ = {
                type: 'multiplication',
                arguments: [
                  $1,
                  $3
                ]
              };
            }
        ;
    

    The above production rule will pass the object $$ to the higher level (i.e. the production rule which used this rule).

    Let's complement the multiplication rule in order to achieve a runnable example:

    /* lexical grammar */
    %lex
    %%
    
    \s+                   /* skip whitespace */
    [0-9]+("."[0-9]+)?\b  return 'NUMBER'
    [a-zA-Z]+             return 'CHARACTER'
    "*"                   return '*'
    <<EOF>>               return 'EOF'
    .                     return 'INVALID'
    
    /lex
    
    %start start
    %% /* language grammar */
    
    start
        : multiplication EOF
            {return $1;}
        ;
    
    multiplication
        : variable '*' variable
            {$$ = {
                type: 'multiplication',
                arguments: [
                  $1,
                  $3
                ]
              };
            }
        ;
    
    variable
        : 'NUMBER'
            {$$ = {
                  type: 'number',
                  arguments: [$1]
                };
             }
        | 'CHARACTER'
            {$$ = {
                  type: 'character',
                  arguments: [$1]
                };
             }
        ;
    

    You can try it online: http://zaach.github.io/jison/try/. At the time of this edit (12.02.2017), the online generator sadly throws an error - independently of the Jison file you feed in. See the addendum after step 3 for hints on how to generate the parser on your local machine.

    If you input for example a*3, you get the object structure below:

    {
      "type": "multiplication",
      "arguments": [
        {
          "type": "character",
          "arguments": ["a"]
        },
        {
          "type": "number",
          "arguments": ["3"]
        }
      ]
    }
    
  3. Clean the code and generated AST by injecting custom objects

    When using the Jison-generated parser, you can inject arbitrary objects into the scope of the 'code blocks' in the syntax file:

    const MyParser = require('./my-parser.js');
    MyParser.parser.yy = {
       MultiplicationTerm
       /*, AdditionTerm, NegationTerm etc. */
    };
    
    let calculation = MyParser.parse("3*4");
    // Using the modification below, calculation will now be an object of type MultiplicationTerm
    

    If MultiplicationTerm had a constructor accepting both factors, the new part for multiplication would look like this:

    multiplication
        : variable '*' variable
            {$$ = new yy.MultiplicationTerm($1, $3);}
        ;
    

Addendum on how to create the Jison parser:

Download the Jison NPM module. Then you can create the Jison-parser either by using Jison's command-line or running new jison.Generator(fileContents).generate() in your build file and write the returned string to your preferred file, e.g. my-parser.js.

Actual answer

Applying the rules above leads to the Jison file below.
The Jison file format and the JavaScript API (as stated in the question) are interchangeable as far as I know.

Also note that this Jison file only produces a flat tree (i.e. a list) since the input format is only a list as well (or how would you nest concatenated hex strings in a logical way?).

/* lexical grammar */
%lex
%%

\s+                   /* skip whitespace */
[a-f0-9]+             return 'HEX'
<<EOF>>               return 'EOF'
.                     return 'INVALID'

/lex

%start start
%% /* language grammar */

start
    :  hex_strings EOF
        {return $1;}
    ;

hex_strings
    : hex_strings HEX
        {$$ = $1.concat([$2]);}
    | HEX
        {$$ = [$1];}
    ;
查看更多
【Aperson】
3楼-- · 2019-01-31 20:08

I'm not too familiar with Jison's inner workings, so I don't know any method that would do it.

But in case you're interested in a little bruteforce to solve this problem, try this:

First, create an object to hold the AST

function jisonAST(name, x) { this.name = name; this.x = x; }

// return the indented AST
jisonAST.prototype.get = function(indent){
  // create an indentation for level l
  function indentString(l) { var r=""; for(var i=0;i<l;i++){r+="  "}; return r }

  var r = indentString(indent) + "["+this.name+": ";
  var rem = this.x;
  if( rem.length == 1 && !(rem[0] instanceof jisonAST) ) r += "'"+rem[0]+"'"; 
  else for( i in rem ){ 
      if( rem[i] instanceof jisonAST ) r += "\n" + rem[i].get(indent+1);
      else { r += "\n" + indentString(indent+1); r += "'"+rem[i]+"'"; }
    }
  return r + "]";
}

Add a little helper function for Jison's BNF

function o( s ){
    r = "$$ = new yy.jisonAST('"+s+"',[";
    for( i = 1; i <= s.split(" ").length; i++ ){ r += "$"+i+"," }
    r = r.slice(0,-1) + "]);";
    return [s,r];
}

With this, continue to the example code (slight modification):

var Parser = require("jison").Parser;

// a grammar in JSON
var grammar = {
    "lex": {
        "rules": [
           ["\\s+", "/* skip whitespace */"],
           ["[a-f0-9]+", "return 'HEX';"]
        ]
    },
    "bnf": {
        // had to add a start/end, see below
        "start" : [ [ "hex_strings", "return $1" ] ],
        "hex_strings" :[ 
            o("hex_strings HEX"), 
            o("HEX") 
        ]
    }
};

var parser = new Parser(grammar);
// expose the AST object to Jison
parser.yy.jisonAST = jisonAST

Now you can try parsing:

console.log( parser.parse("adfe34bc e82a 43af").get(0) );

This will give you:

[hex_strings HEX: 
  [hex_strings HEX: 
    [HEX: 'adfe34bc']  
    'e82a']  
  '43af']

Small note: I had to add a "start" rule, in order to only have one statement that returns the result. It is not clean (since the BNF works fine without it). Set it as an entry point to be sure...

查看更多
登录 后发表回答