ANTLR PCRE Grammar to JS Target

2019-02-20 17:40发布

问题:

I'm trying to build Bart Kiers' ANTLR PCRE grammar (see: http://big-o.nl/apps/pcreparser/pcre/PCREParser.html) to a JS target. The only way I get get it to build is with global backtracking and memoization and it's generating code that is invalid here is the grammar:

grammar PCRE;

options {
    language=JavaScript;
    backtrack=true;
    memoize=true;
}

parse
   :  regexAtom* EOF
   ;

 ... and the rest of the grammar as seen: http://big-o.nl/apps/pcreparser/pcre/PCREParser.html

The lexer is generating code that looks like:

//$ANTLR 3.4 PCRE.g 2011-11-19 23:22:35

var PCRELexer = function(input, state) {
// alternate constructor @todo
// public PCRELexer(CharStream input)
// public PCRELexer(CharStream input, RecognizerSharedState state) {
    if (!state) {
        state = new org.antlr.runtime.RecognizerSharedState();
    }

    (function(){
    }).call(this);

    PCRELexer.superclass.constructor.call(this, input, state);

    };


org.antlr.lang.augmentObject(PCRELexer, {
     : ,
     : ,
     : ,
     : ,
     : ,
  ... and more of this empty object.

And when I try to use this generated code I'm getting JS errors on the arguementObject line above. Can someone provide direction of how I can get this to build correctly for a JS target. I'm will ultimately build a walker to generate similar output to the output shown on the same page.

回答1:


Update: the grammar that generates JS lexer- and parser files can be found here: https://github.com/bkiers/PCREParser/tree/js


I haven't updated that page in a while (I will do so soon, if I find the time). Here's the modified grammar that doesn't use global backtracking, and works (as far as I can see) with JavaScript (tested with ANTLR v3.3):

grammar PCRE;

options {
  output=AST;
  language=JavaScript;
}

tokens {
  REGEX;
  ATOM;
  DOT;
  OR;
  CHAR_CLASS;
  NEG_CHAR_CLASS;
  RANGE;
  QUOTATION;
  INT;
  QUANTIFIER;
  GREEDY;
  RELUCTANT;
  POSSESSIVE;
  BACK_REFERENCE;
  CAPTURE_GROUP;
  FLAG_GROUP;
  ATOMIC_GROUP;
  NON_CAPTURE_GROUP;
  POSITIVE_LOOK_AHEAD;
  NEGATIVE_LOOK_AHEAD;
  POSITIVE_LOOK_BEHIND;
  NEGATIVE_LOOK_BEHIND;
  FLAGS;
  ENABLE;
  DISABLE;
  DOT;
  ATOM;
  ATOMS;
}

// parser rules

parse
  :  regexAtoms EOF -> ^(REGEX regexAtoms)
  ;

regexAtoms
  :  atoms (Or^ atoms)*
  ;

atoms
  :  regexAtom* -> ^(ATOMS regexAtom*)
  ;

regexAtom
  :  unit quantifier? -> ^(ATOM unit quantifier?)
  ;

unit
  :  charClass
  |  singleChar
  |  boundaryMatch
  |  quotation
  |  backReference
  |  group
  |  ShorthandCharacterClass
  |  PosixCharacterClass
  |  Dot
  ;

quantifier
  :  (greedy -> ^(GREEDY greedy))
     ('+'    -> ^(POSSESSIVE greedy)
     |'?'    -> ^(RELUCTANT greedy)
     )?
  ;

greedy
  :  '+'            -> INT["1"] INT["2147483647"]
  |  '*'            -> INT["0"] INT["2147483647"]
  |  '?'            -> INT["0"] INT["1"]
  |  '{' (a=integer -> INT[$a.text] INT[$a.text]) 
     (
       (','         -> INT[$a.text] INT["2147483647"])
       (b=integer   -> INT[$a.text] INT[$b.text])?
     )? 
     '}'
  ;

charClass
  :  '[' (('^')=> '^' charClassAtom+ ']' -> ^(NEG_CHAR_CLASS charClassAtom+)
         |        charClassAtom+ ']'     -> ^(CHAR_CLASS charClassAtom+)
         )
  ;

charClassAtom
  :  (charClassSingleChar '-' charClassSingleChar)=> 
     charClassSingleChar '-' charClassSingleChar -> ^(RANGE charClassSingleChar charClassSingleChar)
  |  quotation
  |  ShorthandCharacterClass
  |  BoundaryMatch
  |  PosixCharacterClass
  |  charClassSingleChar
  ;

charClassSingleChar
  :  charClassEscape
  |  EscapeSequence
  |  OctalNumber
  |  SmallHexNumber
  |  UnicodeChar
  |  Or
  |  Caret
  |  Hyphen
  |  Colon
  |  Dollar
  |  SquareBracketStart
  |  RoundBracketStart
  |  RoundBracketEnd
  |  CurlyBracketStart
  |  CurlyBracketEnd
  |  Equals
  |  LessThan
  |  GreaterThan
  |  ExclamationMark
  |  Comma
  |  Plus
  |  Star
  |  QuestionMark
  |  Dot
  |  Digit
  |  OtherChar
  ;

charClassEscape
  :  '\\' ('\\' | '^' | ']' | '-')
  ;

singleChar
  :  regexEscape
  |  EscapeSequence
  |  OctalNumber
  |  SmallHexNumber
  |  UnicodeChar
  |  Hyphen
  |  Colon
  |  SquareBracketEnd
  |  CurlyBracketEnd
  |  Equals
  |  LessThan
  |  GreaterThan
  |  ExclamationMark
  |  Comma
  |  Digit
  |  OtherChar
  ;

regexEscape
  :  '\\' ('\\' | '|' | '^' | '$' | '[' | '(' | ')' | '{' | '}' | '+' | '*' | '?' | '.')
  ;

boundaryMatch
  :  Caret
  |  Dollar
  |  BoundaryMatch
  ;

backReference
  :  '\\' integer -> ^(BACK_REFERENCE integer)
  ;

group
  :  '(' 
     ( '?' ( (flags                -> ^(FLAG_GROUP flags)
             ) 
             (':' regexAtoms       -> ^(NON_CAPTURE_GROUP flags regexAtoms)
             )?
           | '>' regexAtoms        -> ^(ATOMIC_GROUP regexAtoms)
           | '!' regexAtoms        -> ^(NEGATIVE_LOOK_AHEAD regexAtoms)
           | '=' regexAtoms        -> ^(POSITIVE_LOOK_AHEAD regexAtoms)
           | '<' ( '!' regexAtoms  -> ^(NEGATIVE_LOOK_BEHIND regexAtoms)
                 | '=' regexAtoms  -> ^(POSITIVE_LOOK_BEHIND regexAtoms)
                 )
           )
     | regexAtoms                  -> ^(CAPTURE_GROUP regexAtoms)
     )
     ')'
  ;

flags
  :  (a=singleFlags     -> ^(FLAGS ^(ENABLE $a))) 
     ('-' b=singleFlags -> ^(FLAGS ^(ENABLE $a) ^(DISABLE $b))
     )?
  ;

singleFlags
  :  OtherChar*
  ;

quotation
  :  QuotationStart innerQuotation QuotationEnd -> ^(QUOTATION innerQuotation)
  ;

innerQuotation
  :  (~QuotationEnd)*
  ;

integer
  :  (options{greedy=true;}: Digit)+
  ;

// lexer rules

QuotationStart
  :  '\\Q'
  ;

QuotationEnd
  :  '\\E'
  ;

PosixCharacterClass
  :  '\\p{' ('Lower' | 'Upper' | 'ASCII' | 'Alpha' | 'Digit' | 'Alnum' | 'Punct' | 'Graph' | 'Print' | 'Blank' | 'Cntrl' | 'XDigit' | 'Space') '}'
  ;

ShorthandCharacterClass
  :  Escape ('d' | 'D' | 's' | 'S' | 'w' | 'W')
  ;

BoundaryMatch
  :  Escape ('b' | 'B' | 'A' | 'Z' | 'z' | 'G')
  ;

OctalNumber
  :  Escape '0' ( OctDigit? OctDigit 
                | '0'..'3' OctDigit OctDigit
                )
  ;

SmallHexNumber
  :  Escape 'x' HexDigit HexDigit
  ;

U    nicodeChar
  :  Escape 'u' HexDigit HexDigit HexDigit HexDigit
  ;

EscapeSequence
  :  Escape ('t' | 'n' | 'r' | 'f' | 'a' | 'e' | ~('a'..'z' | 'A'..'Z' | '0'..'9'))
  ;

Escape             : '\\';
Or                 : '|';
Hyphen             : '-';
Caret              : '^';
Colon              : ':';
Dollar             : '$';
SquareBracketStart : '[';
SquareBracketEnd   : ']';
RoundBracketStart  : '(';
RoundBracketEnd    : ')';
CurlyBracketStart  : '{';
CurlyBracketEnd    : '}';
Equals             : '=';
LessThan           : '<';
GreaterThan        : '>';
ExclamationMark    : '!';
Comma              : ',';
Plus               : '+';
Star               : '*';
QuestionMark       : '?';
Dot                : '.';
Digit              : '0'..'9';
OtherChar          :  . ;

// fragments
fragment OctDigit : '0'..'7';
fragment HexDigit : ('0'..'9' | 'a'..'f' | 'A'..'F');

It contains next to no target specific code. The only thing I did is use a couple of string literals to rewrite AST's (see the quantifier) and a few .text calls, but almost all ANTLR targets accept double quoted string literals and .text, so you should be okay with Java, Python, C and JavaScript. For C#, I guess you'd need to change the .text invocations to .Text.

You can test it with the following HTML file:

<html>
  <head>
    <script type="text/javascript" src="antlr3-all-min.js"></script>
    <script type="text/javascript" src="PCRELexer.js"></script>
    <script type="text/javascript" src="PCREParser.js"></script>

    <style type="text/css">
      #tree {
        padding: 20px;
        font-family: Monospace;
      }
      .leaf {
        font-weight: bold; 
        font-size: 130%;
      }
    </style>

    <script type="text/javascript">

    function init() {
      document.getElementById("parse").onclick = parseRegex;
    }

    function parseRegex() {
      document.getElementById("tree").innerHTML = "";
      var regex = document.getElementById("regex").value;
      if(regex) {
        var lexer = new PCRELexer(new org.antlr.runtime.ANTLRStringStream(regex));
        var parser = new PCREParser(new org.antlr.runtime.CommonTokenStream(lexer));
        var root = parser.parse().getTree();
        printTree(root, 0);
      }
      else {
        document.getElementById("regex").value = "enter a regex here first"; 
      }
    }

    function printTree(root, indent) {
      if(!root) return;
      for(var i = 0; i < indent; i++) {
        document.getElementById("tree").innerHTML += ".&nbsp;";
      }
      var n = root.getChildCount();
      if(n == 0) {
        document.getElementById("tree").innerHTML += "<span class=\"leaf\">" + root + "</span><br />";
      }
      else {
        document.getElementById("tree").innerHTML += root + "<br />";
      }
      for(i = 0; i < n; i++) {
        printTree(root.getChild(i), indent + 1);  
      }
    }

    </script>
  </head>
<body onload="init()">
  <input id="regex" type="text" size="50" />
  <button id="parse">parse</button>
  <div id="tree"></div>
</body>
</html>

(I rarely use JavaScript, so don't mind the mess posted above!)

If you now parse the regex:

[^-234-7]|(?=[ab\]@]++$).|^$|\1\.\(

with the help of the HTML file above, you will see the following being printed to the screen:

REGEX
. |
. . |
. . . |
. . . . ATOMS
. . . . . ATOM
. . . . . . NEG_CHAR_CLASS
. . . . . . . -
. . . . . . . 2
. . . . . . . 3
. . . . . . . RANGE
. . . . . . . . 4
. . . . . . . . 7
. . . . ATOMS
. . . . . ATOM
. . . . . . POSITIVE_LOOK_AHEAD
. . . . . . . ATOMS
. . . . . . . . ATOM
. . . . . . . . . CHAR_CLASS
. . . . . . . . . . a
. . . . . . . . . . b
. . . . . . . . . . \]
. . . . . . . . . . @
. . . . . . . . . POSSESSIVE
. . . . . . . . . . 1
. . . . . . . . . . 2147483647
. . . . . . . . ATOM
. . . . . . . . . $
. . . . . ATOM
. . . . . . .
. . . ATOMS
. . . . ATOM
. . . . . ^
. . . . ATOM
. . . . . $
. . ATOMS
. . . ATOM
. . . . BACK_REFERENCE
. . . . . 1
. . . ATOM
. . . . \.
. . . ATOM
. . . . \(

Be careful, I have not tested the grammar properly! I'd appreciate it if you let me know if you find any bugs in it.

EDIT

If you un-comment the line language=JavaScript;, regenerate a lexer and parser and run the following class:

import org.antlr.runtime.*;
import org.antlr.runtime.tree.*;
import org.antlr.stringtemplate.*;

public class Main {
  public static void main(String[] args) throws Exception {
    String src = "[^-234-7]|(?=[ab\\]@]++$).|^$|\\1\\.\\(|\\Q*+[\\E";
    PCRELexer lexer = new PCRELexer(new ANTLRStringStream(src));
    PCREParser parser = new PCREParser(new CommonTokenStream(lexer));
    CommonTree tree = (CommonTree)parser.parse().getTree();
    DOTTreeGenerator gen = new DOTTreeGenerator();
    StringTemplate st = gen.toDOT(tree);
    System.out.println(st);
  }
}

you'll see DOT-output that corresponds to the following AST: