I know,we can define some conditions in lex, matching:
1.<DIRECTIVE>{STRING} {printf("Matching the DIRECTIVE state!");}
2.<REFERENCE>{INTEGER} {printf("Matching the REFERNCE state!");}
3.[\n] {printf("Matching the INITIAL state?");}
4.<*>{DOBULE} {printf("Matching all state include INITIAL? Seem not!");}
How to use the states in the right way? What is the difference in conditions on line 3 and 4?
The whole .l file, cut by me,now it just to realize a reference.When I run it,it can work well,but it always says "line:4: error: syntax error" at last,I don't why! My test.vm has only 2 lines text.
%{
/**支持了所有的token,不支持转义和多行注释*/
#include<stdio.h>
#include<string.h>
#include "context.h"
#include "bool.h"
#include "vtl4.tab.h"
extern int yylex();
/**bracket标志*/
int bracket_flag = 0;
/**引用标志*/
int ref_flag = 0;
/**多行注释标记*/
int mul_comment_flag = 0;
%}
%option stack
%option noyywrap yylineno
%x REF
VAR_NAME ([_a-zA-Z]+[a-zA-Z0-9_\-]*)
%%
/**这里去除$#\n防止覆盖状态REF和DIRECTIVE*/
[^$\n#]*? {printf("text:%s\n",yytext);yylval.string = yytext; return CONTENT;}
/**换行单独取出来,还没清楚为什么*/
\n {printf("newLine:%s\n",yytext);yylval.string = yytext; return CONTENT;}
/**添加^$防止只匹配最后一个变量
例如:<p class="$b">$a $b</p> 只匹配了最后一个$b
*/
[^#$]*?/"$" {BEGIN REF;printf("begin ref text:%s\n",yytext);yylval.string = yytext; return CONTENT;}
<REF>"$"|"$!"/"{"?{VAR_NAME} {ref_flag++;printf("$:%s\n",yytext);return DOLLAR;}
<REF>"{" {printf("{:%s\n",yytext);return BRACE;}
<DIRECTIVE>"}" {printf("}:%s\n",yytext);return BRACE_CLOSE;}
<REF>{VAR_NAME}/[^0-9A-Za-z_\-] {
printf("ref name:%s\n",yytext);
ref_flag--;
yylval.sym = find_symbol(yytext);
return ID;
}
<REF>[ /t"="\n] {BEGIN INITIAL; printf("ref end:%s\n",yytext);}
<REF>[}] {printf("}:%s\n",yytext);return BRACE_CLOSE;}
<<EOF>> {printf("lex end:%s\n",yytext);return LEX_EOF;}
%%
The whole .y file:
%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "bool.h"
#include "parser.h"
#include "context.h"
#include "vtl4.tab.h"
extern FILE * yyin;
extern FILE * yyout;
int yylex();
#define YYDEBUG 1
%}
/*priority level*/
%union {
struct simpleNode *ast;
double d;
int i;
bool b;
char* string;
struct symbol *sym;
}
%token ADD SUB MUL DIV MOD EQUAL PARENTHESIS CLOSE_PARENTHESIS BRACE BRACE_CLOSE LOGICAL_AND LOGICAL_OR LOGIC_EQUAL LOGICAL_LT LOGICAL_LE LOGICAL_GT LOGICAL_GE LOGICAL_NOT_EQUALS
%token LEX_EOF
%token <string> CONTENT STRING_LITERAL SINGLE_LINE_COMMENT MULTI_LINE_COMMENT
%token <b> BOOL
%token INTEGER_LITERAL
%token <d> DOUBLE_LITERAL
%token DOLLAR
%token <sym> ID
%token HASH SET PARSE IF ELSE ELSEIF FOREACH IN END
%type <ast> root statements statement reference content
%start root
%%
/*JJTPROCESS*/
root:statements LEX_EOF {printf("yacc root\n");$$ = process($1);}
;
statements
: statement {printf("yacc statements:statement\n"); $$ = $1; }
| statements statement {printf("yacc statements:statements statement\n"); $$ = add_ybrother($1,$2);}
;
statement
: reference {printf("yacc statement:ref\n"); $$ = $1;}
| content
;
reference
: DOLLAR ID {printf("yacc ref\n");$$ = reference($2);}
;
content
: CONTENT {$$ = text(NULL);}
;
%%
int main(){
printf("BEGIN:\n");
FILE *src;
src = fopen("test.vm","r");
yyin = src;
int result = yyparse();
fclose(src);
return result;
}
Firstly, start conditions are not a feature of Lex. They are a feature of GNU Flex.
The syntax
<*>
is indeed documented as meaning "this rule will fire unconditionally in any state".As you probably know, conditions can be exclusive (defined with
%x
) or inclusive (defined with%s
).If you write a rule without any conditions, then it is active in the
INITIAL
state, and also in any states which are inclusive: inclusive means "rules which specify no condition will be active in this state".Rules without any conditions are not active if the current state is an exlusive one: it excludes rules that have no conditions.
The
<*>
syntax means that the rule will activate in all states, including the exlusive ones.If a rule has no
<...>
condition syntax, it means that the rule will activate only in theINITIAL
state, or states that are inclusive.Plenty can still go wrong in your parser. Remember that the longest matching rule is the one that is triggered at any point in the input, and that can be misinterpreted as start conditions not working. If the input is
catalog
and some eligible rule wants to matchcat
, it doesn't matter that the rule which matchesc
is attributed with<*>
. It's not the longest match.