CFG Reader(common-left-factor)
A. Second Edition
This is second edition of my CFG Reader which stands for Context-Free-Grammar Reader. In this edition, I made
an adjustment of structure of CFGRead class to make it a pure controller of a class Grammar which store and
represent CFG. What's more important is that I made the Grammar class capable to find and solve the common left
factor of grammar rules. For example: A -> test a b | test a | test a b a
In this rule, it is rather complicated to combine the common-left-factor which is "test a" and it is across 3
independent rules. Do you see the difficulty? The only easy way to solve it is to do it step-by-step. That is
to change grammar to be: A -> test A0 A0 -> a b | a
And the third rule remains: A -> test a b a
Now re-do the checking will find out "test" is still the common-left-factor and will be factored again!
Now it is something like: A -> test A1 A1 -> A0 | a b a
The "A0" remains same this time: A0 -> a b | a
My intention is to write a simple grammar reader that is to automatically update CFG. There are several jobs to
be considered:
1) How to store "tokens"?
2) How to connect "tokens" with their "grammar rules"?
3) How to change grammar to make it LL(1) grammar?
a) Combine common-left-factors.
b) Remove left-recursions.
c) Calculates First and Follow sets.
I finished the 1,2,3-a. The rest will be finished in next edition.
program -> stmt-sequence
stmt-sequence -> stmt-sequence ; statement | statement
statement -> if-stmt | repeat-stmt | assign-stmt | read-stmt | write-stmt
if-stmt -> if boolean-exp then stmt-sequence end
| if boolean-exp then stmt-sequence else stmt-sequence end
repeat-stmt -> repeat stmt-sequence until boolean-exp
assign-stmt -> identifier := integer-exp
read-stmt -> read identifier
write-stmt -> write integer-exp
boolean-exp -> integer-exp comparison-op integer-exp
comparison-op -> < | =
integer-exp -> integer-exp addop term | term
addop -> + | -
term -> term mulop factor | factor
mulop -> * | /
factor -> ( integer-exp ) | number | identifier
E.Further improvement
F.File listing
1. CFGReader.h
2. CFGReader.cpp
3. Grammar.h
4. Grammar.cpp
5. main.cpp (main)
file name: CFGReader.h
#include "Grammar.h" class CFGReader { private: char buffer[BufferLength]; FILE* stream; void newRule(const char* str); void readRule(); void addRule(const char* str); int addToken(const char* str, bool isTerminal=true); void printRule(int index); public: void readFromFile(const char* fileName); void optimize(); void print(); };
file name: CFGReader.cpp
#include <iostream> #include "CFGReader.h" using namespace std; Grammar grammar; const char* deliStr=" |\n"; //this would only remove the immediate left recursion //and should I expand it to remove all kinds of left-recursion? void CFGReader::readFromFile(const char* fileName) { if ((stream=fopen(fileName, "r"))==NULL) { cout<<"cannot open file "<<fileName<<endl; } else { readRule(); } } void CFGReader::readRule() { char* ptr=buffer; char* hold; int count=0; while (!feof(stream)) { count=0; fgets(buffer, BufferLength-1, stream); ptr=buffer; while ((ptr=strtok(ptr, " \n"))!=NULL) { if (strcmp(ptr, "|")!=0) { //test the first two token to see if old rule continues if (count==0) { hold=ptr; } if (count==1) { if (strcmp("->", ptr)==0) { /* curIndex=addToken(hold, false);//the index of cur token grammar[curIndex]->terminal=false; newRule(); */ newRule(hold); } else { addRule(hold); addRule(ptr); } } if (count>=2)//always add { addRule(ptr); } count++; } else { //this is an alternative production rule newRule(NULL); } ptr=NULL; } } } void CFGReader::newRule(const char* str) { grammar.newRule(str); } void CFGReader::optimize() { grammar.optimize(); } void CFGReader::addRule(const char* str) { grammar.addRule(str); } int CFGReader::addToken(const char* str, bool isTerminal) { return grammar.addToken(str, isTerminal); } void CFGReader::printRule(int index) { grammar.printRule(index); } void CFGReader::print() { grammar.print(); }
file name: Grammar.h
const int BufferLength=256; const int MaxTokenCount=50; const int MaxRHSCount=8; const int MaxRuleCount=50; const int MaxTokenLength=10; const int MaxProduction=10; struct Token { bool terminal; char* name; int production[MaxProduction];//pointer to the production it gives int count; int firstCount; int followCount; int follow[MaxTokenCount]; int first[MaxTokenCount]; }; class Grammar { private: Token* token[MaxTokenCount]; int tokenCount; int curIndex;//the index of current token int curPos;//the position at production rule //MaxRuleCount>=MaxVariableCount!!! int production[MaxRuleCount][MaxRHSCount]; int prodIndex;//pointing to current production, initialized to -1 int checkRecursion(int tokenIndex, int ruleIndex); void replaceRule(int tokenIndex, int recurIndex[], int count); void initialize(); void commonLeftFactor(); int findCommonFactor(int tokenIndex, int ruleIndex); void removeLeftRecursion(); void doCommonFactor(int tokenIndex, int ruleIndex, int index); int forgeToken(int index);//create a new variable void epsilonRule(int ruleIndex); public: Grammar(); void optimize(); void printRule(int index); void print(); void addRule(const char* str); void newRule(const char* str);//this is an internal method, not suitable for outside Token* operator[](int index); int addToken(const char* str, bool isTerminal=true); Token* createToken(const char* theName, bool isTerminal); int tokenIndex(const char* str); };
file name: Grammar.cpp
#include <iostream> #include "Grammar.h" using namespace std; const char* emptyStr="empty"; int Grammar::forgeToken(int index) { char str[MaxTokenLength+2], ch; int len=strlen(token[index]->name); int temp=0, i=0; strcpy(str, token[index]->name); ch=str[len-1];//get last char of name if (ch>='0'&&ch<'9') { str[len-1]=ch+i+1; } else { str[len]='0'+i; str[len+1]='\0'; } //I won't bother to check repetitation of name while (tokenIndex(str)!=-1) { i++; if (ch>='0'&&ch<'9') { str[len-1]=ch+i+1; } else { str[len]='0'+i; str[len+1]='\0'; } } return addToken(str, false);//it is non-terminal for sure } void Grammar::addRule(const char* str) { int index; index=addToken(str); production[prodIndex][curPos++]=index; production[prodIndex][curPos]=-1;//set end } //if the token is already there, it return the index //otherwise, it add new node in token array int Grammar::addToken(const char* str, bool isTerminal) { int index; index=tokenIndex(str); if (index==-1) { index=tokenCount; } else { return index; } token[index]=createToken(str, isTerminal); tokenCount++; return index; } void Grammar::newRule(const char* str) { if (str!=NULL) { curIndex=addToken(str, false); } //add one pointer token[curIndex]->production[token[curIndex]->count++]=++prodIndex; token[curIndex]->terminal=false; curPos=0;//reset to restart; } Token* Grammar::createToken(const char* theName, bool isTerminal) { Token* ptr=new Token; ptr->name=new char[strlen(theName)+1]; strcpy(ptr->name, theName); ptr->terminal=isTerminal; ptr->count=ptr->firstCount=ptr->followCount=0; return ptr; } int Grammar::tokenIndex(const char* str) { for (int i=0; i<tokenCount; i++) { if (strcmp(str, token[i]->name)==0) { return i; } } return -1; } int Grammar::checkRecursion(int tokenIndex, int ruleIndex) { for (int i=ruleIndex; i<token[tokenIndex]->count; i++) { //token[tokenIndex]->production[i]=ruleIndex //production[ruleIndex][0] is the first left-recursion one if (production[token[tokenIndex]->production[i]][0]==tokenIndex) { return i; } } return -1; } void Grammar::printRule(int index) { int nodeIndex=0; while (production[index][nodeIndex]!=-1) { cout<<token[production[index][nodeIndex]]->name<<" "; nodeIndex++; } } void Grammar::initialize() { tokenCount=curIndex=curPos=0; prodIndex=-1;//in order to ++ blindly } Grammar::Grammar() { initialize(); } void Grammar::removeLeftRecursion() { int index, count; int recurIndex[MaxTokenCount]; for (int i=0; i<tokenCount; i++) { index=0; count=0; do { index =checkRecursion(i, index); if (index!=-1) { recurIndex[count++]=index; } }while (index!=-1); replaceRule(i, recurIndex, count); } } void Grammar::replaceRule(int tokenIndex, int recurIndex[], int count) { // int newTokenIndex=forgeToken(tokenIndex); } //optimize sequence is first common-factor then remove left recursion //therefore I don't have to check if for same variable if there will be //more than one left-recursion void Grammar::optimize() { commonLeftFactor(); // removeLeftRecursion(); } int Grammar::findCommonFactor(int tokenIndex, int ruleIndex) { for (int i=ruleIndex+1; i<token[tokenIndex]->count; i++) { //if the two rule has the same first token if (production[token[tokenIndex]->production[ruleIndex]][0]== production[token[tokenIndex]->production[i]][0]) { return i; } } return -1; } void Grammar::epsilonRule(int ruleIndex) { production[ruleIndex][0]=addToken(emptyStr); production[ruleIndex][1]=-1; } //sample: x -> Aa // x -> Ab //changed to: x -> Ax' //this is to change the old rule // x' -> b //this is to change the old rule // x' -> a //this is the new-added rule void Grammar::doCommonFactor(int tokenIndex, int ruleIndex, int index) { int newTokenIndex=forgeToken(tokenIndex);//create a new variable //move the second and after part to the new rule of new variable //doing: x' -> a curPos=0; //prepare to add one new rule token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex; token[newTokenIndex]->terminal=false; do { //do copying production[prodIndex][curPos]= production[token[tokenIndex]->production[ruleIndex]][curPos+1]; curPos++; //even the -1 at end is copied }while (production[token[tokenIndex]->production[ruleIndex]][curPos]!=-1); //in order to show an empty string, in case the string is "epsilon" if (curPos==1) { epsilonRule(prodIndex); } //replace x -> Aa with x -> Ax' production[token[tokenIndex]->production[ruleIndex]][1]=newTokenIndex; production[token[tokenIndex]->production[ruleIndex]][2]=-1;//end //doing: x' -> b curPos=0; //prepare to add one new rule //pointing new token to where old rule lies token[newTokenIndex]->production[token[newTokenIndex]->count++]= token[tokenIndex]->production[index]; do { //left-shift productions production[token[tokenIndex]->production[index]][curPos]= production[token[tokenIndex]->production[index]][curPos+1]; curPos++; }while (production[token[tokenIndex]->production[index]][curPos]!=-1); //add epsilon if (curPos==1) { epsilonRule(token[tokenIndex]->production[index]); } //remove from token's production array the rule-index----"index" //shrink the array by one token[tokenIndex]->count--; for (int i=index; i<token[tokenIndex]->count; i++) { token[tokenIndex]->production[i]=token[tokenIndex]->production[i+1]; } } void Grammar::commonLeftFactor() { int index=-1, tokenIndex=0, ruleIndex=0; bool flag; //whenever a newrule is done, restart! while (tokenIndex<tokenCount) { ruleIndex=0; flag=false; while (ruleIndex<token[tokenIndex]->count) { index=findCommonFactor(tokenIndex, ruleIndex); if (index!=-1) { doCommonFactor(tokenIndex, ruleIndex, index); //restart flag=true; break; } else { ruleIndex++; } } if (flag) { tokenIndex=0; } else { tokenIndex++; } } } Token* Grammar::operator[](int index) { if (index>=0&&index<tokenCount) { return token[index]; } else { return NULL; } } void Grammar::print() { for (int i=0; i<tokenCount; i++) { if (!token[i]->terminal) { cout<<token[i]->name<<" ==> "; for (int j=0; j<token[i]->count; j++) { printRule(token[i]->production[j]); if (j!=token[i]->count-1) { cout<<" | "; } } cout<<"\n"; } } }
file name: main.cpp (main)
#include <iostream> #include "CFGReader.h" using namespace std; int main() { CFGReader R; R.readFromFile("c:\\ruleTest.txt"); R.optimize(); R.print(); return 0; }
The input is something like following:
Please note that all tokens must be separated by space, including "|" and "->".
A -> test a b | test a | test a b a
B -> test A | test B
Here is the result:
A ==> test A1 B ==> test B0 A0 ==> a A2 A1 ==> A0 | a b a B0 ==> A | B A2 ==> b | empty Press any key to continue