CFGReader

index.0 M ==> no.1program no.2i no.3; no.4Dl no.5B
index.4 Dl ==> no.6Dv no.7Ml
index.5 B ==> no.29begin no.30Sl no.31end no.3;
index.6 Dv ==> no.14variables no.12Vl | no.9e
index.7 Ml ==> no.69Ml0
index.8 Mo ==> no.10module no.2i no.11( no.12Vl no.13) no.6Dv no.5B
index.12 Vl ==> no.15V no.70Vl0
index.15 V ==> no.16Il no.17: no.18T no.3;
index.16 Il ==> no.2i no.71Il0
index.18 T ==> no.19integer no.20Ad | no.21char no.20Ad
index.20 Ad ==> no.9e | no.25array no.26[ no.27n no.28]
index.23 L ==> no.2i no.24Ar
index.24 Ar ==> no.9e | no.26[ no.34E no.28]
index.30 Sl ==> no.32S no.72Sl0
index.32 S ==> no.2i no.73S0 | no.35if no.36C no.37then no.32S no.38else no.32S | no.39loop no.30
Sl no.31end no.3; | no.40exit no.3; | no.29begin no.30Sl no.31end no.3; | no.42read no.43Ln no.3;
| no.44write no.45Lo no.3; | no.9e no.3;
index.34 E ==> no.50F no.65M0
index.36 C ==> no.50F no.65M0 no.58Or no.34E
index.41 Lp ==> no.9e | no.43Ln
index.43 Ln ==> no.2i no.24Ar no.66M1
index.45 Lo ==> no.48Lr no.67M2
index.48 Lr ==> no.2i no.24Ar | no.27n | no.49c
index.50 F ==> no.54R no.68M3
index.51 Oa ==> no.52+ | no.53-
index.54 R ==> no.2i no.24Ar | no.27n | no.11( no.34E no.13) | no.49c
index.55 Om ==> no.56* | no.57/
index.58 Or ==> no.59= | no.60< | no.61> | no.62<= | no.63>= | no.64!=
index.65 M0 ==> no.52+ no.50F no.65M0 | no.9e | no.53- no.50F no.65M0
index.66 M1 ==> no.22, no.23L no.66M1 | no.9e
index.67 M2 ==> no.22, no.48Lr no.67M2 | no.9e
index.68 M3 ==> no.56* no.54R no.68M3 | no.9e | no.57/ no.54R no.68M3
index.69 Ml0 ==> no.10module no.2i no.11( no.12Vl no.13) no.6Dv no.5B no.69Ml0 | no.9e
index.70 Vl0 ==> no.2i no.71Il0 no.17: no.18T no.3; no.70Vl0 | no.9e
index.71 Il0 ==> no.22, no.2i no.71Il0 | no.9e
index.72 Sl0 ==> no.9e | no.30Sl
index.73 S0 ==> no.24Ar no.33:= no.34E no.3; | no.11( no.41Lp no.13) no.3;
index.74 START ==> no.0M no.75$
Press any key to continue

　

The following gives the index of each rule:

M ==> rule.0 program i ; Dl B
Dl ==> rule.1 Dv Ml
B ==> rule.17 begin Sl end ;
Dv ==> rule.5 variables Vl | rule.6 e
Ml ==> rule.3 Ml0
Mo ==> rule.4 module i ( Vl ) Dv B
Vl ==> rule.8 V Vl0
V ==> rule.9 Il : T ;
Il ==> rule.12 i Il0
T ==> rule.10 integer Ad | rule.11 char Ad
Ad ==> rule.15 e | rule.16 array [ n ]
L ==> rule.14 i Ar
Ar ==> rule.36 e | rule.37 [ E ]
Sl ==> rule.18 S Sl0
S ==> rule.20 i S0 | rule.21 if C then S else S | rule.22 loop Sl end ; | rule.23 exit ; | rule.
25 begin Sl end ; | rule.26 read Ln ; | rule.27 write Lo ; | rule.28 e ;
E ==> rule.38 F M0
C ==> rule.48 F M0 Or E
Lp ==> rule.29 e | rule.30 Ln
Ln ==> rule.31 i Ar M1
Lo ==> rule.32 Lr M2
Lr ==> rule.33 i Ar | rule.34 n | rule.35 c
F ==> rule.41 R M3
Oa ==> rule.39 + | rule.40 -
R ==> rule.44 i Ar | rule.45 n | rule.46 ( E ) | rule.47 c
Om ==> rule.42 * | rule.43 /
Or ==> rule.49 = | rule.50 < | rule.51 > | rule.52 <= | rule.53 >= | rule.54 !=
M0 ==> rule.55 + F M0 | rule.56 e | rule.66 - F M0
M1 ==> rule.57 , L M1 | rule.58 e
M2 ==> rule.59 , Lr M2 | rule.60 e
M3 ==> rule.61 * R M3 | rule.62 e | rule.67 / R M3
Ml0 ==> rule.2 module i ( Vl ) Dv B Ml0 | rule.63 e
Vl0 ==> rule.7 i Il0 : T ; Vl0 | rule.64 e
Il0 ==> rule.13 , i Il0 | rule.65 e
Sl0 ==> rule.68 e | rule.19 Sl
S0 ==> rule.69 Ar := E ; | rule.24 ( Lp ) ;
START ==> rule.70 M $
Press any key to continue

9. errorno.h

10. errorNo.cpp

11. initialize.cpp

12. hash.h

13. hash.cpp

12. main.cpp (main)

#include "Grammar.h" class CFGReader { private: char buffer[BufferLength]; FILE* stream; void newRule(const char* str); void readRule(); void addRule(const char* str); int addToken(const char* str, bool isTerminal=true); void printRule(int index); public: void readFromFile(const char* fileName); void optimize(); void calculateLookAhead(); void print(); };

#include <iostream> #include "CFGReader.h" using namespace std; Grammar grammar; const char* deliStr=" |\n"; //this would only remove the immediate left recursion //and should I expand it to remove all kinds of left-recursion? void CFGReader::readFromFile(const char* fileName) { if ((stream=fopen(fileName, "r"))==NULL) { cout<<"cannot open file "<<fileName<<endl; } else { readRule(); } } void CFGReader::readRule() { char* ptr=buffer; char* hold; int count=0; while (!feof(stream)) { count=0; fgets(buffer, BufferLength-1, stream); ptr=buffer; while ((ptr=strtok(ptr, " \n"))!=NULL) { if (strcmp(ptr, "|")!=0) { //test the first two token to see if old rule continues if (count==0) { hold=ptr; } if (count==1) { if (strcmp("->", ptr)==0) { /* curIndex=addToken(hold, false);//the index of cur token grammar[curIndex]->terminal=false; newRule(); */ newRule(hold); } else { addRule(hold); addRule(ptr); } } if (count>=2)//always add { addRule(ptr); } count++; } else { //this is an alternative production rule newRule(NULL); } ptr=NULL; } } } void CFGReader::newRule(const char* str) { grammar.newRule(str); } void CFGReader::optimize(bool forLL) { grammar.optimize(forLL); } void CFGReader::calculateLookAhead() { grammar.calculateLookAhead(); } void CFGReader::addRule(const char* str) { grammar.addRule(str); } int CFGReader::addToken(const char* str, bool isTerminal) { return grammar.addToken(str, isTerminal); } void CFGReader::printRule(int index) { grammar.printRule(index); } void CFGReader::print() { grammar.print(); //grammar.printToken(); //grammar.buildTable(); //grammar.printTable(); //grammar.printAllRules(); }

#ifndef GRAMMAR_H #define GRAMMAR_H #include <bitset> #include <iostream> using namespace std; const int BufferLength=256; const int MaxTokenCount=100; const int MaxRHSCount=40; const int MaxRuleCount=200; const int MaxGrammarTokenLength=10; const int MaxProduction=10; const int TokenTypeCount=38; const int MaxStateCount=200; const int MaxItemCount=500; //a macro to ease the job //#define BitSet (bitset<MaxItemCount>) struct GrammarToken { //bool isMeta; bool terminal; bool isNull; char* name; int production[MaxProduction];//pointer to the production it gives int count; int firstCount; int followCount; int follow[MaxTokenCount]; int first[MaxTokenCount]; }; struct Item { int varIndex; int rulePos; int dotPos; }; bool operator == (Item& first, Item& second); /* Item& operator= (Item& first, Item& second) { first.dotPos=second.dotPos; first.rulePos=second.rulePos; first.varIndex=second.varIndex; return first; } */ class BitSet: public bitset<MaxItemCount> { private: int current; public: BitSet& operator=(const BitSet& theSet); int next(); BitSet(); void restart(){current=-1;} }; class Grammar { //to allow Parser to access Grammar friend class Parser; friend class LRParser; private: int LRTable[MaxStateCount][MaxTokenCount]; int terminalCount; int nonTermCount; GrammarToken* token[MaxTokenCount]; int tokenCount; int curIndex;//the index of current token int curPos;//the position at production rule //MaxRuleCount>=MaxVariableCount!!! //the last one in each rule is reserved for the length int production[MaxRuleCount][MaxRHSCount+1]; int prodIndex;//pointing to current production, initialized to -1 void removeEpsilon(int ruleIndex); int checkRecursion(int tIndex, int curToken); void grammarError(int theVar, int theToken, int rule1, int rule2); void addFirstIntoTable(int theVariable, int theFirst, int theRule); void addFollowIntoTable(int theVariable, int theRule); void addBeginEnd(); void addEpsilonForToken(int tIndex); bool addIntoFirst(int target, int source); bool addFirst(int target, int source); void shiftRule(int ruleIndex, bool Left2Right, int offset); void addAtEnd(int ruleIndex, int toAdd); void addRuleForToken(int tIndex, int ruleIndex); int copyRule(int source, int target);//return the position of -1 void replaceRule(int curToken, int ruleIndex); int findMetaSymbol(int& begin, int& end); void initialize(); void doReplaceMetaSymbol(int ruleIndex, int begin, int end); void removeRuleFromToken(int tIndex, int ruleIndex); void checkEpsilon(int ruleIndex); void removeImmediate(int tIndex, int ruleIndex); void doAddRule(int tIndex); void commonLeftFactor(); int findCommonFactor(int tokenIndex, int ruleIndex); void removeLeftRecursion(); void doCommonFactor(int tokenIndex, int ruleIndex, int index); int forgeToken(int index);//create a new variable void epsilonRule(int ruleIndex); bool isRedundant(int tIndex); void replaceMetaSymbol(); void calculateFirst(); void calculateFollow(); void calculateNull(); bool Null(int tIndex); bool addFollow(int target, int source); bool addIntoFollow(int target, int source); bool addFollowIntoFollow(int target, int source); void removeRedundant(); void removeToken(int tIndex); void LRoptimize(); void LLoptimize(); void buildLLTable(); void buildLRTable(); int ruleLen(int ruleIndex); void calculateProperty(); int item2Rule(int itemIndex); int item2Var(int itemIndex); int shift2Mask(int input); int reduce2Mask(int input); void printDFA(); bool* expandFinished; int mask2State(int input); bool isReduce(int input); bool isShift(int input); //items-related methods void initializeDFA(); void uninitializeDFA(); void constructDFA(); //void addItems(int itemIndex, bitset<MaxStateCount>* theSet); int addItem(Item& theItem); int itemCount; int stateCount; Item**items; //int addState(bitset void addSonItem(int tIndex, BitSet& theSet); int createState(const BitSet& theSet); BitSet** sets; int createItem(int theVar, int theNo, int position); void addClosure(int itemIndex, BitSet& theSet); void expandState(int stateIndex); int findNextTokenOfItem(int itemIndex); void doExpandState(int itemIndex, int tIndex, BitSet& theSet); void add2Table(int stateIndex, int targetState, int tIndex);//this is shift void add2Table(int stateIndex, int ruleIndex);//this is reduce public: Grammar(); void buildTable(bool forLLGrammar=true); void optimize(bool forLLGrammar=true); void printRule(int index); void printRule(int tIndex, int rulePos, int dotPos); void print(); void printTable(bool isLL=true); void printAllRules(); void printToken(bool onlyVar=false); int varCount(); int termCount(); void addRule(const char* str); void newRule(const char* str);//this is an internal method, not suitable for outside GrammarToken* operator[](int index); int addToken(const char* str, bool isTerminal=true); GrammarToken* createToken(const char* theName, bool isTerminal); int tokenIndex(const char* str); void calculateLookAhead(); }; #endif

#include <iostream> #include <iomanip> #include "errorNo.h" #include "Grammar.h" using namespace std; /* #define REDUCEMASK 0xd10000000 #define SHIFTMASK 0xd20000000 #define STATEMASK 0xd01111111 #define SHIFTREDUCEMASK 0xd30000000 */ const int REDUCEMASK = 268435456; const int SHIFTMASK = 536870912; const int STATEMASK = 268435455; const int SHIFTREDUCEMASK = 805306368; const int ACCEPTANCE = SHIFTREDUCEMASK; const char* emptyStr="e"; const char* optionBegin="{"; const char* optionEnd="}"; const char* startStr="START"; const char* endStr="$"; int startSymbolIndex=-1; int stackBottomIndex=-1; int beginIndex=-1; int endIndex=-1; int emptyIndex=-1; int epsilonIndex=-1; int table[MaxTokenCount][MaxTokenCount]; char* terminalStr[TokenTypeCount]= { //GENERAL TYPE 5 "i", "n", "c", "COMMENT", "ERROR", //THE FOLLOWING ARE SYMBOL TYPE 18 "(", ")", ";", "+", "-", "*", "/", ":=", "<", ">", "=", "<=", ">=", "!=", "[", "]", ",", ":", //THE FOLLOWING ARE RESERVED TYPE 15 "begin", "end", "program", "variables","integer", "array", "char", "module", "if", "then", "else", "loop", "exit", "read", "write" }; //these are "hash-table-like" tables //this is exactly the opposite of below int token2type[MaxTokenCount]; //this is to translate tokenType in scanner to token-index of grammar int type2token[MaxTokenCount]; int matchTokens(const char* str) { for (int i=0; i<TokenTypeCount; i++) { if (strcmp(terminalStr[i], str)==0) { return i; } } return -1; } void Grammar::calculateProperty() { terminalCount=nonTermCount=0; for (int i=0; i<tokenCount; i++) { if (token[i]->terminal) { terminalCount++; continue; } for (int j=0; j<token[i]->count; j++) { int len; len=ruleLen(token[i]->production[j]); production[token[i]->production[j]][MaxRHSCount]=len; } nonTermCount++; } } void Grammar::addEpsilonForToken(int tIndex) { if (epsilonIndex==-1) { epsilonRule(++prodIndex); addRuleForToken(tIndex, prodIndex); } else { addRuleForToken(tIndex, epsilonIndex); } } void Grammar::printTable(bool isLL) { /* cout<<" "; for (int i=0; i<tokenCount; i++) { cout<<"| "<<i; } cout<<endl; */ if (isLL) { for (int r=0; r<tokenCount; r++) { if (token[r]->terminal) { continue; } cout<<token[r]->name<<":"; for (int c=0; c<tokenCount; c++) { if (table[r][c]!=-1) { cout<<token[c]->name<<"="<<table[r][c]<<","; } } cout<<endl; } } else { cout<<"the following is LR table\n"; cout<<" "; for (int col=0; col<tokenCount; col++) { cout<<token[col]->name<<","; } cout<<endl; for (int r=0; r<stateCount; r++) { cout<<r<<"| "; for (int c=0; c<tokenCount; c++) { int result=LRTable[r][c]; if (isReduce(result)) { cout<<"r"; cout<<mask2State(result); } else { if (isShift(result)) { cout<<"s"; cout<<mask2State(result); } else { cout<<result; } } cout<<","; } cout<<endl; } } } void Grammar::buildLRTable() { constructDFA(); printDFA(); cout<<"\n"; uninitializeDFA(); } //initialized at "initialize()" void Grammar::buildLLTable() { int typeIndex; for (int r=0; r<tokenCount; r++) { if (token[r]->terminal) { typeIndex=matchTokens(token[r]->name); if (typeIndex!=-1) { type2token[typeIndex]=r; token2type[r]=typeIndex; } } //initialize /* for (int c=0; c<MaxTokenCount; c++) { table[r][c]=-1; } */ } for (int i=0; i<tokenCount; i++) { if (token[i]->terminal) { continue; } for (int j=0; j<token[i]->count; j++) { int k=0, theRule=token[i]->production[j]; int theToken=production[theRule][k]; while (theToken!=-1) { addFirstIntoTable(i, theToken, theRule); if (!token[theToken]->isNull) { break; } k++; theToken=production[theRule][k]; } if (theToken==-1) { addFollowIntoTable(i, theRule); } } } } void Grammar::buildTable(bool forLLGrammar) { if (forLLGrammar) { buildLLTable(); } else { buildLRTable(); } } /* nCount=tCount=0; for (int r=0; r<MaxTokenCount; r++) { if (r<tokenCount) { if (token[r]->terminal) { typeIndex=matchTokens(token[r]->name); if (typeIndex!=-1) { type2token[typeIndex]=r; token2type[r]=typeIndex; } //I am making a invertable table!!! tArray[r]=tCount++; } else { //it is an invertable table!!! nArray[r]=nCount++; } } for (int c=0; c<MaxTokenCount; c++) { table[r][c]=-1; } } for (int i=0; i<tCount; i++) { for (int j=0; j<token[i]->count; j++) { int k=0, theRule=token[i]->production[j]; int theToken=production[theRule][k]; while (theToken!=-1) { addFirstIntoTable(theToken, j); if (!token[theToken]->isNull) { break; } k++; theToken=production[theRule][k]; } if (theToken==-1) { addFollowIntoTable(j); } } } } */ void Grammar::grammarError(int theVar, int theToken, int rule1, int rule2) { cout<<"error! the conflict of grammar at "; cout<<token[theVar]->name<<" for token of " <<token[theToken]->name <<" between rules of \n"; printRule(rule1); cout<<" and "; printRule(rule2); cout<<endl; } void Grammar::addFollowIntoTable(int theVariable, int theRule) { int temp; for (int i=0; i<token[theVariable]->followCount; i++) { temp=token[theVariable]->follow[i]; if (table[theVariable][temp]!=theRule) { if (table[theVariable][temp]!=-1) { grammarError(theVariable, temp, theRule, table[theVariable][temp]); } table[theVariable][temp]=theRule; } } } void Grammar::addFirstIntoTable(int theVariable, int theFirst, int theRule) { for (int i=0; i<token[theFirst]->firstCount; i++) { if (table[theVariable][token[theFirst]->first[i]]!=theRule) { if (table[theVariable][token[theFirst]->first[i]]!=-1) { grammarError(theVariable, token[theFirst]->first[i], theRule, table[theVariable][token[theFirst]->first[i]]); } table[theVariable][token[theFirst]->first[i]]=theRule; } } } int Grammar::varCount() { int result=0; for (int i=0; i<tokenCount; i++) { if (!token[i]->terminal) { result++; } } return result; } int Grammar::termCount() { int result=0; for (int i=0; i<tokenCount; i++) { if (token[i]->terminal) { result++; } } return result; } //leave the rule unremoved!!! as I have no way to do it!!! void Grammar::removeToken(int tIndex) { delete[]token[tIndex]->name; delete token[tIndex]; tokenCount--; for (int i=tIndex; i<tokenCount; i++) { token[i]=token[i+1]; } } bool Grammar::isRedundant(int tIndex) { int k, theRule, theToken; for (int i=0; i<tokenCount; i++) { for (int j=0; j<token[i]->count; j++) { k=0; theRule=token[i]->production[j]; theToken=production[theRule][k]; while (theToken!=-1) { if (theToken==tIndex) { return false; } k++; theToken=production[theRule][k]; } } } return true; } //what is redundant? except the start variable //the non-terminal never appears in other rules! void Grammar::removeRedundant() { int tIndex=1; bool findNew=false; while (tIndex<tokenCount) { findNew=false; if (!token[tIndex]->terminal) { if (isRedundant(tIndex)) { removeToken(tIndex); findNew=true; } } if (findNew) { tIndex=1; } else { tIndex++; } } } void Grammar::printAllRules() { /* cout<<"\nnow print all rules\n"; for (int i=0; i<=prodIndex; i++) { cout<<"rule index "<<i<<": "; printRule(i); cout<<"\n"; } */ int sum=0; for (int i=0; i<tokenCount; i++) { if (!token[i]->terminal) { sum+=token[i]->count; } } cout<<" total rules is:"<<prodIndex+1; cout<<"\n and the sum is "<<sum<<endl; } void Grammar::addBeginEnd() { int begin=addToken(startStr, false); int end=addToken(endStr, true); prodIndex++; production[prodIndex][0]=0; production[prodIndex][1]=end; production[prodIndex][2]=-1; addRuleForToken(begin, prodIndex); startSymbolIndex=begin; stackBottomIndex=end; } /* void Grammar::calculateProperty() { calculateNull(); calculateFirst(); calculateFollow(); } */ void Grammar::printToken(bool onlyVar) { for (int i=0; i<tokenCount; i++) { if (onlyVar) { if (token[i]->terminal) { continue; } } cout<<"name: "<<token[i]->name<<endl; cout<<" isNull: "<<(token[i]->isNull?"true":"false")<<endl; cout<<" isTerminal: "<<(token[i]->terminal?"true":"false")<<endl; cout<<" first: "; for (int j=0; j<token[i]->firstCount; j++) { if (j!=0) { cout<<","; } cout<<"["<<j<<"]="<<token[token[i]->first[j]]->name; } cout<<"\n follow: "; for (j=0; j<token[i]->followCount; j++) { if (j!=0) { cout<<","; } cout<<"["<<j<<"]="<<token[token[i]->follow[j]]->name; } cout<<endl; } } //must use while loop to discover new set!!!!! void Grammar::calculateFirst() { int i; bool addNew; do { i=0; addNew=false; while (i<tokenCount) { //the terminal contains itself if (token[i]->terminal) { //addFirst don't judge if it is nullable!! addFirst(i, i); //token[i]->first[token[i]->firstCount++]=i; } else { //for all its rules for (int j=0; j<token[i]->count; j++) { int theToken, k=0; theToken=production[token[i]->production[j]][k]; //for each token in each rule do { //add non epsilon set if (addIntoFirst(i, theToken)) { addNew=true; } //if it is not null, means it is end if (!token[theToken]->isNull) { break; } //if it is nullable, continue k++; theToken=production[token[i]->production[j]][k]; }while (theToken!=-1); //it means all token in this rule is nullable, so if (theToken==-1) { //it is nullable //addEpsilonIntoFirst(i); addFirst(i, emptyIndex); } } } i++; } }while (addNew); } bool Grammar::addFollowIntoFollow(int target, int source) { bool addNew=false; for (int i=0; i<token[source]->followCount; i++) { if (addFollow(target, token[source]->follow[i])) { addNew=true; } } return addNew; } bool Grammar::addIntoFollow(int target, int source) { bool addNew=false; if (source==-1) { return false; } for (int i=0; i<token[source]->firstCount; i++) { //add non-epsilon if (!token[token[source]->first[i]]->isNull) { if (addFollow(target, token[source]->first[i])) { addNew=true; } } } return addNew; } void Grammar::calculateFollow() { int i; bool addNew, started; //token[startSymbolIndex]->follow[0]=stackBottomIndex; //token[startSymbolIndex]->followCount=1; do { i=0; addNew=false; while (i<tokenCount) { //the terminal contains itself if (!token[i]->terminal) { for (int tIndex=0; tIndex<tokenCount; tIndex++) { //for all its rules if (token[tIndex]->terminal) { continue; } //for each its rule for (int j=0; j<token[tIndex]->count; j++) { int theToken, k=0, theRule=token[tIndex]->production[j]; started=false; theToken=production[theRule][k]; //for each token in each rule do { //the token appears here if (started) { if (addIntoFollow(i, theToken)) { addNew=true; } if (!token[theToken]->isNull) { break; } } if (theToken==i) { started=true; //add non epsilon set, including -1 situation!!! } //if it is not null, means it is end //if it is nullable, continue k++; theToken=production[theRule][k]; }while (theToken!=-1); //it means all token in this rule is nullable, so if (started&&theToken==-1) { //it is nullable //add current variable Follow(j) into Follow(i); if (addFollowIntoFollow(i, tIndex)) { addNew=true; } } } } } i++; } }while (addNew); } void Grammar::addRuleForToken(int tIndex, int ruleIndex) { token[tIndex]->production[token[tIndex]->count++]=ruleIndex; } void Grammar::shiftRule(int ruleIndex, bool left2Right, int offset) { int end=0; /* while (production[ruleIndex][end]!=-1) { end++; } */ end=ruleLen(ruleIndex); if (left2Right) { for (int i=end; i>=0; i--) { production[ruleIndex][i+offset]=production[ruleIndex][i]; } } else { for (int i=0; i<=end-offset; i++) { production[ruleIndex][i]=production[ruleIndex][i+offset]; } checkEpsilon(ruleIndex); } } void Grammar::checkEpsilon(int ruleIndex) { if (production[ruleIndex][0]==-1) { epsilonRule(ruleIndex); } } bool Grammar::addFollow(int target, int source) { //check if it is already there for (int i=0; i<token[target]->followCount; i++) { if (token[target]->follow[i]==source) { return false; } } //add at end token[target]->follow[token[target]->followCount++]=source; return true; } bool Grammar::addFirst(int target, int source) { //check if it is already there for (int i=0; i<token[target]->firstCount; i++) { if (token[target]->first[i]==source) { return false; } } //add at end token[target]->first[token[target]->firstCount++]=source; return true; } //add non epsilon into it. bool Grammar::addIntoFirst(int target, int source) { bool addNew=false; if (token[source]->terminal) { if (!token[source]->isNull) { return addFirst(target, source); } else { return false; } } for (int i=0; i<token[source]->firstCount; i++) { //add non epsilon if (!token[token[source]->first[i]]->isNull) { if (addFirst(target, token[source]->first[i])) { addNew=true; } } } return addNew; } bool Grammar::Null(int tIndex) { if (token[tIndex]->terminal) { return token[tIndex]->isNull; } for (int i=0; i<token[tIndex]->count; i++) { int j=0, theToken; theToken=production[token[tIndex]->production[i]][j]; do { if (theToken==tIndex||!Null(theToken)) { break; } j++; theToken=production[token[tIndex]->production[i]][j]; }while (theToken!=-1); if (theToken==-1) { return true; } } return false; } void Grammar::calculateNull() { for (int i=0; i<tokenCount; i++) { token[i]->isNull=Null(i); } } int Grammar::findMetaSymbol(int& begin, int& end) { int theRule, theToken, k; begin=end=-1; for (int i=0; i<tokenCount; i++) { for (int j=0; j<token[i]->count; j++) { k=0; theRule=token[i]->production[j]; theToken=production[theRule][k]; while (theToken!=-1) { if (theToken==beginIndex) { begin=k; } if (theToken==endIndex) { end=k; return theRule; } k++; theToken=production[theRule][k]; } } } return -1; } void Grammar::doReplaceMetaSymbol(int ruleIndex, int begin, int end) { int newTokenIndex=forgeToken(0), i=0; addRuleForToken(newTokenIndex, ++prodIndex); //token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex; //token[newTokenIndex]->terminal=false; copyRule(ruleIndex, prodIndex); //shrink while (production[ruleIndex][i+end+1]!=-1) { production[ruleIndex][begin+i]=production[ruleIndex][i+end+1]; i++; } production[ruleIndex][begin+i]=-1; addAtEnd(ruleIndex, newTokenIndex); /* production[ruleIndex][begin]=newTokenIndex; production[ruleIndex][begin+1]=-1; */ shiftRule(prodIndex, false, begin+1); production[prodIndex][end-begin-1]=newTokenIndex; production[prodIndex][end-begin]=-1; addEpsilonForToken(newTokenIndex); /* if (epsilonIndex==-1) { eRuleIndex=++prodIndex; epsilonRule(eRuleIndex); } else { eRuleIndex=epsilonIndex; } addRuleForToken(newTokenIndex, eRuleIndex); */ } void Grammar::replaceMetaSymbol() { int begin, end, ruleIndex; while ((ruleIndex=findMetaSymbol(begin, end))!=-1) { doReplaceMetaSymbol(ruleIndex, begin, end); } /* for (int i=0; i<tokenCount; i++) { //if (token[i]->isMeta) if (i==beginIndex||i==endIndex) { removeToken(i); } } */ } void Grammar::removeEpsilon(int ruleIndex) { int i=0; while (production[ruleIndex][i]!=-1) { if (production[ruleIndex][i]==emptyIndex) { do { production[ruleIndex][i]=production[ruleIndex][i+1]; i++; }while (production[ruleIndex][i]!=-1); return; } i++; } } void Grammar::addAtEnd(int ruleIndex, int toAdd) { int end; end=ruleLen(ruleIndex); production[ruleIndex][end++]=toAdd; production[ruleIndex][end]=-1; } //left-recursion: A -> A a | b | c //change to: A -> b A' | c A' // A' -> a A' | empty void Grammar::removeImmediate(int tIndex, int ruleIndex) { int newIndex=forgeToken(tIndex); int holdRuleIndex=token[tIndex]->production[ruleIndex]; //sequence matters! //change to: A -> b A' for (int i=0; i<token[tIndex]->count; i++) { if (i!=ruleIndex) { addAtEnd(token[tIndex]->production[i], newIndex); removeEpsilon(token[tIndex]->production[i]); } } //shift removeRuleFromToken(tIndex, ruleIndex); addRuleForToken(newIndex, holdRuleIndex); //token[newIndex]->production[token[newIndex]->count++]=holdRuleIndex; shiftRule(holdRuleIndex, false, 1); addAtEnd(holdRuleIndex, newIndex); //add epsilon rule for new variable addEpsilonForToken(newIndex); /* epsilonRule(++prodIndex); addRuleForToken(newIndex, prodIndex); */ } int Grammar::forgeToken(int index) { char str[MaxGrammarTokenLength+2], ch; int len=strlen(token[index]->name); int temp=0, i=0; strcpy(str, token[index]->name); ch=str[len-1];//get last char of name if (ch>='0'&&ch<'9') { str[len-1]=ch+i+1; } else { str[len]='0'+i; str[len+1]='\0'; } //I won't bother to check repetitation of name while (tokenIndex(str)!=-1) { i++; if (ch>='0'&&ch<'9') { str[len-1]=ch+i+1; } else { str[len]='0'+i; str[len+1]='\0'; } } return addToken(str, false);//it is non-terminal for sure } int Grammar::copyRule(int source, int target) { int i=0; while (production[source][i]!=-1) { production[target][i]=production[source][i]; i++; } production[target][i]=-1; return i; } void Grammar::doAddRule(int tIndex) { token[tIndex]->production[token[tIndex]->count++]=++prodIndex; } void Grammar::addRule(const char* str) { int index; index=addToken(str); production[prodIndex][curPos++]=index; production[prodIndex][curPos]=-1;//set end } //if the token is already there, it return the index //otherwise, it add new node in token array int Grammar::addToken(const char* str, bool isTerminal) { int index; index=tokenIndex(str); if (index==-1) { index=tokenCount; } else { return index; } token[index]=createToken(str, isTerminal); if (strcmp(str, optionBegin)==0) { beginIndex=index; } if (strcmp(str, optionEnd)==0) { endIndex=index; } tokenCount++; if (strcmp(str, emptyStr)==0) { token[index]->isNull=true; emptyIndex=index; } return index; } void Grammar::newRule(const char* str) { if (str!=NULL) { curIndex=addToken(str, false); } //add one pointer token[curIndex]->production[token[curIndex]->count++]=++prodIndex; token[curIndex]->terminal=false; curPos=0;//reset to restart; } GrammarToken* Grammar::createToken(const char* theName, bool isTerminal) { GrammarToken* ptr=new GrammarToken; ptr->name=new char[strlen(theName)+1]; strcpy(ptr->name, theName); ptr->terminal=isTerminal; ptr->count=ptr->firstCount=ptr->followCount=0; ptr->isNull=false; return ptr; } int Grammar::tokenIndex(const char* str) { for (int i=0; i<tokenCount; i++) { if (strcmp(str, token[i]->name)==0) { return i; } } return -1; } int Grammar::checkRecursion(int tIndex, int curToken) { for (int i=0; i<token[curToken]->count; i++) { //token[tIndex]->production[i]=ruleIndex //production[ruleIndex][0] is the first left-recursion one if (production[token[curToken]->production[i]][0]<=curToken) { return i; } } return -1; } void Grammar::printRule(int index) { int nodeIndex=0; //cout<<" ~"<<index<<"~ "; while (production[index][nodeIndex]!=-1) { //cout<<production[index][nodeIndex]<<" "<< //this is old line cout<<token[production[index][nodeIndex]]->name<<" "; //for debug: //cout<<"no."<<production[index][nodeIndex]<< // token[production[index][nodeIndex]]->name<<" "; nodeIndex++; } } void Grammar::printRule(int tIndex, int rulePos, int dotPos) { int nodeIndex=0; cout<<token[tIndex]->name<<" ==> "; //cout<<" ~"<<index<<"~ "; while (production[token[tIndex]->production[rulePos]][nodeIndex]!=-1) { if (nodeIndex==dotPos) { cout<<" . "; } cout<<token[production[token[tIndex]->production[rulePos]][nodeIndex]]->name<<" "; nodeIndex++; } if (nodeIndex==dotPos) { cout<<" . "; } //printRule(token[tIndex]->production[rulePos]); } void Grammar::initialize() { tokenCount=curIndex=curPos=0; prodIndex=-1;//in order to ++ blindly for (int i=0; i<MaxStateCount; i++) { for (int j=0; j<MaxTokenCount; j++) { LRTable[i][j]=-1; } } for (i=0; i<MaxTokenCount; i++) { for (int j=0; j<MaxTokenCount; j++) { table[i][j]=-1; } } } Grammar::Grammar() { initialize(); } void Grammar::removeLeftRecursion() { int tIndex=0, curToken=0; bool newChange=false; while (tIndex<tokenCount) { if (!token[tIndex]->terminal) { for (int i=0; i<token[tIndex]->count; i++) { curToken=production[token[tIndex]->production[i]][0]; if (curToken<=tIndex&&!token[curToken]->terminal) { if (curToken!=tIndex) { replaceRule(tIndex, i); } else { removeImmediate(tIndex, i); } newChange=true; } } } //whenever there is some new findings, restart if (!newChange) { tIndex++; } else { tIndex=0; newChange=false; } } } void Grammar::replaceRule(int tIndex, int ruleIndex) { int pos, j, targetEnd, sourceEnd, curRule; curRule=token[tIndex]->production[ruleIndex]; int curToken=production[curRule][0]; for (int i=token[curToken]->count-1; i>=0; i--) { if (i>0) { doAddRule(tIndex); pos=copyRule(token[curToken]->production[i], prodIndex); j=0; while (production[curRule][j+1]!=-1) { production[prodIndex][pos+j]=production[curRule][j+1]; j++; } production[prodIndex][pos+j]=-1; //addRuleForToken(curToken, prodIndex); } else { targetEnd=sourceEnd=0; //curRule=token[tIndex]->production[ruleIndex]; while (true) { if (production[token[curToken]->production[0]][sourceEnd]==-1&& production[curRule][targetEnd]==-1) { break; } if (production[token[curToken]->production[0]][sourceEnd]!=-1) { sourceEnd++; } if (production[curRule][targetEnd]!=-1) { targetEnd++; } } j=targetEnd+sourceEnd-1; while (j>=0) { if (j>=sourceEnd) { production[curRule][j]=production[curRule][j-sourceEnd+1]; } else { production[curRule][j]=production[token[curToken]->production[0]][j]; } j--; } } } } void Grammar::calculateLookAhead() { replaceMetaSymbol(); calculateNull(); cout<<setw(20)<<"The Rule "<<"Look-Ahead "<<endl; for (int i=0; i<tokenCount; i++) { if (token[i]->terminal) { continue; } for (int j=0; j<token[i]->count; j++) { int theRule, theToken, k=0; theRule=token[i]->production[j]; cout<<setw(5); cout<<token[i]->name<<" -> "; cout<<setiosflags(ios::left); printRule(theRule); cout<<" "; theToken=production[theRule][k]; while (theToken!=-1) { if (k!=0) { cout<<"+"; } cout<<"First("<<token[theToken]->name<<")"; if (!token[theToken]->isNull) { break; } k++; theToken=production[theRule][k]; } if (theToken==-1) { if (k!=0) { cout<<"+"; } cout<<"Follow("<<token[i]->name<<")"; } cout<<"\n"; } } } void Grammar::LLoptimize() { removeLeftRecursion(); commonLeftFactor(); } void Grammar::LRoptimize() { calculateProperty(); } //the return value is the count, not the index of rule int Grammar::ruleLen(int ruleIndex) { int end=0; while (production[ruleIndex][end]!=-1) { end++; } return end; } //optimize sequence is first common-factor then remove left recursion //therefore I don't have to check if for same variable if there will be //more than one left-recursion void Grammar::optimize(bool forLLGrammar) { replaceMetaSymbol(); if (forLLGrammar) { LLoptimize(); addBeginEnd(); } else { addBeginEnd(); LRoptimize(); } //removeRedundant(); calculateNull(); calculateFirst(); calculateFollow(); buildTable(forLLGrammar); } int Grammar::findCommonFactor(int tIndex, int ruleIndex) { for (int i=ruleIndex+1; i<token[tIndex]->count; i++) { //if the two rule has the same first token if (production[token[tIndex]->production[ruleIndex]][0]== production[token[tIndex]->production[i]][0]) { /* //calculate if there is epsilon if (emptyIndex==-1) { emptyIndex=tokenIndex(emptyStr); } //if it is not epsilon if (production[token[tIndex]->production[i]][0]!=emptyIndex) { return i; } */ return i; } } return -1; } void Grammar::epsilonRule(int ruleIndex) { production[ruleIndex][0]=addToken(emptyStr); production[ruleIndex][1]=-1; } //sample: x -> Aa // x -> Ab //changed to: x -> Ax' //this is to change the old rule // x' -> b //this is to change the old rule // x' -> a //this is the new-added rule void Grammar::doCommonFactor(int tIndex, int ruleIndex, int index) { int newTokenIndex=forgeToken(tIndex);//create a new variable //move the second and after part to the new rule of new variable //doing: x' -> a //curPos=0; //prepare to add one new rule addRuleForToken(newTokenIndex, ++prodIndex); //token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex; token[newTokenIndex]->terminal=false; copyRule(token[tIndex]->production[ruleIndex], prodIndex); shiftRule(prodIndex, false, 1); /* do { //do copying production[prodIndex][curPos]= production[token[tIndex]->production[ruleIndex]][curPos+1]; curPos++; //even the -1 at end is copied }while (production[token[tIndex]->production[ruleIndex]][curPos]!=-1); */ //in order to show an empty string, in case the string is "epsilon" /* if (curPos==1) { epsilonRule(prodIndex); } */ //replace x -> Aa with x -> Ax' production[token[tIndex]->production[ruleIndex]][1]=newTokenIndex; production[token[tIndex]->production[ruleIndex]][2]=-1;//end //doing: x' -> b //curPos=0; //prepare to add one new rule //pointing new token to where old rule lies addRuleForToken(newTokenIndex, token[tIndex]->production[index]); /* token[newTokenIndex]->production[token[newTokenIndex]->count++]= token[tIndex]->production[index]; */ shiftRule(token[tIndex]->production[index], false, 1); removeRuleFromToken(tIndex, index); } void Grammar::removeRuleFromToken(int tIndex, int ruleIndex) { token[tIndex]->count--; for (int i=ruleIndex; i<token[tIndex]->count; i++) { token[tIndex]->production[i]=token[tIndex]->production[i+1]; } } void Grammar::commonLeftFactor() { int index=-1, tIndex=0, ruleIndex=0; bool flag; //whenever a newrule is done, restart! while (tIndex<tokenCount) { ruleIndex=0; flag=false; while (ruleIndex<token[tIndex]->count) { index=findCommonFactor(tIndex, ruleIndex); if (index!=-1) { doCommonFactor(tIndex, ruleIndex, index); //restart flag=true; break; } else { ruleIndex++; } } if (flag) { tIndex=0; } else { tIndex++; } } } GrammarToken* Grammar::operator[](int index) { if (index>=0&&index<tokenCount) { return token[index]; } else { return NULL; } } void Grammar::print() { for (int i=0; i<tokenCount; i++) { if (!token[i]->terminal) { //to do: for test //cout<<"index."<<i<<" "<<token[i]->name<<" ==> "; cout<<token[i]->name<<" ==> "; for (int j=0; j<token[i]->count; j++) { //rule no cout<<"rule."<<token[i]->production[j]<<" "; printRule(token[i]->production[j]); if (j!=token[i]->count-1) { cout<<" | "; } } cout<<"\n"; } } } void Grammar::uninitializeDFA() { for (int i=0; i<itemCount; i++) { delete items[i]; } delete [] items; delete [] expandFinished; for (i=0; i<stateCount; i++) { delete sets[i]; } delete []sets; } void Grammar::initializeDFA() { items=new Item*[MaxItemCount]; for (int i=0; i<MaxItemCount; i++) { items[i]=new Item; items[i]->rulePos=items[i]->dotPos=items[i]->varIndex=-1; } sets= new BitSet*[MaxStateCount]; expandFinished=new bool[MaxStateCount]; for (i=0; i<MaxStateCount; i++) { sets[i]=new BitSet; sets[i]->reset(); expandFinished[i]=false; } itemCount=stateCount=0; //initialize LRTable for (i=0; i<MaxStateCount; i++) { for (int j=0; j<tokenCount; j++) { LRTable[i][j]=-1; } } } int Grammar::addItem(Item& theItem) { for (int i=0; i<itemCount; i++) { if (*(items[i])== theItem) { return i; } } items[i]=new Item; *(items[i])=theItem; itemCount++; return i; } void Grammar::addSonItem(int tIndex, BitSet& theSet) { if (!token[tIndex]->terminal) { for (int i=0; i<token[tIndex]->count; i++) { Item temp; int itemIndex;//easy reading temp.varIndex=tIndex; temp.rulePos=i; temp.dotPos=0; itemIndex=addItem(temp); theSet.set(itemIndex); //recursive addSonItem(production[token[tIndex]->production[i]][0], theSet); } } } int Grammar::createItem(int theVar, int theNo, int position) { Item temp; temp.varIndex=theVar; temp.rulePos=theNo; temp.dotPos=position; return addItem(temp); } void Grammar::constructDFA() { int startIndex, itemIndex, stateIndex; BitSet theSet; //should initialize everything!!! initializeDFA(); startIndex=createItem(startSymbolIndex, 0, 0); //theSet.set(startIndex); addClosure(startIndex, theSet); stateIndex=itemIndex=0; stateIndex= createState(theSet); expandState(stateIndex);//recursion inside //for all choices, do createState(startStart, tansition token) ( } int Grammar::item2Var(int itemIndex) { int tIndex, rulePos, dotPos, theVar, theRule; tIndex=items[itemIndex]->varIndex; rulePos=items[itemIndex]->rulePos; dotPos=items[itemIndex]->dotPos; theRule=token[tIndex]->production[rulePos]; //so, get the var theVar=production[theRule][dotPos]; return theVar; } void Grammar::addClosure(int itemIndex, BitSet& theSet) { int tIndex, rulePos, dotPos, theVar, theRule; if (!theSet.test(itemIndex)) { theSet.set(itemIndex);//add itself first!!! } else { //prevent looping return; } tIndex=items[itemIndex]->varIndex; rulePos=items[itemIndex]->rulePos; dotPos=items[itemIndex]->dotPos; theRule=token[tIndex]->production[rulePos]; //so, get the var theVar=production[theRule][dotPos]; //to skip the epsilon if (theVar==emptyIndex) { theVar=production[theRule][dotPos+1]; } //theVar=item2Var(itemIndex); if (theVar!=-1) { if (!token[theVar]->terminal) { for (int i=0; i<token[theVar]->count; i++) { Item temp; int newItemIndex;//easy reading temp.varIndex=theVar; temp.rulePos=i; temp.dotPos=0; newItemIndex=addItem(temp); //theSet.set(itemIndex); //recursive addClosure(newItemIndex, theSet); } } } } int Grammar::createState(const BitSet& theSet) { for (int i=0; i<stateCount; i++) { if (*(sets[i])==theSet) { return i; } } sets[i]=new BitSet; *sets[i] = theSet; stateCount++; return i; } //indicating if it is reduceable by returning true void Grammar::doExpandState(int itemIndex, int tIndex, BitSet& theSet) { int theNewItemIndex; //int theNextVar=findNextTokenOfItem(itemIndex); theNewItemIndex=createItem(items[itemIndex]->varIndex, items[itemIndex]->rulePos, items[itemIndex]->dotPos+1); /* if (tIndex!=stackBottomIndex) { acceptedItemIndex=theNewItemIndex; } */ addClosure(theNewItemIndex, theSet); // return findNextTokenOfItem(theNewItemIndex)==-1;//true=reduceable } int Grammar::item2Rule(int itemIndex) { int ruleIndex, rulePos, varIndex; //only for easy reading varIndex=items[itemIndex]->varIndex; rulePos=items[itemIndex]->rulePos; ruleIndex =token[varIndex]->production[rulePos]; return ruleIndex; } void Grammar::expandState(int stateIndex) { int itemIndex, targetState, tIndex, reduceIndex=-1, shiftIndex; BitSet theSet;//a new temp bool findShift=false, findReduce=false, isAccepted=false; //this will prevent looping expanding!!!??? expandFinished[stateIndex]=true; //to see if it is to reduce sets[stateIndex]->restart(); while ((itemIndex=sets[stateIndex]->next())!=-1) { tIndex=findNextTokenOfItem(itemIndex); if (tIndex==-1) { findReduce=true; /* int ruleIndex, rulePos, varIndex; //only for easy reading varIndex=items[itemIndex]->varIndex; rulePos=items[itemIndex]->rulePos; ruleIndex =token[varIndex]->production[rulePos]; */ int ruleIndex=item2Rule(itemIndex); add2Table(stateIndex, ruleIndex); if (reduceIndex!=-1) { errorHandle(ReduceReduceConflict, (void*)(items[itemIndex]), (void*)(items[reduceIndex])); } else { reduceIndex=itemIndex; } } else { shiftIndex=itemIndex; findShift=true; } } //how to do in LR(1)???? //I am currently working on LR(0). I try not to think about it if (findReduce&&findShift) { //in LR(1), this is not necessarily a conflict, because you need to //check the first and follow set? I forgot it now. :) errorHandle(ShiftReduceConflict, (void*)(items[shiftIndex]), (void*)(items[reduceIndex])); //temparily int temp1, temp2; temp1=items[shiftIndex]->varIndex; temp2=items[shiftIndex]->rulePos; cout<<"\nconflict at shift:\n"; printRule(temp1, temp2, items[shiftIndex]->dotPos); temp1=items[reduceIndex]->varIndex; temp2=items[reduceIndex]->rulePos; cout<<"\nconflict with reduce:\n"; printRule(temp1, temp2, items[reduceIndex]->dotPos); cout<<endl; return;//currently ignore LR(1) and should be improved when LR(1) } for (int i=0; i<tokenCount; i++) { if (i==emptyIndex||i==stackBottomIndex) { continue; } sets[stateIndex]->restart(); //for all items, search if there is token to match to advance the dot while ((itemIndex=sets[stateIndex]->next())!=-1) { if (findNextTokenOfItem(itemIndex)== i)// { //do I need to add acceptance here? if (i==stackBottomIndex) { } doExpandState(itemIndex, i, theSet);//should finish addClosure } } //only expandState after you find one "transition" if (theSet.count()>0)//if you do find some { targetState=createState(theSet); //here to write the LRTable add2Table(stateIndex, targetState, i);//3 params means to reduce findShift=true; theSet.reset();//prepare for next token //prevent infinite loop because DFA is cyclic graph if (!expandFinished[targetState]) { expandState(targetState); } }//no one is set } } //this is for reduce of LR(0) void Grammar::add2Table(int stateIndex, int ruleIndex) { int result= reduce2Mask(ruleIndex); //currently this is for LR(0) for (int i=0; i<tokenCount; i++) { if (token[i]->terminal) { if (LRTable[stateIndex][i]!=-1) { errorHandle(OverWritingLRTable); } LRTable[stateIndex][i]=result; } } } //this is for shift void Grammar::add2Table(int stateIndex, int targetIndex, int tIndex) { int result=targetIndex; if (token[tIndex]->terminal) { result= shift2Mask(targetIndex); } if (LRTable[stateIndex][tIndex]!=-1) { errorHandle(OverWritingLRTable); } LRTable[stateIndex][tIndex]=result; } int Grammar::findNextTokenOfItem(int itemIndex) { int tIndex, theNo, thePos, theVar; tIndex=items[itemIndex]->varIndex;//easy for reading theNo=items[itemIndex]->rulePos; thePos=items[itemIndex]->dotPos; //maybe I should write small function to wrap this kind of indexing!!!! theVar=production[token[tIndex]->production[theNo]][thePos]; return theVar; } bool Grammar::isReduce(int input) { return (SHIFTREDUCEMASK&input)==REDUCEMASK; } bool Grammar::isShift(int input) { return (SHIFTREDUCEMASK&input)==SHIFTMASK; } int Grammar::mask2State(int input) { return STATEMASK&input; } //internal test purpose void Grammar::printDFA() { for (int i=0; i<stateCount; i++) { cout<<"\nstate no."<<i<<":\n"; sets[i]->restart(); int j=0; while ((j=sets[i]->next())!=-1) { printRule(items[j]->varIndex, items[j]->rulePos, items[j]->dotPos); cout<<endl; } } } //these are trivial functions and will later be changed to be inline int Grammar::reduce2Mask(int input) { return REDUCEMASK|input; } int Grammar::shift2Mask(int input) { return SHIFTMASK|input; } BitSet::BitSet() { current=-1; reset(); } BitSet& BitSet::operator =(const BitSet& theSet) { reset(); this->operator |=(theSet); return *this; } int BitSet::next() { for (int i=current+1; i<bitset_size; i++) { if (at(i)) { current=i; return i; } } current=-1; return -1; } bool operator == (Item& first, Item& second) { return (first.dotPos==second.dotPos)&&(first.rulePos==second.rulePos)&& (first.varIndex==second.varIndex); }

#include "grammar.h" const int MaxStackLength=50; class Parser { private: bool isLLParser; int stack[MaxStackLength]; bool push(int num); bool pop(int& num); int top; void initialize(); bool pushToken(int tIndex, int theToken); void prepare(); void pushRule(int theRule); void LLParse(); void LRParse(); public: Parser(bool forLL=true); void parseFile(const char* fileName); };

#include <iostream> #include "parser.h" #include "scanner.h" #include "errorNo.h" using namespace std; Scanner scanner; extern Grammar grammar; const char* defaultListFile="nickListFile.txt"; extern int startSymbolIndex; extern int stackBottomIndex; extern int table[MaxTokenCount][MaxTokenCount]; extern int token2type[MaxTokenCount]; extern int type2token[MaxTokenCount]; extern int emptyIndex; bool Parser::pushToken(int tIndex, int theToken) { if (table[tIndex][theToken]!=-1) { cout<<grammar.token[tIndex]->name<<" => "; grammar.printRule(table[tIndex][theToken]); cout<<endl; pushRule(table[tIndex][theToken]); return true; } return false; } void Parser::pushRule(int theRule) { int len=0; while (grammar.production[theRule][len]!=-1) { len++; } for (int i=len-1; i>=0; i--) { if (!push(grammar.production[theRule][i])) { errorHandle(StackOverFlow); } } } void Parser::LLParse() { int theToken, theVar; bool canPop=true; while (scanner.nextToken()) { if (!canPop) { errorHandle(UnexpectedEmptyStack); } if (scanner.token.type==COMMENTTYPE) { continue; } theToken=type2token[scanner.token.type]; while ((canPop=pop(theVar))==true) { if (grammar.token[theVar]->terminal) { if (theToken==theVar)//match { break; } else { if (theVar==emptyIndex) { continue; } errorHandle(IllegalGrammarToken); } } else { if (!pushToken(theVar, theToken)) { errorHandle(IllegalGrammarToken); } } } } if (pop(theVar)) { if (theVar!=stackBottomIndex) { errorHandle(NotEmptyStack); } /* if (theVar!=stackBottomIndex) { errorHandle(NotEmptyStack); } */ } } void Parser::LRParse() { /* int theToken, theVar; bool canPop=true; while (scanner.nextToken()) { if (!canPop) { errorHandle(UnexpectedEmptyStack); } if (scanner.token.type==COMMENTTYPE) { continue; } theToken=type2token[scanner.token.type]; //if (grammar.isShift(LRTable[ } /* while ((canPop=pop(theVar))==true) { if (grammar.token[theVar]->terminal) { if (theToken==theVar)//match { break; } else { if (theVar==emptyIndex) { continue; } errorHandle(IllegalGrammarToken); } } else { if (!pushToken(theVar, theToken)) { errorHandle(IllegalGrammarToken); } } } } if (pop(theVar)) { if (theVar!=stackBottomIndex) { errorHandle(NotEmptyStack); } /* if (theVar!=stackBottomIndex) { errorHandle(NotEmptyStack); } } */ } void Parser::parseFile(const char*fileName) { scanner.readFromFile(fileName, defaultListFile); prepare(); if (isLLParser) { LLParse(); } else { LRParse(); } /* while (scanner.nextToken()) { if (!canPop) { errorHandle(UnexpectedEmptyStack); } if (scanner.token.type==COMMENTTYPE) { continue; } theToken=type2token[scanner.token.type]; while ((canPop=pop(theVar))==true) { if (grammar.token[theVar]->terminal) { if (theToken==theVar)//match { break; } else { if (theVar==emptyIndex) { continue; } errorHandle(IllegalGrammarToken); } } else { if (!pushToken(theVar, theToken)) { errorHandle(IllegalGrammarToken); } } } } if (pop(theVar)) { if (theVar!=stackBottomIndex) { errorHandle(NotEmptyStack); } /* if (theVar!=stackBottomIndex) { errorHandle(NotEmptyStack); } } */ } Parser::Parser(bool forLL) { isLLParser=forLL; initialize(); } void Parser::initialize() { top=0; } bool Parser::pop(int& num) { if (top==0) { return false; } num=stack[--top]; return true; } bool Parser::push(int num) { if (top==MaxStackLength-1) { return false; } stack[top++]=num; return true; } void Parser::prepare() { top=0; if (isLLParser) { pushRule(grammar.token[startSymbolIndex]->production[0]); } else { push(0); } }

/////////////////////////////////////////////////////////////////////////////////////////// //Program: SLang Scanner //Author: Qingzhe Huang //Date: Jan. 18, 2004 //FileName: scanner.h //Features: // 1. I want to improve efficiency of scanning, so I used table-driven method. // 2. I used enum to represent character of all ASCII---CharType---where "space, tab, // end of line, end of file are all considered to be White Space. // 3. All legal token is represented by enum TokenType. // 4. I defined a huge amount of TokenState which is basically the state of a DFA. As // I don't want to search reserved keyword with linear search or whatever, I have // many states for the reserved words. // 5. I deliberately make the sequence of first 38 TokenState elements exactly same as // all that of TokenType, so that each final state of DFA has a 1-1 correspondence with // type of token. // 6. I defined a struct of Token which may be used in future parser. // 7. I defined an errorNo variable to represent various errors. And a series error string // for displaying information. // 8. When class Scanner is created, it will initialize the big "state-charType" table. // 9. When readFromFile is called, it will first read one char in advance. // 10. When an error is encountered, the caller of Scanner should understand that no further // char is read in. So, stop calling "nextToken()". This is a bit controvercial, and I // plan to change it in next version. //////////////////////////////////////////////////////////////////////////////////////////// /*//////////////////////////////////////////////////////////////////////////// Program: SLang Scanner Author: Qingzhe Huang Date: Jan. 21, 2004 FileName: scanner.h Features: 1. I restructured the struct Token, to make it a union field in order to store int value for number. 2. I restructured the function "nextToken()" in order to give out correct line no. when error occurs. *//////////////////////////////////////////////////////////////////////////////// #ifndef SCANNER_H #define SCANNER_H #include <iostream> using namespace std; extern enum ErrorCode; const int TokenStateCount=138; const int CharTypeCount=72; const int MaxTokenLength=255; const int MaxNumberLength=12; enum CharType { //all small letters 26 SMALLA,SMALLB,SMALLC,SMALLD,SMALLE,SMALLF,SMALLG,SMALLH,SMALLI,SMALLJ,SMALLK,SMALLL, SMALLM,SMALLN,SMALLO,SMALLP,SMALLQ,SMALLR,SMALLS,SMALLT,SMALLU,SMALLV,SMALLW,SMALLX, SMALLY,SMALLZ, //all big letters 26 BIGA,BIGB,BIGC,BIGD,BIGE,BIGF,BIGG,BIGH,BIGI,BIGJ,BIGK,BIGL,BIGM,BIGN,BIGO,BIGP,BIGQ, BIGR,BIGS,BIGT,BIGU,BIGV,BIGW,BIGX,BIGY,BIGZ, //all digit 1 DIGIT, //all symbols 16 QUOTE, OPENPAR, CLOSEPAR, SEMICOLON,PLUS, MINUS, TIMES, SLASH, COLON, EQUAL,SMALLER,GREATER,EXCLAIM,OPENBRACKET, CLOSEBRACKET,COMMA, //space, tab, end of line are regarded as whitespace, 1 WHITESPACE, //UNDERSCORE IS A SPECIAL SYMBOL 1 UNDERSCORE, //all other ASCII is regarded as illegal 1 ILLEGAL }; //TOTAL 38, JUST 1-1 WITH THE FIRST 38 OF TOKENSTATE enum TokenType { //GENERAL TYPE 5 IDTYPE, NUMBERTYPE, CHARCONSTTYPE, COMMENTTYPE, ERRORTYPE, //THE FOLLOWING ARE SYMBOL TYPE 18 OPENPARTYPE, CLOSEPARTYPE, SEMICOLONTYPE, PLUSTYPE, MINUSTYPE, TIMESTYPE, SLASHTYPE, ASSIGNMENTTYPE, SMALLERTYPE, GREATERTYPE, EQUALTYPE, SMALLEREQUALTYPE, GREATEREQUALTYPE, NOTEQUALTYPE, OPENBRACKETTYPE, CLOSEBRACKETTYPE, COMMATYPE, COLONTYPE, //THE FOLLOWING ARE RESERVED TYPE 15 BEGINTYPE, ENDTYPE, PROGRAMTYPE, VARIABLESTYPE,INTEGERTYPE, ARRAYTYPE, CHARTYPE, MODULETYPE, IFTYPE, THENTYPE, ELSETYPE, LOOPTYPE, EXITTYPE, READTYPE, WRITETYPE }; //total 138 states enum TokenState { //THE FINAL STATE 38, in order to easy initialize "finalState", I put them in beginning //5 generals IDEND, NUMBEREND, CONSTCHAREND, COMMENTEND, ERROR, //18 symbols OPENPAREND, CLOSEPAREND, SEMICOLONEND, PLUSEND, MINUSEND, TIMESEND, SLASHEND, ASSIGNMENTEND, SMALLEREND, GREATEREND, EQUALEND, SMALLEREQUALEND, GREATEREQUALEND, NOTEQUALEND, OPENBRACKETEND, CLOSEBRACKETEND, COMMAEND, COLONEND, //15 reserved BEGINEND, ENDEND, PROGRAMEND, VARIABLESEND, INTEGEREND, ARRAYEND, CHAREND, MODULEEND, IFEND, THENEND, ELSEEND, LOOPEND, EXITEND, READEND, WRITEEND, //THE FOLLOWING ARE ALL NON-FINAL STATES //THE very FIRST CHAR 1 READY, //THE FOLLOWING ARE ALL RESERVED STATE //the first char 12 ARRAY1, BEGIN1, CHAR1, E1, I1, LOOP1, MODULE1, PROGRAM1, READ1, THEN1, VARIABLES1, WRITE1, //THE SECOND CHAR 15 ARRAY2, BEGIN2, CHAR2, ELSE2, END2, EXIT2, IF2, INTEGER2, LOOP2, MODULE2, PROGRAM2, READ2, THEN2, VARIABLES2, WRITE2, //THE THIRD CHAR 14 ARRAY3, BEGIN3, CHAR3, ELSE3, END3, EXIT3, INTEGER3, LOOP3, MODULE3, PROGRAM3, READ3, THEN3, VARIABLES3, WRITE3, //THE FOURTH CHAR 13 ARRAY4, BEGIN4, CHAR4, ELSE4, EXIT4, INTEGER4, LOOP4, MODULE4, PROGRAM4, READ4, THEN4, VARIABLES4, WRITE4, //THE FIFTH CHAR 7 ARRAY5, BEGIN5, INTEGER5, MODULE5, PROGRAM5, VARIABLES5, WRITE5, //THE SIXTH CHAR 4 INTEGER6, MODULE6, PROGRAM6, VARIABLES6, //THE SEVENTH CHAR 3 INTEGER7, PROGRAM7, VARIABLES7, //THE EIGHTH CHAR 1 VARIABLES8, //THE NINETH CHAR 1 VARIABLES9, //THESE ARE NON-RESERVED //THESE ARE GENERAL 9 IDBEGIN, IDUNDERSCORE, NUMBERBEGIN, CONSTCHARQUOTEBEGIN, CONSTCHARBEGIN, COMMENTSTARBEGIN, COMMENTBEGIN, COMMENTSTAREND, COMMENTSLASHBEGIN, //the SINGLE symbols 16 QUOTEBEGIN, OPENPARBEGIN, CLOSEPARBEGIN, SEMICOLONBEGIN, PLUSBEGIN, MINUSBEGIN, TIMESBEGIN, SLASHBEGIN, COLONBEGIN, SMALLERBEGIN, GREATERBEGIN, EQUALBEGIN, EXCLAIMBEGIN, OPENBRACKETBEGIN, CLOSEBRACKETBEGIN, COMMABEGIN, //MULTI SYMBOL 4 ASSIGNMENTBEGIN, SMALLEREQUALBEGIN, GREATEREQUALBEGIN, NOTEQUALBEGIN }; //extern ErrorCode errorNo; extern void errorHandle(ErrorCode errorNo); struct Token { TokenType type; union { char name[MaxTokenLength+1]; int number; }; }; class Scanner { private: int tokenCount; unsigned char ch; void printLineNo(); FILE* stream; bool nextChar(); void initialize(); bool resume(); public: Scanner(); static Token token; bool readFromFile(const char* fileName, const char* listFileName); const char* getToken(){return token.name;} bool nextToken(); void report(); }; void initialTokenState(); #endif

/*//////////////////////////////////////////////////////////////////////////// Program: SLang Scanner Author: Qingzhe Huang Date: Jan. 21, 2004 FileName: scanner.cpp Features: 1. As Dr. Optrany said, the number should be stored as int or double whatever. 2. I restructured the function "nextToken()" in order to give out correct line no. when error occurs. *//////////////////////////////////////////////////////////////////////////////// #include <iostream> #include <fstream> #include "scanner.h" #include "errorNo.h" using namespace std; ofstream fList; //this will determine how many errors of maximum the scanner will tolerant const int MaxErrortolerant=10; //as integer usually have max 12 digit roughly int errorCount=0; int lineCount=1; //static memeber Token Scanner::token; const int TokenTypeCount=38; //extern void errorHandle(int errorNo); extern char* errorStr[ErrorCount]; //this is purely for displaying purpose char* tokenTypeStr[TokenTypeCount]= { //GENERAL TYPE 5 "ID", "NUMBER", "CHARACTER CONSTANT", "COMMENT", "ERROR", //THE FOLLOWING ARE SYMBOL TYPE 18 "(", ")", ";", "+", "-", "*", "/", ":=", "<", ">", "=", "<=", ">=", "!=", "[", "]", ",", ":", //THE FOLLOWING ARE RESERVED TYPE 15 "begin", "end", "program", "variables","integer", "array", "char", "module", "if", "then", "else", "loop", "exit", "read", "write" }; CharType charType[256]; TokenState tokenState[TokenStateCount][CharTypeCount]; //when error occurs, no message is immediately output, it is //postponed to next time, because when '\n' is read in, //lineCount is not incremented until token is decided. //Therefore, when a token is ended with '\n', the line no is not updated //until next round. So, we can keep the correct line no. for each token bool Scanner::nextToken() { TokenState state=READY; int digitCount=0; int value=0; int count=0;//to count the length of token char* ptr=token.name; bool isComment=false; do { //map ch to CharType reducing 256 ASCII to 73 CharTypes //the table for "state" and "CharType is 138x73, each entry is a //index for state. state=tokenState[state][charType[ch]]; if (state==NUMBERBEGIN) { digitCount++; if (digitCount>=MaxNumberLength) { errorHandle(ExceedNumberLimit); return resume(); } //to accumulate the value value*=10; value+=ch-'0'; } //because I put all final state in the first 38 positions if (state<38) { //This is a dirty trick! Because I make the "TokenType" 1-1 with //TokenState for the 38 finals. *ptr='\0'; token.type=(TokenType)(state); if (state==ERROR) { errorHandle(IllegalToken); //printLineNo(); return resume(); } tokenCount++; if (state==NUMBEREND) { token.number=value; } //printLineNo(); return true; } if (state==COMMENTBEGIN) { isComment=true; } if (count>=MaxTokenLength) { errorHandle(TokenTooLong); token.type=ERRORTYPE; //printLineNo(); return false; } //cout<<ch; if (!isComment&&state!=READY) { *ptr=ch; ptr++; count++; } //it is only at end to update line no. printLineNo(); }while (nextChar()); state=tokenState[state][charType[ch]]; //at this point, it is either in ready state, or error state if (state==ERROR) { token.type=(TokenType)(state); errorHandle(UnexpectedReachEOF); } //but in all case it means end of file, so return false return false; } bool Scanner::resume() { //the scanner will try to continue if error number is within 10 if (errorCount==MaxErrortolerant) { return false; } return nextChar(); } void Scanner::report() { fList<<"\ntotal number of tokens is "<<tokenCount; fList<<"\ntotal number of errors is "<<errorCount; } void Scanner::printLineNo() { if (ch=='\n') { fList<<++lineCount<<" "; } } Scanner::Scanner() { initialize(); } void Scanner::initialize() { errorCount=0; lineCount=1; tokenCount=0; initialTokenState(); } bool Scanner::readFromFile(const char* fileName, const char* listFileName) { if ((stream=fopen(fileName, "r"))==NULL) { errorHandle(CannotOpenFile); return false; } else { fList.open(listFileName); fList<<lineCount<<" "; //this is to prevent the empty file situation in which //you cannot even read one single char because my scanner need to read //one char ahead if (!nextChar()) { errorHandle(FileEmptyError); return false; } } return true; } bool Scanner::nextChar() { ch=fgetc(stream); fList<<ch; return ch!=255; }

#ifndef ERRORNO_H #define ERRORNO_H extern char* errorStr[]; void errorHandle(int errorNo, void* param1=NULL, void* param2=NULL); /* class ErrorInfo { private: public: ErrorInfo(int errorNo); void addInt(int num); }; */ const int ScannerErrorCount=6; const int ParserErrorCount=4; const int LRParserErrorCount=3; const int SymTabErrorCount=3; const int ErrorCount=ScannerErrorCount+ParserErrorCount+LRParserErrorCount+SymTabErrorCount; //errors of scanner = 6 #define IllegalToken 0 #define TokenTooLong 1 #define UnexpectedReachEOF 2 #define FileEmptyError 3 #define CannotOpenFile 4 #define ExceedNumberLimit 5 //error of parser =4 #define UnexpectedEmptyStack 6 #define IllegalGrammarToken 7 #define NotEmptyStack 8 #define StackOverFlow 9 //error of LR(0) = 3 #define ShiftReduceConflict 10 #define ReduceReduceConflict 11 #define OverWritingLRTable 12 //error of symbol table #define TooManyIdentifier 13 #define VariableRedeclared 14 #define VariableUndeclared 15 #endif

#include <iostream> #include <fstream> #include "scanner.h" #include "errorNo.h" #include "grammar.h" using namespace std; extern Grammar grammar; char* errorStr[ErrorCount]= { //these are scanner errors: "IllegalToken", "TokenTooLong", "UnexpectedReachEOF", "FileEmptyError", "CannotOpenFile", "ExceedNumberLimit", //these are parser errors "UnexpectedEmptyStack", "IllegalGrammarToken", "NotEmptyStack", "StackOverFlow", //LR parser error "ShiftReduceConflict", "ReduceReduceConflict", "OverWritingLRTable", //error of symtab "TooManyIdentifier", "VariableRedeclared", "VariableUndeclared" }; extern int errorCount; extern int lineCount; extern ofstream fList; //this is going to be improved in future as parser need to //call it, too. so, more parameter should be added? //No! the error no. itself specifies the error and it is //error handler to try to find necessary info to display. void errorHandle(int errorNo, void* param1, void* param2) { void printItem(Item* ptr); //void printItem(Item* ptr) //{ int total=ScannerErrorCount; Item* ptr; if (errorNo<total) { errorCount++; //the illegal token may be for various reason and I only suggest //a possible nearby place to spot the error occurs. fList<<"\nerror of "<<errorStr[errorNo]<<" occurred at line " <<lineCount<<" near token "<<Scanner::token.name<<endl; } else { total+=ParserErrorCount; if (errorNo<total) { errorCount++; fList<<"\nerror of "<<errorStr[errorNo]<<" occurred at line " <<lineCount<<" near token "<<Scanner::token.name<<endl; //if (param1!=NULL) //for debug { cout<<errorStr[errorNo]<<" occured at line "<<lineCount <<" near token "<<Scanner::token.name<<endl; } exit(errorNo); } else { total+=LRParserErrorCount; if (errorNo<total) { errorCount++; //temparorily cout<<errorStr[errorNo]<<endl; switch (errorNo) { case ShiftReduceConflict: ptr=((Item*)(param1)); cout<<"\nconflict at shift:\n"; printItem(ptr); ptr=((Item*)(param2)); cout<<"\nconflict with reduce:\n"; printItem(ptr); cout<<endl; break; case ReduceReduceConflict: ptr=((Item*)(param1)); cout<<"\nconflict at reduce:\n"; printItem(ptr); ptr=((Item*)(param2)); cout<<"\nconflict with reduce:\n"; printItem(ptr); cout<<endl; break; } } else { total+=SymTabErrorCount; if (errorNo<total) { cout<<"error of "<<errorStr[errorNo]<<" at line "<<lineCount<<" "; if (errorNo==VariableRedeclared||errorNo==VariableUndeclared) { cout<<" "<<(char*)param1; } cout<<"\n"; } } } } } void printItem(Item* ptr) { int temp1, temp2; temp1=ptr->varIndex; temp2=ptr->rulePos; grammar.printRule(temp1, temp2, ptr->dotPos); }

/*//////////////////////////////////////////////////////////////////////////// Program: SLang Scanner Author: Qingzhe Huang Date: Jan. 18, 2004 FileName: initialize.cpp Features: 1. This is purely mechnical job, you know to initialize a huge state table: 138x72 is really a boring, routine job. 2. For EOF, I want "ch" to be able to be an index in "CharType" array, so, it cannot be -1, but 255 for "unsigned char" which is declared in class Scanner. *//////////////////////////////////////////////////////////////////////////////// #include "scanner.h" extern enum CharType; extern enum TokenState; extern CharType charType[256]; extern TokenState tokenState[TokenStateCount][CharTypeCount]; void finalSymbolToken(TokenState state, TokenState endState); void finalReservedToken(TokenState state, TokenState endState); void initialCharType(); void setFinalTokenState(); void initialReserved(TokenState state); void setRange(TokenState state, CharType start, CharType end, TokenState target); void setState(TokenState state, TokenState targetState); void setDefaultState(); void setDefaultState() { //all states are by default error for (int i=0; i<TokenStateCount; i++) { setState((TokenState)i, ERROR); } //the default for all letters are IDBEGIN setRange(READY, SMALLA, BIGZ, IDBEGIN); //THIS IS another dirty trick, since I put all reserved states together //so you can initialize them together. for (i=ARRAY1; i<=VARIABLES9; i++) { initialReserved((TokenState)i); } setFinalTokenState(); } void setFinalTokenState() { //FOR ID finalReservedToken(IDBEGIN, IDEND); //for number finalReservedToken(NUMBERBEGIN, NUMBEREND); //THESE FOR RESERVED WORDS finalReservedToken(ARRAY5, ARRAYEND); finalReservedToken(BEGIN5, BEGINEND); finalReservedToken(CHAR4, CHAREND); finalReservedToken(ELSE4, ELSEEND); finalReservedToken(END3, ENDEND); finalReservedToken(EXIT4, EXITEND); finalReservedToken(IF2, IFEND); finalReservedToken(INTEGER7, INTEGEREND); finalReservedToken(LOOP4, LOOPEND); finalReservedToken(MODULE6, MODULEEND); finalReservedToken(PROGRAM7, PROGRAMEND); finalReservedToken(READ4, READEND); finalReservedToken(THEN4, THENEND); finalReservedToken(VARIABLES9, VARIABLESEND); finalReservedToken(WRITE5, WRITEEND); //THESE FOR SYMBOLS finalSymbolToken(OPENPARBEGIN, OPENPAREND); finalSymbolToken(CLOSEPARBEGIN, CLOSEPAREND); finalSymbolToken(SEMICOLONBEGIN, SEMICOLONEND); finalSymbolToken(PLUSBEGIN, PLUSEND); finalSymbolToken(MINUSBEGIN, MINUSEND); finalSymbolToken(TIMESBEGIN, TIMESEND); finalSymbolToken(SLASHBEGIN, SLASHEND); finalSymbolToken(ASSIGNMENTBEGIN, ASSIGNMENTEND); finalSymbolToken(SMALLERBEGIN, SMALLEREND); finalSymbolToken(GREATERBEGIN, GREATEREND); finalSymbolToken(EQUALBEGIN, EQUALEND); finalSymbolToken(SMALLEREQUALBEGIN, SMALLEREQUALEND); finalSymbolToken(GREATEREQUALBEGIN, GREATEREQUALEND); finalSymbolToken(NOTEQUALBEGIN, NOTEQUALEND); finalSymbolToken(OPENBRACKETBEGIN, OPENBRACKETEND); finalSymbolToken(CLOSEBRACKETBEGIN, CLOSEBRACKETEND); finalSymbolToken(COMMABEGIN, COMMAEND); finalSymbolToken(COLONBEGIN, COLONEND); //COMMENT finalSymbolToken(COMMENTSLASHBEGIN, COMMENTEND); //CONSTCHAR finalSymbolToken(CONSTCHARQUOTEBEGIN, CONSTCHAREND); } void initialTokenState() { //initialize all charType initialCharType(); //default is always error setDefaultState(); //loop tokenState[READY][WHITESPACE]=READY; //number tokenState[READY][DIGIT]=NUMBERBEGIN; tokenState[NUMBERBEGIN][DIGIT]=NUMBERBEGIN;//HOW LONG SHOULD NUMBER BE? //ID //setRange(READY, SMALLA, BIGZ, IDBEGIN); THIS IS IN DEFAULT setRange(IDBEGIN, SMALLA, DIGIT, IDBEGIN); tokenState[IDBEGIN][UNDERSCORE]=IDUNDERSCORE; setRange(IDUNDERSCORE, SMALLA, DIGIT, IDBEGIN); //reserved words //ARRAY1, BEGIN1, CHAR1, E1, I1, LOOP1, MODULE1, PROGRAM1, READ1, THEN1, WRITE1, //VARIABLES1, tokenState[READY][SMALLA]=ARRAY1; tokenState[READY][SMALLB]=BEGIN1; tokenState[READY][SMALLC]=CHAR1; tokenState[READY][SMALLE]=E1; tokenState[READY][SMALLI]=I1; tokenState[READY][SMALLL]=LOOP1; tokenState[READY][SMALLM]=MODULE1; tokenState[READY][SMALLP]=PROGRAM1; tokenState[READY][SMALLR]=READ1; tokenState[READY][SMALLT]=THEN1; tokenState[READY][SMALLV]=VARIABLES1; tokenState[READY][SMALLW]=WRITE1; /* RESERVED WORDS ARRAY2 */ tokenState[ARRAY1][SMALLR]=ARRAY2; //BEGIN2 tokenState[BEGIN1][SMALLE]=BEGIN2; //CHAR2 tokenState[CHAR1][SMALLH]=CHAR2; //ELSE2, tokenState[E1][SMALLL]=ELSE2; //EXIT2 tokenState[E1][SMALLX]=EXIT2; //END2 tokenState[E1][SMALLN]=END2; //IF2 tokenState[I1][SMALLF]=IF2; //INTEGER2 tokenState[I1][SMALLN]=INTEGER2; //LOOP2 tokenState[LOOP1][SMALLO]=LOOP2; //MODULE2 tokenState[MODULE1][SMALLO]=MODULE2; //PROGRAM2 tokenState[PROGRAM1][SMALLR]=PROGRAM2; //READ2 tokenState[READ1][SMALLE]=READ2; //THEN2 tokenState[THEN1][SMALLH]=THEN2; //VARIABLES2 tokenState[VARIABLES1][SMALLA]=VARIABLES2; //WRITE2 tokenState[WRITE1][SMALLR]=WRITE2; /* RESERVED WORDS ARRAY3 */ tokenState[ARRAY2][SMALLR]=ARRAY3; //BEGIN2 tokenState[BEGIN2][SMALLG]=BEGIN3; //CHAR2 tokenState[CHAR2][SMALLA]=CHAR3; //ELSE2, tokenState[ELSE2][SMALLS]=ELSE3; //END2 tokenState[END2][SMALLD]=END3; //EXIT2 tokenState[EXIT2][SMALLI]=EXIT3; //INTEGER2 tokenState[INTEGER2][SMALLT]=INTEGER3; //LOOP2 tokenState[LOOP2][SMALLO]=LOOP3; //MODULE2 tokenState[MODULE2][SMALLD]=MODULE3; //PROGRAM2 tokenState[PROGRAM2][SMALLO]=PROGRAM3; //READ2 tokenState[READ2][SMALLA]=READ3; //THEN2 tokenState[THEN2][SMALLE]=THEN3; //VARIABLES2 tokenState[VARIABLES2][SMALLR]=VARIABLES3; //WRITE2 tokenState[WRITE2][SMALLI]=WRITE3; /* RESERVED WORDS ARRAY3 */ tokenState[ARRAY3][SMALLA]=ARRAY4; //BEGIN2 tokenState[BEGIN3][SMALLI]=BEGIN4; //CHAR2 tokenState[CHAR3][SMALLR]=CHAR4; //ELSE2, tokenState[ELSE3][SMALLE]=ELSE4; //EXIT2 tokenState[EXIT3][SMALLT]=EXIT4; //INTEGER2 tokenState[INTEGER3][SMALLE]=INTEGER4; //LOOP2 tokenState[LOOP3][SMALLP]=LOOP4; //MODULE2 tokenState[MODULE3][SMALLU]=MODULE4; //PROGRAM2 tokenState[PROGRAM3][SMALLG]=PROGRAM4; //READ2 tokenState[READ3][SMALLD]=READ4; //THEN2 tokenState[THEN3][SMALLN]=THEN4; //VARIABLES2 tokenState[VARIABLES3][SMALLI]=VARIABLES4; //WRITE2 tokenState[WRITE3][SMALLT]=WRITE4; /* RESERVED WORDS ARRAY */ tokenState[ARRAY4][SMALLY]=ARRAY5; //BEGIN2 tokenState[BEGIN4][SMALLN]=BEGIN5; //INTEGER2 tokenState[INTEGER4][SMALLG]=INTEGER5; //MODULE2 tokenState[MODULE4][SMALLL]=MODULE5; //PROGRAM2 tokenState[PROGRAM4][SMALLR]=PROGRAM5; //VARIABLES2 tokenState[VARIABLES4][SMALLA]=VARIABLES5; //WRITE2 tokenState[WRITE4][SMALLE]=WRITE5; // RESERVED WORDS*/ //INTEGER2 tokenState[INTEGER5][SMALLE]=INTEGER6; //MODULE2 tokenState[MODULE5][SMALLE]=MODULE6; //PROGRAM2 tokenState[PROGRAM5][SMALLA]=PROGRAM6; //VARIABLES2 tokenState[VARIABLES5][SMALLB]=VARIABLES6; // RESERVED WORDS*/ //INTEGER2 tokenState[INTEGER6][SMALLR]=INTEGER7; //PROGRAM2 tokenState[PROGRAM6][SMALLM]=PROGRAM7; //VARIABLES2 tokenState[VARIABLES6][SMALLL]=VARIABLES7; // RESERVED WORDS*/ //VARIABLES2 tokenState[VARIABLES7][SMALLE]=VARIABLES8; //VARIABLES2 tokenState[VARIABLES8][SMALLS]=VARIABLES9; /* CONSTCHAR, UNDERSCOREBEGIN, ASSIGNMENTBEGIN, SMALLEREQUALBEGIN, GREATEREQUALBEGIN, NOTEQUAL, COMMENTBEGIN, IDUNDERSCORE,*/ //now is the symbols //QUOTEBEGIN, OPENPARBEGIN, CLOSEPARBEGIN, SEMICOLONBEGIN, //PLUSBEGIN, MINUSBEGIN, TIMESBEGIN, SLASHBEGIN, COLONBEGIN, SMALLERBEGIN, GREATERBEGIN, //EQUALBEGIN, EXCLAIMBEGIN, OPENBRACKETBEGIN, CLOSEBRACKETBEGIN, COMMABEGIN, //' tokenState[READY][QUOTE]=QUOTEBEGIN; //( tokenState[READY][OPENPAR]=OPENPARBEGIN; //) tokenState[READY][CLOSEPAR]=CLOSEPARBEGIN; //; tokenState[READY][SEMICOLON]=SEMICOLONBEGIN; //+ tokenState[READY][PLUS]=PLUSBEGIN; //- tokenState[READY][MINUS]=MINUSBEGIN; //* tokenState[READY][TIMES]=TIMESBEGIN; /// tokenState[READY][SLASH]=SLASHBEGIN; //: tokenState[READY][COLON]=COLONBEGIN; //< tokenState[READY][SMALLER]=SMALLERBEGIN; //> tokenState[READY][GREATER]=GREATERBEGIN; //= tokenState[READY][EQUAL]=EQUALBEGIN; //! tokenState[READY][EXCLAIM]=EXCLAIMBEGIN; //[ tokenState[READY][OPENBRACKET]=OPENBRACKETBEGIN; //] tokenState[READY][CLOSEBRACKET]=CLOSEBRACKETBEGIN; //, tokenState[READY][COMMA]=COMMABEGIN; //AFTER QUOTE IT CAN BE ANY CHARACTER, INCLUDING ILLEGAL CHAR setRange(QUOTEBEGIN, SMALLA, ILLEGAL, CONSTCHARBEGIN); //ANY OTHER STATE IS BY DEFAULT ERROR tokenState[CONSTCHARBEGIN][QUOTE]=CONSTCHARQUOTEBEGIN; //FOR /, DEFAULT IS SLASHEND, EXCEPT * WHICH IS COMMENTSTARBEGIN tokenState[SLASHBEGIN][TIMES]= COMMENTSTARBEGIN; //FOR :, DEFAULT IS COLONEND, EXCEPT FOR = WHICH IS ASSIGNMENTBEGIN tokenState[COLONBEGIN][EQUAL]= ASSIGNMENTBEGIN; //FOR <, DEFAULT IS SMALLEREND, EXCEPT FOR= WHICH IS SMALLEREQAULBEGIN tokenState[SMALLERBEGIN][EQUAL]=SMALLEREQUALBEGIN; //FOR >, DEFAULT IS GREATEREND, EXCEPT FOR= WHICH IS GREATEREQAULBEGIN tokenState[GREATERBEGIN][EQUAL]= GREATEREQUALBEGIN; tokenState[EXCLAIMBEGIN][EQUAL]= NOTEQUALBEGIN; //WITHIN COMMENT IT IS A LOOP, EXCEPT FOR * WHICH IS POSSIBLE FOR END OF COMMENT setRange(COMMENTSTARBEGIN, SMALLA, ILLEGAL, COMMENTBEGIN); tokenState[COMMENTSTARBEGIN][TIMES]=COMMENTSTAREND; setRange(COMMENTBEGIN, SMALLA, ILLEGAL, COMMENTBEGIN); tokenState[COMMENTBEGIN][TIMES]=COMMENTSTAREND; //FROM COMMENTSTARBEGIN, ALL IS BACK TO COMMENTBEGIN, EXCEPT / WHICH IS END OF COMMENT setRange(COMMENTSTAREND, SMALLA, ILLEGAL, COMMENTBEGIN); tokenState[COMMENTSTAREND][SLASH]=COMMENTSLASHBEGIN; // } void initialReserved(TokenState state) { setRange(state, SMALLA, DIGIT, IDBEGIN); finalReservedToken(state, IDEND); tokenState[state][UNDERSCORE]=IDUNDERSCORE;//a_ } void finalSymbolToken(TokenState state, TokenState endState) { for (int i=SMALLA; i<=WHITESPACE; i++) { tokenState[state][(CharType)i]=endState; } } void finalReservedToken(TokenState state, TokenState endState) { //all non-letter, non-digit is regarded to be delimeter for (int i=QUOTE; i<=WHITESPACE; i++) { tokenState[state][(CharType)i]=endState; } } //the default charType is ILLEGAL void initialCharType() { int chType; //the default charType is ILLEGAL for (int i=0; i<256; i++) { charType[i]=ILLEGAL; } //chType is SMALLA chType=SMALLA; for (i='a'; i<='z'; i++) { charType[i]=(CharType)(chType); chType++; } //chType is now BIGA chType=BIGA;//I don't want to rely on the trick. for (i='A'; i<='Z'; i++) { charType[i]=(CharType)(chType); chType++; } chType=DIGIT; for (i='0'; i<='9'; i++) { charType[i]=(CharType)(chType); } /* UNDERSCORE, QUOTE, OPENPAR, CLOSEPAR, SEMICOLON,PLUS, MINUS, TIMES, SLASH, COLON, EQUAL,SMALLER,GREATER,EXCLAIM,OPENBRACKET, CLOSEBRACKET,COMMA, SPACE,TAB, ENDLINE, ILLEGAL */ charType['_']=UNDERSCORE; charType['\'']=QUOTE; charType['(']=OPENPAR; charType[')']=CLOSEPAR; charType[';']=SEMICOLON; charType['+']=PLUS; charType['-']=MINUS; charType['*']=TIMES; charType['/']=SLASH; charType[':']=COLON; charType['=']=EQUAL; charType['<']=SMALLER; charType['>']=GREATER; charType['!']=EXCLAIM; charType['[']=OPENBRACKET; charType[']']=CLOSEBRACKET; charType[',']=COMMA; charType[' ']=WHITESPACE; charType['\t']=WHITESPACE; charType[10]=WHITESPACE; charType[13]=WHITESPACE; //pls note, since I changed the type of "ch" to be "unsigned char" //the EOF now is not -1, but 255 charType[255]=WHITESPACE;//IT IS A KIND OF DELIMETER } void setRange(TokenState state, CharType start, CharType end, TokenState target) { for (int i=start; i<=end; i++) { tokenState[state][i]=target; } } void setState(TokenState state, TokenState targetState) { for (int i=0; i<CharTypeCount; i++) { tokenState[state][i]=targetState; } }

#include <iostream> using namespace std; const int SHIFT=8; const int TableLength=211; const int MaxParamNo=10; const int CharArrayLength=4096; const int MaxSymbolIDNumber=600; //const int MaxLineNumber=600; /* struct Lines { int line; Lines* next; }; */ enum StructureType { SIMPLE, ARRAY, MODULE, PROGRAM }; struct Node { Node* next; char* name; bool valid;//if not properly declared, then it is invalide;default true int type; //int 0, char 1 bool declared;//default to be true int address; //offset int structure;//0 simple, 1 array, 2 module, 3 program int size; //if it is array int paramNo; Node* paramType[10]; //Lines* linesList; }; ostream& operator<<(ostream& out, Node*ptr); class Hash { protected: static Node nodes[MaxSymbolIDNumber]; static char charArray[CharArrayLength]; static int nodeCount; static char* current; //int nameCount=0; Node* table[TableLength]; int hashFun(char* in); Node* createNode(char* str); public: Hash(); bool search(char* in, Node*& out); bool insert(char* in, Node*& out); void purge(); void print(); }; file name: hash.cpp

#include "hash.h" #include "errorNo.h" char Hash::charArray[CharArrayLength]; char* Hash::current=charArray; Node Hash::nodes[MaxSymbolIDNumber]; int Hash::nodeCount=0; //Lines lines[MaxLineNumber]; //int linesCount=0;//this is for the struct Lines list not the //int nameCount=0; //Node* HashFun::empty=NULL; char* structStr[4]={"SIMPLE", "ARRAY", "MODULE", "PROGRAM"}; ostream& operator<<(ostream& out, Node* ptr) { Node* temp=ptr; while (temp!=NULL) { out<<"name:"<<temp->name<<","; out<<"declared is "<<temp->declared<<","; out<<"structure is "<<structStr[temp->structure]<<","; out<<"type is:"<<(temp->type==0?"integer":"char")<<","; if (temp->structure==ARRAY) { out<<"array["<<temp->size<<"],"; } temp=temp->next; } return out; } Node* Hash::createNode(char* str) { Node* ptr=nodes+nodeCount; nodeCount++; ptr->name=current; //ptr->data=new char[strlen(str)+1]; //if there is no enough space for new name if (current+strlen(str)+1>charArray+CharArrayLength) { errorHandle(TooManyIdentifier); } strcpy(ptr->name, str); //the following are initialization of Node ptr->address=-1; ptr->declared=true; ptr->next=NULL; ptr->paramNo=0; ptr->size=0; ptr->structure=0; ptr->type=0; ptr->valid=true; while (*current!='\0') { current++; } current++;//one more step for new position if (current==charArray+CharArrayLength) { errorHandle(TooManyIdentifier); } return ptr; } Hash::Hash() { //count=0; for (int i=0; i<TableLength; i++) { table[i]=NULL; } } int Hash::hashFun(char* in) { char* ptr=in; int result=0; while (*ptr!='\0') { result = ((result<<SHIFT) + *ptr)%TableLength; //result+= *ptr; ptr++; } return result; } bool Hash::search(char* in, Node*& out) { int index=hashFun(in); if (table[index]!=NULL) { out=table[index]; if (strcmp(out->name, in)==0) { return true; } else { while (out->next!=NULL) { if (strcmp(out->name, in)==0) { return true; } out=out->next; } } } return false; } bool Hash::insert(char* in, Node*& out) { int index=hashFun(in); if (table[index]!=NULL) { out=table[index]; if (strcmp(out->name, in)==0) { return false;//already there } else { while (out->next!=NULL) { if (strcmp(out->name, in)==0) { return false; } out=out->next; } out->next=createNode(in); out=out->next; //same return as else } } else { table[index]=createNode(in); out=table[index]; } return true; } void Hash::purge() { for (int i=0; i<TableLength; i++) { table[i]=NULL; } } void Hash::print() { for (int i=0; i<TableLength; i++) { if (table[i]!=NULL) { cout<<"ID is:"<<table[i]<<endl; } } }

#include <iostream> #include "CFGReader.h" #include "Parser.h" using namespace std; int main( int argc, char *argv[ ]) { if (!(argc==1||argc==3)) { cout<<"usage: CRGReader grammarSourceFileName sourceFileToBeParsed"; exit(1); } CFGReader R; Parser P; if (argc==1) { R.readFromFile("c:\\ruleTest.txt"); } else { R.readFromFile(argv[1]); } R.optimize(); //R.print(); //R.calculateLookAhead(); //R.print(); cout<<"\n now begin parsing...\n"; if (argc==1) { P.parseFile("c:\\simpleSource.txt"); } else { P.parseFile(argv[2]); } return 0; }

program average; variables n: integer; ave, total: integer; module sum(n :integer;) variables i: integer; begin i:=i; end; module gogo( x: integer;) begin sum(x); end; begin read n; end;

now begin parsing... M => program i ; Dl B Dl => Dv Ml Dv => variables Vl Vl => V Vl0 V => Il : T ; Il => i Il0 Il0 => e T => integer Ad Ad => e Vl0 => i Il0 : T ; Vl0 Il0 => , i Il0 Il0 => e T => integer Ad Ad => e Vl0 => e Ml => Ml0 Ml0 => module i ( Vl ) Dv B Ml0 Vl => V Vl0 V => Il : T ; Il => i Il0 Il0 => e T => integer Ad Ad => e Vl0 => e Dv => variables Vl Vl => V Vl0 V => Il : T ; Il => i Il0 Il0 => e T => integer Ad Ad => e Vl0 => e B => begin Sl end ; Sl => S Sl0 S => i S0 S0 => Ar := E ; Ar => e E => F M0 F => R M3 R => i Ar Ar => e M3 => e M0 => e Sl0 => e print module sum ID is:name:i,declared is 1,structure is SIMPLE,type is:integer, ID is:name:n,declared is 1,structure is SIMPLE,type is:integer, Ml0 => module i ( Vl ) Dv B Ml0 Vl => V Vl0 V => Il : T ; Il => i Il0 Il0 => e T => integer Ad Ad => e Vl0 => e Dv => e B => begin Sl end ; Sl => S Sl0 S => i S0 S0 => ( Lp ) ; Lp => Ln Ln => i Ar M1 Ar => e M1 => e Sl0 => e print module gogo ID is:name:x,declared is 1,structure is SIMPLE,type is:integer, Ml0 => e B => begin Sl end ; Sl => S Sl0 S => read Ln ; Ln => i Ar M1 Ar => e M1 => e Sl0 => e print module gogo ID is:name:x,declared is 1,structure is SIMPLE,type is:integer, Press any key to continue