CFGReader

name: M
   isNull: false
   isTerminal: false
   first: [0]=program
   follow: [0]=$
name: Dl
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=variables,[2]=module
   follow: [0]=begin
name: B
   isNull: false
   isTerminal: false
   first: [0]=begin
   follow: [0]=$,[1]=module,[2]=begin
name: Dv
   isNull: true
   isTerminal: false
   first: [0]=variables,[1]=e
   follow: [0]=module,[1]=begin
name: Ml
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=module
   follow: [0]=begin
name: Mo
   isNull: false
   isTerminal: false
   first: [0]=module
   follow:
name: Vl
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=module,[1]=begin,[2]=)
name: V
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=i,[1]=module,[2]=begin,[3]=)
name: Il
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=:
name: T
   isNull: false
   isTerminal: false
   first: [0]=integer,[1]=char
   follow: [0]=;
name: Ad
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=array
   follow: [0]=;
name: L
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=,,[1]=;,[2]=)
name: Ar
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=[
   follow: [0]=,,[1]=:=,[2]=;,[3]=),[4]=*,[5]=/,[6]=+,[7]=-,[8]=],[9]==,[10]=<,[11]=>,[12]=<=,[13]=>
=,[14]=!=,[15]=then
name: Sl
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=;
   follow: [0]=end
name: S
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=;
   follow: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=;,[8]=end,[9]=else
name: E
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=(,[3]=c
   follow: [0]=],[1]=),[2]=;,[3]=then
name: C
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=(,[3]=c
   follow: [0]=then
name: Lp
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=i
   follow: [0]=)
name: Ln
   isNull: false
   isTerminal: false
   first: [0]=i
   follow: [0]=;,[1]=)
name: Lo
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=c
   follow: [0]=;
name: Lr
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=c
   follow: [0]=,,[1]=;
name: F
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=(,[3]=c
   follow: [0]=+,[1]=-,[2]=],[3]=),[4]=;,[5]==,[6]=<,[7]=>,[8]=<=,[9]=>=,[10]=!=,[11]=then
name: Oa
   isNull: false
   isTerminal: false
   first: [0]=+,[1]=-
   follow:
name: R
   isNull: false
   isTerminal: false
   first: [0]=i,[1]=n,[2]=(,[3]=c
   follow: [0]=*,[1]=/,[2]=+,[3]=-,[4]=],[5]=),[6]=;,[7]==,[8]=<,[9]=>,[10]=<=,[11]=>=,[12]=!=,[13]=
then
name: Om
   isNull: false
   isTerminal: false
   first: [0]=*,[1]=/
   follow:
name: Or
   isNull: false
   isTerminal: false
   first: [0]==,[1]=<,[2]=>,[3]=<=,[4]=>=,[5]=!=
   follow: [0]=i,[1]=n,[2]=(,[3]=c
name: M0
   isNull: true
   isTerminal: false
   first: [0]=+,[1]=e,[2]=-
   follow: [0]=],[1]=),[2]=;,[3]==,[4]=<,[5]=>,[6]=<=,[7]=>=,[8]=!=,[9]=then
name: M1
   isNull: true
   isTerminal: false
   first: [0]=,,[1]=e
   follow: [0]=;,[1]=)
name: M2
   isNull: true
   isTerminal: false
   first: [0]=,,[1]=e
   follow: [0]=;
name: M3
   isNull: true
   isTerminal: false
   first: [0]=*,[1]=e,[2]=/
   follow: [0]=+,[1]=-,[2]=],[3]=),[4]=;,[5]==,[6]=<,[7]=>,[8]=<=,[9]=>=,[10]=!=,[11]=then
name: Ml0
   isNull: true
   isTerminal: false
   first: [0]=module,[1]=e
   follow: [0]=begin
name: Vl0
   isNull: true
   isTerminal: false
   first: [0]=i,[1]=e
   follow: [0]=module,[1]=begin,[2]=)
name: Il0
   isNull: true
   isTerminal: false
   first: [0]=,,[1]=e
   follow: [0]=:
name: Sl0
   isNull: true
   isTerminal: false
   first: [0]=e,[1]=i,[2]=if,[3]=loop,[4]=exit,[5]=begin,[6]=read,[7]=write,[8]=;
   follow: [0]=end
name: S0
   isNull: false
   isTerminal: false
   first: [0]=[,[1]=:=,[2]=(
   follow: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=;,[8]=end,[9]=else
name: START
   isNull: false
   isTerminal: false
   first: [0]=program
   follow:
Press any key to continue

#include "Grammar.h" class CFGReader { private: char buffer[BufferLength]; FILE* stream; void newRule(const char* str); void readRule(); void addRule(const char* str); int addToken(const char* str, bool isTerminal=true); void printRule(int index); public: void readFromFile(const char* fileName); void optimize(); void calculateLookAhead(); void print(); };

#include <iostream> #include "CFGReader.h" using namespace std; Grammar grammar; const char* deliStr=" |\n"; //this would only remove the immediate left recursion //and should I expand it to remove all kinds of left-recursion? void CFGReader::readFromFile(const char* fileName) { if ((stream=fopen(fileName, "r"))==NULL) { cout<<"cannot open file "<<fileName<<endl; } else { readRule(); } } void CFGReader::readRule() { char* ptr=buffer; char* hold; int count=0; while (!feof(stream)) { count=0; fgets(buffer, BufferLength-1, stream); ptr=buffer; while ((ptr=strtok(ptr, " \n"))!=NULL) { if (strcmp(ptr, "|")!=0) { //test the first two token to see if old rule continues if (count==0) { hold=ptr; } if (count==1) { if (strcmp("->", ptr)==0) { /* curIndex=addToken(hold, false);//the index of cur token grammar[curIndex]->terminal=false; newRule(); */ newRule(hold); } else { addRule(hold); addRule(ptr); } } if (count>=2)//always add { addRule(ptr); } count++; } else { //this is an alternative production rule newRule(NULL); } ptr=NULL; } } } void CFGReader::newRule(const char* str) { grammar.newRule(str); } void CFGReader::optimize() { grammar.optimize(); } void CFGReader::calculateLookAhead() { grammar.calculateLookAhead(); } void CFGReader::addRule(const char* str) { grammar.addRule(str); } int CFGReader::addToken(const char* str, bool isTerminal) { return grammar.addToken(str, isTerminal); } void CFGReader::printRule(int index) { grammar.printRule(index); } void CFGReader::print() { grammar.print(); //grammar.printToken(true); grammar.buildTable(false); grammar.printTable(false); //grammar.printAllRules(); }

#ifndef GRAMMAR_H #define GRAMMAR_H #include <bitset> #include <iostream> using namespace std; const int BufferLength=256; const int MaxTokenCount=100; const int MaxRHSCount=40; const int MaxRuleCount=200; const int MaxGrammarTokenLength=10; const int MaxProduction=10; const int TokenTypeCount=38; const int MaxStateCount=200; const int MaxItemCount=500; //a macro to ease the job //#define BitSet (bitset<MaxItemCount>) struct GrammarToken { //bool isMeta; bool terminal; bool isNull; char* name; int production[MaxProduction];//pointer to the production it gives int count; int firstCount; int followCount; int follow[MaxTokenCount]; int first[MaxTokenCount]; }; struct Item { int varIndex; int rulePos; int dotPos; }; bool operator == (Item& first, Item& second); /* Item& operator= (Item& first, Item& second) { first.dotPos=second.dotPos; first.rulePos=second.rulePos; first.varIndex=second.varIndex; return first; } */ class BitSet: public bitset<MaxItemCount> { private: int current; public: BitSet& operator=(const BitSet& theSet); int next(); BitSet(); void restart(){current=-1;} }; class Grammar { //to allow Parser to access Grammar friend class Parser; private: int LRTable[MaxStateCount][MaxTokenCount]; int terminalCount; int nonTermCount; GrammarToken* token[MaxTokenCount]; int tokenCount; int curIndex;//the index of current token int curPos;//the position at production rule //MaxRuleCount>=MaxVariableCount!!! //the last one in each rule is reserved for the length int production[MaxRuleCount][MaxRHSCount+1]; int prodIndex;//pointing to current production, initialized to -1 int checkRecursion(int tIndex, int curToken); void grammarError(int theVar, int theToken, int rule1, int rule2); void addFirstIntoTable(int theVariable, int theFirst, int theRule); void addFollowIntoTable(int theVariable, int theRule); void addBeginEnd(); void addEpsilonForToken(int tIndex); bool addIntoFirst(int target, int source); bool addFirst(int target, int source); void shiftRule(int ruleIndex, bool Left2Right, int offset); void addAtEnd(int ruleIndex, int toAdd); void addRuleForToken(int tIndex, int ruleIndex); int copyRule(int source, int target);//return the position of -1 void replaceRule(int curToken, int ruleIndex); int findMetaSymbol(int& begin, int& end); void initialize(); void doReplaceMetaSymbol(int ruleIndex, int begin, int end); void removeRuleFromToken(int tIndex, int ruleIndex); void checkEpsilon(int ruleIndex); void removeImmediate(int tIndex, int ruleIndex); void doAddRule(int tIndex); void commonLeftFactor(); int findCommonFactor(int tokenIndex, int ruleIndex); void removeLeftRecursion(); void doCommonFactor(int tokenIndex, int ruleIndex, int index); int forgeToken(int index);//create a new variable void epsilonRule(int ruleIndex); bool isRedundant(int tIndex); void replaceMetaSymbol(); void calculateFirst(); void calculateFollow(); void calculateNull(); bool Null(int tIndex); bool addFollow(int target, int source); bool addIntoFollow(int target, int source); bool addFollowIntoFollow(int target, int source); void removeRedundant(); void removeToken(int tIndex); void LRoptimize(); void LLoptimize(); void buildLLTable(); void buildLRTable(); int ruleLen(int ruleIndex); void calculateProperty(); int shift2Mask(int input); int reduce2Mask(int input); bool* expandFinished; int mask2State(int input); bool isReduce(int input); bool isShift(int input); //items-related methods void initializeDFA(); void uninitializeDFA(); void constructDFA(); //void addItems(int itemIndex, bitset<MaxStateCount>* theSet); int addItem(Item& theItem); int itemCount; int stateCount; Item**items; //int addState(bitset void addSonItem(int tIndex, BitSet& theSet); int createState(const BitSet& theSet); BitSet** sets; int createItem(int theVar, int theNo, int position); void addClosure(int itemIndex, BitSet& theSet); void expandState(int stateIndex); int findNextTokenOfItem(int itemIndex); void doExpandState(int itemIndex, int tIndex, BitSet& theSet); void add2Table(int stateIndex, int targetState, int tIndex);//this is shift void add2Table(int stateIndex, int ruleIndex);//this is reduce public: Grammar(); void buildTable(bool forLLGrammar=true); void optimize(bool forLLGrammar=true); void printRule(int index); void printRule(int tIndex, int rulePos); void print(); void printTable(bool isLL=true); void printAllRules(); void printToken(bool onlyVar=false); int varCount(); int termCount(); void addRule(const char* str); void newRule(const char* str);//this is an internal method, not suitable for outside GrammarToken* operator[](int index); int addToken(const char* str, bool isTerminal=true); GrammarToken* createToken(const char* theName, bool isTerminal); int tokenIndex(const char* str); void calculateLookAhead(); }; #endif

#include <iostream> #include <iomanip> #include "errorNo.h" #include "Grammar.h" using namespace std; /* #define REDUCEMASK 0xd10000000 #define SHIFTMASK 0xd20000000 #define STATEMASK 0xd01111111 #define SHIFTREDUCEMASK 0xd30000000 */ const int REDUCEMASK = 0xd10000000; const int SHIFTMASK = 0xd20000000; const int STATEMASK = 0xd01111111; const int SHIFTREDUCEMASK = 0xd30000000; const char* emptyStr="e"; const char* optionBegin="{"; const char* optionEnd="}"; const char* startStr="START"; const char* endStr="$"; int startSymbolIndex=-1; int stackBottomIndex=-1; int beginIndex=-1; int endIndex=-1; int emptyIndex=-1; int epsilonIndex=-1; int table[MaxTokenCount][MaxTokenCount]; char* terminalStr[TokenTypeCount]= { //GENERAL TYPE 5 "i", "n", "c", "COMMENT", "ERROR", //THE FOLLOWING ARE SYMBOL TYPE 18 "(", ")", ";", "+", "-", "*", "/", ":=", "<", ">", "=", "<=", ">=", "!=", "[", "]", ",", ":", //THE FOLLOWING ARE RESERVED TYPE 15 "begin", "end", "program", "variables","integer", "array", "char", "module", "if", "then", "else", "loop", "exit", "read", "write" }; //these are "hash-table-like" tables //this is exactly the opposite of below int token2type[MaxTokenCount]; //this is to translate tokenType in scanner to token-index of grammar int type2token[MaxTokenCount]; int matchTokens(const char* str) { for (int i=0; i<TokenTypeCount; i++) { if (strcmp(terminalStr[i], str)==0) { return i; } } return -1; } void Grammar::calculateProperty() { terminalCount=nonTermCount=0; for (int i=0; i<tokenCount; i++) { if (token[i]->terminal) { terminalCount++; continue; } for (int j=0; j<token[i]->count; j++) { int len; len=ruleLen(token[i]->production[j]); production[token[i]->production[j]][MaxRHSCount]=len; } nonTermCount++; } } void Grammar::addEpsilonForToken(int tIndex) { if (epsilonIndex==-1) { epsilonRule(++prodIndex); addRuleForToken(tIndex, prodIndex); } else { addRuleForToken(tIndex, epsilonIndex); } } void Grammar::printTable(bool isLL) { /* cout<<" "; for (int i=0; i<tokenCount; i++) { cout<<"| "<<i; } cout<<endl; */ if (isLL) { for (int r=0; r<tokenCount; r++) { if (token[r]->terminal) { continue; } cout<<token[r]->name<<":"; for (int c=0; c<tokenCount; c++) { if (table[r][c]!=-1) { cout<<token[c]->name<<"="<<table[r][c]<<","; } } cout<<endl; } } else { cout<<"the following is LR table<<\n"; for (int col=0; col<tokenCount; col++) { cout<<token[col]->name<<","; } cout<<endl; for (int r=0; r<stateCount; r++) { for (int c=0; c<tokenCount; c++) { int result=LRTable[r][c]; if (isReduce(result)) { cout<<"r"; cout<<mask2State(result); } else { if (isShift(result)) { cout<<"s"; cout<<mask2State(result); } else { cout<<result; } } cout<<","; } cout<<endl; } } } void Grammar::buildLRTable() { constructDFA(); } //initialized at "initialize()" void Grammar::buildLLTable() { int typeIndex; for (int r=0; r<tokenCount; r++) { if (token[r]->terminal) { typeIndex=matchTokens(token[r]->name); if (typeIndex!=-1) { type2token[typeIndex]=r; token2type[r]=typeIndex; } } //initialize /* for (int c=0; c<MaxTokenCount; c++) { table[r][c]=-1; } */ } for (int i=0; i<tokenCount; i++) { if (token[i]->terminal) { continue; } for (int j=0; j<token[i]->count; j++) { int k=0, theRule=token[i]->production[j]; int theToken=production[theRule][k]; while (theToken!=-1) { addFirstIntoTable(i, theToken, theRule); if (!token[theToken]->isNull) { break; } k++; theToken=production[theRule][k]; } if (theToken==-1) { addFollowIntoTable(i, theRule); } } } } void Grammar::buildTable(bool forLLGrammar) { if (forLLGrammar) { buildLLTable(); } else { buildLRTable(); } } /* nCount=tCount=0; for (int r=0; r<MaxTokenCount; r++) { if (r<tokenCount) { if (token[r]->terminal) { typeIndex=matchTokens(token[r]->name); if (typeIndex!=-1) { type2token[typeIndex]=r; token2type[r]=typeIndex; } //I am making a invertable table!!! tArray[r]=tCount++; } else { //it is an invertable table!!! nArray[r]=nCount++; } } for (int c=0; c<MaxTokenCount; c++) { table[r][c]=-1; } } for (int i=0; i<tCount; i++) { for (int j=0; j<token[i]->count; j++) { int k=0, theRule=token[i]->production[j]; int theToken=production[theRule][k]; while (theToken!=-1) { addFirstIntoTable(theToken, j); if (!token[theToken]->isNull) { break; } k++; theToken=production[theRule][k]; } if (theToken==-1) { addFollowIntoTable(j); } } } } */ void Grammar::grammarError(int theVar, int theToken, int rule1, int rule2) { cout<<"error! the conflict of grammar at "; cout<<token[theVar]->name<<" for token of " <<token[theToken]->name <<" between rules of \n"; printRule(rule1); cout<<" and "; printRule(rule2); cout<<endl; } void Grammar::addFollowIntoTable(int theVariable, int theRule) { int temp; for (int i=0; i<token[theVariable]->followCount; i++) { temp=token[theVariable]->follow[i]; if (table[theVariable][temp]!=theRule) { if (table[theVariable][temp]!=-1) { grammarError(theVariable, temp, theRule, table[theVariable][temp]); } table[theVariable][temp]=theRule; } } } void Grammar::addFirstIntoTable(int theVariable, int theFirst, int theRule) { for (int i=0; i<token[theFirst]->firstCount; i++) { if (table[theVariable][token[theFirst]->first[i]]!=theRule) { if (table[theVariable][token[theFirst]->first[i]]!=-1) { grammarError(theVariable, token[theFirst]->first[i], theRule, table[theVariable][token[theFirst]->first[i]]); } table[theVariable][token[theFirst]->first[i]]=theRule; } } } int Grammar::varCount() { int result=0; for (int i=0; i<tokenCount; i++) { if (!token[i]->terminal) { result++; } } return result; } int Grammar::termCount() { int result=0; for (int i=0; i<tokenCount; i++) { if (token[i]->terminal) { result++; } } return result; } //leave the rule unremoved!!! as I have no way to do it!!! void Grammar::removeToken(int tIndex) { delete[]token[tIndex]->name; delete token[tIndex]; tokenCount--; for (int i=tIndex; i<tokenCount; i++) { token[i]=token[i+1]; } } bool Grammar::isRedundant(int tIndex) { int k, theRule, theToken; for (int i=0; i<tokenCount; i++) { for (int j=0; j<token[i]->count; j++) { k=0; theRule=token[i]->production[j]; theToken=production[theRule][k]; while (theToken!=-1) { if (theToken==tIndex) { return false; } k++; theToken=production[theRule][k]; } } } return true; } //what is redundant? except the start variable //the non-terminal never appears in other rules! void Grammar::removeRedundant() { int tIndex=1; bool findNew=false; while (tIndex<tokenCount) { findNew=false; if (!token[tIndex]->terminal) { if (isRedundant(tIndex)) { removeToken(tIndex); findNew=true; } } if (findNew) { tIndex=1; } else { tIndex++; } } } void Grammar::printAllRules() { /* cout<<"\nnow print all rules\n"; for (int i=0; i<=prodIndex; i++) { cout<<"rule index "<<i<<": "; printRule(i); cout<<"\n"; } */ int sum=0; for (int i=0; i<tokenCount; i++) { if (!token[i]->terminal) { sum+=token[i]->count; } } cout<<" total rules is:"<<prodIndex+1; cout<<"\n and the sum is "<<sum<<endl; } void Grammar::addBeginEnd() { int begin=addToken(startStr, false); int end=addToken(endStr, true); prodIndex++; production[prodIndex][0]=0; production[prodIndex][1]=end; production[prodIndex][2]=-1; addRuleForToken(begin, prodIndex); startSymbolIndex=begin; stackBottomIndex=end; } /* void Grammar::calculateProperty() { calculateNull(); calculateFirst(); calculateFollow(); } */ void Grammar::printToken(bool onlyVar) { for (int i=0; i<tokenCount; i++) { if (onlyVar) { if (token[i]->terminal) { continue; } } cout<<"name: "<<token[i]->name<<endl; cout<<" isNull: "<<(token[i]->isNull?"true":"false")<<endl; cout<<" isTerminal: "<<(token[i]->terminal?"true":"false")<<endl; cout<<" first: "; for (int j=0; j<token[i]->firstCount; j++) { if (j!=0) { cout<<","; } cout<<"["<<j<<"]="<<token[token[i]->first[j]]->name; } cout<<"\n follow: "; for (j=0; j<token[i]->followCount; j++) { if (j!=0) { cout<<","; } cout<<"["<<j<<"]="<<token[token[i]->follow[j]]->name; } cout<<endl; } } //must use while loop to discover new set!!!!! void Grammar::calculateFirst() { int i; bool addNew; do { i=0; addNew=false; while (i<tokenCount) { //the terminal contains itself if (token[i]->terminal) { //addFirst don't judge if it is nullable!! addFirst(i, i); //token[i]->first[token[i]->firstCount++]=i; } else { //for all its rules for (int j=0; j<token[i]->count; j++) { int theToken, k=0; theToken=production[token[i]->production[j]][k]; //for each token in each rule do { //add non epsilon set if (addIntoFirst(i, theToken)) { addNew=true; } //if it is not null, means it is end if (!token[theToken]->isNull) { break; } //if it is nullable, continue k++; theToken=production[token[i]->production[j]][k]; }while (theToken!=-1); //it means all token in this rule is nullable, so if (theToken==-1) { //it is nullable //addEpsilonIntoFirst(i); addFirst(i, emptyIndex); } } } i++; } }while (addNew); } bool Grammar::addFollowIntoFollow(int target, int source) { bool addNew=false; for (int i=0; i<token[source]->followCount; i++) { if (addFollow(target, token[source]->follow[i])) { addNew=true; } } return addNew; } bool Grammar::addIntoFollow(int target, int source) { bool addNew=false; if (source==-1) { return false; } for (int i=0; i<token[source]->firstCount; i++) { //add non-epsilon if (!token[token[source]->first[i]]->isNull) { if (addFollow(target, token[source]->first[i])) { addNew=true; } } } return addNew; } void Grammar::calculateFollow() { int i; bool addNew, started; //token[startSymbolIndex]->follow[0]=stackBottomIndex; //token[startSymbolIndex]->followCount=1; do { i=0; addNew=false; while (i<tokenCount) { //the terminal contains itself if (!token[i]->terminal) { for (int tIndex=0; tIndex<tokenCount; tIndex++) { //for all its rules if (token[tIndex]->terminal) { continue; } //for each its rule for (int j=0; j<token[tIndex]->count; j++) { int theToken, k=0, theRule=token[tIndex]->production[j]; started=false; theToken=production[theRule][k]; //for each token in each rule do { //the token appears here if (started) { if (addIntoFollow(i, theToken)) { addNew=true; } if (!token[theToken]->isNull) { break; } } if (theToken==i) { started=true; //add non epsilon set, including -1 situation!!! } //if it is not null, means it is end //if it is nullable, continue k++; theToken=production[theRule][k]; }while (theToken!=-1); //it means all token in this rule is nullable, so if (started&&theToken==-1) { //it is nullable //add current variable Follow(j) into Follow(i); if (addFollowIntoFollow(i, tIndex)) { addNew=true; } } } } } i++; } }while (addNew); } void Grammar::addRuleForToken(int tIndex, int ruleIndex) { token[tIndex]->production[token[tIndex]->count++]=ruleIndex; } void Grammar::shiftRule(int ruleIndex, bool left2Right, int offset) { int end=0; /* while (production[ruleIndex][end]!=-1) { end++; } */ end=ruleLen(ruleIndex); if (left2Right) { for (int i=end; i>=0; i--) { production[ruleIndex][i+offset]=production[ruleIndex][i]; } } else { for (int i=0; i<=end-offset; i++) { production[ruleIndex][i]=production[ruleIndex][i+offset]; } checkEpsilon(ruleIndex); } } void Grammar::checkEpsilon(int ruleIndex) { if (production[ruleIndex][0]==-1) { epsilonRule(ruleIndex); } } bool Grammar::addFollow(int target, int source) { //check if it is already there for (int i=0; i<token[target]->followCount; i++) { if (token[target]->follow[i]==source) { return false; } } //add at end token[target]->follow[token[target]->followCount++]=source; return true; } bool Grammar::addFirst(int target, int source) { //check if it is already there for (int i=0; i<token[target]->firstCount; i++) { if (token[target]->first[i]==source) { return false; } } //add at end token[target]->first[token[target]->firstCount++]=source; return true; } //add non epsilon into it. bool Grammar::addIntoFirst(int target, int source) { bool addNew=false; if (token[source]->terminal) { if (!token[source]->isNull) { return addFirst(target, source); } else { return false; } } for (int i=0; i<token[source]->firstCount; i++) { //add non epsilon if (!token[token[source]->first[i]]->isNull) { if (addFirst(target, token[source]->first[i])) { addNew=true; } } } return addNew; } bool Grammar::Null(int tIndex) { if (token[tIndex]->terminal) { return token[tIndex]->isNull; } for (int i=0; i<token[tIndex]->count; i++) { int j=0, theToken; theToken=production[token[tIndex]->production[i]][j]; do { if (theToken==tIndex||!Null(theToken)) { break; } j++; theToken=production[token[tIndex]->production[i]][j]; }while (theToken!=-1); if (theToken==-1) { return true; } } return false; } void Grammar::calculateNull() { for (int i=0; i<tokenCount; i++) { token[i]->isNull=Null(i); } } int Grammar::findMetaSymbol(int& begin, int& end) { int theRule, theToken, k; begin=end=-1; for (int i=0; i<tokenCount; i++) { for (int j=0; j<token[i]->count; j++) { k=0; theRule=token[i]->production[j]; theToken=production[theRule][k]; while (theToken!=-1) { if (theToken==beginIndex) { begin=k; } if (theToken==endIndex) { end=k; return theRule; } k++; theToken=production[theRule][k]; } } } return -1; } void Grammar::doReplaceMetaSymbol(int ruleIndex, int begin, int end) { int newTokenIndex=forgeToken(0), i=0; addRuleForToken(newTokenIndex, ++prodIndex); //token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex; //token[newTokenIndex]->terminal=false; copyRule(ruleIndex, prodIndex); //shrink while (production[ruleIndex][i+end+1]!=-1) { production[ruleIndex][begin+i]=production[ruleIndex][i+end+1]; i++; } production[ruleIndex][begin+i]=-1; addAtEnd(ruleIndex, newTokenIndex); /* production[ruleIndex][begin]=newTokenIndex; production[ruleIndex][begin+1]=-1; */ shiftRule(prodIndex, false, begin+1); production[prodIndex][end-begin-1]=newTokenIndex; production[prodIndex][end-begin]=-1; addEpsilonForToken(newTokenIndex); /* if (epsilonIndex==-1) { eRuleIndex=++prodIndex; epsilonRule(eRuleIndex); } else { eRuleIndex=epsilonIndex; } addRuleForToken(newTokenIndex, eRuleIndex); */ } void Grammar::replaceMetaSymbol() { int begin, end, ruleIndex; while ((ruleIndex=findMetaSymbol(begin, end))!=-1) { doReplaceMetaSymbol(ruleIndex, begin, end); } /* for (int i=0; i<tokenCount; i++) { //if (token[i]->isMeta) if (i==beginIndex||i==endIndex) { removeToken(i); } } */ } void Grammar::addAtEnd(int ruleIndex, int toAdd) { int end; end=ruleLen(ruleIndex); production[ruleIndex][end++]=toAdd; production[ruleIndex][end]=-1; } //left-recursion: A -> A a | b | c //change to: A -> b A' | c A' // A' -> a A' | empty void Grammar::removeImmediate(int tIndex, int ruleIndex) { int newIndex=forgeToken(tIndex); int holdRuleIndex=token[tIndex]->production[ruleIndex]; //sequence matters! //change to: A -> b A' for (int i=0; i<token[tIndex]->count; i++) { if (i!=ruleIndex) { addAtEnd(token[tIndex]->production[i], newIndex); } } //shift removeRuleFromToken(tIndex, ruleIndex); addRuleForToken(newIndex, holdRuleIndex); //token[newIndex]->production[token[newIndex]->count++]=holdRuleIndex; shiftRule(holdRuleIndex, false, 1); addAtEnd(holdRuleIndex, newIndex); //add epsilon rule for new variable addEpsilonForToken(newIndex); /* epsilonRule(++prodIndex); addRuleForToken(newIndex, prodIndex); */ } int Grammar::forgeToken(int index) { char str[MaxGrammarTokenLength+2], ch; int len=strlen(token[index]->name); int temp=0, i=0; strcpy(str, token[index]->name); ch=str[len-1];//get last char of name if (ch>='0'&&ch<'9') { str[len-1]=ch+i+1; } else { str[len]='0'+i; str[len+1]='\0'; } //I won't bother to check repetitation of name while (tokenIndex(str)!=-1) { i++; if (ch>='0'&&ch<'9') { str[len-1]=ch+i+1; } else { str[len]='0'+i; str[len+1]='\0'; } } return addToken(str, false);//it is non-terminal for sure } int Grammar::copyRule(int source, int target) { int i=0; while (production[source][i]!=-1) { production[target][i]=production[source][i]; i++; } production[target][i]=-1; return i; } void Grammar::doAddRule(int tIndex) { token[tIndex]->production[token[tIndex]->count++]=++prodIndex; } void Grammar::addRule(const char* str) { int index; index=addToken(str); production[prodIndex][curPos++]=index; production[prodIndex][curPos]=-1;//set end } //if the token is already there, it return the index //otherwise, it add new node in token array int Grammar::addToken(const char* str, bool isTerminal) { int index; index=tokenIndex(str); if (index==-1) { index=tokenCount; } else { return index; } token[index]=createToken(str, isTerminal); if (strcmp(str, optionBegin)==0) { beginIndex=index; } if (strcmp(str, optionEnd)==0) { endIndex=index; } tokenCount++; if (strcmp(str, emptyStr)==0) { token[index]->isNull=true; emptyIndex=index; } return index; } void Grammar::newRule(const char* str) { if (str!=NULL) { curIndex=addToken(str, false); } //add one pointer token[curIndex]->production[token[curIndex]->count++]=++prodIndex; token[curIndex]->terminal=false; curPos=0;//reset to restart; } GrammarToken* Grammar::createToken(const char* theName, bool isTerminal) { GrammarToken* ptr=new GrammarToken; ptr->name=new char[strlen(theName)+1]; strcpy(ptr->name, theName); ptr->terminal=isTerminal; ptr->count=ptr->firstCount=ptr->followCount=0; ptr->isNull=false; return ptr; } int Grammar::tokenIndex(const char* str) { for (int i=0; i<tokenCount; i++) { if (strcmp(str, token[i]->name)==0) { return i; } } return -1; } int Grammar::checkRecursion(int tIndex, int curToken) { for (int i=0; i<token[curToken]->count; i++) { //token[tIndex]->production[i]=ruleIndex //production[ruleIndex][0] is the first left-recursion one if (production[token[curToken]->production[i]][0]<=curToken) { return i; } } return -1; } void Grammar::printRule(int index) { int nodeIndex=0; //cout<<" ~"<<index<<"~ "; while (production[index][nodeIndex]!=-1) { cout<<token[production[index][nodeIndex]]->name<<" "; nodeIndex++; } } void Grammar::printRule(int tIndex, int rulePos) { cout<<token[tIndex]->name<<" ==> "; printRule(token[tIndex]->production[rulePos]); } void Grammar::initialize() { tokenCount=curIndex=curPos=0; prodIndex=-1;//in order to ++ blindly for (int i=0; i<MaxStateCount; i++) { for (int j=0; j<MaxTokenCount; j++) { LRTable[i][j]=-1; } } for (i=0; i<MaxTokenCount; i++) { for (int j=0; j<MaxTokenCount; j++) { table[i][j]=-1; } } } Grammar::Grammar() { initialize(); } void Grammar::removeLeftRecursion() { int tIndex=0, curToken=0; bool newChange=false; while (tIndex<tokenCount) { if (!token[tIndex]->terminal) { for (int i=0; i<token[tIndex]->count; i++) { curToken=production[token[tIndex]->production[i]][0]; if (curToken<=tIndex&&!token[curToken]->terminal) { if (curToken!=tIndex) { replaceRule(tIndex, i); } else { removeImmediate(tIndex, i); } newChange=true; } } } //whenever there is some new findings, restart if (!newChange) { tIndex++; } else { tIndex=0; newChange=false; } } } void Grammar::replaceRule(int tIndex, int ruleIndex) { int pos, j, targetEnd, sourceEnd, curRule; curRule=token[tIndex]->production[ruleIndex]; int curToken=production[curRule][0]; for (int i=token[curToken]->count-1; i>=0; i--) { if (i>0) { doAddRule(tIndex); pos=copyRule(token[curToken]->production[i], prodIndex); j=0; while (production[curRule][j+1]!=-1) { production[prodIndex][pos+j]=production[curRule][j+1]; j++; } production[prodIndex][pos+j]=-1; //addRuleForToken(curToken, prodIndex); } else { targetEnd=sourceEnd=0; //curRule=token[tIndex]->production[ruleIndex]; while (true) { if (production[token[curToken]->production[0]][sourceEnd]==-1&& production[curRule][targetEnd]==-1) { break; } if (production[token[curToken]->production[0]][sourceEnd]!=-1) { sourceEnd++; } if (production[curRule][targetEnd]!=-1) { targetEnd++; } } j=targetEnd+sourceEnd-1; while (j>=0) { if (j>=sourceEnd) { production[curRule][j]=production[curRule][j-sourceEnd+1]; } else { production[curRule][j]=production[token[curToken]->production[0]][j]; } j--; } } } } void Grammar::calculateLookAhead() { replaceMetaSymbol(); calculateNull(); cout<<setw(20)<<"The Rule "<<"Look-Ahead "<<endl; for (int i=0; i<tokenCount; i++) { if (token[i]->terminal) { continue; } for (int j=0; j<token[i]->count; j++) { int theRule, theToken, k=0; theRule=token[i]->production[j]; cout<<setw(5); cout<<token[i]->name<<" -> "; cout<<setiosflags(ios::left); printRule(theRule); cout<<" "; theToken=production[theRule][k]; while (theToken!=-1) { if (k!=0) { cout<<"+"; } cout<<"First("<<token[theToken]->name<<")"; if (!token[theToken]->isNull) { break; } k++; theToken=production[theRule][k]; } if (theToken==-1) { if (k!=0) { cout<<"+"; } cout<<"Follow("<<token[i]->name<<")"; } cout<<"\n"; } } } void Grammar::LLoptimize() { removeLeftRecursion(); commonLeftFactor(); } void Grammar::LRoptimize() { calculateProperty(); } //the return value is the count, not the index of rule int Grammar::ruleLen(int ruleIndex) { int end=0; while (production[ruleIndex][end]!=-1) { end++; } return end; } //optimize sequence is first common-factor then remove left recursion //therefore I don't have to check if for same variable if there will be //more than one left-recursion void Grammar::optimize(bool forLLGrammar) { replaceMetaSymbol(); addBeginEnd(); if (forLLGrammar) { LLoptimize(); } else { LRoptimize(); } //removeRedundant(); calculateNull(); calculateFirst(); calculateFollow(); buildTable(forLLGrammar); } int Grammar::findCommonFactor(int tIndex, int ruleIndex) { for (int i=ruleIndex+1; i<token[tIndex]->count; i++) { //if the two rule has the same first token if (production[token[tIndex]->production[ruleIndex]][0]== production[token[tIndex]->production[i]][0]) { /* //calculate if there is epsilon if (emptyIndex==-1) { emptyIndex=tokenIndex(emptyStr); } //if it is not epsilon if (production[token[tIndex]->production[i]][0]!=emptyIndex) { return i; } */ return i; } } return -1; } void Grammar::epsilonRule(int ruleIndex) { production[ruleIndex][0]=addToken(emptyStr); production[ruleIndex][1]=-1; } //sample: x -> Aa // x -> Ab //changed to: x -> Ax' //this is to change the old rule // x' -> b //this is to change the old rule // x' -> a //this is the new-added rule void Grammar::doCommonFactor(int tIndex, int ruleIndex, int index) { int newTokenIndex=forgeToken(tIndex);//create a new variable //move the second and after part to the new rule of new variable //doing: x' -> a //curPos=0; //prepare to add one new rule addRuleForToken(newTokenIndex, ++prodIndex); //token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex; token[newTokenIndex]->terminal=false; copyRule(token[tIndex]->production[ruleIndex], prodIndex); shiftRule(prodIndex, false, 1); /* do { //do copying production[prodIndex][curPos]= production[token[tIndex]->production[ruleIndex]][curPos+1]; curPos++; //even the -1 at end is copied }while (production[token[tIndex]->production[ruleIndex]][curPos]!=-1); */ //in order to show an empty string, in case the string is "epsilon" /* if (curPos==1) { epsilonRule(prodIndex); } */ //replace x -> Aa with x -> Ax' production[token[tIndex]->production[ruleIndex]][1]=newTokenIndex; production[token[tIndex]->production[ruleIndex]][2]=-1;//end //doing: x' -> b //curPos=0; //prepare to add one new rule //pointing new token to where old rule lies addRuleForToken(newTokenIndex, token[tIndex]->production[index]); /* token[newTokenIndex]->production[token[newTokenIndex]->count++]= token[tIndex]->production[index]; */ shiftRule(token[tIndex]->production[index], false, 1); removeRuleFromToken(tIndex, index); } void Grammar::removeRuleFromToken(int tIndex, int ruleIndex) { token[tIndex]->count--; for (int i=ruleIndex; i<token[tIndex]->count; i++) { token[tIndex]->production[i]=token[tIndex]->production[i+1]; } } void Grammar::commonLeftFactor() { int index=-1, tIndex=0, ruleIndex=0; bool flag; //whenever a newrule is done, restart! while (tIndex<tokenCount) { ruleIndex=0; flag=false; while (ruleIndex<token[tIndex]->count) { index=findCommonFactor(tIndex, ruleIndex); if (index!=-1) { doCommonFactor(tIndex, ruleIndex, index); //restart flag=true; break; } else { ruleIndex++; } } if (flag) { tIndex=0; } else { tIndex++; } } } GrammarToken* Grammar::operator[](int index) { if (index>=0&&index<tokenCount) { return token[index]; } else { return NULL; } } void Grammar::print() { for (int i=0; i<tokenCount; i++) { if (!token[i]->terminal) { cout<<token[i]->name<<" ==> "; for (int j=0; j<token[i]->count; j++) { printRule(token[i]->production[j]); if (j!=token[i]->count-1) { cout<<" | "; } } cout<<"\n"; } } } void Grammar::initializeDFA() { items=new Item*[MaxItemCount]; for (int i=0; i<MaxItemCount; i++) { items[i]=new Item; items[i]->rulePos=items[i]->dotPos=items[i]->varIndex=-1; } sets= new BitSet*[MaxStateCount]; expandFinished=new bool[MaxStateCount]; for (i=0; i<MaxStateCount; i++) { sets[i]=new BitSet; sets[i]->reset(); expandFinished[i]=false; } itemCount=stateCount=0; //initialize LRTable for (i=0; i<MaxStateCount; i++) { for (int j=0; j<tokenCount; j++) { LRTable[i][j]=-1; } } } int Grammar::addItem(Item& theItem) { for (int i=0; i<itemCount; i++) { if (*(items[i])== theItem) { return i; } } items[i]=new Item; *(items[i])=theItem; itemCount++; return i; } void Grammar::addSonItem(int tIndex, BitSet& theSet) { if (!token[tIndex]->terminal) { for (int i=0; i<token[tIndex]->count; i++) { Item temp; int itemIndex;//easy reading temp.varIndex=tIndex; temp.rulePos=i; temp.dotPos=0; itemIndex=addItem(temp); theSet.set(itemIndex); //recursive addSonItem(production[token[tIndex]->production[i]][0], theSet); } } } int Grammar::createItem(int theVar, int theNo, int position) { Item temp; temp.varIndex=theVar; temp.rulePos=theNo; temp.dotPos=position; return addItem(temp); } void Grammar::constructDFA() { int startIndex, itemIndex, stateIndex; BitSet theSet; //should initialize everything!!! initializeDFA(); startIndex=createItem(startSymbolIndex, 0, 0); theSet.set(startIndex); addClosure(startIndex, theSet); stateIndex=itemIndex=0; stateIndex= createState(theSet); expandState(stateIndex);//recursion inside //for all choices, do createState(startStart, tansition token) ( } void Grammar::addClosure(int itemIndex, BitSet& theSet) { int theNextVar=findNextTokenOfItem(itemIndex); theSet.set(itemIndex);//add itself first!!! /* theSet.set(itemIndex); tIndex=items[itemIndex]->varIndex;//easy for reading theNo=items[itemIndex]->rulePos; thePos=items[itemIndex]->dotPos; //maybe I should write small function to wrap this kind of indexing!!!! theVar=production[token[tIndex]->production[theNo]][thePos]; */ if (theNextVar!=-1) { addSonItem(theNextVar, theSet);//add son!!! } } int Grammar::createState(const BitSet& theSet) { for (int i=0; i<stateCount; i++) { if (*(sets[i])==theSet) { return i; } } sets[i]=new BitSet; *sets[i] = theSet; stateCount++; return i; } //indicating if it is reduceable by returning true void Grammar::doExpandState(int itemIndex, int tIndex, BitSet& theSet) { int theNewItemIndex; //int theNextVar=findNextTokenOfItem(itemIndex); theNewItemIndex=createItem(items[itemIndex]->varIndex, items[itemIndex]->rulePos, items[itemIndex]->dotPos+1); addClosure(theNewItemIndex, theSet); // return findNextTokenOfItem(theNewItemIndex)==-1;//true=reduceable } void Grammar::expandState(int stateIndex) { int itemIndex, targetState, tIndex, reduceIndex, shiftIndex; BitSet theSet;//a new temp bool findShift=false, findReduce=false; //this will prevent looping expanding!!!??? expandFinished[stateIndex]=true; //to see if it is to reduce sets[stateIndex]->restart(); while ((itemIndex=sets[stateIndex]->next())!=-1) { if ((tIndex=findNextTokenOfItem(itemIndex))==-1) { findReduce=true; int ruleIndex, rulePos, varIndex; //only for easy reading varIndex=items[itemIndex]->varIndex; rulePos=items[itemIndex]->rulePos; ruleIndex =token[varIndex]->production[rulePos]; add2Table(stateIndex, ruleIndex); reduceIndex=itemIndex; } else { shiftIndex=itemIndex; findShift=true; } } //how to do in LR(1)???? //I am currently working on LR(0). I try not to think about it if (findReduce&&findShift) { //in LR(1), this is not necessarily a conflict, because you need to //check the first and follow set? I forgot it now. :) errorHandle(ShiftReduceConflict); //temparily int temp1, temp2; temp1=items[shiftIndex]->varIndex; temp2=items[shiftIndex]->rulePos; cout<<"\nconflict at shift:\n"; printRule(temp1, temp2); temp1=items[reduceIndex]->varIndex; temp2=items[reduceIndex]->rulePos; cout<<"\nconflict with reduce:\n"; printRule(temp1, temp2); cout<<endl; return;//currently ignore LR(1) and should be improved when LR(1) } for (int i=0; i<tokenCount; i++) { sets[stateIndex]->restart(); //for all items, search if there is token to match to advance the dot while ((itemIndex=sets[stateIndex]->next())!=-1) { if (findNextTokenOfItem(itemIndex)== i)// { doExpandState(itemIndex, i, theSet);//should finish addClosure } } //only expandState after you find one "transition" if (theSet.count()>0)//if you do find some { targetState=createState(theSet); //here to write the LRTable add2Table(stateIndex, targetState, i);//3 params means to reduce findShift=true; theSet.reset();//prepare for next token if (!expandFinished[targetState]) { expandState(targetState); } }//no one is set } /* sets[stateIndex]->restart(); while ((itemIndex=sets[stateIndex]->next())!=-1) { if ((tIndex=findNextTokenOfItem(itemIndex))==-1) { findReduce=true; int ruleIndex, rulePos, varIndex; //only for easy reading varIndex=items[itemIndex]->varIndex; rulePos=items[itemIndex]->rulePos; ruleIndex =token[varIndex]->production[rulePos]; add2Table(stateIndex, ruleIndex); } } //conflict if (findReduce&&findShift) { errorHandle(ShiftReduceConflict); } */ } //this is for reduce void Grammar::add2Table(int stateIndex, int ruleIndex) { int result= reduce2Mask(ruleIndex); //currently this is for LR(0) for (int i=0; i<tokenCount; i++) { if (token[i]->terminal) { LRTable[stateIndex][i]=result; } } } //this is for shift void Grammar::add2Table(int stateIndex, int targetIndex, int tIndex) { int result = shift2Mask(targetIndex); LRTable[stateIndex][tIndex]=result; } int Grammar::findNextTokenOfItem(int itemIndex) { int tIndex, theNo, thePos, theVar; tIndex=items[itemIndex]->varIndex;//easy for reading theNo=items[itemIndex]->rulePos; thePos=items[itemIndex]->dotPos; //maybe I should write small function to wrap this kind of indexing!!!! theVar=production[token[tIndex]->production[theNo]][thePos]; return theVar; } bool Grammar::isReduce(int input) { return (SHIFTREDUCEMASK&input)==REDUCEMASK; } bool Grammar::isShift(int input) { return (SHIFTREDUCEMASK&input)==SHIFTMASK; } int Grammar::mask2State(int input) { return STATEMASK&input; } //these are trivial functions and will later be changed to be inline int Grammar::reduce2Mask(int input) { return REDUCEMASK|input; } int Grammar::shift2Mask(int input) { return SHIFTMASK|input; } BitSet::BitSet() { current=-1; reset(); } BitSet& BitSet::operator =(const BitSet& theSet) { reset(); this->operator |=(theSet); return *this; } int BitSet::next() { for (int i=current+1; i<bitset_size; i++) { if (at(i)) { current=i; return i; } } current=-1; return -1; } bool operator == (Item& first, Item& second) { return (first.dotPos==second.dotPos)&&(first.rulePos==second.rulePos)&& (first.varIndex==second.varIndex); }

#include "grammar.h" const int MaxStackLength=50; class Parser { private: int stack[MaxStackLength]; bool push(int num); bool pop(int& num); int top; void initialize(); bool pushToken(int tIndex, int theToken); void prepare(); void pushRule(int theRule); public: Parser(); void parseFile(const char* fileName); };

#include <iostream> #include "parser.h" #include "scanner.h" #include "errorNo.h" using namespace std; Scanner scanner; extern Grammar grammar; const char* defaultListFile="nickListFile.txt"; extern int startSymbolIndex; extern int stackBottomIndex; extern int table[MaxTokenCount][MaxTokenCount]; extern int token2type[MaxTokenCount]; extern int type2token[MaxTokenCount]; extern int emptyIndex; bool Parser::pushToken(int tIndex, int theToken) { if (table[tIndex][theToken]!=-1) { cout<<grammar.token[tIndex]->name<<" => "; grammar.printRule(table[tIndex][theToken]); cout<<endl; pushRule(table[tIndex][theToken]); return true; } return false; } void Parser::pushRule(int theRule) { int len=0; while (grammar.production[theRule][len]!=-1) { len++; } for (int i=len-1; i>=0; i--) { if (!push(grammar.production[theRule][i])) { errorHandle(StackOverFlow); } } } void Parser::parseFile(const char*fileName) { int theToken, theVar; bool canPop=true; scanner.readFromFile(fileName, defaultListFile); prepare(); while (scanner.nextToken()) { if (!canPop) { errorHandle(UnexpectedEmptyStack); } if (scanner.token.type==COMMENTTYPE) { continue; } theToken=type2token[scanner.token.type]; while ((canPop=pop(theVar))==true) { if (grammar.token[theVar]->terminal) { if (theToken==theVar)//match { break; } else { if (theVar==emptyIndex) { continue; } errorHandle(IllegalGrammarToken); } } else { if (!pushToken(theVar, theToken)) { errorHandle(IllegalGrammarToken); } } } } if (pop(theVar)) { if (theVar!=stackBottomIndex) { errorHandle(NotEmptyStack); } /* if (theVar!=stackBottomIndex) { errorHandle(NotEmptyStack); } */ } } Parser::Parser() { initialize(); } void Parser::initialize() { top=0; } bool Parser::pop(int& num) { if (top==0) { return false; } num=stack[--top]; return true; } bool Parser::push(int num) { if (top==MaxStackLength-1) { return false; } stack[top++]=num; return true; } void Parser::prepare() { top=0; pushRule(grammar.token[startSymbolIndex]->production[0]); }

/////////////////////////////////////////////////////////////////////////////////////////// //Program: SLang Scanner //Author: Qingzhe Huang //Date: Jan. 18, 2004 //FileName: scanner.h //Features: // 1. I want to improve efficiency of scanning, so I used table-driven method. // 2. I used enum to represent character of all ASCII---CharType---where "space, tab, // end of line, end of file are all considered to be White Space. // 3. All legal token is represented by enum TokenType. // 4. I defined a huge amount of TokenState which is basically the state of a DFA. As // I don't want to search reserved keyword with linear search or whatever, I have // many states for the reserved words. // 5. I deliberately make the sequence of first 38 TokenState elements exactly same as // all that of TokenType, so that each final state of DFA has a 1-1 correspondence with // type of token. // 6. I defined a struct of Token which may be used in future parser. // 7. I defined an errorNo variable to represent various errors. And a series error string // for displaying information. // 8. When class Scanner is created, it will initialize the big "state-charType" table. // 9. When readFromFile is called, it will first read one char in advance. // 10. When an error is encountered, the caller of Scanner should understand that no further // char is read in. So, stop calling "nextToken()". This is a bit controvercial, and I // plan to change it in next version. //////////////////////////////////////////////////////////////////////////////////////////// /*//////////////////////////////////////////////////////////////////////////// Program: SLang Scanner Author: Qingzhe Huang Date: Jan. 21, 2004 FileName: scanner.h Features: 1. I restructured the struct Token, to make it a union field in order to store int value for number. 2. I restructured the function "nextToken()" in order to give out correct line no. when error occurs. *//////////////////////////////////////////////////////////////////////////////// #ifndef SCANNER_H #define SCANNER_H #include <iostream> using namespace std; extern enum ErrorCode; const int TokenStateCount=138; const int CharTypeCount=72; const int MaxTokenLength=255; const int MaxNumberLength=12; enum CharType { //all small letters 26 SMALLA,SMALLB,SMALLC,SMALLD,SMALLE,SMALLF,SMALLG,SMALLH,SMALLI,SMALLJ,SMALLK,SMALLL, SMALLM,SMALLN,SMALLO,SMALLP,SMALLQ,SMALLR,SMALLS,SMALLT,SMALLU,SMALLV,SMALLW,SMALLX, SMALLY,SMALLZ, //all big letters 26 BIGA,BIGB,BIGC,BIGD,BIGE,BIGF,BIGG,BIGH,BIGI,BIGJ,BIGK,BIGL,BIGM,BIGN,BIGO,BIGP,BIGQ, BIGR,BIGS,BIGT,BIGU,BIGV,BIGW,BIGX,BIGY,BIGZ, //all digit 1 DIGIT, //all symbols 16 QUOTE, OPENPAR, CLOSEPAR, SEMICOLON,PLUS, MINUS, TIMES, SLASH, COLON, EQUAL,SMALLER,GREATER,EXCLAIM,OPENBRACKET, CLOSEBRACKET,COMMA, //space, tab, end of line are regarded as whitespace, 1 WHITESPACE, //UNDERSCORE IS A SPECIAL SYMBOL 1 UNDERSCORE, //all other ASCII is regarded as illegal 1 ILLEGAL }; //TOTAL 38, JUST 1-1 WITH THE FIRST 38 OF TOKENSTATE enum TokenType { //GENERAL TYPE 5 IDTYPE, NUMBERTYPE, CHARCONSTTYPE, COMMENTTYPE, ERRORTYPE, //THE FOLLOWING ARE SYMBOL TYPE 18 OPENPARTYPE, CLOSEPARTYPE, SEMICOLONTYPE, PLUSTYPE, MINUSTYPE, TIMESTYPE, SLASHTYPE, ASSIGNMENTTYPE, SMALLERTYPE, GREATERTYPE, EQUALTYPE, SMALLEREQUALTYPE, GREATEREQUALTYPE, NOTEQUALTYPE, OPENBRACKETTYPE, CLOSEBRACKETTYPE, COMMATYPE, COLONTYPE, //THE FOLLOWING ARE RESERVED TYPE 15 BEGINTYPE, ENDTYPE, PROGRAMTYPE, VARIABLESTYPE,INTEGERTYPE, ARRAYTYPE, CHARTYPE, MODULETYPE, IFTYPE, THENTYPE, ELSETYPE, LOOPTYPE, EXITTYPE, READTYPE, WRITETYPE }; //total 138 states enum TokenState { //THE FINAL STATE 38, in order to easy initialize "finalState", I put them in beginning //5 generals IDEND, NUMBEREND, CONSTCHAREND, COMMENTEND, ERROR, //18 symbols OPENPAREND, CLOSEPAREND, SEMICOLONEND, PLUSEND, MINUSEND, TIMESEND, SLASHEND, ASSIGNMENTEND, SMALLEREND, GREATEREND, EQUALEND, SMALLEREQUALEND, GREATEREQUALEND, NOTEQUALEND, OPENBRACKETEND, CLOSEBRACKETEND, COMMAEND, COLONEND, //15 reserved BEGINEND, ENDEND, PROGRAMEND, VARIABLESEND, INTEGEREND, ARRAYEND, CHAREND, MODULEEND, IFEND, THENEND, ELSEEND, LOOPEND, EXITEND, READEND, WRITEEND, //THE FOLLOWING ARE ALL NON-FINAL STATES //THE very FIRST CHAR 1 READY, //THE FOLLOWING ARE ALL RESERVED STATE //the first char 12 ARRAY1, BEGIN1, CHAR1, E1, I1, LOOP1, MODULE1, PROGRAM1, READ1, THEN1, VARIABLES1, WRITE1, //THE SECOND CHAR 15 ARRAY2, BEGIN2, CHAR2, ELSE2, END2, EXIT2, IF2, INTEGER2, LOOP2, MODULE2, PROGRAM2, READ2, THEN2, VARIABLES2, WRITE2, //THE THIRD CHAR 14 ARRAY3, BEGIN3, CHAR3, ELSE3, END3, EXIT3, INTEGER3, LOOP3, MODULE3, PROGRAM3, READ3, THEN3, VARIABLES3, WRITE3, //THE FOURTH CHAR 13 ARRAY4, BEGIN4, CHAR4, ELSE4, EXIT4, INTEGER4, LOOP4, MODULE4, PROGRAM4, READ4, THEN4, VARIABLES4, WRITE4, //THE FIFTH CHAR 7 ARRAY5, BEGIN5, INTEGER5, MODULE5, PROGRAM5, VARIABLES5, WRITE5, //THE SIXTH CHAR 4 INTEGER6, MODULE6, PROGRAM6, VARIABLES6, //THE SEVENTH CHAR 3 INTEGER7, PROGRAM7, VARIABLES7, //THE EIGHTH CHAR 1 VARIABLES8, //THE NINETH CHAR 1 VARIABLES9, //THESE ARE NON-RESERVED //THESE ARE GENERAL 9 IDBEGIN, IDUNDERSCORE, NUMBERBEGIN, CONSTCHARQUOTEBEGIN, CONSTCHARBEGIN, COMMENTSTARBEGIN, COMMENTBEGIN, COMMENTSTAREND, COMMENTSLASHBEGIN, //the SINGLE symbols 16 QUOTEBEGIN, OPENPARBEGIN, CLOSEPARBEGIN, SEMICOLONBEGIN, PLUSBEGIN, MINUSBEGIN, TIMESBEGIN, SLASHBEGIN, COLONBEGIN, SMALLERBEGIN, GREATERBEGIN, EQUALBEGIN, EXCLAIMBEGIN, OPENBRACKETBEGIN, CLOSEBRACKETBEGIN, COMMABEGIN, //MULTI SYMBOL 4 ASSIGNMENTBEGIN, SMALLEREQUALBEGIN, GREATEREQUALBEGIN, NOTEQUALBEGIN }; //extern ErrorCode errorNo; extern void errorHandle(ErrorCode errorNo); struct Token { TokenType type; union { char name[MaxTokenLength+1]; int number; }; }; class Scanner { private: int tokenCount; unsigned char ch; void printLineNo(); FILE* stream; bool nextChar(); void initialize(); bool resume(); public: Scanner(); static Token token; bool readFromFile(const char* fileName, const char* listFileName); const char* getToken(){return token.name;} bool nextToken(); void report(); }; void initialTokenState(); #endif

/*//////////////////////////////////////////////////////////////////////////// Program: SLang Scanner Author: Qingzhe Huang Date: Jan. 21, 2004 FileName: scanner.cpp Features: 1. As Dr. Optrany said, the number should be stored as int or double whatever. 2. I restructured the function "nextToken()" in order to give out correct line no. when error occurs. *//////////////////////////////////////////////////////////////////////////////// #include <iostream> #include <fstream> #include "scanner.h" #include "errorNo.h" using namespace std; ofstream fList; //this will determine how many errors of maximum the scanner will tolerant const int MaxErrortolerant=10; //as integer usually have max 12 digit roughly int errorCount=0; int lineCount=1; //static memeber Token Scanner::token; const int TokenTypeCount=38; //extern void errorHandle(int errorNo); extern char* errorStr[ErrorCount]; //this is purely for displaying purpose char* tokenTypeStr[TokenTypeCount]= { //GENERAL TYPE 5 "ID", "NUMBER", "CHARACTER CONSTANT", "COMMENT", "ERROR", //THE FOLLOWING ARE SYMBOL TYPE 18 "(", ")", ";", "+", "-", "*", "/", ":=", "<", ">", "=", "<=", ">=", "!=", "[", "]", ",", ":", //THE FOLLOWING ARE RESERVED TYPE 15 "begin", "end", "program", "variables","integer", "array", "char", "module", "if", "then", "else", "loop", "exit", "read", "write" }; CharType charType[256]; TokenState tokenState[TokenStateCount][CharTypeCount]; //when error occurs, no message is immediately output, it is //postponed to next time, because when '\n' is read in, //lineCount is not incremented until token is decided. //Therefore, when a token is ended with '\n', the line no is not updated //until next round. So, we can keep the correct line no. for each token bool Scanner::nextToken() { TokenState state=READY; int digitCount=0; int value=0; int count=0;//to count the length of token char* ptr=token.name; bool isComment=false; do { //map ch to CharType reducing 256 ASCII to 73 CharTypes //the table for "state" and "CharType is 138x73, each entry is a //index for state. state=tokenState[state][charType[ch]]; if (state==NUMBERBEGIN) { digitCount++; if (digitCount>=MaxNumberLength) { errorHandle(ExceedNumberLimit); return resume(); } //to accumulate the value value*=10; value+=ch-'0'; } //because I put all final state in the first 38 positions if (state<38) { //This is a dirty trick! Because I make the "TokenType" 1-1 with //TokenState for the 38 finals. *ptr='\0'; token.type=(TokenType)(state); if (state==ERROR) { errorHandle(IllegalToken); //printLineNo(); return resume(); } tokenCount++; if (state==NUMBEREND) { token.number=value; } //printLineNo(); return true; } if (state==COMMENTBEGIN) { isComment=true; } if (count>=MaxTokenLength) { errorHandle(TokenTooLong); token.type=ERRORTYPE; //printLineNo(); return false; } //cout<<ch; if (!isComment&&state!=READY) { *ptr=ch; ptr++; count++; } //it is only at end to update line no. printLineNo(); }while (nextChar()); state=tokenState[state][charType[ch]]; //at this point, it is either in ready state, or error state if (state==ERROR) { token.type=(TokenType)(state); errorHandle(UnexpectedReachEOF); } //but in all case it means end of file, so return false return false; } bool Scanner::resume() { //the scanner will try to continue if error number is within 10 if (errorCount==MaxErrortolerant) { return false; } return nextChar(); } void Scanner::report() { fList<<"\ntotal number of tokens is "<<tokenCount; fList<<"\ntotal number of errors is "<<errorCount; } void Scanner::printLineNo() { if (ch=='\n') { fList<<++lineCount<<" "; } } Scanner::Scanner() { initialize(); } void Scanner::initialize() { errorCount=0; lineCount=1; tokenCount=0; initialTokenState(); } bool Scanner::readFromFile(const char* fileName, const char* listFileName) { if ((stream=fopen(fileName, "r"))==NULL) { errorHandle(CannotOpenFile); return false; } else { fList.open(listFileName); fList<<lineCount<<" "; //this is to prevent the empty file situation in which //you cannot even read one single char because my scanner need to read //one char ahead if (!nextChar()) { errorHandle(FileEmptyError); return false; } } return true; } bool Scanner::nextChar() { ch=fgetc(stream); fList<<ch; return ch!=255; }

#ifndef ERRORNO_H #define ERRORNO_H extern char* errorStr[]; void errorHandle(int errorNo); const int ErrorCount=10; const int ScannerErrorCount=6; const int ParserErrorCount=4; //errors of scanner = 6 #define IllegalToken 0 #define TokenTooLong 1 #define UnexpectedReachEOF 2 #define FileEmptyError 3 #define CannotOpenFile 4 #define ExceedNumberLimit 5 //error of parser =4 #define UnexpectedEmptyStack 6 #define IllegalGrammarToken 7 #define NotEmptyStack 8 #define StackOverFlow 9 //error of LR(0) #define ShiftReduceConflict 10 #endif

#include <iostream> #include <fstream> #include "scanner.h" #include "errorNo.h" using namespace std; char* errorStr[ErrorCount]= { //these are scanner errors: "IllegalToken", "TokenTooLong", "UnexpectedReachEOF", "FileEmptyError", "CannotOpenFile", "ExceedNumberLimit", //these are parser errors "UnexpectedEmptyStack", "IllegalGrammarToken", "NotEmptyStack", "StackOverFlow" }; extern int errorCount; extern int lineCount; extern ofstream fList; //this is going to be improved in future as parser need to //call it, too. so, more parameter should be added? //No! the error no. itself specifies the error and it is //error handler to try to find necessary info to display. void errorHandle(int errorNo) { if (errorNo<ScannerErrorCount) { errorCount++; //the illegal token may be for various reason and I only suggest //a possible nearby place to spot the error occurs. fList<<"\nerror of "<<errorStr[errorNo]<<" occurred at line " <<lineCount<<" near token "<<Scanner::token.name<<endl; } else { if (errorNo<ScannerErrorCount+ParserErrorCount) { errorCount++; fList<<"\nerror of "<<errorStr[errorNo]<<" occurred at line " <<lineCount<<" near token "<<Scanner::token.name<<endl; exit(errorNo); } } }

/*//////////////////////////////////////////////////////////////////////////// Program: SLang Scanner Author: Qingzhe Huang Date: Jan. 18, 2004 FileName: initialize.cpp Features: 1. This is purely mechnical job, you know to initialize a huge state table: 138x72 is really a boring, routine job. 2. For EOF, I want "ch" to be able to be an index in "CharType" array, so, it cannot be -1, but 255 for "unsigned char" which is declared in class Scanner. *//////////////////////////////////////////////////////////////////////////////// #include "scanner.h" extern enum CharType; extern enum TokenState; extern CharType charType[256]; extern TokenState tokenState[TokenStateCount][CharTypeCount]; void finalSymbolToken(TokenState state, TokenState endState); void finalReservedToken(TokenState state, TokenState endState); void initialCharType(); void setFinalTokenState(); void initialReserved(TokenState state); void setRange(TokenState state, CharType start, CharType end, TokenState target); void setState(TokenState state, TokenState targetState); void setDefaultState(); void setDefaultState() { //all states are by default error for (int i=0; i<TokenStateCount; i++) { setState((TokenState)i, ERROR); } //the default for all letters are IDBEGIN setRange(READY, SMALLA, BIGZ, IDBEGIN); //THIS IS another dirty trick, since I put all reserved states together //so you can initialize them together. for (i=ARRAY1; i<=VARIABLES9; i++) { initialReserved((TokenState)i); } setFinalTokenState(); } void setFinalTokenState() { //FOR ID finalReservedToken(IDBEGIN, IDEND); //for number finalReservedToken(NUMBERBEGIN, NUMBEREND); //THESE FOR RESERVED WORDS finalReservedToken(ARRAY5, ARRAYEND); finalReservedToken(BEGIN5, BEGINEND); finalReservedToken(CHAR4, CHAREND); finalReservedToken(ELSE4, ELSEEND); finalReservedToken(END3, ENDEND); finalReservedToken(EXIT4, EXITEND); finalReservedToken(IF2, IFEND); finalReservedToken(INTEGER7, INTEGEREND); finalReservedToken(LOOP4, LOOPEND); finalReservedToken(MODULE6, MODULEEND); finalReservedToken(PROGRAM7, PROGRAMEND); finalReservedToken(READ4, READEND); finalReservedToken(THEN4, THENEND); finalReservedToken(VARIABLES9, VARIABLESEND); finalReservedToken(WRITE5, WRITEEND); //THESE FOR SYMBOLS finalSymbolToken(OPENPARBEGIN, OPENPAREND); finalSymbolToken(CLOSEPARBEGIN, CLOSEPAREND); finalSymbolToken(SEMICOLONBEGIN, SEMICOLONEND); finalSymbolToken(PLUSBEGIN, PLUSEND); finalSymbolToken(MINUSBEGIN, MINUSEND); finalSymbolToken(TIMESBEGIN, TIMESEND); finalSymbolToken(SLASHBEGIN, SLASHEND); finalSymbolToken(ASSIGNMENTBEGIN, ASSIGNMENTEND); finalSymbolToken(SMALLERBEGIN, SMALLEREND); finalSymbolToken(GREATERBEGIN, GREATEREND); finalSymbolToken(EQUALBEGIN, EQUALEND); finalSymbolToken(SMALLEREQUALBEGIN, SMALLEREQUALEND); finalSymbolToken(GREATEREQUALBEGIN, GREATEREQUALEND); finalSymbolToken(NOTEQUALBEGIN, NOTEQUALEND); finalSymbolToken(OPENBRACKETBEGIN, OPENBRACKETEND); finalSymbolToken(CLOSEBRACKETBEGIN, CLOSEBRACKETEND); finalSymbolToken(COMMABEGIN, COMMAEND); finalSymbolToken(COLONBEGIN, COLONEND); //COMMENT finalSymbolToken(COMMENTSLASHBEGIN, COMMENTEND); //CONSTCHAR finalSymbolToken(CONSTCHARQUOTEBEGIN, CONSTCHAREND); } void initialTokenState() { //initialize all charType initialCharType(); //default is always error setDefaultState(); //loop tokenState[READY][WHITESPACE]=READY; //number tokenState[READY][DIGIT]=NUMBERBEGIN; tokenState[NUMBERBEGIN][DIGIT]=NUMBERBEGIN;//HOW LONG SHOULD NUMBER BE? //ID //setRange(READY, SMALLA, BIGZ, IDBEGIN); THIS IS IN DEFAULT setRange(IDBEGIN, SMALLA, DIGIT, IDBEGIN); tokenState[IDBEGIN][UNDERSCORE]=IDUNDERSCORE; setRange(IDUNDERSCORE, SMALLA, DIGIT, IDBEGIN); //reserved words //ARRAY1, BEGIN1, CHAR1, E1, I1, LOOP1, MODULE1, PROGRAM1, READ1, THEN1, WRITE1, //VARIABLES1, tokenState[READY][SMALLA]=ARRAY1; tokenState[READY][SMALLB]=BEGIN1; tokenState[READY][SMALLC]=CHAR1; tokenState[READY][SMALLE]=E1; tokenState[READY][SMALLI]=I1; tokenState[READY][SMALLL]=LOOP1; tokenState[READY][SMALLM]=MODULE1; tokenState[READY][SMALLP]=PROGRAM1; tokenState[READY][SMALLR]=READ1; tokenState[READY][SMALLT]=THEN1; tokenState[READY][SMALLV]=VARIABLES1; tokenState[READY][SMALLW]=WRITE1; /* RESERVED WORDS ARRAY2 */ tokenState[ARRAY1][SMALLR]=ARRAY2; //BEGIN2 tokenState[BEGIN1][SMALLE]=BEGIN2; //CHAR2 tokenState[CHAR1][SMALLH]=CHAR2; //ELSE2, tokenState[E1][SMALLL]=ELSE2; //EXIT2 tokenState[E1][SMALLX]=EXIT2; //END2 tokenState[E1][SMALLN]=END2; //IF2 tokenState[I1][SMALLF]=IF2; //INTEGER2 tokenState[I1][SMALLN]=INTEGER2; //LOOP2 tokenState[LOOP1][SMALLO]=LOOP2; //MODULE2 tokenState[MODULE1][SMALLO]=MODULE2; //PROGRAM2 tokenState[PROGRAM1][SMALLR]=PROGRAM2; //READ2 tokenState[READ1][SMALLE]=READ2; //THEN2 tokenState[THEN1][SMALLH]=THEN2; //VARIABLES2 tokenState[VARIABLES1][SMALLA]=VARIABLES2; //WRITE2 tokenState[WRITE1][SMALLR]=WRITE2; /* RESERVED WORDS ARRAY3 */ tokenState[ARRAY2][SMALLR]=ARRAY3; //BEGIN2 tokenState[BEGIN2][SMALLG]=BEGIN3; //CHAR2 tokenState[CHAR2][SMALLA]=CHAR3; //ELSE2, tokenState[ELSE2][SMALLS]=ELSE3; //END2 tokenState[END2][SMALLD]=END3; //EXIT2 tokenState[EXIT2][SMALLI]=EXIT3; //INTEGER2 tokenState[INTEGER2][SMALLT]=INTEGER3; //LOOP2 tokenState[LOOP2][SMALLO]=LOOP3; //MODULE2 tokenState[MODULE2][SMALLD]=MODULE3; //PROGRAM2 tokenState[PROGRAM2][SMALLO]=PROGRAM3; //READ2 tokenState[READ2][SMALLA]=READ3; //THEN2 tokenState[THEN2][SMALLE]=THEN3; //VARIABLES2 tokenState[VARIABLES2][SMALLR]=VARIABLES3; //WRITE2 tokenState[WRITE2][SMALLI]=WRITE3; /* RESERVED WORDS ARRAY3 */ tokenState[ARRAY3][SMALLA]=ARRAY4; //BEGIN2 tokenState[BEGIN3][SMALLI]=BEGIN4; //CHAR2 tokenState[CHAR3][SMALLR]=CHAR4; //ELSE2, tokenState[ELSE3][SMALLE]=ELSE4; //EXIT2 tokenState[EXIT3][SMALLT]=EXIT4; //INTEGER2 tokenState[INTEGER3][SMALLE]=INTEGER4; //LOOP2 tokenState[LOOP3][SMALLP]=LOOP4; //MODULE2 tokenState[MODULE3][SMALLU]=MODULE4; //PROGRAM2 tokenState[PROGRAM3][SMALLG]=PROGRAM4; //READ2 tokenState[READ3][SMALLD]=READ4; //THEN2 tokenState[THEN3][SMALLN]=THEN4; //VARIABLES2 tokenState[VARIABLES3][SMALLI]=VARIABLES4; //WRITE2 tokenState[WRITE3][SMALLT]=WRITE4; /* RESERVED WORDS ARRAY */ tokenState[ARRAY4][SMALLY]=ARRAY5; //BEGIN2 tokenState[BEGIN4][SMALLN]=BEGIN5; //INTEGER2 tokenState[INTEGER4][SMALLG]=INTEGER5; //MODULE2 tokenState[MODULE4][SMALLL]=MODULE5; //PROGRAM2 tokenState[PROGRAM4][SMALLR]=PROGRAM5; //VARIABLES2 tokenState[VARIABLES4][SMALLA]=VARIABLES5; //WRITE2 tokenState[WRITE4][SMALLE]=WRITE5; // RESERVED WORDS*/ //INTEGER2 tokenState[INTEGER5][SMALLE]=INTEGER6; //MODULE2 tokenState[MODULE5][SMALLE]=MODULE6; //PROGRAM2 tokenState[PROGRAM5][SMALLA]=PROGRAM6; //VARIABLES2 tokenState[VARIABLES5][SMALLB]=VARIABLES6; // RESERVED WORDS*/ //INTEGER2 tokenState[INTEGER6][SMALLR]=INTEGER7; //PROGRAM2 tokenState[PROGRAM6][SMALLM]=PROGRAM7; //VARIABLES2 tokenState[VARIABLES6][SMALLL]=VARIABLES7; // RESERVED WORDS*/ //VARIABLES2 tokenState[VARIABLES7][SMALLE]=VARIABLES8; //VARIABLES2 tokenState[VARIABLES8][SMALLS]=VARIABLES9; /* CONSTCHAR, UNDERSCOREBEGIN, ASSIGNMENTBEGIN, SMALLEREQUALBEGIN, GREATEREQUALBEGIN, NOTEQUAL, COMMENTBEGIN, IDUNDERSCORE,*/ //now is the symbols //QUOTEBEGIN, OPENPARBEGIN, CLOSEPARBEGIN, SEMICOLONBEGIN, //PLUSBEGIN, MINUSBEGIN, TIMESBEGIN, SLASHBEGIN, COLONBEGIN, SMALLERBEGIN, GREATERBEGIN, //EQUALBEGIN, EXCLAIMBEGIN, OPENBRACKETBEGIN, CLOSEBRACKETBEGIN, COMMABEGIN, //' tokenState[READY][QUOTE]=QUOTEBEGIN; //( tokenState[READY][OPENPAR]=OPENPARBEGIN; //) tokenState[READY][CLOSEPAR]=CLOSEPARBEGIN; //; tokenState[READY][SEMICOLON]=SEMICOLONBEGIN; //+ tokenState[READY][PLUS]=PLUSBEGIN; //- tokenState[READY][MINUS]=MINUSBEGIN; //* tokenState[READY][TIMES]=TIMESBEGIN; /// tokenState[READY][SLASH]=SLASHBEGIN; //: tokenState[READY][COLON]=COLONBEGIN; //< tokenState[READY][SMALLER]=SMALLERBEGIN; //> tokenState[READY][GREATER]=GREATERBEGIN; //= tokenState[READY][EQUAL]=EQUALBEGIN; //! tokenState[READY][EXCLAIM]=EXCLAIMBEGIN; //[ tokenState[READY][OPENBRACKET]=OPENBRACKETBEGIN; //] tokenState[READY][CLOSEBRACKET]=CLOSEBRACKETBEGIN; //, tokenState[READY][COMMA]=COMMABEGIN; //AFTER QUOTE IT CAN BE ANY CHARACTER, INCLUDING ILLEGAL CHAR setRange(QUOTEBEGIN, SMALLA, ILLEGAL, CONSTCHARBEGIN); //ANY OTHER STATE IS BY DEFAULT ERROR tokenState[CONSTCHARBEGIN][QUOTE]=CONSTCHARQUOTEBEGIN; //FOR /, DEFAULT IS SLASHEND, EXCEPT * WHICH IS COMMENTSTARBEGIN tokenState[SLASHBEGIN][TIMES]= COMMENTSTARBEGIN; //FOR :, DEFAULT IS COLONEND, EXCEPT FOR = WHICH IS ASSIGNMENTBEGIN tokenState[COLONBEGIN][EQUAL]= ASSIGNMENTBEGIN; //FOR <, DEFAULT IS SMALLEREND, EXCEPT FOR= WHICH IS SMALLEREQAULBEGIN tokenState[SMALLERBEGIN][EQUAL]=SMALLEREQUALBEGIN; //FOR >, DEFAULT IS GREATEREND, EXCEPT FOR= WHICH IS GREATEREQAULBEGIN tokenState[GREATERBEGIN][EQUAL]= GREATEREQUALBEGIN; tokenState[EXCLAIMBEGIN][EQUAL]= NOTEQUALBEGIN; //WITHIN COMMENT IT IS A LOOP, EXCEPT FOR * WHICH IS POSSIBLE FOR END OF COMMENT setRange(COMMENTSTARBEGIN, SMALLA, ILLEGAL, COMMENTBEGIN); tokenState[COMMENTSTARBEGIN][TIMES]=COMMENTSTAREND; setRange(COMMENTBEGIN, SMALLA, ILLEGAL, COMMENTBEGIN); tokenState[COMMENTBEGIN][TIMES]=COMMENTSTAREND; //FROM COMMENTSTARBEGIN, ALL IS BACK TO COMMENTBEGIN, EXCEPT / WHICH IS END OF COMMENT setRange(COMMENTSTAREND, SMALLA, ILLEGAL, COMMENTBEGIN); tokenState[COMMENTSTAREND][SLASH]=COMMENTSLASHBEGIN; // } void initialReserved(TokenState state) { setRange(state, SMALLA, DIGIT, IDBEGIN); finalReservedToken(state, IDEND); tokenState[state][UNDERSCORE]=IDUNDERSCORE;//a_ } void finalSymbolToken(TokenState state, TokenState endState) { for (int i=SMALLA; i<=WHITESPACE; i++) { tokenState[state][(CharType)i]=endState; } } void finalReservedToken(TokenState state, TokenState endState) { //all non-letter, non-digit is regarded to be delimeter for (int i=QUOTE; i<=WHITESPACE; i++) { tokenState[state][(CharType)i]=endState; } } //the default charType is ILLEGAL void initialCharType() { int chType; //the default charType is ILLEGAL for (int i=0; i<256; i++) { charType[i]=ILLEGAL; } //chType is SMALLA chType=SMALLA; for (i='a'; i<='z'; i++) { charType[i]=(CharType)(chType); chType++; } //chType is now BIGA chType=BIGA;//I don't want to rely on the trick. for (i='A'; i<='Z'; i++) { charType[i]=(CharType)(chType); chType++; } chType=DIGIT; for (i='0'; i<='9'; i++) { charType[i]=(CharType)(chType); } /* UNDERSCORE, QUOTE, OPENPAR, CLOSEPAR, SEMICOLON,PLUS, MINUS, TIMES, SLASH, COLON, EQUAL,SMALLER,GREATER,EXCLAIM,OPENBRACKET, CLOSEBRACKET,COMMA, SPACE,TAB, ENDLINE, ILLEGAL */ charType['_']=UNDERSCORE; charType['\'']=QUOTE; charType['(']=OPENPAR; charType[')']=CLOSEPAR; charType[';']=SEMICOLON; charType['+']=PLUS; charType['-']=MINUS; charType['*']=TIMES; charType['/']=SLASH; charType[':']=COLON; charType['=']=EQUAL; charType['<']=SMALLER; charType['>']=GREATER; charType['!']=EXCLAIM; charType['[']=OPENBRACKET; charType[']']=CLOSEBRACKET; charType[',']=COMMA; charType[' ']=WHITESPACE; charType['\t']=WHITESPACE; charType[10]=WHITESPACE; charType[13]=WHITESPACE; //pls note, since I changed the type of "ch" to be "unsigned char" //the EOF now is not -1, but 255 charType[255]=WHITESPACE;//IT IS A KIND OF DELIMETER } void setRange(TokenState state, CharType start, CharType end, TokenState target) { for (int i=start; i<=end; i++) { tokenState[state][i]=target; } } void setState(TokenState state, TokenState targetState) { for (int i=0; i<CharTypeCount; i++) { tokenState[state][i]=targetState; } }

#include <iostream> #include "CFGReader.h" #include "Parser.h" using namespace std; int main( int argc, char *argv[ ]) { if (!(argc==1||argc==3)) { cout<<"usage: CRGReader grammarSourceFileName sourceFileToBeParsed"; exit(1); } CFGReader R; Parser P; if (argc==1) { R.readFromFile("c:\\ruleTest.txt"); } else { R.readFromFile(argv[1]); } R.optimize(); //R.calculateLookAhead(); //R.print(); cout<<"\n now begin parsing...\n"; if (argc==1) { P.parseFile("c:\\sourceTest.txt"); } else { P.parseFile(argv[2]); } return 0; }

M -> program i ; Dl B Dl -> Dv Ml Ml -> Ml Mo | e Mo -> module i ( Vl ) Dv B Dv -> variables Vl | e Vl -> Vl V | V V -> Il : T ; T -> integer Ad | char Ad Il -> i | Il , i L -> i Ar Ad -> e | array [ n ] B -> begin Sl end ; Sl -> S | S Sl S -> L := E ; | if C then S else S | loop Sl end ; | exit ; | i ( Lp ) ; | B | read Ln ; | write Lo ; | e ; Lp -> e | Ln Ln -> L { , L } Lo -> Lr { , Lr } Lr -> i Ar | n | c Ar -> e | [ E ] E -> F { Oa F } Oa -> + | - F -> R { Om R } Om -> * | / R -> L | n | ( E ) | c C -> E Or E Or -> = | < | > | <= | >= | !=

program average; /* a sample program. It calculates the average of integers, */ /* the first value gives the number of integers to read*/ variables n: integer; ave, total: integer; module sum(n, res :integer;) /* sums up n integers from input, puts the result into res*/ variables i: integer; val: integer array[100]; begin i:=i; res :=0; loop read val[i]; res:= res+ val[i]; i := i+1; if i>n then exit; else ; end; /* loop */ end;/* module sum*/ begin /* main program*/ read n; sum(n, total); ave:=(((total/n))); write 'a', 'v','e','r','a','g','e','='; write ave; end; /* of program*/

M ==> program i ; Dl B Dl ==> Dv Ml B ==> begin Sl end ; Dv ==> variables Vl | e Ml ==> e Ml0 Mo ==> module i ( Vl ) Dv B Vl ==> V Vl0 V ==> Il : T ; Il ==> i Il0 T ==> integer Ad | char Ad Ad ==> e | array [ n ] L ==> i Ar Ar ==> e | [ E ] Sl ==> S Sl0 S ==> i S0 | if C then S else S | loop Sl end ; | exit ; | begin Sl end ; | read Ln ; | write Lo ; | e ; E ==> F M0 C ==> F M0 Or E Lp ==> e | Ln Ln ==> i Ar M1 Lo ==> Lr M2 Lr ==> i Ar | n | c F ==> R M3 Oa ==> + | - R ==> i Ar | n | ( E ) | c Om ==> * | / Or ==> = | < | > | <= | >= | != M0 ==> + F M0 | e | - F M0 M1 ==> , L M1 | e M2 ==> , Lr M2 | e M3 ==> * R M3 | e | / R M3 Ml0 ==> module i ( Vl ) Dv B Ml0 | e Vl0 ==> i Il0 : T ; Vl0 | e Il0 ==> , i Il0 | e Sl0 ==> e | Sl S0 ==> Ar := E ; | ( Lp ) ; START ==> M $ name: M isNull: false isTerminal: false first: [0]=program follow: [0]=$ name: Dl isNull: true isTerminal: false first: [0]=e,[1]=variables,[2]=module follow: [0]=begin name: B isNull: false isTerminal: false first: [0]=begin follow: [0]=$,[1]=module,[2]=begin name: Dv isNull: true isTerminal: false first: [0]=variables,[1]=e follow: [0]=module,[1]=begin name: Ml isNull: true isTerminal: false first: [0]=e,[1]=module follow: [0]=begin name: Mo isNull: false isTerminal: false first: [0]=module follow: name: Vl isNull: false isTerminal: false first: [0]=i follow: [0]=module,[1]=begin,[2]=) name: V isNull: false isTerminal: false first: [0]=i follow: [0]=i,[1]=module,[2]=begin,[3]=) name: Il isNull: false isTerminal: false first: [0]=i follow: [0]=: name: T isNull: false isTerminal: false first: [0]=integer,[1]=char follow: [0]=; name: Ad isNull: true isTerminal: false first: [0]=e,[1]=array follow: [0]=; name: L isNull: false isTerminal: false first: [0]=i follow: [0]=,,[1]=;,[2]=) name: Ar isNull: true isTerminal: false first: [0]=e,[1]=[ follow: [0]=,,[1]=:=,[2]=;,[3]=),[4]=*,[5]=/,[6]=+,[7]=-,[8]=],[9]==,[10]=<,[11]=>,[12]=<=,[13]=> =,[14]=!=,[15]=then name: Sl isNull: false isTerminal: false first: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=; follow: [0]=end name: S isNull: false isTerminal: false first: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=; follow: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=;,[8]=end,[9]=else name: E isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=(,[3]=c follow: [0]=],[1]=),[2]=;,[3]=then name: C isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=(,[3]=c follow: [0]=then name: Lp isNull: true isTerminal: false first: [0]=e,[1]=i follow: [0]=) name: Ln isNull: false isTerminal: false first: [0]=i follow: [0]=;,[1]=) name: Lo isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=c follow: [0]=; name: Lr isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=c follow: [0]=,,[1]=; name: F isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=(,[3]=c follow: [0]=+,[1]=-,[2]=],[3]=),[4]=;,[5]==,[6]=<,[7]=>,[8]=<=,[9]=>=,[10]=!=,[11]=then name: Oa isNull: false isTerminal: false first: [0]=+,[1]=- follow: name: R isNull: false isTerminal: false first: [0]=i,[1]=n,[2]=(,[3]=c follow: [0]=*,[1]=/,[2]=+,[3]=-,[4]=],[5]=),[6]=;,[7]==,[8]=<,[9]=>,[10]=<=,[11]=>=,[12]=!=,[13]= then name: Om isNull: false isTerminal: false first: [0]=*,[1]=/ follow: name: Or isNull: false isTerminal: false first: [0]==,[1]=<,[2]=>,[3]=<=,[4]=>=,[5]=!= follow: [0]=i,[1]=n,[2]=(,[3]=c name: M0 isNull: true isTerminal: false first: [0]=+,[1]=e,[2]=- follow: [0]=],[1]=),[2]=;,[3]==,[4]=<,[5]=>,[6]=<=,[7]=>=,[8]=!=,[9]=then name: M1 isNull: true isTerminal: false first: [0]=,,[1]=e follow: [0]=;,[1]=) name: M2 isNull: true isTerminal: false first: [0]=,,[1]=e follow: [0]=; name: M3 isNull: true isTerminal: false first: [0]=*,[1]=e,[2]=/ follow: [0]=+,[1]=-,[2]=],[3]=),[4]=;,[5]==,[6]=<,[7]=>,[8]=<=,[9]=>=,[10]=!=,[11]=then name: Ml0 isNull: true isTerminal: false first: [0]=module,[1]=e follow: [0]=begin name: Vl0 isNull: true isTerminal: false first: [0]=i,[1]=e follow: [0]=module,[1]=begin,[2]=) name: Il0 isNull: true isTerminal: false first: [0]=,,[1]=e follow: [0]=: name: Sl0 isNull: true isTerminal: false first: [0]=e,[1]=i,[2]=if,[3]=loop,[4]=exit,[5]=begin,[6]=read,[7]=write,[8]=; follow: [0]=end name: S0 isNull: false isTerminal: false first: [0]=[,[1]=:=,[2]=( follow: [0]=i,[1]=if,[2]=loop,[3]=exit,[4]=begin,[5]=read,[6]=write,[7]=;,[8]=end,[9]=else name: START isNull: false isTerminal: false first: [0]=program follow: Press any key to continue