CFGReader

This is fourth edition of my CFG Reader which stands for Context-Free-Grammar Reader. In this edition, I finished

the second part of removal-left-recursion.

Here you see the variable A which appears the left-most of rule and this will produce infinite loop for Recursive-

Descent parsing. For table-driven parsing, it is the same problem. So, how to remove?

program -> stmt-sequence
stmt-sequence -> stmt-sequence ; statement | statement
statement -> if-stmt | repeat-stmt | assign-stmt | read-stmt | write-stmt
if-stmt -> if boolean-exp then stmt-sequence end
| if boolean-exp then stmt-sequence else stmt-sequence end
repeat-stmt -> repeat stmt-sequence until boolean-exp
assign-stmt -> identifier := integer-exp
read-stmt -> read identifier
write-stmt -> write integer-exp
boolean-exp -> integer-exp comparison-op integer-exp
comparison-op -> < | =
integer-exp -> integer-exp addop term | term
addop -> + | -
term -> term mulop factor | factor
mulop -> * | /
factor -> ( integer-exp ) | number | identifier

Now we are in a position to remove the immediate-left-recursion! Taking the above example, the new grammar would

So, generally there are three steps:

1) Modify current rule to such that it removes the first repeating variable and shift-to-left by 1 and at end

2) Change all other rules to rules belonging to new variable and at end of each rule add the new variable.

3) Add an epsilon rule for new variable.

5. main.cpp (main)

#include "Grammar.h" class CFGReader { private: char buffer[BufferLength]; FILE* stream; void newRule(const char* str); void readRule(); void addRule(const char* str); int addToken(const char* str, bool isTerminal=true); void printRule(int index); public: void readFromFile(const char* fileName); void optimize(); void print(); };

#include <iostream> #include "CFGReader.h" using namespace std; Grammar grammar; const char* deliStr=" |\n"; //this would only remove the immediate left recursion //and should I expand it to remove all kinds of left-recursion? void CFGReader::readFromFile(const char* fileName) { if ((stream=fopen(fileName, "r"))==NULL) { cout<<"cannot open file "<<fileName<<endl; } else { readRule(); } } void CFGReader::readRule() { char* ptr=buffer; char* hold; int count=0; while (!feof(stream)) { count=0; fgets(buffer, BufferLength-1, stream); ptr=buffer; while ((ptr=strtok(ptr, " \n"))!=NULL) { if (strcmp(ptr, "|")!=0) { //test the first two token to see if old rule continues if (count==0) { hold=ptr; } if (count==1) { if (strcmp("->", ptr)==0) { /* curIndex=addToken(hold, false);//the index of cur token grammar[curIndex]->terminal=false; newRule(); */ newRule(hold); } else { addRule(hold); addRule(ptr); } } if (count>=2)//always add { addRule(ptr); } count++; } else { //this is an alternative production rule newRule(NULL); } ptr=NULL; } } } void CFGReader::newRule(const char* str) { grammar.newRule(str); } void CFGReader::optimize() { grammar.optimize(); } void CFGReader::addRule(const char* str) { grammar.addRule(str); } int CFGReader::addToken(const char* str, bool isTerminal) { return grammar.addToken(str, isTerminal); } void CFGReader::printRule(int index) { grammar.printRule(index); } void CFGReader::print() { grammar.print(); }

const int BufferLength=256; const int MaxTokenCount=50; const int MaxRHSCount=8; const int MaxRuleCount=50; const int MaxTokenLength=10; const int MaxProduction=10; struct Token { bool terminal; bool isNull; char* name; int production[MaxProduction];//pointer to the production it gives int count; int firstCount; int followCount; int follow[MaxTokenCount]; int first[MaxTokenCount]; }; class Grammar { private: Token* token[MaxTokenCount]; int tokenCount; int curIndex;//the index of current token int curPos;//the position at production rule //MaxRuleCount>=MaxVariableCount!!! int production[MaxRuleCount][MaxRHSCount]; int prodIndex;//pointing to current production, initialized to -1 int checkRecursion(int tIndex, int curToken); bool addIntoFirst(int target, int source, int ruleIndex); void shiftRule(int ruleIndex, bool Left2Right, int offset); void addAtEnd(int ruleIndex, int toAdd); void addRuleForToken(int tIndex, int ruleIndex); int copyRule(int source, int target);//return the position of -1 void replaceRule(int curToken, int ruleIndex); void initialize(); void removeRuleFromToken(int tIndex, int ruleIndex); void checkEpsilon(int ruleIndex); void removeImmediate(int tIndex, int ruleIndex); void doAddRule(int tIndex); void commonLeftFactor(); int findCommonFactor(int tokenIndex, int ruleIndex); void removeLeftRecursion(); void doCommonFactor(int tokenIndex, int ruleIndex, int index); int forgeToken(int index);//create a new variable void epsilonRule(int ruleIndex); //void calculateNull(); void calculateFirst(); void calculateFollow(); public: Grammar(); void optimize(); void printRule(int index); void print(); void addRule(const char* str); void newRule(const char* str);//this is an internal method, not suitable for outside Token* operator[](int index); int addToken(const char* str, bool isTerminal=true); Token* createToken(const char* theName, bool isTerminal); int tokenIndex(const char* str); };

#include <iostream> #include "Grammar.h" using namespace std; const char* emptyStr="empty"; int emptyIndex=-1; void Grammar::calculateFirst() { int start=0; while (start<tokenCount) { if (!token[start]->terminal) { } } } void Grammar::addRuleForToken(int tIndex, int ruleIndex) { token[tIndex]->production[token[tIndex]->count++]=ruleIndex; } void Grammar::shiftRule(int ruleIndex, bool left2Right, int offset) { int end=0; while (production[ruleIndex][end]!=-1) { end++; } if (left2Right) { for (int i=end; i>=0; i--) { production[ruleIndex][i+offset]=production[ruleIndex][i]; } } else { for (int i=0; i<=end-offset; i++) { production[ruleIndex][i]=production[ruleIndex][i+offset]; } checkEpsilon(ruleIndex); } } void Grammar::checkEpsilon(int ruleIndex) { if (production[ruleIndex][0]==-1) { epsilonRule(ruleIndex); } } bool Grammar::addIntoFirst(int target, int source, int ruleIndex) { /* int theFirst; bool added; //if the token epsilon is added, surely the token will be nullable if (token[source]->terminal) { token[target]->first[token[target]->firstCount++]=source; if (token[source]->isNull) { token[target]->isNull=true; } } else { added=false; for (int i=0; i<token[source]->count; i++) { theFirst=production[token[source]->production[i]][0]; if ( addIntoFirst(target, production[token[source]->production[0]][0]); } } */ return true; } void Grammar::addAtEnd(int ruleIndex, int toAdd) { int end=0; while (production[ruleIndex][end]!=-1) { end++; } production[ruleIndex][end++]=toAdd; production[ruleIndex][end]=-1; } //left-recursion: A -> A a | b | c //change to: A -> b A' | c A' // A' -> a A' | empty void Grammar::removeImmediate(int tIndex, int ruleIndex) { int newIndex=forgeToken(tIndex); int holdRuleIndex=token[tIndex]->production[ruleIndex]; //sequence matters! //change to: A -> b A' for (int i=0; i<token[tIndex]->count; i++) { if (i!=ruleIndex) { addAtEnd(token[tIndex]->production[i], newIndex); } } //shift /* token[tIndex]->count--; for (i=ruleIndex; i<token[tIndex]->count; i++) { token[tIndex]->production[i]=token[tIndex]->production[i+1]; } */ removeRuleFromToken(tIndex, ruleIndex); addRuleForToken(newIndex, holdRuleIndex); //token[newIndex]->production[token[newIndex]->count++]=holdRuleIndex; shiftRule(holdRuleIndex, false, 1); addAtEnd(holdRuleIndex, newIndex); //add epsilon rule for new variable epsilonRule(++prodIndex); addRuleForToken(newIndex, prodIndex); } int Grammar::forgeToken(int index) { char str[MaxTokenLength+2], ch; int len=strlen(token[index]->name); int temp=0, i=0; strcpy(str, token[index]->name); ch=str[len-1];//get last char of name if (ch>='0'&&ch<'9') { str[len-1]=ch+i+1; } else { str[len]='0'+i; str[len+1]='\0'; } //I won't bother to check repetitation of name while (tokenIndex(str)!=-1) { i++; if (ch>='0'&&ch<'9') { str[len-1]=ch+i+1; } else { str[len]='0'+i; str[len+1]='\0'; } } return addToken(str, false);//it is non-terminal for sure } int Grammar::copyRule(int source, int target) { int i=0; while (production[source][i]!=-1) { production[target][i]=production[source][i]; i++; } production[target][i]=-1; return i; } void Grammar::doAddRule(int tIndex) { token[tIndex]->production[token[tIndex]->count++]=++prodIndex; } void Grammar::addRule(const char* str) { int index; index=addToken(str); production[prodIndex][curPos++]=index; production[prodIndex][curPos]=-1;//set end } //if the token is already there, it return the index //otherwise, it add new node in token array int Grammar::addToken(const char* str, bool isTerminal) { int index; index=tokenIndex(str); if (index==-1) { index=tokenCount; } else { return index; } token[index]=createToken(str, isTerminal); tokenCount++; if (strcmp(str, emptyStr)==0) { token[index]->isNull=true; } return index; } void Grammar::newRule(const char* str) { if (str!=NULL) { curIndex=addToken(str, false); } //add one pointer token[curIndex]->production[token[curIndex]->count++]=++prodIndex; token[curIndex]->terminal=false; curPos=0;//reset to restart; } Token* Grammar::createToken(const char* theName, bool isTerminal) { Token* ptr=new Token; ptr->name=new char[strlen(theName)+1]; strcpy(ptr->name, theName); ptr->terminal=isTerminal; ptr->count=ptr->firstCount=ptr->followCount=0; ptr->isNull=false; return ptr; } int Grammar::tokenIndex(const char* str) { for (int i=0; i<tokenCount; i++) { if (strcmp(str, token[i]->name)==0) { return i; } } return -1; } int Grammar::checkRecursion(int tIndex, int curToken) { for (int i=0; i<token[curToken]->count; i++) { //token[tIndex]->production[i]=ruleIndex //production[ruleIndex][0] is the first left-recursion one if (production[token[curToken]->production[i]][0]<=curToken) { return i; } } return -1; } void Grammar::printRule(int index) { int nodeIndex=0; while (production[index][nodeIndex]!=-1) { cout<<token[production[index][nodeIndex]]->name<<" "; nodeIndex++; } } void Grammar::initialize() { tokenCount=curIndex=curPos=0; prodIndex=-1;//in order to ++ blindly } Grammar::Grammar() { initialize(); } void Grammar::removeLeftRecursion() { int tIndex=0, curToken=0; bool newChange=false; while (tIndex<tokenCount) { if (!token[tIndex]->terminal) { for (int i=0; i<token[tIndex]->count; i++) { curToken=production[token[tIndex]->production[i]][0]; if (curToken<=tIndex&&!token[curToken]->terminal) { if (curToken!=tIndex) { replaceRule(tIndex, i); } else { removeImmediate(tIndex, i); } newChange=true; } } } //whenever there is some new findings, restart if (!newChange) { tIndex++; } else { tIndex=0; newChange=false; } } } void Grammar::replaceRule(int tIndex, int ruleIndex) { int pos, j, targetEnd, sourceEnd, curRule; int curToken=production[token[tIndex]->production[ruleIndex]][0]; for (int i=token[curToken]->count-1; i>=0; i--) { if (i>0) { doAddRule(tIndex); pos=copyRule(token[curToken]->production[i], prodIndex); j=0; while (production[token[tIndex]->production[ruleIndex]][j+1]!=-1) { production[prodIndex][pos+j]= production[token[tIndex]->production[ruleIndex]][j+1]; j++; } production[prodIndex][pos+j]=-1; } else { targetEnd=sourceEnd=0; curRule=token[tIndex]->production[ruleIndex]; while (true) { if (production[token[curToken]->production[0]][sourceEnd]==-1&& production[curRule][targetEnd]==-1) { break; } if (production[token[curToken]->production[0]][sourceEnd]!=-1) { sourceEnd++; } if (production[curRule][targetEnd]!=-1) { targetEnd++; } } j=targetEnd+sourceEnd-1; while (j>=0) { if (j>=sourceEnd) { production[curRule][j]=production[curRule][j-sourceEnd+1]; } else { production[curRule][j]=production[token[curToken]->production[0]][j]; } j--; } } } } //optimize sequence is first common-factor then remove left recursion //therefore I don't have to check if for same variable if there will be //more than one left-recursion void Grammar::optimize() { removeLeftRecursion(); commonLeftFactor(); } int Grammar::findCommonFactor(int tIndex, int ruleIndex) { for (int i=ruleIndex+1; i<token[tIndex]->count; i++) { //if the two rule has the same first token if (production[token[tIndex]->production[ruleIndex]][0]== production[token[tIndex]->production[i]][0]) { /* //calculate if there is epsilon if (emptyIndex==-1) { emptyIndex=tokenIndex(emptyStr); } //if it is not epsilon if (production[token[tIndex]->production[i]][0]!=emptyIndex) { return i; } */ return i; } } return -1; } void Grammar::epsilonRule(int ruleIndex) { production[ruleIndex][0]=addToken(emptyStr); production[ruleIndex][1]=-1; } //sample: x -> Aa // x -> Ab //changed to: x -> Ax' //this is to change the old rule // x' -> b //this is to change the old rule // x' -> a //this is the new-added rule void Grammar::doCommonFactor(int tIndex, int ruleIndex, int index) { int newTokenIndex=forgeToken(tIndex);//create a new variable //move the second and after part to the new rule of new variable //doing: x' -> a //curPos=0; //prepare to add one new rule addRuleForToken(newTokenIndex, ++prodIndex); //token[newTokenIndex]->production[token[newTokenIndex]->count++]=++prodIndex; token[newTokenIndex]->terminal=false; copyRule(token[tIndex]->production[ruleIndex], prodIndex); shiftRule(prodIndex, false, 1); /* do { //do copying production[prodIndex][curPos]= production[token[tIndex]->production[ruleIndex]][curPos+1]; curPos++; //even the -1 at end is copied }while (production[token[tIndex]->production[ruleIndex]][curPos]!=-1); */ //in order to show an empty string, in case the string is "epsilon" /* if (curPos==1) { epsilonRule(prodIndex); } */ //replace x -> Aa with x -> Ax' production[token[tIndex]->production[ruleIndex]][1]=newTokenIndex; production[token[tIndex]->production[ruleIndex]][2]=-1;//end //doing: x' -> b //curPos=0; //prepare to add one new rule //pointing new token to where old rule lies addRuleForToken(newTokenIndex, token[tIndex]->production[index]); /* token[newTokenIndex]->production[token[newTokenIndex]->count++]= token[tIndex]->production[index]; */ shiftRule(token[tIndex]->production[index], false, 1); /* do { //left-shift productions production[token[tIndex]->production[index]][curPos]= production[token[tIndex]->production[index]][curPos+1]; curPos++; }while (production[token[tIndex]->production[index]][curPos]!=-1); */ //add epsilon /* if (curPos==1) { epsilonRule(token[tIndex]->production[index]); } */ //remove from token's production array the rule-index----"index" //shrink the array by one /* for (int i=index; i<token[tIndex]->count; i++) { token[tIndex]->production[i]=token[tIndex]->production[i+1]; } token[tIndex]->count--; */ removeRuleFromToken(tIndex, index); } void Grammar::removeRuleFromToken(int tIndex, int ruleIndex) { for (int i=ruleIndex; i<token[tIndex]->count; i++) { token[tIndex]->production[i]=token[tIndex]->production[i+1]; } token[tIndex]->count--; } void Grammar::commonLeftFactor() { int index=-1, tIndex=0, ruleIndex=0; bool flag; //whenever a newrule is done, restart! while (tIndex<tokenCount) { ruleIndex=0; flag=false; while (ruleIndex<token[tIndex]->count) { index=findCommonFactor(tIndex, ruleIndex); if (index!=-1) { doCommonFactor(tIndex, ruleIndex, index); //restart flag=true; break; } else { ruleIndex++; } } if (flag) { tIndex=0; } else { tIndex++; } } } Token* Grammar::operator[](int index) { if (index>=0&&index<tokenCount) { return token[index]; } else { return NULL; } } void Grammar::print() { for (int i=0; i<tokenCount; i++) { if (!token[i]->terminal) { cout<<token[i]->name<<" ==> "; for (int j=0; j<token[i]->count; j++) { printRule(token[i]->production[j]); if (j!=token[i]->count-1) { cout<<" | "; } } cout<<"\n"; } } }