www.digitalmars.com         C & C++   DMDScript  

digitalmars.D.learn - Recursive-descent parsing

reply Andrew Edwards <edwards.ac gmail.com> writes:
The authors of "The Art of Java" present, as a first coding 
example, a recursive-descent parser to demonstrate Java's ability 
to facilitate low level programming commonly performed in C and 
C++.

I took the opportunity to port the code to D. By doing this, I 
now have an understanding of how a recursive-descent parser is 
written and works. What I am now interested in, is learning about 
what makes this particular implementation a poor or strong one, 
how to improve upon it and other, possibly superior, 
implementations made possible by D.

Comments are invited and appreciated.

Andrew Edwards
If at first you don't succeed...

=========================
rdp2.d
=========================
//module rdp2;

class ParserException : Throwable
{
     this(string msg)
     {
         super(msg);
     }
}

// These are the token types.
enum
{
     NONE,
     DELIMITER,
     VARIABLE,
     NUMBER
}

// These are the types of syntax errors.
enum
{
     SYNTAX,
     UNBALPARENS,
     NOEXP,
     DIVBYZERO
}

// This token indecates end-of-expression.
enum EOE = ";";

string exp;     // refers to expression string
int ndx;     // current index into the expression
string token;   // holds current token
int tokType;    // holds token's types

// Array for variables.
double[] vars = new double[26];

// Parser entry poing.
double evaluate(string expstr)
{
     double result;
     exp = expstr;
     ndx = 0;

     getToken();
     if (token == EOE)
         handleErr(NOEXP); // no expression present

     // Parse and evaluate the expression.
     result = evalExp1();

     if (token != EOE)
         handleErr(SYNTAX); // last token must be EOE

     return result;
}

// Process an assignment.
double evalExp1()
{
     double result;
     int varNdx;
     int ttokType;
     string tmpToken;

     if (tokType == VARIABLE)
     {
         // save old token
         tmpToken = token;
         ttokType = tokType;

         // Compute the index of the variable.
         import std.ascii: toUpper;
         varNdx = token[0].toUpper - 'A';

         getToken();
         if (token != "=")
         {
             putBack(); // return current token
             // restore old token -- not an assignment
             token = tmpToken;
             tokType = ttokType;
         }
         else
         {
             getToken(); // get next part of exp
             result = evalExp2();
             vars[varNdx] = result;
             return result;
         }
     }
     return evalExp2();
}

// Add or subtract two terms
double evalExp2()
{
     char op;
     double result;
     double partialResult;

     result = evalExp3();

     while ((op = token[0]) == '+' || op == '-')
     {
         getToken();
         partialResult = evalExp3();
         final switch (op)
         {
             case '-':
             {
                 result -= partialResult;
                 break;
             }
             case '+':
             {
                 result += partialResult;
                 break;
             }
         }
     }
     return result;
}

// Multiply or divide two factors
double evalExp3()
{
     char op;
     double result;
     double partialResult;

     result = evalExp4();

     while ((op = token[0]) == '*' || op == '/' || op == '%')
     {
         getToken();
         partialResult = evalExp4();
         final switch (op)
         {
             case '*':
             {
                 result *= partialResult;
                 break;
             }
             case '/':
             {
                 if (partialResult == 0.0)
                     handleErr(DIVBYZERO);
                 result /= partialResult;
                 break;
             }
             case '%':
             {
                 if (partialResult == 0.0)
                     handleErr(DIVBYZERO);
                 result %= partialResult;
                 break;
             }
         }
     }
     return result;
}

// Process an exponent.
double evalExp4()
{
     double result;
     double partialResult;
     double ex;

     result = evalExp5();

     if (token == "^")
     {
         getToken();
         partialResult = evalExp4();
         ex = result;
         if (partialResult == 0.0)
         {
             result = 1.0;
         }
         else
         {
             for (int t = cast(int)partialResult-1; t > 0; t--)
                 result *= ex;
         }
     }
     return result;
}

// Evaluate a unary + or -
double evalExp5()
{
     double result;
     string op;

     op = null;
     if ((tokType == DELIMITER) && token == "+" || token == "-")
     {

         op = token;
         getToken();
     }
     result = evalExp6();

     //if (op == "-") result = -result;

     return (op == "-") ? -result : result;
}

// Process a parenthesized expression.
double evalExp6()
{
     double result;

     if (token == "(")
     {
         getToken();
         result = evalExp2();
         if (token != ")")
             handleErr(UNBALPARENS);
         getToken();
     }
     else
         result = atom();

     return result;
}

// Get the value of a number.
double atom()
{
     double result = 0.0;

     switch (tokType)
     {
         case NUMBER:
             try
             {
                 import std.conv: parse;
                 result = token.parse!double;
             }
             catch (Exception e)
             {
                 handleErr(SYNTAX);
             }
             getToken();
             break;
         case VARIABLE:
             result = findVar(token);
             getToken();
             break;
         default:
             handleErr(SYNTAX);
             break;
     }
     return result;
}

// Return the value of a variable.
double findVar(string vname)
{
     import std.ascii: isAlpha, toUpper;
     if (!vname[0].isAlpha)
     {
         handleErr(SYNTAX);
         return 0.0;
     }

     return vars[(vname[0] - 'A').toUpper];
}

// Return a token to the input stream.
void putBack()
{
     if (token == EOE) return;
     foreach(i; 0 .. token.length) ndx--;
}

// Handle an error.
void handleErr(int error)
{
     string[] err = [
         "Syntax Error",
         "Unbalanced Parentheses",
         "No Expression Present",
         "Division by Zero"
     ];

     throw new ParserException(err[error]);
}

// Obtain the next token.
void getToken()
{
     import std.ascii: isAlpha, isDigit, isWhite;
     tokType = NONE;
     token = null;

     // Check for end of expression
     if (ndx == exp.length)
     {
         token = EOE.dup;
         return;
     }

     // Skip over white space
     while (ndx < exp.length && exp[ndx].isWhite) ++ndx;

     // Trailing whitespace ends expression.
     if (ndx == exp.length)
     {
         token = EOE.dup;
         return;
     }

     if (exp[ndx].isDelim) // is operator
     {
         token ~= exp[ndx];
         ndx++;
         tokType = DELIMITER;
     }
     else if (exp[ndx].isAlpha) // is variable
     {
         while (!exp[ndx].isDelim)
         {
             token ~= exp[ndx];
             ndx++;
             if (ndx >= exp.length) break;
         }
         tokType = VARIABLE;
     }
     else if (exp[ndx].isDigit) // is number
     {
         while (!exp[ndx].isDelim)
         {
             token ~= exp[ndx];
             ndx++;
             if (ndx >= exp.length) break;

         }
         tokType = NUMBER;
     }
     else // unknown character terminates expression
     {
         token = EOE.dup;
         return;
     }
}

// Returns true if c is a delimiter.
bool isDelim(char c)
{
     import std.algorithm: canFind;
     return canFind(" +-/*%^=()", c);
}

=========================
demordp.d
=========================
mixin(import("rdp2.d"));

void main(string[] args)
{
     string expr;

     import std.stdio: write, writeln, readln;
     writeln("Enter an empty expression to stop.");

     for(;;)
     {
         write("Enter an expression: ");
         expr = readln();
         if (expr == "\n") break;
         try
         {
             writeln("Result: ", expr.evaluate);
         }
         catch (ParserException e)
         {
             e.msg.writeln;
         }
     }
}
Dec 24 2016
parent Stefan Koch <uplink.coder googlemail.com> writes:
On Saturday, 24 December 2016 at 12:42:31 UTC, Andrew Edwards 
wrote:
 The authors of "The Art of Java" present, as a first coding 
 example, a recursive-descent parser to demonstrate Java's 
 ability to facilitate low level programming commonly performed 
 in C and C++.

 I took the opportunity to port the code to D. By doing this, I 
 now have an understanding of how a recursive-descent parser is 
 written and works. What I am now interested in, is learning 
 about what makes this particular implementation a poor or 
 strong one, how to improve upon it and other, possibly 
 superior, implementations made possible by D.

 Comments are invited and appreciated.

 Andrew Edwards
 If at first you don't succeed...

 =========================
 rdp2.d
 =========================
 //module rdp2;

 class ParserException : Throwable
 {
     this(string msg)
     {
         super(msg);
     }
 }

 // These are the token types.
 enum
 {
     NONE,
     DELIMITER,
     VARIABLE,
     NUMBER
 }

 // These are the types of syntax errors.
 enum
 {
     SYNTAX,
     UNBALPARENS,
     NOEXP,
     DIVBYZERO
 }

 // This token indecates end-of-expression.
 enum EOE = ";";

 string exp;     // refers to expression string
 int ndx;     // current index into the expression
 string token;   // holds current token
 int tokType;    // holds token's types

 // Array for variables.
 double[] vars = new double[26];

 // Parser entry poing.
 double evaluate(string expstr)
 {
     double result;
     exp = expstr;
     ndx = 0;

     getToken();
     if (token == EOE)
         handleErr(NOEXP); // no expression present

     // Parse and evaluate the expression.
     result = evalExp1();

     if (token != EOE)
         handleErr(SYNTAX); // last token must be EOE

     return result;
 }

 // Process an assignment.
 double evalExp1()
 {
     double result;
     int varNdx;
     int ttokType;
     string tmpToken;

     if (tokType == VARIABLE)
     {
         // save old token
         tmpToken = token;
         ttokType = tokType;

         // Compute the index of the variable.
         import std.ascii: toUpper;
         varNdx = token[0].toUpper - 'A';

         getToken();
         if (token != "=")
         {
             putBack(); // return current token
             // restore old token -- not an assignment
             token = tmpToken;
             tokType = ttokType;
         }
         else
         {
             getToken(); // get next part of exp
             result = evalExp2();
             vars[varNdx] = result;
             return result;
         }
     }
     return evalExp2();
 }

 // Add or subtract two terms
 double evalExp2()
 {
     char op;
     double result;
     double partialResult;

     result = evalExp3();

     while ((op = token[0]) == '+' || op == '-')
     {
         getToken();
         partialResult = evalExp3();
         final switch (op)
         {
             case '-':
             {
                 result -= partialResult;
                 break;
             }
             case '+':
             {
                 result += partialResult;
                 break;
             }
         }
     }
     return result;
 }

 // Multiply or divide two factors
 double evalExp3()
 {
     char op;
     double result;
     double partialResult;

     result = evalExp4();

     while ((op = token[0]) == '*' || op == '/' || op == '%')
     {
         getToken();
         partialResult = evalExp4();
         final switch (op)
         {
             case '*':
             {
                 result *= partialResult;
                 break;
             }
             case '/':
             {
                 if (partialResult == 0.0)
                     handleErr(DIVBYZERO);
                 result /= partialResult;
                 break;
             }
             case '%':
             {
                 if (partialResult == 0.0)
                     handleErr(DIVBYZERO);
                 result %= partialResult;
                 break;
             }
         }
     }
     return result;
 }

 // Process an exponent.
 double evalExp4()
 {
     double result;
     double partialResult;
     double ex;

     result = evalExp5();

     if (token == "^")
     {
         getToken();
         partialResult = evalExp4();
         ex = result;
         if (partialResult == 0.0)
         {
             result = 1.0;
         }
         else
         {
             for (int t = cast(int)partialResult-1; t > 0; t--)
                 result *= ex;
         }
     }
     return result;
 }

 // Evaluate a unary + or -
 double evalExp5()
 {
     double result;
     string op;

     op = null;
     if ((tokType == DELIMITER) && token == "+" || token == "-")
     {

         op = token;
         getToken();
     }
     result = evalExp6();

     //if (op == "-") result = -result;

     return (op == "-") ? -result : result;
 }

 // Process a parenthesized expression.
 double evalExp6()
 {
     double result;

     if (token == "(")
     {
         getToken();
         result = evalExp2();
         if (token != ")")
             handleErr(UNBALPARENS);
         getToken();
     }
     else
         result = atom();

     return result;
 }

 // Get the value of a number.
 double atom()
 {
     double result = 0.0;

     switch (tokType)
     {
         case NUMBER:
             try
             {
                 import std.conv: parse;
                 result = token.parse!double;
             }
             catch (Exception e)
             {
                 handleErr(SYNTAX);
             }
             getToken();
             break;
         case VARIABLE:
             result = findVar(token);
             getToken();
             break;
         default:
             handleErr(SYNTAX);
             break;
     }
     return result;
 }

 // Return the value of a variable.
 double findVar(string vname)
 {
     import std.ascii: isAlpha, toUpper;
     if (!vname[0].isAlpha)
     {
         handleErr(SYNTAX);
         return 0.0;
     }

     return vars[(vname[0] - 'A').toUpper];
 }

 // Return a token to the input stream.
 void putBack()
 {
     if (token == EOE) return;
     foreach(i; 0 .. token.length) ndx--;
 }

 // Handle an error.
 void handleErr(int error)
 {
     string[] err = [
         "Syntax Error",
         "Unbalanced Parentheses",
         "No Expression Present",
         "Division by Zero"
     ];

     throw new ParserException(err[error]);
 }

 // Obtain the next token.
 void getToken()
 {
     import std.ascii: isAlpha, isDigit, isWhite;
     tokType = NONE;
     token = null;

     // Check for end of expression
     if (ndx == exp.length)
     {
         token = EOE.dup;
         return;
     }

     // Skip over white space
     while (ndx < exp.length && exp[ndx].isWhite) ++ndx;

     // Trailing whitespace ends expression.
     if (ndx == exp.length)
     {
         token = EOE.dup;
         return;
     }

     if (exp[ndx].isDelim) // is operator
     {
         token ~= exp[ndx];
         ndx++;
         tokType = DELIMITER;
     }
     else if (exp[ndx].isAlpha) // is variable
     {
         while (!exp[ndx].isDelim)
         {
             token ~= exp[ndx];
             ndx++;
             if (ndx >= exp.length) break;
         }
         tokType = VARIABLE;
     }
     else if (exp[ndx].isDigit) // is number
     {
         while (!exp[ndx].isDelim)
         {
             token ~= exp[ndx];
             ndx++;
             if (ndx >= exp.length) break;

         }
         tokType = NUMBER;
     }
     else // unknown character terminates expression
     {
         token = EOE.dup;
         return;
     }
 }

 // Returns true if c is a delimiter.
 bool isDelim(char c)
 {
     import std.algorithm: canFind;
     return canFind(" +-/*%^=()", c);
 }

 =========================
 demordp.d
 =========================
 mixin(import("rdp2.d"));

 void main(string[] args)
 {
     string expr;

     import std.stdio: write, writeln, readln;
     writeln("Enter an empty expression to stop.");

     for(;;)
     {
         write("Enter an expression: ");
         expr = readln();
         if (expr == "\n") break;
         try
         {
             writeln("Result: ", expr.evaluate);
         }
         catch (ParserException e)
         {
             e.msg.writeln;
         }
     }
 }
All of the performance characteristics crucially depend on the language you want to parse. In general you want to match as many tokes at as you can in a single parse-tree node. Since that will reduce the number of function calls you have to make. You want to preallocate buffers. Use the append-functionality sparingly since that involves a call into a shared library. Avoid or encapsulate global state if you can since it can get tricky to mange. Other then that there is not much general advice I can give. Try to tag your parse-nodes since then you can match patterns using a nested switch, which is one of the cleanest routes to do it.
Dec 24 2016