您的位置:首页 > 编程语言 > C语言/C++

标准纯C++实现简单的词法分析器(三)

2006-03-29 10:10 756 查看
状态机实现:
/**: nextToken
&
* primary interface ......
* return a Token from source file(list_of_source)
&
* author: lonelyforest
* data: 2006.03.16
*/
//-----------------------------------------------------------------------------
Token& Scanner::nextToken()
{
if (m_pushed )
{
m_pushed = false;
return m_token;
}
else
{

char t;
bool save = false;
int tokenStringIndex;
tokenStringIndex = 0;

stateType state = s_START;
m_token.str = "";
m_token.type = k_NONE;

while (state != s_DONE)
{
char c;
c = getNextChar();
save = false;
switch (state)
{
case s_START:
if (::isdigit(c)){
save= true;
state = s_INNUM;
}
else if (::isalpha(c)|| c == '_'){
save = true;
state = s_INID;
}
else if (c=='=')
state = s_INASSIGN;
else if ( ::isspace(c))//(c== ' ') || (c=='/t') || (c=='/n'))
state = s_START; /* blanks */
else if (c== '<')
state = s_INL;
else if (c== '>')
state = s_ING;
else
{
state = s_DONE;
switch (c)
{
case EOF:
m_token.type = k_EOF;
m_token.str = "EOF";
break;
case '+':
m_token.type = PLUS;
m_token.str = "+";
break;
case '-':
m_token.type = MINUS;
m_token.str = "-";
break;
case '*':
m_token.type = TIMES;
m_token.str = "*";
break;
case '(':
m_token.type = LPARAN;
m_token.str = "(";
break;
case ')':
m_token.type = RPARAN;
m_token.str = ")";
break;
case '{':
m_token.type = LBRACE;
m_token.str = "{";
break;
case '}':
m_token.type = RBRACE;
m_token.str = "}";
break;
case ',':
m_token.type = COMMA;
m_token.str = ",";
break;
case ';':
m_token.type = SEMI;
m_token.str = ";";
break;
case '[':
m_token.type = LSQUARE;
m_token.str = "[";
break;
case ']':
m_token.type = RSQUARE;
m_token.str = "]";
break;
case '/': /* 判断是否有注释 */
t = getNextChar();
if ( t == '*') //C Style Comment,
{
state = s_INCOMMENT;
}
else if ( t == '/') // C++ Style Comment,
{
c = t;
while (c != '/n' && c != EOF)
{
c = getNextChar();
}

state = s_START;
}
else
{ /* not comment, */
m_token.type = DIV;
m_token.str = "/";
unGetNextChar();
}

break; // break case '/':

case '!':
t = getNextChar();
if ( t== '=' ) {
m_token.type = NEQ;
m_token.str = "!=";
}
else
{
m_token.type = k_ERROR;
m_token.str = "!";
unGetNextChar();
}

break;

default:
add_err();
m_token.type = k_ERROR;
m_token.str = c;
break;
} // end inside switch
}

break; // end case s_START

case s_INCOMMENT:
save = false;
t = getNextChar();
if ( (c!=EOF) && (t!=EOF))
{ /* 防止出现文件结束但是注释尚未结束的情况 */
if ((c == '*') && (t=='/'))
{ // C Style Comment,
state = s_START;
}
else
{
unGetNextChar();
}
}
else
{ /* 出现文件结束但是注释尚未结束 */
outputMsg(-1,"maybe comment end before code !");
add_err();
m_token.type = k_NONE;
m_token.str = "--> comment unexpected end before code !";
state = s_DONE;
}
break; // end state s_INCOMMENT
case s_INASSIGN:
state = s_DONE;
m_token.type = ASSIGN;
m_token.str = "=";
if (c== '=')
{
m_token.type = EQ;
m_token.str = "==";
}
else { unGetNextChar();}
break;
case s_INL: /* < or <= */
state = s_DONE;
m_token.type = LT;
m_token.str = "<";
if (c=='=')
{
m_token.type = NGT;
m_token.str = "<=";
}
else { unGetNextChar();}
break;
case s_ING: /* > or >= */
state = s_DONE;
m_token.type = GT;
m_token.str = ">";
if (c=='=')
m_token.type = NLT; /* >= */
else { unGetNextChar();}
break;
case s_INNUM: /* number, integer */
save = true;
if (!::isdigit(c))
{ /* backup int the input */
unGetNextChar();
save = false;
state = s_DONE;
m_token.type = k_NUM;
}
break;
case s_INID:
save = true;
if( !( ::isalpha(c) || ::isdigit(c) || c == '_'))
{ /*backup in the input */
unGetNextChar();
save = false;
state = s_DONE;
m_token.type = k_ID;
}
break;
case s_DONE: /* 除非机器或者系统故障,否则不会出现 */
default: /* should never happen */
sprintf(msg_temp, "Scanner Bug : State = %d", state);
outputMsg(lineno(), msg_temp);

add_err();
m_token.type = k_ERROR;
m_token.str = msg_temp;

state = s_DONE;
break;
}

if (save && (tokenStringIndex < 43))
{
//tokenString[tokenStringIndex++] = c;
tokenStringIndex++;
m_token.str += c;
}

if (state == s_DONE)
{
//tokenString[tokenStringIndex] = '/0';
if (m_token.type == k_ID)
{
m_token.type = reservedLookup(m_token.str);
}
}
}

// trace compiler and trace scan
if (m_token.type == k_ERROR)
{
string msg = "unknow or unsuported symbol ----> /'";
msg += m_token.str+"/'";
outputMsg(lineno(), msg.c_str());

if (TraceScan )
{
sprintf(msg_temp, "/t%d: Error: unexpected or unsuported symbol--> '%s'/n", lineno(), m_token.str.c_str());
insert_list(msg_temp);
}
}
else if (TraceScan && (m_token.type != k_EOF) )
{
string outmsg;

sprintf(msg_temp, "/t%d: ", lineno());
outmsg = msg_temp;
switch (m_token.type)
{
case k_ID:
outmsg += "ID, name = ";
break;
case k_NUM:
outmsg += "NUM, val = ";
break;
case k_ELSE: case k_IF:
case k_WHILE: case k_READ:
case k_WRITE: case k_INT:
case k_RETURN: case k_VOID:
outmsg += "reserve word: ";
break;
case k_ERROR:
outmsg += m_token.str;
outmsg += ", Scanner Bug !";
case k_NONE:
outmsg += "Bug!";
break;
default:
break;
}

outmsg = outmsg + m_token.str + '/n';
Tokenizer::insert_list(outmsg.c_str());
}

return m_token;
}
}

// 注: 本人编译环境: windows XP(SP2), VC 6, VC 7.1, Intel C++ 8.0, DEV-CPP 4.9.9.2 均通过测试。
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: