您的位置:首页 > 编程语言 > C语言/C++

编译原理:第七节 及词法分析器的C++和Python实现

2015-09-28 20:33 861 查看

编译原理:词法分析

概述:

词法分析是完成编译程序的第一个阶段的工作。所谓词法分析就是对输入字符串形式的源程序按顺序进行扫描,识别其中的字符串作为输出。词法分析器的作用
就是完成这个阶段。词法分析器的是所有编译器所必须的。例如:



这是Python的架构,我们可以看到Scanner,它所做的就是进行词法分析。
举个简单的C/C++的例子,来直观的看一下词法分析器的功能:



我们看到词法分析器识别出了关键字,标识符,整数和一些特殊的符号,并以二元组的形式输出。我们使用的编译器的词法分析和这个原理上式一致的,只是复杂了很多。

词法分析器的实现:

首先,构造识别单词的状态转换图
然后,编程实现状态转换图

例如我们用DFA来表示可识别的单词:



我们把它们进行合并,如图:



利用此图我们就可以识别标识符,无符号整数,分界符,运算符。接下来我们就可以编程来实现。其实本质来说我们就是利用状态转移,我们可以规定开始时为0状态(即图中的S状态)读入一个字符,如果是字母,则状态变为1,是数字则状态变为2........出口定义为结束状态,可以用一个整数值来表示,若状态为终结状态则输出。上图部分用代码表示:
GETNEXTCHAR( ) ;
SWITCH(CHCODE);
{
CASE   1: { WHILE  (ISLETTER OR ISDIGIT) DO
{
SAVE( ); //  当前字符放入一临时字符数组;
GETNEXTCHAR( ) ;//从缓冲区取下一字符
};
UNGETCH;//回退一字符
OUTPUT(1,标识符名字);
};BREAK;
CASE   2: { WHILE ISDIGIT DO
{
SAVE( ); //  当前字符放入一临时字符数组;
GETNEXTCHAR ;//从缓冲区取下一字符
};
UNGETCH;//回退一字符
OUTPUT (2, 整数);
}; BREAK;


下面给出一个词法分析器的完整代码:
#include<stdio.h>
#include<stdlib.h>
#include<ctype.h>
#include<memory.h>
#include<string.h>
#define MAXSIZE 100//缓冲区大小
#define RESWORD 9
int state = 0;//DFA状态
int index = 0;//用于记录识别文法的长度
char ch;//每次读取的字符
char chserious[MAXSIZE];//用于记录识别的文法(缓冲区)
long ll;//文件读取位置
int line = 0;//行号
int error_count = 0;//不可识别字符的个数
int annotate_count = 0;//注释的个数
char ResWord[][10] = {"int","if","then","else","end","repeat","until","read","write"};//保留字
FILE *file = fopen("N:\\wang.txt","rt+");
struct Error_message//记录不可识别字符信息
{
int line_number;
char error_char;
};
struct Annotate_message//记录注释信息
{
int line_number;
char annotate[MAXSIZE];
};
bool CheckRes(char *str)//检查是否为保留字
{
for(int i=0;i<RESWORD;i++)
{
if(strcmp(str,ResWord[i])==0)
return true;
}
return false;
}
struct Error_message message[MAXSIZE];
struct Annotate_message annotateMessage[MAXSIZE];
int main()
{
while(!feof(file))
{
ch = fgetc(file);
if(ch == '\n')
line++;
switch(state)
{
case 0:
index = 0;
chserious[0] = ch;
if(isalpha(ch))
state = 1;
else if(isdigit(ch))
state = 3;
else if(ch == '+')
state = 5;
else if(ch == '-')
state = 9;
else if(ch == '*')
state = 13;
else if(ch == '/')
state = 16;
else if(ch == '=')
state = 20;
else if(ch == '<')
state = 21;
else if(ch == '{')
state = 22;
else if(ch == '}')
state = 23;
else if(ch == ';')
state = 24;
else if(ch != EOF)
{
if(!isspace(ch)&&(ch!='\n'))
{
message[error_count].line_number = line/2;
message[error_count].error_char = ch;
error_count++;
}
state = 25;
}
break;

case 1:
while(isalpha(ch)||isdigit(ch))
{
index ++;
chserious[index] = ch;
ch = fgetc(file);
}
ll = ftell(file);
if(ch == '\n')
line++;
if(ch == EOF)
fseek(file,ll-1L, SEEK_SET);//回退
else
fseek(file,ll-2L, SEEK_SET);//回退
state = 2;
break;
case 2:
if(CheckRes(chserious))
printf("(关键字,%s)\n",chserious);
else
printf("(标识符,%s)\n",chserious);
state = 0;
index = 0;
memset(chserious,'\0',sizeof(chserious));
break;
case 3:
while(isdigit(ch))
{
index++;
chserious[index] = ch;
ch = fgetc(file);
}
ll = ftell(file);
if(ch == EOF)
fseek(file,ll-1L, SEEK_SET);//回退
else
fseek(file,ll-2L, SEEK_SET);//回退
state = 4;
break;
case 4:
if(ch == '\n')
line++;
printf("(数,%s)\n",chserious);
state = 0;
index = 0;
memset(chserious,'\0',sizeof(chserious));
break;
case 5:
if(ch == '+')
state = 6;
else if(ch == '=')
state = 7;
else
state = 8;

break;
case 6:
printf("(特殊符号,++)\n");
ll = ftell(file);
if(ch!=EOF)
fseek(file,ll-1L, SEEK_SET);//回退
state = 0;
break;
case 7:
printf("(特殊符号,+=)\n");
ll = ftell(file);
//printf("%ld",ll);
if(ch!=EOF)
fseek(file,ll-1L, SEEK_SET);//回退

state = 0;
break;
case 8:
printf("(特殊符号,+)\n");
ll = ftell(file);
fseek(file,ll-2L, SEEK_SET);//回退
state = 0;
break;
case 9:
if(ch == '-')
state = 10;
else if(ch == '=')
state = 11;
else
state = 12;
break;
case 10:
printf("(特殊符号,--)\n");
ll = ftell(file);
if(ch!=EOF)
fseek(file,ll-1L, SEEK_SET);//回退
state = 0;
break;
case 11:
printf("(特殊符号,-=)\n");
ll = ftell(file);
if(ch!=EOF)
fseek(file,ll-1L, SEEK_SET);//回退
state = 0;
break;
case 12:
printf("(特殊符号,-)\n");
ll = ftell(file);
fseek(file,ll-2L, SEEK_SET);//回退
state = 0;
break;
case 13:
if(ch == '=')
state = 14;
else
state = 15;
break;
case 14:
printf("(特殊符号,*=)\n");
ll = ftell(file);
if(ch!=EOF)
fseek(file,ll-1L, SEEK_SET);//回退
state = 0;
break;
case 15:
printf("(特殊符号,*)\n");
ll = ftell(file);
fseek(file,ll-2L, SEEK_SET);//回退
state = 0;
break;
case 16:
if(ch == '/')
state = 17;
else if(ch == '=')
state = 18;
else
state = 19;
break;

case 17:
{
printf("(特殊符号,//)\n");
ll = ftell(file);
fseek(file,ll-1L, SEEK_SET);//回退
state = 0;

ch = fgetc(file);
//printf("line:%d\n",line/2);
annotateMessage[annotate_count].line_number = line/2;
int j = 0;
while(ch!='\n'&&ch!=EOF)
{
//printf("---------%c******\n",ch);
annotateMessage[annotate_count].annotate[j] = ch;
ch = fgetc(file);
j++;
}
line+=2;
annotate_count++;
//exit(0);

break;
}
case 18:
printf("(特殊符号,/=)\n");
ll = ftell(file);
if(ch!=EOF)
fseek(file,ll-1L, SEEK_SET);//回退
state = 0;
break;
case 19:
printf("(特殊符号,/)\n");
ll = ftell(file);
fseek(file,ll-2L, SEEK_SET);//回退
state = 0;
break;
case 20:
printf("(特殊符号,=)\n");
if(ch != EOF)
{
ll = ftell(file);
fseek(file,ll-1L, SEEK_SET);//回退
}
state = 0;
break;
case 21:
printf("(特殊符号,<)\n");
if(ch != EOF)
{
ll = ftell(file);
fseek(file,ll-1L, SEEK_SET);//回退
}
state = 0;
break;
case 22:
printf("(特殊符号,{)\n");
if(ch != EOF)
{
ll = ftell(file);
fseek(file,ll-1L, SEEK_SET);//回退
}
state = 0;
break;
case 23:
printf("(特殊符号,})\n");
if(ch != EOF)
{
ll = ftell(file);
fseek(file,ll-1L, SEEK_SET);//回退
}
state = 0;
break;
case 24:
printf("(特殊符号,;)\n");
if(ch != EOF)
{
ll = ftell(file);
fseek(file,ll-1L, SEEK_SET);//回退
}
state = 0;
break;
case 25:
if(isspace(ch))
printf("(特殊符号,空格)\n");
//printf("****%c****\n",ch);
if(ch != EOF)
{
ll = ftell(file);
fseek(file,ll-1L, SEEK_SET);//回退
}
state = 0;
default:
break;
}
}
fclose(file);
//printf("%d\n",state);
//printf("%s\n",chserious);
printf("注释内容的个数为:%d 分别为:\n",annotate_count);
for(int j=0;j<annotate_count;j++)
{
printf("(%d,%s)\n",annotateMessage[j].line_number,annotateMessage[j].annotate);
}
printf("不可识别的字符个数为:%d 分别为:\n",error_count);
for(int i=0;i<error_count;i++)
{
printf("(%d,%c)\n",message[i].line_number,message[i].error_char);
}
return 0;
}


下面我用Python重写了这个词法分析器,更加简洁,每次读入文件一行进行处理(上面C++版本每次读入一个字符),代码如下:

# -*- coding: cp936 -*-
'''
DFA有限自动机Python实现
作者:王灿
2015-9-27于中国矿业大学
'''
class DFA:
file_object = ''#文件句柄
line_number = 0 #记录行号
state = 0 #状态
ResWord = ['int','if','then','else','end','repeat','until','read','write']#保留字
error_message = []#保存错误信息,存储元组,元组第一个参数是行号,第二个参数是错误字符
annotate_message = []#注释信息,存储元组,元组第一个参数是行号,第二个参数是注释
char_message = []#识别的字符串,存储元组,元组第一个参数是类型,第二个参数是该字符串

def __init__(self,file_name):
self.file_object = file_name
self.state = 0
self.line_number = 0
self.error_message = []
self.annotate_message = []
self.char_message = []
def Start_convert(self):
for line in self.file_object:#一行行的处理
line = line.strip('\n')#去除换行fu
self.line_number += 1#没处理一行行号加一
line_length = len(line)
i = 0
string = ''#存储一个字符串
while i < line_length:
ch = line[i]#读取该行的一个字符
i += 1
if self.state == 0:#初始状态
string = ch
if ch.isalpha():
self.state = 1
elif ch.isdigit():
self.state = 3
elif ch == '+':
self.state = 5
elif ch == '-':
self.state = 9
elif ch == '*':
self.state = 13
elif ch == '/':
self.state = 16
elif ch == '=':
self.state = 20
i -= 1
elif ch == '<':
self.state = 21
i -= 1
elif ch == '{':
self.state = 22
i -= 1
elif ch == '}':
self.state = 23
i -= 1
elif ch == ';':
i -= 1
self.state = 24
elif ch.isspace():
self.state = 25
else:
self.state = 26#不可识别状态
i -= 1
elif self.state == 1:#判断字母数字
while ch.isalpha() or ch.isdigit():
string += ch
if i < line_length:
ch = line[i]
i += 1
else:
break
self.state = 2
i -= 2#回退2个字符
elif self.state == 2:
if string in self.ResWord:
content = '(关键字,' + string + ')'
else:
content = '(标识符,' + string + ')'
#print content
self.char_message.append(content)
string = ''#回到初始情况
self.state = 0#回到状态0
elif self.state == 3:
while ch.isdigit():
string += ch
if i < line_length:
ch = line[i]
i += 1
else:
break
self.state = 4
i -= 2#回退2个字符
elif self.state == 4:
content = '(数字,' + string + ')'
self.char_message.append(content)
#print string
string = ''#回到初始情况
self.state = 0#回到状态0
elif self.state == 5:
if ch == '+':
self.state = 6
i -= 1
elif ch == '=':
self.state = 7
i -= 1
else:
self.state = 8
i -= 2
elif self.state == 6:#判断++
content = '(特殊符号,' + string + ch + ')'
self.char_message.append(content)
#print string + ch
string = ''#回到初始情况
self.state = 0#回到状态0
elif self.state == 7:#判断+=
content = '(特殊符号,' + string + ch + ')'
self.char_message.append(content)
#print string + ch
string = ''#回到初始情况
self.state = 0#回到状态0
elif self.state == 8:#判断+
content = '(特殊符号,' + ch + ')'
self.char_message.append(content)
#print ch
string = ''#回到初始情况
self.state = 0#回到状态0
elif self.state == 9:
if ch == '-':
self.state = 10
i -= 1
elif ch == '=':
self.state = 11
i -= 1
else:
self.state = 12
i -= 2
elif self.state == 10:
content = '(特殊符号,' + string + ch + ')'
self.char_message.append(content)
#print string + ch#判断--
string = ''#回到初始情况
self.state = 0#回到状态0
elif self.state == 11:#判断-=
content = '(特殊符号,' + string + ch + ')'
self.char_message.append(content)
#print string + ch
string = ''#回到初始情况
self.state = 0#回到状态0
elif self.state == 12:#判断-
content = '(特殊符号,' + ch + ')'
self.char_message.append(content)
#print ch
string = ''#回到初始情况
self.state = 0#回到状态0
elif self.state == 13:
if ch == '=':
self.state = 14
i -= 1
else:
self.state = 15
i -= 2
elif self.state == 14:#判断*=
content = '(特殊符号,' + string + ch + ')'
self.char_message.append(content)
#print string + ch
string = ''#回到初始情况
self.state = 0#回到状态0
elif self.state == 15:#判断*
content = '(特殊符号,' + ch + ')'
self.char_message.append(content)
#print ch
string = ''#回到初始情况
self.state = 0#回到状态0
elif self.state == 16:
if ch == '/':
self.state = 17
i -= 1
elif ch == '=':
self.state = 18
i -= 1
else:
self.state = 19
i -= 2
elif self.state == 17:#判断//
content = '(特殊符号,' + string + ch + ')'
self.char_message.append(content)
content = '(注释,'+ line[i:] +')'
self.annotate_message.append(content)
#print content
string = ''#回到初始情况
self.state = 0#回到状态0
elif self.state == 18:#判断/=
content = '(特殊符号,' + string + ch + ')'
self.char_message.append(content)
#print string + ch
string = ''#回到初始情况
self.state = 0#回到状态0
elif self.state == 19:#判断/
content = '(特殊符号,' + ch + ')'
self.char_message.append(content)
#print ch
string = ''#回到初始情况
self.state = 0#回到状态0
elif self.state == 20:
content = '(特殊符号,=)'
self.char_message.append(content)
#print '='
self.state = 0
string = ''
elif self.state == 21:
content = '(特殊符号,<)'
self.char_message.append(content)
#print '<'
self.state = 0
string = ''
elif self.state == 22:
content = '(特殊符号,{)'
self.char_message.append(content)
#print '{'
self.state = 0
string = ''
elif self.state == 23:
content = '(特殊符号,})'
self.char_message.append(content)
#print '}'
self.state = 0
string = ''
elif self.state == 24:
content = '(特殊符号,;)'
self.char_message.append(content)
#print ';'
self.state = 0
string = ''
elif self.state == 25:
while ch.isspace():
if i < line_length:
ch = line[i]
i += 1
else:
break
self.state = 0
i -= 1
elif self.state == 26:
content = '(行号:'+str(self.line_number)+',' + ch + ')'
self.error_message.append(content)
#print 'error:' + ch
self.state = 0
string = ''
#print self.state
def Get_error(self):#获取错误信息
return self.error_message

def Get_annotate(self):#获取注释信息
return self.annotate_message

def Get_char(self):#获取识别信息
return self.char_message
'''
*****************测试内容************************
try:
file_object = open("N:\\wang4.txt")
dfa = DFA(file_object)
dfa.Start_convert()
content = dfa.Get_char()
for item in content:
print item
content = dfa.Get_annotate()
for item in content:
print item
content = dfa.Get_error()
for item in content:
print item
finally:
file_object.close()
'''


利用面向对象的机制,我们很容易做到数据,数据处理和GUI端的分离,便于代码复用和重构,把上面Python代码做成一个简单的GUI如下:

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: