编译原理:第七节 及词法分析器的C++和Python实现
2015-09-28 20:33
861 查看
编译原理:词法分析
概述:
词法分析是完成编译程序的第一个阶段的工作。所谓词法分析就是对输入字符串形式的源程序按顺序进行扫描,识别其中的字符串作为输出。词法分析器的作用就是完成这个阶段。词法分析器的是所有编译器所必须的。例如:
这是Python的架构,我们可以看到Scanner,它所做的就是进行词法分析。
举个简单的C/C++的例子,来直观的看一下词法分析器的功能:
我们看到词法分析器识别出了关键字,标识符,整数和一些特殊的符号,并以二元组的形式输出。我们使用的编译器的词法分析和这个原理上式一致的,只是复杂了很多。
词法分析器的实现:
首先,构造识别单词的状态转换图然后,编程实现状态转换图
例如我们用DFA来表示可识别的单词:
我们把它们进行合并,如图:
利用此图我们就可以识别标识符,无符号整数,分界符,运算符。接下来我们就可以编程来实现。其实本质来说我们就是利用状态转移,我们可以规定开始时为0状态(即图中的S状态)读入一个字符,如果是字母,则状态变为1,是数字则状态变为2........出口定义为结束状态,可以用一个整数值来表示,若状态为终结状态则输出。上图部分用代码表示:
GETNEXTCHAR( ) ; SWITCH(CHCODE); { CASE 1: { WHILE (ISLETTER OR ISDIGIT) DO { SAVE( ); // 当前字符放入一临时字符数组; GETNEXTCHAR( ) ;//从缓冲区取下一字符 }; UNGETCH;//回退一字符 OUTPUT(1,标识符名字); };BREAK; CASE 2: { WHILE ISDIGIT DO { SAVE( ); // 当前字符放入一临时字符数组; GETNEXTCHAR ;//从缓冲区取下一字符 }; UNGETCH;//回退一字符 OUTPUT (2, 整数); }; BREAK;
下面给出一个词法分析器的完整代码:
#include<stdio.h> #include<stdlib.h> #include<ctype.h> #include<memory.h> #include<string.h> #define MAXSIZE 100//缓冲区大小 #define RESWORD 9 int state = 0;//DFA状态 int index = 0;//用于记录识别文法的长度 char ch;//每次读取的字符 char chserious[MAXSIZE];//用于记录识别的文法(缓冲区) long ll;//文件读取位置 int line = 0;//行号 int error_count = 0;//不可识别字符的个数 int annotate_count = 0;//注释的个数 char ResWord[][10] = {"int","if","then","else","end","repeat","until","read","write"};//保留字 FILE *file = fopen("N:\\wang.txt","rt+"); struct Error_message//记录不可识别字符信息 { int line_number; char error_char; }; struct Annotate_message//记录注释信息 { int line_number; char annotate[MAXSIZE]; }; bool CheckRes(char *str)//检查是否为保留字 { for(int i=0;i<RESWORD;i++) { if(strcmp(str,ResWord[i])==0) return true; } return false; } struct Error_message message[MAXSIZE]; struct Annotate_message annotateMessage[MAXSIZE]; int main() { while(!feof(file)) { ch = fgetc(file); if(ch == '\n') line++; switch(state) { case 0: index = 0; chserious[0] = ch; if(isalpha(ch)) state = 1; else if(isdigit(ch)) state = 3; else if(ch == '+') state = 5; else if(ch == '-') state = 9; else if(ch == '*') state = 13; else if(ch == '/') state = 16; else if(ch == '=') state = 20; else if(ch == '<') state = 21; else if(ch == '{') state = 22; else if(ch == '}') state = 23; else if(ch == ';') state = 24; else if(ch != EOF) { if(!isspace(ch)&&(ch!='\n')) { message[error_count].line_number = line/2; message[error_count].error_char = ch; error_count++; } state = 25; } break; case 1: while(isalpha(ch)||isdigit(ch)) { index ++; chserious[index] = ch; ch = fgetc(file); } ll = ftell(file); if(ch == '\n') line++; if(ch == EOF) fseek(file,ll-1L, SEEK_SET);//回退 else fseek(file,ll-2L, SEEK_SET);//回退 state = 2; break; case 2: if(CheckRes(chserious)) printf("(关键字,%s)\n",chserious); else printf("(标识符,%s)\n",chserious); state = 0; index = 0; memset(chserious,'\0',sizeof(chserious)); break; case 3: while(isdigit(ch)) { index++; chserious[index] = ch; ch = fgetc(file); } ll = ftell(file); if(ch == EOF) fseek(file,ll-1L, SEEK_SET);//回退 else fseek(file,ll-2L, SEEK_SET);//回退 state = 4; break; case 4: if(ch == '\n') line++; printf("(数,%s)\n",chserious); state = 0; index = 0; memset(chserious,'\0',sizeof(chserious)); break; case 5: if(ch == '+') state = 6; else if(ch == '=') state = 7; else state = 8; break; case 6: printf("(特殊符号,++)\n"); ll = ftell(file); if(ch!=EOF) fseek(file,ll-1L, SEEK_SET);//回退 state = 0; break; case 7: printf("(特殊符号,+=)\n"); ll = ftell(file); //printf("%ld",ll); if(ch!=EOF) fseek(file,ll-1L, SEEK_SET);//回退 state = 0; break; case 8: printf("(特殊符号,+)\n"); ll = ftell(file); fseek(file,ll-2L, SEEK_SET);//回退 state = 0; break; case 9: if(ch == '-') state = 10; else if(ch == '=') state = 11; else state = 12; break; case 10: printf("(特殊符号,--)\n"); ll = ftell(file); if(ch!=EOF) fseek(file,ll-1L, SEEK_SET);//回退 state = 0; break; case 11: printf("(特殊符号,-=)\n"); ll = ftell(file); if(ch!=EOF) fseek(file,ll-1L, SEEK_SET);//回退 state = 0; break; case 12: printf("(特殊符号,-)\n"); ll = ftell(file); fseek(file,ll-2L, SEEK_SET);//回退 state = 0; break; case 13: if(ch == '=') state = 14; else state = 15; break; case 14: printf("(特殊符号,*=)\n"); ll = ftell(file); if(ch!=EOF) fseek(file,ll-1L, SEEK_SET);//回退 state = 0; break; case 15: printf("(特殊符号,*)\n"); ll = ftell(file); fseek(file,ll-2L, SEEK_SET);//回退 state = 0; break; case 16: if(ch == '/') state = 17; else if(ch == '=') state = 18; else state = 19; break; case 17: { printf("(特殊符号,//)\n"); ll = ftell(file); fseek(file,ll-1L, SEEK_SET);//回退 state = 0; ch = fgetc(file); //printf("line:%d\n",line/2); annotateMessage[annotate_count].line_number = line/2; int j = 0; while(ch!='\n'&&ch!=EOF) { //printf("---------%c******\n",ch); annotateMessage[annotate_count].annotate[j] = ch; ch = fgetc(file); j++; } line+=2; annotate_count++; //exit(0); break; } case 18: printf("(特殊符号,/=)\n"); ll = ftell(file); if(ch!=EOF) fseek(file,ll-1L, SEEK_SET);//回退 state = 0; break; case 19: printf("(特殊符号,/)\n"); ll = ftell(file); fseek(file,ll-2L, SEEK_SET);//回退 state = 0; break; case 20: printf("(特殊符号,=)\n"); if(ch != EOF) { ll = ftell(file); fseek(file,ll-1L, SEEK_SET);//回退 } state = 0; break; case 21: printf("(特殊符号,<)\n"); if(ch != EOF) { ll = ftell(file); fseek(file,ll-1L, SEEK_SET);//回退 } state = 0; break; case 22: printf("(特殊符号,{)\n"); if(ch != EOF) { ll = ftell(file); fseek(file,ll-1L, SEEK_SET);//回退 } state = 0; break; case 23: printf("(特殊符号,})\n"); if(ch != EOF) { ll = ftell(file); fseek(file,ll-1L, SEEK_SET);//回退 } state = 0; break; case 24: printf("(特殊符号,;)\n"); if(ch != EOF) { ll = ftell(file); fseek(file,ll-1L, SEEK_SET);//回退 } state = 0; break; case 25: if(isspace(ch)) printf("(特殊符号,空格)\n"); //printf("****%c****\n",ch); if(ch != EOF) { ll = ftell(file); fseek(file,ll-1L, SEEK_SET);//回退 } state = 0; default: break; } } fclose(file); //printf("%d\n",state); //printf("%s\n",chserious); printf("注释内容的个数为:%d 分别为:\n",annotate_count); for(int j=0;j<annotate_count;j++) { printf("(%d,%s)\n",annotateMessage[j].line_number,annotateMessage[j].annotate); } printf("不可识别的字符个数为:%d 分别为:\n",error_count); for(int i=0;i<error_count;i++) { printf("(%d,%c)\n",message[i].line_number,message[i].error_char); } return 0; }
下面我用Python重写了这个词法分析器,更加简洁,每次读入文件一行进行处理(上面C++版本每次读入一个字符),代码如下:
# -*- coding: cp936 -*- ''' DFA有限自动机Python实现 作者:王灿 2015-9-27于中国矿业大学 ''' class DFA: file_object = ''#文件句柄 line_number = 0 #记录行号 state = 0 #状态 ResWord = ['int','if','then','else','end','repeat','until','read','write']#保留字 error_message = []#保存错误信息,存储元组,元组第一个参数是行号,第二个参数是错误字符 annotate_message = []#注释信息,存储元组,元组第一个参数是行号,第二个参数是注释 char_message = []#识别的字符串,存储元组,元组第一个参数是类型,第二个参数是该字符串 def __init__(self,file_name): self.file_object = file_name self.state = 0 self.line_number = 0 self.error_message = [] self.annotate_message = [] self.char_message = [] def Start_convert(self): for line in self.file_object:#一行行的处理 line = line.strip('\n')#去除换行fu self.line_number += 1#没处理一行行号加一 line_length = len(line) i = 0 string = ''#存储一个字符串 while i < line_length: ch = line[i]#读取该行的一个字符 i += 1 if self.state == 0:#初始状态 string = ch if ch.isalpha(): self.state = 1 elif ch.isdigit(): self.state = 3 elif ch == '+': self.state = 5 elif ch == '-': self.state = 9 elif ch == '*': self.state = 13 elif ch == '/': self.state = 16 elif ch == '=': self.state = 20 i -= 1 elif ch == '<': self.state = 21 i -= 1 elif ch == '{': self.state = 22 i -= 1 elif ch == '}': self.state = 23 i -= 1 elif ch == ';': i -= 1 self.state = 24 elif ch.isspace(): self.state = 25 else: self.state = 26#不可识别状态 i -= 1 elif self.state == 1:#判断字母数字 while ch.isalpha() or ch.isdigit(): string += ch if i < line_length: ch = line[i] i += 1 else: break self.state = 2 i -= 2#回退2个字符 elif self.state == 2: if string in self.ResWord: content = '(关键字,' + string + ')' else: content = '(标识符,' + string + ')' #print content self.char_message.append(content) string = ''#回到初始情况 self.state = 0#回到状态0 elif self.state == 3: while ch.isdigit(): string += ch if i < line_length: ch = line[i] i += 1 else: break self.state = 4 i -= 2#回退2个字符 elif self.state == 4: content = '(数字,' + string + ')' self.char_message.append(content) #print string string = ''#回到初始情况 self.state = 0#回到状态0 elif self.state == 5: if ch == '+': self.state = 6 i -= 1 elif ch == '=': self.state = 7 i -= 1 else: self.state = 8 i -= 2 elif self.state == 6:#判断++ content = '(特殊符号,' + string + ch + ')' self.char_message.append(content) #print string + ch string = ''#回到初始情况 self.state = 0#回到状态0 elif self.state == 7:#判断+= content = '(特殊符号,' + string + ch + ')' self.char_message.append(content) #print string + ch string = ''#回到初始情况 self.state = 0#回到状态0 elif self.state == 8:#判断+ content = '(特殊符号,' + ch + ')' self.char_message.append(content) #print ch string = ''#回到初始情况 self.state = 0#回到状态0 elif self.state == 9: if ch == '-': self.state = 10 i -= 1 elif ch == '=': self.state = 11 i -= 1 else: self.state = 12 i -= 2 elif self.state == 10: content = '(特殊符号,' + string + ch + ')' self.char_message.append(content) #print string + ch#判断-- string = ''#回到初始情况 self.state = 0#回到状态0 elif self.state == 11:#判断-= content = '(特殊符号,' + string + ch + ')' self.char_message.append(content) #print string + ch string = ''#回到初始情况 self.state = 0#回到状态0 elif self.state == 12:#判断- content = '(特殊符号,' + ch + ')' self.char_message.append(content) #print ch string = ''#回到初始情况 self.state = 0#回到状态0 elif self.state == 13: if ch == '=': self.state = 14 i -= 1 else: self.state = 15 i -= 2 elif self.state == 14:#判断*= content = '(特殊符号,' + string + ch + ')' self.char_message.append(content) #print string + ch string = ''#回到初始情况 self.state = 0#回到状态0 elif self.state == 15:#判断* content = '(特殊符号,' + ch + ')' self.char_message.append(content) #print ch string = ''#回到初始情况 self.state = 0#回到状态0 elif self.state == 16: if ch == '/': self.state = 17 i -= 1 elif ch == '=': self.state = 18 i -= 1 else: self.state = 19 i -= 2 elif self.state == 17:#判断// content = '(特殊符号,' + string + ch + ')' self.char_message.append(content) content = '(注释,'+ line[i:] +')' self.annotate_message.append(content) #print content string = ''#回到初始情况 self.state = 0#回到状态0 elif self.state == 18:#判断/= content = '(特殊符号,' + string + ch + ')' self.char_message.append(content) #print string + ch string = ''#回到初始情况 self.state = 0#回到状态0 elif self.state == 19:#判断/ content = '(特殊符号,' + ch + ')' self.char_message.append(content) #print ch string = ''#回到初始情况 self.state = 0#回到状态0 elif self.state == 20: content = '(特殊符号,=)' self.char_message.append(content) #print '=' self.state = 0 string = '' elif self.state == 21: content = '(特殊符号,<)' self.char_message.append(content) #print '<' self.state = 0 string = '' elif self.state == 22: content = '(特殊符号,{)' self.char_message.append(content) #print '{' self.state = 0 string = '' elif self.state == 23: content = '(特殊符号,})' self.char_message.append(content) #print '}' self.state = 0 string = '' elif self.state == 24: content = '(特殊符号,;)' self.char_message.append(content) #print ';' self.state = 0 string = '' elif self.state == 25: while ch.isspace(): if i < line_length: ch = line[i] i += 1 else: break self.state = 0 i -= 1 elif self.state == 26: content = '(行号:'+str(self.line_number)+',' + ch + ')' self.error_message.append(content) #print 'error:' + ch self.state = 0 string = '' #print self.state def Get_error(self):#获取错误信息 return self.error_message def Get_annotate(self):#获取注释信息 return self.annotate_message def Get_char(self):#获取识别信息 return self.char_message ''' *****************测试内容************************ try: file_object = open("N:\\wang4.txt") dfa = DFA(file_object) dfa.Start_convert() content = dfa.Get_char() for item in content: print item content = dfa.Get_annotate() for item in content: print item content = dfa.Get_error() for item in content: print item finally: file_object.close() '''
利用面向对象的机制,我们很容易做到数据,数据处理和GUI端的分离,便于代码复用和重构,把上面Python代码做成一个简单的GUI如下:
相关文章推荐
- 十进制转换为二进制序列,并输出1的个数,和序列的奇偶序列
- C/C++中结构体的区别
- opencv提取视频帧,将多张图片组合成视频(C++)
- C++拷贝构造函数
- 跑马
- 水
- 用if语句将三个数按从大到小的顺序输出
- c++友元
- C++书目
- C和C++头文件的不同
- c++中char[], char*, string的用法初学
- 快速排序算法
- C++类及使用 概念及易忽略点小结
- C++类及使用 概念及易忽略点小结
- C语言第11课
- c++模版
- Winxp虚拟机(含VC++及C++书籍资料)下载说明
- 【转载,排版美化】c++中string的七个构造函数
- C++伪(pseudo)随机数生成及简单应用
- 最长公共子序列