您的位置：首页 > 编程语言 > C语言/C++

压缩C++简单程序词法分析后的文件（编译原理实验）

2011-04-13 15:22 513 查看

继续完成前面一篇“设计有穷自动机DFA实现C++简单程序的词法分析、扫描（编译原理实验）”词法分析扫猫程序剩下来关于去除多余空行、空格、注释进行源程序压缩的功能。

按实验要求（如下），这里需要考虑下面带星号*的第（3）（5）点:

实验中用到的C++源程序如下图：

思路：

其实也就是将源程序中的多余空格、注释、换行等都删除，整理成单单一行的源代码。

每次对扫描程序获取到的Token进行判断，根据上一个Token的类型（有关键字、标识符、数值、字符串、特殊符号）决定当前Token是否能够与上一个Token紧邻，也即不加任何空格。

例如上面截图中倒数第二行中的 else 和 cout 两个关键字之间就必须有空格分开，否则代码就会出错了。针对上面这个简单的C++源程序，观察其DFA图可以得出以下特点：

1、关键字与标识符不能紧邻，例如 int i中间必须有空格

2、关键字与关键字也不能紧邻，如上所述

3、另外关键字与字符串也不要紧邻

对于以上样例输入，先进行词法分析，然后将获得的Token压缩并保存在StringBuilder对象中，在写入到一个新的文件，最终再次对压缩后的文件进行扫描，判断压缩前后的扫描结果是否一直。

程序输出结果（包括压缩后的源代码）如下：

根据上面这三个特点，代码实现如下（高亮部分是与上一篇源代码不同之处）：

package lexical_analysis;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.PrintWriter;

public class Scanner_2 {

// 定义DFA中的所有状态表
//  enum StateType {Start, Num, ID, EQ, NE, NM, NL,
//      Com, LineCom, MulCom1, MulCom2, Special, Done, Str};

// 定义DFA中的所有状态表
private static final int Start = 1;
private static final int Num = 2;
private static final int ID = 3;
private static final int EQ = 4;
private static final int NE = 5;
private static final int NM = 6;
private static final int NL = 7;
private static final int Coms = 8;
private static final int LineCom = 9;
private static final int MulCom1 = 10;
private static final int MulCom2 = 11;
private static final int Special = 12;
private static final int Done = 13;
private static final int Str = 14;

// Token类型，Initial为初始类型
private enum TokenType {
Initial, ID, Special, Str, KeyWord
};

// 关键字
private String[] keyWords = new String[] {
"include", "define", "iostream", "int", "folat", "double",
"main", "if", "else", "for", "while", "do", "goto", "switch",
"case", "static", "cin", "cout"
};

// 特殊字符
private String [] special = {"{", "}", "[", "]", "(", ")",
"#", ",", ".", ";", ":", "\\",
"'", "\"", ">>", "<<", "!=", "=",
"==", "<=", ">=", "++", "--"};

// 算术运算符
private String [] arithmetic = {"+", "-", "-", "/", "%"};

// 源代码文件输入流
private BufferedReader sourceFile;

// 压缩后的文件输出流
private PrintWriter compressedFileWriter;
// 上一个Token的类型
private TokenType preType = TokenType.Initial;
// 缓存去除多余空格、注释后的源代码
private StringBuilder compressedStr = new StringBuilder();

// 扫描行的最大字符数
private static final int BUF_SIZE = 256;
// 当前行的字符长度
private int bufSize = 0;
// 当前行
private String eachLine;
// 当前扫描行的字符序列
private char [] lineBuf = new char[BUF_SIZE];
// 当前扫描的行数
private int lineNum = 0;
// 当前行的字符下标
private int charPos = 0;
// 是否已达文件尾
private boolean isEOF = false;

/**
* 每次扫描前都要初始化一些必要变量值
*/
private void initial(){
bufSize = 0;
lineNum = 0;
charPos = 0;
isEOF = false;
}

/**
* 初始化并读取源代码文件
* 扫描程序开始执行，直到读取文件结束符EOF
* @throws Exception
*/
private void scanning(String originalFile) throws Exception {
this.sourceFile = new BufferedReader(new FileReader(originalFile));

this.initial();
while(!isEOF) {
getToken();
}
System.out.println("========================> end scanning ...");
}

/**
* 获取下一个字符
* @return
* @throws Exception
*/
private char getNextChar() throws Exception {
char nextChar = '\0';

if(!(charPos < bufSize)) {
if((eachLine = sourceFile.readLine()) != null) {
lineNum++;
System.out.println(lineNum + ": " + eachLine);
lineBuf = eachLine.toCharArray();
bufSize = eachLine.length();
charPos = 0;
nextChar = lineBuf[charPos++];
} else {
isEOF = true;
nextChar = '\0';
}
} else {
nextChar = lineBuf[charPos++];
}
return nextChar;
}

/**
* 【按步长（step）】取消获取下一个字符
*/
private void unGetNextChar(int step) {
if(!isEOF) {
charPos -= step;
}
}

/**
* 获取一个Token
* @return
* @throws Exception
*/
private String getToken() throws Exception {
String tokenStr = "";
String currentToken = "";
int currentState = Start;
boolean isSave;

// 不同时为EOF和Done状态
while(currentState != Done && !isEOF) {
char c = getNextChar();
isSave = true;

switch(currentState) {
case Start:
if(isDigit(c)) {
currentState = Num;
} else if(isLetter(c) || c == '.') { //点号是为了处理头文件iostream.h的格式
currentState = ID;
} else if(c == ' ' || c == '\t' || c == '\n') {
isSave = false;
} else if(c == '!') {
currentState = NE;
} else if(c == '=') {
currentState = EQ;
} else if(c == '<') {
currentState = NM;
} else if(c == '>') {
currentState = NL;
} else if(c == '/') {
currentState = Coms;
isSave = false;
} else if(c == '"') {
currentState = Str;
} else {
currentState = Done;
//                      if(isSingle(c)) {
//                          currentToken = "" + c;
//                          currentState = Done;
//                          isSave = false;
//                      }
}
break;
case Num:
if(!isDigit(c)) {
currentState = Done;
unGetNextChar(1);
isSave = false;
}
break;
case ID:
if(!isLetter(c) && !isDigit(c)) {
currentState = Done;
unGetNextChar(1);
isSave = false;
}
break;
case NE:
if(c != '=') {
currentState = Special;
unGetNextChar(2);
isSave = false;
} else {
currentState = Done;
}
break;
case NM:
if(c != '=' && c != '<') {
currentState = Special;
unGetNextChar(2);
isSave = false;
} else {
currentState = Done;
}
break;
case NL:
if(c != '=' && c != '>') {
currentState = Special;
unGetNextChar(2);
isSave = false;
} else {
currentState = Done;
}
break;
case EQ:
if(c != '=') {
currentState = Special;
unGetNextChar(2);
isSave = false;
} else {
currentState = Done;
}
break;
case Str:
if(c == '"') {
currentState = Done;
}
break;
case Coms:
isSave = false;
if(c == '/') {
currentState = LineCom;
} else if(c == '*') {
currentState = MulCom1;
} else {
currentState = Special;
unGetNextChar(1);
}
break;
case LineCom:
isSave = false;
if(c == '\n') {
currentState = Done;
}
break;
case MulCom2:
isSave = false;
if(c == '*') {
currentState = MulCom2;
} else if(c == '/') {
currentState = Done;
} else {
currentState = MulCom1;
}
break;
case Special:
if(c == '!' || c == '=' || c == '<' || c == '>') {
//                  if(isSpecialSingle(c)) {
currentToken = "" + c;
currentState = Done;
isSave = false;
} else {
currentToken = "Error";
currentState = Done;
}
break;
default:
System.out.println(lineNum + " >> Scanner Bug : state = " + currentState);
currentState = Done;
currentToken = "Error";
break;
}
if(isSave) {
tokenStr += c;
}
if(currentState == Done) {
currentToken = tokenStr;
printToken(currentToken);
}
}
return currentToken;
}

/**
* 判断是否为字母
* @param c
* @return
*/
private boolean isLetter(char c) {
if(('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')){
return true;
}
return false;
}

/**
* 判断是否为数字
* @param c
* @return
*/
private boolean isDigit(char c) {
if('0' <= c && c <= '9') {
return true;
}
return false;
}

/**
* 打印时判断是否为【数值Num】
* @param token
* @return
*/
private boolean isNum(String token) {
boolean flag = true;
char [] chs = token.toCharArray();
int len = chs.length;
for(int i = 0; i < len; i++) {
if(!isDigit(chs[i])) {
flag = false;
}
}
return flag;
}

/**
* 打印时判断是否为【特殊符号】
*/
private boolean isSpecial(String token) {
int len = special.length;
for(int i = 0; i < len; i++) {
if(token.equals(special[i])) {
return true;
}
}
return false;
}

/**
* 判断是否为算术运算符
* @param token
* @return
*/
private boolean isArithmetic(String token) {
int len = arithmetic.length;
for(int i = 0; i < len; i++) {
if(token.equals(arithmetic[i])) {
return true;
}
}
return false;
}

/**
* 打印时判断是否为【关键字】
* @param token
* @return
*/
private boolean isKeyWord(String token) {
int len = keyWords.length;
for(int i = 0; i < len; i++) {
if(keyWords[i].equals(token)) {
return true;
}
}
return false;
}

/**
* 判断是否为【单个字符】即 # * { } [ ] ( ) , . ; : '
* @param c
* @return
*/
//  private boolean isSingle(char c) {
//      char [] single = {'#', '*', '{', '}',
//                          '[', ']', '(', ')',
//                          ':', ';', '.', ',',
//                          '\''};
//      int len = single.length;
//      for(int i = 0; i < len; i++) {
//          if(c == single[i]) {
//              return true;
//          }
//      }
//      return false;
//  }

/**
* 判断是否为【单个的特殊字符】即 !   =   <   >
* 因为这几个属于多义字符，能形成 !=  ==  <<  >>
* @param c
* @return
*/
//  private boolean isSpecialSingle(char c) {
//      char [] special = {'!', '=', '<', '>'};
//      int len = special.length;
//      for(int i = 0; i < len; i++) {
//          if(c == special[i]) {
//              return true;
//          }
//      }
//      return false;
//  }

/**
* 按类别打印扫描得到的Token
* @param token
*/
private void printToken(String token) {
if(isKeyWord(token)) {
System.out.printf("%4d: %s --- %s\n", lineNum, token, "关键字");

token = (preType == TokenType.KeyWord ? " " : "") + token;
preType = TokenType.KeyWord;
this.compressedStr.append(token);

} else if(isSpecial(token)) {
System.out.printf("%4d: %s --- %s\n", lineNum, token,"特殊符号");

preType = TokenType.Special;
this.compressedStr.append(token);

} else if(isArithmetic(token)) {
System.out.printf("%4d: %s --- %s\n", lineNum, token,"算术运算符");

preType = TokenType.Special;
this.compressedStr.append(token);

} else if(isNum(token)) {
System.out.printf("%4d: %s --- %s\n", lineNum, token,"数值");

preType = TokenType.Special;
this.compressedStr.append(token);

} else if(token.startsWith("\"")) {
System.out.printf("%4d: %s --- %s\n", lineNum, token,"字符串");

token = (preType == TokenType.KeyWord ? " " : "") + token;
this.compressedStr.append(token);
preType = TokenType.Str;

} else {
System.out.printf("%4d: %s --- %s\n", lineNum, token,"标识符");

token = (preType == TokenType.KeyWord ? " " : "") + token;
this.compressedStr.append(token);
preType = TokenType.ID;
}
}

/**
* 打印并将被压缩后的源代码写入新的文件中
*/
public void printCompressedFile(String compressedFile) throws Exception {
System.out.println(this.compressedStr);
// 创建压缩后的文件输出流
this.compressedFileWriter = new PrintWriter(
new FileOutputStream(new File(compressedFile)));
// 写入到新的文件
this.compressedFileWriter.write(new String(this.compressedStr));
this.compressedFileWriter.flush();
}

/**
* 测试
*/
public static void main(String[] args) throws Exception {
Scanner_2 scanner = new Scanner_2();

System.out.println("扫描未压缩源代码文件 >> ");
scanner.scanning("cppSrc.cpp");

System.out.println("\n压缩之后的源代码 >> ");
scanner.printCompressedFile("afterCompressed.cpp");

System.out.println("\n扫描压缩后的源代码文件 >> ");
scanner.scanning("afterCompressed.cpp");
}
}

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签： 扫描休闲压缩编译原理词法分析

相关文章推荐

新的分享

章节导航