您的位置:首页 > 其它

编译原理之词法分析和语法分析

2014-12-03 19:20 405 查看
花了一天写出的程序没有顾及很多层面,但对于理解基本的实验道理和交上实验还是有点帮助的。代码实现了基于有限自动机的词法分析,采用递归下降分析法和EBNF文法实现语法分析并生成中间代码。from sdu

lexAnalysis.h

/*
 * lexAnalysis.h
 *
 *  Created on: 2014-12-2
 *      Author: liuqiushan
 */

#ifndef LEXANALYSIS_H_
#define LEXANALYSIS_H_

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef enum _SYM
{
		_CONST,
		_VAR,
		_procedure,
		_begin,
		_end,
		_if,
		_then,
		_ID, // such as the 'a' in 'var a;'
		_INT, // such as the '10' in 'const a = 10;'
		_ASSIGN,// '='
		_PLUS,// '+'
		_SUB,// '-'
		_STAR,// '*'
		_DIV,// '/'
		_LESS,// '<'
		_MORE,// '>'
		_LESSEQ,// '<='
	    _MOREEQ,// '>='
	    _DH,// ','
	    _MD,// ':='
	    _LEFT,// '('
	    _RIGHT,// ')'
	    _JH,// '#'
	    _FH// ‘;’
}SYM;

SYM getSYM(char* str);

void appendStr(char* str, char* c, int p_str);

int isABC(char* c);

int isNumber(char* c);

#endif /* LEXANALYSIS_H_ */


LexAnalysis.c

/*
 * LexAnalysis.c
 *
 *  Created on: 2014-12-2
 *      Author: liuqiushan
 */

#include "lexAnalysis.h"

SYM getSYM(char* _str)
{
	SYM temp = _ID; // default identifier
	if(!strcasecmp(_str, "CONST"))
		temp = _CONST;
	else if(!strcasecmp(_str, "VAR"))
		temp = _VAR;
	else if(!strcasecmp(_str, "procedure"))
			temp = _procedure;
	else if(!strcasecmp(_str, "begin"))
				temp = _begin;
	else if(!strcasecmp(_str, "end"))
				temp = _end;
	else if(!strcasecmp(_str, "if"))
					temp = _if;
	else if(!strcasecmp(_str, "then"))
					temp = _then;

	return temp;
}

void appendStr(char str[], char* c, int p_str )
{
	str[p_str] = *c;
}

int isABC(char* c)
{
	char ch = *c;

	if(ch=='q'||ch=='w'||ch=='e'||ch=='r'||ch=='t'||ch=='y'||ch=='u'||ch=='i'||ch=='o'||ch=='p'||
					ch=='a'||ch=='s'||ch=='d'||ch=='f'||ch=='g'||ch=='h'||ch=='j'||ch=='k'||ch=='l'||
					ch=='z'||ch=='x'||ch=='c'||ch=='v'||ch=='b'||ch=='n'||ch=='m'||ch=='Q'||ch=='W'||
					ch=='E'||ch=='R'||ch=='T'||ch=='Y'||ch=='U'||ch=='I'||ch=='O'||ch=='P'||ch=='A'||
					ch=='S'||ch=='D'||ch=='F'||ch=='G'||ch=='H'||ch=='J'||ch=='K'||ch=='L'||ch=='Z'||
					ch=='X'||ch=='C'||ch=='V'||ch=='B'||ch=='N'||ch=='M')
				return 1;
			else
				return 0;
}

int isNumber(char* c)
{
	char ch = *c;

	if(ch=='1'||ch=='2'||ch=='3'||ch=='4'||ch=='5'||ch=='6'||ch=='7'||ch=='8'||ch=='9'||ch=='0')
				return 1;
			else
				return 0;
}


synAnalysis.h

/*
 * synAnalysis.h
 *
 *  Created on: 2014-12-2
 *      Author: liuqiushan
 */

#ifndef SYNANALYSIS_H_
#define SYNANALYSIS_H_

typedef enum _KIND
{
	PRO,//procedure
	CON,//constant
	VAR //variable
}CVPKind;

/*
 * PL/0 Object Instruction
 *
 * There are three domains:
 *
 * f is the function code
 * l is the difference between the declared level and the invoked level of the variable
 *
 * -----------------------------
 * |           |       |       |
 * |     f     |   l   |   a   |
 * |           |       |       |
 * -----------------------------
 */
typedef enum _FunctionCode
{
	LIT,//① LIT: put the constant on the top of the stack, a is the constant
	LOD,//② LOD: put the variable on the top of the stack, a is the address
	STO,//③ STO: put the content on the top of the stack into a variable. such as c := b + 10
	CAL,//④ CAL
	INT,//⑤ INT: create the data area, and a is the number of the area
	JMP,//⑥ JMP
	JPC,//⑦ JPC
	OPR//⑧ OPR: operation, and the type is delivered by a. And the operation objects are the top of the stack and the next top of the stack

}FunctionCode;

typedef enum _OPR
{
	ADD = 1, //+
	SUB,     //-
	MUL,     //*
	DIV,     // /
	GT,
	LT,
	GE,
	LE,
	UE,
	EQ,
	WRITE,
	READ,
	MINUS
}OPRTYPE;

typedef struct _TableItem
{
	char* name;
	CVPKind kind;
	int level;
	char* address;
}CVPTableItem;

typedef struct _Instruct
{
	FunctionCode f;
	int l;
	int a;
}ObjectInstruct;

#endif /* SYNANALYSIS_H_ */


SynAnalysis.c

/*
 ============================================================================
 Name        : SynAnalysis.c
 Author      : liuqiushan
 Date		 : Dec. 2, 2014 18:16
 Version     : 1.0
 Copyright   : Shandong University
 Description : Syntax Analysis
 ============================================================================
 */

/*
 * Expression EBNF
 *--------------------------------------------
 *| Expression -> Term { Addop Term }        |
 *| Addop -> "+" | "-"                       |
 *| Term -> Factor { Mulop Factor }          |
 *| Mulop -> "*" | "/"                       |
 *| Factor -> ID | NUM | "(" Expression ")"  |
 *--------------------------------------------
 */

/*
 * Pl/0 Statements to be analyzed
 *--------------------------------------------
 *| const a=10                               |
 *| var b,c                                  |
 *| procedure p                              |
 *| begin                                    |
 *|   c:=b+a                                 |
 *| end                                      |
 *--------------------------------------------
 */

#include "lexAnalysis.h"
#include "synAnalysis.h"

// Data Structure
typedef struct Statement
{
	char* _statement;
	int _length; // the length of the statement
	int _locate_pointer; // default point to the first character
}P_Statement;

// Methods
void putsStatements(void);
void initStatements(void);
void binding(P_Statement* p_s, char* statement);
void lexical_Analysis(char* a, int len_a);
void printLexResults(void);
SYM getNextSYMToken();
char* getNextID();
char* getNextNumber();
void levelProcess(int level);
void processConstant(int level);
int processVariable(int level);
void processProcedure(int level);
void strcpyCVPTable(CVPTableItem* cvp, char* name, char* address);
void printSynResults(void);
int str2int(char *str);
void GEN(FunctionCode f, int l, int a);
void Sentence(int level);
void EBNFExpression(int level);
int getIndexCVP(char* id);
void Term(int level);
void Factor(int level);

/*  Global Variables */

// Information about the statements set
#define statementNum 6
char* statements[statementNum] = {
		"const a=10",
		"var b,c",
		"procedure p",
		"begin",
		"c:=b+a",
		"end"
};

// Contain those statements to be analyzed
P_Statement statementSet[statementNum];

// For lexical analysis
#define MAX 100
SYM array_SYM[MAX]; // SYM array for lexical analysis
int p_a_SYM = 0; // SYM array location pointer

char array_Id[MAX][MAX]; // identifier array for lexical analysis
int p_a_Id = 0; // identifier array location pointer

char array_Num[MAX][MAX]; // number array for lexical analysis
int p_a_Num = 0; // number array location pointer

// Token acquired during lexical analysis
char str[MAX] = ""; // current token to be analyzed
int p_str = 0; // token inner location pointer

/* For Syntax Analysis */
// Token acquired during syntax analysis
SYM _currentSYMToken;
int currentPointer = 0;

char* _currentID;
int currentPointerID = 0;

char* _currentNumber;
int currentPointerNumber = 0;

int level = 0; //It indicates which level is under processing, and also the level is just the main level.

CVPTableItem cvptable[MAX]; // used to record every element, and we need the table to show which elements (including constant, variable and procedure) are in the program
int p_table = 0; // the appending pointer of cvptable

int addrVar = 3; // the initial address for the first variable is 3

// the following two are used for the address of procedure. In fact up to now, I do not their meaning.
int indexW = 0;
int offset = 1;

ObjectInstruct CODE[MAX]; // recording the generated intermediate code
int p_code = 0; // the appending pointer of CODE

OPRTYPE type_opr; // Expression operation type

// Main
int main(void) {

	puts("====================Initial=====================");

	putsStatements(); // print which statements need to be analyzed

	initStatements(); // initial all the statements to be analyzed with the struct P_Statement

	// start lexical analysis
	puts("================Lexical Analysis================");
	int i;
	for (i = 0; i < statementNum; i++) {
		lexical_Analysis(statementSet[i]._statement, statementSet[i]._length);
	}
	array_SYM[p_a_SYM] = -1;// end with -1

	// print the results of lexical analysis
	printLexResults();

	// Now let us start Syntax Analysis, just enjoy it
	puts("================Syntax Analysis================");
	_currentSYMToken = getNextSYMToken();

	levelProcess(level);

	//print the results of syntax analysis
	printSynResults();

	return EXIT_SUCCESS;
}

// Implementation

void putsStatements(void)
{
	puts("Analyzing the following statements: ");
	puts("");
	int i;
	for (i = 0; i < statementNum; i++)
	{
		puts(statements[i]);
	}
}

void initStatements(void)
{
	int i;
	for (i = 0; i < statementNum; i++)
		binding(&statementSet[i], statements[i]);
}

void binding(P_Statement* p_s, char* statement)
{
	// calculate the length of statement;
	int len = strlen(statement);

	p_s->_statement = malloc(len*sizeof(char));
	strcpy(p_s->_statement, statement);
	p_s->_length = len;
	p_s->_locate_pointer = 0;

	// testing
	// printf("%s\n", p_s->_statement);
	// printf("%d\n", p_s->_length);
}

// Lexical Analysis Main Program
void lexical_Analysis(char* a, int len_a)
{
	int i;
	for (i = 0; i < len_a; i++) {
		// Only get one character each time
		if (a[i]==' ')
			continue;
		// Finite Automation Method
		if (isABC(&a[i])) // If the first character is letter
		{
			while (isABC(&a[i])||isNumber(&a[i]))
			{
				appendStr(str, &a[i], p_str);
				p_str++;
				i++;
				if (i == len_a)
					break;
			}
			i--;

			SYM code = getSYM(str);

			//either identifier or keyword
			if (code == _ID)
			{
				array_SYM[p_a_SYM] = _ID;
				p_a_SYM++;
				strcpy(array_Id[p_a_Id], str);
				p_a_Id++;
			} else { // keyword or reserved word of Pl/0
				array_SYM[p_a_SYM] = code;
				p_a_SYM++;
			}

			// before next one, clean them
			memset(str, 0, sizeof(str));
			p_str = 0;

		} else if (isNumber(&a[i])) // If the first character is number
		{
			while (isNumber(&a[i]))
			{
				appendStr(str, &a[i], p_str);
				p_str++;
				i++;
				if (i == len_a)
					break;
			}
			i--;

			array_SYM[p_a_SYM] = _INT;
			p_a_SYM++;
			strcpy(array_Num[p_a_Num], str);
			p_a_Num++;

			// before next one, clean them
			memset(str, 0, sizeof(str));
			p_str = 0;

		} else if (a[i] == '=') {
			array_SYM[p_a_SYM] = _ASSIGN;
			p_a_SYM++;
		} else if (a[i] == ',') {
			array_SYM[p_a_SYM] = _DH;
			p_a_SYM++;
		} else if (a[i] == ':') {
			i++;
			if (a[i] == '=')
			{
				array_SYM[p_a_SYM] = _MD;
				p_a_SYM++;
			}
		} else if (a[i] == '+') {
			array_SYM[p_a_SYM] = _PLUS;
			p_a_SYM++;
		}

	}

	// print the results of lexical analysis
	//printLexResults();

}

void printLexResults(void)
{
	int i;
	puts("============Lexical Analysis Results================");
	printf("SYM: [ ");
	for (i = 0; i < p_a_SYM; i++)
	{
		switch(array_SYM[i])
		{
		case 0: printf("_CONST "); break;
		case 1: printf("_VAR "); break;
		case 2: printf("_procedure "); break;
		case 3: printf("_begin "); break;
		case 4: printf("_end "); break;
		case 5: printf("_if "); break;
		case 6: printf("_then "); break;
		case 7: printf("_ID "); break;
		case 8: printf("_INT "); break;
		case 9: printf("_ASSIGN "); break;
		case 10: printf("_PLUS "); break;
		case 11: printf("_SUB "); break;
		case 12: printf("_STAR "); break;
		case 13: printf("_DIV "); break;
		case 14: printf("_LESS "); break;
		case 15: printf("_MORE "); break;
		case 16: printf("_LESSEQ "); break;
		case 17: printf("_MOREEQ "); break;
		case 18: printf("_DH "); break;
		case 19: printf("_MD "); break;
		case 20: printf("_LEFT "); break;
		case 21: printf("_RIGHT "); break;
		case 22: printf("_JH "); break;
		case 23: printf("_FH "); break;
		}
	}
	printf("]\n");

	printf("ID: [ ");
	for (i = 0; i < p_a_Id; i++)
		printf("%s ", array_Id[i]);
	printf("]\n");

	printf("NUM: [ ");
	for (i = 0; i < p_a_Num; i++)
		printf("%s ", array_Num[i]);
	printf("]\n");

}

SYM getNextSYMToken()
{
	return array_SYM[currentPointer++];
}

char* getNextID()
{
	return array_Id[currentPointerID++];
}

char* getNextNumber()
{
	return array_Num[currentPointerNumber++];
}

void levelProcess(int level)
{
	int countVar = 0; // record the number of variables in one level

	if (_currentSYMToken != -1)
	{
		if (_currentSYMToken == _CONST) // Constant
		{
			processConstant(level);
			if ((_currentSYMToken=getNextSYMToken())==-1)
				return;
		}
		if (_currentSYMToken == _VAR) // variables follow constant in our example
		{
			countVar = processVariable(level);
		}
		if (_currentSYMToken == _procedure) // procedure follows variables in our example
		{
			_currentSYMToken = getNextSYMToken(); // get the identifier of the procedure
			processProcedure(level);
		}
	}

	GEN(INT, 0, 3+countVar); // Create the specified number area. Because we designed addrVar 3, so we plus 3
	Sentence(level);
	GEN(OPR, 0, 0); // quit the data area
}

void processConstant(int level)
{
	if ((_currentSYMToken=getNextSYMToken())!=-1&&_currentSYMToken==_ID) // a
		if ((_currentSYMToken=getNextSYMToken())!=-1&&_currentSYMToken==_ASSIGN) // =
			if ((_currentSYMToken=getNextSYMToken())!=-1&&_currentSYMToken==_INT) // 10
				if ((_currentID=getNextID())!=NULL) // get the identifier
					if ((_currentNumber=getNextNumber())!=NULL) // get the number
					{
						strcpyCVPTable(&cvptable[p_table], _currentID, _currentNumber);
						cvptable[p_table].kind = CON;
						cvptable[p_table].level = level;

						// Do not forget to add p_table
						p_table++;
					}
}

int processVariable(int level)
{
	int count = 0; // record the number of variables

	if ((_currentSYMToken=getNextSYMToken())!=-1&&_currentSYMToken==_ID)
		if ((_currentID=getNextID())!=NULL) // get the identifier
		{
			count++;
			char str[MAX];
			sprintf(str, "%d", addrVar);
			strcpyCVPTable(&cvptable[p_table], _currentID, str);
			cvptable[p_table].kind = VAR;
			cvptable[p_table].level = level;

			addrVar++;

			// Do not forget to add p_table
			p_table++;
		}
	// Maybe we could meet such var a,b
	while((_currentSYMToken=getNextSYMToken())!=-1&&_currentSYMToken==_DH)
	{
		if ((_currentSYMToken=getNextSYMToken())!=-1&&_currentSYMToken==_ID)
			if ((_currentID=getNextID())!=NULL) // get the identifier
			{
				count++;
				char str[MAX];
				sprintf(str, "%d", addrVar);
				strcpyCVPTable(&cvptable[p_table], _currentID, str);
				cvptable[p_table].kind = VAR;
				cvptable[p_table].level = level;

				addrVar++;

				// Do not forget to add p_table
				p_table++;
			}
	}

	return count;
}

void processProcedure(int level)
{
	if (_currentSYMToken!=-1&&_currentSYMToken==_ID)
		if ((_currentID=getNextID())!=NULL) // get the identifier of the procedure
		{
			char str[MAX];
			int temp = indexW + offset;
			sprintf(str, "%d", temp);
			strcpyCVPTable(&cvptable[p_table], _currentID, str);
			cvptable[p_table].kind = PRO;
			cvptable[p_table].level = level;

			// Do not forget to add p_table
			p_table++;
		}

	//  IMPORTANT PHASE: SUBLEVEL
	_currentSYMToken = getNextSYMToken(); // in our example it should be begin
	levelProcess(level + 1); // Recursive-descent Parsing

}

void strcpyCVPTable(CVPTableItem* cvp, char* name, char* address)
{
		int len_N = strlen(name);
		int len_A = strlen(address);

		cvp->name = (char*)malloc(len_N*sizeof(char));
		cvp->address = (char*)malloc(len_A*sizeof(char));
		strcpy(cvp->name, name);
		strcpy(cvp->address, address);
}

void printSynResults(void)
{
	int i;
	puts("============Syntax Analysis Results================");
	puts("The Table of Constant, Variable and Procedure");
	puts("Name\tKind\tLevel\tAddress\t");
	for (i = 0; i < p_table; i++)
	{
		//printf("%s\t%d\t%d\t%s\n", cvptable[i].name, cvptable[i].kind, cvptable[i].level, cvptable[i].address);
		printf("%s\t", cvptable[i].name);
		switch(cvptable[i].kind) {
		case CON: printf("CON\t"); break;
		case VAR: printf("VAR\t"); break;
		case PRO: printf("PRO\t"); break;
		}
		printf("%d\t%s\n", cvptable[i].level, cvptable[i].address);
	}

	puts("");
	puts("Intermediate Code");
	puts("No.\tFunction Code\tLevel Difference\tDisplacement");
	for (i = 0; i < p_code; i++)
	{
		printf("%d\t", i);
		switch(CODE[i].f)
		{
		case LIT: printf("LIT\t"); break;
		case INT: printf("INT\t"); break;
		case LOD: printf("LOD\t"); break;
		case STO: printf("STO\t"); break;
		case CAL: printf("CAL\t"); break;
		case JMP: printf("JMP\t"); break;
		case JPC: printf("JPC\t"); break;
		case OPR: printf("OPR\t"); break;
		}
		printf("\t%d\t\t\t%d\n", CODE[i].l, CODE[i].a);
	}

}

int str2int(char *str)
{
	int v = 0;
	do {
		v = 10*v+*str-'0';
		str++;
	} while((*str>='0')&&(*str<='9'));
	return v;
}

// To generate the object instruct
void GEN(FunctionCode f, int l, int a)
{
	CODE[p_code].f = f;
	CODE[p_code].l = l;
	CODE[p_code].a = a;

	p_code++;
}

void Sentence(int level)
{
	// Here I just implement begin and Assignment i.e. _MD(:=)
	if (_currentSYMToken != -1)
	{
		if (_currentSYMToken ==_begin)
		{
			_currentSYMToken = getNextSYMToken();
			Sentence(level);// recurse with the sub-sentence of the sentence (i.e. nest), but they are in the same level

			while(_currentSYMToken != -1)
			{
				if (_currentSYMToken == _end)
				{
					_currentSYMToken = getNextSYMToken(); // in our example, this is the end of SYM table. it should be -1
					return;
				}
			}
		} else if (_currentSYMToken == _ID) // Assignment
		{
			_currentID = getNextID(); // I omit the variables valid checking, but we sill need to find it in cvptable

			int cvpIndex = getIndexCVP(_currentID); // Just find its index in cvptable

			if((_currentSYMToken=getNextSYMToken())!=-1&&_currentSYMToken==_MD) // assignment symbol
			{
				_currentSYMToken = getNextSYMToken();
				EBNFExpression(level);
				GEN(STO, level-cvptable[cvpIndex].level, str2int(cvptable[cvpIndex].address));
				return; // assignment over
			}
		}
	}
}

void EBNFExpression(int level)
{
	Term(level); // the level + and -

	// Here we implement + and -
	while((_currentSYMToken=getNextSYMToken())!=-1)
	{
		if (_currentSYMToken == _SUB || _currentSYMToken == _PLUS)
		{
			if (_currentSYMToken == _SUB)
				type_opr = SUB;
			else
				type_opr = ADD;

			_currentSYMToken = getNextSYMToken(); // next term
			Term(level);
			GEN(OPR, 0, type_opr);
		}
	}
}

int getIndexCVP(char* id)
{
	int i;
	for (i = 0; i < p_table; i++)
	{
		if(!strcmp(cvptable[i].name, id))
			break;
	}

	return i;
}

void Term(int level)
{
	Factor(level); // the level of * and /

	// we omit * and /
}

void Factor(int level)
{
	// In fact, we omit '(' and ')', but it is easy to implement it by recursing with EBNFExpression
	if(_currentSYMToken!=-1&&_currentSYMToken==_ID)
	{
		_currentID = getNextID();
		int cvpIndex = getIndexCVP(_currentID); // Just find its index in cvptable

		if(cvptable[cvpIndex].kind == VAR) // type checking
		{
			GEN(LOD, level-cvptable[cvpIndex].level, str2int(cvptable[cvpIndex].address));
		} else if (cvptable[cvpIndex].kind == CON) {
			GEN(LIT, 0, str2int(cvptable[cvpIndex].address));
		}
	}
}


实验结果:

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: