【编译原理】PL/0编译程序之词法分析 | 实现词法分析器 – 小飞侠

【编译原理】PL/0编译程序之词法分析 | 实现词法分析器

Post author:xfxia
Post published:2023年9月10日
Post category:其他

视频

简单的手写，理解原理 + 花时间 = 写出它

核心

遍历字符流，不需回溯
没用DFA，全程if-else，思路很好理解
识别单词的时候分为：1-标识符关键字,2-整数,3-复合运算符,4-单独字符
类别码是1,2,3…，用enum在头文件里定义了。
单独字符的类型码统一定义在ssym数组里了，关键字的类型码统一定义在wsym数组里了。因为他们是一一对应的，用数组比较简洁。
报错做的很简陋，几乎没有
识别到token就直接输出了，万事从简（懒）…理解思想和方法就可再改进?
C 库函数 int isalpha( int c )：判断字符是否是字母，int isdigit ( int c )：判断字符是否是数字。当然也可以直接用ASCII码。
C 库函数 int strcmp(const char *str1, const char *str2) 把 str1 所指向的字符串和 str2 所指向的字符串进行比较。
如果视频看不懂就是我没讲清楚，有点口齿不清…sorry?

my.c

#include "my.h"

void error(int n)
{
	printf("Error %3d: %s\n", n, err_msg[n]);
}

void lexer(FILE *fp)
{
	ch = fgetc(fp);
	while (ch != EOF)
	{
		while (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n')
		{
			ch = fgetc(fp);
		}
		if (isalpha(ch)) // 当前输入为字母,则应该为关键字或标识符
		{
			char a[MAXIDLEN + 1]; // 当前读取到的单词
			int k = 0;
			for (; (isalpha(ch) || isdigit(ch)) && k < MAXIDLEN; k++)
			{
				a[k] = ch;
				ch = fgetc(fp);
			}

			a[k] = '\0'; // 字符数组和字符串的区别就是结尾少了\0，一定要加上！
			// 检查是否为关键字
			int i = 1;
			for (; i <= NRW; i++)
			{
				if (strcmp(a, word[i]) == 0)
					break;
			}
			if (i <= NRW)
			{
				sym = wsym[i]; // symbol is a reserved word
			}
			else
			{
				sym = SYM_IDENTIFIER; // symbol is an identifier
			}
			printf("(%d,%s)\n", sym, a);
		}
		else if (isdigit(ch))
		{ // symbol is a number.
			sym = SYM_NUMBER;
			int k = 0;
			int num = 0;
			while (isdigit(ch))
			{
				num = num * 10 + ch - '0';
				ch = fgetc(fp);
				k++;
			}
			if (k > MAXNUMLEN)
				error(25); // The number is too great.
			else
			{
				printf("(%d,%d)\n", sym, num);
			}
		}
		else if (ch == ':')
		{
			ch = fgetc(fp);
			if (ch == '=')
			{
				sym = SYM_BECOMES; // :=
				ch = fgetc(fp);
				printf("(%d,:=)\n", sym);
			}
			else
			{
				sym = SYM_NULL; // illegal?
			}
		}
		else if (ch == '>')
		{
			ch = fgetc(fp);
			if (ch == '=')
			{
				sym = SYM_GEQ; // >=
				ch = fgetc(fp);
				printf("(%d,>=)\n", sym);
			}
			else
			{
				sym = SYM_GTR; // >
				printf("(%d,=)\n", sym);
			}
		}
		else if (ch == '<')
		{
			ch = fgetc(fp);
			if (ch == '=')
			{
				sym = SYM_LEQ; // <=
				ch = fgetc(fp);
				printf("(%d,<=)\n", sym);
			}
			else if (ch == '>')
			{
				sym = SYM_NEQ; // <>
				ch = fgetc(fp);
			}
			else
			{
				sym = SYM_LES; // <
				printf("(%d,<)\n", sym);
			}
		}
		else if (ch == '{')
		{ //忽略注释
			int end = 1;
			while (end)
			{
				ch = fgetc(fp);
				if (ch == '}')
					end = 0;
			}
			ch = fgetc(fp);
		}
		else
		{ // other tokens : '+', '-', '*', '/', '(', ')', '=', ',', '.', ';'
			//代码和识别关键字那里类似
			int i = 1;
			for (; i <= NSYM; i++)
			{
				if (ch == csym[i])
					break;
			}
			if (i <= NSYM)
			{
				sym = ssym[i];
				printf("(%d,%c)\n", sym, ch);
				ch = fgetc(fp);
			}
			//不应该出现的字符
			else
			{
				printf("Fatal Error: Unknown character.\n");
				exit(1);
			}
		}
	}
	printf("—————文件读取结束—————");
}

int main()
{
	FILE *fp = fopen("source1.txt", "r");
	lexer(fp);
	return 0;
}

my.h

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define NRW        11     // number of reserved words
#define MAXNUMLEN  14     // maximum number of digits in numbers
#define NSYM       10     // maximum number of symbols in array ssym and csym
#define MAXIDLEN   10     // length of identifiers

char ch;         // last character read
int  sym;        // last symbol read
// char line[80];   //输入缓冲，便于报错时输出错误句子

char csym[NSYM + 1] = {
	' ', '+', '-', '*', '/', '(', ')', '=', ',', '.', ';'
};

//关键字
char* word[NRW + 1] = {
	"", /* place holder */
	"begin", "call", "const", "do", "end","if",
	"odd", "procedure", "then", "var", "while"
};
//类别码
enum symtype {
	SYM_NULL,	SYM_IDENTIFIER,	SYM_NUMBER,	SYM_PLUS,	SYM_MINUS,	SYM_TIMES,	SYM_SLASH,	SYM_ODD,	SYM_EQU,	SYM_NEQ,	SYM_LES,	SYM_LEQ,	SYM_GTR,	SYM_GEQ,	SYM_LPAREN,	SYM_RPAREN,	SYM_COMMA,	SYM_SEMICOLON,	SYM_PERIOD,	SYM_BECOMES,    SYM_BEGIN,	SYM_END,	SYM_IF,	SYM_THEN,	SYM_WHILE,	SYM_DO,	SYM_CALL,	SYM_CONST,	SYM_VAR,	SYM_PROCEDURE
};
int wsym[NRW + 1] = {
	SYM_NULL, SYM_BEGIN, SYM_CALL, SYM_CONST, SYM_DO, SYM_END,
	SYM_IF, SYM_ODD, SYM_PROCEDURE, SYM_THEN, SYM_VAR, SYM_WHILE
};
int ssym[NSYM + 1] = {//
	SYM_NULL, SYM_PLUS, SYM_MINUS, SYM_TIMES, SYM_SLASH,
	SYM_LPAREN, SYM_RPAREN, SYM_EQU, SYM_COMMA, SYM_PERIOD, SYM_SEMICOLON
};

//报错信息
char* err_msg[] =
{
/*  0 */    "",
/*  1 */    "Found ':=' when expecting '='.",
/*  2 */    "There must be a number to follow '='.",
/*  3 */    "There must be an '=' to follow the identifier.",
/*  4 */    "There must be an identifier to follow 'const', 'var', or 'procedure'.",
/*  5 */    "Missing ',' or ';'.",
/*  6 */    "Incorrect procedure name.",
/*  7 */    "Statement expected.",
/*  8 */    "Follow the statement is an incorrect symbol.",
/*  9 */    "'.' expected.",
/* 10 */    "';' expected.",
/* 11 */    "Undeclared identifier.",
/* 12 */    "Illegal assignment.",
/* 13 */    "':=' expected.",
/* 14 */    "There must be an identifier to follow the 'call'.",
/* 15 */    "A constant or variable can not be called.",
/* 16 */    "'then' expected.",
/* 17 */    "';' or 'end' expected.",
/* 18 */    "'do' expected.",
/* 19 */    "Incorrect symbol.",
/* 20 */    "Relative operators expected.",
/* 21 */    "Procedure identifier can not be in an expression.",
/* 22 */    "Missing ')'.",
/* 23 */    "The symbol can not be followed by a factor.",
/* 24 */    "The symbol can not be as the beginning of an expression.",
/* 25 */    "The number is too great.",
/* 26 */    "",
/* 27 */    "",
/* 28 */    "",
/* 29 */    "",
/* 30 */    "",
/* 31 */    "",
/* 32 */    "There are too many levels."
};

source1.txt

const a=10;    {常量声明}
const b=20;
var c;         {变量声明}
procedure p;   {过程声明}
     begin
          c:=b+a
     end;

begin
     call p
end.

版权声明：本文为qq_44850725原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

原文链接：https://blog.csdn.net/qq_44850725/article/details/113822903