This commit is contained in:
Yaossg 2024-12-07 12:14:10 +08:00
parent dd4ef1edbd
commit 2d7c2371e4
2 changed files with 288 additions and 227 deletions

View File

@ -1,6 +1,6 @@
# RVBTCC
不到 2000 行的轻量级自举编译器。
2000 行的轻量级自举编译器。
- 旨在展示如何迅速编写一个自举编译器。
- 语法类似 C输出 RISC-V 汇编。
@ -96,24 +96,11 @@ $ sh boot.sh
### 关键字
本语言包含的关键字即为支持的标量类型的关键字和流程控制的关键字,还有 `const``extern`
本语言包含的关键字即为支持的标量类型的关键字和流程控制的关键字,还有 `const``extern``enum`
#### `const` 关键字
`const` 关键字可以在类型中使用,在大部分情况下会被直接忽略。支持它是为了更好兼容 C 程序。
但是当在出现
- 全局,标量(即不是数组)。
- 类型为 `const int``const int const`
- 带有初始化。
- 不是 `extern` 的。
的声明时,将会被解析为整数常量。
整数常量在使用的时候会被直接替换为对应的右值,失去作为全局变量左值的性质。
使用 `int const``int` 形式或添加 `extern` 可以避免这样的特殊处理。
`const` 关键字可以在类型中使用,会被直接忽略。支持它是为了更好兼容 C 程序。
#### `extern` 关键字
@ -128,6 +115,12 @@ $ sh boot.sh
- 不可以初始化。
- 不可是数组。
#### `enum` 关键字
用于定义整数常量。enum 的名字必须省略,因此不能用于定义枚举类型。
整数常量可以用于数组大小、全局变量初始化等需要常量的地方。
### 支持以下运算符
| 运算符 | 含义 | 结合性 |
@ -169,13 +162,12 @@ $ sh boot.sh
编译过程中涉及的以下参数:
- 符号表总长度、字符串表总长度
- 符号数、字符串数、全局变量数、局部变量
- 符号数、字符串数、局部变量数、虚拟寄存器
不能超过源代码中指定的常数。如果有必要这些常数可以适度加大。
不能超过源代码中指定的常数。
目前源代码中的常数能够保证自举。
如果愿意,完全可以把程序中的各类表改为 `malloc``free` 动态管理,本语言是完全支持的。
- 目前源代码中的常数能够保证自举成功。如果有必要可以将它们适度加大。
- 该设计保证了没有任何的动态内存分配。如果愿意,可以将它们改为 `malloc``free` 动态管理,本语言是完全支持的。
## 依赖

481
boot.c
View File

@ -1,12 +1,11 @@
/*
* RVBTCC By Yaossg
* A lightweight bootstrapping compiler in less than 2000 lines.
* A lightweight bootstrapping compiler in 2000 lines.
*
* It aims to demonstrate how to write a bootstrapping compiler in no time.
* Syntax is similar to C, output is RISC-V assembly.
* Only dependent on some glibc functions for I/O.
* Purely for educational purposes. Do not use in production.
*
*/
// libc dependency
@ -25,8 +24,190 @@ void ungetchar(int ch) {
ungetc(ch, stdin);
}
// limitations
enum {
STRING_TABLE_SIZE = 65536,
STRING_LUT_SIZE = 4096,
ID_TABLE_SIZE = 65536,
ID_LUT_SIZE = 4096,
LOCAL_SIZE = 4096,
REG_SIZE = 4096,
};
// constants
enum {
TOKEN_EOF,
TOKEN_SEMICOLON,
TOKEN_ADD,
TOKEN_SUB,
TOKEN_MUL,
TOKEN_DIV,
TOKEN_REM,
TOKEN_ASSIGN,
TOKEN_COMMA,
TOKEN_LSHIFT,
TOKEN_RSHIFT,
TOKEN_AND,
TOKEN_OR,
TOKEN_XOR,
TOKEN_COMPL,
TOKEN_NOT,
TOKEN_LAND,
TOKEN_LOR,
TOKEN_ELLIPSIS,
TOKEN_INC,
TOKEN_DEC,
TOKEN_ADD_ASSIGN,
TOKEN_SUB_ASSIGN,
TOKEN_MUL_ASSIGN,
TOKEN_DIV_ASSIGN,
TOKEN_REM_ASSIGN,
TOKEN_AND_ASSIGN,
TOKEN_OR_ASSIGN,
TOKEN_XOR_ASSIGN,
TOKEN_LSHIFT_ASSIGN,
TOKEN_RSHIFT_ASSIGN,
TOKEN_QUESTION,
TOKEN_COLON,
TOKEN_EQ,
TOKEN_NE,
TOKEN_LT,
TOKEN_GT,
TOKEN_LE,
TOKEN_GE,
TOKEN_PAREN_LEFT = 50,
TOKEN_PAREN_RIGHT,
TOKEN_BRACKET_LEFT,
TOKEN_BRACKET_RIGHT,
TOKEN_BRACE_LEFT,
TOKEN_BRACE_RIGHT,
TOKEN_STRING = 99,
TOKEN_NUMBER,
TOKEN_ID,
TOKEN_IF,
TOKEN_ELSE,
TOKEN_WHILE,
TOKEN_FOR,
TOKEN_DO,
TOKEN_BREAK,
TOKEN_CONTINUE,
TOKEN_RETURN,
TOKEN_ENUM,
TOKEN_EXTERN,
TOKEN_CONST,
TOKEN_VOID = 128,
TOKEN_INT,
TOKEN_CHAR,
};
enum {
TYPE_VOID,
TYPE_INT,
TYPE_CHAR,
TYPE_VOID_PTR = 16,
TYPE_INT_PTR,
TYPE_CHAR_PTR,
TYPE_PTR_MASK = TYPE_VOID_PTR,
TYPE_TOKEN_MASK = TOKEN_VOID,
};
enum {
KIND_TEMP,
KIND_SCALAR,
KIND_ARRAY,
KIND_FUNCTION,
};
enum {
REG_ZERO,
REG_RA,
REG_SP,
REG_GP,
REG_TP,
REG_T0,
REG_T1,
REG_T2,
REG_FP,
REG_S1,
REG_A0,
REG_A1,
REG_A2,
REG_A3,
REG_A4,
REG_A5,
REG_A6,
REG_A7,
REG_S2,
REG_S3,
REG_S4,
REG_S5,
REG_S6,
REG_S7,
REG_S8,
REG_S9,
REG_S10,
REG_S11,
REG_T3,
REG_T4,
REG_T5,
REG_T6,
};
const char* reg_name(int reg) {
// special begin
if (reg == REG_ZERO) return "zero";
if (reg == REG_RA) return "ra";
if (reg == REG_SP) return "sp";
if (reg == REG_GP) return "gp";
if (reg == REG_TP) return "tp";
if (reg == REG_T0) return "t0";
if (reg == REG_T1) return "t1";
if (reg == REG_T2) return "t2";
if (reg == REG_FP) return "fp";
if (reg == REG_S1) return "s1";
if (reg == REG_A0) return "a0";
if (reg == REG_A1) return "a1";
if (reg == REG_A2) return "a2";
if (reg == REG_A3) return "a3";
if (reg == REG_A4) return "a4";
if (reg == REG_A5) return "a5";
if (reg == REG_A6) return "a6";
if (reg == REG_A7) return "a7";
// allocation begin
if (reg == REG_S2) return "s2";
if (reg == REG_S3) return "s3";
if (reg == REG_S4) return "s4";
if (reg == REG_S5) return "s5";
if (reg == REG_S6) return "s6";
if (reg == REG_S7) return "s7";
if (reg == REG_S8) return "s8";
if (reg == REG_S9) return "s9";
if (reg == REG_S10) return "s10";
if (reg == REG_S11) return "s11";
if (reg == REG_T3) return "t3";
if (reg == REG_T4) return "t4";
if (reg == REG_T5) return "t5";
if (reg == REG_T6) return "t6";
// overflow begin
return 0;
}
// lexer
int streq(const char* s1, const char* s2) {
while (*s1 && *s2 && *s1 == *s2) {
s1++;
s2++;
}
return *s1 == *s2;
}
int is_digit(int ch) {
return '0' <= ch && ch <= '9';
}
@ -39,87 +220,6 @@ int is_id_cont(int ch) {
return is_id_start(ch) || is_digit(ch);
}
int token_state;
int token_type;
int token_data;
const int TOKEN_EOF = 0;
const int TOKEN_SEMICOLON = 1;
const int TOKEN_ADD = 2;
const int TOKEN_SUB = 3;
const int TOKEN_MUL = 4;
const int TOKEN_DIV = 5;
const int TOKEN_REM = 6;
const int TOKEN_ASSIGN = 7;
const int TOKEN_COMMA = 8;
const int TOKEN_DOT = 9;
const int TOKEN_LSHIFT = 10;
const int TOKEN_RSHIFT = 11;
const int TOKEN_AND = 12;
const int TOKEN_OR = 13;
const int TOKEN_XOR = 14;
const int TOKEN_COMPL = 15;
const int TOKEN_NOT = 16;
const int TOKEN_LAND = 17;
const int TOKEN_LOR = 18;
const int TOKEN_ELLIPSIS = 19;
const int TOKEN_INC = 20;
const int TOKEN_DEC = 21;
const int TOKEN_ADD_ASSIGN = 22;
const int TOKEN_SUB_ASSIGN = 23;
const int TOKEN_MUL_ASSIGN = 24;
const int TOKEN_DIV_ASSIGN = 25;
const int TOKEN_REM_ASSIGN = 26;
const int TOKEN_AND_ASSIGN = 27;
const int TOKEN_OR_ASSIGN = 28;
const int TOKEN_XOR_ASSIGN = 29;
const int TOKEN_LSHIFT_ASSIGN = 30;
const int TOKEN_RSHIFT_ASSIGN = 31;
const int TOKEN_QUESTION = 32;
const int TOKEN_COLON = 33;
const int TOKEN_EQ = 40;
const int TOKEN_NE = 41;
const int TOKEN_LT = 42;
const int TOKEN_GT = 43;
const int TOKEN_LE = 44;
const int TOKEN_GE = 45;
const int TOKEN_PAREN_LEFT = 50;
const int TOKEN_PAREN_RIGHT = 51;
const int TOKEN_BRACKET_LEFT = 52;
const int TOKEN_BRACKET_RIGHT = 53;
const int TOKEN_BRACE_LEFT = 54;
const int TOKEN_BRACE_RIGHT = 55;
const int TOKEN_STRING = 99;
const int TOKEN_NUMBER = 100;
const int TOKEN_ID = 101;
const int TOKEN_IF = 102;
const int TOKEN_ELSE = 103;
const int TOKEN_WHILE = 104;
const int TOKEN_FOR = 105;
const int TOKEN_DO = 106;
const int TOKEN_BREAK = 107;
const int TOKEN_CONTINUE = 108;
const int TOKEN_RETURN = 109;
const int TOKEN_EXTERN = 126;
const int TOKEN_CONST = 127;
const int TOKEN_VOID = 128;
const int TOKEN_INT = 129;
const int TOKEN_CHAR = 130;
const int TYPE_VOID = 0;
const int TYPE_INT = 1;
const int TYPE_CHAR = 2;
const int TYPE_VOID_PTR = 16;
const int TYPE_INT_PTR = 17;
const int TYPE_CHAR_PTR = 18;
const int TYPE_PTR_MASK = 16;
const int TYPE_TOKEN_MASK = 128;
int parse_int(int ch) {
int num = ch - '0';
while (is_digit(ch = getchar())) {
@ -153,17 +253,13 @@ int get_escaped_char() {
return ch;
}
int streq(const char* s1, const char* s2) {
while (*s1 && *s2 && *s1 == *s2) {
s1++;
s2++;
}
return *s1 == *s2;
}
int token_state;
int token_type;
int token_data;
char string_table[65536];
char string_table[STRING_TABLE_SIZE];
int string_offset;
int string_lut[4096];
int string_lut[STRING_LUT_SIZE];
int string_lut_size;
int parse_string() {
int offset = string_offset;
@ -201,9 +297,9 @@ void dedup_string() {
}
}
char id_table[65536];
char id_table[ID_TABLE_SIZE];
int id_offset;
int id_lut[4096];
int id_lut[ID_LUT_SIZE];
int id_lut_size;
int parse_id(int ch) {
int offset = id_offset;
@ -265,6 +361,8 @@ void parse_id_like(int ch) {
token_type = TOKEN_DO;
} else if (streq(id, "extern")) {
token_type = TOKEN_EXTERN;
} else if (streq(id, "enum")) {
token_type = TOKEN_ENUM;
}
if (token_type != TOKEN_ID) {
rewind_id(0);
@ -529,65 +627,45 @@ int parse_type() {
// assembly context
const int KIND_TEMP = 0;
const int KIND_SCALAR = 1;
const int KIND_ARRAY = 2;
const int KIND_FUNCTION = 3;
// use id as index
int local_table[ID_LUT_SIZE]; // id -> local id
int local_table[4096]; // id -> local id
// use local id as index
int next_local_id = 1;
int max_local_id = 1;
int local_kind[4096];
int local_type[4096];
int local_kind[LOCAL_SIZE];
int local_type[LOCAL_SIZE];
int global_kind[4096];
int global_type[4096];
// use id as index
int global_kind[ID_LUT_SIZE];
int global_type[ID_LUT_SIZE];
int next_reg_id = 18;
int max_reg_id = 18;
int reg_type[4096];
char indirection[4096];
int overflow[4096]; // reg -> local id
// use reg id as index
int next_reg_id = REG_S2;
int max_reg_id = REG_S2;
int reg_type[REG_SIZE];
char indirection[REG_SIZE];
int overflow[REG_SIZE]; // reg -> local id
int const_table[4096]; // id -> value
char is_const[4096];
// use id as index
int const_table[ID_LUT_SIZE]; // id -> value
char is_const[ID_LUT_SIZE];
const int REG_ZERO = 0;
const int REG_RA = 1;
const int REG_SP = 2;
const int REG_GP = 3;
const int REG_TP = 4;
const int REG_T0 = 5;
const int REG_T1 = 6;
const int REG_T2 = 7;
const int REG_FP = 8;
const int REG_S1 = 9;
const int REG_A0 = 10;
const int REG_A1 = 11;
const int REG_A2 = 12;
const int REG_A3 = 13;
const int REG_A4 = 14;
const int REG_A5 = 15;
const int REG_A6 = 16;
const int REG_A7 = 17;
const int REG_S2 = 18;
const int REG_S3 = 19;
const int REG_S4 = 20;
const int REG_S5 = 21;
const int REG_S6 = 22;
const int REG_S7 = 23;
const int REG_S8 = 24;
const int REG_S9 = 25;
const int REG_S10 = 26;
const int REG_S11 = 27;
const int REG_T3 = 28;
const int REG_T4 = 29;
const int REG_T5 = 30;
const int REG_T6 = 31;
int expect_const() {
next_token();
if (token_type == TOKEN_NUMBER) {
return token_data;
}
if (token_type == TOKEN_ID && !local_table[token_data] && is_const[token_data]) {
return const_table[token_data];
}
fprintf(stderr, "expecting a constant\n");
exit(1);
}
void reset_reg() {
next_reg_id = REG_S2;
for (int i = 0; i < 4096; ++i) {
for (int i = 0; i < REG_SIZE; ++i) {
reg_type[i] = TYPE_VOID;
indirection[i] = 0;
overflow[i] = 0;
@ -595,55 +673,17 @@ void reset_reg() {
reg_type[REG_ZERO] = TYPE_INT;
}
const char* reg_name(int reg) {
if (reg == 0) return "zero";
if (reg == 1) return "ra";
if (reg == 2) return "sp";
if (reg == 3) return "gp";
if (reg == 4) return "tp";
if (reg == 5) return "t0";
if (reg == 6) return "t1";
if (reg == 7) return "t2";
if (reg == 8) return "fp";
// reserved begin
if (reg == 9) return "s1";
if (reg == 10) return "a0";
if (reg == 11) return "a1";
if (reg == 12) return "a2";
if (reg == 13) return "a3";
if (reg == 14) return "a4";
if (reg == 15) return "a5";
if (reg == 16) return "a6";
if (reg == 17) return "a7";
// allocation begin
if (reg == 18) return "s2";
if (reg == 19) return "s3";
if (reg == 20) return "s4";
if (reg == 21) return "s5";
if (reg == 22) return "s6";
if (reg == 23) return "s7";
if (reg == 24) return "s8";
if (reg == 25) return "s9";
if (reg == 26) return "s10";
if (reg == 27) return "s11";
if (reg == 28) return "t3";
if (reg == 29) return "t4";
if (reg == 30) return "t5";
if (reg == 31) return "t6";
// overflow begin
return 0;
}
int is_overflow(int reg) {
return reg > REG_T6;
void reset_local_table() {
for (int i = 0; i < ID_LUT_SIZE; ++i) {
local_table[i] = 0;
}
}
void reset_local() {
next_local_id = 1;
max_local_id = 1;
max_reg_id = REG_S2;
for (int i = 0; i < 4096; ++i) {
local_table[i] = 0;
for (int i = 0; i < LOCAL_SIZE; ++i) {
local_kind[i] = KIND_TEMP;
local_type[i] = TYPE_VOID;
}
@ -692,6 +732,10 @@ void declare_global(int id, int kind, int type) {
global_type[id] = type;
}
int is_overflow(int reg) {
return reg > REG_T6;
}
int next_reg(int type) {
int reg = next_reg_id++;
if (is_overflow(reg)) {
@ -1581,8 +1625,7 @@ void parse_local_variable(int type) {
fprintf(stderr, "array of pointers is not supported\n");
exit(1);
}
expect_token(TOKEN_NUMBER);
int size = token_data;
int size = expect_const();
expect_token(TOKEN_BRACKET_RIGHT);
declare_local_array(id, type, size);
return;
@ -1777,6 +1820,7 @@ void parse_function(const char* name) {
}
next_token();
if (token_type == TOKEN_SEMICOLON) {
reset_local_table();
return;
}
unget_token();
@ -1832,6 +1876,7 @@ void parse_function(const char* name) {
}
asm_addi("sp", "sp", frame_size);
printf(" ret\n");
reset_local_table();
}
void parse_global_variable(int id, const char* name, int type) {
@ -1840,15 +1885,13 @@ void parse_global_variable(int id, const char* name, int type) {
printf(".align 5\n");
printf("%s:\n", name);
if (token_type == TOKEN_ASSIGN) {
expect_token(TOKEN_NUMBER);
printf(" .dword %d\n", token_data);
printf(" .dword %d\n", expect_const());
} else if (token_type == TOKEN_BRACKET_LEFT) {
if (type & TYPE_PTR_MASK) {
fprintf(stderr, "array of pointers is not supported\n");
exit(1);
}
expect_token(TOKEN_NUMBER);
int size = token_data;
int size = expect_const();
expect_token(TOKEN_BRACKET_RIGHT);
int array_size = array_size_of(type, size);
printf(" .zero %d\n", array_size);
@ -1865,28 +1908,16 @@ void parse_global_declaration() {
external = 1;
next_token();
}
int is_const_int = 1;
if (token_type != TOKEN_CONST) {
is_const_int = 0;
}
int type = parse_type();
if (type < 0) {
fprintf(stderr, "expecting type for global declaration\n");
exit(1);
}
if (type != TYPE_INT) {
is_const_int = 0;
}
expect_token(TOKEN_ID);
int id = token_data;
char* name = id_table + id_lut[id];
next_token();
if (!external && is_const_int && token_type == TOKEN_ASSIGN) {
expect_token(TOKEN_NUMBER);
const_table[id] = token_data;
is_const[id] = 1;
expect_token(TOKEN_SEMICOLON);
} else if (token_type == TOKEN_PAREN_LEFT) {
if (token_type == TOKEN_PAREN_LEFT) {
declare_global(id, KIND_FUNCTION, type);
parse_function(name);
} else {
@ -1904,11 +1935,49 @@ void parse_global_declaration() {
}
}
void parse_enum() {
expect_token(TOKEN_BRACE_LEFT);
int value = 0;
while (1) {
next_token();
if (token_type == TOKEN_BRACE_RIGHT) {
break;
}
if (token_type != TOKEN_ID) {
fprintf(stderr, "expecting identifier in enum\n");
exit(1);
}
int id = token_data;
next_token();
if (token_type == TOKEN_ASSIGN) {
value = expect_const();
} else {
unget_token();
}
const_table[id] = value++;
is_const[id] = 1;
next_token();
if (token_type == TOKEN_COMMA) {
// continue;
} else if (token_type == TOKEN_BRACE_RIGHT) {
break;
} else {
fprintf(stderr, "expecting ',' or '}'\n");
exit(1);
}
}
expect_token(TOKEN_SEMICOLON);
}
void parse_top_level() {
next_token();
if (token_type == TOKEN_EOF)
return;
parse_global_declaration();
if (token_type == TOKEN_ENUM) {
parse_enum();
} else {
parse_global_declaration();
}
parse_top_level();
}