This commit is contained in:
Yaossg 2024-12-06 15:06:55 +08:00
parent d1d1c88934
commit f778cf0670
6 changed files with 113 additions and 114 deletions

View File

@ -50,13 +50,13 @@ $ sh boot.sh
自举会输出六个文件,三个汇编文件和三个可执行文件:
| 源代码 | 编译器 | 汇编 | 可执行 | 代号 | 命名 |
| ----------------- | --------- | ------- | --------- | ---- | ---------------------- |
| boot.c boot-lib.c | gcc | | gcc.out | G | 自制编译器 |
| boot.c boot-lib.h | gcc.out | boot1.s | boot1.out | B1 | 自举自制编译器 |
| boot.c boot-lib.h | boot1.out | boot2.s | boot2.out | B2 | 自举自举自制编译器 |
| boot.c boot-lib.h | boot2.out | boot3.s | | B3 | 验证自举自举自制编译器 |
| ------ | --------- | ------- | --------- | ---- | ---------------------- |
| boot.c | gcc | | gcc.out | G | 自制编译器 |
| boot.c | gcc.out | boot1.s | boot1.out | B1 | 自举自制编译器 |
| boot.c | boot1.out | boot2.s | boot2.out | B2 | 自举自举自制编译器 |
| boot.c | boot2.out | boot3.s | | B3 | 验证自举自举自制编译器 |
后三次编译时boot-lib.h 的内容被手动导入 boot.c 开头进行编译boot-lib.c 提供的库通过链接引入
除了第一次编译全程由 gcc 完成之外,另外三次编译从源码到汇编由本编译器完成,从汇编到可执行文件由 gcc 完成。从汇编到可执行文件时需要将 glibc 链接进去,这对于 gcc 来说是默认的行为
整个自举及其验证的过程如下图所示:
@ -95,7 +95,7 @@ $ sh boot.sh
### 关键字
本语言包含的关键字即为支持的标量类型的关键字和流程控制的关键字,还有 `const`
本语言包含的关键字即为支持的标量类型的关键字和流程控制的关键字,还有 `const``extern`
#### `const` 关键字
@ -103,15 +103,29 @@ $ sh boot.sh
但是当在出现
- 全局,标量(即不是数组)
- 类型为 `const int``const int const`
- 带有初始化
- 全局,标量(即不是数组)。
- 类型为 `const int``const int const`
- 带有初始化。
- 不是 `extern` 的。
的声明时,将会被解析为整数常量。
整数常量在使用的时候会被直接替换为对应的右值,失去作为全局变量左值的性质。
使用 `int const``int` 可以避免这样的特殊处理。
使用 `int const``int` 形式或添加 `extern` 可以避免这样的特殊处理。
### `extern` 关键字
`extern` 在全局函数和变量的声明的开头中可以使用。
全局函数的声明和定义都会直接忽略这个关键字。全局函数的声明和定义由是否提供函数体决定,与该关键字无关。
全局变量如果使用了这个关键字,则有以下特性和限制:
- 变量仅被声明,而没有被定义。
- 如果需要使用这样的变量,需要稍后提供定义,或在外部已经定义。
- 不可以初始化。
- 不可是数组。
### 支持以下运算符
@ -144,7 +158,6 @@ $ sh boot.sh
- 支持全局变量和局部变量,局部变量遮挡全局变量。
- 不支持局部变量之间的遮挡,重名的局部变量为同一变量。
- 支持函数声明,可以通过函数声明来调用 C 语言库。不支持变量声明。
- 函数只支持最多八个参数。函数声明中支持可变参数,仅用于兼容 C 语言库。
- 类型检查有遗漏,若 C 编译器报错,而本语言编译通过,就可以认为是 UB。
- 例如函数调用的参数和 `return` 语句不会检查类型。
@ -165,14 +178,12 @@ $ sh boot.sh
## 依赖
直接依赖下面这些 C 语言库函数,在本语言中提供声明后调用。
直接依赖下面这些 C 语言库函数和变量,在本语言中提供声明后调用。
- `printf`
- `getchar`
- `exit`
间接依赖下面这些 C 语言库函数,在 C 语言中进行封装后调用。
- `ungetc`(理论上非必须,可以在本语言中手动模拟)
- `vfprintf` 和可变参数有关的宏(用于输出调试信息,非必须)
- `ungetc``stdin`(理论上非必须,可以在本语言中手动模拟)
- `fprintf``stderr`(理论上非必须,仅用于输出错误信息)

View File

@ -1,15 +0,0 @@
#include <stdio.h>
#include <stdarg.h>
int eprintf(const char format[], ...) {
va_list args;
va_start(args, format);
int ret = vfprintf(stderr, format, args);
va_end(args);
return ret;
}
void ungetchar(int ch) {
ungetc(ch, stdin);
}

View File

@ -1,11 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
// std
int printf(const char* format, ...);
int getchar();
void exit(int status);
// ext
void ungetchar(int ch);
int eprintf(const char* format, ...);

View File

@ -1,12 +1,10 @@
mkdir -p build && cd build &&
cat ../boot-lib.h ../boot.c | sed '/^#/d' > boot-all.c &&
gcc ../boot.c ../boot-lib.c -o gcc.out &&
./gcc.out < boot-all.c > boot1.s &&
gcc -static boot1.s ../boot-lib.c -o boot1.out &&
./boot1.out < boot-all.c > boot2.s &&
gcc -static boot2.s ../boot-lib.c -o boot2.out &&
./boot2.out < boot-all.c > boot3.s &&
gcc ../boot.c -o gcc.out &&
./gcc.out < ../boot.c > boot1.s &&
gcc -static boot1.s -o boot1.out &&
./boot1.out < ../boot.c > boot2.s &&
gcc -static boot2.s -o boot2.out &&
./boot2.out < ../boot.c > boot3.s &&
cmp --silent boot1.s boot2.s && echo "boot1.s == boot2.s" || echo "boot1.s != boot2.s"
cmp --silent boot2.s boot3.s && echo "boot2.s == boot3.s" || echo "boot2.s != boot3.s"
cmp --silent boot1.s boot3.s && echo "boot1.s == boot3.s" || echo "boot1.s != boot3.s"
rm boot-all.c

122
boot.c
View File

@ -1,4 +1,18 @@
#include "boot-lib.h"
// libc dependency
extern void* stdin;
extern void* stdout;
extern void* stderr;
int printf(const char* format, ...);
int getchar();
void exit(int status);
int fprintf(void* file, const char* format, ...);
int ungetc(int ch, void* file);
void ungetchar(int ch) {
ungetc(ch, stdin);
}
// lexer
@ -79,6 +93,7 @@ const int TOKEN_BREAK = 107;
const int TOKEN_CONTINUE = 108;
const int TOKEN_RETURN = 109;
const int TOKEN_EXTERN = 126;
const int TOKEN_CONST = 127;
const int TOKEN_VOID = 128;
const int TOKEN_INT = 129;
@ -121,7 +136,7 @@ int get_escaped_char() {
} else if (ch == '\"') {
ch = '\"';
} else {
eprintf("unexpected escaped character: %c\n", ch);
fprintf(stderr, "unexpected escaped character: %c\n", ch);
exit(1);
}
return ch;
@ -144,7 +159,7 @@ int parse_string() {
int ch;
while ((ch = getchar()) != '"') {
if (ch == -1 || ch == '\n') {
eprintf("expecting '\"'\n");
fprintf(stderr, "expecting '\"'\n");
exit(1);
}
if (ch == '\\') {
@ -237,6 +252,8 @@ void parse_id_like(int ch) {
token_type = TOKEN_FOR;
} else if (streq(id, "do")) {
token_type = TOKEN_DO;
} else if (streq(id, "extern")) {
token_type = TOKEN_EXTERN;
}
if (token_type != TOKEN_ID) {
rewind_id(0);
@ -312,7 +329,7 @@ void next_token() {
while (1) {
ch = getchar();
if (ch == -1) {
eprintf("expecting '*/'\n");
fprintf(stderr, "expecting '*/'\n");
exit(1);
}
if (ch == '*') {
@ -429,7 +446,7 @@ void next_token() {
token_data = get_escaped_char();
}
if (getchar() != '\'') {
eprintf("expecting '\n");
fprintf(stderr, "expecting '\n");
exit(1);
}
} else if (ch == '"') {
@ -444,7 +461,7 @@ void next_token() {
}
}
if (token_type != TOKEN_ELLIPSIS) {
eprintf("expecting '...'\n");
fprintf(stderr, "expecting '...'\n");
exit(1);
}
} else if (is_digit(ch)) {
@ -453,22 +470,22 @@ void next_token() {
} else if (is_id_start(ch)) {
parse_id_like(ch);
} else {
eprintf("unexpected character: %c(%d)\n", ch, ch);
fprintf(stderr, "unexpected character: %c(%d)\n", ch, ch);
exit(1);
}
eprintf("token: %d\n", token_type);
fprintf(stderr, "token: %d\n", token_type);
if (token_type == TOKEN_ID) {
const char* name = id_table + id_lut[token_data];
eprintf(" id: %s\n", name);
fprintf(stderr, " id: %s\n", name);
} else if (token_type == TOKEN_NUMBER) {
eprintf(" number: %d\n", token_data);
fprintf(stderr, " number: %d\n", token_data);
}
}
void expect_token(int expected_type) {
next_token();
if (token_type != expected_type) {
eprintf("unexpected token: %d, should be %d\n", token_type, expected_type);
fprintf(stderr, "unexpected token: %d, should be %d\n", token_type, expected_type);
exit(1);
}
}
@ -890,20 +907,20 @@ int lookup(int id) {
const char* name = id_table + id_lut[id];
if (global_marker[id]) {
if (global_marker[id] == MARKER_FUNCTION) {
eprintf("function name must not appear outside function call: %s\n", name);
fprintf(stderr, "function name must not appear outside function call: %s\n", name);
exit(1);
}
int rd = next_reg(TYPE_VOID_PTR);
_asm_i("la", rd, name, " # id: ", id);
return materialize_address(rd, global_type[id], global_marker[id]);
}
eprintf("unresolved identifier: %s\n", name);
fprintf(stderr, "unresolved identifier: %s\n", name);
exit(1);
}
int asm_r_arith(const char* op, int rs1) {
if (reg_type[rs1] & TYPE_PTR_MASK) {
eprintf("pointer cannot be arithmetically operated by %s\n", op);
fprintf(stderr, "pointer cannot be arithmetically operated by %s\n", op);
exit(1);
}
return asm_r(TYPE_INT, op, rs1);
@ -911,7 +928,7 @@ int asm_r_arith(const char* op, int rs1) {
int asm_rr_arith(const char* op, int rs1, int rs2) {
if (reg_type[rs1] & TYPE_PTR_MASK || reg_type[rs2] & TYPE_PTR_MASK) {
eprintf("pointer cannot be arithmetically operated by %s\n", op);
fprintf(stderr, "pointer cannot be arithmetically operated by %s\n", op);
exit(1);
}
return asm_rr(TYPE_INT, op, rs1, rs2);
@ -951,7 +968,7 @@ int cont_label_stack_size;
void asm_break() {
if (break_label_stack_size == 0) {
eprintf("break without loop\n");
fprintf(stderr, "break without loop\n");
exit(1);
}
asm_j(break_label_stack[break_label_stack_size - 1]);
@ -959,7 +976,7 @@ void asm_break() {
void asm_continue() {
if (cont_label_stack_size == 0) {
eprintf("continue without loop\n");
fprintf(stderr, "continue without loop\n");
exit(1);
}
asm_j(cont_label_stack[cont_label_stack_size - 1]);
@ -1004,7 +1021,7 @@ int asm_add(int lhs, int rhs) {
}
int ptr_type = reg_type[ptr];
if (ptr_type == TYPE_VOID_PTR) {
eprintf("void pointer cannot be arithmetically operated\n");
fprintf(stderr, "void pointer cannot be arithmetically operated\n");
exit(1);
}
int offset = next_reg(TYPE_INT);
@ -1012,7 +1029,7 @@ int asm_add(int lhs, int rhs) {
return asm_rr(ptr_type, "add", ptr, offset);
}
if (type1 && type2) {
eprintf("operands of addition cannot be both pointers\n");
fprintf(stderr, "operands of addition cannot be both pointers\n");
exit(1);
}
return asm_rr(TYPE_INT, "add", lhs, rhs);
@ -1025,11 +1042,11 @@ int asm_sub(int lhs, int rhs) {
int type2 = rhs_type & TYPE_PTR_MASK;
if (type1 && type2) {
if (lhs_type != rhs_type) {
eprintf("pointer type mismatch\n");
fprintf(stderr, "pointer type mismatch\n");
exit(1);
}
if (lhs_type == TYPE_VOID_PTR) {
eprintf("void pointer cannot be arithmetically operated\n");
fprintf(stderr, "void pointer cannot be arithmetically operated\n");
exit(1);
}
int diff = asm_rr(TYPE_INT, "sub", lhs, rhs);
@ -1072,7 +1089,7 @@ int parse_assign_expr();
int parse_function_call(int id) {
const char* name = id_table + id_lut[id];
if (global_marker[id] != MARKER_FUNCTION) {
eprintf("not a function name: %s\n", name);
fprintf(stderr, "not a function name: %s\n", name);
exit(1);
}
int arg = 0;
@ -1084,7 +1101,7 @@ int parse_function_call(int id) {
}
unget_token();
if (arg >= 8) {
eprintf("too many arguments\n");
fprintf(stderr, "too many arguments\n");
exit(1);
}
args[arg++] = parse_assign_expr();
@ -1094,7 +1111,7 @@ int parse_function_call(int id) {
} else if (token_type == TOKEN_PAREN_RIGHT) {
break;
} else {
eprintf("expecting ',' or ')'\n");
fprintf(stderr, "expecting ',' or ')'\n");
exit(1);
}
}
@ -1143,7 +1160,7 @@ int parse_primary_expr() {
expect_token(TOKEN_PAREN_RIGHT);
return reg;
} else {
eprintf("unexpected token in primary expression: %d\n", token_type);
fprintf(stderr, "unexpected token in primary expression: %d\n", token_type);
exit(1);
}
}
@ -1182,7 +1199,7 @@ int parse_prefix_expr() {
int reg = parse_postfix_expr();
int type = reg_type[reg];
if (type & TYPE_PTR_MASK) {
eprintf("cannot take address of a pointer\n");
fprintf(stderr, "cannot take address of a pointer\n");
exit(1);
}
return addressof(reg);
@ -1190,11 +1207,11 @@ int parse_prefix_expr() {
int reg = parse_postfix_expr();
int type = reg_type[reg];
if (!(type & TYPE_PTR_MASK)) {
eprintf("cannot dereference a non-pointer\n");
fprintf(stderr, "cannot dereference a non-pointer\n");
exit(1);
}
if (type == TYPE_VOID_PTR) {
eprintf("cannot dereference void pointer\n");
fprintf(stderr, "cannot dereference void pointer\n");
exit(1);
}
return dereference(reg);
@ -1443,7 +1460,7 @@ int parse_conditional_expr() {
asm_label(label1);
int rhs = parse_conditional_expr();
if (reg_type[lhs] != reg_type[rhs]) {
eprintf("type mismatch in conditional expression\n");
fprintf(stderr, "type mismatch in conditional expression\n");
exit(1);
}
asm_mv(result, rhs);
@ -1535,7 +1552,7 @@ int parse_expr() {
void parse_local_variable(int type) {
if (type == TYPE_VOID) {
eprintf("variable cannot be of void type\n");
fprintf(stderr, "variable cannot be of void type\n");
exit(1);
}
expect_token(TOKEN_ID);
@ -1543,7 +1560,7 @@ void parse_local_variable(int type) {
next_token();
if (token_type == TOKEN_BRACKET_LEFT) {
if (type & TYPE_PTR_MASK) {
eprintf("array of pointers is not supported\n");
fprintf(stderr, "array of pointers is not supported\n");
exit(1);
}
expect_token(TOKEN_NUMBER);
@ -1561,7 +1578,7 @@ void parse_local_variable(int type) {
expect_token(TOKEN_ASSIGN);
int reg = parse_expr();
if (type != reg_type[reg]) {
eprintf("type mismatch in assignment\n");
fprintf(stderr, "type mismatch in assignment\n");
exit(1);
}
store_into_local(reg, slot);
@ -1709,17 +1726,9 @@ void parse_function(const char* name) {
expect_token(TOKEN_PAREN_RIGHT);
break;
}
if (token_type == TOKEN_VOID) {
if (arg != 0) {
eprintf("void should be the only argument\n");
exit(1);
}
expect_token(TOKEN_PAREN_RIGHT);
break;
}
int arg_type = parse_type();
if (arg_type < 0 || arg_type == TYPE_VOID) {
eprintf("expecting a non-void argument type: %d\n", arg_type);
fprintf(stderr, "expecting a non-void argument type: %d\n", arg_type);
exit(1);
}
expect_token(TOKEN_ID);
@ -1729,13 +1738,13 @@ void parse_function(const char* name) {
expect_token(TOKEN_BRACKET_RIGHT);
next_token();
if (arg_type & TYPE_PTR_MASK) {
eprintf("array of pointers is not supported\n");
fprintf(stderr, "array of pointers is not supported\n");
exit(1);
}
arg_type |= TYPE_PTR_MASK;
}
if (arg >= 8) {
eprintf("too many arguments\n");
fprintf(stderr, "too many arguments\n");
exit(1);
}
args[arg++] = declare_local(token_data, arg_type);
@ -1744,7 +1753,7 @@ void parse_function(const char* name) {
} else if (token_type == TOKEN_PAREN_RIGHT) {
break;
} else {
eprintf("expecting ',' or ')'\n");
fprintf(stderr, "expecting ',' or ')'\n");
exit(1);
}
}
@ -1808,10 +1817,6 @@ void parse_function(const char* name) {
}
void parse_global_variable(int id, const char* name, int type) {
if (type == TYPE_VOID) {
eprintf("variable cannot be of void type\n");
exit(1);
}
printf(".data\n");
printf(".globl %s\n", name);
printf(".align 5\n");
@ -1821,7 +1826,7 @@ void parse_global_variable(int id, const char* name, int type) {
printf(" .dword %d\n", token_data);
} else if (token_type == TOKEN_BRACKET_LEFT) {
if (type & TYPE_PTR_MASK) {
eprintf("array of pointers is not supported\n");
fprintf(stderr, "array of pointers is not supported\n");
exit(1);
}
expect_token(TOKEN_NUMBER);
@ -1834,17 +1839,21 @@ void parse_global_variable(int id, const char* name, int type) {
printf(" .zero %d\n", 8);
unget_token();
}
expect_token(TOKEN_SEMICOLON);
}
void parse_global_declaration() {
int external = 0;
if (token_type == TOKEN_EXTERN) {
external = 1;
next_token();
}
int is_const_int = 1;
if (token_type != TOKEN_CONST) {
is_const_int = 0;
}
int type = parse_type();
if (type < 0) {
eprintf("expecting type for global declaration\n");
fprintf(stderr, "expecting type for global declaration\n");
exit(1);
}
if (type != TYPE_INT) {
@ -1854,7 +1863,7 @@ void parse_global_declaration() {
int id = token_data;
char* name = id_table + id_lut[id];
next_token();
if (is_const_int && token_type == TOKEN_ASSIGN) {
if (!external && is_const_int && token_type == TOKEN_ASSIGN) {
expect_token(TOKEN_NUMBER);
const_table[id] = token_data;
is_const[id] = 1;
@ -1863,9 +1872,18 @@ void parse_global_declaration() {
declare_global(id, MARKER_FUNCTION, type);
parse_function(name);
} else {
if (type == TYPE_VOID) {
fprintf(stderr, "variable cannot be of void type\n");
exit(1);
}
declare_global(id, MARKER_SCALAR, type);
if (external) {
unget_token();
} else {
parse_global_variable(id, name, type);
}
expect_token(TOKEN_SEMICOLON);
}
}
void parse_top_level() {

14
boot.sh
View File

@ -1,12 +1,10 @@
mkdir -p build && cd build &&
cat ../boot-lib.h ../boot.c | sed '/^#/d' > boot-all.c &&
gcc ../boot.c ../boot-lib.c -o gcc.out &&
./gcc.out < boot-all.c > boot1.s &&
riscv64-linux-gnu-gcc-12 -static boot1.s ../boot-lib.c -o boot1.out &&
qemu-riscv64 boot1.out < boot-all.c > boot2.s &&
riscv64-linux-gnu-gcc-12 -static boot2.s ../boot-lib.c -o boot2.out &&
qemu-riscv64 boot2.out < boot-all.c > boot3.s
gcc ../boot.c -o gcc.out &&
./gcc.out < ../boot.c > boot1.s &&
riscv64-linux-gnu-gcc-12 -static boot1.s -o boot1.out &&
qemu-riscv64 boot1.out < ../boot.c > boot2.s &&
riscv64-linux-gnu-gcc-12 -static boot2.s -o boot2.out &&
qemu-riscv64 boot2.out < ../boot.c > boot3.s
cmp --silent boot1.s boot2.s && echo "boot1.s == boot2.s" || echo "boot1.s != boot2.s"
cmp --silent boot2.s boot3.s && echo "boot2.s == boot3.s" || echo "boot2.s != boot3.s"
cmp --silent boot1.s boot3.s && echo "boot1.s == boot3.s" || echo "boot1.s != boot3.s"
rm boot-all.c