diff --git a/README.md b/README.md index f2c2324..7b37e2a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # RVBTCC -- 约 1500 行的轻量级自举编译器。 +- 约 1800 行的轻量级自举编译器。 - 编译器和自举编译器行为一致。 - 语法类似 C,输出 RISC-V 汇编。 - 依赖几个 libc 函数用于输入输出。 @@ -64,7 +64,21 @@ $ sh boot.sh 本语言包含的关键字即为支持的标量类型的关键字和流程控制的关键字,还有 `const`。 -`const` 关键字可以在类型中使用,但会被直接忽略。支持它是为了更好兼容 C 程序。 +### `const` 关键字 + +`const` 关键字可以在类型中使用,在大部分情况下会被直接忽略。支持它是为了更好兼容 C 程序。 + +但是当在出现 + +- 全局,标量(即不是数组) +- 类型为 `const int` 或 `const int const` +- 带有初始化 + +的声明时,将会被解析为整数常量。 + +整数常量在使用的时候会被直接替换为对应的右值,失去作为全局变量左值的性质。 + +使用 `int const` 或 `int` 可以避免这样的特殊处理。 ### 支持六个基本类型 @@ -75,13 +89,11 @@ $ sh boot.sh | `int` | `int*` | - 注意指针类型不是复合得来的,而是被视作整体。因此也不存在二重指针。 - - 函数和数组不是类型系统的一部分。 - 可以认为数组的类型就是其元素对应的指针类型。 - 函数的参数类型和个数不会检查,返回值会参与类型检查。 - 函数名只能被用于调用,函数调用被视为初等表达式。 - 数组只支持一维数组,且数组的元素不能是指针类型。 -- 全局变量不能是指针类型。 - 整数和字符字面量的类型是 `int`,字符串字面量的类型是 `char*` ### 支持的流程控制 @@ -116,8 +128,6 @@ $ sh boot.sh - 算术运算的结果总是被提升为 `int` 类型。布尔值用 `int` 类型表示。 - 由于空指针就是 `0`,因此指针和整数之间的比较运算没有禁止。 - 逻辑与和逻辑或支持短路求值。 -- 表达式没有左值和右值之分。可以认为右值总是存在一个临时的变量中。 -- 赋值不检查类型。强制类型转换可以用赋值给特定类型的变量实现。 ### 其它支持与不支持 diff --git a/boot.c b/boot.c index 5fbab63..0586617 100644 --- a/boot.c +++ b/boot.c @@ -145,6 +145,24 @@ int parse_string() { return string_lut_size++; } +void rewind_string(int new_data) { + string_offset = string_lut[token_data]; + token_data = new_data; + --string_lut_size; +} + +void dedup_string() { + int last_string = string_lut_size - 1; + char* latest = string_table + string_lut[last_string]; + for (int i = 0; i < last_string; i++) { + char* candidate = string_table + string_lut[i]; + if (streq(candidate, latest)) { + rewind_string(i); + return; + } + } +} + char id_table[65536]; int id_offset; int id_lut[4096]; @@ -361,18 +379,16 @@ void next_token() { } else if (ch == '"') { token_type = TOKEN_STRING; token_data = parse_string(); + dedup_string(); } else if (ch == '.') { - int ch2 = getchar(); - if (ch2 == '.') { - int ch3 = getchar(); - if (ch3 == '.') { + token_type = 0; + if (getchar() == '.') { + if (getchar() == '.') { token_type = TOKEN_ELLIPSIS; - } else { - eprintf("unexpected character: %c\n", ch3); - exit(1); } - } else { - eprintf("unexpected character: %c\n", ch2); + } + if (token_type != TOKEN_ELLIPSIS) { + eprintf("expecting '...'\n"); exit(1); } } else if (is_digit(ch)) { @@ -429,8 +445,8 @@ int parse_type() { int epilog_label; int local_table[4096]; // id -> local id -int next_local_id = 2; -int max_local_id = 2; +int next_local_id = 1; +int max_local_id = 1; const int MARKER_TEMP = 0; const int MARKER_SCALAR = 1; @@ -441,47 +457,143 @@ int local_marker[4096]; int global_marker[4096]; int local_type[4096]; int global_type[4096]; + +int reg_type[4096]; +int next_reg_id = 18; +int max_reg_id = 18; int indirection[4096]; +int overflow[4096]; + +int const_table[4096]; // id -> value +int is_const[4096]; + +const int REG_ZERO = 0; +const int REG_RA = 1; +const int REG_SP = 2; +const int REG_GP = 3; +const int REG_TP = 4; +const int REG_T0 = 5; +const int REG_T1 = 6; +const int REG_T2 = 7; +const int REG_FP = 8; +const int REG_S1 = 9; +const int REG_A0 = 10; +const int REG_A1 = 11; +const int REG_A2 = 12; +const int REG_A3 = 13; +const int REG_A4 = 14; +const int REG_A5 = 15; +const int REG_A6 = 16; +const int REG_A7 = 17; +const int REG_S2 = 18; +const int REG_S3 = 19; +const int REG_S4 = 20; +const int REG_S5 = 21; +const int REG_S6 = 22; +const int REG_S7 = 23; +const int REG_S8 = 24; +const int REG_S9 = 25; +const int REG_S10 = 26; +const int REG_S11 = 27; +const int REG_T3 = 28; +const int REG_T4 = 29; +const int REG_T5 = 30; +const int REG_T6 = 31; + +void reset_reg() { + next_reg_id = REG_S2; + for (int i = 0; i < 4096; ++i) { + reg_type[i] = TYPE_VOID; + indirection[i] = 0; + overflow[i] = 0; + } + reg_type[REG_ZERO] = TYPE_INT; +} + +const char* reg_name(int reg) { + if (reg == 0) return "zero"; + if (reg == 1) return "ra"; + if (reg == 2) return "sp"; + if (reg == 3) return "gp"; + if (reg == 4) return "tp"; + if (reg == 5) return "t0"; + if (reg == 6) return "t1"; + if (reg == 7) return "t2"; + if (reg == 8) return "fp"; + // reserved begin + if (reg == 9) return "s1"; + if (reg == 10) return "a0"; + if (reg == 11) return "a1"; + if (reg == 12) return "a2"; + if (reg == 13) return "a3"; + if (reg == 14) return "a4"; + if (reg == 15) return "a5"; + if (reg == 16) return "a6"; + if (reg == 17) return "a7"; + // allocation begin + if (reg == 18) return "s2"; + if (reg == 19) return "s3"; + if (reg == 20) return "s4"; + if (reg == 21) return "s5"; + if (reg == 22) return "s6"; + if (reg == 23) return "s7"; + if (reg == 24) return "s8"; + if (reg == 25) return "s9"; + if (reg == 26) return "s10"; + if (reg == 27) return "s11"; + if (reg == 28) return "t3"; + if (reg == 29) return "t4"; + if (reg == 30) return "t5"; + if (reg == 31) return "t6"; + // overflow begin + return 0; +} + +int is_overflow(int reg) { + return reg > REG_T6; +} void reset_local() { - next_local_id = 2; - max_local_id = 2; + next_local_id = 1; + max_local_id = 1; + max_reg_id = REG_S2; for (int i = 0; i < 4096; ++i) { local_table[i] = 0; local_marker[i] = MARKER_TEMP; local_type[i] = TYPE_VOID; - indirection[i] = 0; } + reset_reg(); } void reset_temp() { - while (next_local_id > 2 && local_marker[next_local_id - 1] == MARKER_TEMP) { + while (next_local_id > 1 && local_marker[next_local_id - 1] == MARKER_TEMP) { --next_local_id; } + reset_reg(); } -int next_reg(int type) { - int reg = next_local_id++; - local_type[reg] = type; - indirection[reg] = 0; +int next_local_slot(int type) { + int slot = next_local_id++; + local_type[slot] = type; if (next_local_id > max_local_id) { max_local_id = next_local_id; } - return reg; + return slot; } int declare_local(int id, int type) { if (local_table[id] != 0) return local_table[id]; - int reg = next_reg(type); - local_marker[reg] = MARKER_SCALAR; - return local_table[id] = reg; + int slot = next_local_slot(type); + local_marker[slot] = MARKER_SCALAR; + return local_table[id] = slot; } int declare_local_array(int id, int type, int size) { if (local_table[id] != 0) return local_table[id]; - int reg; - for (int i = 0; i < size; ++i) local_marker[reg = next_reg(type)] = MARKER_ARRAY; - return local_table[id] = reg; + int slot = next_local_slot(type); + local_marker[slot] = MARKER_ARRAY; + for (int i = 1; i < size; ++i) local_marker[next_local_slot(type)] = MARKER_ARRAY; + return local_table[id] = slot; } void declare_global(int id, int marker, int type) { @@ -489,6 +601,23 @@ void declare_global(int id, int marker, int type) { global_type[id] = type; } +int next_reg(int type) { + int reg = next_reg_id++; + if (is_overflow(reg)) { + int slot = next_local_slot(type); + local_marker[slot] = MARKER_TEMP; + overflow[reg] = slot; + } + reg_type[reg] = type; + if (next_reg_id > max_reg_id) { + max_reg_id = next_reg_id; + } + return reg; +} + + +// prolog & epilog helpers + int check_itype_immediate(int value) { return value >= -2048 && value <= 2047; } @@ -522,87 +651,215 @@ void asm_addi(const char* rd, const char* rs, int imm) { } } -void load_address(int rd, int id) { - if (id == -1) { - eprintf("void cannot be arithmetically operated\n"); - exit(1); - } - int offset = -id * 8 - 8; - if (indirection[id]) { - if (check_itype_immediate(offset)) { - printf(" ld t%d, %d(fp) # indirection\n", rd, offset); - } else { - printf(" li t%d, %d\n", rd, offset); - printf(" add t%d, fp, t%d\n", rd, rd); - printf(" ld t%d, 0(t%d) # indirection\n", rd, rd); - } +// assembly helpers + +// address loaders +// rd must be one of t0, t1, t2 +void load_local_address(int rd, int slot_id) { + int offset = slot_id * 8 - 8; + const char* rd_name = reg_name(rd); + if (check_itype_immediate(offset)) { + printf(" addi %s, sp, %d\n", rd_name, offset); } else { - if (check_itype_immediate(offset)) { - printf(" addi t%d, fp, %d\n", rd, offset); - } else { - printf(" li t%d, %d\n", rd, offset); - printf(" add t%d, fp, t%d\n", rd, rd); + printf(" li %s, %d\n", rd_name, offset); + printf(" add %s, sp, %s\n", rd_name, rd_name); + } +} + +const char* load_op_of_type(int type) { + if (type & TYPE_PTR_MASK) { + return "ld"; + } else if (type == TYPE_CHAR) { + return "lb"; + } else { // int + return "lw"; + } +} + +const char* store_op_of_type(int type) { + if (type & TYPE_PTR_MASK) { + return "sd"; + } else if (type == TYPE_CHAR) { + return "sb"; + } else { // int + return "sw"; + } +} + +// load a non-trivial register into t0, t1 or t2 +// rd must be one of t0, t1, t2 +void load(int rd, int reg) { + const char* op = load_op_of_type(reg_type[reg]); + const char* rd_name = reg_name(rd); + if (is_overflow(reg)) { + load_local_address(rd, overflow[reg]); + if (indirection[reg]) { + printf(" ld %s, 0(%s)\n", rd_name, rd_name); } + reg = rd; + } + printf(" %s %s, 0(%s) # load non-trivial register\n", op, rd_name, reg_name(reg)); +} + +// store t0 into a non-trivial register +void store_t0(int reg) { + const char* op = store_op_of_type(reg_type[reg]); + if (is_overflow(reg)) { + load_local_address(REG_T2, overflow[reg]); + if (indirection[reg]) { + printf(" ld t2, 0(t2)\n"); + } + reg = REG_T2; + } + printf(" %s t0, 0(%s) # store non-trivial register\n", op, reg_name(reg)); +} + +int is_nontrivial(int reg) { + return is_overflow(reg) || indirection[reg]; +} + +void _asm_r(const char* op, int rd, int rs1) { + const char* rd_name = reg_name(rd); + const char* rs1_name = reg_name(rs1); + if (is_nontrivial(rd)) rd_name = "t0"; + if (is_nontrivial(rs1)) { + rs1_name = "t0"; + load(REG_T0, rs1); + } + if (!(streq(op, "mv") && streq(rd_name, rs1_name))) + printf(" %s %s, %s\n", op, rd_name, rs1_name); + if (is_nontrivial(rd)) { + store_t0(rd); } } -void load(int rd, int id) { - load_address(rd, id); - int type = local_type[id]; - const char* op = "lw"; // int - if (type == TYPE_CHAR) { - op = "lb"; - } else if (type & TYPE_PTR_MASK) { - op = "ld"; +void _asm_rr(const char* op, int rd, int rs1, int rs2) { + const char* rd_name = reg_name(rd); + const char* rs1_name = reg_name(rs1); + const char* rs2_name = reg_name(rs2); + if (is_nontrivial(rd)) rd_name = "t0"; + if (is_nontrivial(rs1)) { + rs1_name = "t0"; + load(REG_T0, rs1); } - printf(" %s t%d, 0(t%d) # id: type %d\n", op, rd, rd, type); -} - -void store_t0(int id) { - load_address(1, id); - int type = local_type[id]; - const char* op = "sw"; // int - if (type == TYPE_CHAR) { - op = "sb"; - } else if (type & TYPE_PTR_MASK) { - op = "sd"; + if (is_nontrivial(rs2)) { + rs2_name = "t1"; + load(REG_T1, rs2); + } + printf(" %s %s, %s, %s\n", op, rd_name, rs1_name, rs2_name); + if (is_nontrivial(rd)) { + store_t0(rd); } - printf(" %s t0, 0(t1) # id: type %d\n", op, type); } -int materialize_t0(int type) { - int reg = next_reg(type); - store_t0(reg); - return reg; +void _asm_ri(const char* op, int rd, int rs1, int imm) { + const char* rd_name = reg_name(rd); + const char* rs1_name = reg_name(rs1); + if (is_nontrivial(rd)) rd_name = "t0"; + if (is_nontrivial(rs1)) { + rs1_name = "t0"; + load(REG_T0, rs1); + } + printf(" %s %s, %s, %d\n", op, rd_name, rs1_name, imm); + if (is_nontrivial(rd)) { + store_t0(rd); + } } -int dereference(int reg) { - local_type[reg] = local_type[reg] & ~TYPE_PTR_MASK; - indirection[reg] = 1; +void _asm_branch(const char* op, int rs1, int label) { + const char* rs1_name = reg_name(rs1); + if (is_nontrivial(rs1)) { + rs1_name = "t0"; + load(REG_T0, rs1); + } + printf(" %s %s, L%d\n", op, rs1_name, label); +} + +void _asm_i(const char* op, int rd, const char* prefix1, const char* prefix2, int imm) { + const char* rd_name = reg_name(rd); + if (is_nontrivial(rd)) rd_name = "t0"; + printf(" %s %s, %s%s%d\n", op, rd_name, prefix1, prefix2, imm); + if (is_nontrivial(rd)) { + store_t0(rd); + } +} + +int is_not_reusable(int rs1, int expected_type) { + return indirection[rs1] || reg_type[rs1] != expected_type || rs1 == REG_ZERO; +} + +int asm_r(int type, const char* op, int rs1) { + int rd = rs1; + if (is_not_reusable(rs1, type)) rd = next_reg(type); + _asm_r(op, rd, rs1); + return rd; +} + +int asm_rr(int type, const char* op, int rs1, int rs2) { + int rd = rs1; + if (is_not_reusable(rs1, type)) rd = rs2; + if (is_not_reusable(rs2, type)) rd = next_reg(type); + _asm_rr(op, rd, rs1, rs2); + return rd; +} + +void asm_mv(int rd, int rs1) { + _asm_r("mv", rd, rs1); +} + +void store_into_local(int rs1, int slot) { + const char* rs1_name = reg_name(rs1); + if (is_nontrivial(rs1)) { + rs1_name = "t0"; + load(REG_T0, rs1); + } + load_local_address(REG_T2, slot); + printf(" %s %s, 0(t2)\n", store_op_of_type(local_type[slot]), rs1_name); +} + +int materialize_address(int rd, int type, int marker) { + if (marker == MARKER_ARRAY) { + type = type | TYPE_PTR_MASK; + } + reg_type[rd] = type; + indirection[rd] = marker == MARKER_SCALAR; + return rd; +} + +int lookup_from_slot(int slot) { + int reg = next_reg(TYPE_VOID_PTR); + if (is_nontrivial(reg)) { + load_local_address(REG_T0, slot); + asm_mv(reg, REG_T0); + } else { + load_local_address(reg, slot); + } + return materialize_address(reg, local_type[slot], local_marker[slot]); +} + +int load_imm(int imm) { + if (imm == 0) return REG_ZERO; + int reg = next_reg(TYPE_INT); + _asm_i("li", reg, "", "", imm); return reg; } int lookup(int id) { - int local = local_table[id]; - if (local) { - if (local_marker[local] == MARKER_ARRAY) { - load_address(0, local); - return materialize_t0(local_type[local] | TYPE_PTR_MASK); - } - return local; + if (local_table[id]) { + return lookup_from_slot(local_table[id]); + } + if (is_const[id]) { + return load_imm(const_table[id]); } const char* name = id_table + id_lut[id]; if (global_marker[id]) { if (global_marker[id] == MARKER_FUNCTION) { eprintf("function name must not appear outside function call: %s\n", name); exit(1); - } - printf(" la t0, %s # id: %d\n", name, id); - int reg = materialize_t0(global_type[id] | TYPE_PTR_MASK); - if (global_marker[id] == MARKER_SCALAR) { - reg = dereference(reg); - } - return reg; + } + int reg = next_reg(TYPE_VOID_PTR); + _asm_i("la", reg, name, " # id: ", id); + return materialize_address(reg, global_type[id], global_marker[id]); } eprintf("unresolved identifier: %s\n", name); exit(1); @@ -619,65 +876,33 @@ int asm_label(int label) { return label; } -int is_not_reusable(int rs1, int expected_type) { - return indirection[rs1] || local_marker[rs1] != MARKER_TEMP || local_type[rs1] != expected_type; -} - -int asm_r(const char* op, int rs1) { - load(0, rs1); - printf(" %s t0, t0\n", op); - int rd = rs1; - if (is_not_reusable(rs1, TYPE_INT)) { - rd = next_reg(TYPE_INT); - } - store_t0(rd); - return rd; -} - int asm_r_arith(const char* op, int rs1) { - if (local_type[rs1] & TYPE_PTR_MASK) { + if (reg_type[rs1] & TYPE_PTR_MASK) { eprintf("pointer cannot be arithmetically operated by %s\n", op); exit(1); } - return asm_r(op, rs1); -} - -int asm_rr(const char* op, int rs1, int rs2) { - load(0, rs1); - load(1, rs2); - printf(" %s t0, t0, t1\n", op); - int rd = rs1; - if (is_not_reusable(rd, TYPE_INT)) { - rd = rs2; - if (is_not_reusable(rd, TYPE_INT)) { - rd = next_reg(TYPE_INT); - } - } - store_t0(rd); - return rd; + return asm_r(TYPE_INT, op, rs1); } int asm_rr_arith(const char* op, int rs1, int rs2) { - if (local_type[rs1] & TYPE_PTR_MASK || local_type[rs2] & TYPE_PTR_MASK) { + if (reg_type[rs1] & TYPE_PTR_MASK || reg_type[rs2] & TYPE_PTR_MASK) { eprintf("pointer cannot be arithmetically operated by %s\n", op); exit(1); } - return asm_rr(op, rs1, rs2); + return asm_rr(TYPE_INT, op, rs1, rs2); } int asm_rr_cmp(const char* op, int rs1, int rs2) { - // since NULL is virtually 0, it is considered valid example of a pointer comparing with an integer - return asm_rr(op, rs1, rs2); + // since NULL is virtually 0, it is considered a valid example of a pointer comparing with an integer + return asm_rr(TYPE_INT, op, rs1, rs2); } void asm_beqz(int rs1, int label) { - load(0, rs1); - printf(" beqz t0, L%d\n", label); + _asm_branch("beqz", rs1, label); } void asm_bnez(int rs1, int label) { - load(0, rs1); - printf(" bnez t0, L%d\n", label); + _asm_branch("bnez", rs1, label); } void asm_j(int label) { @@ -714,15 +939,9 @@ int step_of(int type) { return 1; } -void asm_shift_t0(const char* op, int type) { - if (type == TYPE_INT_PTR) { - printf(" %s t0, t0, 2\n", op); - } -} - int asm_add(int lhs, int rhs) { - int type1 = local_type[lhs] & TYPE_PTR_MASK; - int type2 = local_type[rhs] & TYPE_PTR_MASK; + int type1 = reg_type[lhs] & TYPE_PTR_MASK; + int type2 = reg_type[rhs] & TYPE_PTR_MASK; if (type1 != type2) { int ptr; int idx; @@ -733,27 +952,26 @@ int asm_add(int lhs, int rhs) { ptr = rhs; idx = lhs; } - int ptr_type = local_type[ptr]; + int ptr_type = reg_type[ptr]; if (ptr_type == TYPE_VOID_PTR) { eprintf("void pointer cannot be arithmetically operated\n"); exit(1); } - load(0, idx); - load(1, ptr); - asm_shift_t0("slli", ptr_type); - printf(" add t0, t0, t1\n"); - return materialize_t0(ptr_type); + int offset = next_reg(TYPE_INT); + int shift = 2 * (ptr_type == TYPE_INT_PTR); + _asm_ri("slli", offset, idx, shift); + return asm_rr(ptr_type, "add", ptr, offset); } if (type1 && type2) { eprintf("operands of addition cannot be both pointers\n"); exit(1); } - return asm_rr("add", lhs, rhs); + return asm_rr(TYPE_INT, "add", lhs, rhs); } int asm_sub(int lhs, int rhs) { - int lhs_type = local_type[lhs]; - int rhs_type = local_type[rhs]; + int lhs_type = reg_type[lhs]; + int rhs_type = reg_type[rhs]; int type1 = lhs_type & TYPE_PTR_MASK; int type2 = rhs_type & TYPE_PTR_MASK; if (type1 && type2) { @@ -765,17 +983,36 @@ int asm_sub(int lhs, int rhs) { eprintf("void pointer cannot be arithmetically operated\n"); exit(1); } - load(0, lhs); - load(1, rhs); - printf(" sub t0, t0, t1\n"); - asm_shift_t0("srai", lhs_type); - return materialize_t0(TYPE_INT); + int difference = asm_rr(TYPE_INT, "sub", lhs, rhs); + int shift = 2 * (lhs_type == TYPE_INT_PTR); + _asm_ri("slli", difference, difference, shift); + return difference; } if (type1) { int neg = asm_r_arith("neg", rhs); return asm_add(lhs, neg); } - return asm_rr("sub", lhs, rhs); + return asm_rr_arith("sub", lhs, rhs); +} + +int dereference(int reg) { + if (indirection[reg]) { + load(reg, reg); + } else { + indirection[reg] = 1; + } + reg_type[reg] = reg_type[reg] & ~TYPE_PTR_MASK; + return reg; +} + +int addressof(int reg) { + if (indirection[reg] && !(reg_type[reg] & TYPE_PTR_MASK)) { + reg_type[reg] = reg_type[reg] | TYPE_PTR_MASK; + indirection[reg] = 0; + } else { + printf("cannot take address of this expression"); + } + return reg; } // parser @@ -811,14 +1048,24 @@ int parse_function_call(int id) { } } for (int i = 0; i < arg; ++i) { - load(0, args[i]); - printf(" mv a%d, t0\n", i); + asm_mv(i + REG_A0, args[i]); + } + for (int i = REG_T3; i <= REG_T6; ++i) { + if (i < max_reg_id) { + asm_sd(reg_name(i), (REG_S2 - i) * 8 - 24, "fp"); + } } printf(" call %s\n", name); + for (int i = REG_T3; i <= REG_T6; ++i) { + if (i < max_reg_id) { + asm_ld(reg_name(i), (REG_S2 - i) * 8 - 24, "fp"); + } + } int type = global_type[id]; if (type != TYPE_VOID) { - printf(" mv t0, a0\n"); - return materialize_t0(type); + int rd = next_reg(type); + asm_mv(rd, REG_A0); + return rd; } return -1; } @@ -828,8 +1075,7 @@ int parse_primary_expr() { if (token_type == TOKEN_EOF) { exit(1); } else if (token_type == TOKEN_NUMBER) { - printf(" li t0, %d\n", token_data); - return materialize_t0(TYPE_INT); + return load_imm(token_data); } else if (token_type == TOKEN_ID) { next_token(); if (token_type == TOKEN_PAREN_LEFT) { @@ -838,8 +1084,9 @@ int parse_primary_expr() { unget_token(); return lookup(token_data); } else if (token_type == TOKEN_STRING) { - printf(" la t0, .LC%d\n", token_data); - return materialize_t0(TYPE_CHAR_PTR); + int reg = next_reg(TYPE_CHAR_PTR); + _asm_i("la", reg, ".LC", "", token_data); + return reg; } else if (token_type == TOKEN_PAREN_LEFT) { int reg = parse_expr(); expect_token(TOKEN_PAREN_RIGHT); @@ -855,20 +1102,16 @@ int parse_postfix_expr() { while (1) { next_token(); if (token_type == TOKEN_INC) { - int type = local_type[lhs]; + int type = reg_type[lhs]; int reg = next_reg(type); - load(0, lhs); - store_t0(reg); - printf(" addi t0, t0, %d\n", step_of(type)); - store_t0(lhs); + asm_mv(reg, lhs); + _asm_ri("addi", lhs, lhs, step_of(type)); lhs = reg; } else if (token_type == TOKEN_DEC) { - int type = local_type[lhs]; + int type = reg_type[lhs]; int reg = next_reg(type); - load(0, lhs); - store_t0(reg); - printf(" addi t0, t0, -%d\n", step_of(type)); - store_t0(lhs); + asm_mv(reg, lhs); + _asm_ri("addi", lhs, lhs, -step_of(type)); lhs = reg; } else if (token_type == TOKEN_BRACKET_LEFT) { int rhs = parse_expr(); @@ -886,16 +1129,15 @@ int parse_prefix_expr() { next_token(); if (token_type == TOKEN_AND) { int reg = parse_postfix_expr(); - int type = local_type[reg]; + int type = reg_type[reg]; if (type & TYPE_PTR_MASK) { eprintf("cannot take address of a pointer\n"); exit(1); } - load_address(0, reg); - return materialize_t0(type | TYPE_PTR_MASK); + return addressof(reg); } else if (token_type == TOKEN_STAR) { int reg = parse_postfix_expr(); - int type = local_type[reg]; + int type = reg_type[reg]; if (!(type & TYPE_PTR_MASK)) { eprintf("cannot dereference a non-pointer\n"); exit(1); @@ -904,8 +1146,7 @@ int parse_prefix_expr() { eprintf("cannot dereference void pointer\n"); exit(1); } - load(0, reg); - return dereference(materialize_t0(type)); + return dereference(reg); } else if (token_type == TOKEN_MINUS) { int reg = parse_postfix_expr(); return asm_r_arith("neg", reg); @@ -914,18 +1155,14 @@ int parse_prefix_expr() { return asm_r_arith("not", reg); } else if (token_type == TOKEN_NOT) { int reg = parse_postfix_expr(); - return asm_r("seqz", reg); + return asm_r(TYPE_INT, "seqz", reg); } else if (token_type == TOKEN_INC) { int reg = parse_postfix_expr(); - load(0, reg); - printf(" addi t0, t0, %d\n", step_of(local_type[reg])); - store_t0(reg); + _asm_ri("addi", reg, reg, step_of(reg_type[reg])); return reg; } else if (token_type == TOKEN_DEC) { int reg = parse_postfix_expr(); - load(0, reg); - printf(" addi t0, t0, -%d\n", step_of(local_type[reg])); - store_t0(reg); + _asm_ri("addi", reg, reg, -step_of(reg_type[reg])); return reg; } else { unget_token(); @@ -1003,11 +1240,11 @@ int parse_cmp_expr() { } else if (token_type == TOKEN_LE) { int rhs = parse_shift_expr(); int sgt = asm_rr_cmp("sgt", lhs, rhs); - lhs = asm_r("seqz", sgt); + lhs = asm_r(TYPE_INT, "seqz", sgt); } else if (token_type == TOKEN_GE) { int rhs = parse_shift_expr(); int slt = asm_rr_cmp("slt", lhs, rhs); - lhs = asm_r("seqz", slt); + lhs = asm_r(TYPE_INT, "seqz", slt); } else { unget_token(); break; @@ -1023,11 +1260,11 @@ int parse_eq_expr() { if (token_type == TOKEN_EQ) { int rhs = parse_cmp_expr(); int xor0 = asm_rr_cmp("xor", lhs, rhs); - lhs = asm_r("seqz", xor0); + lhs = asm_r(TYPE_INT, "seqz", xor0); } else if (token_type == TOKEN_NE) { int rhs = parse_cmp_expr(); int xor0 = asm_rr_cmp("xor", lhs, rhs); - lhs = asm_r("snez", xor0); + lhs = asm_r(TYPE_INT, "snez", xor0); } else { unget_token(); break; @@ -1084,48 +1321,58 @@ int parse_bitwise_or_expr() { int parse_logical_and_expr() { int lhs = parse_bitwise_or_expr(); - int label = next_label(); - int label_used = 0; + int logical = 0; + int label; + int result; while (1) { next_token(); if (token_type == TOKEN_LAND) { - lhs = asm_r("snez", lhs); - asm_beqz(lhs, label); + if (!logical) { + logical = 1; + label = next_label(); + result = next_reg(TYPE_INT); + _asm_r("snez", result, lhs); + } + asm_beqz(result, label); int rhs = parse_bitwise_or_expr(); - rhs = asm_r("snez", rhs); - lhs = asm_rr("and", lhs, rhs); - label_used = 1; + _asm_r("snez", result, rhs); } else { unget_token(); break; } } - if (label_used) { + if (logical) { asm_label(label); + return result; } return lhs; } int parse_logical_or_expr() { int lhs = parse_logical_and_expr(); - int label = next_label(); - int label_used = 0; + int logical = 0; + int label; + int result; while (1) { next_token(); if (token_type == TOKEN_LOR) { - lhs = asm_r("snez", lhs); - asm_bnez(lhs, label); + if (!logical) { + logical = 1; + label = next_label(); + result = next_reg(TYPE_INT); + _asm_r("snez", result, lhs); + } + asm_bnez(result, label); int rhs = parse_logical_and_expr(); - rhs = asm_r("snez", rhs); - lhs = asm_rr("or", lhs, rhs); - label_used = 1; + _asm_r("snez", result, rhs); } else { unget_token(); break; } } - if (label_used) { + if (logical) { asm_label(label); + return result; } return lhs; } @@ -1135,8 +1382,7 @@ int parse_assign_expr() { next_token(); if (token_type == TOKEN_ASSIGN) { int rhs = parse_assign_expr(); - load(0, rhs); - store_t0(lhs); + asm_mv(lhs, rhs); return lhs; } else { unget_token(); @@ -1150,7 +1396,7 @@ int parse_expr() { void parse_local_variable(int type) { if (type == TYPE_VOID) { - eprintf("local variable of void type is not supported\n"); + eprintf("variable cannot be of void type\n"); exit(1); } expect_token(TOKEN_ID); @@ -1158,17 +1404,16 @@ void parse_local_variable(int type) { next_token(); if (token_type == TOKEN_BRACKET_LEFT) { if (type & TYPE_PTR_MASK) { - eprintf("local variable of array of pointers is not supported\n"); + eprintf("array of pointers is not supported\n"); exit(1); } expect_token(TOKEN_NUMBER); int size = token_data; expect_token(TOKEN_BRACKET_RIGHT); declare_local_array(id, type, size); - next_token(); - } else { - declare_local(id, type); - } + return; + } + int slot = declare_local(id, type); if (token_type == TOKEN_SEMICOLON) { unget_token(); return; @@ -1176,8 +1421,11 @@ void parse_local_variable(int type) { unget_token(); expect_token(TOKEN_ASSIGN); int reg = parse_expr(); - load(0, reg); - store_t0(local_table[id]); + if (type != reg_type[reg]) { + eprintf("type mismatch in assignment\n"); + exit(1); + } + store_into_local(reg, slot); } void parse_stmt(); @@ -1290,9 +1538,8 @@ void parse_stmt() { return; } unget_token(); - int reg = parse_expr(); - load(0, reg); - printf(" mv a0, t0\n"); + int rs1 = parse_expr(); + asm_mv(REG_A0, rs1); asm_j(epilog_label); } else if (token_type == TOKEN_BREAK) { int label = asm_get_break_label(); @@ -1335,7 +1582,7 @@ void parse_function(const char* name) { } int arg_type = parse_type(); if (arg_type < 0 || arg_type == TYPE_VOID) { - eprintf("unexpected a non-void argument type"); + eprintf("unexpected a non-void argument type: %d\n", arg_type); exit(1); } expect_token(TOKEN_ID); @@ -1345,11 +1592,15 @@ void parse_function(const char* name) { expect_token(TOKEN_BRACKET_RIGHT); next_token(); if (arg_type & TYPE_PTR_MASK) { - eprintf("local variable of array of pointers is not supported\n"); + eprintf("array of pointers is not supported\n"); exit(1); } arg_type = arg_type | TYPE_PTR_MASK; } + if (arg >= 8) { + eprintf("too many arguments\n"); + exit(1); + } args[arg++] = declare_local(token_data, arg_type); if (token_type == TOKEN_COMMA) { // continue; @@ -1383,7 +1634,10 @@ void parse_function(const char* name) { parse_stmt(); } asm_j(epilog_label); - int frame_size = max_local_id * 8; + int reg_used = max_reg_id - REG_S2; + if (reg_used > 14) reg_used = 14; + int frame_size = (max_local_id - 1 + reg_used + 2) * 8; + if (reg_used > 10) reg_used = 10; if (frame_size % 16 != 0) { frame_size = frame_size + 8; } @@ -1392,27 +1646,30 @@ void parse_function(const char* name) { asm_addi("sp", "sp", -frame_size); asm_sd("ra", frame_size - 8, "sp"); asm_sd("fp", frame_size - 16, "sp"); + for (int i = 0; i < reg_used; ++i) { + int reg = REG_S2 + i; + asm_sd(reg_name(reg), frame_size - 24 - i * 8, "sp"); + } asm_addi("fp", "sp", frame_size); for (int i = 0; i < arg; ++i) { - printf(" mv t0, a%d\n", i); - store_t0(args[i]); + store_into_local(REG_A0 + i, args[i]); } asm_j(label); // epilog asm_label(epilog_label); - asm_ld("fp", frame_size - 16, "sp"); asm_ld("ra", frame_size - 8, "sp"); + asm_ld("fp", frame_size - 16, "sp"); + for (int i = 0; i < reg_used; ++i) { + int reg = REG_S2 + i; + asm_ld(reg_name(reg), frame_size - 24 - i * 8, "sp"); + } asm_addi("sp", "sp", frame_size); printf(" ret\n"); } void parse_global_variable(int id, const char* name, int type) { if (type == TYPE_VOID) { - eprintf("global variable of void type is not supported\n"); - exit(1); - } - if (type & TYPE_PTR_MASK) { - eprintf("global variable of pointer is not supported\n"); + eprintf("variable cannot be of void type\n"); exit(1); } printf(".data\n"); @@ -1423,6 +1680,10 @@ void parse_global_variable(int id, const char* name, int type) { expect_token(TOKEN_NUMBER); printf(" .word %d\n", token_data); } else if (token_type == TOKEN_BRACKET_LEFT) { + if (type & TYPE_PTR_MASK) { + eprintf("array of pointers is not supported\n"); + exit(1); + } expect_token(TOKEN_NUMBER); int size = token_data; expect_token(TOKEN_BRACKET_RIGHT); @@ -1440,16 +1701,28 @@ void parse_global_variable(int id, const char* name, int type) { } void parse_global_declaration() { + int is_const_int = 1; + if (token_type != TOKEN_CONST) { + is_const_int = 0; + } int type = parse_type(); if (type < 0) { eprintf("expecting type for global declaration\n"); exit(1); } + if (type != TYPE_INT) { + is_const_int = 0; + } expect_token(TOKEN_ID); int id = token_data; char* name = id_table + id_lut[id]; next_token(); - if (token_type == TOKEN_PAREN_LEFT) { + if (is_const_int && token_type == TOKEN_ASSIGN) { + expect_token(TOKEN_NUMBER); + const_table[id] = token_data; + is_const[id] = 1; + expect_token(TOKEN_SEMICOLON); + } else if (token_type == TOKEN_PAREN_LEFT) { declare_global(id, MARKER_FUNCTION, type); parse_function(name); } else { diff --git a/boot.sh b/boot.sh index 91909ec..c4a4ecb 100644 --- a/boot.sh +++ b/boot.sh @@ -5,7 +5,7 @@ gcc ../boot.c ../boot-lib.c -o gcc.out && riscv64-linux-gnu-gcc-12 -static boot1.s ../boot-lib.c -o boot1.out && qemu-riscv64 boot1.out < boot-all.c > boot2.s && riscv64-linux-gnu-gcc-12 -static boot2.s ../boot-lib.c -o boot2.out && -qemu-riscv64 boot2.out < boot-all.c > boot3.s && +qemu-riscv64 boot2.out < boot-all.c > boot3.s cmp --silent boot1.s boot2.s && echo "boot1.s == boot2.s" || echo "boot1.s != boot2.s" cmp --silent boot2.s boot3.s && echo "boot2.s == boot3.s" || echo "boot2.s != boot3.s" cmp --silent boot1.s boot3.s && echo "boot1.s == boot3.s" || echo "boot1.s != boot3.s" diff --git a/demo/add.c b/demo/add.c new file mode 100644 index 0000000..6c54b5f --- /dev/null +++ b/demo/add.c @@ -0,0 +1,17 @@ +int printf(const char format[], ...); +int scanf(const char format[], ...); +int putchar(int ch); + +int* p; +int f1() { + int a = 1; + return *(a+(a+(a+(a+(a+(a+(a+(a+(a+(a+(p))))))))))); // a[10] +} + + +int main() { + int a[15]; + p = a; + for (int i = 0; i < 15; ++i) a[i] = i; + return f1(); +} \ No newline at end of file diff --git a/demo/lut.c b/demo/lut.c new file mode 100644 index 0000000..f5b23ba --- /dev/null +++ b/demo/lut.c @@ -0,0 +1,47 @@ +int printf(const char format[], ...); +int getchar(); + +char string_table[65536]; +int string_offset; +int string_lut[4096]; +int string_lut_size; + +int parse_string() { + int offset = string_offset; + int ch; + while ((ch = getchar()) != '"') { + if (ch == -1 || ch == '\n') { + printf("expecting '\"'\n"); + return 1; + } + string_table[string_offset++] = ch; + } + string_table[string_offset++] = 0; + string_lut[string_lut_size] = offset; + return string_lut_size++; +} + + +int streq(const char* s1, const char* s2) { + while (*s1 && *s2 && *s1 == *s2) { + s1++; + s2++; + } + return *s1 == *s2; +} + +void dump_string_table() { + printf(".data\n"); + for (int i = 0; i < string_lut_size; ++i) { + char* id = string_table + string_lut[i]; + printf(".LC%d: .string \"%s\", const: %d\n", + i, id, streq(id, "const")); + } +} + +int main() { + char ch; + while ((ch = getchar()) == '"') parse_string(); + dump_string_table(); + return 0; +} \ No newline at end of file diff --git a/demo/parse.c b/demo/parse.c new file mode 100644 index 0000000..98b7e9d --- /dev/null +++ b/demo/parse.c @@ -0,0 +1,19 @@ +int getchar(); + + +int is_digit(int ch) { + return '0' <= ch && ch <= '9'; +} + +int parse_int(int ch) { + int num = ch - '0'; + while (is_digit(ch = getchar())) { + num = num * 10; + num = num + ch - '0'; + } + return num; +} + +int main() { + return parse_int(getchar()); +} \ No newline at end of file diff --git a/demo/strcmp.c b/demo/strcmp.c new file mode 100644 index 0000000..6d4793e --- /dev/null +++ b/demo/strcmp.c @@ -0,0 +1,17 @@ +int printf(const char* format, ...); + +int strcmp(const char* s1, const char* s2) { + while (*s1 && *s2 && *s1 == *s2) { + s1++; + s2++; + } + return *s1 - *s2; +} + +int main() { + const char* s1 = "helloworld"; + const char* s2 = "world"; + printf("%d\n", strcmp(s1, s2)); + printf("%d\n", strcmp(s1 + 5, s2)); + return 0; +} \ No newline at end of file