Ark Compiler#
一个简单的编译器,本科时几个人合作写的,起的名字比较好玩。
实现了 parser、lexer、AST、符号表,以及在此之上的简单类型检查、结构体、重载、指针等高阶特性,虽然比较简单但功能相对完整。不过其实应该算一个编译器前端,能够将 Pascal 语言编译成一种三地址中间码。
例:一个 gcd 算法(最大公约数),输入两个整数,输出它们的最大公约数。涉及到指针、函数、递归、重载等必要特性。
program example(input,output);
var x,y:integer;
function gcd(a,b:integer):integer;
begin
if b=0 then gcd:=a
else gcd:=gcd(b, a mod b)
end;
begin
read(x, y);
write(gcd(x, y))
end.pascal编译结果:本来是想接入 LLVM 作为后端,但是因为时间关系没有做到,暂且输出一种 C 的三地址中间码。更复杂例子请查看附录
//prelude
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
typedef char* string;
int ____nouse = 1;
void assign_string_(string* a, string b) {
if (*a) free(*a);
char* p = malloc(strlen(b) + 1);
strcpy(p, b);
*a = p;
}
string add_string_(string a, string b) {
char* c = malloc(strlen(a) + strlen(b) + 1);
strcpy(c, a);
strcpy(c + strlen(a), b);
return c;
}
string new_string_(const char* s) {
string p = malloc(strlen(s) + 1);
strcpy(p, s);
return p;
}
void read_integer_integer(int* a2,int* a1) { scanf("%d%d", a2, a1); }
void write_integer(int a1) { printf("%d", a1); }
//global_define
int t0_0=0;
int t0_1=0;
//pre_struct
//pre_array
//struct
//init_global_var
void init_global_var_() {
}
//main
int x;
int y;
int gcd_integer_integer(int a, int b)
{
int __ret;
int t2_0;
(t2_0)=(b)==(t0_0);
if (t2_0)goto L0;
{
int t3_0;
(t3_0)=(a)%(b);
int t3_1;
(t3_1)=gcd_integer_integer((b),(t3_0));
(__ret)=(t3_1);
}
goto L1;
L0: ____nouse=1;;
{
(__ret)=(a);
}
L1: ____nouse=1;;
return __ret;
}
int main()
{
init_global_var_();
int __ret;
read_integer_integer(&(x),&(y));
int t2_0;
(t2_0)=gcd_integer_integer((x),(y));
write_integer((t2_0));
return t0_1;
return __ret;
}c设计#
Lexer#
词法分析器使用了 Flex 生成的有限自动机,将源代码转为 token 流,并检查词法错误。
Parser#
虽然有考虑过用 LR(1) 文法的分析器,但是在参考 Rust 实现及 LLVM 官方教程后,考虑到 LR(1) 功能还是过于僵硬、不易调试、错误恢复困难等问题,最终权衡利弊下还是使用了手写的递归下降分析器。
Parser 会将 token 流转为我们内部的 AST,并检查语法错误。
AST#
我负责实现了 AST、符号表与后续的中间代码生成。
AST 中包含了一些节点,有
- AST:基类
- BlockAST:代码块。由于 Pascal 的代码块不返回值,所以单独对待。
- ExprAST:带有返回值的表达式。
- TypeDeclAST:类型声明表达式的基类。
- BasicTypeAST:基本类型,如 int。
- TypeDefAST:自定义类型。
- PointerTypeDeclAST:定义指针类型。
- ArrayTypeDeclAST:定义数组类型。
- NumberExprAST:数字常量。
- StringExprAST:字符串常量。
- CharExprAST:字符常量。
- VariableExprAST:变量引用。
- UnaryExprAST:一元运算符。
- BinaryExprAST:二元运算符。
- ReturnAST:返回语句。
- CallExprAST:函数调用。
- VariableDeclAST:变量声明。
- ForStatementAST:for 语句。
- WhileStatementAST:while 语句。
- IfStatementAST:if 语句。
- FunctionSignatureAST:函数签名,不包含函数体。
- FunctionAST:函数定义。
- StructDeclAST:结构体定义。
- GlobalAST:全局 AST。
符号表与描述符#
编译器实现了符号表,实现了作用域等必要功能。
// 作用域下的符号表
class _SymbolTable {
int next_slot;
std::map<std::string, SymbolDescriptor*> refType;
std::map<std::string, VariableDescriptor*> refVar;
std::map<std::pair<SymbolDescriptor*, int>, std::string> hasArrayType;
std::map<SymbolDescriptor*, std::string> hasPointerType;
// ...
};
// 用于实现作用域嵌套
class SymbolTable {
private:
static _SymbolTable* root;
static _SymbolTable* current;
// ...
};
// 地址符号表
class TagTable {
static int next_slot;
// ...
};c描述符包括
- SymbolDescriptor:基类
- TypeDescriptor:类型描述符
- PointerTypeDescriptor:指针类型描述符
- ArrayTypeDescriptor:数组类型描述符
- VariableDescriptor:变量描述符
- StructDescriptor:结构体描述符
- FunctionDescriptor:函数描述符
中间代码生成#
代码生成主要由 Dispatcher 在 AST 上遍历完成。AST 定义了 dispatcher 的遍历顺序,并由 dispatcher 负责生成代码。这种设计使得未来可以相对方便地替换 dispatcher 从而对接其他后端,比如 LLVM(虽然最后还是没时间换了)。生成过程中将检查语义错误。
dispatcher 将代码插入至 Collector,并按照中间代码的结构顺序作最后编排,编译为中间代码。后续由 gcc 编译为目标程序。
以及一点可视化#
这个纯属课程作业的附带需求。由小组另一位兄弟 ↗完成:https://gr.yiren.lu/ ↗
开源#
代码于 github 公开:https://github.com/Harmaney/ark_compiler ↗
团体名字和项目名字只是因为有趣,没有特别的意思。
不过代码质量写得没那么好,建议只编译后玩玩 files 文件下附带的 examples。
附录:更多例子#
以下附带二维数组、0-1 背包、快速排序三个复杂样例。其他更多样例可查看开源地址。
二维数组#
源代码#
program Hello;
var a:array[1..10,10..20] of integer;
begin
a[1,14]:=2;
a[2,12]:=3;
a[10,20]:=a[1,14]+a[2,12];
write(a[10,20]);
end.pascal中间代码#
//prelude
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
typedef char* string;
int ____nouse = 1;
void assign_string_(string* a, string b) {
if (*a) free(*a);
char* p = malloc(strlen(b) + 1);
strcpy(p, b);
*a = p;
}
string add_string_(string a, string b) {
char* c = malloc(strlen(a) + strlen(b) + 1);
strcpy(c, a);
strcpy(c + strlen(a), b);
return c;
}
string new_string_(const char* s) {
string p = malloc(strlen(s) + 1);
strcpy(p, s);
return p;
}
void write_integer(int a1) { printf("%d", a1); }
//global_define
int t0_2=1;
int t0_3=14;
int t0_4=2;
int t0_5=2;
int t0_6=12;
int t0_7=3;
int t0_8=10;
int t0_9=20;
int t0_10=1;
int t0_11=14;
int t0_12=2;
int t0_13=12;
int t0_14=10;
int t0_15=20;
int t0_16=0;
//pre_struct
//pre_array
typedef int t0_0[11];
typedef t0_0 t0_1[10];
//struct
//init_global_var
void init_global_var_() {
}
//main
t0_1 a;
int main()
{
init_global_var_();
int __ret;
int t2_0=(t2_0)-1;
t0_0* t2_1;
t2_1=&(a)[(t0_2)];
int t2_2=(t2_2)-10;
int* t2_3;
t2_3=&(*t2_1)[(t0_3)];
(*t2_3)=(t0_4);
int t2_4=(t2_4)-1;
t0_0* t2_5;
t2_5=&(a)[(t0_5)];
int t2_6=(t2_6)-10;
int* t2_7;
t2_7=&(*t2_5)[(t0_6)];
(*t2_7)=(t0_7);
int t2_8=(t2_8)-1;
t0_0* t2_9;
t2_9=&(a)[(t0_8)];
int t2_10=(t2_10)-10;
int* t2_11;
t2_11=&(*t2_9)[(t0_9)];
int t2_12=(t2_12)-1;
t0_0* t2_13;
t2_13=&(a)[(t0_10)];
int t2_14=(t2_14)-10;
int* t2_15;
t2_15=&(*t2_13)[(t0_11)];
int t2_16=(t2_16)-1;
t0_0* t2_17;
t2_17=&(a)[(t0_12)];
int t2_18=(t2_18)-10;
int* t2_19;
t2_19=&(*t2_17)[(t0_13)];
int t2_20;
(t2_20)=(*t2_15)+(*t2_19);
(*t2_11)=(t2_20);
int t2_21=(t2_21)-1;
t0_0* t2_22;
t2_22=&(a)[(t0_14)];
int t2_23=(t2_23)-10;
int* t2_24;
t2_24=&(*t2_22)[(t0_15)];
write_integer((*t2_24));
return t0_16;
return __ret;
}c0-1 背包#
源代码#
program backpack;
var c,n:longint;f,w,v:array[0..1005] of longint;
var i,j:longint;
function max(i,j:longint):longint;
begin
if i<j then max:=j
else max:=i;
end;
begin
read(c,n);
for i:=1 to n do
read(w[i],v[i]);
for i:=1 to n do
begin
j:=C;
while j>=w[i] do
begin
f[j]:=max(f[j],f[j-w[i]]+v[i]);
j:=j-1;
end;
end;
write(f[c]);
end.pascal中间代码#
//prelude
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
typedef char* string;
int ____nouse = 1;
void assign_string_(string* a, string b) {
if (*a) free(*a);
char* p = malloc(strlen(b) + 1);
strcpy(p, b);
*a = p;
}
string add_string_(string a, string b) {
char* c = malloc(strlen(a) + strlen(b) + 1);
strcpy(c, a);
strcpy(c + strlen(a), b);
return c;
}
string new_string_(const char* s) {
string p = malloc(strlen(s) + 1);
strcpy(p, s);
return p;
}
void read_longint_longint(int* a2,int* a1) { scanf("%d%d", a2, a1); }
void write_longint(int a1) { printf("%d", a1); }
//global_define
int t0_3=1;
int t0_4=1;
int t0_5=1;
int t0_6=0;
//pre_struct
//pre_array
typedef int t0_0[1006];
typedef int t0_1[1006];
typedef int t0_2[1006];
//struct
//init_global_var
void init_global_var_() {
}
//main
int i;
int j;
int c;
int n;
t0_0 f;
t0_1 w;
t0_2 v;
int max_longint_longint(int i, int j)
{
int __ret;
int t2_0;
(t2_0)=(i)<(j);
if (t2_0)goto L0;
{
(__ret)=(i);
}
goto L1;
L0: ____nouse=1;;
{
(__ret)=(j);
}
L1: ____nouse=1;;
return __ret;
}
int main()
{
init_global_var_();
int __ret;
read_longint_longint(&(c),&(n));
i = t0_3;
if (i>n)goto L2;
L3: ____nouse=1;;
{
int t3_0=(t3_0)-0;
int* t3_1;
t3_1=&(w)[(i)];
int t3_2=(t3_2)-0;
int* t3_3;
t3_3=&(v)[(i)];
read_longint_longint(&(*t3_1),&(*t3_3));
}
i++;
if (i<=n)goto L3;
L2: ____nouse=1;;
i = t0_4;
if (i>n)goto L4;
L5: ____nouse=1;;
{
(j)=(__ret);
int t3_0=(t3_0)-0;
int* t3_1;
t3_1=&(w)[(i)];
int t3_2;
(t3_2)=(j)>=(*t3_1);
if (!t3_2)goto L6;
L7: ____nouse=1;;
{
int t4_0=(t4_0)-0;
int* t4_1;
t4_1=&(f)[(j)];
int t4_2=(t4_2)-0;
int* t4_3;
t4_3=&(f)[(j)];
int t4_4=(t4_4)-0;
int* t4_5;
t4_5=&(w)[(i)];
int t4_6;
(t4_6)=(j)-(*t4_5);
int t4_7=(t4_7)-0;
int* t4_8;
t4_8=&(f)[(t4_6)];
int t4_9=(t4_9)-0;
int* t4_10;
t4_10=&(v)[(i)];
int t4_11;
(t4_11)=(*t4_8)+(*t4_10);
int t4_12;
(t4_12)=max_longint_longint((*t4_3),(t4_11));
(*t4_1)=(t4_12);
int t4_13;
(t4_13)=(j)-(t0_5);
(j)=(t4_13);
}
int t3_3=(t3_3)-0;
int* t3_4;
t3_4=&(w)[(i)];
int t3_5;
(t3_5)=(j)>=(*t3_4);
if (t3_5)goto L7;
L6: ____nouse=1;;
}
i++;
if (i<=n)goto L5;
L4: ____nouse=1;;
int t2_0=(t2_0)-0;
int* t2_1;
t2_1=&(f)[(c)];
write_longint((*t2_1));
return t0_6;
return __ret;
}c快速排序#
源代码#
program quicksort(input,output);
var a:array[0..200000] of int64;
var n:int64;i:longint;
procedure swap(var i,j:int64);
var t:int64;
begin
t:=i;
i:=j;
j:=t;
end;
procedure qsort(l,r:int64);
var i,j,key:int64;
var m:int64;
begin
if l>=r then exit();
i:=l;
j:=r;
m:=(l+r) div 2;
key:=a[m];
swap(a[m],a[l]);
while i<>j do
begin
while (a[j]>=key) and (i<j) do j:=j-1;
while (a[i]<=key) and (i<j) do i:=i+1;
swap(a[i],a[j]);
end;
swap(a[l],a[i]);
while (l < i) and (a[i] = a[i-1]) do i := i - 1;
while (j < r) and (a[j] = a[j+1]) do j := j + 1;
qsort(l,i-1);
qsort(j+1,r);
end;
begin
read(n);
for i:=1 to n do
read(a[i]);
qsort(1,n);
for i:=1 to n do
begin
write(a[i]);
if (i <> n) then write(' ');
end;
write('\n');
end.pascal中间代码#
//prelude
#include "stdio.h"
#include "stdlib.h"
#include "string.h"
typedef char* string;
int ____nouse = 1;
void assign_string_(string* a, string b) {
if (*a) free(*a);
char* p = malloc(strlen(b) + 1);
strcpy(p, b);
*a = p;
}
string add_string_(string a, string b) {
char* c = malloc(strlen(a) + strlen(b) + 1);
strcpy(c, a);
strcpy(c + strlen(a), b);
return c;
}
string new_string_(const char* s) {
string p = malloc(strlen(s) + 1);
strcpy(p, s);
return p;
}
void read_int64(long long* a1) { scanf("%lld", a1); }
void write_int64(long long a1) { printf("%lld", a1); }
void write_string(string a1) { printf("%s", a1); }
//global_define
int t0_1=2;
int t0_2=1;
int t0_3=1;
int t0_4=1;
int t0_5=1;
int t0_6=1;
int t0_7=1;
int t0_8=1;
int t0_9=1;
int t0_10=1;
int t0_11=1;
int t0_12=1;
int t0_13=1;
int t0_14=1;
string t0_15 = NULL;
string t0_16 = NULL;
int t0_17=0;
//pre_struct
//pre_array
typedef long long t0_0[200001];
//struct
//init_global_var
void init_global_var_() {
t0_15= new_string_(" ");
t0_16= new_string_("\n");
}
//main
long long n;
int i;
t0_0 a;
void swap_int64_int64(long long* i, long long* j)
{
long long t;
(t)=(*i);
(*i)=(*j);
(*j)=(t);
int __ret;
}
void qsort_int64_int64(long long l, long long r)
{
long long key;
long long j;
long long i;
long long m;
long long t2_0;
(t2_0)=(l)>=(r);
if (t2_0)goto L0;
{
}
goto L1;
L0: ____nouse=1;;
{
return ;
}
L1: ____nouse=1;;
(i)=(l);
(j)=(r);
long long t2_1;
(t2_1)=(l)+(r);
long long t2_2;
(t2_2)=(t2_1)/(t0_1);
(m)=(t2_2);
long long t2_3=(t2_3)-0;
long long* t2_4;
t2_4=&(a)[(m)];
(key)=(*t2_4);
long long t2_5=(t2_5)-0;
long long* t2_6;
t2_6=&(a)[(m)];
long long t2_7=(t2_7)-0;
long long* t2_8;
t2_8=&(a)[(l)];
swap_int64_int64(&(*t2_6),&(*t2_8));
long long t2_9;
(t2_9)=(i)!=(j);
if (!t2_9)goto L2;
L3: ____nouse=1;;
{
long long t3_0=(t3_0)-0;
long long* t3_1;
t3_1=&(a)[(j)];
long long t3_2;
(t3_2)=(*t3_1)>=(key);
long long t3_3;
(t3_3)=(i)<(j);
long long t3_4;
(t3_4)=(t3_2)&&(t3_3);
if (!t3_4)goto L4;
L5: ____nouse=1;;
{
long long t4_0;
(t4_0)=(j)-(t0_2);
(j)=(t4_0);
}
long long t3_5=(t3_5)-0;
long long* t3_6;
t3_6=&(a)[(j)];
long long t3_7;
(t3_7)=(*t3_6)>=(key);
long long t3_8;
(t3_8)=(i)<(j);
long long t3_9;
(t3_9)=(t3_7)&&(t3_8);
if (t3_9)goto L5;
L4: ____nouse=1;;
long long t3_10=(t3_10)-0;
long long* t3_11;
t3_11=&(a)[(i)];
long long t3_12;
(t3_12)=(*t3_11)<=(key);
long long t3_13;
(t3_13)=(i)<(j);
long long t3_14;
(t3_14)=(t3_12)&&(t3_13);
if (!t3_14)goto L6;
L7: ____nouse=1;;
{
long long t4_0;
(t4_0)=(i)+(t0_3);
(i)=(t4_0);
}
long long t3_15=(t3_15)-0;
long long* t3_16;
t3_16=&(a)[(i)];
long long t3_17;
(t3_17)=(*t3_16)<=(key);
long long t3_18;
(t3_18)=(i)<(j);
long long t3_19;
(t3_19)=(t3_17)&&(t3_18);
if (t3_19)goto L7;
L6: ____nouse=1;;
long long t3_20=(t3_20)-0;
long long* t3_21;
t3_21=&(a)[(i)];
long long t3_22=(t3_22)-0;
long long* t3_23;
t3_23=&(a)[(j)];
swap_int64_int64(&(*t3_21),&(*t3_23));
}
long long t2_10;
(t2_10)=(i)!=(j);
if (t2_10)goto L3;
L2: ____nouse=1;;
long long t2_11=(t2_11)-0;
long long* t2_12;
t2_12=&(a)[(l)];
long long t2_13=(t2_13)-0;
long long* t2_14;
t2_14=&(a)[(i)];
swap_int64_int64(&(*t2_12),&(*t2_14));
long long t2_15;
(t2_15)=(l)<(i);
long long t2_16=(t2_16)-0;
long long* t2_17;
t2_17=&(a)[(i)];
long long t2_18;
(t2_18)=(i)-(t0_4);
long long t2_19=(t2_19)-0;
long long* t2_20;
t2_20=&(a)[(t2_18)];
long long t2_21;
(t2_21)=(*t2_17)==(*t2_20);
long long t2_22;
(t2_22)=(t2_15)&&(t2_21);
if (!t2_22)goto L8;
L9: ____nouse=1;;
{
long long t3_0;
(t3_0)=(i)-(t0_5);
(i)=(t3_0);
}
long long t2_23;
(t2_23)=(l)<(i);
long long t2_24=(t2_24)-0;
long long* t2_25;
t2_25=&(a)[(i)];
long long t2_26;
(t2_26)=(i)-(t0_6);
long long t2_27=(t2_27)-0;
long long* t2_28;
t2_28=&(a)[(t2_26)];
long long t2_29;
(t2_29)=(*t2_25)==(*t2_28);
long long t2_30;
(t2_30)=(t2_23)&&(t2_29);
if (t2_30)goto L9;
L8: ____nouse=1;;
long long t2_31;
(t2_31)=(j)<(r);
long long t2_32=(t2_32)-0;
long long* t2_33;
t2_33=&(a)[(j)];
long long t2_34;
(t2_34)=(j)+(t0_7);
long long t2_35=(t2_35)-0;
long long* t2_36;
t2_36=&(a)[(t2_34)];
long long t2_37;
(t2_37)=(*t2_33)==(*t2_36);
long long t2_38;
(t2_38)=(t2_31)&&(t2_37);
if (!t2_38)goto L10;
L11: ____nouse=1;;
{
long long t3_0;
(t3_0)=(j)+(t0_8);
(j)=(t3_0);
}
long long t2_39;
(t2_39)=(j)<(r);
long long t2_40=(t2_40)-0;
long long* t2_41;
t2_41=&(a)[(j)];
long long t2_42;
(t2_42)=(j)+(t0_9);
long long t2_43=(t2_43)-0;
long long* t2_44;
t2_44=&(a)[(t2_42)];
long long t2_45;
(t2_45)=(*t2_41)==(*t2_44);
long long t2_46;
(t2_46)=(t2_39)&&(t2_45);
if (t2_46)goto L11;
L10: ____nouse=1;;
long long t2_47;
(t2_47)=(i)-(t0_10);
qsort_int64_int64((l),(t2_47));
long long t2_48;
(t2_48)=(j)+(t0_11);
qsort_int64_int64((t2_48),(r));
int __ret;
}
int main()
{
init_global_var_();
int __ret;
read_int64(&(n));
i = t0_12;
if (i>n)goto L12;
L13: ____nouse=1;;
{
int t3_0=(t3_0)-0;
long long* t3_1;
t3_1=&(a)[(i)];
read_int64(&(*t3_1));
}
i++;
if (i<=n)goto L13;
L12: ____nouse=1;;
qsort_int64_int64((t0_13),(n));
i = t0_14;
if (i>n)goto L14;
L15: ____nouse=1;;
{
int t3_0=(t3_0)-0;
long long* t3_1;
t3_1=&(a)[(i)];
write_int64((*t3_1));
int t3_2;
(t3_2)=(i)!=(n);
if (t3_2)goto L16;
{
}
goto L17;
L16: ____nouse=1;;
{
write_string((t0_15));
}
L17: ____nouse=1;;
}
i++;
if (i<=n)goto L15;
L14: ____nouse=1;;
write_string((t0_16));
return t0_17;
return __ret;
}c