R5900簡易デコンパイラを作ってみた

PS2にはR5900というCPUが搭載されている。PS2エミュレータはこのR5900の機械語をリコンパイルして実行したりするのだが、現在世界で最も有名なエミュレータの一つであるPCSX2のリコンパイラのソースの一部を読んだところ、機械語が物凄い勢いで機械語にリコンパイルされていた。時間がなくて深くは読んでいないのだが、ものすごく複雑だった。これをもう少し簡単に行うことができないかと考えてみた。
自分が考えたアイデアはR5900の機械語を一度C言語デコンパイルし、それをx86/x64バイナリを吐くことができるコンパイラで再度コンパイルするという物。上手く行けばCPUの拡張命令セットも簡単に使えるようになるはず。ということで、R5900のでコンパイラを作って実験してみた。ただし、機械語を読むのは面倒なので、R5900のアセンブラからCを吐かせるようにした。
まず用意したテストコードがこちら。

#define SIZE (2048)
#define MAX (1 << 20)
static int GRAPH[SIZE][SIZE];

inline unsigned long xor128() {
	static unsigned long x=123456789,y=362436069,z=521288629,w=88675123;
	unsigned long t;
	t=(x^(x<<11));x=y;y=z;z=w; return( w=(w^(w>>19))^(t^(t>>8)) );
}

main()
{
	int i, j, k;
	for (i = 0; i < SIZE; ++i){
		for (j = 0; j < SIZE; ++j){
			GRAPH[i][j] = xor128() % MAX;
		}
	}
	for (k = 0; k < SIZE; ++k){
		for (i = 0; i < SIZE; ++i){
			for (j = 0; j < SIZE; ++j){
				if (GRAPH[i][j] > GRAPH[i][k] + GRAPH[k][j]){
					GRAPH[i][j] = GRAPH[i][k] + GRAPH[k][j];
				}
			}
		}
	}
}

枝のコストをランダムに決定したグラフの全点間の最短距離をWarshall-Floyd法を用いて調べるというコード。特に変わったところはない。これをPS2DEV.ORG配布されているtoolchainを用いてコンパイルしてアセンブラを吐かせた結果がこちら。

	.file	1 "benchmark.c"
	.section .mdebug.eabi64
	.previous
	.sdata
	.align	3
	.type	x.0,@object
	.size	x.0,8
x.0:
	.dword	0x75bcd15
	.align	3
	.type	y.1,@object
	.size	y.1,8
y.1:
	.dword	0x159a55e5
	.align	3
	.type	z.2,@object
	.size	z.2,8
z.2:
	.dword	0x1f123bb5
	.align	3
	.type	w.3,@object
	.size	w.3,8
w.3:
	.dword	0x5491333
	.text
	.align	2
	.p2align 3,,7
	.globl	main
	.ent	main
main:
	.frame	$sp,0,$31		# vars= 0, regs= 0/0, args= 0, extra= 0
	.mask	0x00000000,0
	.fmask	0x00000000,0
	lui	$2,%hi(GRAPH) # high
	ld	$8,x.0
	ld	$10,y.1
	addiu	$15,$2,%lo(GRAPH) # low
	ld	$9,z.2
	move	$14,$0
	ld	$6,w.3
	dli	$13,0xfffff		# 1048575
	sll	$2,$14,13
$L41:
	li	$7,2047			# 0x7ff
	addu	$5,$2,$15
	.p2align 3
$L11:
	dsll	$3,$8,11
	addu	$7,$7,-1
	xor	$3,$8,$3
	move	$8,$10
	dsrl	$2,$3,8
	move	$10,$9
	xor	$3,$3,$2
	move	$9,$6
	dsrl	$2,$6,19
	move	$11,$10
	xor	$2,$6,$2
	xor	$2,$2,$3
	move	$6,$2
	and	$2,$2,$13
	dsll	$2,$2,32
	dsra	$2,$2,32
	sw	$2,0($5)
	.set	noreorder
	.set	nomacro
	bgez	$7,$L11
	addu	$5,$5,4
	.set	macro
	.set	reorder

	addu	$14,$14,1
	slt	$2,$14,2048
	.set	noreorder
	.set	nomacro
	bne	$2,$0,$L41
	sll	$2,$14,13
	.set	macro
	.set	reorder

	lui	$2,%hi(GRAPH) # high
	sd	$6,w.3
	sd	$11,y.1
	addiu	$10,$2,%lo(GRAPH) # low
	sd	$8,x.0
	sd	$9,z.2
	move	$9,$0
	sll	$2,$9,2
$L42:
	move	$14,$0
	addu	$8,$2,$10
	sll	$11,$9,13
$L27:
	sll	$2,$14,13
	addu	$6,$11,$10
	addu	$5,$2,$10
	li	$7,2047			# 0x7ff
	.p2align 3
$L26:
	lw	$4,0($6)
	addu	$7,$7,-1
	lw	$3,0($8)
	lw	$2,0($5)
	addu	$3,$3,$4
	slt	$2,$3,$2
	.set	noreorder
	.set	nomacro
	beq	$2,$0,$L23
	addu	$6,$6,4
	.set	macro
	.set	reorder

	sw	$3,0($5)
$L23:
	.set	noreorder
	.set	nomacro
	bgez	$7,$L26
	addu	$5,$5,4
	.set	macro
	.set	reorder

	addu	$14,$14,1
	slt	$2,$14,2048
	.set	noreorder
	.set	nomacro
	bne	$2,$0,$L27
	addu	$8,$8,8192
	.set	macro
	.set	reorder

	addu	$9,$9,1
	slt	$2,$9,2048
	.set	noreorder
	.set	nomacro
	bnel	$2,$0,$L42
	sll	$2,$9,2
	.set	macro
	.set	reorder

	j	$31
	.end	main
$Lfe1:
	.size	main,$Lfe1-main
	.align	2
	.p2align 3,,7
	.globl	xor128
	.ent	xor128
xor128:
	.frame	$sp,0,$31		# vars= 0, regs= 0/0, args= 0, extra= 0
	.mask	0x00000000,0
	.fmask	0x00000000,0
	ld	$2,x.0
	ld	$5,w.3
	dsll	$3,$2,11
	xor	$2,$2,$3
	dsrl	$4,$5,19
	dsrl	$3,$2,8
	xor	$4,$5,$4
	xor	$2,$2,$3
	ld	$3,y.1
	xor	$4,$4,$2
	ld	$2,z.2
	sd	$3,x.0
	sd	$2,y.1
	sd	$5,z.2
	sd	$4,w.3
	.set	noreorder
	.set	nomacro
	j	$31
	move	$2,$4
	.set	macro
	.set	reorder

	.end	xor128
$Lfe2:
	.size	xor128,$Lfe2-xor128
	.section	.bss
GRAPH:
	.align	3
	.space	16777216
	.previous
	.ident	"GCC: (GNU) 3.2.2"

普段目にしない記号ばかりなので面喰ってしまった。これをC言語デコンパイルするプログラムのソースがこちら。

#include <cstdio>

#include <string>
#include <sstream>
#include <map>
#include <iostream>

using namespace std;

#define REGIST_COMMAND(x) lineProcessors[#x]=x

typedef void (*processLine)(const string& arg0, const string& arg1, const string& arg2);
static map<string, processLine> lineProcessors;
static char slot[1024] = {0};
static char delaySlot[1024] = {0};

string replace(const string& s, char from, char to)
{
	string result = s;
	for (string::iterator it = result.begin(); it != result.end(); ++it){
		if (*it == from){
			*it = to;
		}
	}
	return result;
}

string parseArgS64(const string& s)
{
	char buffer[1024];
	if (s[0] == '$'){
		sprintf(buffer, "registers.S64[%s]", s.c_str() + 1);
	} else {
		sprintf(buffer, "%s", s.c_str());
	}
	return buffer;
}

string parseArgU32(const string& s)
{
	char buffer[1024];
	if (s[0] == '$'){
		sprintf(buffer, "registers.U32[%s][0]", s.c_str() + 1);
	} else {
		sprintf(buffer, "%s", s.c_str());
	}
	return buffer;
}

void ld(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	sprintf(slot, "\tregisters.U64[%d] = *(u64*)(&RAM[%s]);\n", r0, replace(arg1, '.', '_').c_str());
}

void move(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	const int r1 = atoi(arg1.c_str() + 1);
	sprintf(slot, "\tregisters.U64[%d] = registers.U64[%d];\n", r0, r1);
}

void dli(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	sprintf(slot, "\tregisters.U64[%d] = %sULL;\n", r0, arg1.c_str());
}

void sll(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	const int r1 = atoi(arg1.c_str() + 1);
	sprintf(slot, "\tregisters.U32[%d][0] = registers.U32[%d][0] >> %s;\n", r0, r1, arg2.c_str());
}

void li(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	sprintf(slot, "\tregisters.U64[%d] = %s;\n", r0, arg1.c_str());
}

void addu(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	const int r1 = atoi(arg1.c_str() + 1);
	const int r2 = atoi(arg2.c_str() + 1);
	sprintf(slot, "\tregisters.U32[%d][0] = %s + %s;\n", r0, parseArgU32(arg1).c_str(), parseArgU32(arg2).c_str());
}

void dsll(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	const int r1 = atoi(arg1.c_str() + 1);
	const int r2 = atoi(arg2.c_str() + 1);
	sprintf(slot, "\tregisters.U64[%d] = registers.U64[%d] << %s;\n", r0, r1, arg2.c_str());
}

void xor_(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	const int r1 = atoi(arg1.c_str() + 1);
	const int r2 = atoi(arg2.c_str() + 1);
	sprintf(slot, "\tregisters.U64[%d] = registers.U64[%d] ^ registers.U64[%d];\n", r0, r1, r2);
}

void and_(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	const int r1 = atoi(arg1.c_str() + 1);
	const int r2 = atoi(arg2.c_str() + 1);
	sprintf(slot, "\tregisters.U64[%d] = registers.U64[%d] & registers.U64[%d];\n", r0, r1, r2);
}

void dsra(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	const int r1 = atoi(arg1.c_str() + 1);
	const int r2 = atoi(arg2.c_str() + 1);
	sprintf(slot, "\tregisters.S64[%d] = registers.S64[%d] >> %s;\n", r0, r1, arg2.c_str());
}

void dsrl(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	const int r1 = atoi(arg1.c_str() + 1);
	const int r2 = atoi(arg2.c_str() + 1);
	sprintf(slot, "\tregisters.U64[%d] = registers.U64[%d] >> %s;\n", r0, r1, arg2.c_str());
}

void sw(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	istringstream iss(arg1);
	int address;
	char c;
	int r1;
	iss >> address >> c >> c >> r1;

	sprintf(slot, "\t*(u32*)(&RAM[registers.U32[%d][0] + %d]) = registers.U32[%d][0];\n", r1, address, r0);
}

void bgez(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	sprintf(delaySlot,
		"%%s"
		"\tif (registers.S32[%d][0] >= 0) {\n"
		"\t\tgoto %s;\n"
		"\t}\n", r0, arg1.c_str());
}

void slt(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	sprintf(slot, "\tregisters.U64[%d] = %s < %s ? 1 : 0;\n", r0, parseArgS64(arg1).c_str(), parseArgS64(arg2).c_str());
}

void bne(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	const int r1 = atoi(arg1.c_str() + 1);
	sprintf(delaySlot,
		"%%s"
		"\tif (registers.U64[%d] != registers.U64[%d]) {\n"
		"\t\tgoto %s;\n"
		"\t}\n", r0, r1, arg2.c_str());
}

void sd(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	sprintf(slot, "\t*(u64*)(&RAM[%s]) = registers.U64[%d];\n", replace(arg1, '.', '_').c_str(), r0);
}

void lw(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	istringstream iss(arg1);
	int address;
	char c;
	int r1;
	iss >> address >> c >> c >> r1;

	sprintf(slot, "\tregisters.S32[%d][0] = *(s32*)(&RAM[registers.U32[%d][0] + %d]);\n", r0, r1, address);
}

void beq(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	const int r1 = atoi(arg1.c_str() + 1);
	sprintf(delaySlot,
		"%%s"
		"\tif (registers.U64[%d] == registers.U64[%d]) {\n"
		"\t\tgoto %s;\n"
		"\t}\n", r0, r1, arg2.c_str());
}

void bnel(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	const int r1 = atoi(arg1.c_str() + 1);
	sprintf(delaySlot,
		"\tif (registers.U64[%d] != registers.U64[%d]) {\n"
		"\t%%s"
		"\t\tgoto %s;\n"
		"\t}\n", r0, r1, arg2.c_str());
}

void lui(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	istringstream iss(arg1);
	string val;
	char c;
	iss >> c >> c >> c >> c >> val;
	val.erase(val.size() - 1);

	sprintf(slot, "\tregisters.U32[%d][0] = (u32)%s & 0xffff0000;\n", r0, val.c_str());
}

void addiu(const string& arg0, const string& arg1, const string& arg2)
{
	const int r0 = atoi(arg0.c_str() + 1);
	const int r1 = atoi(arg1.c_str() + 1);
	istringstream iss(arg2);
	string val;
	char c;
	iss >> c >> c >> c >> c >> val;
	val.erase(val.size() - 1);

	sprintf(slot, "\tregisters.U64[%d] = registers.U64[%d] + (((u64)%s) & 0xffff);\n", r0, r1, val.c_str());
}

int main()
{
	REGIST_COMMAND(ld);
	REGIST_COMMAND(move);
	REGIST_COMMAND(dli);
	REGIST_COMMAND(sll);
	REGIST_COMMAND(li);
	REGIST_COMMAND(addu);
	REGIST_COMMAND(dsll);
	lineProcessors["xor"]=xor_;
	lineProcessors["and"]=and_;
	REGIST_COMMAND(dsra);
	REGIST_COMMAND(dsrl);
	REGIST_COMMAND(sw);
	REGIST_COMMAND(bgez);
	REGIST_COMMAND(slt);
	REGIST_COMMAND(bne);
	REGIST_COMMAND(sd);
	REGIST_COMMAND(lw);
	REGIST_COMMAND(beq);
	REGIST_COMMAND(bnel);
	REGIST_COMMAND(lui);
	REGIST_COMMAND(addiu);

	cout << "typedef unsigned char u8;" << endl;
	cout << "typedef signed char s8;" << endl;
	cout << "typedef unsigned short u16;" << endl;
	cout << "typedef signed short s16;" << endl;
	cout << "typedef unsigned long u32;" << endl;
	cout << "typedef signed long s32;" << endl;
	cout << "typedef unsigned long long u64;" << endl;
	cout << "typedef signed long long s64;" << endl;
	cout << "static u8 RAM[32 * 1024 * 1024];" << endl;
	cout << "static const int GRAPH = 0;" << endl;
	cout << "static const int x_0 = 16777216;" << endl;
	cout << "static const int y_1 = 16777216 + 8;" << endl;
	cout << "static const int z_2 = 16777216 + 8 + 8;" << endl;
	cout << "static const int w_3 = 16777216 + 8 + 8 + 8;" << endl;
	cout << "static union _registers {" << endl;
	cout << "\tu64 U64[32];" << endl;
	cout << "\ts64 S64[32];" << endl;
	cout << "\tu32 U32[32][2];" << endl;
	cout << "\ts32 S32[32][2];" << endl;
	cout << "\tu16 U16[32][4];" << endl;
	cout << "\ts16 S16[32][4];" << endl;
	cout << "\tu8  U8 [32][8];" << endl;
	cout << "\ts8  S8 [32][8];" << endl;
	cout << "} registers;" << endl;
	cout << endl;
	cout << "int main() {" << endl;
	cout << "\t*(u64*)(&RAM[x_0]) = 0x75bcd15;" << endl;
	cout << "\t*(u64*)(&RAM[y_1]) = 0x159a55e5;" << endl;
	cout << "\t*(u64*)(&RAM[z_2]) = 0x1f123bb5;" << endl;
	cout << "\t*(u64*)(&RAM[w_3]) = 0x5491333;" << endl;

	string line;
	while (getline(cin, line)){
		for (string::iterator it = line.begin(); it != line.end(); ++it){
			if (*it == ','){
				*it = ' ';
			}
		}

		string command, arg0, arg1, arg2;
		istringstream iss(line);
		iss >> command >> arg0 >> arg1 >> arg2;

		cout << "//" << line << endl;

		if (command.empty() ||  command[0] == '.'){
			continue;
		}
		if (command[command.size() - 1] == ':'){
			cout << replace(command, '.', '_') << endl;
			continue;
		}

		if (command == "j"){
			break;
		}

		if (lineProcessors.count(command)){
			lineProcessors[command](arg0, arg1, arg2);
		} else {
			cout << "NOT FOUND : " << line << endl;
		}

		if (delaySlot[0] && slot[0]){
			printf(delaySlot, slot);
			delaySlot[0] = '\0';
			slot[0] = '\0';
		} else if (slot[0]) {
			printf("%s", slot);
			slot[0] = '\0';
		}
	}

	cout << "}" << endl;
}

必要な命令しか組み込んでいないが、とりあえずは動く。これを使って先ほどのアセンブラデコンパイルした結果がこちら。

typedef unsigned char u8;
typedef signed char s8;
typedef unsigned short u16;
typedef signed short s16;
typedef unsigned long u32;
typedef signed long s32;
typedef unsigned long long u64;
typedef signed long long s64;
static u8 RAM[32 * 1024 * 1024];
static const int GRAPH = 0;
static const int x_0 = 16777216;
static const int y_1 = 16777216 + 8;
static const int z_2 = 16777216 + 8 + 8;
static const int w_3 = 16777216 + 8 + 8 + 8;
static union _registers {
	u64 U64[32];
	s64 S64[32];
	u32 U32[32][2];
	s32 S32[32][2];
	u16 U16[32][4];
	s16 S16[32][4];
	u8  U8 [32][8];
	s8  S8 [32][8];
} registers;

int main() {
	*(u64*)(&RAM[x_0]) = 0x75bcd15;
	*(u64*)(&RAM[y_1]) = 0x159a55e5;
	*(u64*)(&RAM[z_2]) = 0x1f123bb5;
	*(u64*)(&RAM[w_3]) = 0x5491333;
//	.file	1 "benchmark.c"
//	.section .mdebug.eabi64
//	.previous
//	.sdata
//	.align	3
//	.type	x.0 @object
//	.size	x.0 8
//x.0:
x_0:
//	.dword	0x75bcd15
//	.align	3
//	.type	y.1 @object
//	.size	y.1 8
//y.1:
y_1:
//	.dword	0x159a55e5
//	.align	3
//	.type	z.2 @object
//	.size	z.2 8
//z.2:
z_2:
//	.dword	0x1f123bb5
//	.align	3
//	.type	w.3 @object
//	.size	w.3 8
//w.3:
w_3:
//	.dword	0x5491333
//	.text
//	.align	2
//	.p2align 3  7
//	.globl	main
//	.ent	main
//main:
main:
//	.frame	$sp 0 $31		# vars= 0  regs= 0/0  args= 0  extra= 0
//	.mask	0x00000000 0
//	.fmask	0x00000000 0
//	lui	$2 %hi(GRAPH) # high
	registers.U32[2][0] = (u32)GRAPH & 0xffff0000;
//	ld	$8 x.0
	registers.U64[8] = *(u64*)(&RAM[x_0]);
//	ld	$10 y.1
	registers.U64[10] = *(u64*)(&RAM[y_1]);
//	addiu	$15 $2 %lo(GRAPH) # low
	registers.U64[15] = registers.U64[2] + (((u64)GRAPH) & 0xffff);
//	ld	$9 z.2
	registers.U64[9] = *(u64*)(&RAM[z_2]);
//	move	$14 $0
	registers.U64[14] = registers.U64[0];
//	ld	$6 w.3
	registers.U64[6] = *(u64*)(&RAM[w_3]);
//	dli	$13 0xfffff		# 1048575
	registers.U64[13] = 0xfffffULL;
//	sll	$2 $14 13
	registers.U32[2][0] = registers.U32[14][0] >> 13;
//$L41:
$L41:
//	li	$7 2047			# 0x7ff
	registers.U64[7] = 2047;
//	addu	$5 $2 $15
	registers.U32[5][0] = registers.U32[2][0] + registers.U32[15][0];
//	.p2align 3
//$L11:
$L11:
//	dsll	$3 $8 11
	registers.U64[3] = registers.U64[8] << 11;
//	addu	$7 $7 -1
	registers.U32[7][0] = registers.U32[7][0] + -1;
//	xor	$3 $8 $3
	registers.U64[3] = registers.U64[8] ^ registers.U64[3];
//	move	$8 $10
	registers.U64[8] = registers.U64[10];
//	dsrl	$2 $3 8
	registers.U64[2] = registers.U64[3] >> 8;
//	move	$10 $9
	registers.U64[10] = registers.U64[9];
//	xor	$3 $3 $2
	registers.U64[3] = registers.U64[3] ^ registers.U64[2];
//	move	$9 $6
	registers.U64[9] = registers.U64[6];
//	dsrl	$2 $6 19
	registers.U64[2] = registers.U64[6] >> 19;
//	move	$11 $10
	registers.U64[11] = registers.U64[10];
//	xor	$2 $6 $2
	registers.U64[2] = registers.U64[6] ^ registers.U64[2];
//	xor	$2 $2 $3
	registers.U64[2] = registers.U64[2] ^ registers.U64[3];
//	move	$6 $2
	registers.U64[6] = registers.U64[2];
//	and	$2 $2 $13
	registers.U64[2] = registers.U64[2] & registers.U64[13];
//	dsll	$2 $2 32
	registers.U64[2] = registers.U64[2] << 32;
//	dsra	$2 $2 32
	registers.S64[2] = registers.S64[2] >> 32;
//	sw	$2 0($5)
	*(u32*)(&RAM[registers.U32[5][0] + 0]) = registers.U32[2][0];
//	.set	noreorder
//	.set	nomacro
//	bgez	$7 $L11
//	addu	$5 $5 4
	registers.U32[5][0] = registers.U32[5][0] + 4;
	if (registers.S32[7][0] >= 0) {
		goto $L11;
	}
//	.set	macro
//	.set	reorder
//
//	addu	$14 $14 1
	registers.U32[14][0] = registers.U32[14][0] + 1;
//	slt	$2 $14 2048
	registers.U64[2] = registers.S64[14] < 2048 ? 1 : 0;
//	.set	noreorder
//	.set	nomacro
//	bne	$2 $0 $L41
//	sll	$2 $14 13
	registers.U32[2][0] = registers.U32[14][0] >> 13;
	if (registers.U64[2] != registers.U64[0]) {
		goto $L41;
	}
//	.set	macro
//	.set	reorder
//
//	lui	$2 %hi(GRAPH) # high
	registers.U32[2][0] = (u32)GRAPH & 0xffff0000;
//	sd	$6 w.3
	*(u64*)(&RAM[w_3]) = registers.U64[6];
//	sd	$11 y.1
	*(u64*)(&RAM[y_1]) = registers.U64[11];
//	addiu	$10 $2 %lo(GRAPH) # low
	registers.U64[10] = registers.U64[2] + (((u64)GRAPH) & 0xffff);
//	sd	$8 x.0
	*(u64*)(&RAM[x_0]) = registers.U64[8];
//	sd	$9 z.2
	*(u64*)(&RAM[z_2]) = registers.U64[9];
//	move	$9 $0
	registers.U64[9] = registers.U64[0];
//	sll	$2 $9 2
	registers.U32[2][0] = registers.U32[9][0] >> 2;
//$L42:
$L42:
//	move	$14 $0
	registers.U64[14] = registers.U64[0];
//	addu	$8 $2 $10
	registers.U32[8][0] = registers.U32[2][0] + registers.U32[10][0];
//	sll	$11 $9 13
	registers.U32[11][0] = registers.U32[9][0] >> 13;
//$L27:
$L27:
//	sll	$2 $14 13
	registers.U32[2][0] = registers.U32[14][0] >> 13;
//	addu	$6 $11 $10
	registers.U32[6][0] = registers.U32[11][0] + registers.U32[10][0];
//	addu	$5 $2 $10
	registers.U32[5][0] = registers.U32[2][0] + registers.U32[10][0];
//	li	$7 2047			# 0x7ff
	registers.U64[7] = 2047;
//	.p2align 3
//$L26:
$L26:
//	lw	$4 0($6)
	registers.S32[4][0] = *(s32*)(&RAM[registers.U32[6][0] + 0]);
//	addu	$7 $7 -1
	registers.U32[7][0] = registers.U32[7][0] + -1;
//	lw	$3 0($8)
	registers.S32[3][0] = *(s32*)(&RAM[registers.U32[8][0] + 0]);
//	lw	$2 0($5)
	registers.S32[2][0] = *(s32*)(&RAM[registers.U32[5][0] + 0]);
//	addu	$3 $3 $4
	registers.U32[3][0] = registers.U32[3][0] + registers.U32[4][0];
//	slt	$2 $3 $2
	registers.U64[2] = registers.S64[3] < registers.S64[2] ? 1 : 0;
//	.set	noreorder
//	.set	nomacro
//	beq	$2 $0 $L23
//	addu	$6 $6 4
	registers.U32[6][0] = registers.U32[6][0] + 4;
	if (registers.U64[2] == registers.U64[0]) {
		goto $L23;
	}
//	.set	macro
//	.set	reorder
//
//	sw	$3 0($5)
	*(u32*)(&RAM[registers.U32[5][0] + 0]) = registers.U32[3][0];
//$L23:
$L23:
//	.set	noreorder
//	.set	nomacro
//	bgez	$7 $L26
//	addu	$5 $5 4
	registers.U32[5][0] = registers.U32[5][0] + 4;
	if (registers.S32[7][0] >= 0) {
		goto $L26;
	}
//	.set	macro
//	.set	reorder
//
//	addu	$14 $14 1
	registers.U32[14][0] = registers.U32[14][0] + 1;
//	slt	$2 $14 2048
	registers.U64[2] = registers.S64[14] < 2048 ? 1 : 0;
//	.set	noreorder
//	.set	nomacro
//	bne	$2 $0 $L27
//	addu	$8 $8 8192
	registers.U32[8][0] = registers.U32[8][0] + 8192;
	if (registers.U64[2] != registers.U64[0]) {
		goto $L27;
	}
//	.set	macro
//	.set	reorder
//
//	addu	$9 $9 1
	registers.U32[9][0] = registers.U32[9][0] + 1;
//	slt	$2 $9 2048
	registers.U64[2] = registers.S64[9] < 2048 ? 1 : 0;
//	.set	noreorder
//	.set	nomacro
//	bnel	$2 $0 $L42
//	sll	$2 $9 2
	if (registers.U64[2] != registers.U64[0]) {
		registers.U32[2][0] = registers.U32[9][0] >> 2;
		goto $L42;
	}
//	.set	macro
//	.set	reorder
//
//	j	$31
}

値が決め打ちだったり、スタックポインタを無視していたりと、いろいろと問題のあるコードだが、一応動くことは確認できた。それぞれの動作速度はこちら。

$ gcc benchmark.c && time ./a.exe

real    0m41.403s
user    0m41.277s
sys     0m0.030s

hisayori@tigeragon ~/recompile/r5900
$ gcc benchmark.c -O1 && time ./a.exe

real    0m18.735s
user    0m18.642s
sys     0m0.015s

hisayori@tigeragon ~/recompile/r5900
$ gcc benchmark.c -O2 && time ./a.exe

real    0m12.761s
user    0m12.682s
sys     0m0.000s

$ /cygdrive/c/ps2dev/ee/bin/ee-gcc benchmark.c -O2 -S && g++ recompile.cpp -O2
&& ./a < benchmark.s > benchmark.recompiled.c && gcc benchmark.recompiled.c -O2
 && gcc benchmark.recompiled.c -O2 -S && time ./a.exe

real    1m16.081s
user    1m15.847s
sys     0m0.031s

Visual Studio 2008も使ってみた。

PGO無し
$ time ./icpc.exe

real    0m34.398s
user    0m0.015s
sys     0m0.124s

PGO有り
real    0m30.794s
user    0m0.000s
sys     0m0.125s

ということで、Visual Studio 2008のPGOまで使えばネイティブの速度の2〜3倍程度の遅さで動くことがわかった。これってもしかして使えるんじゃね?
もうすこしPCSX2のソースを読んでリコンパイラの内容を把握してから、PCSX2のフォーラムにアイデア投稿をしてみようと思う。

PS2DEV.ORG http://ps2dev.org/
PCSX2 - PS2 Emulator for Windows and Linux http://www.pcsx2.net/