| @@ -0,0 +1,53 @@ | |||||
| // C rmsbolt starter file | |||||
| // Local Variables: | |||||
| // rmsbolt-command: "/opt/riscv/bin/riscv32-unknown-elf-gcc -O0" | |||||
| // rmsbolt-disassemble: nil | |||||
| // End: | |||||
| int lookup(int x, int y, int dim){ | |||||
| int t = 0; | |||||
| int ii; | |||||
| for(ii = 0; ii < y; ii++){ | |||||
| t += dim; | |||||
| } | |||||
| return t + x; | |||||
| } | |||||
| void convolutePixel(int x, int y, int* image, int* output, int* kernel){ | |||||
| int acc = 0; | |||||
| acc += image[lookup( x - 1 , y - 1 , 32)] << kernel[0]; | |||||
| acc += image[lookup( x , y - 1 , 32)] << kernel[1]; | |||||
| acc += image[lookup( x + 1 , y - 1 , 32)] << kernel[2]; | |||||
| acc += image[lookup( x - 1 , y , 32)] << kernel[3]; | |||||
| acc += image[lookup( x , y , 32)] << kernel[4]; | |||||
| acc += image[lookup( x + 1 , y , 32)] << kernel[5]; | |||||
| acc += image[lookup( x - 1 , y + 1 , 32)] << kernel[6]; | |||||
| acc += image[lookup( x , y + 1 , 32)] << kernel[7]; | |||||
| acc += image[lookup( x + 1 , y + 1 , 32)] << kernel[8]; | |||||
| output[lookup(x, y, 30)] = acc; | |||||
| } | |||||
| int run() { | |||||
| int* image = (int*)0; | |||||
| int* output = (int*)(1024); | |||||
| int* kernel = (int*)(1924); | |||||
| int ii; | |||||
| int kk; | |||||
| for(ii = 1; ii < 31; ii++){ | |||||
| for(kk = 1; kk < 31; kk++){ | |||||
| convolutePixel(ii, kk, image, output, kernel); | |||||
| } | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| int main(){ | |||||
| run(); | |||||
| } | |||||
| @@ -0,0 +1,200 @@ | |||||
| main: | |||||
| addi sp,sp,-16 | |||||
| sw ra,12(sp) | |||||
| call run | |||||
| lw ra,12(sp) | |||||
| addi sp,sp,16 | |||||
| jr ra | |||||
| rem: | |||||
| bge a0,a1,.L7 | |||||
| ret | |||||
| .L7: | |||||
| addi sp,sp,-16 | |||||
| sw ra,12(sp) | |||||
| sub a0,a0,a1 | |||||
| call rem | |||||
| lw ra,12(sp) | |||||
| addi sp,sp,16 | |||||
| jr ra | |||||
| f1: | |||||
| addi sp,sp,-16 | |||||
| sw ra,12(sp) | |||||
| sw s0,8(sp) | |||||
| sw s1,4(sp) | |||||
| sw s2,0(sp) | |||||
| li s1,0 | |||||
| li s2,241 | |||||
| j .L9 | |||||
| .L11: | |||||
| mv a0,s0 | |||||
| .L9: | |||||
| addi s0,a0,-1 | |||||
| blez a0,.L8 | |||||
| beq s0,s2,.L8 | |||||
| li a1,10 | |||||
| mv a0,s0 | |||||
| call rem | |||||
| bnez a0,.L11 | |||||
| add s1,s1,s0 | |||||
| j .L11 | |||||
| .L8: | |||||
| mv a0,s1 | |||||
| lw ra,12(sp) | |||||
| lw s0,8(sp) | |||||
| lw s1,4(sp) | |||||
| lw s2,0(sp) | |||||
| addi sp,sp,16 | |||||
| jr ra | |||||
| f2: | |||||
| addi sp,sp,-32 | |||||
| sw ra,28(sp) | |||||
| sw s0,24(sp) | |||||
| sw s1,20(sp) | |||||
| sw s2,16(sp) | |||||
| sw s3,12(sp) | |||||
| sw s4,8(sp) | |||||
| mv s3,a0 | |||||
| li s2,0 | |||||
| li s0,0 | |||||
| li s4,3 | |||||
| .L15: | |||||
| sub a0,s3,s0 | |||||
| call f1 | |||||
| mv s1,a0 | |||||
| add a0,s0,s3 | |||||
| call f1 | |||||
| add a0,s1,a0 | |||||
| add s2,s2,a0 | |||||
| addi s0,s0,1 | |||||
| bne s0,s4,.L15 | |||||
| mv a0,s2 | |||||
| lw ra,28(sp) | |||||
| lw s0,24(sp) | |||||
| lw s1,20(sp) | |||||
| lw s2,16(sp) | |||||
| lw s3,12(sp) | |||||
| lw s4,8(sp) | |||||
| addi sp,sp,32 | |||||
| jr ra | |||||
| f3: | |||||
| addi sp,sp,-16 | |||||
| sw ra,12(sp) | |||||
| sw s0,8(sp) | |||||
| sw s1,4(sp) | |||||
| mv s0,a0 | |||||
| li a1,10 | |||||
| call rem | |||||
| beqz a0,.L23 | |||||
| li a1,20 | |||||
| mv a0,s0 | |||||
| call rem | |||||
| beqz a0,.L24 | |||||
| mv a0,s0 | |||||
| call f1 | |||||
| mv s1,a0 | |||||
| mv a0,s0 | |||||
| call f2 | |||||
| add a0,s1,a0 | |||||
| .L18: | |||||
| lw ra,12(sp) | |||||
| lw s0,8(sp) | |||||
| lw s1,4(sp) | |||||
| addi sp,sp,16 | |||||
| jr ra | |||||
| .L23: | |||||
| mv a0,s0 | |||||
| call f2 | |||||
| j .L18 | |||||
| .L24: | |||||
| mv a0,s0 | |||||
| call f1 | |||||
| j .L18 | |||||
| getCall: | |||||
| addi sp,sp,-16 | |||||
| sw ra,12(sp) | |||||
| beqz a0,.L30 | |||||
| li a5,1 | |||||
| beq a0,a5,.L31 | |||||
| mv a0,a1 | |||||
| call f3 | |||||
| .L25: | |||||
| lw ra,12(sp) | |||||
| addi sp,sp,16 | |||||
| jr ra | |||||
| .L30: | |||||
| mv a0,a1 | |||||
| call f1 | |||||
| j .L25 | |||||
| .L31: | |||||
| mv a0,a1 | |||||
| call f2 | |||||
| j .L25 | |||||
| run: | |||||
| addi sp,sp,-48 | |||||
| sw ra,44(sp) | |||||
| sw s0,40(sp) | |||||
| sw s1,36(sp) | |||||
| sw s2,32(sp) | |||||
| sw s3,28(sp) | |||||
| sw s4,24(sp) | |||||
| sw s5,20(sp) | |||||
| sw s6,16(sp) | |||||
| sw s7,12(sp) | |||||
| sw s8,8(sp) | |||||
| li s1,0 | |||||
| li s0,0 | |||||
| li s3,0 | |||||
| li s7,56 | |||||
| li s6,2 | |||||
| li s5,3 | |||||
| li s4,24 | |||||
| .L35: | |||||
| sub a5,s7,s1 | |||||
| lw s8,0(a5) | |||||
| sgt a5,s0,s6 | |||||
| xori a5,a5,1 | |||||
| add s0,s0,a5 | |||||
| sub a5,s0,s5 | |||||
| snez a5,a5 | |||||
| sub a5,zero,a5 | |||||
| and s0,s0,a5 | |||||
| lw a1,0(s1) | |||||
| mv a0,s0 | |||||
| call getCall | |||||
| mv s2,a0 | |||||
| mv a1,s8 | |||||
| mv a0,s0 | |||||
| call getCall | |||||
| sub a0,s2,a0 | |||||
| add s3,s3,a0 | |||||
| addi s1,s1,4 | |||||
| bne s1,s4,.L35 | |||||
| mv a0,s3 | |||||
| lw ra,44(sp) | |||||
| lw s0,40(sp) | |||||
| lw s1,36(sp) | |||||
| lw s2,32(sp) | |||||
| lw s3,28(sp) | |||||
| lw s4,24(sp) | |||||
| lw s5,20(sp) | |||||
| lw s6,16(sp) | |||||
| lw s7,12(sp) | |||||
| lw s8,8(sp) | |||||
| addi sp,sp,48 | |||||
| jr ra | |||||
| #memset 0x0, 0x4 | |||||
| #memset 0x4, 0x7 | |||||
| #memset 0x8, 0x3 | |||||
| #memset 0xc, 0x8 | |||||
| #memset 0x10, 0x4 | |||||
| #memset 0x14, 0x22 | |||||
| #memset 0x18, 0x19 | |||||
| #memset 0x1c, 0x8 | |||||
| #memset 0x20, 0x11 | |||||
| #memset 0x24, 0x10 | |||||
| #memset 0x28, 0x9 | |||||
| #memset 0x2c, 0x8 | |||||
| #memset 0x30, 0x7 | |||||
| #memset 0x34, 0x6 | |||||
| #memset 0x38, 0x5 | |||||
| #memset 0x3c, 0x10 | |||||
| @@ -21,7 +21,7 @@ object Manifest { | |||||
| // TODO: Change back after add test succedes | // TODO: Change back after add test succedes | ||||
| val singleTest = "addi.s" //"forward2.s" | val singleTest = "addi.s" //"forward2.s" | ||||
| val nopPadded = true | |||||
| val nopPadded = false | |||||
| val singleTestOptions = TestOptions( | val singleTestOptions = TestOptions( | ||||
| printIfSuccessful = true, | printIfSuccessful = true, | ||||
| @@ -32,7 +32,8 @@ object Manifest { | |||||
| printMergedTrace = true, | printMergedTrace = true, | ||||
| nopPadded = nopPadded, | nopPadded = nopPadded, | ||||
| breakPoints = Nil, // not implemented | breakPoints = Nil, // not implemented | ||||
| testName = singleTest) | |||||
| testName = singleTest, | |||||
| maxSteps = 15000) | |||||
| val allTestOptions: String => TestOptions = name => TestOptions( | val allTestOptions: String => TestOptions = name => TestOptions( | ||||
| @@ -44,11 +45,32 @@ object Manifest { | |||||
| printMergedTrace = false, | printMergedTrace = false, | ||||
| nopPadded = nopPadded, | nopPadded = nopPadded, | ||||
| breakPoints = Nil, // not implemented | breakPoints = Nil, // not implemented | ||||
| testName = name) | |||||
| testName = name, | |||||
| maxSteps = 15000) | |||||
| } | } | ||||
| class ProfileBranching extends FlatSpec with Matchers { | |||||
| it should "profile some branches" in { | |||||
| TestRunner.profileBranching( | |||||
| Manifest.singleTestOptions.copy(testName = "branchProfiling.s", maxSteps = 50000) | |||||
| ) should be(true) | |||||
| } | |||||
| } | |||||
| class ProfileCache extends FlatSpec with Matchers { | |||||
| it should "profile a cache" in { | |||||
| say("Warning, this test takes forever to run! 2 minutes on my machine at least.") | |||||
| say("This happens due to the less than optimal way of storing the update log. Sorry I guess") | |||||
| say("You probably want to debug this with a smaller program") | |||||
| TestRunner.profileCache( | |||||
| Manifest.singleTestOptions.copy(testName = "convolution.s", maxSteps = 150000) | |||||
| ) should be(true) | |||||
| } | |||||
| } | |||||
| class SingleTest extends FlatSpec with Matchers { | class SingleTest extends FlatSpec with Matchers { | ||||
| it should "just werk" in { | it should "just werk" in { | ||||
| TestRunner.run(Manifest.singleTestOptions) should be(true) | TestRunner.run(Manifest.singleTestOptions) should be(true) | ||||
| @@ -58,7 +80,7 @@ class SingleTest extends FlatSpec with Matchers { | |||||
| class AllTests extends FlatSpec with Matchers { | class AllTests extends FlatSpec with Matchers { | ||||
| it should "just werk" in { | it should "just werk" in { | ||||
| val werks = getAllTestNames.map{testname => | |||||
| val werks = getAllTestNames.filterNot(_ == "convolution.s").map{testname => | |||||
| say(s"testing $testname") | say(s"testing $testname") | ||||
| val opts = Manifest.allTestOptions(testname) | val opts = Manifest.allTestOptions(testname) | ||||
| (testname, TestRunner.run(opts)) | (testname, TestRunner.run(opts)) | ||||
| @@ -37,10 +37,11 @@ object Data { | |||||
| case class MemRead(addr: Addr, word: Int) extends ExecutionEvent | case class MemRead(addr: Addr, word: Int) extends ExecutionEvent | ||||
| // addr is the target address | // addr is the target address | ||||
| case class PcUpdateJALR(addr: Addr) extends ExecutionEvent | |||||
| case class PcUpdateJAL(addr: Addr) extends ExecutionEvent | |||||
| case class PcUpdateB(addr: Addr) extends ExecutionEvent | |||||
| case class PcUpdate(addr: Addr) extends ExecutionEvent | |||||
| case class PcUpdateJALR(addr: Addr) extends ExecutionEvent | |||||
| case class PcUpdateJAL(addr: Addr) extends ExecutionEvent | |||||
| case class PcUpdateBranch(addr: Addr, target: Addr) extends ExecutionEvent | |||||
| case class PcUpdateNoBranch(addr: Addr) extends ExecutionEvent | |||||
| case class PcUpdate(addr: Addr) extends ExecutionEvent | |||||
| case class ExecutionTraceEvent(pc: Addr, event: ExecutionEvent*){ override def toString(): String = s"$pc: " + event.toList.mkString(", ") } | case class ExecutionTraceEvent(pc: Addr, event: ExecutionEvent*){ override def toString(): String = s"$pc: " + event.toList.mkString(", ") } | ||||
| type ExecutionTrace[A] = Writer[List[ExecutionTraceEvent], A] | type ExecutionTrace[A] = Writer[List[ExecutionTraceEvent], A] | ||||
| @@ -168,6 +169,17 @@ object Data { | |||||
| } | } | ||||
| def log2: Int = math.ceil(math.log(i.toDouble)/math.log(2.0)).toInt | def log2: Int = math.ceil(math.log(i.toDouble)/math.log(2.0)).toInt | ||||
| // Discards two lowest bits | |||||
| def getTag(slots: Int): Int = { | |||||
| val bitsLeft = 32 - (slots.log2 + 2) | |||||
| val bitsRight = 32 - slots.log2 | |||||
| val leftShifted = i << bitsLeft | |||||
| val rightShifted = leftShifted >>> bitsRight | |||||
| // say(i) | |||||
| // say(rightShifted) | |||||
| rightShifted | |||||
| } | |||||
| } | } | ||||
| implicit class StringOps(s: String) { | implicit class StringOps(s: String) { | ||||
| @@ -235,7 +247,6 @@ object Data { | |||||
| ops : List[SourceInfo[Op]], | ops : List[SourceInfo[Op]], | ||||
| settings : List[TestSetting], | settings : List[TestSetting], | ||||
| labelMap : Map[Label, Addr], | labelMap : Map[Label, Addr], | ||||
| maxSteps : Int = 5000 | |||||
| ){ | ){ | ||||
| def imem: Map[Addr, Op] = | def imem: Map[Addr, Op] = | ||||
| @@ -271,7 +282,7 @@ object Data { | |||||
| /** | /** | ||||
| * Returns the binary code and the execution trace or an error for convenient error checking. | * Returns the binary code and the execution trace or an error for convenient error checking. | ||||
| */ | */ | ||||
| def validate: Either[String, (Map[Addr, Int], ExecutionTrace[VM])] = machineCode.flatMap{ binary => | |||||
| def validate(maxSteps: Int): Either[String, (Map[Addr, Int], ExecutionTrace[VM])] = machineCode.flatMap{ binary => | |||||
| val uk = "UNKNOWN" | val uk = "UNKNOWN" | ||||
| val (finish, trace) = VM.run(maxSteps, vm) | val (finish, trace) = VM.run(maxSteps, vm) | ||||
| finish match { | finish match { | ||||
| @@ -24,7 +24,6 @@ object Ops { | |||||
| sealed trait JImmediate extends ImmType | sealed trait JImmediate extends ImmType | ||||
| sealed trait ShiftImmediate extends ImmType | sealed trait ShiftImmediate extends ImmType | ||||
| sealed trait Comparison { | sealed trait Comparison { | ||||
| def run(rs1Val: Int, rs2Val: Int): Boolean | def run(rs1Val: Int, rs2Val: Int): Boolean | ||||
| } | } | ||||
| @@ -51,7 +50,10 @@ object Ops { | |||||
| def beqz(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, EQ) | def beqz(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, EQ) | ||||
| def bnez(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, NE) | def bnez(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, NE) | ||||
| def blez(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, LT) | |||||
| def blez(rs1: Int, dst: Label) = Branch(Reg(0), Reg(rs1), dst, GE) | |||||
| def bgez(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, GE) | |||||
| def bltz(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, LT) | |||||
| def bgtz(rs1: Int, dst: Label) = Branch(Reg(0), Reg(rs1), dst, LT) | |||||
| } | } | ||||
| sealed trait someDecorator | sealed trait someDecorator | ||||
| @@ -105,22 +107,22 @@ object Ops { | |||||
| def sra( rd: Int, rs1: Int, imm: Int) = ArithImmShift(Reg(rd), Reg(rs1), Imm(imm), SRA) | def sra( rd: Int, rs1: Int, imm: Int) = ArithImmShift(Reg(rd), Reg(rs1), Imm(imm), SRA) | ||||
| } | } | ||||
| case class LUI(rd: Reg, imm: Imm) extends Op with UType | |||||
| case class AUIPC(rd: Reg, imm: Imm) extends Op with UType | |||||
| case class SW(rs2: Reg, rs1: Reg, offset: Imm) extends Op with SType | |||||
| case class LW(rd: Reg, rs1: Reg, offset: Imm) extends Op with IType | |||||
| case class LUI(rd: Reg, imm: Imm) extends Op with UType | |||||
| case class AUIPC(rd: Reg, imm: Imm) extends Op with UType | |||||
| case class JALR(rd: Reg, rs1: Reg, dst: String) extends Op with IType | case class JALR(rd: Reg, rs1: Reg, dst: String) extends Op with IType | ||||
| case class JAL(rd: Reg, dst: String) extends Op with UType | case class JAL(rd: Reg, dst: String) extends Op with UType | ||||
| case class SW(rs2: Reg, rs1: Reg, offset: Imm) extends Op with SType | |||||
| case class LW(rd: Reg, rs1: Reg, offset: Imm) extends Op with IType | |||||
| object LUI { def apply(rd: Int, imm: Int): LUI = LUI(Reg(rd), Imm(imm)) } | object LUI { def apply(rd: Int, imm: Int): LUI = LUI(Reg(rd), Imm(imm)) } | ||||
| object AUIPC { def apply(rd: Int, imm: Int): AUIPC = AUIPC(Reg(rd), Imm(imm)) } | object AUIPC { def apply(rd: Int, imm: Int): AUIPC = AUIPC(Reg(rd), Imm(imm)) } | ||||
| object SW { def apply(rs2: Int, rs1: Int, offset: Int): SW = SW(Reg(rs2), Reg(rs1), Imm(offset)) } | |||||
| object LW { def apply(rd: Int, rs1: Int, offset: Int): LW = LW(Reg(rd), Reg(rs1), Imm(offset)) } | |||||
| object JAL{ def apply(rd: Int, dst: String): JAL = JAL(Reg(rd), dst) } | object JAL{ def apply(rd: Int, dst: String): JAL = JAL(Reg(rd), dst) } | ||||
| object JALR{ def apply(rd: Int, rs1: Int, dst: String): JALR = JALR(Reg(rd), Reg(rs1), dst) } | object JALR{ def apply(rd: Int, rs1: Int, dst: String): JALR = JALR(Reg(rd), Reg(rs1), dst) } | ||||
| object SW { def apply(rs2: Int, rs1: Int, offset: Int): SW = SW(Reg(rs2), Reg(rs1), Imm(offset)) } | |||||
| object LW { def apply(rd: Int, rs1: Int, offset: Int): LW = LW(Reg(rd), Reg(rs1), Imm(offset)) } | |||||
| // This op should not be assembled, but will for the sake of simplicity be rendered as a NOP | // This op should not be assembled, but will for the sake of simplicity be rendered as a NOP | ||||
| case object DONE extends Op with IType { val rd = Reg(0); val rs1 = Reg(0) } | case object DONE extends Op with IType { val rd = Reg(0); val rs1 = Reg(0) } | ||||
| @@ -66,6 +66,7 @@ object Parser { | |||||
| stringWs("sra") ~> arith.mapN{Arith.sra}, | stringWs("sra") ~> arith.mapN{Arith.sra}, | ||||
| stringWs("slt") ~> arith.mapN{Arith.slt}, | stringWs("slt") ~> arith.mapN{Arith.slt}, | ||||
| stringWs("sgt") ~> arith.mapN{ case(x,y,z) => Arith.slt(x,z,y)}, | |||||
| stringWs("sltu") ~> arith.mapN{Arith.sltu}, | stringWs("sltu") ~> arith.mapN{Arith.sltu}, | ||||
| // pseudos | // pseudos | ||||
| @@ -99,10 +100,7 @@ object Parser { | |||||
| stringWs("seqz") ~> (reg <~ sep, reg, ok(1)).mapN{ArithImm.sltu}, | stringWs("seqz") ~> (reg <~ sep, reg, ok(1)).mapN{ArithImm.sltu}, | ||||
| stringWs("li") ~> (reg ~ sep ~ (hex | int)).collect{ | stringWs("li") ~> (reg ~ sep ~ (hex | int)).collect{ | ||||
| case((a, b), c) if (c.nBitsS <= 12) => { | |||||
| say(s"for c: $c, nBitsS was ${c.nBitsS}") | |||||
| ArithImm.add(a, 0, c) | |||||
| } | |||||
| case((a, b), c) if (c.nBitsS <= 12) => { ArithImm.add(a, 0, c) } | |||||
| }, | }, | ||||
| @@ -38,21 +38,19 @@ case class VM( | |||||
| } | } | ||||
| private def executeBranch(op: Branch) = { | private def executeBranch(op: Branch) = { | ||||
| getAddr(op.dst).map{ addr => | getAddr(op.dst).map{ addr => | ||||
| val takeBranch = regs.compare(op.rs1, op.rs2, op.comp.run) | val takeBranch = regs.compare(op.rs1, op.rs2, op.comp.run) | ||||
| if(takeBranch){ | if(takeBranch){ | ||||
| val nextVM = copy(pc = addr) | val nextVM = copy(pc = addr) | ||||
| jump(nextVM, PcUpdateB(nextVM.pc)) | |||||
| jump(nextVM, PcUpdateBranch(pc, nextVM.pc)) | |||||
| } | } | ||||
| else { | else { | ||||
| step(this) | |||||
| step(this, PcUpdateNoBranch(this.pc + Addr(4))) | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| /** | /** | ||||
| * The weird :_* syntax is simply a way to pass a list to a varArgs function. | * The weird :_* syntax is simply a way to pass a list to a varArgs function. | ||||
| * | * | ||||
| @@ -40,9 +40,10 @@ object PrintUtils { | |||||
| case MemRead(addr, word) => fansi.Color.Red(f"M[${addr.show}] -> 0x${word.hs}") | case MemRead(addr, word) => fansi.Color.Red(f"M[${addr.show}] -> 0x${word.hs}") | ||||
| // addr is the target address | // addr is the target address | ||||
| case PcUpdateJALR(addr) => fansi.Color.Green(s"PC updated to ${addr.show} via JALR") | |||||
| case PcUpdateJAL(addr) => fansi.Color.Magenta(s"PC updated to ${addr.show} via JAL") | |||||
| case PcUpdateB(addr) => fansi.Color.Yellow(s"PC updated to ${addr.show} via Branch") | |||||
| case PcUpdateJALR(addr) => fansi.Color.Green(s"PC updated to ${addr.show} via JALR") | |||||
| case PcUpdateJAL(addr) => fansi.Color.Magenta(s"PC updated to ${addr.show} via JAL") | |||||
| case PcUpdateBranch(from, to) => fansi.Color.Yellow(s"PC updated to ${to.show} via Branch") | |||||
| case PcUpdateNoBranch(addr) => fansi.Color.Yellow(s"PC updated to ${addr.show}, skipping a Branch") | |||||
| } | } | ||||
| } | } | ||||
| @@ -100,6 +101,7 @@ object PrintUtils { | |||||
| def binary: String = String.format("%" + 32 + "s", i.toBinaryString) | def binary: String = String.format("%" + 32 + "s", i.toBinaryString) | ||||
| .replace(' ', '0').grouped(4) | .replace(' ', '0').grouped(4) | ||||
| .map(x => x + " ").mkString | .map(x => x + " ").mkString | ||||
| def binary(n: Int): String = String.format("%" + n + "s", i.toBinaryString).replace(' ', '0') | |||||
| } | } | ||||
| @@ -25,7 +25,8 @@ case class TestOptions( | |||||
| printMergedTrace : Boolean, | printMergedTrace : Boolean, | ||||
| nopPadded : Boolean, | nopPadded : Boolean, | ||||
| breakPoints : List[Int], // Not implemented | breakPoints : List[Int], // Not implemented | ||||
| testName : String | |||||
| testName : String, | |||||
| maxSteps : Int | |||||
| ) | ) | ||||
| case class TestResult( | case class TestResult( | ||||
| @@ -44,12 +45,12 @@ object TestRunner { | |||||
| val testResults = for { | val testResults = for { | ||||
| lines <- fileUtils.readTest(testOptions) | lines <- fileUtils.readTest(testOptions) | ||||
| program <- FiveStage.Parser.parseProgram(lines, testOptions) | program <- FiveStage.Parser.parseProgram(lines, testOptions) | ||||
| (binary, (trace, finalVM)) <- program.validate.map(x => (x._1, x._2.run)) | |||||
| (binary, (trace, finalVM)) <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run)) | |||||
| (termitationCause, chiselTrace) <- ChiselTestRunner( | (termitationCause, chiselTrace) <- ChiselTestRunner( | ||||
| binary.toList.sortBy(_._1.value).map(_._2), | |||||
| program.settings, | |||||
| finalVM.pc, | |||||
| 15000) | |||||
| binary.toList.sortBy(_._1.value).map(_._2), | |||||
| program.settings, | |||||
| finalVM.pc, | |||||
| testOptions.maxSteps) | |||||
| } yield { | } yield { | ||||
| val traces = mergeTraces(trace, chiselTrace).map(x => printMergedTraces((x), program)) | val traces = mergeTraces(trace, chiselTrace).map(x => printMergedTraces((x), program)) | ||||
| @@ -100,4 +101,98 @@ object TestRunner { | |||||
| successful | successful | ||||
| }.toOption.getOrElse(false) | }.toOption.getOrElse(false) | ||||
| } | } | ||||
| def profileBranching(testOptions: TestOptions): Boolean = { | |||||
| val testResults = for { | |||||
| lines <- fileUtils.readTest(testOptions) | |||||
| program <- FiveStage.Parser.parseProgram(lines, testOptions) | |||||
| (binary, (trace, finalVM)) <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run)) | |||||
| } yield { | |||||
| sealed trait BranchEvent | |||||
| case class Taken(from: Int, to: Int) extends BranchEvent { override def toString = s"Taken ${from.hs}\t${to.hs}" } | |||||
| case class NotTaken(addr: Int) extends BranchEvent { override def toString = s"Not Taken ${addr.hs}" } | |||||
| val events: List[BranchEvent] = trace.flatMap(_.event).collect{ | |||||
| case PcUpdateBranch(from, to) => Taken(from.value, to.value) | |||||
| case PcUpdateNoBranch(at) => NotTaken(at.value) | |||||
| } | |||||
| /** | |||||
| * This is a sample profiler for a rather unrealistic branch predictor which has an unlimited amount | |||||
| * of slots | |||||
| */ | |||||
| def OneBitInfiniteSlots(events: List[BranchEvent]): Int = { | |||||
| // Uncomment to take a look at the event log | |||||
| // say(events.mkString("\n","\n","\n")) | |||||
| // Helper inspects the next element of the event list. If the event is a mispredict the prediction table is updated | |||||
| // to reflect this. | |||||
| // As long as there are remaining events the helper calls itself recursively on the remainder | |||||
| def helper(events: List[BranchEvent], predictionTable: Map[Int, Boolean]): Int = { | |||||
| events match { | |||||
| // Scala syntax for matching a list with a head element of some type and a tail | |||||
| // `case h :: t =>` | |||||
| // means we want to match a list with at least a head and a tail (tail can be Nil, so we | |||||
| // essentially want to match a list with at least one element) | |||||
| // h is the first element of the list, t is the remainder (which can be Nil, aka empty) | |||||
| // `case Constructor(arg1, arg2) :: t => ` | |||||
| // means we want to match a list whose first element is of type Constructor, giving us access to its internal | |||||
| // values. | |||||
| // `case Constructor(arg1, arg2) :: t => if(p(arg1, arg2))` | |||||
| // means we want to match a list whose first element is of type Constructor while satisfying some predicate p, | |||||
| // called an if guard. | |||||
| case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable) | |||||
| case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true)) | |||||
| case NotTaken(addr) :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false)) | |||||
| case NotTaken(addr) :: t if(!predictionTable(addr)) => helper(t, predictionTable) | |||||
| case Nil => 0 | |||||
| } | |||||
| } | |||||
| // Initially every possible branch is set to false since the initial state of the predictor is to assume branch not taken | |||||
| def initState = events.map{ | |||||
| case Taken(from, addr) => (from, false) | |||||
| case NotTaken(addr) => (addr, false) | |||||
| }.toMap | |||||
| helper(events, initState) | |||||
| } | |||||
| say(OneBitInfiniteSlots(events)) | |||||
| } | |||||
| true | |||||
| } | |||||
| def profileCache(testOptions: TestOptions): Boolean = { | |||||
| val testResults = for { | |||||
| lines <- fileUtils.readTest(testOptions) | |||||
| program <- FiveStage.Parser.parseProgram(lines, testOptions) | |||||
| (binary, (trace, finalVM)) <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run)) | |||||
| } yield { | |||||
| sealed trait MemoryEvent | |||||
| case class Write(addr: Int) extends MemoryEvent | |||||
| case class Read(addr: Int) extends MemoryEvent | |||||
| val events: List[MemoryEvent] = trace.flatMap(_.event).collect{ | |||||
| case MemWrite(x,_) => Write(x.value) | |||||
| case MemRead(x,_) => Read(x.value) | |||||
| } | |||||
| // Your cache here | |||||
| } | |||||
| true | |||||
| } | |||||
| } | } | ||||
| @@ -0,0 +1,277 @@ | |||||
| * Question 1 - Hazards | |||||
| For the following programs describe each hazard with type (data or control), line number and a | |||||
| small (max one sentence) description | |||||
| ** program 1 | |||||
| #+begin_src asm | |||||
| addi t0, zero, 10 | |||||
| addi t1, zero, 20 | |||||
| L2: | |||||
| sub t1, t1, t0 | |||||
| beq t1, zero, .L2 | |||||
| jr ra | |||||
| #+end_src | |||||
| ** program 2 | |||||
| #+begin_src asm | |||||
| addi t0, zero, 10 | |||||
| lw t0, 10(t0) | |||||
| beq t0, zero, .L3 | |||||
| jr ra | |||||
| #+end_src | |||||
| ** program 3 | |||||
| #+begin_src asm | |||||
| lw t0, 0(t0) | |||||
| lw t1, 4(t0) | |||||
| sw t0, 8(t1) | |||||
| lw t1, 12(t0) | |||||
| beq t0, t1, .L3 | |||||
| jr ra | |||||
| #+end_src | |||||
| * Question 2 - Handling hazards | |||||
| For this question, keep in mind that the forwarder does not care if the values it forwards are being used or not! | |||||
| Even for a JAL instructions which has neither an rs1 or rs2 field, the forwarder must still forward its values. | |||||
| ** Data hazards 1 | |||||
| At some cycle the following instructions can be found in a 5 stage design: | |||||
| #+begin_src text | |||||
| EX: || MEM: || WB: | |||||
| ---------------------||-------------------------||-------------------------- | |||||
| rs1: 4 || rs1: 4 || rs1: 1 | |||||
| rs2: 5 || rs2: 6 || rs2: 2 | |||||
| rd: 6 || rd: 4 || rd: 5 | |||||
| memToReg = false || memToReg = false || memToReg = false | |||||
| regWrite = true || regWrite = false || regWrite = true | |||||
| memWrite = false || memWrite = false || memWrite = false | |||||
| branch = false || branch = true || branch = false | |||||
| jump = false || jump = false || jump = false | |||||
| #+end_src | |||||
| For the operation currently in EX, from where (ID, MEM or WB) should the forwarder get data from for rs1 and rs2? | |||||
| ** Data hazards 2 | |||||
| At some cycle the following instructions can be found in a 5 stage design: | |||||
| #+begin_src text | |||||
| EX: || MEM: || WB: | |||||
| ---------------------||-------------------------||-------------------------- | |||||
| rs1: 1 || rs1: 4 || rs1: 1 | |||||
| rs2: 5 || rs2: 6 || rs2: 0 | |||||
| rd: 0 || rd: 1 || rd: 0 | |||||
| memToReg = false || memToReg = false || memToReg = false | |||||
| regWrite = true || regWrite = true || regWrite = true | |||||
| memWrite = false || memWrite = false || memWrite = false | |||||
| branch = false || branch = true || branch = false | |||||
| jump = true || jump = true || jump = false | |||||
| #+end_src | |||||
| For the operation currently in EX, from where (ID, MEM or WB) should the forwarder get data from for rs1 and rs2? | |||||
| ** Data hazards 3 | |||||
| At some cycle the following instructions can be found in a 5 stage design: | |||||
| #+begin_src text | |||||
| EX: || MEM: || WB: | |||||
| ---------------------||-------------------------||-------------------------- | |||||
| rs1: 2 || rs1: 4 || rs1: 3 | |||||
| rs2: 5 || rs2: 6 || rs2: 4 | |||||
| rd: 1 || rd: 1 || rd: 5 | |||||
| memToReg = false || memToReg = true || memToReg = false | |||||
| regWrite = false || regWrite = true || regWrite = true | |||||
| memWrite = true || memWrite = false || memWrite = false | |||||
| branch = false || branch = false || branch = false | |||||
| jump = false || jump = false || jump = false | |||||
| Should the forwarding unit issue a load hazard signal? | |||||
| (Hint: what are the semantics of the instruction currently in EX stage?) | |||||
| #+end_src | |||||
| * Question 3 - Branch prediction | |||||
| Consider a 2 bit branch predictor with only 4 slots for a 32 bit architecture (without BTB), where the decision to | |||||
| take a branch or not is decided in accordance to the following table: | |||||
| #+begin_src text | |||||
| state || predict taken || next state if taken || next state if not taken || | |||||
| =======||=================||=======================||==========================|| | |||||
| 00 || NO || 01 || 00 || | |||||
| 01 || NO || 10 || 00 || | |||||
| 10 || YES || 11 || 01 || | |||||
| 11 || YES || 11 || 10 || | |||||
| #+end_src | |||||
| (This is known as a saturating 2bit counter, it is *not* the same scheme as in the lecture slides) | |||||
| At some point during execution the program counter is ~0xc~ and the branch predictor table looks like this: | |||||
| #+begin_src text | |||||
| slot || value | |||||
| ======||======== | |||||
| 00 || 01 | |||||
| 01 || 00 | |||||
| 10 || 11 | |||||
| 11 || 01 | |||||
| #+end_src | |||||
| For the following program: | |||||
| #+begin_src asm | |||||
| 0xc addi x1, x3, 10 | |||||
| 0x10 add x2, x1, x1 | |||||
| 0x14 beq x1, x2, .L1 | |||||
| 0x18 j .L2 | |||||
| #+end_src | |||||
| Will the predictor predict taken or not taken for the beq instruction? | |||||
| * Question 4 - Benchmarking | |||||
| In order to gauge the performance increase from adding branch predictors it is necessary to do some testing. | |||||
| Rather than writing a test from scratch it is better to use the tester already in use in the test harness. | |||||
| When running a program the VM outputs a log of all events, including which branches have been taken and which | |||||
| haven't, which as it turns out is the only information we actually need to gauge the effectiveness of a branch | |||||
| predictor! | |||||
| For this exercise you will write a program that parses a log of branch events. | |||||
| #+BEGIN_SRC scala | |||||
| sealed trait BranchEvent | |||||
| case class Taken(from: Int, to: Int) extends BranchEvent | |||||
| case class NotTaken(at: Int) extends BranchEvent | |||||
| def profile(events: List[BranchEvent]): Int = ??? | |||||
| #+END_SRC | |||||
| To help you get started, I have provided you with much of the necessary code. | |||||
| In order to get an idea for how you should profile branch misses, consider the following profiler which calculates | |||||
| misses for a processor with a branch predictor with a 1 bit predictor with infinite memory: | |||||
| #+BEGIN_SRC scala | |||||
| def OneBitInfiniteSlots(events: List[BranchEvent]): Int = { | |||||
| // Helper inspects the next element of the event list. If the event is a mispredict the prediction table is updated | |||||
| // to reflect this. | |||||
| // As long as there are remaining events the helper calls itself recursively on the remainder | |||||
| def helper(events: List[BranchEvent], predictionTable: Map[Int, Boolean]): Int = { | |||||
| events match { | |||||
| // Scala syntax for matching a list with a head element of some type and a tail | |||||
| // `case h :: t =>` | |||||
| // means we want to match a list with at least a head and a tail (tail can be Nil, so we | |||||
| // essentially want to match a list with at least one element) | |||||
| // h is the first element of the list, t is the remainder (which can be Nil, aka empty) | |||||
| // `case Constructor(arg1, arg2) :: t => ` | |||||
| // means we want to match a list whose first element is of type Constructor, giving us access to its internal | |||||
| // values. | |||||
| // `case Constructor(arg1, arg2) :: t => if(p(arg1, arg2))` | |||||
| // means we want to match a list whose first element is of type Constructor while satisfying some predicate p, | |||||
| // called an if guard. | |||||
| case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable) | |||||
| case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true)) | |||||
| case NotTaken(addr) :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false)) | |||||
| case NotTaken(addr) :: t if(!predictionTable(addr)) => helper(t, predictionTable) | |||||
| case _ => 0 | |||||
| } | |||||
| } | |||||
| // Initially every possible branch is set to false since the initial state of the predictor is to assume branch not taken | |||||
| def initState = events.map{ | |||||
| case Taken(addr) => (addr, false) | |||||
| case NotTaken(addr) => (addr, false) | |||||
| }.toMap | |||||
| helper(events, initState) | |||||
| } | |||||
| #+END_SRC | |||||
| ** Your task | |||||
| Your job is to implement a test that checks how many misses occur for a 2 bit branch predictor with 8 slots. | |||||
| The rule table is the same as in question 3. | |||||
| The predictor does not use a branch target buffer (BTB), which means that the address will always be decoded in | |||||
| the ID stage. | |||||
| For you this means you do not need to keep track of branch targets, simplifying your simulation quite a bit. | |||||
| (If not you would need to add logic for when BTB value does not match actual value) | |||||
| For simplicity's sake, assume that every value in the table is initialized to 00. | |||||
| For this task it is necessary to use something more sophisticated than ~Map[(Int, Boolean)]~ to represent | |||||
| your branch predictor model. | |||||
| The skeleton code is located in ~testRunner.scala~ and can be run using testOnly FiveStage.ProfileTest. | |||||
| With a 2 bit 8 slot scheme, how many mispredicts will happen? | |||||
| Answer with a number. | |||||
| Hint: Use the getTag method defined on int (in DataTypes.scala) to get the tag for an address. | |||||
| #+BEGIN_SRC scala | |||||
| val slots = 8 | |||||
| say(0x1C40.getTag(slots)) // prints 0 | |||||
| say(0x1C44.getTag(slots)) // prints 1 | |||||
| say(0x1C48.getTag(slots)) // prints 2 | |||||
| say(0x1C4C.getTag(slots)) // prints 3 | |||||
| say(0x1C50.getTag(slots)) // prints 4 | |||||
| say(0x1C54.getTag(slots)) // prints 5 | |||||
| say(0x1C58.getTag(slots)) // prints 6 | |||||
| say(0x1C5C.getTag(slots)) // prints 7 | |||||
| say(0x1C60.getTag(slots)) // prints 0 (thus conflicts with 0x1C40) | |||||
| #+END_SRC | |||||
| * Question 5 - Cache profiling | |||||
| Unlike our design which has a very limited memory pool, real designs have access to vast amounts of memory, offset | |||||
| by a steep cost in access latency. | |||||
| To amend this a modern processor features several caches where even the smallest fastest cache has more memory than | |||||
| your entire design. | |||||
| In order to investigate how caches can alter performance it is therefore necessary to make some rather | |||||
| unrealistic assumptions to see how different cache schemes impacts performance. | |||||
| We will therefore assume the following: | |||||
| + Reads from main memory takes 5 cycles | |||||
| + cache has a total storage of 8 words (256 bits) | |||||
| + cache reads work as they do now (i.e no additional latency) | |||||
| For this exercise you will write a program that parses a log of memory events, similar to previous task | |||||
| #+BEGIN_SRC scala | |||||
| sealed trait MemoryEvent | |||||
| case class Write(addr: Int) extends MemoryEvent | |||||
| case class Read(addr: Int) extends MemoryEvent | |||||
| def profile(events: List[MemoryEvent]): Int = ??? | |||||
| #+END_SRC | |||||
| ** Your task | |||||
| Your job is to implement a model that tests how many delay cycles will occur for a cache which: | |||||
| + Follows a 2-way associative scheme | |||||
| + set size is 4 words (128 bits) (total cache size: a whopping 256 bits) | |||||
| + Block size is 1 word (32 bits) meaning that we *do not need a block offset*. | |||||
| + Is write-through write no-allocate (this means that you can ignore stores, only loads will affect the cache) | |||||
| + Eviction policy is LRU (least recently used) | |||||
| In the typical cache each block has more than 32 bits, requiring an offset, however the | |||||
| simulated cache does not. | |||||
| This means that the simulated cache has two sets of 4 words, greatly reducing the complexity | |||||
| of your implementation. | |||||
| Additionally, assume that writes does not change the the LRU counter. | |||||
| This means that that your cache will only consider which value was most recently loaded, | |||||
| not written. | |||||
| It's not realistic, but it allows you to completely disregard write events (you can | |||||
| just filter them out if you want.) | |||||
| Your answer should be the number of cache miss latency cycles when using this cache. | |||||
| *** Further study | |||||
| If you have the time I strongly encourage you to experiment with a larger cache with bigger | |||||
| block sizes, forcing you to implement the additional complexity of block offsets. | |||||
| Likewise, by trying a different scheme than write-through no-allocate you will get a much | |||||
| better grasp on how exactly the cache works. | |||||
| This is *not* a deliverable, just something I encourage you to tinker with to get a better | |||||
| understanding. | |||||