| @@ -0,0 +1,53 @@ | |||
| // C rmsbolt starter file | |||
| // Local Variables: | |||
| // rmsbolt-command: "/opt/riscv/bin/riscv32-unknown-elf-gcc -O0" | |||
| // rmsbolt-disassemble: nil | |||
| // End: | |||
| int lookup(int x, int y, int dim){ | |||
| int t = 0; | |||
| int ii; | |||
| for(ii = 0; ii < y; ii++){ | |||
| t += dim; | |||
| } | |||
| return t + x; | |||
| } | |||
| void convolutePixel(int x, int y, int* image, int* output, int* kernel){ | |||
| int acc = 0; | |||
| acc += image[lookup( x - 1 , y - 1 , 32)] << kernel[0]; | |||
| acc += image[lookup( x , y - 1 , 32)] << kernel[1]; | |||
| acc += image[lookup( x + 1 , y - 1 , 32)] << kernel[2]; | |||
| acc += image[lookup( x - 1 , y , 32)] << kernel[3]; | |||
| acc += image[lookup( x , y , 32)] << kernel[4]; | |||
| acc += image[lookup( x + 1 , y , 32)] << kernel[5]; | |||
| acc += image[lookup( x - 1 , y + 1 , 32)] << kernel[6]; | |||
| acc += image[lookup( x , y + 1 , 32)] << kernel[7]; | |||
| acc += image[lookup( x + 1 , y + 1 , 32)] << kernel[8]; | |||
| output[lookup(x, y, 30)] = acc; | |||
| } | |||
| int run() { | |||
| int* image = (int*)0; | |||
| int* output = (int*)(1024); | |||
| int* kernel = (int*)(1924); | |||
| int ii; | |||
| int kk; | |||
| for(ii = 1; ii < 31; ii++){ | |||
| for(kk = 1; kk < 31; kk++){ | |||
| convolutePixel(ii, kk, image, output, kernel); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int main(){ | |||
| run(); | |||
| } | |||
| @@ -0,0 +1,200 @@ | |||
| main: | |||
| addi sp,sp,-16 | |||
| sw ra,12(sp) | |||
| call run | |||
| lw ra,12(sp) | |||
| addi sp,sp,16 | |||
| jr ra | |||
| rem: | |||
| bge a0,a1,.L7 | |||
| ret | |||
| .L7: | |||
| addi sp,sp,-16 | |||
| sw ra,12(sp) | |||
| sub a0,a0,a1 | |||
| call rem | |||
| lw ra,12(sp) | |||
| addi sp,sp,16 | |||
| jr ra | |||
| f1: | |||
| addi sp,sp,-16 | |||
| sw ra,12(sp) | |||
| sw s0,8(sp) | |||
| sw s1,4(sp) | |||
| sw s2,0(sp) | |||
| li s1,0 | |||
| li s2,241 | |||
| j .L9 | |||
| .L11: | |||
| mv a0,s0 | |||
| .L9: | |||
| addi s0,a0,-1 | |||
| blez a0,.L8 | |||
| beq s0,s2,.L8 | |||
| li a1,10 | |||
| mv a0,s0 | |||
| call rem | |||
| bnez a0,.L11 | |||
| add s1,s1,s0 | |||
| j .L11 | |||
| .L8: | |||
| mv a0,s1 | |||
| lw ra,12(sp) | |||
| lw s0,8(sp) | |||
| lw s1,4(sp) | |||
| lw s2,0(sp) | |||
| addi sp,sp,16 | |||
| jr ra | |||
| f2: | |||
| addi sp,sp,-32 | |||
| sw ra,28(sp) | |||
| sw s0,24(sp) | |||
| sw s1,20(sp) | |||
| sw s2,16(sp) | |||
| sw s3,12(sp) | |||
| sw s4,8(sp) | |||
| mv s3,a0 | |||
| li s2,0 | |||
| li s0,0 | |||
| li s4,3 | |||
| .L15: | |||
| sub a0,s3,s0 | |||
| call f1 | |||
| mv s1,a0 | |||
| add a0,s0,s3 | |||
| call f1 | |||
| add a0,s1,a0 | |||
| add s2,s2,a0 | |||
| addi s0,s0,1 | |||
| bne s0,s4,.L15 | |||
| mv a0,s2 | |||
| lw ra,28(sp) | |||
| lw s0,24(sp) | |||
| lw s1,20(sp) | |||
| lw s2,16(sp) | |||
| lw s3,12(sp) | |||
| lw s4,8(sp) | |||
| addi sp,sp,32 | |||
| jr ra | |||
| f3: | |||
| addi sp,sp,-16 | |||
| sw ra,12(sp) | |||
| sw s0,8(sp) | |||
| sw s1,4(sp) | |||
| mv s0,a0 | |||
| li a1,10 | |||
| call rem | |||
| beqz a0,.L23 | |||
| li a1,20 | |||
| mv a0,s0 | |||
| call rem | |||
| beqz a0,.L24 | |||
| mv a0,s0 | |||
| call f1 | |||
| mv s1,a0 | |||
| mv a0,s0 | |||
| call f2 | |||
| add a0,s1,a0 | |||
| .L18: | |||
| lw ra,12(sp) | |||
| lw s0,8(sp) | |||
| lw s1,4(sp) | |||
| addi sp,sp,16 | |||
| jr ra | |||
| .L23: | |||
| mv a0,s0 | |||
| call f2 | |||
| j .L18 | |||
| .L24: | |||
| mv a0,s0 | |||
| call f1 | |||
| j .L18 | |||
| getCall: | |||
| addi sp,sp,-16 | |||
| sw ra,12(sp) | |||
| beqz a0,.L30 | |||
| li a5,1 | |||
| beq a0,a5,.L31 | |||
| mv a0,a1 | |||
| call f3 | |||
| .L25: | |||
| lw ra,12(sp) | |||
| addi sp,sp,16 | |||
| jr ra | |||
| .L30: | |||
| mv a0,a1 | |||
| call f1 | |||
| j .L25 | |||
| .L31: | |||
| mv a0,a1 | |||
| call f2 | |||
| j .L25 | |||
| run: | |||
| addi sp,sp,-48 | |||
| sw ra,44(sp) | |||
| sw s0,40(sp) | |||
| sw s1,36(sp) | |||
| sw s2,32(sp) | |||
| sw s3,28(sp) | |||
| sw s4,24(sp) | |||
| sw s5,20(sp) | |||
| sw s6,16(sp) | |||
| sw s7,12(sp) | |||
| sw s8,8(sp) | |||
| li s1,0 | |||
| li s0,0 | |||
| li s3,0 | |||
| li s7,56 | |||
| li s6,2 | |||
| li s5,3 | |||
| li s4,24 | |||
| .L35: | |||
| sub a5,s7,s1 | |||
| lw s8,0(a5) | |||
| sgt a5,s0,s6 | |||
| xori a5,a5,1 | |||
| add s0,s0,a5 | |||
| sub a5,s0,s5 | |||
| snez a5,a5 | |||
| sub a5,zero,a5 | |||
| and s0,s0,a5 | |||
| lw a1,0(s1) | |||
| mv a0,s0 | |||
| call getCall | |||
| mv s2,a0 | |||
| mv a1,s8 | |||
| mv a0,s0 | |||
| call getCall | |||
| sub a0,s2,a0 | |||
| add s3,s3,a0 | |||
| addi s1,s1,4 | |||
| bne s1,s4,.L35 | |||
| mv a0,s3 | |||
| lw ra,44(sp) | |||
| lw s0,40(sp) | |||
| lw s1,36(sp) | |||
| lw s2,32(sp) | |||
| lw s3,28(sp) | |||
| lw s4,24(sp) | |||
| lw s5,20(sp) | |||
| lw s6,16(sp) | |||
| lw s7,12(sp) | |||
| lw s8,8(sp) | |||
| addi sp,sp,48 | |||
| jr ra | |||
| #memset 0x0, 0x4 | |||
| #memset 0x4, 0x7 | |||
| #memset 0x8, 0x3 | |||
| #memset 0xc, 0x8 | |||
| #memset 0x10, 0x4 | |||
| #memset 0x14, 0x22 | |||
| #memset 0x18, 0x19 | |||
| #memset 0x1c, 0x8 | |||
| #memset 0x20, 0x11 | |||
| #memset 0x24, 0x10 | |||
| #memset 0x28, 0x9 | |||
| #memset 0x2c, 0x8 | |||
| #memset 0x30, 0x7 | |||
| #memset 0x34, 0x6 | |||
| #memset 0x38, 0x5 | |||
| #memset 0x3c, 0x10 | |||
| @@ -21,7 +21,7 @@ object Manifest { | |||
| // TODO: Change back after add test succedes | |||
| val singleTest = "addi.s" //"forward2.s" | |||
| val nopPadded = true | |||
| val nopPadded = false | |||
| val singleTestOptions = TestOptions( | |||
| printIfSuccessful = true, | |||
| @@ -32,7 +32,8 @@ object Manifest { | |||
| printMergedTrace = true, | |||
| nopPadded = nopPadded, | |||
| breakPoints = Nil, // not implemented | |||
| testName = singleTest) | |||
| testName = singleTest, | |||
| maxSteps = 15000) | |||
| val allTestOptions: String => TestOptions = name => TestOptions( | |||
| @@ -44,11 +45,32 @@ object Manifest { | |||
| printMergedTrace = false, | |||
| nopPadded = nopPadded, | |||
| breakPoints = Nil, // not implemented | |||
| testName = name) | |||
| testName = name, | |||
| maxSteps = 15000) | |||
| } | |||
| class ProfileBranching extends FlatSpec with Matchers { | |||
| it should "profile some branches" in { | |||
| TestRunner.profileBranching( | |||
| Manifest.singleTestOptions.copy(testName = "branchProfiling.s", maxSteps = 50000) | |||
| ) should be(true) | |||
| } | |||
| } | |||
| class ProfileCache extends FlatSpec with Matchers { | |||
| it should "profile a cache" in { | |||
| say("Warning, this test takes forever to run! 2 minutes on my machine at least.") | |||
| say("This happens due to the less than optimal way of storing the update log. Sorry I guess") | |||
| say("You probably want to debug this with a smaller program") | |||
| TestRunner.profileCache( | |||
| Manifest.singleTestOptions.copy(testName = "convolution.s", maxSteps = 150000) | |||
| ) should be(true) | |||
| } | |||
| } | |||
| class SingleTest extends FlatSpec with Matchers { | |||
| it should "just werk" in { | |||
| TestRunner.run(Manifest.singleTestOptions) should be(true) | |||
| @@ -58,7 +80,7 @@ class SingleTest extends FlatSpec with Matchers { | |||
| class AllTests extends FlatSpec with Matchers { | |||
| it should "just werk" in { | |||
| val werks = getAllTestNames.map{testname => | |||
| val werks = getAllTestNames.filterNot(_ == "convolution.s").map{testname => | |||
| say(s"testing $testname") | |||
| val opts = Manifest.allTestOptions(testname) | |||
| (testname, TestRunner.run(opts)) | |||
| @@ -37,10 +37,11 @@ object Data { | |||
| case class MemRead(addr: Addr, word: Int) extends ExecutionEvent | |||
| // addr is the target address | |||
| case class PcUpdateJALR(addr: Addr) extends ExecutionEvent | |||
| case class PcUpdateJAL(addr: Addr) extends ExecutionEvent | |||
| case class PcUpdateB(addr: Addr) extends ExecutionEvent | |||
| case class PcUpdate(addr: Addr) extends ExecutionEvent | |||
| case class PcUpdateJALR(addr: Addr) extends ExecutionEvent | |||
| case class PcUpdateJAL(addr: Addr) extends ExecutionEvent | |||
| case class PcUpdateBranch(addr: Addr, target: Addr) extends ExecutionEvent | |||
| case class PcUpdateNoBranch(addr: Addr) extends ExecutionEvent | |||
| case class PcUpdate(addr: Addr) extends ExecutionEvent | |||
| case class ExecutionTraceEvent(pc: Addr, event: ExecutionEvent*){ override def toString(): String = s"$pc: " + event.toList.mkString(", ") } | |||
| type ExecutionTrace[A] = Writer[List[ExecutionTraceEvent], A] | |||
| @@ -168,6 +169,17 @@ object Data { | |||
| } | |||
| def log2: Int = math.ceil(math.log(i.toDouble)/math.log(2.0)).toInt | |||
| // Discards two lowest bits | |||
| def getTag(slots: Int): Int = { | |||
| val bitsLeft = 32 - (slots.log2 + 2) | |||
| val bitsRight = 32 - slots.log2 | |||
| val leftShifted = i << bitsLeft | |||
| val rightShifted = leftShifted >>> bitsRight | |||
| // say(i) | |||
| // say(rightShifted) | |||
| rightShifted | |||
| } | |||
| } | |||
| implicit class StringOps(s: String) { | |||
| @@ -235,7 +247,6 @@ object Data { | |||
| ops : List[SourceInfo[Op]], | |||
| settings : List[TestSetting], | |||
| labelMap : Map[Label, Addr], | |||
| maxSteps : Int = 5000 | |||
| ){ | |||
| def imem: Map[Addr, Op] = | |||
| @@ -271,7 +282,7 @@ object Data { | |||
| /** | |||
| * Returns the binary code and the execution trace or an error for convenient error checking. | |||
| */ | |||
| def validate: Either[String, (Map[Addr, Int], ExecutionTrace[VM])] = machineCode.flatMap{ binary => | |||
| def validate(maxSteps: Int): Either[String, (Map[Addr, Int], ExecutionTrace[VM])] = machineCode.flatMap{ binary => | |||
| val uk = "UNKNOWN" | |||
| val (finish, trace) = VM.run(maxSteps, vm) | |||
| finish match { | |||
| @@ -24,7 +24,6 @@ object Ops { | |||
| sealed trait JImmediate extends ImmType | |||
| sealed trait ShiftImmediate extends ImmType | |||
| sealed trait Comparison { | |||
| def run(rs1Val: Int, rs2Val: Int): Boolean | |||
| } | |||
| @@ -51,7 +50,10 @@ object Ops { | |||
| def beqz(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, EQ) | |||
| def bnez(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, NE) | |||
| def blez(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, LT) | |||
| def blez(rs1: Int, dst: Label) = Branch(Reg(0), Reg(rs1), dst, GE) | |||
| def bgez(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, GE) | |||
| def bltz(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, LT) | |||
| def bgtz(rs1: Int, dst: Label) = Branch(Reg(0), Reg(rs1), dst, LT) | |||
| } | |||
| sealed trait someDecorator | |||
| @@ -105,22 +107,22 @@ object Ops { | |||
| def sra( rd: Int, rs1: Int, imm: Int) = ArithImmShift(Reg(rd), Reg(rs1), Imm(imm), SRA) | |||
| } | |||
| case class LUI(rd: Reg, imm: Imm) extends Op with UType | |||
| case class AUIPC(rd: Reg, imm: Imm) extends Op with UType | |||
| case class SW(rs2: Reg, rs1: Reg, offset: Imm) extends Op with SType | |||
| case class LW(rd: Reg, rs1: Reg, offset: Imm) extends Op with IType | |||
| case class LUI(rd: Reg, imm: Imm) extends Op with UType | |||
| case class AUIPC(rd: Reg, imm: Imm) extends Op with UType | |||
| case class JALR(rd: Reg, rs1: Reg, dst: String) extends Op with IType | |||
| case class JAL(rd: Reg, dst: String) extends Op with UType | |||
| case class SW(rs2: Reg, rs1: Reg, offset: Imm) extends Op with SType | |||
| case class LW(rd: Reg, rs1: Reg, offset: Imm) extends Op with IType | |||
| object LUI { def apply(rd: Int, imm: Int): LUI = LUI(Reg(rd), Imm(imm)) } | |||
| object AUIPC { def apply(rd: Int, imm: Int): AUIPC = AUIPC(Reg(rd), Imm(imm)) } | |||
| object SW { def apply(rs2: Int, rs1: Int, offset: Int): SW = SW(Reg(rs2), Reg(rs1), Imm(offset)) } | |||
| object LW { def apply(rd: Int, rs1: Int, offset: Int): LW = LW(Reg(rd), Reg(rs1), Imm(offset)) } | |||
| object JAL{ def apply(rd: Int, dst: String): JAL = JAL(Reg(rd), dst) } | |||
| object JALR{ def apply(rd: Int, rs1: Int, dst: String): JALR = JALR(Reg(rd), Reg(rs1), dst) } | |||
| object SW { def apply(rs2: Int, rs1: Int, offset: Int): SW = SW(Reg(rs2), Reg(rs1), Imm(offset)) } | |||
| object LW { def apply(rd: Int, rs1: Int, offset: Int): LW = LW(Reg(rd), Reg(rs1), Imm(offset)) } | |||
| // This op should not be assembled, but will for the sake of simplicity be rendered as a NOP | |||
| case object DONE extends Op with IType { val rd = Reg(0); val rs1 = Reg(0) } | |||
| @@ -66,6 +66,7 @@ object Parser { | |||
| stringWs("sra") ~> arith.mapN{Arith.sra}, | |||
| stringWs("slt") ~> arith.mapN{Arith.slt}, | |||
| stringWs("sgt") ~> arith.mapN{ case(x,y,z) => Arith.slt(x,z,y)}, | |||
| stringWs("sltu") ~> arith.mapN{Arith.sltu}, | |||
| // pseudos | |||
| @@ -99,10 +100,7 @@ object Parser { | |||
| stringWs("seqz") ~> (reg <~ sep, reg, ok(1)).mapN{ArithImm.sltu}, | |||
| stringWs("li") ~> (reg ~ sep ~ (hex | int)).collect{ | |||
| case((a, b), c) if (c.nBitsS <= 12) => { | |||
| say(s"for c: $c, nBitsS was ${c.nBitsS}") | |||
| ArithImm.add(a, 0, c) | |||
| } | |||
| case((a, b), c) if (c.nBitsS <= 12) => { ArithImm.add(a, 0, c) } | |||
| }, | |||
| @@ -38,21 +38,19 @@ case class VM( | |||
| } | |||
| private def executeBranch(op: Branch) = { | |||
| getAddr(op.dst).map{ addr => | |||
| val takeBranch = regs.compare(op.rs1, op.rs2, op.comp.run) | |||
| if(takeBranch){ | |||
| val nextVM = copy(pc = addr) | |||
| jump(nextVM, PcUpdateB(nextVM.pc)) | |||
| jump(nextVM, PcUpdateBranch(pc, nextVM.pc)) | |||
| } | |||
| else { | |||
| step(this) | |||
| step(this, PcUpdateNoBranch(this.pc + Addr(4))) | |||
| } | |||
| } | |||
| } | |||
| /** | |||
| * The weird :_* syntax is simply a way to pass a list to a varArgs function. | |||
| * | |||
| @@ -40,9 +40,10 @@ object PrintUtils { | |||
| case MemRead(addr, word) => fansi.Color.Red(f"M[${addr.show}] -> 0x${word.hs}") | |||
| // addr is the target address | |||
| case PcUpdateJALR(addr) => fansi.Color.Green(s"PC updated to ${addr.show} via JALR") | |||
| case PcUpdateJAL(addr) => fansi.Color.Magenta(s"PC updated to ${addr.show} via JAL") | |||
| case PcUpdateB(addr) => fansi.Color.Yellow(s"PC updated to ${addr.show} via Branch") | |||
| case PcUpdateJALR(addr) => fansi.Color.Green(s"PC updated to ${addr.show} via JALR") | |||
| case PcUpdateJAL(addr) => fansi.Color.Magenta(s"PC updated to ${addr.show} via JAL") | |||
| case PcUpdateBranch(from, to) => fansi.Color.Yellow(s"PC updated to ${to.show} via Branch") | |||
| case PcUpdateNoBranch(addr) => fansi.Color.Yellow(s"PC updated to ${addr.show}, skipping a Branch") | |||
| } | |||
| } | |||
| @@ -100,6 +101,7 @@ object PrintUtils { | |||
| def binary: String = String.format("%" + 32 + "s", i.toBinaryString) | |||
| .replace(' ', '0').grouped(4) | |||
| .map(x => x + " ").mkString | |||
| def binary(n: Int): String = String.format("%" + n + "s", i.toBinaryString).replace(' ', '0') | |||
| } | |||
| @@ -25,7 +25,8 @@ case class TestOptions( | |||
| printMergedTrace : Boolean, | |||
| nopPadded : Boolean, | |||
| breakPoints : List[Int], // Not implemented | |||
| testName : String | |||
| testName : String, | |||
| maxSteps : Int | |||
| ) | |||
| case class TestResult( | |||
| @@ -44,12 +45,12 @@ object TestRunner { | |||
| val testResults = for { | |||
| lines <- fileUtils.readTest(testOptions) | |||
| program <- FiveStage.Parser.parseProgram(lines, testOptions) | |||
| (binary, (trace, finalVM)) <- program.validate.map(x => (x._1, x._2.run)) | |||
| (binary, (trace, finalVM)) <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run)) | |||
| (termitationCause, chiselTrace) <- ChiselTestRunner( | |||
| binary.toList.sortBy(_._1.value).map(_._2), | |||
| program.settings, | |||
| finalVM.pc, | |||
| 15000) | |||
| binary.toList.sortBy(_._1.value).map(_._2), | |||
| program.settings, | |||
| finalVM.pc, | |||
| testOptions.maxSteps) | |||
| } yield { | |||
| val traces = mergeTraces(trace, chiselTrace).map(x => printMergedTraces((x), program)) | |||
| @@ -100,4 +101,98 @@ object TestRunner { | |||
| successful | |||
| }.toOption.getOrElse(false) | |||
| } | |||
| def profileBranching(testOptions: TestOptions): Boolean = { | |||
| val testResults = for { | |||
| lines <- fileUtils.readTest(testOptions) | |||
| program <- FiveStage.Parser.parseProgram(lines, testOptions) | |||
| (binary, (trace, finalVM)) <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run)) | |||
| } yield { | |||
| sealed trait BranchEvent | |||
| case class Taken(from: Int, to: Int) extends BranchEvent { override def toString = s"Taken ${from.hs}\t${to.hs}" } | |||
| case class NotTaken(addr: Int) extends BranchEvent { override def toString = s"Not Taken ${addr.hs}" } | |||
| val events: List[BranchEvent] = trace.flatMap(_.event).collect{ | |||
| case PcUpdateBranch(from, to) => Taken(from.value, to.value) | |||
| case PcUpdateNoBranch(at) => NotTaken(at.value) | |||
| } | |||
| /** | |||
| * This is a sample profiler for a rather unrealistic branch predictor which has an unlimited amount | |||
| * of slots | |||
| */ | |||
| def OneBitInfiniteSlots(events: List[BranchEvent]): Int = { | |||
| // Uncomment to take a look at the event log | |||
| // say(events.mkString("\n","\n","\n")) | |||
| // Helper inspects the next element of the event list. If the event is a mispredict the prediction table is updated | |||
| // to reflect this. | |||
| // As long as there are remaining events the helper calls itself recursively on the remainder | |||
| def helper(events: List[BranchEvent], predictionTable: Map[Int, Boolean]): Int = { | |||
| events match { | |||
| // Scala syntax for matching a list with a head element of some type and a tail | |||
| // `case h :: t =>` | |||
| // means we want to match a list with at least a head and a tail (tail can be Nil, so we | |||
| // essentially want to match a list with at least one element) | |||
| // h is the first element of the list, t is the remainder (which can be Nil, aka empty) | |||
| // `case Constructor(arg1, arg2) :: t => ` | |||
| // means we want to match a list whose first element is of type Constructor, giving us access to its internal | |||
| // values. | |||
| // `case Constructor(arg1, arg2) :: t => if(p(arg1, arg2))` | |||
| // means we want to match a list whose first element is of type Constructor while satisfying some predicate p, | |||
| // called an if guard. | |||
| case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable) | |||
| case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true)) | |||
| case NotTaken(addr) :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false)) | |||
| case NotTaken(addr) :: t if(!predictionTable(addr)) => helper(t, predictionTable) | |||
| case Nil => 0 | |||
| } | |||
| } | |||
| // Initially every possible branch is set to false since the initial state of the predictor is to assume branch not taken | |||
| def initState = events.map{ | |||
| case Taken(from, addr) => (from, false) | |||
| case NotTaken(addr) => (addr, false) | |||
| }.toMap | |||
| helper(events, initState) | |||
| } | |||
| say(OneBitInfiniteSlots(events)) | |||
| } | |||
| true | |||
| } | |||
| def profileCache(testOptions: TestOptions): Boolean = { | |||
| val testResults = for { | |||
| lines <- fileUtils.readTest(testOptions) | |||
| program <- FiveStage.Parser.parseProgram(lines, testOptions) | |||
| (binary, (trace, finalVM)) <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run)) | |||
| } yield { | |||
| sealed trait MemoryEvent | |||
| case class Write(addr: Int) extends MemoryEvent | |||
| case class Read(addr: Int) extends MemoryEvent | |||
| val events: List[MemoryEvent] = trace.flatMap(_.event).collect{ | |||
| case MemWrite(x,_) => Write(x.value) | |||
| case MemRead(x,_) => Read(x.value) | |||
| } | |||
| // Your cache here | |||
| } | |||
| true | |||
| } | |||
| } | |||
| @@ -0,0 +1,277 @@ | |||
| * Question 1 - Hazards | |||
| For the following programs describe each hazard with type (data or control), line number and a | |||
| small (max one sentence) description | |||
| ** program 1 | |||
| #+begin_src asm | |||
| addi t0, zero, 10 | |||
| addi t1, zero, 20 | |||
| L2: | |||
| sub t1, t1, t0 | |||
| beq t1, zero, .L2 | |||
| jr ra | |||
| #+end_src | |||
| ** program 2 | |||
| #+begin_src asm | |||
| addi t0, zero, 10 | |||
| lw t0, 10(t0) | |||
| beq t0, zero, .L3 | |||
| jr ra | |||
| #+end_src | |||
| ** program 3 | |||
| #+begin_src asm | |||
| lw t0, 0(t0) | |||
| lw t1, 4(t0) | |||
| sw t0, 8(t1) | |||
| lw t1, 12(t0) | |||
| beq t0, t1, .L3 | |||
| jr ra | |||
| #+end_src | |||
| * Question 2 - Handling hazards | |||
| For this question, keep in mind that the forwarder does not care if the values it forwards are being used or not! | |||
| Even for a JAL instructions which has neither an rs1 or rs2 field, the forwarder must still forward its values. | |||
| ** Data hazards 1 | |||
| At some cycle the following instructions can be found in a 5 stage design: | |||
| #+begin_src text | |||
| EX: || MEM: || WB: | |||
| ---------------------||-------------------------||-------------------------- | |||
| rs1: 4 || rs1: 4 || rs1: 1 | |||
| rs2: 5 || rs2: 6 || rs2: 2 | |||
| rd: 6 || rd: 4 || rd: 5 | |||
| memToReg = false || memToReg = false || memToReg = false | |||
| regWrite = true || regWrite = false || regWrite = true | |||
| memWrite = false || memWrite = false || memWrite = false | |||
| branch = false || branch = true || branch = false | |||
| jump = false || jump = false || jump = false | |||
| #+end_src | |||
| For the operation currently in EX, from where (ID, MEM or WB) should the forwarder get data from for rs1 and rs2? | |||
| ** Data hazards 2 | |||
| At some cycle the following instructions can be found in a 5 stage design: | |||
| #+begin_src text | |||
| EX: || MEM: || WB: | |||
| ---------------------||-------------------------||-------------------------- | |||
| rs1: 1 || rs1: 4 || rs1: 1 | |||
| rs2: 5 || rs2: 6 || rs2: 0 | |||
| rd: 0 || rd: 1 || rd: 0 | |||
| memToReg = false || memToReg = false || memToReg = false | |||
| regWrite = true || regWrite = true || regWrite = true | |||
| memWrite = false || memWrite = false || memWrite = false | |||
| branch = false || branch = true || branch = false | |||
| jump = true || jump = true || jump = false | |||
| #+end_src | |||
| For the operation currently in EX, from where (ID, MEM or WB) should the forwarder get data from for rs1 and rs2? | |||
| ** Data hazards 3 | |||
| At some cycle the following instructions can be found in a 5 stage design: | |||
| #+begin_src text | |||
| EX: || MEM: || WB: | |||
| ---------------------||-------------------------||-------------------------- | |||
| rs1: 2 || rs1: 4 || rs1: 3 | |||
| rs2: 5 || rs2: 6 || rs2: 4 | |||
| rd: 1 || rd: 1 || rd: 5 | |||
| memToReg = false || memToReg = true || memToReg = false | |||
| regWrite = false || regWrite = true || regWrite = true | |||
| memWrite = true || memWrite = false || memWrite = false | |||
| branch = false || branch = false || branch = false | |||
| jump = false || jump = false || jump = false | |||
| Should the forwarding unit issue a load hazard signal? | |||
| (Hint: what are the semantics of the instruction currently in EX stage?) | |||
| #+end_src | |||
| * Question 3 - Branch prediction | |||
| Consider a 2 bit branch predictor with only 4 slots for a 32 bit architecture (without BTB), where the decision to | |||
| take a branch or not is decided in accordance to the following table: | |||
| #+begin_src text | |||
| state || predict taken || next state if taken || next state if not taken || | |||
| =======||=================||=======================||==========================|| | |||
| 00 || NO || 01 || 00 || | |||
| 01 || NO || 10 || 00 || | |||
| 10 || YES || 11 || 01 || | |||
| 11 || YES || 11 || 10 || | |||
| #+end_src | |||
| (This is known as a saturating 2bit counter, it is *not* the same scheme as in the lecture slides) | |||
| At some point during execution the program counter is ~0xc~ and the branch predictor table looks like this: | |||
| #+begin_src text | |||
| slot || value | |||
| ======||======== | |||
| 00 || 01 | |||
| 01 || 00 | |||
| 10 || 11 | |||
| 11 || 01 | |||
| #+end_src | |||
| For the following program: | |||
| #+begin_src asm | |||
| 0xc addi x1, x3, 10 | |||
| 0x10 add x2, x1, x1 | |||
| 0x14 beq x1, x2, .L1 | |||
| 0x18 j .L2 | |||
| #+end_src | |||
| Will the predictor predict taken or not taken for the beq instruction? | |||
| * Question 4 - Benchmarking | |||
| In order to gauge the performance increase from adding branch predictors it is necessary to do some testing. | |||
| Rather than writing a test from scratch it is better to use the tester already in use in the test harness. | |||
| When running a program the VM outputs a log of all events, including which branches have been taken and which | |||
| haven't, which as it turns out is the only information we actually need to gauge the effectiveness of a branch | |||
| predictor! | |||
| For this exercise you will write a program that parses a log of branch events. | |||
| #+BEGIN_SRC scala | |||
| sealed trait BranchEvent | |||
| case class Taken(from: Int, to: Int) extends BranchEvent | |||
| case class NotTaken(at: Int) extends BranchEvent | |||
| def profile(events: List[BranchEvent]): Int = ??? | |||
| #+END_SRC | |||
| To help you get started, I have provided you with much of the necessary code. | |||
| In order to get an idea for how you should profile branch misses, consider the following profiler which calculates | |||
| misses for a processor with a branch predictor with a 1 bit predictor with infinite memory: | |||
| #+BEGIN_SRC scala | |||
| def OneBitInfiniteSlots(events: List[BranchEvent]): Int = { | |||
| // Helper inspects the next element of the event list. If the event is a mispredict the prediction table is updated | |||
| // to reflect this. | |||
| // As long as there are remaining events the helper calls itself recursively on the remainder | |||
| def helper(events: List[BranchEvent], predictionTable: Map[Int, Boolean]): Int = { | |||
| events match { | |||
| // Scala syntax for matching a list with a head element of some type and a tail | |||
| // `case h :: t =>` | |||
| // means we want to match a list with at least a head and a tail (tail can be Nil, so we | |||
| // essentially want to match a list with at least one element) | |||
| // h is the first element of the list, t is the remainder (which can be Nil, aka empty) | |||
| // `case Constructor(arg1, arg2) :: t => ` | |||
| // means we want to match a list whose first element is of type Constructor, giving us access to its internal | |||
| // values. | |||
| // `case Constructor(arg1, arg2) :: t => if(p(arg1, arg2))` | |||
| // means we want to match a list whose first element is of type Constructor while satisfying some predicate p, | |||
| // called an if guard. | |||
| case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable) | |||
| case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true)) | |||
| case NotTaken(addr) :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false)) | |||
| case NotTaken(addr) :: t if(!predictionTable(addr)) => helper(t, predictionTable) | |||
| case _ => 0 | |||
| } | |||
| } | |||
| // Initially every possible branch is set to false since the initial state of the predictor is to assume branch not taken | |||
| def initState = events.map{ | |||
| case Taken(addr) => (addr, false) | |||
| case NotTaken(addr) => (addr, false) | |||
| }.toMap | |||
| helper(events, initState) | |||
| } | |||
| #+END_SRC | |||
| ** Your task | |||
| Your job is to implement a test that checks how many misses occur for a 2 bit branch predictor with 8 slots. | |||
| The rule table is the same as in question 3. | |||
| The predictor does not use a branch target buffer (BTB), which means that the address will always be decoded in | |||
| the ID stage. | |||
| For you this means you do not need to keep track of branch targets, simplifying your simulation quite a bit. | |||
| (If not you would need to add logic for when BTB value does not match actual value) | |||
| For simplicity's sake, assume that every value in the table is initialized to 00. | |||
| For this task it is necessary to use something more sophisticated than ~Map[(Int, Boolean)]~ to represent | |||
| your branch predictor model. | |||
| The skeleton code is located in ~testRunner.scala~ and can be run using testOnly FiveStage.ProfileTest. | |||
| With a 2 bit 8 slot scheme, how many mispredicts will happen? | |||
| Answer with a number. | |||
| Hint: Use the getTag method defined on int (in DataTypes.scala) to get the tag for an address. | |||
| #+BEGIN_SRC scala | |||
| val slots = 8 | |||
| say(0x1C40.getTag(slots)) // prints 0 | |||
| say(0x1C44.getTag(slots)) // prints 1 | |||
| say(0x1C48.getTag(slots)) // prints 2 | |||
| say(0x1C4C.getTag(slots)) // prints 3 | |||
| say(0x1C50.getTag(slots)) // prints 4 | |||
| say(0x1C54.getTag(slots)) // prints 5 | |||
| say(0x1C58.getTag(slots)) // prints 6 | |||
| say(0x1C5C.getTag(slots)) // prints 7 | |||
| say(0x1C60.getTag(slots)) // prints 0 (thus conflicts with 0x1C40) | |||
| #+END_SRC | |||
| * Question 5 - Cache profiling | |||
| Unlike our design which has a very limited memory pool, real designs have access to vast amounts of memory, offset | |||
| by a steep cost in access latency. | |||
| To amend this a modern processor features several caches where even the smallest fastest cache has more memory than | |||
| your entire design. | |||
| In order to investigate how caches can alter performance it is therefore necessary to make some rather | |||
| unrealistic assumptions to see how different cache schemes impacts performance. | |||
| We will therefore assume the following: | |||
| + Reads from main memory takes 5 cycles | |||
| + cache has a total storage of 8 words (256 bits) | |||
| + cache reads work as they do now (i.e no additional latency) | |||
| For this exercise you will write a program that parses a log of memory events, similar to previous task | |||
| #+BEGIN_SRC scala | |||
| sealed trait MemoryEvent | |||
| case class Write(addr: Int) extends MemoryEvent | |||
| case class Read(addr: Int) extends MemoryEvent | |||
| def profile(events: List[MemoryEvent]): Int = ??? | |||
| #+END_SRC | |||
| ** Your task | |||
| Your job is to implement a model that tests how many delay cycles will occur for a cache which: | |||
| + Follows a 2-way associative scheme | |||
| + set size is 4 words (128 bits) (total cache size: a whopping 256 bits) | |||
| + Block size is 1 word (32 bits) meaning that we *do not need a block offset*. | |||
| + Is write-through write no-allocate (this means that you can ignore stores, only loads will affect the cache) | |||
| + Eviction policy is LRU (least recently used) | |||
| In the typical cache each block has more than 32 bits, requiring an offset, however the | |||
| simulated cache does not. | |||
| This means that the simulated cache has two sets of 4 words, greatly reducing the complexity | |||
| of your implementation. | |||
| Additionally, assume that writes does not change the the LRU counter. | |||
| This means that that your cache will only consider which value was most recently loaded, | |||
| not written. | |||
| It's not realistic, but it allows you to completely disregard write events (you can | |||
| just filter them out if you want.) | |||
| Your answer should be the number of cache miss latency cycles when using this cache. | |||
| *** Further study | |||
| If you have the time I strongly encourage you to experiment with a larger cache with bigger | |||
| block sizes, forcing you to implement the additional complexity of block offsets. | |||
| Likewise, by trying a different scheme than write-through no-allocate you will get a much | |||
| better grasp on how exactly the cache works. | |||
| This is *not* a deliverable, just something I encourage you to tinker with to get a better | |||
| understanding. | |||