vor 6 Jahren · c43a497178
--- a/src/test/resources/tests/programs/source/convolution.c
+++ b/src/test/resources/tests/programs/source/convolution.c
@@ -0,0 +1,53 @@
 // C rmsbolt starter file

 // Local Variables:
 // rmsbolt-command: "/opt/riscv/bin/riscv32-unknown-elf-gcc -O0"
 // rmsbolt-disassemble: nil
 // End:


 int lookup(int x, int y, int dim){
  int t = 0;
  int ii;
  for(ii = 0; ii < y; ii++){
    t += dim;
  }
  return t + x;
 }

 void convolutePixel(int x, int y, int* image, int* output, int* kernel){
  int acc = 0;
  acc += image[lookup( x - 1 , y - 1 , 32)] << kernel[0];
  acc += image[lookup( x     , y - 1 , 32)] << kernel[1];
  acc += image[lookup( x + 1 , y - 1 , 32)] << kernel[2];

  acc += image[lookup( x - 1 , y     , 32)] << kernel[3];
  acc += image[lookup( x     , y     , 32)] << kernel[4];
  acc += image[lookup( x + 1 , y     , 32)] << kernel[5];

  acc += image[lookup( x - 1 , y + 1 , 32)] << kernel[6];
  acc += image[lookup( x     , y + 1 , 32)] << kernel[7];
  acc += image[lookup( x + 1 , y + 1 , 32)] << kernel[8];

  output[lookup(x, y, 30)] = acc;
 }

 int run() {

  int* image = (int*)0;
  int* output = (int*)(1024);
  int* kernel = (int*)(1924);

  int ii;
  int kk;
  for(ii = 1; ii < 31; ii++){
    for(kk = 1; kk < 31; kk++){
      convolutePixel(ii, kk, image, output, kernel);
    }
  }
  return 0;
 }

 int main(){
  run();
 }
--- a/src/test/resources/tests/theory/branchProfiling.s
+++ b/src/test/resources/tests/theory/branchProfiling.s
@@ -0,0 +1,200 @@
 main:
 	addi	sp,sp,-16
 	sw	ra,12(sp)
 	call	run
 	lw	ra,12(sp)
 	addi	sp,sp,16
 	jr	ra
 rem:
 	bge	a0,a1,.L7
 	ret
 .L7:
 	addi	sp,sp,-16
 	sw	ra,12(sp)
 	sub	a0,a0,a1
 	call	rem
 	lw	ra,12(sp)
 	addi	sp,sp,16
 	jr	ra
 f1:
 	addi	sp,sp,-16
 	sw	ra,12(sp)
 	sw	s0,8(sp)
 	sw	s1,4(sp)
 	sw	s2,0(sp)
 	li	s1,0
 	li	s2,241
 	j	.L9
 .L11:
 	mv	a0,s0
 .L9:
 	addi	s0,a0,-1
 	blez	a0,.L8
 	beq	s0,s2,.L8
 	li	a1,10
 	mv	a0,s0
 	call	rem
 	bnez	a0,.L11
 	add	s1,s1,s0
 	j	.L11
 .L8:
 	mv	a0,s1
 	lw	ra,12(sp)
 	lw	s0,8(sp)
 	lw	s1,4(sp)
 	lw	s2,0(sp)
 	addi	sp,sp,16
 	jr	ra
 f2:
 	addi	sp,sp,-32
 	sw	ra,28(sp)
 	sw	s0,24(sp)
 	sw	s1,20(sp)
 	sw	s2,16(sp)
 	sw	s3,12(sp)
 	sw	s4,8(sp)
 	mv	s3,a0
 	li	s2,0
 	li	s0,0
 	li	s4,3
 .L15:
 	sub	a0,s3,s0
 	call	f1
 	mv	s1,a0
 	add	a0,s0,s3
 	call	f1
 	add	a0,s1,a0
 	add	s2,s2,a0
 	addi	s0,s0,1
 	bne	s0,s4,.L15
 	mv	a0,s2
 	lw	ra,28(sp)
 	lw	s0,24(sp)
 	lw	s1,20(sp)
 	lw	s2,16(sp)
 	lw	s3,12(sp)
 	lw	s4,8(sp)
 	addi	sp,sp,32
 	jr	ra
 f3:
 	addi	sp,sp,-16
 	sw	ra,12(sp)
 	sw	s0,8(sp)
 	sw	s1,4(sp)
 	mv	s0,a0
 	li	a1,10
 	call	rem
 	beqz	a0,.L23
 	li	a1,20
 	mv	a0,s0
 	call	rem
 	beqz	a0,.L24
 	mv	a0,s0
 	call	f1
 	mv	s1,a0
 	mv	a0,s0
 	call	f2
 	add	a0,s1,a0
 .L18:
 	lw	ra,12(sp)
 	lw	s0,8(sp)
 	lw	s1,4(sp)
 	addi	sp,sp,16
 	jr	ra
 .L23:
 	mv	a0,s0
 	call	f2
 	j	.L18
 .L24:
 	mv	a0,s0
 	call	f1
 	j	.L18
 getCall:
 	addi	sp,sp,-16
 	sw	ra,12(sp)
 	beqz	a0,.L30
 	li	a5,1
 	beq	a0,a5,.L31
 	mv	a0,a1
 	call	f3
 .L25:
 	lw	ra,12(sp)
 	addi	sp,sp,16
 	jr	ra
 .L30:
 	mv	a0,a1
 	call	f1
 	j	.L25
 .L31:
 	mv	a0,a1
 	call	f2
 	j	.L25
 run:
 	addi	sp,sp,-48
 	sw	ra,44(sp)
 	sw	s0,40(sp)
 	sw	s1,36(sp)
 	sw	s2,32(sp)
 	sw	s3,28(sp)
 	sw	s4,24(sp)
 	sw	s5,20(sp)
 	sw	s6,16(sp)
 	sw	s7,12(sp)
 	sw	s8,8(sp)
 	li	s1,0
 	li	s0,0
 	li	s3,0
 	li	s7,56
 	li	s6,2
 	li	s5,3
 	li	s4,24
 .L35:
 	sub	a5,s7,s1
 	lw	s8,0(a5)
 	sgt	a5,s0,s6
 	xori	a5,a5,1
 	add	s0,s0,a5
 	sub	a5,s0,s5
 	snez	a5,a5
 	sub	a5,zero,a5
 	and	s0,s0,a5
 	lw	a1,0(s1)
 	mv	a0,s0
 	call	getCall
 	mv	s2,a0
 	mv	a1,s8
 	mv	a0,s0
 	call	getCall
 	sub	a0,s2,a0
 	add	s3,s3,a0
 	addi	s1,s1,4
 	bne	s1,s4,.L35
 	mv	a0,s3
 	lw	ra,44(sp)
 	lw	s0,40(sp)
 	lw	s1,36(sp)
 	lw	s2,32(sp)
 	lw	s3,28(sp)
 	lw	s4,24(sp)
 	lw	s5,20(sp)
 	lw	s6,16(sp)
 	lw	s7,12(sp)
 	lw	s8,8(sp)
 	addi	sp,sp,48
 	jr	ra
 #memset 0x0,  0x4
 #memset 0x4,  0x7
 #memset 0x8,  0x3
 #memset 0xc,  0x8
 #memset 0x10, 0x4
 #memset 0x14, 0x22
 #memset 0x18, 0x19
 #memset 0x1c, 0x8
 #memset 0x20, 0x11
 #memset 0x24, 0x10
 #memset 0x28, 0x9
 #memset 0x2c, 0x8
 #memset 0x30, 0x7
 #memset 0x34, 0x6
 #memset 0x38, 0x5
 #memset 0x3c, 0x10
--- a/src/test/resources/tests/theory/convolution.s
+++ b/src/test/resources/tests/theory/convolution.s
--- a/src/test/scala/Manifest.scala
+++ b/src/test/scala/Manifest.scala
@@ -21,7 +21,7 @@ object Manifest {
  // TODO: Change back after add test succedes
  val singleTest = "addi.s" //"forward2.s"

  val nopPadded = true
  val nopPadded = false

  val singleTestOptions = TestOptions(
    printIfSuccessful  = true,
@@ -32,7 +32,8 @@ object Manifest {
    printMergedTrace   = true,
    nopPadded          = nopPadded,
    breakPoints        = Nil, // not implemented
    testName           = singleTest)
    testName           = singleTest,
    maxSteps           = 15000)


  val allTestOptions: String => TestOptions = name => TestOptions(
@@ -44,11 +45,32 @@ object Manifest {
    printMergedTrace   = false,
    nopPadded          = nopPadded,
    breakPoints        = Nil, // not implemented
    testName           = name)
    testName           = name,
    maxSteps           = 15000)

 }



 class ProfileBranching extends FlatSpec with Matchers {
  it should "profile some branches" in {
    TestRunner.profileBranching(
      Manifest.singleTestOptions.copy(testName = "branchProfiling.s", maxSteps = 50000)
    ) should be(true)
  }
 }

 class ProfileCache extends FlatSpec with Matchers {
  it should "profile a cache" in {
    say("Warning, this test takes forever to run! 2 minutes on my machine at least.")
    say("This happens due to the less than optimal way of storing the update log. Sorry I guess")
    say("You probably want to debug this with a smaller program")
    TestRunner.profileCache(
      Manifest.singleTestOptions.copy(testName = "convolution.s", maxSteps = 150000)
    ) should be(true)
  }
 }

 class SingleTest extends FlatSpec with Matchers {
  it should "just werk" in {
    TestRunner.run(Manifest.singleTestOptions) should be(true)
@@ -58,7 +80,7 @@ class SingleTest extends FlatSpec with Matchers {

 class AllTests extends FlatSpec with Matchers {
  it should "just werk" in {
    val werks = getAllTestNames.map{testname => 
    val werks = getAllTestNames.filterNot(_ == "convolution.s").map{testname => 
      say(s"testing $testname")
      val opts = Manifest.allTestOptions(testname)
      (testname, TestRunner.run(opts))
--- a/src/test/scala/RISCV/DataTypes.scala
+++ b/src/test/scala/RISCV/DataTypes.scala
@@ -37,10 +37,11 @@ object Data {
  case class MemRead(addr: Addr, word: Int)  extends ExecutionEvent

  // addr is the target address
  case class PcUpdateJALR(addr: Addr)        extends ExecutionEvent
  case class PcUpdateJAL(addr: Addr)         extends ExecutionEvent
  case class PcUpdateB(addr: Addr)           extends ExecutionEvent
  case class PcUpdate(addr: Addr)            extends ExecutionEvent
  case class PcUpdateJALR(addr: Addr)                 extends ExecutionEvent
  case class PcUpdateJAL(addr: Addr)                  extends ExecutionEvent
  case class PcUpdateBranch(addr: Addr, target: Addr) extends ExecutionEvent
  case class PcUpdateNoBranch(addr: Addr)             extends ExecutionEvent
  case class PcUpdate(addr: Addr)                     extends ExecutionEvent

  case class ExecutionTraceEvent(pc: Addr, event: ExecutionEvent*){ override def toString(): String = s"$pc: " + event.toList.mkString(", ") }
  type ExecutionTrace[A] = Writer[List[ExecutionTraceEvent], A]
@@ -168,6 +169,17 @@ object Data {
    }

    def log2: Int = math.ceil(math.log(i.toDouble)/math.log(2.0)).toInt

    // Discards two lowest bits
    def getTag(slots: Int): Int = {
      val bitsLeft = 32 - (slots.log2 + 2)
      val bitsRight = 32 - slots.log2
      val leftShifted = i << bitsLeft
      val rightShifted = leftShifted >>> bitsRight
      // say(i)
      // say(rightShifted)
      rightShifted
    }
  }

  implicit class StringOps(s: String) {
@@ -235,7 +247,6 @@ object Data {
    ops      : List[SourceInfo[Op]],
    settings : List[TestSetting],
    labelMap : Map[Label, Addr],
    maxSteps : Int = 5000
  ){

   def imem: Map[Addr, Op] =
@@ -271,7 +282,7 @@ object Data {
    /**
      * Returns the binary code and the execution trace or an error for convenient error checking.
      */
    def validate: Either[String, (Map[Addr, Int], ExecutionTrace[VM])] = machineCode.flatMap{ binary =>
    def validate(maxSteps: Int): Either[String, (Map[Addr, Int], ExecutionTrace[VM])] = machineCode.flatMap{ binary =>
      val uk = "UNKNOWN"
      val (finish, trace) = VM.run(maxSteps, vm)
      finish match {
--- a/src/test/scala/RISCV/Ops.scala
+++ b/src/test/scala/RISCV/Ops.scala
@@ -24,7 +24,6 @@ object Ops {
  sealed trait JImmediate     extends ImmType
  sealed trait ShiftImmediate extends ImmType


  sealed trait Comparison {
    def run(rs1Val: Int, rs2Val: Int): Boolean
  }
@@ -51,7 +50,10 @@ object Ops {

    def beqz(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, EQ)
    def bnez(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, NE)
    def blez(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, LT)
    def blez(rs1: Int, dst: Label) = Branch(Reg(0), Reg(rs1), dst, GE)
    def bgez(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, GE)
    def bltz(rs1: Int, dst: Label) = Branch(Reg(rs1), Reg(0), dst, LT)
    def bgtz(rs1: Int, dst: Label) = Branch(Reg(0), Reg(rs1), dst, LT)
  }

  sealed trait someDecorator
@@ -105,22 +107,22 @@ object Ops {
    def sra( rd: Int, rs1: Int, imm: Int) = ArithImmShift(Reg(rd), Reg(rs1), Imm(imm), SRA)
  }

  case class LUI(rd: Reg, imm: Imm)              extends Op with UType
  case class AUIPC(rd: Reg, imm: Imm)            extends Op with UType
  case class SW(rs2: Reg, rs1: Reg, offset: Imm) extends Op with SType
  case class LW(rd: Reg, rs1: Reg, offset: Imm)  extends Op with IType
  case class LUI(rd: Reg, imm: Imm)   extends Op with UType
  case class AUIPC(rd: Reg, imm: Imm) extends Op with UType

  case class JALR(rd: Reg, rs1: Reg, dst: String) extends Op with IType
  case class JAL(rd: Reg, dst: String) extends Op with UType
  case class SW(rs2: Reg, rs1: Reg, offset: Imm) extends Op with SType
  case class LW(rd: Reg, rs1: Reg, offset: Imm)  extends Op with IType


  object LUI { def apply(rd: Int, imm: Int): LUI = LUI(Reg(rd), Imm(imm)) }
  object AUIPC { def apply(rd: Int, imm: Int): AUIPC = AUIPC(Reg(rd), Imm(imm)) }
  object SW  { def apply(rs2: Int, rs1: Int, offset: Int): SW = SW(Reg(rs2), Reg(rs1), Imm(offset)) }
  object LW  { def apply(rd: Int, rs1: Int, offset: Int): LW = LW(Reg(rd), Reg(rs1), Imm(offset)) }

  object JAL{ def apply(rd: Int, dst: String): JAL = JAL(Reg(rd), dst) }
  object JALR{ def apply(rd: Int, rs1: Int, dst: String): JALR = JALR(Reg(rd), Reg(rs1), dst) }
  object SW  { def apply(rs2: Int, rs1: Int, offset: Int): SW = SW(Reg(rs2), Reg(rs1), Imm(offset)) }
  object LW  { def apply(rd: Int, rs1: Int, offset: Int): LW = LW(Reg(rd), Reg(rs1), Imm(offset)) }

  // This op should not be assembled, but will for the sake of simplicity be rendered as a NOP
  case object DONE extends Op with IType { val rd = Reg(0); val rs1 = Reg(0) }
--- a/src/test/scala/RISCV/Parser.scala
+++ b/src/test/scala/RISCV/Parser.scala
@@ -66,6 +66,7 @@ object Parser {
    stringWs("sra")   ~> arith.mapN{Arith.sra},

    stringWs("slt")   ~> arith.mapN{Arith.slt},
    stringWs("sgt")   ~> arith.mapN{ case(x,y,z) => Arith.slt(x,z,y)},
    stringWs("sltu")  ~> arith.mapN{Arith.sltu},

    // pseudos
@@ -99,10 +100,7 @@ object Parser {
    stringWs("seqz")  ~> (reg <~ sep, reg, ok(1)).mapN{ArithImm.sltu},

    stringWs("li")    ~> (reg ~ sep ~ (hex | int)).collect{
      case((a, b), c) if (c.nBitsS <= 12) => {
        say(s"for c: $c, nBitsS was ${c.nBitsS}")
        ArithImm.add(a, 0, c)
      }
      case((a, b), c) if (c.nBitsS <= 12) => { ArithImm.add(a, 0, c) }
    },


--- a/src/test/scala/RISCV/VM.scala
+++ b/src/test/scala/RISCV/VM.scala
@@ -38,21 +38,19 @@ case class VM(
  }



  private def executeBranch(op: Branch) = {
    getAddr(op.dst).map{ addr =>
      val takeBranch = regs.compare(op.rs1, op.rs2, op.comp.run)
      if(takeBranch){
        val nextVM = copy(pc = addr)
        jump(nextVM, PcUpdateB(nextVM.pc))
        jump(nextVM, PcUpdateBranch(pc, nextVM.pc))
      }
      else {
        step(this)
        step(this, PcUpdateNoBranch(this.pc + Addr(4)))
      }
    }
  }


  /**
    * The weird :_* syntax is simply a way to pass a list to a varArgs function.
    * 
--- a/src/test/scala/RISCV/printUtils.scala
+++ b/src/test/scala/RISCV/printUtils.scala
@@ -40,9 +40,10 @@ object PrintUtils {
      case MemRead(addr, word)  => fansi.Color.Red(f"M[${addr.show}] -> 0x${word.hs}")

      // addr is the target address
      case PcUpdateJALR(addr)   => fansi.Color.Green(s"PC updated to ${addr.show} via JALR")
      case PcUpdateJAL(addr)    => fansi.Color.Magenta(s"PC updated to ${addr.show} via JAL")
      case PcUpdateB(addr)      => fansi.Color.Yellow(s"PC updated to ${addr.show} via Branch")
      case PcUpdateJALR(addr)       => fansi.Color.Green(s"PC updated to ${addr.show} via JALR")
      case PcUpdateJAL(addr)        => fansi.Color.Magenta(s"PC updated to ${addr.show} via JAL")
      case PcUpdateBranch(from, to) => fansi.Color.Yellow(s"PC updated to ${to.show} via Branch")
      case PcUpdateNoBranch(addr)   => fansi.Color.Yellow(s"PC updated to ${addr.show}, skipping a Branch")
    }
  }

@@ -100,6 +101,7 @@ object PrintUtils {
    def binary: String = String.format("%" + 32 + "s", i.toBinaryString)
      .replace(' ', '0').grouped(4)
      .map(x => x + "  ").mkString
    def binary(n: Int): String = String.format("%" + n + "s", i.toBinaryString).replace(' ', '0')
  }


--- a/src/test/scala/RISCV/testRunner.scala
+++ b/src/test/scala/RISCV/testRunner.scala
@@ -25,7 +25,8 @@ case class TestOptions(
  printMergedTrace   : Boolean,
  nopPadded          : Boolean,
  breakPoints        : List[Int], // Not implemented
  testName           : String
  testName           : String,
  maxSteps           : Int
 )

 case class TestResult(
@@ -44,12 +45,12 @@ object TestRunner {
    val testResults = for {
      lines                           <- fileUtils.readTest(testOptions)
      program                         <- FiveStage.Parser.parseProgram(lines, testOptions)
      (binary, (trace, finalVM))      <- program.validate.map(x => (x._1, x._2.run))
      (binary, (trace, finalVM))      <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run))
      (termitationCause, chiselTrace) <- ChiselTestRunner(
                                           binary.toList.sortBy(_._1.value).map(_._2),
                                           program.settings,
                                           finalVM.pc,
                                           15000)
        binary.toList.sortBy(_._1.value).map(_._2),
        program.settings,
        finalVM.pc,
        testOptions.maxSteps)
    } yield {
      val traces = mergeTraces(trace, chiselTrace).map(x => printMergedTraces((x), program))

@@ -100,4 +101,98 @@ object TestRunner {
      successful
    }.toOption.getOrElse(false)
  }

  def profileBranching(testOptions: TestOptions): Boolean = {

    val testResults = for {
      lines                           <- fileUtils.readTest(testOptions)
      program                         <- FiveStage.Parser.parseProgram(lines, testOptions)
      (binary, (trace, finalVM))      <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run))
    } yield {

      sealed trait BranchEvent
      case class Taken(from: Int, to: Int) extends BranchEvent { override def toString = s"Taken      ${from.hs}\t${to.hs}" }
      case class NotTaken(addr: Int) extends BranchEvent { override def toString =       s"Not Taken  ${addr.hs}" }

      val events: List[BranchEvent] = trace.flatMap(_.event).collect{
        case PcUpdateBranch(from, to) => Taken(from.value, to.value)
        case PcUpdateNoBranch(at) => NotTaken(at.value)
      }


      /**
        * This is a sample profiler for a rather unrealistic branch predictor which has an unlimited amount
        * of slots
        */
      def OneBitInfiniteSlots(events: List[BranchEvent]): Int = {

        // Uncomment to take a look at the event log
        // say(events.mkString("\n","\n","\n"))

        // Helper inspects the next element of the event list. If the event is a mispredict the prediction table is updated
        // to reflect this.
        // As long as there are remaining events the helper calls itself recursively on the remainder
        def helper(events: List[BranchEvent], predictionTable: Map[Int, Boolean]): Int = {
          events match {

            // Scala syntax for matching a list with a head element of some type and a tail
 	    // `case h :: t =>`
 	    // means we want to match a list with at least a head and a tail (tail can be Nil, so we
 	    // essentially want to match a list with at least one element)
 	    // h is the first element of the list, t is the remainder (which can be Nil, aka empty)

 	    // `case Constructor(arg1, arg2) :: t => `
 	    // means we want to match a list whose first element is of type Constructor, giving us access to its internal
 	    // values.

 	    // `case Constructor(arg1, arg2) :: t => if(p(arg1, arg2))`
 	    // means we want to match a list whose first element is of type Constructor while satisfying some predicate p,
 	    // called an if guard.
            case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable)
            case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true))
            case NotTaken(addr)  :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false))
            case NotTaken(addr)  :: t if(!predictionTable(addr)) => helper(t, predictionTable)
            case Nil => 0
          }
        }

        // Initially every possible branch is set to false since the initial state of the predictor is to assume branch not taken
        def initState = events.map{
          case Taken(from, addr) => (from, false)
          case NotTaken(addr)    => (addr, false)
        }.toMap

        helper(events, initState)
      }

      say(OneBitInfiniteSlots(events))
    }


    true
  }


  def profileCache(testOptions: TestOptions): Boolean = {

    val testResults = for {
      lines                           <- fileUtils.readTest(testOptions)
      program                         <- FiveStage.Parser.parseProgram(lines, testOptions)
      (binary, (trace, finalVM))      <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run))
    } yield {

      sealed trait MemoryEvent
      case class Write(addr: Int) extends MemoryEvent
      case class Read(addr: Int) extends MemoryEvent

      val events: List[MemoryEvent] = trace.flatMap(_.event).collect{
        case MemWrite(x,_) => Write(x.value)
        case MemRead(x,_) => Read(x.value)
      }

      // Your cache here

    }
    true
  }
 }
--- a/theory2.org
+++ b/theory2.org
@@ -0,0 +1,277 @@
 * Question 1 - Hazards
  For the following programs describe each hazard with type (data or control), line number and a
  small (max one sentence) description

 ** program 1
  #+begin_src asm
    addi t0,   zero,  10
    addi t1,   zero,  20
  L2:
    sub  t1,   t1,    t0
    beq  t1,   zero, .L2
    jr   ra
  #+end_src


 ** program 2
  #+begin_src asm
    addi t0,   zero,  10
    lw   t0,   10(t0)
    beq  t0,   zero,  .L3
    jr   ra
  #+end_src


 ** program 3
  #+begin_src asm
  lw   t0,   0(t0)
  lw   t1,   4(t0)
  sw   t0,   8(t1)
  lw   t1,   12(t0)
  beq  t0,   t1,  .L3
  jr   ra
  #+end_src


 * Question 2 - Handling hazards
  For this question, keep in mind that the forwarder does not care if the values it forwards are being used or not!
  Even for a JAL instructions which has neither an rs1 or rs2 field, the forwarder must still forward its values.

 ** Data hazards 1
   At some cycle the following instructions can be found in a 5 stage design:
   
   #+begin_src text
   EX:                  ||     MEM:                ||      WB:
   ---------------------||-------------------------||--------------------------
   rs1: 4               ||     rs1: 4              ||      rs1: 1
   rs2: 5               ||     rs2: 6              ||      rs2: 2
   rd:  6               ||     rd:  4              ||      rd:  5
   memToReg = false     ||     memToReg = false    ||      memToReg = false
   regWrite = true      ||     regWrite = false    ||      regWrite = true
   memWrite = false     ||     memWrite = false    ||      memWrite = false
   branch   = false     ||     branch   = true     ||      branch   = false
   jump     = false     ||     jump     = false    ||      jump     = false
   #+end_src
   
   For the operation currently in EX, from where (ID, MEM or WB) should the forwarder get data from for rs1 and rs2?
   
 ** Data hazards 2

   At some cycle the following instructions can be found in a 5 stage design:
   
   #+begin_src text
   EX:                  ||     MEM:                ||      WB:
   ---------------------||-------------------------||--------------------------
   rs1: 1               ||     rs1: 4              ||      rs1: 1
   rs2: 5               ||     rs2: 6              ||      rs2: 0
   rd:  0               ||     rd:  1              ||      rd:  0
   memToReg = false     ||     memToReg = false    ||      memToReg = false
   regWrite = true      ||     regWrite = true     ||      regWrite = true
   memWrite = false     ||     memWrite = false    ||      memWrite = false
   branch   = false     ||     branch   = true     ||      branch   = false
   jump     = true      ||     jump     = true     ||      jump     = false
   #+end_src

   For the operation currently in EX, from where (ID, MEM or WB) should the forwarder get data from for rs1 and rs2?

 ** Data hazards 3

   At some cycle the following instructions can be found in a 5 stage design:
   
   #+begin_src text
   EX:                  ||     MEM:                ||      WB:
   ---------------------||-------------------------||--------------------------
   rs1: 2               ||     rs1: 4              ||      rs1: 3
   rs2: 5               ||     rs2: 6              ||      rs2: 4
   rd:  1               ||     rd:  1              ||      rd:  5
   memToReg = false     ||     memToReg = true     ||      memToReg = false
   regWrite = false     ||     regWrite = true     ||      regWrite = true
   memWrite = true      ||     memWrite = false    ||      memWrite = false
   branch   = false     ||     branch   = false    ||      branch   = false
   jump     = false     ||     jump     = false    ||      jump     = false

   Should the forwarding unit issue a load hazard signal?
   (Hint: what are the semantics of the instruction currently in EX stage?)
   #+end_src

 * Question 3 - Branch prediction
  Consider a 2 bit branch predictor with only 4 slots for a 32 bit architecture (without BTB), where the decision to 
  take a branch or not is decided in accordance to the following table:
  #+begin_src text
  state  ||  predict taken  ||  next state if taken  ||  next state if not taken ||
  =======||=================||=======================||==========================||
  00     ||  NO             ||  01                   ||  00                      ||
  01     ||  NO             ||  10                   ||  00                      ||
  10     ||  YES            ||  11                   ||  01                      ||
  11     ||  YES            ||  11                   ||  10                      ||
  #+end_src
  
  (This is known as a saturating 2bit counter, it is *not* the same scheme as in the lecture slides)

  At some point during execution the program counter is ~0xc~ and the branch predictor table looks like this:
  #+begin_src text
  slot  ||  value
  ======||========
  00    ||  01
  01    ||  00
  10    ||  11
  11    ||  01
  #+end_src

  For the following program:
  #+begin_src asm
  0xc  addi x1, x3, 10
  0x10 add  x2, x1, x1
  0x14 beq  x1, x2, .L1 
  0x18 j    .L2
  #+end_src
  
  Will the predictor predict taken or not taken for the beq instruction?

 * Question 4 - Benchmarking
  In order to gauge the performance increase from adding branch predictors it is necessary to do some testing.
  Rather than writing a test from scratch it is better to use the tester already in use in the test harness.
  When running a program the VM outputs a log of all events, including which branches have been taken and which
  haven't, which as it turns out is the only information we actually need to gauge the effectiveness of a branch
  predictor!

  For this exercise you will write a program that parses a log of branch events.

  #+BEGIN_SRC scala
  sealed trait BranchEvent
  case class Taken(from: Int, to: Int) extends BranchEvent
  case class NotTaken(at: Int) extends BranchEvent


  def profile(events: List[BranchEvent]): Int = ???
  #+END_SRC

  To help you get started, I have provided you with much of the necessary code.
  In order to get an idea for how you should profile branch misses, consider the following profiler which calculates
  misses for a processor with a branch predictor with a 1 bit predictor with infinite memory:

  #+BEGIN_SRC scala
  def OneBitInfiniteSlots(events: List[BranchEvent]): Int = {

    // Helper inspects the next element of the event list. If the event is a mispredict the prediction table is updated
    // to reflect this.
    // As long as there are remaining events the helper calls itself recursively on the remainder
    def helper(events: List[BranchEvent], predictionTable: Map[Int, Boolean]): Int = {
      events match {

 	// Scala syntax for matching a list with a head element of some type and a tail
 	// `case h :: t =>`
 	// means we want to match a list with at least a head and a tail (tail can be Nil, so we
 	// essentially want to match a list with at least one element)
 	// h is the first element of the list, t is the remainder (which can be Nil, aka empty)

 	// `case Constructor(arg1, arg2) :: t => `
 	// means we want to match a list whose first element is of type Constructor, giving us access to its internal
 	// values.

 	// `case Constructor(arg1, arg2) :: t => if(p(arg1, arg2))`
 	// means we want to match a list whose first element is of type Constructor while satisfying some predicate p,
 	// called an if guard.
        case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable)
        case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true))
        case NotTaken(addr)  :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false))
        case NotTaken(addr)  :: t if(!predictionTable(addr)) => helper(t, predictionTable)
        case _ => 0
      }
    }

    // Initially every possible branch is set to false since the initial state of the predictor is to assume branch not taken
    def initState = events.map{
      case Taken(addr)    => (addr, false)
      case NotTaken(addr) => (addr, false)
    }.toMap

    helper(events, initState)
  }
  #+END_SRC

 ** Your task
   Your job is to implement a test that checks how many misses occur for a 2 bit branch predictor with 8 slots.
   The rule table is the same as in question 3.
   The predictor does not use a branch target buffer (BTB), which means that the address will always be decoded in
   the ID stage.
   For you this means you do not need to keep track of branch targets, simplifying your simulation quite a bit.
   (If not you would need to add logic for when BTB value does not match actual value)

   For simplicity's sake, assume that every value in the table is initialized to 00.

   For this task it is necessary to use something more sophisticated than ~Map[(Int, Boolean)]~ to represent
   your branch predictor model.

   The skeleton code is located in ~testRunner.scala~ and can be run using testOnly FiveStage.ProfileTest.

   With a 2 bit 8 slot scheme, how many mispredicts will happen?
   Answer with a number.
   
   Hint: Use the getTag method defined on int (in DataTypes.scala) to get the tag for an address.
   #+BEGIN_SRC scala
   val slots = 8
   say(0x1C40.getTag(slots)) // prints 0
   say(0x1C44.getTag(slots)) // prints 1
   say(0x1C48.getTag(slots)) // prints 2
   say(0x1C4C.getTag(slots)) // prints 3
   say(0x1C50.getTag(slots)) // prints 4
   say(0x1C54.getTag(slots)) // prints 5
   say(0x1C58.getTag(slots)) // prints 6
   say(0x1C5C.getTag(slots)) // prints 7
   say(0x1C60.getTag(slots)) // prints 0 (thus conflicts with 0x1C40)
   #+END_SRC
   

 * Question 5 - Cache profiling
  Unlike our design which has a very limited memory pool, real designs have access to vast amounts of memory, offset
  by a steep cost in access latency.
  To amend this a modern processor features several caches where even the smallest fastest cache has more memory than
  your entire design.
  In order to investigate how caches can alter performance it is therefore necessary to make some rather
  unrealistic assumptions to see how different cache schemes impacts performance.

  We will therefore assume the following:
  + Reads from main memory takes 5 cycles
  + cache has a total storage of 8 words (256 bits)
  + cache reads work as they do now (i.e no additional latency)

  For this exercise you will write a program that parses a log of memory events, similar to previous task
  #+BEGIN_SRC scala
  sealed trait MemoryEvent
  case class Write(addr: Int) extends MemoryEvent
  case class Read(addr: Int) extends MemoryEvent


  def profile(events: List[MemoryEvent]): Int = ???
  #+END_SRC

 ** Your task
   Your job is to implement a model that tests how many delay cycles will occur for a cache which:
   + Follows a 2-way associative scheme
   + set size is 4 words (128 bits) (total cache size: a whopping 256 bits)
   + Block size is 1 word (32 bits) meaning that we *do not need a block offset*.
   + Is write-through write no-allocate (this means that you can ignore stores, only loads will affect the cache)
   + Eviction policy is LRU (least recently used)
     
   In the typical cache each block has more than 32 bits, requiring an offset, however the
   simulated cache does not.
   This means that the simulated cache has two sets of 4 words, greatly reducing the complexity
   of your implementation.
   
   Additionally, assume that writes does not change the the LRU counter. 
   This means that that your cache will only consider which value was most recently loaded,
   not written.
   It's not realistic, but it allows you to completely disregard write events (you can
   just filter them out if you want.)

   Your answer should be the number of cache miss latency cycles when using this cache.

 *** Further study
    If you have the time I strongly encourage you to experiment with a larger cache with bigger
    block sizes, forcing you to implement the additional complexity of block offsets.
    Likewise, by trying a different scheme than write-through no-allocate you will get a much
    better grasp on how exactly the cache works.
    This is *not* a deliverable, just something I encourage you to tinker with to get a better
    understanding.