diff --git a/vta/apps/tsim_example/hardware/chisel/.scalafmt.conf b/vta/apps/tsim_example/hardware/chisel/.scalafmt.conf
deleted file mode 100644
index 9172d5e9854a..000000000000
--- a/vta/apps/tsim_example/hardware/chisel/.scalafmt.conf
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-maxColumn = 100
-rewrite.rules = [SortModifiers, SortImports]
diff --git a/vta/apps/tsim_example/hardware/chisel/Makefile b/vta/apps/tsim_example/hardware/chisel/Makefile
index 0f979450c694..342e6896b000 100644
--- a/vta/apps/tsim_example/hardware/chisel/Makefile
+++ b/vta/apps/tsim_example/hardware/chisel/Makefile
@@ -94,7 +94,8 @@ endif
 default: lint lib
 
 lint:
-	sbt scalafmt
+	cp $(vta_dir)/hardware/chisel/scalastyle-config.xml .
+	sbt scalastyle
 
 lib: $(lib_path)
 $(lib_path): $(verilator_build_dir)/V$(TOP).cpp
diff --git a/vta/apps/tsim_example/hardware/chisel/project/plugins.sbt b/vta/apps/tsim_example/hardware/chisel/project/plugins.sbt
index e14e694d6f67..19ae5c9d49b9 100644
--- a/vta/apps/tsim_example/hardware/chisel/project/plugins.sbt
+++ b/vta/apps/tsim_example/hardware/chisel/project/plugins.sbt
@@ -18,4 +18,4 @@
  */
 
 logLevel := Level.Warn
-addSbtPlugin("com.geirsson" % "sbt-scalafmt" % "1.5.1")
+addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
diff --git a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala
index b90c72943c42..7ba1e633629f 100644
--- a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala
+++ b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Accel.scala
@@ -23,18 +23,18 @@ import chisel3._
 import vta.dpi._
 
 /** Add-by-one accelerator.
-  *
-  * ___________      ___________
-  * |         |      |         |
-  * | HostDPI | <--> | RegFile | <->|
-  * |_________|      |_________|    |
-  *                                 |
-  * ___________      ___________    |
-  * |         |      |         |    |
-  * | MemDPI  | <--> | Compute | <->|
-  * |_________|      |_________|
-  *
-  */
+ *
+ * ___________      ___________
+ * |         |      |         |
+ * | HostDPI | <--> | RegFile | <->|
+ * |_________|      |_________|    |
+ *                                 |
+ * ___________      ___________    |
+ * |         |      |         |    |
+ * | MemDPI  | <--> | Compute | <->|
+ * |_________|      |_________|
+ *
+ */
 case class AccelConfig() {
   val nCtrl = 1
   val nECnt = 1
diff --git a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala
index 7ad965c67f86..3ef2e7e69bdb 100644
--- a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala
+++ b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/Compute.scala
@@ -24,17 +24,17 @@ import chisel3.util._
 import vta.dpi._
 
 /** Compute
-  *
-  * Add-by-one procedure:
-  *
-  * 1. Wait for launch to be asserted
-  * 2. Issue a read request for 8-byte value at inp_baddr address
-  * 3. Wait for the value
-  * 4. Issue a write request for 8-byte value at out_baddr address
-  * 5. Increment read-address and write-address for next value
-  * 6. Check if counter (cnt) is equal to length to assert finish,
-  *    otherwise go to step 2.
-  */
+ *
+ * Add-by-one procedure:
+ *
+ * 1. Wait for launch to be asserted
+ * 2. Issue a read request for 8-byte value at inp_baddr address
+ * 3. Wait for the value
+ * 4. Issue a write request for 8-byte value at out_baddr address
+ * 5. Increment read-address and write-address for next value
+ * 6. Check if counter (cnt) is equal to length to assert finish,
+ *    otherwise go to step 2.
+ */
 class Compute(implicit config: AccelConfig) extends Module {
   val io = IO(new Bundle {
     val launch = Input(Bool())
diff --git a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala
index 1982f186a0e1..2764510a68ee 100644
--- a/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala
+++ b/vta/apps/tsim_example/hardware/chisel/src/main/scala/accel/RegFile.scala
@@ -24,29 +24,29 @@ import chisel3.util._
 import vta.dpi._
 
 /** Register File.
-  *
-  * Six 32-bit register file.
-  *
-  * -------------------------------
-  *  Register description    | addr
-  * -------------------------|-----
-  *  Control status register | 0x00
-  *  Cycle counter           | 0x04
-  *  Constant value          | 0x08
-  *  Vector length           | 0x0c
-  *  Input pointer lsb       | 0x10
-  *  Input pointer msb       | 0x14
-  *  Output pointer lsb      | 0x18
-  *  Output pointer msb      | 0x1c
-  * -------------------------------
-
-  * ------------------------------
-  *  Control status register | bit
-  * ------------------------------
-  *  Launch                  | 0
-  *  Finish                  | 1
-  * ------------------------------
-  */
+ *
+ * Six 32-bit register file.
+ *
+ * -------------------------------
+ *  Register description    | addr
+ * -------------------------|-----
+ *  Control status register | 0x00
+ *  Cycle counter           | 0x04
+ *  Constant value          | 0x08
+ *  Vector length           | 0x0c
+ *  Input pointer lsb       | 0x10
+ *  Input pointer msb       | 0x14
+ *  Output pointer lsb      | 0x18
+ *  Output pointer msb      | 0x1c
+ * -------------------------------
+ *
+ * ------------------------------
+ *  Control status register | bit
+ * ------------------------------
+ *  Launch                  | 0
+ *  Finish                  | 1
+ * ------------------------------
+ */
 class RegFile(implicit config: AccelConfig) extends Module {
   val io = IO(new Bundle {
     val launch = Output(Bool())
@@ -98,9 +98,8 @@ class RegFile(implicit config: AccelConfig) extends Module {
   }
 
   for (i <- 0 until (config.nVals + (2 * config.nPtrs))) {
-    when(
-      state === sIdle && io.host.req.valid &&
-        io.host.req.opcode && addr(vo + i).U === io.host.req.addr) {
+    when(state === sIdle && io.host.req.valid &&
+      io.host.req.opcode && addr(vo + i).U === io.host.req.addr) {
       reg(vo + i) := io.host.req.value
     }
   }
diff --git a/vta/hardware/chisel/.scalafmt.conf b/vta/hardware/chisel/.scalafmt.conf
deleted file mode 100644
index 9172d5e9854a..000000000000
--- a/vta/hardware/chisel/.scalafmt.conf
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-maxColumn = 100
-rewrite.rules = [SortModifiers, SortImports]
diff --git a/vta/hardware/chisel/Makefile b/vta/hardware/chisel/Makefile
index a98c52c77078..9804230074ac 100644
--- a/vta/hardware/chisel/Makefile
+++ b/vta/hardware/chisel/Makefile
@@ -112,7 +112,7 @@ endif
 default: lint lib
 
 lint:
-	sbt scalafmt --test
+	sbt scalastyle
 
 lib: $(lib_path)
 
diff --git a/vta/hardware/chisel/project/plugins.sbt b/vta/hardware/chisel/project/plugins.sbt
index e14e694d6f67..19ae5c9d49b9 100644
--- a/vta/hardware/chisel/project/plugins.sbt
+++ b/vta/hardware/chisel/project/plugins.sbt
@@ -18,4 +18,4 @@
  */
 
 logLevel := Level.Warn
-addSbtPlugin("com.geirsson" % "sbt-scalafmt" % "1.5.1")
+addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
diff --git a/vta/hardware/chisel/scalastyle-config.xml b/vta/hardware/chisel/scalastyle-config.xml
new file mode 100644
index 000000000000..ae7c8e6b588a
--- /dev/null
+++ b/vta/hardware/chisel/scalastyle-config.xml
@@ -0,0 +1,128 @@
+<scalastyle>
+ <name>Scalastyle standard configuration</name>
+ <check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="true">
+  <parameters>
+   <parameter name="maxFileLength"><![CDATA[800]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true">
+  <parameters>
+   <parameter name="header"><![CDATA[/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
+  <parameters>
+   <parameter name="maxLineLength"><![CDATA[120]]></parameter>
+   <parameter name="tabSize"><![CDATA[2]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
+  <parameters>
+   <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
+  <parameters>
+   <parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
+  <parameters>
+   <parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="true">
+  <parameters>
+   <parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
+  <parameters>
+   <parameter name="maxParameters"><![CDATA[8]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false">
+  <parameters>
+   <parameter name="ignore"><![CDATA[-1,0,1,2,3,4,8,16,32,64,128]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
+  <parameters>
+   <parameter name="regex"><![CDATA[println]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="true">
+  <parameters>
+   <parameter name="maxTypes"><![CDATA[30]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="true">
+  <parameters>
+   <parameter name="maximum"><![CDATA[10]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="false">
+  <parameters>
+   <parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
+   <parameter name="doubleLineAllowed"><![CDATA[false]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="true">
+  <parameters>
+   <parameter name="maxLength"><![CDATA[50]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false">
+  <parameters>
+   <parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="true">
+  <parameters>
+   <parameter name="maxMethods"><![CDATA[30]]></parameter>
+  </parameters>
+ </check>
+ <check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="false"></check>
+ <check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
+ <check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
+ <check level="error" class="org.scalastyle.file.IndentationChecker" enabled="true">
+   <parameters>
+     <parameter name="tabSize">2</parameter>
+     <parameter name="methodParamIndentSize">2</parameter>
+     <parameter name="classParamIndentSize">4</parameter>
+   </parameters>
+ </check>
+</scalastyle>
diff --git a/vta/hardware/chisel/src/main/scala/core/Compute.scala b/vta/hardware/chisel/src/main/scala/core/Compute.scala
index 7751bf7ddf48..c605a1a1a824 100644
--- a/vta/hardware/chisel/src/main/scala/core/Compute.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Compute.scala
@@ -25,13 +25,13 @@ import vta.util.config._
 import vta.shell._
 
 /** Compute.
-  *
-  * The compute unit is in charge of the following:
-  * - Loading micro-ops from memory (loadUop module)
-  * - Loading biases (acc) from memory (tensorAcc module)
-  * - Compute ALU instructions (tensorAlu module)
-  * - Compute GEMM instructions (tensorGemm module)
-  */
+ *
+ * The compute unit is in charge of the following:
+ * - Loading micro-ops from memory (loadUop module)
+ * - Loading biases (acc) from memory (tensorAcc module)
+ * - Compute ALU instructions (tensorAlu module)
+ * - Compute GEMM instructions (tensorGemm module)
+ */
 class Compute(debug: Boolean = false)(implicit p: Parameters) extends Module {
   val mp = p(ShellKey).memParams
   val io = IO(new Bundle {
@@ -65,10 +65,10 @@ class Compute(debug: Boolean = false)(implicit p: Parameters) extends Module {
 
   val inst_type =
     Cat(dec.io.isFinish,
-        dec.io.isAlu,
-        dec.io.isGemm,
-        dec.io.isLoadAcc,
-        dec.io.isLoadUop).asUInt
+      dec.io.isAlu,
+      dec.io.isGemm,
+      dec.io.isLoadAcc,
+      dec.io.isLoadUop).asUInt
 
   val sprev = inst_q.io.deq.valid & Mux(dec.io.pop_prev, s(0).io.sready, true.B)
   val snext = inst_q.io.deq.valid & Mux(dec.io.pop_next, s(1).io.sready, true.B)
@@ -116,20 +116,14 @@ class Compute(debug: Boolean = false)(implicit p: Parameters) extends Module {
   loadUop.io.inst := inst_q.io.deq.bits
   loadUop.io.baddr := io.uop_baddr
   io.vme_rd(0) <> loadUop.io.vme_rd
-  loadUop.io.uop.idx <> Mux(dec.io.isGemm,
-                            tensorGemm.io.uop.idx,
-                            tensorAlu.io.uop.idx)
+  loadUop.io.uop.idx <> Mux(dec.io.isGemm, tensorGemm.io.uop.idx, tensorAlu.io.uop.idx)
 
   // acc
   tensorAcc.io.start := state === sIdle & start & dec.io.isLoadAcc
   tensorAcc.io.inst := inst_q.io.deq.bits
   tensorAcc.io.baddr := io.acc_baddr
-  tensorAcc.io.tensor.rd.idx <> Mux(dec.io.isGemm,
-                                    tensorGemm.io.acc.rd.idx,
-                                    tensorAlu.io.acc.rd.idx)
-  tensorAcc.io.tensor.wr <> Mux(dec.io.isGemm,
-                                tensorGemm.io.acc.wr,
-                                tensorAlu.io.acc.wr)
+  tensorAcc.io.tensor.rd.idx <> Mux(dec.io.isGemm, tensorGemm.io.acc.rd.idx, tensorAlu.io.acc.rd.idx)
+  tensorAcc.io.tensor.wr <> Mux(dec.io.isGemm, tensorGemm.io.acc.wr, tensorAlu.io.acc.wr)
   io.vme_rd(1) <> tensorAcc.io.vme_rd
 
   // gemm
@@ -156,8 +150,8 @@ class Compute(debug: Boolean = false)(implicit p: Parameters) extends Module {
 
   // out
   io.out.rd.idx <> Mux(dec.io.isGemm,
-                       tensorGemm.io.out.rd.idx,
-                       tensorAlu.io.out.rd.idx)
+    tensorGemm.io.out.rd.idx,
+    tensorAlu.io.out.rd.idx)
   io.out.wr <> Mux(dec.io.isGemm, tensorGemm.io.out.wr, tensorAlu.io.out.wr)
 
   // semaphore
@@ -178,20 +172,16 @@ class Compute(debug: Boolean = false)(implicit p: Parameters) extends Module {
       when(dec.io.isSync) {
         printf("[Compute] start sync\n")
       }.elsewhen(dec.io.isLoadUop) {
-          printf("[Compute] start load uop\n")
-        }
-        .elsewhen(dec.io.isLoadAcc) {
-          printf("[Compute] start load acc\n")
-        }
-        .elsewhen(dec.io.isGemm) {
-          printf("[Compute] start gemm\n")
-        }
-        .elsewhen(dec.io.isAlu) {
-          printf("[Compute] start alu\n")
-        }
-        .elsewhen(dec.io.isFinish) {
-          printf("[Compute] start finish\n")
-        }
+        printf("[Compute] start load uop\n")
+      }.elsewhen(dec.io.isLoadAcc) {
+        printf("[Compute] start load acc\n")
+      }.elsewhen(dec.io.isGemm) {
+        printf("[Compute] start gemm\n")
+      }.elsewhen(dec.io.isAlu) {
+        printf("[Compute] start alu\n")
+      }.elsewhen(dec.io.isFinish) {
+        printf("[Compute] start finish\n")
+      }
     }
     // done
     when(state === sSync) {
@@ -202,17 +192,14 @@ class Compute(debug: Boolean = false)(implicit p: Parameters) extends Module {
         when(dec.io.isLoadUop) {
           printf("[Compute] done load uop\n")
         }.elsewhen(dec.io.isLoadAcc) {
-            printf("[Compute] done load acc\n")
-          }
-          .elsewhen(dec.io.isGemm) {
-            printf("[Compute] done gemm\n")
-          }
-          .elsewhen(dec.io.isAlu) {
-            printf("[Compute] done alu\n")
-          }
-          .elsewhen(dec.io.isFinish) {
-            printf("[Compute] done finish\n")
-          }
+          printf("[Compute] done load acc\n")
+        }.elsewhen(dec.io.isGemm) {
+          printf("[Compute] done gemm\n")
+        }.elsewhen(dec.io.isAlu) {
+          printf("[Compute] done alu\n")
+        }.elsewhen(dec.io.isFinish) {
+          printf("[Compute] done finish\n")
+        }
       }
     }
   }
diff --git a/vta/hardware/chisel/src/main/scala/core/Configs.scala b/vta/hardware/chisel/src/main/scala/core/Configs.scala
index de7012b5cef3..4ab7d8503a0a 100644
--- a/vta/hardware/chisel/src/main/scala/core/Configs.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Configs.scala
@@ -22,28 +22,27 @@ package vta.core
 import vta.util.config._
 
 /** CoreConfig.
-  *
-  * This is one supported configuration for VTA. This file will
-  * be eventually filled out with class configurations that can be
-  * mixed/matched with Shell configurations for different backends.
-  */
-class CoreConfig
-    extends Config((site, here, up) => {
-      case CoreKey =>
-        CoreParams(
-          batch = 1,
-          blockOut = 16,
-          blockIn = 16,
-          inpBits = 8,
-          wgtBits = 8,
-          uopBits = 32,
-          accBits = 32,
-          outBits = 8,
-          uopMemDepth = 2048,
-          inpMemDepth = 2048,
-          wgtMemDepth = 1024,
-          accMemDepth = 2048,
-          outMemDepth = 2048,
-          instQueueEntries = 512
-        )
-    })
+ *
+ * This is one supported configuration for VTA. This file will
+ * be eventually filled out with class configurations that can be
+ * mixed/matched with Shell configurations for different backends.
+ */
+class CoreConfig extends Config((site, here, up) => {
+  case CoreKey =>
+    CoreParams(
+      batch = 1,
+      blockOut = 16,
+      blockIn = 16,
+      inpBits = 8,
+      wgtBits = 8,
+      uopBits = 32,
+      accBits = 32,
+      outBits = 8,
+      uopMemDepth = 2048,
+      inpMemDepth = 2048,
+      wgtMemDepth = 1024,
+      accMemDepth = 2048,
+      outMemDepth = 2048,
+      instQueueEntries = 512
+    )
+})
diff --git a/vta/hardware/chisel/src/main/scala/core/Core.scala b/vta/hardware/chisel/src/main/scala/core/Core.scala
index a7228ee898fa..6bfffdc212b7 100644
--- a/vta/hardware/chisel/src/main/scala/core/Core.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Core.scala
@@ -41,23 +41,23 @@ case class CoreParams(
     instQueueEntries: Int = 32
 ) {
   require(uopBits % 8 == 0,
-          s"\n\n[VTA] [CoreParams] uopBits must be byte aligned\n\n")
+    s"\n\n[VTA] [CoreParams] uopBits must be byte aligned\n\n")
 }
 
 case object CoreKey extends Field[CoreParams]
 
 /** Core.
-  *
-  * The core defines the current VTA architecture by connecting memory and
-  * compute modules together such as load/store and compute. Most of the
-  * connections in the core are bulk (<>), and we should try to keep it this
-  * way, because it is easier to understand what is going on.
-  *
-  * Also, the core must be instantiated by a shell using the
-  * VTA Control Register (VCR) and the VTA Memory Engine (VME) interfaces.
-  * More info about these interfaces and modules can be found in the shell
-  * directory.
-  */
+ *
+ * The core defines the current VTA architecture by connecting memory and
+ * compute modules together such as load/store and compute. Most of the
+ * connections in the core are bulk (<>), and we should try to keep it this
+ * way, because it is easier to understand what is going on.
+ *
+ * Also, the core must be instantiated by a shell using the
+ * VTA Control Register (VCR) and the VTA Memory Engine (VME) interfaces.
+ * More info about these interfaces and modules can be found in the shell
+ * directory.
+ */
 class Core(implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val vcr = new VCRClient
diff --git a/vta/hardware/chisel/src/main/scala/core/Decode.scala b/vta/hardware/chisel/src/main/scala/core/Decode.scala
index a49ddce5367a..37f6ab40584c 100644
--- a/vta/hardware/chisel/src/main/scala/core/Decode.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Decode.scala
@@ -25,16 +25,16 @@ import chisel3.util._
 import ISA._
 
 /** MemDecode.
-  *
-  * Decode memory instructions with a Bundle. This is similar to an union,
-  * therefore order matters when declaring fields. These are the instructions
-  * decoded with this bundle:
-  *   - LUOP
-  *   - LWGT
-  *   - LINP
-  *   - LACC
-  *   - SOUT
-  */
+ *
+ * Decode memory instructions with a Bundle. This is similar to an union,
+ * therefore order matters when declaring fields. These are the instructions
+ * decoded with this bundle:
+ *   - LUOP
+ *   - LWGT
+ *   - LINP
+ *   - LACC
+ *   - SOUT
+ */
 class MemDecode extends Bundle {
   val xpad_1 = UInt(M_PAD_BITS.W)
   val xpad_0 = UInt(M_PAD_BITS.W)
@@ -55,10 +55,10 @@ class MemDecode extends Bundle {
 }
 
 /** GemmDecode.
-  *
-  * Decode GEMM instruction with a Bundle. This is similar to an union,
-  * therefore order matters when declaring fields.
-  */
+ *
+ * Decode GEMM instruction with a Bundle. This is similar to an union,
+ * therefore order matters when declaring fields.
+ */
 class GemmDecode extends Bundle {
   val wgt_1 = UInt(C_WIDX_BITS.W)
   val wgt_0 = UInt(C_WIDX_BITS.W)
@@ -80,15 +80,15 @@ class GemmDecode extends Bundle {
 }
 
 /** AluDecode.
-  *
-  * Decode ALU instructions with a Bundle. This is similar to an union,
-  * therefore order matters when declaring fields. These are the instructions
-  * decoded with this bundle:
-  *   - VMIN
-  *   - VMAX
-  *   - VADD
-  *   - VSHX
-  */
+ *
+ * Decode ALU instructions with a Bundle. This is similar to an union,
+ * therefore order matters when declaring fields. These are the instructions
+ * decoded with this bundle:
+ *   - VMIN
+ *   - VMAX
+ *   - VADD
+ *   - VSHX
+ */
 class AluDecode extends Bundle {
   val empty_1 = Bool()
   val alu_imm = UInt(C_ALU_IMM_BITS.W)
@@ -112,9 +112,9 @@ class AluDecode extends Bundle {
 }
 
 /** UopDecode.
-  *
-  * Decode micro-ops (uops).
-  */
+ *
+ * Decode micro-ops (uops).
+ */
 class UopDecode extends Bundle {
   val u2 = UInt(10.W)
   val u1 = UInt(11.W)
@@ -122,9 +122,9 @@ class UopDecode extends Bundle {
 }
 
 /** FetchDecode.
-  *
-  * Partial decoding for dispatching instructions to Load, Compute, and Store.
-  */
+ *
+ * Partial decoding for dispatching instructions to Load, Compute, and Store.
+ */
 class FetchDecode extends Module {
   val io = IO(new Bundle {
     val inst = Input(UInt(INST_BITS.W))
@@ -159,9 +159,9 @@ class FetchDecode extends Module {
 }
 
 /** LoadDecode.
-  *
-  * Decode dependencies, type and sync for Load module.
-  */
+ *
+ * Decode dependencies, type and sync for Load module.
+ */
 class LoadDecode extends Module {
   val io = IO(new Bundle {
     val inst = Input(UInt(INST_BITS.W))
@@ -180,9 +180,9 @@ class LoadDecode extends Module {
 }
 
 /** ComputeDecode.
-  *
-  * Decode dependencies, type and sync for Compute module.
-  */
+ *
+ * Decode dependencies, type and sync for Compute module.
+ */
 class ComputeDecode extends Module {
   val io = IO(new Bundle {
     val inst = Input(UInt(INST_BITS.W))
@@ -211,9 +211,9 @@ class ComputeDecode extends Module {
 }
 
 /** StoreDecode.
-  *
-  * Decode dependencies, type and sync for Store module.
-  */
+ *
+ * Decode dependencies, type and sync for Store module.
+ */
 class StoreDecode extends Module {
   val io = IO(new Bundle {
     val inst = Input(UInt(INST_BITS.W))
diff --git a/vta/hardware/chisel/src/main/scala/core/EventCounters.scala b/vta/hardware/chisel/src/main/scala/core/EventCounters.scala
index 8990aefd6054..f9fd7f0be105 100644
--- a/vta/hardware/chisel/src/main/scala/core/EventCounters.scala
+++ b/vta/hardware/chisel/src/main/scala/core/EventCounters.scala
@@ -25,21 +25,20 @@ import vta.util.config._
 import vta.shell._
 
 /** EventCounters.
-  *
-  * This unit contains all the event counting logic. One common event tracked in
-  * hardware is the number of clock cycles taken to achieve certain task. We
-  * can count the total number of clock cycles spent in a VTA run by checking
-  * launch and finish signals.
-  *
-  * The event counter value is passed to the VCR module via the ecnt port, so
-  * they can be accessed by the host. The number of event counters (nECnt) is
-  * defined in the Shell VCR module as a parameter, see VCRParams.
-  *
-  * If one would like to add an event counter, then the value of nECnt must be
-  * changed in VCRParams together with the corresponding counting logic here.
-  */
-class EventCounters(debug: Boolean = false)(implicit p: Parameters)
-    extends Module {
+ *
+ * This unit contains all the event counting logic. One common event tracked in
+ * hardware is the number of clock cycles taken to achieve certain task. We
+ * can count the total number of clock cycles spent in a VTA run by checking
+ * launch and finish signals.
+ *
+ * The event counter value is passed to the VCR module via the ecnt port, so
+ * they can be accessed by the host. The number of event counters (nECnt) is
+ * defined in the Shell VCR module as a parameter, see VCRParams.
+ *
+ * If one would like to add an event counter, then the value of nECnt must be
+ * changed in VCRParams together with the corresponding counting logic here.
+ */
+class EventCounters(debug: Boolean = false)(implicit p: Parameters) extends Module {
   val vp = p(ShellKey).vcrParams
   val io = IO(new Bundle {
     val launch = Input(Bool())
diff --git a/vta/hardware/chisel/src/main/scala/core/Fetch.scala b/vta/hardware/chisel/src/main/scala/core/Fetch.scala
index 9baf1cc33984..0ea35a3e653a 100644
--- a/vta/hardware/chisel/src/main/scala/core/Fetch.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Fetch.scala
@@ -25,20 +25,20 @@ import vta.util.config._
 import vta.shell._
 
 /** Fetch.
-  *
-  * The fetch unit reads instructions (tasks) from memory (i.e. DRAM), using the
-  * VTA Memory Engine (VME), and push them into an instruction queue called
-  * inst_q. Once the instruction queue is full, instructions are dispatched to
-  * the Load, Compute and Store module queues based on the instruction opcode.
-  * After draining the queue, the fetch unit checks if there are more instructions
-  * via the ins_count register which is written by the host.
-  *
-  * Additionally, instructions are read into two chunks (see sReadLSB and sReadMSB)
-  * because we are using a DRAM payload of 8-bytes or half of a VTA instruction.
-  * This should be configurable for larger payloads, i.e. 64-bytes, which can load
-  * more than one instruction at the time. Finally, the instruction queue is
-  * sized (entries_q), depending on the maximum burst allowed in the memory.
-  */
+ *
+ * The fetch unit reads instructions (tasks) from memory (i.e. DRAM), using the
+ * VTA Memory Engine (VME), and push them into an instruction queue called
+ * inst_q. Once the instruction queue is full, instructions are dispatched to
+ * the Load, Compute and Store module queues based on the instruction opcode.
+ * After draining the queue, the fetch unit checks if there are more instructions
+ * via the ins_count register which is written by the host.
+ *
+ * Additionally, instructions are read into two chunks (see sReadLSB and sReadMSB)
+ * because we are using a DRAM payload of 8-bytes or half of a VTA instruction.
+ * This should be configurable for larger payloads, i.e. 64-bytes, which can load
+ * more than one instruction at the time. Finally, the instruction queue is
+ * sized (entries_q), depending on the maximum burst allowed in the memory.
+ */
 class Fetch(debug: Boolean = false)(implicit p: Parameters) extends Module {
   val vp = p(ShellKey).vcrParams
   val mp = p(ShellKey).memParams
@@ -112,17 +112,16 @@ class Fetch(debug: Boolean = false)(implicit p: Parameters) extends Module {
         when(xrem === 0.U) {
           state := sIdle
         }.elsewhen(xrem < xmax) {
-            state := sReadCmd
-            rlen := xrem
-            ilen := xrem >> 1.U
-            xrem := 0.U
-          }
-          .otherwise {
-            state := sReadCmd
-            rlen := xmax - 1.U
-            ilen := (xmax >> 1.U) - 1.U
-            xrem := xrem - xmax
-          }
+          state := sReadCmd
+          rlen := xrem
+          ilen := xrem >> 1.U
+          xrem := 0.U
+        }.otherwise {
+          state := sReadCmd
+          rlen := xmax - 1.U
+          ilen := (xmax >> 1.U) - 1.U
+          xrem := xrem - xmax
+        }
       }
     }
   }
@@ -165,12 +164,12 @@ class Fetch(debug: Boolean = false)(implicit p: Parameters) extends Module {
   val deq_sel = Cat(dec.io.isCompute, dec.io.isStore, dec.io.isLoad).asUInt
   val deq_ready =
     MuxLookup(deq_sel,
-              false.B, // default
-              Array(
-                "h_01".U -> io.inst.ld.ready,
-                "h_02".U -> io.inst.st.ready,
-                "h_04".U -> io.inst.co.ready
-              ))
+      false.B, // default
+      Array(
+        "h_01".U -> io.inst.ld.ready,
+        "h_02".U -> io.inst.st.ready,
+        "h_04".U -> io.inst.co.ready
+      ))
 
   // dequeue instruction
   inst_q.io.deq.ready := deq_ready & inst_q.io.deq.valid & state === sDrain
diff --git a/vta/hardware/chisel/src/main/scala/core/ISA.scala b/vta/hardware/chisel/src/main/scala/core/ISA.scala
index edc182311adb..bfe89ebb41f5 100644
--- a/vta/hardware/chisel/src/main/scala/core/ISA.scala
+++ b/vta/hardware/chisel/src/main/scala/core/ISA.scala
@@ -24,9 +24,9 @@ import chisel3.util._
 import scala.collection.mutable.HashMap
 
 /** ISAConstants.
-  *
-  * These constants are used for decoding (parsing) fields on instructions.
-  */
+ *
+ * These constants are used for decoding (parsing) fields on instructions.
+ */
 trait ISAConstants {
   val INST_BITS = 128
 
@@ -70,13 +70,13 @@ trait ISAConstants {
 }
 
 /** ISA.
-  *
-  * This is the VTA task ISA
-  *
-  * TODO: Add VXOR to clear accumulator
-  * TODO: Use ISA object for decoding as well
-  * TODO: Eventually deprecate ISAConstants
-  */
+ *
+ * This is the VTA task ISA
+ *
+ * TODO: Add VXOR to clear accumulator
+ * TODO: Use ISA object for decoding as well
+ * TODO: Eventually deprecate ISAConstants
+ */
 object ISA {
   private val xLen = 128
   private val depBits = 4
@@ -86,19 +86,19 @@ object ISA {
 
   private val taskId: HashMap[String, String] =
     HashMap(("load", "000"),
-            ("store", "001"),
-            ("gemm", "010"),
-            ("finish", "011"),
-            ("alu", "100"))
+      ("store", "001"),
+      ("gemm", "010"),
+      ("finish", "011"),
+      ("alu", "100"))
 
   private val memId: HashMap[String, String] =
     HashMap(("uop", "00"), ("wgt", "01"), ("inp", "10"), ("acc", "11"))
 
   private val aluId: HashMap[String, String] =
     HashMap(("minpool", "00"),
-            ("maxpool", "01"),
-            ("add", "10"),
-            ("shift", "11"))
+      ("maxpool", "01"),
+      ("add", "10"),
+      ("shift", "11"))
 
   private def dontCare(bits: Int): String = "?" * bits
 
diff --git a/vta/hardware/chisel/src/main/scala/core/Load.scala b/vta/hardware/chisel/src/main/scala/core/Load.scala
index 7c79498bd8f0..50c26bb8e8ed 100644
--- a/vta/hardware/chisel/src/main/scala/core/Load.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Load.scala
@@ -25,12 +25,12 @@ import vta.util.config._
 import vta.shell._
 
 /** Load.
-  *
-  * Load inputs and weights from memory (DRAM) into scratchpads (SRAMs).
-  * This module instantiate the TensorLoad unit which is in charge of
-  * loading 1D and 2D tensors to scratchpads, so it can be used by
-  * other modules such as Compute.
-  */
+ *
+ * Load inputs and weights from memory (DRAM) into scratchpads (SRAMs).
+ * This module instantiate the TensorLoad unit which is in charge of
+ * loading 1D and 2D tensors to scratchpads, so it can be used by
+ * other modules such as Compute.
+ */
 class Load(debug: Boolean = false)(implicit p: Parameters) extends Module {
   val mp = p(ShellKey).memParams
   val io = IO(new Bundle {
@@ -110,11 +110,10 @@ class Load(debug: Boolean = false)(implicit p: Parameters) extends Module {
       when(dec.io.isSync) {
         printf("[Load] start sync\n")
       }.elsewhen(dec.io.isInput) {
-          printf("[Load] start input\n")
-        }
-        .elsewhen(dec.io.isWeight) {
-          printf("[Load] start weight\n")
-        }
+        printf("[Load] start input\n")
+      }.elsewhen(dec.io.isWeight) {
+        printf("[Load] start weight\n")
+      }
     }
     // done
     when(state === sSync) {
diff --git a/vta/hardware/chisel/src/main/scala/core/LoadUop.scala b/vta/hardware/chisel/src/main/scala/core/LoadUop.scala
index c7d0f4439858..274581f475b3 100644
--- a/vta/hardware/chisel/src/main/scala/core/LoadUop.scala
+++ b/vta/hardware/chisel/src/main/scala/core/LoadUop.scala
@@ -25,11 +25,11 @@ import vta.util.config._
 import vta.shell._
 
 /** UopMaster.
-  *
-  * Uop interface used by a master module, i.e. TensorAlu or TensorGemm,
-  * to request a micro-op (uop) from the uop-scratchpad. The index (idx) is
-  * used as an address to find the uop in the uop-scratchpad.
-  */
+ *
+ * Uop interface used by a master module, i.e. TensorAlu or TensorGemm,
+ * to request a micro-op (uop) from the uop-scratchpad. The index (idx) is
+ * used as an address to find the uop in the uop-scratchpad.
+ */
 class UopMaster(implicit p: Parameters) extends Bundle {
   val addrBits = log2Ceil(p(CoreKey).uopMemDepth)
   val idx = ValidIO(UInt(addrBits.W))
@@ -38,11 +38,11 @@ class UopMaster(implicit p: Parameters) extends Bundle {
 }
 
 /** UopClient.
-  *
-  * Uop interface used by a client module, i.e. LoadUop, to receive
-  * a request from a master module, i.e. TensorAlu or TensorGemm.
-  * The index (idx) is used as an address to find the uop in the uop-scratchpad.
-  */
+ *
+ * Uop interface used by a client module, i.e. LoadUop, to receive
+ * a request from a master module, i.e. TensorAlu or TensorGemm.
+ * The index (idx) is used as an address to find the uop in the uop-scratchpad.
+ */
 class UopClient(implicit p: Parameters) extends Bundle {
   val addrBits = log2Ceil(p(CoreKey).uopMemDepth)
   val idx = Flipped(ValidIO(UInt(addrBits.W)))
@@ -51,12 +51,12 @@ class UopClient(implicit p: Parameters) extends Bundle {
 }
 
 /** LoadUop.
-  *
-  * Load micro-ops (uops) from memory, i.e. DRAM, and store them in the
-  * uop-scratchpad. Currently, micro-ops are 32-bit wide and loaded in
-  * group of 2 given the fact that the DRAM payload is 8-bytes. This module
-  * should be modified later on to support different DRAM sizes efficiently.
-  */
+ *
+ * Load micro-ops (uops) from memory, i.e. DRAM, and store them in the
+ * uop-scratchpad. Currently, micro-ops are 32-bit wide and loaded in
+ * group of 2 given the fact that the DRAM payload is 8-bytes. This module
+ * should be modified later on to support different DRAM sizes efficiently.
+ */
 class LoadUop(debug: Boolean = false)(implicit p: Parameters) extends Module {
   val mp = p(ShellKey).memParams
   val io = IO(new Bundle {
@@ -113,15 +113,14 @@ class LoadUop(debug: Boolean = false)(implicit p: Parameters) extends Module {
           when(xrem === 0.U) {
             state := sIdle
           }.elsewhen(xrem < xmax) {
-              state := sReadCmd
-              xlen := xrem
-              xrem := 0.U
-            }
-            .otherwise {
-              state := sReadCmd
-              xlen := xmax - 1.U
-              xrem := xrem - xmax
-            }
+            state := sReadCmd
+            xlen := xrem
+            xrem := 0.U
+          }.otherwise {
+            state := sReadCmd
+            xlen := xmax - 1.U
+            xrem := xrem - xmax
+          }
         }
       }
     }
@@ -166,19 +165,18 @@ class LoadUop(debug: Boolean = false)(implicit p: Parameters) extends Module {
     when(sizeIsEven) {
       wmask := "b_11".U.asTypeOf(wmask)
     }.elsewhen(io.vme_rd.cmd.fire()) {
-        when(dec.xsize === 1.U) {
-          wmask := "b_01".U.asTypeOf(wmask)
-        }.otherwise {
-          wmask := "b_11".U.asTypeOf(wmask)
-        }
+      when(dec.xsize === 1.U) {
+        wmask := "b_01".U.asTypeOf(wmask)
+      }.otherwise {
+        wmask := "b_11".U.asTypeOf(wmask)
       }
-      .elsewhen(io.vme_rd.data.fire()) {
-        when((xcnt === xlen - 1.U) && (xrem === 0.U)) {
-          wmask := "b_01".U.asTypeOf(wmask)
-        }.otherwise {
-          wmask := "b_11".U.asTypeOf(wmask)
-        }
+    }.elsewhen(io.vme_rd.data.fire()) {
+      when((xcnt === xlen - 1.U) && (xrem === 0.U)) {
+        wmask := "b_01".U.asTypeOf(wmask)
+      }.otherwise {
+        wmask := "b_11".U.asTypeOf(wmask)
       }
+    }
   }.otherwise {
     when(io.vme_rd.cmd.fire()) {
       wmask := "b_10".U.asTypeOf(wmask)
diff --git a/vta/hardware/chisel/src/main/scala/core/Semaphore.scala b/vta/hardware/chisel/src/main/scala/core/Semaphore.scala
index f268e79b228b..efc895bc673e 100644
--- a/vta/hardware/chisel/src/main/scala/core/Semaphore.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Semaphore.scala
@@ -23,14 +23,13 @@ import chisel3._
 import chisel3.util._
 
 /** Semaphore.
-  *
-  * This semaphore is used instead of push/pop fifo, used in the initial
-  * version of VTA. This semaphore is incremented (spost) or decremented (swait)
-  * depending on the push and pop fields on instructions to prevent RAW and WAR
-  * hazards.
-  */
-class Semaphore(counterBits: Int = 1, counterInitValue: Int = 1)
-    extends Module {
+ *
+ * This semaphore is used instead of push/pop fifo, used in the initial
+ * version of VTA. This semaphore is incremented (spost) or decremented (swait)
+ * depending on the push and pop fields on instructions to prevent RAW and WAR
+ * hazards.
+ */
+class Semaphore(counterBits: Int = 1, counterInitValue: Int = 1) extends Module {
   val io = IO(new Bundle {
     val spost = Input(Bool())
     val swait = Input(Bool())
diff --git a/vta/hardware/chisel/src/main/scala/core/Store.scala b/vta/hardware/chisel/src/main/scala/core/Store.scala
index 04bc7f5be60b..025a0a24696b 100644
--- a/vta/hardware/chisel/src/main/scala/core/Store.scala
+++ b/vta/hardware/chisel/src/main/scala/core/Store.scala
@@ -25,11 +25,11 @@ import vta.util.config._
 import vta.shell._
 
 /** Store.
-  *
-  * Store results back to memory (DRAM) from scratchpads (SRAMs).
-  * This module instantiate the TensorStore unit which is in charge
-  * of storing 1D and 2D tensors to main memory.
-  */
+ *
+ * Store results back to memory (DRAM) from scratchpads (SRAMs).
+ * This module instantiate the TensorStore unit which is in charge
+ * of storing 1D and 2D tensors to main memory.
+ */
 class Store(debug: Boolean = false)(implicit p: Parameters) extends Module {
   val mp = p(ShellKey).memParams
   val io = IO(new Bundle {
diff --git a/vta/hardware/chisel/src/main/scala/core/TensorAlu.scala b/vta/hardware/chisel/src/main/scala/core/TensorAlu.scala
index 21e57f2dab94..a6feffaa18dc 100644
--- a/vta/hardware/chisel/src/main/scala/core/TensorAlu.scala
+++ b/vta/hardware/chisel/src/main/scala/core/TensorAlu.scala
@@ -39,11 +39,8 @@ class Alu(implicit p: Parameters) extends Module {
   val m = ~ub(width - 1, 0) + 1.U
 
   val n = ub(width - 1, 0)
-  val fop = Seq(Mux(io.a < io.b, io.a, io.b),
-                Mux(io.a < io.b, io.b, io.a),
-                io.a + io.b,
-                io.a >> n,
-                io.a << m)
+  val fop = Seq(Mux(io.a < io.b, io.a, io.b), Mux(io.a < io.b, io.b, io.a),
+    io.a + io.b, io.a >> n, io.a << m)
 
   val opmux = Seq.tabulate(ALU_OP_NUM)(i => ALU_OP(i) -> fop(i))
   io.y := MuxLookup(io.opcode, io.a, opmux)
@@ -101,12 +98,12 @@ class AluVector(implicit p: Parameters) extends Module {
 }
 
 /** TensorAlu.
-  *
-  * This unit instantiate the ALU vector unit (AluVector) and go over the
-  * micro-ops (uops) which are used to read the source operands (vectors)
-  * from the acc-scratchpad and then they are written back the same
-  * acc-scratchpad.
-  */
+ *
+ * This unit instantiate the ALU vector unit (AluVector) and go over the
+ * micro-ops (uops) which are used to read the source operands (vectors)
+ * from the acc-scratchpad and then they are written back the same
+ * acc-scratchpad.
+ */
 class TensorAlu(debug: Boolean = false)(implicit p: Parameters) extends Module {
   val aluBits = p(CoreKey).accBits
   val io = IO(new Bundle {
@@ -200,18 +197,14 @@ class TensorAlu(debug: Boolean = false)(implicit p: Parameters) extends Module {
     dst_i := 0.U
     src_i := 0.U
   }.elsewhen(state === sReadUop && cnt_i === dec.lp_1) {
-      cnt_i := 0.U
-      dst_i := dst_o
-      src_i := src_o
-    }
-    .elsewhen(
-      state === sExe &&
-        alu.io.out.data.valid &&
-        uop_idx === uop_end - 1.U) {
-      cnt_i := cnt_i + 1.U
-      dst_i := dst_i + dec.dst_1
-      src_i := src_i + dec.src_1
-    }
+    cnt_i := 0.U
+    dst_i := dst_o
+    src_i := src_o
+  }.elsewhen(state === sExe && alu.io.out.data.valid && uop_idx === uop_end - 1.U) {
+    cnt_i := cnt_i + 1.U
+    dst_i := dst_i + dec.dst_1
+    src_i := src_i + dec.src_1
+  }
 
   when(state === sComputeIdx && io.uop.data.valid) {
     uop_dst := io.uop.data.bits.u0 + dst_i
@@ -232,7 +225,7 @@ class TensorAlu(debug: Boolean = false)(implicit p: Parameters) extends Module {
   tensorImm.data.bits.foreach { b =>
     b.foreach { c =>
       c := Mux(dec.alu_imm(C_ALU_IMM_BITS - 1),
-               Cat(-1.S((aluBits - C_ALU_IMM_BITS).W), dec.alu_imm), dec.alu_imm)
+        Cat(-1.S((aluBits - C_ALU_IMM_BITS).W), dec.alu_imm), dec.alu_imm)
     }
   }
 
@@ -244,11 +237,11 @@ class TensorAlu(debug: Boolean = false)(implicit p: Parameters) extends Module {
   alu.io.acc_a.data.valid := io.acc.rd.data.valid & state === sReadTensorB
   alu.io.acc_a.data.bits <> io.acc.rd.data.bits
   alu.io.acc_b.data.valid := Mux(dec.alu_use_imm,
-                                 tensorImm.data.valid,
-                                 io.acc.rd.data.valid & state === sExe)
+    tensorImm.data.valid,
+    io.acc.rd.data.valid & state === sExe)
   alu.io.acc_b.data.bits <> Mux(dec.alu_use_imm,
-                                tensorImm.data.bits,
-                                io.acc.rd.data.bits)
+    tensorImm.data.bits,
+    io.acc.rd.data.bits)
 
   // acc_o
   io.acc.wr.valid := alu.io.acc_y.data.valid
diff --git a/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala b/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala
index f982176c1e81..f2d295f66220 100644
--- a/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala
+++ b/vta/hardware/chisel/src/main/scala/core/TensorGemm.scala
@@ -47,9 +47,9 @@ class MAC(aBits: Int = 8, bBits: Int = 8, cBits: Int = 16) extends Module {
 }
 
 /** PipeAdder
-  *
-  * This unit loads input bits into register and performs addition in the next cycle
-  */
+ *
+ * This unit loads input bits into register and performs addition in the next cycle
+ */
 class PipeAdder(aBits: Int = 8, bBits: Int = 8) extends Module {
   val outBits = Math.max(aBits, bBits) + 1
   val io = IO(new Bundle {
@@ -65,10 +65,10 @@ class PipeAdder(aBits: Int = 8, bBits: Int = 8) extends Module {
 }
 
 /** Adder
-  *
-  * This unit wires input bits to an adder directly.
-  * The output comes out of combinational logic without waiting for another cycle.
-  */
+ *
+ * This unit wires input bits to an adder directly.
+ * The output comes out of combinational logic without waiting for another cycle.
+ */
 class Adder(aBits: Int = 8, bBits: Int = 8) extends Module {
   val outBits = Math.max(aBits, bBits) + 1
   val io = IO(new Bundle {
@@ -86,8 +86,7 @@ class Adder(aBits: Int = 8, bBits: Int = 8) extends Module {
 }
 
 /** Pipelined DotProduct based on MAC and PipeAdder */
-class DotProduct(aBits: Int = 8, bBits: Int = 8, size: Int = 16)
-    extends Module {
+class DotProduct(aBits: Int = 8, bBits: Int = 8, size: Int = 16) extends Module {
   val errorMsg =
     s"\n\n[VTA] [DotProduct] size must be greater than 4 and a power of 2\n\n"
   require(size >= 2 && isPow2(size), errorMsg)
@@ -175,16 +174,15 @@ class MatrixVectorMultiplication(implicit p: Parameters) extends Module {
 }
 
 /** TensorGemm.
-  *
-  * This unit instantiate the MatrixVectorMultiplication and go over the
-  * micro-ops (uops) which are used to read inputs, weights and biases,
-  * and writes results back to the acc and out scratchpads.
-  *
-  * Also, the TensorGemm uses the reset field in the Gemm instruction to
-  * clear or zero-out the acc-scratchpad locations based on the micro-ops.
-  */
-class TensorGemm(debug: Boolean = false)(implicit p: Parameters)
-    extends Module {
+ *
+ * This unit instantiate the MatrixVectorMultiplication and go over the
+ * micro-ops (uops) which are used to read inputs, weights and biases,
+ * and writes results back to the acc and out scratchpads.
+ *
+ * Also, the TensorGemm uses the reset field in the Gemm instruction to
+ * clear or zero-out the acc-scratchpad locations based on the micro-ops.
+ */
+class TensorGemm(debug: Boolean = false)(implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val start = Input(Bool())
     val done = Output(Bool())
@@ -268,11 +266,10 @@ class TensorGemm(debug: Boolean = false)(implicit p: Parameters)
     when((state === sReadTensor) && mvc.io.acc_o.data.valid) { // issue & commit
       inflight := inflight
     }.elsewhen(state === sReadTensor) { // issue a tensor
-        inflight := inflight + 1.U
-      }
-      .elsewhen(mvc.io.acc_o.data.valid) { // commit a tensor
-        inflight := inflight - 1.U
-      }
+      inflight := inflight + 1.U
+    }.elsewhen(mvc.io.acc_o.data.valid) { // commit a tensor
+      inflight := inflight - 1.U
+    }
   }
 
   when(
@@ -305,17 +302,16 @@ class TensorGemm(debug: Boolean = false)(implicit p: Parameters)
     inp_i := 0.U
     wgt_i := 0.U
   }.elsewhen(state === sReadUop && cnt_i === dec.lp_1) {
-      cnt_i := 0.U
-      acc_i := acc_o
-      inp_i := inp_o
-      wgt_i := wgt_o
-    }
-    .elsewhen(state === sExe && uop_idx === uop_end - 1.U) {
-      cnt_i := cnt_i + 1.U
-      acc_i := acc_i + dec.acc_1
-      inp_i := inp_i + dec.inp_1
-      wgt_i := wgt_i + dec.wgt_1
-    }
+    cnt_i := 0.U
+    acc_i := acc_o
+    inp_i := inp_o
+    wgt_i := wgt_o
+  }.elsewhen(state === sExe && uop_idx === uop_end - 1.U) {
+    cnt_i := cnt_i + 1.U
+    acc_i := acc_i + dec.acc_1
+    inp_i := inp_i + dec.inp_1
+    wgt_i := wgt_i + dec.wgt_1
+  }
 
   when(state === sComputeIdx && io.uop.data.valid) {
     uop_acc := io.uop.data.bits.u0 + acc_i
@@ -351,9 +347,8 @@ class TensorGemm(debug: Boolean = false)(implicit p: Parameters)
   mvc.io.acc_i.data <> io.acc.rd.data
 
   // acc_o
-  io.acc.wr.valid := mvc.io.acc_o.data.valid & Mux(dec.reset,
-                                                   true.B,
-                                                   wrpipe.io.deq.valid)
+  io.acc.wr.valid := mvc.io.acc_o.data.valid &
+    Mux(dec.reset, true.B, wrpipe.io.deq.valid)
   io.acc.wr.bits.idx := Mux(dec.reset, uop_acc, wrpipe.io.deq.bits)
   io.acc.wr.bits.data <> mvc.io.acc_o.data.bits
 
@@ -371,10 +366,7 @@ class TensorGemm(debug: Boolean = false)(implicit p: Parameters)
     }
 
     when(state === sReadTensor && ~dec.reset) {
-      printf("[TensorGemm] [uop] acc:%x inp:%x wgt:%x\n",
-             uop_acc,
-             uop_inp,
-             uop_wgt)
+      printf("[TensorGemm] [uop] acc:%x inp:%x wgt:%x\n", uop_acc, uop_inp, uop_wgt)
     }
 
     io.inp.rd.data.bits.zipWithIndex.foreach {
diff --git a/vta/hardware/chisel/src/main/scala/core/TensorLoad.scala b/vta/hardware/chisel/src/main/scala/core/TensorLoad.scala
index d184cd2c286a..f5cc849547a6 100644
--- a/vta/hardware/chisel/src/main/scala/core/TensorLoad.scala
+++ b/vta/hardware/chisel/src/main/scala/core/TensorLoad.scala
@@ -25,13 +25,13 @@ import vta.util.config._
 import vta.shell._
 
 /** TensorLoad.
-  *
-  * Load 1D and 2D tensors from main memory (DRAM) to input/weight
-  * scratchpads (SRAM). Also, there is support for zero padding, while
-  * doing the load. Zero-padding works on the y and x axis, and it is
-  * managed by TensorPadCtrl. The TensorDataCtrl is in charge of
-  * handling the way tensors are stored on the scratchpads.
-  */
+ *
+ * Load 1D and 2D tensors from main memory (DRAM) to input/weight
+ * scratchpads (SRAM). Also, there is support for zero padding, while
+ * doing the load. Zero-padding works on the y and x axis, and it is
+ * managed by TensorPadCtrl. The TensorDataCtrl is in charge of
+ * handling the way tensors are stored on the scratchpads.
+ */
 class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
     implicit p: Parameters)
     extends Module {
@@ -71,11 +71,10 @@ class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
         when(dec.ypad_0 =/= 0.U) {
           state := sYPad0
         }.elsewhen(dec.xpad_0 =/= 0.U) {
-            state := sXPad0
-          }
-          .otherwise {
-            state := sReadCmd
-          }
+          state := sXPad0
+        }.otherwise {
+          state := sReadCmd
+        }
       }
     }
     is(sYPad0) {
@@ -213,13 +212,12 @@ class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
     waddr_cur := dec.sram_offset
     waddr_nxt := dec.sram_offset
   }.elsewhen((io.vme_rd.data
-      .fire() || isZeroPad) && set === (tp.tensorLength - 1).U && tag === (tp.numMemBlock - 1).U) {
-      waddr_cur := waddr_cur + 1.U
-    }
-    .elsewhen(dataCtrl.io.stride) {
-      waddr_cur := waddr_nxt + dec.xsize
-      waddr_nxt := waddr_nxt + dec.xsize
-    }
+    .fire() || isZeroPad) && set === (tp.tensorLength - 1).U && tag === (tp.numMemBlock - 1).U) {
+    waddr_cur := waddr_cur + 1.U
+  }.elsewhen(dataCtrl.io.stride) {
+    waddr_cur := waddr_nxt + dec.xsize
+    waddr_nxt := waddr_nxt + dec.xsize
+  }
 
   val tensorFile = Seq.fill(tp.tensorLength) {
     SyncReadMem(tp.memDepth, Vec(tp.numMemBlock, UInt(tp.memBlockBits.W)))
@@ -241,8 +239,8 @@ class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
     val tdata = io.tensor.wr.bits.data(i).asUInt.asTypeOf(wdata(i))
     val muxWen =
       Mux(state === sIdle,
-          io.tensor.wr.valid,
-          (io.vme_rd.data.fire() | isZeroPad) & set === i.U)
+        io.tensor.wr.valid,
+        (io.vme_rd.data.fire() | isZeroPad) & set === i.U)
     val muxWaddr = Mux(state === sIdle, io.tensor.wr.bits.idx, waddr_cur)
     val muxWdata = Mux(state === sIdle, tdata, wdata(i))
     val muxWmask = Mux(state === sIdle, no_mask, wmask(i))
@@ -274,8 +272,8 @@ class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
     if (tensorType == "inp") {
       when(io.vme_rd.cmd.fire()) {
         printf("[TensorLoad] [inp] cmd addr:%x len:%x\n",
-               dataCtrl.io.addr,
-               dataCtrl.io.len)
+          dataCtrl.io.addr,
+          dataCtrl.io.len)
       }
       when(state === sYPad0) {
         printf("[TensorLoad] [inp] sYPad0\n")
@@ -292,14 +290,14 @@ class TensorLoad(tensorType: String = "none", debug: Boolean = false)(
     } else if (tensorType == "wgt") {
       when(io.vme_rd.cmd.fire()) {
         printf("[TensorLoad] [wgt] cmd addr:%x len:%x\n",
-               dataCtrl.io.addr,
-               dataCtrl.io.len)
+          dataCtrl.io.addr,
+          dataCtrl.io.len)
       }
     } else if (tensorType == "acc") {
       when(io.vme_rd.cmd.fire()) {
         printf("[TensorLoad] [acc] cmd addr:%x len:%x\n",
-               dataCtrl.io.addr,
-               dataCtrl.io.len)
+          dataCtrl.io.addr,
+          dataCtrl.io.len)
       }
     }
   }
diff --git a/vta/hardware/chisel/src/main/scala/core/TensorStore.scala b/vta/hardware/chisel/src/main/scala/core/TensorStore.scala
index 083a70c5b119..439023be0934 100644
--- a/vta/hardware/chisel/src/main/scala/core/TensorStore.scala
+++ b/vta/hardware/chisel/src/main/scala/core/TensorStore.scala
@@ -25,9 +25,9 @@ import vta.util.config._
 import vta.shell._
 
 /** TensorStore.
-  *
-  * Store 1D and 2D tensors from out-scratchpad (SRAM) to main memory (DRAM).
-  */
+ *
+ * Store 1D and 2D tensors from out-scratchpad (SRAM) to main memory (DRAM).
+ */
 class TensorStore(tensorType: String = "none", debug: Boolean = false)(
     implicit p: Parameters)
     extends Module {
@@ -112,15 +112,14 @@ class TensorStore(tensorType: String = "none", debug: Boolean = false)(
             }
           }
         }.elsewhen(xrem < xmax) {
-            state := sWriteCmd
-            xlen := xrem
-            xrem := 0.U
-          }
-          .otherwise {
-            state := sWriteCmd
-            xlen := xmax - 1.U
-            xrem := xrem - xmax
-          }
+          state := sWriteCmd
+          xlen := xrem
+          xrem := 0.U
+        }.otherwise {
+          state := sWriteCmd
+          xlen := xmax - 1.U
+          xrem := xrem - xmax
+        }
       }
     }
   }
@@ -176,13 +175,12 @@ class TensorStore(tensorType: String = "none", debug: Boolean = false)(
     raddr_cur := dec.sram_offset
     raddr_nxt := dec.sram_offset
   }.elsewhen(io.vme_wr.data
-      .fire() && set === (tensorLength - 1).U && tag === (numMemBlock - 1).U) {
-      raddr_cur := raddr_cur + 1.U
-    }
-    .elsewhen(stride) {
-      raddr_cur := raddr_nxt + dec.xsize
-      raddr_nxt := raddr_nxt + dec.xsize
-    }
+    .fire() && set === (tensorLength - 1).U && tag === (numMemBlock - 1).U) {
+    raddr_cur := raddr_cur + 1.U
+  }.elsewhen(stride) {
+    raddr_cur := raddr_nxt + dec.xsize
+    raddr_nxt := raddr_nxt + dec.xsize
+  }
 
   val tread = Seq.tabulate(tensorLength) { i =>
     i.U ->
@@ -199,14 +197,11 @@ class TensorStore(tensorType: String = "none", debug: Boolean = false)(
     waddr_nxt := io.baddr | (maskOffset & (dec.dram_offset << log2Ceil(
       elemBytes)))
   }.elsewhen(state === sWriteAck && io.vme_wr.ack && xrem =/= 0.U) {
-      waddr_cur := waddr_cur + xmax_bytes
-    }
-    .elsewhen(stride) {
-      waddr_cur := waddr_nxt + (dec.xstride << log2Ceil(
-        tensorLength * tensorWidth))
-      waddr_nxt := waddr_nxt + (dec.xstride << log2Ceil(
-        tensorLength * tensorWidth))
-    }
+    waddr_cur := waddr_cur + xmax_bytes
+  }.elsewhen(stride) {
+    waddr_cur := waddr_nxt + (dec.xstride << log2Ceil(tensorLength * tensorWidth))
+    waddr_nxt := waddr_nxt + (dec.xstride << log2Ceil(tensorLength * tensorWidth))
+  }
 
   io.vme_wr.cmd.valid := state === sWriteCmd
   io.vme_wr.cmd.bits.addr := waddr_cur
@@ -231,12 +226,7 @@ class TensorStore(tensorType: String = "none", debug: Boolean = false)(
   if (debug) {
     when(io.vme_wr.cmd.fire()) {
       printf("[TensorStore] ysize:%x ycnt:%x raddr:%x waddr:%x len:%x rem:%x\n",
-             ysize,
-             ycnt,
-             raddr_cur,
-             waddr_cur,
-             xlen,
-             xrem)
+        ysize, ycnt, raddr_cur, waddr_cur, xlen, xrem)
     }
     when(io.vme_wr.data.fire()) {
       printf("[TensorStore] data:%x\n", io.vme_wr.data.bits)
diff --git a/vta/hardware/chisel/src/main/scala/core/TensorUtil.scala b/vta/hardware/chisel/src/main/scala/core/TensorUtil.scala
index 99e9012b71b8..6e6f7e776c0e 100644
--- a/vta/hardware/chisel/src/main/scala/core/TensorUtil.scala
+++ b/vta/hardware/chisel/src/main/scala/core/TensorUtil.scala
@@ -25,19 +25,18 @@ import vta.util.config._
 import vta.shell._
 
 /** TensorParams.
-  *
-  * This Bundle derives parameters for each tensorType, including inputs (inp),
-  * weights (wgt), biases (acc), and outputs (out). This is used to avoid
-  * doing the same boring calculations over and over again.
-  */
-class TensorParams(tensorType: String = "none")(implicit p: Parameters)
-    extends Bundle {
+ *
+ * This Bundle derives parameters for each tensorType, including inputs (inp),
+ * weights (wgt), biases (acc), and outputs (out). This is used to avoid
+ * doing the same boring calculations over and over again.
+ */
+class TensorParams(tensorType: String = "none")(implicit p: Parameters) extends Bundle {
   val errorMsg =
     s"\n\n[VTA] [TensorParams] only inp, wgt, acc, and out supported\n\n"
 
   require(tensorType == "inp" || tensorType == "wgt"
-            || tensorType == "acc" || tensorType == "out",
-          errorMsg)
+    || tensorType == "acc" || tensorType == "out",
+    errorMsg)
 
   val (tensorLength, tensorWidth, tensorElemBits) =
     if (tensorType == "inp")
@@ -66,14 +65,14 @@ class TensorParams(tensorType: String = "none")(implicit p: Parameters)
 }
 
 /** TensorMaster.
-  *
-  * This interface issue read and write tensor-requests to scratchpads. For example,
-  * The TensorGemm unit uses this interface for managing the inputs (inp), weights (wgt),
-  * biases (acc), and outputs (out).
-  *
-  */
-class TensorMaster(tensorType: String = "none")(implicit p: Parameters)
-    extends TensorParams(tensorType) {
+ *
+ * This interface issue read and write tensor-requests to scratchpads. For example,
+ * The TensorGemm unit uses this interface for managing the inputs (inp), weights (wgt),
+ * biases (acc), and outputs (out).
+ *
+ */
+class TensorMaster(tensorType: String = "none")
+  (implicit p: Parameters) extends TensorParams(tensorType) {
   val rd = new Bundle {
     val idx = ValidIO(UInt(memAddrBits.W))
     val data = Flipped(
@@ -101,13 +100,13 @@ class TensorMaster(tensorType: String = "none")(implicit p: Parameters)
 }
 
 /** TensorClient.
-  *
-  * This interface receives read and write tensor-requests to scratchpads. For example,
-  * The TensorLoad unit uses this interface for receiving read and write requests from
-  * the TensorGemm unit.
-  */
-class TensorClient(tensorType: String = "none")(implicit p: Parameters)
-    extends TensorParams(tensorType) {
+ *
+ * This interface receives read and write tensor-requests to scratchpads. For example,
+ * The TensorLoad unit uses this interface for receiving read and write requests from
+ * the TensorGemm unit.
+ */
+class TensorClient(tensorType: String = "none")
+  (implicit p: Parameters) extends TensorParams(tensorType) {
   val rd = new Bundle {
     val idx = Flipped(ValidIO(UInt(memAddrBits.W)))
     val data = ValidIO(
@@ -130,13 +129,13 @@ class TensorClient(tensorType: String = "none")(implicit p: Parameters)
 }
 
 /** TensorMasterData.
-  *
-  * This interface is only used for datapath only purposes and the direction convention
-  * is based on the TensorMaster interface, which means this is an input. This interface
-  * is used on datapath only module such MatrixVectorCore or AluVector.
-  */
-class TensorMasterData(tensorType: String = "none")(implicit p: Parameters)
-    extends TensorParams(tensorType) {
+ *
+ * This interface is only used for datapath only purposes and the direction convention
+ * is based on the TensorMaster interface, which means this is an input. This interface
+ * is used on datapath only module such MatrixVectorCore or AluVector.
+ */
+class TensorMasterData(tensorType: String = "none")
+  (implicit p: Parameters) extends TensorParams(tensorType) {
   val data = Flipped(
     ValidIO(Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W)))))
   override def cloneType =
@@ -144,13 +143,13 @@ class TensorMasterData(tensorType: String = "none")(implicit p: Parameters)
 }
 
 /** TensorClientData.
-  *
-  * This interface is only used for datapath only purposes and the direction convention
-  * is based on the TensorClient interface, which means this is an output. This interface
-  * is used on datapath only module such MatrixVectorCore or AluVector.
-  */
-class TensorClientData(tensorType: String = "none")(implicit p: Parameters)
-    extends TensorParams(tensorType) {
+ *
+ * This interface is only used for datapath only purposes and the direction convention
+ * is based on the TensorClient interface, which means this is an output. This interface
+ * is used on datapath only module such MatrixVectorCore or AluVector.
+ */
+class TensorClientData(tensorType: String = "none")
+  (implicit p: Parameters) extends TensorParams(tensorType) {
   val data = ValidIO(
     Vec(tensorLength, Vec(tensorWidth, UInt(tensorElemBits.W))))
   override def cloneType =
@@ -158,13 +157,12 @@ class TensorClientData(tensorType: String = "none")(implicit p: Parameters)
 }
 
 /** TensorPadCtrl. Zero-padding controller for TensorLoad. */
-class TensorPadCtrl(padType: String = "none", sizeFactor: Int = 1)
-    extends Module {
+class TensorPadCtrl(padType: String = "none", sizeFactor: Int = 1) extends Module {
   val errorMsg =
     s"\n\n\n[VTA-ERROR] only YPad0, YPad1, XPad0, or XPad1 supported\n\n\n"
   require(padType == "YPad0" || padType == "YPad1"
-            || padType == "XPad0" || padType == "XPad1",
-          errorMsg)
+    || padType == "XPad0" || padType == "XPad1",
+    errorMsg)
 
   val io = IO(new Bundle {
     val start = Input(Bool())
@@ -233,9 +231,7 @@ class TensorPadCtrl(padType: String = "none", sizeFactor: Int = 1)
 
 /** TensorDataCtrl. Data controller for TensorLoad. */
 class TensorDataCtrl(tensorType: String = "none",
-                     sizeFactor: Int = 1,
-                     strideFactor: Int = 1)(implicit p: Parameters)
-    extends Module {
+    sizeFactor: Int = 1, strideFactor: Int = 1)(implicit p: Parameters) extends Module {
   val mp = p(ShellKey).memParams
   val io = IO(new Bundle {
     val start = Input(Bool())
diff --git a/vta/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala b/vta/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
index 3318251fa454..a42891661851 100644
--- a/vta/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
+++ b/vta/hardware/chisel/src/main/scala/dpi/VTAHostDPI.scala
@@ -32,9 +32,9 @@ trait VTAHostDPIParams {
 }
 
 /** Host master interface.
-  *
-  * This interface is tipically used by the Host
-  */
+ *
+ * This interface is tipically used by the Host
+ */
 class VTAHostDPIMaster extends Bundle with VTAHostDPIParams {
   val req = new Bundle {
     val valid = Output(Bool())
@@ -47,9 +47,9 @@ class VTAHostDPIMaster extends Bundle with VTAHostDPIParams {
 }
 
 /** Host client interface.
-  *
-  * This interface is tipically used by the Accelerator
-  */
+ *
+ * This interface is tipically used by the Accelerator
+ */
 class VTAHostDPIClient extends Bundle with VTAHostDPIParams {
   val req = new Bundle {
     val valid = Input(Bool())
@@ -62,9 +62,9 @@ class VTAHostDPIClient extends Bundle with VTAHostDPIParams {
 }
 
 /** Host DPI module.
-  *
-  * Wrapper for Host Verilog DPI module.
-  */
+ *
+ * Wrapper for Host Verilog DPI module.
+ */
 class VTAHostDPI extends BlackBox with HasBlackBoxResource {
   val io = IO(new Bundle {
     val clock = Input(Clock())
@@ -75,11 +75,10 @@ class VTAHostDPI extends BlackBox with HasBlackBoxResource {
 }
 
 /** Host DPI to AXI Converter.
-  *
-  * Convert Host DPI to AXI for VTAShell
-  */
-class VTAHostDPIToAXI(debug: Boolean = false)(implicit p: Parameters)
-    extends Module {
+ *
+ * Convert Host DPI to AXI for VTAShell
+ */
+class VTAHostDPIToAXI(debug: Boolean = false)(implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val dpi = new VTAHostDPIClient
     val axi = new AXILiteMaster(p(ShellKey).hostParams)
diff --git a/vta/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala b/vta/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
index f46b778966fc..bffbc1c651cf 100644
--- a/vta/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
+++ b/vta/hardware/chisel/src/main/scala/dpi/VTAMemDPI.scala
@@ -33,9 +33,9 @@ trait VTAMemDPIParams {
 }
 
 /** Memory master interface.
-  *
-  * This interface is tipically used by the Accelerator
-  */
+ *
+ * This interface is tipically used by the Accelerator
+ */
 class VTAMemDPIMaster extends Bundle with VTAMemDPIParams {
   val req = new Bundle {
     val valid = Output(Bool())
@@ -48,9 +48,9 @@ class VTAMemDPIMaster extends Bundle with VTAMemDPIParams {
 }
 
 /** Memory client interface.
-  *
-  * This interface is tipically used by the Host
-  */
+ *
+ * This interface is tipically used by the Host
+ */
 class VTAMemDPIClient extends Bundle with VTAMemDPIParams {
   val req = new Bundle {
     val valid = Input(Bool())
@@ -63,9 +63,9 @@ class VTAMemDPIClient extends Bundle with VTAMemDPIParams {
 }
 
 /** Memory DPI module.
-  *
-  * Wrapper for Memory Verilog DPI module.
-  */
+ *
+ * Wrapper for Memory Verilog DPI module.
+ */
 class VTAMemDPI extends BlackBox with HasBlackBoxResource {
   val io = IO(new Bundle {
     val clock = Input(Clock())
@@ -75,8 +75,7 @@ class VTAMemDPI extends BlackBox with HasBlackBoxResource {
   setResource("/verilog/VTAMemDPI.v")
 }
 
-class VTAMemDPIToAXI(debug: Boolean = false)(implicit p: Parameters)
-    extends Module {
+class VTAMemDPIToAXI(debug: Boolean = false)(implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val dpi = new VTAMemDPIMaster
     val axi = new AXIClient(p(ShellKey).memParams)
@@ -173,13 +172,13 @@ class VTAMemDPIToAXI(debug: Boolean = false)(implicit p: Parameters)
     }
     when(io.axi.r.fire()) {
       printf("[VTAMemDPIToAXI] [R] last:%x data:%x\n",
-             io.axi.r.bits.last,
-             io.axi.r.bits.data)
+        io.axi.r.bits.last,
+        io.axi.r.bits.data)
     }
     when(io.axi.w.fire()) {
       printf("[VTAMemDPIToAXI] [W] last:%x data:%x\n",
-             io.axi.w.bits.last,
-             io.axi.w.bits.data)
+        io.axi.w.bits.last,
+        io.axi.w.bits.data)
     }
   }
 }
diff --git a/vta/hardware/chisel/src/main/scala/dpi/VTASimDPI.scala b/vta/hardware/chisel/src/main/scala/dpi/VTASimDPI.scala
index 33b1101e55f8..2f2532804eec 100644
--- a/vta/hardware/chisel/src/main/scala/dpi/VTASimDPI.scala
+++ b/vta/hardware/chisel/src/main/scala/dpi/VTASimDPI.scala
@@ -26,9 +26,9 @@ import vta.interface.axi._
 import vta.shell._
 
 /** Sim DPI module.
-  *
-  * Wrapper for Sim Verilog DPI module.
-  */
+ *
+ * Wrapper for Sim Verilog DPI module.
+ */
 class VTASimDPI extends BlackBox with HasBlackBoxResource {
   val io = IO(new Bundle {
     val clock = Input(Clock())
diff --git a/vta/hardware/chisel/src/main/scala/interface/axi/AXI.scala b/vta/hardware/chisel/src/main/scala/interface/axi/AXI.scala
index 8fd0fa896b61..515159075602 100644
--- a/vta/hardware/chisel/src/main/scala/interface/axi/AXI.scala
+++ b/vta/hardware/chisel/src/main/scala/interface/axi/AXI.scala
@@ -55,7 +55,7 @@ case class AXIParams(
 }
 
 abstract class AXIBase(params: AXIParams)
-    extends GenericParameterizedBundle(params)
+  extends GenericParameterizedBundle(params)
 
 // AXILite
 
diff --git a/vta/hardware/chisel/src/main/scala/shell/Configs.scala b/vta/hardware/chisel/src/main/scala/shell/Configs.scala
index 3c271f56fa17..b0c54029b1a4 100644
--- a/vta/hardware/chisel/src/main/scala/shell/Configs.scala
+++ b/vta/hardware/chisel/src/main/scala/shell/Configs.scala
@@ -25,59 +25,56 @@ import vta.util.config._
 import vta.interface.axi._
 
 /** PynqConfig. Shell configuration for Pynq */
-class PynqConfig
-    extends Config((site, here, up) => {
-      case ShellKey =>
-        ShellParams(
-          hostParams = AXIParams(coherent = false,
-                                 addrBits = 16,
-                                 dataBits = 32,
-                                 lenBits = 8,
-                                 userBits = 1),
-          memParams = AXIParams(coherent = true,
-                                addrBits = 32,
-                                dataBits = 64,
-                                lenBits = 8,
-                                userBits = 1),
-          vcrParams = VCRParams(),
-          vmeParams = VMEParams()
-        )
-    })
+class PynqConfig extends Config((site, here, up) => {
+  case ShellKey =>
+    ShellParams(
+      hostParams = AXIParams(coherent = false,
+        addrBits = 16,
+        dataBits = 32,
+        lenBits = 8,
+        userBits = 1),
+      memParams = AXIParams(coherent = true,
+        addrBits = 32,
+        dataBits = 64,
+        lenBits = 8,
+        userBits = 1),
+      vcrParams = VCRParams(),
+      vmeParams = VMEParams()
+    )
+})
 
 /** F1Config. Shell configuration for F1 */
-class F1Config
-    extends Config((site, here, up) => {
-      case ShellKey =>
-        ShellParams(
-          hostParams = AXIParams(coherent = false,
-                                 addrBits = 16,
-                                 dataBits = 32,
-                                 lenBits = 8,
-                                 userBits = 1),
-          memParams = AXIParams(coherent = false,
-                                addrBits = 64,
-                                dataBits = 64,
-                                lenBits = 8,
-                                userBits = 1),
-          vcrParams = VCRParams(),
-          vmeParams = VMEParams()
-        )
-    })
+class F1Config extends Config((site, here, up) => {
+  case ShellKey =>
+    ShellParams(
+      hostParams = AXIParams(coherent = false,
+        addrBits = 16,
+        dataBits = 32,
+        lenBits = 8,
+        userBits = 1),
+      memParams = AXIParams(coherent = false,
+        addrBits = 64,
+        dataBits = 64,
+        lenBits = 8,
+        userBits = 1),
+      vcrParams = VCRParams(),
+      vmeParams = VMEParams()
+    )
+})
 
 /** De10Config. Shell configuration for De10 */
-class De10Config
-    extends Config((site, here, up) => {
-      case ShellKey =>
-        ShellParams(
-          hostParams =
-            AXIParams(addrBits = 16, dataBits = 32, idBits = 13, lenBits = 4),
-          memParams = AXIParams(
-            addrBits = 32,
-            dataBits = 64,
-            userBits = 5,
-            lenBits = 4, // limit to 16 beats, instead of 256 beats in AXI4
-            coherent = true),
-          vcrParams = VCRParams(),
-          vmeParams = VMEParams()
-        )
-    })
+class De10Config extends Config((site, here, up) => {
+  case ShellKey =>
+    ShellParams(
+      hostParams =
+        AXIParams(addrBits = 16, dataBits = 32, idBits = 13, lenBits = 4),
+      memParams = AXIParams(
+        addrBits = 32,
+        dataBits = 64,
+        userBits = 5,
+        lenBits = 4,  // limit to 16 beats, instead of 256 beats in AXI4
+        coherent = true),
+      vcrParams = VCRParams(),
+      vmeParams = VMEParams()
+    )
+})
diff --git a/vta/hardware/chisel/src/main/scala/shell/IntelShell.scala b/vta/hardware/chisel/src/main/scala/shell/IntelShell.scala
index 6eb22224e8d7..e1b6995decd0 100644
--- a/vta/hardware/chisel/src/main/scala/shell/IntelShell.scala
+++ b/vta/hardware/chisel/src/main/scala/shell/IntelShell.scala
@@ -25,10 +25,10 @@ import vta.interface.axi._
 import vta.core._
 
 /** IntelShell.
-  *
-  * The IntelShell is based on a VME, VCR and core. This creates a complete VTA
-  * system that can be used for simulation or real hardware.
-  */
+ *
+ * The IntelShell is based on a VME, VCR and core. This creates a complete VTA
+ * system that can be used for simulation or real hardware.
+ */
 class IntelShell(implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val host = new AXIClient(p(ShellKey).hostParams)
diff --git a/vta/hardware/chisel/src/main/scala/shell/SimShell.scala b/vta/hardware/chisel/src/main/scala/shell/SimShell.scala
index 30b84d63dbc4..0909d1bfe47e 100644
--- a/vta/hardware/chisel/src/main/scala/shell/SimShell.scala
+++ b/vta/hardware/chisel/src/main/scala/shell/SimShell.scala
@@ -27,11 +27,11 @@ import vta.shell._
 import vta.dpi._
 
 /** VTAHost.
-  *
-  * This module translate the DPI protocol into AXI. This is a simulation only
-  * module and used to test host-to-VTA communication. This module should be updated
-  * for testing hosts using a different bus protocol, other than AXI.
-  */
+ *
+ * This module translate the DPI protocol into AXI. This is a simulation only
+ * module and used to test host-to-VTA communication. This module should be updated
+ * for testing hosts using a different bus protocol, other than AXI.
+ */
 class VTAHost(implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val axi = new AXILiteMaster(p(ShellKey).hostParams)
@@ -45,11 +45,11 @@ class VTAHost(implicit p: Parameters) extends Module {
 }
 
 /** VTAMem.
-  *
-  * This module translate the DPI protocol into AXI. This is a simulation only
-  * module and used to test VTA-to-memory communication. This module should be updated
-  * for testing memories using a different bus protocol, other than AXI.
-  */
+ *
+ * This module translate the DPI protocol into AXI. This is a simulation only
+ * module and used to test VTA-to-memory communication. This module should be updated
+ * for testing memories using a different bus protocol, other than AXI.
+ */
 class VTAMem(implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val axi = new AXIClient(p(ShellKey).memParams)
@@ -63,12 +63,12 @@ class VTAMem(implicit p: Parameters) extends Module {
 }
 
 /** VTASim.
-  *
-  * This module is used to handle hardware simulation thread, such as halting
-  * or terminating the simulation thread. The sim_wait port is used to halt
-  * the simulation thread when it is asserted and resume it when it is
-  * de-asserted.
-  */
+ *
+ * This module is used to handle hardware simulation thread, such as halting
+ * or terminating the simulation thread. The sim_wait port is used to halt
+ * the simulation thread when it is asserted and resume it when it is
+ * de-asserted.
+ */
 class VTASim(implicit p: Parameters) extends MultiIOModule {
   val sim_wait = IO(Output(Bool()))
   val sim = Module(new VTASimDPI)
@@ -78,11 +78,11 @@ class VTASim(implicit p: Parameters) extends MultiIOModule {
 }
 
 /** SimShell.
-  *
-  * The simulation shell instantiate the sim, host and memory DPI modules that
-  * are connected to the VTAShell. An extra clock, sim_clock, is used to eval
-  * the VTASim DPI function when the main simulation clock is on halt state.
-  */
+ *
+ * The simulation shell instantiate the sim, host and memory DPI modules that
+ * are connected to the VTAShell. An extra clock, sim_clock, is used to eval
+ * the VTASim DPI function when the main simulation clock is on halt state.
+ */
 class SimShell(implicit p: Parameters) extends MultiIOModule {
   val mem = IO(new AXIClient(p(ShellKey).memParams))
   val host = IO(new AXILiteMaster(p(ShellKey).hostParams))
diff --git a/vta/hardware/chisel/src/main/scala/shell/VCR.scala b/vta/hardware/chisel/src/main/scala/shell/VCR.scala
index 517f58179b5c..3e74a256d537 100644
--- a/vta/hardware/chisel/src/main/scala/shell/VCR.scala
+++ b/vta/hardware/chisel/src/main/scala/shell/VCR.scala
@@ -26,9 +26,9 @@ import vta.util.genericbundle._
 import vta.interface.axi._
 
 /** VCR parameters.
-  *
-  * These parameters are used on VCR interfaces and modules.
-  */
+ *
+ * These parameters are used on VCR interfaces and modules.
+ */
 case class VCRParams() {
   val nCtrl = 1
   val nECnt = 1
@@ -38,14 +38,13 @@ case class VCRParams() {
 }
 
 /** VCRBase. Parametrize base class. */
-abstract class VCRBase(implicit p: Parameters)
-    extends GenericParameterizedBundle(p)
+abstract class VCRBase(implicit p: Parameters) extends GenericParameterizedBundle(p)
 
 /** VCRMaster.
-  *
-  * This is the master interface used by VCR in the VTAShell to control
-  * the Core unit.
-  */
+ *
+ * This is the master interface used by VCR in the VTAShell to control
+ * the Core unit.
+ */
 class VCRMaster(implicit p: Parameters) extends VCRBase {
   val vp = p(ShellKey).vcrParams
   val mp = p(ShellKey).memParams
@@ -57,10 +56,10 @@ class VCRMaster(implicit p: Parameters) extends VCRBase {
 }
 
 /** VCRClient.
-  *
-  * This is the client interface used by the Core module to communicate
-  * to the VCR in the VTAShell.
-  */
+ *
+ * This is the client interface used by the Core module to communicate
+ * to the VCR in the VTAShell.
+ */
 class VCRClient(implicit p: Parameters) extends VCRBase {
   val vp = p(ShellKey).vcrParams
   val mp = p(ShellKey).memParams
@@ -72,12 +71,12 @@ class VCRClient(implicit p: Parameters) extends VCRBase {
 }
 
 /** VTA Control Registers (VCR).
-  *
-  * This unit provides control registers (32 and 64 bits) to be used by a control'
-  * unit, typically a host processor. These registers are read-only by the core
-  * at the moment but this will likely change once we add support to general purpose
-  * registers that could be used as event counters by the Core unit.
-  */
+ *
+ * This unit provides control registers (32 and 64 bits) to be used by a control'
+ * unit, typically a host processor. These registers are read-only by the core
+ * at the moment but this will likely change once we add support to general purpose
+ * registers that could be used as event counters by the Core unit.
+ */
 class VCR(implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val host = new AXILiteClient(p(ShellKey).hostParams)
diff --git a/vta/hardware/chisel/src/main/scala/shell/VME.scala b/vta/hardware/chisel/src/main/scala/shell/VME.scala
index 949929a179fc..41b24d1ba7aa 100644
--- a/vta/hardware/chisel/src/main/scala/shell/VME.scala
+++ b/vta/hardware/chisel/src/main/scala/shell/VME.scala
@@ -26,27 +26,26 @@ import vta.util.genericbundle._
 import vta.interface.axi._
 
 /** VME parameters.
-  *
-  * These parameters are used on VME interfaces and modules.
-  */
+ *
+ * These parameters are used on VME interfaces and modules.
+ */
 case class VMEParams() {
   val nReadClients: Int = 5
   val nWriteClients: Int = 1
   require(nReadClients > 0,
-          s"\n\n[VTA] [VMEParams] nReadClients must be larger than 0\n\n")
+    s"\n\n[VTA] [VMEParams] nReadClients must be larger than 0\n\n")
   require(
     nWriteClients == 1,
     s"\n\n[VTA] [VMEParams] nWriteClients must be 1, only one-write-client support atm\n\n")
 }
 
 /** VMEBase. Parametrize base class. */
-abstract class VMEBase(implicit p: Parameters)
-    extends GenericParameterizedBundle(p)
+abstract class VMEBase(implicit p: Parameters) extends GenericParameterizedBundle(p)
 
 /** VMECmd.
-  *
-  * This interface is used for creating write and read requests to memory.
-  */
+ *
+ * This interface is used for creating write and read requests to memory.
+ */
 class VMECmd(implicit p: Parameters) extends VMEBase {
   val addrBits = p(ShellKey).memParams.addrBits
   val lenBits = p(ShellKey).memParams.lenBits
@@ -55,10 +54,10 @@ class VMECmd(implicit p: Parameters) extends VMEBase {
 }
 
 /** VMEReadMaster.
-  *
-  * This interface is used by modules inside the core to generate read requests
-  * and receive responses from VME.
-  */
+ *
+ * This interface is used by modules inside the core to generate read requests
+ * and receive responses from VME.
+ */
 class VMEReadMaster(implicit p: Parameters) extends Bundle {
   val dataBits = p(ShellKey).memParams.dataBits
   val cmd = Decoupled(new VMECmd)
@@ -68,10 +67,10 @@ class VMEReadMaster(implicit p: Parameters) extends Bundle {
 }
 
 /** VMEReadClient.
-  *
-  * This interface is used by the VME to receive read requests and generate
-  * responses to modules inside the core.
-  */
+ *
+ * This interface is used by the VME to receive read requests and generate
+ * responses to modules inside the core.
+ */
 class VMEReadClient(implicit p: Parameters) extends Bundle {
   val dataBits = p(ShellKey).memParams.dataBits
   val cmd = Flipped(Decoupled(new VMECmd))
@@ -81,10 +80,10 @@ class VMEReadClient(implicit p: Parameters) extends Bundle {
 }
 
 /** VMEWriteMaster.
-  *
-  * This interface is used by modules inside the core to generate write requests
-  * to the VME.
-  */
+ *
+ * This interface is used by modules inside the core to generate write requests
+ * to the VME.
+ */
 class VMEWriteMaster(implicit p: Parameters) extends Bundle {
   val dataBits = p(ShellKey).memParams.dataBits
   val cmd = Decoupled(new VMECmd)
@@ -95,10 +94,10 @@ class VMEWriteMaster(implicit p: Parameters) extends Bundle {
 }
 
 /** VMEWriteClient.
-  *
-  * This interface is used by the VME to handle write requests from modules inside
-  * the core.
-  */
+ *
+ * This interface is used by the VME to handle write requests from modules inside
+ * the core.
+ */
 class VMEWriteClient(implicit p: Parameters) extends Bundle {
   val dataBits = p(ShellKey).memParams.dataBits
   val cmd = Flipped(Decoupled(new VMECmd))
@@ -109,10 +108,10 @@ class VMEWriteClient(implicit p: Parameters) extends Bundle {
 }
 
 /** VMEMaster.
-  *
-  * Pack nRd number of VMEReadMaster interfaces and nWr number of VMEWriteMaster
-  * interfaces.
-  */
+ *
+ * Pack nRd number of VMEReadMaster interfaces and nWr number of VMEWriteMaster
+ * interfaces.
+ */
 class VMEMaster(implicit p: Parameters) extends Bundle {
   val nRd = p(ShellKey).vmeParams.nReadClients
   val nWr = p(ShellKey).vmeParams.nWriteClients
@@ -121,10 +120,10 @@ class VMEMaster(implicit p: Parameters) extends Bundle {
 }
 
 /** VMEClient.
-  *
-  * Pack nRd number of VMEReadClient interfaces and nWr number of VMEWriteClient
-  * interfaces.
-  */
+ *
+ * Pack nRd number of VMEReadClient interfaces and nWr number of VMEWriteClient
+ * interfaces.
+ */
 class VMEClient(implicit p: Parameters) extends Bundle {
   val nRd = p(ShellKey).vmeParams.nReadClients
   val nWr = p(ShellKey).vmeParams.nWriteClients
@@ -133,10 +132,10 @@ class VMEClient(implicit p: Parameters) extends Bundle {
 }
 
 /** VTA Memory Engine (VME).
-  *
-  * This unit multiplexes the memory controller interface for the Core. Currently,
-  * it supports single-writer and multiple-reader mode and it is also based on AXI.
-  */
+ *
+ * This unit multiplexes the memory controller interface for the Core. Currently,
+ * it supports single-writer and multiple-reader mode and it is also based on AXI.
+ */
 class VME(implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val mem = new AXIMaster(p(ShellKey).memParams)
diff --git a/vta/hardware/chisel/src/main/scala/shell/VTAShell.scala b/vta/hardware/chisel/src/main/scala/shell/VTAShell.scala
index 782aeae351e1..650a1c56c352 100644
--- a/vta/hardware/chisel/src/main/scala/shell/VTAShell.scala
+++ b/vta/hardware/chisel/src/main/scala/shell/VTAShell.scala
@@ -35,10 +35,10 @@ case class ShellParams(
 case object ShellKey extends Field[ShellParams]
 
 /** VTAShell.
-  *
-  * The VTAShell is based on a VME, VCR and core. This creates a complete VTA
-  * system that can be used for simulation or real hardware.
-  */
+ *
+ * The VTAShell is based on a VME, VCR and core. This creates a complete VTA
+ * system that can be used for simulation or real hardware.
+ */
 class VTAShell(implicit p: Parameters) extends Module {
   val io = IO(new Bundle {
     val host = new AXILiteClient(p(ShellKey).hostParams)
diff --git a/vta/hardware/chisel/src/main/scala/shell/XilinxShell.scala b/vta/hardware/chisel/src/main/scala/shell/XilinxShell.scala
index ec7bffb89657..28f95ea36bc4 100644
--- a/vta/hardware/chisel/src/main/scala/shell/XilinxShell.scala
+++ b/vta/hardware/chisel/src/main/scala/shell/XilinxShell.scala
@@ -25,10 +25,10 @@ import vta.util.config._
 import vta.interface.axi._
 
 /** XilinxShell.
-  *
-  * This is a wrapper shell mostly used to match Xilinx convention naming,
-  * therefore we can pack VTA as an IP for IPI based flows.
-  */
+ *
+ * This is a wrapper shell mostly used to match Xilinx convention naming,
+ * therefore we can pack VTA as an IP for IPI based flows.
+ */
 class XilinxShell(implicit p: Parameters) extends RawModule {
 
   val hp = p(ShellKey).hostParams
diff --git a/vta/hardware/chisel/src/main/scala/util/Config.scala b/vta/hardware/chisel/src/main/scala/util/Config.scala
index 41104c44810a..d63d95665571 100644
--- a/vta/hardware/chisel/src/main/scala/util/Config.scala
+++ b/vta/hardware/chisel/src/main/scala/util/Config.scala
@@ -46,7 +46,7 @@ abstract class Parameters extends View {
     new ChainParameters(this, x)
 
   final def alter(
-      f: (View, View, View) => PartialFunction[Any, Any]): Parameters =
+    f: (View, View, View) => PartialFunction[Any, Any]): Parameters =
     Parameters(f) ++ this
 
   final def alterPartial(f: PartialFunction[Any, Any]): Parameters =
@@ -56,8 +56,8 @@ abstract class Parameters extends View {
     new MapParameters(m) ++ this
 
   protected[config] def chain[T](site: View,
-                                 tail: View,
-                                 pname: Field[T]): Option[T]
+    tail: View,
+    pname: Field[T]): Option[T]
   protected[config] def find[T](pname: Field[T], site: View) =
     chain(site, new TerminalView, pname)
 }
diff --git a/vta/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala b/vta/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala
index db8f5d241222..063e76673396 100644
--- a/vta/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala
+++ b/vta/hardware/chisel/src/main/scala/util/GenericParameterizedBundle.scala
@@ -23,8 +23,8 @@ package vta.util.genericbundle
 
 import chisel3._
 
-abstract class GenericParameterizedBundle[+T <: Object](val params: T)
-    extends Bundle {
+abstract class GenericParameterizedBundle[+T <: Object]
+  (val params: T) extends Bundle {
   override def cloneType = {
     try {
       this.getClass.getConstructors.head
diff --git a/vta/hardware/chisel/src/main/scala/vta/Configs.scala b/vta/hardware/chisel/src/main/scala/vta/Configs.scala
index f137ab6c5330..350379b2ec22 100644
--- a/vta/hardware/chisel/src/main/scala/vta/Configs.scala
+++ b/vta/hardware/chisel/src/main/scala/vta/Configs.scala
@@ -26,11 +26,11 @@ import vta.core._
 import vta.test._
 
 /** VTA.
-  *
-  * This file contains all the configurations supported by VTA.
-  * These configurations are built in a mix/match form based on core
-  * and shell configurations.
-  */
+ *
+ * This file contains all the configurations supported by VTA.
+ * These configurations are built in a mix/match form based on core
+ * and shell configurations.
+ */
 class DefaultPynqConfig extends Config(new CoreConfig ++ new PynqConfig)
 class DefaultF1Config extends Config(new CoreConfig ++ new F1Config)
 class DefaultDe10Config extends Config(new CoreConfig ++ new De10Config)