diff --git a/build/gguf_loader.py b/build/gguf_loader.py
index 464f4536a..033cec212 100644
--- a/build/gguf_loader.py
+++ b/build/gguf_loader.py
@@ -5,14 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 
 
-import argparse
-
 import copy
 import logging
 import sys
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, Mapping
+from typing import Any, Dict
 
 import gguf
 
@@ -31,8 +29,6 @@
 wd = Path(__file__).parent.resolve()
 sys.path.append(str(wd))
 
-from typing import Set
-
 from model import ModelArgs, Transformer
 
 logger: logging.Logger = logging.getLogger(__name__)
diff --git a/build/gguf_util.py b/build/gguf_util.py
index 9f8a07661..fe631781e 100644
--- a/build/gguf_util.py
+++ b/build/gguf_util.py
@@ -63,7 +63,7 @@ def test_by_to_float(source_file: str, target_file: str) -> None:
             )
             print("First 5 elements of converted source: ", source.reshape(-1)[0:5])
             print("First 5 elements of target: ", target.reshape(-1)[0:5])
-            assert False, "found mismatch"
+            raise AssertionError("found mismatch")
 
     print("All tensors match.")
 
diff --git a/build/model.py b/build/model.py
index 655405d9f..4786434f8 100644
--- a/build/model.py
+++ b/build/model.py
@@ -95,48 +95,58 @@ def from_name(cls, name: str):
 
 
 transformer_configs = {
-    "CodeLlama-7b-Python-hf": dict(
-        block_size=16384, vocab_size=32000, n_layers=32, dim=4096, rope_base=1000000
-    ),
-    "7B": dict(n_layers=32, n_heads=32, dim=4096),
-    "13B": dict(n_layers=40, n_heads=40, dim=5120),
-    "30B": dict(n_layers=60, n_heads=52, dim=6656),
-    "34B": dict(
-        n_layers=48,
-        n_heads=64,
-        dim=8192,
-        vocab_size=32000,
-        n_local_heads=8,
-        hidden_dim=22016,
-        rope_base=1000000,
-    ),  # CodeLlama-34B-Python-hf
-    "70B": dict(n_layers=80, n_heads=64, dim=8192, n_local_heads=8, hidden_dim=28672),
-    "Mistral-7B": dict(
-        n_layers=32,
-        n_heads=32,
-        n_local_heads=8,
-        dim=4096,
-        hidden_dim=14336,
-        vocab_size=32000,
-    ),
-    "Mistral-7B-Instruct-v0.1": dict(
-        n_layers=32,
-        n_heads=32,
-        n_local_heads=8,
-        dim=4096,
-        hidden_dim=14336,
-        vocab_size=32000,
-    ),
-    "Mistral-7B-Instruct-v0.2": dict(
-        n_layers=32,
-        n_heads=32,
-        n_local_heads=8,
-        dim=4096,
-        hidden_dim=14336,
-        vocab_size=32000,
-    ),
-    "stories15M": dict(n_layers=6, n_heads=6, dim=288),
-    "stories110M": dict(n_layers=12, n_heads=12, dim=768),
+    "CodeLlama-7b-Python-hf": {
+        "block_size": 16384,
+        "vocab_size": 32000,
+        "n_layers": 32,
+        "dim": 4096,
+        "rope_base": 1000000,
+    },
+    "7B": {"n_layers": 32, "n_heads": 32, "dim": 4096},
+    "13B": {"n_layers": 40, "n_heads": 40, "dim": 5120},
+    "30B": {"n_layers": 60, "n_heads": 52, "dim": 6656},
+    "34B": {
+        "n_layers": 48,
+        "n_heads": 64,
+        "dim": 8192,
+        "vocab_size": 32000,
+        "n_local_heads": 8,
+        "hidden_dim": 22016,
+        "rope_base": 1000000,
+    },  # CodeLlama-34B-Python-hf
+    "70B": {
+        "n_layers": 80,
+        "n_heads": 64,
+        "dim": 8192,
+        "n_local_heads": 8,
+        "hidden_dim": 28672,
+    },
+    "Mistral-7B": {
+        "n_layers": 32,
+        "n_heads": 32,
+        "n_local_heads": 8,
+        "dim": 4096,
+        "hidden_dim": 14336,
+        "vocab_size": 32000,
+    },
+    "Mistral-7B-Instruct-v0.1": {
+        "n_layers": 32,
+        "n_heads": 32,
+        "n_local_heads": 8,
+        "dim": 4096,
+        "hidden_dim": 14336,
+        "vocab_size": 32000,
+    },
+    "Mistral-7B-Instruct-v0.2": {
+        "n_layers": 32,
+        "n_heads": 32,
+        "n_local_heads": 8,
+        "dim": 4096,
+        "hidden_dim": 14336,
+        "vocab_size": 32000,
+    },
+    "stories15M": {"n_layers": 6, "n_heads": 6, "dim": 288},
+    "stories110M": {"n_layers": 12, "n_heads": 12, "dim": 768},
 }
 
 
@@ -216,7 +226,7 @@ def forward(self, idx: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
         freqs_cis = self.freqs_cis[input_pos]
         x = self.tok_embeddings(idx)
 
-        for i, layer in enumerate(self.layers):
+        for _, layer in enumerate(self.layers):
             x = layer(x, input_pos, freqs_cis, mask)
         x = self.norm(x)
         logits = self.output(x)
@@ -344,7 +354,7 @@ def forward(
         q = apply_rotary_emb(q, freqs_cis)
         k = apply_rotary_emb(k, freqs_cis)
 
-        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+        q, k, v = (x.transpose(1, 2) for x in (q, k, v))
 
         if self.kv_cache is not None:
             k, v = self.kv_cache.update(input_pos, k, v)
diff --git a/build/model_aoti.py b/build/model_aoti.py
index 50fa2b939..b7c1bda56 100644
--- a/build/model_aoti.py
+++ b/build/model_aoti.py
@@ -1,12 +1,6 @@
-from ctypes import c_void_p
-
 import torch
 import torch.nn as nn
-from torch import empty
-from torch._dynamo.testing import rand_strided
 from torch._inductor.codecache import AsyncCompile
-from torch._inductor.utils import print_performance
-from torch._inductor.wrapper_benchmark import compiled_module_main
 
 # with open("./dso_model.h", "rb") as f:
 #     dso_src = f.read().decode("utf-8")
diff --git a/build/model_et.py b/build/model_et.py
index f7bd02194..747a7ca31 100644
--- a/build/model_et.py
+++ b/build/model_et.py
@@ -1,9 +1,6 @@
-from ctypes import c_void_p
-
 import torch
 import torch.nn as nn
 from executorch.extension.pybindings import portable_lib as exec_lib
-from torch import empty
 
 
 class PTEModel(nn.Module):
diff --git a/eval.py b/eval.py
index 6d719d460..f681d8418 100644
--- a/eval.py
+++ b/eval.py
@@ -165,7 +165,7 @@ def _model_generate(self, context, max_length, eos_token_id):
 def eval(
     model: Transformer,
     tokenizer,
-    tasks: list = ["hellaswag"],
+    tasks: Optional[list] = None,
     limit: Optional[int] = None,
     max_seq_length: Optional[int] = None,
 ) -> dict:
@@ -182,6 +182,9 @@ def eval(
     Returns:
         eval_results (dict): A dictionary of evaluation results for the specified task(s).
     """
+    if tasks is None:
+        tasks = ["hellaswag"]
+
     model_eval_wrapper = GPTFastEvalWrapper(
         model,
         tokenizer,
@@ -195,7 +198,7 @@ def eval(
 
     if "hendrycks_test" in tasks:
         tasks.remove("hendrycks_test")
-        tasks += [x for x in lm_eval.tasks.hendrycks_test.create_all_tasks().keys()]
+        tasks += list(lm_eval.tasks.hendrycks_test.create_all_tasks().keys())
     task_dict = get_task_dict(tasks)
 
     eval_results = evaluate(
diff --git a/export_aoti.py b/export_aoti.py
index b9a59c3bb..2fa58369d 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -4,20 +4,11 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-import itertools
-import sys
-import time
-from pathlib import Path
-from typing import Optional, Tuple
 
 import torch
 import torch.nn as nn
 
-from build.model import Transformer
-
-from generate import decode_one_token
-from quantize import quantize_model
-from torch.export import Dim, export
+from torch.export import Dim
 
 default_device = "cpu"  # 'cuda' if torch.cuda.is_available() else 'cpu'
 
diff --git a/export_et.py b/export_et.py
index eecba7ec7..16b3ad617 100644
--- a/export_et.py
+++ b/export_et.py
@@ -4,11 +4,8 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
-import time
-from pathlib import Path
 
 import torch
-import torch.nn as nn
 from build.model import Transformer
 
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
@@ -28,10 +25,8 @@
 # )
 from executorch_portable_utils import export_to_edge
 
-from generate import decode_one_token
-from quantize import get_precision, name_to_dtype, quantize_model, set_precision
+from quantize import get_precision
 from torch._export import capture_pre_autograd_graph
-from torch.export import Dim, export
 
 
 default_device = "cpu"  # 'cuda' if torch.cuda.is_available() else 'cpu'
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
index 1d4300f6e..b5d6d7ba2 100644
--- a/scripts/convert_hf_checkpoint.py
+++ b/scripts/convert_hf_checkpoint.py
@@ -22,11 +22,11 @@
 @torch.inference_mode()
 def convert_hf_checkpoint(
     *,
-    checkpoint_dir: Path = Path(
-        "checkpoints/meta-Transformer/Transformer-2-7b-chat-hf"
-    ),
+    checkpoint_dir: Optional[Path] = None,
     model_name: Optional[str] = None,
 ) -> None:
+    if checkpoint_dir is None:
+        checkpoint_dir = Path("checkpoints/meta-Transformer/Transformer-2-7b-chat-hf")
     if model_name is None:
         model_name = checkpoint_dir.name