From 98a7cff0c5b9d8b43eafefda9b659ae85158ea75 Mon Sep 17 00:00:00 2001
From: Rui Ueyama <ruiu@cs.stanford.edu>
Date: Sat, 27 Jul 2024 09:08:36 +0900
Subject: [PATCH] Handle R_LARCH_RELAX

---
 elf/arch-loongarch.cc  | 46 ++++++++++++++++++++++++++++++++++++++++--
 elf/input-sections.cc  |  2 +-
 elf/main.cc            |  2 +-
 elf/mold.h             |  8 +++++++-
 elf/shrink-sections.cc |  3 ++-
 elf/thunks.cc          |  1 +
 6 files changed, 56 insertions(+), 6 deletions(-)
diff --git a/elf/arch-loongarch.cc b/elf/arch-loongarch.cc
index bdf66ef5b9..79e5c4ea8e 100644
--- a/elf/arch-loongarch.cc
+++ b/elf/arch-loongarch.cc
@@ -238,6 +238,10 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
     dynrel = (ElfRel<E> *)(ctx.buf + ctx.reldyn->shdr.sh_offset +
                            file.reldyn_offset + this->reldyn_offset);
 
+  auto get_r_delta = [&](i64 idx) {
+    return extra.r_deltas.empty() ? 0 : extra.r_deltas[idx];
+  };
+
   for (i64 i = 0; i < rels.size(); i++) {
     const ElfRel<E> &rel = rels[i];
 
@@ -247,7 +251,9 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
       continue;
 
     Symbol<E> &sym = *file.symbols[rel.r_sym];
-    u8 *loc = base + rel.r_offset;
+    i64 r_offset = rel.r_offset - get_r_delta(i);
+    [[maybe_unused]] i64 removed_bytes = get_r_delta(i + 1) - get_r_delta(i);
+    u8 *loc = base + r_offset;
 
     auto check = [&](i64 val, i64 lo, i64 hi) {
       if (val < lo || hi <= val)
@@ -280,7 +286,7 @@ void InputSection<E>::apply_reloc_alloc(Context<E> &ctx, u8 *base) {
 
     u64 S = sym.get_addr(ctx);
     u64 A = rel.r_addend;
-    u64 P = get_addr() + rel.r_offset;
+    u64 P = get_addr() + r_offset;
     u64 G = get_got_idx() * sizeof(Word<E>);
     u64 GOT = ctx.got->shdr.sh_addr;
 
@@ -663,6 +669,42 @@ void InputSection<E>::scan_relocations(Context<E> &ctx) {
   }
 }
 
+template <>
+void shrink_section(Context<E> &ctx, InputSection<E> &isec, bool use_rvc) {
+  std::span<const ElfRel<E>> rels = isec.get_rels(ctx);
+  isec.extra.r_deltas.resize(rels.size() + 1);
+  i64 delta = 0;
+
+  for (i64 i = 0; i < rels.size(); i++) {
+    const ElfRel<E> &r = rels[i];
+    isec.extra.r_deltas[i] = delta;
+
+    if (r.r_type == R_LARCH_ALIGN) {
+      i64 nop_size;
+      if (r.r_sym) {
+        if (r.r_addend & ~0xff)
+          Fatal(ctx) << isec << ": ternary R_LARCH_ALIGN is not supported: " << i;
+        nop_size = (1 << (r.r_addend & 0xff)) - 4;
+      } else {
+        nop_size = r.r_addend;
+      }
+
+      u64 loc = isec.get_addr() + r.r_offset - delta;
+      u64 next_loc = loc + nop_size;
+      u64 alignment = nop_size + 4;
+
+      if (!has_single_bit(alignment))
+        Fatal(ctx) << isec << ": R_LARCH_ALIGN: invalid nop sequence: " << i;
+
+      delta += next_loc - align_to(loc, alignment);
+      continue;
+    }
+  }
+
+  isec.extra.r_deltas[rels.size()] = delta;
+  isec.sh_size -= delta;
+}
+
 template <>
 void Thunk<E>::copy_buf(Context<E> &ctx) {
   constexpr ul32 insn[] = {
diff --git a/elf/input-sections.cc b/elf/input-sections.cc
index 51c04e69c9..b8359f81ba 100644
--- a/elf/input-sections.cc
+++ b/elf/input-sections.cc
@@ -457,7 +457,7 @@ void InputSection<E>::write_to(Context<E> &ctx, u8 *buf) {
   // an atomic unit of copying because of relaxation. That is, some
   // relocations are allowed to remove bytes from the middle of a
   // section and shrink the overall size of it.
-  if constexpr (is_riscv<E>) {
+  if constexpr (is_riscv<E> || is_loongarch<E>) {
     if (extra.r_deltas.empty()) {
       // If a section is not relaxed, we can copy it as a one big chunk.
       copy_contents(ctx, buf);
diff --git a/elf/main.cc b/elf/main.cc
index e8f247b9c5..012e68db91 100644
--- a/elf/main.cc
+++ b/elf/main.cc
@@ -618,7 +618,7 @@ int elf_main(int argc, char **argv) {
   // that they can jump to anywhere in ±2 GiB by default. They may
   // be replaced with shorter instruction sequences if destinations
   // are close enough. Do this optimization.
-  if constexpr (is_riscv<E>)
+  if constexpr (is_riscv<E> || is_loongarch<E>)
     filesize = shrink_sections(ctx);
 
   // At this point, memory layout is fixed.
diff --git a/elf/mold.h b/elf/mold.h
index 28f14144aa..4dc374fd2d 100644
--- a/elf/mold.h
+++ b/elf/mold.h
@@ -232,7 +232,7 @@ struct FdeRecord {
 template <typename E>
 struct InputSectionExtras {};
 
-template <needs_thunk E>
+template <typename E> requires (needs_thunk<E> && !is_loongarch<E>)
 struct InputSectionExtras<E> {
   std::vector<ThunkRef> thunk_refs;
 };
@@ -242,6 +242,12 @@ struct InputSectionExtras<E> {
   std::vector<i32> r_deltas;
 };
 
+template <is_loongarch E>
+struct InputSectionExtras<E> {
+  std::vector<ThunkRef> thunk_refs;
+  std::vector<i32> r_deltas;
+};
+
 // InputSection represents a section in an input object file.
 template <typename E>
 class __attribute__((aligned(4))) InputSection {
diff --git a/elf/shrink-sections.cc b/elf/shrink-sections.cc
index 5abd5f2b9e..2f483484ef 100644
--- a/elf/shrink-sections.cc
+++ b/elf/shrink-sections.cc
@@ -1,6 +1,7 @@
 // Shrink sections by interpreting relocations.
 
-#if MOLD_RV64LE || MOLD_RV64BE || MOLD_RV32LE || MOLD_RV32BE
+#if MOLD_RV64LE || MOLD_RV64BE || MOLD_RV32LE || MOLD_RV32BE || \
+    MOLD_LOONGARCH64 || MOLD_LOONGARCH32
 
 #include "mold.h"
 
diff --git a/elf/thunks.cc b/elf/thunks.cc
index 7e0880d59b..0601ba80b8 100644
--- a/elf/thunks.cc
+++ b/elf/thunks.cc
@@ -183,6 +183,7 @@ void OutputSection<E>::create_range_extension_thunks(Context<E> &ctx) {
   // haven't.
   for (InputSection<E> *isec : m)
     isec->offset = -1;
+  thunks.clear();
 
   // We create thunks from the beginning of the section to the end.
   // We manage progress using four offsets which increase monotonically.