Skip to content

Commit

Permalink
btrfs: fix deadlock when defragging transparent huge pages
Browse files Browse the repository at this point in the history
Attempting to defragment a Btrfs file containing a transparent huge page
immediately deadlocks with the following stack trace:

  #0  context_switch (kernel/sched/core.c:4940:2)
  #1  __schedule (kernel/sched/core.c:6287:8)
  #2  schedule (kernel/sched/core.c:6366:3)
  #3  io_schedule (kernel/sched/core.c:8389:2)
  #4  wait_on_page_bit_common (mm/filemap.c:1356:4)
  #5  __lock_page (mm/filemap.c:1648:2)
  #6  lock_page (./include/linux/pagemap.h:625:3)
  #7  pagecache_get_page (mm/filemap.c:1910:4)
  #8  find_or_create_page (./include/linux/pagemap.h:420:9)
  grate-driver#9  defrag_prepare_one_page (fs/btrfs/ioctl.c:1068:9)
  grate-driver#10 defrag_one_range (fs/btrfs/ioctl.c:1326:14)
  grate-driver#11 defrag_one_cluster (fs/btrfs/ioctl.c:1421:9)
  grate-driver#12 btrfs_defrag_file (fs/btrfs/ioctl.c:1523:9)
  grate-driver#13 btrfs_ioctl_defrag (fs/btrfs/ioctl.c:3117:9)
  grate-driver#14 btrfs_ioctl (fs/btrfs/ioctl.c:4872:10)
  grate-driver#15 vfs_ioctl (fs/ioctl.c:51:10)
  grate-driver#16 __do_sys_ioctl (fs/ioctl.c:874:11)
  grate-driver#17 __se_sys_ioctl (fs/ioctl.c:860:1)
  grate-driver#18 __x64_sys_ioctl (fs/ioctl.c:860:1)
  grate-driver#19 do_syscall_x64 (arch/x86/entry/common.c:50:14)
  grate-driver#20 do_syscall_64 (arch/x86/entry/common.c:80:7)
  grate-driver#21 entry_SYSCALL_64+0x7c/0x15b (arch/x86/entry/entry_64.S:113)

A huge page is represented by a compound page, which consists of a
struct page for each PAGE_SIZE page within the huge page. The first
struct page is the "head page", and the remaining are "tail pages".

Defragmentation attempts to lock each page in the range. However,
lock_page() on a tail page actually locks the corresponding head page.
So, if defragmentation tries to lock more than one struct page in a
compound page, it tries to lock the same head page twice and deadlocks
with itself.

Ideally, we should be able to defragment transparent huge pages.
However, THP for filesystems is currently read-only, so a lot of code is
not ready to use huge pages for I/O. For now, let's just return
ETXTBUSY.

This can be reproduced with the following on a kernel with
CONFIG_READ_ONLY_THP_FOR_FS=y:

  $ cat create_thp_file.c
  #include <fcntl.h>
  #include <stdbool.h>
  #include <stdio.h>
  #include <stdint.h>
  #include <stdlib.h>
  #include <unistd.h>
  #include <sys/mman.h>

  static const char zeroes[1024 * 1024];
  static const size_t FILE_SIZE = 2 * 1024 * 1024;

  int main(int argc, char **argv)
  {
          if (argc != 2) {
                  fprintf(stderr, "usage: %s PATH\n", argv[0]);
                  return EXIT_FAILURE;
          }
          int fd = creat(argv[1], 0777);
          if (fd == -1) {
                  perror("creat");
                  return EXIT_FAILURE;
          }
          size_t written = 0;
          while (written < FILE_SIZE) {
                  ssize_t ret = write(fd, zeroes,
                                      sizeof(zeroes) < FILE_SIZE - written ?
                                      sizeof(zeroes) : FILE_SIZE - written);
                  if (ret < 0) {
                          perror("write");
                          return EXIT_FAILURE;
                  }
                  written += ret;
          }
          close(fd);
          fd = open(argv[1], O_RDONLY);
          if (fd == -1) {
                  perror("open");
                  return EXIT_FAILURE;
          }

          /*
           * Reserve some address space so that we can align the file mapping to
           * the huge page size.
           */
          void *placeholder_map = mmap(NULL, FILE_SIZE * 2, PROT_NONE,
                                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
          if (placeholder_map == MAP_FAILED) {
                  perror("mmap (placeholder)");
                  return EXIT_FAILURE;
          }

          void *aligned_address =
                  (void *)(((uintptr_t)placeholder_map + FILE_SIZE - 1) & ~(FILE_SIZE - 1));

          void *map = mmap(aligned_address, FILE_SIZE, PROT_READ | PROT_EXEC,
                           MAP_SHARED | MAP_FIXED, fd, 0);
          if (map == MAP_FAILED) {
                  perror("mmap");
                  return EXIT_FAILURE;
          }
          if (madvise(map, FILE_SIZE, MADV_HUGEPAGE) < 0) {
                  perror("madvise");
                  return EXIT_FAILURE;
          }

          char *line = NULL;
          size_t line_capacity = 0;
          FILE *smaps_file = fopen("/proc/self/smaps", "r");
          if (!smaps_file) {
                  perror("fopen");
                  return EXIT_FAILURE;
          }
          for (;;) {
                  for (size_t off = 0; off < FILE_SIZE; off += 4096)
                          ((volatile char *)map)[off];

                  ssize_t ret;
                  bool this_mapping = false;
                  while ((ret = getline(&line, &line_capacity, smaps_file)) > 0) {
                          unsigned long start, end, huge;
                          if (sscanf(line, "%lx-%lx", &start, &end) == 2) {
                                  this_mapping = (start <= (uintptr_t)map &&
                                                  (uintptr_t)map < end);
                          } else if (this_mapping &&
                                     sscanf(line, "FilePmdMapped: %ld", &huge) == 1 &&
                                     huge > 0) {
                                  return EXIT_SUCCESS;
                          }
                  }

                  sleep(6);
                  rewind(smaps_file);
                  fflush(smaps_file);
          }
  }
  $ ./create_thp_file huge
  $ btrfs fi defrag -czstd ./huge

Reviewed-by: Josef Bacik <[email protected]>
Signed-off-by: Omar Sandoval <[email protected]>
Reviewed-by: David Sterba <[email protected]>
Signed-off-by: David Sterba <[email protected]>
  • Loading branch information
osandov authored and kdave committed Oct 26, 2021
1 parent 020e527 commit 24bcb45
Showing 1 changed file with 14 additions and 0 deletions.
14 changes: 14 additions & 0 deletions fs/btrfs/ioctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -1069,6 +1069,20 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
if (!page)
return ERR_PTR(-ENOMEM);

/*
* Since we can defragment files opened read-only, we can encounter
* transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
* can't do I/O using huge pages yet, so return an error for now.
* Filesystem transparent huge pages are typically only used for
* executables that explicitly enable them, so this isn't very
* restrictive.
*/
if (PageCompound(page)) {
unlock_page(page);
put_page(page);
return ERR_PTR(-ETXTBSY);
}

ret = set_page_extent_mapped(page);
if (ret < 0) {
unlock_page(page);
Expand Down

0 comments on commit 24bcb45

Please sign in to comment.