Fix handling of non-ASCII characters in archive entry file names

When creating a `PathFragment` from a ZIP or TAR entry file name, the raw bytes of the name are now wrapped into a Latin-1 encoded String, which is how Bazel internally represents file paths. Previously, ZIP entries as well as TAR entries with PAX headers would result in ordinary decoded Java strings, resulting in corrupted file names when passed to Bazel's file system operations. Fixes #12986 Fixes bazel-contrib/rules_go#2771 Closes #18448. PiperOrigin-RevId: 571857847 Change-Id: Ie578724e75ddbefbe05255601b0afab706835f89
bazelbuild · Oct 9, 2023 · 10169bb · 10169bb
1 parent 7a6df02
commit 10169bb
Showing 9 changed files with 304 additions and 63 deletions.
diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/BUILD b/src/main/java/com/google/devtools/build/lib/bazel/repository/BUILD
@@ -51,6 +51,7 @@ java_library(
         "//src/main/java/com/google/devtools/common/options",
         "//src/main/java/net/starlark/java/eval",
         "//third_party:apache_commons_compress",
+        "//third_party:auto_service",
         "//third_party:auto_value",
         "//third_party:flogger",
         "//third_party:guava",

diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/CompressedTarFunction.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/CompressedTarFunction.java
@@ -15,8 +15,10 @@
 package com.google.devtools.build.lib.bazel.repository;
 
 import static com.google.devtools.build.lib.bazel.repository.StripPrefixedPath.maybeDeprefixSymlink;
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
 
-import com.google.common.base.Optional;
+import com.google.auto.service.AutoService;
 import com.google.common.io.ByteStreams;
 import com.google.devtools.build.lib.bazel.repository.DecompressorValue.Decompressor;
 import com.google.devtools.build.lib.vfs.FileSystemUtils;
@@ -25,16 +27,33 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.spi.CharsetProvider;
+import java.util.Collections;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.Map;
+import java.util.Optional;
 import java.util.Set;
+import java.util.UUID;
 import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
 import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
 
 /**
  * Common code for unarchiving a compressed TAR file.
+ *
+ * <p>TAR file entries commonly use one of two formats: PAX, which uses UTF-8 encoding for all
+ * strings, and USTAR, which does not specify an encoding. This class interprets USTAR headers as
+ * latin-1, thus preserving the original bytes of the header without enforcing any particular
+ * encoding. Internally, for file system operations, all strings are converted into Bazel's internal
+ * representation of raw bytes stored as latin-1 strings.
  */
 public abstract class CompressedTarFunction implements Decompressor {
   protected abstract InputStream getDecompressorStream(DecompressorDescriptor descriptor)
@@ -54,20 +73,23 @@ public Path decompress(DecompressorDescriptor descriptor)
     Map<Path, PathFragment> symlinks = new HashMap<>();
 
     try (InputStream decompressorStream = getDecompressorStream(descriptor)) {
-      TarArchiveInputStream tarStream = new TarArchiveInputStream(decompressorStream);
+      // USTAR tar headers use an unspecified encoding whereas PAX tar headers always use UTF-8.
+      // We can specify the encoding to use for USTAR headers, but the Charset used for PAX headers
+      // is fixed to UTF-8. We thus specify a custom Charset for the former so that we can
+      // distinguish between the two.
+      TarArchiveInputStream tarStream =
+          new TarArchiveInputStream(decompressorStream, MarkedIso88591Charset.NAME);
       TarArchiveEntry entry;
       while ((entry = tarStream.getNextTarEntry()) != null) {
-        String entryName = entry.getName();
+        String entryName = toRawBytesString(entry.getName());
         entryName = renameFiles.getOrDefault(entryName, entryName);
-        StripPrefixedPath entryPath = StripPrefixedPath.maybeDeprefix(entryName, prefix);
+        StripPrefixedPath entryPath =
+            StripPrefixedPath.maybeDeprefix(entryName.getBytes(ISO_8859_1), prefix);
         foundPrefix = foundPrefix || entryPath.foundPrefix();
 
         if (prefix.isPresent() && !foundPrefix) {
-          Optional<String> suggestion =
-              CouldNotFindPrefixException.maybeMakePrefixSuggestion(entryPath.getPathFragment());
-          if (suggestion.isPresent()) {
-            availablePrefixes.add(suggestion.get());
-          }
+          CouldNotFindPrefixException.maybeMakePrefixSuggestion(entryPath.getPathFragment())
+              .ifPresent(availablePrefixes::add);
         }
 
         if (entryPath.skip()) {
@@ -80,8 +102,11 @@ public Path decompress(DecompressorDescriptor descriptor)
           filePath.createDirectoryAndParents();
         } else {
           if (entry.isSymbolicLink() || entry.isLink()) {
-            PathFragment targetName = PathFragment.create(entry.getLinkName());
-            targetName = maybeDeprefixSymlink(targetName, prefix, descriptor.destinationPath());
+            PathFragment targetName =
+                maybeDeprefixSymlink(
+                    toRawBytesString(entry.getLinkName()).getBytes(ISO_8859_1),
+                    prefix,
+                    descriptor.destinationPath());
             if (entry.isSymbolicLink()) {
               symlinks.put(filePath, targetName);
             } else {
@@ -135,4 +160,100 @@ public Path decompress(DecompressorDescriptor descriptor)
 
     return descriptor.destinationPath();
   }
+
+  /**
+   * Returns a string that contains the raw bytes of the given string encoded in ISO-8859-1,
+   * assuming that the given string was encoded with either UTF-8 or the special {@link
+   * MarkedIso88591Charset}.
+   */
+  private static String toRawBytesString(String name) {
+    // Marked strings are already encoded in ISO-8859-1. Other strings originate from PAX headers
+    // and are thus encoded in UTF-8, which we decode to the raw bytes and then re-encode trivially
+    // in ISO-8859-1.
+    return MarkedIso88591Charset.getRawBytesStringIfMarked(name)
+        .orElseGet(() -> new String(name.getBytes(UTF_8), ISO_8859_1));
+  }
+
+  /** A provider of {@link MarkedIso88591Charset}s. */
+  @AutoService(CharsetProvider.class)
+  public static class MarkedIso88591CharsetProvider extends CharsetProvider {
+    private static final Charset CHARSET = new MarkedIso88591Charset();
+
+    @Override
+    public Iterator<Charset> charsets() {
+      // This charset is only meant for internal use within CompressedTarFunction and thus should
+      // not be discoverable.
+      return Collections.emptyIterator();
+    }
+
+    @Override
+    public Charset charsetForName(String charsetName) {
+      return MarkedIso88591Charset.NAME.equals(charsetName) ? CHARSET : null;
+    }
+  }
+
+  /**
+   * A charset that decodes ISO-8859-1, i.e., produces a String that contains the raw decoded bytes,
+   * and appends a marker to the end of the string to indicate that it was decoded with this
+   * charset.
+   */
+  private static class MarkedIso88591Charset extends Charset {
+    // The name
+    // * must not collide with the name of any other charset.
+    // * must not appear in archive entry names by chance.
+    // * is internal to CompressedTarFunction.
+    // This is best served by a cryptographically random UUID, generated at startup.
+    private static final String NAME = UUID.randomUUID().toString();
+
+    private MarkedIso88591Charset() {
+      super(NAME, new String[0]);
+    }
+
+    public static Optional<String> getRawBytesStringIfMarked(String s) {
+      // Check for the marker in all positions as TarArchiveInputStream manipulates the raw name in
+      // certain cases (for example, appending a '/' to directory names).
+      if (s.contains(NAME)) {
+        return Optional.of(s.replaceAll(NAME, ""));
+      }
+      return Optional.empty();
+    }
+
+    @Override
+    public CharsetDecoder newDecoder() {
+      return new CharsetDecoder(this, 1, 1) {
+        @Override
+        protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
+          // A simple unoptimized ISO-8859-1 decoder.
+          while (in.hasRemaining()) {
+            if (!out.hasRemaining()) {
+              return CoderResult.OVERFLOW;
+            }
+            out.put((char) (in.get() & 0xFF));
+          }
+          return CoderResult.UNDERFLOW;
+        }
+
+        @Override
+        protected CoderResult implFlush(CharBuffer out) {
+          // Append the marker to the end of the buffer to indicate that it was decoded with this
+          // charset.
+          if (out.remaining() < NAME.length()) {
+            return CoderResult.OVERFLOW;
+          }
+          out.put(NAME);
+          return CoderResult.UNDERFLOW;
+        }
+      };
+    }
+
+    @Override
+    public CharsetEncoder newEncoder() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean contains(Charset cs) {
+      return false;
+    }
+  }
 }
diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorDescriptor.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorDescriptor.java
@@ -15,10 +15,10 @@
 package com.google.devtools.build.lib.bazel.repository;
 
 import com.google.auto.value.AutoValue;
-import com.google.common.base.Optional;
 import com.google.common.collect.ImmutableMap;
 import com.google.devtools.build.lib.vfs.Path;
 import java.util.Map;
+import java.util.Optional;
 
 /** Description of an archive to be decompressed. */
 @AutoValue

diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorValue.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorValue.java
@@ -14,14 +14,17 @@
 
 package com.google.devtools.build.lib.bazel.repository;
 
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Optional;
 import com.google.devtools.build.lib.rules.repository.RepositoryFunction.RepositoryFunctionException;
 import com.google.devtools.build.lib.vfs.Path;
 import com.google.devtools.build.lib.vfs.PathFragment;
 import com.google.devtools.build.skyframe.SkyFunctionException.Transience;
 import com.google.devtools.build.skyframe.SkyValue;
 import java.io.IOException;
+import java.util.Optional;
 import java.util.Set;
 import net.starlark.java.eval.Starlark;
 
@@ -59,9 +62,14 @@ private static String prepareErrorMessage(String prefix, Set<String> availablePr
       }
 
       public static Optional<String> maybeMakePrefixSuggestion(PathFragment pathFragment) {
-        return pathFragment.isMultiSegment()
-            ? Optional.of(pathFragment.getSegment(0))
-            : Optional.absent();
+        if (!pathFragment.isMultiSegment()) {
+          return Optional.empty();
+        }
+        String rawFirstSegment = pathFragment.getSegment(0);
+        // Users can only specify prefixes from Starlark, which is planned to use UTF-8 for all
+        // strings, but currently still collects the raw bytes in a latin-1 string. We thus
+        // optimistically decode the raw bytes with UTF-8 here for display purposes.
+        return Optional.of(new String(rawFirstSegment.getBytes(ISO_8859_1), UTF_8));
       }
     }
 

diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPath.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPath.java
@@ -14,11 +14,13 @@
 
 package com.google.devtools.build.lib.bazel.repository;
 
-import com.google.common.base.Optional;
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
 import com.google.common.base.Preconditions;
 import com.google.devtools.build.lib.concurrent.ThreadSafety;
 import com.google.devtools.build.lib.vfs.Path;
 import com.google.devtools.build.lib.vfs.PathFragment;
+import java.util.Optional;
 
 /**
  * Utility class for removing a prefix from an archive's path.
@@ -36,17 +38,19 @@ public final class StripPrefixedPath {
    * could cause collisions, if a zip file had one entry for bin/some-binary and another entry for
    * /bin/some-binary.
    *
-   * Note that the prefix is stripped to move the files up one level, so if you have an entry
+   * <p>Note that the prefix is stripped to move the files up one level, so if you have an entry
    * "foo/../bar" and a prefix of "foo", the result will be "bar" not "../bar".
    */
-  public static StripPrefixedPath maybeDeprefix(String entry, Optional<String> prefix) {
+  public static StripPrefixedPath maybeDeprefix(byte[] entry, Optional<String> prefix) {
     Preconditions.checkNotNull(entry);
     PathFragment entryPath = relativize(entry);
-    if (!prefix.isPresent()) {
+    if (prefix.isEmpty()) {
       return new StripPrefixedPath(entryPath, false, false);
     }
 
-    PathFragment prefixPath = relativize(prefix.get());
+    // Bazel parses Starlark files, which are the ultimate source of prefixes, as Latin-1
+    // (ISO-8859-1).
+    PathFragment prefixPath = relativize(prefix.get().getBytes(ISO_8859_1));
     boolean found = false;
     boolean skip = false;
     if (entryPath.startsWith(prefixPath)) {
@@ -64,8 +68,8 @@ public static StripPrefixedPath maybeDeprefix(String entry, Optional<String> pre
   /**
    * Normalize the path and, if it is absolute, make it relative (e.g., /foo/bar becomes foo/bar).
    */
-  private static PathFragment relativize(String path) {
-    PathFragment entryPath = PathFragment.create(path);
+  private static PathFragment relativize(byte[] path) {
+    PathFragment entryPath = createPathFragment(path);
     if (entryPath.isAbsolute()) {
       entryPath = entryPath.toRelative();
     }
@@ -79,10 +83,10 @@ private StripPrefixedPath(PathFragment pathFragment, boolean found, boolean skip
   }
 
   public static PathFragment maybeDeprefixSymlink(
-      PathFragment linkPathFragment, Optional<String> prefix, Path root) {
-    boolean wasAbsolute = linkPathFragment.isAbsolute();
+      byte[] rawTarget, Optional<String> prefix, Path root) {
+    boolean wasAbsolute = createPathFragment(rawTarget).isAbsolute();
     // Strip the prefix from the link path if set.
-    linkPathFragment = maybeDeprefix(linkPathFragment.getPathString(), prefix).getPathFragment();
+    PathFragment linkPathFragment = maybeDeprefix(rawTarget, prefix).getPathFragment();
     if (wasAbsolute) {
       // Recover the path to an absolute path as maybeDeprefix() relativize the path
       // even if the prefix is not set
@@ -103,4 +107,10 @@ public boolean skip() {
     return skip;
   }
 
+  static PathFragment createPathFragment(byte[] rawBytes) {
+    // Bazel internally represents paths as raw bytes by using the Latin-1 encoding, which has the
+    // property that (new String(bytes, ISO_8859_1)).getBytes(ISO_8859_1)) equals bytes for every
+    // byte array bytes.
+    return PathFragment.create(new String(rawBytes, ISO_8859_1));
+  }
 }
diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/ZipDecompressor.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/ZipDecompressor.java
@@ -15,9 +15,9 @@
 package com.google.devtools.build.lib.bazel.repository;
 
 import static com.google.devtools.build.lib.bazel.repository.StripPrefixedPath.maybeDeprefixSymlink;
+import static java.nio.charset.StandardCharsets.UTF_8;
 
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Optional;
 import com.google.common.base.Preconditions;
 import com.google.common.io.ByteStreams;
 import com.google.devtools.build.lib.bazel.repository.DecompressorValue.Decompressor;
@@ -29,11 +29,11 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
-import java.nio.charset.Charset;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
+import java.util.Optional;
 import java.util.Set;
 import javax.annotation.Nullable;
 
@@ -89,7 +89,8 @@ public Path decompress(DecompressorDescriptor descriptor)
       for (ZipFileEntry entry : entries) {
         String entryName = entry.getName();
         entryName = renameFiles.getOrDefault(entryName, entryName);
-        StripPrefixedPath entryPath = StripPrefixedPath.maybeDeprefix(entryName, prefix);
+        StripPrefixedPath entryPath =
+            StripPrefixedPath.maybeDeprefix(entryName.getBytes(UTF_8), prefix);
         foundPrefix = foundPrefix || entryPath.foundPrefix();
         if (entryPath.skip()) {
           continue;
@@ -102,12 +103,9 @@ public Path decompress(DecompressorDescriptor descriptor)
         Set<String> prefixes = new HashSet<>();
         for (ZipFileEntry entry : entries) {
           StripPrefixedPath entryPath =
-              StripPrefixedPath.maybeDeprefix(entry.getName(), Optional.absent());
-          Optional<String> suggestion =
-              CouldNotFindPrefixException.maybeMakePrefixSuggestion(entryPath.getPathFragment());
-          if (suggestion.isPresent()) {
-            prefixes.add(suggestion.get());
-          }
+              StripPrefixedPath.maybeDeprefix(entry.getName().getBytes(UTF_8), Optional.empty());
+          CouldNotFindPrefixException.maybeMakePrefixSuggestion(entryPath.getPathFragment())
+              .ifPresent(prefixes::add);
         }
         throw new CouldNotFindPrefixException(prefix.get(), prefixes);
       }
@@ -146,17 +144,22 @@ private static void extractZipEntry(
       // For symlinks, the "compressed data" is actually the target name.
       int read = reader.getInputStream(entry).read(buffer);
       Preconditions.checkState(read == buffer.length);
-      PathFragment target = PathFragment.create(new String(buffer, Charset.defaultCharset()));
+
+      PathFragment target = StripPrefixedPath.createPathFragment(buffer);
       if (target.containsUplevelReferences()) {
         PathFragment pointsTo = strippedRelativePath.getParentDirectory().getRelative(target);
         if (pointsTo.containsUplevelReferences()) {
-          throw new IOException("Zip entries cannot refer to files outside of their directory: "
-              + reader.getFilename() + " has a symlink " + strippedRelativePath + " pointing to "
-              + target);
+          throw new IOException(
+              "Zip entries cannot refer to files outside of their directory: "
+                  + reader.getFilename()
+                  + " has a symlink "
+                  + strippedRelativePath
+                  + " pointing to "
+                  + new String(buffer, UTF_8));
         }
       }
-      target = maybeDeprefixSymlink(target, prefix, destinationDirectory);
-      symlinks.put(outputPath, target);
+
+      symlinks.put(outputPath, maybeDeprefixSymlink(buffer, prefix, destinationDirectory));
     } else {
       try (InputStream input = reader.getInputStream(entry);
           OutputStream output = outputPath.getOutputStream()) {

diff --git a/src/test/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPathTest.java b/src/test/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPathTest.java
@@ -15,13 +15,14 @@
 package com.google.devtools.build.lib.bazel.repository;
 
 import static com.google.common.truth.Truth.assertThat;
+import static java.nio.charset.StandardCharsets.UTF_8;
 
-import com.google.common.base.Optional;
 import com.google.devtools.build.lib.clock.BlazeClock;
 import com.google.devtools.build.lib.util.OS;
 import com.google.devtools.build.lib.vfs.DigestHashFunction;
 import com.google.devtools.build.lib.vfs.PathFragment;
 import com.google.devtools.build.lib.vfs.inmemoryfs.InMemoryFileSystem;
+import java.util.Optional;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -33,31 +34,32 @@
 public class StripPrefixedPathTest {
   @Test
   public void testStrip() {
-    StripPrefixedPath result = StripPrefixedPath.maybeDeprefix("foo/bar", Optional.of("foo"));
+    StripPrefixedPath result =
+        StripPrefixedPath.maybeDeprefix("foo/bar".getBytes(UTF_8), Optional.of("foo"));
     assertThat(PathFragment.create("bar")).isEqualTo(result.getPathFragment());
     assertThat(result.foundPrefix()).isTrue();
     assertThat(result.skip()).isFalse();
 
-    result = StripPrefixedPath.maybeDeprefix("foo", Optional.of("foo"));
+    result = StripPrefixedPath.maybeDeprefix("foo".getBytes(UTF_8), Optional.of("foo"));
     assertThat(result.skip()).isTrue();
 
-    result = StripPrefixedPath.maybeDeprefix("bar/baz", Optional.of("foo"));
+    result = StripPrefixedPath.maybeDeprefix("bar/baz".getBytes(UTF_8), Optional.of("foo"));
     assertThat(result.foundPrefix()).isFalse();
 
-    result = StripPrefixedPath.maybeDeprefix("foof/bar", Optional.of("foo"));
+    result = StripPrefixedPath.maybeDeprefix("foof/bar".getBytes(UTF_8), Optional.of("foo"));
     assertThat(result.foundPrefix()).isFalse();
   }
 
   @Test
   public void testAbsolute() {
-    StripPrefixedPath result = StripPrefixedPath.maybeDeprefix(
-        "/foo/bar", Optional.<String>absent());
+    StripPrefixedPath result =
+        StripPrefixedPath.maybeDeprefix("/foo/bar".getBytes(UTF_8), Optional.empty());
     assertThat(result.getPathFragment()).isEqualTo(PathFragment.create("foo/bar"));
 
-    result = StripPrefixedPath.maybeDeprefix("///foo/bar/baz", Optional.<String>absent());
+    result = StripPrefixedPath.maybeDeprefix("///foo/bar/baz".getBytes(UTF_8), Optional.empty());
     assertThat(result.getPathFragment()).isEqualTo(PathFragment.create("foo/bar/baz"));
 
-    result = StripPrefixedPath.maybeDeprefix("/foo/bar/baz", Optional.of("/foo"));
+    result = StripPrefixedPath.maybeDeprefix("/foo/bar/baz".getBytes(UTF_8), Optional.of("/foo"));
     assertThat(result.getPathFragment()).isEqualTo(PathFragment.create("bar/baz"));
   }
 
@@ -66,21 +68,21 @@ public void testWindowsAbsolute() {
     if (OS.getCurrent() != OS.WINDOWS) {
       return;
     }
-    StripPrefixedPath result = StripPrefixedPath.maybeDeprefix(
-        "c:/foo/bar", Optional.<String>absent());
+    StripPrefixedPath result =
+        StripPrefixedPath.maybeDeprefix("c:/foo/bar".getBytes(UTF_8), Optional.empty());
     assertThat(result.getPathFragment()).isEqualTo(PathFragment.create("foo/bar"));
   }
 
   @Test
   public void testNormalize() {
-    StripPrefixedPath result = StripPrefixedPath.maybeDeprefix(
-        "../bar", Optional.<String>absent());
+    StripPrefixedPath result =
+        StripPrefixedPath.maybeDeprefix("../bar".getBytes(UTF_8), Optional.empty());
     assertThat(result.getPathFragment()).isEqualTo(PathFragment.create("../bar"));
 
-    result = StripPrefixedPath.maybeDeprefix("foo/../baz", Optional.<String>absent());
+    result = StripPrefixedPath.maybeDeprefix("foo/../baz".getBytes(UTF_8), Optional.empty());
     assertThat(result.getPathFragment()).isEqualTo(PathFragment.create("baz"));
 
-    result = StripPrefixedPath.maybeDeprefix("foo/../baz", Optional.of("foo"));
+    result = StripPrefixedPath.maybeDeprefix("foo/../baz".getBytes(UTF_8), Optional.of("foo"));
     assertThat(result.getPathFragment()).isEqualTo(PathFragment.create("baz"));
   }
 
@@ -91,23 +93,23 @@ public void testDeprefixSymlink() {
 
     PathFragment relativeNoPrefix =
         StripPrefixedPath.maybeDeprefixSymlink(
-            PathFragment.create("a/b"), Optional.absent(), fileSystem.getPath("/usr"));
+            "a/b".getBytes(UTF_8), Optional.empty(), fileSystem.getPath("/usr"));
     // there is no attempt to get absolute path for the relative symlinks target path
     assertThat(relativeNoPrefix).isEqualTo(PathFragment.create("a/b"));
 
     PathFragment absoluteNoPrefix =
         StripPrefixedPath.maybeDeprefixSymlink(
-            PathFragment.create("/a/b"), Optional.absent(), fileSystem.getPath("/usr"));
+            "/a/b".getBytes(UTF_8), Optional.empty(), fileSystem.getPath("/usr"));
     assertThat(absoluteNoPrefix).isEqualTo(PathFragment.create("/usr/a/b"));
 
     PathFragment absolutePrefix =
         StripPrefixedPath.maybeDeprefixSymlink(
-            PathFragment.create("/root/a/b"), Optional.of("root"), fileSystem.getPath("/usr"));
+            "/root/a/b".getBytes(UTF_8), Optional.of("root"), fileSystem.getPath("/usr"));
     assertThat(absolutePrefix).isEqualTo(PathFragment.create("/usr/a/b"));
 
     PathFragment relativePrefix =
         StripPrefixedPath.maybeDeprefixSymlink(
-            PathFragment.create("root/a/b"), Optional.of("root"), fileSystem.getPath("/usr"));
+            "root/a/b".getBytes(UTF_8), Optional.of("root"), fileSystem.getPath("/usr"));
     // there is no attempt to get absolute path for the relative symlinks target path
     assertThat(relativePrefix).isEqualTo(PathFragment.create("a/b"));
   }

diff --git a/src/test/shell/bazel/bazel_workspaces_test.sh b/src/test/shell/bazel/bazel_workspaces_test.sh
@@ -451,6 +451,102 @@ function test_extract_rename_files() {
   ensure_output_contains_exactly_once "external/repo/out_dir/renamed-A.txt" "Second file: A"
 }
 
+# Regression test for https://github.com/bazelbuild/bazel/issues/12986
+# Verifies that tar entries with PAX headers, which are always encoded in UTF-8, are extracted
+# correctly.
+function test_extract_pax_tar_non_ascii_utf8_file_names() {
+  local archive_tar="${TEST_TMPDIR}/pax.tar"
+
+  pushd "${TEST_TMPDIR}"
+  mkdir "Ä_pax_∅"
+  echo "bar" > "Ä_pax_∅/Ä_foo_∅.txt"
+  tar --format=pax -cvf pax.tar "Ä_pax_∅"
+  popd
+
+  set_workspace_command "
+  repository_ctx.extract('${archive_tar}', 'out_dir', 'Ä_pax_∅/')"
+
+  build_and_process_log --exclude_rule "repository @local_config_cc"
+
+  ensure_contains_exactly 'location: .*repos.bzl:3:25' 1
+  ensure_contains_atleast 'context: "repository @repo"' 2
+  ensure_contains_exactly 'extract_event' 1
+
+  ensure_output_contains_exactly_once "external/repo/out_dir/Ä_foo_∅.txt" "bar"
+}
+
+# Verifies that tar entries with USTAR headers, for which an encoding isn't specified, are extracted
+# correctly if that encoding happens to be UTF-8.
+function test_extract_ustar_tar_non_ascii_utf8_file_names() {
+  local archive_tar="${TEST_TMPDIR}/ustar.tar"
+
+  pushd "${TEST_TMPDIR}"
+  mkdir "Ä_ustar_∅"
+  echo "bar" > "Ä_ustar_∅/Ä_foo_∅.txt"
+  tar --format=ustar -cvf ustar.tar "Ä_ustar_∅"
+  popd
+
+  set_workspace_command "
+  repository_ctx.extract('${archive_tar}', 'out_dir', 'Ä_ustar_∅/')"
+
+  build_and_process_log --exclude_rule "repository @local_config_cc"
+
+  ensure_contains_exactly 'location: .*repos.bzl:3:25' 1
+  ensure_contains_atleast 'context: "repository @repo"' 2
+  ensure_contains_exactly 'extract_event' 1
+
+  ensure_output_contains_exactly_once "external/repo/out_dir/Ä_foo_∅.txt" "bar"
+}
+
+# Verifies that tar entries with USTAR headers, for which an encoding isn't specified, are extracted
+# correctly if that encoding is not UTF-8.
+function test_extract_ustar_tar_non_ascii_non_utf8_file_names() {
+  if is_darwin; then
+    echo "Skipping test on macOS due to lack of support for non-UTF-8 filenames"
+    return
+  fi
+
+  local archive_tar="${TEST_TMPDIR}/ustar.tar"
+
+  pushd "${TEST_TMPDIR}"
+  mkdir "Ä_ustar_latin1_∅"
+  echo "bar" > "$(echo -e 'Ä_ustar_latin1_∅/\xC4_foo_latin1_\xD6.txt')"
+  tar --format=ustar -cvf ustar.tar "Ä_ustar_latin1_∅"
+  popd
+
+  set_workspace_command "
+  repository_ctx.extract('${archive_tar}', 'out_dir', 'Ä_ustar_latin1_∅/')"
+
+  build_and_process_log --exclude_rule "repository @local_config_cc"
+
+  ensure_contains_exactly 'location: .*repos.bzl:3:25' 1
+  ensure_contains_atleast 'context: "repository @repo"' 2
+  ensure_contains_exactly 'extract_event' 1
+
+  ensure_output_contains_exactly_once "$(echo -e 'external/repo/out_dir/\xC4_foo_latin1_\xD6.txt')" "bar"
+}
+
+function test_extract_default_zip_non_ascii_utf8_file_names() {
+  local archive_tar="${TEST_TMPDIR}/default.zip"
+
+  pushd "${TEST_TMPDIR}"
+  mkdir "Ä_default_∅"
+  echo "bar" > "Ä_default_∅/Ä_foo_∅.txt"
+  zip default.zip -r "Ä_default_∅"
+  popd
+
+  set_workspace_command "
+  repository_ctx.extract('${archive_tar}', 'out_dir', 'Ä_default_∅/')"
+
+  build_and_process_log --exclude_rule "repository @local_config_cc"
+
+  ensure_contains_exactly 'location: .*repos.bzl:3:25' 1
+  ensure_contains_atleast 'context: "repository @repo"' 2
+  ensure_contains_exactly 'extract_event' 1
+
+  ensure_output_contains_exactly_once "external/repo/out_dir/Ä_foo_∅.txt" "bar"
+}
+
 function test_file() {
   set_workspace_command 'repository_ctx.file("filefile.sh", "echo filefile", True)'
 

diff --git a/src/test/shell/integration/minimal_jdk_test.sh b/src/test/shell/integration/minimal_jdk_test.sh
@@ -42,13 +42,13 @@ export BAZEL_SUFFIX="_jdk_minimal"
 source "$(rlocation "io_bazel/src/test/shell/integration_test_setup.sh")" \
   || { echo "integration_test_setup.sh not found!" >&2; exit 1; }
 
-# Bazel's install base is < 366MB with minimal JDK and > 366MB with an all
+# Bazel's install base is < 370MB with minimal JDK and > 370MB with an all
 # modules JDK.
-function test_size_less_than_366MB() {
+function test_size_less_than_370MB() {
   bazel info
   ib=$(bazel info install_base)
   size=$(du -s "$ib" | cut -d\	 -f1)
-  maxsize=$((1024*366))
+  maxsize=$((1024*370))
   if [ $size -gt $maxsize ]; then
     echo "$ib was too big:" 1>&2
     du -a "$ib" 1>&2