Skip to content

Commit

Permalink
Add Linux x86-64bits native method to retrieve the number of allocate…
Browse files Browse the repository at this point in the history
…d bytes on disk for a file (#80437)

This commit introduces a new native method for Linux 64bits 
platforms to retrieve the number of bytes allocated on disk for 
a given sparse file. It calls the native glibc method __xstat.

Follow up #79698
  • Loading branch information
tlrx authored Dec 6, 2021
1 parent b18f5fd commit 0c9ba49
Show file tree
Hide file tree
Showing 4 changed files with 228 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ private static Provider loadJnaProvider() {
Class.forName("com.sun.jna.Native");
if (Constants.WINDOWS) {
return WindowsFileSystemNatives.getInstance();
} else if (Constants.LINUX && Constants.JRE_IS_64BIT) {
return LinuxFileSystemNatives.getInstance();
}
} catch (ClassNotFoundException e) {
logger.warn("JNA not found. FileSystemNatives methods will be disabled.", e);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.common.filesystem;

import com.sun.jna.LastErrorException;
import com.sun.jna.Native;
import com.sun.jna.Platform;
import com.sun.jna.Structure;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.lucene.util.Constants;

import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Instant;
import java.util.OptionalLong;

/**
* {@link FileSystemNatives.Provider} implementation for Linux x86-64bits
*/
final class LinuxFileSystemNatives implements FileSystemNatives.Provider {

private static final Logger logger = LogManager.getLogger(LinuxFileSystemNatives.class);

private static final LinuxFileSystemNatives INSTANCE = new LinuxFileSystemNatives();

/** st_blocks field indicates the number of blocks allocated to the file, 512-byte units **/
private static final long ST_BLOCKS_UNIT = 512L;

/**
* Version of the `struct stat' data structure.
*
* To allow the `struct stat' structure bits to vary without changing shared library major version number, the `stat' function is often
* an inline wrapper around `xstat' which takes a leading version-number argument designating the data structure and bits used.
*
* In glibc this version is defined in bits/stat.h (or bits/struct_stat.h in glibc 2.33) as:
* # define _STAT_VER_LINUX 1
* # define _STAT_VER _STAT_VER_LINUX
**/
private static final int STAT_VER_LINUX = 1;
private static final int STAT_VER = STAT_VER_LINUX;

private LinuxFileSystemNatives() {
assert Constants.LINUX : Constants.OS_NAME;
assert Constants.JRE_IS_64BIT : Constants.OS_ARCH;
try {
Native.register(XStatLibrary.class, Platform.C_LIBRARY_NAME);
logger.debug("C library loaded");
} catch (LinkageError e) {
logger.warn("unable to link C library. native methods and handlers will be disabled.", e);
throw e;
}
}

static LinuxFileSystemNatives getInstance() {
return INSTANCE;
}

public static class XStatLibrary {
public static native int __xstat(int version, String path, Stat stats) throws LastErrorException;
}

/**
* Retrieves the actual number of bytes of disk storage used to store a specified file.
*
* @param path the path to the file
* @return an {@link OptionalLong} that contains the number of allocated bytes on disk for the file, or empty if the size is invalid
*/
@Override
public OptionalLong allocatedSizeInBytes(Path path) {
assert Files.isRegularFile(path) : path;
try {
final Stat stats = new Stat();
final int rc = XStatLibrary.__xstat(STAT_VER, path.toString(), stats);
if (logger.isTraceEnabled()) {
logger.trace("executing native method __xstat() returned {} with error code [{}] for file [{}]", rc, stats, path);
}
return OptionalLong.of(stats.st_blocks * ST_BLOCKS_UNIT);
} catch (LastErrorException e) {
logger.warn(
() -> new ParameterizedMessage(
"error when executing native method __xstat(int vers, const char *name, struct stat *buf) for file [{}]",
path
),
e
);
}
return OptionalLong.empty();
}

@Structure.FieldOrder(
{
"st_dev",
"st_ino",
"st_nlink",
"st_mode",
"st_uid",
"st_gid",
"__pad0",
"st_rdev",
"st_size",
"st_blksize",
"st_blocks",
"st_atim",
"st_mtim",
"st_ctim",
"__glibc_reserved0",
"__glibc_reserved1",
"__glibc_reserved2" }
)
public static class Stat extends Structure {

/**
* The stat structure varies across architectures in the glibc and kernel source codes. For example some fields might be ordered
* differently and/or some padding bytes might be present between some fields.
*
* The struct implemented here refers to the Linux x86 architecture in the glibc source files:
* - glibc version 2.23: sysdeps/unix/sysv/linux/x86/bits/stat.h
* - glibc version 2.33: sysdeps/unix/sysv/linux/x86/bits/struct_stat.h
*
* The following command is useful to compile the stat struct on a given system:
* echo "#include <sys/stat.h>" | gcc -xc - -E -dD | grep -ve '^$' | grep -A23 '^struct stat'
*/
public long st_dev; // __dev_t st_dev; /* Device. */
public long st_ino; // __ino_t st_ino; /* File serial number. */
public long st_nlink; // __nlink_t st_nlink; /* Link count. */
public int st_mode; // __mode_t st_mode; /* File mode. */
public int st_uid; // __uid_t st_uid; /* User ID of the file's owner. */
public int st_gid; // __gid_t st_gid; /* Group ID of the file's group. */
public int __pad0;
public long st_rdev; // __dev_t st_rdev; /* Device number, if device. */
public long st_size; // __off_t st_size; /* Size of file, in bytes. */
public long st_blksize; // __blksize_t st_blksize; /* Optimal block size for I/O. */
public long st_blocks; // __blkcnt_t st_blocks; /* Number 512-byte blocks allocated. */
public Time st_atim; // struct timespec st_atim; /* Time of last access. */
public Time st_mtim; // struct timespec st_mtim; /* Time of last modification. */
public Time st_ctim; // struct timespec st_ctim; /* Time of last status change. */
public long __glibc_reserved0; // __syscall_slong_t
public long __glibc_reserved1; // __syscall_slong_t
public long __glibc_reserved2; // __syscall_slong_t

@Override
public String toString() {
return "[st_dev="
+ st_dev
+ ", st_ino="
+ st_ino
+ ", st_nlink="
+ st_nlink
+ ", st_mode="
+ st_mode
+ ", st_uid="
+ st_uid
+ ", st_gid="
+ st_gid
+ ", st_rdev="
+ st_rdev
+ ", st_size="
+ st_size
+ ", st_blksize="
+ st_blksize
+ ", st_blocks="
+ st_blocks
+ ", st_atim="
+ Instant.ofEpochSecond(st_atim.tv_sec, st_atim.tv_nsec)
+ ", st_mtim="
+ Instant.ofEpochSecond(st_mtim.tv_sec, st_mtim.tv_nsec)
+ ", st_ctim="
+ Instant.ofEpochSecond(st_ctim.tv_sec, st_ctim.tv_nsec)
+ ']';
}
}

@Structure.FieldOrder({ "tv_sec", "tv_nsec" })
public static class Time extends Structure {
public long tv_sec;
public long tv_nsec;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import org.apache.logging.log4j.Logger;
import org.apache.lucene.util.LuceneTestCase;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.filesystem.FileSystemNatives;
import org.elasticsearch.common.io.FileSystemUtils;
import org.elasticsearch.common.network.IfConfig;
import org.elasticsearch.common.settings.Settings;
Expand Down Expand Up @@ -81,6 +82,9 @@ public class BootstrapForTesting {
final boolean systemCallFilter = Booleans.parseBoolean(System.getProperty("tests.system_call_filter", "true"));
Bootstrap.initializeNatives(javaTmpDir, memoryLock, systemCallFilter, true);

// init filesystem natives
FileSystemNatives.init();

// initialize probes
Bootstrap.initializeProbes();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,10 @@
import java.util.concurrent.Future;

import static org.elasticsearch.xpack.searchablesnapshots.cache.common.TestUtils.randomPopulateAndReads;
import static org.hamcrest.Matchers.allOf;
import static org.hamcrest.Matchers.containsString;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
import static org.hamcrest.Matchers.hasSize;
import static org.hamcrest.Matchers.instanceOf;
Expand Down Expand Up @@ -389,14 +391,22 @@ public void testFSyncFailure() throws Exception {
}
}

private static void assumeLinux64bitsOrWindows() {
assumeTrue(
"This test uses native methods implemented only for Windows & Linux 64bits",
Constants.WINDOWS || Constants.LINUX && Constants.JRE_IS_64BIT
);
}

public void testCacheFileCreatedAsSparseFile() throws Exception {
assumeTrue("This test uses a native method implemented only for Windows", Constants.WINDOWS);
assumeLinux64bitsOrWindows();
final long fourKb = 4096L;
final long oneMb = 1 << 20;

final Path file = createTempDir().resolve(UUIDs.randomBase64UUID(random()));
final CacheFile cacheFile = new CacheFile(
new CacheKey("_snap_uuid", "_snap_name", new ShardId("_name", "_uid", 0), "_filename"),
oneMb,
randomLongBetween(fourKb, oneMb),
file,
NOOP
);
Expand All @@ -420,7 +430,19 @@ public void testCacheFileCreatedAsSparseFile() throws Exception {

sizeOnDisk = FileSystemNatives.allocatedSizeInBytes(file);
assertTrue(sizeOnDisk.isPresent());
assertThat("Cache file should be sparse and not fully allocated on disk", sizeOnDisk.getAsLong(), lessThan(oneMb));
assertThat(
"Cache file should be sparse and not fully allocated on disk",
sizeOnDisk.getAsLong(),
allOf(greaterThan(0L), lessThan(oneMb))
);

final long blockSize;
if (Constants.LINUX) {
// on Linux we can infer the filesystem's block size if only 1 byte was written
blockSize = sizeOnDisk.getAsLong();
} else {
blockSize = 0L;
}

fill(fileChannel, 0, Math.toIntExact(cacheFile.getLength()));
fileChannel.force(false);
Expand All @@ -430,8 +452,17 @@ public void testCacheFileCreatedAsSparseFile() throws Exception {
assertThat(
"Cache file should be fully allocated on disk (maybe more given cluster/block size)",
sizeOnDisk.getAsLong(),
greaterThanOrEqualTo(oneMb)
greaterThanOrEqualTo(cacheFile.getLength())
);

if (Constants.LINUX) {
final long nbBlocks = (cacheFile.getLength() + blockSize - 1) / blockSize; // ceil(cacheFile.getLength() / blockSize)
assertThat(
"Cache file size mismatches (block size: " + blockSize + ", number of blocks: " + nbBlocks + ')',
sizeOnDisk.getAsLong(),
equalTo(nbBlocks * blockSize)
);
}
} finally {
cacheFile.release(listener);
}
Expand Down

0 comments on commit 0c9ba49

Please sign in to comment.