From 8b9f099df69b2d98cf3589ca7b596086fc1e5e30 Mon Sep 17 00:00:00 2001 From: Bryan Beaudreault Date: Tue, 6 Sep 2022 15:04:32 -0400 Subject: [PATCH] HBASE-27224 HFile tool statistic sampling produces misleading results (#4638) Signed-off-by: Duo Zhang Reviewed-by: Clay Baenziger (cherry picked from commit 6f6857b83fbf90e77d58642da38c8bc878ccdf2f) Change-Id: Ibf797084adc46339b1c8d856f2359914381c277e --- .../hbase/io/hfile/HFilePrettyPrinter.java | 240 +++++++++++++----- .../io/hfile/TestHFilePrettyPrinter.java | 56 ++++ 2 files changed, 226 insertions(+), 70 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePrettyPrinter.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePrettyPrinter.java index deccf6da891a..2ed917b53e1f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePrettyPrinter.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFilePrettyPrinter.java @@ -20,21 +20,17 @@ import static com.codahale.metrics.MetricRegistry.name; import com.codahale.metrics.ConsoleReporter; -import com.codahale.metrics.Counter; -import com.codahale.metrics.Gauge; import com.codahale.metrics.Histogram; -import com.codahale.metrics.Meter; -import com.codahale.metrics.MetricFilter; import com.codahale.metrics.MetricRegistry; -import com.codahale.metrics.ScheduledReporter; import com.codahale.metrics.Snapshot; -import com.codahale.metrics.Timer; +import com.codahale.metrics.UniformReservoir; import java.io.ByteArrayOutputStream; import java.io.DataInput; import java.io.IOException; import java.io.PrintStream; import java.text.DateFormat; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashSet; @@ -43,10 +39,8 @@ import java.util.Map; import java.util.Optional; import java.util.Set; -import java.util.SortedMap; import java.util.TimeZone; -import java.util.concurrent.TimeUnit; -import org.apache.commons.lang3.StringUtils; +import java.util.concurrent.atomic.LongAdder; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; @@ -107,6 +101,7 @@ public class HFilePrettyPrinter extends Configured implements Tool { private boolean printBlockIndex; private boolean printBlockHeaders; private boolean printStats; + private boolean printStatRanges; private boolean checkRow; private boolean checkFamily; private boolean isSeekToRow = false; @@ -150,6 +145,8 @@ private void init() { options.addOption("w", "seekToRow", true, "Seek to this row and print all the kvs for this row only"); options.addOption("s", "stats", false, "Print statistics"); + options.addOption("d", "details", false, + "Print detailed statistics, including counts by range"); options.addOption("i", "checkMobIntegrity", false, "Print all cells whose mob files are missing"); @@ -181,7 +178,8 @@ public boolean parseOptions(String args[]) throws ParseException, IOException { shouldPrintMeta = cmd.hasOption("m"); printBlockIndex = cmd.hasOption("b"); printBlockHeaders = cmd.hasOption("h"); - printStats = cmd.hasOption("s"); + printStatRanges = cmd.hasOption("d"); + printStats = cmd.hasOption("s") || printStatRanges; checkRow = cmd.hasOption("k"); checkFamily = cmd.hasOption("a"); checkMobIntegrity = cmd.hasOption("i"); @@ -356,7 +354,7 @@ public int processFile(Path file, boolean checkRootDir) throws IOException { } if (printStats) { - fileStats.finish(); + fileStats.finish(printStatRanges); out.println("Stats:\n" + fileStats); } @@ -382,7 +380,7 @@ private void scanKeysValues(Path file, KeyValueStatsCollector fileStats, HFileSc } // collect stats if (printStats) { - fileStats.collect(cell); + fileStats.collect(cell, printStatRanges); } // dump key value if (printKey) { @@ -581,18 +579,101 @@ private void printMeta(HFile.Reader reader, Map fileInfo) throws } } + // Default reservoir is exponentially decaying, but we're doing a point-in-time analysis + // of a store file. It doesn't make sense to prefer keys later in the store file. + private static final MetricRegistry.MetricSupplier UNIFORM_RESERVOIR = + () -> new Histogram(new UniformReservoir()); + + // Useful ranges for viewing distribution of small to large keys, values, and rows. + // we only print ranges which actually have values, so more here doesn't add much overhead + private static final long[] RANGES = new long[] { 1, 3, 10, 50, 100, 500, 1_000, 5_000, 10_000, + 50_000, 100_000, 500_000, 750_000, 1_000_000, 5_000_000, 10_000_000, 50_000_000, 100_000_000 }; + + /** + * Holds a Histogram and supporting min/max and range buckets for analyzing distribution of key + * bytes, value bytes, row bytes, and row columns. Supports adding values, getting the histogram, + * and getting counts per range. + */ + static class KeyValueStats { + private final Histogram histogram; + private final String name; + private long max = Long.MIN_VALUE; + private long min = Long.MAX_VALUE; + private boolean collectRanges = false; + private final LongAdder[] rangeCounts; + + KeyValueStats(MetricRegistry metricRegistry, String statName) { + this.histogram = + metricRegistry.histogram(name(HFilePrettyPrinter.class, statName), UNIFORM_RESERVOIR); + this.name = statName; + this.rangeCounts = new LongAdder[RANGES.length]; + for (int i = 0; i < rangeCounts.length; i++) { + rangeCounts[i] = new LongAdder(); + } + } + + void update(long value, boolean collectRanges) { + histogram.update(value); + min = Math.min(value, min); + max = Math.max(value, max); + + if (collectRanges) { + this.collectRanges = true; + int result = Arrays.binarySearch(RANGES, value); + int idx = result >= 0 ? result : Math.abs(result) - 1; + rangeCounts[idx].increment(); + } + } + + Histogram getHistogram() { + return histogram; + } + + String getName() { + return name; + } + + long getMax() { + return max; + } + + long getMin() { + return min; + } + + long[] getRanges() { + return RANGES; + } + + long getCountAtOrBelow(long range) { + long count = 0; + for (int i = 0; i < RANGES.length; i++) { + if (RANGES[i] <= range) { + count += rangeCounts[i].sum(); + } else { + break; + } + } + return count; + } + + boolean hasRangeCounts() { + return collectRanges; + } + } + private static class KeyValueStatsCollector { private final MetricRegistry metricsRegistry = new MetricRegistry(); private final ByteArrayOutputStream metricsOutput = new ByteArrayOutputStream(); - private final SimpleReporter simpleReporter = SimpleReporter.forRegistry(metricsRegistry) - .outputTo(new PrintStream(metricsOutput)).filter(MetricFilter.ALL).build(); - Histogram keyLen = metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Key length")); - Histogram valLen = metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Val length")); - Histogram rowSizeBytes = - metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Row size (bytes)")); - Histogram rowSizeCols = - metricsRegistry.histogram(name(HFilePrettyPrinter.class, "Row size (columns)")); + KeyValueStats keyLen = new KeyValueStats(metricsRegistry, "Key length"); + KeyValueStats valLen = new KeyValueStats(metricsRegistry, "Val length"); + KeyValueStats rowSizeBytes = new KeyValueStats(metricsRegistry, "Row size (bytes)"); + KeyValueStats rowSizeCols = new KeyValueStats(metricsRegistry, "Row size (columns)"); + + private final SimpleReporter simpleReporter = + SimpleReporter.newBuilder().outputTo(new PrintStream(metricsOutput)).addStats(keyLen) + .addStats(valLen).addStats(rowSizeBytes).addStats(rowSizeCols).build(); long curRowBytes = 0; long curRowCols = 0; @@ -603,11 +684,11 @@ private static class KeyValueStatsCollector { private long maxRowBytes = 0; private long curRowKeyLength; - public void collect(Cell cell) { - valLen.update(cell.getValueLength()); + public void collect(Cell cell, boolean printStatRanges) { + valLen.update(cell.getValueLength(), printStatRanges); if (prevCell != null && CellComparator.getInstance().compareRows(prevCell, cell) != 0) { // new row - collectRow(); + collectRow(printStatRanges); } curRowBytes += cell.getSerializedSize(); curRowKeyLength = KeyValueUtil.keyLength(cell); @@ -615,10 +696,10 @@ public void collect(Cell cell) { prevCell = cell; } - private void collectRow() { - rowSizeBytes.update(curRowBytes); - rowSizeCols.update(curRowCols); - keyLen.update(curRowKeyLength); + private void collectRow(boolean printStatRanges) { + rowSizeBytes.update(curRowBytes, printStatRanges); + rowSizeCols.update(curRowCols, printStatRanges); + keyLen.update(curRowKeyLength, printStatRanges); if (curRowBytes > maxRowBytes && prevCell != null) { biggestRow = CellUtil.cloneRow(prevCell); @@ -629,9 +710,9 @@ private void collectRow() { curRowCols = 0; } - public void finish() { + public void finish(boolean printStatRanges) { if (curRowCols > 0) { - collectRow(); + collectRow(printStatRanges); } } @@ -640,7 +721,6 @@ public String toString() { if (prevCell == null) return "no data available for statistics"; // Dump the metrics to the output stream - simpleReporter.stop(); simpleReporter.report(); return metricsOutput.toString() + "Key of biggest row: " + Bytes.toStringBinary(biggestRow); @@ -648,41 +728,32 @@ public String toString() { } /** - * Almost identical to ConsoleReporter, but extending ScheduledReporter, as extending - * ConsoleReporter in this version of dropwizard is now too much trouble. + * Simple reporter which collects registered histograms for printing to an output stream in + * {@link #report()}. */ - private static class SimpleReporter extends ScheduledReporter { + private static final class SimpleReporter { /** - * Returns a new {@link Builder} for {@link ConsoleReporter}. - * @param registry the registry to report - * @return a {@link Builder} instance for a {@link ConsoleReporter} + * Returns a new {@link Builder} for {@link SimpleReporter}. + * @return a {@link Builder} instance for a {@link SimpleReporter} */ - public static Builder forRegistry(MetricRegistry registry) { - return new Builder(registry); + public static Builder newBuilder() { + return new Builder(); } /** * A builder for {@link SimpleReporter} instances. Defaults to using the default locale and time - * zone, writing to {@code System.out}, converting rates to events/second, converting durations - * to milliseconds, and not filtering metrics. + * zone, writing to {@code System.out}. */ public static class Builder { - private final MetricRegistry registry; + private final List stats = new ArrayList<>(); private PrintStream output; private Locale locale; private TimeZone timeZone; - private TimeUnit rateUnit; - private TimeUnit durationUnit; - private MetricFilter filter; - private Builder(MetricRegistry registry) { - this.registry = registry; + private Builder() { this.output = System.out; this.locale = Locale.getDefault(); this.timeZone = TimeZone.getDefault(); - this.rateUnit = TimeUnit.SECONDS; - this.durationUnit = TimeUnit.MILLISECONDS; - this.filter = MetricFilter.ALL; } /** @@ -696,12 +767,12 @@ public Builder outputTo(PrintStream output) { } /** - * Only report metrics which match the given filter. - * @param filter a {@link MetricFilter} + * Add the given {@link KeyValueStats} to be reported + * @param stat the stat to be reported * @return {@code this} */ - public Builder filter(MetricFilter filter) { - this.filter = filter; + public Builder addStats(KeyValueStats stat) { + this.stats.add(stat); return this; } @@ -710,35 +781,31 @@ public Builder filter(MetricFilter filter) { * @return a {@link ConsoleReporter} */ public SimpleReporter build() { - return new SimpleReporter(registry, output, locale, timeZone, rateUnit, durationUnit, - filter); + return new SimpleReporter(output, stats, locale, timeZone); } } private final PrintStream output; + private final List stats; private final Locale locale; private final DateFormat dateFormat; - private SimpleReporter(MetricRegistry registry, PrintStream output, Locale locale, - TimeZone timeZone, TimeUnit rateUnit, TimeUnit durationUnit, MetricFilter filter) { - super(registry, "simple-reporter", filter, rateUnit, durationUnit); + private SimpleReporter(PrintStream output, List stats, Locale locale, + TimeZone timeZone) { this.output = output; + this.stats = stats; this.locale = locale; - this.dateFormat = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.MEDIUM, locale); dateFormat.setTimeZone(timeZone); } - @Override - public void report(SortedMap gauges, SortedMap counters, - SortedMap histograms, SortedMap meters, - SortedMap timers) { + public void report() { // we know we only have histograms - if (!histograms.isEmpty()) { - for (Map.Entry entry : histograms.entrySet()) { - output.print(" " + StringUtils.substringAfterLast(entry.getKey(), ".")); + if (!stats.isEmpty()) { + for (KeyValueStats stat : stats) { + output.print(" " + stat.getName()); output.println(':'); - printHistogram(entry.getValue()); + printHistogram(stat); } output.println(); } @@ -747,10 +814,12 @@ public void report(SortedMap gauges, SortedMap c output.flush(); } - private void printHistogram(Histogram histogram) { + private void printHistogram(KeyValueStats stats) { + Histogram histogram = stats.getHistogram(); Snapshot snapshot = histogram.getSnapshot(); - output.printf(locale, " min = %d%n", snapshot.getMin()); - output.printf(locale, " max = %d%n", snapshot.getMax()); + + output.printf(locale, " min = %d%n", stats.getMin()); + output.printf(locale, " max = %d%n", stats.getMax()); output.printf(locale, " mean = %2.2f%n", snapshot.getMean()); output.printf(locale, " stddev = %2.2f%n", snapshot.getStdDev()); output.printf(locale, " median = %2.2f%n", snapshot.getMedian()); @@ -760,6 +829,37 @@ private void printHistogram(Histogram histogram) { output.printf(locale, " 99%% <= %2.2f%n", snapshot.get99thPercentile()); output.printf(locale, " 99.9%% <= %2.2f%n", snapshot.get999thPercentile()); output.printf(locale, " count = %d%n", histogram.getCount()); + + // if printStatRanges was enabled with -d arg, below we'll create an approximate histogram + // of counts based on the configured ranges in RANGES. Each range of sizes (i.e. <= 50, <= + // 100, etc) will have a count printed if any values were seen in that range. If no values + // were seen for a range, that range will be excluded to keep the output small. + if (stats.hasRangeCounts()) { + output.printf(locale, " (range <= count):%n"); + long lastVal = 0; + long lastRange = 0; + for (long range : stats.getRanges()) { + long val = stats.getCountAtOrBelow(range); + if (val - lastVal > 0) { + // print the last zero value before this one, to give context + if (lastVal == 0 && lastRange != 0) { + printRangeCount(lastRange, lastVal); + } + printRangeCount(range, val - lastVal); + } + lastVal = val; + lastRange = range; + } + if (histogram.getCount() - lastVal > 0) { + // print any remaining that might have been outside our buckets + printRangeCount(Long.MAX_VALUE, histogram.getCount() - lastVal); + } + } + } + + private void printRangeCount(long range, long countAtOrBelow) { + String rangeString = range == Long.MAX_VALUE ? "inf" : Long.toString(range); + output.printf(locale, "%17s <= %d%n", rangeString, countAtOrBelow); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFilePrettyPrinter.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFilePrettyPrinter.java index 7a78981c6652..9ec5dc1c576f 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFilePrettyPrinter.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/io/hfile/TestHFilePrettyPrinter.java @@ -21,6 +21,7 @@ import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; +import com.codahale.metrics.MetricRegistry; import java.io.ByteArrayOutputStream; import java.io.PrintStream; import org.apache.hadoop.conf.Configuration; @@ -126,4 +127,59 @@ public void testHFilePrettyPrinterSeekFirstRow() throws Exception { String expectedResult = "Scanning -> " + fileNotInRootDir + "\n" + "Scanned kv count -> 1\n"; assertEquals(expectedResult, result); } + + @Test + public void testHistograms() throws Exception { + Path fileNotInRootDir = UTIL.getDataTestDir("hfile"); + TestHRegionServerBulkLoad.createHFile(fs, fileNotInRootDir, cf, fam, value, 1000); + assertNotEquals("directory used is not an HBase root dir", UTIL.getDefaultRootDirPath(), + fileNotInRootDir); + + System.setOut(ps); + new HFilePrettyPrinter(conf).run(new String[] { "-s", "-d", String.valueOf(fileNotInRootDir) }); + String result = stream.toString(); + LOG.info(result); + + // split out the output into sections based on the headers + String[] headers = + new String[] { "Key length", "Val length", "Row size (bytes)", "Row size (columns)" }; + // for each section, there is a corresponding expected (count, range) pairs + int[][] expectations = new int[][] { new int[] { 0, 10, 1000, 50 }, new int[] { 0, 1, 1000, 3 }, + new int[] { 0, 10, 1000, 50 }, new int[] { 1000, 1 }, }; + + for (int i = 0; i < headers.length - 1; i++) { + int idx = result.indexOf(headers[i]); + int nextIdx = result.indexOf(headers[i + 1]); + + assertContainsRanges(result.substring(idx, nextIdx), expectations[i]); + } + } + + private void assertContainsRanges(String result, int... rangeCountPairs) { + for (int i = 0; i < rangeCountPairs.length - 1; i += 2) { + String expected = rangeCountPairs[i + 1] + " <= " + rangeCountPairs[i]; + assertTrue("expected:\n" + result + "\nto contain: '" + expected + "'", + result.contains(expected)); + } + } + + @Test + public void testKeyValueStats() { + HFilePrettyPrinter.KeyValueStats stats = + new HFilePrettyPrinter.KeyValueStats(new MetricRegistry(), "test"); + long[] ranges = stats.getRanges(); + for (long range : ranges) { + stats.update(range - 1, true); + } + + assertEquals(ranges[ranges.length - 1] - 1, stats.getMax()); + assertEquals(ranges[0] - 1, stats.getMin()); + + int total = 1; + for (long range : ranges) { + long val = stats.getCountAtOrBelow(range); + assertEquals(total++, val); + } + + } }