Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Use random sampler for field statistics table in Discover and Data visualizer #138953

Closed
wants to merge 22 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,32 @@ export interface DataVisualizerGridEmbeddableInput extends EmbeddableInput {
query?: Query | AggregateQuery;
visibleFieldNames?: string[];
filters?: Filter[];
/**
* Whether to show the mini chart distributions when table is first rendered
*/
showPreviewByDefault?: boolean;
/**
* Whether to show option directly in table rows to edit the fields in the data view
*/
allowEditDataView?: boolean;
/**
* Callback to add a filter to filter bar
*/
onAddFilter?: (field: DataViewField | string, value: string, type: '+' | '-') => void;
/**
* Session ID used for Kibana's search to save and restore search sessions
*/
sessionId?: string;
/**
* List of fields to fetch field statistics for
* since we might not have fetch all fields at once, but rather all that are available
*/
fieldsToFetch?: string[];
/**
* The preferred mode for sampling data for the field statistics
* default as 'autoRandomSampler'
*/
samplingMode?: string;
}
export interface DataVisualizerGridEmbeddableOutput extends EmbeddableOutput {
showDistributions?: boolean;
Expand Down Expand Up @@ -155,6 +174,7 @@ export const FieldStatisticsTable = (props: FieldStatisticsTableProps) => {
onAddFilter,
sessionId: searchSessionId,
fieldsToFetch: availableFields$?.getValue().fields,
samplingMode: 'autoRandomSampler',
});
embeddable.reload();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ export interface FieldVisStats {
min?: number;
topValues?: Array<{ key: number | string; doc_count: number }>;
topValuesSampleSize?: number;
topValuesSamplerShardSize?: number;
examples?: Array<string | GeoPointExample | object>;
timeRangeEarliest?: number;
timeRangeLatest?: number;
Expand Down
10 changes: 5 additions & 5 deletions x-pack/plugins/data_visualizer/common/types/field_stats.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ export interface NumericFieldStats {
isTopValuesSampled: boolean;
topValues: Bucket[];
topValuesSampleSize: number;
topValuesSamplerShardSize: number;
median?: number;
distribution?: Distribution;
}
Expand All @@ -71,7 +70,6 @@ export interface StringFieldStats {
isTopValuesSampled: boolean;
topValues: Bucket[];
topValuesSampleSize: number;
topValuesSamplerShardSize: number;
}

export interface DateFieldStats {
Expand All @@ -95,7 +93,7 @@ export interface DocumentCountStats {
timeRangeEarliest?: number;
timeRangeLatest?: number;
totalCount: number;
probability?: number | null;
probability: number | null;
took?: number;
randomlySampled?: boolean;
}
Expand Down Expand Up @@ -178,14 +176,16 @@ export function isValidFieldStats(arg: unknown): arg is FieldStats {

export interface FieldStatsCommonRequestParams {
index: string;
samplerShardSize: number;
timeFieldName?: string;
earliestMs?: number | undefined;
latestMs?: number | undefined;
runtimeFieldMap?: estypes.MappingRuntimeFields;
intervalMs?: number;
query: estypes.QueryDslQueryContainer;
maxExamples?: number;
samplingProbability: number | null;
browserSessionSeed: number;
totalCount: number;
}

export interface OverallStatsSearchStrategyParams {
Expand All @@ -195,13 +195,13 @@ export interface OverallStatsSearchStrategyParams {
aggInterval: TimeBucketsInterval;
intervalMs?: number;
searchQuery: Query['query'];
samplerShardSize: number;
index: string;
timeFieldName?: string;
runtimeFieldMap?: estypes.MappingRuntimeFields;
aggregatableFields: string[];
nonAggregatableFields: string[];
fieldsToFetch?: string[];
browserSessionSeed: number;
}

export interface FieldStatsSearchStrategyReturnBase {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ export const FieldsStatsGrid: FC<Props> = ({ results }) => {
pageState={dataVisualizerListState}
updatePageState={setDataVisualizerListState}
getItemIdToExpandedRowMap={getItemIdToExpandedRowMap}
totalCount={results.num_messages_analyzed}
/>
</div>
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ interface Props {
}

export const ChoroplethMap: FC<Props> = ({ stats, suggestion }) => {
const { fieldName, isTopValuesSampled, topValues, topValuesSamplerShardSize } = stats!;
const { fieldName, isTopValuesSampled, topValues, topValuesSampleSize } = stats!;

const layerList: VectorLayerDescriptor[] = useMemo(
() => [getChoroplethTopValuesLayer(fieldName || '', topValues || [], suggestion)],
Expand All @@ -113,16 +113,15 @@ export const ChoroplethMap: FC<Props> = ({ stats, suggestion }) => {
<div className={'dvMap__wrapper'}>
<EmbeddedMapComponent layerList={layerList} />
</div>

{isTopValuesSampled === true && (
<div>
<EuiSpacer size={'s'} />
<EuiText size="xs" textAlign={'center'}>
<FormattedMessage
id="xpack.dataVisualizer.dataGrid.fieldExpandedRow.choroplethMapTopValues.calculatedFromSampleDescription"
defaultMessage="Calculated from sample of {topValuesSamplerShardSize} documents per shard"
defaultMessage="Calculated from sample of {topValuesSampleSize} documents"
values={{
topValuesSamplerShardSize,
topValuesSampleSize,
}}
/>
</EuiText>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,19 @@ import { isIndexBasedFieldVisConfig } from '../../types';

interface Props extends FieldDataRowProps {
showIcon?: boolean;
totalCount: number;
}
export const DocumentStat = ({ config, showIcon }: Props) => {
export const DocumentStat = ({ config, showIcon, totalCount }: Props) => {
const { stats } = config;
if (stats === undefined) return null;
const { count, sampleCount } = stats;

// If field exists is docs but we don't have count stats then don't show
// Otherwise if field doesn't appear in docs at all, show 0%
const docsCount =
count ?? (isIndexBasedFieldVisConfig(config) && config.existsInDocs === true ? undefined : 0);
const docsPercent =
docsCount !== undefined && sampleCount !== undefined
? roundToDecimalPlace((docsCount / sampleCount) * 100)
? roundToDecimalPlace((docsCount / totalCount) * 100)
: 0;

return docsCount !== undefined ? (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ interface DataVisualizerTableProps<T> {
/** Callback to receive any updates when table or page state is changed **/
onChange?: (update: Partial<DataVisualizerTableState>) => void;
loading?: boolean;
totalCount: number;
}

export const DataVisualizerTable = <T extends DataVisualizerTableItem>({
Expand All @@ -71,6 +72,7 @@ export const DataVisualizerTable = <T extends DataVisualizerTableItem>({
showPreviewByDefault,
onChange,
loading,
totalCount,
}: DataVisualizerTableProps<T>) => {
const { euiTheme } = useEuiTheme();

Expand Down Expand Up @@ -221,7 +223,7 @@ export const DataVisualizerTable = <T extends DataVisualizerTableItem>({
defaultMessage: 'Documents (%)',
}),
render: (value: number | undefined, item: DataVisualizerTableItem) => (
<DocumentStat config={item} showIcon={dimensions.showIcon} />
<DocumentStat config={item} showIcon={dimensions.showIcon} totalCount={totalCount} />
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For a data view without a time field, you get Infinity % for percentages in the document stats and top values:

image

),
sortable: (item: DataVisualizerTableItem) => item?.stats?.count,
align: LEFT_ALIGNMENT as HorizontalAlignment,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,7 @@ function getPercentLabel(docCount: number, topValuesSampleSize: number): string

export const TopValues: FC<Props> = ({ stats, fieldFormat, barColor, compressed, onAddFilter }) => {
if (stats === undefined || !stats.topValues) return null;
const {
topValues,
topValuesSampleSize,
topValuesSamplerShardSize,
count,
isTopValuesSampled,
fieldName,
} = stats;
const { topValues, topValuesSampleSize, count, isTopValuesSampled, fieldName } = stats;

const progressBarMax = isTopValuesSampled === true ? topValuesSampleSize : count;
return (
Expand Down Expand Up @@ -154,9 +147,9 @@ export const TopValues: FC<Props> = ({ stats, fieldFormat, barColor, compressed,
<EuiText size="xs" textAlign={'center'}>
<FormattedMessage
id="xpack.dataVisualizer.dataGrid.field.topValues.calculatedFromSampleDescription"
defaultMessage="Calculated from sample of {topValuesSamplerShardSize} documents per shard"
defaultMessage="Calculated from sample of {topValuesSampleSize} documents"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this message still needed? If random sampler isn't turned on, then there is no 'sampling' is there? And if random sampling is being used, it mirrors the doc count displayed at the top.

image

values={{
topValuesSamplerShardSize,
topValuesSampleSize,
}}
/>
</EuiText>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
*/

import moment from 'moment';
const SIGFIGS_IF_ROUNDING = 3; // Number of sigfigs to use for values < 10
export const SIGFIGS_IF_ROUNDING = 3; // Number of sigfigs to use for values < 10

// Formats a single value according to the specified ML function.
// If a Kibana fieldFormat is not supplied, will fall back to default
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ export const getDefaultDataVisualizerListState = (
sortDirection: 'asc',
visibleFieldTypes: [],
visibleFieldNames: [],
samplerShardSize: 5000,
searchString: '',
searchQuery: defaultSearchQuery,
searchQueryLanguage: SEARCH_QUERY_LANGUAGE.KUERY,
Expand All @@ -107,7 +106,7 @@ export const getDefaultDataVisualizerListState = (
showAllFields: false,
showEmptyFields: false,
probability: null,
rndSamplerPref: RANDOM_SAMPLER_OPTION.ON_AUTOMATIC,
randomSamplerPref: RANDOM_SAMPLER_OPTION.ON_AUTOMATIC,
...overrides,
});

Expand All @@ -128,7 +127,7 @@ export const IndexDataVisualizerView: FC<IndexDataVisualizerViewProps> = (dataVi
const restorableDefaults = useMemo(
() =>
getDefaultDataVisualizerListState({
rndSamplerPref: savedRandomSamplerPreference,
randomSamplerPref: savedRandomSamplerPreference,
}),
// We just need to load the saved preference when the page is first loaded
// eslint-disable-next-line react-hooks/exhaustive-deps
Expand Down Expand Up @@ -217,12 +216,6 @@ export const IndexDataVisualizerView: FC<IndexDataVisualizerViewProps> = (dataVi
[currentSavedSearch, dataVisualizerListState, setDataVisualizerListState]
);

const samplerShardSize =
dataVisualizerListState.samplerShardSize ?? restorableDefaults.samplerShardSize;
const setSamplerShardSize = (value: number) => {
setDataVisualizerListState({ ...dataVisualizerListState, samplerShardSize: value });
};

const visibleFieldTypes =
dataVisualizerListState.visibleFieldTypes ?? restorableDefaults.visibleFieldTypes;
const setVisibleFieldTypes = (values: string[]) => {
Expand Down Expand Up @@ -502,8 +495,6 @@ export const IndexDataVisualizerView: FC<IndexDataVisualizerViewProps> = (dataVi
searchQuery={searchQuery}
searchQueryLanguage={searchQueryLanguage}
setSearchParams={setSearchParams}
samplerShardSize={samplerShardSize}
setSamplerShardSize={setSamplerShardSize}
overallStats={overallStats}
indexedFieldTypes={fieldTypes}
setVisibleFieldTypes={setVisibleFieldTypes}
Expand Down Expand Up @@ -552,6 +543,7 @@ export const IndexDataVisualizerView: FC<IndexDataVisualizerViewProps> = (dataVi
loading={progress < 100}
showPreviewByDefault={dataVisualizerListState.showDistributions ?? true}
onChange={setDataVisualizerListState}
totalCount={overallStats.totalCount}
/>
</EuiPanel>
</EuiFlexItem>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import { Query, Filter } from '@kbn/es-query';
import type { TimeRange } from '@kbn/es-query';
import { DataView, DataViewField } from '@kbn/data-views-plugin/public';
import { isDefined } from '../../../common/util/is_defined';
import { ShardSizeFilter } from './shard_size_select';
import { DataVisualizerFieldNamesFilter } from './field_name_filter';
import { DataVisualizerFieldTypeFilter } from './field_type_filter';
import { SupportedFieldType } from '../../../../../common/types';
Expand All @@ -26,8 +25,6 @@ interface Props {
searchString: Query['query'];
searchQuery: Query['query'];
searchQueryLanguage: SearchQueryLanguage;
samplerShardSize: number;
setSamplerShardSize(s: number): void;
overallStats: OverallStats;
indexedFieldTypes: SupportedFieldType[];
setVisibleFieldTypes(q: string[]): void;
Expand All @@ -53,8 +50,6 @@ export const SearchPanel: FC<Props> = ({
dataView,
searchString,
searchQueryLanguage,
samplerShardSize,
setSamplerShardSize,
overallStats,
indexedFieldTypes,
setVisibleFieldTypes,
Expand Down Expand Up @@ -149,11 +144,6 @@ export const SearchPanel: FC<Props> = ({
</EuiFlexItem>

<EuiFlexItem grow={2} className={'dvSearchPanel__controls'}>
<ShardSizeFilter
samplerShardSize={samplerShardSize}
setSamplerShardSize={setSamplerShardSize}
/>

<DataVisualizerFieldNamesFilter
overallStats={overallStats}
setVisibleFieldNames={setVisibleFieldNames}
Expand Down

This file was deleted.

Loading