Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] Adds sampled % of documents & cardinality for text fields for Data visualizer/Field stats & fix missing bucket in doc count chart #172378

Merged
merged 13 commits into from
Dec 5, 2023
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,6 @@ export const DocumentCountChart: FC<Props> = ({
}
);

const xDomain = {
min: timeRangeEarliest,
max: timeRangeLatest,
};

const adjustedChartPoints = useMemo(() => {
// Display empty chart when no data in range
if (chartPoints.length < 1) return [{ time: timeRangeEarliest, value: 0 }];
Expand Down Expand Up @@ -149,7 +144,6 @@ export const DocumentCountChart: FC<Props> = ({
}}
>
<Settings
xDomain={xDomain}
onBrushEnd={onBrushEnd as BrushEndListener}
onElementClick={onElementClick}
theme={chartTheme}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,67 @@
* 2.0.
*/

import { EuiIcon, EuiText } from '@elastic/eui';
import { EuiIcon, EuiText, EuiToolTip } from '@elastic/eui';

import React from 'react';
import { FormattedMessage } from '@kbn/i18n-react';
import { ES_FIELD_TYPES, KBN_FIELD_TYPES } from '@kbn/field-types';
import { SUPPORTED_FIELD_TYPES } from '../../../../../../../common/constants';
import { useDataVisualizerKibana } from '../../../../../kibana_context';
import { FieldDataRowProps } from '../../types';
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit

Suggested change
import { FieldDataRowProps } from '../../types';
import type { FieldDataRowProps } from '../../types';

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated here 6e88086


interface Props {
cardinality?: number;
interface Props extends FieldDataRowProps {
showIcon?: boolean;
}

export const DistinctValues = ({ cardinality, showIcon }: Props) => {
if (cardinality === undefined) return null;
export const DistinctValues = ({ showIcon, config }: Props) => {
const { stats, type } = config;
const {
services: {
data: { fieldFormats },
},
} = useDataVisualizerKibana();

const cardinality = config?.stats?.cardinality;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
const cardinality = config?.stats?.cardinality;
const cardinality = stats?.cardinality;

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated here 6e88086


if (cardinality === undefined || stats === undefined) return null;

const { sampleCount } = stats;

const tooltipContent =
type === SUPPORTED_FIELD_TYPES.TEXT ? (
<FormattedMessage
id="xpack.dataVisualizer.sampledCardinalityForTextFieldsMsg"
defaultMessage="The cardinality for text fields is sampled and calculated from {sampledDocumentsFormatted} sample {sampledDocuments, plural, one {record} other {records}}."
Copy link
Contributor

@szabosteve szabosteve Dec 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As above.

Suggested change
defaultMessage="The cardinality for text fields is sampled and calculated from {sampledDocumentsFormatted} sample {sampledDocuments, plural, one {record} other {records}}."
defaultMessage="The cardinality for text fields is calculated from a sample of {sampledDocumentsFormatted} {sampledDocuments, plural, one {record} other {records}}."

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated here e6facd0

values={{
sampledDocuments: sampleCount,
sampledDocumentsFormatted: (
<strong>
{fieldFormats
.getDefaultInstance(KBN_FIELD_TYPES.NUMBER, [ES_FIELD_TYPES.INTEGER])
.convert(sampleCount)}
</strong>
),
}}
/>
) : null;

const icon = showIcon ? (
type === SUPPORTED_FIELD_TYPES.TEXT ? (
<EuiToolTip content={tooltipContent}>
<EuiIcon type="partial" size={'m'} className={'columnHeader__icon'} />
</EuiToolTip>
) : (
<EuiIcon type="database" size={'m'} className={'columnHeader__icon'} />
)
) : null;

const content = <EuiText size={'xs'}>{cardinality}</EuiText>;

return (
<>
{showIcon ? <EuiIcon type="database" size={'m'} className={'columnHeader__icon'} /> : null}
<EuiText size={'xs'}>{cardinality}</EuiText>
{icon}
<EuiToolTip content={tooltipContent}>{content}</EuiToolTip>
</>
);
};
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
* 2.0.
*/

import { EuiIcon, EuiText } from '@elastic/eui';
import { EuiIcon, EuiText, EuiToolTip } from '@elastic/eui';

import React from 'react';
import { ES_FIELD_TYPES, KBN_FIELD_TYPES } from '@kbn/field-types';
import { roundToDecimalPlace } from '@kbn/ml-number-utils';
import { FormattedMessage } from '@kbn/i18n-react';
import { SUPPORTED_FIELD_TYPES } from '../../../../../../../common/constants';
import { useDataVisualizerKibana } from '../../../../../kibana_context';
import { isIndexBasedFieldVisConfig } from '../../../../../../../common/types/field_vis_config';
import type { FieldDataRowProps } from '../../types/field_data_row';
Expand All @@ -19,7 +21,7 @@ interface Props extends FieldDataRowProps {
totalCount?: number;
}
export const DocumentStat = ({ config, showIcon, totalCount }: Props) => {
const { stats } = config;
const { stats, type } = config;
const {
services: {
data: { fieldFormats },
Expand All @@ -40,15 +42,47 @@ export const DocumentStat = ({ config, showIcon, totalCount }: Props) => {
? `(${roundToDecimalPlace((valueCount / total) * 100)}%)`
: null;

const content = (
<EuiText size={'xs'}>
{fieldFormats
.getDefaultInstance(KBN_FIELD_TYPES.NUMBER, [ES_FIELD_TYPES.INTEGER])
.convert(valueCount)}{' '}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the space char {' '} needed here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this one, yes, since it's separating two different values

{docsPercent}
</EuiText>
);

const tooltipContent =
type === SUPPORTED_FIELD_TYPES.TEXT ? (
<FormattedMessage
id="xpack.dataVisualizer.sampledPercentageForTextFieldsMsg"
defaultMessage="The % of documents for text fields is sampled and calculated from {sampledDocumentsFormatted} sample {sampledDocuments, plural, one {record} other {records}}."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it could be a bit simpler.

Suggested change
defaultMessage="The % of documents for text fields is sampled and calculated from {sampledDocumentsFormatted} sample {sampledDocuments, plural, one {record} other {records}}."
defaultMessage="The % of documents for text fields is calculated from a sample of {sampledDocumentsFormatted} {sampledDocuments, plural, one {record} other {records}}."

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated here e6facd0

values={{
sampledDocuments: sampleCount,
sampledDocumentsFormatted: (
<strong>
{fieldFormats
.getDefaultInstance(KBN_FIELD_TYPES.NUMBER, [ES_FIELD_TYPES.INTEGER])
.convert(sampleCount)}
</strong>
),
}}
/>
) : null;

const icon = showIcon ? (
type === SUPPORTED_FIELD_TYPES.TEXT ? (
<EuiToolTip content={tooltipContent}>
<EuiIcon type="partial" size={'m'} className={'columnHeader__icon'} />
</EuiToolTip>
) : (
<EuiIcon type="document" size={'m'} className={'columnHeader__icon'} />
)
) : null;

return valueCount !== undefined ? (
<>
{showIcon ? <EuiIcon type="document" size={'m'} className={'columnHeader__icon'} /> : null}
<EuiText size={'xs'}>
{fieldFormats
.getDefaultInstance(KBN_FIELD_TYPES.NUMBER, [ES_FIELD_TYPES.INTEGER])
.convert(valueCount)}{' '}
{docsPercent}
</EuiText>
{icon}
<EuiToolTip content={tooltipContent}>{content}</EuiToolTip>
</>
) : null;
};
Original file line number Diff line number Diff line change
Expand Up @@ -275,9 +275,7 @@ export const DataVisualizerTable = <T extends DataVisualizerTableItem>({
);
}

return (
<DistinctValues cardinality={item?.stats?.cardinality} showIcon={dimensions.showIcon} />
);
return <DistinctValues config={item} showIcon={dimensions.showIcon} />;
},
sortable: (item: DataVisualizerTableItem) => item?.stats?.cardinality,
align: LEFT_ALIGNMENT as HorizontalAlignment,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,16 @@ import type {
ISearchOptions,
} from '@kbn/data-plugin/common';
import { extractErrorProperties } from '@kbn/ml-error-utils';
import { getProcessedFields } from '@kbn/ml-data-grid';
import { useDataVisualizerKibana } from '../../kibana_context';
import {
AggregatableFieldOverallStats,
checkAggregatableFieldsExistRequest,
checkNonAggregatableFieldExistsRequest,
getSampleOfDocumentsForNonAggregatableFields,
isAggregatableFieldOverallStats,
isNonAggregatableFieldOverallStats,
isNonAggregatableSampledDocs,
NonAggregatableFieldOverallStats,
processAggregatableFieldsExistResponse,
processNonAggregatableFieldsExistResponse,
Expand Down Expand Up @@ -128,6 +131,26 @@ export function useOverallStats<TParams extends OverallStatsSearchStrategyParams
probability
);

const nonAggregatableFieldsExamplesObs = data.search
.search<IKibanaSearchRequest, IKibanaSearchResponse>(
{
params: getSampleOfDocumentsForNonAggregatableFields(
nonAggregatableFields,
index,
searchQuery,
timeFieldName,
earliest,
latest,
runtimeFieldMap
),
},
searchOptions
)
.pipe(
map((resp) => {
return resp as IKibanaSearchResponse;
})
);
const nonAggregatableFieldsObs = nonAggregatableFields.map((fieldName: string) =>
data.search
.search<IKibanaSearchRequest, IKibanaSearchResponse>(
Expand Down Expand Up @@ -190,14 +213,29 @@ export function useOverallStats<TParams extends OverallStatsSearchStrategyParams

const sub = rateLimitingForkJoin<
AggregatableFieldOverallStats | NonAggregatableFieldOverallStats | undefined
>([...aggregatableOverallStatsObs, ...nonAggregatableFieldsObs], MAX_CONCURRENT_REQUESTS);
>(
[
nonAggregatableFieldsExamplesObs,
...aggregatableOverallStatsObs,
...nonAggregatableFieldsObs,
],
MAX_CONCURRENT_REQUESTS
);

searchSubscription$.current = sub.subscribe({
next: (value) => {
const aggregatableOverallStatsResp: AggregatableFieldOverallStats[] = [];
const nonAggregatableOverallStatsResp: NonAggregatableFieldOverallStats[] = [];

let sampledNonAggregatableFieldsExamples: Array<{ [key: string]: string }> | undefined;
value.forEach((resp, idx) => {
if (idx === 0 && isNonAggregatableSampledDocs(resp)) {
const docs = resp.rawResponse.hits.hits.map((d) =>
getProcessedFields(d.fields ?? {})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rather than calling getProcessedFields with an empty object, it might be neater to make the check beforehand.
e.g.

d.fields ? getProcessedFields(d.fields) : {}

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated here 6e88086

);

sampledNonAggregatableFieldsExamples = docs;
}
if (isAggregatableFieldOverallStats(resp)) {
aggregatableOverallStatsResp.push(resp);
}
Expand All @@ -214,9 +252,27 @@ export function useOverallStats<TParams extends OverallStatsSearchStrategyParams
aggregatableFields
);

const nonAggregatableFieldsCount: number[] = new Array(nonAggregatableFields.length).fill(
0
);
const nonAggregatableFieldsUniqueCount = nonAggregatableFields.map(
() => new Set<string>()
);
if (sampledNonAggregatableFieldsExamples) {
sampledNonAggregatableFieldsExamples.forEach((doc) => {
nonAggregatableFields.forEach((field, fieldIdx) => {
if (doc.hasOwnProperty(field)) {
nonAggregatableFieldsCount[fieldIdx] += 1;
nonAggregatableFieldsUniqueCount[fieldIdx].add(doc[field]!);
}
});
});
}
const nonAggregatableOverallStats = processNonAggregatableFieldsExistResponse(
nonAggregatableOverallStatsResp,
nonAggregatableFields
nonAggregatableFields,
nonAggregatableFieldsCount,
nonAggregatableFieldsUniqueCount
);

setOverallStats({
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,15 @@ export function isNonAggregatableFieldOverallStats(
return isPopulatedObject(arg, ['rawResponse']);
}

export function isNonAggregatableSampledDocs(
arg: unknown
): arg is IKibanaSearchResponse<estypes.SearchResponse<unknown>> {
return (
isPopulatedObject(arg, ['rawResponse']) &&
(arg.rawResponse as estypes.SearchResponse).hasOwnProperty('hits')
);
}

export const processAggregatableFieldsExistResponse = (
responses: AggregatableFieldOverallStats[] | undefined,
aggregatableFields: OverallStatsSearchStrategyParams['aggregatableFields'],
Expand Down Expand Up @@ -204,6 +213,10 @@ export const checkNonAggregatableFieldExistsRequest = (
const size = 0;
const filterCriteria = buildBaseFilterCriteria(timeFieldName, earliestMs, latestMs, query);

if (Array.isArray(filterCriteria)) {
filterCriteria.push({ exists: { field } });
}

const searchBody = {
query: {
bool: {
Expand All @@ -212,9 +225,6 @@ export const checkNonAggregatableFieldExistsRequest = (
},
...(isPopulatedObject(runtimeMappings) ? { runtime_mappings: runtimeMappings } : {}),
};
if (Array.isArray(filterCriteria)) {
filterCriteria.push({ exists: { field } });
}

return {
index,
Expand All @@ -227,9 +237,40 @@ export const checkNonAggregatableFieldExistsRequest = (
};
};

const DEFAULT_DOCS_SAMPLE_OF_TEXT_FIELDS_SIZE = 1000;

export const getSampleOfDocumentsForNonAggregatableFields = (
nonAggregatableFields: string[],
dataViewTitle: string,
query: Query['query'],
timeFieldName: string | undefined,
earliestMs: number | undefined,
latestMs: number | undefined,
runtimeMappings?: estypes.MappingRuntimeFields
): estypes.SearchRequest => {
const index = dataViewTitle;
const filterCriteria = buildBaseFilterCriteria(timeFieldName, earliestMs, latestMs, query);

return {
index,
body: {
fields: nonAggregatableFields.map((fieldName) => fieldName),
query: {
bool: {
filter: filterCriteria,
},
},
...(isPopulatedObject(runtimeMappings) ? { runtime_mappings: runtimeMappings } : {}),
size: DEFAULT_DOCS_SAMPLE_OF_TEXT_FIELDS_SIZE,
},
};
};

export const processNonAggregatableFieldsExistResponse = (
results: IKibanaSearchResponse[] | undefined,
nonAggregatableFields: string[]
nonAggregatableFields: string[],
nonAggregatableFieldsCount: number[],
nonAggregatableFieldsUniqueCount: Array<Set<string>>
) => {
const stats = {
nonAggregatableExistsFields: [] as NonAggregatableField[],
Expand All @@ -238,12 +279,17 @@ export const processNonAggregatableFieldsExistResponse = (

if (!results || nonAggregatableFields.length === 0) return stats;

nonAggregatableFields.forEach((fieldName) => {
nonAggregatableFields.forEach((fieldName, fieldIdx) => {
const foundField = results.find((r) => r.rawResponse.fieldName === fieldName);
const existsInDocs = foundField !== undefined && foundField.rawResponse.hits.total > 0;
const fieldData: NonAggregatableField = {
fieldName,
existsInDocs,
stats: {
count: nonAggregatableFieldsCount[fieldIdx],
cardinality: nonAggregatableFieldsUniqueCount[fieldIdx].size,
sampleCount: DEFAULT_DOCS_SAMPLE_OF_TEXT_FIELDS_SIZE,
},
};
if (existsInDocs === true) {
stats.nonAggregatableExistsFields.push(fieldData);
Expand Down
Loading