From 288b40feb141640c9f355c32fbb7ebc64e3378d8 Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Wed, 2 Dec 2020 16:29:52 +0200 Subject: [PATCH] Add support for median, p90, p95 and p99 math functions Related issue: #2584 --- ee/clickhouse/queries/trends/util.py | 24 +++--- frontend/src/lib/utils.tsx | 2 +- .../insights/ActionFilter/ActionFilterRow.js | 46 +++++++++++- posthog/models/utils.py | 9 ++- posthog/queries/test/test_trends.py | 73 ++++++++----------- posthog/queries/trends.py | 19 ++++- 6 files changed, 114 insertions(+), 59 deletions(-) diff --git a/ee/clickhouse/queries/trends/util.py b/ee/clickhouse/queries/trends/util.py index de50c507b9b7a..0287d6fd9415b 100644 --- a/ee/clickhouse/queries/trends/util.py +++ b/ee/clickhouse/queries/trends/util.py @@ -5,6 +5,17 @@ from posthog.models.entity import Entity from posthog.models.filters import Filter +MATH_FUNCTIONS = { + "sum": "sum", + "avg": "avg", + "min": "min", + "max": "max", + "median": "quantile(0.50)", + "p90": "quantile(0.90)", + "p95": "quantile(0.95)", + "p99": "quantile(0.99)", +} + def process_math(entity: Entity) -> Tuple[str, str, Dict[str, Optional[str]]]: aggregate_operation = "count(*)" @@ -14,17 +25,8 @@ def process_math(entity: Entity) -> Tuple[str, str, Dict[str, Optional[str]]]: if entity.math == "dau": join_condition = EVENT_JOIN_PERSON_SQL aggregate_operation = "count(DISTINCT person_id)" - elif entity.math == "sum": - aggregate_operation = "sum({})".format(value) - params = {"join_property_key": entity.math_property} - elif entity.math == "avg": - aggregate_operation = "avg({})".format(value) - params = {"join_property_key": entity.math_property} - elif entity.math == "min": - aggregate_operation = "min({})".format(value) - params = {"join_property_key": entity.math_property} - elif entity.math == "max": - aggregate_operation = "max({})".format(value) + elif entity.math in MATH_FUNCTIONS: + aggregate_operation = f"{MATH_FUNCTIONS[entity.math]}({value})" params = {"join_property_key": entity.math_property} return aggregate_operation, join_condition, params diff --git a/frontend/src/lib/utils.tsx b/frontend/src/lib/utils.tsx index 1bdd2ffaa8468..175c01836db9e 100644 --- a/frontend/src/lib/utils.tsx +++ b/frontend/src/lib/utils.tsx @@ -233,7 +233,7 @@ export function formatProperty(property: Record): string { export function formatLabel(label: string, action: Record): string { if (action.math === 'dau') { label += ` (${action.math.toUpperCase()}) ` - } else if (['sum', 'avg', 'min', 'max'].includes(action.math)) { + } else if (['sum', 'avg', 'min', 'max', 'median', 'p90', 'p95', 'p99'].includes(action.math)) { label += ` (${action.math} of ${action.math_property}) ` } else { label += ' (Total) ' diff --git a/frontend/src/scenes/insights/ActionFilter/ActionFilterRow.js b/frontend/src/scenes/insights/ActionFilter/ActionFilterRow.js index 6f21e3b5d0727..32d66b407d5f4 100644 --- a/frontend/src/scenes/insights/ActionFilter/ActionFilterRow.js +++ b/frontend/src/scenes/insights/ActionFilter/ActionFilterRow.js @@ -1,8 +1,8 @@ import React, { useRef, useState } from 'react' import { useActions, useValues } from 'kea' +import { Button, Tooltip, Dropdown, Menu, Col, Row, Select } from 'antd' import { EntityTypes } from '../trendsLogic' import { ActionFilterDropdown } from './ActionFilterDropdown' -import { Button, Tooltip, Dropdown, Menu, Col, Row, Select } from 'antd' import { PropertyFilters } from 'lib/components/PropertyFilters/PropertyFilters' import { userLogic } from 'scenes/userLogic' import { DownOutlined } from '@ant-design/icons' @@ -77,6 +77,50 @@ const MATHS = { ), onProperty: true, }, + median: { + name: 'Median', + description: ( + <> + Event property median (50th percentile). +
+ For example 100 events captured with property amount equal to 101..200, result in 150. + + ), + onProperty: true, + }, + p90: { + name: '90th percentile', + description: ( + <> + Event property 90th percentile. +
+ For example 100 events captured with property amount equal to 101..200, result in 190. + + ), + onProperty: true, + }, + p95: { + name: '95th percentile', + description: ( + <> + Event property 95th percentile. +
+ For example 100 events captured with property amount equal to 101..200, result in 195. + + ), + onProperty: true, + }, + p99: { + name: '99th percentile', + description: ( + <> + Event property 90th percentile. +
+ For example 100 events captured with property amount equal to 101..200, result in 199. + + ), + onProperty: true, + }, } const MATH_ENTRIES = Object.entries(MATHS) diff --git a/posthog/models/utils.py b/posthog/models/utils.py index 79a5e58e43a49..9301e497ad0b1 100644 --- a/posthog/models/utils.py +++ b/posthog/models/utils.py @@ -9,7 +9,7 @@ class UUIDT(uuid.UUID): """UUID (mostly) sortable by generation time. - + This doesn't adhere to any official UUID version spec, but it is superior as a primary key: to incremented integers (as they can reveal sensitive business information about usage volumes and patterns), to UUID v4 (as the complete randomness of v4 makes its indexing performance suboptimal), @@ -83,3 +83,10 @@ def generate_random_token(nbytes: int = 32) -> str: https://docs.python.org/3/library/secrets.html#how-many-bytes-should-tokens-use """ return secrets.token_urlsafe(nbytes) + + +class Percentile(models.Aggregate): + template = "percentile_disc(%(percentile)s) WITHIN GROUP (ORDER BY %(expressions)s)" + + def __init__(self, percentile, expression, **extra): + super().__init__(expression, percentile=percentile, **extra) diff --git a/posthog/queries/test/test_trends.py b/posthog/queries/test/test_trends.py index 69cf9c5740736..e750557fa5af1 100644 --- a/posthog/queries/test/test_trends.py +++ b/posthog/queries/test/test_trends.py @@ -422,65 +422,56 @@ def test_dau_with_breakdown_filtering(self): self.assertTrue(self._compare_entity_response(action_response, event_response)) - def _create_maths_events(self): + def _create_maths_events(self, values): sign_up_action, person = self._create_events() person_factory(team_id=self.team.pk, distinct_ids=["someone_else"]) - event_factory(team=self.team, event="sign up", distinct_id="someone_else", properties={"some_number": 2}) - event_factory(team=self.team, event="sign up", distinct_id="someone_else", properties={"some_number": 3}) - event_factory(team=self.team, event="sign up", distinct_id="someone_else", properties={"some_number": 5.5}) - event_factory(team=self.team, event="sign up", distinct_id="someone_else", properties={"some_number": 7.5}) + for value in values: + event_factory( + team=self.team, event="sign up", distinct_id="someone_else", properties={"some_number": value} + ) + event_factory(team=self.team, event="sign up", distinct_id="someone_else", properties={"some_number": None}) return sign_up_action - def test_sum_filtering(self): - sign_up_action = self._create_maths_events() + def _test_math_property_aggregation(self, math_property, values, expected_value): + sign_up_action = self._create_maths_events(values) action_response = trends().run( - Filter(data={"actions": [{"id": sign_up_action.id, "math": "sum", "math_property": "some_number"}]}), + Filter( + data={"actions": [{"id": sign_up_action.id, "math": math_property, "math_property": "some_number"}]} + ), self.team, ) event_response = trends().run( - Filter(data={"events": [{"id": "sign up", "math": "sum", "math_property": "some_number"}]}), self.team + Filter(data={"events": [{"id": "sign up", "math": math_property, "math_property": "some_number"}]}), + self.team, ) - self.assertEqual(action_response[0]["data"][-1], 18) + self.assertEqual(action_response[0]["data"][-1], expected_value) self.assertTrue(self._compare_entity_response(action_response, event_response)) - def test_avg_filtering(self): - sign_up_action = self._create_maths_events() + def test_sum_filtering(self): + self._test_math_property_aggregation("sum", values=[2, 3, 5.5, 7.5], expected_value=18) - action_response = trends().run( - Filter(data={"actions": [{"id": sign_up_action.id, "math": "avg", "math_property": "some_number"}]}), - self.team, - ) - event_response = trends().run( - Filter(data={"events": [{"id": "sign up", "math": "avg", "math_property": "some_number"}]}), self.team - ) - self.assertEqual(action_response[0]["data"][-1], 4.5) - self.assertTrue(self._compare_entity_response(action_response, event_response)) + def test_avg_filtering(self): + self._test_math_property_aggregation("avg", values=[2, 3, 5.5, 7.5], expected_value=4.5) def test_min_filtering(self): - sign_up_action = self._create_maths_events() - action_response = trends().run( - Filter(data={"actions": [{"id": sign_up_action.id, "math": "min", "math_property": "some_number"}]}), - self.team, - ) - event_response = trends().run( - Filter(data={"events": [{"id": "sign up", "math": "min", "math_property": "some_number"}]}), self.team - ) - self.assertEqual(action_response[0]["data"][-1], 2) - self.assertTrue(self._compare_entity_response(action_response, event_response)) + self._test_math_property_aggregation("min", values=[2, 3, 5.5, 7.5], expected_value=2) def test_max_filtering(self): - sign_up_action = self._create_maths_events() - action_response = trends().run( - Filter(data={"actions": [{"id": sign_up_action.id, "math": "max", "math_property": "some_number"}]}), - self.team, - ) - event_response = trends().run( - Filter(data={"events": [{"id": "sign up", "math": "max", "math_property": "some_number"}]}), self.team - ) - self.assertEqual(action_response[0]["data"][-1], 7.5) - self.assertTrue(self._compare_entity_response(action_response, event_response)) + self._test_math_property_aggregation("max", values=[2, 3, 5.5, 7.5], expected_value=7.5) + + def test_median_filtering(self): + self._test_math_property_aggregation("median", values=range(101, 201), expected_value=150) + + def test_p90_filtering(self): + self._test_math_property_aggregation("p90", values=range(101, 201), expected_value=190) + + def test_p95_filtering(self): + self._test_math_property_aggregation("p95", values=range(101, 201), expected_value=195) + + def test_p99_filtering(self): + self._test_math_property_aggregation("p99", values=range(101, 201), expected_value=199) def test_avg_filtering_non_number_resiliency(self): sign_up_action, person = self._create_events() diff --git a/posthog/queries/trends.py b/posthog/queries/trends.py index 10a5cf5975cec..048bfc0915ff6 100644 --- a/posthog/queries/trends.py +++ b/posthog/queries/trends.py @@ -1,7 +1,7 @@ import copy import datetime from itertools import accumulate -from typing import Any, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import pandas as pd @@ -37,12 +37,24 @@ Person, Team, ) +from posthog.models.utils import Percentile from posthog.utils import append_data from .base import BaseQuery, filter_events, handle_compare, process_entity_for_events FREQ_MAP = {"minute": "60S", "hour": "H", "day": "D", "week": "W", "month": "M"} +MATH_TO_AGGREGATE_FUNCTION: Dict[str, Callable] = { + "sum": Sum, + "avg": Avg, + "min": Min, + "max": Max, + "median": lambda expr: Percentile(0.5, expr), + "p90": lambda expr: Percentile(0.9, expr), + "p95": lambda expr: Percentile(0.95, expr), + "p99": lambda expr: Percentile(0.99, expr), +} + def build_dataframe(aggregates: QuerySet, interval: str, breakdown: Optional[str] = None) -> pd.DataFrame: if breakdown == "cohorts": @@ -195,14 +207,13 @@ def aggregate_by_interval( def process_math(query: QuerySet, entity: Entity) -> QuerySet: - math_to_aggregate_function = {"sum": Sum, "avg": Avg, "min": Min, "max": Max} if entity.math == "dau": # In daily active users mode count only up to 1 event per user per day query = query.annotate(count=Count("person_id", distinct=True)) - elif entity.math in math_to_aggregate_function: + elif entity.math in MATH_TO_AGGREGATE_FUNCTION: # Run relevant aggregate function on specified event property, casting it to a double query = query.annotate( - count=math_to_aggregate_function[entity.math]( + count=MATH_TO_AGGREGATE_FUNCTION[entity.math]( Cast(RawSQL('"posthog_event"."properties"->>%s', (entity.math_property,)), output_field=FloatField(),) ) )