Skip to content

Commit

Permalink
Add support for median, p90, p95 and p99 math functions (#2630)
Browse files Browse the repository at this point in the history
* Add support for median, p90, p95 and p99 math functions

Related issue: #2584

* Update tests to reflect quantile functions are not exact in ch
  • Loading branch information
macobo authored Dec 2, 2020
1 parent 1dd7f73 commit a451e14
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 59 deletions.
24 changes: 13 additions & 11 deletions ee/clickhouse/queries/trends/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,17 @@
from posthog.models.entity import Entity
from posthog.models.filters import Filter

MATH_FUNCTIONS = {
"sum": "sum",
"avg": "avg",
"min": "min",
"max": "max",
"median": "quantile(0.50)",
"p90": "quantile(0.90)",
"p95": "quantile(0.95)",
"p99": "quantile(0.99)",
}


def process_math(entity: Entity) -> Tuple[str, str, Dict[str, Optional[str]]]:
aggregate_operation = "count(*)"
Expand All @@ -14,17 +25,8 @@ def process_math(entity: Entity) -> Tuple[str, str, Dict[str, Optional[str]]]:
if entity.math == "dau":
join_condition = EVENT_JOIN_PERSON_SQL
aggregate_operation = "count(DISTINCT person_id)"
elif entity.math == "sum":
aggregate_operation = "sum({})".format(value)
params = {"join_property_key": entity.math_property}
elif entity.math == "avg":
aggregate_operation = "avg({})".format(value)
params = {"join_property_key": entity.math_property}
elif entity.math == "min":
aggregate_operation = "min({})".format(value)
params = {"join_property_key": entity.math_property}
elif entity.math == "max":
aggregate_operation = "max({})".format(value)
elif entity.math in MATH_FUNCTIONS:
aggregate_operation = f"{MATH_FUNCTIONS[entity.math]}({value})"
params = {"join_property_key": entity.math_property}

return aggregate_operation, join_condition, params
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/lib/utils.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ export function formatProperty(property: Record<string, any>): string {
export function formatLabel(label: string, action: Record<string, any>): string {
if (action.math === 'dau') {
label += ` (${action.math.toUpperCase()}) `
} else if (['sum', 'avg', 'min', 'max'].includes(action.math)) {
} else if (['sum', 'avg', 'min', 'max', 'median', 'p90', 'p95', 'p99'].includes(action.math)) {
label += ` (${action.math} of ${action.math_property}) `
} else {
label += ' (Total) '
Expand Down
46 changes: 45 additions & 1 deletion frontend/src/scenes/insights/ActionFilter/ActionFilterRow.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import React, { useRef, useState } from 'react'
import { useActions, useValues } from 'kea'
import { Button, Tooltip, Dropdown, Menu, Col, Row, Select } from 'antd'
import { EntityTypes } from '../trendsLogic'
import { ActionFilterDropdown } from './ActionFilterDropdown'
import { Button, Tooltip, Dropdown, Menu, Col, Row, Select } from 'antd'
import { PropertyFilters } from 'lib/components/PropertyFilters/PropertyFilters'
import { userLogic } from 'scenes/userLogic'
import { DownOutlined } from '@ant-design/icons'
Expand Down Expand Up @@ -77,6 +77,50 @@ const MATHS = {
),
onProperty: true,
},
median: {
name: 'Median',
description: (
<>
Event property median (50th percentile).
<br />
For example 100 events captured with property <code>amount</code> equal to 101..200, result in 150.
</>
),
onProperty: true,
},
p90: {
name: '90th percentile',
description: (
<>
Event property 90th percentile.
<br />
For example 100 events captured with property <code>amount</code> equal to 101..200, result in 190.
</>
),
onProperty: true,
},
p95: {
name: '95th percentile',
description: (
<>
Event property 95th percentile.
<br />
For example 100 events captured with property <code>amount</code> equal to 101..200, result in 195.
</>
),
onProperty: true,
},
p99: {
name: '99th percentile',
description: (
<>
Event property 90th percentile.
<br />
For example 100 events captured with property <code>amount</code> equal to 101..200, result in 199.
</>
),
onProperty: true,
},
}
const MATH_ENTRIES = Object.entries(MATHS)

Expand Down
9 changes: 8 additions & 1 deletion posthog/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class UUIDT(uuid.UUID):
"""UUID (mostly) sortable by generation time.
This doesn't adhere to any official UUID version spec, but it is superior as a primary key:
to incremented integers (as they can reveal sensitive business information about usage volumes and patterns),
to UUID v4 (as the complete randomness of v4 makes its indexing performance suboptimal),
Expand Down Expand Up @@ -83,3 +83,10 @@ def generate_random_token(nbytes: int = 32) -> str:
https://docs.python.org/3/library/secrets.html#how-many-bytes-should-tokens-use
"""
return secrets.token_urlsafe(nbytes)


class Percentile(models.Aggregate):
template = "percentile_disc(%(percentile)s) WITHIN GROUP (ORDER BY %(expressions)s)"

def __init__(self, percentile, expression, **extra):
super().__init__(expression, percentile=percentile, **extra)
74 changes: 33 additions & 41 deletions posthog/queries/test/test_trends.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,65 +422,57 @@ def test_dau_with_breakdown_filtering(self):

self.assertTrue(self._compare_entity_response(action_response, event_response))

def _create_maths_events(self):
def _create_maths_events(self, values):
sign_up_action, person = self._create_events()
person_factory(team_id=self.team.pk, distinct_ids=["someone_else"])
event_factory(team=self.team, event="sign up", distinct_id="someone_else", properties={"some_number": 2})
event_factory(team=self.team, event="sign up", distinct_id="someone_else", properties={"some_number": 3})
event_factory(team=self.team, event="sign up", distinct_id="someone_else", properties={"some_number": 5.5})
event_factory(team=self.team, event="sign up", distinct_id="someone_else", properties={"some_number": 7.5})
for value in values:
event_factory(
team=self.team, event="sign up", distinct_id="someone_else", properties={"some_number": value}
)

event_factory(team=self.team, event="sign up", distinct_id="someone_else", properties={"some_number": None})
return sign_up_action

def test_sum_filtering(self):
sign_up_action = self._create_maths_events()
def _test_math_property_aggregation(self, math_property, values, expected_value):
sign_up_action = self._create_maths_events(values)

action_response = trends().run(
Filter(data={"actions": [{"id": sign_up_action.id, "math": "sum", "math_property": "some_number"}]}),
Filter(
data={"actions": [{"id": sign_up_action.id, "math": math_property, "math_property": "some_number"}]}
),
self.team,
)
event_response = trends().run(
Filter(data={"events": [{"id": "sign up", "math": "sum", "math_property": "some_number"}]}), self.team
Filter(data={"events": [{"id": "sign up", "math": math_property, "math_property": "some_number"}]}),
self.team,
)
self.assertEqual(action_response[0]["data"][-1], 18)
# :TRICKY: Work around clickhouse functions not being 100%
self.assertAlmostEqual(action_response[0]["data"][-1], expected_value, delta=0.5)
self.assertTrue(self._compare_entity_response(action_response, event_response))

def test_avg_filtering(self):
sign_up_action = self._create_maths_events()
def test_sum_filtering(self):
self._test_math_property_aggregation("sum", values=[2, 3, 5.5, 7.5], expected_value=18)

action_response = trends().run(
Filter(data={"actions": [{"id": sign_up_action.id, "math": "avg", "math_property": "some_number"}]}),
self.team,
)
event_response = trends().run(
Filter(data={"events": [{"id": "sign up", "math": "avg", "math_property": "some_number"}]}), self.team
)
self.assertEqual(action_response[0]["data"][-1], 4.5)
self.assertTrue(self._compare_entity_response(action_response, event_response))
def test_avg_filtering(self):
self._test_math_property_aggregation("avg", values=[2, 3, 5.5, 7.5], expected_value=4.5)

def test_min_filtering(self):
sign_up_action = self._create_maths_events()
action_response = trends().run(
Filter(data={"actions": [{"id": sign_up_action.id, "math": "min", "math_property": "some_number"}]}),
self.team,
)
event_response = trends().run(
Filter(data={"events": [{"id": "sign up", "math": "min", "math_property": "some_number"}]}), self.team
)
self.assertEqual(action_response[0]["data"][-1], 2)
self.assertTrue(self._compare_entity_response(action_response, event_response))
self._test_math_property_aggregation("min", values=[2, 3, 5.5, 7.5], expected_value=2)

def test_max_filtering(self):
sign_up_action = self._create_maths_events()
action_response = trends().run(
Filter(data={"actions": [{"id": sign_up_action.id, "math": "max", "math_property": "some_number"}]}),
self.team,
)
event_response = trends().run(
Filter(data={"events": [{"id": "sign up", "math": "max", "math_property": "some_number"}]}), self.team
)
self.assertEqual(action_response[0]["data"][-1], 7.5)
self.assertTrue(self._compare_entity_response(action_response, event_response))
self._test_math_property_aggregation("max", values=[2, 3, 5.5, 7.5], expected_value=7.5)

def test_median_filtering(self):
self._test_math_property_aggregation("median", values=range(101, 201), expected_value=150)

def test_p90_filtering(self):
self._test_math_property_aggregation("p90", values=range(101, 201), expected_value=190)

def test_p95_filtering(self):
self._test_math_property_aggregation("p95", values=range(101, 201), expected_value=195)

def test_p99_filtering(self):
self._test_math_property_aggregation("p99", values=range(101, 201), expected_value=199)

def test_avg_filtering_non_number_resiliency(self):
sign_up_action, person = self._create_events()
Expand Down
19 changes: 15 additions & 4 deletions posthog/queries/trends.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import copy
import datetime
from itertools import accumulate
from typing import Any, Dict, List, Optional, Union
from typing import Any, Callable, Dict, List, Optional, Union

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -37,12 +37,24 @@
Person,
Team,
)
from posthog.models.utils import Percentile
from posthog.utils import append_data

from .base import BaseQuery, filter_events, handle_compare, process_entity_for_events

FREQ_MAP = {"minute": "60S", "hour": "H", "day": "D", "week": "W", "month": "M"}

MATH_TO_AGGREGATE_FUNCTION: Dict[str, Callable] = {
"sum": Sum,
"avg": Avg,
"min": Min,
"max": Max,
"median": lambda expr: Percentile(0.5, expr),
"p90": lambda expr: Percentile(0.9, expr),
"p95": lambda expr: Percentile(0.95, expr),
"p99": lambda expr: Percentile(0.99, expr),
}


def build_dataframe(aggregates: QuerySet, interval: str, breakdown: Optional[str] = None) -> pd.DataFrame:
if breakdown == "cohorts":
Expand Down Expand Up @@ -195,14 +207,13 @@ def aggregate_by_interval(


def process_math(query: QuerySet, entity: Entity) -> QuerySet:
math_to_aggregate_function = {"sum": Sum, "avg": Avg, "min": Min, "max": Max}
if entity.math == "dau":
# In daily active users mode count only up to 1 event per user per day
query = query.annotate(count=Count("person_id", distinct=True))
elif entity.math in math_to_aggregate_function:
elif entity.math in MATH_TO_AGGREGATE_FUNCTION:
# Run relevant aggregate function on specified event property, casting it to a double
query = query.annotate(
count=math_to_aggregate_function[entity.math](
count=MATH_TO_AGGREGATE_FUNCTION[entity.math](
Cast(RawSQL('"posthog_event"."properties"->>%s', (entity.math_property,)), output_field=FloatField(),)
)
)
Expand Down

0 comments on commit a451e14

Please sign in to comment.