Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

geom_violin memory consumption #430

Open
x1o opened this issue Aug 19, 2020 · 2 comments
Open

geom_violin memory consumption #430

x1o opened this issue Aug 19, 2020 · 2 comments

Comments

@x1o
Copy link

x1o commented Aug 19, 2020

Compared with geom_boxplot(), geom_violin() seems to consume an unreasonable amount of memory. I get this error trying to plot a 70091 x 2 dataframe:

Unable to allocate 36.6 GiB for an array with shape (70091, 70091) and data type float64
@x1o x1o changed the title geom_violin memory consuption geom_violin memory consumption Aug 19, 2020
@has2k1
Copy link
Owner

has2k1 commented Aug 20, 2020

geom_violin densities are computed using statsmodels.nonparametric.kde.KDEUnivariate. Violins (densities) use more computing resources than boxplots. You have not posted a stack trace so I cannot tell which part of the pipeline is chocking, but a 70091 x 70091 array gives 70091 * 70091 * 64 / (8 * 1024**3) = 36.6 GiB. I do not think there we can do anything about it in plotnine.

@CarlosGrohmann
Copy link
Contributor

Just to add that I tested this today. While a dataframe with shape (2903785, 3) cannot be plotted as violin_plot in plotnine, it can be in seaborn. Looking here I see that seaborn uses scipy.stats.gaussian_kde().

here is the output from plotnine

---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
/usr/lib/python3/dist-packages/IPython/core/formatters.py in __call__(self, obj)
    700                 type_pprinters=self.type_printers,
    701                 deferred_pprinters=self.deferred_printers)
--> 702             printer.pretty(obj)
    703             printer.flush()
    704             return stream.getvalue()

/usr/lib/python3/dist-packages/IPython/lib/pretty.py in pretty(self, obj)
    392                         if cls is not object \
    393                                 and callable(cls.__dict__.get('__repr__')):
--> 394                             return _repr_pprint(obj, self, cycle)
    395 
    396             return _default_pprint(obj, self, cycle)

/usr/lib/python3/dist-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
    682     """A pprint that just redirects to the normal repr function."""
    683     # Find newlines and replace them with p.break_()
--> 684     output = repr(obj)
    685     lines = output.splitlines()
    686     with p.group():

~/.local/lib/python3.8/site-packages/plotnine/ggplot.py in __repr__(self)
     86         # in the jupyter notebook.
     87         if not self.figure:
---> 88             self.draw()
     89         plt.show()
     90         return '<ggplot: (%d)>' % self.__hash__()

~/.local/lib/python3.8/site-packages/plotnine/ggplot.py in draw(self, return_ggplot)
    179         # new frames knowing that they are separate from the original.
    180         with pd.option_context('mode.chained_assignment', None):
--> 181             return self._draw(return_ggplot)
    182 
    183     def _draw(self, return_ggplot=False):

~/.local/lib/python3.8/site-packages/plotnine/ggplot.py in _draw(self, return_ggplot)
    186         # assign a default theme
    187         self = deepcopy(self)
--> 188         self._build()
    189 
    190         # If no theme we use the default

~/.local/lib/python3.8/site-packages/plotnine/ggplot.py in _build(self)
    297 
    298         # Apply and map statistics
--> 299         layers.compute_statistic(layout)
    300         layers.map_statistic(self)
    301 

~/.local/lib/python3.8/site-packages/plotnine/layer.py in compute_statistic(self, layout)
     83     def compute_statistic(self, layout):
     84         for l in self:
---> 85             l.compute_statistic(layout)
     86 
     87     def map_statistic(self, plot):

~/.local/lib/python3.8/site-packages/plotnine/layer.py in compute_statistic(self, layout)
    370         data = self.stat.use_defaults(data)
    371         data = self.stat.setup_data(data)
--> 372         data = self.stat.compute_layer(data, params, layout)
    373         self.data = data
    374 

~/.local/lib/python3.8/site-packages/plotnine/stats/stat.py in compute_layer(cls, data, params, layout)
    272             return cls.compute_panel(pdata, pscales, **params)
    273 
--> 274         return groupby_apply(data, 'PANEL', fn)
    275 
    276     @classmethod

~/.local/lib/python3.8/site-packages/plotnine/utils.py in groupby_apply(df, cols, func, *args, **kwargs)
    631         # function fn should be free to modify dataframe d, therefore
    632         # do not mark d as a slice of df i.e no SettingWithCopyWarning
--> 633         lst.append(func(d, *args, **kwargs))
    634     return pd.concat(lst, axis=axis, ignore_index=True)
    635 

~/.local/lib/python3.8/site-packages/plotnine/stats/stat.py in fn(pdata)
    270                 return pdata
    271             pscales = layout.get_scales(pdata['PANEL'].iat[0])
--> 272             return cls.compute_panel(pdata, pscales, **params)
    273 
    274         return groupby_apply(data, 'PANEL', fn)

~/.local/lib/python3.8/site-packages/plotnine/stats/stat_ydensity.py in compute_panel(cls, data, scales, **params)
    134     @classmethod
    135     def compute_panel(cls, data, scales, **params):
--> 136         data = super(cls, cls).compute_panel(data, scales, **params)
    137 
    138         if not len(data):

~/.local/lib/python3.8/site-packages/plotnine/stats/stat.py in compute_panel(cls, data, scales, **params)
    303         stats = []
    304         for _, old in data.groupby('group'):
--> 305             new = cls.compute_group(old, scales, **params)
    306             unique = uniquecols(old)
    307             missing = unique.columns.difference(new.columns)

~/.local/lib/python3.8/site-packages/plotnine/stats/stat_ydensity.py in compute_group(cls, data, scales, **params)
    166             range_y = scales.y.dimension()
    167 
--> 168         dens = compute_density(data['y'], weight, range_y, **params)
    169         dens['y'] = dens['x']
    170         dens['x'] = np.mean([data['x'].min(), data['x'].max()])

~/.local/lib/python3.8/site-packages/plotnine/stats/stat_density.py in compute_density(x, weight, range, **params)
    172         bw = nrd0(x)
    173     kde = sm.nonparametric.KDEUnivariate(x)
--> 174     kde.fit(
    175         kernel=params['kernel'],
    176         bw=bw,

~/.local/lib/python3.8/site-packages/statsmodels/nonparametric/kde.py in fit(self, kernel, bw, fft, weights, gridsize, adjust, cut, clip)
    176             )
    177         else:
--> 178             density, grid, bw = kdensity(
    179                 endog,
    180                 kernel=kernel,

~/.local/lib/python3.8/site-packages/statsmodels/nonparametric/kde.py in kdensity(x, kernel, bw, weights, gridsize, adjust, clip, cut, retgrid)
    423 
    424     k = (
--> 425         x.T - grid[:, None]
    426     ) / bw  # uses broadcasting to make a gridsize x nobs
    427 

MemoryError: Unable to allocate array with shape (442608, 442608) and data type float64

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

3 participants