Skip to content

Commit

Permalink
ENH: Quoting column names containing spaces with backticks to use the…
Browse files Browse the repository at this point in the history
…m in query and eval. (#24955)
  • Loading branch information
hwalinga authored and jreback committed Mar 20, 2019
1 parent 6e979d8 commit 02ada08
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 18 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Other Enhancements
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
- :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`)
- :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`)
- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)

.. _whatsnew_0250.api_breaking:

Expand Down
14 changes: 13 additions & 1 deletion pandas/core/computation/common.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import numpy as np

from pandas.compat import reduce
from pandas.compat import reduce, string_types

import pandas as pd

# A token value Python's tokenizer probably will never use.
_BACKTICK_QUOTED_STRING = 100


def _ensure_decoded(s):
""" if we have bytes, decode them to unicode """
Expand All @@ -22,5 +25,14 @@ def _result_type_many(*arrays_and_dtypes):
return reduce(np.result_type, arrays_and_dtypes)


def _remove_spaces_column_name(name):
"""Check if name contains any spaces, if it contains any spaces
the spaces will be removed and an underscore suffix is added."""
if not isinstance(name, string_types) or " " not in name:
return name

return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING"


class NameResolutionError(NameError):
pass
51 changes: 46 additions & 5 deletions pandas/core/computation/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,20 @@

import ast
from functools import partial
import itertools as it
import operator
import tokenize

import numpy as np

from pandas.compat import StringIO, lmap, reduce, string_types, zip
from pandas.compat import StringIO, lmap, map, reduce, string_types, zip

import pandas as pd
from pandas import compat
from pandas.core import common as com
from pandas.core.base import StringMixin
from pandas.core.computation.common import (
_BACKTICK_QUOTED_STRING, _remove_spaces_column_name)
from pandas.core.computation.ops import (
_LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp,
UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms,
Expand All @@ -31,7 +35,17 @@ def tokenize_string(source):
A Python source code string
"""
line_reader = StringIO(source).readline
for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader):
token_generator = tokenize.generate_tokens(line_reader)

# Loop over all tokens till a backtick (`) is found.
# Then, take all tokens till the next backtick to form a backtick quoted
# string.
for toknum, tokval, _, _, _ in token_generator:
if tokval == '`':
tokval = " ".join(it.takewhile(
lambda tokval: tokval != '`',
map(operator.itemgetter(1), token_generator)))
toknum = _BACKTICK_QUOTED_STRING
yield toknum, tokval


Expand Down Expand Up @@ -102,6 +116,31 @@ def _replace_locals(tok):
return toknum, tokval


def _clean_spaces_backtick_quoted_names(tok):
"""Clean up a column name if surrounded by backticks.
Backtick quoted string are indicated by a certain tokval value. If a string
is a backtick quoted token it will processed by
:func:`_remove_spaces_column_name` so that the parser can find this
string when the query is executed.
See also :meth:`NDFrame._get_space_character_free_column_resolver`.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
t : tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == _BACKTICK_QUOTED_STRING:
return tokenize.NAME, _remove_spaces_column_name(tokval)
return toknum, tokval


def _compose2(f, g):
"""Compose 2 callables"""
return lambda *args, **kwargs: f(g(*args, **kwargs))
Expand All @@ -114,7 +153,8 @@ def _compose(*funcs):


def _preparse(source, f=_compose(_replace_locals, _replace_booleans,
_rewrite_assign)):
_rewrite_assign,
_clean_spaces_backtick_quoted_names)):
"""Compose a collection of tokenization functions
Parameters
Expand Down Expand Up @@ -711,8 +751,9 @@ def visitor(x, y):
class PandasExprVisitor(BaseExprVisitor):

def __init__(self, env, engine, parser,
preparser=partial(_preparse, f=_compose(_replace_locals,
_replace_booleans))):
preparser=partial(_preparse, f=_compose(
_replace_locals, _replace_booleans,
_clean_spaces_backtick_quoted_names))):
super(PandasExprVisitor, self).__init__(env, engine, parser, preparser)


Expand Down
49 changes: 37 additions & 12 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2967,6 +2967,15 @@ def query(self, expr, inplace=False, **kwargs):
The query string to evaluate. You can refer to variables
in the environment by prefixing them with an '@' character like
``@a + b``.
.. versionadded:: 0.25.0
You can refer to column names that contain spaces by surrounding
them in backticks.
For example, if one of your columns is called ``a a`` and you want
to sum it with ``b``, your query should be ```a a` + b``.
inplace : bool
Whether the query should modify the data in place or return
a modified copy.
Expand Down Expand Up @@ -3025,23 +3034,37 @@ def query(self, expr, inplace=False, **kwargs):
Examples
--------
>>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
>>> df = pd.DataFrame({'A': range(1, 6),
... 'B': range(10, 0, -2),
... 'C C': range(10, 5, -1)})
>>> df
A B
0 1 10
1 2 8
2 3 6
3 4 4
4 5 2
A B C C
0 1 10 10
1 2 8 9
2 3 6 8
3 4 4 7
4 5 2 6
>>> df.query('A > B')
A B
4 5 2
A B C C
4 5 2 6
The previous expression is equivalent to
>>> df[df.A > df.B]
A B
4 5 2
A B C C
4 5 2 6
For columns with spaces in their name, you can use backtick quoting.
>>> df.query('B == `C C`')
A B C C
0 1 10 10
The previous expression is equivalent to
>>> df[df.B == df['C C']]
A B C C
0 1 10 10
"""
inplace = validate_bool_kwarg(inplace, 'inplace')
if not isinstance(expr, compat.string_types):
Expand Down Expand Up @@ -3160,7 +3183,9 @@ def eval(self, expr, inplace=False, **kwargs):
kwargs['level'] = kwargs.pop('level', 0) + 1
if resolvers is None:
index_resolvers = self._get_index_resolvers()
resolvers = dict(self.iteritems()), index_resolvers
column_resolvers = \
self._get_space_character_free_column_resolvers()
resolvers = column_resolvers, index_resolvers
if 'target' not in kwargs:
kwargs['target'] = self
kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)
Expand Down
12 changes: 12 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,18 @@ def _get_index_resolvers(self):
d.update(self._get_axis_resolvers(axis_name))
return d

def _get_space_character_free_column_resolvers(self):
"""Return the space character free column resolvers of a dataframe.
Column names with spaces are 'cleaned up' so that they can be referred
to by backtick quoting.
Used in :meth:`DataFrame.eval`.
"""
from pandas.core.computation.common import _remove_spaces_column_name

return {_remove_spaces_column_name(k): v for k, v
in self.iteritems()}

@property
def _info_axis(self):
return getattr(self, self._info_axis_name)
Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/frame/test_query_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1031,3 +1031,54 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op):

with pytest.raises(TypeError, match=msg):
df.eval('a {0} b'.format(op), engine=engine, parser=parser)


class TestDataFrameQueryBacktickQuoting(object):

@pytest.fixture(scope='class')
def df(self):
yield DataFrame({'A': [1, 2, 3],
'B B': [3, 2, 1],
'C C': [4, 5, 6],
'C_C': [8, 9, 10],
'D_D D': [11, 1, 101]})

def test_single_backtick_variable_query(self, df):
res = df.query('1 < `B B`')
expect = df[1 < df['B B']]
assert_frame_equal(res, expect)

def test_two_backtick_variables_query(self, df):
res = df.query('1 < `B B` and 4 < `C C`')
expect = df[(1 < df['B B']) & (4 < df['C C'])]
assert_frame_equal(res, expect)

def test_single_backtick_variable_expr(self, df):
res = df.eval('A + `B B`')
expect = df['A'] + df['B B']
assert_series_equal(res, expect)

def test_two_backtick_variables_expr(self, df):
res = df.eval('`B B` + `C C`')
expect = df['B B'] + df['C C']
assert_series_equal(res, expect)

def test_already_underscore_variable(self, df):
res = df.eval('`C_C` + A')
expect = df['C_C'] + df['A']
assert_series_equal(res, expect)

def test_same_name_but_underscores(self, df):
res = df.eval('C_C + `C C`')
expect = df['C_C'] + df['C C']
assert_series_equal(res, expect)

def test_mixed_underscores_and_spaces(self, df):
res = df.eval('A + `D_D D`')
expect = df['A'] + df['D_D D']
assert_series_equal(res, expect)

def backtick_quote_name_with_no_spaces(self, df):
res = df.eval('A + `C_C`')
expect = df['A'] + df['C_C']
assert_series_equal(res, expect)

0 comments on commit 02ada08

Please sign in to comment.