diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 73eb6a15a1b47..700a7c0e72074 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -29,6 +29,7 @@ Other Enhancements - :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`) - :class:`RangeIndex` has gained :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop`, and :attr:`~RangeIndex.step` attributes (:issue:`25710`) - :class:`datetime.timezone` objects are now supported as arguments to timezone methods and constructors (:issue:`25065`) +- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) .. _whatsnew_0250.api_breaking: diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index e7eca04e413c5..1e38919affcdd 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -1,9 +1,12 @@ import numpy as np -from pandas.compat import reduce +from pandas.compat import reduce, string_types import pandas as pd +# A token value Python's tokenizer probably will never use. +_BACKTICK_QUOTED_STRING = 100 + def _ensure_decoded(s): """ if we have bytes, decode them to unicode """ @@ -22,5 +25,14 @@ def _result_type_many(*arrays_and_dtypes): return reduce(np.result_type, arrays_and_dtypes) +def _remove_spaces_column_name(name): + """Check if name contains any spaces, if it contains any spaces + the spaces will be removed and an underscore suffix is added.""" + if not isinstance(name, string_types) or " " not in name: + return name + + return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING" + + class NameResolutionError(NameError): pass diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index d840bf6ae71a2..4ab34b7349af5 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -3,16 +3,20 @@ import ast from functools import partial +import itertools as it +import operator import tokenize import numpy as np -from pandas.compat import StringIO, lmap, reduce, string_types, zip +from pandas.compat import StringIO, lmap, map, reduce, string_types, zip import pandas as pd from pandas import compat from pandas.core import common as com from pandas.core.base import StringMixin +from pandas.core.computation.common import ( + _BACKTICK_QUOTED_STRING, _remove_spaces_column_name) from pandas.core.computation.ops import ( _LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp, UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms, @@ -31,7 +35,17 @@ def tokenize_string(source): A Python source code string """ line_reader = StringIO(source).readline - for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader): + token_generator = tokenize.generate_tokens(line_reader) + + # Loop over all tokens till a backtick (`) is found. + # Then, take all tokens till the next backtick to form a backtick quoted + # string. + for toknum, tokval, _, _, _ in token_generator: + if tokval == '`': + tokval = " ".join(it.takewhile( + lambda tokval: tokval != '`', + map(operator.itemgetter(1), token_generator))) + toknum = _BACKTICK_QUOTED_STRING yield toknum, tokval @@ -102,6 +116,31 @@ def _replace_locals(tok): return toknum, tokval +def _clean_spaces_backtick_quoted_names(tok): + """Clean up a column name if surrounded by backticks. + + Backtick quoted string are indicated by a certain tokval value. If a string + is a backtick quoted token it will processed by + :func:`_remove_spaces_column_name` so that the parser can find this + string when the query is executed. + See also :meth:`NDFrame._get_space_character_free_column_resolver`. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + t : tuple of int, str + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == _BACKTICK_QUOTED_STRING: + return tokenize.NAME, _remove_spaces_column_name(tokval) + return toknum, tokval + + def _compose2(f, g): """Compose 2 callables""" return lambda *args, **kwargs: f(g(*args, **kwargs)) @@ -114,7 +153,8 @@ def _compose(*funcs): def _preparse(source, f=_compose(_replace_locals, _replace_booleans, - _rewrite_assign)): + _rewrite_assign, + _clean_spaces_backtick_quoted_names)): """Compose a collection of tokenization functions Parameters @@ -711,8 +751,9 @@ def visitor(x, y): class PandasExprVisitor(BaseExprVisitor): def __init__(self, env, engine, parser, - preparser=partial(_preparse, f=_compose(_replace_locals, - _replace_booleans))): + preparser=partial(_preparse, f=_compose( + _replace_locals, _replace_booleans, + _clean_spaces_backtick_quoted_names))): super(PandasExprVisitor, self).__init__(env, engine, parser, preparser) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b4f15905afc44..2dc885d198f48 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2967,6 +2967,15 @@ def query(self, expr, inplace=False, **kwargs): The query string to evaluate. You can refer to variables in the environment by prefixing them with an '@' character like ``@a + b``. + + .. versionadded:: 0.25.0 + + You can refer to column names that contain spaces by surrounding + them in backticks. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. + inplace : bool Whether the query should modify the data in place or return a modified copy. @@ -3025,23 +3034,37 @@ def query(self, expr, inplace=False, **kwargs): Examples -------- - >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df = pd.DataFrame({'A': range(1, 6), + ... 'B': range(10, 0, -2), + ... 'C C': range(10, 5, -1)}) >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 + A B C C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 >>> df.query('A > B') - A B - 4 5 2 + A B C C + 4 5 2 6 The previous expression is equivalent to >>> df[df.A > df.B] - A B - 4 5 2 + A B C C + 4 5 2 6 + + For columns with spaces in their name, you can use backtick quoting. + + >>> df.query('B == `C C`') + A B C C + 0 1 10 10 + + The previous expression is equivalent to + + >>> df[df.B == df['C C']] + A B C C + 0 1 10 10 """ inplace = validate_bool_kwarg(inplace, 'inplace') if not isinstance(expr, compat.string_types): @@ -3160,7 +3183,9 @@ def eval(self, expr, inplace=False, **kwargs): kwargs['level'] = kwargs.pop('level', 0) + 1 if resolvers is None: index_resolvers = self._get_index_resolvers() - resolvers = dict(self.iteritems()), index_resolvers + column_resolvers = \ + self._get_space_character_free_column_resolvers() + resolvers = column_resolvers, index_resolvers if 'target' not in kwargs: kwargs['target'] = self kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ac2ec40d6305d..f69ba51e59784 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -423,6 +423,18 @@ def _get_index_resolvers(self): d.update(self._get_axis_resolvers(axis_name)) return d + def _get_space_character_free_column_resolvers(self): + """Return the space character free column resolvers of a dataframe. + + Column names with spaces are 'cleaned up' so that they can be referred + to by backtick quoting. + Used in :meth:`DataFrame.eval`. + """ + from pandas.core.computation.common import _remove_spaces_column_name + + return {_remove_spaces_column_name(k): v for k, v + in self.iteritems()} + @property def _info_axis(self): return getattr(self, self._info_axis_name) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index ba02cb54bcea1..a8a9a278a0ebb 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1031,3 +1031,54 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op): with pytest.raises(TypeError, match=msg): df.eval('a {0} b'.format(op), engine=engine, parser=parser) + + +class TestDataFrameQueryBacktickQuoting(object): + + @pytest.fixture(scope='class') + def df(self): + yield DataFrame({'A': [1, 2, 3], + 'B B': [3, 2, 1], + 'C C': [4, 5, 6], + 'C_C': [8, 9, 10], + 'D_D D': [11, 1, 101]}) + + def test_single_backtick_variable_query(self, df): + res = df.query('1 < `B B`') + expect = df[1 < df['B B']] + assert_frame_equal(res, expect) + + def test_two_backtick_variables_query(self, df): + res = df.query('1 < `B B` and 4 < `C C`') + expect = df[(1 < df['B B']) & (4 < df['C C'])] + assert_frame_equal(res, expect) + + def test_single_backtick_variable_expr(self, df): + res = df.eval('A + `B B`') + expect = df['A'] + df['B B'] + assert_series_equal(res, expect) + + def test_two_backtick_variables_expr(self, df): + res = df.eval('`B B` + `C C`') + expect = df['B B'] + df['C C'] + assert_series_equal(res, expect) + + def test_already_underscore_variable(self, df): + res = df.eval('`C_C` + A') + expect = df['C_C'] + df['A'] + assert_series_equal(res, expect) + + def test_same_name_but_underscores(self, df): + res = df.eval('C_C + `C C`') + expect = df['C_C'] + df['C C'] + assert_series_equal(res, expect) + + def test_mixed_underscores_and_spaces(self, df): + res = df.eval('A + `D_D D`') + expect = df['A'] + df['D_D D'] + assert_series_equal(res, expect) + + def backtick_quote_name_with_no_spaces(self, df): + res = df.eval('A + `C_C`') + expect = df['A'] + df['C_C'] + assert_series_equal(res, expect)