From ff463cac5a17c7518c12e68e24fa4e6ff4d1383b Mon Sep 17 00:00:00 2001 From: Hielke Walinga Date: Sat, 26 Jan 2019 21:59:10 +0100 Subject: [PATCH 01/11] TST: Add tests for backtick quoting (#6508) --- pandas/tests/frame/test_query_eval.py | 31 +++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 0d06d0006a9e2..6b3a1058f65db 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1030,3 +1030,34 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op): with pytest.raises(TypeError, match=msg): df.eval('a {0} b'.format(op), engine=engine, parser=parser) + + +class TestDataFrameQueryBacktickQuoting(object): + + def setup_method(self, method): + self.df = DataFrame({'A': [1, 2, 3], + 'B B': [3, 2, 1], + 'C C': [4, 5, 6]}) + + def teardown_method(self, method): + del self.df + + def test_single_backtick_variable_query(self): + res = self.df.query('1 < `B B`') + expect = self.df[1 < self.df['B B']] + assert_frame_equal(res, expect) + + def test_two_backtick_variables_query(self): + res = self.df.query('1 < `B B` and 4 < `C C`') + expect = self.df[(1 < self.df['B B']) & (4 < self.df['C C'])] + assert_frame_equal(res, expect) + + def test_single_backtick_variable_expr(self): + res = self.df.eval('A + `B B`') + expect = self.df['A'] + self.df['B B'] + assert_series_equal(res, expect) + + def test_two_backtick_variables_expr(self): + res = self.df.eval('`B B` + `C C`') + expect = self.df['B B'] + self.df['C C'] + assert_series_equal(res, expect) From db9c769ba388a3630641ee0ea03979c5bdc2df63 Mon Sep 17 00:00:00 2001 From: Hielke Walinga Date: Sat, 26 Jan 2019 23:01:43 +0100 Subject: [PATCH 02/11] Update docstring query about quoting backtick variables --- pandas/core/frame.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cf97c94f6d129..7951e4938f845 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2965,7 +2965,8 @@ def query(self, expr, inplace=False, **kwargs): expr : str The query string to evaluate. You can refer to variables in the environment by prefixing them with an '@' character like - ``@a + b``. + ``@a + b``. You can refer to column names with spaces by quoting + them in backticks like ```a a` + b``. inplace : bool Whether the query should modify the data in place or return a modified copy. From 22686fda76529f9585576e77a1dc48f30f5b0d52 Mon Sep 17 00:00:00 2001 From: Hielke Walinga Date: Fri, 15 Feb 2019 22:44:22 +0100 Subject: [PATCH 03/11] Fixed whatsnew entry --- doc/source/whatsnew/v0.25.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 95362521f3b9f..461d7199815f9 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -20,6 +20,7 @@ Other Enhancements ^^^^^^^^^^^^^^^^^^ - :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`) +- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`) - - From bfebb9dc690c2e7a60d12c20c2938eae46dc0953 Mon Sep 17 00:00:00 2001 From: Hielke Walinga Date: Sat, 16 Feb 2019 00:52:44 +0100 Subject: [PATCH 04/11] Backtick quotes are now tokenized. More tests and pytest fixtures --- pandas/core/computation/common.py | 14 ++++++- pandas/core/computation/expr.py | 44 ++++++++++++++++++-- pandas/core/frame.py | 9 +++- pandas/tests/frame/test_query_eval.py | 60 ++++++++++++++++++--------- 4 files changed, 101 insertions(+), 26 deletions(-) diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index e7eca04e413c5..e2209e424b5f9 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -1,10 +1,14 @@ import numpy as np -from pandas.compat import reduce +from pandas.compat import reduce, string_types import pandas as pd +# A token value Python's tokenizer probably will never use. +_BACKTICK_QUOTED_STRING = 100 + + def _ensure_decoded(s): """ if we have bytes, decode them to unicode """ if isinstance(s, (np.bytes_, bytes)): @@ -22,5 +26,13 @@ def _result_type_many(*arrays_and_dtypes): return reduce(np.result_type, arrays_and_dtypes) +def clean_column_name_with_spaces(name): + """Check if name contains any spaces, if it contains any spaces + the spaces will be removed and an underscore suffix is added.""" + if not isinstance(name, string_types) or " " not in name: + return name + return "_BACKTICK_QUOTED_STRING_" + name.replace(" ", "_") + + class NameResolutionError(NameError): pass diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index d840bf6ae71a2..298166df39f16 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -3,6 +3,8 @@ import ast from functools import partial +import itertools as it +import operator as op import tokenize import numpy as np @@ -13,6 +15,8 @@ from pandas import compat from pandas.core import common as com from pandas.core.base import StringMixin +from pandas.core.computation.common import ( + _BACKTICK_QUOTED_STRING, clean_column_name_with_spaces) from pandas.core.computation.ops import ( _LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp, UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms, @@ -31,7 +35,13 @@ def tokenize_string(source): A Python source code string """ line_reader = StringIO(source).readline - for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader): + token_generator = tokenize.generate_tokens(line_reader) + for toknum, tokval, _, _, _ in token_generator: + if tokval == '`': + tokval = " ".join(it.takewhile( + lambda tokval: tokval != '`', + map(op.itemgetter(1), token_generator))) + toknum = _BACKTICK_QUOTED_STRING yield toknum, tokval @@ -102,6 +112,30 @@ def _replace_locals(tok): return toknum, tokval +def _clean_spaces_backtick_quoted_names(tok): + """Clean up a column name if surrounded by backticks. + + Backtick quoted string are indicated by a certain tokval value. If a string + is a backtick quoted token it will processed by + :func:`clean_column_name_with_spaces` so that the parser can find this + string when the query is executed. See also :meth:`DataFrame.eval`. + + Parameters + ---------- + tok : tuple of int, str + ints correspond to the all caps constants in the tokenize module + + Returns + ------- + t : tuple of int, str + Either the input or token or the replacement values + """ + toknum, tokval = tok + if toknum == _BACKTICK_QUOTED_STRING: + return tokenize.NAME, clean_column_name_with_spaces(tokval) + return toknum, tokval + + def _compose2(f, g): """Compose 2 callables""" return lambda *args, **kwargs: f(g(*args, **kwargs)) @@ -114,7 +148,8 @@ def _compose(*funcs): def _preparse(source, f=_compose(_replace_locals, _replace_booleans, - _rewrite_assign)): + _rewrite_assign, + _clean_spaces_backtick_quoted_names)): """Compose a collection of tokenization functions Parameters @@ -711,8 +746,9 @@ def visitor(x, y): class PandasExprVisitor(BaseExprVisitor): def __init__(self, env, engine, parser, - preparser=partial(_preparse, f=_compose(_replace_locals, - _replace_booleans))): + preparser=partial(_preparse, f=_compose( + _replace_locals, _replace_booleans, + _clean_spaces_backtick_quoted_names))): super(PandasExprVisitor, self).__init__(env, engine, parser, preparser) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7951e4938f845..c5060ab195ea5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -36,6 +36,7 @@ PY36, raise_with_traceback, string_and_binary_types) from pandas.compat.numpy import function as nv +from pandas.core.computation.common import clean_column_name_with_spaces from pandas.core.dtypes.cast import ( maybe_upcast, cast_scalar_to_array, @@ -3160,7 +3161,13 @@ def eval(self, expr, inplace=False, **kwargs): kwargs['level'] = kwargs.pop('level', 0) + 1 if resolvers is None: index_resolvers = self._get_index_resolvers() - resolvers = dict(self.iteritems()), index_resolvers + # column names with spaces are altered so that they can be referred + # to by backtick quoting. + # Also see _clean_spaces_backtick_quoted_names from + # pandas/core/computation/expr.py + column_resolvers = {clean_column_name_with_spaces(k): v + for k, v in self.iteritems()} + resolvers = column_resolvers, index_resolvers if 'target' not in kwargs: kwargs['target'] = self kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 6b3a1058f65db..6dbefa89c7dd7 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1034,30 +1034,50 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op): class TestDataFrameQueryBacktickQuoting(object): - def setup_method(self, method): - self.df = DataFrame({'A': [1, 2, 3], - 'B B': [3, 2, 1], - 'C C': [4, 5, 6]}) - - def teardown_method(self, method): - del self.df - - def test_single_backtick_variable_query(self): - res = self.df.query('1 < `B B`') - expect = self.df[1 < self.df['B B']] + @pytest.fixture(scope='class') + def df(self): + yield DataFrame({'A': [1, 2, 3], + 'B B': [3, 2, 1], + 'C C': [4, 5, 6], + 'C_C': [8, 9, 10], + 'D_D D': [11, 1, 101]}) + + def test_single_backtick_variable_query(self, df): + res = df.query('1 < `B B`') + expect = df[1 < df['B B']] assert_frame_equal(res, expect) - def test_two_backtick_variables_query(self): - res = self.df.query('1 < `B B` and 4 < `C C`') - expect = self.df[(1 < self.df['B B']) & (4 < self.df['C C'])] + def test_two_backtick_variables_query(self, df): + res = df.query('1 < `B B` and 4 < `C C`') + expect = df[(1 < df['B B']) & (4 < df['C C'])] assert_frame_equal(res, expect) - def test_single_backtick_variable_expr(self): - res = self.df.eval('A + `B B`') - expect = self.df['A'] + self.df['B B'] + def test_single_backtick_variable_expr(self, df): + res = df.eval('A + `B B`') + expect = df['A'] + df['B B'] + assert_series_equal(res, expect) + + def test_two_backtick_variables_expr(self, df): + res = df.eval('`B B` + `C C`') + expect = df['B B'] + df['C C'] + assert_series_equal(res, expect) + + def test_already_underscore_variable(self, df): + res = df.eval('`C_C` + A') + expect = df['C_C'] + df['A'] + assert_series_equal(res, expect) + + def test_same_name_but_underscores(self, df): + res = df.eval('C_C + `C C`') + expect = df['C_C'] + df['C C'] + assert_series_equal(res, expect) + + def test_mixed_underscores_and_spaces(self, df): + res = df.eval('A + `D_D D`') + expect = df['A'] + df['D_D D'] assert_series_equal(res, expect) - def test_two_backtick_variables_expr(self): - res = self.df.eval('`B B` + `C C`') - expect = self.df['B B'] + self.df['C C'] + def backtick_quote_name_with_no_spaces(self, df): + res = df.eval('A + `C_C`') + expect = df['A'] + df['C_C'] assert_series_equal(res, expect) From a65f5a52edd18b3c4a5741019f7813042bd19e28 Mon Sep 17 00:00:00 2001 From: Hielke Walinga Date: Sat, 16 Feb 2019 02:04:41 +0100 Subject: [PATCH 05/11] Use compat.map; No import alias (operator) to prevent name shadowing --- pandas/core/computation/expr.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 298166df39f16..4b2990a3d15ee 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -4,12 +4,12 @@ import ast from functools import partial import itertools as it -import operator as op +import operator import tokenize import numpy as np -from pandas.compat import StringIO, lmap, reduce, string_types, zip +from pandas.compat import StringIO, lmap, reduce, string_types, zip, map import pandas as pd from pandas import compat @@ -40,7 +40,7 @@ def tokenize_string(source): if tokval == '`': tokval = " ".join(it.takewhile( lambda tokval: tokval != '`', - map(op.itemgetter(1), token_generator))) + map(operator.itemgetter(1), token_generator))) toknum = _BACKTICK_QUOTED_STRING yield toknum, tokval @@ -174,7 +174,9 @@ def _preparse(source, f=_compose(_replace_locals, _replace_booleans, the ``tokenize`` module and ``tokval`` is a string. """ assert callable(f), 'f must be callable' - return tokenize.untokenize(lmap(f, tokenize_string(source))) + source = tokenize.untokenize(lmap(f, tokenize_string(source))) + print(source) + return source def _is_type(t): From da6095504867f3c80b0ef987291262e01b7bfe7d Mon Sep 17 00:00:00 2001 From: Hielke Walinga Date: Sat, 16 Feb 2019 12:18:00 +0100 Subject: [PATCH 06/11] Fix import order; Remove debug print; --- pandas/core/computation/common.py | 1 - pandas/core/computation/expr.py | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index e2209e424b5f9..56bb2f383408b 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -4,7 +4,6 @@ import pandas as pd - # A token value Python's tokenizer probably will never use. _BACKTICK_QUOTED_STRING = 100 diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 4b2990a3d15ee..fd8590e76b9cc 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -9,7 +9,7 @@ import numpy as np -from pandas.compat import StringIO, lmap, reduce, string_types, zip, map +from pandas.compat import StringIO, lmap, map, reduce, string_types, zip import pandas as pd from pandas import compat @@ -174,9 +174,7 @@ def _preparse(source, f=_compose(_replace_locals, _replace_booleans, the ``tokenize`` module and ``tokval`` is a string. """ assert callable(f), 'f must be callable' - source = tokenize.untokenize(lmap(f, tokenize_string(source))) - print(source) - return source + return tokenize.untokenize(lmap(f, tokenize_string(source))) def _is_type(t): From 212506887f955b9a9ddc8a97ba89bcd56f74187b Mon Sep 17 00:00:00 2001 From: Hielke Walinga Date: Sun, 24 Feb 2019 23:59:46 +0100 Subject: [PATCH 07/11] Add 'versionadded' and move column resolvers logic to common.py. --- pandas/core/computation/common.py | 14 +++++++++++++- pandas/core/computation/expr.py | 9 +++++---- pandas/core/frame.py | 17 +++++++++-------- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 56bb2f383408b..7f45251af103d 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -25,7 +25,7 @@ def _result_type_many(*arrays_and_dtypes): return reduce(np.result_type, arrays_and_dtypes) -def clean_column_name_with_spaces(name): +def _clean_column_name_with_spaces(name): """Check if name contains any spaces, if it contains any spaces the spaces will be removed and an underscore suffix is added.""" if not isinstance(name, string_types) or " " not in name: @@ -33,5 +33,17 @@ def clean_column_name_with_spaces(name): return "_BACKTICK_QUOTED_STRING_" + name.replace(" ", "_") +def _get_column_resolvers(dataFrame): + """Return the axis resolvers of a dataframe. + + Column names with spaces are 'cleaned up' so that they can be referred to + by backtick quoting. See also :func:`_clean_spaces_backtick_quoted_names` + from :mod:`pandas.core.computation` + """ + + return {_clean_column_name_with_spaces(k): v for k, v + in dataFrame.iteritems()} + + class NameResolutionError(NameError): pass diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index fd8590e76b9cc..2aacfd56ece21 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -16,7 +16,7 @@ from pandas.core import common as com from pandas.core.base import StringMixin from pandas.core.computation.common import ( - _BACKTICK_QUOTED_STRING, clean_column_name_with_spaces) + _BACKTICK_QUOTED_STRING, _clean_column_name_with_spaces) from pandas.core.computation.ops import ( _LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp, UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms, @@ -117,8 +117,9 @@ def _clean_spaces_backtick_quoted_names(tok): Backtick quoted string are indicated by a certain tokval value. If a string is a backtick quoted token it will processed by - :func:`clean_column_name_with_spaces` so that the parser can find this - string when the query is executed. See also :meth:`DataFrame.eval`. + :func:`_clean_column_name_with_spaces` so that the parser can find this + string when the query is executed. See also :func:`_get_column_resolvers` + used in :meth:`DataFrame.eval`. Parameters ---------- @@ -132,7 +133,7 @@ def _clean_spaces_backtick_quoted_names(tok): """ toknum, tokval = tok if toknum == _BACKTICK_QUOTED_STRING: - return tokenize.NAME, clean_column_name_with_spaces(tokval) + return tokenize.NAME, _clean_column_name_with_spaces(tokval) return toknum, tokval diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c5060ab195ea5..2089b7a63ba07 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -36,7 +36,6 @@ PY36, raise_with_traceback, string_and_binary_types) from pandas.compat.numpy import function as nv -from pandas.core.computation.common import clean_column_name_with_spaces from pandas.core.dtypes.cast import ( maybe_upcast, cast_scalar_to_array, @@ -2966,8 +2965,13 @@ def query(self, expr, inplace=False, **kwargs): expr : str The query string to evaluate. You can refer to variables in the environment by prefixing them with an '@' character like - ``@a + b``. You can refer to column names with spaces by quoting + ``@a + b``. + + .. versionadded:: 0.25.0 + + You can refer to column names that contain spaces by surrounding them in backticks like ```a a` + b``. + inplace : bool Whether the query should modify the data in place or return a modified copy. @@ -3160,13 +3164,10 @@ def eval(self, expr, inplace=False, **kwargs): resolvers = kwargs.pop('resolvers', None) kwargs['level'] = kwargs.pop('level', 0) + 1 if resolvers is None: + from pandas.core.computation.common import _get_column_resolvers + index_resolvers = self._get_index_resolvers() - # column names with spaces are altered so that they can be referred - # to by backtick quoting. - # Also see _clean_spaces_backtick_quoted_names from - # pandas/core/computation/expr.py - column_resolvers = {clean_column_name_with_spaces(k): v - for k, v in self.iteritems()} + column_resolvers = _get_column_resolvers(self) resolvers = column_resolvers, index_resolvers if 'target' not in kwargs: kwargs['target'] = self From 63c25bfe0f0dff747b4975428930dd722f80268c Mon Sep 17 00:00:00 2001 From: Hielke Walinga Date: Sun, 10 Mar 2019 19:13:07 +0100 Subject: [PATCH 08/11] More clarity in comments; Moved column resolver to class; Use uuid --- pandas/core/computation/common.py | 19 ++++++------------- pandas/core/computation/expr.py | 14 +++++++++----- pandas/core/frame.py | 10 ++++++---- pandas/core/generic.py | 12 ++++++++++++ 4 files changed, 33 insertions(+), 22 deletions(-) diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 7f45251af103d..3adb25db13b8d 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -1,3 +1,5 @@ +import uuid + import numpy as np from pandas.compat import reduce, string_types @@ -25,24 +27,15 @@ def _result_type_many(*arrays_and_dtypes): return reduce(np.result_type, arrays_and_dtypes) -def _clean_column_name_with_spaces(name): +def _remove_spaces_column_name(name): """Check if name contains any spaces, if it contains any spaces the spaces will be removed and an underscore suffix is added.""" if not isinstance(name, string_types) or " " not in name: return name - return "_BACKTICK_QUOTED_STRING_" + name.replace(" ", "_") - - -def _get_column_resolvers(dataFrame): - """Return the axis resolvers of a dataframe. - - Column names with spaces are 'cleaned up' so that they can be referred to - by backtick quoting. See also :func:`_clean_spaces_backtick_quoted_names` - from :mod:`pandas.core.computation` - """ - return {_clean_column_name_with_spaces(k): v for k, v - in dataFrame.iteritems()} + # uuid3 will provide a unique string that can be independently reproduced. + return name.replace(" ", "_") + "_" + \ + str(uuid.uuid3(uuid.NAMESPACE_DNS, name)).replace("-", "") class NameResolutionError(NameError): diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 2aacfd56ece21..4ab34b7349af5 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -16,7 +16,7 @@ from pandas.core import common as com from pandas.core.base import StringMixin from pandas.core.computation.common import ( - _BACKTICK_QUOTED_STRING, _clean_column_name_with_spaces) + _BACKTICK_QUOTED_STRING, _remove_spaces_column_name) from pandas.core.computation.ops import ( _LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp, UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms, @@ -36,6 +36,10 @@ def tokenize_string(source): """ line_reader = StringIO(source).readline token_generator = tokenize.generate_tokens(line_reader) + + # Loop over all tokens till a backtick (`) is found. + # Then, take all tokens till the next backtick to form a backtick quoted + # string. for toknum, tokval, _, _, _ in token_generator: if tokval == '`': tokval = " ".join(it.takewhile( @@ -117,9 +121,9 @@ def _clean_spaces_backtick_quoted_names(tok): Backtick quoted string are indicated by a certain tokval value. If a string is a backtick quoted token it will processed by - :func:`_clean_column_name_with_spaces` so that the parser can find this - string when the query is executed. See also :func:`_get_column_resolvers` - used in :meth:`DataFrame.eval`. + :func:`_remove_spaces_column_name` so that the parser can find this + string when the query is executed. + See also :meth:`NDFrame._get_space_character_free_column_resolver`. Parameters ---------- @@ -133,7 +137,7 @@ def _clean_spaces_backtick_quoted_names(tok): """ toknum, tokval = tok if toknum == _BACKTICK_QUOTED_STRING: - return tokenize.NAME, _clean_column_name_with_spaces(tokval) + return tokenize.NAME, _remove_spaces_column_name(tokval) return toknum, tokval diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 266558f949c74..58f6173d8889c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2971,7 +2971,10 @@ def query(self, expr, inplace=False, **kwargs): .. versionadded:: 0.25.0 You can refer to column names that contain spaces by surrounding - them in backticks like ```a a` + b``. + them in backticks. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. inplace : bool Whether the query should modify the data in place or return @@ -3165,10 +3168,9 @@ def eval(self, expr, inplace=False, **kwargs): resolvers = kwargs.pop('resolvers', None) kwargs['level'] = kwargs.pop('level', 0) + 1 if resolvers is None: - from pandas.core.computation.common import _get_column_resolvers - index_resolvers = self._get_index_resolvers() - column_resolvers = _get_column_resolvers(self) + column_resolvers = \ + self._get_space_character_free_column_resolvers() resolvers = column_resolvers, index_resolvers if 'target' not in kwargs: kwargs['target'] = self diff --git a/pandas/core/generic.py b/pandas/core/generic.py index eb84a9a5810f4..507a8212b28da 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -38,6 +38,7 @@ import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com +from pandas.core.computation.common import _remove_spaces_column_name from pandas.core.index import ( Index, InvalidIndexError, MultiIndex, RangeIndex, ensure_index) from pandas.core.indexes.datetimes import DatetimeIndex @@ -423,6 +424,17 @@ def _get_index_resolvers(self): d.update(self._get_axis_resolvers(axis_name)) return d + def _get_space_character_free_column_resolvers(self): + """Return the space character free column resolvers of a dataframe. + + Column names with spaces are 'cleaned up' so that they can be referred + to by backtick quoting. + Used in :meth:`DataFrame.eval`. + """ + + return {_remove_spaces_column_name(k): v for k, v + in self.iteritems()} + @property def _info_axis(self): return getattr(self, self._info_axis_name) From e496671fe55cd30d6e0b8992b2be127ec561ae9b Mon Sep 17 00:00:00 2001 From: Hielke Walinga Date: Sun, 10 Mar 2019 21:10:23 +0100 Subject: [PATCH 09/11] uuid3 python2/3 compatible function added --- pandas/compat/__init__.py | 6 ++++++ pandas/core/computation/common.py | 6 +++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 4036af85b7212..183dd42bf922a 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -38,6 +38,7 @@ import inspect from collections import namedtuple import collections +import uuid PY2 = sys.version_info[0] == 2 PY3 = sys.version_info[0] >= 3 @@ -115,6 +116,7 @@ def get_range_parameters(data): reduce = functools.reduce long = int unichr = chr + uuid3 = uuid.uuid3 # This was introduced in Python 3.3, but we don't support # Python 3.x < 3.5, so checking PY3 is safe. @@ -162,6 +164,10 @@ def bytes_to_str(b, encoding='ascii'): def signature(f): return inspect.getargspec(f) + # See also: https://bugs.python.org/issue34145 + def uuid3(namespace, name): + return uuid.uuid3(namespace, name.encode('utf-8')) + def get_range_parameters(data): """Gets the start, stop, and step parameters from a range object""" # seems we only have indexing ops to infer diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 3adb25db13b8d..8cc4e07a91583 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -2,7 +2,7 @@ import numpy as np -from pandas.compat import reduce, string_types +from pandas.compat import reduce, string_types, uuid3 import pandas as pd @@ -34,8 +34,8 @@ def _remove_spaces_column_name(name): return name # uuid3 will provide a unique string that can be independently reproduced. - return name.replace(" ", "_") + "_" + \ - str(uuid.uuid3(uuid.NAMESPACE_DNS, name)).replace("-", "") + return name.replace(" ", "_") + "_" + str(uuid3( + uuid.NAMESPACE_DNS, name)).replace("-", "") class NameResolutionError(NameError): From d3877d16d3aa7964c939356a7912418badd458b2 Mon Sep 17 00:00:00 2001 From: Hielke Walinga Date: Sun, 10 Mar 2019 23:38:41 +0100 Subject: [PATCH 10/11] Reverted uuid3 --- pandas/compat/__init__.py | 6 ------ pandas/core/computation/common.py | 8 ++------ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 183dd42bf922a..4036af85b7212 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -38,7 +38,6 @@ import inspect from collections import namedtuple import collections -import uuid PY2 = sys.version_info[0] == 2 PY3 = sys.version_info[0] >= 3 @@ -116,7 +115,6 @@ def get_range_parameters(data): reduce = functools.reduce long = int unichr = chr - uuid3 = uuid.uuid3 # This was introduced in Python 3.3, but we don't support # Python 3.x < 3.5, so checking PY3 is safe. @@ -164,10 +162,6 @@ def bytes_to_str(b, encoding='ascii'): def signature(f): return inspect.getargspec(f) - # See also: https://bugs.python.org/issue34145 - def uuid3(namespace, name): - return uuid.uuid3(namespace, name.encode('utf-8')) - def get_range_parameters(data): """Gets the start, stop, and step parameters from a range object""" # seems we only have indexing ops to infer diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 8cc4e07a91583..1e38919affcdd 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -1,8 +1,6 @@ -import uuid - import numpy as np -from pandas.compat import reduce, string_types, uuid3 +from pandas.compat import reduce, string_types import pandas as pd @@ -33,9 +31,7 @@ def _remove_spaces_column_name(name): if not isinstance(name, string_types) or " " not in name: return name - # uuid3 will provide a unique string that can be independently reproduced. - return name.replace(" ", "_") + "_" + str(uuid3( - uuid.NAMESPACE_DNS, name)).replace("-", "") + return name.replace(" ", "_") + "_BACKTICK_QUOTED_STRING" class NameResolutionError(NameError): From bb62d7392962521b34046df5760c4411f0497ca1 Mon Sep 17 00:00:00 2001 From: Hielke Walinga Date: Mon, 11 Mar 2019 00:27:07 +0100 Subject: [PATCH 11/11] Local import for computation.common; Added example in query --- pandas/core/frame.py | 36 +++++++++++++++++++++++++----------- pandas/core/generic.py | 2 +- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c8303daadff7c..c720731ffbaa2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3034,23 +3034,37 @@ def query(self, expr, inplace=False, **kwargs): Examples -------- - >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df = pd.DataFrame({'A': range(1, 6), + ... 'B': range(10, 0, -2), + ... 'C C': range(10, 5, -1)}) >>> df - A B - 0 1 10 - 1 2 8 - 2 3 6 - 3 4 4 - 4 5 2 + A B C C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 >>> df.query('A > B') - A B - 4 5 2 + A B C C + 4 5 2 6 The previous expression is equivalent to >>> df[df.A > df.B] - A B - 4 5 2 + A B C C + 4 5 2 6 + + For columns with spaces in their name, you can use backtick quoting. + + >>> df.query('B == `C C`') + A B C C + 0 1 10 10 + + The previous expression is equivalent to + + >>> df[df.B == df['C C']] + A B C C + 0 1 10 10 """ inplace = validate_bool_kwarg(inplace, 'inplace') if not isinstance(expr, compat.string_types): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 31f402c0ae012..712ba5a8f0016 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -38,7 +38,6 @@ import pandas.core.algorithms as algos from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com -from pandas.core.computation.common import _remove_spaces_column_name from pandas.core.index import ( Index, InvalidIndexError, MultiIndex, RangeIndex, ensure_index) from pandas.core.indexes.datetimes import DatetimeIndex @@ -431,6 +430,7 @@ def _get_space_character_free_column_resolvers(self): to by backtick quoting. Used in :meth:`DataFrame.eval`. """ + from pandas.core.computation.common import _remove_spaces_column_name return {_remove_spaces_column_name(k): v for k, v in self.iteritems()}