Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Quoting column names containing spaces with backticks to use them in query and eval. #24955

Merged
merged 14 commits into from
Mar 20, 2019
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ Other Enhancements
- :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
- :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
- :meth:`DataFrame.rename` now supports the ``errors`` argument to raise errors when attempting to rename nonexistent keys (:issue:`13473`)
- :meth:`DataFrame.query` and :meth:`DataFrame.eval` now supports quoting column names with backticks to refer to names with spaces (:issue:`6508`)

.. _whatsnew_0250.api_breaking:

Expand Down
6 changes: 6 additions & 0 deletions pandas/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import inspect
from collections import namedtuple
import collections
import uuid

PY2 = sys.version_info[0] == 2
PY3 = sys.version_info[0] >= 3
Expand Down Expand Up @@ -115,6 +116,7 @@ def get_range_parameters(data):
reduce = functools.reduce
long = int
unichr = chr
uuid3 = uuid.uuid3

# This was introduced in Python 3.3, but we don't support
# Python 3.x < 3.5, so checking PY3 is safe.
Expand Down Expand Up @@ -162,6 +164,10 @@ def bytes_to_str(b, encoding='ascii'):
def signature(f):
return inspect.getargspec(f)

# See also: https://bugs.python.org/issue34145
hwalinga marked this conversation as resolved.
Show resolved Hide resolved
def uuid3(namespace, name):
return uuid.uuid3(namespace, name.encode('utf-8'))

def get_range_parameters(data):
"""Gets the start, stop, and step parameters from a range object"""
# seems we only have indexing ops to infer
Expand Down
18 changes: 17 additions & 1 deletion pandas/core/computation/common.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import uuid

import numpy as np

from pandas.compat import reduce
from pandas.compat import reduce, string_types, uuid3

import pandas as pd

# A token value Python's tokenizer probably will never use.
hwalinga marked this conversation as resolved.
Show resolved Hide resolved
_BACKTICK_QUOTED_STRING = 100


def _ensure_decoded(s):
""" if we have bytes, decode them to unicode """
Expand All @@ -22,5 +27,16 @@ def _result_type_many(*arrays_and_dtypes):
return reduce(np.result_type, arrays_and_dtypes)


def _remove_spaces_column_name(name):
"""Check if name contains any spaces, if it contains any spaces
the spaces will be removed and an underscore suffix is added."""
if not isinstance(name, string_types) or " " not in name:
return name

# uuid3 will provide a unique string that can be independently reproduced.
return name.replace(" ", "_") + "_" + str(uuid3(
uuid.NAMESPACE_DNS, name)).replace("-", "")


class NameResolutionError(NameError):
pass
51 changes: 46 additions & 5 deletions pandas/core/computation/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,20 @@

import ast
from functools import partial
import itertools as it
import operator
import tokenize

import numpy as np

from pandas.compat import StringIO, lmap, reduce, string_types, zip
from pandas.compat import StringIO, lmap, map, reduce, string_types, zip

import pandas as pd
from pandas import compat
from pandas.core import common as com
from pandas.core.base import StringMixin
from pandas.core.computation.common import (
_BACKTICK_QUOTED_STRING, _remove_spaces_column_name)
from pandas.core.computation.ops import (
_LOCAL_TAG, BinOp, Constant, Div, FuncNode, Op, Term, UnaryOp,
UndefinedVariableError, _arith_ops_syms, _bool_ops_syms, _cmp_ops_syms,
Expand All @@ -31,7 +35,17 @@ def tokenize_string(source):
A Python source code string
"""
line_reader = StringIO(source).readline
for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader):
token_generator = tokenize.generate_tokens(line_reader)
hwalinga marked this conversation as resolved.
Show resolved Hide resolved

# Loop over all tokens till a backtick (`) is found.
# Then, take all tokens till the next backtick to form a backtick quoted
# string.
hwalinga marked this conversation as resolved.
Show resolved Hide resolved
for toknum, tokval, _, _, _ in token_generator:
if tokval == '`':
tokval = " ".join(it.takewhile(
lambda tokval: tokval != '`',
map(operator.itemgetter(1), token_generator)))
toknum = _BACKTICK_QUOTED_STRING
yield toknum, tokval


Expand Down Expand Up @@ -102,6 +116,31 @@ def _replace_locals(tok):
return toknum, tokval


def _clean_spaces_backtick_quoted_names(tok):
"""Clean up a column name if surrounded by backticks.

Backtick quoted string are indicated by a certain tokval value. If a string
is a backtick quoted token it will processed by
:func:`_remove_spaces_column_name` so that the parser can find this
string when the query is executed.
See also :meth:`NDFrame._get_space_character_free_column_resolver`.

Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module

Returns
-------
t : tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == _BACKTICK_QUOTED_STRING:
return tokenize.NAME, _remove_spaces_column_name(tokval)
return toknum, tokval


def _compose2(f, g):
"""Compose 2 callables"""
return lambda *args, **kwargs: f(g(*args, **kwargs))
Expand All @@ -114,7 +153,8 @@ def _compose(*funcs):


def _preparse(source, f=_compose(_replace_locals, _replace_booleans,
_rewrite_assign)):
_rewrite_assign,
_clean_spaces_backtick_quoted_names)):
"""Compose a collection of tokenization functions

Parameters
Expand Down Expand Up @@ -711,8 +751,9 @@ def visitor(x, y):
class PandasExprVisitor(BaseExprVisitor):

def __init__(self, env, engine, parser,
preparser=partial(_preparse, f=_compose(_replace_locals,
_replace_booleans))):
preparser=partial(_preparse, f=_compose(
_replace_locals, _replace_booleans,
_clean_spaces_backtick_quoted_names))):
super(PandasExprVisitor, self).__init__(env, engine, parser, preparser)


Expand Down
13 changes: 12 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2967,6 +2967,15 @@ def query(self, expr, inplace=False, **kwargs):
The query string to evaluate. You can refer to variables
in the environment by prefixing them with an '@' character like
``@a + b``.

.. versionadded:: 0.25.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add an example in the Examples section as well

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, but don't know what this means:

1 Warnings found:
No extended summary found
Docstring for "pandas.DataFrame.query" correct. :)


You can refer to column names that contain spaces by surrounding
them in backticks.

For example, if one of your columns is called ``a a`` and you want
to sum it with ``b``, your query should be ```a a` + b``.

inplace : bool
Whether the query should modify the data in place or return
a modified copy.
Expand Down Expand Up @@ -3160,7 +3169,9 @@ def eval(self, expr, inplace=False, **kwargs):
kwargs['level'] = kwargs.pop('level', 0) + 1
if resolvers is None:
index_resolvers = self._get_index_resolvers()
hwalinga marked this conversation as resolved.
Show resolved Hide resolved
resolvers = dict(self.iteritems()), index_resolvers
column_resolvers = \
self._get_space_character_free_column_resolvers()
resolvers = column_resolvers, index_resolvers
if 'target' not in kwargs:
kwargs['target'] = self
kwargs['resolvers'] = kwargs.get('resolvers', ()) + tuple(resolvers)
Expand Down
12 changes: 12 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import pandas.core.algorithms as algos
from pandas.core.base import PandasObject, SelectionMixin
import pandas.core.common as com
from pandas.core.computation.common import _remove_spaces_column_name
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

import this locally in the function (as we have some restricted import about computation)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

from pandas.core.index import (
Index, InvalidIndexError, MultiIndex, RangeIndex, ensure_index)
from pandas.core.indexes.datetimes import DatetimeIndex
Expand Down Expand Up @@ -423,6 +424,17 @@ def _get_index_resolvers(self):
d.update(self._get_axis_resolvers(axis_name))
return d

def _get_space_character_free_column_resolvers(self):
"""Return the space character free column resolvers of a dataframe.

Column names with spaces are 'cleaned up' so that they can be referred
to by backtick quoting.
Used in :meth:`DataFrame.eval`.
"""

return {_remove_spaces_column_name(k): v for k, v
in self.iteritems()}

@property
def _info_axis(self):
return getattr(self, self._info_axis_name)
Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/frame/test_query_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1031,3 +1031,54 @@ def test_invalid_type_for_operator_raises(self, parser, engine, op):

with pytest.raises(TypeError, match=msg):
df.eval('a {0} b'.format(op), engine=engine, parser=parser)


class TestDataFrameQueryBacktickQuoting(object):

@pytest.fixture(scope='class')
def df(self):
yield DataFrame({'A': [1, 2, 3],
'B B': [3, 2, 1],
'C C': [4, 5, 6],
'C_C': [8, 9, 10],
'D_D D': [11, 1, 101]})

def test_single_backtick_variable_query(self, df):
res = df.query('1 < `B B`')
expect = df[1 < df['B B']]
assert_frame_equal(res, expect)

def test_two_backtick_variables_query(self, df):
res = df.query('1 < `B B` and 4 < `C C`')
expect = df[(1 < df['B B']) & (4 < df['C C'])]
assert_frame_equal(res, expect)

def test_single_backtick_variable_expr(self, df):
res = df.eval('A + `B B`')
expect = df['A'] + df['B B']
assert_series_equal(res, expect)

def test_two_backtick_variables_expr(self, df):
res = df.eval('`B B` + `C C`')
expect = df['B B'] + df['C C']
assert_series_equal(res, expect)

def test_already_underscore_variable(self, df):
res = df.eval('`C_C` + A')
expect = df['C_C'] + df['A']
assert_series_equal(res, expect)

def test_same_name_but_underscores(self, df):
res = df.eval('C_C + `C C`')
expect = df['C_C'] + df['C C']
assert_series_equal(res, expect)

def test_mixed_underscores_and_spaces(self, df):
res = df.eval('A + `D_D D`')
expect = df['A'] + df['D_D D']
assert_series_equal(res, expect)

def backtick_quote_name_with_no_spaces(self, df):
res = df.eval('A + `C_C`')
expect = df['A'] + df['C_C']
assert_series_equal(res, expect)