From 59dfd38881467b0a1ad4655abd014afdb168ee6c Mon Sep 17 00:00:00 2001 From: Arwa Date: Mon, 21 Oct 2024 16:01:57 -0500 Subject: [PATCH 1/2] feat: show possible correct key(s) in .__getitem__ KeyError message --- bigframes/core/groupby/__init__.py | 28 ++++++++++++++++++++-- tests/system/small/test_groupby.py | 37 +++++++++++++++++------------- 2 files changed, 47 insertions(+), 18 deletions(-) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 2d351cf82d..1b74427f1d 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -19,6 +19,7 @@ import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby +import jellyfish import pandas as pd from bigframes.core import log_adapter @@ -91,8 +92,31 @@ def __getitem__( bad_keys = [key for key in keys if key not in self._block.column_labels] - if len(bad_keys) > 0: - raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}") + # Raise a KeyError message with the possible correct key(s) + if len(bad_keys) == 1: + possible_key = min( + self._block.column_labels, + key=lambda item: jellyfish.damerau_levenshtein_distance( + bad_keys[0], item + ), + ) + raise KeyError( + f"Columns not found: {str(bad_keys)[1:-1]}. Did you mean '{str(possible_key)}'?" + ) + if len(bad_keys) > 1: + possible_key = [] + for bad_key in bad_keys: + possible_key.append( + min( + self._block.column_labels, + key=lambda item: jellyfish.damerau_levenshtein_distance( + bad_key, item + ), + ) + ) + raise KeyError( + f"Columns not found: {str(bad_keys)[1:-1]}. Did you mean {str(possible_key)[1:-1]}?" + ) columns = [ col_id for col_id, label in self._col_id_labels.items() if label in keys diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 2d5ae21bb4..cbf6e1269d 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -426,24 +426,12 @@ def test_dataframe_groupby_getitem_error( scalars_pandas_df_index, ): col_names = ["float64_col", "int64_col", "bool_col", "string_col"] - with pytest.raises(KeyError, match="\"Columns not found: 'not_in_group'\""): - ( - scalars_df_index[col_names] - .groupby("string_col")["not_in_group"] - .min() - .to_pandas() - ) - - -def test_dataframe_groupby_getitem_multiple_columns_error( - scalars_df_index, - scalars_pandas_df_index, -): - col_names = ["float64_col", "int64_col", "bool_col", "string_col"] - with pytest.raises(KeyError, match="\"Columns not found: 'col1', 'col2'\""): + with pytest.raises( + KeyError, match=r"Columns not found: 'not_in_group'. Did you mean 'string_col'?" + ): ( scalars_df_index[col_names] - .groupby("string_col")["col1", "col2"] + .groupby("bool_col")["not_in_group"] .min() .to_pandas() ) @@ -464,6 +452,23 @@ def test_dataframe_groupby_getitem_list( pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) +def test_dataframe_groupby_getitem_list_error( + scalars_df_index, + scalars_pandas_df_index, +): + col_names = ["float64_col", "int64_col", "bool_col", "string_col"] + with pytest.raises( + KeyError, + match=r"Columns not found: 'col1', 'float'. Did you mean 'bool_col', 'float64_col'?", + ): + ( + scalars_df_index[col_names] + .groupby("string_col")["col1", "float"] + .min() + .to_pandas() + ) + + def test_dataframe_groupby_nonnumeric_with_mean(): df = pd.DataFrame( { From 24af2d89b4e5860289c5cafa397f8b15ec937409 Mon Sep 17 00:00:00 2001 From: Arwa Date: Wed, 23 Oct 2024 10:52:36 -0500 Subject: [PATCH 2/2] Keep one if statment --- bigframes/core/groupby/__init__.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 1b74427f1d..dfbe2ddea2 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -93,17 +93,7 @@ def __getitem__( bad_keys = [key for key in keys if key not in self._block.column_labels] # Raise a KeyError message with the possible correct key(s) - if len(bad_keys) == 1: - possible_key = min( - self._block.column_labels, - key=lambda item: jellyfish.damerau_levenshtein_distance( - bad_keys[0], item - ), - ) - raise KeyError( - f"Columns not found: {str(bad_keys)[1:-1]}. Did you mean '{str(possible_key)}'?" - ) - if len(bad_keys) > 1: + if len(bad_keys) > 0: possible_key = [] for bad_key in bad_keys: possible_key.append(