supports dataframegroupby and adds tests/docs

chelsea-lin · chelsea-lin · commit 6e92dd6f07ec · 2024-05-07T23:55:23.000Z
diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
@@ -22,11 +22,13 @@
 
 import typing
 
+import bigframes.constants as constants
 import bigframes.core.groupby as groupby
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
 
 if typing.TYPE_CHECKING:
+    import bigframes.dataframe as dataframe
     import bigframes.series as series
 
 
@@ -54,6 +56,10 @@ def array_length(series: series.Series) -> series.Series:
         2    2
         dtype: Int64
 
+    Args:
+        series (bigframes.series.Series):
+                A Series with array columns.
+
     Returns:
         bigframes.series.Series: A Series of integer values indicating
             the length of each element in the Series.
@@ -62,5 +68,53 @@ def array_length(series: series.Series) -> series.Series:
     return series._apply_unary_op(ops.len_op)
 
 
-def array_agg(groupby_series: groupby.SeriesGroupBy) -> series.Series:
-    return groupby_series._aggregate(agg_ops.ArrayAggOp())
+def array_agg(
+    obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy,
+) -> series.Series | dataframe.DataFrame:
+    """Group data and create arrays from selected columns, omitting NULLs to avoid
+    BigQuery errors (NULLs not allowed in arrays).
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+
+    For a SeriesGroupBy object:
+
+        >>> lst = ['a', 'a', 'b', 'b', 'a']
+        >>> s = bpd.Series([1, 2, 3, 4, np.nan], index=lst)
+        >>> bbq.array_agg(s.groupby(level=0))
+        a    [1. 2.]
+        b    [3. 4.]
+        dtype: list<item: double>[pyarrow]
+
+    For a DataFrameGroupBy object:
+
+        >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
+        >>> df = bpd.DataFrame(l, columns=["a", "b", "c"])
+        >>> bbq.array_agg(df.groupby(by=["b"]))
+        b		a	c
+        1.0	[2]	[3]
+        2.0	[1 1]	[3 2]
+        2 rows × 2 columns
+
+        [2 rows x 2 columns in total]
+
+    Args:
+        obj (groupby.SeriesGroupBy | groupby.DataFrameGroupBy):
+                A GroupBy object to be applied the function.
+
+    Returns:
+        bigframes.series.Series | bigframes.dataframe.DataFrame: A Series or
+            DataFrame containing aggregated array columns, and indexed by the
+            original group columns.
+    """
+    if isinstance(obj, groupby.SeriesGroupBy):
+        return obj._aggregate(agg_ops.ArrayAggOp())
+    elif isinstance(obj, groupby.DataFrameGroupBy):
+        return obj._aggregate_all(agg_ops.ArrayAggOp(), numeric_only=False)
+    else:
+        raise ValueError(
+            f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}"
+        )
diff --git a/tests/system/small/bigquery/test_array.py b/tests/system/small/bigquery/test_array.py
@@ -14,6 +14,7 @@
 
 import numpy as np
 import pandas as pd
+import pytest
 
 import bigframes.bigquery as bbq
 import bigframes.pandas as bpd
@@ -23,10 +24,76 @@ def test_array_length():
     series = bpd.Series([["A", "AA", "AAA"], ["BB", "B"], np.nan, [], ["C"]])
     # TODO(b/336880368): Allow for NULL values to be input for ARRAY columns.
     # Once we actually store NULL values, this will be NULL where the input is NULL.
-    expected = pd.Series([3, 2, 0, 0, 1])
+    expected = bpd.Series([3, 2, 0, 0, 1])
     pd.testing.assert_series_equal(
         bbq.array_length(series).to_pandas(),
-        expected,
-        check_dtype=False,
-        check_index_type=False,
+        expected.to_pandas(),
+    )
+
+
+@pytest.mark.parametrize(
+    ("input_data", "output_data"),
+    [
+        pytest.param([1, 2, 3, 4, 5], [[1, 2], [3, 4], [5]], id="ints"),
+        pytest.param(
+            ["e", "d", "c", "b", "a"],
+            [["e", "d"], ["c", "b"], ["a"]],
+            id="reverse_strings",
+        ),
+        pytest.param(
+            [1.0, 2.0, np.nan, np.nan, np.nan], [[1.0, 2.0], [], []], id="nans"
+        ),
+        pytest.param(
+            [{"A": {"x": 1.0}}, {"A": {"z": 4.0}}, {}, {"B": "b"}, np.nan],
+            [[{"A": {"x": 1.0}}, {"A": {"z": 4.0}}], [{}, {"B": "b"}], []],
+            id="structs",
+        ),
+    ],
+)
+def test_array_agg_w_series(input_data, output_data):
+    input_index = ["a", "a", "b", "b", "c"]
+    series = bpd.Series(input_data, index=input_index)
+    result = bbq.array_agg(series.groupby(level=0))
+
+    expected = bpd.Series(output_data, index=["a", "b", "c"])
+    pd.testing.assert_series_equal(
+        result.to_pandas(),
+        expected.to_pandas(),
+    )
+
+
+def test_array_agg_w_dataframe():
+    data = {
+        "a": [1, 1, 2, 1],
+        "b": [2, None, 1, 2],
+        "c": [3, 4, 3, 2],
+    }
+    df = bpd.DataFrame(data)
+    result = bbq.array_agg(df.groupby(by=["b"]))
+
+    expected_data = {
+        "b": [1.0, 2.0],
+        "a": [[2], [1, 1]],
+        "c": [[3], [3, 2]],
+    }
+    expected = bpd.DataFrame(expected_data).set_index("b")
+
+    pd.testing.assert_frame_equal(
+        result.to_pandas(),
+        expected.to_pandas(),
+    )
+
+def assert_array_agg_matches_after_explode():
+    data = {
+        "index": np.arange(10),
+        "a": [np.random.randint(0, 10, 10) for _ in range(10)],
+        "b": [np.random.randint(0, 10, 10) for _ in range(10)],
+    }
+    df = bpd.DataFrame(data).set_index("index")
+    result = bbq.array_agg(df.explode(["a", "b"]).groupby(level=0))
+    result.index.name = "index"
+
+    pd.testing.assert_frame_equal(
+        result.to_pandas(),
+        df.to_pandas(),
     )