feat: warn the deprecated max_download_size, random_state and sampling_method parameters in (DataFrame|Series).to_pandas() (#1573)

chelsea-lin · tswast · web-flow · commit b9623daa8478 · 2025-04-08T13:41:57.000-05:00
Fixes internal issue 391676515 

---------

Co-authored-by: Tim Sweña (Swast) &lt;tswast@gmail.com&gt;
Co-authored-by: Tim Sweña (Swast) &lt;swast@google.com&gt;
diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py
@@ -1669,17 +1669,27 @@ def to_pandas(
 
         Args:
             max_download_size (int, default None):
-                Download size threshold in MB. If max_download_size is exceeded when downloading data
-                (e.g., to_pandas()), the data will be downsampled if
-                bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be
-                raised. If set to a value other than None, this will supersede the global config.
+                .. deprecated:: 2.0.0
+                    ``max_download_size`` parameter is deprecated. Please use ``to_pandas_batches()``
+                    method instead.
+
+                Download size threshold in MB. If ``max_download_size`` is exceeded when downloading data,
+                the data will be downsampled if ``bigframes.options.sampling.enable_downsampling`` is
+                ``True``, otherwise, an error will be raised. If set to a value other than ``None``,
+                this will supersede the global config.
             sampling_method (str, default None):
+                .. deprecated:: 2.0.0
+                    ``sampling_method`` parameter is deprecated. Please use ``sample()`` method instead.
+
                 Downsampling algorithms to be chosen from, the choices are: "head": This algorithm
                 returns a portion of the data from the beginning. It is fast and requires minimal
                 computations to perform the downsampling; "uniform": This algorithm returns uniform
                 random samples of the data. If set to a value other than None, this will supersede
                 the global config.
             random_state (int, default None):
+                .. deprecated:: 2.0.0
+                    ``random_state`` parameter is deprecated. Please use ``sample()`` method instead.
+
                 The seed for the uniform downsampling algorithm. If provided, the uniform method may
                 take longer to execute and require more computation. If set to a value other than
                 None, this will supersede the global config.
@@ -1699,6 +1709,20 @@ def to_pandas(
                 downsampled rows and all columns of this DataFrame. If dry_run is set, a pandas
                 Series containing dry run statistics will be returned.
         """
+        if max_download_size is not None:
+            msg = bfe.format_message(
+                "DEPRECATED: The `max_download_size` parameters for `DataFrame.to_pandas()` "
+                "are deprecated and will be removed soon. Please use `DataFrame.to_pandas_batches()`."
+            )
+            warnings.warn(msg, category=FutureWarning)
+        if sampling_method is not None or random_state is not None:
+            msg = bfe.format_message(
+                "DEPRECATED: The `sampling_method` and `random_state` parameters for "
+                "`DataFrame.to_pandas()` are deprecated and will be removed soon. "
+                "Please use `DataFrame.sample().to_pandas()` instead for sampling."
+            )
+            warnings.warn(msg, category=FutureWarning, stacklevel=2)
+
         if dry_run:
             dry_run_stats, dry_run_job = self._block._compute_dry_run(
                 max_download_size=max_download_size,
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -35,6 +35,7 @@
     Tuple,
     Union,
 )
+import warnings
 
 import bigframes_vendored.constants as constants
 import bigframes_vendored.pandas.core.series as vendored_pandas_series
@@ -61,6 +62,7 @@
 import bigframes.core.window_spec as windows
 import bigframes.dataframe
 import bigframes.dtypes
+import bigframes.exceptions as bfe
 import bigframes.formatting_helpers as formatter
 import bigframes.operations as ops
 import bigframes.operations.aggregations as agg_ops
@@ -432,17 +434,27 @@ def to_pandas(
 
         Args:
             max_download_size (int, default None):
-                Download size threshold in MB. If max_download_size is exceeded when downloading data
-                (e.g., to_pandas()), the data will be downsampled if
-                bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be
-                raised. If set to a value other than None, this will supersede the global config.
+                .. deprecated:: 2.0.0
+                    ``max_download_size`` parameter is deprecated. Please use ``to_pandas_batches()``
+                    method instead.
+
+                Download size threshold in MB. If ``max_download_size`` is exceeded when downloading data,
+                the data will be downsampled if ``bigframes.options.sampling.enable_downsampling`` is
+                ``True``, otherwise, an error will be raised. If set to a value other than ``None``,
+                this will supersede the global config.
             sampling_method (str, default None):
+                .. deprecated:: 2.0.0
+                    ``sampling_method`` parameter is deprecated. Please use ``sample()`` method instead.
+
                 Downsampling algorithms to be chosen from, the choices are: "head": This algorithm
                 returns a portion of the data from the beginning. It is fast and requires minimal
                 computations to perform the downsampling; "uniform": This algorithm returns uniform
                 random samples of the data. If set to a value other than None, this will supersede
                 the global config.
             random_state (int, default None):
+                .. deprecated:: 2.0.0
+                    ``random_state`` parameter is deprecated. Please use ``sample()`` method instead.
+
                 The seed for the uniform downsampling algorithm. If provided, the uniform method may
                 take longer to execute and require more computation. If set to a value other than
                 None, this will supersede the global config.
@@ -461,6 +473,19 @@ def to_pandas(
                 is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame. If dry_run
                 is set to True, a pandas Series containing dry run statistics will be returned.
         """
+        if max_download_size is not None:
+            msg = bfe.format_message(
+                "DEPRECATED: The `max_download_size` parameters for `Series.to_pandas()` "
+                "are deprecated and will be removed soon. Please use `Series.to_pandas_batches()`."
+            )
+            warnings.warn(msg, category=FutureWarning)
+        if sampling_method is not None or random_state is not None:
+            msg = bfe.format_message(
+                "DEPRECATED: The `sampling_method` and `random_state` parameters for "
+                "`Series.to_pandas()` are deprecated and will be removed soon. "
+                "Please use `Series.sample().to_pandas()` instead for sampling."
+            )
+            warnings.warn(msg, category=FutureWarning)
 
         if dry_run:
             dry_run_stats, dry_run_job = self._block._compute_dry_run(
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -5203,9 +5203,7 @@ def test_query_complexity_repeated_subtrees(
     # See: https://github.com/python/cpython/issues/112282
     reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.",
 )
-def test_query_complexity_repeated_analytic(
-    scalars_df_index, scalars_pandas_df_index, with_multiquery_execution
-):
+def test_query_complexity_repeated_analytic(scalars_df_index, scalars_pandas_df_index):
     bf_df = scalars_df_index[["int64_col", "int64_too"]]
     pd_df = scalars_pandas_df_index[["int64_col", "int64_too"]]
     # Uses LAG analytic operator, each in a new SELECT
@@ -5217,22 +5215,6 @@ def test_query_complexity_repeated_analytic(
     assert_pandas_df_equal(bf_result, pd_result)
 
 
-def test_to_pandas_downsampling_option_override(session):
-    df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting")
-    download_size = 1
-
-    # limits only apply for allow_large_result=True
-    df = df.to_pandas(
-        max_download_size=download_size,
-        sampling_method="head",
-        allow_large_results=True,
-    )
-
-    total_memory_bytes = df.memory_usage(deep=True).sum()
-    total_memory_mb = total_memory_bytes / (1024 * 1024)
-    assert total_memory_mb == pytest.approx(download_size, rel=0.5)
-
-
 def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_created):
     dataset_id = dataset_id_not_created
     destination_table = f"{dataset_id}.scalars_df"
diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py
@@ -266,6 +266,62 @@ def test_to_pandas_override_global_option(scalars_df_index):
         assert scalars_df_index._query_job.destination.table_id == table_id
 
 
+def test_to_pandas_downsampling_option_override(session):
+    df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting")
+    download_size = 1
+
+    with pytest.warns(
+        UserWarning, match="The data size .* exceeds the maximum download limit"
+    ):
+        # limits only apply for allow_large_result=True
+        df = df.to_pandas(
+            max_download_size=download_size,
+            sampling_method="head",
+            allow_large_results=True,
+        )
+
+    total_memory_bytes = df.memory_usage(deep=True).sum()
+    total_memory_mb = total_memory_bytes / (1024 * 1024)
+    assert total_memory_mb == pytest.approx(download_size, rel=0.5)
+
+
+@pytest.mark.parametrize(
+    ("kwargs", "message"),
+    [
+        pytest.param(
+            {"sampling_method": "head"},
+            r"DEPRECATED[\S\s]*sampling_method[\S\s]*DataFrame.sample",
+            id="sampling_method",
+        ),
+        pytest.param(
+            {"random_state": 10},
+            r"DEPRECATED[\S\s]*random_state[\S\s]*DataFrame.sample",
+            id="random_state",
+        ),
+        pytest.param(
+            {"max_download_size": 10},
+            r"DEPRECATED[\S\s]*max_download_size[\S\s]*DataFrame.to_pandas_batches",
+            id="max_download_size",
+        ),
+    ],
+)
+def test_to_pandas_warns_deprecated_parameters(scalars_df_index, kwargs, message):
+    with pytest.warns(FutureWarning, match=message):
+        scalars_df_index.to_pandas(
+            # limits only apply for allow_large_result=True
+            allow_large_results=True,
+            **kwargs,
+        )
+
+
+def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index):
+    bf_df = session.read_pandas(scalars_pandas_df_multi_index)
+
+    result = bf_df.to_pandas(dry_run=True)
+
+    assert len(result) == 14
+
+
 def test_to_arrow_override_global_option(scalars_df_index):
     # Direct call to_arrow uses global default setting (allow_large_results=True),
     with bigframes.option_context("bigquery.allow_large_results", True):
@@ -813,11 +869,3 @@ def test_to_sql_query_named_index_excluded(
     utils.assert_pandas_df_equal(
         roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True
     )
-
-
-def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index):
-    bf_df = session.read_pandas(scalars_pandas_df_multi_index)
-
-    result = bf_df.to_pandas(dry_run=True)
-
-    assert len(result) == 14
diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py
@@ -426,11 +426,3 @@ def test_multiindex_repr_includes_all_names(session):
     )
     index = session.read_pandas(df).set_index(["A", "B"]).index
     assert "names=['A', 'B']" in repr(index)
-
-
-def test_to_pandas_dry_run(scalars_df_index):
-    index = scalars_df_index.index
-
-    result = index.to_pandas(dry_run=True)
-
-    assert len(result) == 14
diff --git a/tests/system/small/test_index_io.py b/tests/system/small/test_index_io.py
@@ -30,6 +30,14 @@ def test_to_pandas_override_global_option(scalars_df_index):
         assert bf_index._query_job.destination.table_id == table_id
 
 
+def test_to_pandas_dry_run(scalars_df_index):
+    index = scalars_df_index.index
+
+    result = index.to_pandas(dry_run=True)
+
+    assert len(result) == 14
+
+
 def test_to_numpy_override_global_option(scalars_df_index):
     with bigframes.option_context("bigquery.allow_large_results", True):
 
diff --git a/tests/system/small/test_series_io.py b/tests/system/small/test_series_io.py
@@ -15,6 +15,7 @@
 import pytest
 
 import bigframes
+import bigframes.series
 
 
 def test_to_pandas_override_global_option(scalars_df_index):
@@ -37,6 +38,36 @@ def test_to_pandas_override_global_option(scalars_df_index):
         assert session._metrics.execution_count - execution_count == 1
 
 
+@pytest.mark.parametrize(
+    ("kwargs", "message"),
+    [
+        pytest.param(
+            {"sampling_method": "head"},
+            r"DEPRECATED[\S\s]*sampling_method[\S\s]*Series.sample",
+            id="sampling_method",
+        ),
+        pytest.param(
+            {"random_state": 10},
+            r"DEPRECATED[\S\s]*random_state[\S\s]*Series.sample",
+            id="random_state",
+        ),
+        pytest.param(
+            {"max_download_size": 10},
+            r"DEPRECATED[\S\s]*max_download_size[\S\s]*Series.to_pandas_batches",
+            id="max_download_size",
+        ),
+    ],
+)
+def test_to_pandas_warns_deprecated_parameters(scalars_df_index, kwargs, message):
+    s: bigframes.series.Series = scalars_df_index["int64_col"]
+    with pytest.warns(FutureWarning, match=message):
+        s.to_pandas(
+            # limits only apply for allow_large_result=True
+            allow_large_results=True,
+            **kwargs,
+        )
+
+
 @pytest.mark.parametrize(
     ("page_size", "max_results", "allow_large_results"),
     [