Skip to content

Commit b9623da

Browse files
chelsea-lintswast
andauthored
feat: warn the deprecated max_download_size, random_state and sampling_method parameters in (DataFrame|Series).to_pandas() (#1573)
Fixes internal issue 391676515 --------- Co-authored-by: Tim Sweña (Swast) <tswast@gmail.com> Co-authored-by: Tim Sweña (Swast) <swast@google.com>
1 parent 7f0904d commit b9623da

File tree

7 files changed

+153
-43
lines changed

7 files changed

+153
-43
lines changed

bigframes/dataframe.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1669,17 +1669,27 @@ def to_pandas(
16691669
16701670
Args:
16711671
max_download_size (int, default None):
1672-
Download size threshold in MB. If max_download_size is exceeded when downloading data
1673-
(e.g., to_pandas()), the data will be downsampled if
1674-
bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be
1675-
raised. If set to a value other than None, this will supersede the global config.
1672+
.. deprecated:: 2.0.0
1673+
``max_download_size`` parameter is deprecated. Please use ``to_pandas_batches()``
1674+
method instead.
1675+
1676+
Download size threshold in MB. If ``max_download_size`` is exceeded when downloading data,
1677+
the data will be downsampled if ``bigframes.options.sampling.enable_downsampling`` is
1678+
``True``, otherwise, an error will be raised. If set to a value other than ``None``,
1679+
this will supersede the global config.
16761680
sampling_method (str, default None):
1681+
.. deprecated:: 2.0.0
1682+
``sampling_method`` parameter is deprecated. Please use ``sample()`` method instead.
1683+
16771684
Downsampling algorithms to be chosen from, the choices are: "head": This algorithm
16781685
returns a portion of the data from the beginning. It is fast and requires minimal
16791686
computations to perform the downsampling; "uniform": This algorithm returns uniform
16801687
random samples of the data. If set to a value other than None, this will supersede
16811688
the global config.
16821689
random_state (int, default None):
1690+
.. deprecated:: 2.0.0
1691+
``random_state`` parameter is deprecated. Please use ``sample()`` method instead.
1692+
16831693
The seed for the uniform downsampling algorithm. If provided, the uniform method may
16841694
take longer to execute and require more computation. If set to a value other than
16851695
None, this will supersede the global config.
@@ -1699,6 +1709,20 @@ def to_pandas(
16991709
downsampled rows and all columns of this DataFrame. If dry_run is set, a pandas
17001710
Series containing dry run statistics will be returned.
17011711
"""
1712+
if max_download_size is not None:
1713+
msg = bfe.format_message(
1714+
"DEPRECATED: The `max_download_size` parameters for `DataFrame.to_pandas()` "
1715+
"are deprecated and will be removed soon. Please use `DataFrame.to_pandas_batches()`."
1716+
)
1717+
warnings.warn(msg, category=FutureWarning)
1718+
if sampling_method is not None or random_state is not None:
1719+
msg = bfe.format_message(
1720+
"DEPRECATED: The `sampling_method` and `random_state` parameters for "
1721+
"`DataFrame.to_pandas()` are deprecated and will be removed soon. "
1722+
"Please use `DataFrame.sample().to_pandas()` instead for sampling."
1723+
)
1724+
warnings.warn(msg, category=FutureWarning, stacklevel=2)
1725+
17021726
if dry_run:
17031727
dry_run_stats, dry_run_job = self._block._compute_dry_run(
17041728
max_download_size=max_download_size,

bigframes/series.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
Tuple,
3636
Union,
3737
)
38+
import warnings
3839

3940
import bigframes_vendored.constants as constants
4041
import bigframes_vendored.pandas.core.series as vendored_pandas_series
@@ -61,6 +62,7 @@
6162
import bigframes.core.window_spec as windows
6263
import bigframes.dataframe
6364
import bigframes.dtypes
65+
import bigframes.exceptions as bfe
6466
import bigframes.formatting_helpers as formatter
6567
import bigframes.operations as ops
6668
import bigframes.operations.aggregations as agg_ops
@@ -432,17 +434,27 @@ def to_pandas(
432434
433435
Args:
434436
max_download_size (int, default None):
435-
Download size threshold in MB. If max_download_size is exceeded when downloading data
436-
(e.g., to_pandas()), the data will be downsampled if
437-
bigframes.options.sampling.enable_downsampling is True, otherwise, an error will be
438-
raised. If set to a value other than None, this will supersede the global config.
437+
.. deprecated:: 2.0.0
438+
``max_download_size`` parameter is deprecated. Please use ``to_pandas_batches()``
439+
method instead.
440+
441+
Download size threshold in MB. If ``max_download_size`` is exceeded when downloading data,
442+
the data will be downsampled if ``bigframes.options.sampling.enable_downsampling`` is
443+
``True``, otherwise, an error will be raised. If set to a value other than ``None``,
444+
this will supersede the global config.
439445
sampling_method (str, default None):
446+
.. deprecated:: 2.0.0
447+
``sampling_method`` parameter is deprecated. Please use ``sample()`` method instead.
448+
440449
Downsampling algorithms to be chosen from, the choices are: "head": This algorithm
441450
returns a portion of the data from the beginning. It is fast and requires minimal
442451
computations to perform the downsampling; "uniform": This algorithm returns uniform
443452
random samples of the data. If set to a value other than None, this will supersede
444453
the global config.
445454
random_state (int, default None):
455+
.. deprecated:: 2.0.0
456+
``random_state`` parameter is deprecated. Please use ``sample()`` method instead.
457+
446458
The seed for the uniform downsampling algorithm. If provided, the uniform method may
447459
take longer to execute and require more computation. If set to a value other than
448460
None, this will supersede the global config.
@@ -461,6 +473,19 @@ def to_pandas(
461473
is not exceeded; otherwise, a pandas Series with downsampled rows of the DataFrame. If dry_run
462474
is set to True, a pandas Series containing dry run statistics will be returned.
463475
"""
476+
if max_download_size is not None:
477+
msg = bfe.format_message(
478+
"DEPRECATED: The `max_download_size` parameters for `Series.to_pandas()` "
479+
"are deprecated and will be removed soon. Please use `Series.to_pandas_batches()`."
480+
)
481+
warnings.warn(msg, category=FutureWarning)
482+
if sampling_method is not None or random_state is not None:
483+
msg = bfe.format_message(
484+
"DEPRECATED: The `sampling_method` and `random_state` parameters for "
485+
"`Series.to_pandas()` are deprecated and will be removed soon. "
486+
"Please use `Series.sample().to_pandas()` instead for sampling."
487+
)
488+
warnings.warn(msg, category=FutureWarning)
464489

465490
if dry_run:
466491
dry_run_stats, dry_run_job = self._block._compute_dry_run(

tests/system/small/test_dataframe.py

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5203,9 +5203,7 @@ def test_query_complexity_repeated_subtrees(
52035203
# See: https://github.com/python/cpython/issues/112282
52045204
reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.",
52055205
)
5206-
def test_query_complexity_repeated_analytic(
5207-
scalars_df_index, scalars_pandas_df_index, with_multiquery_execution
5208-
):
5206+
def test_query_complexity_repeated_analytic(scalars_df_index, scalars_pandas_df_index):
52095207
bf_df = scalars_df_index[["int64_col", "int64_too"]]
52105208
pd_df = scalars_pandas_df_index[["int64_col", "int64_too"]]
52115209
# Uses LAG analytic operator, each in a new SELECT
@@ -5217,22 +5215,6 @@ def test_query_complexity_repeated_analytic(
52175215
assert_pandas_df_equal(bf_result, pd_result)
52185216

52195217

5220-
def test_to_pandas_downsampling_option_override(session):
5221-
df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting")
5222-
download_size = 1
5223-
5224-
# limits only apply for allow_large_result=True
5225-
df = df.to_pandas(
5226-
max_download_size=download_size,
5227-
sampling_method="head",
5228-
allow_large_results=True,
5229-
)
5230-
5231-
total_memory_bytes = df.memory_usage(deep=True).sum()
5232-
total_memory_mb = total_memory_bytes / (1024 * 1024)
5233-
assert total_memory_mb == pytest.approx(download_size, rel=0.5)
5234-
5235-
52365218
def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_created):
52375219
dataset_id = dataset_id_not_created
52385220
destination_table = f"{dataset_id}.scalars_df"

tests/system/small/test_dataframe_io.py

Lines changed: 56 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,62 @@ def test_to_pandas_override_global_option(scalars_df_index):
266266
assert scalars_df_index._query_job.destination.table_id == table_id
267267

268268

269+
def test_to_pandas_downsampling_option_override(session):
270+
df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting")
271+
download_size = 1
272+
273+
with pytest.warns(
274+
UserWarning, match="The data size .* exceeds the maximum download limit"
275+
):
276+
# limits only apply for allow_large_result=True
277+
df = df.to_pandas(
278+
max_download_size=download_size,
279+
sampling_method="head",
280+
allow_large_results=True,
281+
)
282+
283+
total_memory_bytes = df.memory_usage(deep=True).sum()
284+
total_memory_mb = total_memory_bytes / (1024 * 1024)
285+
assert total_memory_mb == pytest.approx(download_size, rel=0.5)
286+
287+
288+
@pytest.mark.parametrize(
289+
("kwargs", "message"),
290+
[
291+
pytest.param(
292+
{"sampling_method": "head"},
293+
r"DEPRECATED[\S\s]*sampling_method[\S\s]*DataFrame.sample",
294+
id="sampling_method",
295+
),
296+
pytest.param(
297+
{"random_state": 10},
298+
r"DEPRECATED[\S\s]*random_state[\S\s]*DataFrame.sample",
299+
id="random_state",
300+
),
301+
pytest.param(
302+
{"max_download_size": 10},
303+
r"DEPRECATED[\S\s]*max_download_size[\S\s]*DataFrame.to_pandas_batches",
304+
id="max_download_size",
305+
),
306+
],
307+
)
308+
def test_to_pandas_warns_deprecated_parameters(scalars_df_index, kwargs, message):
309+
with pytest.warns(FutureWarning, match=message):
310+
scalars_df_index.to_pandas(
311+
# limits only apply for allow_large_result=True
312+
allow_large_results=True,
313+
**kwargs,
314+
)
315+
316+
317+
def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index):
318+
bf_df = session.read_pandas(scalars_pandas_df_multi_index)
319+
320+
result = bf_df.to_pandas(dry_run=True)
321+
322+
assert len(result) == 14
323+
324+
269325
def test_to_arrow_override_global_option(scalars_df_index):
270326
# Direct call to_arrow uses global default setting (allow_large_results=True),
271327
with bigframes.option_context("bigquery.allow_large_results", True):
@@ -813,11 +869,3 @@ def test_to_sql_query_named_index_excluded(
813869
utils.assert_pandas_df_equal(
814870
roundtrip.to_pandas(), pd_df, check_index_type=False, ignore_order=True
815871
)
816-
817-
818-
def test_to_pandas_dry_run(session, scalars_pandas_df_multi_index):
819-
bf_df = session.read_pandas(scalars_pandas_df_multi_index)
820-
821-
result = bf_df.to_pandas(dry_run=True)
822-
823-
assert len(result) == 14

tests/system/small/test_index.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -426,11 +426,3 @@ def test_multiindex_repr_includes_all_names(session):
426426
)
427427
index = session.read_pandas(df).set_index(["A", "B"]).index
428428
assert "names=['A', 'B']" in repr(index)
429-
430-
431-
def test_to_pandas_dry_run(scalars_df_index):
432-
index = scalars_df_index.index
433-
434-
result = index.to_pandas(dry_run=True)
435-
436-
assert len(result) == 14

tests/system/small/test_index_io.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,14 @@ def test_to_pandas_override_global_option(scalars_df_index):
3030
assert bf_index._query_job.destination.table_id == table_id
3131

3232

33+
def test_to_pandas_dry_run(scalars_df_index):
34+
index = scalars_df_index.index
35+
36+
result = index.to_pandas(dry_run=True)
37+
38+
assert len(result) == 14
39+
40+
3341
def test_to_numpy_override_global_option(scalars_df_index):
3442
with bigframes.option_context("bigquery.allow_large_results", True):
3543

tests/system/small/test_series_io.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import pytest
1616

1717
import bigframes
18+
import bigframes.series
1819

1920

2021
def test_to_pandas_override_global_option(scalars_df_index):
@@ -37,6 +38,36 @@ def test_to_pandas_override_global_option(scalars_df_index):
3738
assert session._metrics.execution_count - execution_count == 1
3839

3940

41+
@pytest.mark.parametrize(
42+
("kwargs", "message"),
43+
[
44+
pytest.param(
45+
{"sampling_method": "head"},
46+
r"DEPRECATED[\S\s]*sampling_method[\S\s]*Series.sample",
47+
id="sampling_method",
48+
),
49+
pytest.param(
50+
{"random_state": 10},
51+
r"DEPRECATED[\S\s]*random_state[\S\s]*Series.sample",
52+
id="random_state",
53+
),
54+
pytest.param(
55+
{"max_download_size": 10},
56+
r"DEPRECATED[\S\s]*max_download_size[\S\s]*Series.to_pandas_batches",
57+
id="max_download_size",
58+
),
59+
],
60+
)
61+
def test_to_pandas_warns_deprecated_parameters(scalars_df_index, kwargs, message):
62+
s: bigframes.series.Series = scalars_df_index["int64_col"]
63+
with pytest.warns(FutureWarning, match=message):
64+
s.to_pandas(
65+
# limits only apply for allow_large_result=True
66+
allow_large_results=True,
67+
**kwargs,
68+
)
69+
70+
4071
@pytest.mark.parametrize(
4172
("page_size", "max_results", "allow_large_results"),
4273
[

0 commit comments

Comments
 (0)