Merge branch 'main' into hybrid_concat

TrevorBergeron · web-flow · commit 1fb7614c2593 · 2025-07-09T10:06:10.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,25 @@
 
 [1]: https://pypi.org/project/bigframes/#history
 
+## [2.10.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.9.0...v2.10.0) (2025-07-08)
+
+
+### Features
+
+* `df.to_pandas_batches()` returns one empty DataFrame if `df` is empty ([#1878](https://github.com/googleapis/python-bigquery-dataframes/issues/1878)) ([e43d15d](https://github.com/googleapis/python-bigquery-dataframes/commit/e43d15d535d6d5fd73c33967271f3591c41dffb3))
+* Add filter pushdown to hybrid engine ([#1871](https://github.com/googleapis/python-bigquery-dataframes/issues/1871)) ([6454aff](https://github.com/googleapis/python-bigquery-dataframes/commit/6454aff726dee791acbac98f893075ee5ee6d9a1))
+* Add simple stats support to hybrid local pushdown ([#1873](https://github.com/googleapis/python-bigquery-dataframes/issues/1873)) ([8715105](https://github.com/googleapis/python-bigquery-dataframes/commit/8715105239216bffe899ddcbb15805f2e3063af4))
+
+
+### Bug Fixes
+
+* Fix issues where duration type returned as int ([#1875](https://github.com/googleapis/python-bigquery-dataframes/issues/1875)) ([f30f750](https://github.com/googleapis/python-bigquery-dataframes/commit/f30f75053a6966abd1a6a644c23efb86b2ac568d))
+
+
+### Documentation
+
+* Update gsutil commands to gcloud commands ([#1876](https://github.com/googleapis/python-bigquery-dataframes/issues/1876)) ([c289f70](https://github.com/googleapis/python-bigquery-dataframes/commit/c289f7061320ec6d9de099cab2416cc9f289baac))
+
 ## [2.9.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.8.0...v2.9.0) (2025-06-30)
 
 
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -29,7 +29,17 @@
 import random
 import textwrap
 import typing
-from typing import Iterable, List, Literal, Mapping, Optional, Sequence, Tuple, Union
+from typing import (
+    Iterable,
+    Iterator,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
 import warnings
 
 import bigframes_vendored.constants as constants
@@ -87,14 +97,22 @@
 LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]]
 
 
-class BlockHolder(typing.Protocol):
+@dataclasses.dataclass
+class PandasBatches(Iterator[pd.DataFrame]):
     """Interface for mutable objects with state represented by a block value object."""
 
-    def _set_block(self, block: Block):
-        """Set the underlying block value of the object"""
+    def __init__(
+        self, pandas_batches: Iterator[pd.DataFrame], total_rows: Optional[int] = 0
+    ):
+        self._dataframes: Iterator[pd.DataFrame] = pandas_batches
+        self._total_rows: Optional[int] = total_rows
+
+    @property
+    def total_rows(self) -> Optional[int]:
+        return self._total_rows
 
-    def _get_block(self) -> Block:
-        """Get the underlying block value of the object"""
+    def __next__(self) -> pd.DataFrame:
+        return next(self._dataframes)
 
 
 @dataclasses.dataclass()
@@ -599,8 +617,7 @@ def try_peek(
                 self.expr, n, use_explicit_destination=allow_large_results
             )
             df = result.to_pandas()
-            self._copy_index_to_pandas(df)
-            return df
+            return self._copy_index_to_pandas(df)
         else:
             return None
 
@@ -609,8 +626,7 @@ def to_pandas_batches(
         page_size: Optional[int] = None,
         max_results: Optional[int] = None,
         allow_large_results: Optional[bool] = None,
-        squeeze: Optional[bool] = False,
-    ):
+    ) -> Iterator[pd.DataFrame]:
         """Download results one message at a time.
 
         page_size and max_results determine the size and number of batches,
@@ -621,43 +637,43 @@ def to_pandas_batches(
             use_explicit_destination=allow_large_results,
         )
 
-        total_batches = 0
-        for df in execute_result.to_pandas_batches(
-            page_size=page_size, max_results=max_results
-        ):
-            total_batches += 1
-            self._copy_index_to_pandas(df)
-            if squeeze:
-                yield df.squeeze(axis=1)
-            else:
-                yield df
-
         # To reduce the number of edge cases to consider when working with the
         # results of this, always return at least one DataFrame. See:
         # b/428918844.
-        if total_batches == 0:
-            df = pd.DataFrame(
-                {
-                    col: pd.Series([], dtype=self.expr.get_column_type(col))
-                    for col in itertools.chain(self.value_columns, self.index_columns)
-                }
-            )
-            self._copy_index_to_pandas(df)
-            yield df
+        empty_val = pd.DataFrame(
+            {
+                col: pd.Series([], dtype=self.expr.get_column_type(col))
+                for col in itertools.chain(self.value_columns, self.index_columns)
+            }
+        )
+        dfs = map(
+            lambda a: a[0],
+            itertools.zip_longest(
+                execute_result.to_pandas_batches(page_size, max_results),
+                [0],
+                fillvalue=empty_val,
+            ),
+        )
+        dfs = iter(map(self._copy_index_to_pandas, dfs))
 
-    def _copy_index_to_pandas(self, df: pd.DataFrame):
-        """Set the index on pandas DataFrame to match this block.
+        total_rows = execute_result.total_rows
+        if (total_rows is not None) and (max_results is not None):
+            total_rows = min(total_rows, max_results)
 
-        Warning: This method modifies ``df`` inplace.
-        """
+        return PandasBatches(dfs, total_rows)
+
+    def _copy_index_to_pandas(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Set the index on pandas DataFrame to match this block."""
         # Note: If BigQuery DataFrame has null index, a default one will be created for the local materialization.
+        new_df = df.copy()
         if len(self.index_columns) > 0:
-            df.set_index(list(self.index_columns), inplace=True)
+            new_df.set_index(list(self.index_columns), inplace=True)
             # Pandas names is annotated as list[str] rather than the more
             # general Sequence[Label] that BigQuery DataFrames has.
             # See: https://github.com/pandas-dev/pandas-stubs/issues/804
-            df.index.names = self.index.names  # type: ignore
-        df.columns = self.column_labels
+            new_df.index.names = self.index.names  # type: ignore
+        new_df.columns = self.column_labels
+        return new_df
 
     def _materialize_local(
         self, materialize_options: MaterializationOptions = MaterializationOptions()
@@ -724,9 +740,7 @@ def _materialize_local(
             )
         else:
             df = execute_result.to_pandas()
-            self._copy_index_to_pandas(df)
-
-        return df, execute_result.query_job
+            return self._copy_index_to_pandas(df), execute_result.query_job
 
     def _downsample(
         self, total_rows: int, sampling_method: str, fraction: float, random_state
@@ -1591,8 +1605,7 @@ def retrieve_repr_request_results(
         row_count = self.session._executor.execute(self.expr.row_count()).to_py_scalar()
 
         head_df = head_result.to_pandas()
-        self._copy_index_to_pandas(head_df)
-        return head_df, row_count, head_result.query_job
+        return self._copy_index_to_pandas(head_df), row_count, head_result.query_job
 
     def promote_offsets(self, label: Label = None) -> typing.Tuple[Block, str]:
         expr, result_id = self._expr.promote_offsets()
diff --git a/bigframes/core/compile/sqlglot/expressions/binary_compiler.py b/bigframes/core/compile/sqlglot/expressions/binary_compiler.py
@@ -42,3 +42,8 @@ def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
 @BINARY_OP_REGISTRATION.register(ops.ge_op)
 def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
     return sge.GTE(this=left.expr, expression=right.expr)
+
+
+@BINARY_OP_REGISTRATION.register(ops.JSONSet)
+def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression:
+    return sge.func("JSON_SET", left.expr, sge.convert(op.json_path), right.expr)
diff --git a/bigframes/core/compile/sqlglot/expressions/unary_compiler.py b/bigframes/core/compile/sqlglot/expressions/unary_compiler.py
@@ -70,3 +70,49 @@ def _(op: ops.ArraySliceOp, expr: TypedExpr) -> sge.Expression:
     )
 
     return sge.array(selected_elements)
+
+
+# JSON Ops
+@UNARY_OP_REGISTRATION.register(ops.JSONExtract)
+def _(op: ops.JSONExtract, expr: TypedExpr) -> sge.Expression:
+    return sge.func("JSON_EXTRACT", expr.expr, sge.convert(op.json_path))
+
+
+@UNARY_OP_REGISTRATION.register(ops.JSONExtractArray)
+def _(op: ops.JSONExtractArray, expr: TypedExpr) -> sge.Expression:
+    return sge.func("JSON_EXTRACT_ARRAY", expr.expr, sge.convert(op.json_path))
+
+
+@UNARY_OP_REGISTRATION.register(ops.JSONExtractStringArray)
+def _(op: ops.JSONExtractStringArray, expr: TypedExpr) -> sge.Expression:
+    return sge.func("JSON_EXTRACT_STRING_ARRAY", expr.expr, sge.convert(op.json_path))
+
+
+@UNARY_OP_REGISTRATION.register(ops.JSONQuery)
+def _(op: ops.JSONQuery, expr: TypedExpr) -> sge.Expression:
+    return sge.func("JSON_QUERY", expr.expr, sge.convert(op.json_path))
+
+
+@UNARY_OP_REGISTRATION.register(ops.JSONQueryArray)
+def _(op: ops.JSONQueryArray, expr: TypedExpr) -> sge.Expression:
+    return sge.func("JSON_QUERY_ARRAY", expr.expr, sge.convert(op.json_path))
+
+
+@UNARY_OP_REGISTRATION.register(ops.JSONValue)
+def _(op: ops.JSONValue, expr: TypedExpr) -> sge.Expression:
+    return sge.func("JSON_VALUE", expr.expr, sge.convert(op.json_path))
+
+
+@UNARY_OP_REGISTRATION.register(ops.JSONValueArray)
+def _(op: ops.JSONValueArray, expr: TypedExpr) -> sge.Expression:
+    return sge.func("JSON_VALUE_ARRAY", expr.expr, sge.convert(op.json_path))
+
+
+@UNARY_OP_REGISTRATION.register(ops.ParseJSON)
+def _(op: ops.ParseJSON, expr: TypedExpr) -> sge.Expression:
+    return sge.func("PARSE_JSON", expr.expr)
+
+
+@UNARY_OP_REGISTRATION.register(ops.ToJSONString)
+def _(op: ops.ToJSONString, expr: TypedExpr) -> sge.Expression:
+    return sge.func("TO_JSON_STRING", expr.expr)
diff --git a/bigframes/series.py b/bigframes/series.py
@@ -648,13 +648,12 @@ def to_pandas_batches(
                 form the original Series. Results stream from bigquery,
                 see https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.table.RowIterator#google_cloud_bigquery_table_RowIterator_to_arrow_iterable
         """
-        df = self._block.to_pandas_batches(
+        batches = self._block.to_pandas_batches(
             page_size=page_size,
             max_results=max_results,
             allow_large_results=allow_large_results,
-            squeeze=True,
         )
-        return df
+        return map(lambda df: cast(pandas.Series, df.squeeze(1)), batches)
 
     def _compute_dry_run(self) -> bigquery.QueryJob:
         _, query_job = self._block._compute_dry_run((self._value_column,))
diff --git a/bigframes/version.py b/bigframes/version.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "2.9.0"
+__version__ = "2.10.0"
 
 # {x-release-please-start-date}
-__release_date__ = "2025-06-30"
+__release_date__ = "2025-07-08"
 # {x-release-please-end}
diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -871,6 +871,21 @@ def test_filter_df(scalars_dfs):
     assert_pandas_df_equal(bf_result, pd_result)
 
 
+def test_df_to_pandas_batches(scalars_dfs):
+    scalars_df, scalars_pandas_df = scalars_dfs
+
+    capped_unfiltered_batches = scalars_df.to_pandas_batches(page_size=2, max_results=6)
+    bf_bool_series = scalars_df["bool_col"]
+    filtered_batches = scalars_df[bf_bool_series].to_pandas_batches()
+
+    pd_bool_series = scalars_pandas_df["bool_col"]
+    pd_result = scalars_pandas_df[pd_bool_series]
+
+    assert 6 == capped_unfiltered_batches.total_rows
+    assert len(pd_result) == filtered_batches.total_rows
+    assert_pandas_df_equal(pd.concat(filtered_batches), pd_result)
+
+
 def test_assign_new_column(scalars_dfs):
     scalars_df, scalars_pandas_df = scalars_dfs
     kwargs = {"new_col": 2}
diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_json_set/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_json_set/out.sql
@@ -0,0 +1,20 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `rowindex` AS `bfcol_0`,
+    `json_col` AS `bfcol_1`
+  FROM `bigframes-dev`.`sqlglot_test`.`json_types`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    JSON_SET(`bfcol_1`, '$.a', 100) AS `bfcol_4`
+  FROM `bfcte_0`
+), `bfcte_2` AS (
+  SELECT
+    *,
+    JSON_SET(`bfcol_4`, '$.b', 'hi') AS `bfcol_7`
+  FROM `bfcte_1`
+)
+SELECT
+  `bfcol_0` AS `rowindex`,
+  `bfcol_7` AS `json_col`
+FROM `bfcte_2`
diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_extract/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_json_extract/out.sql
@@ -0,0 +1,15 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `rowindex` AS `bfcol_0`,
+    `json_col` AS `bfcol_1`
+  FROM `bigframes-dev`.`sqlglot_test`.`json_types`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    JSON_EXTRACT(`bfcol_1`, '$') AS `bfcol_4`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_0` AS `rowindex`,
+  `bfcol_4` AS `json_col`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_parse_json/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_unary_compiler/test_parse_json/out.sql
@@ -0,0 +1,15 @@
+WITH `bfcte_0` AS (
+  SELECT
+    `rowindex` AS `bfcol_0`,
+    `string_col` AS `bfcol_1`
+  FROM `bigframes-dev`.`sqlglot_test`.`scalar_types`
+), `bfcte_1` AS (
+  SELECT
+    *,
+    JSON_VALUE(`bfcol_1`, '$') AS `bfcol_4`
+  FROM `bfcte_0`
+)
+SELECT
+  `bfcol_0` AS `rowindex`,
+  `bfcol_4` AS `string_col`
+FROM `bfcte_1`
diff --git a/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py b/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py
@@ -14,6 +14,7 @@
 
 import pytest
 
+import bigframes.bigquery as bbq
 import bigframes.pandas as bpd
 
 pytest.importorskip("pytest_snapshot")
@@ -41,3 +42,8 @@ def test_add_string(scalar_types_df: bpd.DataFrame, snapshot):
     bf_df["string_col"] = bf_df["string_col"] + "a"
 
     snapshot.assert_match(bf_df.sql, "out.sql")
+
+
+def test_json_set(json_types_df: bpd.DataFrame, snapshot):
+    result = bbq.json_set(json_types_df["json_col"], [("$.a", 100), ("$.b", "hi")])
+    snapshot.assert_match(result.to_frame().sql, "out.sql")
diff --git a/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py b/tests/unit/core/compile/sqlglot/expressions/test_unary_compiler.py
diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py