Skip to content
This repository was archived by the owner on Mar 6, 2026. It is now read-only.

Commit c2355c4

Browse files
committed
feat: add bool, int, float, string dtype to to_dataframe
1 parent a2520ca commit c2355c4

4 files changed

Lines changed: 272 additions & 11 deletions

File tree

google/cloud/bigquery/_pandas_helpers.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import logging
2222
import queue
2323
import warnings
24+
from typing import Any, Union
2425

2526
from packaging import version
2627

@@ -283,7 +284,13 @@ def bq_to_arrow_schema(bq_schema):
283284
return pyarrow.schema(arrow_fields)
284285

285286

286-
def default_types_mapper(date_as_object: bool = False):
287+
def default_types_mapper(
288+
date_as_object: bool = False,
289+
bool_dtype: Union[Any, None] = pandas.BooleanDtype(),
290+
int_dtype: Union[Any, None] = pandas.Int64Dtype(),
291+
float_dtype: Union[Any, None] = None,
292+
string_dtype: Union[Any, None] = None,
293+
):
287294
"""Create a mapping from pyarrow types to pandas types.
288295
289296
This overrides the pandas defaults to use null-safe extension types where
@@ -299,8 +306,17 @@ def default_types_mapper(date_as_object: bool = False):
299306
"""
300307

301308
def types_mapper(arrow_data_type):
302-
if pyarrow.types.is_boolean(arrow_data_type):
303-
return pandas.BooleanDtype()
309+
if bool_dtype is not None and pyarrow.types.is_boolean(arrow_data_type):
310+
return bool_dtype
311+
312+
elif int_dtype is not None and pyarrow.types.is_integer(arrow_data_type):
313+
return int_dtype
314+
315+
elif float_dtype is not None and pyarrow.types.is_floating(arrow_data_type):
316+
return float_dtype
317+
318+
elif string_dtype is not None and pyarrow.types.is_string(arrow_data_type):
319+
return string_dtype
304320

305321
elif (
306322
# If date_as_object is True, we know some DATE columns are
@@ -310,9 +326,6 @@ def types_mapper(arrow_data_type):
310326
):
311327
return db_dtypes.DateDtype()
312328

313-
elif pyarrow.types.is_integer(arrow_data_type):
314-
return pandas.Int64Dtype()
315-
316329
elif pyarrow.types.is_time(arrow_data_type):
317330
return db_dtypes.TimeDtype()
318331

google/cloud/bigquery/job/query.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@
5353
from google.cloud.bigquery.job.base import _JobConfig
5454
from google.cloud.bigquery.job.base import _JobReference
5555

56+
try:
57+
import pandas # type: ignore
58+
except ImportError: # pragma: NO COVER
59+
pandas = None
60+
5661
if typing.TYPE_CHECKING: # pragma: NO COVER
5762
# Assumption: type checks are only used by library developers and CI environments
5863
# that have all optional dependencies installed, thus no conditional imports.
@@ -1620,6 +1625,10 @@ def to_dataframe(
16201625
create_bqstorage_client: bool = True,
16211626
max_results: Optional[int] = None,
16221627
geography_as_object: bool = False,
1628+
bool_dtype: Union[Any, None] = pandas.BooleanDtype(),
1629+
int_dtype: Union[Any, None] = pandas.Int64Dtype(),
1630+
float_dtype: Union[Any, None] = None,
1631+
string_dtype: Union[Any, None] = None,
16231632
) -> "pandas.DataFrame":
16241633
"""Return a pandas DataFrame from a QueryJob
16251634
@@ -1672,6 +1681,46 @@ def to_dataframe(
16721681
16731682
.. versionadded:: 2.24.0
16741683
1684+
bool_dtype (Optional[pandas.Series.dtype, None]):
1685+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
1686+
to convert BigQuery Boolean type, instead of relying on the default
1687+
``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
1688+
then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
1689+
type can be found at:
1690+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
1691+
1692+
.. versionadded:: 3.7.1
1693+
1694+
int_dtype (Optional[pandas.Series.dtype, None]):
1695+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
1696+
to convert BigQuery Integer types, instead of relying on the default
1697+
``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
1698+
then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
1699+
Integer types can be found at:
1700+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
1701+
1702+
.. versionadded:: 3.7.1
1703+
1704+
float_dtype (Optional[pandas.Series.dtype, None]):
1705+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
1706+
to convert BigQuery Float type, instead of relying on the default
1707+
``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
1708+
then the data type will be ``numpy.dtype("float64")``. BigQuery Float
1709+
type can be found at:
1710+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
1711+
1712+
.. versionadded:: 3.7.1
1713+
1714+
string_dtype (Optional[pandas.Series.dtype, None]):
1715+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
1716+
convert BigQuery String type, instead of relying on the default
1717+
``numpy.dtype("object")``. If you explicitly set the value to ``None``,
1718+
then the data type will be ``numpy.dtype("object")``. BigQuery String
1719+
type can be found at:
1720+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
1721+
1722+
.. versionadded:: 3.7.1
1723+
16751724
Returns:
16761725
pandas.DataFrame:
16771726
A :class:`~pandas.DataFrame` populated with row data
@@ -1694,6 +1743,10 @@ def to_dataframe(
16941743
progress_bar_type=progress_bar_type,
16951744
create_bqstorage_client=create_bqstorage_client,
16961745
geography_as_object=geography_as_object,
1746+
bool_dtype=bool_dtype,
1747+
int_dtype=int_dtype,
1748+
float_dtype=float_dtype,
1749+
string_dtype=string_dtype,
16971750
)
16981751

16991752
# If changing the signature of this method, make sure to apply the same

google/cloud/bigquery/table.py

Lines changed: 87 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,11 @@
3434
except ImportError: # pragma: NO COVER
3535
pyarrow = None
3636

37+
try:
38+
import db_dtypes # type: ignore
39+
except ImportError: # pragma: NO COVER
40+
db_dtypes = None
41+
3742
try:
3843
import geopandas # type: ignore
3944
except ImportError:
@@ -88,6 +93,11 @@
8893

8994
_TABLE_HAS_NO_SCHEMA = 'Table has no schema: call "client.get_table()"'
9095

96+
_NO_SUPPORTED_DTYPE = (
97+
"The dtype cannot to be converted to a pandas ExtensionArray "
98+
"because the necessary `__from_arrow__` attribute is missing."
99+
)
100+
91101

92102
def _reference_getter(table):
93103
"""A :class:`~google.cloud.bigquery.table.TableReference` pointing to
@@ -1920,6 +1930,10 @@ def to_dataframe(
19201930
progress_bar_type: str = None,
19211931
create_bqstorage_client: bool = True,
19221932
geography_as_object: bool = False,
1933+
bool_dtype: Union[Any, None] = pandas.BooleanDtype(),
1934+
int_dtype: Union[Any, None] = pandas.Int64Dtype(),
1935+
float_dtype: Union[Any, None] = None,
1936+
string_dtype: Union[Any, None] = None,
19231937
) -> "pandas.DataFrame":
19241938
"""Create a pandas DataFrame by loading all pages of a query.
19251939
@@ -1958,6 +1972,7 @@ def to_dataframe(
19581972
progress bar as a graphical dialog box.
19591973
19601974
.. versionadded:: 1.11.0
1975+
19611976
create_bqstorage_client (Optional[bool]):
19621977
If ``True`` (default), create a BigQuery Storage API client
19631978
using the default API settings. The BigQuery Storage API
@@ -1975,6 +1990,46 @@ def to_dataframe(
19751990
19761991
.. versionadded:: 2.24.0
19771992
1993+
bool_dtype (Optional[pandas.Series.dtype, None]):
1994+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.BooleanDtype()``)
1995+
to convert BigQuery Boolean type, instead of relying on the default
1996+
``pandas.BooleanDtype()``. If you explicitly set the value to ``None``,
1997+
then the data type will be ``numpy.dtype("bool")``. BigQuery Boolean
1998+
type can be found at:
1999+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#boolean_type
2000+
2001+
.. versionadded:: 3.7.1
2002+
2003+
int_dtype (Optional[pandas.Series.dtype, None]):
2004+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Int64Dtype()``)
2005+
to convert BigQuery Integer types, instead of relying on the default
2006+
``pandas.Int64Dtype()``. If you explicitly set the value to ``None``,
2007+
then the data type will be ``numpy.dtype("int64")``. A list of BigQuery
2008+
Integer types can be found at:
2009+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#integer_types
2010+
2011+
.. versionadded:: 3.7.1
2012+
2013+
float_dtype (Optional[pandas.Series.dtype, None]):
2014+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.Float32Dtype()``)
2015+
to convert BigQuery Float type, instead of relying on the default
2016+
``numpy.dtype("float64")``. If you explicitly set the value to ``None``,
2017+
then the data type will be ``numpy.dtype("float64")``. BigQuery Float
2018+
type can be found at:
2019+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#floating_point_types
2020+
2021+
.. versionadded:: 3.7.1
2022+
2023+
string_dtype (Optional[pandas.Series.dtype, None]):
2024+
If set, indicate a pandas ExtensionDtype (e.g. ``pandas.StringDtype()``) to
2025+
convert BigQuery String type, instead of relying on the default
2026+
``numpy.dtype("object")``. If you explicitly set the value to ``None``,
2027+
then the data type will be ``numpy.dtype("object")``. BigQuery String
2028+
type can be found at:
2029+
https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#string_type
2030+
2031+
.. versionadded:: 3.7.1
2032+
19782033
Returns:
19792034
pandas.DataFrame:
19802035
A :class:`~pandas.DataFrame` populated with row data and column
@@ -1987,14 +2042,28 @@ def to_dataframe(
19872042
the :mod:`google.cloud.bigquery_storage_v1` module is
19882043
required but cannot be imported. Also if
19892044
`geography_as_object` is `True`, but the
1990-
:mod:`shapely` library cannot be imported.
2045+
:mod:`shapely` library cannot be imported. Also if
2046+
`bool_dtype`, `int_dtype` or other dtype parameters
2047+
is not supported dtype.
19912048
19922049
"""
19932050
_pandas_helpers.verify_pandas_imports()
19942051

19952052
if geography_as_object and shapely is None:
19962053
raise ValueError(_NO_SHAPELY_ERROR)
19972054

2055+
if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"):
2056+
raise ValueError("bool_dtype", _NO_SUPPORTED_DTYPE)
2057+
2058+
if int_dtype is not None and not hasattr(int_dtype, "__from_arrow__"):
2059+
raise ValueError("int_dtype", _NO_SUPPORTED_DTYPE)
2060+
2061+
if float_dtype is not None and not hasattr(float_dtype, "__from_arrow__"):
2062+
raise ValueError("float_dtype", _NO_SUPPORTED_DTYPE)
2063+
2064+
if string_dtype is not None and not hasattr(string_dtype, "__from_arrow__"):
2065+
raise ValueError("string_dtype", _NO_SUPPORTED_DTYPE)
2066+
19982067
if dtypes is None:
19992068
dtypes = {}
20002069

@@ -2019,15 +2088,15 @@ def to_dataframe(
20192088
for col in record_batch
20202089
# Type can be date32 or date64 (plus units).
20212090
# See: https://arrow.apache.org/docs/python/api/datatypes.html
2022-
if str(col.type).startswith("date")
2091+
if pyarrow.types.is_date(col.type)
20232092
)
20242093

20252094
timestamp_as_object = not all(
20262095
self.__can_cast_timestamp_ns(col)
20272096
for col in record_batch
2028-
# Type can be timestamp (plus units and time zone).
2097+
# Type can be datetime and timestamp (plus units and time zone).
20292098
# See: https://arrow.apache.org/docs/python/api/datatypes.html
2030-
if str(col.type).startswith("timestamp")
2099+
if pyarrow.types.is_timestamp(col.type)
20312100
)
20322101

20332102
if len(record_batch) > 0:
@@ -2036,7 +2105,11 @@ def to_dataframe(
20362105
timestamp_as_object=timestamp_as_object,
20372106
integer_object_nulls=True,
20382107
types_mapper=_pandas_helpers.default_types_mapper(
2039-
date_as_object=date_as_object
2108+
date_as_object=date_as_object,
2109+
bool_dtype=bool_dtype,
2110+
int_dtype=int_dtype,
2111+
float_dtype=float_dtype,
2112+
string_dtype=string_dtype,
20402113
),
20412114
)
20422115
else:
@@ -2233,6 +2306,10 @@ def to_dataframe(
22332306
progress_bar_type=None,
22342307
create_bqstorage_client=True,
22352308
geography_as_object=False,
2309+
bool_dtype=None,
2310+
int_dtype=None,
2311+
float_dtype=None,
2312+
string_dtype=None,
22362313
) -> "pandas.DataFrame":
22372314
"""Create an empty dataframe.
22382315
@@ -2241,6 +2318,11 @@ def to_dataframe(
22412318
dtypes (Any): Ignored. Added for compatibility with RowIterator.
22422319
progress_bar_type (Any): Ignored. Added for compatibility with RowIterator.
22432320
create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator.
2321+
geography_as_object (bool): Ignored. Added for compatibility with RowIterator.
2322+
bool_dtype (Any): Ignored. Added for compatibility with RowIterator.
2323+
int_dtype (Any): Ignored. Added for compatibility with RowIterator.
2324+
float_dtype (Any): Ignored. Added for compatibility with RowIterator.
2325+
string_dtype (Any): Ignored. Added for compatibility with RowIterator.
22442326
22452327
Returns:
22462328
pandas.DataFrame: An empty :class:`~pandas.DataFrame`.

0 commit comments

Comments
 (0)