Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ci/environment-3.7.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,5 @@ dependencies:
# Including grpcio-status as a temporary workaround for
# https://github.com/googleapis/python-api-core/issues/301
- grpcio-status
- pandas-gbq<=0.15
- google-cloud-bigquery>=2.11.0
- google-cloud-bigquery-storage
1 change: 0 additions & 1 deletion ci/environment-3.8.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,5 @@ dependencies:
# Including grpcio-status as a temporary workaround for
# https://github.com/googleapis/python-api-core/issues/301
- grpcio-status
- pandas-gbq<=0.15
- google-cloud-bigquery>=2.11.0
- google-cloud-bigquery-storage
1 change: 0 additions & 1 deletion ci/environment-3.9.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,5 @@ dependencies:
# Including grpcio-status as a temporary workaround for
# https://github.com/googleapis/python-api-core/issues/301
- grpcio-status
- pandas-gbq<=0.15
- google-cloud-bigquery>=2.11.0
- google-cloud-bigquery-storage
31 changes: 23 additions & 8 deletions dask_bigquery/tests/test_core.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import random
import uuid
from datetime import datetime, timedelta, timezone

import google.auth
import pandas as pd
Expand All @@ -19,6 +20,7 @@ def df():
{
"name": random.choice(["fred", "wilma", "barney", "betty"]),
"number": random.randint(0, 100),
"timestamp": datetime.now(timezone.utc) - timedelta(days=i % 2),
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before adding timezone.utc I was getting this assertion error:

E           AssertionError: Attributes of DataFrame.iloc[:, 2] (column name="timestamp") are different
E           
E           Attribute "dtype" are different
E           [left]:  datetime64[ns, UTC]
E           [right]: datetime64[ns]

It seems like when reading back from bigquery, it will automatically convert to utc if not otherwise specified, causing the error.
@tswast can you confirm this is the case? any comments?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TIMESTAMP columns are intended to come back as datetime64[ns, UTC], yes.

DATETIME should come back as datetime64[ns].

See my answer here on the difference between the two: https://stackoverflow.com/a/47724366/101923

Also note: both will come back as object dtype if there's a date outside of the pandas representable range, e.g. 0001-01-01 or 9999-12-31.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm actually working on making the pandas-gbq dtypes consistent with google-cloud-bigquery as we speak in googleapis/python-bigquery-pandas#444

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that if I don't provide a schema, bigquery will infer that the dataframe column named "timestamp" is a TIMESTAMP column therefore it's converting it is coming back as datetime64[ns, UTC]. That been said to keep the test simple I think we can have the local dataframe to be timezone aware and test that it comes back as it should.

cc: @jrbourbeau Does this convince you? If so this PR is ready for review.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good 👍

"idx": i,
}
for i in range(10)
Expand All @@ -35,13 +37,26 @@ def dataset(df):
dataset_id = uuid.uuid4().hex
table_id = "table_test"
# push data to gbq
pd.DataFrame.to_gbq(
df,
destination_table=f"{dataset_id}.{table_id}",
project_id=project_id,
chunksize=5,
if_exists="append",

time_partitioning = bigquery.TimePartitioning(
type_=bigquery.TimePartitioningType.DAY,
field="timestamp",
) # field to use for partitioning

job_config = bigquery.LoadJobConfig(
write_disposition="WRITE_TRUNCATE", time_partitioning=time_partitioning
)

with bigquery.Client() as bq_client:
dataset = bigquery.Dataset(f"{project_id}.{dataset_id}")
bq_client.create_dataset(dataset)
job = bq_client.load_table_from_dataframe(
df,
destination=f"{project_id}.{dataset_id}.{table_id}",
job_config=job_config,
) # Make an API request.
job.result()

yield (project_id, dataset_id, table_id)

with bigquery.Client() as bq_client:
Expand All @@ -55,7 +70,7 @@ def test_read_gbq(df, dataset, client):
project_id, dataset_id, table_id = dataset
ddf = read_gbq(project_id=project_id, dataset_id=dataset_id, table_id=table_id)

assert list(ddf.columns) == ["name", "number", "idx"]
assert list(ddf.columns) == ["name", "number", "timestamp", "idx"]
assert ddf.npartitions == 2
assert assert_eq(ddf.set_index("idx"), df.set_index("idx"))

Expand All @@ -69,7 +84,7 @@ def test_read_row_filter(df, dataset, client):
row_filter="idx < 5",
)

assert list(ddf.columns) == ["name", "number", "idx"]
assert list(ddf.columns) == ["name", "number", "timestamp", "idx"]
assert ddf.npartitions == 2
assert assert_eq(ddf.set_index("idx").loc[:4], df.set_index("idx").loc[:4])

Expand Down