Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: Reduce CURRENT_TIMESTAMP queries #1114

Merged
merged 4 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 2 additions & 16 deletions bigframes/session/_io/bigquery/read_gbq_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@
def get_table_metadata(
bqclient: bigquery.Client,
table_ref: google.cloud.bigquery.table.TableReference,
bq_time: datetime.datetime,
*,
api_name: str,
cache: Dict[bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]],
use_cache: bool = True,
) -> Tuple[datetime.datetime, google.cloud.bigquery.table.Table]:
Expand Down Expand Up @@ -76,23 +76,9 @@ def get_table_metadata(
)
return cached_table

# TODO(swast): It's possible that the table metadata is changed between now
# and when we run the CURRENT_TIMESTAMP() query to see when we can time
# travel to. Find a way to fetch the table metadata and BQ's current time
# atomically.
table = bqclient.get_table(table_ref)

# TODO(swast): Use session._start_query instead?
# TODO(swast): Use query_and_wait since we know these are small results.
job_config = bigquery.QueryJobConfig()
bigframes.session._io.bigquery.add_labels(job_config, api_name=api_name)
snapshot_timestamp = list(
bqclient.query(
"SELECT CURRENT_TIMESTAMP() AS `current_timestamp`",
job_config=job_config,
).result()
)[0][0]
cached_table = (snapshot_timestamp, table)
cached_table = (bq_time, table)
cache[table_ref] = cached_table
return cached_table

Expand Down
5 changes: 4 additions & 1 deletion bigframes/session/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
import bigframes.session.metrics
import bigframes.session.planner
import bigframes.session.temp_storage
import bigframes.session.time as session_time
import bigframes.version

# Avoid circular imports.
Expand Down Expand Up @@ -128,6 +129,8 @@ def __init__(
self._metrics = metrics
# Unfortunate circular reference, but need to pass reference when constructing objects
self._session = session
self._clock = session_time.BigQuerySyncedClock(bqclient)
self._clock.sync()

def read_pandas_load_job(
self, pandas_dataframe: pandas.DataFrame, api_name: str
Expand Down Expand Up @@ -246,7 +249,7 @@ def read_gbq_table(
time_travel_timestamp, table = bf_read_gbq_table.get_table_metadata(
self._bqclient,
table_ref=table_ref,
api_name=api_name,
bq_time=self._clock.get_time(),
cache=self._df_snapshot,
use_cache=use_cache,
)
Expand Down
59 changes: 59 additions & 0 deletions bigframes/session/time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
import threading
import time
from typing import cast, Optional

import google.cloud.bigquery as bigquery

MIN_RESYNC_SECONDS = 100


class BigQuerySyncedClock:
"""
Local clock that attempts to synchronize its time with the bigquery service.
"""

def __init__(self, bqclient: bigquery.Client):
self._bqclient = bqclient
self._sync_lock = threading.Lock()
self._sync_remote_time: Optional[datetime.datetime] = None
self._sync_monotonic_time: Optional[float] = None

def get_time(self):
if (self._sync_monotonic_time is None) or (self._sync_remote_time is None):
self.sync()
assert self._sync_remote_time is not None
assert self._sync_monotonic_time is not None
return self._sync_remote_time + datetime.timedelta(
seconds=time.monotonic() - self._sync_monotonic_time
)

def sync(self):
with self._sync_lock:
if (self._sync_monotonic_time is not None) and (
time.monotonic() - self._sync_monotonic_time
) < MIN_RESYNC_SECONDS:
return
current_bq_time = list(
next(
self._bqclient.query_and_wait(
"SELECT CURRENT_TIMESTAMP() AS `current_timestamp`",
)
)
)[0]
self._sync_remote_time = cast(datetime.datetime, current_bq_time)
self._sync_monotonic_time = time.monotonic()
1 change: 1 addition & 0 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
UNIT_TEST_STANDARD_DEPENDENCIES = [
"mock",
"asyncmock",
"freezegun",
PYTEST_VERSION,
"pytest-cov",
"pytest-asyncio",
Expand Down
9 changes: 9 additions & 0 deletions tests/unit/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,16 @@ def query_mock(query, *args, **kwargs):

return query_job

existing_query_and_wait = bqclient.query_and_wait

def query_and_wait_mock(query, *args, **kwargs):
if query.startswith("SELECT CURRENT_TIMESTAMP()"):
return iter([[datetime.datetime.now()]])
else:
return existing_query_and_wait(query, *args, **kwargs)

bqclient.query = query_mock
bqclient.query_and_wait = query_and_wait_mock

clients_provider = mock.create_autospec(bigframes.session.clients.ClientsProvider)
type(clients_provider).bqclient = mock.PropertyMock(return_value=bqclient)
Expand Down
69 changes: 69 additions & 0 deletions tests/unit/session/test_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime
import unittest.mock as mock

import freezegun
import google.cloud.bigquery
import pytest

import bigframes.session.time

INITIAL_BQ_TIME = datetime.datetime(
year=2020,
month=4,
day=24,
hour=8,
minute=55,
second=29,
tzinfo=datetime.timezone.utc,
)


@pytest.fixture()
def bq_client():
bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True)

def query_and_wait_mock(query, *args, **kwargs):
if query.startswith("SELECT CURRENT_TIMESTAMP()"):
return iter([[INITIAL_BQ_TIME]])
else:
return ValueError(f"mock cannot handle query : {query}")

bqclient.query_and_wait = query_and_wait_mock
return bqclient


def test_bqsyncedclock_get_time(bq_client):
# this initial local time is actually irrelevant, only the ticks matter
initial_local_datetime = datetime.datetime(
year=1, month=7, day=12, hour=15, minute=6, second=3
)

with freezegun.freeze_time(initial_local_datetime) as frozen_datetime:
clock = bigframes.session.time.BigQuerySyncedClock(bq_client)

t1 = clock.get_time()
assert t1 == INITIAL_BQ_TIME

frozen_datetime.tick(datetime.timedelta(seconds=3))
t2 = clock.get_time()
assert t2 == INITIAL_BQ_TIME + datetime.timedelta(seconds=3)

frozen_datetime.tick(datetime.timedelta(seconds=23529385))
t3 = clock.get_time()
assert t3 == INITIAL_BQ_TIME + datetime.timedelta(
seconds=3
) + datetime.timedelta(seconds=23529385)