googleapis · daniel-sanche · Jun 23, 2023 · Apr 1, 2023 · Apr 1, 2023 · Apr 2, 2023
diff --git a/google/cloud/bigtable/__init__.py b/google/cloud/bigtable/__init__.py
@@ -36,6 +36,8 @@
 
 # Type alias for the output of sample_keys
 RowKeySamples = List[Tuple[bytes, int]]
+# type alias for the output of query.shard()
+ShardedQuery = List[ReadRowsQuery]
 
 __version__: str = package_version.__version__
 

diff --git a/google/cloud/bigtable/client.py b/google/cloud/bigtable/client.py
@@ -38,6 +38,7 @@
 from google.cloud.bigtable_v2.services.bigtable.transports.pooled_grpc_asyncio import (
     PooledBigtableGrpcAsyncIOTransport,
 )
+from google.cloud.bigtable_v2.types.bigtable import PingAndWarmRequest
 from google.cloud.client import ClientWithProject
 from google.api_core.exceptions import GoogleAPICallError
 from google.api_core import retry_async as retries
@@ -50,10 +51,14 @@
 from google.cloud.bigtable.row import Row
 from google.cloud.bigtable.read_rows_query import ReadRowsQuery
 from google.cloud.bigtable.iterators import ReadRowsIterator
+from google.cloud.bigtable.exceptions import FailedQueryShardError
+from google.cloud.bigtable.exceptions import ShardedReadRowsExceptionGroup
+
 from google.cloud.bigtable.mutations import Mutation, RowMutationEntry
 from google.cloud.bigtable._mutate_rows import _MutateRowsOperation
 from google.cloud.bigtable._helpers import _make_metadata
 from google.cloud.bigtable._helpers import _convert_retry_deadline
+from google.cloud.bigtable._helpers import _attempt_timeout_generator
 
 from google.cloud.bigtable.read_modify_write_rules import ReadModifyWriteRule
 from google.cloud.bigtable.row_filters import RowFilter
@@ -64,6 +69,10 @@
 if TYPE_CHECKING:
     from google.cloud.bigtable.mutations_batcher import MutationsBatcher
     from google.cloud.bigtable import RowKeySamples
+    from google.cloud.bigtable import ShardedQuery
+
+# used by read_rows_sharded to limit how many requests are attempted in parallel
+CONCURRENCY_LIMIT = 10
 
 
 class BigtableDataClient(ClientWithProject):
@@ -190,10 +199,13 @@ async def _ping_and_warm_instances(
             - sequence of results or exceptions from the ping requests
         """
         ping_rpc = channel.unary_unary(
-            "/google.bigtable.v2.Bigtable/PingAndWarmChannel"
+            "/google.bigtable.v2.Bigtable/PingAndWarm",
+            request_serializer=PingAndWarmRequest.serialize,
         )
         tasks = [ping_rpc({"name": n}) for n in self._active_instances]
-        return await asyncio.gather(*tasks, return_exceptions=True)
+        result = await asyncio.gather(*tasks, return_exceptions=True)
+        # return None in place of empty successful responses
+        return [r or None for r in result]
 
     async def _manage_channel(
         self,
@@ -532,22 +544,79 @@ async def read_row(
 
     async def read_rows_sharded(
         self,
-        query_list: list[ReadRowsQuery] | list[dict[str, Any]],
+        sharded_query: ShardedQuery,
         *,
-        limit: int | None,
-        operation_timeout: int | float | None = 60,
+        operation_timeout: int | float | None = None,
         per_request_timeout: int | float | None = None,
-    ) -> ReadRowsIterator:
+    ) -> list[Row]:
         """
-        Runs a sharded query in parallel
+        Runs a sharded query in parallel, then return the results in a single list.
+        Results will be returned in the order of the input queries.
+
+        This function is intended to be run on the results on a query.shard() call:
 
-        Each query in query list will be run concurrently, with results yielded as they are ready
-        yielded results may be out of order
+        ```
+        table_shard_keys = await table.sample_row_keys()
+        query = ReadRowsQuery(...)
+        shard_queries = query.shard(table_shard_keys)
+        results = await table.read_rows_sharded(shard_queries)
+        ```
 
         Args:
-            - query_list: a list of queries to run in parallel
+            - sharded_query: a sharded query to execute
+        Raises:
+            - ShardedReadRowsExceptionGroup: if any of the queries failed
+            - ValueError: if the query_list is empty
         """
-        raise NotImplementedError
+        if not sharded_query:
+            raise ValueError("empty sharded_query")
+        # reduce operation_timeout between batches
+        operation_timeout = operation_timeout or self.default_operation_timeout
+        per_request_timeout = (
+            per_request_timeout or self.default_per_request_timeout or operation_timeout
+        )
+        timeout_generator = _attempt_timeout_generator(
+            operation_timeout, operation_timeout
+        )
+        # submit shards in batches if the number of shards goes over CONCURRENCY_LIMIT
+        batched_queries = [
+            sharded_query[i : i + CONCURRENCY_LIMIT]
+            for i in range(0, len(sharded_query), CONCURRENCY_LIMIT)
+        ]
+        # run batches and collect results
+        results_list = []
+        error_dict = {}
+        shard_idx = 0
+        for batch in batched_queries:
+            batch_operation_timeout = next(timeout_generator)
+            routine_list = [
+                self.read_rows(
+                    query,
+                    operation_timeout=batch_operation_timeout,
+                    per_request_timeout=min(
+                        per_request_timeout, batch_operation_timeout
+                    ),
+                )
+                for query in batch
+            ]
+            batch_result = await asyncio.gather(*routine_list, return_exceptions=True)
+            for result in batch_result:
+                if isinstance(result, Exception):
+                    error_dict[shard_idx] = result
+                else:
+                    results_list.extend(result)
+                shard_idx += 1
+        if error_dict:
+            # if any sub-request failed, raise an exception instead of returning results
+            raise ShardedReadRowsExceptionGroup(
+                [
+                    FailedQueryShardError(idx, sharded_query[idx], e)
+                    for idx, e in error_dict.items()
+                ],
+                results_list,
+                len(sharded_query),
+            )
+        return results_list
 
     async def row_exists(
         self,
@@ -577,32 +646,81 @@ async def row_exists(
         )
         return len(results) > 0
 
-    async def sample_keys(
+    async def sample_row_keys(
         self,
         *,
-        operation_timeout: int | float | None = 60,
-        per_sample_timeout: int | float | None = 10,
-        per_request_timeout: int | float | None = None,
+        operation_timeout: float | None = None,
+        per_request_timeout: float | None = None,
     ) -> RowKeySamples:
         """
         Return a set of RowKeySamples that delimit contiguous sections of the table of
         approximately equal size
 
         RowKeySamples output can be used with ReadRowsQuery.shard() to create a sharded query that
         can be parallelized across multiple backend nodes read_rows and read_rows_stream
-        requests will call sample_keys internally for this purpose when sharding is enabled
+        requests will call sample_row_keys internally for this purpose when sharding is enabled
 
         RowKeySamples is simply a type alias for list[tuple[bytes, int]]; a list of
             row_keys, along with offset positions in the table
 
         Returns:
             - a set of RowKeySamples the delimit contiguous sections of the table
         Raises:
-            - DeadlineExceeded: raised after operation timeout
-                will be chained with a RetryExceptionGroup containing all GoogleAPIError
-                exceptions from any retries that failed
+            - GoogleAPICallError: if the sample_row_keys request fails
         """
-        raise NotImplementedError
+        # prepare timeouts
+        operation_timeout = operation_timeout or self.default_operation_timeout
+        per_request_timeout = per_request_timeout or self.default_per_request_timeout
+
+        if operation_timeout <= 0:
+            raise ValueError("operation_timeout must be greater than 0")
+        if per_request_timeout is not None and per_request_timeout <= 0:
+            raise ValueError("per_request_timeout must be greater than 0")
+        if per_request_timeout is not None and per_request_timeout > operation_timeout:
+            raise ValueError(
+                "per_request_timeout must not be greater than operation_timeout"
+            )
+        attempt_timeout_gen = _attempt_timeout_generator(
+            per_request_timeout, operation_timeout
+        )
+        # prepare retryable
+        predicate = retries.if_exception_type(
+            core_exceptions.DeadlineExceeded,
+            core_exceptions.ServiceUnavailable,
+        )
+        transient_errors = []
+
+        def on_error_fn(exc):
+            # add errors to list if retryable
+            if predicate(exc):
+                transient_errors.append(exc)
+
+        retry = retries.AsyncRetry(
+            predicate=predicate,
+            timeout=operation_timeout,
+            initial=0.01,
+            multiplier=2,
+            maximum=60,
+            on_error=on_error_fn,
+            is_stream=False,
+        )
+
+        # prepare request
+        metadata = _make_metadata(self.table_name, self.app_profile_id)
+
+        async def execute_rpc():
+            results = await self.client._gapic_client.sample_row_keys(
+                table_name=self.table_name,
+                app_profile_id=self.app_profile_id,
+                timeout=next(attempt_timeout_gen),
+                metadata=metadata,
+            )
+            return [(s.row_key, s.offset_bytes) async for s in results]
+
+        wrapped_fn = _convert_retry_deadline(
+            retry(execute_rpc), operation_timeout, transient_errors
+        )
+        return await wrapped_fn()
 
     def mutations_batcher(self, **kwargs) -> MutationsBatcher:
         """
@@ -896,16 +1014,17 @@ async def close(self):
         """
         Called to close the Table instance and release any resources held by it.
         """
+        self._register_instance_task.cancel()
         await self.client._remove_instance_registration(self.instance_id, self)
 
     async def __aenter__(self):
         """
         Implement async context manager protocol
 
-        Register this instance with the client, so that
+        Ensure registration task has time to run, so that
         grpc channels will be warmed for the specified instance
         """
-        await self.client._register_instance(self.instance_id, self)
+        await self._register_instance_task
         return self
 
     async def __aexit__(self, exc_type, exc_val, exc_tb):

diff --git a/google/cloud/bigtable/exceptions.py b/google/cloud/bigtable/exceptions.py
@@ -16,14 +16,16 @@
 
 import sys
 
-from typing import TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 from google.api_core import exceptions as core_exceptions
+from google.cloud.bigtable.row import Row
 
 is_311_plus = sys.version_info >= (3, 11)
 
 if TYPE_CHECKING:
     from google.cloud.bigtable.mutations import RowMutationEntry
+    from google.cloud.bigtable.read_rows_query import ReadRowsQuery
 
 
 class IdleTimeout(core_exceptions.DeadlineExceeded):
@@ -137,3 +139,49 @@ def __init__(self, excs: list[Exception]):
 
     def __new__(cls, excs: list[Exception]):
         return super().__new__(cls, cls._format_message(excs), excs)
+
+
+class ShardedReadRowsExceptionGroup(BigtableExceptionGroup):
+    """
+    Represents one or more exceptions that occur during a sharded read rows operation
+    """
+
+    @staticmethod
+    def _format_message(excs: list[FailedQueryShardError], total_queries: int):
+        query_str = "query" if total_queries == 1 else "queries"
+        plural_str = "" if len(excs) == 1 else "s"
+        return f"{len(excs)} sub-exception{plural_str} (from {total_queries} {query_str} attempted)"
+
+    def __init__(
+        self,
+        excs: list[FailedQueryShardError],
+        succeeded: list[Row],
+        total_queries: int,
+    ):
+        super().__init__(self._format_message(excs, total_queries), excs)
+        self.successful_rows = succeeded
+
+    def __new__(
+        cls, excs: list[FailedQueryShardError], succeeded: list[Row], total_queries: int
+    ):
+        instance = super().__new__(cls, cls._format_message(excs, total_queries), excs)
+        instance.successful_rows = succeeded
+        return instance
+
+
+class FailedQueryShardError(Exception):
+    """
+    Represents an individual failed query in a sharded read rows operation
+    """
+
+    def __init__(
+        self,
+        failed_index: int,
+        failed_query: "ReadRowsQuery" | dict[str, Any],
+        cause: Exception,
+    ):
+        message = f"Failed query at index {failed_index} with cause: {cause!r}"
+        super().__init__(message)
+        self.index = failed_index
+        self.query = failed_query
+        self.__cause__ = cause