Source code for apache_beam.transforms.enrichment_handlers.bigtable

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
from collections.abc import Callable
from typing import Any
from typing import Optional

from google.api_core.exceptions import NotFound
from google.cloud import bigtable
from google.cloud.bigtable import Client
from google.cloud.bigtable.row_filters import CellsColumnLimitFilter
from google.cloud.bigtable.row_filters import RowFilter

import apache_beam as beam
from apache_beam.transforms.enrichment import EnrichmentSourceHandler
from apache_beam.transforms.enrichment_handlers.utils import ExceptionLevel

__all__ = [
    'BigTableEnrichmentHandler',
]

RowKeyFn = Callable[[beam.Row], bytes]

_LOGGER = logging.getLogger(__name__)



[docs]
class BigTableEnrichmentHandler(EnrichmentSourceHandler[beam.Row, beam.Row]):
  """A handler for :class:`apache_beam.transforms.enrichment.Enrichment`
  transform to interact with GCP BigTable.

  Args:
    project_id (str): GCP project-id of the BigTable cluster.
    instance_id (str): GCP instance-id of the BigTable cluster.
    table_id (str): GCP table-id of the BigTable.
    row_key (str): unique row-key field name from the input `beam.Row` object
      to use as `row_key` for BigTable querying.
    row_filter: a ``:class:`google.cloud.bigtable.row_filters.RowFilter``` to
      filter data read with ``read_row()``.
      Defaults to `CellsColumnLimitFilter(1)`.
    app_profile_id (str): App profile ID to use for BigTable.
      See https://cloud.google.com/bigtable/docs/app-profiles for more details.
    encoding (str): encoding type to convert the string to bytes and vice-versa
      from BigTable. Default is `utf-8`.
    row_key_fn: a lambda function that returns a string row key from the
      input row. It is used to build/extract the row key for Bigtable.
    exception_level: a `enum.Enum` value from
      ``apache_beam.transforms.enrichment_handlers.utils.ExceptionLevel``
      to set the level when an empty row is returned from the BigTable query.
      Defaults to ``ExceptionLevel.WARN``.
    include_timestamp (bool): If enabled, the timestamp associated with the
      value is returned as `(value, timestamp)` for each `row_key`.
      Defaults to `False` - only the latest value without
      the timestamp is returned.
  """
  def __init__(
      self,
      project_id: str,
      instance_id: str,
      table_id: str,
      row_key: str = "",
      row_filter: Optional[RowFilter] = CellsColumnLimitFilter(1),
      *,
      app_profile_id: str = None,  # type: ignore[assignment]
      encoding: str = 'utf-8',
      row_key_fn: Optional[RowKeyFn] = None,
      exception_level: ExceptionLevel = ExceptionLevel.WARN,
      include_timestamp: bool = False,
  ):
    self._project_id = project_id
    self._instance_id = instance_id
    self._table_id = table_id
    self._row_key = row_key
    self._row_filter = row_filter
    self._app_profile_id = app_profile_id
    self._encoding = encoding
    self._row_key_fn = row_key_fn
    self._exception_level = exception_level
    self._include_timestamp = include_timestamp
    if ((not self._row_key_fn and not self._row_key) or
        bool(self._row_key_fn and self._row_key)):
      raise ValueError(
          "Please specify exactly one of `row_key` or a lambda "
          "function with `row_key_fn` to extract the row key "
          "from the input row.")

  def __enter__(self):
    """connect to the Google BigTable cluster."""
    self.client = Client(project=self._project_id)
    self.instance = self.client.instance(self._instance_id)
    self._table = bigtable.table.Table(
        table_id=self._table_id,
        instance=self.instance,
        app_profile_id=self._app_profile_id)

  def __call__(self, request: beam.Row, *args, **kwargs):
    """
    Reads a row from the GCP BigTable and returns
    a `Tuple` of request and response.

    Args:
    request: the input `beam.Row` to enrich.
    """
    response_dict: dict[str, Any] = {}
    row_key_str: str = ""
    try:
      if self._row_key_fn:
        row_key = self._row_key_fn(request)
      else:
        request_dict = request._asdict()
        row_key_str = str(request_dict[self._row_key])
        row_key = row_key_str.encode(self._encoding)
      row = self._table.read_row(row_key, filter_=self._row_filter)
      if row:
        for cf_id, cf_v in row.cells.items():
          response_dict[cf_id] = {}
          for col_id, col_v in cf_v.items():
            if self._include_timestamp:
              response_dict[cf_id][col_id.decode(self._encoding)] = [
                  (v.value.decode(self._encoding), v.timestamp) for v in col_v
              ]
            else:
              response_dict[cf_id][col_id.decode(
                  self._encoding)] = col_v[0].value.decode(self._encoding)
      elif self._exception_level == ExceptionLevel.WARN:
        _LOGGER.warning(
            'no matching row found for row_key: %s '
            'with row_filter: %s' % (row_key_str, self._row_filter))
      elif self._exception_level == ExceptionLevel.RAISE:
        raise ValueError(
            'no matching row found for row_key: %s '
            'with row_filter=%s' % (row_key_str, self._row_filter))
    except KeyError:
      raise KeyError('row_key %s not found in input PCollection.' % row_key_str)
    except NotFound:
      raise NotFound(
          'GCP BigTable cluster `%s:%s:%s` not found.' %
          (self._project_id, self._instance_id, self._table_id))
    except Exception as e:
      raise e

    return request, beam.Row(**response_dict)

  def __exit__(self, exc_type, exc_val, exc_tb):
    """Clean the instantiated BigTable client."""
    self.client = None
    self.instance = None
    self._table = None


[docs]
  def get_cache_key(self, request: beam.Row) -> str:
    """Returns a string formatted with row key since it is unique to
    a request made to `Bigtable`."""
    if self._row_key_fn:
      return "row_key: %s" % str(self._row_key_fn(request))
    return "%s: %s" % (self._row_key, request._asdict()[self._row_key])