#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Pipeline options validator.
For internal use only; no backwards-compatibility guarantees.
"""
# pytype: skip-file
import logging
import re
import string
from urllib.parse import urlparse
from apache_beam.internal import pickler
from apache_beam.options.pipeline_options import DebugOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions
from apache_beam.options.pipeline_options import PortableOptions
from apache_beam.options.pipeline_options import SetupOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import TestOptions
from apache_beam.options.pipeline_options import TypeOptions
from apache_beam.options.pipeline_options import WorkerOptions
_LOGGER = logging.getLogger(__name__)
[docs]
class PipelineOptionsValidator(object):
"""Validates PipelineOptions.
Goes through a list of known PipelineOption subclassess and calls::
validate(validator)
if one is implemented. Aggregates a list of validation errors from all and
returns an aggregated list.
"""
# Validator will call validate on these subclasses of PipelineOptions
OPTIONS = [
DebugOptions,
GoogleCloudOptions,
PortableOptions,
SetupOptions,
StandardOptions,
TestOptions,
TypeOptions,
WorkerOptions
]
# Mutually exclusive options for different types of portable environments.
REQUIRED_ENVIRONMENT_OPTIONS = {
'DOCKER': [],
'PROCESS': ['process_command'],
'EXTERNAL': ['external_service_address'],
'LOOPBACK': []
}
OPTIONAL_ENVIRONMENT_OPTIONS = {
'DOCKER': ['docker_container_image'],
'PROCESS': ['process_variables'],
'EXTERNAL': [],
'LOOPBACK': []
}
# Possible validation errors.
ERR_MISSING_OPTION = 'Missing required option: %s.'
ERR_MISSING_GCS_PATH = 'Missing GCS path option: %s.'
ERR_INVALID_GCS_PATH = 'Invalid GCS path (%s), given for the option: %s.'
ERR_INVALID_GCS_BUCKET = (
'Invalid GCS bucket (%s), given for the option: %s. See '
'https://developers.google.com/storage/docs/bucketnaming '
'for more details.')
ERR_INVALID_GCS_OBJECT = 'Invalid GCS object (%s), given for the option: %s.'
ERR_INVALID_JOB_NAME = (
'Invalid job_name (%s); the name must consist of only the characters '
'[-a-z0-9], starting with a letter and ending with a letter or number')
ERR_INVALID_PROJECT_NUMBER = (
'Invalid Project ID (%s). Please make sure you specified the Project ID, '
'not project number.')
ERR_INVALID_PROJECT_ID = (
'Invalid Project ID (%s). Please make sure you specified the Project ID, '
'not project description.')
ERR_INVALID_ENDPOINT = (
'Invalid url (%s) for dataflow endpoint. Please provide a valid url.')
ERR_INVALID_NOT_POSITIVE = (
'Invalid value (%s) for option: %s. Value needs '
'to be positive.')
ERR_INVALID_TEST_MATCHER_TYPE = (
'Invalid value (%s) for option: %s. Please extend your matcher object '
'from hamcrest.core.base_matcher.BaseMatcher.')
ERR_INVALID_TEST_MATCHER_UNPICKLABLE = (
'Invalid value (%s) for option: %s. Please make sure the test matcher '
'is unpicklable.')
ERR_INVALID_TRANSFORM_NAME_MAPPING = (
'Invalid transform name mapping format. Please make sure the mapping is '
'string key-value pairs. Invalid pair: (%s:%s)')
ERR_INVALID_ENVIRONMENT = (
'Option %s is not compatible with environment type %s.')
ERR_ENVIRONMENT_CONFIG = (
'Option environment_config is incompatible with option(s) %s.')
ERR_MISSING_REQUIRED_ENVIRONMENT_OPTION = (
'Option %s is required for environment type %s.')
ERR_NUM_WORKERS_TOO_HIGH = (
'num_workers (%s) cannot exceed max_num_workers (%s)')
ERR_REPEATABLE_OPTIONS_NOT_SET_AS_LIST = (
'(%s) is a string. Programmatically set PipelineOptions like (%s) '
'options need to be specified as a list.')
# GCS path specific patterns.
GCS_URI = '(?P<SCHEME>[^:]+)://(?P<BUCKET>[^/]+)(/(?P<OBJECT>.*))?'
GCS_BUCKET = '^[a-z0-9][-_a-z0-9.]+[a-z0-9]$'
GCS_SCHEME = 'gs'
# GoogleCloudOptions specific patterns.
JOB_PATTERN = '[a-z]([-a-z0-9]*[a-z0-9])?'
PROJECT_ID_PATTERN = '[a-z][-a-z0-9:.]+[a-z0-9]'
PROJECT_NUMBER_PATTERN = '[0-9]*'
def __init__(self, options, runner):
self.options = options
self.runner = runner
[docs]
def validate(self):
"""Calls validate on subclassess and returns a list of errors.
validate will call validate method on subclasses, accumulate the returned
list of errors, and returns the aggregate list.
Returns:
Aggregate list of errors after all calling all possible validate methods.
"""
errors = []
for cls in self.OPTIONS:
if 'validate' in cls.__dict__ and callable(cls.__dict__['validate']):
errors.extend(self.options.view_as(cls).validate(self))
return errors
[docs]
def is_service_runner(self):
"""True if pipeline will execute on the Google Cloud Dataflow service."""
is_service_runner = (
self.runner is not None and
type(self.runner).__name__ in ['DataflowRunner', 'TestDataflowRunner'])
dataflow_endpoint = (
self.options.view_as(GoogleCloudOptions).dataflow_endpoint)
if dataflow_endpoint is None:
return False
else:
endpoint_parts = urlparse(dataflow_endpoint, allow_fragments=False)
if endpoint_parts.netloc.startswith("localhost"):
return False
return is_service_runner
[docs]
def is_full_string_match(self, pattern, string):
"""Returns True if the pattern matches the whole string."""
pattern = '^%s$' % pattern
return re.search(pattern, string) is not None
def _validate_error(self, err, *args):
return [err % args]
[docs]
def validate_gcs_path(self, view, arg_name):
"""Validates a GCS path against gs://bucket/object URI format."""
arg = getattr(view, arg_name, None)
if arg is None:
return self._validate_error(self.ERR_MISSING_GCS_PATH, arg_name)
match = re.match(self.GCS_URI, arg, re.DOTALL)
if match is None:
return self._validate_error(self.ERR_INVALID_GCS_PATH, arg, arg_name)
scheme = match.group('SCHEME')
bucket = match.group('BUCKET')
gcs_object = match.group('OBJECT')
if ((scheme is None) or (scheme.lower() != self.GCS_SCHEME) or
(bucket is None)):
return self._validate_error(self.ERR_INVALID_GCS_PATH, arg, arg_name)
if not self.is_full_string_match(self.GCS_BUCKET, bucket):
return self._validate_error(self.ERR_INVALID_GCS_BUCKET, arg, arg_name)
if gcs_object is None or '\n' in gcs_object or '\r' in gcs_object:
return self._validate_error(self.ERR_INVALID_GCS_OBJECT, arg, arg_name)
return []
[docs]
def validate_cloud_options(self, view):
"""Validates job_name and project arguments."""
errors = []
if (view.job_name and
not self.is_full_string_match(self.JOB_PATTERN, view.job_name)):
errors.extend(
self._validate_error(self.ERR_INVALID_JOB_NAME, view.job_name))
project = view.project
if project is None:
errors.extend(self._validate_error(self.ERR_MISSING_OPTION, 'project'))
else:
if self.is_full_string_match(self.PROJECT_NUMBER_PATTERN, project):
errors.extend(
self._validate_error(self.ERR_INVALID_PROJECT_NUMBER, project))
elif not self.is_full_string_match(self.PROJECT_ID_PATTERN, project):
errors.extend(
self._validate_error(self.ERR_INVALID_PROJECT_ID, project))
if view.update:
if not view.job_name:
errors.extend(
self._validate_error(
'Existing job name must be provided when updating a pipeline.'))
if view.transform_name_mapping:
if not view.update or not self.options.view_as(StandardOptions).streaming:
errors.append(
'Transform name mapping option is only useful when '
'--update and --streaming is specified')
for _, (key, value) in enumerate(view.transform_name_mapping.items()):
if not isinstance(key, str) or not isinstance(value, str):
errors.extend(
self._validate_error(
self.ERR_INVALID_TRANSFORM_NAME_MAPPING, key, value))
break
if view.region is None and self.is_service_runner():
default_region = self.runner.get_default_gcp_region()
if default_region is None:
errors.extend(self._validate_error(self.ERR_MISSING_OPTION, 'region'))
else:
view.region = default_region
dataflow_endpoint = view.dataflow_endpoint
if dataflow_endpoint is None:
errors.extend(
self._validate_error(self.ERR_MISSING_OPTION, dataflow_endpoint))
else:
valid_endpoint = self.validate_endpoint_url(dataflow_endpoint)
if valid_endpoint is False:
errors.extend(
self._validate_error(self.ERR_INVALID_ENDPOINT, dataflow_endpoint))
return errors
[docs]
def validate_sdk_container_image_options(self, view):
errors = []
if view.sdk_container_image and view.worker_harness_container_image:
# To be fully backwards-compatible, these options will be set to the same
# value. Check that the values are different.
if view.sdk_container_image != view.worker_harness_container_image:
errors.extend(
self._validate_error(
'Cannot use legacy flag --worker_harness_container_image along '
'with view.sdk_container_image'))
elif view.worker_harness_container_image:
# Warn about legacy flag and set new flag to value of old flag.
_LOGGER.warning(
'Setting sdk_container_image to value of legacy flag '
'worker_harness_container_image.')
view.sdk_container_image = view.worker_harness_container_image
elif view.sdk_container_image:
# Set legacy option to value of new option.
view.worker_harness_container_image = view.sdk_container_image
return errors
[docs]
def validate_container_prebuilding_options(self, view):
errors = []
custom_image = self.options.view_as(WorkerOptions).sdk_container_image
if (view.prebuild_sdk_container_base_image is not None and
custom_image != view.prebuild_sdk_container_base_image):
errors.extend(
self._validate_error(
'Don\'t use the deprecated option '
'--prebuild_sdk_container_base_image. Use --sdk_container_image '
'instead.'))
return errors
[docs]
def validate_num_workers(self, view):
"""Validates that Dataflow worker number is valid."""
errors = self.validate_optional_argument_positive(view, 'num_workers')
errors.extend(
self.validate_optional_argument_positive(view, 'max_num_workers'))
num_workers = view.num_workers
max_num_workers = view.max_num_workers
if (num_workers is not None and max_num_workers is not None and
num_workers > max_num_workers):
errors.extend(
self._validate_error(
self.ERR_NUM_WORKERS_TOO_HIGH, num_workers, max_num_workers))
return errors
[docs]
def validate_worker_region_zone(self, view):
"""Validates Dataflow worker region and zone arguments are consistent."""
errors = []
if view.zone and (view.worker_region or view.worker_zone):
errors.extend(
self._validate_error(
'Cannot use deprecated flag --zone along with worker_region or '
'worker_zone.'))
if self.options.view_as(DebugOptions).lookup_experiment('worker_region') \
and (view.worker_region or view.worker_zone):
errors.extend(
self._validate_error(
'Cannot use deprecated experiment worker_region along with '
'worker_region or worker_zone.'))
if view.worker_region and view.worker_zone:
errors.extend(
self._validate_error(
'worker_region and worker_zone are mutually exclusive.'))
if view.zone:
_LOGGER.warning(
'Option --zone is deprecated. Please use --worker_zone instead.')
view.worker_zone = view.zone
view.zone = None
return errors
[docs]
def validate_optional_argument_positive(self, view, arg_name):
"""Validates that an optional argument (if set) has a positive value."""
arg = getattr(view, arg_name, None)
if arg is not None and int(arg) <= 0:
return self._validate_error(self.ERR_INVALID_NOT_POSITIVE, arg, arg_name)
return []
[docs]
def validate_test_matcher(self, view, arg_name):
"""Validates that on_success_matcher argument if set.
Validates that on_success_matcher is unpicklable and is instance
of `hamcrest.core.base_matcher.BaseMatcher`.
"""
# This is a test only method and requires hamcrest
from hamcrest.core.base_matcher import BaseMatcher
pickled_matcher = view.on_success_matcher
errors = []
try:
matcher = pickler.loads(pickled_matcher)
if not isinstance(matcher, BaseMatcher):
errors.extend(
self._validate_error(
self.ERR_INVALID_TEST_MATCHER_TYPE, matcher, arg_name))
except: # pylint: disable=bare-except
errors.extend(
self._validate_error(
self.ERR_INVALID_TEST_MATCHER_UNPICKLABLE,
pickled_matcher,
arg_name))
return errors
[docs]
def validate_environment_options(self, view):
"""Validates portable environment options."""
errors = []
actual_environment_type = (
view.environment_type.upper() if view.environment_type else None)
for environment_type, required in self.REQUIRED_ENVIRONMENT_OPTIONS.items():
found_required_options = [
opt for opt in required
if view.lookup_environment_option(opt) is not None
]
found_optional_options = [
opt for opt in self.OPTIONAL_ENVIRONMENT_OPTIONS[environment_type]
if view.lookup_environment_option(opt) is not None
]
found_options = found_required_options + found_optional_options
if environment_type == actual_environment_type:
if view.environment_config:
if found_options:
errors.extend(
self._validate_error(
self.ERR_ENVIRONMENT_CONFIG, ', '.join(found_options)))
else:
missing_options = set(required).difference(
set(found_required_options))
for opt in missing_options:
errors.extend(
self._validate_error(
self.ERR_MISSING_REQUIRED_ENVIRONMENT_OPTION,
opt,
environment_type))
else:
# Environment options classes are mutually exclusive.
for opt in found_options:
errors.extend(
self._validate_error(
self.ERR_INVALID_ENVIRONMENT, opt, actual_environment_type))
if actual_environment_type == 'LOOPBACK' and view.environment_config:
errors.extend(
self._validate_error(
self.ERR_INVALID_ENVIRONMENT, 'environment_config', 'LOOPBACK'))
return errors
[docs]
def validate_repeatable_argument_passed_as_list(self, view, arg_name):
"""Validates that repeatable PipelineOptions like dataflow_service_options
or experiments are specified as a list when set programmatically. This
way, users do not inadvertently specify it as a string, mirroring the way
they are set via the command lineRepeatable options, which are as passed a
list.
"""
arg = getattr(view, arg_name, None)
if not isinstance(arg, list):
return self._validate_error(
self.ERR_REPEATABLE_OPTIONS_NOT_SET_AS_LIST, arg, arg_name)
return []
# Minimally validates the endpoint url. This is not a strict application
# of http://www.faqs.org/rfcs/rfc1738.html.
# If the url matches localhost, set
[docs]
def validate_endpoint_url(self, endpoint_url):
url_parts = urlparse(endpoint_url, allow_fragments=False)
if not url_parts.scheme or not url_parts.netloc:
return False
if url_parts.scheme not in ['http', 'https']:
return False
if set(
url_parts.netloc) <= set(string.ascii_letters + string.digits + '-.'):
return True
return False