Source code for apache_beam.yaml.generate_yaml_docs

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import argparse
import glob
import io
import itertools
import os
import re

import docstring_parser
import yaml

from apache_beam.portability.api import schema_pb2
from apache_beam.typehints import schemas
from apache_beam.utils import subprocess_server
from apache_beam.utils.python_callable import PythonCallableWithSource
from apache_beam.version import __version__ as beam_version
from apache_beam.yaml import json_utils
from apache_beam.yaml import yaml_provider
from apache_beam.yaml import yaml_utils
from apache_beam.yaml.yaml_errors import ErrorHandlingConfig


def _singular(name):
  # Simply removing an 's' (or 'es', or 'ies', ...) may result in surprising
  # manglings. Better to play it safe and leave a correctly-spelled plural
  # than a botched singular in our examples configs.
  return {
      'args': 'arg',
      'attributes': 'attribute',
      'elements': 'element',
      'fields': 'field',
  }.get(name, name)


def _fake_value(name, beam_type):
  type_info = beam_type.WhichOneof("type_info")
  if type_info == "atomic_type":
    if beam_type.atomic_type == schema_pb2.STRING:
      return f'"{name}"'
    elif beam_type.atomic_type == schema_pb2.BOOLEAN:
      return "true|false"
    else:
      return name
  elif type_info == "array_type":
    return [
        _fake_value(_singular(name), beam_type.array_type.element_type),
        _fake_value(_singular(name), beam_type.array_type.element_type),
        '...'
    ]
  elif type_info == "iterable_type":
    return [
        _fake_value(_singular(name), beam_type.iterable_type.element_type),
        _fake_value(_singular(name), beam_type.iterable_type.element_type),
        '...'
    ]
  elif type_info == "map_type":
    if beam_type.map_type.key_type.atomic_type == schema_pb2.STRING:
      return {
          'a': _fake_value(name + '_value_a', beam_type.map_type.value_type),
          'b': _fake_value(name + '_value_b', beam_type.map_type.value_type),
          'c': '...',
      }
    else:
      return {
          _fake_value(name + '_key', beam_type.map_type.key_type): _fake_value(
              name + '_value', beam_type.map_type.value_type)
      }
  elif type_info == "row_type":
    return _fake_row(beam_type.row_type.schema)
  elif type_info == "logical_type":
    return name
  else:
    raise ValueError(f"Unrecognized type_info: {type_info!r}")


def _fake_row(schema):
  if schema is None:
    return '...'
  return {f.name: _fake_value(f.name, f.type) for f in schema.fields}



[docs]
def pretty_example(provider, t, base_t=None):
  spec = {'type': base_t or t}
  try:
    requires_inputs = provider.requires_inputs(t, {})
  except Exception:
    requires_inputs = False
  if requires_inputs:
    spec['input'] = '...'
  config_schema = provider.config_schema(t)
  if config_schema is None or config_schema.fields:
    spec['config'] = _fake_row(config_schema)
  s = yaml.dump(spec, sort_keys=False)
  return s.replace("'", "")




[docs]
def config_docs(schema):
  if schema is None:
    return ''
  elif not schema.fields:
    return 'No configuration parameters.'

  def pretty_type(beam_type):
    type_info = beam_type.WhichOneof("type_info")
    if type_info == "atomic_type":
      return schema_pb2.AtomicType.Name(beam_type.atomic_type).lower()
    elif type_info == "array_type":
      return f'Array[{pretty_type(beam_type.array_type.element_type)}]'
    elif type_info == "iterable_type":
      return f'Iterable[{pretty_type(beam_type.iterable_type.element_type)}]'
    elif type_info == "map_type":
      return (
          f'Map[{pretty_type(beam_type.map_type.key_type)}, '
          f'{pretty_type(beam_type.map_type.value_type)}]')
    elif type_info == "row_type":
      return 'Row'
    else:
      return '?'

  def maybe_row_parameters(t):
    if t.WhichOneof("type_info") == "row_type":
      return indent('\n\nRow fields:\n\n' + config_docs(t.row_type.schema), 4)
    else:
      return ''

  def maybe_optional(t):
    return " (Optional)" if t.nullable else ""

  def normalize_error_handling(f):
    doc = docstring_parser.parse(
        ErrorHandlingConfig.__doc__, docstring_parser.DocstringStyle.GOOGLE)
    if f.name == "error_handling":
      f = schema_pb2.Field(
          name="error_handling",
          type=schema_pb2.FieldType(
              row_type=schema_pb2.RowType(
                  schema=schema_pb2.Schema(
                      fields=[
                          schemas.schema_field(
                              param.arg_name,
                              PythonCallableWithSource.load_from_expression(
                                  param.type_name),
                              param.description) for param in doc.params
                      ])),
              nullable=True),
          description=f.description or doc.short_description)
    return f

  def lines():
    for f in schema.fields:
      f = normalize_error_handling(f)
      yield ''.join([
          f'**{f.name}** `{pretty_type(f.type)}`',
          maybe_optional(f.type),
          indent(': ' + f.description if f.description else '', 2),
          maybe_row_parameters(f.type),
      ])

  return '\n\n'.join('*' + indent(line, 2) for line in lines()).strip()




[docs]
def indent(lines, size):
  return '\n'.join(' ' * size + line for line in lines.split('\n'))




[docs]
def longest(func, xs):
  return max([func(x) or '' for x in xs], key=len)




[docs]
def io_grouping_key(transform_name):
  """Place reads and writes next to each other, after all other transforms."""
  if transform_name.startswith('ReadFrom'):
    return 1, transform_name[8:], 0
  elif transform_name.startswith('WriteTo'):
    return 1, transform_name[7:], 1
  else:
    return 0, transform_name



# Exclude providers
SKIP = {}



[docs]
def add_transform_links(transform, description, provider_list):
  """
  Convert references of Providers to urls that link to their respective pages.

  For example,
    "Some description talking about MyTransform."
  would be converted to
    "Some description talking about <a href="#mytransform">MyTransform</a>"

  meanwhile::

    type: MyTransform
    config:
      ...

  Would remain unchanged.

  Avoid self-linking within a Transform page.
  """
  for p in provider_list:
    # Match all instances of built-in transforms within the description
    # excluding the transform whose description is currently being evaluated.
    # Match the entire word boundary so that partial matches do not count.
    # (i.e. OtherTransform should not match Transform)
    description = re.sub(
        rf"(?<!type: )\b(?!{transform}\b)\b{p}\b",
        f'<a href="#{p.lower()}">{p}</a>',
        description or '')
  return description




[docs]
def transform_docs(transform_base, transforms, providers, extra_docs=''):
  return '\n'.join([
      f'## {transform_base}',
      '',
      longest(
          lambda t: longest(
              lambda p: add_transform_links(
                  t, p.description(t), providers.keys()), providers[t]),
          transforms).replace('::\n', '\n\n    :::yaml\n'),
      '',
      extra_docs,
      '',
      '### Configuration',
      '',
      longest(
          lambda t: longest(
              lambda p: config_docs(p.config_schema(t)), providers[t]),
          transforms),
      '',
      '### Usage',
      '',
      '    :::yaml',
      '',
      indent(
          longest(
              lambda t: longest(
                  lambda p: pretty_example(p, t, transform_base), providers[t]),
              transforms),
          4),
  ])




[docs]
def create_index(include, exclude, options):
  with subprocess_server.SubprocessServer.cache_subprocesses():
    json_config_schemas = []
    markdown_out = io.StringIO()
    if options.provider_source_file:
      providers = yaml_provider.merge_providers(
          yaml_provider.load_providers(
              yaml_utils.locate_data_file(options.provider_source_file)))
    else:
      providers = yaml_provider.standard_providers()
    for transform_base, transforms in itertools.groupby(
        sorted(providers.keys(), key=io_grouping_key),
        key=lambda s: s.split('-')[0]):
      transforms = list(transforms)
      if include(transform_base) and not exclude(transform_base):
        print(transform_base)
        if options.markdown_file or options.html_file:
          if '-' in transforms[0]:
            extra_docs = 'Supported languages: ' + ', '.join(
                t.split('-')[-1] for t in sorted(transforms)) + '.'
          else:
            extra_docs = ''
          markdown_out.write(
              transform_docs(transform_base, transforms, providers, extra_docs))
          markdown_out.write('\n\n')
        if options.schema_file:
          for transform in transforms:
            schema = providers[transform][0].config_schema(transform)
            if schema:
              json_config_schemas.append({
                  'if': {
                      'properties': {
                          'type': {
                              'const': transform
                          }
                      }
                  },
                  'then': {
                      'properties': {
                          'config': {
                              'type': 'object',
                              'properties': {
                                  '__line__': {
                                      'type': 'integer'
                                  },
                                  '__uuid__': {},
                                  **{
                                      f.name:  #
                                      json_utils.beam_type_to_json_type(f.type)
                                      for f in schema.fields
                                  }
                              },
                              'additionalProperties': False,
                          }
                      }
                  }
              })
    return json_config_schemas, markdown_out.getvalue()




[docs]
def create_examples_markdown():
  markdown_out = io.StringIO()
  base = os.path.join(os.path.dirname(__file__), 'examples')
  section = last_section = ''
  for path in sorted(glob.glob(os.path.join(base, '**', '*.yaml'),
                               recursive=True),
                     key=lambda path: (path.count(os.sep), path)):
    short_path = path[len(base):].replace('transforms', '').strip(os.sep)

    def to_title(path):
      base, _ = os.path.splitext(path)
      nice = base.replace('_', ' ').replace(os.sep, ' ').title()
      # These acronyms should be upper, not title.
      nice = re.sub(r'\bMl\b', 'ML', nice)
      nice = re.sub(r'\bIo\b', 'IO', nice)
      return nice

    def clean_yaml(content):
      content = re.sub(
          '# Licensed to the Apache Software Foundation.*'
          '# limitations under the License.',
          '',
          content,
          flags=re.MULTILINE | re.DOTALL)
      content = re.sub('# coding=.*', '', content)
      return content

    def split_header(yaml):
      lines = yaml.split('\n')
      ix = 0  # make lint happy
      for ix, line in enumerate(lines):
        if not line.strip():
          continue
        if not line.startswith('#'):
          break
      return (
          '\n'.join([line[1:].strip() for line in lines[:ix]]),
          '\n'.join(lines[ix:]))

    if os.sep in short_path:
      section = to_title(short_path.split(os.sep)[0])
      if section != last_section:
        markdown_out.write(f'# {section}\n\n')
        last_section = section
    title = to_title(short_path)[len(section):]
    markdown_out.write(f'## {title}\n\n')
    with open(path) as fin:
      content = fin.read()
    header, body = split_header(clean_yaml(content))
    markdown_out.write(header)
    markdown_out.write('\n\n    :::yaml\n\n')
    markdown_out.write('    ' + body.replace('\n', '\n    '))
    markdown_out.write('\n')
  return markdown_out.getvalue()




[docs]
def markdown_to_html(title, markdown_content, header=''):
  import markdown
  import markdown.extensions.toc
  import pygments.formatters

  md = markdown.Markdown(
      extensions=[
          markdown.extensions.toc.TocExtension(toc_depth=2),
          'codehilite',
      ])
  pygments_style = pygments.formatters.HtmlFormatter().get_style_defs(
      '.codehilite')
  extra_style = '''
          * {
            box-sizing: border-box;
          }
          body {
            font-family: 'Roboto', sans-serif;
            font-weight: normal;
            color: #404040;
            background: #edf0f2;
          }
          .body-for-nav {
            background: #fcfcfc;
          }
          .grid-for-nav {
            width: 100%;
          }
          .nav-side {
            position: fixed;
            top: 0;
            left: 0;
            width: 300px;
            height: 100%;
            padding-bottom: 2em;
            color: #9b9b9b;
            background: #343131;
          }
          .nav-header {
            display: block;
            width: 300px;
            padding: 1em;
            background-color: #2980B9;
            text-align: center;
            color: #fcfcfc;
          }
          .nav-header a {
            color: #fcfcfc;
            font-weight: bold;
            display: inline-block;
            padding: 4px 6px;
            margin-bottom: 1em;
            text-decoration:none;
          }
          .nav-header>div.version {
            margin-top: -.5em;
            margin-bottom: 1em;
            font-weight: normal;
            color: rgba(255, 255, 255, 0.3);
          }
          .toc {
            width: 300px;
            text-align: left;
            overflow-y: auto;
            max-height: calc(100% - 4.3em);
            scrollbar-width: thin;
            scrollbar-color: #9b9b9b #343131;
          }
          .toc ul {
            margin: 0;
            padding: 0;
            list-style: none;
          }
          .toc li {
            border-bottom: 1px solid #4e4a4a;
            margin-left: 1em;
          }
          .toc a {
            display: block;
            line-height: 36px;
            font-size: 90%;
            color: #d9d9d9;
            padding: .1em 0.6em;
            text-decoration: none;
            transition: background-color 0.3s ease, color 0.3s ease;
          }
          .toc a:hover {
            background-color: #4e4a4a;
            color: #ffffff;
          }
          .transform-content-wrap {
            margin-left: 300px;
            background: #fcfcfc;
          }
          .transform-content {
            padding: 1.5em 3em;
            margin: 20px;
            padding-bottom: 2em;
          }
          .transform-content li::marker {
            display: inline-block;
            width: 0.5em;
          }
          .transform-content h1 {
            font-size: 40px;
          }
          .transform-content ul {
            margin-left: 0.75em;
            text-align: left;
            list-style-type: disc;
          }
          hr {
            color: gray;
            display: block;
            height: 1px;
            border: 0;
            border-top: 1px solid #e1e4e5;
            margin-bottom: 3em;
            margin-top: 3em;
            padding: 0;
          }
          .codehilite {
            background: #f5f5f5;
            border: 1px solid #ccc;
            border-radius: 4px;
            padding: 0.2em 1em;
            overflow: auto;
            font-family: monospace;
            font-size: 14px;
            line-height: 1.5;
          }
          p code, li code {
              white-space: nowrap;
              max-width: 100%;
              background: #fff;
              border: solid 1px #e1e4e5;
              padding: 0 5px;
              font-family: monospace;
              color: #404040;
              font-weight: bold;
              padding: 2px 5px;
          }
          '''

  html = md.convert(markdown_content)
  return f'''
            <html>
              <head>
                <meta charset="UTF-8">
                <title>{title}</title>
                <style>
                {pygments_style}
                {extra_style}
                </style>
              </head>
              <body class="body-for-nav">
                <div class="grid-for-nav">
                  <nav class="nav-side">
                    <div class="nav-header">
                      <a href=#>{title}</a>
                      <div class="version">
                        {beam_version}
                      </div>
                    </div>
                    {getattr(md, 'toc')}
                  </nav>
                  <section class="transform-content-wrap">
                    <div class="transform-content">
                      <h1>{title}</h1>
                      {header}
                      {html.replace('<h2', '<hr><h2')}
                    </div>
                  </section>
                </div>
              </body>
            </html>
            '''




[docs]
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('--examples_file')
  parser.add_argument('--markdown_file')
  parser.add_argument('--html_file')
  parser.add_argument('--schema_file')
  parser.add_argument('--include', default='.*')
  parser.add_argument('--exclude', default='')
  parser.add_argument(
      "--provider_source_file",
      help="Path to a YAML file containing custom providers. "
      "If not provided, uses standard Beam providers.",
  )
  options = parser.parse_args()
  include = re.compile(options.include).match
  exclude = (
      re.compile(options.exclude).match
      if options.exclude else lambda x: x in SKIP)

  json_config_schemas, markdown_content = create_index(
      include, exclude, options)

  if options.schema_file:
    with open(options.schema_file, 'w') as fout:
      yaml.dump(json_config_schemas, fout, sort_keys=False)

  if options.markdown_file:
    with open(options.markdown_file, 'w') as fout:
      fout.write(markdown_content)

  if options.html_file:
    with open(options.html_file, 'w') as html_out:
      html_out.write(
          markdown_to_html('Beam YAML Transform Index', markdown_content))

  if options.examples_file:
    with open(options.examples_file, 'w') as html_out:
      html_out.write(
          markdown_to_html(
              'Beam YAML Examples',
              create_examples_markdown(),
              header='''
                <p>Example pipelines using the
                <a href="https://beam.apache.org/documentation/sdks/yaml/">
                Beam YAML API</a>.
                These examples can also be found on
                <a href="https://github.com/apache/beam/tree/master/sdks/'''
              '''python/apache_beam/yaml/examples">github</a>.
                </p>
                <p>Note: These examples below are automatically tested for
                correctness and may be used as a starting point for your own
                pipelines.</p>
          '''))



if __name__ == '__main__':
  main()