fune/taskcluster/taskgraph/util/schema.py

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from __future__ import absolute_import, print_function, unicode_literals

import re
import pprint
import collections
import voluptuous

import taskgraph

from mozbuild import schedules
from .attributes import keymatch


def validate_schema(schema, obj, msg_prefix):
    """
    Validate that object satisfies schema.  If not, generate a useful exception
    beginning with msg_prefix.
    """
    if taskgraph.fast:
        return
    try:
        schema(obj)
    except voluptuous.MultipleInvalid as exc:
        msg = [msg_prefix]
        for error in exc.errors:
            msg.append(str(error))
        raise Exception('\n'.join(msg) + '\n' + pprint.pformat(obj))


def optionally_keyed_by(*arguments):
    """
    Mark a schema value as optionally keyed by any of a number of fields.  The
    schema is the last argument, and the remaining fields are taken to be the
    field names.  For example:

        'some-value': optionally_keyed_by(
            'test-platform', 'build-platform',
            Any('a', 'b', 'c'))

    The resulting schema will allow nesting of `by-test-platform` and
    `by-build-platform` in either order.
    """
    schema = arguments[-1]
    fields = arguments[:-1]

    # build the nestable schema by generating schema = Any(schema,
    # by-fld1, by-fld2, by-fld3) once for each field.  So we don't allow
    # infinite nesting, but one level of nesting for each field.
    for _ in arguments:
        options = [schema]
        for field in fields:
            options.append({'by-' + field: {basestring: schema}})
        schema = voluptuous.Any(*options)
    return schema


def resolve_keyed_by(item, field, item_name, **extra_values):
    """
    For values which can either accept a literal value, or be keyed by some
    other attribute of the item, perform that lookup and replacement in-place
    (modifying `item` directly).  The field is specified using dotted notation
    to traverse dictionaries.

    For example, given item::

        job:
            test-platform: linux128
            chunks:
                by-test-platform:
                    macosx-10.11/debug: 13
                    win.*: 6
                    default: 12

    a call to `resolve_keyed_by(item, 'job.chunks', item['thing-name'])`
    would mutate item in-place to::

        job:
            chunks: 12

    The `item_name` parameter is used to generate useful error messages.

    If extra_values are supplied, they represent additional values available
    for reference from by-<field>.

    Items can be nested as deeply as the schema will allow::

        chunks:
            by-test-platform:
                win.*:
                    by-project:
                        ash: ..
                        cedar: ..
                linux: 13
                default: 12
    """
    # find the field, returning the item unchanged if anything goes wrong
    container, subfield = item, field
    while '.' in subfield:
        f, subfield = subfield.split('.', 1)
        if f not in container:
            return item
        container = container[f]
        if not isinstance(container, dict):
            return item

    if subfield not in container:
        return item
    value = container[subfield]
    while True:
        if not isinstance(value, dict) or len(value) != 1 or not value.keys()[0].startswith('by-'):
            return item

        keyed_by = value.keys()[0][3:]  # strip off 'by-' prefix
        key = extra_values.get(keyed_by) if keyed_by in extra_values else item[keyed_by]
        alternatives = value.values()[0]

        if len(alternatives) == 1 and 'default' in alternatives:
            # Error out when only 'default' is specified as only alternatives,
            # because we don't need to by-{keyed_by} there.
            raise Exception(
                "Keyed-by '{}' unnecessary with only value 'default' "
                "found, when determining item '{}' in '{}'".format(
                    keyed_by, field, item_name))

        matches = keymatch(alternatives, key)
        if len(matches) > 1:
            raise Exception(
                "Multiple matching values for {} {!r} found while "
                "determining item {} in {}".format(
                    keyed_by, key, field, item_name))
        elif matches:
            value = container[subfield] = matches[0]
            continue

        raise Exception(
            "No {} matching {!r} nor 'default' found while determining item {} in {}".format(
                keyed_by, key, field, item_name))


# Schemas for YAML files should use dashed identifiers by default.  If there are
# components of the schema for which there is a good reason to use another format,
# they can be whitelisted here.
WHITELISTED_SCHEMA_IDENTIFIERS = [
    # upstream-artifacts are handed directly to scriptWorker, which expects interCaps
    lambda path: "[u'upstream-artifacts']" in path,
]


def check_schema(schema):
    identifier_re = re.compile('^[a-z][a-z0-9-]*$')

    def whitelisted(path):
        return any(f(path) for f in WHITELISTED_SCHEMA_IDENTIFIERS)

    def iter(path, sch):
        def check_identifier(path, k):
            if k in (basestring, voluptuous.Extra):
                pass
            elif isinstance(k, basestring):
                if not identifier_re.match(k) and not whitelisted(path):
                    raise RuntimeError(
                        'YAML schemas should use dashed lower-case identifiers, '
                        'not {!r} @ {}'.format(k, path))
            elif isinstance(k, (voluptuous.Optional, voluptuous.Required)):
                check_identifier(path, k.schema)
            elif isinstance(k, voluptuous.Any):
                for v in k.validators:
                    check_identifier(path, v)
            elif not whitelisted(path):
                raise RuntimeError(
                    'Unexpected type in YAML schema: {} @ {}'.format(
                        type(k).__name__, path))

        if isinstance(sch, collections.Mapping):
            for k, v in sch.iteritems():
                child = "{}[{!r}]".format(path, k)
                check_identifier(child, k)
                iter(child, v)
        elif isinstance(sch, (list, tuple)):
            for i, v in enumerate(sch):
                iter("{}[{}]".format(path, i), v)
        elif isinstance(sch, voluptuous.Any):
            for v in sch.validators:
                iter(path, v)
    iter('schema', schema.schema)


def Schema(*args, **kwargs):
    """
    Operates identically to voluptuous.Schema, but applying some taskgraph-specific checks
    in the process.
    """
    schema = voluptuous.Schema(*args, **kwargs)
    check_schema(schema)
    return schema


OptimizationSchema = voluptuous.Any(
    # always run this task (default)
    None,
    # search the index for the given index namespaces, and replace this task if found
    # the search occurs in order, with the first match winning
    {'index-search': [basestring]},
    # consult SETA and skip this task if it is low-value
    {'seta': None},
    # skip this task if none of the given file patterns match
    {'skip-unless-changed': [basestring]},
    # skip this task if unless the change files' SCHEDULES contains any of these components
    {'skip-unless-schedules': list(schedules.ALL_COMPONENTS)},
    # skip if SETA or skip-unless-schedules says to
    {'skip-unless-schedules-or-seta': list(schedules.ALL_COMPONENTS)},
    # only run this task if its dependencies will run (useful for follow-on tasks that
    # are unnecessary if the parent tasks are not run)
    {'only-if-dependencies-run': None}
)