fune/toolkit/components/telemetry/build_scripts/mozparsers/parse_histograms.py

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import atexit
import collections
import itertools
import json
import math
import os
import re
import runpy
import sys
from collections import OrderedDict
from ctypes import c_int

from . import shared_telemetry_utils as utils
from .shared_telemetry_utils import ParserError

atexit.register(ParserError.exit_func)

# Constants.
MAX_LABEL_LENGTH = 20
MAX_LABEL_COUNT = 100
MAX_KEY_COUNT = 30
MAX_KEY_LENGTH = 20
MIN_CATEGORICAL_BUCKET_COUNT = 50
CPP_IDENTIFIER_PATTERN = "^[a-z][a-z0-9_]+[a-z0-9]$"

ALWAYS_ALLOWED_KEYS = [
    "kind",
    "description",
    "operating_systems",
    "expires_in_version",
    "alert_emails",
    "keyed",
    "releaseChannelCollection",
    "bug_numbers",
    "keys",
    "record_in_processes",
    "record_into_store",
    "products",
]

BASE_DOC_URL = (
    "https://firefox-source-docs.mozilla.org/toolkit/components/" "telemetry/telemetry/"
)
HISTOGRAMS_DOC_URL = BASE_DOC_URL + "collection/histograms.html"
SCALARS_DOC_URL = BASE_DOC_URL + "collection/scalars.html"

GECKOVIEW_STREAMING_SUPPORTED_KINDS = [
    "linear",
    "exponential",
    "categorical",
]

# parse_histograms.py is used by scripts from a mozilla-central build tree
# and also by outside consumers, such as the telemetry server.  We need
# to ensure that importing things works in both contexts.  Therefore,
# unconditionally importing things that are local to the build tree, such
# as buildconfig, is a no-no.
try:
    import buildconfig

    # Need to update sys.path to be able to find usecounters.
    sys.path.append(os.path.join(buildconfig.topsrcdir, "dom/base/"))
except ImportError:
    # Must be in an out-of-tree usage scenario.  Trust that whoever is
    # running this script knows we need the usecounters module and has
    # ensured it's in our sys.path.
    pass


def linear_buckets(dmin, dmax, n_buckets):
    ret_array = [0] * n_buckets
    dmin = float(dmin)
    dmax = float(dmax)
    for i in range(1, n_buckets):
        linear_range = (dmin * (n_buckets - 1 - i) + dmax * (i - 1)) / (n_buckets - 2)
        ret_array[i] = int(linear_range + 0.5)
    return ret_array


def exponential_buckets(dmin, dmax, n_buckets):
    log_max = math.log(dmax)
    bucket_index = 2
    ret_array = [0] * n_buckets
    current = dmin
    ret_array[1] = current
    for bucket_index in range(2, n_buckets):
        log_current = math.log(current)
        log_ratio = (log_max - log_current) / (n_buckets - bucket_index)
        log_next = log_current + log_ratio
        next_value = int(math.floor(math.exp(log_next) + 0.5))
        if next_value > current:
            current = next_value
        else:
            current = current + 1
        ret_array[bucket_index] = current
    return ret_array


allowlists = None


def load_allowlist():
    global allowlists
    try:
        parsers_path = os.path.realpath(os.path.dirname(__file__))
        # The parsers live in build_scripts/parsers in the Telemetry module, while
        # the histogram-allowlists file lives in the root of the module. Account
        # for that when looking for the allowlist.
        # NOTE: if the parsers are moved, this logic will need to be updated.
        telemetry_module_path = os.path.abspath(
            os.path.join(parsers_path, os.pardir, os.pardir)
        )
        allowlist_path = os.path.join(
            telemetry_module_path, "histogram-allowlists.json"
        )
        with open(allowlist_path, "r") as f:
            try:
                allowlists = json.load(f)
                for name, allowlist in allowlists.items():
                    allowlists[name] = set(allowlist)
            except ValueError:
                ParserError("Error parsing allowlist: %s" % allowlist_path).handle_now()
    except IOError:
        allowlists = None
        ParserError("Unable to parse allowlist: %s." % allowlist_path).handle_now()


class Histogram:
    """A class for representing a histogram definition."""

    def __init__(self, name, definition, strict_type_checks=False):
        """Initialize a histogram named name with the given definition.
        definition is a dict-like object that must contain at least the keys:

         - 'kind': The kind of histogram.  Must be one of 'boolean', 'flag',
           'count', 'enumerated', 'linear', or 'exponential'.
         - 'description': A textual description of the histogram.
         - 'strict_type_checks': A boolean indicating whether to use the new, stricter type checks.
                                 The server-side still has to deal with old, oddly typed
                                 submissions, so we have to skip them there by default.
        """
        self._strict_type_checks = strict_type_checks
        self._is_use_counter = name.startswith("USE_COUNTER2_")
        if self._is_use_counter:
            definition.setdefault("record_in_processes", ["main", "content"])
            definition.setdefault("releaseChannelCollection", "opt-out")
            definition.setdefault("products", ["firefox", "fennec"])
        self.verify_attributes(name, definition)
        self._name = name
        self._description = definition["description"]
        self._kind = definition["kind"]
        self._keys = definition.get("keys", [])
        self._keyed = definition.get("keyed", False)
        self._expiration = definition.get("expires_in_version")
        self._labels = definition.get("labels", [])
        self._record_in_processes = definition.get("record_in_processes")
        self._record_into_store = definition.get("record_into_store", ["main"])
        self._products = definition.get("products")
        self._operating_systems = definition.get("operating_systems", ["all"])

        self.compute_bucket_parameters(definition)
        self.set_nsITelemetry_kind()
        self.set_dataset(definition)

    def name(self):
        """Return the name of the histogram."""
        return self._name

    def description(self):
        """Return the description of the histogram."""
        return self._description

    def kind(self):
        """Return the kind of the histogram.
        Will be one of 'boolean', 'flag', 'count', 'enumerated', 'categorical', 'linear',
        or 'exponential'."""
        return self._kind

    def expiration(self):
        """Return the expiration version of the histogram."""
        return self._expiration

    def nsITelemetry_kind(self):
        """Return the nsITelemetry constant corresponding to the kind of
        the histogram."""
        return self._nsITelemetry_kind

    def low(self):
        """Return the lower bound of the histogram."""
        return self._low

    def high(self):
        """Return the high bound of the histogram."""
        return self._high

    def n_buckets(self):
        """Return the number of buckets in the histogram."""
        return self._n_buckets

    def keyed(self):
        """Returns True if this a keyed histogram, false otherwise."""
        return self._keyed

    def keys(self):
        """Returns a list of allowed keys for keyed histogram, [] for others."""
        return self._keys

    def dataset(self):
        """Returns the dataset this histogram belongs into."""
        return self._dataset

    def labels(self):
        """Returns a list of labels for a categorical histogram, [] for others."""
        return self._labels

    def record_in_processes(self):
        """Returns a list of processes this histogram is permitted to record in."""
        return self._record_in_processes

    def record_in_processes_enum(self):
        """Get the non-empty list of flags representing the processes to record data in"""
        return [utils.process_name_to_enum(p) for p in self.record_in_processes()]

    def products(self):
        """Get the non-empty list of products to record data on"""
        return self._products

    def products_enum(self):
        """Get the non-empty list of flags representing products to record data on"""
        return [utils.product_name_to_enum(p) for p in self.products()]

    def operating_systems(self):
        """Get the list of operating systems to record data on"""
        return self._operating_systems

    def record_on_os(self, target_os):
        """Check if this probe should be recorded on the passed os."""
        os = self.operating_systems()
        if "all" in os:
            return True

        canonical_os = utils.canonical_os(target_os)

        if "unix" in os and canonical_os in utils.UNIX_LIKE_OS:
            return True

        return canonical_os in os

    def record_into_store(self):
        """Get the non-empty list of stores to record into"""
        return self._record_into_store

    def ranges(self):
        """Return an array of lower bounds for each bucket in the histogram."""
        bucket_fns = {
            "boolean": linear_buckets,
            "flag": linear_buckets,
            "count": linear_buckets,
            "enumerated": linear_buckets,
            "categorical": linear_buckets,
            "linear": linear_buckets,
            "exponential": exponential_buckets,
        }

        if self._kind not in bucket_fns:
            ParserError(
                'Unknown kind "%s" for histogram "%s".' % (self._kind, self._name)
            ).handle_later()

        fn = bucket_fns[self._kind]
        return fn(self.low(), self.high(), self.n_buckets())

    def compute_bucket_parameters(self, definition):
        bucket_fns = {
            "boolean": Histogram.boolean_flag_bucket_parameters,
            "flag": Histogram.boolean_flag_bucket_parameters,
            "count": Histogram.boolean_flag_bucket_parameters,
            "enumerated": Histogram.enumerated_bucket_parameters,
            "categorical": Histogram.categorical_bucket_parameters,
            "linear": Histogram.linear_bucket_parameters,
            "exponential": Histogram.exponential_bucket_parameters,
        }

        if self._kind not in bucket_fns:
            ParserError(
                'Unknown kind "%s" for histogram "%s".' % (self._kind, self._name)
            ).handle_later()

        fn = bucket_fns[self._kind]
        self.set_bucket_parameters(*fn(definition))

    def verify_attributes(self, name, definition):
        general_keys = ALWAYS_ALLOWED_KEYS + ["low", "high", "n_buckets"]

        table = {
            "boolean": ALWAYS_ALLOWED_KEYS,
            "flag": ALWAYS_ALLOWED_KEYS,
            "count": ALWAYS_ALLOWED_KEYS,
            "enumerated": ALWAYS_ALLOWED_KEYS + ["n_values"],
            "categorical": ALWAYS_ALLOWED_KEYS + ["labels", "n_values"],
            "linear": general_keys,
            "exponential": general_keys,
        }
        # We removed extended_statistics_ok on the client, but the server-side,
        # where _strict_type_checks==False, has to deal with historical data.
        if not self._strict_type_checks:
            table["exponential"].append("extended_statistics_ok")

        kind = definition["kind"]
        if kind not in table:
            ParserError(
                'Unknown kind "%s" for histogram "%s".' % (kind, name)
            ).handle_later()
        allowed_keys = table[kind]

        self.check_name(name)
        self.check_keys(name, definition, allowed_keys)
        self.check_keys_field(name, definition)
        self.check_field_types(name, definition)
        self.check_allowlisted_kind(name, definition)
        self.check_allowlistable_fields(name, definition)
        self.check_expiration(name, definition)
        self.check_label_values(name, definition)
        self.check_record_in_processes(name, definition)
        self.check_products(name, definition)
        self.check_operating_systems(name, definition)
        self.check_record_into_store(name, definition)

    def check_name(self, name):
        if "#" in name:
            ParserError(
                'Error for histogram name "%s": "#" is not allowed.' % (name)
            ).handle_later()

        # Avoid C++ identifier conflicts between histogram enums and label enum names.
        if name.startswith("LABELS_"):
            ParserError(
                'Error for histogram name "%s":  can not start with "LABELS_".' % (name)
            ).handle_later()

        # To make it easier to generate C++ identifiers from this etc., we restrict
        # the histogram names to a strict pattern.
        # We skip this on the server to avoid failures with old Histogram.json revisions.
        if self._strict_type_checks:
            if not re.match(CPP_IDENTIFIER_PATTERN, name, re.IGNORECASE):
                ParserError(
                    'Error for histogram name "%s": name does not conform to "%s"'
                    % (name, CPP_IDENTIFIER_PATTERN)
                ).handle_later()

    def check_expiration(self, name, definition):
        field = "expires_in_version"
        expiration = definition.get(field)

        if not expiration:
            return

        # We forbid new probes from using "expires_in_version" : "default" field/value pair.
        # Old ones that use this are added to the allowlist.
        if (
            expiration == "default"
            and allowlists is not None
            and name not in allowlists["expiry_default"]
        ):
            ParserError(
                'New histogram "%s" cannot have "default" %s value.' % (name, field)
            ).handle_later()

        # Historical editions of Histograms.json can have the deprecated
        # expiration format 'N.Na1'. Fortunately, those scripts set
        # self._strict_type_checks to false.
        if (
            expiration != "default"
            and not utils.validate_expiration_version(expiration)
            and self._strict_type_checks
        ):
            ParserError(
                (
                    "Error for histogram {} - invalid {}: {}."
                    "\nSee: {}#expires-in-version"
                ).format(name, field, expiration, HISTOGRAMS_DOC_URL)
            ).handle_later()

        expiration = utils.add_expiration_postfix(expiration)

        definition[field] = expiration

    def check_label_values(self, name, definition):
        labels = definition.get("labels")
        if not labels:
            return

        invalid = filter(lambda l: len(l) > MAX_LABEL_LENGTH, labels)
        if len(list(invalid)) > 0:
            ParserError(
                'Label values for "%s" exceed length limit of %d: %s'
                % (name, MAX_LABEL_LENGTH, ", ".join(invalid))
            ).handle_later()

        if len(labels) > MAX_LABEL_COUNT:
            ParserError(
                'Label count for "%s" exceeds limit of %d' % (name, MAX_LABEL_COUNT)
            ).handle_now()

        # To make it easier to generate C++ identifiers from this etc., we restrict
        # the label values to a strict pattern.
        invalid = filter(
            lambda l: not re.match(CPP_IDENTIFIER_PATTERN, l, re.IGNORECASE), labels
        )
        if len(list(invalid)) > 0:
            ParserError(
                'Label values for %s are not matching pattern "%s": %s'
                % (name, CPP_IDENTIFIER_PATTERN, ", ".join(invalid))
            ).handle_later()

    def check_record_in_processes(self, name, definition):
        if not self._strict_type_checks:
            return

        field = "record_in_processes"
        rip = definition.get(field)

        DOC_URL = HISTOGRAMS_DOC_URL + "#record-in-processes"

        if not rip:
            ParserError(
                'Histogram "%s" must have a "%s" field:\n%s' % (name, field, DOC_URL)
            ).handle_later()

        for process in rip:
            if not utils.is_valid_process_name(process):
                ParserError(
                    'Histogram "%s" has unknown process "%s" in %s.\n%s'
                    % (name, process, field, DOC_URL)
                ).handle_later()

    def check_products(self, name, definition):
        if not self._strict_type_checks:
            return

        field = "products"
        products = definition.get(field)

        DOC_URL = HISTOGRAMS_DOC_URL + "#products"

        if not products:
            ParserError(
                'Histogram "%s" must have a "%s" field:\n%s' % (name, field, DOC_URL)
            ).handle_now()

        for product in products:
            if not utils.is_valid_product(product):
                ParserError(
                    'Histogram "%s" has unknown product "%s" in %s.\n%s'
                    % (name, product, field, DOC_URL)
                ).handle_later()
            if utils.is_geckoview_streaming_product(product):
                kind = definition.get("kind")
                if kind not in GECKOVIEW_STREAMING_SUPPORTED_KINDS:
                    ParserError(
                        (
                            'Histogram "%s" is of kind "%s" which is unsupported for '
                            'product "%s".'
                        )
                        % (name, kind, product)
                    ).handle_later()
                keyed = definition.get("keyed")
                if keyed:
                    ParserError(
                        'Keyed histograms like "%s" are unsupported for product "%s"'
                        % (name, product)
                    ).handle_later()

    def check_operating_systems(self, name, definition):
        if not self._strict_type_checks:
            return

        field = "operating_systems"
        operating_systems = definition.get(field)

        DOC_URL = HISTOGRAMS_DOC_URL + "#operating-systems"

        if not operating_systems:
            # operating_systems is optional
            return

        for operating_system in operating_systems:
            if not utils.is_valid_os(operating_system):
                ParserError(
                    'Histogram "%s" has unknown operating system "%s" in %s.\n%s'
                    % (name, operating_system, field, DOC_URL)
                ).handle_later()

    def check_record_into_store(self, name, definition):
        if not self._strict_type_checks:
            return

        field = "record_into_store"
        DOC_URL = HISTOGRAMS_DOC_URL + "#record-into-store"

        if field not in definition:
            # record_into_store is optional
            return

        record_into_store = definition.get(field)
        # record_into_store should not be empty
        if not record_into_store:
            ParserError(
                'Histogram "%s" has empty list of stores, which is not allowed.\n%s'
                % (name, DOC_URL)
            ).handle_later()

    def check_keys_field(self, name, definition):
        keys = definition.get("keys")
        if not self._strict_type_checks or keys is None:
            return

        if not definition.get("keyed", False):
            raise ValueError(
                "'keys' field is not valid for %s; only allowed for keyed histograms."
                % (name)
            )

        if len(keys) == 0:
            raise ValueError("The key list for %s cannot be empty" % (name))

        if len(keys) > MAX_KEY_COUNT:
            raise ValueError(
                "Label count for %s exceeds limit of %d" % (name, MAX_KEY_COUNT)
            )

        invalid = filter(lambda k: len(k) > MAX_KEY_LENGTH, keys)
        if len(list(invalid)) > 0:
            raise ValueError(
                '"keys" values for %s are exceeding length "%d": %s'
                % (name, MAX_KEY_LENGTH, ", ".join(invalid))
            )

    def check_allowlisted_kind(self, name, definition):
        # We don't need to run any of these checks on the server.
        if not self._strict_type_checks or allowlists is None:
            return

        # Disallow "flag" and "count" histograms on desktop, suggest to use
        # scalars instead. Allow using these histograms on Android, as we
        # don't support scalars there yet.
        hist_kind = definition.get("kind")
        android_target = "android" in definition.get("operating_systems", [])

        if (
            not android_target
            and hist_kind in ["flag", "count"]
            and name not in allowlists["kind"]
        ):
            ParserError(
                (
                    'Unsupported kind "%s" for histogram "%s":\n'
                    'New "%s" histograms are not supported on Desktop, you should'
                    " use scalars instead:\n"
                    "%s\n"
                    "Are you trying to add a histogram on Android?"
                    ' Add "operating_systems": ["android"] to your histogram definition.'
                )
                % (hist_kind, name, hist_kind, SCALARS_DOC_URL)
            ).handle_now()

    # Check for the presence of fields that old histograms are allowlisted for.
    def check_allowlistable_fields(self, name, definition):
        # Use counters don't have any mechanism to add the fields checked here,
        # so skip the check for them.
        # We also don't need to run any of these checks on the server.
        if self._is_use_counter or not self._strict_type_checks:
            return

        # In the pipeline we don't have allowlists available.
        if allowlists is None:
            return

        for field in ["alert_emails", "bug_numbers"]:
            if field not in definition and name not in allowlists[field]:
                ParserError(
                    'New histogram "%s" must have a "%s" field.' % (name, field)
                ).handle_later()
            if field in definition and name in allowlists[field]:
                msg = (
                    'Histogram "%s" should be removed from the allowlist for "%s" in '
                    "histogram-allowlists.json."
                )
                ParserError(msg % (name, field)).handle_later()

    def check_field_types(self, name, definition):
        # Define expected types for the histogram properties.
        type_checked_fields = {
            "n_buckets": int,
            "n_values": int,
            "low": int,
            "high": int,
            "keyed": bool,
            "expires_in_version": str,
            "kind": str,
            "description": str,
            "releaseChannelCollection": str,
        }

        # For list fields we check the items types.
        type_checked_list_fields = {
            "bug_numbers": int,
            "alert_emails": str,
            "labels": str,
            "record_in_processes": str,
            "keys": str,
            "products": str,
            "operating_systems": str,
            "record_into_store": str,
        }

        # For the server-side, where _strict_type_checks==False, we want to
        # skip the stricter type checks for these fields for dealing with
        # historical data.
        coerce_fields = ["low", "high", "n_values", "n_buckets"]
        if not self._strict_type_checks:
            # This handles some old non-numeric expressions.
            EXPRESSIONS = {
                "JS::GCReason::NUM_TELEMETRY_REASONS": 101,
                "mozilla::StartupTimeline::MAX_EVENT_ID": 12,
            }

            def try_to_coerce_to_number(v):
                if v in EXPRESSIONS:
                    return EXPRESSIONS[v]
                try:
                    return eval(v, {})
                except Exception:
                    return v

            for key in [k for k in coerce_fields if k in definition]:
                definition[key] = try_to_coerce_to_number(definition[key])
            # This handles old "keyed":"true" definitions (bug 1271986).
            if definition.get("keyed", None) == "true":
                definition["keyed"] = True

        def nice_type_name(t):
            if t is str:
                return "string"
            return t.__name__

        for key, key_type in type_checked_fields.items():
            if key not in definition:
                continue
            if not isinstance(definition[key], key_type):
                ParserError(
                    'Value for key "{0}" in histogram "{1}" should be {2}.'.format(
                        key, name, nice_type_name(key_type)
                    )
                ).handle_later()

        # Make sure the max range is lower than or equal to INT_MAX
        if "high" in definition and not c_int(definition["high"]).value > 0:
            ParserError(
                'Value for high in histogram "{0}" should be lower or equal to INT_MAX.'.format(
                    nice_type_name(c_int)
                )
            ).handle_later()

        for key, key_type in type_checked_list_fields.items():
            if key not in definition:
                continue
            if not all(isinstance(x, key_type) for x in definition[key]):
                ParserError(
                    'All values for list "{0}" in histogram "{1}" should be of type'
                    " {2}.".format(key, name, nice_type_name(key_type))
                ).handle_later()

    def check_keys(self, name, definition, allowed_keys):
        if not self._strict_type_checks:
            return
        for key in iter(definition.keys()):
            if key not in allowed_keys:
                ParserError(
                    'Key "%s" is not allowed for histogram "%s".' % (key, name)
                ).handle_later()

    def set_bucket_parameters(self, low, high, n_buckets):
        self._low = low
        self._high = high
        self._n_buckets = n_buckets
        max_n_buckets = 101 if self._kind in ["enumerated", "categorical"] else 100
        if (
            allowlists is not None
            and self._n_buckets > max_n_buckets
            and type(self._n_buckets) is int
        ):
            if self._name not in allowlists["n_buckets"]:
                ParserError(
                    'New histogram "%s" is not permitted to have more than 100 buckets.\n'
                    "Histograms with large numbers of buckets use disproportionately high"
                    " amounts of resources. Contact a Telemetry peer (e.g. in #telemetry)"
                    " if you think an exception ought to be made:\n"
                    "https://wiki.mozilla.org/Modules/Toolkit#Telemetry" % self._name
                ).handle_later()

    @staticmethod
    def boolean_flag_bucket_parameters(definition):
        return (1, 2, 3)

    @staticmethod
    def linear_bucket_parameters(definition):
        return (definition.get("low", 1), definition["high"], definition["n_buckets"])

    @staticmethod
    def enumerated_bucket_parameters(definition):
        n_values = definition["n_values"]
        return (1, n_values, n_values + 1)

    @staticmethod
    def categorical_bucket_parameters(definition):
        # Categorical histograms default to 50 buckets to make working with them easier.
        # Otherwise when adding labels later we run into problems with the pipeline not
        # supporting bucket changes.
        # This can be overridden using the n_values field.
        n_values = max(
            len(definition["labels"]),
            definition.get("n_values", 0),
            MIN_CATEGORICAL_BUCKET_COUNT,
        )
        return (1, n_values, n_values + 1)

    @staticmethod
    def exponential_bucket_parameters(definition):
        return (definition.get("low", 1), definition["high"], definition["n_buckets"])

    def set_nsITelemetry_kind(self):
        # Pick a Telemetry implementation type.
        types = {
            "boolean": "BOOLEAN",
            "flag": "FLAG",
            "count": "COUNT",
            "enumerated": "LINEAR",
            "categorical": "CATEGORICAL",
            "linear": "LINEAR",
            "exponential": "EXPONENTIAL",
        }

        if self._kind not in types:
            ParserError(
                'Unknown kind "%s" for histogram "%s".' % (self._kind, self._name)
            ).handle_later()

        self._nsITelemetry_kind = "nsITelemetry::HISTOGRAM_%s" % types[self._kind]

    def set_dataset(self, definition):
        datasets = {
            "opt-in": "DATASET_PRERELEASE_CHANNELS",
            "opt-out": "DATASET_ALL_CHANNELS",
        }

        value = definition.get("releaseChannelCollection", "opt-in")
        if value not in datasets:
            ParserError(
                "Unknown value for releaseChannelCollection"
                ' policy for histogram "%s".' % self._name
            ).handle_later()

        self._dataset = "nsITelemetry::" + datasets[value]


# This hook function loads the histograms into an OrderedDict.
# It will raise a ParserError if duplicate keys are found.
def load_histograms_into_dict(ordered_pairs, strict_type_checks):
    d = collections.OrderedDict()
    for key, value in ordered_pairs:
        if strict_type_checks and key in d:
            ParserError(
                "Found duplicate key in Histograms file: %s" % key
            ).handle_later()
        d[key] = value
    return d


# We support generating histograms from multiple different input files, not
# just Histograms.json.  For each file's basename, we have a specific
# routine to parse that file, and return a dictionary mapping histogram
# names to histogram parameters.
def from_json(filename, strict_type_checks):
    with open(filename, "r") as f:
        try:

            def hook(ps):
                return load_histograms_into_dict(ps, strict_type_checks)

            histograms = json.load(f, object_pairs_hook=hook)
        except ValueError as e:
            ParserError(
                "error parsing histograms in %s: %s" % (filename, e)
            ).handle_now()
    return histograms


def from_UseCounters_conf(filename, strict_type_checks):
    return usecounters.generate_histograms(filename)


def from_UseCountersWorker_conf(filename, strict_type_checks):
    return usecounters.generate_histograms(filename, True)


def from_nsDeprecatedOperationList(filename, strict_type_checks):
    operation_regex = re.compile("^DEPRECATED_OPERATION\\(([^)]+)\\)")
    histograms = collections.OrderedDict()

    with open(filename, "r") as f:
        for line in f:
            match = operation_regex.search(line)
            if not match:
                continue

            op = match.group(1)

            def add_counter(context):
                name = "USE_COUNTER2_DEPRECATED_%s_%s" % (op, context.upper())
                histograms[name] = {
                    "expires_in_version": "never",
                    "kind": "boolean",
                    "description": "Whether a %s used %s" % (context, op),
                }

            add_counter("document")
            add_counter("page")

    return histograms


def to_camel_case(property_name):
    return re.sub(
        "(^|_|-)([a-z0-9])",
        lambda m: m.group(2).upper(),
        property_name.strip("_").strip("-"),
    )


def add_css_property_counters(histograms, property_name):
    def add_counter(context):
        name = "USE_COUNTER2_CSS_PROPERTY_%s_%s" % (
            to_camel_case(property_name),
            context.upper(),
        )
        histograms[name] = {
            "expires_in_version": "never",
            "kind": "boolean",
            "description": "Whether a %s used the CSS property %s"
            % (context, property_name),
        }

    add_counter("document")
    add_counter("page")


def from_ServoCSSPropList(filename, strict_type_checks):
    histograms = collections.OrderedDict()
    properties = runpy.run_path(filename)["data"]
    for prop in properties:
        add_css_property_counters(histograms, prop.name)
    return histograms


def from_counted_unknown_properties(filename, strict_type_checks):
    histograms = collections.OrderedDict()
    properties = runpy.run_path(filename)["COUNTED_UNKNOWN_PROPERTIES"]

    # NOTE(emilio): Unlike ServoCSSProperties, `prop` here is just the property
    # name.
    #
    # We use the same naming as CSS properties so that we don't get
    # discontinuity when we implement or prototype them.
    for prop in properties:
        add_css_property_counters(histograms, prop)
    return histograms


# This is only used for probe-scraper.
def from_properties_db(filename, strict_type_checks):
    histograms = collections.OrderedDict()
    with open(filename, "r") as f:
        in_css_properties = False

        for line in f:
            if not in_css_properties:
                if line.startswith("exports.CSS_PROPERTIES = {"):
                    in_css_properties = True
                continue

            if line.startswith("};"):
                break

            if not line.startswith('  "'):
                continue

            name = line.split('"')[1]
            add_css_property_counters(histograms, name)
    return histograms


FILENAME_PARSERS = [
    (lambda x: from_json if x.endswith(".json") else None),
    (
        lambda x: from_nsDeprecatedOperationList
        if x == "nsDeprecatedOperationList.h"
        else None
    ),
    (lambda x: from_ServoCSSPropList if x == "ServoCSSPropList.py" else None),
    (
        lambda x: from_counted_unknown_properties
        if x == "counted_unknown_properties.py"
        else None
    ),
    (lambda x: from_properties_db if x == "properties-db.js" else None),
]

# Similarly to the dance above with buildconfig, usecounters may not be
# available, so handle that gracefully.
try:
    import usecounters

    FILENAME_PARSERS.append(
        lambda x: from_UseCounters_conf if x == "UseCounters.conf" else None
    )
    FILENAME_PARSERS.append(
        lambda x: from_UseCountersWorker_conf if x == "UseCountersWorker.conf" else None
    )
except ImportError:
    pass


def from_files(filenames, strict_type_checks=True):
    """Return an iterator that provides a sequence of Histograms for
    the histograms defined in filenames.
    """
    if strict_type_checks:
        load_allowlist()

    all_histograms = OrderedDict()
    for filename in filenames:
        parser = None
        for checkFn in FILENAME_PARSERS:
            parser = checkFn(os.path.basename(filename))
            if parser is not None:
                break

        if parser is None:
            ParserError("Don't know how to parse %s." % filename).handle_now()

        histograms = parser(filename, strict_type_checks)

        # OrderedDicts are important, because then the iteration order over
        # the parsed histograms is stable, which makes the insertion into
        # all_histograms stable, which makes ordering in generated files
        # stable, which makes builds more deterministic.
        if not isinstance(histograms, OrderedDict):
            ParserError("Histogram parser did not provide an OrderedDict.").handle_now()

        for (name, definition) in histograms.items():
            if name in all_histograms:
                ParserError('Duplicate histogram name "%s".' % name).handle_later()
            all_histograms[name] = definition

    def check_continuity(iterable, filter_function, name):
        indices = list(filter(filter_function, enumerate(iter(iterable.keys()))))
        if indices:
            lower_bound = indices[0][0]
            upper_bound = indices[-1][0]
            n_counters = upper_bound - lower_bound + 1
            if n_counters != len(indices):
                ParserError(
                    "Histograms %s must be defined in a contiguous block." % name
                ).handle_later()

    # We require that all USE_COUNTER2_*_WORKER histograms be defined in a contiguous
    # block.
    check_continuity(
        all_histograms,
        lambda x: x[1].startswith("USE_COUNTER2_") and x[1].endswith("_WORKER"),
        "use counter worker",
    )
    # And all other USE_COUNTER2_* histograms be defined in a contiguous
    # block.
    check_continuity(
        all_histograms,
        lambda x: x[1].startswith("USE_COUNTER2_") and not x[1].endswith("_WORKER"),
        "use counter",
    )

    # Check that histograms that were removed from Histograms.json etc.
    # are also removed from the allowlists.
    if allowlists is not None:
        all_allowlist_entries = itertools.chain.from_iterable(iter(allowlists.values()))
        orphaned = set(all_allowlist_entries) - set(all_histograms.keys())
        if len(orphaned) > 0:
            msg = (
                "The following entries are orphaned and should be removed from "
                "histogram-allowlists.json:\n%s"
            )
            ParserError(msg % (", ".join(sorted(orphaned)))).handle_later()

    for (name, definition) in all_histograms.items():
        yield Histogram(name, definition, strict_type_checks=strict_type_checks)