fune/third_party/python/taskcluster_taskgraph/taskgraph/util/vcs.py

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.


import logging
import os
import re
import subprocess
from abc import ABC, abstractmethod, abstractproperty
from shutil import which

import requests
from redo import retry

from taskgraph.util.path import ancestors

PUSHLOG_TMPL = "{}/json-pushes?version=2&changeset={}&tipsonly=1&full=1"

logger = logging.getLogger(__name__)


class Repository(ABC):
    # Both mercurial and git use sha1 as revision idenfiers. Luckily, both define
    # the same value as the null revision.
    #
    # https://github.com/git/git/blob/dc04167d378fb29d30e1647ff6ff51dd182bc9a3/t/oid-info/hash-info#L7
    # https://www.mercurial-scm.org/repo/hg-stable/file/82efc31bd152/mercurial/node.py#l30
    NULL_REVISION = "0000000000000000000000000000000000000000"

    def __init__(self, path):
        self.path = path
        self.binary = which(self.tool)
        if self.binary is None:
            raise OSError(f"{self.tool} not found!")
        self._valid_diff_filter = ("m", "a", "d")

        self._env = os.environ.copy()

    def run(self, *args: str, **kwargs):
        return_codes = kwargs.pop("return_codes", [])
        cmd = (self.binary,) + args

        try:
            return subprocess.check_output(
                cmd, cwd=self.path, env=self._env, encoding="utf-8", **kwargs
            )
        except subprocess.CalledProcessError as e:
            if e.returncode in return_codes:
                return ""
            raise

    @abstractproperty
    def tool(self) -> str:
        """Version control system being used, either 'hg' or 'git'."""

    @abstractproperty
    def head_rev(self) -> str:
        """Hash of HEAD revision."""

    @abstractproperty
    def base_rev(self):
        """Hash of revision the current topic branch is based on."""

    @abstractproperty
    def branch(self):
        """Current branch or bookmark the checkout has active."""

    @abstractproperty
    def all_remote_names(self):
        """Name of all configured remote repositories."""

    @abstractproperty
    def default_remote_name(self):
        """Name the VCS defines for the remote repository when cloning
        it for the first time. This name may not exist anymore if users
        changed the default configuration, for instance."""

    @abstractproperty
    def remote_name(self):
        """Name of the remote repository."""

    def _get_most_suitable_remote(self, remote_instructions):
        remotes = self.all_remote_names
        if len(remotes) == 1:
            return remotes[0]

        if self.default_remote_name in remotes:
            return self.default_remote_name

        first_remote = remotes[0]
        logger.warning(
            f"Unable to determine which remote repository to use between: {remotes}. "
            f'Arbitrarily using the first one "{first_remote}". Please set an '
            f"`{self.default_remote_name}` remote if the arbitrarily selected one "
            f"is not right. To do so: {remote_instructions}"
        )

        return first_remote

    @abstractproperty
    def default_branch(self):
        """Name of the default branch."""

    @abstractmethod
    def get_url(self, remote=None):
        """Get URL of the upstream repository."""

    @abstractmethod
    def get_commit_message(self, revision=None):
        """Commit message of specified revision or current commit."""

    @abstractmethod
    def get_changed_files(self, diff_filter, mode="unstaged", rev=None, base_rev=None):
        """Return a list of files that are changed in:
         * either this repository's working copy,
         * or at a given revision (``rev``)
         * or between 2 revisions (``base_rev`` and ``rev``)

        ``diff_filter`` controls which kinds of modifications are returned.
        It is a string which may only contain the following characters:

            A - Include files that were added
            D - Include files that were deleted
            M - Include files that were modified

        By default, all three will be included.

        ``mode`` can be one of 'unstaged', 'staged' or 'all'. Only has an
        effect on git. Defaults to 'unstaged'.

        ``rev`` is a specifier for which changesets to consider for
        changes. The exact meaning depends on the vcs system being used.

        ``base_rev`` specifies the range of changesets. This parameter cannot
        be used without ``rev``. The range includes ``rev`` but excludes
        ``base_rev``.
        """

    @abstractmethod
    def get_outgoing_files(self, diff_filter, upstream):
        """Return a list of changed files compared to upstream.

        ``diff_filter`` works the same as `get_changed_files`.
        ``upstream`` is a remote ref to compare against. If unspecified,
        this will be determined automatically. If there is no remote ref,
        a MissingUpstreamRepo exception will be raised.
        """

    @abstractmethod
    def working_directory_clean(self, untracked=False, ignored=False):
        """Determine if the working directory is free of modifications.

        Returns True if the working directory does not have any file
        modifications. False otherwise.

        By default, untracked and ignored files are not considered. If
        ``untracked`` or ``ignored`` are set, they influence the clean check
        to factor these file classes into consideration.
        """

    @abstractmethod
    def update(self, ref):
        """Update the working directory to the specified reference."""

    @abstractmethod
    def find_latest_common_revision(self, base_ref_or_rev, head_rev):
        """Find the latest revision that is common to both the given
        ``head_rev`` and ``base_ref_or_rev``"""

    @abstractmethod
    def does_revision_exist_locally(self, revision):
        """Check whether this revision exists in the local repository.

        If this function returns an unexpected value, then make sure
        the revision was fetched from the remote repository."""


class HgRepository(Repository):
    tool = "hg"
    default_remote_name = "default"

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._env["HGPLAIN"] = "1"

    @property
    def head_rev(self):
        return self.run("log", "-r", ".", "-T", "{node}").strip()

    @property
    def base_rev(self):
        return self.run("log", "-r", "last(ancestors(.) and public())", "-T", "{node}")

    @property
    def branch(self):
        bookmarks_fn = os.path.join(self.path, ".hg", "bookmarks.current")
        if os.path.exists(bookmarks_fn):
            with open(bookmarks_fn) as f:
                bookmark = f.read()
                return bookmark or None

        return None

    @property
    def all_remote_names(self):
        remotes = self.run("paths", "--quiet").splitlines()
        if not remotes:
            raise RuntimeError("No remotes defined")
        return remotes

    @property
    def remote_name(self):
        return self._get_most_suitable_remote(
            "Edit .hg/hgrc and add:\n\n[paths]\ndefault = $URL",
        )

    @property
    def default_branch(self):
        # Mercurial recommends keeping "default"
        # https://www.mercurial-scm.org/wiki/StandardBranching#Don.27t_use_a_name_other_than_default_for_your_main_development_branch
        return "default"

    def get_url(self, remote="default"):
        return self.run("path", "-T", "{url}", remote).strip()

    def get_commit_message(self, revision=None):
        revision = revision or self.head_rev
        return self.run("log", "-r", ".", "-T", "{desc}")

    def _format_diff_filter(self, diff_filter, for_status=False):
        df = diff_filter.lower()
        assert all(f in self._valid_diff_filter for f in df)

        # When looking at the changes in the working directory, the hg status
        # command uses 'd' for files that have been deleted with a non-hg
        # command, and 'r' for files that have been `hg rm`ed. Use both.
        return df.replace("d", "dr") if for_status else df

    def _files_template(self, diff_filter):
        template = ""
        df = self._format_diff_filter(diff_filter)
        if "a" in df:
            template += "{file_adds % '{file}\\n'}"
        if "d" in df:
            template += "{file_dels % '{file}\\n'}"
        if "m" in df:
            template += "{file_mods % '{file}\\n'}"
        return template

    def get_changed_files(
        self, diff_filter="ADM", mode="unstaged", rev=None, base_rev=None
    ):
        if rev is None:
            if base_rev is not None:
                raise ValueError("Cannot specify `base_rev` without `rev`")
            # Use --no-status to print just the filename.
            df = self._format_diff_filter(diff_filter, for_status=True)
            return self.run("status", "--no-status", f"-{df}").splitlines()
        else:
            template = self._files_template(diff_filter)
            revision_argument = rev if base_rev is None else f"{base_rev}~-1::{rev}"
            return self.run("log", "-r", revision_argument, "-T", template).splitlines()

    def get_outgoing_files(self, diff_filter="ADM", upstream=None):
        template = self._files_template(diff_filter)

        if not upstream:
            return self.run(
                "log", "-r", "draft() and ancestors(.)", "--template", template
            ).split()

        return self.run(
            "outgoing",
            "-r",
            ".",
            "--quiet",
            "--template",
            template,
            upstream,
            return_codes=(1,),
        ).split()

    def working_directory_clean(self, untracked=False, ignored=False):
        args = ["status", "--modified", "--added", "--removed", "--deleted"]
        if untracked:
            args.append("--unknown")
        if ignored:
            args.append("--ignored")

        # If output is empty, there are no entries of requested status, which
        # means we are clean.
        return not len(self.run(*args).strip())

    def update(self, ref):
        return self.run("update", "--check", ref)

    def find_latest_common_revision(self, base_ref_or_rev, head_rev):
        return self.run(
            "log",
            "-r",
            f"last(ancestors('{base_ref_or_rev}') and ancestors('{head_rev}'))",
            "--template",
            "{node}",
        ).strip()

    def does_revision_exist_locally(self, revision):
        try:
            return self.run("log", "-r", revision).strip() != ""
        except subprocess.CalledProcessError as e:
            # Error code 255 comes with the message:
            # "abort: unknown revision $REVISION"
            if e.returncode == 255:
                return False
            raise


class GitRepository(Repository):
    tool = "git"
    default_remote_name = "origin"

    _LS_REMOTE_PATTERN = re.compile(r"ref:\s+refs/heads/(?P<branch_name>\S+)\s+HEAD")

    @property
    def head_rev(self):
        return self.run("rev-parse", "--verify", "HEAD").strip()

    @property
    def base_rev(self):
        refs = self.run(
            "rev-list", "HEAD", "--topo-order", "--boundary", "--not", "--remotes"
        ).splitlines()
        if refs:
            return refs[-1][1:]  # boundary starts with a prefix `-`
        return self.head_rev

    @property
    def branch(self):
        return self.run("branch", "--show-current").strip() or None

    @property
    def all_remote_names(self):
        remotes = self.run("remote").splitlines()
        if not remotes:
            raise RuntimeError("No remotes defined")
        return remotes

    @property
    def remote_name(self):
        try:
            remote_branch_name = self.run(
                "rev-parse", "--verify", "--abbrev-ref", "--symbolic-full-name", "@{u}"
            ).strip()
            return remote_branch_name.split("/")[0]
        except subprocess.CalledProcessError as e:
            # Error code 128 comes with the message:
            # "fatal: no upstream configured for branch $BRANCH"
            if e.returncode != 128:
                raise

        return self._get_most_suitable_remote("`git remote add origin $URL`")

    @property
    def default_branch(self):
        try:
            # this one works if the current repo was cloned from an existing
            # repo elsewhere
            return self._get_default_branch_from_cloned_metadata()
        except (subprocess.CalledProcessError, RuntimeError):
            pass

        try:
            # This call works if you have (network) access to the repo
            return self._get_default_branch_from_remote_query()
        except (subprocess.CalledProcessError, RuntimeError):
            pass

        # this one is the last resort in case the remote is not accessible and
        # the local repo is where `git init` was made
        return self._guess_default_branch()

    def _get_default_branch_from_remote_query(self):
        # This function requires network access to the repo
        remote_name = self.remote_name
        output = self.run("ls-remote", "--symref", remote_name, "HEAD")
        matches = self._LS_REMOTE_PATTERN.search(output)
        if not matches:
            raise RuntimeError(
                f'Could not find the default branch of remote repository "{remote_name}". '
                "Got: {output}"
            )

        branch_name = matches.group("branch_name")
        return f"{remote_name}/{branch_name}"

    def _get_default_branch_from_cloned_metadata(self):
        return self.run("rev-parse", "--abbrev-ref", f"{self.remote_name}/HEAD").strip()

    def _guess_default_branch(self):
        branches = [
            line.strip()
            for line in self.run(
                "branch", "--all", "--no-color", "--format=%(refname)"
            ).splitlines()
            for candidate_branch in ("main", "master", "branches/default/tip")
            if line.strip().endswith(candidate_branch)
        ]

        if len(branches) == 1:
            return branches[0]

        raise RuntimeError(f"Unable to find default branch. Got: {branches}")

    def get_url(self, remote="origin"):
        return self.run("remote", "get-url", remote).strip()

    def get_commit_message(self, revision=None):
        revision = revision or self.head_rev
        return self.run("log", "-n1", "--format=%B")

    def get_changed_files(
        self, diff_filter="ADM", mode="unstaged", rev=None, base_rev=None
    ):
        assert all(f.lower() in self._valid_diff_filter for f in diff_filter)

        if rev is None:
            if base_rev is not None:
                raise ValueError("Cannot specify `base_rev` without `rev`")
            cmd = ["diff"]
            if mode == "staged":
                cmd.append("--cached")
            elif mode == "all":
                cmd.append("HEAD")
        else:
            revision_argument = (
                f"{rev}~1..{rev}" if base_rev is None else f"{base_rev}..{rev}"
            )
            cmd = ["log", "--format=format:", revision_argument]

        cmd.append("--name-only")
        cmd.append("--diff-filter=" + diff_filter.upper())

        files = self.run(*cmd).splitlines()
        return [f for f in files if f]

    def get_outgoing_files(self, diff_filter="ADM", upstream=None):
        assert all(f.lower() in self._valid_diff_filter for f in diff_filter)

        not_condition = upstream if upstream else "--remotes"

        files = self.run(
            "log",
            "--name-only",
            f"--diff-filter={diff_filter.upper()}",
            "--oneline",
            "--pretty=format:",
            "HEAD",
            "--not",
            not_condition,
        ).splitlines()
        return [f for f in files if f]

    def working_directory_clean(self, untracked=False, ignored=False):
        args = ["status", "--porcelain"]

        # Even in --porcelain mode, behavior is affected by the
        # ``status.showUntrackedFiles`` option, which means we need to be
        # explicit about how to treat untracked files.
        if untracked:
            args.append("--untracked-files=all")
        else:
            args.append("--untracked-files=no")

        if ignored:
            args.append("--ignored")

        # If output is empty, there are no entries of requested status, which
        # means we are clean.
        return not len(self.run(*args).strip())

    def update(self, ref):
        self.run("checkout", ref)

    def find_latest_common_revision(self, base_ref_or_rev, head_rev):
        return self.run("merge-base", base_ref_or_rev, head_rev).strip()

    def does_revision_exist_locally(self, revision):
        try:
            return self.run("cat-file", "-t", revision).strip() == "commit"
        except subprocess.CalledProcessError as e:
            # Error code 128 comes with the message:
            # "git cat-file: could not get object info"
            if e.returncode == 128:
                return False
            raise


def get_repository(path):
    """Get a repository object for the repository at `path`.
    If `path` is not a known VCS repository, raise an exception.
    """
    for path in ancestors(path):
        if os.path.isdir(os.path.join(path, ".hg")):
            return HgRepository(path)
        elif os.path.exists(os.path.join(path, ".git")):
            return GitRepository(path)

    raise RuntimeError("Current directory is neither a git or hg repository")


def find_hg_revision_push_info(repository, revision):
    """Given the parameters for this action and a revision, find the
    pushlog_id of the revision."""
    pushlog_url = PUSHLOG_TMPL.format(repository, revision)

    def query_pushlog(url):
        r = requests.get(pushlog_url, timeout=60)
        r.raise_for_status()
        return r

    r = retry(
        query_pushlog,
        args=(pushlog_url,),
        attempts=5,
        sleeptime=10,
    )
    pushes = r.json()["pushes"]
    if len(pushes) != 1:
        raise RuntimeError(
            "Unable to find a single pushlog_id for {} revision {}: {}".format(
                repository, revision, pushes
            )
        )
    pushid = list(pushes.keys())[0]
    return {
        "pushdate": pushes[pushid]["date"],
        "pushid": pushid,
        "user": pushes[pushid]["user"],
    }