Source code for loopy.kernel

"""Kernel object."""

__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"

__license__ = """
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
"""

from sys import intern

from collections import defaultdict

import numpy as np
from pytools import ImmutableRecordWithoutPickling, ImmutableRecord, memoize_method
from pytools.tag import Taggable
import islpy as isl
from islpy import dim_type
import re

from pytools import UniqueNameGenerator, generate_unique_names, natsorted

from loopy.diagnostic import CannotBranchDomainTree, LoopyError
from loopy.tools import update_persistent_hash
from loopy.diagnostic import StaticValueFindingError
from loopy.kernel.data import filter_iname_tags_by_type, Iname
from warnings import warn


# {{{ unique var names

class _UniqueVarNameGenerator(UniqueNameGenerator):

    def __init__(self, existing_names=frozenset(), forced_prefix=""):
        super().__init__(existing_names, forced_prefix)
        array_prefix_pattern = re.compile("(.*)_s[0-9]+$")

        array_prefixes = set()
        for name in existing_names:
            match = array_prefix_pattern.match(name)
            if match is None:
                continue

            array_prefixes.add(match.group(1))

        self.conflicting_array_prefixes = array_prefixes
        self.array_prefix_pattern = array_prefix_pattern

    def _name_added(self, name):
        match = self.array_prefix_pattern.match(name)
        if match is None:
            return

        self.conflicting_array_prefixes.add(match.group(1))

    def is_name_conflicting(self, name):
        if name in self.existing_names:
            return True

        # Array dimensions implemented as separate arrays generate
        # names by appending '_s<NUMBER>'. Make sure that no
        # conflicts can arise from these names.

        # Case 1: a_s0 is already a name; we are trying to insert a
        # Case 2: a is already a name; we are trying to insert a_s0

        if name in self.conflicting_array_prefixes:
            return True

        match = self.array_prefix_pattern.match(name)
        if match is None:
            return False

        return match.group(1) in self.existing_names

# }}}


# {{{ loop kernel object

class _deprecated_KernelState_SCHEDULED:  # noqa
    def __init__(self, f):
        self.f = f

    def __get__(self, obj, klass):
        warn(
            "'KernelState.SCHEDULED' is deprecated. "
            "Use 'KernelState.LINEARIZED'.",
            DeprecationWarning, stacklevel=2)
        return self.f()

class KernelState:  # noqa
    INITIAL = 0
    CALLS_RESOLVED = 1
    PREPROCESSED = 2
    LINEARIZED = 3

    @_deprecated_KernelState_SCHEDULED
    def SCHEDULED():  # pylint:disable=no-method-argument
        return KernelState.LINEARIZED

# {{{ kernel_state, KernelState compataibility

class _deperecated_kernel_state_class_method:  # noqa
    def __init__(self, f):
        self.f = f

    def __get__(self, obj, klass):
        warn("'temp_var_scope' is deprecated. Use 'AddressSpace'.",
                DeprecationWarning, stacklevel=2)
        return self.f()


class kernel_state:  # noqa
    """Deprecated. Use :class:`loopy.kernel.KernelState` instead.
    """

    @_deperecated_kernel_state_class_method
    def INITIAL():  # pylint:disable=no-method-argument
        return KernelState.INITIAL

    @_deperecated_kernel_state_class_method
    def PREPROCESSED():  # pylint:disable=no-method-argument
        return KernelState.PREPROCESSED

    @_deperecated_kernel_state_class_method
    def SCHEDULED():  # pylint:disable=no-method-argument
        return KernelState.SCHEDULED

# }}}


def _get_inames_from_domains(domains):
    return frozenset().union(*
            (frozenset(dom.get_var_names(dim_type.set)) for dom in domains))


class _not_provided:  # noqa: N801
    pass


[docs]class LoopKernel(ImmutableRecordWithoutPickling, Taggable):
    """These correspond more or less directly to arguments of
    :func:`loopy.make_kernel`.

    .. note::

        This data structure and its attributes should be considered immutable,
        even if it contains mutable data types. See :meth:`copy` for an easy
        way of producing a modified copy.

    .. attribute:: domains

        a list of :class:`islpy.BasicSet` instances representing the
        :ref:`domain-tree`.

    .. attribute:: instructions

        A list of :class:`InstructionBase` instances, e.g.
        :class:`Assignment`. See :ref:`instructions`.

    .. attribute:: args

        A list of :class:`loopy.KernelArgument`

    .. attribute:: schedule

        *None* or a list of :class:`loopy.schedule.ScheduleItem`

    .. attribute:: name
    .. attribute:: preambles
    .. attribute:: preamble_generators
    .. attribute:: assumptions

        A :class:`islpy.BasicSet` parameter domain.

    .. attribute:: local_sizes
    .. attribute:: temporary_variables

        A :class:`dict` of mapping variable names to
        :class:`loopy.TemporaryVariable`
        instances.

    .. attribute:: symbol_manglers

    .. attribute:: substitutions

        a mapping from substitution names to
        :class:`SubstitutionRule` objects

    .. attribute:: iname_slab_increments

        a dictionary mapping inames to (lower_incr,
        upper_incr) tuples that will be separated out in the execution to generate
        'bulk' slabs with fewer conditionals.

    .. attribute:: loop_priority

        A frozenset of priority constraints to the kernel. Each such constraint
        is a tuple of inames. Inames occuring in such a tuple will be scheduled
        earlier than any iname following in the tuple. This applies only to inames
        with non-parallel implementation tags.

    .. attribute:: silenced_warnings

    .. attribute:: applied_iname_rewrites

        A list of past substitution dictionaries that
        were applied to the kernel. These are stored so that they may be repeated
        on expressions the user specifies later.

    .. attribute:: cache_manager
    .. attribute:: options

        An instance of :class:`loopy.Options`

    .. attribute:: state

        A value from :class:`KernelState`.

    .. attribute:: target

        A subclass of :class:`loopy.TargetBase`.

    .. attribute:: inames

        An instance of :class:`dict`, a mapping from the names of kernel's
        inames to their corresponding instances of :class:`loopy.kernel.data.Iname`.
        An entry is guaranteed to be present for each iname.

    .. automethod:: __call__
    .. automethod:: copy

    .. automethod:: tagged
    .. automethod:: without_tags
    """

    # {{{ constructor

    def __init__(self, domains, instructions, args=None,
            schedule=None,
            linearization=None,
            name="loopy_kernel",
            preambles=None,
            preamble_generators=None,
            assumptions=None,
            local_sizes=None,
            temporary_variables=None,
            inames=None,
            iname_to_tags=None,
            substitutions=None,
            symbol_manglers=None,

            iname_slab_increments=None,
            loop_priority=frozenset(),
            silenced_warnings=None,

            applied_iname_rewrites=None,
            cache_manager=None,
            index_dtype=None,
            options=None,

            state=KernelState.INITIAL,
            target=None,

            overridden_get_grid_sizes_for_insn_ids=None,
            _cached_written_variables=None,
            tags=frozenset()):
        """
        :arg overridden_get_grid_sizes_for_insn_ids: A callable. When kernels get
            intersected in slab decomposition, their grid sizes shouldn't
            change. This provides a way to forward sub-kernel grid size requests.
        """

        # {{{ process constructor arguments

        if args is None:
            args = []
        if preambles is None:
            preambles = []
        if preamble_generators is None:
            preamble_generators = []
        if local_sizes is None:
            local_sizes = {}
        if temporary_variables is None:
            temporary_variables = {}
        if substitutions is None:
            substitutions = {}
        if symbol_manglers is None:
            symbol_manglers = []
        if iname_slab_increments is None:
            iname_slab_increments = {}

        if silenced_warnings is None:
            silenced_warnings = []
        if applied_iname_rewrites is None:
            applied_iname_rewrites = []

        if cache_manager is None:
            from loopy.kernel.tools import SetOperationCacheManager
            cache_manager = SetOperationCacheManager()

        if iname_to_tags is not None:
            warn("Providing iname_to_tags is deprecated, pass inames instead. "
                    "Will be unsupported in 2022.",
                    DeprecationWarning, stacklevel=2)

            if inames is not None:
                raise LoopyError("Cannot provide both iname_to_tags and inames to "
                        "LoopKernel.__init__")

            inames = {
                name: inames.get(name, Iname(name, frozenset()))
                for name in _get_inames_from_domains(domains)}

        assert isinstance(inames, dict)

        if index_dtype is None:
            index_dtype = np.int32

        # }}}

        assert isinstance(assumptions, isl.BasicSet)
        assert assumptions.is_params()

        from loopy.types import to_loopy_type
        index_dtype = to_loopy_type(index_dtype, target=target)
        if not index_dtype.is_integral():
            raise TypeError("index_dtype must be an integer")
        if np.iinfo(index_dtype.numpy_dtype).min >= 0:
            raise TypeError("index_dtype must be signed")

        if state not in [
                KernelState.INITIAL,
                KernelState.CALLS_RESOLVED,
                KernelState.PREPROCESSED,
                KernelState.LINEARIZED,
                ]:
            raise ValueError("invalid value for 'state'")

        if linearization is not None:
            if schedule is not None:
                # these should not both be present
                raise ValueError(
                    "received both `schedule` and `linearization` args, "
                    "'LoopKernel.linearization' is deprecated. "
                    "Use 'LoopKernel.linearization'.")
        elif schedule is not None:
            warn(
                "'LoopKernel.linearization' is deprecated. "
                "Use 'LoopKernel.linearization'.",
                DeprecationWarning, stacklevel=2)
            linearization = schedule

        assert assumptions.get_ctx() == isl.DEFAULT_CONTEXT

        super().__init__(
                domains=domains,
                instructions=instructions,
                args=args,
                linearization=linearization,
                name=name,
                preambles=preambles,
                preamble_generators=preamble_generators,
                assumptions=assumptions,
                iname_slab_increments=iname_slab_increments,
                loop_priority=loop_priority,
                silenced_warnings=silenced_warnings,
                temporary_variables=temporary_variables,
                local_sizes=local_sizes,
                inames=inames,
                substitutions=substitutions,
                cache_manager=cache_manager,
                applied_iname_rewrites=applied_iname_rewrites,
                symbol_manglers=symbol_manglers,
                index_dtype=index_dtype,
                options=options,
                state=state,
                target=target,
                overridden_get_grid_sizes_for_insn_ids=(
                    overridden_get_grid_sizes_for_insn_ids),
                _cached_written_variables=_cached_written_variables,
                tags=tags)

        self._kernel_executor_cache = {}

    # }}}

    # {{{ symbol mangling

    def mangle_symbol(self, ast_builder, identifier):
        manglers = ast_builder.symbol_manglers() + self.symbol_manglers

        for mangler in manglers:
            result = mangler(self, identifier)
            if result is not None:
                return result

        return None

    # }}}

    # {{{ name wrangling

    @memoize_method
    def non_iname_variable_names(self):
        return (set(self.arg_dict.keys())
                | set(self.temporary_variables.keys()))

    @memoize_method
    def all_variable_names(self, include_temp_storage=True):
        return (
                set(self.temporary_variables.keys())
                | {tv.base_storage
                    for tv in self.temporary_variables.values()
                    if tv.base_storage is not None and include_temp_storage}
                | set(self.substitutions.keys())
                | {arg.name for arg in self.args}
                | set(self.all_inames()))

    def get_var_name_generator(self):
        return _UniqueVarNameGenerator(self.all_variable_names())

    def get_instruction_id_generator(self, based_on="insn"):
        used_ids = {insn.id for insn in self.instructions}

        return UniqueNameGenerator(used_ids)

    def make_unique_instruction_id(self, insns=None, based_on="insn",
            extra_used_ids=frozenset()):
        if insns is None:
            insns = self.instructions

        used_ids = {insn.id for insn in insns} | extra_used_ids

        for id_str in generate_unique_names(based_on):
            if id_str not in used_ids:
                return intern(id_str)

    def all_group_names(self):
        result = set()
        for insn in self.instructions:
            result.update(insn.groups)
            result.update(insn.conflicts_with_groups)

        return frozenset(result)

    def get_group_name_generator(self):
        return _UniqueVarNameGenerator(set(self.all_group_names()))

    def get_var_descriptor(self, name):
        try:
            return self.arg_dict[name]
        except KeyError:
            pass

        try:
            return self.temporary_variables[name]
        except KeyError:
            pass

        if name in self.all_inames():
            from loopy import TemporaryVariable
            return TemporaryVariable(
                    name=name,
                    dtype=self.index_dtype,
                    shape=())

        try:
            dtype, name = self.mangle_symbol(self.target.get_device_ast_builder(),
                    name)
            from loopy import ValueArg
            return ValueArg(name, dtype)
        except TypeError:
            pass

        raise ValueError("nothing known about variable '%s'" % name)

    @property
    @memoize_method
    def id_to_insn(self):
        return {insn.id: insn for insn in self.instructions}

    # }}}

    # {{{ domain wrangling

    @memoize_method
    def parents_per_domain(self):
        """Return a list corresponding to self.domains (by index)
        containing domain indices which are nested around this
        domain.

        Each domains nest list walks from the leaves of the nesting
        tree to the root.
        """

        # The stack of iname sets records which inames are active
        # as we step through the linear list of domains. It also
        # determines the granularity of inames to be popped/decactivated
        # if we ascend a level.

        iname_set_stack = []
        result = []

        from loopy.kernel.tools import is_domain_dependent_on_inames

        for dom_idx, dom in enumerate(self.domains):
            inames = set(dom.get_var_names(dim_type.set))

            # This next domain may be nested inside the previous domain.
            # Or it may not, in which case we need to figure out how many
            # levels of parents we need to discard in order to find the
            # true parent.

            discard_level_count = 0
            while discard_level_count < len(iname_set_stack):
                last_inames = (
                        iname_set_stack[-1-discard_level_count])
                if discard_level_count + 1 < len(iname_set_stack):
                    last_inames = (
                            last_inames - iname_set_stack[-2-discard_level_count])

                if is_domain_dependent_on_inames(self, dom_idx, last_inames):
                    break

                discard_level_count += 1

            if discard_level_count:
                iname_set_stack = iname_set_stack[:-discard_level_count]

            if result:
                parent = len(result)-1
            else:
                parent = None

            for _i in range(discard_level_count):
                assert parent is not None
                parent = result[parent]

            # found this domain's parent
            result.append(parent)

            if iname_set_stack:
                parent_inames = iname_set_stack[-1]
            else:
                parent_inames = set()
            iname_set_stack.append(parent_inames | inames)

        return result

    @memoize_method
    def all_parents_per_domain(self):
        """Return a list corresponding to self.domains (by index)
        containing domain indices which are nested around this
        domain.

        Each domains nest list walks from the leaves of the nesting
        tree to the root.
        """
        result = []

        ppd = self.parents_per_domain()
        for parent in ppd:
            # keep walking up tree to find *all* parents
            dom_result = []
            while parent is not None:
                dom_result.insert(0, parent)
                parent = ppd[parent]

            result.append(dom_result)

        return result

    @memoize_method
    def _get_home_domain_map(self):
        return {
                iname: i_domain
                for i_domain, dom in enumerate(self.domains)
                for iname in dom.get_var_names(dim_type.set)}

    def get_home_domain_index(self, iname):
        return self._get_home_domain_map()[iname]

    @property
    def isl_context(self):
        for dom in self.domains:
            return dom.get_ctx()

        raise AssertionError()

    @memoize_method
    def combine_domains(self, domains):
        """
        :arg domains: domain indices of domains to be combined. More 'dominant'
            domains (those which get most say on the actual dim_type of an iname)
            must be later in the order.
        """
        assert isinstance(domains, tuple)  # for caching

        if not domains:
            return isl.BasicSet.universe(isl.Space.set_alloc(
                self.isl_context, 0, 0))

        result = None
        for dom_index in domains:
            dom = self.domains[dom_index]
            if result is None:
                result = dom
            else:
                aligned_dom, aligned_result = isl.align_two(
                        dom, result)
                result = aligned_result & aligned_dom

        return result

    def get_inames_domain(self, inames):
        if not inames:
            return self.combine_domains(())

        if isinstance(inames, str):
            inames = frozenset([inames])
        if not isinstance(inames, frozenset):
            inames = frozenset(inames)

            from warnings import warn
            warn("get_inames_domain did not get a frozenset", stacklevel=2)

        return self._get_inames_domain_backend(inames)

    @memoize_method
    def get_leaf_domain_indices(self, inames):
        """Find the leaves of the domain tree needed to cover all inames.

        :arg inames: a non-mutable iterable
        """

        hdm = self._get_home_domain_map()
        ppd = self.all_parents_per_domain()

        domain_indices = set()

        # map root -> leaf
        root_to_leaf = {}

        for iname in inames:
            home_domain_index = hdm[iname]
            if home_domain_index in domain_indices:
                # nothin' new
                continue

            domain_path_to_root = [home_domain_index] + ppd[home_domain_index]
            current_root = domain_path_to_root[-1]
            previous_leaf = root_to_leaf.get(current_root)

            if previous_leaf is not None:
                # Check that we don't branch the domain tree.
                #
                # Branching the domain tree is dangerous/ill-formed because
                # it can introduce artificial restrictions on variables
                # further up the tree.

                prev_path_to_root = set([previous_leaf] + ppd[previous_leaf])
                if not prev_path_to_root <= set(domain_path_to_root):
                    raise CannotBranchDomainTree("iname set '%s' requires "
                            "branch in domain tree (when adding '%s')"
                            % (", ".join(inames), iname))
            else:
                # We're adding a new root. That's fine.
                pass

            root_to_leaf[current_root] = home_domain_index
            domain_indices.update(domain_path_to_root)

        return list(root_to_leaf.values())

    @memoize_method
    def _get_inames_domain_backend(self, inames):
        domain_indices = set()
        for leaf_dom_idx in self.get_leaf_domain_indices(inames):
            domain_indices.add(leaf_dom_idx)
            domain_indices.update(self.all_parents_per_domain()[leaf_dom_idx])

        return self.combine_domains(tuple(sorted(domain_indices)))

    # }}}

    @property
    def schedule(self):
        warn(
                "LoopKernel.linearization is deprecated. "
                "Call LoopKernel.linearization instead, "
                "will be unsupported in 2022.",
                DeprecationWarning, stacklevel=2)
        return self.linearization

    # {{{ iname wrangling

    @property
    @memoize_method
    def iname_to_tags(self):
        warn(
                "LoopKernel.iname_to_tags is deprecated. "
                "Call LoopKernel.inames instead, "
                "will be unsupported in 2022.",
                DeprecationWarning, stacklevel=2)
        return {name: iname.tags
                for name, iname in self.inames.items()
                if iname.tags}

    def iname_tags(self, iname):
        return self.inames[iname].tags

    def iname_tags_of_type(self, iname, tag_type_or_types,
            max_num=None, min_num=None):
        """Return a subset of *tags* that matches type *tag_type*. Raises exception
        if the number of tags found were greater than *max_num* or less than
        *min_num*.

        :arg tags: An iterable of tags.
        :arg tag_type_or_types: a subclass of :class:`loopy.kernel.data.InameTag`.
        :arg max_num: the maximum number of tags expected to be found.
        :arg min_num: the minimum number of tags expected to be found.
        """

        from loopy.kernel.data import filter_iname_tags_by_type
        return filter_iname_tags_by_type(
                self.iname_tags(iname),
                tag_type_or_types, max_num=max_num, min_num=min_num)

    @memoize_method
    def all_inames(self):
        """
        Returns a :class:`frozenset` of the names of all the inames in the kernel.
        """
        return frozenset(self.inames.keys())

    @memoize_method
    def all_params(self):
        all_inames = self.all_inames()

        result = set()
        for dom in self.domains:
            result.update(set(dom.get_var_names(dim_type.param)) - all_inames)

        from loopy.tools import intern_frozenset_of_ids
        return intern_frozenset_of_ids(result)

    def outer_params(self):
        from loopy.kernel.tools import get_outer_params
        return get_outer_params(self.domains)

    @memoize_method
    def all_insn_inames(self):
        """Return a mapping from instruction ids to inames inside which
        they should be run.
        """
        result = {}
        for insn in self.instructions:
            result[insn.id] = insn.within_inames

        return result

    @memoize_method
    def all_referenced_inames(self):
        result = set()
        for inames in self.all_insn_inames().values():
            result.update(inames)
        return result

    def insn_inames(self, insn):
        if isinstance(insn, str):
            insn = self.id_to_insn[insn]
        return insn.within_inames

    @memoize_method
    def iname_to_insns(self):
        result = {
                iname: set() for iname in self.all_inames()}
        for insn in self.instructions:
            for iname in insn.within_inames:
                result[iname].add(insn.id)

        return result

    @memoize_method
    def _remove_inames_for_shared_hw_axes(self, cond_inames):
        """
        See if cond_inames contains references to two (or more) inames that
        boil down to the same tag. If so, exclude them. (We shouldn't be writing
        conditionals for such inames because we would be implicitly restricting
        the other inames as well.)
        """

        tag_key_uses = defaultdict(list)

        from loopy.kernel.data import HardwareConcurrentTag

        for iname in cond_inames:
            tags = self.iname_tags_of_type(iname, HardwareConcurrentTag, max_num=1)
            if tags:
                tag, = tags
                tag_key_uses[tag.key].append(iname)

        multi_use_keys = {
                key for key, user_inames in tag_key_uses.items()
                if len(user_inames) > 1}

        multi_use_inames = set()
        for iname in cond_inames:
            tags = self.iname_tags_of_type(iname, HardwareConcurrentTag)
            if tags:
                tag, = filter_iname_tags_by_type(tags, HardwareConcurrentTag, 1)
                if tag.key in multi_use_keys:
                    multi_use_inames.add(iname)

        return frozenset(cond_inames - multi_use_inames)

    # }}}

    # {{{ dependency wrangling

    @memoize_method
    def recursive_insn_dep_map(self):
        """Returns a :class:`dict` mapping an instruction IDs *a*
        to all instruction IDs it directly or indirectly depends
        on.
        """

        result = {}

        def compute_deps(insn_id):
            try:
                return result[insn_id]
            except KeyError:
                pass

            insn = self.id_to_insn[insn_id]
            insn_result = set(insn.depends_on)

            for dep in list(insn.depends_on):
                insn_result.update(compute_deps(dep))

            result[insn_id] = frozenset(insn_result)
            return insn_result

        for insn in self.instructions:
            compute_deps(insn.id)

        return result

    # }}}

    # {{{ read and written variables

    @memoize_method
    def reader_map(self):
        """
        :return: a dict that maps variable names to ids of insns that read that
          variable.
        """
        result = {}

        admissible_vars = (
                {arg.name for arg in self.args}
                | set(self.temporary_variables.keys()))

        for insn in self.instructions:
            for var_name in insn.read_dependency_names() & admissible_vars:
                result.setdefault(var_name, set()).add(insn.id)

        return result

    @memoize_method
    def writer_map(self):
        """
        :return: a dict that maps variable names to ids of insns that write
            to that variable.
        """
        result = {}

        for insn in self.instructions:
            for var_name in insn.assignee_var_names():
                result.setdefault(var_name, set()).add(insn.id)

        return result

    @memoize_method
    def get_read_variables(self):
        result = set()
        for insn in self.instructions:
            result.update(insn.read_dependency_names())

        for domain in self.domains:
            result.update(domain.get_var_names(dim_type.param))

        return result

    @memoize_method
    def get_written_variables(self):
        if self._cached_written_variables is not None:
            return self._cached_written_variables

        return frozenset(
                var_name
                for insn in self.instructions
                for var_name in insn.assignee_var_names())

    @memoize_method
    def get_temporary_to_base_storage_map(self):
        result = {}
        for tv in self.temporary_variables.values():
            if tv.base_storage:
                result[tv.name] = tv.base_storage

        return result

    @memoize_method
    def get_unwritten_value_args(self):
        written_vars = self.get_written_variables()

        from loopy.kernel.data import ValueArg
        return {
                arg.name
                for arg in self.args
                if isinstance(arg, ValueArg) and arg.name not in written_vars}

    # }}}

    # {{{ argument wrangling

    @property
    @memoize_method
    def arg_dict(self):
        return {arg.name: arg for arg in self.args}

    @property
    @memoize_method
    def scalar_loop_args(self):
        from loopy.kernel.data import ValueArg

        if self.args is None:
            return []
        else:
            from pytools import flatten
            loop_arg_names = list(flatten(dom.get_var_names(dim_type.param)
                    for dom in self.domains))
            return [arg.name for arg in self.args if isinstance(arg, ValueArg)
                    if arg.name in loop_arg_names]

    @memoize_method
    def global_var_names(self):
        from loopy.kernel.data import AddressSpace

        from loopy.kernel.data import ArrayArg
        return (
                {
                    arg.name for arg in self.args
                    if (isinstance(arg, ArrayArg)
                        and arg.address_space == AddressSpace.GLOBAL)}
                | {
                    tv.name
                    for tv in self.temporary_variables.values()
                    if tv.address_space == AddressSpace.GLOBAL})

    # }}}

    # {{{ bounds finding

    @memoize_method
    def get_iname_bounds(self, iname, constants_only=False):
        domain = self.get_inames_domain(frozenset([iname]))

        assumptions = self.assumptions.project_out_except(
                set(domain.get_var_dict(dim_type.param)), [dim_type.param])

        aligned_assumptions, domain = isl.align_two(assumptions, domain)

        dom_intersect_assumptions = aligned_assumptions & domain

        if constants_only:
            # Kill all variable dependencies
            dom_intersect_assumptions = dom_intersect_assumptions.project_out_except(
                    [iname], [dim_type.param, dim_type.set])

        iname_idx = dom_intersect_assumptions.get_var_dict()[iname][1]

        lower_bound_pw_aff = (
                self.cache_manager.dim_min(
                    dom_intersect_assumptions, iname_idx)
                .coalesce())
        upper_bound_pw_aff = (
                self.cache_manager.dim_max(
                    dom_intersect_assumptions, iname_idx)
                .coalesce())

        class BoundsRecord(ImmutableRecord):
            pass

        size = (upper_bound_pw_aff - lower_bound_pw_aff + 1)
        size = size.gist(assumptions)

        return BoundsRecord(
                lower_bound_pw_aff=lower_bound_pw_aff,
                upper_bound_pw_aff=upper_bound_pw_aff,
                size=size)

    @memoize_method
    def get_constant_iname_length(self, iname):
        from loopy.isl_helpers import static_max_of_pw_aff
        from loopy.symbolic import aff_to_expr
        return int(aff_to_expr(static_max_of_pw_aff(
                self.get_iname_bounds(iname, constants_only=True).size,
                constants_only=True)))

    @memoize_method
    def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids,
            callables_table, ignore_auto=False):
        """
        Returns a tuple of (global_sizes, local_sizes), where global_sizes,
        local_sizes are the grid sizes accommodating all of *insn_ids*. The grid
        sizes are a dict from the axis index to the corresponding grid size.
        """
        all_inames_by_insns = set()
        for insn_id in insn_ids:
            all_inames_by_insns |= self.insn_inames(insn_id)

        if not all_inames_by_insns <= self.all_inames():
            raise RuntimeError("some inames collected from instructions (%s) "
                    "are not present in domain (%s)"
                    % (", ".join(sorted(all_inames_by_insns)),
                        ", ".join(sorted(self.all_inames()))))

        # {{{ include grid constraints due to callees

        global_sizes = {}
        local_sizes = {}

        from loopy.kernel.instruction import CallInstruction
        from loopy.symbolic import ResolvedFunction

        for insn in self.instructions:
            # TODO: This might be unsafe as call-sites must be resolved to get
            # any hardware axes size constraints they might impose. However,
            # transforms like 'precompute' use this method and callables might
            # not be resolved by then.
            if (isinstance(insn, CallInstruction)
                    and isinstance(insn.expression.function, ResolvedFunction)):

                clbl = callables_table[insn.expression.function.name]
                gsize, lsize = clbl.get_hw_axes_sizes(insn.arg_id_to_arg(),
                                                      self.assumptions.space,
                                                      callables_table)

                for tgt_dict, tgt_size in [(global_sizes, gsize),
                                            (local_sizes, lsize)]:

                    for iaxis, size in tgt_size.items():
                        if iaxis in tgt_dict:
                            tgt_dict[iaxis] = tgt_dict[iaxis].max(size)
                        else:
                            tgt_dict[iaxis] = size

        # }}}

        from loopy.kernel.data import (
                GroupInameTag, LocalInameTag,
                AutoLocalInameTagBase)

        for iname in all_inames_by_insns:
            tags = self.iname_tags_of_type(
                    iname,
                    (AutoLocalInameTagBase, GroupInameTag, LocalInameTag), max_num=1)

            if not tags:
                continue

            tag, = tags

            if isinstance(tag, AutoLocalInameTagBase) and not ignore_auto:
                raise RuntimeError("cannot find grid sizes if automatic "
                        "local index tags are present")
            elif isinstance(tag, GroupInameTag):
                tgt_dict = global_sizes
            elif isinstance(tag, LocalInameTag):
                tgt_dict = local_sizes
            else:
                continue

            size = self.get_iname_bounds(iname).size

            if tag.axis in tgt_dict:
                size = tgt_dict[tag.axis].max(size)

            from loopy.isl_helpers import static_max_of_pw_aff
            try:
                # insist block size is constant
                size_as_aff = static_max_of_pw_aff(size,
                        constants_only=isinstance(tag, LocalInameTag),
                        context=self.assumptions)
                size = isl.PwAff.from_aff(size_as_aff)
            except StaticValueFindingError:
                pass

            tgt_dict[tag.axis] = size

        # {{{ override local_sizes with self.local_sizes

        for i_lsize, lsize in self.local_sizes.items():
            if i_lsize <= max(local_sizes.keys()):
                local_sizes[i_lsize] = lsize
            else:
                from warnings import warn
                warn(f"Forced local sizes '{i_lsize}: {lsize}' is unused"
                     f" because kernel '{self.name}' uses {max(local_sizes.keys())}"
                     " local hardware axes.")

        # }}}

        return global_sizes, local_sizes

    @memoize_method
    def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table,
            ignore_auto=False, return_dict=False):
        """Return a tuple (global_size, local_size) containing a grid that
        could accommodate execution of all instructions whose IDs are given
        in *insn_ids*.

        :arg insn_ids: a :class:`frozenset` of instruction IDs

        *global_size* and *local_size* are :class:`islpy.PwAff` objects.
        """

        if self.overridden_get_grid_sizes_for_insn_ids:
            gsize, lsize = self.overridden_get_grid_sizes_for_insn_ids(
                insn_ids,
                callables_table=callables_table,
                ignore_auto=ignore_auto)
            if return_dict:
                return dict(enumerate(gsize)), dict(enumerate(lsize))
            else:
                return gsize, lsize

        global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts(
                insn_ids, callables_table, ignore_auto=ignore_auto)

        if return_dict:
            return global_sizes, local_sizes

        def to_dim_tuple(size_dict, which):
            size_list = []
            sorted_axes = sorted(size_dict.keys())

            while sorted_axes:
                if sorted_axes:
                    cur_axis = sorted_axes.pop(0)
                else:
                    cur_axis = None

                assert cur_axis is not None

                if cur_axis > len(size_list):
                    raise LoopyError("%s axis %d unused for %s" % (
                        which, len(size_list), self.name))

                size_list.append(size_dict[cur_axis])

            return tuple(size_list)

        return (to_dim_tuple(global_sizes, "global"),
                to_dim_tuple(local_sizes, "local"))

    @memoize_method
    def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids,
            callables_table, ignore_auto=False, return_dict=False):
        """Return a tuple (global_size, local_size) containing a grid that
        could accommodate execution of all instructions whose IDs are given
        in *insn_ids*.

        :arg insn_ids: a :class:`frozenset` of instruction IDs

        *global_size* and *local_size* are :mod:`pymbolic` expressions
        """

        grid_size, group_size = self.get_grid_sizes_for_insn_ids(
                insn_ids, callables_table, ignore_auto, return_dict)

        if return_dict:
            def dict_to_exprs(d):
                from loopy.symbolic import pw_aff_to_expr
                return {k: pw_aff_to_expr(v, int_ok=True)
                        for k, v in d.items()}

            return dict_to_exprs(grid_size), dict_to_exprs(group_size)

        def tup_to_exprs(tup):
            from loopy.symbolic import pw_aff_to_expr
            return tuple(pw_aff_to_expr(i, int_ok=True) for i in tup)

        return tup_to_exprs(grid_size), tup_to_exprs(group_size)

    def get_grid_size_upper_bounds(self, callables_table, ignore_auto=False,
            return_dict=False):
        """Return a tuple (global_size, local_size) containing a grid that
        could accommodate execution of *all* instructions in the kernel.

        *global_size* and *local_size* are :class:`islpy.PwAff` objects.
        """
        return self.get_grid_sizes_for_insn_ids(
                frozenset(insn.id for insn in self.instructions),
                callables_table, ignore_auto=ignore_auto,
                return_dict=return_dict)

    def get_grid_size_upper_bounds_as_exprs(self, callables_table,
            ignore_auto=False, return_dict=False):
        """Return a tuple (global_size, local_size) containing a grid that
        could accommodate execution of *all* instructions in the kernel.

        *global_size* and *local_size* are :mod:`pymbolic` expressions
        """
        return self.get_grid_sizes_for_insn_ids_as_exprs(
                frozenset(insn.id for insn in self.instructions),
                callables_table, ignore_auto=ignore_auto,
                return_dict=return_dict)

    # }}}

    # {{{ local memory

    @memoize_method
    def local_var_names(self):
        from loopy.kernel.data import AddressSpace
        return {
            tv.name
            for tv in self.temporary_variables.values()
            if tv.address_space == AddressSpace.LOCAL}

    def local_mem_use(self):
        from loopy.kernel.data import AddressSpace
        return sum(
                tv.nbytes for tv in self.temporary_variables.values()
                if tv.address_space == AddressSpace.LOCAL)

    # }}}

    # {{{ nosync sets

    @memoize_method
    def get_nosync_set(self, insn_id, scope):
        assert scope in ("local", "global")

        return frozenset(
            insn_id
            for insn_id, nosync_scope in self.id_to_insn[insn_id].no_sync_with
            if nosync_scope == scope or nosync_scope == "any")

    # }}}

    # {{{ pretty-printing

    @memoize_method
    def _get_iname_order_for_printing(self):
        try:
            from loopy.kernel.tools import get_visual_iname_order_embedding
            embedding = get_visual_iname_order_embedding(self)
        except ValueError:
            from loopy.diagnostic import warn_with_kernel
            warn_with_kernel(self,
                "iname-order",
                "get_visual_iname_order_embedding() could not determine a "
                "consistent iname nesting order. This is a possible indication "
                "that the kernel may not schedule successfully, but for now "
                "it only impacts printing of the kernel.")
            embedding = {iname: iname for iname in self.all_inames()}

        return embedding

    def stringify(self, what=None, with_dependencies=False, use_separators=True,
            show_labels=True):
        all_what = {
            "name",
            "arguments",
            "domains",
            "tags",
            "variables",
            "rules",
            "instructions",
            "Dependencies",
            "linearization",
            }

        first_letter_to_what = {
                w[0]: w for w in all_what}
        assert len(first_letter_to_what) == len(all_what)

        if what is None:
            what = all_what.copy()
            if not with_dependencies:
                what.remove("Dependencies")

        if isinstance(what, str):
            if "," in what:
                what = what.split(",")
                what = {s.strip() for s in what}
            else:
                what = {
                        first_letter_to_what[w]
                        for w in what}

        if not (what <= all_what):
            raise LoopyError("invalid 'what' passed: %s"
                    % ", ".join(what-all_what))

        lines = []

        kernel = self

        if use_separators:
            sep = [75*"-"]
        else:
            sep = []

        if "name" in what:
            lines.extend(sep)
            lines.append("KERNEL: " + kernel.name)

        if "arguments" in what:
            lines.extend(sep)
            if show_labels:
                lines.append("ARGUMENTS:")
            for arg_name in natsorted(kernel.arg_dict):
                lines.append(str(kernel.arg_dict[arg_name]))

        if "domains" in what:
            lines.extend(sep)
            if show_labels:
                lines.append("DOMAINS:")
            for dom, parents in zip(kernel.domains, kernel.all_parents_per_domain()):
                lines.append(len(parents)*"  " + str(dom))

        if "tags" in what:
            lines.extend(sep)
            if show_labels:
                lines.append("INAME IMPLEMENTATION TAGS:")
            for iname in natsorted(kernel.all_inames()):
                tags = kernel.iname_tags(iname)

                if not tags:
                    tags_str = "None"
                else:
                    tags_str = ", ".join(str(tag) for tag in tags)

                line = f"{iname}: {tags_str}"
                lines.append(line)

        if "variables" in what and kernel.temporary_variables:
            lines.extend(sep)
            if show_labels:
                lines.append("TEMPORARIES:")
            for tv in natsorted(kernel.temporary_variables.values(),
                    key=lambda tv: tv.name):
                lines.append(str(tv))

        if "rules" in what and kernel.substitutions:
            lines.extend(sep)
            if show_labels:
                lines.append("SUBSTITUTION RULES:")
            for rule_name in natsorted(kernel.substitutions.keys()):
                lines.append(str(kernel.substitutions[rule_name]))

        if "instructions" in what:
            lines.extend(sep)
            if show_labels:
                lines.append("INSTRUCTIONS:")

            from loopy.kernel.tools import stringify_instruction_list
            lines.extend(stringify_instruction_list(kernel))

        dep_lines = []
        for insn in kernel.instructions:
            if insn.depends_on:
                dep_lines.append("{} : {}".format(
                    insn.id, ",".join(insn.depends_on)))

        if "Dependencies" in what and dep_lines:
            lines.extend(sep)
            if show_labels:
                lines.append("DEPENDENCIES: "
                        "(use loopy.show_dependency_graph to visualize)")
            lines.extend(dep_lines)

        if "linearization" in what and kernel.linearization is not None:
            lines.extend(sep)
            if show_labels:
                lines.append("LINEARIZATION:")
            from loopy.schedule import dump_schedule
            lines.append(dump_schedule(kernel, kernel.linearization))

        lines.extend(sep)

        return "\n".join(lines)

    def __str__(self):
        return self.stringify()

    def __unicode__(self):
        return self.stringify()

    # }}}

    # {{{ implementation arguments

    @property
    @memoize_method
    def impl_arg_to_arg(self):
        from loopy.kernel.array import ArrayBase

        result = {}

        for arg in self.args:
            if not isinstance(arg, ArrayBase):
                result[arg.name] = arg
                continue

            if arg.shape is None or arg.dim_tags is None:
                result[arg.name] = arg
                continue

            subscripts_and_names = arg.subscripts_and_names()
            if subscripts_and_names is None:
                result[arg.name] = arg
                continue

            for _index, sub_arg_name in subscripts_and_names:
                result[sub_arg_name] = arg

        return result

    # }}}

    # {{{ direct execution

    def __call__(self, *args, **kwargs):
        """
        Execute the :class:`LoopKernel`.
        """
        warn("Calling a LoopKernel is deprecated, call a TranslationUnit "
                "instead.", DeprecationWarning, stacklevel=2)
        from loopy.translation_unit import make_program
        program = make_program(self)
        return program(*args, **kwargs)

    # }}}

    # {{{ pickling

    def __getstate__(self):
        result = {
                key: getattr(self, key)
                for key in self.__class__.fields
                if hasattr(self, key)}

        result.pop("cache_manager", None)

        # Make the instructions lazily unpickling, to support faster
        # cache retrieval for execution.
        from loopy.kernel.instruction import _get_insn_eq_key, _get_insn_hash_key
        from loopy.tools import (
                LazilyUnpicklingListWithEqAndPersistentHashing as LazyList)

        result["instructions"] = LazyList(
                self.instructions,
                eq_key_getter=_get_insn_eq_key,
                persistent_hash_key_getter=_get_insn_hash_key)

        # Cache written variables to avoid having to unpickle instructions in
        # order to compute the written variables. This is needed on the
        # cache-to-execution path.
        result["_cached_written_variables"] = self.get_written_variables()

        # make sure that kernels are pickled with a cached hash key in place
        from loopy.tools import LoopyKeyBuilder
        LoopyKeyBuilder()(self)

        return (result, self._pytools_persistent_hash_digest)

    def __setstate__(self, state):
        attribs, p_hash_digest = state

        new_fields = set()

        for k, v in attribs.items():
            setattr(self, k, v)
            new_fields.add(k)

        self.register_fields(new_fields)

        if 0:
            # {{{ check that 'reconstituted' object has same hash

            from loopy.tools import LoopyKeyBuilder
            assert p_hash_digest == LoopyKeyBuilder()(self)

            # }}}

        self._pytools_persistent_hash_digest = p_hash_digest

        from loopy.kernel.tools import SetOperationCacheManager
        self.cache_manager = SetOperationCacheManager()
        self._kernel_executor_cache = {}

    # }}}

    # {{{ persistent hash key generation / comparison

    hash_fields = (
            "domains",
            "instructions",
            "args",
            "linearization",
            "name",
            "preambles",
            "assumptions",
            "local_sizes",
            "temporary_variables",
            "inames",
            "substitutions",
            "iname_slab_increments",
            "loop_priority",
            "silenced_warnings",
            "options",
            "state",
            "target",
            )

    comparison_fields = hash_fields + (
            # Contains pymbolic expressions, hence a (small) headache to hash.
            # Likely not needed for hash uniqueness => headache avoided.
            "applied_iname_rewrites",

            # These are lists of functions. It's not clear how to
            # hash these correctly, so let's not attempt it. We'll
            # just assume that the rest of the hash is specific enough
            # that we won't have to rely on differences in these to
            # resolve hash conflicts.

            "preamble_generators",
            "symbol_manglers",
            )

    update_persistent_hash = update_persistent_hash

    @memoize_method
    def __hash__(self):
        from loopy.tools import LoopyKeyBuilder
        import hashlib
        key_hash = hashlib.sha256()
        self.update_persistent_hash(key_hash, LoopyKeyBuilder())
        return hash(key_hash.digest())

    def __eq__(self, other):
        if self is other:
            return True

        if not isinstance(other, LoopKernel):
            return False

        for field_name in self.comparison_fields:
            if field_name == "domains":
                if len(self.domains) != len(other.domains):
                    return False

                for set_a, set_b in zip(self.domains, other.domains):
                    if not (set_a.plain_is_equal(set_b) or set_a.is_equal(set_b)):
                        return False

            elif field_name == "assumptions":
                if not (
                        self.assumptions.plain_is_equal(other.assumptions)
                        or self.assumptions.is_equal(other.assumptions)):
                    return False

            elif getattr(self, field_name) != getattr(other, field_name):
                return False

        return True

    def __ne__(self, other):
        return not self.__eq__(other)

    # }}}

    def get_copy_kwargs(self, **kwargs):
        if "iname_to_tags" in kwargs:
            if "inames" in kwargs:
                raise LoopyError("Cannot pass both `inames` and `iname_to_tags` to "
                        "LoopKernel.get_copy_kwargs")

            warn("Providing iname_to_tags is deprecated, pass inames instead. "
                    "Will be unsupported in 2022.",
                    DeprecationWarning, stacklevel=2)

            iname_to_tags = kwargs["iname_to_tags"]
            domains = kwargs.get("domains", self.domains)
            kwargs["inames"] = {name: Iname(name,
                                            iname_to_tags.get(name, frozenset()))
                                for name in _get_inames_from_domains(domains)
                                }
            del kwargs["iname_to_tags"]

        if "domains" in kwargs:
            inames = kwargs.get("inames", self.inames)
            domains = kwargs["domains"]
            kwargs["inames"] = {name: inames.get(name, Iname(name, frozenset()))
                                for name in _get_inames_from_domains(domains)}

            assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains)

        if "instructions" in kwargs:
            # Avoid carrying over an invalid cache when instructions are
            # modified.
            kwargs["_cached_written_variables"] = None

        return super().get_copy_kwargs(**kwargs)

    def copy(self, **kwargs):
        if "iname_to_tags" in kwargs:
            if "inames" in kwargs:
                raise LoopyError("Cannot pass both `inames` and `iname_to_tags` to "
                        "LoopKernel.copy")

        if "schedule" in kwargs:
            if "linearization" in kwargs:
                raise LoopyError("Cannot pass both `schedule` and "
                                 "`linearization` to LoopKernel.copy")

            kwargs["linearization"] = None

        from pytools.tag import normalize_tags, check_tag_uniqueness
        tags = kwargs.pop("tags", _not_provided)
        if tags is not _not_provided:
            kwargs["tags"] = check_tag_uniqueness(normalize_tags(tags))

        return super().copy(**kwargs)

# }}}

# vim: foldmethod=marker