I was looking to produce a hash for the source con...
# plugins
w
I was looking to produce a hash for the source contents of targets. Is there a smarter way to do this? What I don't like about this right now is that a change in our requirements.txt will produce a different hash for a target even if the requirement isn't used by the target (because all targets depend on it, but only use subsets)
Copy code
from enum import Enum
import hashlib

from pants.engine.addresses import Addresses
from pants.engine.addresses import BuildFileAddress
from pants.engine.console import Console
from pants.engine.fs import Digest
from pants.engine.fs import DigestContents
from pants.engine.fs import PathGlobs

from pants.build_graph.address import BuildFileAddressRequest

from pants.engine.goal import Goal
from pants.engine.goal import GoalSubsystem
from pants.engine.goal import LineOriented
from pants.engine.rules import Get
from pants.engine.rules import MultiGet
from pants.engine.rules import goal_rule

from pants.build_graph.address import BuildFileAddressRequest

from pants.engine.rules import collect_rules

from pants.engine.target import HydratedSources
from pants.engine.target import HydrateSourcesRequest
from pants.engine.target import SourcesField
from pants.engine.target import Target
from pants.engine.target import TransitiveTargets
from pants.engine.target import TransitiveTargetsRequest

from pants.option.option_types import EnumOption
from pants.util.strutil import softwrap

HashAlgorithms = Enum('HashAlgorithms',  {alg.upper(): alg for alg in hashlib.algorithms_available})


class HashTargetsSubsystem(LineOriented, GoalSubsystem):
    name = 'hash'
    help = 'Computes hash from contents of all files a target depends on.'

    algorithm = EnumOption(
        flag_name='--algorithm',
        default=HashAlgorithms.SHA256,
        help=softwrap(
            """
            Specifies which hashing algorithm to use.
            """
        ),
    )


class HashTargets(Goal):
    subsystem_cls = HashTargetsSubsystem


@goal_rule
async def hash_targets(
    console: Console,
    hash_targets_subsystem: HashTargetsSubsystem,
    addresses: Addresses,
) -> HashTargets:
    for address in addresses:
        transitive_targets = await Get(
            TransitiveTargets, TransitiveTargetsRequest([address], include_special_cased_deps=True)
        )
        targets = transitive_targets.closure

        build_file_addresses = await MultiGet(
            Get(
                BuildFileAddress,
                BuildFileAddressRequest(target.address, description_of_origin="CLI arguments"),
            )
            for target in targets
        )
        unique_rel_paths = {build_file_address.rel_path for build_file_address in build_file_addresses}

        all_hydrated_sources = await MultiGet(
            Get(HydratedSources, HydrateSourcesRequest(target.get(SourcesField))) for target in targets
        )
        unique_rel_paths.update(
            file
            for hydrated_sources in all_hydrated_sources
            for file in hydrated_sources.snapshot.files
        )

        digest = await Get(Digest, PathGlobs(sorted(unique_rel_paths)))
        digest_contents = await Get(DigestContents, Digest, digest)
        hash_content = hashlib.new(hash_targets_subsystem.algorithm.value)
        for file_content in digest_contents:
            hash_content.update(file_content.path.encode())
            hash_content.update(file_content.content)


        with hash_targets_subsystem.line_oriented(console) as print_stdout:
            print_stdout(hash_content.hexdigest())

    return HashTargets(exit_code=0)

def rules():
    return collect_rules()
h
could you use the hash from the
Digest
? Use something like
Get(HydratedSources, HydrateSourcesRequest(tgt[SourcesField])
, then
hydrated_sources.snapshot.digest.fingerprint
. It's a sha256
if you want per-file hashes, then request
Get(DigestEntries, Digest, my_digest)
, and it will give back a bunch of
FileEntry
objects, which you can then do
my_file_entry.file_digest.fingerprint
👍 1
w
Awesome, thanks
The digest of a python requirement target seems to be stable regardless of package version. Is this a sane addition?
Copy code
def hash_python_requirement_versions(targets: Iterable[Target]) -> set[str]:
    return {
        str(hash(target.get(PythonRequirementsField).value))
        for target in targets
        if isinstance(target, PythonRequirementTarget)
    }
essentially added these to the set of fingerprints and hash the sum-total
Copy code
from __future__ import annotations

import hashlib
from collections.abc import Iterable
from enum import Enum

from pants.backend.python.target_types import PythonRequirementsField
from pants.backend.python.target_types import PythonRequirementTarget
from pants.engine.addresses import Addresses
from pants.engine.console import Console
from pants.engine.goal import Goal
from pants.engine.goal import GoalSubsystem
from pants.engine.goal import LineOriented
from pants.engine.rules import collect_rules
from pants.engine.rules import Get
from pants.engine.rules import goal_rule
from pants.engine.rules import MultiGet
from pants.engine.target import HydratedSources
from pants.engine.target import HydrateSourcesRequest
from pants.engine.target import SourcesField
from pants.engine.target import Target
from pants.engine.target import TransitiveTargets
from pants.engine.target import TransitiveTargetsRequest
from pants.option.option_types import EnumOption
from pants.util.strutil import softwrap

HashAlgorithms = Enum('HashAlgorithms', {alg.upper(): alg for alg in hashlib.algorithms_available})


class HashTargetsSubsystem(LineOriented, GoalSubsystem):
    name = 'hash'
    help = 'Computes hash from contents of all files a target depends on.'

    algorithm = EnumOption(
        flag_name='--algorithm',
        default=HashAlgorithms.SHA256,
        help=softwrap(
            """
            Specifies which hashing algorithm to use.
            """
        ),
    )


class HashTargets(Goal):
    subsystem_cls = HashTargetsSubsystem


def hash_python_requirement_versions(targets: Iterable[Target]) -> set[str]:
    return {
        str(hash(target.get(PythonRequirementsField).value))
        for target in targets
        if isinstance(target, PythonRequirementTarget)
    }


@goal_rule
async def hash_targets(
    console: Console, hash_targets_subsystem: HashTargetsSubsystem, addresses: Addresses
) -> HashTargets:
    for address in addresses:
        transitive_targets = await Get(
            TransitiveTargets, TransitiveTargetsRequest([address], include_special_cased_deps=True)
        )
        targets = transitive_targets.closure

        all_hydrated_sources = await MultiGet(
            Get(HydratedSources, HydrateSourcesRequest(target.get(SourcesField)))
            for target in targets
        )
        fingerprints = {
            file
            for hydrated_sources in all_hydrated_sources
            for file in hydrated_sources.snapshot.digest.fingerprint
        }
        fingerprints.update(hash_python_requirement_versions(targets))

        hash_content = hashlib.new(hash_targets_subsystem.algorithm.value)
        for fingerprint in sorted(fingerprints):
            hash_content.update(fingerprint.encode())

        with hash_targets_subsystem.line_oriented(console) as print_stdout:
            print_stdout(hash_content.hexdigest())

    return HashTargets(exit_code=0)


def rules():
    return collect_rules()
Some trivial tests seem to indicate it does what I want. (hash all source files except requirements.txt, hash it per
PythonRequirementsTarget
, including the version string)
Nvm, the digests dont seem to be stable. Got "tricked" by pants cache.
h
PythonRequirementsTarget
doesn't have a sources field, indeed, so you should analyze the
RequirementsField
. But use
shalib.sha256
rather than
hash()
. In Python,
hash()
is not stable across runs
👍 1