]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-113257: Automatically generate pip SBOM metadata from wheel (#113295)
authorSeth Michael Larson <sethmichaellarson@gmail.com>
Wed, 20 Dec 2023 17:28:20 +0000 (11:28 -0600)
committerGitHub <noreply@github.com>
Wed, 20 Dec 2023 17:28:20 +0000 (17:28 +0000)
Co-authored-by: Hugo van Kemenade <hugovk@users.noreply.github.com>
Misc/sbom.spdx.json
Tools/build/generate_sbom.py

index 81f8486ea350c1b71481caaca4dd96d756a9d492..5b3cd04ffa7f74bae0d0f850453c6977c40e4cf6 100644 (file)
       "checksums": [
         {
           "algorithm": "SHA256",
-          "checksumValue": "7ccf472345f20d35bdc9d1841ff5f313260c2c33fe417f48c30ac46cccabf5be"
+          "checksumValue": "5052d7889c1f9d05224cd41741acb7c5d6fa735ab34e339624a614eaaa7e7d76"
         }
       ],
       "downloadLocation": "https://files.pythonhosted.org/packages/15/aa/3f4c7bcee2057a76562a5b33ecbd199be08cdb4443a02e26bd2c3cf6fc39/pip-23.3.2-py3-none-any.whl",
index c02eb88b46532f3ead53e76ddf5e622d17dacf0e..93d0d8a3762df3f09935b059edacae6362c42a40 100644 (file)
@@ -1,12 +1,16 @@
 """Tool for generating Software Bill of Materials (SBOM) for Python's dependencies"""
-
+import os
 import re
 import hashlib
 import json
 import glob
 import pathlib
 import subprocess
+import sys
 import typing
+from urllib.request import urlopen
+
+CPYTHON_ROOT_DIR = pathlib.Path(__file__).parent.parent.parent
 
 # Before adding a new entry to this list, double check that
 # the license expression is a valid SPDX license expression:
@@ -43,15 +47,14 @@ class PackageFiles(typing.NamedTuple):
 # values to 'exclude' if we create new files within tracked
 # directories that aren't sourced from third-party packages.
 PACKAGE_TO_FILES = {
+    # NOTE: pip's entry in this structure is automatically generated in
+    # the 'discover_pip_sbom_package()' function below.
     "mpdecimal": PackageFiles(
         include=["Modules/_decimal/libmpdec/**"]
     ),
     "expat": PackageFiles(
         include=["Modules/expat/**"]
     ),
-    "pip": PackageFiles(
-        include=["Lib/ensurepip/_bundled/pip-23.3.2-py3-none-any.whl"]
-    ),
     "macholib": PackageFiles(
         include=["Lib/ctypes/macholib/**"],
         exclude=[
@@ -106,13 +109,106 @@ def filter_gitignored_paths(paths: list[str]) -> list[str]:
     return sorted([line.split()[-1] for line in git_check_ignore_lines if line.startswith("::")])
 
 
+def discover_pip_sbom_package(sbom_data: dict[str, typing.Any]) -> None:
+    """pip is a part of a packaging ecosystem (Python, surprise!) so it's actually
+    automatable to discover the metadata we need like the version and checksums
+    so let's do that on behalf of our friends at the PyPA.
+    """
+    global PACKAGE_TO_FILES
+
+    ensurepip_bundled_dir = CPYTHON_ROOT_DIR / "Lib/ensurepip/_bundled"
+    pip_wheels = []
+
+    # Find the hopefully one pip wheel in the bundled directory.
+    for wheel_filename in os.listdir(ensurepip_bundled_dir):
+        if wheel_filename.startswith("pip-"):
+            pip_wheels.append(wheel_filename)
+    if len(pip_wheels) != 1:
+        print("Zero or multiple pip wheels detected in 'Lib/ensurepip/_bundled'")
+        sys.exit(1)
+    pip_wheel_filename = pip_wheels[0]
+
+    # Add the wheel filename to the list of files so the SBOM file
+    # and relationship generator can work its magic on the wheel too.
+    PACKAGE_TO_FILES["pip"] = PackageFiles(
+        include=[f"Lib/ensurepip/_bundled/{pip_wheel_filename}"]
+    )
+
+    # Wheel filename format puts the version right after the project name.
+    pip_version = pip_wheel_filename.split("-")[1]
+    pip_checksum_sha256 = hashlib.sha256(
+        (ensurepip_bundled_dir / pip_wheel_filename).read_bytes()
+    ).hexdigest()
+
+    # Get pip's download location from PyPI. Check that the checksum is correct too.
+    try:
+        raw_text = urlopen(f"https://pypi.org/pypi/pip/{pip_version}/json").read()
+        pip_release_metadata = json.loads(raw_text)
+        url: dict[str, typing.Any]
+
+        # Look for a matching artifact filename and then check
+        # its remote checksum to the local one.
+        for url in pip_release_metadata["urls"]:
+            if url["filename"] == pip_wheel_filename:
+                break
+        else:
+            raise ValueError(f"No matching filename on PyPI for '{pip_wheel_filename}'")
+        if url["digests"]["sha256"] != pip_checksum_sha256:
+            raise ValueError(f"Local pip checksum doesn't match artifact on PyPI")
+
+        # Successfully found the download URL for the matching artifact.
+        pip_download_url = url["url"]
+
+    except (OSError, ValueError) as e:
+        print(f"Couldn't fetch pip's metadata from PyPI: {e}")
+        sys.exit(1)
+
+    # Remove pip from the existing SBOM packages if it's there
+    # and then overwrite its entry with our own generated one.
+    sbom_data["packages"] = [
+        sbom_package
+        for sbom_package in sbom_data["packages"]
+        if sbom_package["name"] != "pip"
+    ]
+    sbom_data["packages"].append(
+        {
+            "SPDXID": spdx_id("SPDXRef-PACKAGE-pip"),
+            "name": "pip",
+            "versionInfo": pip_version,
+            "originator": "Organization: Python Packaging Authority",
+            "licenseConcluded": "MIT",
+            "downloadLocation": pip_download_url,
+            "checksums": [
+                {"algorithm": "SHA256", "checksumValue": pip_checksum_sha256}
+            ],
+            "externalRefs": [
+                {
+                    "referenceCategory": "SECURITY",
+                    "referenceLocator": f"cpe:2.3:a:pypa:pip:{pip_version}:*:*:*:*:*:*:*",
+                    "referenceType": "cpe23Type",
+                },
+                {
+                    "referenceCategory": "PACKAGE_MANAGER",
+                    "referenceLocator": f"pkg:pypi/pip@{pip_version}",
+                    "referenceType": "purl",
+                },
+            ],
+            "primaryPackagePurpose": "SOURCE",
+        }
+    )
+
+
 def main() -> None:
-    root_dir = pathlib.Path(__file__).parent.parent.parent
-    sbom_path = root_dir / "Misc/sbom.spdx.json"
+    sbom_path = CPYTHON_ROOT_DIR / "Misc/sbom.spdx.json"
     sbom_data = json.loads(sbom_path.read_bytes())
 
-    # Make a bunch of assertions about the SBOM data to ensure it's consistent.
+    # Insert pip's SBOM metadata from the wheel.
+    discover_pip_sbom_package(sbom_data)
+
+    # Ensure all packages in this tool are represented also in the SBOM file.
     assert {package["name"] for package in sbom_data["packages"]} == set(PACKAGE_TO_FILES)
+
+    # Make a bunch of assertions about the SBOM data to ensure it's consistent.
     for package in sbom_data["packages"]:
 
         # Properties and ID must be properly formed.
@@ -138,17 +234,17 @@ def main() -> None:
         for include in sorted(files.include):
 
             # Find all the paths and then filter them through .gitignore.
-            paths = glob.glob(include, root_dir=root_dir, recursive=True)
+            paths = glob.glob(include, root_dir=CPYTHON_ROOT_DIR, recursive=True)
             paths = filter_gitignored_paths(paths)
             assert paths, include  # Make sure that every value returns something!
 
             for path in paths:
                 # Skip directories and excluded files
-                if not (root_dir / path).is_file() or path in exclude:
+                if not (CPYTHON_ROOT_DIR / path).is_file() or path in exclude:
                     continue
 
                 # SPDX requires SHA1 to be used for files, but we provide SHA256 too.
-                data = (root_dir / path).read_bytes()
+                data = (CPYTHON_ROOT_DIR / path).read_bytes()
                 checksum_sha1 = hashlib.sha1(data).hexdigest()
                 checksum_sha256 = hashlib.sha256(data).hexdigest()