bpo-45150: Add hashlib.file_digest() for efficient file hashing (GH-31930)

author Christian Heimes <christian@python.org>

Tue, 22 Mar 2022 09:37:00 +0000 (11:37 +0200)

committer GitHub <noreply@github.com>

Tue, 22 Mar 2022 09:37:00 +0000 (02:37 -0700)
author Christian Heimes <christian@python.org>
Tue, 22 Mar 2022 09:37:00 +0000 (11:37 +0200)
committer GitHub <noreply@github.com>
Tue, 22 Mar 2022 09:37:00 +0000 (02:37 -0700)
diff --git a/Doc/library/hashlib.rst b/Doc/library/hashlib.rst

index aa24131f8bf4442ebe5314765fc762019c4da859..da97b0e9a74d159afce296d7a6c8e79fac80752e 100644 (file)
--- a/Doc/library/hashlib.rst
+++ b/Doc/library/hashlib.rst
@@ -228,6 +228,49 @@ by the SHAKE algorithm.
     exchange the value safely in email or other non-binary environments.
  
  
+File hashing
+------------
+
+The hashlib module provides a helper function for efficient hashing of
+a file or file-like object.
+
+.. function:: file_digest(fileobj, digest, /)
+
+   Return a digest object that has been updated with contents of file object.
+
+   *fileobj* must be a file-like object opened for reading in binary mode.
+   It accepts file objects from  builtin :func:`open`, :class:`~io.BytesIO`
+   instances, SocketIO objects from :meth:`socket.socket.makefile`, and
+   similar. The function may bypass Python's I/O and use the file descriptor
+   from :meth:`~io.IOBase.fileno` directly. *fileobj* must be assumed to be
+   in an unknown state after this function returns or raises. It is up to
+   the caller to close *fileobj*.
+
+   *digest* must either be a hash algorithm name as a *str*, a hash
+   constructor, or a callable that returns a hash object.
+
+   Example:
+
+      >>> import io, hashlib, hmac
+      >>> with open(hashlib.__file__, "rb") as f:
+      ...     digest = hashlib.file_digest(f, "sha256")
+      ...
+      >>> digest.hexdigest()  # doctest: +ELLIPSIS
+      '...'
+
+      >>> buf = io.BytesIO(b"somedata")
+      >>> mac1 = hmac.HMAC(b"key", digestmod=hashlib.sha512)
+      >>> digest = hashlib.file_digest(buf, lambda: mac1)
+
+      >>> digest is mac1
+      True
+      >>> mac2 = hmac.HMAC(b"key", b"somedata", digestmod=hashlib.sha512)
+      >>> mac1.digest() == mac2.digest()
+      True
+
+   .. versionadded:: 3.11
+
+
  Key derivation
  --------------
  
diff --git a/Lib/hashlib.py b/Lib/hashlib.py

index 562501860a72b388fecc9d1d327794e761f2d92a..b546a3fd795311a1cd2772cb85edcc820f0cd5c3 100644 (file)
--- a/Lib/hashlib.py
+++ b/Lib/hashlib.py
@@ -65,7 +65,7 @@ algorithms_guaranteed = set(__always_supported)
  algorithms_available = set(__always_supported)
  
  __all__ = __always_supported + ('new', 'algorithms_guaranteed',
-                                'algorithms_available', 'pbkdf2_hmac')
+                                'algorithms_available', 'pbkdf2_hmac', 'file_digest')
  
  
  __builtin_constructor_cache = {}
@@ -254,6 +254,52 @@ except ImportError:
      pass
  
  
+def file_digest(fileobj, digest, /, *, _bufsize=2**18):
+    """Hash the contents of a file-like object. Returns a digest object.
+
+    *fileobj* must be a file-like object opened for reading in binary mode.
+    It accepts file objects from open(), io.BytesIO(), and SocketIO objects.
+    The function may bypass Python's I/O and use the file descriptor *fileno*
+    directly.
+
+    *digest* must either be a hash algorithm name as a *str*, a hash
+    constructor, or a callable that returns a hash object.
+    """
+    # On Linux we could use AF_ALG sockets and sendfile() to archive zero-copy
+    # hashing with hardware acceleration.
+    if isinstance(digest, str):
+        digestobj = new(digest)
+    else:
+        digestobj = digest()
+
+    if hasattr(fileobj, "getbuffer"):
+        # io.BytesIO object, use zero-copy buffer
+        digestobj.update(fileobj.getbuffer())
+        return digestobj
+
+    # Only binary files implement readinto().
+    if not (
+        hasattr(fileobj, "readinto")
+        and hasattr(fileobj, "readable")
+        and fileobj.readable()
+    ):
+        raise ValueError(
+            f"'{fileobj!r}' is not a file-like object in binary reading mode."
+        )
+
+    # binary file, socket.SocketIO object
+    # Note: socket I/O uses different syscalls than file I/O.
+    buf = bytearray(_bufsize)  # Reusable buffer to reduce allocations.
+    view = memoryview(buf)
+    while True:
+        size = fileobj.readinto(buf)
+        if size == 0:
+            break  # EOF
+        digestobj.update(view[:size])
+
+    return digestobj
+
+
  for __func_name in __always_supported:
      # try them all, some may not work due to the OpenSSL
      # version not supporting that algorithm.
diff --git a/Lib/test/test_hashlib.py b/Lib/test/test_hashlib.py

index ea31f8be2cb82b529c2a0e5124d80d6f34935c37..daf6e3862a24f72a19aa7d77192b07573e05c50e 100644 (file)
--- a/Lib/test/test_hashlib.py
+++ b/Lib/test/test_hashlib.py
@@ -10,6 +10,7 @@ import array
  from binascii import unhexlify
  import hashlib
  import importlib
+import io
  import itertools
  import os
  import sys
@@ -20,6 +21,7 @@ import warnings
  from test import support
  from test.support import _4G, bigmemtest
  from test.support.import_helper import import_fresh_module
+from test.support import os_helper
  from test.support import threading_helper
  from test.support import warnings_helper
  from http.client import HTTPException
@@ -371,6 +373,31 @@ class HashLibTestCase(unittest.TestCase):
              if not shake:
                  self.assertEqual(len(digest), m.digest_size)
  
+        if not shake and kwargs.get("key") is None:
+            # skip shake and blake2 extended parameter tests
+            self.check_file_digest(name, data, hexdigest)
+
+    def check_file_digest(self, name, data, hexdigest):
+        hexdigest = hexdigest.lower()
+        digests = [name]
+        digests.extend(self.constructors_to_test[name])
+
+        with open(os_helper.TESTFN, "wb") as f:
+            f.write(data)
+
+        try:
+            for digest in digests:
+                buf = io.BytesIO(data)
+                buf.seek(0)
+                self.assertEqual(
+                    hashlib.file_digest(buf, digest).hexdigest(), hexdigest
+                )
+                with open(os_helper.TESTFN, "rb") as f:
+                    digestobj = hashlib.file_digest(f, digest)
+                self.assertEqual(digestobj.hexdigest(), hexdigest)
+        finally:
+            os.unlink(os_helper.TESTFN)
+
      def check_no_unicode(self, algorithm_name):
          # Unicode objects are not allowed as input.
          constructors = self.constructors_to_test[algorithm_name]
@@ -1117,6 +1144,33 @@ class KDFTests(unittest.TestCase):
          self.assertNotIn("blake2b512", hashlib.algorithms_available)
          self.assertNotIn("sha3-512", hashlib.algorithms_available)
  
+    def test_file_digest(self):
+        data = b'a' * 65536
+        d1 = hashlib.sha256()
+        self.addCleanup(os.unlink, os_helper.TESTFN)
+        with open(os_helper.TESTFN, "wb") as f:
+            for _ in range(10):
+                d1.update(data)
+                f.write(data)
+
+        with open(os_helper.TESTFN, "rb") as f:
+            d2 = hashlib.file_digest(f, hashlib.sha256)
+
+        self.assertEqual(d1.hexdigest(), d2.hexdigest())
+        self.assertEqual(d1.name, d2.name)
+        self.assertIs(type(d1), type(d2))
+
+        with self.assertRaises(ValueError):
+            hashlib.file_digest(None, "sha256")
+
+        with self.assertRaises(ValueError):
+            with open(os_helper.TESTFN, "r") as f:
+                hashlib.file_digest(f, "sha256")
+
+        with self.assertRaises(ValueError):
+            with open(os_helper.TESTFN, "wb") as f:
+                hashlib.file_digest(f, "sha256")
+
  
  if __name__ == "__main__":
      unittest.main()
diff --git a/Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst b/Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst

new file mode 100644 (file)

index 0000000..1c6ea5a
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst
@@ -0,0 +1 @@
+Add :func:`hashlib.file_digest` helper for efficient hashing of file object.
author	Christian Heimes <christian@python.org>
	Tue, 22 Mar 2022 09:37:00 +0000 (11:37 +0200)
committer	GitHub <noreply@github.com>
	Tue, 22 Mar 2022 09:37:00 +0000 (02:37 -0700)
Doc/library/hashlib.rst		patch \| blob \| blame \| history
Lib/hashlib.py		patch \| blob \| blame \| history
Lib/test/test_hashlib.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2022-03-16-11-52-52.bpo-45150.kYbIME.rst	[new file with mode: 0644]	patch \| blob