]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-101282: Apply BOLT optimizations to libpython for shared builds (#104709)
authorGregory Szorc <gregory.szorc@gmail.com>
Mon, 22 May 2023 11:45:20 +0000 (04:45 -0700)
committerGitHub <noreply@github.com>
Mon, 22 May 2023 11:45:20 +0000 (13:45 +0200)
Apply BOLT optimizations to libpython for shared builds. Most of the C
code is in libpython so it is critical to apply BOLT there fully realize
BOLT benefits.

This change also reworks how BOLT instrumentation is applied. It
effectively removes the readelf based logic added in gh-101525 and
replaces it with a mechanism that saves a copy of the pre-bolt binary
and restores that copy when necessary. This allows us to perform BOLT
optimizations without having to manually delete the output binary to
force a new bolt run.

Also:
- add a clean-bolt target for purging BOLT files and hook that up to the
  clean target
- .gitignore BOLT related files

Before and after this refactor, `make` will no-op after a previous run.
Both versions should also share common make DAG deficiencies where
targets fail to trigger as often as they need to or can trigger
prematurely in certain scenarios. e.g. after this change you may need to
`rm profile-bolt-stamp` to force a BOLT run because there aren't
appropriate non-phony targets for BOLT's make target to depend on.

To make it easier to iterate on custom BOLT settings, the flags to pass
to instrumentation and application are now defined in configure and can
be overridden by passing BOLT_INSTRUMENT_FLAGS and BOLT_APPLY_FLAGS.

.gitignore
Doc/using/configure.rst
Makefile.pre.in
Misc/NEWS.d/next/Build/2023-05-20-16-09-59.gh-issue-101282.FvRARb.rst [new file with mode: 0644]
configure
configure.ac

index d9c4a7972f076d544e59aaba786300cf9e237f39..ef7642b09bc5d2aa65bb37ce3c889ccd7c818c03 100644 (file)
 *.gc??
 *.profclang?
 *.profraw
+# Copies of binaries before BOLT optimizations.
+*.prebolt
+# BOLT profile data.
+*.fdata
 *.dyn
 .gdb_history
 .purify
@@ -124,6 +128,7 @@ Tools/unicode/data/
 /platform
 /profile-clean-stamp
 /profile-run-stamp
+/profile-bolt-stamp
 /Python/deepfreeze/*.c
 /pybuilddir.txt
 /pyconfig.h
index ce858ab2c8d79efc8dcb077f00a1a2dad2465921..fbe280d641317053c4b76be12bddd3fcd51f7852 100644 (file)
@@ -314,6 +314,13 @@ also be used to improve performance.
    is dependent on a combination of the build environment + the other
    optimization configure args + the CPU architecture, and not all combinations
    are supported.
+   BOLT versions before LLVM 16 are known to crash BOLT under some scenarios.
+   Use of LLVM 16 or newer for BOLT optimization is strongly encouraged.
+
+   The :envvar:`!BOLT_INSTRUMENT_FLAGS` and :envvar:`!BOLT_APPLY_FLAGS`
+   :program:`configure` variables can be defined to override the default set of
+   arguments for :program:`llvm-bolt` to instrument and apply BOLT data to
+   binaries, respectively.
 
    .. versionadded:: 3.12
 
index da3a8f6c13f90b3b54bfa500808a4eb189c3ec43..eb79c9c4ca18016c7d6bb6e6a9e4fa7622473409 100644 (file)
@@ -672,21 +672,55 @@ profile-opt: profile-run-stamp
        -rm -f profile-clean-stamp
        $(MAKE) @DEF_MAKE_RULE@ CFLAGS_NODIST="$(CFLAGS_NODIST) $(PGO_PROF_USE_FLAG)" LDFLAGS_NODIST="$(LDFLAGS_NODIST)"
 
-.PHONY: bolt-opt
-bolt-opt: @PREBOLT_RULE@
+# List of binaries that BOLT runs on.
+BOLT_BINARIES := @BOLT_BINARIES@
+
+BOLT_INSTRUMENT_FLAGS := @BOLT_INSTRUMENT_FLAGS@
+BOLT_APPLY_FLAGS := @BOLT_APPLY_FLAGS@
+
+.PHONY: clean-bolt
+clean-bolt:
+       # Profile data.
        rm -f *.fdata
-       @if $(READELF) -p .note.bolt_info $(BUILDPYTHON) | grep BOLT > /dev/null; then\
-               echo "skip: $(BUILDPYTHON) is already BOLTed."; \
-       else \
-               @LLVM_BOLT@ ./$(BUILDPYTHON) -instrument -instrumentation-file-append-pid -instrumentation-file=$(abspath $(BUILDPYTHON).bolt) -o $(BUILDPYTHON).bolt_inst; \
-               ./$(BUILDPYTHON).bolt_inst $(PROFILE_TASK) || true; \
-               @MERGE_FDATA@ $(BUILDPYTHON).*.fdata > $(BUILDPYTHON).fdata; \
-               @LLVM_BOLT@ ./$(BUILDPYTHON) -o $(BUILDPYTHON).bolt -data=$(BUILDPYTHON).fdata -update-debug-sections -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions -icf=1 -inline-all -split-eh -reorder-functions-use-hot-size -peepholes=none -jump-tables=aggressive -inline-ap -indirect-call-promotion=all -dyno-stats -use-gnu-stack -frame-opt=hot; \
-               rm -f *.fdata; \
-               rm -f $(BUILDPYTHON).bolt_inst; \
-               mv $(BUILDPYTHON).bolt $(BUILDPYTHON); \
-       fi
+       # Pristine binaries before BOLT optimization.
+       rm -f *.prebolt
+       # BOLT instrumented binaries.
+       rm -f *.bolt_inst
+
+profile-bolt-stamp: $(BUILDPYTHON)
+       # Ensure a pristine, pre-BOLT copy of the binary and no profile data from last run.
+       for bin in $(BOLT_BINARIES); do \
+         prebolt="$${bin}.prebolt"; \
+         if [ -e "$${prebolt}" ]; then \
+           echo "Restoring pre-BOLT binary $${prebolt}"; \
+           mv "$${bin}.prebolt" "$${bin}"; \
+         fi; \
+         cp "$${bin}" "$${prebolt}"; \
+         rm -f $${bin}.bolt.*.fdata $${bin}.fdata; \
+       done
+       # Instrument each binary.
+       for bin in $(BOLT_BINARIES); do \
+         @LLVM_BOLT@ "$${bin}" -instrument -instrumentation-file-append-pid -instrumentation-file=$(abspath $${bin}.bolt) -o $${bin}.bolt_inst $(BOLT_INSTRUMENT_FLAGS); \
+         mv "$${bin}.bolt_inst" "$${bin}"; \
+       done
+       # Run instrumented binaries to collect data.
+       $(RUNSHARED) ./$(BUILDPYTHON) $(PROFILE_TASK) || true
+       # Merge all the data files together.
+       for bin in $(BOLT_BINARIES); do \
+         @MERGE_FDATA@ $${bin}.*.fdata > "$${bin}.fdata"; \
+         rm -f $${bin}.*.fdata; \
+       done
+       # Run bolt against the merged data to produce an optimized binary.
+       for bin in $(BOLT_BINARIES); do \
+         @LLVM_BOLT@ "$${bin}.prebolt" -o "$${bin}.bolt" -data="$${bin}.fdata" $(BOLT_APPLY_FLAGS); \
+         mv "$${bin}.bolt" "$${bin}"; \
+       done
+       touch $@
 
+.PHONY: bolt-opt
+bolt-opt:
+       $(MAKE) @PREBOLT_RULE@
+       $(MAKE) profile-bolt-stamp
 
 # Compile and run with gcov
 .PHONY: coverage
@@ -2623,10 +2657,11 @@ profile-removal:
        rm -f $(COVERAGE_INFO)
        rm -rf $(COVERAGE_REPORT)
        rm -f profile-run-stamp
+       rm -f profile-bolt-stamp
 
 .PHONY: clean
-clean: clean-retain-profile
-       @if test @DEF_MAKE_ALL_RULE@ = profile-opt; then \
+clean: clean-retain-profile clean-bolt
+       @if test @DEF_MAKE_ALL_RULE@ = profile-opt -o @DEF_MAKE_ALL_RULE@ = bolt-opt; then \
                rm -f profile-gen-stamp profile-clean-stamp; \
                $(MAKE) profile-removal; \
        fi
diff --git a/Misc/NEWS.d/next/Build/2023-05-20-16-09-59.gh-issue-101282.FvRARb.rst b/Misc/NEWS.d/next/Build/2023-05-20-16-09-59.gh-issue-101282.FvRARb.rst
new file mode 100644 (file)
index 0000000..cc70d47
--- /dev/null
@@ -0,0 +1,4 @@
+BOLT optimization is now applied to the libpython shared library if building
+a shared library. BOLT instrumentation and application settings can now be
+influenced via the ``BOLT_INSTRUMENT_FLAGS`` and ``BOLT_APPLY_FLAGS``
+configure variables.
index 7aad4fe89e3cbf20550a302997d3f54dd6a65162..2b863be108be26cb8759ce1a5f57976104d95558 100755 (executable)
--- a/configure
+++ b/configure
@@ -883,10 +883,11 @@ CFLAGS_NODIST
 BASECFLAGS
 CFLAGS_ALIASING
 OPT
+BOLT_APPLY_FLAGS
+BOLT_INSTRUMENT_FLAGS
+BOLT_BINARIES
 MERGE_FDATA
 LLVM_BOLT
-ac_ct_READELF
-READELF
 PREBOLT_RULE
 LLVM_PROF_FOUND
 LLVM_PROFDATA
@@ -1105,6 +1106,8 @@ CPPFLAGS
 CPP
 HOSTRUNNER
 PROFILE_TASK
+BOLT_INSTRUMENT_FLAGS
+BOLT_APPLY_FLAGS
 LIBUUID_CFLAGS
 LIBUUID_LIBS
 LIBFFI_CFLAGS
@@ -1916,6 +1919,10 @@ Some influential environment variables:
   HOSTRUNNER  Program to run CPython for the host platform
   PROFILE_TASK
               Python args for PGO generation task
+  BOLT_INSTRUMENT_FLAGS
+              Arguments to llvm-bolt when instrumenting binaries
+  BOLT_APPLY_FLAGS
+              Arguments to llvm-bolt when creating a BOLT optimized binary
   LIBUUID_CFLAGS
               C compiler flags for LIBUUID, overriding pkg-config
   LIBUUID_LIBS
@@ -8106,112 +8113,6 @@ if test "$Py_BOLT" = 'true' ; then
   DEF_MAKE_ALL_RULE="bolt-opt"
   DEF_MAKE_RULE="build_all"
 
-
-  if test -n "$ac_tool_prefix"; then
-  for ac_prog in readelf
-  do
-    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
-set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_READELF+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$READELF"; then
-  ac_cv_prog_READELF="$READELF" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_READELF="$ac_tool_prefix$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-READELF=$ac_cv_prog_READELF
-if test -n "$READELF"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $READELF" >&5
-$as_echo "$READELF" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-    test -n "$READELF" && break
-  done
-fi
-if test -z "$READELF"; then
-  ac_ct_READELF=$READELF
-  for ac_prog in readelf
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_ac_ct_READELF+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$ac_ct_READELF"; then
-  ac_cv_prog_ac_ct_READELF="$ac_ct_READELF" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_ac_ct_READELF="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-ac_ct_READELF=$ac_cv_prog_ac_ct_READELF
-if test -n "$ac_ct_READELF"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_READELF" >&5
-$as_echo "$ac_ct_READELF" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
-
-
-  test -n "$ac_ct_READELF" && break
-done
-
-  if test "x$ac_ct_READELF" = x; then
-    READELF=""notfound""
-  else
-    case $cross_compiling:$ac_tool_warned in
-yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
-$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
-ac_tool_warned=yes ;;
-esac
-    READELF=$ac_ct_READELF
-  fi
-fi
-
-  if test "$READELF" == "notfound"
-  then
-    as_fn_error $? "readelf is required for a --enable-bolt build but could not be found." "$LINENO" 5
-  fi
-
   # -fno-reorder-blocks-and-partition is required for bolt to work.
   # Possibly GCC only.
   { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts -fno-reorder-blocks-and-partition" >&5
@@ -8474,6 +8375,36 @@ $as_echo "\"Found merge-fdata\"" >&6; }
   fi
 fi
 
+
+BOLT_BINARIES='$(BUILDPYTHON)'
+if test "x$enable_shared" = xyes; then :
+
+  BOLT_BINARIES="${BOLT_BINARIES} \$(INSTSONAME)"
+
+fi
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking BOLT_INSTRUMENT_FLAGS" >&5
+$as_echo_n "checking BOLT_INSTRUMENT_FLAGS... " >&6; }
+if test -z "${BOLT_INSTRUMENT_FLAGS}"
+then
+  BOLT_INSTRUMENT_FLAGS=
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $BOLT_INSTRUMENT_FLAGS" >&5
+$as_echo "$BOLT_INSTRUMENT_FLAGS" >&6; }
+
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking BOLT_APPLY_FLAGS" >&5
+$as_echo_n "checking BOLT_APPLY_FLAGS... " >&6; }
+if test -z "${BOLT_APPLY_FLAGS}"
+then
+  BOLT_APPLY_FLAGS=-update-debug-sections -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions -icf=1 -inline-all -split-eh -reorder-functions-use-hot-size -peepholes=none -jump-tables=aggressive -inline-ap -indirect-call-promotion=all -dyno-stats -use-gnu-stack -frame-opt=hot
+
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $BOLT_APPLY_FLAGS" >&5
+$as_echo "$BOLT_APPLY_FLAGS" >&6; }
+
 # XXX Shouldn't the code above that fiddles with BASECFLAGS and OPT be
 # merged with this chunk of code?
 
index 115998e0753b264c3bc3777a5381d59d31bf5eec..786d3414eb0e06a183856ce9d72160b78ab26eb5 100644 (file)
@@ -2028,13 +2028,6 @@ if test "$Py_BOLT" = 'true' ; then
   DEF_MAKE_ALL_RULE="bolt-opt"
   DEF_MAKE_RULE="build_all"
 
-  AC_SUBST(READELF)
-  AC_CHECK_TOOLS(READELF, [readelf], "notfound")
-  if test "$READELF" == "notfound"
-  then
-    AC_MSG_ERROR([readelf is required for a --enable-bolt build but could not be found.])
-  fi
-
   # -fno-reorder-blocks-and-partition is required for bolt to work.
   # Possibly GCC only.
   AX_CHECK_COMPILE_FLAG([-fno-reorder-blocks-and-partition],[
@@ -2067,6 +2060,54 @@ if test "$Py_BOLT" = 'true' ; then
   fi
 fi
 
+dnl Enable BOLT of libpython if built.
+AC_SUBST(BOLT_BINARIES)
+BOLT_BINARIES='$(BUILDPYTHON)'
+AS_VAR_IF([enable_shared], [yes], [
+  BOLT_BINARIES="${BOLT_BINARIES} \$(INSTSONAME)"
+])
+
+AC_ARG_VAR(
+  [BOLT_INSTRUMENT_FLAGS],
+  [Arguments to llvm-bolt when instrumenting binaries]
+)
+AC_MSG_CHECKING([BOLT_INSTRUMENT_FLAGS])
+if test -z "${BOLT_INSTRUMENT_FLAGS}"
+then
+  BOLT_INSTRUMENT_FLAGS=
+fi
+AC_MSG_RESULT([$BOLT_INSTRUMENT_FLAGS])
+
+AC_ARG_VAR(
+  [BOLT_APPLY_FLAGS],
+  [Arguments to llvm-bolt when creating a BOLT optimized binary]
+)
+AC_MSG_CHECKING([BOLT_APPLY_FLAGS])
+if test -z "${BOLT_APPLY_FLAGS}"
+then
+  AS_VAR_SET(
+    [BOLT_APPLY_FLAGS],
+    [m4_join([ ],
+      [-update-debug-sections],
+      [-reorder-blocks=ext-tsp],
+      [-reorder-functions=hfsort+],
+      [-split-functions],
+      [-icf=1],
+      [-inline-all],
+      [-split-eh],
+      [-reorder-functions-use-hot-size],
+      [-peepholes=none],
+      [-jump-tables=aggressive],
+      [-inline-ap],
+      [-indirect-call-promotion=all],
+      [-dyno-stats],
+      [-use-gnu-stack],
+      [-frame-opt=hot]
+    )]
+  )
+fi
+AC_MSG_RESULT([$BOLT_APPLY_FLAGS])
+
 # XXX Shouldn't the code above that fiddles with BASECFLAGS and OPT be
 # merged with this chunk of code?