]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
GH-113464: Add a JIT backend for tier 2 (GH-113465)
authorBrandt Bucher <brandtbucher@microsoft.com>
Mon, 29 Jan 2024 02:48:48 +0000 (18:48 -0800)
committerGitHub <noreply@github.com>
Mon, 29 Jan 2024 02:48:48 +0000 (18:48 -0800)
Add an option (--enable-experimental-jit for configure-based builds
or --experimental-jit for PCbuild-based ones) to build an
*experimental* just-in-time compiler, based on copy-and-patch (https://fredrikbk.com/publications/copy-and-patch.pdf).

See Tools/jit/README.md for more information on how to install the required build-time tooling.

29 files changed:
.github/workflows/jit.yml [new file with mode: 0644]
.github/workflows/mypy.yml
.gitignore
Include/cpython/optimizer.h
Include/internal/pycore_jit.h [new file with mode: 0644]
Include/internal/pycore_object.h
Makefile.pre.in
Misc/NEWS.d/next/Core and Builtins/2023-12-24-03-25-28.gh-issue-113464.dvjQmA.rst [new file with mode: 0644]
PCbuild/_freeze_module.vcxproj
PCbuild/_freeze_module.vcxproj.filters
PCbuild/build.bat
PCbuild/pythoncore.vcxproj
PCbuild/pythoncore.vcxproj.filters
PCbuild/regen.targets
Python/ceval.c
Python/jit.c [new file with mode: 0644]
Python/optimizer.c
Python/pylifecycle.c
Tools/jit/README.md [new file with mode: 0644]
Tools/jit/_llvm.py [new file with mode: 0644]
Tools/jit/_schema.py [new file with mode: 0644]
Tools/jit/_stencils.py [new file with mode: 0644]
Tools/jit/_targets.py [new file with mode: 0644]
Tools/jit/_writer.py [new file with mode: 0644]
Tools/jit/build.py [new file with mode: 0644]
Tools/jit/mypy.ini [new file with mode: 0644]
Tools/jit/template.c [new file with mode: 0644]
configure
configure.ac

diff --git a/.github/workflows/jit.yml b/.github/workflows/jit.yml
new file mode 100644 (file)
index 0000000..e137fd2
--- /dev/null
@@ -0,0 +1,112 @@
+name: JIT
+on:
+  pull_request:
+    paths: '**jit**'
+  push:
+    paths: '**jit**'
+  workflow_dispatch:
+jobs:
+  jit:
+    name: ${{ matrix.target }} (${{ matrix.debug && 'Debug' || 'Release' }})
+    runs-on: ${{ matrix.runner }}
+    strategy:
+      fail-fast: false
+      matrix:
+        target:
+          - i686-pc-windows-msvc/msvc
+          - x86_64-pc-windows-msvc/msvc
+          - x86_64-apple-darwin/clang
+          - x86_64-unknown-linux-gnu/gcc
+          - x86_64-unknown-linux-gnu/clang
+          - aarch64-unknown-linux-gnu/gcc
+          - aarch64-unknown-linux-gnu/clang
+        debug:
+          - true
+          - false
+        llvm:
+          - 16
+        include:
+          - target: i686-pc-windows-msvc/msvc
+            architecture: Win32
+            runner: windows-latest
+            compiler: msvc
+          - target: x86_64-pc-windows-msvc/msvc
+            architecture: x64
+            runner: windows-latest
+            compiler: msvc
+          - target: x86_64-apple-darwin/clang
+            architecture: x86_64
+            runner: macos-latest
+            compiler: clang
+            exclude: test_embed
+          - target: x86_64-unknown-linux-gnu/gcc
+            architecture: x86_64
+            runner: ubuntu-latest
+            compiler: gcc
+          - target: x86_64-unknown-linux-gnu/clang
+            architecture: x86_64
+            runner: ubuntu-latest
+            compiler: clang
+          - target: aarch64-unknown-linux-gnu/gcc
+            architecture: aarch64
+            runner: ubuntu-latest
+            compiler: gcc
+            # These fail because of emulation, not because of the JIT:
+            exclude: test_unix_events test_init test_process_pool test_shutdown test_multiprocessing_fork test_cmd_line test_faulthandler test_os test_perf_profiler test_posix test_signal test_socket test_subprocess test_threading test_venv
+          - target: aarch64-unknown-linux-gnu/clang
+            architecture: aarch64
+            runner: ubuntu-latest
+            compiler: clang
+            # These fail because of emulation, not because of the JIT:
+            exclude: test_unix_events test_init test_process_pool test_shutdown test_multiprocessing_fork test_cmd_line test_faulthandler test_os test_perf_profiler test_posix test_signal test_socket test_subprocess test_threading test_venv
+    env:
+      CC: ${{ matrix.compiler }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Windows
+        if: runner.os == 'Windows'
+        run: |
+          choco install llvm --allow-downgrade --no-progress --version ${{ matrix.llvm }}
+          ./PCbuild/build.bat --experimental-jit ${{ matrix.debug && '-d' || '--pgo' }} -p ${{ matrix.architecture }}
+          ./PCbuild/rt.bat ${{ matrix.debug && '-d' }} -p ${{ matrix.architecture }} -q --exclude ${{ matrix.exclude }} --multiprocess 0 --timeout 3600 --verbose2 --verbose3
+
+      - name: macOS
+        if: runner.os == 'macOS'
+        run: |
+          brew install llvm@${{ matrix.llvm }}
+          export SDKROOT="$(xcrun --show-sdk-path)"
+          ./configure --enable-experimental-jit ${{ matrix.debug && '--with-pydebug' || '--enable-optimizations --with-lto' }}
+          make all --jobs 3
+          ./python.exe -m test --exclude ${{ matrix.exclude }} --multiprocess 0 --timeout 3600 --verbose2 --verbose3
+
+      - name: Native Linux
+        if: runner.os == 'Linux' && matrix.architecture == 'x86_64'
+        run: |
+          sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" ./llvm.sh ${{ matrix.llvm }}
+          export PATH="$(llvm-config-${{ matrix.llvm }} --bindir):$PATH"
+          ./configure --enable-experimental-jit ${{ matrix.debug && '--with-pydebug' || '--enable-optimizations --with-lto' }}
+          make all --jobs 4
+          ./python -m test --exclude ${{ matrix.exclude }} --multiprocess 0 --timeout 3600 --verbose2 --verbose3
+      - name: Emulated Linux
+        if: runner.os == 'Linux' && matrix.architecture != 'x86_64'
+        run: |
+          sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" ./llvm.sh ${{ matrix.llvm }}
+          export PATH="$(llvm-config-${{ matrix.llvm }} --bindir):$PATH"
+          ./configure --prefix="$(pwd)/../build"
+          make install --jobs 4
+          make clean --jobs 4
+          export HOST=${{ matrix.architecture }}-linux-gnu
+          sudo apt install --yes "gcc-$HOST" qemu-user
+          ${{ !matrix.debug && matrix.compiler == 'clang' && './configure --enable-optimizations' || '' }}
+          ${{ !matrix.debug && matrix.compiler == 'clang' && 'make profile-run-stamp --jobs 4' || '' }}
+          export CC="${{ matrix.compiler == 'clang' && 'clang --target=$HOST' || '$HOST-gcc' }}"
+          export CPP="$CC --preprocess"
+          export HOSTRUNNER=qemu-${{ matrix.architecture }}
+          export QEMU_LD_PREFIX="/usr/$HOST"
+          ./configure --enable-experimental-jit ${{ matrix.debug && '--with-pydebug' || '--enable-optimizations --with-lto' }} --build=x86_64-linux-gnu --host="$HOST" --with-build-python=../build/bin/python3 --with-pkg-config=no ac_cv_buggy_getaddrinfo=no ac_cv_file__dev_ptc=no ac_cv_file__dev_ptmx=yes
+          make all --jobs 4
+          ./python -m test --exclude ${{ matrix.exclude }} --multiprocess 0 --timeout 3600 --verbose2 --verbose3
index 11928e72b9b43a187425e309ae113e141fd49cba..b766785de405d2efe015ae7c105cf198f47e30d5 100644 (file)
@@ -12,6 +12,7 @@ on:
       - "Tools/build/generate_sbom.py"
       - "Tools/cases_generator/**"
       - "Tools/clinic/**"
+      - "Tools/jit/**"
       - "Tools/peg_generator/**"
       - "Tools/requirements-dev.txt"
       - "Tools/wasm/**"
@@ -38,6 +39,7 @@ jobs:
           "Tools/build/",
           "Tools/cases_generator",
           "Tools/clinic",
+          "Tools/jit",
           "Tools/peg_generator",
           "Tools/wasm",
         ]
index c424a894c2a6e09f558970cfc954a9c9f713beeb..18eb2a9f0632ce203b37e80eb339ddafa460ad94 100644 (file)
@@ -126,6 +126,7 @@ Tools/unicode/data/
 # hendrikmuhs/ccache-action@v1
 /.ccache
 /cross-build/
+/jit_stencils.h
 /platform
 /profile-clean-stamp
 /profile-run-stamp
index 96e829f8fbe97db2b1c63d1d2720c1a127e0f46f..ecf3cae4cbc3f10aa28ae40cd8728ade99076744 100644 (file)
@@ -39,6 +39,8 @@ typedef struct {
 typedef struct _PyExecutorObject {
     PyObject_VAR_HEAD
     _PyVMData vm_data; /* Used by the VM, but opaque to the optimizer */
+    void *jit_code;
+    size_t jit_size;
     _PyUOpInstruction trace[1];
 } _PyExecutorObject;
 
diff --git a/Include/internal/pycore_jit.h b/Include/internal/pycore_jit.h
new file mode 100644 (file)
index 0000000..0b71eb6
--- /dev/null
@@ -0,0 +1,25 @@
+#ifndef Py_INTERNAL_JIT_H
+#define Py_INTERNAL_JIT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#ifdef _Py_JIT
+
+typedef _Py_CODEUNIT *(*jit_func)(_PyInterpreterFrame *frame, PyObject **stack_pointer, PyThreadState *tstate);
+
+int _PyJIT_Compile(_PyExecutorObject *executor, _PyUOpInstruction *trace, size_t length);
+void _PyJIT_Free(_PyExecutorObject *executor);
+
+#endif  // _Py_JIT
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // !Py_INTERNAL_JIT_H
index 4e52ffc77c5956b9c8aae809c870e075faf5cf8d..e32ea2f528940ae74855dcfd75d522950f85ff9b 100644 (file)
@@ -178,7 +178,7 @@ _Py_DECREF_SPECIALIZED(PyObject *op, const destructor destruct)
     }
     _Py_DECREF_STAT_INC();
 #ifdef Py_REF_DEBUG
-    _Py_DEC_REFTOTAL(_PyInterpreterState_GET());
+    _Py_DEC_REFTOTAL(PyInterpreterState_Get());
 #endif
     if (--op->ob_refcnt != 0) {
         assert(op->ob_refcnt > 0);
@@ -199,7 +199,7 @@ _Py_DECREF_NO_DEALLOC(PyObject *op)
     }
     _Py_DECREF_STAT_INC();
 #ifdef Py_REF_DEBUG
-    _Py_DEC_REFTOTAL(_PyInterpreterState_GET());
+    _Py_DEC_REFTOTAL(PyInterpreterState_Get());
 #endif
     op->ob_refcnt--;
 #ifdef Py_DEBUG
index 37a8b06987c710775f9b8696fc4b386bf0ccd90e..fff3d3c4914e7a778378ed401777ccdbcb408bae 100644 (file)
@@ -433,6 +433,7 @@ PYTHON_OBJS=        \
                Python/initconfig.o \
                Python/instrumentation.o \
                Python/intrinsics.o \
+               Python/jit.o \
                Python/legacy_tracing.o \
                Python/lock.o \
                Python/marshal.o \
@@ -1365,7 +1366,7 @@ regen-unicodedata:
 regen-all: regen-cases regen-typeslots \
        regen-token regen-ast regen-keyword regen-sre regen-frozen \
        regen-pegen-metaparser regen-pegen regen-test-frozenmain \
-       regen-test-levenshtein regen-global-objects regen-sbom
+       regen-test-levenshtein regen-global-objects regen-sbom regen-jit
        @echo
        @echo "Note: make regen-stdlib-module-names, make regen-limited-abi, "
        @echo "make regen-configure and make regen-unicodedata should be run manually"
@@ -1846,6 +1847,7 @@ PYTHON_HEADERS= \
                $(srcdir)/Include/internal/pycore_initconfig.h \
                $(srcdir)/Include/internal/pycore_interp.h \
                $(srcdir)/Include/internal/pycore_intrinsics.h \
+               $(srcdir)/Include/internal/pycore_jit.h \
                $(srcdir)/Include/internal/pycore_list.h \
                $(srcdir)/Include/internal/pycore_llist.h \
                $(srcdir)/Include/internal/pycore_lock.h \
@@ -2641,6 +2643,12 @@ config.status:   $(srcdir)/configure
 Python/asm_trampoline.o: $(srcdir)/Python/asm_trampoline.S
        $(CC) -c $(PY_CORE_CFLAGS) -o $@ $<
 
+Python/jit.o: regen-jit
+
+.PHONY: regen-jit
+regen-jit:
+       @REGEN_JIT_COMMAND@
+
 # Some make's put the object file in the current directory
 .c.o:
        $(CC) -c $(PY_CORE_CFLAGS) -o $@ $<
@@ -2733,6 +2741,7 @@ clean-retain-profile: pycremoval
        -rm -f Python/deepfreeze/*.[co]
        -rm -f Python/frozen_modules/*.h
        -rm -f Python/frozen_modules/MANIFEST
+       -rm -f jit_stencils.h
        -find build -type f -a ! -name '*.gc??' -exec rm -f {} ';'
        -rm -f Include/pydtrace_probes.h
        -rm -f profile-gen-stamp
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-12-24-03-25-28.gh-issue-113464.dvjQmA.rst b/Misc/NEWS.d/next/Core and Builtins/2023-12-24-03-25-28.gh-issue-113464.dvjQmA.rst
new file mode 100644 (file)
index 0000000..bdee4d6
--- /dev/null
@@ -0,0 +1,4 @@
+Add an option (``--enable-experimental-jit`` for ``configure``-based builds
+or ``--experimental-jit`` for ``PCbuild``-based ones) to build an
+*experimental* just-in-time compiler, based on `copy-and-patch
+<https://fredrikbk.com/publications/copy-and-patch.pdf>`_
index dde801fc0fd525e4714f475c5b390a3fbdb6bad7..35788ec4503e8f62a08a59d84a402bab613277df 100644 (file)
     <ClCompile Include="..\Python\initconfig.c" />
     <ClCompile Include="..\Python\intrinsics.c" />
     <ClCompile Include="..\Python\instrumentation.c" />
+    <ClCompile Include="..\Python\jit.c" />
     <ClCompile Include="..\Python\legacy_tracing.c" />
     <ClCompile Include="..\Python\lock.c" />
     <ClCompile Include="..\Python\marshal.c" />
index 90ccb954b424bc132f0dbf2dd3c51f3b47fcf4d1..7a44179e3561059cd2c88a8f3bde8e29facc3852 100644 (file)
     <ClCompile Include="..\Objects\iterobject.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\Python\jit.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\Objects\listobject.c">
       <Filter>Source Files</Filter>
     </ClCompile>
index e61267b5852a8f274804afeb90c8cc395ae8a91f..83b50db44670337d0e340cf2bd60247880d3dc35 100644 (file)
@@ -36,6 +36,7 @@ echo.                 overrides -c and -d
 echo.  --disable-gil  Enable experimental support for running without the GIL.
 echo.  --test-marker  Enable the test marker within the build.
 echo.  --regen        Regenerate all opcodes, grammar and tokens.
+echo.  --experimental-jit  Enable the experimental just-in-time compiler.
 echo.
 echo.Available flags to avoid building certain modules.
 echo.These flags have no effect if '-e' is not given:
@@ -85,6 +86,7 @@ if "%~1"=="--disable-gil" (set UseDisableGil=true) & shift & goto CheckOpts
 if "%~1"=="--test-marker" (set UseTestMarker=true) & shift & goto CheckOpts
 if "%~1"=="-V" shift & goto Version
 if "%~1"=="--regen" (set Regen=true) & shift & goto CheckOpts
+if "%~1"=="--experimental-jit" (set UseJIT=true) & shift & goto CheckOpts
 rem These use the actual property names used by MSBuild.  We could just let
 rem them in through the environment, but we specify them on the command line
 rem anyway for visibility so set defaults after this
@@ -176,6 +178,7 @@ echo on
  /p:IncludeSSL=%IncludeSSL% /p:IncludeTkinter=%IncludeTkinter%^
  /p:DisableGil=%UseDisableGil%^
  /p:UseTestMarker=%UseTestMarker% %GITProperty%^
+ /p:UseJIT=%UseJIT%^
  %1 %2 %3 %4 %5 %6 %7 %8 %9
 
 @echo off
index e0b9fc137457a0805d1f2f14fcaf683a8963822e..e1ff97659659eea3ae0c11fd484740978e6314cd 100644 (file)
       <AdditionalIncludeDirectories Condition="$(IncludeExternals)">$(zlibDir);%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
       <PreprocessorDefinitions>_USRDLL;Py_BUILD_CORE;Py_BUILD_CORE_BUILTIN;Py_ENABLE_SHARED;MS_DLL_ID="$(SysWinVer)";%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <PreprocessorDefinitions Condition="$(IncludeExternals)">_Py_HAVE_ZLIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions Condition="'$(UseJIT)' == 'true'">_Py_JIT;%(PreprocessorDefinitions)</PreprocessorDefinitions>
     </ClCompile>
     <Link>
       <AdditionalDependencies>version.lib;ws2_32.lib;pathcch.lib;bcrypt.lib;%(AdditionalDependencies)</AdditionalDependencies>
     <ClInclude Include="..\Include\internal\pycore_initconfig.h" />
     <ClInclude Include="..\Include\internal\pycore_interp.h" />
     <ClInclude Include="..\Include\internal\pycore_intrinsics.h" />
+    <ClInclude Include="..\Include\internal\pycore_jit.h" />
     <ClInclude Include="..\Include\internal\pycore_list.h" />
     <ClInclude Include="..\Include\internal\pycore_llist.h" />
     <ClInclude Include="..\Include\internal\pycore_lock.h" />
     <ClCompile Include="..\Python\initconfig.c" />
     <ClCompile Include="..\Python\intrinsics.c" />
     <ClCompile Include="..\Python\instrumentation.c" />
+    <ClCompile Include="..\Python\jit.c" />
     <ClCompile Include="..\Python\legacy_tracing.c" />
     <ClCompile Include="..\Python\lock.c" />
     <ClCompile Include="..\Python\marshal.c" />
index fd79436f5add97023649ae1456f76a3108af02a3..4c55f23006b2f0c5de98fb10b0187e6ef385f89a 100644 (file)
     <ClInclude Include="..\Include\internal\pycore_intrinsics.h">
       <Filter>Include\cpython</Filter>
     </ClInclude>
+    <ClInclude Include="..\Include\internal\pycore_jit.h">
+      <Filter>Include\internal</Filter>
+    </ClInclude>
     <ClInclude Include="..\Include\internal\pycore_list.h">
       <Filter>Include\internal</Filter>
     </ClInclude>
     <ClCompile Include="..\Python\instrumentation.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\Python\jit.c">
+      <Filter>Python</Filter>
+    </ClCompile>
     <ClCompile Include="..\Python\legacy_tracing.c">
       <Filter>Source Files</Filter>
     </ClCompile>
index cc9469c7ddd726b35614f01298d0ae460f1b26ec..a90620d6ca8b7d16c48f3fc0f8e4c4cc8c8137d1 100644 (file)
@@ -28,6 +28,9 @@
     </_TokenOutputs>
     <_KeywordSources Include="$(PySourcePath)Grammar\python.gram;$(PySourcePath)Grammar\Tokens" />
     <_KeywordOutputs Include="$(PySourcePath)Lib\keyword.py" />
+    <!-- Taken from _Target._compute_digest in Tools\jit\_targets.py: -->
+    <_JITSources Include="$(PySourcePath)Python\executor_cases.c.h;$(GeneratedPyConfigDir)pyconfig.h;$(PySourcePath)Tools\jit\**"/>
+    <_JITOutputs Include="$(GeneratedPyConfigDir)jit_stencils.h"/>
   </ItemGroup>
 
   <Target Name="_TouchRegenSources" Condition="$(ForceRegen) == 'true'">
     <Exec Command="$(PythonForBuild) Tools\build\generate_global_objects.py"
           WorkingDirectory="$(PySourcePath)" />
   </Target>
+  
+  <Target Name="_RegenJIT"
+          Condition="'$(UseJIT)' == 'true'"
+          DependsOnTargets="_UpdatePyconfig;FindPythonForBuild"
+          Inputs="@(_JITSources)"
+          Outputs="@(_JITOutputs)">
+    <PropertyGroup>
+      <JITArgs Condition="$(Platform) == 'ARM64'">aarch64-pc-windows-msvc</JITArgs>
+      <JITArgs Condition="$(Platform) == 'Win32'">i686-pc-windows-msvc</JITArgs>
+      <JITArgs Condition="$(Platform) == 'x64'">x86_64-pc-windows-msvc</JITArgs>
+      <JITArgs Condition="$(Configuration) == 'Debug'">$(JITArgs) --debug</JITArgs>
+    </PropertyGroup>
+    <Exec Command='$(PythonForBuild) "$(PySourcePath)Tools\jit\build.py" $(JITArgs)'
+          WorkingDirectory="$(GeneratedPyConfigDir)"/>
+  </Target>
 
-  <Target Name="Regen"
+  <Target Name="_RegenNoPGUpdate"
           Condition="$(Configuration) != 'PGUpdate'"
           DependsOnTargets="_TouchRegenSources;_RegenPegen;_RegenAST_H;_RegenTokens;_RegenKeywords;_RegenGlobalObjects">
+  </Target>
+
+  <Target Name="Regen" DependsOnTargets="_RegenNoPGUpdate;_RegenJIT">
     <Message Text="Generated sources are up to date" Importance="high" />
   </Target>
 
index 49388cd20377c0cfd148022a00f3b7b08297a0fa..4f208009086191481c3d615e97c5ce3f5c4827e7 100644 (file)
@@ -11,6 +11,7 @@
 #include "pycore_function.h"
 #include "pycore_instruments.h"
 #include "pycore_intrinsics.h"
+#include "pycore_jit.h"
 #include "pycore_long.h"          // _PyLong_GetZero()
 #include "pycore_moduleobject.h"  // PyModuleObject
 #include "pycore_object.h"        // _PyObject_GC_TRACK()
@@ -955,9 +956,24 @@ resume_with_error:
 
 
 
-// The Tier 2 interpreter is also here!
+// Tier 2 is also here!
 enter_tier_two:
 
+#ifdef _Py_JIT
+
+    ;  // ;)
+    jit_func jitted = current_executor->jit_code;
+    next_instr = jitted(frame, stack_pointer, tstate);
+    frame = tstate->current_frame;
+    Py_DECREF(current_executor);
+    if (next_instr == NULL) {
+        goto resume_with_error;
+    }
+    stack_pointer = _PyFrame_GetStackPointer(frame);
+    DISPATCH();
+
+#else
+
 #undef LOAD_IP
 #define LOAD_IP(UNUSED) (void)0
 
@@ -1073,6 +1089,8 @@ deoptimize:
     Py_DECREF(current_executor);
     DISPATCH();
 
+#endif  // _Py_JIT
+
 }
 #if defined(__GNUC__)
 #  pragma GCC diagnostic pop
diff --git a/Python/jit.c b/Python/jit.c
new file mode 100644 (file)
index 0000000..22949c0
--- /dev/null
@@ -0,0 +1,369 @@
+#ifdef _Py_JIT
+
+#include "Python.h"
+
+#include "pycore_abstract.h"
+#include "pycore_call.h"
+#include "pycore_ceval.h"
+#include "pycore_dict.h"
+#include "pycore_intrinsics.h"
+#include "pycore_long.h"
+#include "pycore_opcode_metadata.h"
+#include "pycore_opcode_utils.h"
+#include "pycore_optimizer.h"
+#include "pycore_pyerrors.h"
+#include "pycore_setobject.h"
+#include "pycore_sliceobject.h"
+#include "pycore_jit.h"
+
+#include "jit_stencils.h"
+
+// Memory management stuff: ////////////////////////////////////////////////////
+
+#ifndef MS_WINDOWS
+    #include <sys/mman.h>
+#endif
+
+static size_t
+get_page_size(void)
+{
+#ifdef MS_WINDOWS
+    SYSTEM_INFO si;
+    GetSystemInfo(&si);
+    return si.dwPageSize;
+#else
+    return sysconf(_SC_PAGESIZE);
+#endif
+}
+
+static void
+jit_error(const char *message)
+{
+#ifdef MS_WINDOWS
+    int hint = GetLastError();
+#else
+    int hint = errno;
+#endif
+    PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint);
+}
+
+static char *
+jit_alloc(size_t size)
+{
+    assert(size);
+    assert(size % get_page_size() == 0);
+#ifdef MS_WINDOWS
+    int flags = MEM_COMMIT | MEM_RESERVE;
+    char *memory = VirtualAlloc(NULL, size, flags, PAGE_READWRITE);
+    int failed = memory == NULL;
+#else
+    int flags = MAP_ANONYMOUS | MAP_PRIVATE;
+    char *memory = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+    int failed = memory == MAP_FAILED;
+#endif
+    if (failed) {
+        jit_error("unable to allocate memory");
+        return NULL;
+    }
+    return memory;
+}
+
+static int
+jit_free(char *memory, size_t size)
+{
+    assert(size);
+    assert(size % get_page_size() == 0);
+#ifdef MS_WINDOWS
+    int failed = !VirtualFree(memory, 0, MEM_RELEASE);
+#else
+    int failed = munmap(memory, size);
+#endif
+    if (failed) {
+        jit_error("unable to free memory");
+        return -1;
+    }
+    return 0;
+}
+
+static int
+mark_executable(char *memory, size_t size)
+{
+    if (size == 0) {
+        return 0;
+    }
+    assert(size % get_page_size() == 0);
+    // Do NOT ever leave the memory writable! Also, don't forget to flush the
+    // i-cache (I cannot begin to tell you how horrible that is to debug):
+#ifdef MS_WINDOWS
+    if (!FlushInstructionCache(GetCurrentProcess(), memory, size)) {
+        jit_error("unable to flush instruction cache");
+        return -1;
+    }
+    int old;
+    int failed = !VirtualProtect(memory, size, PAGE_EXECUTE_READ, &old);
+#else
+    __builtin___clear_cache((char *)memory, (char *)memory + size);
+    int failed = mprotect(memory, size, PROT_EXEC | PROT_READ);
+#endif
+    if (failed) {
+        jit_error("unable to protect executable memory");
+        return -1;
+    }
+    return 0;
+}
+
+static int
+mark_readable(char *memory, size_t size)
+{
+    if (size == 0) {
+        return 0;
+    }
+    assert(size % get_page_size() == 0);
+#ifdef MS_WINDOWS
+    DWORD old;
+    int failed = !VirtualProtect(memory, size, PAGE_READONLY, &old);
+#else
+    int failed = mprotect(memory, size, PROT_READ);
+#endif
+    if (failed) {
+        jit_error("unable to protect readable memory");
+        return -1;
+    }
+    return 0;
+}
+
+// JIT compiler stuff: /////////////////////////////////////////////////////////
+
+// Warning! AArch64 requires you to get your hands dirty. These are your gloves:
+
+// value[value_start : value_start + len]
+static uint32_t
+get_bits(uint64_t value, uint8_t value_start, uint8_t width)
+{
+    assert(width <= 32);
+    return (value >> value_start) & ((1ULL << width) - 1);
+}
+
+// *loc[loc_start : loc_start + width] = value[value_start : value_start + width]
+static void
+set_bits(uint32_t *loc, uint8_t loc_start, uint64_t value, uint8_t value_start,
+         uint8_t width)
+{
+    assert(loc_start + width <= 32);
+    // Clear the bits we're about to patch:
+    *loc &= ~(((1ULL << width) - 1) << loc_start);
+    assert(get_bits(*loc, loc_start, width) == 0);
+    // Patch the bits:
+    *loc |= get_bits(value, value_start, width) << loc_start;
+    assert(get_bits(*loc, loc_start, width) == get_bits(value, value_start, width));
+}
+
+// See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions
+// for instruction encodings:
+#define IS_AARCH64_ADD_OR_SUB(I) (((I) & 0x11C00000) == 0x11000000)
+#define IS_AARCH64_ADRP(I)       (((I) & 0x9F000000) == 0x90000000)
+#define IS_AARCH64_BRANCH(I)     (((I) & 0x7C000000) == 0x14000000)
+#define IS_AARCH64_LDR_OR_STR(I) (((I) & 0x3B000000) == 0x39000000)
+#define IS_AARCH64_MOV(I)        (((I) & 0x9F800000) == 0x92800000)
+
+// Fill all of stencil's holes in the memory pointed to by base, using the
+// values in patches.
+static void
+patch(char *base, const Stencil *stencil, uint64_t *patches)
+{
+    for (uint64_t i = 0; i < stencil->holes_size; i++) {
+        const Hole *hole = &stencil->holes[i];
+        void *location = base + hole->offset;
+        uint64_t value = patches[hole->value] + (uint64_t)hole->symbol + hole->addend;
+        uint32_t *loc32 = (uint32_t *)location;
+        uint64_t *loc64 = (uint64_t *)location;
+        // LLD is a great reference for performing relocations... just keep in
+        // mind that Tools/jit/build.py does filtering and preprocessing for us!
+        // Here's a good place to start for each platform:
+        // - aarch64-apple-darwin:
+        //   - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.cpp
+        //   - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/ARM64Common.h
+        // - aarch64-unknown-linux-gnu:
+        //   - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/AArch64.cpp
+        // - i686-pc-windows-msvc:
+        //   - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp
+        // - x86_64-apple-darwin:
+        //   - https://github.com/llvm/llvm-project/blob/main/lld/MachO/Arch/X86_64.cpp
+        // - x86_64-pc-windows-msvc:
+        //   - https://github.com/llvm/llvm-project/blob/main/lld/COFF/Chunks.cpp
+        // - x86_64-unknown-linux-gnu:
+        //   - https://github.com/llvm/llvm-project/blob/main/lld/ELF/Arch/X86_64.cpp
+        switch (hole->kind) {
+            case HoleKind_IMAGE_REL_I386_DIR32:
+                // 32-bit absolute address.
+                // Check that we're not out of range of 32 unsigned bits:
+                assert(value < (1ULL << 32));
+                *loc32 = (uint32_t)value;
+                continue;
+            case HoleKind_ARM64_RELOC_UNSIGNED:
+            case HoleKind_IMAGE_REL_AMD64_ADDR64:
+            case HoleKind_R_AARCH64_ABS64:
+            case HoleKind_X86_64_RELOC_UNSIGNED:
+            case HoleKind_R_X86_64_64:
+                // 64-bit absolute address.
+                *loc64 = value;
+                continue;
+            case HoleKind_R_AARCH64_CALL26:
+            case HoleKind_R_AARCH64_JUMP26:
+                // 28-bit relative branch.
+                assert(IS_AARCH64_BRANCH(*loc32));
+                value -= (uint64_t)location;
+                // Check that we're not out of range of 28 signed bits:
+                assert((int64_t)value >= -(1 << 27));
+                assert((int64_t)value < (1 << 27));
+                // Since instructions are 4-byte aligned, only use 26 bits:
+                assert(get_bits(value, 0, 2) == 0);
+                set_bits(loc32, 0, value, 2, 26);
+                continue;
+            case HoleKind_R_AARCH64_MOVW_UABS_G0_NC:
+                // 16-bit low part of an absolute address.
+                assert(IS_AARCH64_MOV(*loc32));
+                // Check the implicit shift (this is "part 0 of 3"):
+                assert(get_bits(*loc32, 21, 2) == 0);
+                set_bits(loc32, 5, value, 0, 16);
+                continue;
+            case HoleKind_R_AARCH64_MOVW_UABS_G1_NC:
+                // 16-bit middle-low part of an absolute address.
+                assert(IS_AARCH64_MOV(*loc32));
+                // Check the implicit shift (this is "part 1 of 3"):
+                assert(get_bits(*loc32, 21, 2) == 1);
+                set_bits(loc32, 5, value, 16, 16);
+                continue;
+            case HoleKind_R_AARCH64_MOVW_UABS_G2_NC:
+                // 16-bit middle-high part of an absolute address.
+                assert(IS_AARCH64_MOV(*loc32));
+                // Check the implicit shift (this is "part 2 of 3"):
+                assert(get_bits(*loc32, 21, 2) == 2);
+                set_bits(loc32, 5, value, 32, 16);
+                continue;
+            case HoleKind_R_AARCH64_MOVW_UABS_G3:
+                // 16-bit high part of an absolute address.
+                assert(IS_AARCH64_MOV(*loc32));
+                // Check the implicit shift (this is "part 3 of 3"):
+                assert(get_bits(*loc32, 21, 2) == 3);
+                set_bits(loc32, 5, value, 48, 16);
+                continue;
+            case HoleKind_ARM64_RELOC_GOT_LOAD_PAGE21:
+                // 21-bit count of pages between this page and an absolute address's
+                // page... I know, I know, it's weird. Pairs nicely with
+                // ARM64_RELOC_GOT_LOAD_PAGEOFF12 (below).
+                assert(IS_AARCH64_ADRP(*loc32));
+                // Number of pages between this page and the value's page:
+                value = (value >> 12) - ((uint64_t)location >> 12);
+                // Check that we're not out of range of 21 signed bits:
+                assert((int64_t)value >= -(1 << 20));
+                assert((int64_t)value < (1 << 20));
+                // value[0:2] goes in loc[29:31]:
+                set_bits(loc32, 29, value, 0, 2);
+                // value[2:21] goes in loc[5:26]:
+                set_bits(loc32, 5, value, 2, 19);
+                continue;
+            case HoleKind_ARM64_RELOC_GOT_LOAD_PAGEOFF12:
+                // 12-bit low part of an absolute address. Pairs nicely with
+                // ARM64_RELOC_GOT_LOAD_PAGE21 (above).
+                assert(IS_AARCH64_LDR_OR_STR(*loc32) || IS_AARCH64_ADD_OR_SUB(*loc32));
+                // There might be an implicit shift encoded in the instruction:
+                uint8_t shift = 0;
+                if (IS_AARCH64_LDR_OR_STR(*loc32)) {
+                    shift = (uint8_t)get_bits(*loc32, 30, 2);
+                    // If both of these are set, the shift is supposed to be 4.
+                    // That's pretty weird, and it's never actually been observed...
+                    assert(get_bits(*loc32, 23, 1) == 0 || get_bits(*loc32, 26, 1) == 0);
+                }
+                value = get_bits(value, 0, 12);
+                assert(get_bits(value, 0, shift) == 0);
+                set_bits(loc32, 10, value, shift, 12);
+                continue;
+        }
+        Py_UNREACHABLE();
+    }
+}
+
+static void
+copy_and_patch(char *base, const Stencil *stencil, uint64_t *patches)
+{
+    memcpy(base, stencil->body, stencil->body_size);
+    patch(base, stencil, patches);
+}
+
+static void
+emit(const StencilGroup *group, uint64_t patches[])
+{
+    copy_and_patch((char *)patches[HoleValue_CODE], &group->code, patches);
+    copy_and_patch((char *)patches[HoleValue_DATA], &group->data, patches);
+}
+
+// Compiles executor in-place. Don't forget to call _PyJIT_Free later!
+int
+_PyJIT_Compile(_PyExecutorObject *executor, _PyUOpInstruction *trace, size_t length)
+{
+    // Loop once to find the total compiled size:
+    size_t code_size = 0;
+    size_t data_size = 0;
+    for (size_t i = 0; i < length; i++) {
+        _PyUOpInstruction *instruction = &trace[i];
+        const StencilGroup *group = &stencil_groups[instruction->opcode];
+        code_size += group->code.body_size;
+        data_size += group->data.body_size;
+    }
+    // Round up to the nearest page (code and data need separate pages):
+    size_t page_size = get_page_size();
+    assert((page_size & (page_size - 1)) == 0);
+    code_size += page_size - (code_size & (page_size - 1));
+    data_size += page_size - (data_size & (page_size - 1));
+    char *memory = jit_alloc(code_size + data_size);
+    if (memory == NULL) {
+        return -1;
+    }
+    // Loop again to emit the code:
+    char *code = memory;
+    char *data = memory + code_size;
+    for (size_t i = 0; i < length; i++) {
+        _PyUOpInstruction *instruction = &trace[i];
+        const StencilGroup *group = &stencil_groups[instruction->opcode];
+        // Think of patches as a dictionary mapping HoleValue to uint64_t:
+        uint64_t patches[] = GET_PATCHES();
+        patches[HoleValue_CODE] = (uint64_t)code;
+        patches[HoleValue_CONTINUE] = (uint64_t)code + group->code.body_size;
+        patches[HoleValue_DATA] = (uint64_t)data;
+        patches[HoleValue_EXECUTOR] = (uint64_t)executor;
+        patches[HoleValue_OPARG] = instruction->oparg;
+        patches[HoleValue_OPERAND] = instruction->operand;
+        patches[HoleValue_TARGET] = instruction->target;
+        patches[HoleValue_TOP] = (uint64_t)memory;
+        patches[HoleValue_ZERO] = 0;
+        emit(group, patches);
+        code += group->code.body_size;
+        data += group->data.body_size;
+    }
+    if (mark_executable(memory, code_size) ||
+        mark_readable(memory + code_size, data_size))
+    {
+        jit_free(memory, code_size + data_size);
+        return -1;
+    }
+    executor->jit_code = memory;
+    executor->jit_size = code_size + data_size;
+    return 0;
+}
+
+void
+_PyJIT_Free(_PyExecutorObject *executor)
+{
+    char *memory = (char *)executor->jit_code;
+    size_t size = executor->jit_size;
+    if (memory) {
+        executor->jit_code = NULL;
+        executor->jit_size = 0;
+        if (jit_free(memory, size)) {
+            PyErr_WriteUnraisable(NULL);
+        }
+    }
+}
+
+#endif  // _Py_JIT
index db615068ff517f6ca8e56f569237973fa778c075..0d04b09fef1e8463f3127a71b8b3b36826c76288 100644 (file)
@@ -7,6 +7,7 @@
 #include "pycore_optimizer.h"     // _Py_uop_analyze_and_optimize()
 #include "pycore_pystate.h"       // _PyInterpreterState_GET()
 #include "pycore_uop_ids.h"
+#include "pycore_jit.h"
 #include "cpython/optimizer.h"
 #include <stdbool.h>
 #include <stdint.h>
@@ -227,6 +228,9 @@ static PyMethodDef executor_methods[] = {
 static void
 uop_dealloc(_PyExecutorObject *self) {
     _Py_ExecutorClear(self);
+#ifdef _Py_JIT
+    _PyJIT_Free(self);
+#endif
     PyObject_Free(self);
 }
 
@@ -789,6 +793,14 @@ make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies)
                    executor->trace[i].operand);
         }
     }
+#endif
+#ifdef _Py_JIT
+    executor->jit_code = NULL;
+    executor->jit_size = 0;
+    if (_PyJIT_Compile(executor, executor->trace, Py_SIZE(executor))) {
+        Py_DECREF(executor);
+        return NULL;
+    }
 #endif
     return executor;
 }
index fff64dd63d6b219e0ee988a0533c5fae423ad004..372f60602375b658127da281f6300de2f4ca795d 100644 (file)
@@ -1240,12 +1240,19 @@ init_interp_main(PyThreadState *tstate)
 
     // Turn on experimental tier 2 (uops-based) optimizer
     if (is_main_interp) {
+#ifndef _Py_JIT
+        // No JIT, maybe use the tier two interpreter:
         char *envvar = Py_GETENV("PYTHON_UOPS");
         int enabled = envvar != NULL && *envvar > '0';
         if (_Py_get_xoption(&config->xoptions, L"uops") != NULL) {
             enabled = 1;
         }
         if (enabled) {
+#else
+        // Always enable tier two for JIT builds (ignoring the environment
+        // variable and command-line option above):
+        if (true) {
+#endif
             PyObject *opt = PyUnstable_Optimizer_NewUOpOptimizer();
             if (opt == NULL) {
                 return _PyStatus_ERR("can't initialize optimizer");
diff --git a/Tools/jit/README.md b/Tools/jit/README.md
new file mode 100644 (file)
index 0000000..04a6c07
--- /dev/null
@@ -0,0 +1,46 @@
+The JIT Compiler
+================
+
+This version of CPython can be built with an experimental just-in-time compiler. While most everything you already know about building and using CPython is unchanged, you will probably need to install a compatible version of LLVM first.
+
+## Installing LLVM
+
+The JIT compiler does not require end users to install any third-party dependencies, but part of it must be *built* using LLVM[^why-llvm]. You are *not* required to build the rest of CPython using LLVM, or even the same version of LLVM (in fact, this is uncommon).
+
+LLVM version 16 is required. Both `clang` and `llvm-readobj` need to be installed and discoverable (version suffixes, like `clang-16`, are okay). It's highly recommended that you also have `llvm-objdump` available, since this allows the build script to dump human-readable assembly for the generated code.
+
+It's easy to install all of the required tools:
+
+### Linux
+
+Install LLVM 16 on Ubuntu/Debian:
+
+```sh
+wget https://apt.llvm.org/llvm.sh
+chmod +x llvm.sh
+sudo ./llvm.sh 16
+```
+
+### macOS
+
+Install LLVM 16 with [Homebrew](https://brew.sh):
+
+```sh
+brew install llvm@16
+```
+
+Homebrew won't add any of the tools to your `$PATH`. That's okay; the build script knows how to find them.
+
+### Windows
+
+Install LLVM 16 [by searching for it on LLVM's GitHub releases page](https://github.com/llvm/llvm-project/releases?q=16), clicking on "Assets", downloading the appropriate Windows installer for your platform (likely the file ending with `-win64.exe`), and running it. **When installing, be sure to select the option labeled "Add LLVM to the system PATH".**
+
+## Building
+
+For `PCbuild`-based builds, pass the new `--experimental-jit` option to `build.bat`.
+
+For all other builds, pass the new `--enable-experimental-jit` option to `configure`.
+
+Otherwise, just configure and build as you normally would. Cross-compiling "just works", since the JIT is built for the host platform.
+
+[^why-llvm]: Clang is specifically needed because it's the only C compiler with support for guaranteed tail calls (`musttail`), which are required by CPython's continuation-passing-style approach to JIT compilation. Since LLVM also includes other functionalities we need (namely, object file parsing and disassembly), it's convenient to only support one toolchain at this time.
diff --git a/Tools/jit/_llvm.py b/Tools/jit/_llvm.py
new file mode 100644 (file)
index 0000000..603bbef
--- /dev/null
@@ -0,0 +1,99 @@
+"""Utilities for invoking LLVM tools."""
+import asyncio
+import functools
+import os
+import re
+import shlex
+import subprocess
+import typing
+
+_LLVM_VERSION = 16
+_LLVM_VERSION_PATTERN = re.compile(rf"version\s+{_LLVM_VERSION}\.\d+\.\d+\s+")
+
+_P = typing.ParamSpec("_P")
+_R = typing.TypeVar("_R")
+_C = typing.Callable[_P, typing.Awaitable[_R]]
+
+
+def _async_cache(f: _C[_P, _R]) -> _C[_P, _R]:
+    cache = {}
+    lock = asyncio.Lock()
+
+    @functools.wraps(f)
+    async def wrapper(
+        *args: _P.args, **kwargs: _P.kwargs  # pylint: disable = no-member
+    ) -> _R:
+        async with lock:
+            if args not in cache:
+                cache[args] = await f(*args, **kwargs)
+            return cache[args]
+
+    return wrapper
+
+
+_CORES = asyncio.BoundedSemaphore(os.cpu_count() or 1)
+
+
+async def _run(tool: str, args: typing.Iterable[str], echo: bool = False) -> str | None:
+    command = [tool, *args]
+    async with _CORES:
+        if echo:
+            print(shlex.join(command))
+        try:
+            process = await asyncio.create_subprocess_exec(
+                *command, stdout=subprocess.PIPE
+            )
+        except FileNotFoundError:
+            return None
+        out, _ = await process.communicate()
+    if process.returncode:
+        raise RuntimeError(f"{tool} exited with return code {process.returncode}")
+    return out.decode()
+
+
+@_async_cache
+async def _check_tool_version(name: str, *, echo: bool = False) -> bool:
+    output = await _run(name, ["--version"], echo=echo)
+    return bool(output and _LLVM_VERSION_PATTERN.search(output))
+
+
+@_async_cache
+async def _get_brew_llvm_prefix(*, echo: bool = False) -> str | None:
+    output = await _run("brew", ["--prefix", f"llvm@{_LLVM_VERSION}"], echo=echo)
+    return output and output.removesuffix("\n")
+
+
+@_async_cache
+async def _find_tool(tool: str, *, echo: bool = False) -> str | None:
+    # Unversioned executables:
+    path = tool
+    if await _check_tool_version(path, echo=echo):
+        return path
+    # Versioned executables:
+    path = f"{tool}-{_LLVM_VERSION}"
+    if await _check_tool_version(path, echo=echo):
+        return path
+    # Homebrew-installed executables:
+    prefix = await _get_brew_llvm_prefix(echo=echo)
+    if prefix is not None:
+        path = os.path.join(prefix, "bin", tool)
+        if await _check_tool_version(path, echo=echo):
+            return path
+    # Nothing found:
+    return None
+
+
+async def maybe_run(
+    tool: str, args: typing.Iterable[str], echo: bool = False
+) -> str | None:
+    """Run an LLVM tool if it can be found. Otherwise, return None."""
+    path = await _find_tool(tool, echo=echo)
+    return path and await _run(path, args, echo=echo)
+
+
+async def run(tool: str, args: typing.Iterable[str], echo: bool = False) -> str:
+    """Run an LLVM tool if it can be found. Otherwise, raise RuntimeError."""
+    output = await maybe_run(tool, args, echo=echo)
+    if output is None:
+        raise RuntimeError(f"Can't find {tool}-{_LLVM_VERSION}!")
+    return output
diff --git a/Tools/jit/_schema.py b/Tools/jit/_schema.py
new file mode 100644 (file)
index 0000000..8eeb78e
--- /dev/null
@@ -0,0 +1,99 @@
+"""Schema for the JSON produced by llvm-readobj --elf-output-style=JSON."""
+import typing
+
+HoleKind: typing.TypeAlias = typing.Literal[
+    "ARM64_RELOC_GOT_LOAD_PAGE21",
+    "ARM64_RELOC_GOT_LOAD_PAGEOFF12",
+    "ARM64_RELOC_UNSIGNED",
+    "IMAGE_REL_AMD64_ADDR64",
+    "IMAGE_REL_I386_DIR32",
+    "R_AARCH64_ABS64",
+    "R_AARCH64_CALL26",
+    "R_AARCH64_JUMP26",
+    "R_AARCH64_MOVW_UABS_G0_NC",
+    "R_AARCH64_MOVW_UABS_G1_NC",
+    "R_AARCH64_MOVW_UABS_G2_NC",
+    "R_AARCH64_MOVW_UABS_G3",
+    "R_X86_64_64",
+    "X86_64_RELOC_UNSIGNED",
+]
+
+
+class COFFRelocation(typing.TypedDict):
+    """A COFF object file relocation record."""
+
+    Type: dict[typing.Literal["Value"], HoleKind]
+    Symbol: str
+    Offset: int
+
+
+class ELFRelocation(typing.TypedDict):
+    """An ELF object file relocation record."""
+
+    Addend: int
+    Offset: int
+    Symbol: dict[typing.Literal["Value"], str]
+    Type: dict[typing.Literal["Value"], HoleKind]
+
+
+class MachORelocation(typing.TypedDict):
+    """A Mach-O object file relocation record."""
+
+    Offset: int
+    Section: typing.NotRequired[dict[typing.Literal["Value"], str]]
+    Symbol: typing.NotRequired[dict[typing.Literal["Value"], str]]
+    Type: dict[typing.Literal["Value"], HoleKind]
+
+
+class _COFFSymbol(typing.TypedDict):
+    Name: str
+    Value: int
+
+
+class _ELFSymbol(typing.TypedDict):
+    Name: dict[typing.Literal["Value"], str]
+    Value: int
+
+
+class _MachOSymbol(typing.TypedDict):
+    Name: dict[typing.Literal["Value"], str]
+    Value: int
+
+
+class COFFSection(typing.TypedDict):
+    """A COFF object file section."""
+
+    Characteristics: dict[
+        typing.Literal["Flags"], list[dict[typing.Literal["Name"], str]]
+    ]
+    Number: int
+    RawDataSize: int
+    Relocations: list[dict[typing.Literal["Relocation"], COFFRelocation]]
+    SectionData: typing.NotRequired[dict[typing.Literal["Bytes"], list[int]]]
+    Symbols: list[dict[typing.Literal["Symbol"], _COFFSymbol]]
+
+
+class ELFSection(typing.TypedDict):
+    """An ELF object file section."""
+
+    Flags: dict[typing.Literal["Flags"], list[dict[typing.Literal["Name"], str]]]
+    Index: int
+    Info: int
+    Relocations: list[dict[typing.Literal["Relocation"], ELFRelocation]]
+    SectionData: dict[typing.Literal["Bytes"], list[int]]
+    Symbols: list[dict[typing.Literal["Symbol"], _ELFSymbol]]
+    Type: dict[typing.Literal["Value"], str]
+
+
+class MachOSection(typing.TypedDict):
+    """A Mach-O object file section."""
+
+    Address: int
+    Attributes: dict[typing.Literal["Flags"], list[dict[typing.Literal["Name"], str]]]
+    Index: int
+    Name: dict[typing.Literal["Value"], str]
+    Relocations: typing.NotRequired[
+        list[dict[typing.Literal["Relocation"], MachORelocation]]
+    ]
+    SectionData: typing.NotRequired[dict[typing.Literal["Bytes"], list[int]]]
+    Symbols: typing.NotRequired[list[dict[typing.Literal["Symbol"], _MachOSymbol]]]
diff --git a/Tools/jit/_stencils.py b/Tools/jit/_stencils.py
new file mode 100644 (file)
index 0000000..71c678e
--- /dev/null
@@ -0,0 +1,220 @@
+"""Core data structures for compiled code templates."""
+import dataclasses
+import enum
+import sys
+
+import _schema
+
+
+@enum.unique
+class HoleValue(enum.Enum):
+    """
+    Different "base" values that can be patched into holes (usually combined with the
+    address of a symbol and/or an addend).
+    """
+
+    # The base address of the machine code for the current uop (exposed as _JIT_ENTRY):
+    CODE = enum.auto()
+    # The base address of the machine code for the next uop (exposed as _JIT_CONTINUE):
+    CONTINUE = enum.auto()
+    # The base address of the read-only data for this uop:
+    DATA = enum.auto()
+    # The address of the current executor (exposed as _JIT_EXECUTOR):
+    EXECUTOR = enum.auto()
+    # The base address of the "global" offset table located in the read-only data.
+    # Shouldn't be present in the final stencils, since these are all replaced with
+    # equivalent DATA values:
+    GOT = enum.auto()
+    # The current uop's oparg (exposed as _JIT_OPARG):
+    OPARG = enum.auto()
+    # The current uop's operand (exposed as _JIT_OPERAND):
+    OPERAND = enum.auto()
+    # The current uop's target (exposed as _JIT_TARGET):
+    TARGET = enum.auto()
+    # The base address of the machine code for the first uop (exposed as _JIT_TOP):
+    TOP = enum.auto()
+    # A hardcoded value of zero (used for symbol lookups):
+    ZERO = enum.auto()
+
+
+@dataclasses.dataclass
+class Hole:
+    """
+    A "hole" in the stencil to be patched with a computed runtime value.
+
+    Analogous to relocation records in an object file.
+    """
+
+    offset: int
+    kind: _schema.HoleKind
+    # Patch with this base value:
+    value: HoleValue
+    # ...plus the address of this symbol:
+    symbol: str | None
+    # ...plus this addend:
+    addend: int
+    # Convenience method:
+    replace = dataclasses.replace
+
+    def as_c(self) -> str:
+        """Dump this hole as an initialization of a C Hole struct."""
+        parts = [
+            f"{self.offset:#x}",
+            f"HoleKind_{self.kind}",
+            f"HoleValue_{self.value.name}",
+            f"&{self.symbol}" if self.symbol else "NULL",
+            _format_addend(self.addend),
+        ]
+        return f"{{{', '.join(parts)}}}"
+
+
+@dataclasses.dataclass
+class Stencil:
+    """
+    A contiguous block of machine code or data to be copied-and-patched.
+
+    Analogous to a section or segment in an object file.
+    """
+
+    body: bytearray = dataclasses.field(default_factory=bytearray, init=False)
+    holes: list[Hole] = dataclasses.field(default_factory=list, init=False)
+    disassembly: list[str] = dataclasses.field(default_factory=list, init=False)
+
+    def pad(self, alignment: int) -> None:
+        """Pad the stencil to the given alignment."""
+        offset = len(self.body)
+        padding = -offset % alignment
+        self.disassembly.append(f"{offset:x}: {' '.join(['00'] * padding)}")
+        self.body.extend([0] * padding)
+
+    def emit_aarch64_trampoline(self, hole: Hole) -> None:
+        """Even with the large code model, AArch64 Linux insists on 28-bit jumps."""
+        base = len(self.body)
+        where = slice(hole.offset, hole.offset + 4)
+        instruction = int.from_bytes(self.body[where], sys.byteorder)
+        instruction &= 0xFC000000
+        instruction |= ((base - hole.offset) >> 2) & 0x03FFFFFF
+        self.body[where] = instruction.to_bytes(4, sys.byteorder)
+        self.disassembly += [
+            f"{base + 4 * 0: x}: d2800008      mov     x8, #0x0",
+            f"{base + 4 * 0:016x}:  R_AARCH64_MOVW_UABS_G0_NC    {hole.symbol}",
+            f"{base + 4 * 1:x}: f2a00008      movk    x8, #0x0, lsl #16",
+            f"{base + 4 * 1:016x}:  R_AARCH64_MOVW_UABS_G1_NC    {hole.symbol}",
+            f"{base + 4 * 2:x}: f2c00008      movk    x8, #0x0, lsl #32",
+            f"{base + 4 * 2:016x}:  R_AARCH64_MOVW_UABS_G2_NC    {hole.symbol}",
+            f"{base + 4 * 3:x}: f2e00008      movk    x8, #0x0, lsl #48",
+            f"{base + 4 * 3:016x}:  R_AARCH64_MOVW_UABS_G3       {hole.symbol}",
+            f"{base + 4 * 4:x}: d61f0100      br      x8",
+        ]
+        for code in [
+            0xD2800008.to_bytes(4, sys.byteorder),
+            0xF2A00008.to_bytes(4, sys.byteorder),
+            0xF2C00008.to_bytes(4, sys.byteorder),
+            0xF2E00008.to_bytes(4, sys.byteorder),
+            0xD61F0100.to_bytes(4, sys.byteorder),
+        ]:
+            self.body.extend(code)
+        for i, kind in enumerate(
+            [
+                "R_AARCH64_MOVW_UABS_G0_NC",
+                "R_AARCH64_MOVW_UABS_G1_NC",
+                "R_AARCH64_MOVW_UABS_G2_NC",
+                "R_AARCH64_MOVW_UABS_G3",
+            ]
+        ):
+            self.holes.append(hole.replace(offset=base + 4 * i, kind=kind))
+
+
+@dataclasses.dataclass
+class StencilGroup:
+    """
+    Code and data corresponding to a given micro-opcode.
+
+    Analogous to an entire object file.
+    """
+
+    code: Stencil = dataclasses.field(default_factory=Stencil, init=False)
+    data: Stencil = dataclasses.field(default_factory=Stencil, init=False)
+    symbols: dict[int | str, tuple[HoleValue, int]] = dataclasses.field(
+        default_factory=dict, init=False
+    )
+    _got: dict[str, int] = dataclasses.field(default_factory=dict, init=False)
+
+    def process_relocations(self, *, alignment: int = 1) -> None:
+        """Fix up all GOT and internal relocations for this stencil group."""
+        self.code.pad(alignment)
+        self.data.pad(8)
+        for stencil in [self.code, self.data]:
+            holes = []
+            for hole in stencil.holes:
+                if hole.value is HoleValue.GOT:
+                    assert hole.symbol is not None
+                    hole.value = HoleValue.DATA
+                    hole.addend += self._global_offset_table_lookup(hole.symbol)
+                    hole.symbol = None
+                elif hole.symbol in self.symbols:
+                    hole.value, addend = self.symbols[hole.symbol]
+                    hole.addend += addend
+                    hole.symbol = None
+                elif (
+                    hole.kind in {"R_AARCH64_CALL26", "R_AARCH64_JUMP26"}
+                    and hole.value is HoleValue.ZERO
+                ):
+                    self.code.emit_aarch64_trampoline(hole)
+                    continue
+                holes.append(hole)
+            stencil.holes[:] = holes
+        self.code.pad(alignment)
+        self._emit_global_offset_table()
+        self.code.holes.sort(key=lambda hole: hole.offset)
+        self.data.holes.sort(key=lambda hole: hole.offset)
+
+    def _global_offset_table_lookup(self, symbol: str) -> int:
+        return len(self.data.body) + self._got.setdefault(symbol, 8 * len(self._got))
+
+    def _emit_global_offset_table(self) -> None:
+        got = len(self.data.body)
+        for s, offset in self._got.items():
+            if s in self.symbols:
+                value, addend = self.symbols[s]
+                symbol = None
+            else:
+                value, symbol = symbol_to_value(s)
+                addend = 0
+            self.data.holes.append(
+                Hole(got + offset, "R_X86_64_64", value, symbol, addend)
+            )
+            value_part = value.name if value is not HoleValue.ZERO else ""
+            if value_part and not symbol and not addend:
+                addend_part = ""
+            else:
+                addend_part = f"&{symbol}" if symbol else ""
+                addend_part += _format_addend(addend, signed=symbol is not None)
+                if value_part:
+                    value_part += "+"
+            self.data.disassembly.append(
+                f"{len(self.data.body):x}: {value_part}{addend_part}"
+            )
+            self.data.body.extend([0] * 8)
+
+
+def symbol_to_value(symbol: str) -> tuple[HoleValue, str | None]:
+    """
+    Convert a symbol name to a HoleValue and a symbol name.
+
+    Some symbols (starting with "_JIT_") are special and are converted to their
+    own HoleValues.
+    """
+    if symbol.startswith("_JIT_"):
+        try:
+            return HoleValue[symbol.removeprefix("_JIT_")], None
+        except KeyError:
+            pass
+    return HoleValue.ZERO, symbol
+
+
+def _format_addend(addend: int, signed: bool = False) -> str:
+    addend %= 1 << 64
+    if addend & (1 << 63):
+        addend -= 1 << 64
+    return f"{addend:{'+#x' if signed else '#x'}}"
diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py
new file mode 100644 (file)
index 0000000..51b091e
--- /dev/null
@@ -0,0 +1,394 @@
+"""Target-specific code generation, parsing, and processing."""
+import asyncio
+import dataclasses
+import hashlib
+import json
+import os
+import pathlib
+import re
+import sys
+import tempfile
+import typing
+
+import _llvm
+import _schema
+import _stencils
+import _writer
+
+if sys.version_info < (3, 11):
+    raise RuntimeError("Building the JIT compiler requires Python 3.11 or newer!")
+
+TOOLS_JIT_BUILD = pathlib.Path(__file__).resolve()
+TOOLS_JIT = TOOLS_JIT_BUILD.parent
+TOOLS = TOOLS_JIT.parent
+CPYTHON = TOOLS.parent
+PYTHON_EXECUTOR_CASES_C_H = CPYTHON / "Python" / "executor_cases.c.h"
+TOOLS_JIT_TEMPLATE_C = TOOLS_JIT / "template.c"
+
+
+_S = typing.TypeVar("_S", _schema.COFFSection, _schema.ELFSection, _schema.MachOSection)
+_R = typing.TypeVar(
+    "_R", _schema.COFFRelocation, _schema.ELFRelocation, _schema.MachORelocation
+)
+
+
+@dataclasses.dataclass
+class _Target(typing.Generic[_S, _R]):
+    triple: str
+    _: dataclasses.KW_ONLY
+    alignment: int = 1
+    prefix: str = ""
+    debug: bool = False
+    force: bool = False
+    verbose: bool = False
+
+    def _compute_digest(self, out: pathlib.Path) -> str:
+        hasher = hashlib.sha256()
+        hasher.update(self.triple.encode())
+        hasher.update(self.alignment.to_bytes())
+        hasher.update(self.prefix.encode())
+        # These dependencies are also reflected in _JITSources in regen.targets:
+        hasher.update(PYTHON_EXECUTOR_CASES_C_H.read_bytes())
+        hasher.update((out / "pyconfig.h").read_bytes())
+        for dirpath, _, filenames in sorted(os.walk(TOOLS_JIT)):
+            for filename in filenames:
+                hasher.update(pathlib.Path(dirpath, filename).read_bytes())
+        return hasher.hexdigest()
+
+    async def _parse(self, path: pathlib.Path) -> _stencils.StencilGroup:
+        group = _stencils.StencilGroup()
+        args = ["--disassemble", "--reloc", f"{path}"]
+        output = await _llvm.maybe_run("llvm-objdump", args, echo=self.verbose)
+        if output is not None:
+            group.code.disassembly.extend(
+                line.expandtabs().strip()
+                for line in output.splitlines()
+                if not line.isspace()
+            )
+        args = [
+            "--elf-output-style=JSON",
+            "--expand-relocs",
+            # "--pretty-print",
+            "--section-data",
+            "--section-relocations",
+            "--section-symbols",
+            "--sections",
+            f"{path}",
+        ]
+        output = await _llvm.run("llvm-readobj", args, echo=self.verbose)
+        # --elf-output-style=JSON is only *slightly* broken on Mach-O...
+        output = output.replace("PrivateExtern\n", "\n")
+        output = output.replace("Extern\n", "\n")
+        # ...and also COFF:
+        output = output[output.index("[", 1, None) :]
+        output = output[: output.rindex("]", None, -1) + 1]
+        sections: list[dict[typing.Literal["Section"], _S]] = json.loads(output)
+        for wrapped_section in sections:
+            self._handle_section(wrapped_section["Section"], group)
+        assert group.symbols["_JIT_ENTRY"] == (_stencils.HoleValue.CODE, 0)
+        if group.data.body:
+            line = f"0: {str(bytes(group.data.body)).removeprefix('b')}"
+            group.data.disassembly.append(line)
+        group.process_relocations()
+        return group
+
+    def _handle_section(self, section: _S, group: _stencils.StencilGroup) -> None:
+        raise NotImplementedError(type(self))
+
+    def _handle_relocation(
+        self, base: int, relocation: _R, raw: bytes
+    ) -> _stencils.Hole:
+        raise NotImplementedError(type(self))
+
+    async def _compile(
+        self, opname: str, c: pathlib.Path, tempdir: pathlib.Path
+    ) -> _stencils.StencilGroup:
+        o = tempdir / f"{opname}.o"
+        args = [
+            f"--target={self.triple}",
+            "-DPy_BUILD_CORE",
+            "-D_DEBUG" if self.debug else "-DNDEBUG",
+            f"-D_JIT_OPCODE={opname}",
+            "-D_PyJIT_ACTIVE",
+            "-D_Py_JIT",
+            "-I.",
+            f"-I{CPYTHON / 'Include'}",
+            f"-I{CPYTHON / 'Include' / 'internal'}",
+            f"-I{CPYTHON / 'Include' / 'internal' / 'mimalloc'}",
+            f"-I{CPYTHON / 'Python'}",
+            "-O3",
+            "-c",
+            "-fno-asynchronous-unwind-tables",
+            # SET_FUNCTION_ATTRIBUTE on 32-bit Windows debug builds:
+            "-fno-jump-tables",
+            # Position-independent code adds indirection to every load and jump:
+            "-fno-pic",
+            # Don't make calls to weird stack-smashing canaries:
+            "-fno-stack-protector",
+            # We have three options for code model:
+            # - "small": the default, assumes that code and data reside in the
+            #   lowest 2GB of memory (128MB on aarch64)
+            # - "medium": assumes that code resides in the lowest 2GB of memory,
+            #   and makes no assumptions about data (not available on aarch64)
+            # - "large": makes no assumptions about either code or data
+            "-mcmodel=large",
+            "-o",
+            f"{o}",
+            "-std=c11",
+            f"{c}",
+        ]
+        await _llvm.run("clang", args, echo=self.verbose)
+        return await self._parse(o)
+
+    async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]:
+        generated_cases = PYTHON_EXECUTOR_CASES_C_H.read_text()
+        opnames = sorted(re.findall(r"\n {8}case (\w+): \{\n", generated_cases))
+        tasks = []
+        with tempfile.TemporaryDirectory() as tempdir:
+            work = pathlib.Path(tempdir).resolve()
+            async with asyncio.TaskGroup() as group:
+                for opname in opnames:
+                    coro = self._compile(opname, TOOLS_JIT_TEMPLATE_C, work)
+                    tasks.append(group.create_task(coro, name=opname))
+        return {task.get_name(): task.result() for task in tasks}
+
+    def build(self, out: pathlib.Path, *, comment: str = "") -> None:
+        """Build jit_stencils.h in the given directory."""
+        digest = f"// {self._compute_digest(out)}\n"
+        jit_stencils = out / "jit_stencils.h"
+        if (
+            not self.force
+            and jit_stencils.exists()
+            and jit_stencils.read_text().startswith(digest)
+        ):
+            return
+        stencil_groups = asyncio.run(self._build_stencils())
+        with jit_stencils.open("w") as file:
+            file.write(digest)
+            if comment:
+                file.write(f"// {comment}\n")
+            file.write("")
+            for line in _writer.dump(stencil_groups):
+                file.write(f"{line}\n")
+
+
+class _COFF(
+    _Target[_schema.COFFSection, _schema.COFFRelocation]
+):  # pylint: disable = too-few-public-methods
+    def _handle_section(
+        self, section: _schema.COFFSection, group: _stencils.StencilGroup
+    ) -> None:
+        flags = {flag["Name"] for flag in section["Characteristics"]["Flags"]}
+        if "SectionData" in section:
+            section_data_bytes = section["SectionData"]["Bytes"]
+        else:
+            # Zeroed BSS data, seen with printf debugging calls:
+            section_data_bytes = [0] * section["RawDataSize"]
+        if "IMAGE_SCN_MEM_EXECUTE" in flags:
+            value = _stencils.HoleValue.CODE
+            stencil = group.code
+        elif "IMAGE_SCN_MEM_READ" in flags:
+            value = _stencils.HoleValue.DATA
+            stencil = group.data
+        else:
+            return
+        base = len(stencil.body)
+        group.symbols[section["Number"]] = value, base
+        stencil.body.extend(section_data_bytes)
+        for wrapped_symbol in section["Symbols"]:
+            symbol = wrapped_symbol["Symbol"]
+            offset = base + symbol["Value"]
+            name = symbol["Name"]
+            name = name.removeprefix(self.prefix)
+            group.symbols[name] = value, offset
+        for wrapped_relocation in section["Relocations"]:
+            relocation = wrapped_relocation["Relocation"]
+            hole = self._handle_relocation(base, relocation, stencil.body)
+            stencil.holes.append(hole)
+
+    def _handle_relocation(
+        self, base: int, relocation: _schema.COFFRelocation, raw: bytes
+    ) -> _stencils.Hole:
+        match relocation:
+            case {
+                "Offset": offset,
+                "Symbol": s,
+                "Type": {"Value": "IMAGE_REL_AMD64_ADDR64" as kind},
+            }:
+                offset += base
+                s = s.removeprefix(self.prefix)
+                value, symbol = _stencils.symbol_to_value(s)
+                addend = int.from_bytes(raw[offset : offset + 8], "little")
+            case {
+                "Offset": offset,
+                "Symbol": s,
+                "Type": {"Value": "IMAGE_REL_I386_DIR32" as kind},
+            }:
+                offset += base
+                s = s.removeprefix(self.prefix)
+                value, symbol = _stencils.symbol_to_value(s)
+                addend = int.from_bytes(raw[offset : offset + 4], "little")
+            case _:
+                raise NotImplementedError(relocation)
+        return _stencils.Hole(offset, kind, value, symbol, addend)
+
+
+class _ELF(
+    _Target[_schema.ELFSection, _schema.ELFRelocation]
+):  # pylint: disable = too-few-public-methods
+    def _handle_section(
+        self, section: _schema.ELFSection, group: _stencils.StencilGroup
+    ) -> None:
+        section_type = section["Type"]["Value"]
+        flags = {flag["Name"] for flag in section["Flags"]["Flags"]}
+        if section_type == "SHT_RELA":
+            assert "SHF_INFO_LINK" in flags, flags
+            assert not section["Symbols"]
+            value, base = group.symbols[section["Info"]]
+            if value is _stencils.HoleValue.CODE:
+                stencil = group.code
+            else:
+                assert value is _stencils.HoleValue.DATA
+                stencil = group.data
+            for wrapped_relocation in section["Relocations"]:
+                relocation = wrapped_relocation["Relocation"]
+                hole = self._handle_relocation(base, relocation, stencil.body)
+                stencil.holes.append(hole)
+        elif section_type == "SHT_PROGBITS":
+            if "SHF_ALLOC" not in flags:
+                return
+            if "SHF_EXECINSTR" in flags:
+                value = _stencils.HoleValue.CODE
+                stencil = group.code
+            else:
+                value = _stencils.HoleValue.DATA
+                stencil = group.data
+            group.symbols[section["Index"]] = value, len(stencil.body)
+            for wrapped_symbol in section["Symbols"]:
+                symbol = wrapped_symbol["Symbol"]
+                offset = len(stencil.body) + symbol["Value"]
+                name = symbol["Name"]["Value"]
+                name = name.removeprefix(self.prefix)
+                group.symbols[name] = value, offset
+            stencil.body.extend(section["SectionData"]["Bytes"])
+            assert not section["Relocations"]
+        else:
+            assert section_type in {
+                "SHT_GROUP",
+                "SHT_LLVM_ADDRSIG",
+                "SHT_NULL",
+                "SHT_STRTAB",
+                "SHT_SYMTAB",
+            }, section_type
+
+    def _handle_relocation(
+        self, base: int, relocation: _schema.ELFRelocation, raw: bytes
+    ) -> _stencils.Hole:
+        match relocation:
+            case {
+                "Addend": addend,
+                "Offset": offset,
+                "Symbol": {"Value": s},
+                "Type": {"Value": kind},
+            }:
+                offset += base
+                s = s.removeprefix(self.prefix)
+                value, symbol = _stencils.symbol_to_value(s)
+            case _:
+                raise NotImplementedError(relocation)
+        return _stencils.Hole(offset, kind, value, symbol, addend)
+
+
+class _MachO(
+    _Target[_schema.MachOSection, _schema.MachORelocation]
+):  # pylint: disable = too-few-public-methods
+    def _handle_section(
+        self, section: _schema.MachOSection, group: _stencils.StencilGroup
+    ) -> None:
+        assert section["Address"] >= len(group.code.body)
+        assert "SectionData" in section
+        flags = {flag["Name"] for flag in section["Attributes"]["Flags"]}
+        name = section["Name"]["Value"]
+        name = name.removeprefix(self.prefix)
+        if "SomeInstructions" in flags:
+            value = _stencils.HoleValue.CODE
+            stencil = group.code
+            start_address = 0
+            group.symbols[name] = value, section["Address"] - start_address
+        else:
+            value = _stencils.HoleValue.DATA
+            stencil = group.data
+            start_address = len(group.code.body)
+            group.symbols[name] = value, len(group.code.body)
+        base = section["Address"] - start_address
+        group.symbols[section["Index"]] = value, base
+        stencil.body.extend(
+            [0] * (section["Address"] - len(group.code.body) - len(group.data.body))
+        )
+        stencil.body.extend(section["SectionData"]["Bytes"])
+        assert "Symbols" in section
+        for wrapped_symbol in section["Symbols"]:
+            symbol = wrapped_symbol["Symbol"]
+            offset = symbol["Value"] - start_address
+            name = symbol["Name"]["Value"]
+            name = name.removeprefix(self.prefix)
+            group.symbols[name] = value, offset
+        assert "Relocations" in section
+        for wrapped_relocation in section["Relocations"]:
+            relocation = wrapped_relocation["Relocation"]
+            hole = self._handle_relocation(base, relocation, stencil.body)
+            stencil.holes.append(hole)
+
+    def _handle_relocation(
+        self, base: int, relocation: _schema.MachORelocation, raw: bytes
+    ) -> _stencils.Hole:
+        symbol: str | None
+        match relocation:
+            case {
+                "Offset": offset,
+                "Symbol": {"Value": s},
+                "Type": {
+                    "Value": "ARM64_RELOC_GOT_LOAD_PAGE21"
+                    | "ARM64_RELOC_GOT_LOAD_PAGEOFF12" as kind
+                },
+            }:
+                offset += base
+                s = s.removeprefix(self.prefix)
+                value, symbol = _stencils.HoleValue.GOT, s
+                addend = 0
+            case {
+                "Offset": offset,
+                "Section": {"Value": s},
+                "Type": {"Value": kind},
+            } | {
+                "Offset": offset,
+                "Symbol": {"Value": s},
+                "Type": {"Value": kind},
+            }:
+                offset += base
+                s = s.removeprefix(self.prefix)
+                value, symbol = _stencils.symbol_to_value(s)
+                addend = 0
+            case _:
+                raise NotImplementedError(relocation)
+        # Turn Clang's weird __bzero calls into normal bzero calls:
+        if symbol == "__bzero":
+            symbol = "bzero"
+        return _stencils.Hole(offset, kind, value, symbol, addend)
+
+
+def get_target(host: str) -> _COFF | _ELF | _MachO:
+    """Build a _Target for the given host "triple" and options."""
+    if re.fullmatch(r"aarch64-apple-darwin.*", host):
+        return _MachO(host, alignment=8, prefix="_")
+    if re.fullmatch(r"aarch64-.*-linux-gnu", host):
+        return _ELF(host, alignment=8)
+    if re.fullmatch(r"i686-pc-windows-msvc", host):
+        return _COFF(host, prefix="_")
+    if re.fullmatch(r"x86_64-apple-darwin.*", host):
+        return _MachO(host, prefix="_")
+    if re.fullmatch(r"x86_64-pc-windows-msvc", host):
+        return _COFF(host)
+    if re.fullmatch(r"x86_64-.*-linux-gnu", host):
+        return _ELF(host)
+    raise ValueError(host)
diff --git a/Tools/jit/_writer.py b/Tools/jit/_writer.py
new file mode 100644 (file)
index 0000000..8a2a42e
--- /dev/null
@@ -0,0 +1,95 @@
+"""Utilities for writing StencilGroups out to a C header file."""
+import typing
+
+import _schema
+import _stencils
+
+
+def _dump_header() -> typing.Iterator[str]:
+    yield "typedef enum {"
+    for kind in typing.get_args(_schema.HoleKind):
+        yield f"    HoleKind_{kind},"
+    yield "} HoleKind;"
+    yield ""
+    yield "typedef enum {"
+    for value in _stencils.HoleValue:
+        yield f"    HoleValue_{value.name},"
+    yield "} HoleValue;"
+    yield ""
+    yield "typedef struct {"
+    yield "    const uint64_t offset;"
+    yield "    const HoleKind kind;"
+    yield "    const HoleValue value;"
+    yield "    const void *symbol;"
+    yield "    const uint64_t addend;"
+    yield "} Hole;"
+    yield ""
+    yield "typedef struct {"
+    yield "    const size_t body_size;"
+    yield "    const unsigned char * const body;"
+    yield "    const size_t holes_size;"
+    yield "    const Hole * const holes;"
+    yield "} Stencil;"
+    yield ""
+    yield "typedef struct {"
+    yield "    const Stencil code;"
+    yield "    const Stencil data;"
+    yield "} StencilGroup;"
+    yield ""
+
+
+def _dump_footer(opnames: typing.Iterable[str]) -> typing.Iterator[str]:
+    yield "#define INIT_STENCIL(STENCIL) {                         \\"
+    yield "    .body_size = Py_ARRAY_LENGTH(STENCIL##_body) - 1,   \\"
+    yield "    .body = STENCIL##_body,                             \\"
+    yield "    .holes_size = Py_ARRAY_LENGTH(STENCIL##_holes) - 1, \\"
+    yield "    .holes = STENCIL##_holes,                           \\"
+    yield "}"
+    yield ""
+    yield "#define INIT_STENCIL_GROUP(OP) {     \\"
+    yield "    .code = INIT_STENCIL(OP##_code), \\"
+    yield "    .data = INIT_STENCIL(OP##_data), \\"
+    yield "}"
+    yield ""
+    yield "static const StencilGroup stencil_groups[512] = {"
+    for opname in opnames:
+        yield f"    [{opname}] = INIT_STENCIL_GROUP({opname}),"
+    yield "};"
+    yield ""
+    yield "#define GET_PATCHES() { \\"
+    for value in _stencils.HoleValue:
+        yield f"    [HoleValue_{value.name}] = (uint64_t)0xBADBADBADBADBADB, \\"
+    yield "}"
+
+
+def _dump_stencil(opname: str, group: _stencils.StencilGroup) -> typing.Iterator[str]:
+    yield f"// {opname}"
+    for part, stencil in [("code", group.code), ("data", group.data)]:
+        for line in stencil.disassembly:
+            yield f"// {line}"
+        if stencil.body:
+            size = len(stencil.body) + 1
+            yield f"static const unsigned char {opname}_{part}_body[{size}] = {{"
+            for i in range(0, len(stencil.body), 8):
+                row = " ".join(f"{byte:#04x}," for byte in stencil.body[i : i + 8])
+                yield f"    {row}"
+            yield "};"
+        else:
+            yield f"static const unsigned char {opname}_{part}_body[1];"
+        if stencil.holes:
+            size = len(stencil.holes) + 1
+            yield f"static const Hole {opname}_{part}_holes[{size}] = {{"
+            for hole in stencil.holes:
+                yield f"    {hole.as_c()},"
+            yield "};"
+        else:
+            yield f"static const Hole {opname}_{part}_holes[1];"
+    yield ""
+
+
+def dump(groups: dict[str, _stencils.StencilGroup]) -> typing.Iterator[str]:
+    """Yield a JIT compiler line-by-line as a C header file."""
+    yield from _dump_header()
+    for opname, group in groups.items():
+        yield from _dump_stencil(opname, group)
+    yield from _dump_footer(groups)
diff --git a/Tools/jit/build.py b/Tools/jit/build.py
new file mode 100644 (file)
index 0000000..4d4ace1
--- /dev/null
@@ -0,0 +1,28 @@
+"""Build an experimental just-in-time compiler for CPython."""
+import argparse
+import pathlib
+import shlex
+import sys
+
+import _targets
+
+if __name__ == "__main__":
+    comment = f"$ {shlex.join([sys.executable] + sys.argv)}"
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "target", type=_targets.get_target, help="a PEP 11 target triple to compile for"
+    )
+    parser.add_argument(
+        "-d", "--debug", action="store_true", help="compile for a debug build of Python"
+    )
+    parser.add_argument(
+        "-f", "--force", action="store_true", help="force the entire JIT to be rebuilt"
+    )
+    parser.add_argument(
+        "-v", "--verbose", action="store_true", help="echo commands as they are run"
+    )
+    args = parser.parse_args()
+    args.target.debug = args.debug
+    args.target.force = args.force
+    args.target.verbose = args.verbose
+    args.target.build(pathlib.Path.cwd(), comment=comment)
diff --git a/Tools/jit/mypy.ini b/Tools/jit/mypy.ini
new file mode 100644 (file)
index 0000000..768d002
--- /dev/null
@@ -0,0 +1,5 @@
+[mypy]
+files = Tools/jit
+pretty = True
+python_version = 3.11
+strict = True
diff --git a/Tools/jit/template.c b/Tools/jit/template.c
new file mode 100644 (file)
index 0000000..12303a5
--- /dev/null
@@ -0,0 +1,98 @@
+#include "Python.h"
+
+#include "pycore_call.h"
+#include "pycore_ceval.h"
+#include "pycore_dict.h"
+#include "pycore_emscripten_signal.h"
+#include "pycore_intrinsics.h"
+#include "pycore_jit.h"
+#include "pycore_long.h"
+#include "pycore_opcode_metadata.h"
+#include "pycore_opcode_utils.h"
+#include "pycore_range.h"
+#include "pycore_setobject.h"
+#include "pycore_sliceobject.h"
+
+#include "ceval_macros.h"
+
+#undef CURRENT_OPARG
+#define CURRENT_OPARG() (_oparg)
+
+#undef CURRENT_OPERAND
+#define CURRENT_OPERAND() (_operand)
+
+#undef DEOPT_IF
+#define DEOPT_IF(COND, INSTNAME) \
+    do {                         \
+        if ((COND)) {            \
+            goto deoptimize;     \
+        }                        \
+    } while (0)
+
+#undef ENABLE_SPECIALIZATION
+#define ENABLE_SPECIALIZATION (0)
+
+#undef GOTO_ERROR
+#define GOTO_ERROR(LABEL)        \
+    do {                         \
+        goto LABEL ## _tier_two; \
+    } while (0)
+
+#undef LOAD_IP
+#define LOAD_IP(UNUSED) \
+    do {                \
+    } while (0)
+
+#define PATCH_VALUE(TYPE, NAME, ALIAS)  \
+    extern void ALIAS;                  \
+    TYPE NAME = (TYPE)(uint64_t)&ALIAS;
+
+#define PATCH_JUMP(ALIAS)                                    \
+    extern void ALIAS;                                       \
+    __attribute__((musttail))                                \
+    return ((jit_func)&ALIAS)(frame, stack_pointer, tstate);
+
+_Py_CODEUNIT *
+_JIT_ENTRY(_PyInterpreterFrame *frame, PyObject **stack_pointer, PyThreadState *tstate)
+{
+    // Locals that the instruction implementations expect to exist:
+    PATCH_VALUE(_PyExecutorObject *, current_executor, _JIT_EXECUTOR)
+    int oparg;
+    int opcode = _JIT_OPCODE;
+    _PyUOpInstruction *next_uop;
+    // Other stuff we need handy:
+    PATCH_VALUE(uint16_t, _oparg, _JIT_OPARG)
+    PATCH_VALUE(uint64_t, _operand, _JIT_OPERAND)
+    PATCH_VALUE(uint32_t, _target, _JIT_TARGET)
+    // The actual instruction definitions (only one will be used):
+    if (opcode == _JUMP_TO_TOP) {
+        CHECK_EVAL_BREAKER();
+        PATCH_JUMP(_JIT_TOP);
+    }
+    switch (opcode) {
+#include "executor_cases.c.h"
+        default:
+            Py_UNREACHABLE();
+    }
+    PATCH_JUMP(_JIT_CONTINUE);
+    // Labels that the instruction implementations expect to exist:
+unbound_local_error_tier_two:
+    _PyEval_FormatExcCheckArg(
+        tstate, PyExc_UnboundLocalError, UNBOUNDLOCAL_ERROR_MSG,
+        PyTuple_GetItem(_PyFrame_GetCode(frame)->co_localsplusnames, oparg));
+    goto error_tier_two;
+pop_4_error_tier_two:
+    STACK_SHRINK(1);
+pop_3_error_tier_two:
+    STACK_SHRINK(1);
+pop_2_error_tier_two:
+    STACK_SHRINK(1);
+pop_1_error_tier_two:
+    STACK_SHRINK(1);
+error_tier_two:
+    _PyFrame_SetStackPointer(frame, stack_pointer);
+    return NULL;
+deoptimize:
+    _PyFrame_SetStackPointer(frame, stack_pointer);
+    return _PyCode_CODE(_PyFrame_GetCode(frame)) + _target;
+}
index b1153df4d7ec523ab909f466298eab3e14eafca1..c563c3f5d3c7e6dd4af8ac8400db093d6548d7c2 100755 (executable)
--- a/configure
+++ b/configure
@@ -920,6 +920,7 @@ LLVM_AR
 PROFILE_TASK
 DEF_MAKE_RULE
 DEF_MAKE_ALL_RULE
+REGEN_JIT_COMMAND
 ABIFLAGS
 LN
 MKDIR_P
@@ -1074,6 +1075,7 @@ with_pydebug
 with_trace_refs
 enable_pystats
 with_assertions
+enable_experimental_jit
 enable_optimizations
 with_lto
 enable_bolt
@@ -1801,6 +1803,9 @@ Optional Features:
   --disable-gil           enable experimental support for running without the
                           GIL (default is no)
   --enable-pystats        enable internal statistics gathering (default is no)
+  --enable-experimental-jit
+                          build the experimental just-in-time compiler
+                          (default is no)
   --enable-optimizations  enable expensive, stable optimizations (PGO, etc.)
                           (default is no)
   --enable-bolt           enable usage of the llvm-bolt post-link optimizer
@@ -7997,6 +8002,32 @@ else
 printf "%s\n" "no" >&6; }
 fi
 
+# Check for --enable-experimental-jit:
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for --enable-experimental-jit" >&5
+printf %s "checking for --enable-experimental-jit... " >&6; }
+# Check whether --enable-experimental-jit was given.
+if test ${enable_experimental_jit+y}
+then :
+  enableval=$enable_experimental_jit;
+else $as_nop
+  enable_experimental_jit=no
+fi
+
+if test "x$enable_experimental_jit" = xno
+then :
+
+else $as_nop
+  as_fn_append CFLAGS_NODIST " -D_Py_JIT"
+           REGEN_JIT_COMMAND="\$(PYTHON_FOR_REGEN) \$(srcdir)/Tools/jit/build.py $host"
+           if test "x$Py_DEBUG" = xtrue
+then :
+  as_fn_append REGEN_JIT_COMMAND " --debug"
+fi
+fi
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $enable_experimental_jit" >&5
+printf "%s\n" "$enable_experimental_jit" >&6; }
+
 # Enable optimization flags
 
 
index 9587e6d63499aace3db1ded81c5446774165b02a..13c46b3e80151dcb08046c42ba5315f84d7fa61a 100644 (file)
@@ -1579,6 +1579,26 @@ else
   AC_MSG_RESULT([no])
 fi
 
+# Check for --enable-experimental-jit:
+AC_MSG_CHECKING([for --enable-experimental-jit])
+AC_ARG_ENABLE([experimental-jit],
+              [AS_HELP_STRING([--enable-experimental-jit],
+                              [build the experimental just-in-time compiler (default is no)])],
+              [],
+              [enable_experimental_jit=no])
+AS_VAR_IF([enable_experimental_jit],
+          [no],
+          [],
+          [AS_VAR_APPEND([CFLAGS_NODIST], [" -D_Py_JIT"])
+           AS_VAR_SET([REGEN_JIT_COMMAND],
+                      ["\$(PYTHON_FOR_REGEN) \$(srcdir)/Tools/jit/build.py $host"])
+           AS_VAR_IF([Py_DEBUG],
+                     [true],
+                     [AS_VAR_APPEND([REGEN_JIT_COMMAND], [" --debug"])],
+                     [])])
+AC_SUBST([REGEN_JIT_COMMAND])
+AC_MSG_RESULT([$enable_experimental_jit])
+
 # Enable optimization flags
 AC_SUBST([DEF_MAKE_ALL_RULE])
 AC_SUBST([DEF_MAKE_RULE])