libgfortran/m4/matmul.m4

   1 `/* Implementation of the MATMUL intrinsic
   2    Copyright (C) 2002-2019 Free Software Foundation, Inc.
   3    Contributed by Paul Brook <paul@nowt.org>
   4
   5 This file is part of the GNU Fortran runtime library (libgfortran).
   6
   7 Libgfortran is free software; you can redistribute it and/or
   8 modify it under the terms of the GNU General Public
   9 License as published by the Free Software Foundation; either
  10 version 3 of the License, or (at your option) any later version.
  11
  12 Libgfortran is distributed in the hope that it will be useful,
  13 but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15 GNU General Public License for more details.
  16
  17 Under Section 7 of GPL version 3, you are granted additional
  18 permissions described in the GCC Runtime Library Exception, version
  19 3.1, as published by the Free Software Foundation.
  20
  21 You should have received a copy of the GNU General Public License and
  22 a copy of the GCC Runtime Library Exception along with this program;
  23 see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  24 <http://www.gnu.org/licenses/>.  */
  25
  26 #include "libgfortran.h"
  27 #include <string.h>
  28 #include <assert.h>'
  29
  30 include(iparm.m4)dnl
  31
  32 `#if defined (HAVE_'rtype_name`)
  33
  34 /* Prototype for the BLAS ?gemm subroutine, a pointer to which can be
  35    passed to us by the front-end, in which case we call it for large
  36    matrices.  */
  37
  38 typedef void (*blas_call)(const char *, const char *, const int *, const int *,
  39                           const int *, const 'rtype_name` *, const 'rtype_name` *,
  40                           const int *, const 'rtype_name` *, const int *,
  41                           const 'rtype_name` *, 'rtype_name` *, const int *,
  42                           int, int);
  43
  44 /* The order of loops is different in the case of plain matrix
  45    multiplication C=MATMUL(A,B), and in the frequent special case where
  46    the argument A is the temporary result of a TRANSPOSE intrinsic:
  47    C=MATMUL(TRANSPOSE(A),B).  Transposed temporaries are detected by
  48    looking at their strides.
  49
  50    The equivalent Fortran pseudo-code is:
  51
  52    DIMENSION A(M,COUNT), B(COUNT,N), C(M,N)
  53    IF (.NOT.IS_TRANSPOSED(A)) THEN
  54      C = 0
  55      DO J=1,N
  56        DO K=1,COUNT
  57          DO I=1,M
  58            C(I,J) = C(I,J)+A(I,K)*B(K,J)
  59    ELSE
  60      DO J=1,N
  61        DO I=1,M
  62          S = 0
  63          DO K=1,COUNT
  64            S = S+A(I,K)*B(K,J)
  65          C(I,J) = S
  66    ENDIF
  67 */
  68
  69 /* If try_blas is set to a nonzero value, then the matmul function will
  70    see if there is a way to perform the matrix multiplication by a call
  71    to the BLAS gemm function.  */
  72
  73 extern void matmul_'rtype_code` ('rtype` * const restrict retarray,
  74         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
  75         int blas_limit, blas_call gemm);
  76 export_proto(matmul_'rtype_code`);
  77
  78 /* Put exhaustive list of possible architectures here here, ORed together.  */
  79
  80 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_AVX512F)
  81
  82 #ifdef HAVE_AVX
  83 'define(`matmul_name',`matmul_'rtype_code`_avx')dnl
  84 `static void
  85 'matmul_name` ('rtype` * const restrict retarray,
  86         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
  87         int blas_limit, blas_call gemm) __attribute__((__target__("avx")));
  88 static' include(matmul_internal.m4)dnl
  89 `#endif /* HAVE_AVX */
  90
  91 #ifdef HAVE_AVX2
  92 'define(`matmul_name',`matmul_'rtype_code`_avx2')dnl
  93 `static void
  94 'matmul_name` ('rtype` * const restrict retarray,
  95         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
  96         int blas_limit, blas_call gemm) __attribute__((__target__("avx2,fma")));
  97 static' include(matmul_internal.m4)dnl
  98 `#endif /* HAVE_AVX2 */
  99
 100 #ifdef HAVE_AVX512F
 101 'define(`matmul_name',`matmul_'rtype_code`_avx512f')dnl
 102 `static void
 103 'matmul_name` ('rtype` * const restrict retarray,
 104         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 105         int blas_limit, blas_call gemm) __attribute__((__target__("avx512f")));
 106 static' include(matmul_internal.m4)dnl
 107 `#endif  /* HAVE_AVX512F */
 108
 109 /* AMD-specifix funtions with AVX128 and FMA3/FMA4.  */
 110
 111 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
 112 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma3')dnl
 113 `void
 114 'matmul_name` ('rtype` * const restrict retarray,
 115         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 116         int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma")));
 117 internal_proto('matmul_name`);
 118 #endif
 119
 120 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
 121 'define(`matmul_name',`matmul_'rtype_code`_avx128_fma4')dnl
 122 `void
 123 'matmul_name` ('rtype` * const restrict retarray,
 124         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 125         int blas_limit, blas_call gemm) __attribute__((__target__("avx,fma4")));
 126 internal_proto('matmul_name`);
 127 #endif
 128
 129 /* Function to fall back to if there is no special processor-specific version.  */
 130 'define(`matmul_name',`matmul_'rtype_code`_vanilla')dnl
 131 `static' include(matmul_internal.m4)dnl
 132
 133 `/* Compiling main function, with selection code for the processor.  */
 134
 135 /* Currently, this is i386 only.  Adjust for other architectures.  */
 136
 137 #include <config/i386/cpuinfo.h>
 138 void matmul_'rtype_code` ('rtype` * const restrict retarray,
 139         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 140         int blas_limit, blas_call gemm)
 141 {
 142   static void (*matmul_p) ('rtype` * const restrict retarray,
 143         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 144         int blas_limit, blas_call gemm);
 145
 146   void (*matmul_fn) ('rtype` * const restrict retarray,
 147         'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas,
 148         int blas_limit, blas_call gemm);
 149
 150   matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED);
 151   if (matmul_fn == NULL)
 152     {
 153       matmul_fn = matmul_'rtype_code`_vanilla;
 154       if (__cpu_model.__cpu_vendor == VENDOR_INTEL)
 155         {
 156           /* Run down the available processors in order of preference.  */
 157 #ifdef HAVE_AVX512F
 158           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F))
 159             {
 160               matmul_fn = matmul_'rtype_code`_avx512f;
 161               goto store;
 162             }
 163
 164 #endif  /* HAVE_AVX512F */
 165
 166 #ifdef HAVE_AVX2
 167           if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2))
 168              && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 169             {
 170               matmul_fn = matmul_'rtype_code`_avx2;
 171               goto store;
 172             }
 173
 174 #endif
 175
 176 #ifdef HAVE_AVX
 177           if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
 178             {
 179               matmul_fn = matmul_'rtype_code`_avx;
 180               goto store;
 181             }
 182 #endif  /* HAVE_AVX */
 183         }
 184     else if (__cpu_model.__cpu_vendor == VENDOR_AMD)
 185       {
 186 #if defined(HAVE_AVX) && defined(HAVE_FMA3) && defined(HAVE_AVX128)
 187         if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
 188             && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA)))
 189           {
 190             matmul_fn = matmul_'rtype_code`_avx128_fma3;
 191             goto store;
 192           }
 193 #endif
 194 #if defined(HAVE_AVX) && defined(HAVE_FMA4) && defined(HAVE_AVX128)
 195         if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX))
 196              && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA4)))
 197           {
 198             matmul_fn = matmul_'rtype_code`_avx128_fma4;
 199             goto store;
 200           }
 201 #endif
 202
 203       }
 204    store:
 205       __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED);
 206    }
 207
 208    (*matmul_fn) (retarray, a, b, try_blas, blas_limit, gemm);
 209 }
 210
 211 #else  /* Just the vanilla function.  */
 212
 213 'define(`matmul_name',`matmul_'rtype_code)dnl
 214 define(`target_attribute',`')dnl
 215 include(matmul_internal.m4)dnl
 216 `#endif
 217 #endif
 218 '