]>
Commit | Line | Data |
---|---|---|
a5544970 | 1 | /* Copyright (C) 2008-2019 Free Software Foundation, Inc. |
9bf85028 TS |
2 | |
3 | This file is free software; you can redistribute it and/or modify it under | |
4 | the terms of the GNU General Public License as published by the Free | |
748086b7 | 5 | Software Foundation; either version 3 of the License, or (at your option) |
9bf85028 TS |
6 | any later version. |
7 | ||
8 | This file is distributed in the hope that it will be useful, but WITHOUT | |
9 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
10 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
11 | for more details. | |
12 | ||
748086b7 JJ |
13 | Under Section 7 of GPL version 3, you are granted additional |
14 | permissions described in the GCC Runtime Library Exception, version | |
15 | 3.1, as published by the Free Software Foundation. | |
9bf85028 | 16 | |
748086b7 JJ |
17 | You should have received a copy of the GNU General Public License and |
18 | a copy of the GCC Runtime Library Exception along with this program; | |
19 | see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
20 | <http://www.gnu.org/licenses/>. */ | |
9bf85028 TS |
21 | |
22 | #include <spu_intrinsics.h> | |
23 | ||
24 | typedef int TItype __attribute__ ((mode (TI))); | |
25 | ||
526ed6c2 UW |
26 | union qword_TItype |
27 | { | |
28 | qword q; | |
29 | TItype t; | |
30 | }; | |
31 | ||
32 | inline static qword | |
33 | si_from_TItype (TItype t) | |
34 | { | |
35 | union qword_TItype u; | |
36 | u.t = t; | |
37 | return u.q; | |
38 | } | |
39 | ||
40 | inline static TItype | |
41 | si_to_TItype (qword q) | |
42 | { | |
43 | union qword_TItype u; | |
44 | u.q = q; | |
45 | return u.t; | |
46 | } | |
47 | ||
9bf85028 TS |
48 | /* A straight forward vectorization and unrolling of |
49 | * short l[8], r[8]; | |
50 | * TItype total = 0; | |
51 | * for (i = 0; i < 8; i++) | |
52 | * for (j = 0; j < 8; j++) | |
53 | * total += (TItype)((l[7-i] * r[7-j]) << (16 * (i + j))); | |
54 | */ | |
55 | TItype | |
56 | __multi3 (TItype l, TItype r) | |
57 | { | |
526ed6c2 UW |
58 | qword u = si_from_TItype (l); |
59 | qword v = si_from_TItype (r); | |
9bf85028 TS |
60 | qword splat0 = si_shufb (v, v, si_ilh (0x0001)); |
61 | qword splat1 = si_shufb (v, v, si_ilh (0x0203)); | |
62 | qword splat2 = si_shufb (v, v, si_ilh (0x0405)); | |
63 | qword splat3 = si_shufb (v, v, si_ilh (0x0607)); | |
64 | qword splat4 = si_shufb (v, v, si_ilh (0x0809)); | |
65 | qword splat5 = si_shufb (v, v, si_ilh (0x0a0b)); | |
66 | qword splat6 = si_shufb (v, v, si_ilh (0x0c0d)); | |
67 | qword splat7 = si_shufb (v, v, si_ilh (0x0e0f)); | |
68 | ||
69 | qword part0l = si_shlqbyi (si_mpyu (u, splat0), 14); | |
70 | qword part1h = si_shlqbyi (si_mpyhhu (u, splat1), 14); | |
71 | qword part1l = si_shlqbyi (si_mpyu (u, splat1), 12); | |
72 | qword part2h = si_shlqbyi (si_mpyhhu (u, splat2), 12); | |
73 | qword part2l = si_shlqbyi (si_mpyu (u, splat2), 10); | |
74 | qword part3h = si_shlqbyi (si_mpyhhu (u, splat3), 10); | |
75 | qword part3l = si_shlqbyi (si_mpyu (u, splat3), 8); | |
76 | qword part4h = si_shlqbyi (si_mpyhhu (u, splat4), 8); | |
77 | qword part4l = si_shlqbyi (si_mpyu (u, splat4), 6); | |
78 | qword part5h = si_shlqbyi (si_mpyhhu (u, splat5), 6); | |
79 | qword part5l = si_shlqbyi (si_mpyu (u, splat5), 4); | |
80 | qword part6h = si_shlqbyi (si_mpyhhu (u, splat6), 4); | |
81 | qword part6l = si_shlqbyi (si_mpyu (u, splat6), 2); | |
82 | qword part7h = si_shlqbyi (si_mpyhhu (u, splat7), 2); | |
83 | qword part7l = si_mpyu (u, splat7); | |
84 | ||
85 | qword carry, total0, total1, total2, total3, total4; | |
86 | qword total5, total6, total7, total8, total9, total10; | |
87 | qword total; | |
88 | ||
89 | total0 = si_a (si_a (si_a (part0l, part1h), si_a (part1l, part2h)), part7l); | |
90 | total1 = si_a (part2l, part3h); | |
91 | total2 = si_a (part3l, part4h); | |
92 | total3 = si_a (part4l, part5h); | |
93 | total4 = si_a (part5l, part6h); | |
94 | total5 = si_a (part6l, part7h); | |
95 | total6 = si_a (total0, total1); | |
96 | total7 = si_a (total2, total3); | |
97 | total8 = si_a (total4, total5); | |
98 | total9 = si_a (total6, total7); | |
99 | total10 = si_a (total8, total9); | |
100 | ||
101 | carry = si_cg (part2l, part3h); | |
102 | carry = si_a (carry, si_cg (part3l, part4h)); | |
103 | carry = si_a (carry, si_cg (part4l, part5h)); | |
104 | carry = si_a (carry, si_cg (part5l, part6h)); | |
105 | carry = si_a (carry, si_cg (part6l, part7h)); | |
106 | carry = si_a (carry, si_cg (total0, total1)); | |
107 | carry = si_a (carry, si_cg (total2, total3)); | |
108 | carry = si_a (carry, si_cg (total4, total5)); | |
109 | carry = si_a (carry, si_cg (total6, total7)); | |
110 | carry = si_a (carry, si_cg (total8, total9)); | |
111 | carry = si_shlqbyi (carry, 4); | |
112 | ||
113 | total = si_cg (total10, carry); | |
114 | total = si_shlqbyi (total, 4); | |
115 | total = si_cgx (total10, carry, total); | |
116 | total = si_shlqbyi (total, 4); | |
117 | total = si_addx (total10, carry, total); | |
526ed6c2 | 118 | return si_to_TItype (total); |
9bf85028 | 119 | } |