SeqAn3 3.3.0-rc.1
The Modern C++ library for sequence analysis.
simd_algorithm_avx512.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <array>
16
21
22//-----------------------------------------------------------------------------
23// forward declare avx512 simd algorithms that use avx512 intrinsics
24//-----------------------------------------------------------------------------
25
26namespace seqan3::detail
27{
31template <simd::simd_concept simd_t>
32constexpr simd_t load_avx512(void const * mem_addr);
33
37template <simd::simd_concept simd_t>
38constexpr void store_avx512(void * mem_addr, simd_t const & simd_vec);
39
43template <simd::simd_concept simd_t>
45
49template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
50constexpr target_simd_t upcast_signed_avx512(source_simd_t const & src);
51
55template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
56constexpr target_simd_t upcast_unsigned_avx512(source_simd_t const & src);
57
61template <uint8_t index, simd::simd_concept simd_t>
62constexpr simd_t extract_half_avx512(simd_t const & src);
63
67template <uint8_t index, simd::simd_concept simd_t>
68constexpr simd_t extract_quarter_avx512(simd_t const & src);
69
73template <uint8_t index, simd::simd_concept simd_t>
74constexpr simd_t extract_eighth_avx512(simd_t const & src);
75
76} // namespace seqan3::detail
77
78//-----------------------------------------------------------------------------
79// implementation
80//-----------------------------------------------------------------------------
81
82#ifdef __AVX512F__
83
84namespace seqan3::detail
85{
86
87template <simd::simd_concept simd_t>
88constexpr simd_t load_avx512(void const * mem_addr)
89{
90 return reinterpret_cast<simd_t>(_mm512_loadu_si512(mem_addr));
91}
92
93template <simd::simd_concept simd_t>
94constexpr void store_avx512(void * mem_addr, simd_t const & simd_vec)
95{
96 _mm512_storeu_si512(mem_addr, reinterpret_cast<__m512i const &>(simd_vec));
97}
98
99# if defined(__AVX512BW__)
100template <simd::simd_concept simd_t>
101inline void transpose_matrix_avx512(std::array<simd_t, simd_traits<simd_t>::length> & matrix)
102{
103 // Transposing a 64x64 byte matrix in 6x64 instructions using AVX512 intrinsics.
104 // Step 1: Unpack 8-bit operands.
105
106 // Note: _mm512_unpack* operates on 128-bit lanes, i.e. the following pattern applies:
107 // * for lower half ('lo')
108 // d[127:0] = interleave(a[63:0], b[63:0])
109 // d[255:128] = interleave(a[191:128], b[191:128])
110 // d[383:256] = interleave(a[319:256], b[319:256])
111 // d[511:384] = interleave(a[447:384], b[447:384])
112 // * for higher half ('hi')
113 // d[127:0] = interleave(a[127:64], b[127:64])
114 // d[255:128] = interleave(a[255:192], b[255:192])
115 // d[383:256] = interleave(a[319:320], b[383:320])
116 // d[511:384] = interleave(a[511:448], b[511:448])
117 __m512i tmp1[64];
118 for (int i = 0; i < 32; ++i)
119 {
120 tmp1[i] = _mm512_unpacklo_epi8(reinterpret_cast<__m512i const &>(matrix[2 * i]),
121 reinterpret_cast<__m512i const &>(matrix[2 * i + 1]));
122 tmp1[i + 32] = _mm512_unpackhi_epi8(reinterpret_cast<__m512i const &>(matrix[2 * i]),
123 reinterpret_cast<__m512i const &>(matrix[2 * i + 1]));
124 }
125
126 // Step 2: Unpack 16-bit operands.
127 __m512i tmp2[64];
128 for (int i = 0; i < 32; ++i)
129 {
130 tmp2[i] = _mm512_unpacklo_epi16(tmp1[2 * i], tmp1[2 * i + 1]);
131 tmp2[i + 32] = _mm512_unpackhi_epi16(tmp1[2 * i], tmp1[2 * i + 1]);
132 }
133 // Step 3: Unpack 32-bit operands.
134 for (int i = 0; i < 32; ++i)
135 {
136 tmp1[i] = _mm512_unpacklo_epi32(tmp2[2 * i], tmp2[2 * i + 1]);
137 tmp1[i + 32] = _mm512_unpackhi_epi32(tmp2[2 * i], tmp2[2 * i + 1]);
138 }
139 // Step 4: Unpack 64-bit operands.
140 for (int i = 0; i < 32; ++i)
141 {
142 tmp2[i] = _mm512_unpacklo_epi64(tmp1[2 * i], tmp1[2 * i + 1]);
143 tmp2[i + 32] = _mm512_unpackhi_epi64(tmp1[2 * i], tmp1[2 * i + 1]);
144 }
145
146 // Step 5: Unpack 128-bit operands.
147 // Helper function to emulate unpack of 128-bit across lanes using _mm512_permutex2var_epi64.
148 auto _mm512_unpacklo_epi128 = [](__m512i const & a, __m512i const & b)
149 {
150 constexpr std::array<uint64_t, 8> lo_mask{0, 1, 8, 9, 2, 3, 10, 11};
151 return _mm512_permutex2var_epi64(a, reinterpret_cast<__m512i const &>(*lo_mask.data()), b);
152 };
153
154 auto _mm521_unpackhi_epi128 = [](__m512i const & a, __m512i const & b)
155 {
156 constexpr std::array<uint64_t, 8> hi_mask{4, 5, 12, 13, 6, 7, 14, 15};
157 return _mm512_permutex2var_epi64(a, reinterpret_cast<__m512i const &>(*hi_mask.data()), b);
158 };
159
160 for (int i = 0; i < 32; ++i)
161 {
162 tmp1[i] = _mm512_unpacklo_epi128(tmp2[2 * i], tmp2[2 * i + 1]);
163 tmp1[i + 32] = _mm521_unpackhi_epi128(tmp2[2 * i], tmp2[2 * i + 1]);
164 }
165 // Step 6: Unpack 128-bit operands.
166 // Helper function to emulate unpack of 256-bit across lanes using _mm512_shuffle_i64x2.
167 auto _mm512_unpacklo_epi256 = [](__m512i const & a, __m512i const & b)
168 {
169 return _mm512_shuffle_i64x2(a, b, 0b0100'0100);
170 };
171
172 auto _mm521_unpackhi_epi256 = [](__m512i const & a, __m512i const & b)
173 {
174 return _mm512_shuffle_i64x2(a, b, 0b1110'1110);
175 };
176
177 // A look-up table to place the final transposed vector to the correct position in the
178 // original matrix.
179 // clang-format off
180 constexpr std::array<uint32_t, 64> reverse_idx_mask{
181 // 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
182 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
183 // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
184 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
185 // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
186 32, 48, 40, 56, 36, 52, 44, 60, 34, 50, 42, 58, 38, 54, 46, 62,
187 // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
188 33, 49, 41, 57, 37, 53, 45, 61, 35, 51, 43, 59, 39, 55, 47, 63};
189 // clang-format on
190
191 for (int i = 0; i < 32; ++i)
192 {
193 int const idx = i * 2;
194 matrix[reverse_idx_mask[idx]] = reinterpret_cast<simd_t>(_mm512_unpacklo_epi256(tmp1[idx], tmp1[idx + 1]));
195 matrix[reverse_idx_mask[idx + 1]] = reinterpret_cast<simd_t>(_mm521_unpackhi_epi256(tmp1[idx], tmp1[idx + 1]));
196 }
197}
198# endif // defined(__AVX512BW__)
199
200template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
201constexpr target_simd_t upcast_signed_avx512(source_simd_t const & src)
202{
203 __m512i const & tmp = reinterpret_cast<__m512i const &>(src);
204 if constexpr (simd_traits<source_simd_t>::length == 64) // cast from epi8 ...
205 {
206 if constexpr (simd_traits<target_simd_t>::length == 32) // to epi16
207 return reinterpret_cast<target_simd_t>(_mm512_cvtepi8_epi16(_mm512_castsi512_si256(tmp)));
208 if constexpr (simd_traits<target_simd_t>::length == 16) // to epi32
209 return reinterpret_cast<target_simd_t>(_mm512_cvtepi8_epi32(_mm512_castsi512_si128(tmp)));
210 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi64
211 return reinterpret_cast<target_simd_t>(_mm512_cvtepi8_epi64(_mm512_castsi512_si128(tmp)));
212 }
213 else if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi16 ...
214 {
215 if constexpr (simd_traits<target_simd_t>::length == 16) // to epi32
216 return reinterpret_cast<target_simd_t>(_mm512_cvtepi16_epi32(_mm512_castsi512_si256(tmp)));
217 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi64
218 return reinterpret_cast<target_simd_t>(_mm512_cvtepi16_epi64(_mm512_castsi512_si128(tmp)));
219 }
220 else // cast from epi32 to epi64
221 {
222 static_assert(simd_traits<source_simd_t>::length == 16, "Expected 32 bit scalar type.");
223 return reinterpret_cast<target_simd_t>(_mm512_cvtepi32_epi64(_mm512_castsi512_si256(tmp)));
224 }
225}
226
227template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
228constexpr target_simd_t upcast_unsigned_avx512(source_simd_t const & src)
229{
230 __m512i const & tmp = reinterpret_cast<__m512i const &>(src);
231 if constexpr (simd_traits<source_simd_t>::length == 64) // cast from epi8 ...
232 {
233 if constexpr (simd_traits<target_simd_t>::length == 32) // to epi16
234 return reinterpret_cast<target_simd_t>(_mm512_cvtepu8_epi16(_mm512_castsi512_si256(tmp)));
235 if constexpr (simd_traits<target_simd_t>::length == 16) // to epi32
236 return reinterpret_cast<target_simd_t>(_mm512_cvtepu8_epi32(_mm512_castsi512_si128(tmp)));
237 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi64
238 return reinterpret_cast<target_simd_t>(_mm512_cvtepu8_epi64(_mm512_castsi512_si128(tmp)));
239 }
240 else if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi16 ...
241 {
242 if constexpr (simd_traits<target_simd_t>::length == 16) // to epi32
243 return reinterpret_cast<target_simd_t>(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(tmp)));
244 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi64
245 return reinterpret_cast<target_simd_t>(_mm512_cvtepu16_epi64(_mm512_castsi512_si128(tmp)));
246 }
247 else // cast from epi32 to epi64
248 {
249 static_assert(simd_traits<source_simd_t>::length == 16, "Expected 32 bit scalar type.");
250 return reinterpret_cast<target_simd_t>(_mm512_cvtepu32_epi64(_mm512_castsi512_si256(tmp)));
251 }
252}
253
254template <uint8_t index, simd::simd_concept simd_t>
255constexpr simd_t extract_half_avx512(simd_t const & src)
256{
257 return reinterpret_cast<simd_t>(
258 _mm512_castsi256_si512(_mm512_extracti64x4_epi64(reinterpret_cast<__m512i const &>(src), index)));
259}
260
261# if defined(__AVX512DQ__)
262template <uint8_t index, simd::simd_concept simd_t>
263constexpr simd_t extract_quarter_avx512(simd_t const & src)
264{
265 return reinterpret_cast<simd_t>(
266 _mm512_castsi128_si512(_mm512_extracti64x2_epi64(reinterpret_cast<__m512i const &>(src), index)));
267}
268
269template <uint8_t index, simd::simd_concept simd_t>
270constexpr simd_t extract_eighth_avx512(simd_t const & src)
271{
272 __m512i tmp = reinterpret_cast<__m512i const &>(src);
273
274 // for uneven index exchange higher 64 bits with lower 64 bits for each 128 bit lane.
275 if constexpr (index % 2 == 1)
276 tmp = _mm512_shuffle_epi32(tmp, 0b0100'1110); // := [1, 0, 3, 2].
277
278 return reinterpret_cast<simd_t>(_mm512_castsi128_si512(_mm512_extracti64x2_epi64(tmp, index / 2)));
279}
280# endif // defined(__AVX512DQ__)
281
282} // namespace seqan3::detail
283
284#endif // __AVX512F__
Provides seqan3::detail::builtin_simd, seqan3::detail::is_builtin_simd and seqan3::simd::simd_traits<...
Provides intrinsics include for builtin simd.
Defines the requirements of a matrix (e.g. score matrices, trace matrices).
The internal SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
void transpose_matrix_avx512(std::array< simd_t, simd_traits< simd_t >::length > &matrix)
Transposes the given simd vector matrix.
constexpr simd_t extract_half_avx512(simd_t const &src)
Extracts one half of the given simd vector and stores the result in the lower half of the target vect...
constexpr simd_t load_avx512(void const *mem_addr)
Load simd_t size bits of integral data from memory.
constexpr target_simd_t upcast_signed_avx512(source_simd_t const &src)
Upcasts the given vector into the target vector using signed extension of packed values.
constexpr void store_avx512(void *mem_addr, simd_t const &simd_vec)
Store simd_t size bits of integral data into memory.
constexpr simd_t extract_quarter_avx512(simd_t const &src)
Extracts one quarter of the given simd vector and stores it in the lower quarter of the target vector...
constexpr target_simd_t upcast_unsigned_avx512(source_simd_t const &src)
Upcasts the given vector into the target vector using unsigned extension of packed values.
constexpr simd_t extract_eighth_avx512(simd_t const &src)
Extracts one eighth of the given simd vector and stores it in the lower eighth of the target vector.
Provides seqan3::simd::simd_traits.
seqan3::simd::simd_traits is the trait class that provides uniform interface to the properties of sim...
Definition: simd_traits.hpp:41
Provides seqan3::simd::simd_concept.