amd_buffer_addressing.hpp Source File

amd_buffer_addressing.hpp Source File#

Composable Kernel: amd_buffer_addressing.hpp Source File
utility/amd_buffer_addressing.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
5#include "data_type.hpp"
6
7namespace ck {
8
9template <typename T>
11{
12 __device__ constexpr BufferResource() : content{} {}
13
14 // 128 bit SGPRs to supply buffer resource in buffer instructions
15 // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
20};
21
22template <typename T>
23__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_size)
24{
25 BufferResource<T> wave_buffer_resource;
26
27 // wavewise base address (64 bit)
28 wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
29 // wavewise range (32 bit)
30 wave_buffer_resource.range(Number<2>{}) = element_space_size * sizeof(T);
31 // wavewise setting (32 bit)
32 wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
33
34 return wave_buffer_resource.content;
35}
36
37template <typename T>
39{
40 BufferResource<T> wave_buffer_resource;
41
42 // wavewise base address (64 bit)
43 wave_buffer_resource.address(Number<0>{}) = const_cast<remove_cv_t<T>*>(p_wave);
44 // wavewise range (32 bit)
45 wave_buffer_resource.range(Number<2>{}) = 0xffffffff; // max possible range
46 // wavewise setting (32 bit)
47 wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD;
48
49 return wave_buffer_resource.content;
50}
51
52// buffer load i8
53__device__ int8_t
55 index_t voffset,
56 index_t soffset,
57 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8");
58
59__device__ int8x2_t
61 index_t voffset,
62 index_t soffset,
63 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8");
64
65__device__ int8x4_t
67 index_t voffset,
68 index_t soffset,
69 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
70
71// buffer load i16
72__device__ bhalf_t
74 index_t voffset,
75 index_t soffset,
76 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16");
77
78__device__ bhalf2_t
80 index_t voffset,
81 index_t soffset,
82 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16");
83
84__device__ bhalf4_t
86 index_t voffset,
87 index_t soffset,
88 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16");
89
90// buffer load i32
91__device__ int32_t
93 index_t voffset,
94 index_t soffset,
95 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
96
97__device__ int32x2_t
99 index_t voffset,
100 index_t soffset,
101 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
102
103__device__ int32x4_t
105 index_t voffset,
106 index_t soffset,
107 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
108
109// buffer load fp16
110__device__ half_t
112 index_t voffset,
113 index_t soffset,
114 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
115
116__device__ half2_t
118 index_t voffset,
119 index_t soffset,
120 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16");
121
122__device__ half4_t
124 index_t voffset,
125 index_t soffset,
126 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16");
127
128// buffer load fp32
129__device__ float
131 index_t voffset,
132 index_t soffset,
133 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32");
134
135__device__ float2_t
137 index_t voffset,
138 index_t soffset,
139 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32");
140
141__device__ float4_t
143 index_t voffset,
144 index_t soffset,
145 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32");
146
147// buffer store i8
148__device__ void
150 int32x4_t rsrc,
151 index_t voffset,
152 index_t soffset,
153 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8");
154
155__device__ void
157 int32x4_t rsrc,
158 index_t voffset,
159 index_t soffset,
160 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8");
161
162__device__ void
164 int32x4_t rsrc,
165 index_t voffset,
166 index_t soffset,
167 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8");
168
169// buffer store i16
170__device__ void
172 int32x4_t rsrc,
173 index_t voffset,
174 index_t soffset,
175 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");
176
177__device__ void
179 int32x4_t rsrc,
180 index_t voffset,
181 index_t soffset,
182 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16");
183
184__device__ void
186 int32x4_t rsrc,
187 index_t voffset,
188 index_t soffset,
189 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16");
190
191// buffer store i32
192__device__ void
194 int32x4_t rsrc,
195 index_t voffset,
196 index_t soffset,
197 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32");
198
199__device__ void
201 int32x4_t rsrc,
202 index_t voffset,
203 index_t soffset,
204 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32");
205
206__device__ void
208 int32x4_t rsrc,
209 index_t voffset,
210 index_t soffset,
211 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
212
213// buffer store fp16
214__device__ void
216 int32x4_t rsrc,
217 index_t voffset,
218 index_t soffset,
219 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
220
221__device__ void
223 int32x4_t rsrc,
224 index_t voffset,
225 index_t soffset,
226 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16");
227
228__device__ void
230 int32x4_t rsrc,
231 index_t voffset,
232 index_t soffset,
233 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
234
235// buffer store fp32
236__device__ void
238 int32x4_t rsrc,
239 index_t voffset,
240 index_t soffset,
241 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32");
242
243__device__ void
245 int32x4_t rsrc,
246 index_t voffset,
247 index_t soffset,
248 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32");
249
250__device__ void
252 int32x4_t rsrc,
253 index_t voffset,
254 index_t soffset,
255 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
256
257// buffer atomic-add fp16
259 half2_t vdata,
260 int32x4_t rsrc,
261 index_t voffset,
262 index_t soffset,
263 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
264
265// buffer atomic-add i32
267 int32_t vdata,
268 int32x4_t rsrc,
269 index_t voffset,
270 index_t soffset,
271 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32");
272
273// buffer atomic-add fp32
275 float vdata,
276 int32x4_t rsrc,
277 index_t voffset,
278 index_t soffset,
279 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32");
280
281// buffer atomic-add fp32
282__device__ double
284 int32x4_t rsrc, // dst_wave_buffer_resource
285 int voffset, // dst_thread_addr_offset
286 int soffset, // dst_wave_addr_offset
287 int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
288
289// memory coherency bit for buffer store/load instruction
290// check ISA manual for each GFX target
291// e.g. for
292// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
293// page 67~68
295{
296 DefaultCoherence = 0, // default value
297 GLC = 1,
298 SLC = 2,
300 // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
301 // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
302 // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
311};
312
313template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
314__device__ typename vector_type<int8_t, N>::type
315amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource,
316 index_t src_thread_addr_offset,
317 index_t src_wave_addr_offset)
318{
319 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
320 "wrong! not implemented");
321
322 if constexpr(N == 1)
323 {
324 return llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource,
325 src_thread_addr_offset,
326 src_wave_addr_offset,
327 static_cast<index_t>(coherence));
328 }
329 else if constexpr(N == 2)
330 {
331
332 int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
333 src_thread_addr_offset,
334 src_wave_addr_offset,
335 static_cast<index_t>(coherence));
336
337 return bit_cast<int8x2_t>(tmp);
338 }
339 else if constexpr(N == 4)
340 {
341 int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource,
342 src_thread_addr_offset,
343 src_wave_addr_offset,
344 static_cast<index_t>(coherence));
345
346 return bit_cast<int8x4_t>(tmp);
347 }
348 else if constexpr(N == 8)
349 {
350 int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource,
351 src_thread_addr_offset,
352 src_wave_addr_offset,
353 static_cast<index_t>(coherence));
354
355 return bit_cast<int8x8_t>(tmp);
356 }
357 else if constexpr(N == 16)
358 {
359 int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
360 src_thread_addr_offset,
361 src_wave_addr_offset,
362 static_cast<index_t>(coherence));
363 return bit_cast<int8x16_t>(tmp);
364 }
365 else if constexpr(N == 32)
366 {
367 int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
368 src_thread_addr_offset,
369 src_wave_addr_offset,
370 static_cast<index_t>(coherence));
371 int32x4_t tmp1 =
372 llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
373 src_thread_addr_offset,
374 src_wave_addr_offset + 4 * sizeof(int32_t),
375 static_cast<index_t>(coherence));
377
378 tmp.AsType<int32x4_t>()(Number<0>{}) = tmp0;
379 tmp.AsType<int32x4_t>()(Number<1>{}) = tmp1;
380
381 return bit_cast<int8x32_t>(tmp);
382 }
383 else if constexpr(N == 64)
384 {
385 int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
386 src_thread_addr_offset,
387 src_wave_addr_offset,
388 static_cast<index_t>(coherence));
389 int32x4_t tmp1 =
390 llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
391 src_thread_addr_offset,
392 src_wave_addr_offset + 4 * sizeof(int32_t),
393 static_cast<index_t>(coherence));
394 int32x4_t tmp2 =
395 llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
396 src_thread_addr_offset,
397 src_wave_addr_offset + 8 * sizeof(int32_t),
398 static_cast<index_t>(coherence));
399 int32x4_t tmp3 =
400 llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
401 src_thread_addr_offset,
402 src_wave_addr_offset + 12 * sizeof(int32_t),
403 static_cast<index_t>(coherence));
404
406
407 tmp.AsType<int32x4_t>()(Number<0>{}) = tmp0;
408 tmp.AsType<int32x4_t>()(Number<1>{}) = tmp1;
409 tmp.AsType<int32x4_t>()(Number<2>{}) = tmp2;
410 tmp.AsType<int32x4_t>()(Number<3>{}) = tmp3;
411
412 return bit_cast<int8x64_t>(tmp);
413 }
414}
415
416template <typename T,
417 index_t N,
419__device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
420 index_t src_thread_addr_offset,
421 index_t src_wave_addr_offset)
422{
423 static_assert(
424 (is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
425 (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
426 (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
427 (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
428 (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
429 (is_same<T, f8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
430 (is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
431 (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
432 (is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
433 (is_same<T, pk_i4_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
435 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
436 "wrong! not implemented");
437
438 using r_t = typename vector_type<T, N>::type;
440 src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
441 return bit_cast<r_t>(raw_data);
442}
443
444template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
445__device__ void
447 int32x4_t dst_wave_buffer_resource,
448 index_t dst_thread_addr_offset,
449 index_t dst_wave_addr_offset)
450{
451 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
452 "wrong! not implemented");
453
454 if constexpr(N == 1)
455 {
456 llvm_amdgcn_raw_buffer_store_i8(src_thread_data,
457 dst_wave_buffer_resource,
458 dst_thread_addr_offset,
459 dst_wave_addr_offset,
460 static_cast<index_t>(coherence));
461 }
462 else if constexpr(N == 2)
463 {
464
466 dst_wave_buffer_resource,
467 dst_thread_addr_offset,
468 dst_wave_addr_offset,
469 static_cast<index_t>(coherence));
470 }
471 else if constexpr(N == 4)
472 {
474 dst_wave_buffer_resource,
475 dst_thread_addr_offset,
476 dst_wave_addr_offset,
477 static_cast<index_t>(coherence));
478 }
479 else if constexpr(N == 8)
480 {
482 dst_wave_buffer_resource,
483 dst_thread_addr_offset,
484 dst_wave_addr_offset,
485 static_cast<index_t>(coherence));
486 }
487 else if constexpr(N == 16)
488 {
490 dst_wave_buffer_resource,
491 dst_thread_addr_offset,
492 dst_wave_addr_offset,
493 static_cast<index_t>(coherence));
494 }
495 else if constexpr(N == 32)
496 {
497 vector_type<int32_t, 8> tmp{bit_cast<int32x8_t>(src_thread_data)};
498
499 llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<0>{}],
500 dst_wave_buffer_resource,
501 dst_thread_addr_offset,
502 dst_wave_addr_offset,
503 static_cast<index_t>(coherence));
504
505 llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<1>{}],
506 dst_wave_buffer_resource,
507 dst_thread_addr_offset,
508 dst_wave_addr_offset + sizeof(int32_t) * 4,
509 static_cast<index_t>(coherence));
510 }
511 else if constexpr(N == 64)
512 {
514
515 llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<0>{}],
516 dst_wave_buffer_resource,
517 dst_thread_addr_offset,
518 dst_wave_addr_offset,
519 static_cast<index_t>(coherence));
520
521 llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<1>{}],
522 dst_wave_buffer_resource,
523 dst_thread_addr_offset,
524 dst_wave_addr_offset + sizeof(int32_t) * 4,
525 static_cast<index_t>(coherence));
526
527 llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<2>{}],
528 dst_wave_buffer_resource,
529 dst_thread_addr_offset,
530 dst_wave_addr_offset + sizeof(int32_t) * 8,
531 static_cast<index_t>(coherence));
532
533 llvm_amdgcn_raw_buffer_store_i32x4(tmp.template AsType<int32x4_t>()[Number<3>{}],
534 dst_wave_buffer_resource,
535 dst_thread_addr_offset,
536 dst_wave_addr_offset + sizeof(int32_t) * 12,
537 static_cast<index_t>(coherence));
538 }
539}
540
541template <typename T,
542 index_t N,
544__device__ void amd_buffer_store_impl(const typename vector_type<T, N>::type src_thread_data,
545 int32x4_t dst_wave_buffer_resource,
546 index_t dst_thread_addr_offset,
547 index_t dst_wave_addr_offset)
548{
549 static_assert(
550 (is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
551 (is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
552 (is_same<T, half_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
553 (is_same<T, bhalf_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
554 (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
555 (is_same<T, f8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
556 (is_same<T, bf8_fnuz_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
558 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
559 (is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
560 "wrong! not implemented");
561
562 using r_t = typename vector_type<int8_t, sizeof(T) * N>::type;
563
565 dst_wave_buffer_resource,
566 dst_thread_addr_offset,
567 dst_wave_addr_offset);
568}
569
570template <typename T, index_t N>
571__device__ void amd_global_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
572 T* addr)
573{
574 static_assert((is_same<T, bhalf_t>::value && (N == 2 || N == 4 || N == 8)) ||
575 (is_same<T, half_t>::value && (N == 2 || N == 4 || N == 8)),
576 "wrong! not implemented");
577
578 if constexpr(is_same<T, half_t>::value)
579 {
580 vector_type<half_t, N> tmp{src_thread_data};
581 static_for<0, N / 2, 1>{}([&](auto i) {
582 __builtin_amdgcn_global_atomic_fadd_v2f16(bit_cast<half2_t*>(addr) + i,
583 tmp.template AsType<half2_t>()[i]);
584 });
585 }
586#if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx12__)
587 else if constexpr(is_same<T, bhalf_t>::value)
588 {
589 vector_type<bhalf_t, N> tmp{src_thread_data};
590 static_for<0, N / 2, 1>{}([&](auto i) {
591 __builtin_amdgcn_global_atomic_fadd_v2bf16(bit_cast<bhalf2_t*>(addr) + i,
592 tmp.template AsType<bhalf2_t>()[i]);
593 });
594 }
595#endif
596}
597
598template <typename T, index_t N>
599__device__ void amd_buffer_atomic_add_impl(const typename vector_type<T, N>::type src_thread_data,
600 int32x4_t dst_wave_buffer_resource,
601 index_t dst_thread_addr_offset,
602 index_t dst_wave_addr_offset)
603{
604 static_assert((is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
605 (is_same<T, half_t>::value && (N == 2 || N == 4 || N == 8)) ||
606 (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
607 "wrong! not implemented");
608
609 if constexpr(is_same<T, float>::value)
610 {
611 if constexpr(N == 1)
612 {
614 dst_wave_buffer_resource,
615 dst_thread_addr_offset,
616 dst_wave_addr_offset,
617 0);
618 }
619 else if constexpr(N == 2)
620 {
621 vector_type<float, 2> tmp{src_thread_data};
622
624 dst_wave_buffer_resource,
625 dst_thread_addr_offset,
626 dst_wave_addr_offset,
627 0);
628
630 dst_wave_buffer_resource,
631 dst_thread_addr_offset,
632 dst_wave_addr_offset + sizeof(float),
633 0);
634 }
635 else if constexpr(N == 4)
636 {
637 vector_type<float, 4> tmp{src_thread_data};
638
640 dst_wave_buffer_resource,
641 dst_thread_addr_offset,
642 dst_wave_addr_offset,
643 0);
644
646 dst_wave_buffer_resource,
647 dst_thread_addr_offset,
648 dst_wave_addr_offset + sizeof(float),
649 0);
650
652 dst_wave_buffer_resource,
653 dst_thread_addr_offset,
654 dst_wave_addr_offset + 2 * sizeof(float),
655 0);
656
658 dst_wave_buffer_resource,
659 dst_thread_addr_offset,
660 dst_wave_addr_offset + 3 * sizeof(float),
661 0);
662 }
663 }
664 else if constexpr(is_same<T, half_t>::value)
665 {
666 if constexpr(N == 2)
667 {
669 dst_wave_buffer_resource,
670 dst_thread_addr_offset,
671 dst_wave_addr_offset,
672 0);
673 }
674 else if constexpr(N == 4)
675 {
676 vector_type<half_t, 4> tmp{src_thread_data};
677
678 static_for<0, 2, 1>{}([&](auto i) {
680 dst_wave_buffer_resource,
681 dst_thread_addr_offset,
682 dst_wave_addr_offset + i * sizeof(half2_t),
683 0);
684 });
685 }
686 else if constexpr(N == 8)
687 {
688 vector_type<half_t, 8> tmp{src_thread_data};
689
690 static_for<0, 4, 1>{}([&](auto i) {
692 dst_wave_buffer_resource,
693 dst_thread_addr_offset,
694 dst_wave_addr_offset + i * sizeof(half2_t),
695 0);
696 });
697 }
698 }
699 else if constexpr(is_same<T, int32_t>::value)
700 {
701 if constexpr(N == 1)
702 {
704 dst_wave_buffer_resource,
705 dst_thread_addr_offset,
706 dst_wave_addr_offset,
707 0);
708 }
709 else if constexpr(N == 2)
710 {
711 vector_type<int32_t, 2> tmp{src_thread_data};
712
714 dst_wave_buffer_resource,
715 dst_thread_addr_offset,
716 dst_wave_addr_offset,
717 0);
718
720 dst_wave_buffer_resource,
721 dst_thread_addr_offset,
722 dst_wave_addr_offset + sizeof(int32_t),
723 0);
724 }
725 else if constexpr(N == 4)
726 {
727 vector_type<int32_t, 4> tmp{src_thread_data};
728
730 dst_wave_buffer_resource,
731 dst_thread_addr_offset,
732 dst_wave_addr_offset,
733 0);
734
736 dst_wave_buffer_resource,
737 dst_thread_addr_offset,
738 dst_wave_addr_offset + sizeof(int32_t),
739 0);
740
742 dst_wave_buffer_resource,
743 dst_thread_addr_offset,
744 dst_wave_addr_offset + 2 * sizeof(int32_t),
745 0);
746
748 dst_wave_buffer_resource,
749 dst_thread_addr_offset,
750 dst_wave_addr_offset + 3 * sizeof(int32_t),
751 0);
752 }
753 }
754}
755
756template <typename T, index_t N>
757__device__ void amd_buffer_atomic_max_impl(const typename vector_type<T, N>::type src_thread_data,
758 int32x4_t dst_wave_buffer_resource,
759 index_t dst_thread_addr_offset,
760 index_t dst_wave_addr_offset)
761{
762 static_assert((is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
763 "wrong! not implemented");
764 if constexpr(is_same<T, double>::value)
765 {
766 if constexpr(N == 1)
767 {
769 dst_wave_buffer_resource,
770 dst_thread_addr_offset,
771 dst_wave_addr_offset,
772 0);
773 }
774 else if constexpr(N == 2)
775 {
776 vector_type<double, 2> tmp{src_thread_data};
777
779 dst_wave_buffer_resource,
780 dst_thread_addr_offset,
781 dst_wave_addr_offset,
782 0);
783
785 dst_wave_buffer_resource,
786 dst_thread_addr_offset,
787 dst_wave_addr_offset + sizeof(double),
788 0);
789 }
790 else if constexpr(N == 4)
791 {
792 vector_type<double, 4> tmp{src_thread_data};
793
795 dst_wave_buffer_resource,
796 dst_thread_addr_offset,
797 dst_wave_addr_offset,
798 0);
799
801 dst_wave_buffer_resource,
802 dst_thread_addr_offset,
803 dst_wave_addr_offset + sizeof(double),
804 0);
805
807 dst_wave_buffer_resource,
808 dst_thread_addr_offset,
809 dst_wave_addr_offset + 2 * sizeof(double),
810 0);
811
813 dst_wave_buffer_resource,
814 dst_thread_addr_offset,
815 dst_wave_addr_offset + 3 * sizeof(double),
816 0);
817 }
818 }
819}
820
821// buffer_load requires:
822// 1) p_src_wave must point to global memory space
823// 2) p_src_wave must be a wavewise pointer.
824// It is user's responsibility to make sure that is true.
825template <typename T,
826 index_t N,
828__device__ typename vector_type_maker<T, N>::type::type
830 index_t src_thread_element_offset,
831 bool src_thread_element_valid,
832 index_t src_element_space_size)
833{
834 const int32x4_t src_wave_buffer_resource =
835 make_wave_buffer_resource(p_src_wave, src_element_space_size);
836
837 index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
838
839 using vector_t = typename vector_type_maker<T, N>::type::type;
840 using scalar_t = typename scalar_type<vector_t>::type;
841
842 constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
843
844#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
845 uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
847 src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
848
849#else
850
852 src_wave_buffer_resource, src_thread_addr_offset, 0)};
853 return src_thread_element_valid ? tmp : vector_t(0);
854#endif
855}
856
857// buffer_load requires:
858// 1) p_src_wave must point to global memory space
859// 2) p_src_wave must be a wavewise pointer.
860// It is user's responsibility to make sure that is true.
861template <typename T,
862 index_t N,
864__device__ typename vector_type_maker<T, N>::type::type
866 index_t src_thread_element_offset,
867 bool src_thread_element_valid,
868 index_t src_element_space_size,
869 T customized_value)
870{
871 const int32x4_t src_wave_buffer_resource =
872 make_wave_buffer_resource(p_src_wave, src_element_space_size);
873
874 index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
875
876 using vector_t = typename vector_type_maker<T, N>::type::type;
877 using scalar_t = typename scalar_type<vector_t>::type;
878
879 constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
880
882 src_wave_buffer_resource, src_thread_addr_offset, 0)};
883
884 return src_thread_element_valid ? tmp : vector_t(customized_value);
885}
886
887// buffer_store requires:
888// 1) p_dst_wave must point to global memory
889// 2) p_dst_wave must be a wavewise pointer.
890// It is user's responsibility to make sure that is true.
891template <typename T,
892 index_t N,
894__device__ void amd_buffer_store(const typename vector_type_maker<T, N>::type::type src_thread_data,
895 T* p_dst_wave,
896 const index_t dst_thread_element_offset,
897 const bool dst_thread_element_valid,
898 const index_t dst_element_space_size)
899{
900 const int32x4_t dst_wave_buffer_resource =
901 make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
902
903 index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
904
905 using vector_t = typename vector_type_maker<T, N>::type::type;
906 using scalar_t = typename scalar_type<vector_t>::type;
907 constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
908
909#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
910 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
912 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
913#else
914 if(dst_thread_element_valid)
915 {
917 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
918 }
919#endif
920}
921
922// buffer_atomic_add requires:
923// 1) p_dst_wave must point to global memory
924// 2) p_dst_wave must be a wavewise pointer.
925// It is user's responsibility to make sure that is true.
926template <typename T, index_t N>
927__device__ void
929 T* p_dst_wave,
930 const index_t dst_thread_element_offset,
931 const bool dst_thread_element_valid,
932 const index_t dst_element_space_size)
933{
934 const int32x4_t dst_wave_buffer_resource =
935 make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
936
937 index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
938
939 using vector_t = typename vector_type_maker<T, N>::type::type;
940 using scalar_t = typename scalar_type<vector_t>::type;
941 constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
942
943 if constexpr(is_same<T, bhalf_t>::value)
944 {
945 if(dst_thread_element_valid)
946 {
948 src_thread_data, p_dst_wave + dst_thread_element_offset);
949 }
950 }
951 else
952 {
953#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
954 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
955
957 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
958#else
959 if(dst_thread_element_valid)
960 {
962 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
963 }
964#endif
965 }
966}
967
968// buffer_atomic_max requires:
969// 1) p_dst_wave must point to global memory
970// 2) p_dst_wave must be a wavewise pointer.
971// It is user's responsibility to make sure that is true.
972template <typename T, index_t N>
973__device__ void
975 T* p_dst_wave,
976 const index_t dst_thread_element_offset,
977 const bool dst_thread_element_valid,
978 const index_t dst_element_space_size)
979{
980 const int32x4_t dst_wave_buffer_resource =
981 make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
982
983 index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
984
985 using vector_t = typename vector_type_maker<T, N>::type::type;
986 using scalar_t = typename scalar_type<vector_t>::type;
987 constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
988
989#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
990 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
991
993 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
994#else
995 if(dst_thread_element_valid)
996 {
998 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
999 }
1000#endif
1001}
1002
1003// Direct loads from global to LDS.
1004__device__ void
1006 __attribute__((address_space(3))) uint32_t* lds_ptr,
1007 index_t size,
1008 index_t voffset,
1009 index_t soffset,
1010 index_t offset,
1011 index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
1012
1013#ifndef __HIPCC_RTC__
1014template <typename T, index_t NumElemsPerThread>
1015__device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
1016 const index_t global_offset,
1017 T* lds_base_ptr,
1018 const index_t lds_offset,
1019 const bool is_valid,
1020 const index_t src_element_space_size)
1021{
1022 // Direct loads require that each thread reads and writes exactly a single DWORD.
1023 constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
1024#if defined(__gfx950__)
1025 constexpr auto dword_bytes = 4;
1026 static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
1027 bytes_per_thread == dword_bytes * 4);
1028#elif defined(__gfx942__)
1029 constexpr auto dword_bytes = 4;
1030 static_assert(bytes_per_thread == dword_bytes);
1031#endif
1032
1033 const int32x4_t src_resource =
1034 make_wave_buffer_resource(global_base_ptr, src_element_space_size);
1035 const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
1036
1037#if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
1038 T* lds_ptr = lds_base_ptr + lds_offset;
1039#ifndef CK_CODE_GEN_RTC
1040 auto const lds_ptr_sgpr =
1041 __builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(lds_ptr)));
1042#else
1043 auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((reinterpret_cast<size_t>(lds_ptr)));
1044#endif
1045 asm volatile("s_mov_b32 m0, %0; \n\t"
1046 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
1047 "v"(global_offset_bytes),
1048 "s"(src_resource)
1049 : "memory");
1050#else
1051 // LDS pointer must be attributed with the LDS address space.
1052 __attribute__((address_space(3))) uint32_t* lds_ptr =
1053#ifndef CK_CODE_GEN_RTC
1054 reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
1055 reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
1056#else
1057 reinterpret_cast<__attribute__((address_space(3))) uint32_t*>(
1058 reinterpret_cast<size_t>(lds_base_ptr + lds_offset));
1059#endif
1060
1062 src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
1063#endif
1064}
1065#endif
1066
1067} // namespace ck
#define CK_BUFFER_RESOURCE_3RD_DWORD
Definition ck.hpp:80
Definition ck.hpp:268
__device__ void llvm_amdgcn_raw_buffer_store_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
__device__ int32x4_t make_wave_buffer_resource_with_default_range(T *p_wave)
Definition utility/amd_buffer_addressing.hpp:38
ushort bhalf_t
Definition data_type.hpp:30
__device__ void amd_buffer_store(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition utility/amd_buffer_addressing.hpp:894
__device__ void amd_direct_load_global_to_lds(const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)
Definition utility/amd_buffer_addressing.hpp:1015
__device__ void amd_buffer_atomic_max(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition utility/amd_buffer_addressing.hpp:974
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
__device__ int32x4_t llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
__device__ void llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
__device__ void llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
__device__ void amd_buffer_store_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition utility/amd_buffer_addressing.hpp:544
__device__ float llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
__device__ int32x2_t llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
AmdBufferCoherenceEnum
Definition utility/amd_buffer_addressing.hpp:295
@ GLC
Definition utility/amd_buffer_addressing.hpp:297
@ SYSTEM_NT1
Definition utility/amd_buffer_addressing.hpp:310
@ WAVE_NT0
Definition utility/amd_buffer_addressing.hpp:303
@ GLC_SLC
Definition utility/amd_buffer_addressing.hpp:299
@ SLC
Definition utility/amd_buffer_addressing.hpp:298
@ DefaultCoherence
Definition utility/amd_buffer_addressing.hpp:296
@ DEVICE_NT1
Definition utility/amd_buffer_addressing.hpp:308
@ SYSTEM_NT0
Definition utility/amd_buffer_addressing.hpp:309
@ GROUP_NT1
Definition utility/amd_buffer_addressing.hpp:306
@ DEVICE_NT0
Definition utility/amd_buffer_addressing.hpp:307
@ GROUP_NT0
Definition utility/amd_buffer_addressing.hpp:305
@ WAVE_NT1
Definition utility/amd_buffer_addressing.hpp:304
__device__ int32x4_t make_wave_buffer_resource(T *p_wave, index_t element_space_size)
Definition utility/amd_buffer_addressing.hpp:23
__device__ void llvm_amdgcn_raw_buffer_store_fp16(half_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
typename vector_type< int8_t, 4 >::type int8x4_t
Definition dtype_vector.hpp:2177
__device__ half_t llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
_Float16 half_t
Definition data_type.hpp:31
__device__ bhalf_t llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
__device__ void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
integral_constant< index_t, N > Number
Definition number.hpp:12
__device__ void llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
__device__ int8x4_t llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
__device__ void amd_buffer_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition utility/amd_buffer_addressing.hpp:599
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition utility/amd_buffer_addressing.hpp:865
__device__ float4_t llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
__device__ bhalf2_t llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
__device__ void llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
__device__ void amd_global_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, T *addr)
Definition utility/amd_buffer_addressing.hpp:571
typename vector_type< int32_t, 4 >::type int32x4_t
Definition dtype_vector.hpp:2168
__device__ void llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
typename vector_type< float, 4 >::type float4_t
Definition dtype_vector.hpp:2146
typename vector_type< float, 2 >::type float2_t
Definition dtype_vector.hpp:2145
typename vector_type< int32_t, 2 >::type int32x2_t
Definition dtype_vector.hpp:2167
typename vector_type< bhalf_t, 4 >::type bhalf4_t
Definition dtype_vector.hpp:2161
__device__ void llvm_amdgcn_raw_buffer_store_i16(bhalf_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
__device__ vector_type< int8_t, N >::type amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition utility/amd_buffer_addressing.hpp:315
__device__ void llvm_amdgcn_raw_buffer_store_i16x2(bhalf2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
__device__ half2_t llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
__device__ void amd_buffer_atomic_add(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition utility/amd_buffer_addressing.hpp:928
__device__ void llvm_amdgcn_raw_buffer_store_fp32x2(float2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
__device__ bhalf4_t llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
typename vector_type< half_t, 2 >::type half2_t
Definition dtype_vector.hpp:2153
__device__ int8_t llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
typename vector_type< bhalf_t, 2 >::type bhalf2_t
Definition dtype_vector.hpp:2160
__device__ float2_t llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_zero(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
Definition utility/amd_buffer_addressing.hpp:829
__device__ void amd_buffer_atomic_max_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition utility/amd_buffer_addressing.hpp:757
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
__device__ vector_type< T, N >::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition utility/amd_buffer_addressing.hpp:419
__device__ void llvm_amdgcn_raw_buffer_store_fp16x2(half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
__device__ void llvm_amdgcn_raw_buffer_store_i8(int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
__device__ int32_t llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
__host__ __device__ constexpr Y bit_cast(const X &x)
Definition type.hpp:306
__device__ void amd_buffer_store_impl_raw(const typename vector_type< int8_t, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition utility/amd_buffer_addressing.hpp:446
__device__ void llvm_amdgcn_raw_buffer_store_i16x4(bhalf4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
__device__ int8x2_t llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
typename remove_cv< T >::type remove_cv_t
Definition type.hpp:295
typename vector_type< half_t, 4 >::type half4_t
Definition dtype_vector.hpp:2154
__device__ void llvm_amdgcn_raw_buffer_store_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
__device__ half4_t llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
__device__ void llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
typename vector_type< int8_t, 2 >::type int8x2_t
Definition dtype_vector.hpp:2176
signed short int16_t
Definition stdint.h:122
_W64 unsigned int uintptr_t
Definition stdint.h:164
unsigned int uint32_t
Definition stdint.h:126
signed int int32_t
Definition stdint.h:123
signed char int8_t
Definition stdint.h:121
static constexpr value_type value
Definition utility/integral_constant.hpp:13
Definition data_type.hpp:39
Definition functional2.hpp:33
Definition dtype_vector.hpp:30
Definition dtype_vector.hpp:10
Definition utility/amd_buffer_addressing.hpp:11
int32x4_t content
Definition utility/amd_buffer_addressing.hpp:16
StaticallyIndexedArray< int32_t, 4 > config
Definition utility/amd_buffer_addressing.hpp:19
StaticallyIndexedArray< int32_t, 4 > range
Definition utility/amd_buffer_addressing.hpp:18
StaticallyIndexedArray< T *, 2 > address
Definition utility/amd_buffer_addressing.hpp:17
__device__ constexpr BufferResource()
Definition utility/amd_buffer_addressing.hpp:12