According to the test case below, the assembly generated for AMD MI250 (gfx90a) is the same with or without simd. Though, if you look at the CPU code, you shall see a significant change with the simd clause which in this case, allows for a similar optimization to the ones observed with an explicit usage of the restrict keyword.
TLDR: Currently, the simd clause is irrelevant and only leads to this warning, even for extremely trivial cases:
loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning].
#include <cstdint>
#define RESTRICT __restrict
using Float = double;
void test0_0(Float* a, const Float* b) {
a[0] = b[0] * b[0];
// Forced store/reload (b[0] could be a[0]).
a[1] = b[0];
}
void test0_1(Float* RESTRICT a, const Float* RESTRICT b) {
a[0] = b[0] * b[0];
// No forced store/reload.
a[1] = b[0];
}
void test1_0(Float* a, Float* b, std::size_t length) {
#pragma omp parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// Forced store/reload
a[i + 1] = b[i + 0];
}
}
void test1_1(Float* a, Float* b, std::size_t length) {
#pragma omp parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// simd -> no loop carried dependencies:
// No forced store/reload -> easier vectorization, less generated code.
a[i + 1] = b[i + 0];
}
}
void test2_0(Float* a, Float* b, std::size_t length) {
#pragma omp target teams distribute parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// ASM shows forced store/reload, as expected.
a[i + 1] = b[i + 0];
}
}
void test2_1(Float* RESTRICT a, Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
void test3_0(Float* a, const Float* b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
void test3_1(Float* RESTRICT a, const Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0] * b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
test2_1(Float* RESTRICT a, Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
void test3_0(Float* a, const Float* b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
void test3_1(Float* RESTRICT a, const Float* RESTRICT b, std::size_t length) {
#pragma omp target teams distribute parallel for simd
for (std::size_t i = 0; i < length; i += 2) {
a[i + 0] = b[i + 0];
// ASM shows forced store/reload, but a/b are restricted BAD!
a[i + 1] = b[i + 0];
}
}
Code available at: https://godbolt.org/z/sMY48s8jz