SIMD Programming Matrixes

I'm a beginner in C language.
When I wrote a program that calculated 8x8 matrices using SIMD instructions, it became strange that segmentation fault happened and didn't happen every time I ran it.
I would appreciate it if you could tell me where the mistake is and how it should be improved.

"Also, I think the switch statement should be calculated using a pointer and i, but for some reason, a ""segmentation error"" occurs when using a variable."I would like to know the reason for this as well.

Here's the code:

#include<math.h>
# include <stdio.h>
# include <stdlib.h>
# include <string.h>
# include <x86 intrin.h>

void print_vec (_m256 m)
{
   printf("{");
   for(inti=0;i<8;i++) printf("%.2f", m[i]);
   printf("}\n");
}

void prod_avx(float*a, float*b, float*c)
{
  float cd [64];
  __m256b0n, b1n, b2n, b3n, b4n, b5n, b6n, b7n;
  
  b0n=_mm256_load_ps(b);
  b1n =_mm256_load_ps(b+8);
  b2n =_mm256_load_ps(b+16);
  b3n =_mm256_load_ps(b+24);
  b4n =_mm256_load_ps(b+32);
  b5n =_mm256_load_ps(b+40);
  b6n =_mm256_load_ps(b+48);
  b7n =_mm256_load_ps(b+56);

  for (inti=0; i<8;i++)
  {
    __m256ai0, ai1, ai2, ai3, ai4, ai5, ai6, ai7;
    ai0=_mm256_set1_ps(*(a+8*i)));
    ai1 =_mm256_set1_ps(*(a+8*i+1));
    ai2=_mm256_set1_ps(*(a+8*i+2));
    ai3=_mm256_set1_ps(*(a+8*i+3));
    ai4=_mm256_set1_ps(*(a+8*i+4));
    ai5=_mm256_set1_ps(*(a+8*i+5));
    ai6=_mm256_set1_ps(*(a+8*i+6));
    ai7=_mm256_set1_ps(*(a+8*i+7));
    
    ai0=_mm256_mul_ps(ai0,b0n); 
    ai1 =_mm256_mul_ps(ai1,b1n); 
    ai2=_mm256_mul_ps(ai2,b2n); 
    ai3=_mm256_mul_ps(ai3,b3n); 
    ai4=_mm256_mul_ps(ai4,b4n); 
    ai5=_mm256_mul_ps(ai5,b5n); 
    ai6=_mm256_mul_ps(ai6,b6n); 
    ai7=_mm256_mul_ps(ai7,b7n); 
  
    ai0=_mm256_add_ps(ai0,ai1);
    ai2=_mm256_add_ps(ai2,ai3);
    ai4=_mm256_add_ps(ai4,ai5);
    ai6=_mm256_add_ps(ai6,ai7);
    
    ai0=_mm256_add_ps(ai0,ai2);
    ai4=_mm256_add_ps(ai4,ai6);
    
    ai0=_mm256_add_ps(ai0,ai4);

    // print_vec(ai0);
    // printf("i*8=%d\n", i*8);
    // I can only say that the reason is really unknown, but if you don't use i for pointer operations, you'll get -11 errors.

   switch(i)
    {
    case0: 
        _mm256_store_ps(cd+0,ai0);
        break;
    case1: 
        _mm256_store_ps(cd+8,ai0);
        break;
    case2:  
        _mm256_store_ps(cd+16,ai0);
        break;
    case3:
        _mm256_store_ps(cd+24,ai0);
        break;
    case4: 
        _mm256_store_ps(cd+32,ai0);
        break;
    case5: 
        _mm256_store_ps(cd+40,ai0);
        break;
    case6: 
        _mm256_store_ps(cd+48,ai0);
        break;
    case7: 
        _mm256_store_ps(cd+56,ai0);
        break;
        }
         
 }
  memcpy(c,cd,64*sizeof(float));
}
void print_gyoretu(float*a)
{ 
 for (inti=0; i<8;i++)
    {
    printf("{");
     for (int j = 0; j<8;j++)
     {
       printf("%.2f", a[i*8+j]);  
     }
     printf("}\n");
    }
}

int main() {
   // float*a=(float*)malloc(sizeof(float)*64);
   // float*b=(float*)malloc(sizeof(float)*64);
    floata [64];
    float b [64];
    float cd [64];
    int j = 0;

    for (inti=0;i<64;i++)
      {
          if(i%8<4&i<32)
          {
            a[i] = j;
            j++;
          }
          else
          {
            a[i] = 0;
          }
      }
    for (inti=0;i<64;i++)
      {
          if(i%8<4&i<32)
          {
            b[i] = j;
            j++;
          }
          else
          {
            b[i] = 0;
          }
      }
             
    print_gyoretu(a);
    printf("===========================\n");
    print_gyoretu(b);

    printf("\n====================\n";

    prod_avx(a,b,cd);
    print_gyoretu(cd);
    
}

2022-09-30 16:23

3 Answers

gcc has the option -fsanitize=alignment.

-fsanitize=alignment

This option enables checking of alignment of pointers when they are referred, or when a reference is bound to efficiently aligned target, or when a method or constructor is invoked on efficiently aligned object.

If you run it with this option, type '__m256', which requirements 32 byte alignment will appear.

$lscpu | grep-E'^ (Architecture | Model name)'
Architecture: x86_64
Model name: Intel® Core™ i5-8500T CPU @ 2.10 GHz
$ lsb_release-ir
Distributor ID: Ubuntu
Release : 20.10
$ gcc -- version
gcc(Ubuntu 10.2.0-13ubuntu1) 10.2.0

# using AVX2
$ gcc-fsanitize=alignment-mavx2-Wall-Wextra-g matrix_product.c-o matrix_product&./matrix_product
                          :

/usr/lib/gcc/x86_64-linux-gnu/10/include/avxintrin.h:874:10:
Runtime error:
load of misaligned address 0x7ffee9609fd0 for type '__m256', which requirements 32 byte alignment
0x7ffee9609fd0:note:pointer points here
 00 00 00 00  00 00 80 41 00 00 88 41  00 00 90 41 00 00 98 41  00 00 00 00 00 00 00 00  00 00 00 00
              ^ 
Segmentation fault (core dump)

Regarding alignment, the maximum length is defined in _BIGGEST_ALIGNMENT__.

$gcc-dM-E-</dev/null | grep_BIGEST_ALIGNMENT__
#define_BIGGEST_ALIGNMENT__16

$ gcc-mavx2-dM-E-</dev/null | grep_BIGEST_ALIGNMENT__
#define_BIGGEST_ALIGNMENT_32

AVX2 has 32 bytes (256 bits), so you can specify the aligned attribute with the _attribute__ keyword in gcc.

 floata[64]__attribute__(aligned(32));
               :

2022-09-30 16:23

There may be other problems because Xcode clang on macOS cannot reproduce the event, but since instructions such as _mm256_load_ps require the pointer of the argument to be aligned (aligned) to a 32-byte boundary, CPU exception ("segmentation fault") is likely to occur.

Try rewriting the declaration of an array that may not be aligned well as follows:

Declaration of cd in prod_avx:

 float*cd=(float*) aligned_alloc(32,64*sizeof(float));

Declaration of a, b, cd in main:

 float*a=(float*) aligned_alloc(32,64*sizeof(float));
    float*b=(float*)aligned_alloc(32,64*sizeof(float));
    float*cd=(float*)aligned_alloc(32,64*sizeof(float));

(Each requires a corresponding free but has been omitted for simplicity.Please make up for it yourself.)

g I thought gcc could use the extended attribute __attribute__((aligned(32)) but I chose aligned_alloc because some environments may not support alignment to large boundaries like 32 bytes.

2022-09-30 16:23

This is not an answer, but it was interesting, so this is a reference.

Visual C++ generates code based on the meaning of the built-in function, which is a loop of prod_avx, but the order of instructions has changed.Also, I managed to use registers less than the source code.In addition, when I explicitly permitted the use of AVX2 instructions with the compilation option /arch:AVX2, I even used FMA instructions without permission to use the following code:

ai6=_mm256_broadcast_ss(a+8*i+6);
ai7=_mm256_broadcast_ss(a+8*i+7);
ai6=_mm256_mul_ps(ai6,b6n);
ai4=_mm256_broadcast_ss(a+8*i+4);
ai6=_mm256_fmadd_ps(ai7,b7n,ai6);
ai5=_mm256_broadcast_ss(a+8*i+5);
ai4=_mm256_mul_ps(ai4,b4n);
ai2=_mm256_broadcast_ss(a+8*i+2);
ai4=_mm256_fmadd_ps(ai5,b5n,ai4);
ai3=_mm256_broadcast_ss(a+8*i+3);
ai4=_mm256_add_ps(ai4,ai6);
ai2=_mm256_mul_ps(ai2,b2n);
ai0=_mm256_broadcast_ss(a+8*i);
ai2=_mm256_fmadd_ps(ai3,b3n,ai2);
ai1 =_mm256_broadcast_ss(a+8*i+1);
ai0=_mm256_mul_ps(ai0,b0n);
ai0=_mm256_fmadd_ps(ai1,b1n,ai0);
ai0=_mm256_add_ps(ai0,ai2);
ai0=_mm256_add_ps(ai0,ai4);

The source code of the question cannot be all FMA instructions due to the order of operation.

So, if you use the FMA instruction explicitly, it looks like this.

_m256ai0,ai1;
ai0=_mm256_broadcast_ss(a+8*i);
ai0=_mm256_mul_ps(ai0,b0n);
ai1 =_mm256_broadcast_ss(a+8*i+1);
ai0=_mm256_fmadd_ps(ai1,b1n,ai0);
ai1 =_mm256_broadcast_ss(a+8*i+2);
ai0=_mm256_fmadd_ps(ai1,b2n,ai0);
ai1 =_mm256_broadcast_ss(a+8*i+3);
ai0=_mm256_fmadd_ps(ai1,b3n,ai0);
ai1 =_mm256_broadcast_ss(a+8*i+4);
ai0=_mm256_fmadd_ps(ai1,b4n,ai0);
ai1 =_mm256_broadcast_ss(a+8*i+5);
ai0=_mm256_fmadd_ps(ai1,b5n,ai0);
ai1 =_mm256_broadcast_ss(a+8*i+6);
ai0=_mm256_fmadd_ps(ai1,b6n,ai0);
ai1 =_mm256_broadcast_ss(a+8*i+7);
ai0=_mm256_fmadd_ps(ai1,b7n,ai0);

Or is it like this when you load it once and broadcast it on the register?

_m256ai0, ai1, ai2, ai3;
ai3=_mm256_load_ps(a+8*i); // ai3=a7, a6, a5, a4, a3, a2, a1, a0
ai2=_mm256_permute_ps(ai3,0x00); // ai2=a4, a4, a4, a0, a0, a0, a0, a0
ai1=_mm256_permute2f128_ps(ai2, ai2,0x00); // ai1=a0, a0, a0, a0, a0, a0, a0
ai0=_mm256_mul_ps(ai1,b0n); // ai0=ai1*b0n
ai2=_mm256_permute2f128_ps(ai2, ai2, 0x11); // ai2=a4, a4, a4, a4, a4, a4, a4, a4, a4
ai0=_mm256_fmadd_ps(ai2, b4n, ai0); // ai0=ai2*b4n+ai0
ai2=_mm256_permute_ps(ai3,0x55); // ai2=a5, a5, a5, a1, a1, a1, a1, a1
ai1=_mm256_permute2f128_ps(ai2,ai2,0x00); // ai1=a1,a1,a1,a1,a1,a1
ai0=_mm256_fmadd_ps(ai1,b1n,ai0); // ai0=ai1*b1n+ai0
ai2=_mm256_permute2f128_ps(ai2, ai2, 0x11); // ai2=a5, a5, a5, a5, a5, a5, a5
ai0=_mm256_fmadd_ps(ai2, b5n, ai0); // ai0=ai2*b5n+ai0
ai2 =_mm256_permute_ps(ai3,0xAA); // ai2 = a6, a6, a6, a2, a2, a2, a2, a2
ai1=_mm256_permute2f128_ps(ai2, ai2, 0x00); // ai1=a2, a2, a2, a2, a2, a2, a2, a2
ai0=_mm256_fmadd_ps(ai1,b2n,ai0); // ai0=ai1*b2n+ai0
ai2=_mm256_permute2f128_ps(ai2, ai2, 0x11); // ai2=a6, a6, a6, a6, a6, a6
ai0=_mm256_fmadd_ps(ai2, b6n, ai0); // ai0=ai2*b6n+ai0
ai2=_mm256_permute_ps(ai3,0xFF); // ai2=a7, a7, a7, a3, a3, a3, a3
ai1=_mm256_permute2f128_ps(ai2, ai2, 0x00); // ai1=a3, a3, a3, a3, a3, a3, a3
ai0=_mm256_fmadd_ps(ai1,b3n,ai0);//ai0=ai1*b3n+ai0
ai2=_mm256_permute2f128_ps(ai2, ai2, 0x11); // ai2=a7, a7, a7, a7, a7, a7, a7
ai0=_mm256_fmadd_ps(ai2,b7n,ai0); // ai0=ai2*b7n+ai0

2022-09-30 16:23

If you have any answers or tips

Popular Tags

python x 4647

android x 1593

java x 1494

javascript x 1427

c x 927

c++ x 878

ruby-on-rails x 696

php x 692

python3 x 685

html x 656