NEON を使って、4次元ベクトルの外積計算を行うプログラムの説明 。
ch16_02/vec.h |
#pragma once // Simple vector structure typedef struct { float X; // X component float Y; // Y component float Z; // Z component } Vector; // Vector structure of arrays typedef struct { float* X; // X components float* Y; // Y components float* Z; // Z components } VectorSoA; |
ch16_02/main.cpp |
#include <iostream> #include <iomanip> #include <memory> #include <random> #include "AlignedMem.h" #include "vec.h" #define C_ALIGN 16 #define SEP " " #define N_VEC 18 #define EPS 1.0e-9 #define NEQ(x, y) (fabs((x) - (y)) > EPS)) using namespace std; extern void CrossProdAOS_(Vector* c, const Vector* a, const Vector* b, int n); extern void CrossProdSOA_(VectorSoA& c, const VectorSoA& a, const VectorSoA& b, int n); void CrossProdAOS(Vector* c, const Vector* a, const Vector* b, int n) { for (size_t i = 0; i < n; i++) { c[i].X = a[i].Y * b[i].Z - a[i].Z * b[i].Y; c[i].Y = a[i].Z * b[i].X - a[i].X * b[i].Z; c[i].Z = a[i].X * b[i].Y - a[i].Y * b[i].X; } } void CrossProdSOA(VectorSoA& c, const VectorSoA& a, const VectorSoA& b, int n) { for (size_t i = 0; i < n; i++) { c.X[i] = a.Y[i] * b.Z[i] - a.Z[i] * b.Y[i]; c.Y[i] = a.Z[i] * b.X[i] - a.X[i] * b.Z[i]; c.Z[i] = a.X[i] * b.Y[i] - a.Y[i] * b.X[i]; } } void InitVec(Vector* p, VectorSoA& q, int n, unsigned short int seed) { std::mt19937 mt {seed}; std::uniform_int_distribution<> rand100 {1, 100}; for (int i=0; i < n; i++) { p[i].X = q.X[i] = (float) rand100(mt); p[i].Y = q.Y[i] = (float) rand100(mt); p[i].Z = q.Z[i] = (float) rand100(mt); } } bool CompareCP(Vector* p, Vector* q, int n) { for (int i = 0; i < n; i++) { if (fabs(p[i].X - q[i].X) > EPS || fabs(p[i].Y - q[i].Y) > EPS || fabs(p[i].Z - q[i].Z) > EPS) return false; } return true; } bool CompareCP(VectorSoA& p, VectorSoA& q, int n) { for (int i = 0; i < n; i++) { if (fabs(p.X[i] - q.X[i]) > EPS || fabs(p.Y[i] - q.Y[i]) > EPS || fabs(p.Z[i] - q.Z[i]) > EPS) return false; } return true; } bool CompareCP(Vector* p, VectorSoA& q, int n) { for (int i = 0; i < n; i++) { if (fabs(p[i].X - q.X[i]) > EPS || fabs(p[i].Y - q.Y[i]) > EPS || fabs(p[i].Z - q.Z[i]) > EPS) return false; } return true; } void CrossProd(void) { unique_ptr<Vector> a_aos_up { new Vector[N_VEC] }; unique_ptr<Vector> b_aos_up { new Vector[N_VEC] }; unique_ptr<Vector> c1_aos_up { new Vector[N_VEC] }; unique_ptr<Vector> c2_aos_up { new Vector[N_VEC] }; Vector* a_aos = a_aos_up.get(); Vector* b_aos = b_aos_up.get(); Vector* c1_aos = c1_aos_up.get(); Vector* c2_aos = c2_aos_up.get(); AlignedArray<float> a_soa_x_aa(N_VEC, C_ALIGN); AlignedArray<float> a_soa_y_aa(N_VEC, C_ALIGN); AlignedArray<float> a_soa_z_aa(N_VEC, C_ALIGN); AlignedArray<float> b_soa_x_aa(N_VEC, C_ALIGN); AlignedArray<float> b_soa_y_aa(N_VEC, C_ALIGN); AlignedArray<float> b_soa_z_aa(N_VEC, C_ALIGN); AlignedArray<float> c1_soa_x_aa(N_VEC, C_ALIGN); AlignedArray<float> c1_soa_y_aa(N_VEC, C_ALIGN); AlignedArray<float> c1_soa_z_aa(N_VEC, C_ALIGN); AlignedArray<float> c2_soa_x_aa(N_VEC, C_ALIGN); AlignedArray<float> c2_soa_y_aa(N_VEC, C_ALIGN); AlignedArray<float> c2_soa_z_aa(N_VEC, C_ALIGN); VectorSoA a_soa, b_soa, c1_soa, c2_soa; a_soa.X = a_soa_x_aa.Data(); a_soa.Y = a_soa_y_aa.Data(); a_soa.Z = a_soa_z_aa.Data(); b_soa.X = b_soa_x_aa.Data(); b_soa.Y = b_soa_y_aa.Data(); b_soa.Z = b_soa_z_aa.Data(); c1_soa.X = c1_soa_x_aa.Data(); c1_soa.Y = c1_soa_y_aa.Data(); c1_soa.Z = c1_soa_z_aa.Data(); c2_soa.X = c2_soa_x_aa.Data(); c2_soa.Y = c2_soa_y_aa.Data(); c2_soa.Z = c2_soa_z_aa.Data(); InitVec(a_aos, a_soa, N_VEC, 2021); InitVec(b_aos, b_soa, N_VEC, 2022); CrossProdAOS(c1_aos, a_aos, b_aos, N_VEC); CrossProdAOS_(c2_aos, a_aos, b_aos, N_VEC); CrossProdSOA(c1_soa, a_soa, b_soa, N_VEC); CrossProdSOA_(c2_soa, a_soa, b_soa, N_VEC); bool compare_cp = CompareCP(c1_aos, c2_aos, N_VEC) && CompareCP(c1_soa, c2_soa, N_VEC) && CompareCP(c1_aos, c1_soa, N_VEC); cout << "Results for CrossProd\n"; cout << fixed << setprecision(1); for (size_t i = 0; i < N_VEC; i++) { const unsigned int w = 7; cout << "Vector cross product #" << i << endl; cout << " a: "; cout << setw(w) << a_aos[i].X << SEP; cout << setw(w) << a_aos[i].Y << SEP; cout << setw(w) << a_aos[i].Z << SEP; cout << " b: "; cout << setw(w) << b_aos[i].X << SEP; cout << setw(w) << b_aos[i].Y << SEP; cout << setw(w) << b_aos[i].Z << endl; if (compare_cp) cout << " c: "; else cout << " c1_aos: "; cout << setw(w) << c1_aos[i].X << SEP; cout << setw(w) << c1_aos[i].Y << SEP; cout << setw(w) << c1_aos[i].Z << endl; if (!compare_cp) { cout << " c2_aos: "; cout << setw(w) << c2_aos[i].X << SEP; cout << setw(w) << c2_aos[i].Y << SEP; cout << setw(w) << c2_aos[i].Z << endl; cout << " c1_soa: "; cout << setw(w) << c1_soa.X[i] << SEP; cout << setw(w) << c1_soa.Y[i] << SEP; cout << setw(w) << c1_soa.Z[i] << endl; cout << " c2_soa: "; cout << setw(w) << c2_soa.X[i] << SEP; cout << setw(w) << c2_soa.Y[i] << SEP; cout << setw(w) << c2_soa.Z[i] << endl; } } } int main() { CrossProd(); //CrossProd_BM(); return 0; } |
ch16_02/neon.cpp |
#include "vec.h" #define ASM_MACROS "\n\ // c = cross(a,b) \n\ // a: s0, s1, s2 \n\ // b: s3, s4, s5 \n\ // c: s19, s20, s21 \n\ .macro VecCp \n\ fmul s19, s1, s5 // s19 = ay * bz \n\ fmsub s19, s2, s4, s19 // s19 = s19 - az * by \n\ fmul s20, s2, s3 // s20 = az * bx \n\ fmsub s20, s0, s5, s20 // s20 = s20 - az * by \n\ fmul s21, s0, s4 // s21 = az * bx \n\ fmsub s21, s1, s3, s21 // s21 = s21 - az * by \n\ .endm \n\ \n\ // c[0-3] = SIMD:cross(a[0-3],b[0-3]) \n\ // a: v0, v1, v2 \n\ // b: v3, v4, v5 \n\ // c: v19, v20, v21 \n\ .macro VecCp4 \n\ fmul v19.4s, v1.4s, v5.4s // v19 = ay * bz \n\ fmls v19.4s, v2.4s, v4.4s // v19 -= - az * by \n\ fmul v20.4s, v2.4s, v3.4s // v20 = az * bx \n\ fmls v20.4s, v0.4s, v5.4s // v20 -= az * by \n\ fmul v21.4s, v0.4s, v4.4s // v21 = az * bx \n\ fmls v21.4s, v1.4s, v3.4s // v21 -= az * by \n\ .endm \n\ \n\ // c[0-3] = SIMD:cross(a[0-3],b[0-3]) \n\ // a: ( [x1],[x1+4],[x1+8] ) * 4 \n\ // b: ( [x2],[x2+4],[x2+8] ) * 4 \n\ // c: ( [x0],[x0+4],[x0+8] ) * 4 \n\ .macro VecCp4AOS \n\ ld3 {v0.4s, v1.4s, v2.4s}, [x1], 48 // load (ax,ay,az) into (v0,v1,v2); x1 += 48 \n\ ld3 {v3.4s, v4.4s, v5.4s}, [x2], 48 // load (bx,vy,vz) into (v3,v4,v5); x2 += 48 \n\ VecCp4 // c = cross(a,b) \n\ st3 {v19.4s, v20.4s, v21.4s}, [x0], 48 // store (cx,cy,cz) from (v19,v20,v21); x0 += 48 \n\ .endm \n\ \n\ // c = SIMD:cross(a,b) \n\ // ax:[x7] * 4 \n\ // ay:[x8] * 4 \n\ // az:[x9] * 4 \n\ // bx:[x10] * 4 \n\ // by:[x11] * 4 \n\ // bz:[x12] * 4 \n\ // cx:[x13] * 4 \n\ // cy:[x14] * 4 \n\ // cz:[x15] * 4 \n\ .macro VecCp4SOA \n\ ld1 {v0.4s}, [x7], 16 // v0: ax \n\ ld1 {v1.4s}, [x8], 16 // v1: ay \n\ ld1 {v2.4s}, [x9], 16 // v2: az \n\ ld1 {v3.4s}, [x10], 16 // v3: bx \n\ ld1 {v4.4s}, [x11], 16 // v4: by \n\ ld1 {v5.4s}, [x12], 16 // v5: bz \n\ VecCp4 // c = cross(a,b) \n\ st1 {v19.4s}, [x13], 16 // [x13]: cx \n\ st1 {v20.4s}, [x14], 16 // [x14]: cx \n\ st1 {v21.4s}, [x15], 16 // [x15]: cy \n\ .endm \n\ \n\ " void CrossProdAOS_(Vector* c, const Vector* a, const Vector* b, int n) { __asm volatile (ASM_MACROS "\n\ \n\ cmp x3, 16 // if n < 0 \n\ b.lo LSkipLoop1A // goto SkipLoop1A \n\ \n\ LLoop1A: \n\ VecCp4AOS \n\ VecCp4AOS \n\ VecCp4AOS \n\ VecCp4AOS \n\ sub x3, x3, 16 // n -= 16 \n\ cmp x3, 16 // if n >= 16 \n\ b.hs LLoop1A // goto Loop1A \n\ LSkipLoop1A: \n\ cbz x3, LDoneA \n\ LLoop2A: \n\ ldp s0, s1, [x1], 8 // (s0, s1) = (ax, ay); x1 += 8 \n\ ldr s2, [x1], 4 // s2 = az; x1 += 4 b \n\ ldp s3, s4, [x2], 8 // (s3, s4) = (bx, by); x1 += 8 \n\ ldr s5, [x2], 4 // s5 = bz; x1 += 4 \n\ VecCp \n\ stp s19, s20, [x0], 8 // [x0] = (cx, cy); x0 += 8 \n\ str s21, [x0], 4 // [x0] = cz; x0 += 4 \n\ subs x3, x3, 1 // if --x3 != 0 \n\ b.ne LLoop2A // goto Loop2A \n\ LDoneA: \n\ " : : : "v0", "v1", "v2", "v3", "v4", "v5", "v19", "v20", "v21", "x0", "x1", "x2", "x3" ); } void CrossProdSOA_(VectorSoA& c, const VectorSoA& a, const VectorSoA& b, int n) { __asm volatile ("\n\ \n\ ldp x7, x8, [x1], 16 // (x7, x8) = address of (ax, ay) \n\ ldr x9, [x1] // x9 = address of az \n\ ldp x10, x11, [x2], 16 // (x10, x11) = address of (bx, by) \n\ ldr x12, [x2] // x12 = address of bz \n\ ldp x13, x14, [x0], 16 // (x13, x14) = address of (cx, cy) \n\ ldr x15, [x0] // x15 = address of cz \n\ \n\ cmp x3, 16 // if n < 0 \n\ b.lo LSkipLoop1B // goto SkipLoop1A \n\ \n\ LLoop1B: \n\ VecCp4SOA \n\ VecCp4SOA \n\ VecCp4SOA \n\ VecCp4SOA \n\ sub x3, x3, 16 // n -= 16 \n\ cmp x3, 16 // if n >= 16 \n\ b.hs LLoop1B // goto Loop1B \n\ LSkipLoop1B: \n\ cbz x3, LDoneB \n\ LLoop2B: \n\ ldr s0, [x7], 4 // s0 = ax; x7 += 4 \n\ ldr s1, [x8], 4 // s1 = ay; x8 += 4 \n\ ldr s2, [x9], 4 // s2 = az; x9 += 4 \n\ ldr s3, [x10], 4 // s3 = bx; x10 += 4 \n\ ldr s4, [x11], 4 // s4 = by; x11 += 4 \n\ ldr s5, [x12], 4 // s5 = bz; x12 += 4 \n\ VecCp \n\ str s19, [x13], 4 // [x13] = cx; x13 += 4 \n\ str s20, [x14], 4 // [x14] = cy; x14 += 4 \n\ str s21, [x15], 4 // [x15] = cz; x15 += 4 \n\ subs x3, x3, 1 // if --n != 0 \n\ b.ne LLoop2B // goto Loop2B \n\ LDoneB: \n\ " : : : "v0", "v1", "v2", "v3", "v4", "v5", "v19", "v20", "v21", "x0", "x1", "x2", "x3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15" ); } |
ch16_02/main.cpp の実行例 |
arm64@manet ch16_02 % g++ -I.. -I. -std=c++11 -O -S neon.cpp arm64@manet ch16_02 % g++ -I.. -I. -std=c++11 -O main.cpp neon.cpp -o a.out arm64@manet ch16_02 % ./a.out Results for CrossProd Vector cross product #0 a: 86.0 58.0 1.0 b: 93.0 46.0 50.0 c: 2854.0 -4207.0 -1438.0 Vector cross product #1 a: 95.0 87.0 45.0 b: 56.0 89.0 19.0 c: -2352.0 715.0 3583.0 Vector cross product #2 a: 63.0 92.0 30.0 b: 25.0 17.0 54.0 c: 4458.0 -2652.0 -1229.0 Vector cross product #3 a: 22.0 94.0 25.0 b: 42.0 34.0 28.0 c: 1782.0 434.0 -3200.0 Vector cross product #4 a: 13.0 71.0 71.0 b: 12.0 20.0 95.0 c: 5325.0 -383.0 -592.0 Vector cross product #5 a: 34.0 8.0 2.0 b: 76.0 49.0 20.0 c: 62.0 -528.0 1058.0 Vector cross product #6 a: 98.0 27.0 67.0 b: 39.0 73.0 15.0 c: -4486.0 1143.0 6101.0 Vector cross product #7 a: 49.0 100.0 64.0 b: 94.0 17.0 91.0 c: 8012.0 1557.0 -8567.0 Vector cross product #8 a: 50.0 17.0 51.0 b: 12.0 95.0 98.0 c: -3179.0 -4288.0 4546.0 Vector cross product #9 a: 55.0 53.0 94.0 b: 3.0 81.0 16.0 c: -6766.0 -598.0 4296.0 Vector cross product #10 a: 6.0 50.0 39.0 b: 24.0 82.0 88.0 c: 1202.0 408.0 -708.0 Vector cross product #11 a: 15.0 72.0 86.0 b: 86.0 38.0 57.0 c: 836.0 6541.0 -5622.0 Vector cross product #12 a: 71.0 42.0 22.0 b: 84.0 13.0 46.0 c: 1646.0 -1418.0 -2605.0 Vector cross product #13 a: 26.0 11.0 37.0 b: 91.0 77.0 15.0 c: -2684.0 2977.0 1001.0 Vector cross product #14 a: 20.0 58.0 83.0 b: 97.0 14.0 78.0 c: 3362.0 6491.0 -5346.0 Vector cross product #15 a: 91.0 16.0 41.0 b: 53.0 43.0 54.0 c: -899.0 -2741.0 3065.0 Vector cross product #16 a: 77.0 54.0 12.0 b: 65.0 46.0 94.0 c: 4524.0 -6458.0 32.0 Vector cross product #17 a: 20.0 34.0 79.0 b: 64.0 24.0 38.0 c: -604.0 4296.0 -1696.0 arm64@manet ch16_02 % |