NEON を使って4x4行列の転置行列求めるプログラムの説明。
ch15_07/main.cpp |
#include <iostream> #include <iomanip> #include "MatrixF32.h" using namespace std; extern void Mat4x4TransposeF32_(float* m_des, const float* m_src1); void Mat4x4TestF32(MatrixF32& m_src1) { const char nl = '\n'; const size_t nr = m_src1.GetNumCols(); const size_t nc = m_src1.GetNumRows(); MatrixF32 m_des1(nr, nc); MatrixF32 m_des2(nr, nc); MatrixF32::Transpose(m_des1, m_src1); Mat4x4TransposeF32_(m_des2.Data(), m_src1.Data()); cout << fixed << setprecision(1); m_src1.SetOstream(12, " "); m_des1.SetOstream(12, " "); m_des2.SetOstream(12, " "); cout << "\nResults for Mat4x4TestF32\n"; cout << "Matrix m_src1\n" << m_src1 << nl; cout << "Matrix m_des1 (transpose of m_src1)\n" << m_des1 << nl; cout << "Matrix m_des2 (transpose of m_src1)\n" << m_des2 << nl; if (m_des1 != m_des2) cout << "\nMatrix transpose compare failed\n"; } void Mat4x4TestF32(void) { const size_t nr = 4; const size_t nc = 4; MatrixF32 m_src1(nr ,nc); const float src1_row0[] = { 10, 11, 12, 13 }; const float src1_row1[] = { 20, 21, 22, 23 }; const float src1_row2[] = { 30, 31, 32, 33 }; const float src1_row3[] = { 40, 41, 42, 43 }; m_src1.SetRow(0, src1_row0); m_src1.SetRow(1, src1_row1); m_src1.SetRow(2, src1_row2); m_src1.SetRow(3, src1_row3); Mat4x4TestF32(m_src1); } int main() { Mat4x4TestF32(); return 0; } |
ch15_07/neon.cpp |
#include "Vec128.h" void Mat4x4TransposeF32_(float* m_des, const float* m_src1) { __asm volatile ("\n\ ld1 {v0.4s-v3.4s}, [x1] \n\ trn1 v4.4s, v0.4s, v1.4s // a0 b0 a2 b2 \n\ trn2 v5.4s, v0.4s, v1.4s // a1 b1 a3 b3 \n\ trn1 v6.4s, v2.4s, v3.4s // c0 d0 c2 d2 \n\ trn2 v7.4s, v2.4s, v3.4s // c1 d1 c3 d3 \n\ trn1 v0.2d, v4.2d, v6.2d // a0 b0 c0 d0 \n\ trn1 v1.2d, v5.2d, v7.2d // a1 b1 c1 d1 \n\ trn2 v2.2d, v4.2d, v6.2d // a2 b2 c2 d2 \n\ trn2 v3.2d, v5.2d, v7.2d // a3 b3 c3 d3 \n\ st1 {v0.4s-v3.4s}, [x0] \n\ " : : : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } |
ch15_07/main.cpp の実行例 |
arm64@manet Ch15_07 % g++ -I.. -std=c++11 -O -S neon.cpp arm64@manet Ch15_07 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out arm64@manet Ch15_07 % ./a.out Results for Mat4x4TestF32 Matrix m_src1 10.0 11.0 12.0 13.0 20.0 21.0 22.0 23.0 30.0 31.0 32.0 33.0 40.0 41.0 42.0 43.0 Matrix m_des1 (transpose of m_src1) 10.0 20.0 30.0 40.0 11.0 21.0 31.0 41.0 12.0 22.0 32.0 42.0 13.0 23.0 33.0 43.0 Matrix m_des2 (transpose of m_src1) 10.0 20.0 30.0 40.0 11.0 21.0 31.0 41.0 12.0 22.0 32.0 42.0 13.0 23.0 33.0 43.0 arm64@manet Ch15_07 % |