mirror of
https://github.com/google/liblc3.git
synced 2026-05-30 08:27:01 +00:00
mdct: Add neon implementation of FFT
This commit is contained in:
+11
-4
@@ -18,6 +18,8 @@
|
||||
|
||||
#include "tables.h"
|
||||
|
||||
#include "mdct_neon.h"
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* FFT processing
|
||||
@@ -26,8 +28,9 @@
|
||||
/**
|
||||
* FFT 5 Points
|
||||
* x, y Input and output coefficients, of size 5xn
|
||||
* n Number of interleaved transform to perform
|
||||
* n Number of interleaved transform to perform (n % 2 = 0)
|
||||
*/
|
||||
#ifndef fft_5
|
||||
LC3_HOT static inline void fft_5(
|
||||
const struct lc3_complex *x, struct lc3_complex *y, int n)
|
||||
{
|
||||
@@ -50,6 +53,7 @@ LC3_HOT static inline void fft_5(
|
||||
{ x[2*n].re - x[3*n].re, x[2*n].im - x[3*n].im };
|
||||
|
||||
y[0].re = x[0].re + s14.re + s23.re;
|
||||
|
||||
y[0].im = x[0].im + s14.im + s23.im;
|
||||
|
||||
y[1].re = x[0].re + s14.re * cos1 - d14.im * sin1
|
||||
@@ -77,6 +81,7 @@ LC3_HOT static inline void fft_5(
|
||||
+ s23.im * cos2 - d23.re * sin2;
|
||||
}
|
||||
}
|
||||
#endif /* fft_5 */
|
||||
|
||||
/**
|
||||
* FFT Butterfly 3 Points
|
||||
@@ -84,6 +89,7 @@ LC3_HOT static inline void fft_5(
|
||||
* twiddles Twiddles factors, determine size of transform
|
||||
* n Number of interleaved transforms
|
||||
*/
|
||||
#ifndef fft_bf3
|
||||
LC3_HOT static inline void fft_bf3(
|
||||
const struct lc3_fft_bf3_twiddles *twiddles,
|
||||
const struct lc3_complex *x, struct lc3_complex *y, int n)
|
||||
@@ -95,8 +101,7 @@ LC3_HOT static inline void fft_bf3(
|
||||
const struct lc3_complex *x0 = x, *x1 = x0 + n*n3, *x2 = x1 + n*n3;
|
||||
struct lc3_complex *y0 = y, *y1 = y0 + n3, *y2 = y1 + n3;
|
||||
|
||||
for (int i = 0; i < n; i++, y0 += 3*n3, y1 += 3*n3, y2 += 3*n3) {
|
||||
|
||||
for (int i = 0; i < n; i++, y0 += 3*n3, y1 += 3*n3, y2 += 3*n3)
|
||||
for (int j = 0; j < n3; j++, x0++, x1++, x2++) {
|
||||
|
||||
y0[j].re = x0->re + x1->re * w0[j][0].re - x1->im * w0[j][0].im
|
||||
@@ -117,8 +122,8 @@ LC3_HOT static inline void fft_bf3(
|
||||
y2[j].im = x0->im + x1->im * w2[j][0].re + x1->re * w2[j][0].im
|
||||
+ x2->im * w2[j][1].re + x2->re * w2[j][1].im;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* fft_bf3 */
|
||||
|
||||
/**
|
||||
* FFT Butterfly 2 Points
|
||||
@@ -126,6 +131,7 @@ LC3_HOT static inline void fft_bf3(
|
||||
* x, y Input and output coefficients
|
||||
* n Number of interleaved transforms
|
||||
*/
|
||||
#ifndef fft_bf2
|
||||
LC3_HOT static inline void fft_bf2(
|
||||
const struct lc3_fft_bf2_twiddles *twiddles,
|
||||
const struct lc3_complex *x, struct lc3_complex *y, int n)
|
||||
@@ -148,6 +154,7 @@ LC3_HOT static inline void fft_bf2(
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* fft_bf2 */
|
||||
|
||||
/**
|
||||
* Perform FFT
|
||||
|
||||
+281
@@ -0,0 +1,281 @@
|
||||
/******************************************************************************
|
||||
*
|
||||
* Copyright 2022 Google LLC
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at:
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
#if __ARM_NEON
|
||||
|
||||
/**
|
||||
* Configuration
|
||||
*/
|
||||
|
||||
#ifndef TEST_NEON
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define fft_5 neon_fft_5
|
||||
#define fft_bf3 neon_fft_bf3
|
||||
#define fft_bf2 neon_fft_bf2
|
||||
|
||||
#endif /* TEST_NEON */
|
||||
|
||||
|
||||
/**
|
||||
* FFT 5 Points
|
||||
* The number of interleaved transform `n` assumed to be even
|
||||
*/
|
||||
LC3_HOT static inline void neon_fft_5(
|
||||
const struct lc3_complex *x, struct lc3_complex *y, int n)
|
||||
{
|
||||
static const union { float f[2]; uint64_t u64; }
|
||||
__cos1 = { { 0.3090169944, 0.3090169944 } },
|
||||
__cos2 = { { -0.8090169944, -0.8090169944 } },
|
||||
__sin1 = { { 0.9510565163, -0.9510565163 } },
|
||||
__sin2 = { { 0.5877852523, -0.5877852523 } };
|
||||
|
||||
float32x2_t sin1 = vcreate_f32(__sin1.u64);
|
||||
float32x2_t sin2 = vcreate_f32(__sin2.u64);
|
||||
float32x2_t cos1 = vcreate_f32(__cos1.u64);
|
||||
float32x2_t cos2 = vcreate_f32(__cos2.u64);
|
||||
|
||||
float32x4_t sin1q = vcombine_f32(sin1, sin1);
|
||||
float32x4_t sin2q = vcombine_f32(sin2, sin2);
|
||||
float32x4_t cos1q = vcombine_f32(cos1, cos1);
|
||||
float32x4_t cos2q = vcombine_f32(cos2, cos2);
|
||||
|
||||
for (int i = 0; i < n; i += 2, x += 2, y += 10) {
|
||||
|
||||
float32x4_t y0, y1, y2, y3, y4;
|
||||
|
||||
float32x4_t x0 = vld1q_f32( (float *)(x + 0*n) );
|
||||
float32x4_t x1 = vld1q_f32( (float *)(x + 1*n) );
|
||||
float32x4_t x2 = vld1q_f32( (float *)(x + 2*n) );
|
||||
float32x4_t x3 = vld1q_f32( (float *)(x + 3*n) );
|
||||
float32x4_t x4 = vld1q_f32( (float *)(x + 4*n) );
|
||||
|
||||
float32x4_t s14 = vaddq_f32(x1, x4);
|
||||
float32x4_t s23 = vaddq_f32(x2, x3);
|
||||
|
||||
float32x4_t d14 = vrev64q_f32( vsubq_f32(x1, x4) );
|
||||
float32x4_t d23 = vrev64q_f32( vsubq_f32(x2, x3) );
|
||||
|
||||
y0 = vaddq_f32( x0, vaddq_f32(s14, s23) );
|
||||
|
||||
y4 = vfmaq_f32( x0, s14, cos1q );
|
||||
y4 = vfmaq_f32( y4, s23, cos2q );
|
||||
|
||||
y1 = vfmaq_f32( y4, d14, sin1q );
|
||||
y1 = vfmaq_f32( y1, d23, sin2q );
|
||||
|
||||
y4 = vfmsq_f32( y4, d14, sin1q );
|
||||
y4 = vfmsq_f32( y4, d23, sin2q );
|
||||
|
||||
y3 = vfmaq_f32( x0, s14, cos2q );
|
||||
y3 = vfmaq_f32( y3, s23, cos1q );
|
||||
|
||||
y2 = vfmaq_f32( y3, d14, sin2q );
|
||||
y2 = vfmsq_f32( y2, d23, sin1q );
|
||||
|
||||
y3 = vfmsq_f32( y3, d14, sin2q );
|
||||
y3 = vfmaq_f32( y3, d23, sin1q );
|
||||
|
||||
vst1_f32( (float *)(y + 0), vget_low_f32(y0) );
|
||||
vst1_f32( (float *)(y + 1), vget_low_f32(y1) );
|
||||
vst1_f32( (float *)(y + 2), vget_low_f32(y2) );
|
||||
vst1_f32( (float *)(y + 3), vget_low_f32(y3) );
|
||||
vst1_f32( (float *)(y + 4), vget_low_f32(y4) );
|
||||
|
||||
vst1_f32( (float *)(y + 5), vget_high_f32(y0) );
|
||||
vst1_f32( (float *)(y + 6), vget_high_f32(y1) );
|
||||
vst1_f32( (float *)(y + 7), vget_high_f32(y2) );
|
||||
vst1_f32( (float *)(y + 8), vget_high_f32(y3) );
|
||||
vst1_f32( (float *)(y + 9), vget_high_f32(y4) );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* FFT Butterfly 3 Points
|
||||
*/
|
||||
LC3_HOT static inline void neon_fft_bf3(
|
||||
const struct lc3_fft_bf3_twiddles *twiddles,
|
||||
const struct lc3_complex *x, struct lc3_complex *y, int n)
|
||||
{
|
||||
int n3 = twiddles->n3;
|
||||
const struct lc3_complex (*w0_ptr)[2] = twiddles->t;
|
||||
const struct lc3_complex (*w1_ptr)[2] = w0_ptr + n3;
|
||||
const struct lc3_complex (*w2_ptr)[2] = w1_ptr + n3;
|
||||
|
||||
const struct lc3_complex *x0_ptr = x;
|
||||
const struct lc3_complex *x1_ptr = x0_ptr + n*n3;
|
||||
const struct lc3_complex *x2_ptr = x1_ptr + n*n3;
|
||||
|
||||
struct lc3_complex *y0_ptr = y;
|
||||
struct lc3_complex *y1_ptr = y0_ptr + n3;
|
||||
struct lc3_complex *y2_ptr = y1_ptr + n3;
|
||||
|
||||
for (int j, i = 0; i < n; i++,
|
||||
y0_ptr += 3*n3, y1_ptr += 3*n3, y2_ptr += 3*n3) {
|
||||
|
||||
/* --- Process by pair --- */
|
||||
|
||||
for (j = 0; j < (n3 >> 1); j++,
|
||||
x0_ptr += 2, x1_ptr += 2, x2_ptr += 2) {
|
||||
|
||||
float32x4_t x0 = vld1q_f32( (float *)x0_ptr );
|
||||
float32x4_t x1 = vld1q_f32( (float *)x1_ptr );
|
||||
float32x4_t x2 = vld1q_f32( (float *)x2_ptr );
|
||||
|
||||
float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 );
|
||||
float32x4_t x2r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x2)), x2 );
|
||||
|
||||
float32x4x2_t wn;
|
||||
float32x4_t yn;
|
||||
|
||||
wn = vld2q_f32( (float *)(w0_ptr + 2*j) );
|
||||
|
||||
yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
|
||||
yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
|
||||
yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
|
||||
yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
|
||||
vst1q_f32( (float *)(y0_ptr + 2*j), yn );
|
||||
|
||||
wn = vld2q_f32( (float *)(w1_ptr + 2*j) );
|
||||
|
||||
yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
|
||||
yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
|
||||
yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
|
||||
yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
|
||||
vst1q_f32( (float *)(y1_ptr + 2*j), yn );
|
||||
|
||||
wn = vld2q_f32( (float *)(w2_ptr + 2*j) );
|
||||
|
||||
yn = vfmaq_f32( x0, x1 , vtrn1q_f32(wn.val[0], wn.val[0]) );
|
||||
yn = vfmaq_f32( yn, x1r, vtrn1q_f32(wn.val[1], wn.val[1]) );
|
||||
yn = vfmaq_f32( yn, x2 , vtrn2q_f32(wn.val[0], wn.val[0]) );
|
||||
yn = vfmaq_f32( yn, x2r, vtrn2q_f32(wn.val[1], wn.val[1]) );
|
||||
vst1q_f32( (float *)(y2_ptr + 2*j), yn );
|
||||
|
||||
}
|
||||
|
||||
/* --- Last iteration --- */
|
||||
|
||||
if (n3 & 1) {
|
||||
|
||||
float32x2x2_t wn;
|
||||
float32x2_t yn;
|
||||
|
||||
float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) );
|
||||
float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) );
|
||||
float32x2_t x2 = vld1_f32( (float *)(x2_ptr++) );
|
||||
|
||||
float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 );
|
||||
float32x2_t x2r = vtrn1_f32( vrev64_f32(vneg_f32(x2)), x2 );
|
||||
|
||||
wn = vld2_f32( (float *)(w0_ptr + 2*j) );
|
||||
|
||||
yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
|
||||
yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
|
||||
yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
|
||||
yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
|
||||
vst1_f32( (float *)(y0_ptr + 2*j), yn );
|
||||
|
||||
wn = vld2_f32( (float *)(w1_ptr + 2*j) );
|
||||
|
||||
yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
|
||||
yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
|
||||
yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
|
||||
yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
|
||||
vst1_f32( (float *)(y1_ptr + 2*j), yn );
|
||||
|
||||
wn = vld2_f32( (float *)(w2_ptr + 2*j) );
|
||||
|
||||
yn = vfma_f32( x0, x1 , vtrn1_f32(wn.val[0], wn.val[0]) );
|
||||
yn = vfma_f32( yn, x1r, vtrn1_f32(wn.val[1], wn.val[1]) );
|
||||
yn = vfma_f32( yn, x2 , vtrn2_f32(wn.val[0], wn.val[0]) );
|
||||
yn = vfma_f32( yn, x2r, vtrn2_f32(wn.val[1], wn.val[1]) );
|
||||
vst1_f32( (float *)(y2_ptr + 2*j), yn );
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* FFT Butterfly 2 Points
|
||||
*/
|
||||
LC3_HOT static inline void neon_fft_bf2(
|
||||
const struct lc3_fft_bf2_twiddles *twiddles,
|
||||
const struct lc3_complex *x, struct lc3_complex *y, int n)
|
||||
{
|
||||
int n2 = twiddles->n2;
|
||||
const struct lc3_complex *w_ptr = twiddles->t;
|
||||
|
||||
const struct lc3_complex *x0_ptr = x;
|
||||
const struct lc3_complex *x1_ptr = x0_ptr + n*n2;
|
||||
|
||||
struct lc3_complex *y0_ptr = y;
|
||||
struct lc3_complex *y1_ptr = y0_ptr + n2;
|
||||
|
||||
for (int j, i = 0; i < n; i++, y0_ptr += 2*n2, y1_ptr += 2*n2) {
|
||||
|
||||
/* --- Process by pair --- */
|
||||
|
||||
for (j = 0; j < (n2 >> 1); j++, x0_ptr += 2, x1_ptr += 2) {
|
||||
|
||||
float32x4_t x0 = vld1q_f32( (float *)x0_ptr );
|
||||
float32x4_t x1 = vld1q_f32( (float *)x1_ptr );
|
||||
float32x4_t y0, y1;
|
||||
|
||||
float32x4_t x1r = vtrn1q_f32( vrev64q_f32(vnegq_f32(x1)), x1 );
|
||||
|
||||
float32x4_t w = vld1q_f32( (float *)(w_ptr + 2*j) );
|
||||
float32x4_t w_re = vtrn1q_f32(w, w);
|
||||
float32x4_t w_im = vtrn2q_f32(w, w);
|
||||
|
||||
y0 = vfmaq_f32( x0, x1 , w_re );
|
||||
y0 = vfmaq_f32( y0, x1r, w_im );
|
||||
vst1q_f32( (float *)(y0_ptr + 2*j), y0 );
|
||||
|
||||
y1 = vfmsq_f32( x0, x1 , w_re );
|
||||
y1 = vfmsq_f32( y1, x1r, w_im );
|
||||
vst1q_f32( (float *)(y1_ptr + 2*j), y1 );
|
||||
}
|
||||
|
||||
/* --- Last iteration --- */
|
||||
|
||||
if (n2 & 1) {
|
||||
|
||||
float32x2_t x0 = vld1_f32( (float *)(x0_ptr++) );
|
||||
float32x2_t x1 = vld1_f32( (float *)(x1_ptr++) );
|
||||
float32x2_t y0, y1;
|
||||
|
||||
float32x2_t x1r = vtrn1_f32( vrev64_f32(vneg_f32(x1)), x1 );
|
||||
|
||||
float32x2_t w = vld1_f32( (float *)(w_ptr + 2*j) );
|
||||
float32x2_t w_re = vtrn1_f32(w, w);
|
||||
float32x2_t w_im = vtrn2_f32(w, w);
|
||||
|
||||
y0 = vfma_f32( x0, x1 , w_re );
|
||||
y0 = vfma_f32( y0, x1r, w_im );
|
||||
vst1_f32( (float *)(y0_ptr + 2*j), y0 );
|
||||
|
||||
y1 = vfms_f32( x0, x1 , w_re );
|
||||
y1 = vfms_f32( y1, x1r, w_im );
|
||||
vst1_f32( (float *)(y1_ptr + 2*j), y1 );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* __ARM_NEON */
|
||||
@@ -17,6 +17,7 @@
|
||||
test_neon_src += \
|
||||
$(TEST_DIR)/neon/test_neon.c \
|
||||
$(TEST_DIR)/neon/ltpf_neon.c \
|
||||
$(TEST_DIR)/neon/mdct_neon.c \
|
||||
$(SRC_DIR)/tables.c
|
||||
|
||||
test_neon_include += $(SRC_DIR)
|
||||
|
||||
@@ -0,0 +1,74 @@
|
||||
/******************************************************************************
|
||||
*
|
||||
* Copyright 2022 Google LLC
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at:
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
#include "neon.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
#define TEST_NEON
|
||||
#include <mdct.c>
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
static int check_fft(void)
|
||||
{
|
||||
struct lc3_complex x[240];
|
||||
struct lc3_complex y[240], y_neon[240];
|
||||
|
||||
for (int i = 0; i < 240; i++) {
|
||||
x[i].re = (double)rand() / RAND_MAX;
|
||||
x[i].im = (double)rand() / RAND_MAX;
|
||||
}
|
||||
|
||||
fft_5(x, y, 240/5);
|
||||
neon_fft_5(x, y_neon, 240/5);
|
||||
for (int i = 0; i < 240; i++)
|
||||
if (fabsf(y[i].re - y_neon[i].re) > 1e-6f ||
|
||||
fabsf(y[i].im - y_neon[i].im) > 1e-6f )
|
||||
return -1;
|
||||
|
||||
fft_bf3(lc3_fft_twiddles_bf3[0], x, y, 240/15);
|
||||
neon_fft_bf3(lc3_fft_twiddles_bf3[0], x, y_neon, 240/15);
|
||||
for (int i = 0; i < 240; i++)
|
||||
if (fabsf(y[i].re - y_neon[i].re) > 1e-6f ||
|
||||
fabsf(y[i].im - y_neon[i].im) > 1e-6f )
|
||||
return -1;
|
||||
|
||||
fft_bf2(lc3_fft_twiddles_bf2[0][1], x, y, 240/30);
|
||||
neon_fft_bf2(lc3_fft_twiddles_bf2[0][1], x, y_neon, 240/30);
|
||||
for (int i = 0; i < 240; i++)
|
||||
if (fabsf(y[i].re - y_neon[i].re) > 1e-6f ||
|
||||
fabsf(y[i].im - y_neon[i].im) > 1e-6f )
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int check_mdct(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if ((ret = check_fft()) < 0)
|
||||
return ret;
|
||||
|
||||
return 0;
|
||||
}
|
||||
+252
-63
@@ -25,6 +25,11 @@
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* Integer
|
||||
* -------------------------------------------------------------------------- */
|
||||
|
||||
typedef struct { int16_t e[4]; } int16x4_t;
|
||||
|
||||
typedef struct { int16_t e[8]; } int16x8_t;
|
||||
@@ -32,82 +37,34 @@ typedef struct { int32_t e[4]; } int32x4_t;
|
||||
typedef struct { int64_t e[2]; } int64x2_t;
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* Load / Store
|
||||
* -------------------------------------------------------------------------- */
|
||||
/**
|
||||
* Load / Store
|
||||
*/
|
||||
|
||||
__attribute__((unused))
|
||||
static int16x4_t vld1_s16(const int16_t *p)
|
||||
{
|
||||
int16x4_t r;
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
r.e[i] = *(p++);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static int64x2_t vmovq_n_s64(int64_t v)
|
||||
{
|
||||
int64x2_t r;
|
||||
|
||||
r.e[0] = v;
|
||||
r.e[1] = v;
|
||||
|
||||
return r;
|
||||
return (int16x4_t){ { p[0], p[1], p[2], p[3] } };
|
||||
}
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* Move
|
||||
* -------------------------------------------------------------------------- */
|
||||
|
||||
__attribute__((unused))
|
||||
static int32x4_t vmovq_n_s32(uint32_t v)
|
||||
{
|
||||
int32x4_t r;
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
r.e[i] = v;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static int16x4_t vext_s16(int16x4_t a, int16x4_t b, const int n)
|
||||
{
|
||||
int16x4_t r;
|
||||
int i = 0;
|
||||
|
||||
for (; i < n; i++) r.e[3-i] = b.e[(n-1)-i];
|
||||
for (; i < 4; i++) r.e[3-i] = a.e[3-(i-n)];
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* Arithmetic
|
||||
* -------------------------------------------------------------------------- */
|
||||
/**
|
||||
* Arithmetic
|
||||
*/
|
||||
|
||||
__attribute__((unused))
|
||||
static int32x4_t vmull_s16(int16x4_t a, int16x4_t b)
|
||||
{
|
||||
int32x4_t r;
|
||||
|
||||
for (int i = 0; i < 4; i++)
|
||||
r.e[i] = (int32_t)a.e[i] * b.e[i];
|
||||
|
||||
return r;
|
||||
return (int32x4_t){ { a.e[0] * b.e[0], a.e[1] * b.e[1],
|
||||
a.e[2] * b.e[2], a.e[3] * b.e[3] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static int32x4_t vmlal_s16(int32x4_t r, int16x4_t a, int16x4_t b)
|
||||
{
|
||||
for (int i = 0; i < 4; i++)
|
||||
r.e[i] += (int32_t)a.e[i] * b.e[i];
|
||||
|
||||
return r;
|
||||
return (int32x4_t){ {
|
||||
r.e[0] + a.e[0] * b.e[0], r.e[1] + a.e[1] * b.e[1],
|
||||
r.e[2] + a.e[2] * b.e[2], r.e[3] + a.e[3] * b.e[3] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
@@ -122,9 +79,9 @@ static int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
|
||||
}
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* Reduce
|
||||
* -------------------------------------------------------------------------- */
|
||||
/**
|
||||
* Reduce
|
||||
*/
|
||||
|
||||
__attribute__((unused))
|
||||
static int32_t vaddvq_s32(int32x4_t v)
|
||||
@@ -138,4 +95,236 @@ static int64_t vaddvq_s64(int64x2_t v)
|
||||
return v.e[0] + v.e[1];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Manipulation
|
||||
*/
|
||||
|
||||
__attribute__((unused))
|
||||
static int16x4_t vext_s16(int16x4_t a, int16x4_t b, const int n)
|
||||
{
|
||||
int16_t x[] = { a.e[0], a.e[1], a.e[2], a.e[3],
|
||||
b.e[0], b.e[1], b.e[2], b.e[3] };
|
||||
|
||||
return (int16x4_t){ { x[n], x[n+1], x[n+2], x[n+3] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static int32x4_t vmovq_n_s32(uint32_t v)
|
||||
{
|
||||
return (int32x4_t){ { v, v, v, v } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static int64x2_t vmovq_n_s64(int64_t v)
|
||||
{
|
||||
return (int64x2_t){ { v, v, } };
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------------------
|
||||
* Floating Point
|
||||
* -------------------------------------------------------------------------- */
|
||||
|
||||
typedef struct { float e[2]; } float32x2_t;
|
||||
typedef struct { float e[4]; } float32x4_t;
|
||||
|
||||
typedef struct { float32x2_t val[2]; } float32x2x2_t;
|
||||
typedef struct { float32x4_t val[2]; } float32x4x2_t;
|
||||
|
||||
|
||||
/**
|
||||
* Load / Store
|
||||
*/
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x2_t vld1_f32(const float *p)
|
||||
{
|
||||
return (float32x2_t){ { p[0], p[1] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x4_t vld1q_f32(const float *p)
|
||||
{
|
||||
return (float32x4_t){ { p[0], p[1], p[2], p[3] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x4_t vld1q_dup_f32(const float *p)
|
||||
{
|
||||
return (float32x4_t){ { p[0], p[0], p[0], p[0] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x2x2_t vld2_f32(const float *p)
|
||||
{
|
||||
return (float32x2x2_t){ .val[0] = { { p[0], p[2] } },
|
||||
.val[1] = { { p[1], p[3] } } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x4x2_t vld2q_f32(const float *p)
|
||||
{
|
||||
return (float32x4x2_t){ .val[0] = { { p[0], p[2], p[4], p[6] } },
|
||||
.val[1] = { { p[1], p[3], p[5], p[7] } } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static void vst1_f32(float *p, float32x2_t v)
|
||||
{
|
||||
p[0] = v.e[0], p[1] = v.e[1];
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static void vst1q_f32(float *p, float32x4_t v)
|
||||
{
|
||||
p[0] = v.e[0], p[1] = v.e[1], p[2] = v.e[2], p[3] = v.e[3];
|
||||
}
|
||||
|
||||
/**
|
||||
* Arithmetic
|
||||
*/
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x2_t vneg_f32(float32x2_t a)
|
||||
{
|
||||
return (float32x2_t){ { -a.e[0], -a.e[1] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x4_t vnegq_f32(float32x4_t a)
|
||||
{
|
||||
return (float32x4_t){ { -a.e[0], -a.e[1], -a.e[2], -a.e[3] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x4_t vaddq_f32(float32x4_t a, float32x4_t b)
|
||||
{
|
||||
return (float32x4_t){ { a.e[0] + b.e[0], a.e[1] + b.e[1],
|
||||
a.e[2] + b.e[2], a.e[3] + b.e[3] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x4_t vsubq_f32(float32x4_t a, float32x4_t b)
|
||||
{
|
||||
return (float32x4_t){ { a.e[0] - b.e[0], a.e[1] - b.e[1],
|
||||
a.e[2] - b.e[2], a.e[3] - b.e[3] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x2_t vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c)
|
||||
{
|
||||
return (float32x2_t){ {
|
||||
a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x4_t vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
|
||||
{
|
||||
return (float32x4_t){ {
|
||||
a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1],
|
||||
a.e[2] + b.e[2] * c.e[2], a.e[3] + b.e[3] * c.e[3] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x2_t vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c)
|
||||
{
|
||||
return (float32x2_t){ {
|
||||
a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x4_t vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
|
||||
{
|
||||
return (float32x4_t){ {
|
||||
a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1],
|
||||
a.e[2] - b.e[2] * c.e[2], a.e[3] - b.e[3] * c.e[3] } };
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Manipulation
|
||||
*/
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x2_t vcreate_f32(uint64_t u)
|
||||
{
|
||||
float *f = (float *)&u;
|
||||
return (float32x2_t){ { f[0] , f[1] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x4_t vcombine_f32(float32x2_t a, float32x2_t b)
|
||||
{
|
||||
return (float32x4_t){ { a.e[0], a.e[1], b.e[0], b.e[1] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x2_t vget_low_f32(float32x4_t a)
|
||||
{
|
||||
return (float32x2_t){ { a.e[0], a.e[1] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x2_t vget_high_f32(float32x4_t a)
|
||||
{
|
||||
return (float32x2_t){ { a.e[2], a.e[3] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x4_t vmovq_n_f32(float v)
|
||||
{
|
||||
return (float32x4_t){ { v, v, v, v } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x2_t vrev64_f32(float32x2_t v)
|
||||
{
|
||||
return (float32x2_t){ { v.e[1], v.e[0] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x4_t vrev64q_f32(float32x4_t v)
|
||||
{
|
||||
return (float32x4_t){ { v.e[1], v.e[0], v.e[3], v.e[2] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x2_t vtrn1_f32(float32x2_t a, float32x2_t b)
|
||||
{
|
||||
return (float32x2_t){ { a.e[0], b.e[0] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x2_t vtrn2_f32(float32x2_t a, float32x2_t b)
|
||||
{
|
||||
return (float32x2_t){ { a.e[1], b.e[1] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x4_t vtrn1q_f32(float32x4_t a, float32x4_t b)
|
||||
{
|
||||
return (float32x4_t){ { a.e[0], b.e[0], a.e[2], b.e[2] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x4_t vtrn2q_f32(float32x4_t a, float32x4_t b)
|
||||
{
|
||||
return (float32x4_t){ { a.e[1], b.e[1], a.e[3], b.e[3] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x4_t vzip1q_f32(float32x4_t a, float32x4_t b)
|
||||
{
|
||||
return (float32x4_t){ { a.e[0], b.e[0], a.e[1], b.e[1] } };
|
||||
}
|
||||
|
||||
__attribute__((unused))
|
||||
static float32x4_t vzip2q_f32(float32x4_t a, float32x4_t b)
|
||||
{
|
||||
return (float32x4_t){ { a.e[2], b.e[2], a.e[3], b.e[3] } };
|
||||
}
|
||||
|
||||
|
||||
#endif /* __ARM_NEON */
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include <stdio.h>
|
||||
|
||||
int check_ltpf(void);
|
||||
int check_mdct(void);
|
||||
|
||||
int main()
|
||||
{
|
||||
@@ -28,5 +29,9 @@ int main()
|
||||
printf("%s\n", (r = check_ltpf()) == 0 ? "OK" : "Failed");
|
||||
ret = ret || r;
|
||||
|
||||
printf("Checking MDCT Neon... "); fflush(stdout);
|
||||
printf("%s\n", (r = check_mdct()) == 0 ? "OK" : "Failed");
|
||||
ret = ret || r;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user