mdct: Add neon implementation of FFT

2026-05-19 01:48:02 +00:00 · 2022-05-10 13:56:39 +02:00
parent c4f48f2618
commit f182fe7e0d
6 changed files with 624 additions and 67 deletions
--- a/test/neon/makefile.mk
+++ b/test/neon/makefile.mk
@@ -17,6 +17,7 @@
 test_neon_src += \
    $(TEST_DIR)/neon/test_neon.c \
    $(TEST_DIR)/neon/ltpf_neon.c \
+    $(TEST_DIR)/neon/mdct_neon.c \
    $(SRC_DIR)/tables.c

 test_neon_include += $(SRC_DIR)
--- a/test/neon/mdct_neon.c
+++ b/test/neon/mdct_neon.c
@@ -0,0 +1,74 @@
+/******************************************************************************
+ *
+ *  Copyright 2022 Google LLC
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at:
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ ******************************************************************************/
+
+#include "neon.h"
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+/* -------------------------------------------------------------------------- */
+
+#define TEST_NEON
+#include <mdct.c>
+
+/* -------------------------------------------------------------------------- */
+
+static int check_fft(void)
+{
+    struct lc3_complex x[240];
+    struct lc3_complex y[240], y_neon[240];
+
+    for (int i = 0; i < 240; i++) {
+          x[i].re = (double)rand() / RAND_MAX;
+          x[i].im = (double)rand() / RAND_MAX;
+    }
+
+    fft_5(x, y, 240/5);
+    neon_fft_5(x, y_neon, 240/5);
+    for (int i = 0; i < 240; i++)
+        if (fabsf(y[i].re - y_neon[i].re) > 1e-6f ||
+            fabsf(y[i].im - y_neon[i].im) > 1e-6f   )
+            return -1;
+
+    fft_bf3(lc3_fft_twiddles_bf3[0], x, y, 240/15);
+    neon_fft_bf3(lc3_fft_twiddles_bf3[0], x, y_neon, 240/15);
+    for (int i = 0; i < 240; i++)
+        if (fabsf(y[i].re - y_neon[i].re) > 1e-6f ||
+            fabsf(y[i].im - y_neon[i].im) > 1e-6f   )
+            return -1;
+
+    fft_bf2(lc3_fft_twiddles_bf2[0][1], x, y, 240/30);
+    neon_fft_bf2(lc3_fft_twiddles_bf2[0][1], x, y_neon, 240/30);
+    for (int i = 0; i < 240; i++)
+        if (fabsf(y[i].re - y_neon[i].re) > 1e-6f ||
+            fabsf(y[i].im - y_neon[i].im) > 1e-6f   )
+            return -1;
+
+    return 0;
+}
+
+int check_mdct(void)
+{
+    int ret;
+
+    if ((ret = check_fft()) < 0)
+        return ret;
+
+    return 0;
+}
--- a/test/neon/neon.h
+++ b/test/neon/neon.h
@@ -25,6 +25,11 @@

 #include <stdint.h>

+
+/* ----------------------------------------------------------------------------
+ *  Integer
+ * -------------------------------------------------------------------------- */
+
 typedef struct { int16_t e[4]; } int16x4_t;

 typedef struct { int16_t e[8]; } int16x8_t;
@@ -32,82 +37,34 @@ typedef struct { int32_t e[4]; } int32x4_t;
 typedef struct { int64_t e[2]; } int64x2_t;


-/* ----------------------------------------------------------------------------
- *  Load / Store
- * -------------------------------------------------------------------------- */
+/**
+ * Load / Store
+ */

 __attribute__((unused))
 static int16x4_t vld1_s16(const int16_t *p)
 {
-    int16x4_t r;
-
-    for (int i = 0; i < 4; i++)
-        r.e[i] = *(p++);
-
-    return r;
-}
-
-__attribute__((unused))
-static int64x2_t vmovq_n_s64(int64_t v)
-{
-    int64x2_t r;
-
-    r.e[0] = v;
-    r.e[1] = v;
-
-    return r;
+    return (int16x4_t){ { p[0], p[1], p[2], p[3] } };
 }


-/* ----------------------------------------------------------------------------
- *  Move
- * -------------------------------------------------------------------------- */
-
-__attribute__((unused))
-static int32x4_t vmovq_n_s32(uint32_t v)
-{
-    int32x4_t r;
-
-    for (int i = 0; i < 4; i++)
-        r.e[i] = v;
-
-    return r;
-}
-
-__attribute__((unused))
-static int16x4_t vext_s16(int16x4_t a, int16x4_t b, const int n)
-{
-    int16x4_t r;
-    int i = 0;
-
-    for (; i < n; i++) r.e[3-i] = b.e[(n-1)-i];
-    for (; i < 4; i++) r.e[3-i] = a.e[3-(i-n)];
-
-    return r;
-}
-
-/* ----------------------------------------------------------------------------
- *  Arithmetic
- * -------------------------------------------------------------------------- */
+/**
+ * Arithmetic
+ */

 __attribute__((unused))
 static int32x4_t vmull_s16(int16x4_t a, int16x4_t b)
 {
-    int32x4_t r;
-
-    for (int i = 0; i < 4; i++)
-        r.e[i] = (int32_t)a.e[i] * b.e[i];
-
-    return r;
+    return (int32x4_t){ { a.e[0] * b.e[0], a.e[1] * b.e[1],
+                          a.e[2] * b.e[2], a.e[3] * b.e[3]  } };
 }

 __attribute__((unused))
 static int32x4_t vmlal_s16(int32x4_t r, int16x4_t a, int16x4_t b)
 {
-    for (int i = 0; i < 4; i++)
-        r.e[i] += (int32_t)a.e[i] * b.e[i];
-
-    return r;
+    return (int32x4_t){ {
+        r.e[0] + a.e[0] * b.e[0], r.e[1] + a.e[1] * b.e[1],
+        r.e[2] + a.e[2] * b.e[2], r.e[3] + a.e[3] * b.e[3] } };
 }

 __attribute__((unused))
@@ -122,9 +79,9 @@ static int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
 }


-/* ----------------------------------------------------------------------------
- *  Reduce
- * -------------------------------------------------------------------------- */
+/**
+ * Reduce
+ */

 __attribute__((unused))
 static int32_t vaddvq_s32(int32x4_t v)
@@ -138,4 +95,236 @@ static int64_t vaddvq_s64(int64x2_t v)
    return v.e[0] + v.e[1];
 }

+
+/**
+ * Manipulation
+ */
+
+__attribute__((unused))
+static int16x4_t vext_s16(int16x4_t a, int16x4_t b, const int n)
+{
+    int16_t x[] = { a.e[0], a.e[1], a.e[2], a.e[3],
+                    b.e[0], b.e[1], b.e[2], b.e[3] };
+
+    return (int16x4_t){ { x[n], x[n+1], x[n+2], x[n+3] } };
+}
+
+__attribute__((unused))
+static int32x4_t vmovq_n_s32(uint32_t v)
+{
+    return (int32x4_t){ { v, v, v, v } };
+}
+
+__attribute__((unused))
+static int64x2_t vmovq_n_s64(int64_t v)
+{
+    return (int64x2_t){ { v, v, } };
+}
+
+
+
+/* ----------------------------------------------------------------------------
+ *  Floating Point
+ * -------------------------------------------------------------------------- */
+
+typedef struct { float e[2]; } float32x2_t;
+typedef struct { float e[4]; } float32x4_t;
+
+typedef struct { float32x2_t val[2]; } float32x2x2_t;
+typedef struct { float32x4_t val[2]; } float32x4x2_t;
+
+
+/**
+ * Load / Store
+ */
+
+__attribute__((unused))
+static float32x2_t vld1_f32(const float *p)
+{
+    return (float32x2_t){ { p[0], p[1] } };
+}
+
+__attribute__((unused))
+static float32x4_t vld1q_f32(const float *p)
+{
+    return (float32x4_t){ { p[0], p[1], p[2], p[3] } };
+}
+
+__attribute__((unused))
+static float32x4_t vld1q_dup_f32(const float *p)
+{
+    return (float32x4_t){ { p[0], p[0], p[0], p[0] } };
+}
+
+__attribute__((unused))
+static float32x2x2_t vld2_f32(const float *p)
+{
+    return (float32x2x2_t){ .val[0] = { { p[0], p[2] } },
+                            .val[1] = { { p[1], p[3] } } };
+}
+
+__attribute__((unused))
+static float32x4x2_t vld2q_f32(const float *p)
+{
+    return (float32x4x2_t){ .val[0] = { { p[0], p[2], p[4], p[6] } },
+                            .val[1] = { { p[1], p[3], p[5], p[7] } } };
+}
+
+__attribute__((unused))
+static void vst1_f32(float *p, float32x2_t v)
+{
+    p[0] = v.e[0], p[1] = v.e[1];
+}
+
+__attribute__((unused))
+static void vst1q_f32(float *p, float32x4_t v)
+{
+    p[0] = v.e[0], p[1] = v.e[1], p[2] = v.e[2], p[3] = v.e[3];
+}
+
+/**
+ * Arithmetic
+ */
+
+__attribute__((unused))
+static float32x2_t vneg_f32(float32x2_t a)
+{
+    return (float32x2_t){ { -a.e[0], -a.e[1] } };
+}
+
+__attribute__((unused))
+static float32x4_t vnegq_f32(float32x4_t a)
+{
+    return (float32x4_t){ { -a.e[0], -a.e[1], -a.e[2], -a.e[3] } };
+}
+
+__attribute__((unused))
+static float32x4_t vaddq_f32(float32x4_t a, float32x4_t b)
+{
+    return (float32x4_t){ { a.e[0] + b.e[0], a.e[1] + b.e[1],
+                            a.e[2] + b.e[2], a.e[3] + b.e[3] } };
+}
+
+__attribute__((unused))
+static float32x4_t vsubq_f32(float32x4_t a, float32x4_t b)
+{
+    return (float32x4_t){ { a.e[0] - b.e[0], a.e[1] - b.e[1],
+                            a.e[2] - b.e[2], a.e[3] - b.e[3] } };
+}
+
+__attribute__((unused))
+static float32x2_t vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c)
+{
+    return (float32x2_t){ {
+        a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1] } };
+}
+
+__attribute__((unused))
+static float32x4_t vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
+{
+    return (float32x4_t){ {
+        a.e[0] + b.e[0] * c.e[0], a.e[1] + b.e[1] * c.e[1],
+        a.e[2] + b.e[2] * c.e[2], a.e[3] + b.e[3] * c.e[3] } };
+}
+
+__attribute__((unused))
+static float32x2_t vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c)
+{
+    return (float32x2_t){ {
+        a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1] } };
+}
+
+__attribute__((unused))
+static float32x4_t vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c)
+{
+    return (float32x4_t){ {
+        a.e[0] - b.e[0] * c.e[0], a.e[1] - b.e[1] * c.e[1],
+        a.e[2] - b.e[2] * c.e[2], a.e[3] - b.e[3] * c.e[3] } };
+}
+
+
+/**
+ * Manipulation
+ */
+
+__attribute__((unused))
+static float32x2_t vcreate_f32(uint64_t u)
+{
+    float *f = (float *)&u;
+    return (float32x2_t){ { f[0] , f[1] } };
+}
+
+__attribute__((unused))
+static float32x4_t vcombine_f32(float32x2_t a, float32x2_t b)
+{
+    return (float32x4_t){ { a.e[0], a.e[1], b.e[0], b.e[1] } };
+}
+
+__attribute__((unused))
+static float32x2_t vget_low_f32(float32x4_t a)
+{
+    return (float32x2_t){ { a.e[0], a.e[1] } };
+}
+
+__attribute__((unused))
+static float32x2_t vget_high_f32(float32x4_t a)
+{
+    return (float32x2_t){ { a.e[2], a.e[3] } };
+}
+
+__attribute__((unused))
+static float32x4_t vmovq_n_f32(float v)
+{
+    return (float32x4_t){ { v, v, v, v } };
+}
+
+__attribute__((unused))
+static float32x2_t vrev64_f32(float32x2_t v)
+{
+    return (float32x2_t){ { v.e[1], v.e[0] } };
+}
+
+__attribute__((unused))
+static float32x4_t vrev64q_f32(float32x4_t v)
+{
+    return (float32x4_t){ { v.e[1], v.e[0], v.e[3], v.e[2] } };
+}
+
+__attribute__((unused))
+static float32x2_t vtrn1_f32(float32x2_t a, float32x2_t b)
+{
+    return (float32x2_t){ { a.e[0], b.e[0] } };
+}
+
+__attribute__((unused))
+static float32x2_t vtrn2_f32(float32x2_t a, float32x2_t b)
+{
+    return (float32x2_t){ { a.e[1], b.e[1] } };
+}
+
+__attribute__((unused))
+static float32x4_t vtrn1q_f32(float32x4_t a, float32x4_t b)
+{
+    return (float32x4_t){ { a.e[0], b.e[0], a.e[2], b.e[2] } };
+}
+
+__attribute__((unused))
+static float32x4_t vtrn2q_f32(float32x4_t a, float32x4_t b)
+{
+    return (float32x4_t){ { a.e[1], b.e[1], a.e[3], b.e[3] } };
+}
+
+__attribute__((unused))
+static float32x4_t vzip1q_f32(float32x4_t a, float32x4_t b)
+{
+    return (float32x4_t){ { a.e[0], b.e[0], a.e[1], b.e[1] } };
+}
+
+__attribute__((unused))
+static float32x4_t vzip2q_f32(float32x4_t a, float32x4_t b)
+{
+    return (float32x4_t){ { a.e[2], b.e[2], a.e[3], b.e[3] } };
+}
+
+
 #endif /* __ARM_NEON */
--- a/test/neon/test_neon.c
+++ b/test/neon/test_neon.c
@@ -19,6 +19,7 @@
 #include <stdio.h>

 int check_ltpf(void);
+int check_mdct(void);

 int main()
 {
@@ -28,5 +29,9 @@ int main()
    printf("%s\n", (r = check_ltpf()) == 0 ? "OK" : "Failed");
    ret = ret || r;

+    printf("Checking MDCT Neon... "); fflush(stdout);
+    printf("%s\n", (r = check_mdct()) == 0 ? "OK" : "Failed");
+    ret = ret || r;
+
    return ret;
 }