diff --git a/include/lc3_private.h b/include/lc3_private.h index 8740d84..970ef8e 100644 --- a/include/lc3_private.h +++ b/include/lc3_private.h @@ -26,17 +26,21 @@ /** * Return number of samples, delayed samples and * encoded spectrum coefficients within a frame - * For decoding, keep 18 ms of history, aligned on frames, and a frame + * - For encoding, keep 1.25 ms of temporal winodw + * - For decoding, keep 18 ms of history, aligned on frames, and a frame */ #define __LC3_NS(dt_us, sr_hz) \ - ((dt_us * sr_hz) / 1000 / 1000) + ( (dt_us * sr_hz) / 1000 / 1000 ) #define __LC3_ND(dt_us, sr_hz) \ ( (dt_us) == 7500 ? 23 * __LC3_NS(dt_us, sr_hz) / 30 \ : 5 * __LC3_NS(dt_us, sr_hz) / 8 ) -#define __LC3_NR(dt_us, sr_hz) \ +#define __LC3_NT(sr_hz) \ + ( (5 * sr_hz) / 4000 ) + +#define __LC3_NH(dt_us, sr_hz) \ ( ((3 - ((dt_us) >= 10000)) + 1) * __LC3_NS(dt_us, sr_hz) ) @@ -76,7 +80,7 @@ typedef struct lc3_attdet_analysis { } lc3_attdet_analysis_t; struct lc3_ltpf_hp50_state { - float s1, s2; + int64_t s1, s2; }; typedef struct lc3_ltpf_analysis { @@ -85,8 +89,8 @@ typedef struct lc3_ltpf_analysis { float nc[2]; struct lc3_ltpf_hp50_state hp50; - float x_12k8[384]; - float x_6k4[178]; + int16_t x_12k8[384]; + int16_t x_6k4[178]; int tc; } lc3_ltpf_analysis_t; @@ -103,11 +107,13 @@ struct lc3_encoder { lc3_ltpf_analysis_t ltpf; lc3_spec_analysis_t spec; + int16_t *xt; float *xs, *xf, s[0]; }; #define LC3_ENCODER_BUFFER_COUNT(dt_us, sr_hz) \ - ( 2*__LC3_NS(dt_us, sr_hz) + __LC3_ND(dt_us, sr_hz) ) + ( ( __LC3_NS(dt_us, sr_hz) + __LC3_NT(sr_hz) ) / 2 + \ + 2*__LC3_NS(dt_us, sr_hz) + __LC3_ND(dt_us, sr_hz) ) #define LC3_ENCODER_MEM_T(dt_us, sr_hz) \ struct { \ @@ -139,11 +145,11 @@ struct lc3_decoder { lc3_ltpf_synthesis_t ltpf; lc3_plc_state_t plc; - float *xr, *xs, *xd, *xg, s[0]; + float *xh, *xs, *xd, *xg, s[0]; }; #define LC3_DECODER_BUFFER_COUNT(dt_us, sr_hz) \ - ( __LC3_NR(dt_us, sr_hz) + __LC3_ND(dt_us, sr_hz) + \ + ( __LC3_NH(dt_us, sr_hz) + __LC3_ND(dt_us, sr_hz) + \ __LC3_NS(dt_us, sr_hz) ) #define LC3_DECODER_MEM_T(dt_us, sr_hz) \ diff --git a/src/common.h b/src/common.h index 4275c4e..c9160ca 100644 --- a/src/common.h +++ b/src/common.h @@ -26,6 +26,7 @@ #include #include "fastmath.h" +#include #include #include @@ -72,7 +73,8 @@ /** * Return number of samples, delayed samples and * encoded spectrum coefficients within a frame - * For decoding, keep 18 ms of history, aligned on frames, and a frame + * - For encoding, keep 1.25 ms for temporal window + * - For decoding, keep 18 ms of history, aligned on frames, and a frame */ #define LC3_NS(dt, sr) \ @@ -87,7 +89,10 @@ #define LC3_MAX_NE \ LC3_NE(LC3_DT_10M, LC3_SRATE_48K) -#define LC3_NR(dt, sr) \ +#define LC3_NT(sr_hz) \ + ( (5 * LC3_SRATE_KHZ(sr)) / 4 ) + +#define LC3_NH(dt, sr) \ ( ((3 - dt) + 1) * LC3_NS(dt, sr) ) diff --git a/src/lc3.c b/src/lc3.c index 394a78f..6560b6e 100644 --- a/src/lc3.c +++ b/src/lc3.c @@ -156,11 +156,15 @@ static void load_s16( enum lc3_dt dt = encoder->dt; enum lc3_srate sr = encoder->sr_pcm; + + int16_t *xt = encoder->xt; float *xs = encoder->xs; int ns = LC3_NS(dt, sr); - for (int i = 0; i < ns; i++) - xs[i] = pcm[i*stride]; + for (int i = 0; i < ns; i++) { + int16_t in = pcm[i*stride]; + xt[i] = in, xs[i] = in; + } } /** @@ -175,11 +179,17 @@ static void load_s24( enum lc3_dt dt = encoder->dt; enum lc3_srate sr = encoder->sr_pcm; + + int16_t *xt = encoder->xt; float *xs = encoder->xs; int ns = LC3_NS(dt, sr); - for (int i = 0; i < ns; i++) - xs[i] = ldexpf(pcm[i*stride], -8); + for (int i = 0; i < ns; i++) { + int32_t in = pcm[i*stride]; + + xt[i] = in >> 8; + xs[i] = ldexpf(in, -8); + } } /** @@ -196,7 +206,9 @@ static void analyze(struct lc3_encoder *encoder, enum lc3_srate sr_pcm = encoder->sr_pcm; int ns = LC3_NS(dt, sr_pcm); int nd = LC3_ND(dt, sr_pcm); + int nt = LC3_NT(sr_pcm); + int16_t *xt = encoder->xt; float *xs = encoder->xs; float *xf = encoder->xf; @@ -205,14 +217,16 @@ static void analyze(struct lc3_encoder *encoder, bool att = lc3_attdet_run(dt, sr_pcm, nbytes, &encoder->attdet, xs); side->pitch_present = - lc3_ltpf_analyse(dt, sr_pcm, &encoder->ltpf, xs, &side->ltpf); + lc3_ltpf_analyse(dt, sr_pcm, &encoder->ltpf, xt, &side->ltpf); + + memmove(xt - nt, xt + (ns-nt), nt * sizeof(*xt)); /* --- Spectral --- */ float e[LC3_NUM_BANDS]; lc3_mdct_forward(dt, sr_pcm, sr, xs, xf); - memmove(xs - nd, xs + ns-nd, nd * sizeof(float)); + memmove(xs - nd, xs + (ns-nd), nd * sizeof(*xs)); bool nn_flag = lc3_energy_compute(dt, sr, xf, e); if (nn_flag) @@ -299,12 +313,15 @@ struct lc3_encoder *lc3_setup_encoder( struct lc3_encoder *encoder = mem; int ns = LC3_NS(dt, sr_pcm); int nd = LC3_ND(dt, sr_pcm); + int nt = LC3_NT(sr_pcm); *encoder = (struct lc3_encoder){ .dt = dt, .sr = sr, .sr_pcm = sr_pcm, - .xs = encoder->s + nd, - .xf = encoder->s + nd+ns, + + .xt = (int16_t *)encoder->s + nt, + .xs = encoder->s + (nt+ns)/2 + nd, + .xf = encoder->s + (nt+ns)/2 + nd+ns, }; memset(encoder->s, 0, @@ -482,7 +499,7 @@ static void synthesize(struct lc3_decoder *decoder, } lc3_ltpf_synthesize(dt, sr_pcm, nbytes, &decoder->ltpf, - side && side->pitch_present ? &side->ltpf : NULL, decoder->xr, xs); + side && side->pitch_present ? &side->ltpf : NULL, decoder->xh, xs); } /** @@ -493,11 +510,11 @@ static void complete(struct lc3_decoder *decoder) { enum lc3_dt dt = decoder->dt; enum lc3_srate sr_pcm = decoder->sr_pcm; - int nr = LC3_NR(dt, sr_pcm); + int nh = LC3_NH(dt, sr_pcm); int ns = LC3_NS(dt, sr_pcm); - decoder->xs = decoder->xs - decoder->xr < nr - ns ? - decoder->xs + ns : decoder->xr; + decoder->xs = decoder->xs - decoder->xh < nh - ns ? + decoder->xs + ns : decoder->xh; } /** @@ -530,7 +547,7 @@ struct lc3_decoder *lc3_setup_decoder( return NULL; struct lc3_decoder *decoder = mem; - int nr = LC3_NR(dt, sr_pcm); + int nh = LC3_NH(dt, sr_pcm); int ns = LC3_NS(dt, sr_pcm); int nd = LC3_ND(dt, sr_pcm); @@ -538,11 +555,10 @@ struct lc3_decoder *lc3_setup_decoder( .dt = dt, .sr = sr, .sr_pcm = sr_pcm, - .xr = decoder->s, - .xs = decoder->s + nr-ns, - .xd = decoder->s + nr, - .xg = decoder->s + nr+nd, - + .xh = decoder->s, + .xs = decoder->s + nh-ns, + .xd = decoder->s + nh, + .xg = decoder->s + nh+nd, }; lc3_plc_reset(&decoder->plc); diff --git a/src/ltpf.c b/src/ltpf.c index b7aeb96..7c35890 100644 --- a/src/ltpf.c +++ b/src/ltpf.c @@ -19,158 +19,347 @@ #include "ltpf.h" #include "tables.h" +#include "ltpf_arm.h" +#include "ltpf_neon.h" + /* ---------------------------------------------------------------------------- * Resampling * -------------------------------------------------------------------------- */ +/** + * Resampling coefficients + * The coefficients, in fixed Q15, are reordered by phase for each source + * samplerate (coefficient matrix transposed) + */ + +#ifndef resample_8k_12k8 +static const int16_t h_8k_12k8_q15[8*10] = { + 214, 417, -1052, -4529, 26233, -4529, -1052, 417, 214, 0, + 180, 0, -1522, -2427, 24506, -5289, 0, 763, 156, -28, + 92, -323, -1361, 0, 19741, -3885, 1317, 861, 0, -61, + 0, -457, -752, 1873, 13068, 0, 2389, 598, -213, -79, + -61, -398, 0, 2686, 5997, 5997, 2686, 0, -398, -61, + -79, -213, 598, 2389, 0, 13068, 1873, -752, -457, 0, + -61, 0, 861, 1317, -3885, 19741, 0, -1361, -323, 92, + -28, 156, 763, 0, -5289, 24506, -2427, -1522, 0, 180, +}; +#endif /* resample_8k_12k8 */ + +#ifndef resample_16k_12k8 +static const int16_t h_16k_12k8_q15[4*20] = { + -61, 214, -398, 417, 0, -1052, 2686, -4529, 5997, 26233, + 5997, -4529, 2686, -1052, 0, 417, -398, 214, -61, 0, + + -79, 180, -213, 0, 598, -1522, 2389, -2427, 0, 24506, + 13068, -5289, 1873, 0, -752, 763, -457, 156, 0, -28, + + -61, 92, 0, -323, 861, -1361, 1317, 0, -3885, 19741, + 19741, -3885, 0, 1317, -1361, 861, -323, 0, 92, -61, + + -28, 0, 156, -457, 763, -752, 0, 1873, -5289, 13068, + 24506, 0, -2427, 2389, -1522, 598, 0, -213, 180, -79, +}; +#endif /* resample_16k_12k8 */ + +#ifndef resample_32k_12k8 +static const int16_t h_32k_12k8_q15[2*40] = { + -30, -31, 46, 107, 0, -199, -162, 209, 430, 0, + -681, -526, 658, 1343, 0, -2264, -1943, 2999, 9871, 13116, + 9871, 2999, -1943, -2264, 0, 1343, 658, -526, -681, 0, + 430, 209, -162, -199, 0, 107, 46, -31, -30, 0, + + -14, -39, 0, 90, 78, -106, -229, 0, 382, 299, + -376, -761, 0, 1194, 937, -1214, -2644, 0, 6534, 12253, + 12253, 6534, 0, -2644, -1214, 937, 1194, 0, -761, -376, + 299, 382, 0, -229, -106, 78, 90, 0, -39, -14, +}; +#endif /* resample_32k_12k8 */ + +#ifndef resample_24k_12k8 +static const int16_t h_24k_12k8_q15[8*30] = { + -50, 19, 143, -93, -290, 278, 485, -658, -701, 1396, + 901, -3019, -1042, 10276, 17488, 10276, -1042, -3019, 901, 1396, + -701, -658, 485, 278, -290, -93, 143, 19, -50, 0, + + -46, 0, 141, -45, -305, 185, 543, -501, -854, 1153, + 1249, -2619, -1908, 8712, 17358, 11772, 0, -3319, 480, 1593, + -504, -796, 399, 367, -261, -142, 138, 40, -52, -5, + + -41, -17, 133, 0, -304, 91, 574, -334, -959, 878, + 1516, -2143, -2590, 7118, 16971, 13161, 1202, -3495, 0, 1731, + -267, -908, 287, 445, -215, -188, 125, 62, -52, -12, + + -34, -30, 120, 41, -291, 0, 577, -164, -1015, 585, + 1697, -1618, -3084, 5534, 16337, 14406, 2544, -3526, -523, 1800, + 0, -985, 152, 509, -156, -230, 104, 83, -48, -19, + + -26, -41, 103, 76, -265, -83, 554, 0, -1023, 288, + 1791, -1070, -3393, 3998, 15474, 15474, 3998, -3393, -1070, 1791, + 288, -1023, 0, 554, -83, -265, 76, 103, -41, -26, + + -19, -48, 83, 104, -230, -156, 509, 152, -985, 0, + 1800, -523, -3526, 2544, 14406, 16337, 5534, -3084, -1618, 1697, + 585, -1015, -164, 577, 0, -291, 41, 120, -30, -34, + + -12, -52, 62, 125, -188, -215, 445, 287, -908, -267, + 1731, 0, -3495, 1202, 13161, 16971, 7118, -2590, -2143, 1516, + 878, -959, -334, 574, 91, -304, 0, 133, -17, -41, + + -5, -52, 40, 138, -142, -261, 367, 399, -796, -504, + 1593, 480, -3319, 0, 11772, 17358, 8712, -1908, -2619, 1249, + 1153, -854, -501, 543, 185, -305, -45, 141, 0, -46, +}; +#endif /* resample_24k_12k8 */ + +#ifndef resample_48k_12k8 +static const int16_t h_48k_12k8_q15[4*60] = { + -13, -25, -20, 10, 51, 71, 38, -47, -133, -145, + -42, 139, 277, 242, 0, -329, -511, -351, 144, 698, + 895, 450, -535, -1510, -1697, -521, 1999, 5138, 7737, 8744, + 7737, 5138, 1999, -521, -1697, -1510, -535, 450, 895, 698, + 144, -351, -511, -329, 0, 242, 277, 139, -42, -145, + -133, -47, 38, 71, 51, 10, -20, -25, -13, 0, + + -9, -23, -24, 0, 41, 71, 52, -23, -115, -152, + -78, 92, 254, 272, 76, -251, -493, -427, 0, 576, + 900, 624, -262, -1309, -1763, -954, 1272, 4356, 7203, 8679, + 8169, 5886, 2767, 0, -1542, -1660, -809, 240, 848, 796, + 292, -252, -507, -398, -82, 199, 288, 183, 0, -130, + -145, -71, 20, 69, 60, 20, -15, -26, -17, -3, + + -6, -20, -26, -8, 31, 67, 62, 0, -94, -152, + -108, 45, 223, 287, 143, -167, -454, -480, -134, 439, + 866, 758, 0, -1071, -1748, -1295, 601, 3559, 6580, 8485, + 8485, 6580, 3559, 601, -1295, -1748, -1071, 0, 758, 866, + 439, -134, -480, -454, -167, 143, 287, 223, 45, -108, + -152, -94, 0, 62, 67, 31, -8, -26, -20, -6, + + -3, -17, -26, -15, 20, 60, 69, 20, -71, -145, + -130, 0, 183, 288, 199, -82, -398, -507, -252, 292, + 796, 848, 240, -809, -1660, -1542, 0, 2767, 5886, 8169, + 8679, 7203, 4356, 1272, -954, -1763, -1309, -262, 624, 900, + 576, 0, -427, -493, -251, 76, 272, 254, 92, -78, + -152, -115, -23, 52, 71, 41, 0, -24, -23, -9, +}; +#endif /* resample_48k_12k8 */ + + +/** + * High-pass 50Hz filtering, at 12.8 KHz samplerate + * hp50 Biquad filter state + * xn Input sample, in fixed Q30 + * return Filtered sample, in fixed Q30 + */ +static inline int32_t filter_hp50( + struct lc3_ltpf_hp50_state *hp50, int32_t xn) +{ + int32_t yn; + + const int32_t a1 = -2110217691, a2 = 1037111617; + const int32_t b1 = -2110535566, b2 = 1055267782; + + yn = (hp50->s1 + (int64_t)xn * b2) >> 30; + hp50->s1 = (hp50->s2 + (int64_t)xn * b1 - (int64_t)yn * a1); + hp50->s2 = ( (int64_t)xn * b2 - (int64_t)yn * a2); + + return yn; +} + /** * Resample from 8 / 16 / 32 KHz to 12.8 KHz Template - * p Resampling factor with 64 KHz (8, 4 or 2) - * x [-d..-1] Previous, [0..ns-1] Current samples - * y, n [0..n-1] Output `n` processed samples + * p Resampling factor with compared to 192 KHz (8, 4 or 2) + * h Arrange by phase coefficients table + * hp50 High-Pass biquad filter state + * x [-d..-1] Previous, [0..ns-1] Current samples, Q15 + * y, n [0..n-1] Output `n` processed samples, Q14 * + * The `x` vector is aligned on 32 bits * The number of previous samples `d` accessed on `x` is : * d: { 10, 20, 40 } - 1 for resampling factors 8, 4 and 2. */ -static inline void resample_base_64k_12k8(const int p, - struct lc3_ltpf_hp50_state *hp50, const float *x, float *y, int n) +static inline void resample_x64k_12k8(const int p, const int16_t *h, + struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) { - /* --- Parameters --- - * bn, an: High-Pass Biquad coefficients, - * with `bn` support of rescaling resampling factor. - * Note that it's an High-Pass filter, so we have `b0 = b2`, - * in the following steps we use `b0` as `b2`. */ + const int w = 2*(40 / p); - const int w = 40 / p; + x -= w - 1; - const float *h = lc3_ltpf_h12k8 + 119; - const float a1 = -1.965293373f, b1 = -1.965589417f * 3*LC3_MIN(p, 4); - const float a2 = 0.965885461f, b2 = 0.982794708f * 3*LC3_MIN(p, 4); + for (int i = 0; i < 5*n; i += 5) { + const int16_t *hn = h + (i % p) * w; + const int16_t *xn = x + (i / p); + int32_t un = 0; - /* --- Resampling & filtering --- */ - - for (int i = 0; i < n; i += 8, x += w) - for (int j = 0; j < 40; j += 5) { - - const float *hn = h - 3*(p*w + (j % p)); - const float *xn = x - (2*w - (j / p)); - float yn, un = 0; - - for (int k = 0; k < 2*w; k += 10) { - un += *(++xn) * *(hn += (3*p)); - un += *(++xn) * *(hn += (3*p)); - un += *(++xn) * *(hn += (3*p)); - un += *(++xn) * *(hn += (3*p)); - un += *(++xn) * *(hn += (3*p)); - un += *(++xn) * *(hn += (3*p)); - un += *(++xn) * *(hn += (3*p)); - un += *(++xn) * *(hn += (3*p)); - un += *(++xn) * *(hn += (3*p)); - un += *(++xn) * *(hn += (3*p)); - } - - yn = b2 * un + hp50->s1; - hp50->s1 = b1 * un - a1 * yn + hp50->s2; - hp50->s2 = b2 * un - a2 * yn; - *(y++) = yn; + for (int k = 0; k < w; k += 10) { + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); } + + int32_t yn = filter_hp50(hp50, un); + *(y++) = (yn + (1 << 15)) >> 16; + } } /** * Resample from 24 / 48 KHz to 12.8 KHz Template - * p Resampling factor with 192 KHz (8 or 4) - * x [-d..-1] Previous, [0..ns-1] Current samples - * y, n [0..n-1] Output `n` processed samples + * p Resampling factor with compared to 192 KHz (8 or 4) + * h Arrange by phase coefficients table + * hp50 High-Pass biquad filter state + * x [-d..-1] Previous, [0..ns-1] Current samples, Q15 + * y, n [0..n-1] Output `n` processed samples, Q14 * + * The `x` vector is aligned on 32 bits * The number of previous samples `d` accessed on `x` is : * d: { 30, 60 } - 1 for resampling factors 8 and 4. */ -static inline void resample_base_192k_12k8(const int p, - struct lc3_ltpf_hp50_state *hp50, const float *x, float *y, int n) +static inline void resample_x192k_12k8(const int p, const int16_t *h, + struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) { - /* --- Parameters --- - * bn, an: High-Pass Biquad coefficients, - * with `bn` support of rescaling resampling factor. - * Note that it's an High-Pass filter, so we have `b0 = b2`, - * in the following steps we use `b0` as `b2`. */ + const int w = 2*(120 / p); - const int w = 120 / p; + x -= w - 1; - const float *h = lc3_ltpf_h12k8 + 119; - const float a1 = -1.965293373f, b1 = -1.965589417f * p; - const float a2 = 0.965885461f, b2 = 0.982794708f * p; + for (int i = 0; i < 15*n; i += 15) { + const int16_t *hn = h + (i % p) * w; + const int16_t *xn = x + (i / p); + int32_t un = 0; - /* --- Resampling & filtering --- */ - - for (int i = 0; i < n; i += 8, x += w) - for (int j = 0; j < 120; j += 15) { - - const float *hn = h - (p*w + (j % p)); - const float *xn = x - (2*w - (j / p)); - float yn, un = 0; - - for (int k = 0; k < 2*w; k += 15) { - un += *(++xn) * *(hn += p); - un += *(++xn) * *(hn += p); - un += *(++xn) * *(hn += p); - un += *(++xn) * *(hn += p); - un += *(++xn) * *(hn += p); - un += *(++xn) * *(hn += p); - un += *(++xn) * *(hn += p); - un += *(++xn) * *(hn += p); - un += *(++xn) * *(hn += p); - un += *(++xn) * *(hn += p); - un += *(++xn) * *(hn += p); - un += *(++xn) * *(hn += p); - un += *(++xn) * *(hn += p); - un += *(++xn) * *(hn += p); - un += *(++xn) * *(hn += p); - } - - yn = b2 * un + hp50->s1; - hp50->s1 = b1 * un - a1 * yn + hp50->s2; - hp50->s2 = b2 * un - a2 * yn; - *(y++) = yn; + for (int k = 0; k < w; k += 15) { + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); + un += *(xn++) * *(hn++); } + + int32_t yn = filter_hp50(hp50, un); + *(y++) = (yn + (1 << 15)) >> 16; + } } +/** + * Resample from 8 Khz to 12.8 KHz + * hp50 High-Pass biquad filter state + * x [-10..-1] Previous, [0..ns-1] Current samples, Q15 + * y, n [0..n-1] Output `n` processed samples, Q14 + * + * The `x` vector is aligned on 32 bits + */ +#ifndef resample_8k_12k8 +static void resample_8k_12k8( + struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) +{ + resample_x64k_12k8(8, h_8k_12k8_q15, hp50, x, y, n); +} +#endif /* resample_8k_12k8 */ + +/** + * Resample from 16 Khz to 12.8 KHz + * hp50 High-Pass biquad filter state + * x [-20..-1] Previous, [0..ns-1] Current samples, in fixed Q15 + * y, n [0..n-1] Output `n` processed samples, in fixed Q14 + * + * The `x` vector is aligned on 32 bits + */ +#ifndef resample_16k_12k8 +static void resample_16k_12k8( + struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) +{ + resample_x64k_12k8(4, h_16k_12k8_q15, hp50, x, y, n); +} +#endif /* resample_16k_12k8 */ + +/** + * Resample from 32 Khz to 12.8 KHz + * hp50 High-Pass biquad filter state + * x [-30..-1] Previous, [0..ns-1] Current samples, in fixed Q15 + * y, n [0..n-1] Output `n` processed samples, in fixed Q14 + * + * The `x` vector is aligned on 32 bits + */ +#ifndef resample_32k_12k8 +static void resample_32k_12k8( + struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) +{ + resample_x64k_12k8(2, h_32k_12k8_q15, hp50, x, y, n); +} +#endif /* resample_32k_12k8 */ + +/** + * Resample from 24 Khz to 12.8 KHz + * hp50 High-Pass biquad filter state + * x [-30..-1] Previous, [0..ns-1] Current samples, in fixed Q15 + * y, n [0..n-1] Output `n` processed samples, in fixed Q14 + * + * The `x` vector is aligned on 32 bits + */ +#ifndef resample_24k_12k8 +static void resample_24k_12k8( + struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) +{ + resample_x192k_12k8(8, h_24k_12k8_q15, hp50, x, y, n); +} +#endif /* resample_24k_12k8 */ + +/** + * Resample from 48 Khz to 12.8 KHz + * hp50 High-Pass biquad filter state + * x [-60..-1] Previous, [0..ns-1] Current samples, in fixed Q15 + * y, n [0..n-1] Output `n` processed samples, in fixed Q14 + * +* The `x` vector is aligned on 32 bits +*/ +#ifndef resample_48k_12k8 +static void resample_48k_12k8( +struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) +{ + resample_x192k_12k8(4, h_48k_12k8_q15, hp50, x, y, n); +} +#endif /* resample_48k_12k8 */ + +/** +* Resample to 6.4 KHz +* x [-3..-1] Previous, [0..n-1] Current samples +* y, n [0..n-1] Output `n` processed samples +* +* The `x` vector is aligned on 32 bits + */ +#ifndef resample_6k4 +static void resample_6k4(const int16_t *x, int16_t *y, int n) +{ + static const int16_t h[] = { 18477, 15424, 8105 }; + const int16_t *ye = y + n; + + for (x--; y < ye; x += 2) + *(y++) = (x[0] * h[0] + (x[-1] + x[1]) * h[1] + + (x[-2] + x[2]) * h[2]) >> 16; +} +#endif /* resample_6k4 */ + /** * LTPF Resample to 12.8 KHz implementations for each samplerates */ -static void resample_8k_12k8( - struct lc3_ltpf_hp50_state *hp50, const float *x, float *y, int n) -{ - resample_base_64k_12k8(8, hp50, x, y, n); -} - -static void resample_16k_12k8( - struct lc3_ltpf_hp50_state *hp50, const float *x, float *y, int n) -{ - resample_base_64k_12k8(4, hp50, x, y, n); -} - -static void resample_24k_12k8( - struct lc3_ltpf_hp50_state *hp50, const float *x, float *y, int n) -{ - resample_base_192k_12k8(8, hp50, x, y, n); -} - -static void resample_32k_12k8( - struct lc3_ltpf_hp50_state *hp50, const float *x, float *y, int n) -{ - resample_base_64k_12k8(2, hp50, x, y, n); -} - -static void resample_48k_12k8( - struct lc3_ltpf_hp50_state *hp50, const float *x, float *y, int n) -{ - resample_base_192k_12k8(4, hp50, x, y, n); -} - static void (* const resample_12k8[]) - (struct lc3_ltpf_hp50_state *, const float *, float *, int ) = + (struct lc3_ltpf_hp50_state *, const int16_t *, int16_t *, int ) = { [LC3_SRATE_8K ] = resample_8k_12k8, [LC3_SRATE_16K] = resample_16k_12k8, @@ -179,23 +368,6 @@ static void (* const resample_12k8[]) [LC3_SRATE_48K] = resample_48k_12k8, }; -/** - * Resample to 6.4 KHz (cf. 3.3.9.3-4) - * x [-3..-1] Previous, [0..n-1] Current samples - * y, n [0..n-1] Output `n` processed samples - */ -static void resample_6k4(const float *x, float *y, int n) -{ - static const float h[] = { 0.2819382921, 0.2353512128, 0.1236796411 }; - float xn2 = x[-3], xn1 = x[-2], x0 = x[-1], x1, x2; - - for (const float *ye = y + n; y < ye; xn2 = x0, xn1 = x1, x0 = x2) { - x1 = *(x++); x2 = *(x++); - - *(y++) = x0 * h[0] + (xn1 + x1) * h[1] + (xn2 + x2) * h[2]; - } -} - /* ---------------------------------------------------------------------------- * Analysis @@ -203,33 +375,41 @@ static void resample_6k4(const float *x, float *y, int n) /** * Return dot product of 2 vectors - * a, b, n The 2 vectors of size `n` (multiple of 16) + * a, b, n The 2 vectors of size `n` (> 0 and <= 128) * return sum( a[i] * b[i] ), i = [0..n-1] - */ -static inline float dot(const float *a, const float *b, int n) + * + * The size `n` of vectors must be multiple of 16, and less or equal to 128 +*/ +#ifndef dot +static inline float dot(const int16_t *a, const int16_t *b, int n) { - float v = 0; + int64_t v = 0; for (int i = 0; i < (n >> 4); i++) for (int j = 0; j < 16; j++) v += *(a++) * *(b++); - return v; + int32_t v32 = (v + (1 << 5)) >> 6; + return (float)v32; } +#endif /* dot */ /** * Return vector of correlations - * a, b, n The 2 vector of size `n` to correlate + * a, b, n The 2 vector of size `n` (> 0 and <= 128) * y, nc Output the correlation vector of size `nc` * - * The size `n` of input vectors must be multiple of 16 + * The first vector `a` is aligned of 32 bits + * The size `n` of vectors is multiple of 16, and less or equal to 128 */ +#ifndef correlate static void correlate( - const float *a, const float *b, int n, float *y, int nc) + const int16_t *a, const int16_t *b, int n, float *y, int nc) { for (const float *ye = y + nc; y < ye; ) *(y++) = dot(a, b--, n); } +#endif /* correlate */ /** * Search the maximum value and returns its argument @@ -279,24 +459,30 @@ static int argmax_weighted( * * The size `n` of vectors must be multiple of 4 */ -static void interpolate(const float *x, int n, int d, float *y) +static void interpolate(const int16_t *x, int n, int d, int16_t *y) { - static const float h4[][8] = { - { 2.09880463e-01, 5.83527575e-01, 2.09880463e-01 }, - { 1.06999186e-01, 5.50075002e-01, 3.35690625e-01, 6.69885837e-03 }, - { 3.96711478e-02, 4.59220930e-01, 4.59220930e-01, 3.96711478e-02 }, - { 6.69885837e-03, 3.35690625e-01, 5.50075002e-01, 1.06999186e-01 }, - }; + static const int16_t h4_q15[][4] = { + { 6877, 19121, 6877, 0 }, { 3506, 18025, 11000, 220 }, + { 1300, 15048, 15048, 1300 }, { 220, 11000, 18025, 3506 } }; - const float *h = h4[d]; - float x3 = x[-2], x2 = x[-1], x1, x0; + const int16_t *h = h4_q15[d]; + int16_t x3 = x[-2], x2 = x[-1], x1, x0; x1 = (*x++); - for (const float *ye = y + n; y < ye; ) { - *(y++) = (x0 = *(x++)) * h[0] + x1 * h[1] + x2 * h[2] + x3 * h[3]; - *(y++) = (x3 = *(x++)) * h[0] + x0 * h[1] + x1 * h[2] + x2 * h[3]; - *(y++) = (x2 = *(x++)) * h[0] + x3 * h[1] + x0 * h[2] + x1 * h[3]; - *(y++) = (x1 = *(x++)) * h[0] + x2 * h[1] + x3 * h[2] + x0 * h[3]; + for (const int16_t *ye = y + n; y < ye; ) { + int32_t yn; + + yn = (x0 = *(x++)) * h[0] + x1 * h[1] + x2 * h[2] + x3 * h[3]; + *(y++) = yn >> 15; + + yn = (x3 = *(x++)) * h[0] + x0 * h[1] + x1 * h[2] + x2 * h[3]; + *(y++) = yn >> 15; + + yn = (x2 = *(x++)) * h[0] + x3 * h[1] + x0 * h[2] + x1 * h[3]; + *(y++) = yn >> 15; + + yn = (x1 = *(x++)) * h[0] + x2 * h[1] + x3 * h[2] + x0 * h[3]; + *(y++) = yn >> 15; } } @@ -306,7 +492,7 @@ static void interpolate(const float *x, int n, int d, float *y) * d The phase of interpolation (-3 to 3) * return The interpolated value */ -static float interpolate_4(const float *x, int d) +static float interpolate_corr(const float *x, int d) { static const float h4[][8] = { { 1.53572770e-02, -4.72963246e-02, 8.35788573e-02, 8.98638285e-01, @@ -336,9 +522,11 @@ static float interpolate_4(const float *x, int d) * x, n [-114..-17] Previous, [0..n-1] Current 6.4KHz samples * tc Return the pitch-lag estimation * return True when pitch present + * + * The `x` vector is aligned on 32 bits */ static bool detect_pitch( - struct lc3_ltpf_analysis *ltpf, const float *x, int n, int *tc) + struct lc3_ltpf_analysis *ltpf, const int16_t *x, int n, int *tc) { float rm1, rm2; float r[98]; @@ -352,8 +540,8 @@ static bool detect_pitch( int t1 = argmax_weighted(r, nr, -.5f/(nr-1), &rm1); int t2 = k0 + argmax(r + k0, nk, &rm2); - const float *x1 = x - (r0 + t1); - const float *x2 = x - (r0 + t2); + const int16_t *x1 = x - (r0 + t1); + const int16_t *x2 = x - (r0 + t2); float nc1 = rm1 <= 0 ? 0 : rm1 / sqrtf(dot(x, x, n) * dot(x1, x1, n)); @@ -370,12 +558,14 @@ static bool detect_pitch( /** * Pitch-lag parameter (3.3.9.7) - * x, n [-232..-28] Previous, [0..n-1] Current 12.8KHz samples + * x, n [-232..-28] Previous, [0..n-1] Current 12.8KHz samples, Q14 * tc Pitch-lag estimation * pitch The pitch value, in fixed .4 * return The bitstream pitch index value + * + * The `x` vector is aligned on 32 bits */ -static int refine_pitch(const float *x, int n, int tc, int *pitch) +static int refine_pitch(const int16_t *x, int n, int tc, int *pitch) { float r[17], rm; int e, f; @@ -388,17 +578,17 @@ static int refine_pitch(const float *x, int n, int tc, int *pitch) e = r0 + argmax(r + 4, nr, &rm); const float *re = r + (e - (r0 - 4)); - float dm = interpolate_4(re, f = 0); + float dm = interpolate_corr(re, f = 0); for (int i = 1; i <= 3; i++) { float d; if (e >= 127 && ((i & 1) | (e >= 157))) continue; - if ((d = interpolate_4(re, i)) > dm) + if ((d = interpolate_corr(re, i)) > dm) dm = d, f = i; - if (e > 32 && (d = interpolate_4(re, -i)) > dm) + if (e > 32 && (d = interpolate_corr(re, -i)) > dm) dm = d, f = -i; } @@ -413,31 +603,34 @@ static int refine_pitch(const float *x, int n, int tc, int *pitch) /** * LTPF Analysis */ -bool lc3_ltpf_analyse(enum lc3_dt dt, enum lc3_srate sr, - struct lc3_ltpf_analysis *ltpf, const float *x, struct lc3_ltpf_data *data) +bool lc3_ltpf_analyse( + enum lc3_dt dt, enum lc3_srate sr, struct lc3_ltpf_analysis *ltpf, + const int16_t *x, struct lc3_ltpf_data *data) { /* --- Resampling to 12.8 KHz --- */ - int z_12k8 = sizeof(ltpf->x_12k8) / sizeof(float); + int z_12k8 = sizeof(ltpf->x_12k8) / sizeof(*ltpf->x_12k8); int n_12k8 = dt == LC3_DT_7M5 ? 96 : 128; memmove(ltpf->x_12k8, ltpf->x_12k8 + n_12k8, - (z_12k8 - n_12k8) * sizeof(float)); + (z_12k8 - n_12k8) * sizeof(*ltpf->x_12k8)); + + int16_t *x_12k8 = ltpf->x_12k8 + (z_12k8 - n_12k8); - float *x_12k8 = ltpf->x_12k8 + (z_12k8 - n_12k8); resample_12k8[sr](<pf->hp50, x, x_12k8, n_12k8); x_12k8 -= (dt == LC3_DT_7M5 ? 44 : 24); /* --- Resampling to 6.4 KHz --- */ - int z_6k4 = sizeof(ltpf->x_6k4) / sizeof(float); + int z_6k4 = sizeof(ltpf->x_6k4) / sizeof(*ltpf->x_6k4); int n_6k4 = n_12k8 >> 1; memmove(ltpf->x_6k4, ltpf->x_6k4 + n_6k4, - (z_6k4 - n_6k4) * sizeof(float)); + (z_6k4 - n_6k4) * sizeof(*ltpf->x_6k4)); + + int16_t *x_6k4 = ltpf->x_6k4 + (z_6k4 - n_6k4); - float *x_6k4 = ltpf->x_6k4 + (z_6k4 - n_6k4); resample_6k4(x_12k8, x_6k4, n_6k4); /* --- Pitch detection --- */ @@ -448,7 +641,7 @@ bool lc3_ltpf_analyse(enum lc3_dt dt, enum lc3_srate sr, bool pitch_present = detect_pitch(ltpf, x_6k4, n_6k4, &tc); if (pitch_present) { - float u[n_12k8], v[n_12k8]; + int16_t u[n_12k8], v[n_12k8]; data->pitch_index = refine_pitch(x_12k8, n_12k8, tc, &pitch); @@ -489,14 +682,14 @@ bool lc3_ltpf_analyse(enum lc3_dt dt, enum lc3_srate sr, /** * Synthesis filter template - * xr, nr Ring buffer of filtered samples + * xh, nh History ring buffer of filtered samples * lag Lag parameter in the ring buffer * x0 w-1 previous input samples * x, n Current samples as input, filtered as output * c, w Coefficients `den` then `num`, and width of filter * fade Fading mode of filter -1: Out 1: In 0: None */ -static inline void synthesize_template(const float *xr, int nr, int lag, +static inline void synthesize_template(const float *xh, int nh, int lag, const float *x0, float *x, int n, const float *c, const int w, int fade) { float g = (float)(fade <= 0); @@ -507,15 +700,15 @@ static inline void synthesize_template(const float *xr, int nr, int lag, lag += (w >> 1); - const float *y = x - xr < lag ? x + (nr - lag) : x - lag; - const float *y_end = xr + nr - 1; + const float *y = x - xh < lag ? x + (nh - lag) : x - lag; + const float *y_end = xh + nh - 1; for (int j = 0; j < w-1; j++) { u[j] = 0; float yi = *y, xi = *(x0++); - y = y < y_end ? y + 1 : xr; + y = y < y_end ? y + 1 : xh; for (int k = 0; k <= j; k++) u[j-k] -= yi * c[k]; @@ -532,7 +725,7 @@ static inline void synthesize_template(const float *xr, int nr, int lag, for (int j = 0; j < w; j++, g += g_incr) { float yi = *y, xi = *x; - y = y < y_end ? y + 1 : xr; + y = y < y_end ? y + 1 : xh; for (int k = 0; k < w; k++) u[(j+(w-1)-k)%w] -= yi * c[k]; @@ -589,9 +782,9 @@ static void (* const synthesize[])(const float *, int, int, */ void lc3_ltpf_synthesize(enum lc3_dt dt, enum lc3_srate sr, int nbytes, lc3_ltpf_synthesis_t *ltpf, const lc3_ltpf_data_t *data, - const float *xr, float *x) + const float *xh, float *x) { - int nr = LC3_NR(dt, sr); + int nh = LC3_NH(dt, sr); int dt_us = LC3_DT_US(dt); /* --- Filter parameters --- */ @@ -627,15 +820,15 @@ void lc3_ltpf_synthesize(enum lc3_dt dt, enum lc3_srate sr, int nbytes, memcpy(x0, x + nt-(w-1), (w-1) * sizeof(float)); if (!ltpf->active && active) - synthesize[sr](xr, nr, pitch/4, ltpf->x, x, nt, c, 1); + synthesize[sr](xh, nh, pitch/4, ltpf->x, x, nt, c, 1); else if (ltpf->active && !active) - synthesize[sr](xr, nr, ltpf->pitch/4, ltpf->x, x, nt, ltpf->c, -1); + synthesize[sr](xh, nh, ltpf->pitch/4, ltpf->x, x, nt, ltpf->c, -1); else if (ltpf->active && active && ltpf->pitch == pitch) - synthesize[sr](xr, nr, pitch/4, ltpf->x, x, nt, c, 0); + synthesize[sr](xh, nh, pitch/4, ltpf->x, x, nt, c, 0); else if (ltpf->active && active) { - synthesize[sr](xr, nr, ltpf->pitch/4, ltpf->x, x, nt, ltpf->c, -1); - synthesize[sr](xr, nr, pitch/4, - (x <= xr ? x + nr : x) - (w-1), x, nt, c, 1); + synthesize[sr](xh, nh, ltpf->pitch/4, ltpf->x, x, nt, ltpf->c, -1); + synthesize[sr](xh, nh, pitch/4, + (x <= xh ? x + nh : x) - (w-1), x, nt, c, 1); } /* --- Remainder --- */ @@ -643,7 +836,7 @@ void lc3_ltpf_synthesize(enum lc3_dt dt, enum lc3_srate sr, int nbytes, memcpy(ltpf->x, x + ns - (w-1), (w-1) * sizeof(float)); if (active) - synthesize[sr](xr, nr, pitch/4, x0, x + nt, ns-nt, c, 0); + synthesize[sr](xh, nh, pitch/4, x0, x + nt, ns-nt, c, 0); /* --- Update state --- */ diff --git a/src/ltpf.h b/src/ltpf.h index a0f725c..0d5bb3c 100644 --- a/src/ltpf.h +++ b/src/ltpf.h @@ -53,11 +53,12 @@ typedef struct lc3_ltpf_data { * data Return bitstream data * return True when pitch present, False otherwise * + * The `x` vector is aligned on 32 bits * The number of previous samples `d` accessed on `x` is : * d: { 10, 20, 30, 40, 60 } - 1 for samplerates from 8KHz to 48KHz */ bool lc3_ltpf_analyse(enum lc3_dt dt, enum lc3_srate sr, - lc3_ltpf_analysis_t *ltpf, const float *x, lc3_ltpf_data_t *data); + lc3_ltpf_analysis_t *ltpf, const int16_t *x, lc3_ltpf_data_t *data); /** * LTPF disable diff --git a/src/ltpf_arm.h b/src/ltpf_arm.h new file mode 100644 index 0000000..914c964 --- /dev/null +++ b/src/ltpf_arm.h @@ -0,0 +1,465 @@ +/****************************************************************************** + * + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +#ifdef __ARM_FEATURE_SIMD32 + +/** + * Configuration + */ + +#ifndef TEST_ARM + +#include + +#define resample_8k_12k8 arm_resample_8k_12k8 +#define resample_16k_12k8 arm_resample_16k_12k8 +#define resample_24k_12k8 arm_resample_24k_12k8 +#define resample_32k_12k8 arm_resample_32k_12k8 +#define resample_48k_12k8 arm_resample_48k_12k8 + +#define correlate arm_correlate + +static inline int16x2_t __pkhbt(int16x2_t a, int16x2_t b) +{ + int16x2_t r; + __asm("pkhbt %0, %1, %2" : "=r" (r) : "r" (a), "r" (b)); + return r; +} + +#endif /* TEST_ARM */ + + +/** + * Import + */ + +static inline int32_t filter_hp50(struct lc3_ltpf_hp50_state *, int32_t); +static inline float dot(const int16_t *, const int16_t *, int); + + +/** + * Resample from 8 / 16 / 32 KHz to 12.8 KHz Template + */ +static inline void arm_resample_x64k_12k8(const int p, const int16x2_t *h, + struct lc3_ltpf_hp50_state *hp50, const int16x2_t *x, int16_t *y, int n) +{ + const int w = 40 / p; + + x -= w; + + for (int i = 0; i < 5*n; i += 5) { + const int16x2_t *hn = h + (i % (2*p)) * (48 / p); + const int16x2_t *xn = x + (i / (2*p)); + + int32_t un = __smlad(*(xn++), *(hn++), 0); + + for (int k = 0; k < w; k += 5) { + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + } + + int32_t yn = filter_hp50(hp50, un); + *(y++) = (yn + (1 << 15)) >> 16; + } +} + +/** + * Resample from 24 / 48 KHz to 12.8 KHz Template + */ +static inline void arm_resample_x192k_12k8(const int p, const int16x2_t *h, + struct lc3_ltpf_hp50_state *hp50, const int16x2_t *x, int16_t *y, int n) +{ + const int w = 120 / p; + + x -= w; + + for (int i = 0; i < 15*n; i += 15) { + const int16x2_t *hn = h + (i % (2*p)) * (128 / p); + const int16x2_t *xn = x + (i / (2*p)); + + int32_t un = __smlad(*(xn++), *(hn++), 0); + + for (int k = 0; k < w; k += 15) { + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + un = __smlad(*(xn++), *(hn++), un); + } + + int32_t yn = filter_hp50(hp50, un); + *(y++) = (yn + (1 << 15)) >> 16; + } +} + +/** + * Resample from 8 Khz to 12.8 KHz + */ +static void arm_resample_8k_12k8( + struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) +{ + static const int16_t alignas(int32_t) h[2*8*12] = { + 0, 214, 417, -1052, -4529, 26233, -4529, -1052, 417, 214, 0, 0, + 0, 180, 0, -1522, -2427, 24506, -5289, 0, 763, 156, -28, 0, + 0, 92, -323, -1361, 0, 19741, -3885, 1317, 861, 0, -61, 0, + 0, 0, -457, -752, 1873, 13068, 0, 2389, 598, -213, -79, 0, + 0, -61, -398, 0, 2686, 5997, 5997, 2686, 0, -398, -61, 0, + 0, -79, -213, 598, 2389, 0, 13068, 1873, -752, -457, 0, 0, + 0, -61, 0, 861, 1317, -3885, 19741, 0, -1361, -323, 92, 0, + 0, -28, 156, 763, 0, -5289, 24506, -2427, -1522, 0, 180, 0, + 0, 0, 214, 417, -1052, -4529, 26233, -4529, -1052, 417, 214, 0, + 0, 0, 180, 0, -1522, -2427, 24506, -5289, 0, 763, 156, -28, + 0, 0, 92, -323, -1361, 0, 19741, -3885, 1317, 861, 0, -61, + 0, 0, 0, -457, -752, 1873, 13068, 0, 2389, 598, -213, -79, + 0, 0, -61, -398, 0, 2686, 5997, 5997, 2686, 0, -398, -61, + 0, 0, -79, -213, 598, 2389, 0, 13068, 1873, -752, -457, 0, + 0, 0, -61, 0, 861, 1317, -3885, 19741, 0, -1361, -323, 92, + 0, 0, -28, 156, 763, 0, -5289, 24506, -2427, -1522, 0, 180, + }; + + arm_resample_x64k_12k8( + 8, (const int16x2_t *)h, hp50, (int16x2_t *)x, y, n); +} + +/** + * Resample from 16 Khz to 12.8 KHz + */ +static void arm_resample_16k_12k8( + struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) +{ + static const int16_t alignas(int32_t) h[2*4*24] = { + + 0, -61, 214, -398, 417, 0, -1052, 2686, + -4529, 5997, 26233, 5997, -4529, 2686, -1052, 0, + 417, -398, 214, -61, 0, 0, 0, 0, + + + 0, -79, 180, -213, 0, 598, -1522, 2389, + -2427, 0, 24506, 13068, -5289, 1873, 0, -752, + 763, -457, 156, 0, -28, 0, 0, 0, + + + 0, -61, 92, 0, -323, 861, -1361, 1317, + 0, -3885, 19741, 19741, -3885, 0, 1317, -1361, + 861, -323, 0, 92, -61, 0, 0, 0, + + 0, -28, 0, 156, -457, 763, -752, 0, + 1873, -5289, 13068, 24506, 0, -2427, 2389, -1522, + 598, 0, -213, 180, -79, 0, 0, 0, + + + 0, 0, -61, 214, -398, 417, 0, -1052, + 2686, -4529, 5997, 26233, 5997, -4529, 2686, -1052, + 0, 417, -398, 214, -61, 0, 0, 0, + + + 0, 0, -79, 180, -213, 0, 598, -1522, + 2389, -2427, 0, 24506, 13068, -5289, 1873, 0, + -752, 763, -457, 156, 0, -28, 0, 0, + + + 0, 0, -61, 92, 0, -323, 861, -1361, + 1317, 0, -3885, 19741, 19741, -3885, 0, 1317, + -1361, 861, -323, 0, 92, -61, 0, 0, + + 0, 0, -28, 0, 156, -457, 763, -752, + 0, 1873, -5289, 13068, 24506, 0, -2427, 2389, + -1522, 598, 0, -213, 180, -79, 0, 0, + }; + + arm_resample_x64k_12k8( + 4, (const int16x2_t *)h, hp50, (int16x2_t *)x, y, n); +} + +/** + * Resample from 32 Khz to 12.8 KHz + */ +static void arm_resample_32k_12k8( + struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) +{ + static const int16_t alignas(int32_t) h[2*2*48] = { + + 0, -30, -31, 46, 107, 0, -199, -162, + 209, 430, 0, -681, -526, 658, 1343, 0, + -2264, -1943, 2999, 9871, 13116, 9871, 2999, -1943, + -2264, 0, 1343, 658, -526, -681, 0, 430, + 209, -162, -199, 0, 107, 46, -31, -30, + 0, 0, 0, 0, 0, 0, 0, 0, + + 0, -14, -39, 0, 90, 78, -106, -229, + 0, 382, 299, -376, -761, 0, 1194, 937, + -1214, -2644, 0, 6534, 12253, 12253, 6534, 0, + -2644, -1214, 937, 1194, 0, -761, -376, 299, + 382, 0, -229, -106, 78, 90, 0, -39, + -14, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, -30, -31, 46, 107, 0, -199, + -162, 209, 430, 0, -681, -526, 658, 1343, + 0, -2264, -1943, 2999, 9871, 13116, 9871, 2999, + -1943, -2264, 0, 1343, 658, -526, -681, 0, + 430, 209, -162, -199, 0, 107, 46, -31, + -30, 0, 0, 0, 0, 0, 0, 0, + + 0, 0, -14, -39, 0, 90, 78, -106, + -229, 0, 382, 299, -376, -761, 0, 1194, + 937, -1214, -2644, 0, 6534, 12253, 12253, 6534, + 0, -2644, -1214, 937, 1194, 0, -761, -376, + 299, 382, 0, -229, -106, 78, 90, 0, + -39, -14, 0, 0, 0, 0, 0, 0, + }; + + arm_resample_x64k_12k8( + 2, (const int16x2_t *)h, hp50, (int16x2_t *)x, y, n); +} + +/** + * Resample from 24 Khz to 12.8 KHz + */ +static void arm_resample_24k_12k8( + struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) +{ + static const int16_t alignas(int32_t) h[2*8*32] = { + + 0, -50, 19, 143, -93, -290, 278, 485, + -658, -701, 1396, 901, -3019, -1042, 10276, 17488, + 10276, -1042, -3019, 901, 1396, -701, -658, 485, + 278, -290, -93, 143, 19, -50, 0, 0, + + 0, -46, 0, 141, -45, -305, 185, 543, + -501, -854, 1153, 1249, -2619, -1908, 8712, 17358, + 11772, 0, -3319, 480, 1593, -504, -796, 399, + 367, -261, -142, 138, 40, -52, -5, 0, + + 0, -41, -17, 133, 0, -304, 91, 574, + -334, -959, 878, 1516, -2143, -2590, 7118, 16971, + 13161, 1202, -3495, 0, 1731, -267, -908, 287, + 445, -215, -188, 125, 62, -52, -12, 0, + + 0, -34, -30, 120, 41, -291, 0, 577, + -164, -1015, 585, 1697, -1618, -3084, 5534, 16337, + 14406, 2544, -3526, -523, 1800, 0, -985, 152, + 509, -156, -230, 104, 83, -48, -19, 0, + + 0, -26, -41, 103, 76, -265, -83, 554, + 0, -1023, 288, 1791, -1070, -3393, 3998, 15474, + 15474, 3998, -3393, -1070, 1791, 288, -1023, 0, + 554, -83, -265, 76, 103, -41, -26, 0, + + 0, -19, -48, 83, 104, -230, -156, 509, + 152, -985, 0, 1800, -523, -3526, 2544, 14406, + 16337, 5534, -3084, -1618, 1697, 585, -1015, -164, + 577, 0, -291, 41, 120, -30, -34, 0, + + 0, -12, -52, 62, 125, -188, -215, 445, + 287, -908, -267, 1731, 0, -3495, 1202, 13161, + 16971, 7118, -2590, -2143, 1516, 878, -959, -334, + 574, 91, -304, 0, 133, -17, -41, 0, + + 0, -5, -52, 40, 138, -142, -261, 367, + 399, -796, -504, 1593, 480, -3319, 0, 11772, + 17358, 8712, -1908, -2619, 1249, 1153, -854, -501, + 543, 185, -305, -45, 141, 0, -46, 0, + + 0, 0, -50, 19, 143, -93, -290, 278, + 485, -658, -701, 1396, 901, -3019, -1042, 10276, + 17488, 10276, -1042, -3019, 901, 1396, -701, -658, + 485, 278, -290, -93, 143, 19, -50, 0, + + 0, 0, -46, 0, 141, -45, -305, 185, + 543, -501, -854, 1153, 1249, -2619, -1908, 8712, + 17358, 11772, 0, -3319, 480, 1593, -504, -796, + 399, 367, -261, -142, 138, 40, -52, -5, + + 0, 0, -41, -17, 133, 0, -304, 91, + 574, -334, -959, 878, 1516, -2143, -2590, 7118, + 16971, 13161, 1202, -3495, 0, 1731, -267, -908, + 287, 445, -215, -188, 125, 62, -52, -12, + + 0, 0, -34, -30, 120, 41, -291, 0, + 577, -164, -1015, 585, 1697, -1618, -3084, 5534, + 16337, 14406, 2544, -3526, -523, 1800, 0, -985, + 152, 509, -156, -230, 104, 83, -48, -19, + + 0, 0, -26, -41, 103, 76, -265, -83, + 554, 0, -1023, 288, 1791, -1070, -3393, 3998, + 15474, 15474, 3998, -3393, -1070, 1791, 288, -1023, + 0, 554, -83, -265, 76, 103, -41, -26, + + 0, 0, -19, -48, 83, 104, -230, -156, + 509, 152, -985, 0, 1800, -523, -3526, 2544, + 14406, 16337, 5534, -3084, -1618, 1697, 585, -1015, + -164, 577, 0, -291, 41, 120, -30, -34, + + 0, 0, -12, -52, 62, 125, -188, -215, + 445, 287, -908, -267, 1731, 0, -3495, 1202, + 13161, 16971, 7118, -2590, -2143, 1516, 878, -959, + -334, 574, 91, -304, 0, 133, -17, -41, + + 0, 0, -5, -52, 40, 138, -142, -261, + 367, 399, -796, -504, 1593, 480, -3319, 0, + 11772, 17358, 8712, -1908, -2619, 1249, 1153, -854, + -501, 543, 185, -305, -45, 141, 0, -46, + }; + + arm_resample_x192k_12k8( + 8, (const int16x2_t *)h, hp50, (int16x2_t *)x, y, n); +} + +/** + * Resample from 48 Khz to 12.8 KHz + */ +static void arm_resample_48k_12k8( + struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) +{ + static const int16_t alignas(int32_t) h[2*4*64] = { + + 0, -13, -25, -20, 10, 51, 71, 38, + -47, -133, -145, -42, 139, 277, 242, 0, + -329, -511, -351, 144, 698, 895, 450, -535, + -1510, -1697, -521, 1999, 5138, 7737, 8744, 7737, + 5138, 1999, -521, -1697, -1510, -535, 450, 895, + 698, 144, -351, -511, -329, 0, 242, 277, + 139, -42, -145, -133, -47, 38, 71, 51, + 10, -20, -25, -13, 0, 0, 0, 0, + + 0, -9, -23, -24, 0, 41, 71, 52, + -23, -115, -152, -78, 92, 254, 272, 76, + -251, -493, -427, 0, 576, 900, 624, -262, + -1309, -1763, -954, 1272, 4356, 7203, 8679, 8169, + 5886, 2767, 0, -1542, -1660, -809, 240, 848, + 796, 292, -252, -507, -398, -82, 199, 288, + 183, 0, -130, -145, -71, 20, 69, 60, + 20, -15, -26, -17, -3, 0, 0, 0, + + 0, -6, -20, -26, -8, 31, 67, 62, + 0, -94, -152, -108, 45, 223, 287, 143, + -167, -454, -480, -134, 439, 866, 758, 0, + -1071, -1748, -1295, 601, 3559, 6580, 8485, 8485, + 6580, 3559, 601, -1295, -1748, -1071, 0, 758, + 866, 439, -134, -480, -454, -167, 143, 287, + 223, 45, -108, -152, -94, 0, 62, 67, + 31, -8, -26, -20, -6, 0, 0, 0, + + 0, -3, -17, -26, -15, 20, 60, 69, + 20, -71, -145, -130, 0, 183, 288, 199, + -82, -398, -507, -252, 292, 796, 848, 240, + -809, -1660, -1542, 0, 2767, 5886, 8169, 8679, + 7203, 4356, 1272, -954, -1763, -1309, -262, 624, + 900, 576, 0, -427, -493, -251, 76, 272, + 254, 92, -78, -152, -115, -23, 52, 71, + 41, 0, -24, -23, -9, 0, 0, 0, + + 0, 0, -13, -25, -20, 10, 51, 71, + 38, -47, -133, -145, -42, 139, 277, 242, + 0, -329, -511, -351, 144, 698, 895, 450, + -535, -1510, -1697, -521, 1999, 5138, 7737, 8744, + 7737, 5138, 1999, -521, -1697, -1510, -535, 450, + 895, 698, 144, -351, -511, -329, 0, 242, + 277, 139, -42, -145, -133, -47, 38, 71, + 51, 10, -20, -25, -13, 0, 0, 0, + + 0, 0, -9, -23, -24, 0, 41, 71, + 52, -23, -115, -152, -78, 92, 254, 272, + 76, -251, -493, -427, 0, 576, 900, 624, + -262, -1309, -1763, -954, 1272, 4356, 7203, 8679, + 8169, 5886, 2767, 0, -1542, -1660, -809, 240, + 848, 796, 292, -252, -507, -398, -82, 199, + 288, 183, 0, -130, -145, -71, 20, 69, + 60, 20, -15, -26, -17, -3, 0, 0, + + 0, 0, -6, -20, -26, -8, 31, 67, + 62, 0, -94, -152, -108, 45, 223, 287, + 143, -167, -454, -480, -134, 439, 866, 758, + 0, -1071, -1748, -1295, 601, 3559, 6580, 8485, + 8485, 6580, 3559, 601, -1295, -1748, -1071, 0, + 758, 866, 439, -134, -480, -454, -167, 143, + 287, 223, 45, -108, -152, -94, 0, 62, + 67, 31, -8, -26, -20, -6, 0, 0, + + 0, 0, -3, -17, -26, -15, 20, 60, + 69, 20, -71, -145, -130, 0, 183, 288, + 199, -82, -398, -507, -252, 292, 796, 848, + 240, -809, -1660, -1542, 0, 2767, 5886, 8169, + 8679, 7203, 4356, 1272, -954, -1763, -1309, -262, + 624, 900, 576, 0, -427, -493, -251, 76, + 272, 254, 92, -78, -152, -115, -23, 52, + 71, 41, 0, -24, -23, -9, 0, 0, + }; + + arm_resample_x192k_12k8( + 4, (const int16x2_t *)h, hp50, (int16x2_t *)x, y, n); +} + +/** + * Return vector of correlations + */ +static void arm_correlate( + const int16_t *a, const int16_t *b, int n, float *y, int nc) +{ + /* --- Check alignment of `b` --- */ + + if ((uintptr_t)b & 3) + *(y++) = dot(a, b--, n), nc--; + + /* --- Processing by pair --- */ + + for ( ; nc >= 2; nc -= 2) { + const int16x2_t *an = (const int16x2_t *)(a ); + const int16x2_t *bn = (const int16x2_t *)(b--); + + int16x2_t ax, b0, b1; + int64_t v0 = 0, v1 = 0; + + b1 = (int16x2_t)*(b--) << 16; + + for (int i = 0; i < (n >> 4); i++ ) + for (int j = 0; j < 4; j++) { + + ax = *(an++), b0 = *(bn++); + v0 = __smlald (ax, b0, v0); + v1 = __smlaldx(ax, __pkhbt(b0, b1), v1); + + ax = *(an++), b1 = *(bn++); + v0 = __smlald (ax, b1, v0); + v1 = __smlaldx(ax, __pkhbt(b1, b0), v1); + } + + *(y++) = (float)((int32_t)((v0 + (1 << 5)) >> 6)); + *(y++) = (float)((int32_t)((v1 + (1 << 5)) >> 6)); + } + + /* --- Odd element count --- */ + + if (nc > 0) + *(y++) = dot(a, b, n); +} + +#endif /* __ARM_FEATURE_SIMD32 */ diff --git a/src/ltpf_neon.h b/src/ltpf_neon.h new file mode 100644 index 0000000..728ed1d --- /dev/null +++ b/src/ltpf_neon.h @@ -0,0 +1,256 @@ +/****************************************************************************** + * + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +#if __ARM_NEON + +/** + * Configuration + */ + +#ifndef TEST_NEON + +#include + +#define resample_16k_12k8 neon_resample_16k_12k8 +#define resample_32k_12k8 neon_resample_32k_12k8 +#define resample_48k_12k8 neon_resample_48k_12k8 + +#define correlate neon_correlate +#define dot neon_dot + +#endif /* TEST_NEON */ + + +/** + * Import + */ + +static inline int32_t filter_hp50(struct lc3_ltpf_hp50_state *, int32_t); + + +/** + * Resample from 16 Khz to 12.8 KHz + */ +static void neon_resample_16k_12k8( + struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) +{ + static const int16_t h[4][20] = { + + { -61, 214, -398, 417, 0, -1052, 2686, -4529, 5997, 26233, + 5997, -4529, 2686, -1052, 0, 417, -398, 214, -61, 0 }, + + { -79, 180, -213, 0, 598, -1522, 2389, -2427, 0, 24506, + 13068, -5289, 1873, 0, -752, 763, -457, 156, 0, -28 }, + + { -61, 92, 0, -323, 861, -1361, 1317, 0, -3885, 19741, + 19741, -3885, 0, 1317, -1361, 861, -323, 0, 92, -61 }, + + { -28, 0, 156, -457, 763, -752, 0, 1873, -5289, 13068, + 24506, 0, -2427, 2389, -1522, 598, 0, -213, 180, -79 }, + + }; + + x -= 20 - 1; + + for (int i = 0; i < 5*n; i += 5) { + const int16_t *hn = h[i & 3]; + const int16_t *xn = x + (i >> 2); + int32x4_t un; + + un = vmull_s16( vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4; + un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4; + un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4; + un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4; + un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4; + + int32_t yn = filter_hp50(hp50, vaddvq_s32(un)); + *(y++) = (yn + (1 << 15)) >> 16; + } +} + +/** + * Resample from 32 Khz to 12.8 KHz + */ +static void neon_resample_32k_12k8( + struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) +{ + x -= 40 - 1; + + static const int16_t h[2][40] = { + + { -30, -31, 46, 107, 0, -199, -162, 209, 430, 0, + -681, -526, 658, 1343, 0, -2264, -1943, 2999, 9871, 13116, + 9871, 2999, -1943, -2264, 0, 1343, 658, -526, -681, 0, + 430, 209, -162, -199, 0, 107, 46, -31, -30, 0 }, + + { -14, -39, 0, 90, 78, -106, -229, 0, 382, 299, + -376, -761, 0, 1194, 937, -1214, -2644, 0, 6534, 12253, + 12253, 6534, 0, -2644, -1214, 937, 1194, 0, -761, -376, + 299, 382, 0, -229, -106, 78, 90, 0, -39, -14 }, + + }; + + for (int i = 0; i < 5*n; i += 5) { + const int16_t *hn = h[i & 1]; + const int16_t *xn = x + (i >> 1); + + int32x4_t un = vmull_s16(vld1_s16(xn), vld1_s16(hn)); + xn += 4, hn += 4; + + for (int i = 1; i < 10; i++) + un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4; + + int32_t yn = filter_hp50(hp50, vaddvq_s32(un)); + *(y++) = (yn + (1 << 15)) >> 16; + } +} + + + +/** + * Resample from 48 Khz to 12.8 KHz + */ +static void neon_resample_48k_12k8( + struct lc3_ltpf_hp50_state *hp50, const int16_t *x, int16_t *y, int n) +{ + static const int16_t alignas(16) h[4][64] = { + + { -13, -25, -20, 10, 51, 71, 38, -47, -133, -145, + -42, 139, 277, 242, 0, -329, -511, -351, 144, 698, + 895, 450, -535, -1510, -1697, -521, 1999, 5138, 7737, 8744, + 7737, 5138, 1999, -521, -1697, -1510, -535, 450, 895, 698, + 144, -351, -511, -329, 0, 242, 277, 139, -42, -145, + -133, -47, 38, 71, 51, 10, -20, -25, -13, 0 }, + + { -9, -23, -24, 0, 41, 71, 52, -23, -115, -152, + -78, 92, 254, 272, 76, -251, -493, -427, 0, 576, + 900, 624, -262, -1309, -1763, -954, 1272, 4356, 7203, 8679, + 8169, 5886, 2767, 0, -1542, -1660, -809, 240, 848, 796, + 292, -252, -507, -398, -82, 199, 288, 183, 0, -130, + -145, -71, 20, 69, 60, 20, -15, -26, -17, -3 }, + + { -6, -20, -26, -8, 31, 67, 62, 0, -94, -152, + -108, 45, 223, 287, 143, -167, -454, -480, -134, 439, + 866, 758, 0, -1071, -1748, -1295, 601, 3559, 6580, 8485, + 8485, 6580, 3559, 601, -1295, -1748, -1071, 0, 758, 866, + 439, -134, -480, -454, -167, 143, 287, 223, 45, -108, + -152, -94, 0, 62, 67, 31, -8, -26, -20, -6 }, + + { -3, -17, -26, -15, 20, 60, 69, 20, -71, -145, + -130, 0, 183, 288, 199, -82, -398, -507, -252, 292, + 796, 848, 240, -809, -1660, -1542, 0, 2767, 5886, 8169, + 8679, 7203, 4356, 1272, -954, -1763, -1309, -262, 624, 900, + 576, 0, -427, -493, -251, 76, 272, 254, 92, -78, + -152, -115, -23, 52, 71, 41, 0, -24, -23, -9 }, + + }; + + x -= 60 - 1; + + for (int i = 0; i < 15*n; i += 15) { + const int16_t *hn = h[i & 3]; + const int16_t *xn = x + (i >> 2); + + int32x4_t un = vmull_s16(vld1_s16(xn), vld1_s16(hn)); + xn += 4, hn += 4; + + for (int i = 1; i < 15; i++) + un = vmlal_s16(un, vld1_s16(xn), vld1_s16(hn)), xn += 4, hn += 4; + + int32_t yn = filter_hp50(hp50, vaddvq_s32(un)); + *(y++) = (yn + (1 << 15)) >> 16; + } +} + +/** + * Return dot product of 2 vectors + */ +static inline float neon_dot(const int16_t *a, const int16_t *b, int n) +{ + int64x2_t v = vmovq_n_s64(0); + + for (int i = 0; i < (n >> 4); i++) { + int32x4_t u; + + u = vmull_s16( vld1_s16(a), vld1_s16(b)), a += 4, b += 4; + u = vmlal_s16(u, vld1_s16(a), vld1_s16(b)), a += 4, b += 4; + v = vpadalq_s32(v, u); + + u = vmull_s16( vld1_s16(a), vld1_s16(b)), a += 4, b += 4; + u = vmlal_s16(u, vld1_s16(a), vld1_s16(b)), a += 4, b += 4; + v = vpadalq_s32(v, u); + } + + int32_t v32 = (vaddvq_s64(v) + (1 << 5)) >> 6; + return (float)v32; +} + +/** + * Return vector of correlations + */ +static void neon_correlate( + const int16_t *a, const int16_t *b, int n, float *y, int nc) +{ + for ( ; nc >= 4; nc -= 4, b -= 4) { + const int16_t *an = (const int16_t *)a; + const int16_t *bn = (const int16_t *)b; + + int64x2_t v0 = vmovq_n_s64(0), v1 = v0, v2 = v0, v3 = v0; + int16x4_t ax, b0, b1; + + b0 = vld1_s16(bn-4); + + for (int i=0; i < (n >> 4); i++ ) + for (int j = 0; j < 2; j++) { + int32x4_t u0, u1, u2, u3; + + b1 = b0; + b0 = vld1_s16(bn), bn += 4; + ax = vld1_s16(an), an += 4; + + u0 = vmull_s16(ax, b0); + u1 = vmull_s16(ax, vext_s16(b1, b0, 3)); + u2 = vmull_s16(ax, vext_s16(b1, b0, 2)); + u3 = vmull_s16(ax, vext_s16(b1, b0, 1)); + + b1 = b0; + b0 = vld1_s16(bn), bn += 4; + ax = vld1_s16(an), an += 4; + + u0 = vmlal_s16(u0, ax, b0); + u1 = vmlal_s16(u1, ax, vext_s16(b1, b0, 3)); + u2 = vmlal_s16(u2, ax, vext_s16(b1, b0, 2)); + u3 = vmlal_s16(u3, ax, vext_s16(b1, b0, 1)); + + v0 = vpadalq_s32(v0, u0); + v1 = vpadalq_s32(v1, u1); + v2 = vpadalq_s32(v2, u2); + v3 = vpadalq_s32(v3, u3); + } + + *(y++) = (float)((int32_t)((vaddvq_s64(v0) + (1 << 5)) >> 6)); + *(y++) = (float)((int32_t)((vaddvq_s64(v1) + (1 << 5)) >> 6)); + *(y++) = (float)((int32_t)((vaddvq_s64(v2) + (1 << 5)) >> 6)); + *(y++) = (float)((int32_t)((vaddvq_s64(v3) + (1 << 5)) >> 6)); + } + + for ( ; nc > 0; nc--) + *(y++) = neon_dot(a, b--, n); +} + +#endif /* __ARM_NEON */ diff --git a/src/tables.c b/src/tables.c index 68139cc..aef6fc1 100644 --- a/src/tables.c +++ b/src/tables.c @@ -2441,75 +2441,6 @@ const uint16_t lc3_tns_coeffs_bits[][17] = { }; -/** - * Long Term Postfilter Analysis (cf. 3.7.6) - * with the addition of `h[239] = 0` - */ - -const float lc3_ltpf_h12k8[240] = { - -2.04305583e-05, -4.46345894e-05, -7.16366399e-05, -1.00101113e-04, - -1.28372848e-04, -1.54543830e-04, -1.76544567e-04, -1.92256960e-04, - -1.99643819e-04, -1.96888686e-04, -1.82538332e-04, -1.55639427e-04, - -1.15860365e-04, -6.35893034e-05, 2.81006480e-19, 7.29218021e-05, - 1.52397076e-04, 2.34920777e-04, 3.16378650e-04, 3.92211738e-04, - 4.57623849e-04, 5.07824294e-04, 5.38295523e-04, 5.45072918e-04, - 5.25022155e-04, 4.76098424e-04, 3.97571380e-04, 2.90200217e-04, - 1.56344667e-04, -5.81880142e-19, -1.73252713e-04, -3.56385965e-04, - -5.41155231e-04, -7.18414023e-04, -8.78505232e-04, -1.01171451e-03, - -1.10876706e-03, -1.16134522e-03, -1.16260169e-03, -1.10764097e-03, - -9.93941563e-04, -8.21692190e-04, -5.94017766e-04, -3.17074654e-04, - 9.74695082e-19, 3.45293760e-04, 7.04480871e-04, 1.06133447e-03, - 1.39837473e-03, 1.69763080e-03, 1.94148675e-03, 2.11357591e-03, - 2.19968245e-03, 2.18860625e-03, 2.07294546e-03, 1.84975249e-03, - 1.52102188e-03, 1.09397426e-03, 5.81108062e-04, -1.42248266e-18, - -6.27153730e-04, -1.27425140e-03, -1.91223839e-03, -2.51026925e-03, - -3.03703830e-03, -3.46222687e-03, -3.75800672e-03, -3.90053247e-03, - -3.87135231e-03, -3.65866558e-03, -3.25835851e-03, -2.67475555e-03, - -1.92103305e-03, -1.01925433e-03, 1.86962369e-18, 1.09841545e-03, - 2.23113197e-03, 3.34830927e-03, 4.39702277e-03, 5.32342672e-03, - 6.07510531e-03, 6.60352025e-03, 6.86645399e-03, 6.83034270e-03, - 6.47239234e-03, 5.78237521e-03, 4.76401273e-03, 3.43586351e-03, - 1.83165284e-03, -2.25189837e-18, -1.99647619e-03, -4.08266886e-03, - -6.17308037e-03, -8.17444895e-03, -9.98882386e-03, -1.15169871e-02, - -1.26621006e-02, -1.33334458e-02, -1.34501120e-02, -1.29444881e-02, - -1.17654154e-02, -9.88086732e-03, -7.28003640e-03, -3.97473021e-03, - 2.50961778e-18, 4.58604422e-03, 9.70324900e-03, 1.52512477e-02, - 2.11120585e-02, 2.71533724e-02, 3.32324245e-02, 3.92003203e-02, - 4.49066644e-02, 5.02043309e-02, 5.49542017e-02, 5.90297032e-02, - 6.23209727e-02, 6.47385023e-02, 6.62161245e-02, 6.67132287e-02, - 6.62161245e-02, 6.47385023e-02, 6.23209727e-02, 5.90297032e-02, - 5.49542017e-02, 5.02043309e-02, 4.49066644e-02, 3.92003203e-02, - 3.32324245e-02, 2.71533724e-02, 2.11120585e-02, 1.52512477e-02, - 9.70324900e-03, 4.58604422e-03, 2.50961778e-18, -3.97473021e-03, - -7.28003640e-03, -9.88086732e-03, -1.17654154e-02, -1.29444881e-02, - -1.34501120e-02, -1.33334458e-02, -1.26621006e-02, -1.15169871e-02, - -9.98882386e-03, -8.17444895e-03, -6.17308037e-03, -4.08266886e-03, - -1.99647619e-03, -2.25189837e-18, 1.83165284e-03, 3.43586351e-03, - 4.76401273e-03, 5.78237521e-03, 6.47239234e-03, 6.83034270e-03, - 6.86645399e-03, 6.60352025e-03, 6.07510531e-03, 5.32342672e-03, - 4.39702277e-03, 3.34830927e-03, 2.23113197e-03, 1.09841545e-03, - 1.86962369e-18, -1.01925433e-03, -1.92103305e-03, -2.67475555e-03, - -3.25835851e-03, -3.65866558e-03, -3.87135231e-03, -3.90053247e-03, - -3.75800672e-03, -3.46222687e-03, -3.03703830e-03, -2.51026925e-03, - -1.91223839e-03, -1.27425140e-03, -6.27153730e-04, -1.42248266e-18, - 5.81108062e-04, 1.09397426e-03, 1.52102188e-03, 1.84975249e-03, - 2.07294546e-03, 2.18860625e-03, 2.19968245e-03, 2.11357591e-03, - 1.94148675e-03, 1.69763080e-03, 1.39837473e-03, 1.06133447e-03, - 7.04480871e-04, 3.45293760e-04, 9.74695082e-19, -3.17074654e-04, - -5.94017766e-04, -8.21692190e-04, -9.93941563e-04, -1.10764097e-03, - -1.16260169e-03, -1.16134522e-03, -1.10876706e-03, -1.01171451e-03, - -8.78505232e-04, -7.18414023e-04, -5.41155231e-04, -3.56385965e-04, - -1.73252713e-04, -5.81880142e-19, 1.56344667e-04, 2.90200217e-04, - 3.97571380e-04, 4.76098424e-04, 5.25022155e-04, 5.45072918e-04, - 5.38295523e-04, 5.07824294e-04, 4.57623849e-04, 3.92211738e-04, - 3.16378650e-04, 2.34920777e-04, 1.52397076e-04, 7.29218021e-05, - 2.81006480e-19, -6.35893034e-05, -1.15860365e-04, -1.55639427e-04, - -1.82538332e-04, -1.96888686e-04, -1.99643819e-04, -1.92256960e-04, - -1.76544567e-04, -1.54543830e-04, -1.28372848e-04, -1.00101113e-04, - -7.16366399e-05, -4.46345894e-05, -2.04305583e-05, 0.0 , -}; - - /** * Long Term Postfilter Synthesis (cf. 3.7.6) * with - addition of a 0 for num coefficients diff --git a/src/tables.h b/src/tables.h index b327d0e..26bd48e 100644 --- a/src/tables.h +++ b/src/tables.h @@ -78,8 +78,6 @@ extern const uint16_t lc3_tns_coeffs_bits[][17]; * Long Term Postfilter */ -extern const float lc3_ltpf_h12k8[240]; - extern const float *lc3_ltpf_cnum[LC3_NUM_SRATE][4]; extern const float *lc3_ltpf_cden[LC3_NUM_SRATE][4]; diff --git a/tables/mktables.py b/tables/mktables.py index fec56bd..67d4312 100755 --- a/tables/mktables.py +++ b/tables/mktables.py @@ -17,6 +17,75 @@ import numpy as np +LTPF_H12K8 = np.array([ + -2.04305583e-05, -4.46345894e-05, -7.16366399e-05, -1.00101113e-04, + -1.28372848e-04, -1.54543830e-04, -1.76544567e-04, -1.92256960e-04, + -1.99643819e-04, -1.96888686e-04, -1.82538332e-04, -1.55639427e-04, + -1.15860365e-04, -6.35893034e-05, 2.81006480e-19, 7.29218021e-05, + 1.52397076e-04, 2.34920777e-04, 3.16378650e-04, 3.92211738e-04, + 4.57623849e-04, 5.07824294e-04, 5.38295523e-04, 5.45072918e-04, + 5.25022155e-04, 4.76098424e-04, 3.97571380e-04, 2.90200217e-04, + 1.56344667e-04, -5.81880142e-19, -1.73252713e-04, -3.56385965e-04, + -5.41155231e-04, -7.18414023e-04, -8.78505232e-04, -1.01171451e-03, + -1.10876706e-03, -1.16134522e-03, -1.16260169e-03, -1.10764097e-03, + -9.93941563e-04, -8.21692190e-04, -5.94017766e-04, -3.17074654e-04, + 9.74695082e-19, 3.45293760e-04, 7.04480871e-04, 1.06133447e-03, + 1.39837473e-03, 1.69763080e-03, 1.94148675e-03, 2.11357591e-03, + 2.19968245e-03, 2.18860625e-03, 2.07294546e-03, 1.84975249e-03, + 1.52102188e-03, 1.09397426e-03, 5.81108062e-04, -1.42248266e-18, + -6.27153730e-04, -1.27425140e-03, -1.91223839e-03, -2.51026925e-03, + -3.03703830e-03, -3.46222687e-03, -3.75800672e-03, -3.90053247e-03, + -3.87135231e-03, -3.65866558e-03, -3.25835851e-03, -2.67475555e-03, + -1.92103305e-03, -1.01925433e-03, 1.86962369e-18, 1.09841545e-03, + 2.23113197e-03, 3.34830927e-03, 4.39702277e-03, 5.32342672e-03, + 6.07510531e-03, 6.60352025e-03, 6.86645399e-03, 6.83034270e-03, + 6.47239234e-03, 5.78237521e-03, 4.76401273e-03, 3.43586351e-03, + 1.83165284e-03, -2.25189837e-18, -1.99647619e-03, -4.08266886e-03, + -6.17308037e-03, -8.17444895e-03, -9.98882386e-03, -1.15169871e-02, + -1.26621006e-02, -1.33334458e-02, -1.34501120e-02, -1.29444881e-02, + -1.17654154e-02, -9.88086732e-03, -7.28003640e-03, -3.97473021e-03, + 2.50961778e-18, 4.58604422e-03, 9.70324900e-03, 1.52512477e-02, + 2.11120585e-02, 2.71533724e-02, 3.32324245e-02, 3.92003203e-02, + 4.49066644e-02, 5.02043309e-02, 5.49542017e-02, 5.90297032e-02, + 6.23209727e-02, 6.47385023e-02, 6.62161245e-02, 6.67132287e-02, + 6.62161245e-02, 6.47385023e-02, 6.23209727e-02, 5.90297032e-02, + 5.49542017e-02, 5.02043309e-02, 4.49066644e-02, 3.92003203e-02, + 3.32324245e-02, 2.71533724e-02, 2.11120585e-02, 1.52512477e-02, + 9.70324900e-03, 4.58604422e-03, 2.50961778e-18, -3.97473021e-03, + -7.28003640e-03, -9.88086732e-03, -1.17654154e-02, -1.29444881e-02, + -1.34501120e-02, -1.33334458e-02, -1.26621006e-02, -1.15169871e-02, + -9.98882386e-03, -8.17444895e-03, -6.17308037e-03, -4.08266886e-03, + -1.99647619e-03, -2.25189837e-18, 1.83165284e-03, 3.43586351e-03, + 4.76401273e-03, 5.78237521e-03, 6.47239234e-03, 6.83034270e-03, + 6.86645399e-03, 6.60352025e-03, 6.07510531e-03, 5.32342672e-03, + 4.39702277e-03, 3.34830927e-03, 2.23113197e-03, 1.09841545e-03, + 1.86962369e-18, -1.01925433e-03, -1.92103305e-03, -2.67475555e-03, + -3.25835851e-03, -3.65866558e-03, -3.87135231e-03, -3.90053247e-03, + -3.75800672e-03, -3.46222687e-03, -3.03703830e-03, -2.51026925e-03, + -1.91223839e-03, -1.27425140e-03, -6.27153730e-04, -1.42248266e-18, + 5.81108062e-04, 1.09397426e-03, 1.52102188e-03, 1.84975249e-03, + 2.07294546e-03, 2.18860625e-03, 2.19968245e-03, 2.11357591e-03, + 1.94148675e-03, 1.69763080e-03, 1.39837473e-03, 1.06133447e-03, + 7.04480871e-04, 3.45293760e-04, 9.74695082e-19, -3.17074654e-04, + -5.94017766e-04, -8.21692190e-04, -9.93941563e-04, -1.10764097e-03, + -1.16260169e-03, -1.16134522e-03, -1.10876706e-03, -1.01171451e-03, + -8.78505232e-04, -7.18414023e-04, -5.41155231e-04, -3.56385965e-04, + -1.73252713e-04, -5.81880142e-19, 1.56344667e-04, 2.90200217e-04, + 3.97571380e-04, 4.76098424e-04, 5.25022155e-04, 5.45072918e-04, + 5.38295523e-04, 5.07824294e-04, 4.57623849e-04, 3.92211738e-04, + 3.16378650e-04, 2.34920777e-04, 1.52397076e-04, 7.29218021e-05, + 2.81006480e-19, -6.35893034e-05, -1.15860365e-04, -1.55639427e-04, + -1.82538332e-04, -1.96888686e-04, -1.99643819e-04, -1.92256960e-04, + -1.76544567e-04, -1.54543830e-04, -1.28372848e-04, -1.00101113e-04, + -7.16366399e-05, -4.46345894e-05, -2.04305583e-05 +]) + +LTPF_HI = np.array([ + 6.69885837e-03, 3.96711478e-02, 1.06999186e-01, 2.09880463e-01, + 3.35690625e-01, 4.59220930e-01, 5.50075002e-01, 5.83527575e-01, + 5.50075002e-01, 4.59220930e-01, 3.35690625e-01, 2.09880463e-01, + 1.06999186e-01, 3.96711478e-02, 6.69885837e-03 +]) def print_table(t, m=4): @@ -102,6 +171,45 @@ def inv_table(): print('\n--- inv table ---') print_table(np.append(np.zeros(1), 1 / np.arange(1, 28))) +def ltpf_resampler_table(): + + for sr in [ 8, 16, 32, 24, 48 ]: + + r = 192 // sr + k = 64 if r & (r-1) else 192 + + p = (192 // k) * (k // sr) + q = p * (0.5 if sr == 8 else 1) + + print('\n--- LTPF resampler {:d} KHz to 12.8 KHz ---'.format(sr)) + + h = np.rint(np.append(LTPF_H12K8, 0.) * q * 2**15).astype(int) + h = h.reshape((len(h) // p, p)).T + h = np.flip(h, axis=0) + print('... Gain:', np.max(np.sum(np.abs(h), axis=1)) / 32768.) + + for i in range(0, len(h), 192 // k): + for j in range(0, len(h[i]), 10): + print('{:5d}, {:5d}, {:5d}, {:5d}, {:5d}, ' + '{:5d}, {:5d}, {:5d}, {:5d}, {:5d},'.format( + h[i][j+0], h[i][j+1], h[i][j+2], h[i][j+3], h[i][j+4], + h[i][j+5], h[i][j+6], h[i][j+7], h[i][j+8], h[i][j+9])) + + +def ltpf_interpolate_table(): + + print('\n--- LTPF interpolation ---') + + h = np.rint(np.append(LTPF_HI, 0.) * 2**15).astype(int) + + h = h.reshape(len(h) // 4, 4).T + h = np.flip(h, axis=0) + print('... Gain:', np.max(np.sum(np.abs(h), axis=1)) / 32768.) + + for i in range(len(h)): + print('{:5d}, {:5d}, {:5d}, {:5d}'.format( + h[i][0], h[i][1], h[i][2], h[i][3])) + if __name__ == '__main__': @@ -115,4 +223,7 @@ if __name__ == '__main__': tns_quantization_table() quant_iq_table() + ltpf_resampler_table() + ltpf_interpolate_table() + print('') diff --git a/test/arm/ltpf_arm.c b/test/arm/ltpf_arm.c new file mode 100644 index 0000000..e7b8bfc --- /dev/null +++ b/test/arm/ltpf_arm.c @@ -0,0 +1,114 @@ +/****************************************************************************** + * + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +#include +#include +#include + +#include "simd32.h" + +/* -------------------------------------------------------------------------- */ + +#define TEST_ARM +#include + +void lc3_put_bits_generic(lc3_bits_t *a, unsigned b, int c) +{ (void)a, (void)b, (void)c; } + +unsigned lc3_get_bits_generic(struct lc3_bits *a, int b) +{ return (void)a, (void)b, 0; } + +/* -------------------------------------------------------------------------- */ + +static int check_resampler() +{ + int16_t __x[60+480], *x = __x + 60; + for (int i = -60; i < 480; i++) + x[i] = rand() & 0xffff; + + struct lc3_ltpf_hp50_state hp50 = { 0 }, hp50_arm = { 0 }; + int16_t y[128], y_arm[128]; + + resample_8k_12k8(&hp50, x, y, 128); + arm_resample_8k_12k8(&hp50_arm, x, y_arm, 128); + if (memcmp(y, y_arm, 128 * sizeof(*y)) != 0) + return -1; + + resample_16k_12k8(&hp50, x, y, 128); + arm_resample_16k_12k8(&hp50_arm, x, y_arm, 128); + if (memcmp(y, y_arm, 128 * sizeof(*y)) != 0) + return -1; + + resample_24k_12k8(&hp50, x, y, 128); + arm_resample_24k_12k8(&hp50_arm, x, y_arm, 128); + if (memcmp(y, y_arm, 128 * sizeof(*y)) != 0) + return -1; + + resample_32k_12k8(&hp50, x, y, 128); + arm_resample_32k_12k8(&hp50_arm, x, y_arm, 128); + if (memcmp(y, y_arm, 128 * sizeof(*y)) != 0) + return -1; + + resample_48k_12k8(&hp50, x, y, 128); + arm_resample_48k_12k8(&hp50_arm, x, y_arm, 128); + if (memcmp(y, y_arm, 128 * sizeof(*y)) != 0) + return -1; + + return 0; +} + +static int check_correlate() +{ + int16_t alignas(4) a[500], b[500]; + float y[100], y_arm[100]; + + for (int i = 0; i < 500; i++) { + a[i] = rand() & 0xffff; + b[i] = rand() & 0xffff; + } + + correlate(a, b+200, 128, y, 100); + arm_correlate(a, b+200, 128, y_arm, 100); + if (memcmp(y, y_arm, 100 * sizeof(*y)) != 0) + return -1; + + correlate(a, b+199, 128, y, 99); + arm_correlate(a, b+199, 128, y_arm, 99); + if (memcmp(y, y_arm, 99 * sizeof(*y)) != 0) + return -1; + + correlate(a, b+199, 128, y, 100); + arm_correlate(a, b+199, 128, y_arm, 100); + if (memcmp(y, y_arm, 100 * sizeof(*y)) != 0) + return -1; + + return 0; +} + +int check_ltpf(void) +{ + int ret; + + if ((ret = check_resampler()) < 0) + return ret; + + if ((ret = check_correlate()) < 0) + return ret; + + return 0; +} diff --git a/test/arm/makefile.mk b/test/arm/makefile.mk new file mode 100644 index 0000000..91d11d2 --- /dev/null +++ b/test/arm/makefile.mk @@ -0,0 +1,31 @@ +# +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +test_arm_src += \ + $(TEST_DIR)/arm/test_arm.c \ + $(TEST_DIR)/arm/ltpf_arm.c \ + $(SRC_DIR)/tables.c + +test_arm_include += $(SRC_DIR) +test_arm_ldlibs += m + +$(eval $(call add-bin,test_arm)) + +test_arm: $(test_arm_bin) + @echo " RUN $(notdir $<)" + $(V)$< + +test: test_arm diff --git a/test/arm/simd32.h b/test/arm/simd32.h new file mode 100644 index 0000000..fd17f71 --- /dev/null +++ b/test/arm/simd32.h @@ -0,0 +1,64 @@ +/****************************************************************************** + * + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +#if __ARM_FEATURE_SIMD32 + +#include + +#else +#define __ARM_FEATURE_SIMD32 1 + +#include + +typedef int32_t int16x2_t; + +__attribute__((unused)) +static int16x2_t __pkhbt(int16x2_t a, int16x2_t b) +{ + uint32_t a_bot = (uint32_t)a & 0x0000ffffu; + uint32_t b_top = (uint32_t)b & 0xffff0000u; + + return (int16x2_t)(a_bot | b_top); +} + +__attribute__((unused)) +static int32_t __smlad(int16x2_t a, int16x2_t b, int32_t u) +{ + int16_t a_hi = a >> 16, a_lo = a & 0xffff; + int16_t b_hi = b >> 16, b_lo = b & 0xffff; + + return u + (a_hi * b_hi) + (a_lo * b_lo); +} + +__attribute__((unused)) +static int64_t __smlald(int16x2_t a, int16x2_t b, int64_t u) +{ + int16_t a_hi = a >> 16, a_lo = a & 0xffff; + int16_t b_hi = b >> 16, b_lo = b & 0xffff; + return u + (a_hi * b_hi) + (a_lo * b_lo); +} + +__attribute__((unused)) +static int64_t __smlaldx(int16x2_t a, int16x2_t b, int64_t u) +{ + int16_t a_hi = a >> 16, a_lo = a & 0xffff; + int16_t b_hi = b >> 16, b_lo = b & 0xffff; + return u + (a_hi * b_lo) + (a_lo * b_hi); +} + +#endif /* __ARM_FEATURE_SIMD32 */ diff --git a/test/arm/test_arm.c b/test/arm/test_arm.c new file mode 100644 index 0000000..1e5c15b --- /dev/null +++ b/test/arm/test_arm.c @@ -0,0 +1,32 @@ +/****************************************************************************** + * + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +#include + +int check_ltpf(void); + +int main() +{ + int r, ret = 0; + + printf("Checking LTPF ARM... "); fflush(stdout); + printf("%s\n", (r = check_ltpf()) == 0 ? "OK" : "Failed"); + ret = ret || r; + + return ret; +} diff --git a/test/ctypes.h b/test/ctypes.h index 7eb17b5..5627494 100644 --- a/test/ctypes.h +++ b/test/ctypes.h @@ -241,10 +241,10 @@ static PyObject *to_ltpf_hp50_state( CTYPES_CHECK("hp50", obj && PyDict_Check(obj)); CTYPES_CHECK("hp50.s1", to_scalar( - PyDict_GetItemString(obj, "s1"), NPY_FLOAT, &hp50->s1)); + PyDict_GetItemString(obj, "s1"), NPY_INT64, &hp50->s1)); CTYPES_CHECK("hp50.s2", to_scalar( - PyDict_GetItemString(obj, "s2"), NPY_FLOAT, &hp50->s2)); + PyDict_GetItemString(obj, "s2"), NPY_INT64, &hp50->s2)); return obj; } @@ -254,10 +254,10 @@ static PyObject *from_ltpf_hp50_state( PyObject *obj, const struct lc3_ltpf_hp50_state *hp50) { PyDict_SetItemString(obj, "s1", - new_scalar(NPY_FLOAT, &hp50->s1)); + new_scalar(NPY_INT64, &hp50->s1)); PyDict_SetItemString(obj, "s2", - new_scalar(NPY_FLOAT, &hp50->s2)); + new_scalar(NPY_INT64, &hp50->s2)); return obj; } @@ -267,8 +267,8 @@ static PyObject *to_ltpf_analysis( PyObject *obj, struct lc3_ltpf_analysis *ltpf) { PyObject *nc_obj, *x_12k8_obj, *x_6k4_obj; - const int n_12k8 = sizeof(ltpf->x_12k8) / sizeof(float); - const int n_6k4 = sizeof(ltpf->x_6k4) / sizeof(float); + const int n_12k8 = sizeof(ltpf->x_12k8) / sizeof(*ltpf->x_12k8); + const int n_6k4 = sizeof(ltpf->x_6k4) / sizeof(*ltpf->x_6k4); CTYPES_CHECK("ltpf", obj && PyDict_Check(obj)); @@ -286,11 +286,11 @@ static PyObject *to_ltpf_analysis( PyDict_GetItemString(obj, "hp50"), <pf->hp50)); CTYPES_CHECK("ltpf.x_12k8", x_12k8_obj = to_1d_copy( - PyDict_GetItemString(obj, "x_12k8"), NPY_FLOAT, ltpf->x_12k8, n_12k8)); + PyDict_GetItemString(obj, "x_12k8"), NPY_INT16, ltpf->x_12k8, n_12k8)); PyDict_SetItemString(obj, "x_12k8", x_12k8_obj); CTYPES_CHECK("ltpf.x_6k4", x_6k4_obj = to_1d_copy( - PyDict_GetItemString(obj, "x_6k4"), NPY_FLOAT, ltpf->x_6k4, n_6k4)); + PyDict_GetItemString(obj, "x_6k4"), NPY_INT16, ltpf->x_6k4, n_6k4)); PyDict_SetItemString(obj, "x_6k4", x_6k4_obj); CTYPES_CHECK("ltpf.tc", to_scalar( @@ -303,8 +303,8 @@ __attribute__((unused)) static PyObject *from_ltpf_analysis( PyObject *obj, const struct lc3_ltpf_analysis *ltpf) { - const int n_12k8 = sizeof(ltpf->x_12k8) / sizeof(float); - const int n_6k4 = sizeof(ltpf->x_6k4) / sizeof(float); + const int n_12k8 = sizeof(ltpf->x_12k8) / sizeof(*ltpf->x_12k8); + const int n_6k4 = sizeof(ltpf->x_6k4) / sizeof(*ltpf->x_6k4); if (!obj) obj = PyDict_New(); @@ -321,10 +321,10 @@ static PyObject *from_ltpf_analysis( from_ltpf_hp50_state(PyDict_New(), <pf->hp50)); PyDict_SetItemString(obj, "x_12k8", - new_1d_copy(NPY_FLOAT, n_12k8, <pf->x_12k8)); + new_1d_copy(NPY_INT16, n_12k8, <pf->x_12k8)); PyDict_SetItemString(obj, "x_6k4", - new_1d_copy(NPY_FLOAT, n_6k4, <pf->x_6k4)); + new_1d_copy(NPY_INT16, n_6k4, <pf->x_6k4)); PyDict_SetItemString(obj, "tc", new_scalar(NPY_INT, <pf->tc)); @@ -703,6 +703,7 @@ static PyObject *from_encoder(PyObject *obj, const struct lc3_encoder *enc) unsigned sr_pcm = enc->sr_pcm; int ns = LC3_NS(dt, sr); int nd = LC3_ND(dt, sr); + int nt = LC3_NT(sr); if (!obj) obj = PyDict_New(); @@ -724,6 +725,9 @@ static PyObject *from_encoder(PyObject *obj, const struct lc3_encoder *enc) PyDict_SetItemString(obj, "quant", from_spec_analysis(NULL, &enc->spec)); + PyDict_SetItemString(obj, "xt", + new_1d_copy(NPY_INT16, nt+ns, enc->xt-nt)); + PyDict_SetItemString(obj, "xs", new_1d_copy(NPY_FLOAT, ns+nd, enc->xs-nd)); @@ -737,7 +741,7 @@ __attribute__((unused)) static PyObject *to_encoder(PyObject *obj, struct lc3_encoder *enc) { unsigned dt, sr, sr_pcm; - PyObject *xs_obj, *xf_obj; + PyObject *xt_obj, *xs_obj, *xf_obj; CTYPES_CHECK("encoder", obj && PyDict_Check(obj)); @@ -756,6 +760,7 @@ static PyObject *to_encoder(PyObject *obj, struct lc3_encoder *enc) int ns = LC3_NS(dt, sr); int nd = LC3_ND(dt, sr); + int nt = LC3_NT(sr); CTYPES_CHECK(NULL, to_attdet_analysis( PyDict_GetItemString(obj, "attdet"), &enc->attdet)); @@ -766,6 +771,10 @@ static PyObject *to_encoder(PyObject *obj, struct lc3_encoder *enc) CTYPES_CHECK(NULL, to_spec_analysis( PyDict_GetItemString(obj, "quant"), &enc->spec)); + CTYPES_CHECK("encoder.xt", xt_obj = to_1d_copy( + PyDict_GetItemString(obj, "xt"), NPY_INT16, enc->xt-nt, ns+nt)); + PyDict_SetItemString(obj, "xt", xt_obj); + CTYPES_CHECK("encoder.xs", xs_obj = to_1d_copy( PyDict_GetItemString(obj, "xs"), NPY_FLOAT, enc->xs-nd, ns+nd)); PyDict_SetItemString(obj, "xs", xs_obj); @@ -782,8 +791,8 @@ static PyObject *from_decoder(PyObject *obj, const struct lc3_decoder *dec) { unsigned dt = dec->dt, sr = dec->sr; unsigned sr_pcm = dec->sr_pcm; - unsigned xs_pos = dec->xs - dec->xr; - int nr = LC3_NR(dt, sr); + unsigned xs_pos = dec->xs - dec->xh; + int nh = LC3_NH(dt, sr); int ns = LC3_NS(dt, sr); int nd = LC3_ND(dt, sr); @@ -804,8 +813,8 @@ static PyObject *from_decoder(PyObject *obj, const struct lc3_decoder *dec) PyDict_SetItemString(obj, "plc", new_plc_state(&dec->plc)); - PyDict_SetItemString(obj, "xr", - new_1d_copy(NPY_FLOAT, nr, dec->xr)); + PyDict_SetItemString(obj, "xh", + new_1d_copy(NPY_FLOAT, nh, dec->xh)); PyDict_SetItemString(obj, "xs_pos", new_scalar(NPY_INT, &xs_pos)); @@ -823,7 +832,7 @@ __attribute__((unused)) static PyObject *to_decoder(PyObject *obj, struct lc3_decoder *dec) { unsigned dt, sr, sr_pcm, xs_pos; - PyObject *xr_obj, *xd_obj, *xg_obj; + PyObject *xh_obj, *xd_obj, *xg_obj; CTYPES_CHECK("decoder", obj && PyDict_Check(obj)); @@ -840,7 +849,7 @@ static PyObject *to_decoder(PyObject *obj, struct lc3_decoder *dec) CTYPES_CHECK("decoder.sr_pcm", (unsigned)(dec->sr_pcm = sr_pcm) < LC3_NUM_SRATE); - int nr = LC3_NR(dt, sr); + int nh = LC3_NH(dt, sr); int ns = LC3_NS(dt, sr); int nd = LC3_ND(dt, sr); @@ -850,13 +859,13 @@ static PyObject *to_decoder(PyObject *obj, struct lc3_decoder *dec) CTYPES_CHECK(NULL, to_plc_state( PyDict_GetItemString(obj, "plc"), &dec->plc)); - CTYPES_CHECK("decoder.xr", xr_obj = to_1d_copy( - PyDict_GetItemString(obj, "xr"), NPY_FLOAT, dec->xr, nr)); - PyDict_SetItemString(obj, "xr", xr_obj); + CTYPES_CHECK("decoder.xh", xh_obj = to_1d_copy( + PyDict_GetItemString(obj, "xh"), NPY_FLOAT, dec->xh, nh)); + PyDict_SetItemString(obj, "xh", xh_obj); CTYPES_CHECK("decoder.xs", to_scalar( PyDict_GetItemString(obj, "xs_pos"), NPY_INT, &xs_pos)); - dec->xs = dec->xr + xs_pos; + dec->xs = dec->xh + xs_pos; CTYPES_CHECK("decoder.xd", xd_obj = to_1d_copy( PyDict_GetItemString(obj, "xd"), NPY_FLOAT, dec->xd, nd)); diff --git a/test/ltpf.py b/test/ltpf.py index a159da7..1a852c8 100644 --- a/test/ltpf.py +++ b/test/ltpf.py @@ -120,7 +120,7 @@ class Resampler_6k4: def initial_hp50_state(): - return { 's1': 0.0, 's2': 0.0 } + return { 's1': 0, 's2': 0 } ### ------------------------------------------------------------------------ ### @@ -442,25 +442,25 @@ def initial_sstate(): def check_resampler(rng, dt, sr): ns = T.NS[dt][sr] - nd = T.ND[dt][sr] + nt = (5 * T.SRATE_KHZ[sr]) // 4 ok = True r = Resampler_12k8(dt, sr) hp50_c = initial_hp50_state() - x_c = np.zeros(nd) + x_c = np.zeros(nt) y_c = np.zeros(384) for run in range(10): - x = (2 * rng.random(ns)) - 1 + x = ((2 * rng.random(ns)) - 1) * (2 ** 15 - 1) y = r.resample(x) - x_c = np.append(x_c[-nd:], x) + x_c = np.append(x_c[-nt:], x.astype(np.int16)) y_c[:-r.n] = y_c[r.n:] y_c = lc3.ltpf_resample(dt, sr, hp50_c, x_c, y_c) - ok = ok and np.amax(np.abs(y_c[-r.d-r.n:] - y[:r.d+r.n])) < 1e-4 + ok = ok and np.amax(np.abs(y_c[-r.d-r.n:] - y[:r.d+r.n]/2)) < 4 return ok @@ -469,54 +469,54 @@ def check_resampler_appendix_c(dt): sr = T.SRATE_16K ok = True - nd = T.ND[dt][sr] + nt = (5 * T.SRATE_KHZ[sr]) // 4 n = [ 96, 128 ][dt] k = [ 44, 24 ][dt] + n state = initial_hp50_state() - x = np.append(np.zeros(nd), C.X_PCM[dt][0]) + x = np.append(np.zeros(nt), C.X_PCM[dt][0]) y = np.zeros(384) y = lc3.ltpf_resample(dt, sr, state, x, y) u = y[-k:len(C.X_TILDE_12K8D[dt][0])-k] - ok = np.amax(np.abs(u - C.X_TILDE_12K8D[dt][0])) < 1e0 + ok = ok and np.amax(np.abs(u - C.X_TILDE_12K8D[dt][0]/2)) < 2 - x = np.append(x[-nd:], C.X_PCM[dt][1]) + x = np.append(x[-nt:], C.X_PCM[dt][1]) y[:-n] = y[n:] y = lc3.ltpf_resample(dt, sr, state, x, y) u = y[-k:len(C.X_TILDE_12K8D[dt][1])-k] - ok = ok and np.amax(np.abs(u - C.X_TILDE_12K8D[dt][1])) < 1e0 + ok = ok and np.amax(np.abs(u - C.X_TILDE_12K8D[dt][1]/2)) < 2 return ok def check_analysis(rng, dt, sr): ns = T.NS[dt][sr] - nd = T.ND[dt][sr] + nt = (5 * T.SRATE_KHZ[sr]) // 4 ok = True state_c = initial_state() - x_c = np.zeros(ns+nd) + x_c = np.zeros(ns+nt) ltpf = LtpfAnalysis(dt, sr) t = np.arange(100 * ns) / (T.SRATE_KHZ[sr] * 1000) - s = signal.chirp(t, f0=50, f1=3e3, t1=t[-1], method='logarithmic') + s = signal.chirp(t, f0=10, f1=3e3, t1=t[-1], method='logarithmic') for i in range(20): - x = s[i*ns:(i+1)*ns] + x = s[i*ns:(i+1)*ns] * (2 ** 15 - 1) pitch_present = ltpf.run(x) data = ltpf.get_data() - x_c = np.append(x_c[-nd:], x) + x_c = np.append(x_c[-nt:], x.astype(np.int16)) (pitch_present_c, data_c) = lc3.ltpf_analyse(dt, sr, state_c, x_c) - ok = ok and state_c['tc'] == ltpf.tc - ok = ok and np.amax(np.abs(state_c['nc'][0] - ltpf.nc[0])) < 1e-4 + ok = ok and (not pitch_present or state_c['tc'] == ltpf.tc) + ok = ok and np.amax(np.abs(state_c['nc'][0] - ltpf.nc[0])) < 1e-2 ok = ok and pitch_present_c == pitch_present ok = ok and data_c['active'] == data['active'] ok = ok and data_c['pitch_index'] == data['pitch_index'] @@ -564,12 +564,12 @@ def check_synthesis(rng, dt, sr): def check_analysis_appendix_c(dt): sr = T.SRATE_16K - nd = T.ND[dt][sr] + nt = (5 * T.SRATE_KHZ[sr]) // 4 ok = True state = initial_state() - x = np.append(np.zeros(nd), C.X_PCM[dt][0]) + x = np.append(np.zeros(nt), C.X_PCM[dt][0]) (pitch_present, data) = lc3.ltpf_analyse(dt, sr, state, x) ok = ok and C.T_CURR[dt][0] - state['tc'] == 17 @@ -578,7 +578,7 @@ def check_analysis_appendix_c(dt): ok = ok and data['pitch_index'] == C.PITCH_INDEX[dt][0] ok = ok and data['active'] == C.LTPF_ACTIVE[dt][0] - x = np.append(x[-nd:], C.X_PCM[dt][1]) + x = np.append(x[-nt:], C.X_PCM[dt][1]) (pitch_present, data) = lc3.ltpf_analyse(dt, sr, state, x) ok = ok and C.T_CURR[dt][1] - state['tc'] == 17 diff --git a/test/ltpf_py.c b/test/ltpf_py.c index 427dbb9..c51eadd 100644 --- a/test/ltpf_py.c +++ b/test/ltpf_py.c @@ -27,7 +27,7 @@ static PyObject *resample_py(PyObject *m, PyObject *args) unsigned dt, sr; PyObject *hp50_obj, *x_obj, *y_obj; struct lc3_ltpf_hp50_state hp50; - float *x, *y; + int16_t *x, *y; if (!PyArg_ParseTuple(args, "IIOOO", &dt, &sr, &hp50_obj, &x_obj, &y_obj)) return NULL; @@ -36,14 +36,14 @@ static PyObject *resample_py(PyObject *m, PyObject *args) CTYPES_CHECK("sr", (unsigned)sr < LC3_NUM_SRATE); CTYPES_CHECK(NULL, hp50_obj = to_ltpf_hp50_state(hp50_obj, &hp50)); - int ns = LC3_NS(dt, sr), nd = LC3_ND(dt, sr); - int ny = sizeof((struct lc3_ltpf_analysis){ }.x_12k8) / sizeof(float); + int ns = LC3_NS(dt, sr), nt = LC3_NT(dt); + int ny = sizeof((struct lc3_ltpf_analysis){ }.x_12k8) / sizeof(int16_t); int n = dt == LC3_DT_7M5 ? 96 : 128; - CTYPES_CHECK("x", x_obj = to_1d_ptr(x_obj, NPY_FLOAT, ns+nd, &x)); - CTYPES_CHECK("y", y_obj = to_1d_ptr(y_obj, NPY_FLOAT, ny, &y)); + CTYPES_CHECK("x", x_obj = to_1d_ptr(x_obj, NPY_INT16, ns+nt, &x)); + CTYPES_CHECK("y", y_obj = to_1d_ptr(y_obj, NPY_INT16, ny, &y)); - resample_12k8[sr](&hp50, x + nd, y + (ny - n), n); + resample_12k8[sr](&hp50, x + nt, y + (ny - n), n); from_ltpf_hp50_state(hp50_obj, &hp50); return Py_BuildValue("O", y_obj); @@ -55,7 +55,7 @@ static PyObject *analyse_py(PyObject *m, PyObject *args) unsigned dt, sr; struct lc3_ltpf_analysis ltpf; struct lc3_ltpf_data data = { 0 }; - float *x; + int16_t *x; if (!PyArg_ParseTuple(args, "IIOO", &dt, &sr, <pf_obj, &x_obj)) return NULL; @@ -64,12 +64,12 @@ static PyObject *analyse_py(PyObject *m, PyObject *args) CTYPES_CHECK("sr", sr < LC3_NUM_SRATE); CTYPES_CHECK(NULL, ltpf_obj = to_ltpf_analysis(ltpf_obj, <pf)); - int ns = LC3_NS(dt, sr), nd = LC3_ND(dt, sr); + int ns = LC3_NS(dt, sr), nt = LC3_NT(sr); - CTYPES_CHECK("x", x_obj = to_1d_ptr(x_obj, NPY_FLOAT, ns+nd, &x)); + CTYPES_CHECK("x", x_obj = to_1d_ptr(x_obj, NPY_INT16, ns+nt, &x)); int pitch_present = - lc3_ltpf_analyse(dt, sr, <pf, x + nd, &data); + lc3_ltpf_analyse(dt, sr, <pf, x + nt, &data); from_ltpf_analysis(ltpf_obj, <pf); return Py_BuildValue("iN", pitch_present, new_ltpf_data(&data)); diff --git a/test/makefile.mk b/test/makefile.mk index c2ae83b..cfced65 100644 --- a/test/makefile.mk +++ b/test/makefile.mk @@ -16,12 +16,17 @@ TEST_DIR := test +test_py: + $(V)cd $(TEST_DIR) && python3 setup.py && python3 run.py + .PHONY: test test-clean -test: - $(V)cd $(TEST_DIR) && python3 setup.py && python3 run.py +test: test_py test-clean: $(V)cd $(TEST_DIR) && python3 setup.py clean > /tmp/zero +-include $(TEST_DIR)/arm/makefile.mk +-include $(TEST_DIR)/neon/makefile.mk + clean-all: test-clean diff --git a/test/neon/ltpf_neon.c b/test/neon/ltpf_neon.c new file mode 100644 index 0000000..0577bd1 --- /dev/null +++ b/test/neon/ltpf_neon.c @@ -0,0 +1,116 @@ +/****************************************************************************** + * + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +#include "neon.h" + +#include +#include +#include + +/* -------------------------------------------------------------------------- */ + +#define TEST_NEON +#include + +void lc3_put_bits_generic(lc3_bits_t *a, unsigned b, int c) +{ (void)a, (void)b, (void)c; } + +unsigned lc3_get_bits_generic(struct lc3_bits *a, int b) +{ return (void)a, (void)b, 0; } + +/* -------------------------------------------------------------------------- */ + +static int check_resampler() +{ + int16_t __x[60+480], *x = __x + 60; + for (int i = -60; i < 480; i++) + x[i] = rand() & 0xffff; + + struct lc3_ltpf_hp50_state hp50 = { 0 }, hp50_neon = { 0 }; + int16_t y[128], y_neon[128]; + + resample_16k_12k8(&hp50, x, y, 128); + neon_resample_16k_12k8(&hp50_neon, x, y_neon, 128); + if (memcmp(y, y_neon, 128 * sizeof(*y)) != 0) + return printf("Error\n"), -1; + + resample_32k_12k8(&hp50, x, y, 128); + neon_resample_32k_12k8(&hp50_neon, x, y_neon, 128); + if (memcmp(y, y_neon, 128 * sizeof(*y)) != 0) + return printf("Error\n"), -1; + + resample_48k_12k8(&hp50, x, y, 128); + neon_resample_48k_12k8(&hp50_neon, x, y_neon, 128); + if (memcmp(y, y_neon, 128 * sizeof(*y)) != 0) + return -1; + + return 0; +} + +static int check_dot() +{ + int16_t x[200]; + for (int i = 0; i < 200; i++) + x[i] = rand() & 0xffff; + + float y = dot(x, x+3, 128); + float y_neon = neon_dot(x, x+3, 128); + if (y != y_neon) + return -1; + + return 0; +} + +static int check_correlate() +{ + int16_t alignas(4) a[500], b[500]; + float y[100], y_neon[100]; + + for (int i = 0; i < 500; i++) { + a[i] = rand() & 0xffff; + b[i] = rand() & 0xffff; + } + + correlate(a, b+200, 128, y, 100); + neon_correlate(a, b+200, 128, y_neon, 100); + if (memcmp(y, y_neon, 100 * sizeof(*y)) != 0) + return -1; + + correlate(a, b+199, 128, y, 99); + neon_correlate(a, b+199, 128, y_neon, 99); + if (memcmp(y, y_neon, 99 * sizeof(*y)) != 0) + return -1; + + return 0; +} + +int check_ltpf(void) +{ + int ret; + + if ((ret = check_resampler()) < 0) + return ret; + + if ((ret = check_dot()) < 0) + return ret; + + if ((ret = check_correlate()) < 0) + return ret; + + return 0; +} diff --git a/test/neon/makefile.mk b/test/neon/makefile.mk new file mode 100644 index 0000000..c01e70f --- /dev/null +++ b/test/neon/makefile.mk @@ -0,0 +1,31 @@ +# +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +test_neon_src += \ + $(TEST_DIR)/neon/test_neon.c \ + $(TEST_DIR)/neon/ltpf_neon.c \ + $(SRC_DIR)/tables.c + +test_neon_include += $(SRC_DIR) +test_neon_ldlibs += m + +$(eval $(call add-bin,test_neon)) + +test_neon: $(test_neon_bin) + @echo " RUN $(notdir $<)" + $(V)$< + +test: test_neon diff --git a/test/neon/neon.h b/test/neon/neon.h new file mode 100644 index 0000000..4015ca5 --- /dev/null +++ b/test/neon/neon.h @@ -0,0 +1,141 @@ +/****************************************************************************** + * + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +#if __ARM_NEON + +#include + +#else +#define __ARM_NEON 1 + +#include + +typedef struct { int16_t e[4]; } int16x4_t; + +typedef struct { int16_t e[8]; } int16x8_t; +typedef struct { int32_t e[4]; } int32x4_t; +typedef struct { int64_t e[2]; } int64x2_t; + + +/* ---------------------------------------------------------------------------- + * Load / Store + * -------------------------------------------------------------------------- */ + +__attribute__((unused)) +static int16x4_t vld1_s16(const int16_t *p) +{ + int16x4_t r; + + for (int i = 0; i < 4; i++) + r.e[i] = *(p++); + + return r; +} + +__attribute__((unused)) +static int64x2_t vmovq_n_s64(int64_t v) +{ + int64x2_t r; + + r.e[0] = v; + r.e[1] = v; + + return r; +} + + +/* ---------------------------------------------------------------------------- + * Move + * -------------------------------------------------------------------------- */ + +__attribute__((unused)) +static int32x4_t vmovq_n_s32(uint32_t v) +{ + int32x4_t r; + + for (int i = 0; i < 4; i++) + r.e[i] = v; + + return r; +} + +__attribute__((unused)) +static int16x4_t vext_s16(int16x4_t a, int16x4_t b, const int n) +{ + int16x4_t r; + int i = 0; + + for (; i < n; i++) r.e[3-i] = b.e[(n-1)-i]; + for (; i < 4; i++) r.e[3-i] = a.e[3-(i-n)]; + + return r; +} + +/* ---------------------------------------------------------------------------- + * Arithmetic + * -------------------------------------------------------------------------- */ + +__attribute__((unused)) +static int32x4_t vmull_s16(int16x4_t a, int16x4_t b) +{ + int32x4_t r; + + for (int i = 0; i < 4; i++) + r.e[i] = (int32_t)a.e[i] * b.e[i]; + + return r; +} + +__attribute__((unused)) +static int32x4_t vmlal_s16(int32x4_t r, int16x4_t a, int16x4_t b) +{ + for (int i = 0; i < 4; i++) + r.e[i] += (int32_t)a.e[i] * b.e[i]; + + return r; +} + +__attribute__((unused)) +static int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b) +{ + int64x2_t r; + + r.e[0] = a.e[0] + ((int64_t)b.e[0] + b.e[1]); + r.e[1] = a.e[1] + ((int64_t)b.e[2] + b.e[3]); + + return r; +} + + +/* ---------------------------------------------------------------------------- + * Reduce + * -------------------------------------------------------------------------- */ + +__attribute__((unused)) +static int32_t vaddvq_s32(int32x4_t v) +{ + return v.e[0] + v.e[1] + v.e[2] + v.e[3]; +} + +__attribute__((unused)) +static int64_t vaddvq_s64(int64x2_t v) +{ + return v.e[0] + v.e[1]; +} + +#endif /* __ARM_NEON */ diff --git a/test/neon/test_neon.c b/test/neon/test_neon.c new file mode 100644 index 0000000..af9bd98 --- /dev/null +++ b/test/neon/test_neon.c @@ -0,0 +1,32 @@ +/****************************************************************************** + * + * Copyright 2022 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +#include + +int check_ltpf(void); + +int main() +{ + int r, ret = 0; + + printf("Checking LTPF Neon... "); fflush(stdout); + printf("%s\n", (r = check_ltpf()) == 0 ? "OK" : "Failed"); + ret = ret || r; + + return ret; +}