Skip to content

Commit 35b5feb

Browse files
VSZSagosdahu
authored andcommitted
Arm: Speed up FLOAT2INT16 conversion with Neon
Using Neon for float to int conversion, and introducing platform- specific function for converting an array of float values to int16. Also adding appropriate unit test.
1 parent c79a9bd commit 35b5feb

9 files changed

+277
-4
lines changed

celt/arm/arm_celt_map.c

+16-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/* Copyright (c) 2010 Xiph.Org Foundation
2-
* Copyright (c) 2013 Parrot */
2+
* Copyright (c) 2013 Parrot
3+
* Copyright (c) 2024 Arm Limited */
34
/*
45
Redistribution and use in source and binary forms, with or without
56
modification, are permitted provided that the following conditions
@@ -29,12 +30,25 @@
2930
#include "config.h"
3031
#endif
3132

32-
#include "pitch.h"
3333
#include "kiss_fft.h"
34+
#include "mathops.h"
3435
#include "mdct.h"
36+
#include "pitch.h"
3537

3638
#if defined(OPUS_HAVE_RTCD)
3739

40+
# if !defined(DISABLE_FLOAT_API)
41+
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
42+
void (*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt) = {
43+
celt_float2int16_c, /* ARMv4 */
44+
celt_float2int16_c, /* EDSP */
45+
celt_float2int16_c, /* Media */
46+
celt_float2int16_neon,/* NEON */
47+
celt_float2int16_neon /* DOTPROD */
48+
};
49+
# endif
50+
# endif
51+
3852
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
3953
opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
4054
celt_inner_prod_c, /* ARMv4 */

celt/arm/celt_neon_intr.c

+51
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/* Copyright (c) 2014-2015 Xiph.Org Foundation
2+
Copyright (c) 2024 Arm Limited
23
Written by Viswanath Puttagunta */
34
/**
45
@file celt_neon_intr.c
@@ -35,7 +36,57 @@
3536
#endif
3637

3738
#include <arm_neon.h>
39+
#include "../float_cast.h"
40+
#include "../mathops.h"
3841
#include "../pitch.h"
42+
#if defined(OPUS_CHECK_ASM)
43+
#include <stdlib.h>
44+
#endif
45+
46+
#if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
47+
48+
void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
49+
{
50+
int i = 0;
51+
52+
#if defined(__ARM_NEON)
53+
const int BLOCK_SIZE = 16;
54+
const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;
55+
56+
for (; i < blockedSize; i += BLOCK_SIZE)
57+
{
58+
float32x4_t orig_a = vld1q_f32(&in[i + 0]);
59+
float32x4_t orig_b = vld1q_f32(&in[i + 4]);
60+
float32x4_t orig_c = vld1q_f32(&in[i + 8]);
61+
float32x4_t orig_d = vld1q_f32(&in[i + 12]);
62+
63+
int16x4_t asShort_a = vqmovn_s32(vroundf(vmulq_n_f32(orig_a, CELT_SIG_SCALE)));
64+
int16x4_t asShort_b = vqmovn_s32(vroundf(vmulq_n_f32(orig_b, CELT_SIG_SCALE)));
65+
int16x4_t asShort_c = vqmovn_s32(vroundf(vmulq_n_f32(orig_c, CELT_SIG_SCALE)));
66+
int16x4_t asShort_d = vqmovn_s32(vroundf(vmulq_n_f32(orig_d, CELT_SIG_SCALE)));
67+
68+
vst1_s16(&out[i + 0], asShort_a);
69+
vst1_s16(&out[i + 4], asShort_b);
70+
vst1_s16(&out[i + 8], asShort_c);
71+
vst1_s16(&out[i + 12], asShort_d);
72+
# if defined(OPUS_CHECK_ASM)
73+
short out_c[BLOCK_SIZE];
74+
int j;
75+
for(j = 0; j < BLOCK_SIZE; j++)
76+
{
77+
out_c[j] = FLOAT2INT16(in[i + j]);
78+
celt_assert(abs((out_c[j] - out[i + j])) <= 1);
79+
}
80+
# endif
81+
}
82+
#endif
83+
84+
for (; i < cnt; i++)
85+
{
86+
out[i] = FLOAT2INT16(in[i]);
87+
}
88+
}
89+
#endif
3990

4091
#if defined(FIXED_POINT)
4192
#include <string.h>

celt/arm/mathops_arm.h

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/* Copyright (c) 2024 Arm Limited */
2+
/*
3+
Redistribution and use in source and binary forms, with or without
4+
modification, are permitted provided that the following conditions
5+
are met:
6+
7+
- Redistributions of source code must retain the above copyright
8+
notice, this list of conditions and the following disclaimer.
9+
10+
- Redistributions in binary form must reproduce the above copyright
11+
notice, this list of conditions and the following disclaimer in the
12+
documentation and/or other materials provided with the distribution.
13+
14+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15+
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
18+
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21+
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
22+
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23+
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24+
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
*/
26+
27+
#if !defined(MATHOPS_ARM_H)
28+
# define MATHOPS_ARM_H
29+
30+
#include "armcpu.h"
31+
#include "cpu_support.h"
32+
#include "opus_defines.h"
33+
34+
# if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
35+
36+
#include <arm_neon.h>
37+
38+
static inline int32x4_t vroundf(float32x4_t x)
39+
{
40+
# if defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
41+
return vcvtaq_s32_f32(x);
42+
# else
43+
uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), vdupq_n_u32(0x80000000));
44+
uint32x4_t bias = vdupq_n_u32(0x3F000000);
45+
return vcvtq_s32_f32(vaddq_f32(x, vreinterpretq_f32_u32(vorrq_u32(bias, sign))));
46+
# endif
47+
}
48+
49+
void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
50+
# if defined(OPUS_HAVE_RTCD) && \
51+
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
52+
extern void
53+
(*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
54+
55+
# define OVERRIDE_FLOAT2INT16 (1)
56+
# define celt_float2int16(in, out, cnt, arch) \
57+
((*CELT_FLOAT2INT16_IMPL[(arch)&OPUS_ARCHMASK])(in, out, cnt))
58+
59+
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
60+
# define OVERRIDE_FLOAT2INT16 (1)
61+
# define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_neon(in, out, cnt))
62+
# endif
63+
# endif
64+
65+
#endif /* MATHOPS_ARM_H */

celt/float_cast.h

+7
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,13 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s
9898

9999
return intgr ;
100100
}
101+
#elif defined(__aarch64__)
102+
103+
#include <arm_neon.h>
104+
static OPUS_INLINE opus_int32 float2int(float flt)
105+
{
106+
return vcvtns_s32_f32(flt);
107+
}
101108

102109
#elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
103110

celt/mathops.c

+15
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/* Copyright (c) 2002-2008 Jean-Marc Valin
22
Copyright (c) 2007-2008 CSIRO
33
Copyright (c) 2007-2009 Xiph.Org Foundation
4+
Copyright (c) 2024 Arm Limited
45
Written by Jean-Marc Valin */
56
/**
67
@file mathops.h
@@ -35,6 +36,7 @@
3536
#include "config.h"
3637
#endif
3738

39+
#include "float_cast.h"
3840
#include "mathops.h"
3941

4042
/*Compute floor(sqrt(_val)) with exact arithmetic.
@@ -215,3 +217,16 @@ opus_val32 celt_rcp(opus_val32 x)
215217
}
216218

217219
#endif
220+
221+
#ifndef DISABLE_FLOAT_API
222+
223+
void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
224+
{
225+
int i;
226+
for (i = 0; i < cnt; i++)
227+
{
228+
out[i] = FLOAT2INT16(in[i]);
229+
}
230+
}
231+
232+
#endif /* DISABLE_FLOAT_API */

celt/mathops.h

+16
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/* Copyright (c) 2002-2008 Jean-Marc Valin
22
Copyright (c) 2007-2008 CSIRO
33
Copyright (c) 2007-2009 Xiph.Org Foundation
4+
Copyright (c) 2024 Arm Limited
45
Written by Jean-Marc Valin, and Yunho Huh */
56
/**
67
@file mathops.h
@@ -38,6 +39,10 @@
3839
#include "entcode.h"
3940
#include "os_support.h"
4041

42+
#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
43+
#include "arm/mathops_arm.h"
44+
#endif
45+
4146
#define PI 3.141592653f
4247

4348
/* Multiplies two 16-bit fractional values. Bit-exactness of this macro is important */
@@ -476,4 +481,15 @@ static OPUS_INLINE opus_val16 celt_atan2p(opus_val16 y, opus_val16 x)
476481
}
477482

478483
#endif /* FIXED_POINT */
484+
485+
#ifndef DISABLE_FLOAT_API
486+
487+
void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
488+
489+
#ifndef OVERRIDE_FLOAT2INT16
490+
#define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_c(in, out, cnt))
491+
#endif
492+
493+
#endif /* DISABLE_FLOAT_API */
494+
479495
#endif /* MATHOPS_H */

celt/tests/test_unit_mathops.c

+99-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/* Copyright (c) 2008-2011 Xiph.Org Foundation, Mozilla Corporation,
22
Gregory Maxwell
3+
Copyright (c) 2024 Arm Limited
34
Written by Jean-Marc Valin, Gregory Maxwell, Timothy B. Terriberry,
45
and Yunho Huh */
56
/*
@@ -37,8 +38,10 @@
3738

3839
#include <stdio.h>
3940
#include <math.h>
40-
#include "mathops.h"
4141
#include "bands.h"
42+
#include "cpu_support.h"
43+
#include "float_cast.h"
44+
#include "mathops.h"
4245

4346
#ifdef FIXED_POINT
4447
#define WORD "%d"
@@ -351,8 +354,94 @@ void testilog2(void)
351354
}
352355
#endif
353356

357+
358+
#ifndef DISABLE_FLOAT_API
359+
360+
void testcelt_float2int16(int use_ref_impl, int buffer_size)
361+
{
362+
363+
#define MAX_BUFFER_SIZE 2080
364+
int i, cnt;
365+
float floatsToConvert[MAX_BUFFER_SIZE];
366+
short results[MAX_BUFFER_SIZE] = { 0 };
367+
float scaleInt16RangeTo01;
368+
369+
celt_assert(buffer_size <= MAX_BUFFER_SIZE);
370+
371+
scaleInt16RangeTo01 = 1.f / 32768.f;
372+
cnt = 0;
373+
374+
while (cnt + 15 < buffer_size && cnt < buffer_size / 2)
375+
{
376+
floatsToConvert[cnt++] = 77777.0f * scaleInt16RangeTo01;
377+
floatsToConvert[cnt++] = 33000.0f * scaleInt16RangeTo01;
378+
floatsToConvert[cnt++] = 32768.0f * scaleInt16RangeTo01;
379+
floatsToConvert[cnt++] = 32767.4f * scaleInt16RangeTo01;
380+
floatsToConvert[cnt++] = 32766.6f * scaleInt16RangeTo01;
381+
floatsToConvert[cnt++] = .501 * scaleInt16RangeTo01;
382+
floatsToConvert[cnt++] = .499f * scaleInt16RangeTo01;
383+
floatsToConvert[cnt++] = .0f;
384+
floatsToConvert[cnt++] = -.499f * scaleInt16RangeTo01;
385+
floatsToConvert[cnt++] = -.501f * scaleInt16RangeTo01;
386+
floatsToConvert[cnt++] = -32767.6f * scaleInt16RangeTo01;
387+
floatsToConvert[cnt++] = -32768.4f * scaleInt16RangeTo01;
388+
floatsToConvert[cnt++] = -32769.0f * scaleInt16RangeTo01;
389+
floatsToConvert[cnt++] = -33000.0f * scaleInt16RangeTo01;
390+
floatsToConvert[cnt++] = -77777.0f * scaleInt16RangeTo01;
391+
392+
celt_assert(cnt < buffer_size);
393+
}
394+
395+
while (cnt < buffer_size)
396+
{
397+
float inInt16Range = cnt * 7 + .5;
398+
inInt16Range += (cnt & 0x01) ? .1 : -.1;
399+
inInt16Range *= (cnt & 0x02) ? 1 : -1;
400+
floatsToConvert[cnt++] = inInt16Range * scaleInt16RangeTo01;
401+
}
402+
403+
for (i = 0; i < MAX_BUFFER_SIZE; ++i)
404+
{
405+
results[i] = 42;
406+
}
407+
408+
if (use_ref_impl)
409+
{
410+
celt_float2int16_c(floatsToConvert, results, cnt);
411+
} else {
412+
celt_float2int16(floatsToConvert, results, cnt, opus_select_arch());
413+
}
414+
415+
for (i = 0; i < cnt; ++i)
416+
{
417+
const float expected = FLOAT2INT16(floatsToConvert[i]);
418+
if (results[i] != expected)
419+
{
420+
fprintf (stderr, "testcelt_float2int16 failed: celt_float2int16 converted %f (index: %d) to %d (x*32768=%f, expected: %d, cnt: %d, ref: %d)\n",
421+
floatsToConvert[i], i, (int)results[i], floatsToConvert[i] * 32768.0f, (int)expected, buffer_size, use_ref_impl);
422+
ret = 1;
423+
}
424+
}
425+
426+
for (i = cnt; i < MAX_BUFFER_SIZE; ++i)
427+
{
428+
if (results[i] != 42)
429+
{
430+
fprintf (stderr, "testcelt_float2int16 failed: buffer overflow (cnt: %d, ref: %d)\n", buffer_size, use_ref_impl);
431+
ret = 1;
432+
break;
433+
}
434+
}
435+
#undef MAX_BUFFER_SIZE
436+
}
437+
438+
#endif
439+
354440
int main(void)
355441
{
442+
int i;
443+
int use_ref_impl[2] = { 0, 1 };
444+
356445
testbitexactcos();
357446
testbitexactlog2tan();
358447
testdiv();
@@ -364,6 +453,15 @@ int main(void)
364453
testilog2();
365454
testlog2_db();
366455
testexp2_db();
456+
#endif
457+
#ifndef DISABLE_FLOAT_API
458+
for (i = 0; i <= 1; ++i)
459+
{
460+
testcelt_float2int16(use_ref_impl[i], 1);
461+
testcelt_float2int16(use_ref_impl[i], 32);
462+
testcelt_float2int16(use_ref_impl[i], 127);
463+
testcelt_float2int16(use_ref_impl[i], 1031);
464+
}
367465
#endif
368466
return ret;
369467
}

celt_headers.mk

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ celt/arm/fixed_armv5e.h \
3939
celt/arm/fixed_arm64.h \
4040
celt/arm/kiss_fft_armv4.h \
4141
celt/arm/kiss_fft_armv5e.h \
42+
celt/arm/mathops_arm.h \
4243
celt/arm/pitch_arm.h \
4344
celt/arm/fft_arm.h \
4445
celt/arm/mdct_arm.h \

0 commit comments

Comments
 (0)