Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arm: Speed up FLOAT2INT16 conversion with Neon #379

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions celt/arm/arm_celt_map.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/* Copyright (c) 2010 Xiph.Org Foundation
* Copyright (c) 2013 Parrot */
* Copyright (c) 2013 Parrot
* Copyright (c) 2024 Arm Limited */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -29,12 +30,25 @@
#include "config.h"
#endif

#include "pitch.h"
#include "kiss_fft.h"
#include "mathops.h"
#include "mdct.h"
#include "pitch.h"

#if defined(OPUS_HAVE_RTCD)

# if !defined(DISABLE_FLOAT_API)
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
void (*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt) = {
celt_float2int16_c, /* ARMv4 */
celt_float2int16_c, /* EDSP */
celt_float2int16_c, /* Media */
celt_float2int16_neon,/* NEON */
celt_float2int16_neon /* DOTPROD */
};
# endif
# endif

# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
celt_inner_prod_c, /* ARMv4 */
Expand Down
51 changes: 51 additions & 0 deletions celt/arm/celt_neon_intr.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/* Copyright (c) 2014-2015 Xiph.Org Foundation
Copyright (c) 2024 Arm Limited
Written by Viswanath Puttagunta */
/**
@file celt_neon_intr.c
Expand Down Expand Up @@ -35,7 +36,57 @@
#endif

#include <arm_neon.h>
#include "../float_cast.h"
#include "../mathops.h"
#include "../pitch.h"
#if defined(OPUS_CHECK_ASM)
#include <stdlib.h>
#endif

#if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)

void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
{
int i = 0;

#if defined(__ARM_NEON)
const int BLOCK_SIZE = 16;
const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;

for (; i < blockedSize; i += BLOCK_SIZE)
{
float32x4_t orig_a = vld1q_f32(&in[i + 0]);
float32x4_t orig_b = vld1q_f32(&in[i + 4]);
float32x4_t orig_c = vld1q_f32(&in[i + 8]);
float32x4_t orig_d = vld1q_f32(&in[i + 12]);

int16x4_t asShort_a = vqmovn_s32(vroundf(vmulq_n_f32(orig_a, CELT_SIG_SCALE)));
int16x4_t asShort_b = vqmovn_s32(vroundf(vmulq_n_f32(orig_b, CELT_SIG_SCALE)));
int16x4_t asShort_c = vqmovn_s32(vroundf(vmulq_n_f32(orig_c, CELT_SIG_SCALE)));
int16x4_t asShort_d = vqmovn_s32(vroundf(vmulq_n_f32(orig_d, CELT_SIG_SCALE)));

vst1_s16(&out[i + 0], asShort_a);
vst1_s16(&out[i + 4], asShort_b);
vst1_s16(&out[i + 8], asShort_c);
vst1_s16(&out[i + 12], asShort_d);
# if defined(OPUS_CHECK_ASM)
short out_c[BLOCK_SIZE];
int j;
for(j = 0; j < BLOCK_SIZE; j++)
{
out_c[j] = FLOAT2INT16(in[i + j]);
celt_assert(abs((out_c[j] - out[i + j])) <= 1);
}
# endif
}
#endif

for (; i < cnt; i++)
{
out[i] = FLOAT2INT16(in[i]);
}
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be good to add an OPUS_CHECK_ASM block to verify that the results match the C code. You can grep for OPUS_CHECK_ASM to see how it's done in other parts of the code

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would be happy to add this check, however I'm fairly certain that in the corner case where we convert numbers exactly between two integers it will round differently, as it does in other cases already.

As I've seen:

  • Intel SIMD variants rounds towards zero (via truncation)
  • MSVC x86 assembly variant depends on FPU rounding mode
  • Fallback manual rounding method rounds towards +∞
  • Most other variants of float2int uses round to nearest, ties to even.

Using vcvtaq_s32_f32 intrinsic on AArch64 systems will round to nearest, ties away from zero

One solution could be to further extend unit tests already added to check the correctness of the conversions.
Alternatively, we could aim to achieve the more ubiquitous behaviour with a bit of performance penalty.

As far as I understand, however, in the case of digital signal processing, the benefit of the performance uplift of this solution outweighs the occasional mismatch by one on the output and could be acceptable.

Please advise me how to proceed / what would be an acceptable solution for you.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I see the problem. Indeed we don't really care how ties get rounded. Maybe a simple way to do the check would just be to verify that the integer value differs from the input float by less than 0.501 or so?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I pushed a modification containing a check to see if the intrinsic implementation is off by maximum 1

#endif

#if defined(FIXED_POINT)
#include <string.h>
Expand Down
65 changes: 65 additions & 0 deletions celt/arm/mathops_arm.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/* Copyright (c) 2024 Arm Limited */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.

- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#if !defined(MATHOPS_ARM_H)
# define MATHOPS_ARM_H

#include "armcpu.h"
#include "cpu_support.h"
#include "opus_defines.h"

# if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)

#include <arm_neon.h>

static inline int32x4_t vroundf(float32x4_t x)
{
# if defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
return vcvtaq_s32_f32(x);
# else
uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), vdupq_n_u32(0x80000000));
uint32x4_t bias = vdupq_n_u32(0x3F000000);
return vcvtq_s32_f32(vaddq_f32(x, vreinterpretq_f32_u32(vorrq_u32(bias, sign))));
# endif
}

void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
# if defined(OPUS_HAVE_RTCD) && \
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
extern void
(*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);

# define OVERRIDE_FLOAT2INT16 (1)
# define celt_float2int16(in, out, cnt, arch) \
((*CELT_FLOAT2INT16_IMPL[(arch)&OPUS_ARCHMASK])(in, out, cnt))

# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
# define OVERRIDE_FLOAT2INT16 (1)
# define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_neon(in, out, cnt))
# endif
# endif

#endif /* MATHOPS_ARM_H */
7 changes: 7 additions & 0 deletions celt/float_cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s

return intgr ;
}
#elif defined(__aarch64__)

#include <arm_neon.h>
static OPUS_INLINE opus_int32 float2int(float flt)
{
return vcvtns_s32_f32(flt);
}

#elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L

Expand Down
15 changes: 15 additions & 0 deletions celt/mathops.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* Copyright (c) 2002-2008 Jean-Marc Valin
Copyright (c) 2007-2008 CSIRO
Copyright (c) 2007-2009 Xiph.Org Foundation
Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin */
/**
@file mathops.h
Expand Down Expand Up @@ -35,6 +36,7 @@
#include "config.h"
#endif

#include "float_cast.h"
#include "mathops.h"

/*Compute floor(sqrt(_val)) with exact arithmetic.
Expand Down Expand Up @@ -215,3 +217,16 @@ opus_val32 celt_rcp(opus_val32 x)
}

#endif

#ifndef DISABLE_FLOAT_API

void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
{
int i;
for (i = 0; i < cnt; i++)
{
out[i] = FLOAT2INT16(in[i]);
}
}

#endif /* DISABLE_FLOAT_API */
16 changes: 16 additions & 0 deletions celt/mathops.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
/* Copyright (c) 2002-2008 Jean-Marc Valin
Copyright (c) 2007-2008 CSIRO
Copyright (c) 2007-2009 Xiph.Org Foundation
Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin, and Yunho Huh */
/**
@file mathops.h
Expand Down Expand Up @@ -38,6 +39,10 @@
#include "entcode.h"
#include "os_support.h"

#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
#include "arm/mathops_arm.h"
#endif

#define PI 3.141592653f

/* Multiplies two 16-bit fractional values. Bit-exactness of this macro is important */
Expand Down Expand Up @@ -476,4 +481,15 @@ static OPUS_INLINE opus_val16 celt_atan2p(opus_val16 y, opus_val16 x)
}

#endif /* FIXED_POINT */

#ifndef DISABLE_FLOAT_API

void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);

#ifndef OVERRIDE_FLOAT2INT16
#define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_c(in, out, cnt))
#endif

#endif /* DISABLE_FLOAT_API */

#endif /* MATHOPS_H */
100 changes: 99 additions & 1 deletion celt/tests/test_unit_mathops.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/* Copyright (c) 2008-2011 Xiph.Org Foundation, Mozilla Corporation,
Gregory Maxwell
Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin, Gregory Maxwell, Timothy B. Terriberry,
and Yunho Huh */
/*
Expand Down Expand Up @@ -37,8 +38,10 @@

#include <stdio.h>
#include <math.h>
#include "mathops.h"
#include "bands.h"
#include "cpu_support.h"
#include "float_cast.h"
#include "mathops.h"

#ifdef FIXED_POINT
#define WORD "%d"
Expand Down Expand Up @@ -351,8 +354,94 @@ void testilog2(void)
}
#endif


#ifndef DISABLE_FLOAT_API

void testcelt_float2int16(int use_ref_impl, int buffer_size)
{

#define MAX_BUFFER_SIZE 2080
int i, cnt;
float floatsToConvert[MAX_BUFFER_SIZE];
short results[MAX_BUFFER_SIZE] = { 0 };
float scaleInt16RangeTo01;

celt_assert(buffer_size <= MAX_BUFFER_SIZE);

scaleInt16RangeTo01 = 1.f / 32768.f;
cnt = 0;

while (cnt + 15 < buffer_size && cnt < buffer_size / 2)
{
floatsToConvert[cnt++] = 77777.0f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = 33000.0f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = 32768.0f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = 32767.4f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = 32766.6f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = .501 * scaleInt16RangeTo01;
floatsToConvert[cnt++] = .499f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = .0f;
floatsToConvert[cnt++] = -.499f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -.501f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -32767.6f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -32768.4f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -32769.0f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -33000.0f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -77777.0f * scaleInt16RangeTo01;

celt_assert(cnt < buffer_size);
}

while (cnt < buffer_size)
{
float inInt16Range = cnt * 7 + .5;
inInt16Range += (cnt & 0x01) ? .1 : -.1;
inInt16Range *= (cnt & 0x02) ? 1 : -1;
floatsToConvert[cnt++] = inInt16Range * scaleInt16RangeTo01;
}

for (i = 0; i < MAX_BUFFER_SIZE; ++i)
{
results[i] = 42;
}

if (use_ref_impl)
{
celt_float2int16_c(floatsToConvert, results, cnt);
} else {
celt_float2int16(floatsToConvert, results, cnt, opus_select_arch());
}

for (i = 0; i < cnt; ++i)
{
const float expected = FLOAT2INT16(floatsToConvert[i]);
if (results[i] != expected)
{
fprintf (stderr, "testcelt_float2int16 failed: celt_float2int16 converted %f (index: %d) to %d (x*32768=%f, expected: %d, cnt: %d, ref: %d)\n",
floatsToConvert[i], i, (int)results[i], floatsToConvert[i] * 32768.0f, (int)expected, buffer_size, use_ref_impl);
ret = 1;
}
}

for (i = cnt; i < MAX_BUFFER_SIZE; ++i)
{
if (results[i] != 42)
{
fprintf (stderr, "testcelt_float2int16 failed: buffer overflow (cnt: %d, ref: %d)\n", buffer_size, use_ref_impl);
ret = 1;
break;
}
}
#undef MAX_BUFFER_SIZE
}

#endif

int main(void)
{
int i;
int use_ref_impl[2] = { 0, 1 };

testbitexactcos();
testbitexactlog2tan();
testdiv();
Expand All @@ -364,6 +453,15 @@ int main(void)
testilog2();
testlog2_db();
testexp2_db();
#endif
#ifndef DISABLE_FLOAT_API
for (i = 0; i <= 1; ++i)
{
testcelt_float2int16(use_ref_impl[i], 1);
testcelt_float2int16(use_ref_impl[i], 32);
testcelt_float2int16(use_ref_impl[i], 127);
testcelt_float2int16(use_ref_impl[i], 1031);
}
#endif
return ret;
}
1 change: 1 addition & 0 deletions celt_headers.mk
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ celt/arm/fixed_armv5e.h \
celt/arm/fixed_arm64.h \
celt/arm/kiss_fft_armv4.h \
celt/arm/kiss_fft_armv5e.h \
celt/arm/mathops_arm.h \
celt/arm/pitch_arm.h \
celt/arm/fft_arm.h \
celt/arm/mdct_arm.h \
Expand Down
Loading