Staging: add echo cancelation module
This is used by mISDN and Zaptel drivers. From: Steve Underwood <steveu@coppice.org> From: David Rowe <david@rowetel.com> Cc: Tzafrir Cohen <tzafrir.cohen@xorcom.com> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
This commit is contained in:
parent
00b3ed1685
commit
10602db812
|
@ -39,4 +39,6 @@ source "drivers/staging/winbond/Kconfig"
|
|||
|
||||
source "drivers/staging/wlan-ng/Kconfig"
|
||||
|
||||
source "drivers/staging/echo/Kconfig"
|
||||
|
||||
endif # STAGING
|
||||
|
|
|
@ -8,3 +8,4 @@ obj-$(CONFIG_VIDEO_GO7007) += go7007/
|
|||
obj-$(CONFIG_USB_IP_COMMON) += usbip/
|
||||
obj-$(CONFIG_W35UND) += winbond/
|
||||
obj-$(CONFIG_PRISM2_USB) += wlan-ng/
|
||||
obj-$(CONFIG_ECHO) += echo/
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
config ECHO
|
||||
tristate "Line Echo Canceller support"
|
||||
default n
|
||||
---help---
|
||||
This driver provides line echo cancelling support for mISDN and
|
||||
Zaptel drivers.
|
||||
|
||||
To compile this driver as a module, choose M here. The module
|
||||
will be called echo.
|
|
@ -0,0 +1 @@
|
|||
obj-$(CONFIG_ECHO) += echo.o
|
|
@ -0,0 +1,10 @@
|
|||
TODO:
|
||||
- checkpatch.pl cleanups
|
||||
- Lindent
|
||||
- typedef removals
|
||||
- handle bit_operations.h (merge in or make part of common code?)
|
||||
- remove proc interface, only use echo.h interface (proc interface is
|
||||
racy and not correct.)
|
||||
|
||||
Please send patches to Greg Kroah-Hartman <greg@kroah.com> and Cc: Steve
|
||||
Underwood <steveu@coppice.org> and David Rowe <david@rowetel.com>
|
|
@ -0,0 +1,253 @@
|
|||
/*
|
||||
* SpanDSP - a series of DSP components for telephony
|
||||
*
|
||||
* bit_operations.h - Various bit level operations, such as bit reversal
|
||||
*
|
||||
* Written by Steve Underwood <steveu@coppice.org>
|
||||
*
|
||||
* Copyright (C) 2006 Steve Underwood
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*
|
||||
* $Id: bit_operations.h,v 1.11 2006/11/28 15:37:03 steveu Exp $
|
||||
*/
|
||||
|
||||
/*! \file */
|
||||
|
||||
#if !defined(_BIT_OPERATIONS_H_)
|
||||
#define _BIT_OPERATIONS_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
/*! \brief Find the bit position of the highest set bit in a word
|
||||
\param bits The word to be searched
|
||||
\return The bit number of the highest set bit, or -1 if the word is zero. */
|
||||
static __inline__ int top_bit(unsigned int bits)
|
||||
{
|
||||
int res;
|
||||
|
||||
__asm__ (" xorl %[res],%[res];\n"
|
||||
" decl %[res];\n"
|
||||
" bsrl %[bits],%[res]\n"
|
||||
: [res] "=&r" (res)
|
||||
: [bits] "rm" (bits));
|
||||
return res;
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
/*! \brief Find the bit position of the lowest set bit in a word
|
||||
\param bits The word to be searched
|
||||
\return The bit number of the lowest set bit, or -1 if the word is zero. */
|
||||
static __inline__ int bottom_bit(unsigned int bits)
|
||||
{
|
||||
int res;
|
||||
|
||||
__asm__ (" xorl %[res],%[res];\n"
|
||||
" decl %[res];\n"
|
||||
" bsfl %[bits],%[res]\n"
|
||||
: [res] "=&r" (res)
|
||||
: [bits] "rm" (bits));
|
||||
return res;
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
#else
|
||||
static __inline__ int top_bit(unsigned int bits)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (bits == 0)
|
||||
return -1;
|
||||
i = 0;
|
||||
if (bits & 0xFFFF0000)
|
||||
{
|
||||
bits &= 0xFFFF0000;
|
||||
i += 16;
|
||||
}
|
||||
if (bits & 0xFF00FF00)
|
||||
{
|
||||
bits &= 0xFF00FF00;
|
||||
i += 8;
|
||||
}
|
||||
if (bits & 0xF0F0F0F0)
|
||||
{
|
||||
bits &= 0xF0F0F0F0;
|
||||
i += 4;
|
||||
}
|
||||
if (bits & 0xCCCCCCCC)
|
||||
{
|
||||
bits &= 0xCCCCCCCC;
|
||||
i += 2;
|
||||
}
|
||||
if (bits & 0xAAAAAAAA)
|
||||
{
|
||||
bits &= 0xAAAAAAAA;
|
||||
i += 1;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
static __inline__ int bottom_bit(unsigned int bits)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (bits == 0)
|
||||
return -1;
|
||||
i = 32;
|
||||
if (bits & 0x0000FFFF)
|
||||
{
|
||||
bits &= 0x0000FFFF;
|
||||
i -= 16;
|
||||
}
|
||||
if (bits & 0x00FF00FF)
|
||||
{
|
||||
bits &= 0x00FF00FF;
|
||||
i -= 8;
|
||||
}
|
||||
if (bits & 0x0F0F0F0F)
|
||||
{
|
||||
bits &= 0x0F0F0F0F;
|
||||
i -= 4;
|
||||
}
|
||||
if (bits & 0x33333333)
|
||||
{
|
||||
bits &= 0x33333333;
|
||||
i -= 2;
|
||||
}
|
||||
if (bits & 0x55555555)
|
||||
{
|
||||
bits &= 0x55555555;
|
||||
i -= 1;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
#endif
|
||||
|
||||
/*! \brief Bit reverse a byte.
|
||||
\param data The byte to be reversed.
|
||||
\return The bit reversed version of data. */
|
||||
static __inline__ uint8_t bit_reverse8(uint8_t x)
|
||||
{
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
/* If multiply is fast */
|
||||
return ((x*0x0802U & 0x22110U) | (x*0x8020U & 0x88440U))*0x10101U >> 16;
|
||||
#else
|
||||
/* If multiply is slow, but we have a barrel shifter */
|
||||
x = (x >> 4) | (x << 4);
|
||||
x = ((x & 0xCC) >> 2) | ((x & 0x33) << 2);
|
||||
return ((x & 0xAA) >> 1) | ((x & 0x55) << 1);
|
||||
#endif
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
/*! \brief Bit reverse a 16 bit word.
|
||||
\param data The word to be reversed.
|
||||
\return The bit reversed version of data. */
|
||||
uint16_t bit_reverse16(uint16_t data);
|
||||
|
||||
/*! \brief Bit reverse a 32 bit word.
|
||||
\param data The word to be reversed.
|
||||
\return The bit reversed version of data. */
|
||||
uint32_t bit_reverse32(uint32_t data);
|
||||
|
||||
/*! \brief Bit reverse each of the four bytes in a 32 bit word.
|
||||
\param data The word to be reversed.
|
||||
\return The bit reversed version of data. */
|
||||
uint32_t bit_reverse_4bytes(uint32_t data);
|
||||
|
||||
/*! \brief Find the number of set bits in a 32 bit word.
|
||||
\param x The word to be searched.
|
||||
\return The number of set bits. */
|
||||
int one_bits32(uint32_t x);
|
||||
|
||||
/*! \brief Create a mask as wide as the number in a 32 bit word.
|
||||
\param x The word to be searched.
|
||||
\return The mask. */
|
||||
uint32_t make_mask32(uint32_t x);
|
||||
|
||||
/*! \brief Create a mask as wide as the number in a 16 bit word.
|
||||
\param x The word to be searched.
|
||||
\return The mask. */
|
||||
uint16_t make_mask16(uint16_t x);
|
||||
|
||||
/*! \brief Find the least significant one in a word, and return a word
|
||||
with just that bit set.
|
||||
\param x The word to be searched.
|
||||
\return The word with the single set bit. */
|
||||
static __inline__ uint32_t least_significant_one32(uint32_t x)
|
||||
{
|
||||
return (x & (-(int32_t) x));
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
/*! \brief Find the most significant one in a word, and return a word
|
||||
with just that bit set.
|
||||
\param x The word to be searched.
|
||||
\return The word with the single set bit. */
|
||||
static __inline__ uint32_t most_significant_one32(uint32_t x)
|
||||
{
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
return 1 << top_bit(x);
|
||||
#else
|
||||
x = make_mask32(x);
|
||||
return (x ^ (x >> 1));
|
||||
#endif
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
/*! \brief Find the parity of a byte.
|
||||
\param x The byte to be checked.
|
||||
\return 1 for odd, or 0 for even. */
|
||||
static __inline__ int parity8(uint8_t x)
|
||||
{
|
||||
x = (x ^ (x >> 4)) & 0x0F;
|
||||
return (0x6996 >> x) & 1;
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
/*! \brief Find the parity of a 16 bit word.
|
||||
\param x The word to be checked.
|
||||
\return 1 for odd, or 0 for even. */
|
||||
static __inline__ int parity16(uint16_t x)
|
||||
{
|
||||
x ^= (x >> 8);
|
||||
x = (x ^ (x >> 4)) & 0x0F;
|
||||
return (0x6996 >> x) & 1;
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
/*! \brief Find the parity of a 32 bit word.
|
||||
\param x The word to be checked.
|
||||
\return 1 for odd, or 0 for even. */
|
||||
static __inline__ int parity32(uint32_t x)
|
||||
{
|
||||
x ^= (x >> 16);
|
||||
x ^= (x >> 8);
|
||||
x = (x ^ (x >> 4)) & 0x0F;
|
||||
return (0x6996 >> x) & 1;
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
/*- End of file ------------------------------------------------------------*/
|
|
@ -0,0 +1,632 @@
|
|||
/*
|
||||
* SpanDSP - a series of DSP components for telephony
|
||||
*
|
||||
* echo.c - A line echo canceller. This code is being developed
|
||||
* against and partially complies with G168.
|
||||
*
|
||||
* Written by Steve Underwood <steveu@coppice.org>
|
||||
* and David Rowe <david_at_rowetel_dot_com>
|
||||
*
|
||||
* Copyright (C) 2001, 2003 Steve Underwood, 2007 David Rowe
|
||||
*
|
||||
* Based on a bit from here, a bit from there, eye of toad, ear of
|
||||
* bat, 15 years of failed attempts by David and a few fried brain
|
||||
* cells.
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*
|
||||
* $Id: echo.c,v 1.20 2006/12/01 18:00:48 steveu Exp $
|
||||
*/
|
||||
|
||||
/*! \file */
|
||||
|
||||
/* Implementation Notes
|
||||
David Rowe
|
||||
April 2007
|
||||
|
||||
This code started life as Steve's NLMS algorithm with a tap
|
||||
rotation algorithm to handle divergence during double talk. I
|
||||
added a Geigel Double Talk Detector (DTD) [2] and performed some
|
||||
G168 tests. However I had trouble meeting the G168 requirements,
|
||||
especially for double talk - there were always cases where my DTD
|
||||
failed, for example where near end speech was under the 6dB
|
||||
threshold required for declaring double talk.
|
||||
|
||||
So I tried a two path algorithm [1], which has so far given better
|
||||
results. The original tap rotation/Geigel algorithm is available
|
||||
in SVN http://svn.rowetel.com/software/oslec/tags/before_16bit.
|
||||
It's probably possible to make it work if some one wants to put some
|
||||
serious work into it.
|
||||
|
||||
At present no special treatment is provided for tones, which
|
||||
generally cause NLMS algorithms to diverge. Initial runs of a
|
||||
subset of the G168 tests for tones (e.g ./echo_test 6) show the
|
||||
current algorithm is passing OK, which is kind of surprising. The
|
||||
full set of tests needs to be performed to confirm this result.
|
||||
|
||||
One other interesting change is that I have managed to get the NLMS
|
||||
code to work with 16 bit coefficients, rather than the original 32
|
||||
bit coefficents. This reduces the MIPs and storage required.
|
||||
I evaulated the 16 bit port using g168_tests.sh and listening tests
|
||||
on 4 real-world samples.
|
||||
|
||||
I also attempted the implementation of a block based NLMS update
|
||||
[2] but although this passes g168_tests.sh it didn't converge well
|
||||
on the real-world samples. I have no idea why, perhaps a scaling
|
||||
problem. The block based code is also available in SVN
|
||||
http://svn.rowetel.com/software/oslec/tags/before_16bit. If this
|
||||
code can be debugged, it will lead to further reduction in MIPS, as
|
||||
the block update code maps nicely onto DSP instruction sets (it's a
|
||||
dot product) compared to the current sample-by-sample update.
|
||||
|
||||
Steve also has some nice notes on echo cancellers in echo.h
|
||||
|
||||
|
||||
References:
|
||||
|
||||
[1] Ochiai, Areseki, and Ogihara, "Echo Canceller with Two Echo
|
||||
Path Models", IEEE Transactions on communications, COM-25,
|
||||
No. 6, June
|
||||
1977.
|
||||
http://www.rowetel.com/images/echo/dual_path_paper.pdf
|
||||
|
||||
[2] The classic, very useful paper that tells you how to
|
||||
actually build a real world echo canceller:
|
||||
Messerschmitt, Hedberg, Cole, Haoui, Winship, "Digital Voice
|
||||
Echo Canceller with a TMS320020,
|
||||
http://www.rowetel.com/images/echo/spra129.pdf
|
||||
|
||||
[3] I have written a series of blog posts on this work, here is
|
||||
Part 1: http://www.rowetel.com/blog/?p=18
|
||||
|
||||
[4] The source code http://svn.rowetel.com/software/oslec/
|
||||
|
||||
[5] A nice reference on LMS filters:
|
||||
http://en.wikipedia.org/wiki/Least_mean_squares_filter
|
||||
|
||||
Credits:
|
||||
|
||||
Thanks to Steve Underwood, Jean-Marc Valin, and Ramakrishnan
|
||||
Muthukrishnan for their suggestions and email discussions. Thanks
|
||||
also to those people who collected echo samples for me such as
|
||||
Mark, Pawel, and Pavel.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h> /* We're doing kernel work */
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/slab.h>
|
||||
#define malloc(a) kmalloc((a), GFP_KERNEL)
|
||||
#define free(a) kfree(a)
|
||||
|
||||
#include "bit_operations.h"
|
||||
#include "echo.h"
|
||||
|
||||
#define MIN_TX_POWER_FOR_ADAPTION 64
|
||||
#define MIN_RX_POWER_FOR_ADAPTION 64
|
||||
#define DTD_HANGOVER 600 /* 600 samples, or 75ms */
|
||||
#define DC_LOG2BETA 3 /* log2() of DC filter Beta */
|
||||
|
||||
/*-----------------------------------------------------------------------*\
|
||||
FUNCTIONS
|
||||
\*-----------------------------------------------------------------------*/
|
||||
|
||||
/* adapting coeffs using the traditional stochastic descent (N)LMS algorithm */
|
||||
|
||||
|
||||
#ifdef __BLACKFIN_ASM__
|
||||
static void __inline__ lms_adapt_bg(echo_can_state_t *ec, int clean, int shift)
|
||||
{
|
||||
int i, j;
|
||||
int offset1;
|
||||
int offset2;
|
||||
int factor;
|
||||
int exp;
|
||||
int16_t *phist;
|
||||
int n;
|
||||
|
||||
if (shift > 0)
|
||||
factor = clean << shift;
|
||||
else
|
||||
factor = clean >> -shift;
|
||||
|
||||
/* Update the FIR taps */
|
||||
|
||||
offset2 = ec->curr_pos;
|
||||
offset1 = ec->taps - offset2;
|
||||
phist = &ec->fir_state_bg.history[offset2];
|
||||
|
||||
/* st: and en: help us locate the assembler in echo.s */
|
||||
|
||||
//asm("st:");
|
||||
n = ec->taps;
|
||||
for (i = 0, j = offset2; i < n; i++, j++)
|
||||
{
|
||||
exp = *phist++ * factor;
|
||||
ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15);
|
||||
}
|
||||
//asm("en:");
|
||||
|
||||
/* Note the asm for the inner loop above generated by Blackfin gcc
|
||||
4.1.1 is pretty good (note even parallel instructions used):
|
||||
|
||||
R0 = W [P0++] (X);
|
||||
R0 *= R2;
|
||||
R0 = R0 + R3 (NS) ||
|
||||
R1 = W [P1] (X) ||
|
||||
nop;
|
||||
R0 >>>= 15;
|
||||
R0 = R0 + R1;
|
||||
W [P1++] = R0;
|
||||
|
||||
A block based update algorithm would be much faster but the
|
||||
above can't be improved on much. Every instruction saved in
|
||||
the loop above is 2 MIPs/ch! The for loop above is where the
|
||||
Blackfin spends most of it's time - about 17 MIPs/ch measured
|
||||
with speedtest.c with 256 taps (32ms). Write-back and
|
||||
Write-through cache gave about the same performance.
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
IDEAS for further optimisation of lms_adapt_bg():
|
||||
|
||||
1/ The rounding is quite costly. Could we keep as 32 bit coeffs
|
||||
then make filter pluck the MS 16-bits of the coeffs when filtering?
|
||||
However this would lower potential optimisation of filter, as I
|
||||
think the dual-MAC architecture requires packed 16 bit coeffs.
|
||||
|
||||
2/ Block based update would be more efficient, as per comments above,
|
||||
could use dual MAC architecture.
|
||||
|
||||
3/ Look for same sample Blackfin LMS code, see if we can get dual-MAC
|
||||
packing.
|
||||
|
||||
4/ Execute the whole e/c in a block of say 20ms rather than sample
|
||||
by sample. Processing a few samples every ms is inefficient.
|
||||
*/
|
||||
|
||||
#else
|
||||
static __inline__ void lms_adapt_bg(echo_can_state_t *ec, int clean, int shift)
|
||||
{
|
||||
int i;
|
||||
|
||||
int offset1;
|
||||
int offset2;
|
||||
int factor;
|
||||
int exp;
|
||||
|
||||
if (shift > 0)
|
||||
factor = clean << shift;
|
||||
else
|
||||
factor = clean >> -shift;
|
||||
|
||||
/* Update the FIR taps */
|
||||
|
||||
offset2 = ec->curr_pos;
|
||||
offset1 = ec->taps - offset2;
|
||||
|
||||
for (i = ec->taps - 1; i >= offset1; i--)
|
||||
{
|
||||
exp = (ec->fir_state_bg.history[i - offset1]*factor);
|
||||
ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15);
|
||||
}
|
||||
for ( ; i >= 0; i--)
|
||||
{
|
||||
exp = (ec->fir_state_bg.history[i + offset2]*factor);
|
||||
ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
echo_can_state_t *echo_can_create(int len, int adaption_mode)
|
||||
{
|
||||
echo_can_state_t *ec;
|
||||
int i;
|
||||
int j;
|
||||
|
||||
ec = kmalloc(sizeof(*ec), GFP_KERNEL);
|
||||
if (ec == NULL)
|
||||
return NULL;
|
||||
memset(ec, 0, sizeof(*ec));
|
||||
|
||||
ec->taps = len;
|
||||
ec->log2taps = top_bit(len);
|
||||
ec->curr_pos = ec->taps - 1;
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
if ((ec->fir_taps16[i] = (int16_t *) malloc((ec->taps)*sizeof(int16_t))) == NULL)
|
||||
{
|
||||
for (j = 0; j < i; j++)
|
||||
kfree(ec->fir_taps16[j]);
|
||||
kfree(ec);
|
||||
return NULL;
|
||||
}
|
||||
memset(ec->fir_taps16[i], 0, (ec->taps)*sizeof(int16_t));
|
||||
}
|
||||
|
||||
fir16_create(&ec->fir_state,
|
||||
ec->fir_taps16[0],
|
||||
ec->taps);
|
||||
fir16_create(&ec->fir_state_bg,
|
||||
ec->fir_taps16[1],
|
||||
ec->taps);
|
||||
|
||||
for(i=0; i<5; i++) {
|
||||
ec->xvtx[i] = ec->yvtx[i] = ec->xvrx[i] = ec->yvrx[i] = 0;
|
||||
}
|
||||
|
||||
ec->cng_level = 1000;
|
||||
echo_can_adaption_mode(ec, adaption_mode);
|
||||
|
||||
ec->snapshot = (int16_t*)malloc(ec->taps*sizeof(int16_t));
|
||||
memset(ec->snapshot, 0, sizeof(int16_t)*ec->taps);
|
||||
|
||||
ec->cond_met = 0;
|
||||
ec->Pstates = 0;
|
||||
ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
|
||||
ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
|
||||
ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
|
||||
ec->Lbgn = ec->Lbgn_acc = 0;
|
||||
ec->Lbgn_upper = 200;
|
||||
ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
|
||||
|
||||
return ec;
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
void echo_can_free(echo_can_state_t *ec)
|
||||
{
|
||||
int i;
|
||||
|
||||
fir16_free(&ec->fir_state);
|
||||
fir16_free(&ec->fir_state_bg);
|
||||
for (i = 0; i < 2; i++)
|
||||
kfree(ec->fir_taps16[i]);
|
||||
kfree(ec->snapshot);
|
||||
kfree(ec);
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
void echo_can_adaption_mode(echo_can_state_t *ec, int adaption_mode)
|
||||
{
|
||||
ec->adaption_mode = adaption_mode;
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
void echo_can_flush(echo_can_state_t *ec)
|
||||
{
|
||||
int i;
|
||||
|
||||
ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
|
||||
ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
|
||||
ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
|
||||
|
||||
ec->Lbgn = ec->Lbgn_acc = 0;
|
||||
ec->Lbgn_upper = 200;
|
||||
ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
|
||||
|
||||
ec->nonupdate_dwell = 0;
|
||||
|
||||
fir16_flush(&ec->fir_state);
|
||||
fir16_flush(&ec->fir_state_bg);
|
||||
ec->fir_state.curr_pos = ec->taps - 1;
|
||||
ec->fir_state_bg.curr_pos = ec->taps - 1;
|
||||
for (i = 0; i < 2; i++)
|
||||
memset(ec->fir_taps16[i], 0, ec->taps*sizeof(int16_t));
|
||||
|
||||
ec->curr_pos = ec->taps - 1;
|
||||
ec->Pstates = 0;
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
void echo_can_snapshot(echo_can_state_t *ec) {
|
||||
memcpy(ec->snapshot, ec->fir_taps16[0], ec->taps*sizeof(int16_t));
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
/* Dual Path Echo Canceller ------------------------------------------------*/
|
||||
|
||||
int16_t echo_can_update(echo_can_state_t *ec, int16_t tx, int16_t rx)
|
||||
{
|
||||
int32_t echo_value;
|
||||
int clean_bg;
|
||||
int tmp, tmp1;
|
||||
|
||||
/* Input scaling was found be required to prevent problems when tx
|
||||
starts clipping. Another possible way to handle this would be the
|
||||
filter coefficent scaling. */
|
||||
|
||||
ec->tx = tx; ec->rx = rx;
|
||||
tx >>=1;
|
||||
rx >>=1;
|
||||
|
||||
/*
|
||||
Filter DC, 3dB point is 160Hz (I think), note 32 bit precision required
|
||||
otherwise values do not track down to 0. Zero at DC, Pole at (1-Beta)
|
||||
only real axis. Some chip sets (like Si labs) don't need
|
||||
this, but something like a $10 X100P card does. Any DC really slows
|
||||
down convergence.
|
||||
|
||||
Note: removes some low frequency from the signal, this reduces
|
||||
the speech quality when listening to samples through headphones
|
||||
but may not be obvious through a telephone handset.
|
||||
|
||||
Note that the 3dB frequency in radians is approx Beta, e.g. for
|
||||
Beta = 2^(-3) = 0.125, 3dB freq is 0.125 rads = 159Hz.
|
||||
*/
|
||||
|
||||
if (ec->adaption_mode & ECHO_CAN_USE_RX_HPF) {
|
||||
tmp = rx << 15;
|
||||
#if 1
|
||||
/* Make sure the gain of the HPF is 1.0. This can still saturate a little under
|
||||
impulse conditions, and it might roll to 32768 and need clipping on sustained peak
|
||||
level signals. However, the scale of such clipping is small, and the error due to
|
||||
any saturation should not markedly affect the downstream processing. */
|
||||
tmp -= (tmp >> 4);
|
||||
#endif
|
||||
ec->rx_1 += -(ec->rx_1>>DC_LOG2BETA) + tmp - ec->rx_2;
|
||||
|
||||
/* hard limit filter to prevent clipping. Note that at this stage
|
||||
rx should be limited to +/- 16383 due to right shift above */
|
||||
tmp1 = ec->rx_1 >> 15;
|
||||
if (tmp1 > 16383) tmp1 = 16383;
|
||||
if (tmp1 < -16383) tmp1 = -16383;
|
||||
rx = tmp1;
|
||||
ec->rx_2 = tmp;
|
||||
}
|
||||
|
||||
/* Block average of power in the filter states. Used for
|
||||
adaption power calculation. */
|
||||
|
||||
{
|
||||
int new, old;
|
||||
|
||||
/* efficient "out with the old and in with the new" algorithm so
|
||||
we don't have to recalculate over the whole block of
|
||||
samples. */
|
||||
new = (int)tx * (int)tx;
|
||||
old = (int)ec->fir_state.history[ec->fir_state.curr_pos] *
|
||||
(int)ec->fir_state.history[ec->fir_state.curr_pos];
|
||||
ec->Pstates += ((new - old) + (1<<ec->log2taps)) >> ec->log2taps;
|
||||
if (ec->Pstates < 0) ec->Pstates = 0;
|
||||
}
|
||||
|
||||
/* Calculate short term average levels using simple single pole IIRs */
|
||||
|
||||
ec->Ltxacc += abs(tx) - ec->Ltx;
|
||||
ec->Ltx = (ec->Ltxacc + (1<<4)) >> 5;
|
||||
ec->Lrxacc += abs(rx) - ec->Lrx;
|
||||
ec->Lrx = (ec->Lrxacc + (1<<4)) >> 5;
|
||||
|
||||
/* Foreground filter ---------------------------------------------------*/
|
||||
|
||||
ec->fir_state.coeffs = ec->fir_taps16[0];
|
||||
echo_value = fir16(&ec->fir_state, tx);
|
||||
ec->clean = rx - echo_value;
|
||||
ec->Lcleanacc += abs(ec->clean) - ec->Lclean;
|
||||
ec->Lclean = (ec->Lcleanacc + (1<<4)) >> 5;
|
||||
|
||||
/* Background filter ---------------------------------------------------*/
|
||||
|
||||
echo_value = fir16(&ec->fir_state_bg, tx);
|
||||
clean_bg = rx - echo_value;
|
||||
ec->Lclean_bgacc += abs(clean_bg) - ec->Lclean_bg;
|
||||
ec->Lclean_bg = (ec->Lclean_bgacc + (1<<4)) >> 5;
|
||||
|
||||
/* Background Filter adaption -----------------------------------------*/
|
||||
|
||||
/* Almost always adap bg filter, just simple DT and energy
|
||||
detection to minimise adaption in cases of strong double talk.
|
||||
However this is not critical for the dual path algorithm.
|
||||
*/
|
||||
ec->factor = 0;
|
||||
ec->shift = 0;
|
||||
if ((ec->nonupdate_dwell == 0)) {
|
||||
int P, logP, shift;
|
||||
|
||||
/* Determine:
|
||||
|
||||
f = Beta * clean_bg_rx/P ------ (1)
|
||||
|
||||
where P is the total power in the filter states.
|
||||
|
||||
The Boffins have shown that if we obey (1) we converge
|
||||
quickly and avoid instability.
|
||||
|
||||
The correct factor f must be in Q30, as this is the fixed
|
||||
point format required by the lms_adapt_bg() function,
|
||||
therefore the scaled version of (1) is:
|
||||
|
||||
(2^30) * f = (2^30) * Beta * clean_bg_rx/P
|
||||
factor = (2^30) * Beta * clean_bg_rx/P ----- (2)
|
||||
|
||||
We have chosen Beta = 0.25 by experiment, so:
|
||||
|
||||
factor = (2^30) * (2^-2) * clean_bg_rx/P
|
||||
|
||||
(30 - 2 - log2(P))
|
||||
factor = clean_bg_rx 2 ----- (3)
|
||||
|
||||
To avoid a divide we approximate log2(P) as top_bit(P),
|
||||
which returns the position of the highest non-zero bit in
|
||||
P. This approximation introduces an error as large as a
|
||||
factor of 2, but the algorithm seems to handle it OK.
|
||||
|
||||
Come to think of it a divide may not be a big deal on a
|
||||
modern DSP, so its probably worth checking out the cycles
|
||||
for a divide versus a top_bit() implementation.
|
||||
*/
|
||||
|
||||
P = MIN_TX_POWER_FOR_ADAPTION + ec->Pstates;
|
||||
logP = top_bit(P) + ec->log2taps;
|
||||
shift = 30 - 2 - logP;
|
||||
ec->shift = shift;
|
||||
|
||||
lms_adapt_bg(ec, clean_bg, shift);
|
||||
}
|
||||
|
||||
/* very simple DTD to make sure we dont try and adapt with strong
|
||||
near end speech */
|
||||
|
||||
ec->adapt = 0;
|
||||
if ((ec->Lrx > MIN_RX_POWER_FOR_ADAPTION) && (ec->Lrx > ec->Ltx))
|
||||
ec->nonupdate_dwell = DTD_HANGOVER;
|
||||
if (ec->nonupdate_dwell)
|
||||
ec->nonupdate_dwell--;
|
||||
|
||||
/* Transfer logic ------------------------------------------------------*/
|
||||
|
||||
/* These conditions are from the dual path paper [1], I messed with
|
||||
them a bit to improve performance. */
|
||||
|
||||
if ((ec->adaption_mode & ECHO_CAN_USE_ADAPTION) &&
|
||||
(ec->nonupdate_dwell == 0) &&
|
||||
(8*ec->Lclean_bg < 7*ec->Lclean) /* (ec->Lclean_bg < 0.875*ec->Lclean) */ &&
|
||||
(8*ec->Lclean_bg < ec->Ltx) /* (ec->Lclean_bg < 0.125*ec->Ltx) */ )
|
||||
{
|
||||
if (ec->cond_met == 6) {
|
||||
/* BG filter has had better results for 6 consecutive samples */
|
||||
ec->adapt = 1;
|
||||
memcpy(ec->fir_taps16[0], ec->fir_taps16[1], ec->taps*sizeof(int16_t));
|
||||
}
|
||||
else
|
||||
ec->cond_met++;
|
||||
}
|
||||
else
|
||||
ec->cond_met = 0;
|
||||
|
||||
/* Non-Linear Processing ---------------------------------------------------*/
|
||||
|
||||
ec->clean_nlp = ec->clean;
|
||||
if (ec->adaption_mode & ECHO_CAN_USE_NLP)
|
||||
{
|
||||
/* Non-linear processor - a fancy way to say "zap small signals, to avoid
|
||||
residual echo due to (uLaw/ALaw) non-linearity in the channel.". */
|
||||
|
||||
if ((16*ec->Lclean < ec->Ltx))
|
||||
{
|
||||
/* Our e/c has improved echo by at least 24 dB (each factor of 2 is 6dB,
|
||||
so 2*2*2*2=16 is the same as 6+6+6+6=24dB) */
|
||||
if (ec->adaption_mode & ECHO_CAN_USE_CNG)
|
||||
{
|
||||
ec->cng_level = ec->Lbgn;
|
||||
|
||||
/* Very elementary comfort noise generation. Just random
|
||||
numbers rolled off very vaguely Hoth-like. DR: This
|
||||
noise doesn't sound quite right to me - I suspect there
|
||||
are some overlfow issues in the filtering as it's too
|
||||
"crackly". TODO: debug this, maybe just play noise at
|
||||
high level or look at spectrum.
|
||||
*/
|
||||
|
||||
ec->cng_rndnum = 1664525U*ec->cng_rndnum + 1013904223U;
|
||||
ec->cng_filter = ((ec->cng_rndnum & 0xFFFF) - 32768 + 5*ec->cng_filter) >> 3;
|
||||
ec->clean_nlp = (ec->cng_filter*ec->cng_level*8) >> 14;
|
||||
|
||||
}
|
||||
else if (ec->adaption_mode & ECHO_CAN_USE_CLIP)
|
||||
{
|
||||
/* This sounds much better than CNG */
|
||||
if (ec->clean_nlp > ec->Lbgn)
|
||||
ec->clean_nlp = ec->Lbgn;
|
||||
if (ec->clean_nlp < -ec->Lbgn)
|
||||
ec->clean_nlp = -ec->Lbgn;
|
||||
}
|
||||
else
|
||||
{
|
||||
/* just mute the residual, doesn't sound very good, used mainly
|
||||
in G168 tests */
|
||||
ec->clean_nlp = 0;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* Background noise estimator. I tried a few algorithms
|
||||
here without much luck. This very simple one seems to
|
||||
work best, we just average the level using a slow (1 sec
|
||||
time const) filter if the current level is less than a
|
||||
(experimentally derived) constant. This means we dont
|
||||
include high level signals like near end speech. When
|
||||
combined with CNG or especially CLIP seems to work OK.
|
||||
*/
|
||||
if (ec->Lclean < 40) {
|
||||
ec->Lbgn_acc += abs(ec->clean) - ec->Lbgn;
|
||||
ec->Lbgn = (ec->Lbgn_acc + (1<<11)) >> 12;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Roll around the taps buffer */
|
||||
if (ec->curr_pos <= 0)
|
||||
ec->curr_pos = ec->taps;
|
||||
ec->curr_pos--;
|
||||
|
||||
if (ec->adaption_mode & ECHO_CAN_DISABLE)
|
||||
ec->clean_nlp = rx;
|
||||
|
||||
/* Output scaled back up again to match input scaling */
|
||||
|
||||
return (int16_t) ec->clean_nlp << 1;
|
||||
}
|
||||
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
/* This function is seperated from the echo canceller is it is usually called
|
||||
as part of the tx process. See rx HP (DC blocking) filter above, it's
|
||||
the same design.
|
||||
|
||||
Some soft phones send speech signals with a lot of low frequency
|
||||
energy, e.g. down to 20Hz. This can make the hybrid non-linear
|
||||
which causes the echo canceller to fall over. This filter can help
|
||||
by removing any low frequency before it gets to the tx port of the
|
||||
hybrid.
|
||||
|
||||
It can also help by removing and DC in the tx signal. DC is bad
|
||||
for LMS algorithms.
|
||||
|
||||
This is one of the classic DC removal filters, adjusted to provide sufficient
|
||||
bass rolloff to meet the above requirement to protect hybrids from things that
|
||||
upset them. The difference between successive samples produces a lousy HPF, and
|
||||
then a suitably placed pole flattens things out. The final result is a nicely
|
||||
rolled off bass end. The filtering is implemented with extended fractional
|
||||
precision, which noise shapes things, giving very clean DC removal.
|
||||
*/
|
||||
|
||||
int16_t echo_can_hpf_tx(echo_can_state_t *ec, int16_t tx) {
|
||||
int tmp, tmp1;
|
||||
|
||||
if (ec->adaption_mode & ECHO_CAN_USE_TX_HPF) {
|
||||
tmp = tx << 15;
|
||||
#if 1
|
||||
/* Make sure the gain of the HPF is 1.0. The first can still saturate a little under
|
||||
impulse conditions, and it might roll to 32768 and need clipping on sustained peak
|
||||
level signals. However, the scale of such clipping is small, and the error due to
|
||||
any saturation should not markedly affect the downstream processing. */
|
||||
tmp -= (tmp >> 4);
|
||||
#endif
|
||||
ec->tx_1 += -(ec->tx_1>>DC_LOG2BETA) + tmp - ec->tx_2;
|
||||
tmp1 = ec->tx_1 >> 15;
|
||||
if (tmp1 > 32767) tmp1 = 32767;
|
||||
if (tmp1 < -32767) tmp1 = -32767;
|
||||
tx = tmp1;
|
||||
ec->tx_2 = tmp;
|
||||
}
|
||||
|
||||
return tx;
|
||||
}
|
|
@ -0,0 +1,220 @@
|
|||
/*
|
||||
* SpanDSP - a series of DSP components for telephony
|
||||
*
|
||||
* echo.c - A line echo canceller. This code is being developed
|
||||
* against and partially complies with G168.
|
||||
*
|
||||
* Written by Steve Underwood <steveu@coppice.org>
|
||||
* and David Rowe <david_at_rowetel_dot_com>
|
||||
*
|
||||
* Copyright (C) 2001 Steve Underwood and 2007 David Rowe
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*
|
||||
* $Id: echo.h,v 1.9 2006/10/24 13:45:28 steveu Exp $
|
||||
*/
|
||||
|
||||
#ifndef __ECHO_H
|
||||
#define __ECHO_H
|
||||
|
||||
/*! \page echo_can_page Line echo cancellation for voice
|
||||
|
||||
\section echo_can_page_sec_1 What does it do?
|
||||
This module aims to provide G.168-2002 compliant echo cancellation, to remove
|
||||
electrical echoes (e.g. from 2-4 wire hybrids) from voice calls.
|
||||
|
||||
\section echo_can_page_sec_2 How does it work?
|
||||
The heart of the echo cancellor is FIR filter. This is adapted to match the
|
||||
echo impulse response of the telephone line. It must be long enough to
|
||||
adequately cover the duration of that impulse response. The signal transmitted
|
||||
to the telephone line is passed through the FIR filter. Once the FIR is
|
||||
properly adapted, the resulting output is an estimate of the echo signal
|
||||
received from the line. This is subtracted from the received signal. The result
|
||||
is an estimate of the signal which originated at the far end of the line, free
|
||||
from echos of our own transmitted signal.
|
||||
|
||||
The least mean squares (LMS) algorithm is attributed to Widrow and Hoff, and
|
||||
was introduced in 1960. It is the commonest form of filter adaption used in
|
||||
things like modem line equalisers and line echo cancellers. There it works very
|
||||
well. However, it only works well for signals of constant amplitude. It works
|
||||
very poorly for things like speech echo cancellation, where the signal level
|
||||
varies widely. This is quite easy to fix. If the signal level is normalised -
|
||||
similar to applying AGC - LMS can work as well for a signal of varying
|
||||
amplitude as it does for a modem signal. This normalised least mean squares
|
||||
(NLMS) algorithm is the commonest one used for speech echo cancellation. Many
|
||||
other algorithms exist - e.g. RLS (essentially the same as Kalman filtering),
|
||||
FAP, etc. Some perform significantly better than NLMS. However, factors such
|
||||
as computational complexity and patents favour the use of NLMS.
|
||||
|
||||
A simple refinement to NLMS can improve its performance with speech. NLMS tends
|
||||
to adapt best to the strongest parts of a signal. If the signal is white noise,
|
||||
the NLMS algorithm works very well. However, speech has more low frequency than
|
||||
high frequency content. Pre-whitening (i.e. filtering the signal to flatten its
|
||||
spectrum) the echo signal improves the adapt rate for speech, and ensures the
|
||||
final residual signal is not heavily biased towards high frequencies. A very
|
||||
low complexity filter is adequate for this, so pre-whitening adds little to the
|
||||
compute requirements of the echo canceller.
|
||||
|
||||
An FIR filter adapted using pre-whitened NLMS performs well, provided certain
|
||||
conditions are met:
|
||||
|
||||
- The transmitted signal has poor self-correlation.
|
||||
- There is no signal being generated within the environment being
|
||||
cancelled.
|
||||
|
||||
The difficulty is that neither of these can be guaranteed.
|
||||
|
||||
If the adaption is performed while transmitting noise (or something fairly
|
||||
noise like, such as voice) the adaption works very well. If the adaption is
|
||||
performed while transmitting something highly correlative (typically narrow
|
||||
band energy such as signalling tones or DTMF), the adaption can go seriously
|
||||
wrong. The reason is there is only one solution for the adaption on a near
|
||||
random signal - the impulse response of the line. For a repetitive signal,
|
||||
there are any number of solutions which converge the adaption, and nothing
|
||||
guides the adaption to choose the generalised one. Allowing an untrained
|
||||
canceller to converge on this kind of narrowband energy probably a good thing,
|
||||
since at least it cancels the tones. Allowing a well converged canceller to
|
||||
continue converging on such energy is just a way to ruin its generalised
|
||||
adaption. A narrowband detector is needed, so adapation can be suspended at
|
||||
appropriate times.
|
||||
|
||||
The adaption process is based on trying to eliminate the received signal. When
|
||||
there is any signal from within the environment being cancelled it may upset
|
||||
the adaption process. Similarly, if the signal we are transmitting is small,
|
||||
noise may dominate and disturb the adaption process. If we can ensure that the
|
||||
adaption is only performed when we are transmitting a significant signal level,
|
||||
and the environment is not, things will be OK. Clearly, it is easy to tell when
|
||||
we are sending a significant signal. Telling, if the environment is generating
|
||||
a significant signal, and doing it with sufficient speed that the adaption will
|
||||
not have diverged too much more we stop it, is a little harder.
|
||||
|
||||
The key problem in detecting when the environment is sourcing significant
|
||||
energy is that we must do this very quickly. Given a reasonably long sample of
|
||||
the received signal, there are a number of strategies which may be used to
|
||||
assess whether that signal contains a strong far end component. However, by the
|
||||
time that assessment is complete the far end signal will have already caused
|
||||
major mis-convergence in the adaption process. An assessment algorithm is
|
||||
needed which produces a fairly accurate result from a very short burst of far
|
||||
end energy.
|
||||
|
||||
\section echo_can_page_sec_3 How do I use it?
|
||||
The echo cancellor processes both the transmit and receive streams sample by
|
||||
sample. The processing function is not declared inline. Unfortunately,
|
||||
cancellation requires many operations per sample, so the call overhead is only
|
||||
a minor burden.
|
||||
*/
|
||||
|
||||
#include "fir.h"
|
||||
|
||||
/* Mask bits for the adaption mode */
|
||||
#define ECHO_CAN_USE_ADAPTION 0x01
|
||||
#define ECHO_CAN_USE_NLP 0x02
|
||||
#define ECHO_CAN_USE_CNG 0x04
|
||||
#define ECHO_CAN_USE_CLIP 0x08
|
||||
#define ECHO_CAN_USE_TX_HPF 0x10
|
||||
#define ECHO_CAN_USE_RX_HPF 0x20
|
||||
#define ECHO_CAN_DISABLE 0x40
|
||||
|
||||
/*!
|
||||
G.168 echo canceller descriptor. This defines the working state for a line
|
||||
echo canceller.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
int16_t tx,rx;
|
||||
int16_t clean;
|
||||
int16_t clean_nlp;
|
||||
|
||||
int nonupdate_dwell;
|
||||
int curr_pos;
|
||||
int taps;
|
||||
int log2taps;
|
||||
int adaption_mode;
|
||||
|
||||
int cond_met;
|
||||
int32_t Pstates;
|
||||
int16_t adapt;
|
||||
int32_t factor;
|
||||
int16_t shift;
|
||||
|
||||
/* Average levels and averaging filter states */
|
||||
int Ltxacc, Lrxacc, Lcleanacc, Lclean_bgacc;
|
||||
int Ltx, Lrx;
|
||||
int Lclean;
|
||||
int Lclean_bg;
|
||||
int Lbgn, Lbgn_acc, Lbgn_upper, Lbgn_upper_acc;
|
||||
|
||||
/* foreground and background filter states */
|
||||
fir16_state_t fir_state;
|
||||
fir16_state_t fir_state_bg;
|
||||
int16_t *fir_taps16[2];
|
||||
|
||||
/* DC blocking filter states */
|
||||
int tx_1, tx_2, rx_1, rx_2;
|
||||
|
||||
/* optional High Pass Filter states */
|
||||
int32_t xvtx[5], yvtx[5];
|
||||
int32_t xvrx[5], yvrx[5];
|
||||
|
||||
/* Parameters for the optional Hoth noise generator */
|
||||
int cng_level;
|
||||
int cng_rndnum;
|
||||
int cng_filter;
|
||||
|
||||
/* snapshot sample of coeffs used for development */
|
||||
int16_t *snapshot;
|
||||
} echo_can_state_t;
|
||||
|
||||
/*! Create a voice echo canceller context.
|
||||
\param len The length of the canceller, in samples.
|
||||
\return The new canceller context, or NULL if the canceller could not be created.
|
||||
*/
|
||||
echo_can_state_t *echo_can_create(int len, int adaption_mode);
|
||||
|
||||
/*! Free a voice echo canceller context.
|
||||
\param ec The echo canceller context.
|
||||
*/
|
||||
void echo_can_free(echo_can_state_t *ec);
|
||||
|
||||
/*! Flush (reinitialise) a voice echo canceller context.
|
||||
\param ec The echo canceller context.
|
||||
*/
|
||||
void echo_can_flush(echo_can_state_t *ec);
|
||||
|
||||
/*! Set the adaption mode of a voice echo canceller context.
|
||||
\param ec The echo canceller context.
|
||||
\param adapt The mode.
|
||||
*/
|
||||
void echo_can_adaption_mode(echo_can_state_t *ec, int adaption_mode);
|
||||
|
||||
void echo_can_snapshot(echo_can_state_t *ec);
|
||||
|
||||
/*! Process a sample through a voice echo canceller.
|
||||
\param ec The echo canceller context.
|
||||
\param tx The transmitted audio sample.
|
||||
\param rx The received audio sample.
|
||||
\return The clean (echo cancelled) received sample.
|
||||
*/
|
||||
int16_t echo_can_update(echo_can_state_t *ec, int16_t tx, int16_t rx);
|
||||
|
||||
/*! Process to high pass filter the tx signal.
|
||||
\param ec The echo canceller context.
|
||||
\param tx The transmitted auio sample.
|
||||
\return The HP filtered transmit sample, send this to your D/A.
|
||||
*/
|
||||
int16_t echo_can_hpf_tx(echo_can_state_t *ec, int16_t tx);
|
||||
|
||||
#endif /* __ECHO_H */
|
|
@ -0,0 +1,369 @@
|
|||
/*
|
||||
* SpanDSP - a series of DSP components for telephony
|
||||
*
|
||||
* fir.h - General telephony FIR routines
|
||||
*
|
||||
* Written by Steve Underwood <steveu@coppice.org>
|
||||
*
|
||||
* Copyright (C) 2002 Steve Underwood
|
||||
*
|
||||
* All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License version 2, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*
|
||||
* $Id: fir.h,v 1.8 2006/10/24 13:45:28 steveu Exp $
|
||||
*/
|
||||
|
||||
/*! \page fir_page FIR filtering
|
||||
\section fir_page_sec_1 What does it do?
|
||||
???.
|
||||
|
||||
\section fir_page_sec_2 How does it work?
|
||||
???.
|
||||
*/
|
||||
|
||||
#if !defined(_FIR_H_)
|
||||
#define _FIR_H_
|
||||
|
||||
/*
|
||||
Blackfin NOTES & IDEAS:
|
||||
|
||||
A simple dot product function is used to implement the filter. This performs
|
||||
just one MAC/cycle which is inefficient but was easy to implement as a first
|
||||
pass. The current Blackfin code also uses an unrolled form of the filter
|
||||
history to avoid 0 length hardware loop issues. This is wasteful of
|
||||
memory.
|
||||
|
||||
Ideas for improvement:
|
||||
|
||||
1/ Rewrite filter for dual MAC inner loop. The issue here is handling
|
||||
history sample offsets that are 16 bit aligned - the dual MAC needs
|
||||
32 bit aligmnent. There are some good examples in libbfdsp.
|
||||
|
||||
2/ Use the hardware circular buffer facility tohalve memory usage.
|
||||
|
||||
3/ Consider using internal memory.
|
||||
|
||||
Using less memory might also improve speed as cache misses will be
|
||||
reduced. A drop in MIPs and memory approaching 50% should be
|
||||
possible.
|
||||
|
||||
The foreground and background filters currenlty use a total of
|
||||
about 10 MIPs/ch as measured with speedtest.c on a 256 TAP echo
|
||||
can.
|
||||
*/
|
||||
|
||||
#if defined(USE_MMX) || defined(USE_SSE2)
|
||||
#include "mmx.h"
|
||||
#endif
|
||||
|
||||
/*!
|
||||
16 bit integer FIR descriptor. This defines the working state for a single
|
||||
instance of an FIR filter using 16 bit integer coefficients.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
int taps;
|
||||
int curr_pos;
|
||||
const int16_t *coeffs;
|
||||
int16_t *history;
|
||||
} fir16_state_t;
|
||||
|
||||
/*!
|
||||
32 bit integer FIR descriptor. This defines the working state for a single
|
||||
instance of an FIR filter using 32 bit integer coefficients, and filtering
|
||||
16 bit integer data.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
int taps;
|
||||
int curr_pos;
|
||||
const int32_t *coeffs;
|
||||
int16_t *history;
|
||||
} fir32_state_t;
|
||||
|
||||
/*!
|
||||
Floating point FIR descriptor. This defines the working state for a single
|
||||
instance of an FIR filter using floating point coefficients and data.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
int taps;
|
||||
int curr_pos;
|
||||
const float *coeffs;
|
||||
float *history;
|
||||
} fir_float_state_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
static __inline__ const int16_t *fir16_create(fir16_state_t *fir,
|
||||
const int16_t *coeffs,
|
||||
int taps)
|
||||
{
|
||||
fir->taps = taps;
|
||||
fir->curr_pos = taps - 1;
|
||||
fir->coeffs = coeffs;
|
||||
#if defined(USE_MMX) || defined(USE_SSE2) || defined(__BLACKFIN_ASM__)
|
||||
if ((fir->history = malloc(2*taps*sizeof(int16_t))))
|
||||
memset(fir->history, 0, 2*taps*sizeof(int16_t));
|
||||
#else
|
||||
if ((fir->history = (int16_t *) malloc(taps*sizeof(int16_t))))
|
||||
memset(fir->history, 0, taps*sizeof(int16_t));
|
||||
#endif
|
||||
return fir->history;
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
static __inline__ void fir16_flush(fir16_state_t *fir)
|
||||
{
|
||||
#if defined(USE_MMX) || defined(USE_SSE2) || defined(__BLACKFIN_ASM__)
|
||||
memset(fir->history, 0, 2*fir->taps*sizeof(int16_t));
|
||||
#else
|
||||
memset(fir->history, 0, fir->taps*sizeof(int16_t));
|
||||
#endif
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
static __inline__ void fir16_free(fir16_state_t *fir)
|
||||
{
|
||||
free(fir->history);
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
#ifdef __BLACKFIN_ASM__
|
||||
static inline int32_t dot_asm(short *x, short *y, int len)
|
||||
{
|
||||
int dot;
|
||||
|
||||
len--;
|
||||
|
||||
__asm__
|
||||
(
|
||||
"I0 = %1;\n\t"
|
||||
"I1 = %2;\n\t"
|
||||
"A0 = 0;\n\t"
|
||||
"R0.L = W[I0++] || R1.L = W[I1++];\n\t"
|
||||
"LOOP dot%= LC0 = %3;\n\t"
|
||||
"LOOP_BEGIN dot%=;\n\t"
|
||||
"A0 += R0.L * R1.L (IS) || R0.L = W[I0++] || R1.L = W[I1++];\n\t"
|
||||
"LOOP_END dot%=;\n\t"
|
||||
"A0 += R0.L*R1.L (IS);\n\t"
|
||||
"R0 = A0;\n\t"
|
||||
"%0 = R0;\n\t"
|
||||
: "=&d" (dot)
|
||||
: "a" (x), "a" (y), "a" (len)
|
||||
: "I0", "I1", "A1", "A0", "R0", "R1"
|
||||
);
|
||||
|
||||
return dot;
|
||||
}
|
||||
#endif
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
static __inline__ int16_t fir16(fir16_state_t *fir, int16_t sample)
|
||||
{
|
||||
int32_t y;
|
||||
#if defined(USE_MMX)
|
||||
int i;
|
||||
mmx_t *mmx_coeffs;
|
||||
mmx_t *mmx_hist;
|
||||
|
||||
fir->history[fir->curr_pos] = sample;
|
||||
fir->history[fir->curr_pos + fir->taps] = sample;
|
||||
|
||||
mmx_coeffs = (mmx_t *) fir->coeffs;
|
||||
mmx_hist = (mmx_t *) &fir->history[fir->curr_pos];
|
||||
i = fir->taps;
|
||||
pxor_r2r(mm4, mm4);
|
||||
/* 8 samples per iteration, so the filter must be a multiple of 8 long. */
|
||||
while (i > 0)
|
||||
{
|
||||
movq_m2r(mmx_coeffs[0], mm0);
|
||||
movq_m2r(mmx_coeffs[1], mm2);
|
||||
movq_m2r(mmx_hist[0], mm1);
|
||||
movq_m2r(mmx_hist[1], mm3);
|
||||
mmx_coeffs += 2;
|
||||
mmx_hist += 2;
|
||||
pmaddwd_r2r(mm1, mm0);
|
||||
pmaddwd_r2r(mm3, mm2);
|
||||
paddd_r2r(mm0, mm4);
|
||||
paddd_r2r(mm2, mm4);
|
||||
i -= 8;
|
||||
}
|
||||
movq_r2r(mm4, mm0);
|
||||
psrlq_i2r(32, mm0);
|
||||
paddd_r2r(mm0, mm4);
|
||||
movd_r2m(mm4, y);
|
||||
emms();
|
||||
#elif defined(USE_SSE2)
|
||||
int i;
|
||||
xmm_t *xmm_coeffs;
|
||||
xmm_t *xmm_hist;
|
||||
|
||||
fir->history[fir->curr_pos] = sample;
|
||||
fir->history[fir->curr_pos + fir->taps] = sample;
|
||||
|
||||
xmm_coeffs = (xmm_t *) fir->coeffs;
|
||||
xmm_hist = (xmm_t *) &fir->history[fir->curr_pos];
|
||||
i = fir->taps;
|
||||
pxor_r2r(xmm4, xmm4);
|
||||
/* 16 samples per iteration, so the filter must be a multiple of 16 long. */
|
||||
while (i > 0)
|
||||
{
|
||||
movdqu_m2r(xmm_coeffs[0], xmm0);
|
||||
movdqu_m2r(xmm_coeffs[1], xmm2);
|
||||
movdqu_m2r(xmm_hist[0], xmm1);
|
||||
movdqu_m2r(xmm_hist[1], xmm3);
|
||||
xmm_coeffs += 2;
|
||||
xmm_hist += 2;
|
||||
pmaddwd_r2r(xmm1, xmm0);
|
||||
pmaddwd_r2r(xmm3, xmm2);
|
||||
paddd_r2r(xmm0, xmm4);
|
||||
paddd_r2r(xmm2, xmm4);
|
||||
i -= 16;
|
||||
}
|
||||
movdqa_r2r(xmm4, xmm0);
|
||||
psrldq_i2r(8, xmm0);
|
||||
paddd_r2r(xmm0, xmm4);
|
||||
movdqa_r2r(xmm4, xmm0);
|
||||
psrldq_i2r(4, xmm0);
|
||||
paddd_r2r(xmm0, xmm4);
|
||||
movd_r2m(xmm4, y);
|
||||
#elif defined(__BLACKFIN_ASM__)
|
||||
fir->history[fir->curr_pos] = sample;
|
||||
fir->history[fir->curr_pos + fir->taps] = sample;
|
||||
y = dot_asm((int16_t*)fir->coeffs, &fir->history[fir->curr_pos], fir->taps);
|
||||
#else
|
||||
int i;
|
||||
int offset1;
|
||||
int offset2;
|
||||
|
||||
fir->history[fir->curr_pos] = sample;
|
||||
|
||||
offset2 = fir->curr_pos;
|
||||
offset1 = fir->taps - offset2;
|
||||
y = 0;
|
||||
for (i = fir->taps - 1; i >= offset1; i--)
|
||||
y += fir->coeffs[i]*fir->history[i - offset1];
|
||||
for ( ; i >= 0; i--)
|
||||
y += fir->coeffs[i]*fir->history[i + offset2];
|
||||
#endif
|
||||
if (fir->curr_pos <= 0)
|
||||
fir->curr_pos = fir->taps;
|
||||
fir->curr_pos--;
|
||||
return (int16_t) (y >> 15);
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
static __inline__ const int16_t *fir32_create(fir32_state_t *fir,
|
||||
const int32_t *coeffs,
|
||||
int taps)
|
||||
{
|
||||
fir->taps = taps;
|
||||
fir->curr_pos = taps - 1;
|
||||
fir->coeffs = coeffs;
|
||||
fir->history = (int16_t *) malloc(taps*sizeof(int16_t));
|
||||
if (fir->history)
|
||||
memset(fir->history, '\0', taps*sizeof(int16_t));
|
||||
return fir->history;
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
static __inline__ void fir32_flush(fir32_state_t *fir)
|
||||
{
|
||||
memset(fir->history, 0, fir->taps*sizeof(int16_t));
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
static __inline__ void fir32_free(fir32_state_t *fir)
|
||||
{
|
||||
free(fir->history);
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
static __inline__ int16_t fir32(fir32_state_t *fir, int16_t sample)
|
||||
{
|
||||
int i;
|
||||
int32_t y;
|
||||
int offset1;
|
||||
int offset2;
|
||||
|
||||
fir->history[fir->curr_pos] = sample;
|
||||
offset2 = fir->curr_pos;
|
||||
offset1 = fir->taps - offset2;
|
||||
y = 0;
|
||||
for (i = fir->taps - 1; i >= offset1; i--)
|
||||
y += fir->coeffs[i]*fir->history[i - offset1];
|
||||
for ( ; i >= 0; i--)
|
||||
y += fir->coeffs[i]*fir->history[i + offset2];
|
||||
if (fir->curr_pos <= 0)
|
||||
fir->curr_pos = fir->taps;
|
||||
fir->curr_pos--;
|
||||
return (int16_t) (y >> 15);
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
#ifndef __KERNEL__
|
||||
static __inline__ const float *fir_float_create(fir_float_state_t *fir,
|
||||
const float *coeffs,
|
||||
int taps)
|
||||
{
|
||||
fir->taps = taps;
|
||||
fir->curr_pos = taps - 1;
|
||||
fir->coeffs = coeffs;
|
||||
fir->history = (float *) malloc(taps*sizeof(float));
|
||||
if (fir->history)
|
||||
memset(fir->history, '\0', taps*sizeof(float));
|
||||
return fir->history;
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
static __inline__ void fir_float_free(fir_float_state_t *fir)
|
||||
{
|
||||
free(fir->history);
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
|
||||
static __inline__ int16_t fir_float(fir_float_state_t *fir, int16_t sample)
|
||||
{
|
||||
int i;
|
||||
float y;
|
||||
int offset1;
|
||||
int offset2;
|
||||
|
||||
fir->history[fir->curr_pos] = sample;
|
||||
|
||||
offset2 = fir->curr_pos;
|
||||
offset1 = fir->taps - offset2;
|
||||
y = 0;
|
||||
for (i = fir->taps - 1; i >= offset1; i--)
|
||||
y += fir->coeffs[i]*fir->history[i - offset1];
|
||||
for ( ; i >= 0; i--)
|
||||
y += fir->coeffs[i]*fir->history[i + offset2];
|
||||
if (fir->curr_pos <= 0)
|
||||
fir->curr_pos = fir->taps;
|
||||
fir->curr_pos--;
|
||||
return (int16_t) y;
|
||||
}
|
||||
/*- End of function --------------------------------------------------------*/
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
/*- End of file ------------------------------------------------------------*/
|
|
@ -0,0 +1,288 @@
|
|||
/*
|
||||
* mmx.h
|
||||
* Copyright (C) 1997-2001 H. Dietz and R. Fisher
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
#ifndef AVCODEC_I386MMX_H
|
||||
#define AVCODEC_I386MMX_H
|
||||
|
||||
/*
|
||||
* The type of an value that fits in an MMX register (note that long
|
||||
* long constant values MUST be suffixed by LL and unsigned long long
|
||||
* values by ULL, lest they be truncated by the compiler)
|
||||
*/
|
||||
|
||||
typedef union {
|
||||
long long q; /* Quadword (64-bit) value */
|
||||
unsigned long long uq; /* Unsigned Quadword */
|
||||
int d[2]; /* 2 Doubleword (32-bit) values */
|
||||
unsigned int ud[2]; /* 2 Unsigned Doubleword */
|
||||
short w[4]; /* 4 Word (16-bit) values */
|
||||
unsigned short uw[4]; /* 4 Unsigned Word */
|
||||
char b[8]; /* 8 Byte (8-bit) values */
|
||||
unsigned char ub[8]; /* 8 Unsigned Byte */
|
||||
float s[2]; /* Single-precision (32-bit) value */
|
||||
} mmx_t; /* On an 8-byte (64-bit) boundary */
|
||||
|
||||
/* SSE registers */
|
||||
typedef union {
|
||||
char b[16];
|
||||
} xmm_t;
|
||||
|
||||
|
||||
#define mmx_i2r(op,imm,reg) \
|
||||
__asm__ __volatile__ (#op " %0, %%" #reg \
|
||||
: /* nothing */ \
|
||||
: "i" (imm) )
|
||||
|
||||
#define mmx_m2r(op,mem,reg) \
|
||||
__asm__ __volatile__ (#op " %0, %%" #reg \
|
||||
: /* nothing */ \
|
||||
: "m" (mem))
|
||||
|
||||
#define mmx_r2m(op,reg,mem) \
|
||||
__asm__ __volatile__ (#op " %%" #reg ", %0" \
|
||||
: "=m" (mem) \
|
||||
: /* nothing */ )
|
||||
|
||||
#define mmx_r2r(op,regs,regd) \
|
||||
__asm__ __volatile__ (#op " %" #regs ", %" #regd)
|
||||
|
||||
|
||||
#define emms() __asm__ __volatile__ ("emms")
|
||||
|
||||
#define movd_m2r(var,reg) mmx_m2r (movd, var, reg)
|
||||
#define movd_r2m(reg,var) mmx_r2m (movd, reg, var)
|
||||
#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd)
|
||||
|
||||
#define movq_m2r(var,reg) mmx_m2r (movq, var, reg)
|
||||
#define movq_r2m(reg,var) mmx_r2m (movq, reg, var)
|
||||
#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd)
|
||||
|
||||
#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg)
|
||||
#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd)
|
||||
#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg)
|
||||
#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd)
|
||||
|
||||
#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg)
|
||||
#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd)
|
||||
|
||||
#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg)
|
||||
#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd)
|
||||
#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg)
|
||||
#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd)
|
||||
#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg)
|
||||
#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd)
|
||||
|
||||
#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg)
|
||||
#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd)
|
||||
#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg)
|
||||
#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd)
|
||||
|
||||
#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg)
|
||||
#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd)
|
||||
#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg)
|
||||
#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd)
|
||||
|
||||
#define pand_m2r(var,reg) mmx_m2r (pand, var, reg)
|
||||
#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd)
|
||||
|
||||
#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg)
|
||||
#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd)
|
||||
|
||||
#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg)
|
||||
#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd)
|
||||
#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg)
|
||||
#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd)
|
||||
#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg)
|
||||
#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd)
|
||||
|
||||
#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg)
|
||||
#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd)
|
||||
#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg)
|
||||
#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd)
|
||||
#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg)
|
||||
#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd)
|
||||
|
||||
#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg)
|
||||
#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd)
|
||||
|
||||
#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg)
|
||||
#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd)
|
||||
|
||||
#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg)
|
||||
#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd)
|
||||
|
||||
#define por_m2r(var,reg) mmx_m2r (por, var, reg)
|
||||
#define por_r2r(regs,regd) mmx_r2r (por, regs, regd)
|
||||
|
||||
#define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg)
|
||||
#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg)
|
||||
#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd)
|
||||
#define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg)
|
||||
#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg)
|
||||
#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd)
|
||||
#define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg)
|
||||
#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg)
|
||||
#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd)
|
||||
|
||||
#define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg)
|
||||
#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg)
|
||||
#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd)
|
||||
#define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg)
|
||||
#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg)
|
||||
#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd)
|
||||
|
||||
#define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg)
|
||||
#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg)
|
||||
#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd)
|
||||
#define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg)
|
||||
#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg)
|
||||
#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd)
|
||||
#define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg)
|
||||
#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg)
|
||||
#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd)
|
||||
|
||||
#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg)
|
||||
#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd)
|
||||
#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg)
|
||||
#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd)
|
||||
#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg)
|
||||
#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd)
|
||||
|
||||
#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg)
|
||||
#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd)
|
||||
#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg)
|
||||
#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd)
|
||||
|
||||
#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg)
|
||||
#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd)
|
||||
#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg)
|
||||
#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd)
|
||||
|
||||
#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg)
|
||||
#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd)
|
||||
#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg)
|
||||
#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd)
|
||||
#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg)
|
||||
#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd)
|
||||
|
||||
#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg)
|
||||
#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd)
|
||||
#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg)
|
||||
#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd)
|
||||
#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg)
|
||||
#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd)
|
||||
|
||||
#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg)
|
||||
#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd)
|
||||
|
||||
|
||||
/* 3DNOW extensions */
|
||||
|
||||
#define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg)
|
||||
#define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd)
|
||||
|
||||
|
||||
/* AMD MMX extensions - also available in intel SSE */
|
||||
|
||||
|
||||
#define mmx_m2ri(op,mem,reg,imm) \
|
||||
__asm__ __volatile__ (#op " %1, %0, %%" #reg \
|
||||
: /* nothing */ \
|
||||
: "m" (mem), "i" (imm))
|
||||
#define mmx_r2ri(op,regs,regd,imm) \
|
||||
__asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
|
||||
: /* nothing */ \
|
||||
: "i" (imm) )
|
||||
|
||||
#define mmx_fetch(mem,hint) \
|
||||
__asm__ __volatile__ ("prefetch" #hint " %0" \
|
||||
: /* nothing */ \
|
||||
: "m" (mem))
|
||||
|
||||
|
||||
#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg)
|
||||
|
||||
#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var)
|
||||
|
||||
#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg)
|
||||
#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd)
|
||||
#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg)
|
||||
#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd)
|
||||
|
||||
#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm)
|
||||
|
||||
#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm)
|
||||
|
||||
#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg)
|
||||
#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd)
|
||||
|
||||
#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg)
|
||||
#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd)
|
||||
|
||||
#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg)
|
||||
#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd)
|
||||
|
||||
#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg)
|
||||
#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd)
|
||||
|
||||
#define pmovmskb(mmreg,reg) \
|
||||
__asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg)
|
||||
|
||||
#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg)
|
||||
#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd)
|
||||
|
||||
#define prefetcht0(mem) mmx_fetch (mem, t0)
|
||||
#define prefetcht1(mem) mmx_fetch (mem, t1)
|
||||
#define prefetcht2(mem) mmx_fetch (mem, t2)
|
||||
#define prefetchnta(mem) mmx_fetch (mem, nta)
|
||||
|
||||
#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg)
|
||||
#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd)
|
||||
|
||||
#define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm)
|
||||
#define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm)
|
||||
|
||||
#define sfence() __asm__ __volatile__ ("sfence\n\t")
|
||||
|
||||
/* SSE2 */
|
||||
#define pshufhw_m2r(var,reg,imm) mmx_m2ri(pshufhw, var, reg, imm)
|
||||
#define pshufhw_r2r(regs,regd,imm) mmx_r2ri(pshufhw, regs, regd, imm)
|
||||
#define pshuflw_m2r(var,reg,imm) mmx_m2ri(pshuflw, var, reg, imm)
|
||||
#define pshuflw_r2r(regs,regd,imm) mmx_r2ri(pshuflw, regs, regd, imm)
|
||||
|
||||
#define pshufd_r2r(regs,regd,imm) mmx_r2ri(pshufd, regs, regd, imm)
|
||||
|
||||
#define movdqa_m2r(var,reg) mmx_m2r (movdqa, var, reg)
|
||||
#define movdqa_r2m(reg,var) mmx_r2m (movdqa, reg, var)
|
||||
#define movdqa_r2r(regs,regd) mmx_r2r (movdqa, regs, regd)
|
||||
#define movdqu_m2r(var,reg) mmx_m2r (movdqu, var, reg)
|
||||
#define movdqu_r2m(reg,var) mmx_r2m (movdqu, reg, var)
|
||||
#define movdqu_r2r(regs,regd) mmx_r2r (movdqu, regs, regd)
|
||||
|
||||
#define pmullw_r2m(reg,var) mmx_r2m (pmullw, reg, var)
|
||||
|
||||
#define pslldq_i2r(imm,reg) mmx_i2r (pslldq, imm, reg)
|
||||
#define psrldq_i2r(imm,reg) mmx_i2r (psrldq, imm, reg)
|
||||
|
||||
#define punpcklqdq_r2r(regs,regd) mmx_r2r (punpcklqdq, regs, regd)
|
||||
#define punpckhqdq_r2r(regs,regd) mmx_r2r (punpckhqdq, regs, regd)
|
||||
|
||||
|
||||
#endif /* AVCODEC_I386MMX_H */
|
Loading…
Reference in New Issue