]> git.ipfire.org Git - ipfire-2.x.git/blame - src/patches/suse-2.6.27.39/patches.drivers/0016-Staging-add-echo-cancelation-module.patch
Imported linux-2.6.27.39 suse/xen patches.
[ipfire-2.x.git] / src / patches / suse-2.6.27.39 / patches.drivers / 0016-Staging-add-echo-cancelation-module.patch
CommitLineData
2cb7cef9
BS
1From 10602db812fa270fc923f5e48fb47202288828f3 Mon Sep 17 00:00:00 2001
2From: David Rowe <david@rowetel.com>
3Date: Mon, 6 Oct 2008 21:41:46 -0700
4Subject: [PATCH 16/23] Staging: add echo cancelation module
5Patch-mainline: 2.6.28
6
7This is used by mISDN and Zaptel drivers.
8
9From: Steve Underwood <steveu@coppice.org>
10From: David Rowe <david@rowetel.com>
11Cc: Tzafrir Cohen <tzafrir.cohen@xorcom.com>
12Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
13---
14 drivers/staging/Kconfig | 2 +
15 drivers/staging/Makefile | 1 +
16 drivers/staging/echo/Kconfig | 9 +
17 drivers/staging/echo/Makefile | 1 +
18 drivers/staging/echo/TODO | 10 +
19 drivers/staging/echo/bit_operations.h | 253 +++++++++++++
20 drivers/staging/echo/echo.c | 632 +++++++++++++++++++++++++++++++++
21 drivers/staging/echo/echo.h | 220 ++++++++++++
22 drivers/staging/echo/fir.h | 369 +++++++++++++++++++
23 drivers/staging/echo/mmx.h | 288 +++++++++++++++
24 10 files changed, 1785 insertions(+), 0 deletions(-)
25 create mode 100644 drivers/staging/echo/Kconfig
26 create mode 100644 drivers/staging/echo/Makefile
27 create mode 100644 drivers/staging/echo/TODO
28 create mode 100644 drivers/staging/echo/bit_operations.h
29 create mode 100644 drivers/staging/echo/echo.c
30 create mode 100644 drivers/staging/echo/echo.h
31 create mode 100644 drivers/staging/echo/fir.h
32 create mode 100644 drivers/staging/echo/mmx.h
33
34diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
35index 762b471..25338b7 100644
36--- a/drivers/staging/Kconfig
37+++ b/drivers/staging/Kconfig
38@@ -39,4 +39,6 @@ source "drivers/staging/winbond/Kconfig"
39
40 source "drivers/staging/wlan-ng/Kconfig"
41
42+source "drivers/staging/echo/Kconfig"
43+
44 endif # STAGING
45diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
46index 5741984..93decb8 100644
47--- a/drivers/staging/Makefile
48+++ b/drivers/staging/Makefile
49@@ -8,3 +8,4 @@ obj-$(CONFIG_VIDEO_GO7007) += go7007/
50 obj-$(CONFIG_USB_IP_COMMON) += usbip/
51 obj-$(CONFIG_W35UND) += winbond/
52 obj-$(CONFIG_PRISM2_USB) += wlan-ng/
53+obj-$(CONFIG_ECHO) += echo/
54diff --git a/drivers/staging/echo/Kconfig b/drivers/staging/echo/Kconfig
55new file mode 100644
56index 0000000..f1d41ea
57--- /dev/null
58+++ b/drivers/staging/echo/Kconfig
59@@ -0,0 +1,9 @@
60+config ECHO
61+ tristate "Line Echo Canceller support"
62+ default n
63+ ---help---
64+ This driver provides line echo cancelling support for mISDN and
65+ Zaptel drivers.
66+
67+ To compile this driver as a module, choose M here. The module
68+ will be called echo.
69diff --git a/drivers/staging/echo/Makefile b/drivers/staging/echo/Makefile
70new file mode 100644
71index 0000000..7d4caac
72--- /dev/null
73+++ b/drivers/staging/echo/Makefile
74@@ -0,0 +1 @@
75+obj-$(CONFIG_ECHO) += echo.o
76diff --git a/drivers/staging/echo/TODO b/drivers/staging/echo/TODO
77new file mode 100644
78index 0000000..1ca09af
79--- /dev/null
80+++ b/drivers/staging/echo/TODO
81@@ -0,0 +1,10 @@
82+TODO:
83+ - checkpatch.pl cleanups
84+ - Lindent
85+ - typedef removals
86+ - handle bit_operations.h (merge in or make part of common code?)
87+ - remove proc interface, only use echo.h interface (proc interface is
88+ racy and not correct.)
89+
90+Please send patches to Greg Kroah-Hartman <greg@kroah.com> and Cc: Steve
91+Underwood <steveu@coppice.org> and David Rowe <david@rowetel.com>
92diff --git a/drivers/staging/echo/bit_operations.h b/drivers/staging/echo/bit_operations.h
93new file mode 100644
94index 0000000..b32f4bf
95--- /dev/null
96+++ b/drivers/staging/echo/bit_operations.h
97@@ -0,0 +1,253 @@
98+/*
99+ * SpanDSP - a series of DSP components for telephony
100+ *
101+ * bit_operations.h - Various bit level operations, such as bit reversal
102+ *
103+ * Written by Steve Underwood <steveu@coppice.org>
104+ *
105+ * Copyright (C) 2006 Steve Underwood
106+ *
107+ * All rights reserved.
108+ *
109+ * This program is free software; you can redistribute it and/or modify
110+ * it under the terms of the GNU General Public License version 2, as
111+ * published by the Free Software Foundation.
112+ *
113+ * This program is distributed in the hope that it will be useful,
114+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
115+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
116+ * GNU General Public License for more details.
117+ *
118+ * You should have received a copy of the GNU General Public License
119+ * along with this program; if not, write to the Free Software
120+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
121+ *
122+ * $Id: bit_operations.h,v 1.11 2006/11/28 15:37:03 steveu Exp $
123+ */
124+
125+/*! \file */
126+
127+#if !defined(_BIT_OPERATIONS_H_)
128+#define _BIT_OPERATIONS_H_
129+
130+#ifdef __cplusplus
131+extern "C" {
132+#endif
133+
134+#if defined(__i386__) || defined(__x86_64__)
135+/*! \brief Find the bit position of the highest set bit in a word
136+ \param bits The word to be searched
137+ \return The bit number of the highest set bit, or -1 if the word is zero. */
138+static __inline__ int top_bit(unsigned int bits)
139+{
140+ int res;
141+
142+ __asm__ (" xorl %[res],%[res];\n"
143+ " decl %[res];\n"
144+ " bsrl %[bits],%[res]\n"
145+ : [res] "=&r" (res)
146+ : [bits] "rm" (bits));
147+ return res;
148+}
149+/*- End of function --------------------------------------------------------*/
150+
151+/*! \brief Find the bit position of the lowest set bit in a word
152+ \param bits The word to be searched
153+ \return The bit number of the lowest set bit, or -1 if the word is zero. */
154+static __inline__ int bottom_bit(unsigned int bits)
155+{
156+ int res;
157+
158+ __asm__ (" xorl %[res],%[res];\n"
159+ " decl %[res];\n"
160+ " bsfl %[bits],%[res]\n"
161+ : [res] "=&r" (res)
162+ : [bits] "rm" (bits));
163+ return res;
164+}
165+/*- End of function --------------------------------------------------------*/
166+#else
167+static __inline__ int top_bit(unsigned int bits)
168+{
169+ int i;
170+
171+ if (bits == 0)
172+ return -1;
173+ i = 0;
174+ if (bits & 0xFFFF0000)
175+ {
176+ bits &= 0xFFFF0000;
177+ i += 16;
178+ }
179+ if (bits & 0xFF00FF00)
180+ {
181+ bits &= 0xFF00FF00;
182+ i += 8;
183+ }
184+ if (bits & 0xF0F0F0F0)
185+ {
186+ bits &= 0xF0F0F0F0;
187+ i += 4;
188+ }
189+ if (bits & 0xCCCCCCCC)
190+ {
191+ bits &= 0xCCCCCCCC;
192+ i += 2;
193+ }
194+ if (bits & 0xAAAAAAAA)
195+ {
196+ bits &= 0xAAAAAAAA;
197+ i += 1;
198+ }
199+ return i;
200+}
201+/*- End of function --------------------------------------------------------*/
202+
203+static __inline__ int bottom_bit(unsigned int bits)
204+{
205+ int i;
206+
207+ if (bits == 0)
208+ return -1;
209+ i = 32;
210+ if (bits & 0x0000FFFF)
211+ {
212+ bits &= 0x0000FFFF;
213+ i -= 16;
214+ }
215+ if (bits & 0x00FF00FF)
216+ {
217+ bits &= 0x00FF00FF;
218+ i -= 8;
219+ }
220+ if (bits & 0x0F0F0F0F)
221+ {
222+ bits &= 0x0F0F0F0F;
223+ i -= 4;
224+ }
225+ if (bits & 0x33333333)
226+ {
227+ bits &= 0x33333333;
228+ i -= 2;
229+ }
230+ if (bits & 0x55555555)
231+ {
232+ bits &= 0x55555555;
233+ i -= 1;
234+ }
235+ return i;
236+}
237+/*- End of function --------------------------------------------------------*/
238+#endif
239+
240+/*! \brief Bit reverse a byte.
241+ \param data The byte to be reversed.
242+ \return The bit reversed version of data. */
243+static __inline__ uint8_t bit_reverse8(uint8_t x)
244+{
245+#if defined(__i386__) || defined(__x86_64__)
246+ /* If multiply is fast */
247+ return ((x*0x0802U & 0x22110U) | (x*0x8020U & 0x88440U))*0x10101U >> 16;
248+#else
249+ /* If multiply is slow, but we have a barrel shifter */
250+ x = (x >> 4) | (x << 4);
251+ x = ((x & 0xCC) >> 2) | ((x & 0x33) << 2);
252+ return ((x & 0xAA) >> 1) | ((x & 0x55) << 1);
253+#endif
254+}
255+/*- End of function --------------------------------------------------------*/
256+
257+/*! \brief Bit reverse a 16 bit word.
258+ \param data The word to be reversed.
259+ \return The bit reversed version of data. */
260+uint16_t bit_reverse16(uint16_t data);
261+
262+/*! \brief Bit reverse a 32 bit word.
263+ \param data The word to be reversed.
264+ \return The bit reversed version of data. */
265+uint32_t bit_reverse32(uint32_t data);
266+
267+/*! \brief Bit reverse each of the four bytes in a 32 bit word.
268+ \param data The word to be reversed.
269+ \return The bit reversed version of data. */
270+uint32_t bit_reverse_4bytes(uint32_t data);
271+
272+/*! \brief Find the number of set bits in a 32 bit word.
273+ \param x The word to be searched.
274+ \return The number of set bits. */
275+int one_bits32(uint32_t x);
276+
277+/*! \brief Create a mask as wide as the number in a 32 bit word.
278+ \param x The word to be searched.
279+ \return The mask. */
280+uint32_t make_mask32(uint32_t x);
281+
282+/*! \brief Create a mask as wide as the number in a 16 bit word.
283+ \param x The word to be searched.
284+ \return The mask. */
285+uint16_t make_mask16(uint16_t x);
286+
287+/*! \brief Find the least significant one in a word, and return a word
288+ with just that bit set.
289+ \param x The word to be searched.
290+ \return The word with the single set bit. */
291+static __inline__ uint32_t least_significant_one32(uint32_t x)
292+{
293+ return (x & (-(int32_t) x));
294+}
295+/*- End of function --------------------------------------------------------*/
296+
297+/*! \brief Find the most significant one in a word, and return a word
298+ with just that bit set.
299+ \param x The word to be searched.
300+ \return The word with the single set bit. */
301+static __inline__ uint32_t most_significant_one32(uint32_t x)
302+{
303+#if defined(__i386__) || defined(__x86_64__)
304+ return 1 << top_bit(x);
305+#else
306+ x = make_mask32(x);
307+ return (x ^ (x >> 1));
308+#endif
309+}
310+/*- End of function --------------------------------------------------------*/
311+
312+/*! \brief Find the parity of a byte.
313+ \param x The byte to be checked.
314+ \return 1 for odd, or 0 for even. */
315+static __inline__ int parity8(uint8_t x)
316+{
317+ x = (x ^ (x >> 4)) & 0x0F;
318+ return (0x6996 >> x) & 1;
319+}
320+/*- End of function --------------------------------------------------------*/
321+
322+/*! \brief Find the parity of a 16 bit word.
323+ \param x The word to be checked.
324+ \return 1 for odd, or 0 for even. */
325+static __inline__ int parity16(uint16_t x)
326+{
327+ x ^= (x >> 8);
328+ x = (x ^ (x >> 4)) & 0x0F;
329+ return (0x6996 >> x) & 1;
330+}
331+/*- End of function --------------------------------------------------------*/
332+
333+/*! \brief Find the parity of a 32 bit word.
334+ \param x The word to be checked.
335+ \return 1 for odd, or 0 for even. */
336+static __inline__ int parity32(uint32_t x)
337+{
338+ x ^= (x >> 16);
339+ x ^= (x >> 8);
340+ x = (x ^ (x >> 4)) & 0x0F;
341+ return (0x6996 >> x) & 1;
342+}
343+/*- End of function --------------------------------------------------------*/
344+
345+#ifdef __cplusplus
346+}
347+#endif
348+
349+#endif
350+/*- End of file ------------------------------------------------------------*/
351diff --git a/drivers/staging/echo/echo.c b/drivers/staging/echo/echo.c
352new file mode 100644
353index 0000000..4a281b1
354--- /dev/null
355+++ b/drivers/staging/echo/echo.c
356@@ -0,0 +1,632 @@
357+/*
358+ * SpanDSP - a series of DSP components for telephony
359+ *
360+ * echo.c - A line echo canceller. This code is being developed
361+ * against and partially complies with G168.
362+ *
363+ * Written by Steve Underwood <steveu@coppice.org>
364+ * and David Rowe <david_at_rowetel_dot_com>
365+ *
366+ * Copyright (C) 2001, 2003 Steve Underwood, 2007 David Rowe
367+ *
368+ * Based on a bit from here, a bit from there, eye of toad, ear of
369+ * bat, 15 years of failed attempts by David and a few fried brain
370+ * cells.
371+ *
372+ * All rights reserved.
373+ *
374+ * This program is free software; you can redistribute it and/or modify
375+ * it under the terms of the GNU General Public License version 2, as
376+ * published by the Free Software Foundation.
377+ *
378+ * This program is distributed in the hope that it will be useful,
379+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
380+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
381+ * GNU General Public License for more details.
382+ *
383+ * You should have received a copy of the GNU General Public License
384+ * along with this program; if not, write to the Free Software
385+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
386+ *
387+ * $Id: echo.c,v 1.20 2006/12/01 18:00:48 steveu Exp $
388+ */
389+
390+/*! \file */
391+
392+/* Implementation Notes
393+ David Rowe
394+ April 2007
395+
396+ This code started life as Steve's NLMS algorithm with a tap
397+ rotation algorithm to handle divergence during double talk. I
398+ added a Geigel Double Talk Detector (DTD) [2] and performed some
399+ G168 tests. However I had trouble meeting the G168 requirements,
400+ especially for double talk - there were always cases where my DTD
401+ failed, for example where near end speech was under the 6dB
402+ threshold required for declaring double talk.
403+
404+ So I tried a two path algorithm [1], which has so far given better
405+ results. The original tap rotation/Geigel algorithm is available
406+ in SVN http://svn.rowetel.com/software/oslec/tags/before_16bit.
407+ It's probably possible to make it work if some one wants to put some
408+ serious work into it.
409+
410+ At present no special treatment is provided for tones, which
411+ generally cause NLMS algorithms to diverge. Initial runs of a
412+ subset of the G168 tests for tones (e.g ./echo_test 6) show the
413+ current algorithm is passing OK, which is kind of surprising. The
414+ full set of tests needs to be performed to confirm this result.
415+
416+ One other interesting change is that I have managed to get the NLMS
417+ code to work with 16 bit coefficients, rather than the original 32
418+ bit coefficents. This reduces the MIPs and storage required.
419+ I evaulated the 16 bit port using g168_tests.sh and listening tests
420+ on 4 real-world samples.
421+
422+ I also attempted the implementation of a block based NLMS update
423+ [2] but although this passes g168_tests.sh it didn't converge well
424+ on the real-world samples. I have no idea why, perhaps a scaling
425+ problem. The block based code is also available in SVN
426+ http://svn.rowetel.com/software/oslec/tags/before_16bit. If this
427+ code can be debugged, it will lead to further reduction in MIPS, as
428+ the block update code maps nicely onto DSP instruction sets (it's a
429+ dot product) compared to the current sample-by-sample update.
430+
431+ Steve also has some nice notes on echo cancellers in echo.h
432+
433+
434+ References:
435+
436+ [1] Ochiai, Areseki, and Ogihara, "Echo Canceller with Two Echo
437+ Path Models", IEEE Transactions on communications, COM-25,
438+ No. 6, June
439+ 1977.
440+ http://www.rowetel.com/images/echo/dual_path_paper.pdf
441+
442+ [2] The classic, very useful paper that tells you how to
443+ actually build a real world echo canceller:
444+ Messerschmitt, Hedberg, Cole, Haoui, Winship, "Digital Voice
445+ Echo Canceller with a TMS320020,
446+ http://www.rowetel.com/images/echo/spra129.pdf
447+
448+ [3] I have written a series of blog posts on this work, here is
449+ Part 1: http://www.rowetel.com/blog/?p=18
450+
451+ [4] The source code http://svn.rowetel.com/software/oslec/
452+
453+ [5] A nice reference on LMS filters:
454+ http://en.wikipedia.org/wiki/Least_mean_squares_filter
455+
456+ Credits:
457+
458+ Thanks to Steve Underwood, Jean-Marc Valin, and Ramakrishnan
459+ Muthukrishnan for their suggestions and email discussions. Thanks
460+ also to those people who collected echo samples for me such as
461+ Mark, Pawel, and Pavel.
462+*/
463+
464+#include <linux/kernel.h> /* We're doing kernel work */
465+#include <linux/module.h>
466+#include <linux/kernel.h>
467+#include <linux/slab.h>
468+#define malloc(a) kmalloc((a), GFP_KERNEL)
469+#define free(a) kfree(a)
470+
471+#include "bit_operations.h"
472+#include "echo.h"
473+
474+#define MIN_TX_POWER_FOR_ADAPTION 64
475+#define MIN_RX_POWER_FOR_ADAPTION 64
476+#define DTD_HANGOVER 600 /* 600 samples, or 75ms */
477+#define DC_LOG2BETA 3 /* log2() of DC filter Beta */
478+
479+/*-----------------------------------------------------------------------*\
480+ FUNCTIONS
481+\*-----------------------------------------------------------------------*/
482+
483+/* adapting coeffs using the traditional stochastic descent (N)LMS algorithm */
484+
485+
486+#ifdef __BLACKFIN_ASM__
487+static void __inline__ lms_adapt_bg(echo_can_state_t *ec, int clean, int shift)
488+{
489+ int i, j;
490+ int offset1;
491+ int offset2;
492+ int factor;
493+ int exp;
494+ int16_t *phist;
495+ int n;
496+
497+ if (shift > 0)
498+ factor = clean << shift;
499+ else
500+ factor = clean >> -shift;
501+
502+ /* Update the FIR taps */
503+
504+ offset2 = ec->curr_pos;
505+ offset1 = ec->taps - offset2;
506+ phist = &ec->fir_state_bg.history[offset2];
507+
508+ /* st: and en: help us locate the assembler in echo.s */
509+
510+ //asm("st:");
511+ n = ec->taps;
512+ for (i = 0, j = offset2; i < n; i++, j++)
513+ {
514+ exp = *phist++ * factor;
515+ ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15);
516+ }
517+ //asm("en:");
518+
519+ /* Note the asm for the inner loop above generated by Blackfin gcc
520+ 4.1.1 is pretty good (note even parallel instructions used):
521+
522+ R0 = W [P0++] (X);
523+ R0 *= R2;
524+ R0 = R0 + R3 (NS) ||
525+ R1 = W [P1] (X) ||
526+ nop;
527+ R0 >>>= 15;
528+ R0 = R0 + R1;
529+ W [P1++] = R0;
530+
531+ A block based update algorithm would be much faster but the
532+ above can't be improved on much. Every instruction saved in
533+ the loop above is 2 MIPs/ch! The for loop above is where the
534+ Blackfin spends most of it's time - about 17 MIPs/ch measured
535+ with speedtest.c with 256 taps (32ms). Write-back and
536+ Write-through cache gave about the same performance.
537+ */
538+}
539+
540+/*
541+ IDEAS for further optimisation of lms_adapt_bg():
542+
543+ 1/ The rounding is quite costly. Could we keep as 32 bit coeffs
544+ then make filter pluck the MS 16-bits of the coeffs when filtering?
545+ However this would lower potential optimisation of filter, as I
546+ think the dual-MAC architecture requires packed 16 bit coeffs.
547+
548+ 2/ Block based update would be more efficient, as per comments above,
549+ could use dual MAC architecture.
550+
551+ 3/ Look for same sample Blackfin LMS code, see if we can get dual-MAC
552+ packing.
553+
554+ 4/ Execute the whole e/c in a block of say 20ms rather than sample
555+ by sample. Processing a few samples every ms is inefficient.
556+*/
557+
558+#else
559+static __inline__ void lms_adapt_bg(echo_can_state_t *ec, int clean, int shift)
560+{
561+ int i;
562+
563+ int offset1;
564+ int offset2;
565+ int factor;
566+ int exp;
567+
568+ if (shift > 0)
569+ factor = clean << shift;
570+ else
571+ factor = clean >> -shift;
572+
573+ /* Update the FIR taps */
574+
575+ offset2 = ec->curr_pos;
576+ offset1 = ec->taps - offset2;
577+
578+ for (i = ec->taps - 1; i >= offset1; i--)
579+ {
580+ exp = (ec->fir_state_bg.history[i - offset1]*factor);
581+ ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15);
582+ }
583+ for ( ; i >= 0; i--)
584+ {
585+ exp = (ec->fir_state_bg.history[i + offset2]*factor);
586+ ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15);
587+ }
588+}
589+#endif
590+
591+/*- End of function --------------------------------------------------------*/
592+
593+echo_can_state_t *echo_can_create(int len, int adaption_mode)
594+{
595+ echo_can_state_t *ec;
596+ int i;
597+ int j;
598+
599+ ec = kmalloc(sizeof(*ec), GFP_KERNEL);
600+ if (ec == NULL)
601+ return NULL;
602+ memset(ec, 0, sizeof(*ec));
603+
604+ ec->taps = len;
605+ ec->log2taps = top_bit(len);
606+ ec->curr_pos = ec->taps - 1;
607+
608+ for (i = 0; i < 2; i++)
609+ {
610+ if ((ec->fir_taps16[i] = (int16_t *) malloc((ec->taps)*sizeof(int16_t))) == NULL)
611+ {
612+ for (j = 0; j < i; j++)
613+ kfree(ec->fir_taps16[j]);
614+ kfree(ec);
615+ return NULL;
616+ }
617+ memset(ec->fir_taps16[i], 0, (ec->taps)*sizeof(int16_t));
618+ }
619+
620+ fir16_create(&ec->fir_state,
621+ ec->fir_taps16[0],
622+ ec->taps);
623+ fir16_create(&ec->fir_state_bg,
624+ ec->fir_taps16[1],
625+ ec->taps);
626+
627+ for(i=0; i<5; i++) {
628+ ec->xvtx[i] = ec->yvtx[i] = ec->xvrx[i] = ec->yvrx[i] = 0;
629+ }
630+
631+ ec->cng_level = 1000;
632+ echo_can_adaption_mode(ec, adaption_mode);
633+
634+ ec->snapshot = (int16_t*)malloc(ec->taps*sizeof(int16_t));
635+ memset(ec->snapshot, 0, sizeof(int16_t)*ec->taps);
636+
637+ ec->cond_met = 0;
638+ ec->Pstates = 0;
639+ ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
640+ ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
641+ ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
642+ ec->Lbgn = ec->Lbgn_acc = 0;
643+ ec->Lbgn_upper = 200;
644+ ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
645+
646+ return ec;
647+}
648+/*- End of function --------------------------------------------------------*/
649+
650+void echo_can_free(echo_can_state_t *ec)
651+{
652+ int i;
653+
654+ fir16_free(&ec->fir_state);
655+ fir16_free(&ec->fir_state_bg);
656+ for (i = 0; i < 2; i++)
657+ kfree(ec->fir_taps16[i]);
658+ kfree(ec->snapshot);
659+ kfree(ec);
660+}
661+/*- End of function --------------------------------------------------------*/
662+
663+void echo_can_adaption_mode(echo_can_state_t *ec, int adaption_mode)
664+{
665+ ec->adaption_mode = adaption_mode;
666+}
667+/*- End of function --------------------------------------------------------*/
668+
669+void echo_can_flush(echo_can_state_t *ec)
670+{
671+ int i;
672+
673+ ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
674+ ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
675+ ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
676+
677+ ec->Lbgn = ec->Lbgn_acc = 0;
678+ ec->Lbgn_upper = 200;
679+ ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
680+
681+ ec->nonupdate_dwell = 0;
682+
683+ fir16_flush(&ec->fir_state);
684+ fir16_flush(&ec->fir_state_bg);
685+ ec->fir_state.curr_pos = ec->taps - 1;
686+ ec->fir_state_bg.curr_pos = ec->taps - 1;
687+ for (i = 0; i < 2; i++)
688+ memset(ec->fir_taps16[i], 0, ec->taps*sizeof(int16_t));
689+
690+ ec->curr_pos = ec->taps - 1;
691+ ec->Pstates = 0;
692+}
693+/*- End of function --------------------------------------------------------*/
694+
695+void echo_can_snapshot(echo_can_state_t *ec) {
696+ memcpy(ec->snapshot, ec->fir_taps16[0], ec->taps*sizeof(int16_t));
697+}
698+/*- End of function --------------------------------------------------------*/
699+
700+/* Dual Path Echo Canceller ------------------------------------------------*/
701+
702+int16_t echo_can_update(echo_can_state_t *ec, int16_t tx, int16_t rx)
703+{
704+ int32_t echo_value;
705+ int clean_bg;
706+ int tmp, tmp1;
707+
708+ /* Input scaling was found be required to prevent problems when tx
709+ starts clipping. Another possible way to handle this would be the
710+ filter coefficent scaling. */
711+
712+ ec->tx = tx; ec->rx = rx;
713+ tx >>=1;
714+ rx >>=1;
715+
716+ /*
717+ Filter DC, 3dB point is 160Hz (I think), note 32 bit precision required
718+ otherwise values do not track down to 0. Zero at DC, Pole at (1-Beta)
719+ only real axis. Some chip sets (like Si labs) don't need
720+ this, but something like a $10 X100P card does. Any DC really slows
721+ down convergence.
722+
723+ Note: removes some low frequency from the signal, this reduces
724+ the speech quality when listening to samples through headphones
725+ but may not be obvious through a telephone handset.
726+
727+ Note that the 3dB frequency in radians is approx Beta, e.g. for
728+ Beta = 2^(-3) = 0.125, 3dB freq is 0.125 rads = 159Hz.
729+ */
730+
731+ if (ec->adaption_mode & ECHO_CAN_USE_RX_HPF) {
732+ tmp = rx << 15;
733+#if 1
734+ /* Make sure the gain of the HPF is 1.0. This can still saturate a little under
735+ impulse conditions, and it might roll to 32768 and need clipping on sustained peak
736+ level signals. However, the scale of such clipping is small, and the error due to
737+ any saturation should not markedly affect the downstream processing. */
738+ tmp -= (tmp >> 4);
739+#endif
740+ ec->rx_1 += -(ec->rx_1>>DC_LOG2BETA) + tmp - ec->rx_2;
741+
742+ /* hard limit filter to prevent clipping. Note that at this stage
743+ rx should be limited to +/- 16383 due to right shift above */
744+ tmp1 = ec->rx_1 >> 15;
745+ if (tmp1 > 16383) tmp1 = 16383;
746+ if (tmp1 < -16383) tmp1 = -16383;
747+ rx = tmp1;
748+ ec->rx_2 = tmp;
749+ }
750+
751+ /* Block average of power in the filter states. Used for
752+ adaption power calculation. */
753+
754+ {
755+ int new, old;
756+
757+ /* efficient "out with the old and in with the new" algorithm so
758+ we don't have to recalculate over the whole block of
759+ samples. */
760+ new = (int)tx * (int)tx;
761+ old = (int)ec->fir_state.history[ec->fir_state.curr_pos] *
762+ (int)ec->fir_state.history[ec->fir_state.curr_pos];
763+ ec->Pstates += ((new - old) + (1<<ec->log2taps)) >> ec->log2taps;
764+ if (ec->Pstates < 0) ec->Pstates = 0;
765+ }
766+
767+ /* Calculate short term average levels using simple single pole IIRs */
768+
769+ ec->Ltxacc += abs(tx) - ec->Ltx;
770+ ec->Ltx = (ec->Ltxacc + (1<<4)) >> 5;
771+ ec->Lrxacc += abs(rx) - ec->Lrx;
772+ ec->Lrx = (ec->Lrxacc + (1<<4)) >> 5;
773+
774+ /* Foreground filter ---------------------------------------------------*/
775+
776+ ec->fir_state.coeffs = ec->fir_taps16[0];
777+ echo_value = fir16(&ec->fir_state, tx);
778+ ec->clean = rx - echo_value;
779+ ec->Lcleanacc += abs(ec->clean) - ec->Lclean;
780+ ec->Lclean = (ec->Lcleanacc + (1<<4)) >> 5;
781+
782+ /* Background filter ---------------------------------------------------*/
783+
784+ echo_value = fir16(&ec->fir_state_bg, tx);
785+ clean_bg = rx - echo_value;
786+ ec->Lclean_bgacc += abs(clean_bg) - ec->Lclean_bg;
787+ ec->Lclean_bg = (ec->Lclean_bgacc + (1<<4)) >> 5;
788+
789+ /* Background Filter adaption -----------------------------------------*/
790+
791+ /* Almost always adap bg filter, just simple DT and energy
792+ detection to minimise adaption in cases of strong double talk.
793+ However this is not critical for the dual path algorithm.
794+ */
795+ ec->factor = 0;
796+ ec->shift = 0;
797+ if ((ec->nonupdate_dwell == 0)) {
798+ int P, logP, shift;
799+
800+ /* Determine:
801+
802+ f = Beta * clean_bg_rx/P ------ (1)
803+
804+ where P is the total power in the filter states.
805+
806+ The Boffins have shown that if we obey (1) we converge
807+ quickly and avoid instability.
808+
809+ The correct factor f must be in Q30, as this is the fixed
810+ point format required by the lms_adapt_bg() function,
811+ therefore the scaled version of (1) is:
812+
813+ (2^30) * f = (2^30) * Beta * clean_bg_rx/P
814+ factor = (2^30) * Beta * clean_bg_rx/P ----- (2)
815+
816+ We have chosen Beta = 0.25 by experiment, so:
817+
818+ factor = (2^30) * (2^-2) * clean_bg_rx/P
819+
820+ (30 - 2 - log2(P))
821+ factor = clean_bg_rx 2 ----- (3)
822+
823+ To avoid a divide we approximate log2(P) as top_bit(P),
824+ which returns the position of the highest non-zero bit in
825+ P. This approximation introduces an error as large as a
826+ factor of 2, but the algorithm seems to handle it OK.
827+
828+ Come to think of it a divide may not be a big deal on a
829+ modern DSP, so its probably worth checking out the cycles
830+ for a divide versus a top_bit() implementation.
831+ */
832+
833+ P = MIN_TX_POWER_FOR_ADAPTION + ec->Pstates;
834+ logP = top_bit(P) + ec->log2taps;
835+ shift = 30 - 2 - logP;
836+ ec->shift = shift;
837+
838+ lms_adapt_bg(ec, clean_bg, shift);
839+ }
840+
841+ /* very simple DTD to make sure we dont try and adapt with strong
842+ near end speech */
843+
844+ ec->adapt = 0;
845+ if ((ec->Lrx > MIN_RX_POWER_FOR_ADAPTION) && (ec->Lrx > ec->Ltx))
846+ ec->nonupdate_dwell = DTD_HANGOVER;
847+ if (ec->nonupdate_dwell)
848+ ec->nonupdate_dwell--;
849+
850+ /* Transfer logic ------------------------------------------------------*/
851+
852+ /* These conditions are from the dual path paper [1], I messed with
853+ them a bit to improve performance. */
854+
855+ if ((ec->adaption_mode & ECHO_CAN_USE_ADAPTION) &&
856+ (ec->nonupdate_dwell == 0) &&
857+ (8*ec->Lclean_bg < 7*ec->Lclean) /* (ec->Lclean_bg < 0.875*ec->Lclean) */ &&
858+ (8*ec->Lclean_bg < ec->Ltx) /* (ec->Lclean_bg < 0.125*ec->Ltx) */ )
859+ {
860+ if (ec->cond_met == 6) {
861+ /* BG filter has had better results for 6 consecutive samples */
862+ ec->adapt = 1;
863+ memcpy(ec->fir_taps16[0], ec->fir_taps16[1], ec->taps*sizeof(int16_t));
864+ }
865+ else
866+ ec->cond_met++;
867+ }
868+ else
869+ ec->cond_met = 0;
870+
871+ /* Non-Linear Processing ---------------------------------------------------*/
872+
873+ ec->clean_nlp = ec->clean;
874+ if (ec->adaption_mode & ECHO_CAN_USE_NLP)
875+ {
876+ /* Non-linear processor - a fancy way to say "zap small signals, to avoid
877+ residual echo due to (uLaw/ALaw) non-linearity in the channel.". */
878+
879+ if ((16*ec->Lclean < ec->Ltx))
880+ {
881+ /* Our e/c has improved echo by at least 24 dB (each factor of 2 is 6dB,
882+ so 2*2*2*2=16 is the same as 6+6+6+6=24dB) */
883+ if (ec->adaption_mode & ECHO_CAN_USE_CNG)
884+ {
885+ ec->cng_level = ec->Lbgn;
886+
887+ /* Very elementary comfort noise generation. Just random
888+ numbers rolled off very vaguely Hoth-like. DR: This
889+ noise doesn't sound quite right to me - I suspect there
890+ are some overlfow issues in the filtering as it's too
891+ "crackly". TODO: debug this, maybe just play noise at
892+ high level or look at spectrum.
893+ */
894+
895+ ec->cng_rndnum = 1664525U*ec->cng_rndnum + 1013904223U;
896+ ec->cng_filter = ((ec->cng_rndnum & 0xFFFF) - 32768 + 5*ec->cng_filter) >> 3;
897+ ec->clean_nlp = (ec->cng_filter*ec->cng_level*8) >> 14;
898+
899+ }
900+ else if (ec->adaption_mode & ECHO_CAN_USE_CLIP)
901+ {
902+ /* This sounds much better than CNG */
903+ if (ec->clean_nlp > ec->Lbgn)
904+ ec->clean_nlp = ec->Lbgn;
905+ if (ec->clean_nlp < -ec->Lbgn)
906+ ec->clean_nlp = -ec->Lbgn;
907+ }
908+ else
909+ {
910+ /* just mute the residual, doesn't sound very good, used mainly
911+ in G168 tests */
912+ ec->clean_nlp = 0;
913+ }
914+ }
915+ else {
916+ /* Background noise estimator. I tried a few algorithms
917+ here without much luck. This very simple one seems to
918+ work best, we just average the level using a slow (1 sec
919+ time const) filter if the current level is less than a
920+ (experimentally derived) constant. This means we dont
921+ include high level signals like near end speech. When
922+ combined with CNG or especially CLIP seems to work OK.
923+ */
924+ if (ec->Lclean < 40) {
925+ ec->Lbgn_acc += abs(ec->clean) - ec->Lbgn;
926+ ec->Lbgn = (ec->Lbgn_acc + (1<<11)) >> 12;
927+ }
928+ }
929+ }
930+
931+ /* Roll around the taps buffer */
932+ if (ec->curr_pos <= 0)
933+ ec->curr_pos = ec->taps;
934+ ec->curr_pos--;
935+
936+ if (ec->adaption_mode & ECHO_CAN_DISABLE)
937+ ec->clean_nlp = rx;
938+
939+ /* Output scaled back up again to match input scaling */
940+
941+ return (int16_t) ec->clean_nlp << 1;
942+}
943+
944+/*- End of function --------------------------------------------------------*/
945+
946+/* This function is seperated from the echo canceller is it is usually called
947+ as part of the tx process. See rx HP (DC blocking) filter above, it's
948+ the same design.
949+
950+ Some soft phones send speech signals with a lot of low frequency
951+ energy, e.g. down to 20Hz. This can make the hybrid non-linear
952+ which causes the echo canceller to fall over. This filter can help
953+ by removing any low frequency before it gets to the tx port of the
954+ hybrid.
955+
956+ It can also help by removing and DC in the tx signal. DC is bad
957+ for LMS algorithms.
958+
959+ This is one of the classic DC removal filters, adjusted to provide sufficient
960+ bass rolloff to meet the above requirement to protect hybrids from things that
961+ upset them. The difference between successive samples produces a lousy HPF, and
962+ then a suitably placed pole flattens things out. The final result is a nicely
963+ rolled off bass end. The filtering is implemented with extended fractional
964+ precision, which noise shapes things, giving very clean DC removal.
965+*/
966+
967+int16_t echo_can_hpf_tx(echo_can_state_t *ec, int16_t tx) {
968+ int tmp, tmp1;
969+
970+ if (ec->adaption_mode & ECHO_CAN_USE_TX_HPF) {
971+ tmp = tx << 15;
972+#if 1
973+ /* Make sure the gain of the HPF is 1.0. The first can still saturate a little under
974+ impulse conditions, and it might roll to 32768 and need clipping on sustained peak
975+ level signals. However, the scale of such clipping is small, and the error due to
976+ any saturation should not markedly affect the downstream processing. */
977+ tmp -= (tmp >> 4);
978+#endif
979+ ec->tx_1 += -(ec->tx_1>>DC_LOG2BETA) + tmp - ec->tx_2;
980+ tmp1 = ec->tx_1 >> 15;
981+ if (tmp1 > 32767) tmp1 = 32767;
982+ if (tmp1 < -32767) tmp1 = -32767;
983+ tx = tmp1;
984+ ec->tx_2 = tmp;
985+ }
986+
987+ return tx;
988+}
989diff --git a/drivers/staging/echo/echo.h b/drivers/staging/echo/echo.h
990new file mode 100644
991index 0000000..7a91b43
992--- /dev/null
993+++ b/drivers/staging/echo/echo.h
994@@ -0,0 +1,220 @@
995+/*
996+ * SpanDSP - a series of DSP components for telephony
997+ *
998+ * echo.c - A line echo canceller. This code is being developed
999+ * against and partially complies with G168.
1000+ *
1001+ * Written by Steve Underwood <steveu@coppice.org>
1002+ * and David Rowe <david_at_rowetel_dot_com>
1003+ *
1004+ * Copyright (C) 2001 Steve Underwood and 2007 David Rowe
1005+ *
1006+ * All rights reserved.
1007+ *
1008+ * This program is free software; you can redistribute it and/or modify
1009+ * it under the terms of the GNU General Public License version 2, as
1010+ * published by the Free Software Foundation.
1011+ *
1012+ * This program is distributed in the hope that it will be useful,
1013+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1014+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1015+ * GNU General Public License for more details.
1016+ *
1017+ * You should have received a copy of the GNU General Public License
1018+ * along with this program; if not, write to the Free Software
1019+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1020+ *
1021+ * $Id: echo.h,v 1.9 2006/10/24 13:45:28 steveu Exp $
1022+ */
1023+
1024+#ifndef __ECHO_H
1025+#define __ECHO_H
1026+
1027+/*! \page echo_can_page Line echo cancellation for voice
1028+
1029+\section echo_can_page_sec_1 What does it do?
1030+This module aims to provide G.168-2002 compliant echo cancellation, to remove
1031+electrical echoes (e.g. from 2-4 wire hybrids) from voice calls.
1032+
1033+\section echo_can_page_sec_2 How does it work?
1034+The heart of the echo cancellor is FIR filter. This is adapted to match the
1035+echo impulse response of the telephone line. It must be long enough to
1036+adequately cover the duration of that impulse response. The signal transmitted
1037+to the telephone line is passed through the FIR filter. Once the FIR is
1038+properly adapted, the resulting output is an estimate of the echo signal
1039+received from the line. This is subtracted from the received signal. The result
1040+is an estimate of the signal which originated at the far end of the line, free
1041+from echos of our own transmitted signal.
1042+
1043+The least mean squares (LMS) algorithm is attributed to Widrow and Hoff, and
1044+was introduced in 1960. It is the commonest form of filter adaption used in
1045+things like modem line equalisers and line echo cancellers. There it works very
1046+well. However, it only works well for signals of constant amplitude. It works
1047+very poorly for things like speech echo cancellation, where the signal level
1048+varies widely. This is quite easy to fix. If the signal level is normalised -
1049+similar to applying AGC - LMS can work as well for a signal of varying
1050+amplitude as it does for a modem signal. This normalised least mean squares
1051+(NLMS) algorithm is the commonest one used for speech echo cancellation. Many
1052+other algorithms exist - e.g. RLS (essentially the same as Kalman filtering),
1053+FAP, etc. Some perform significantly better than NLMS. However, factors such
1054+as computational complexity and patents favour the use of NLMS.
1055+
1056+A simple refinement to NLMS can improve its performance with speech. NLMS tends
1057+to adapt best to the strongest parts of a signal. If the signal is white noise,
1058+the NLMS algorithm works very well. However, speech has more low frequency than
1059+high frequency content. Pre-whitening (i.e. filtering the signal to flatten its
1060+spectrum) the echo signal improves the adapt rate for speech, and ensures the
1061+final residual signal is not heavily biased towards high frequencies. A very
1062+low complexity filter is adequate for this, so pre-whitening adds little to the
1063+compute requirements of the echo canceller.
1064+
1065+An FIR filter adapted using pre-whitened NLMS performs well, provided certain
1066+conditions are met:
1067+
1068+ - The transmitted signal has poor self-correlation.
1069+ - There is no signal being generated within the environment being
1070+ cancelled.
1071+
1072+The difficulty is that neither of these can be guaranteed.
1073+
1074+If the adaption is performed while transmitting noise (or something fairly
1075+noise like, such as voice) the adaption works very well. If the adaption is
1076+performed while transmitting something highly correlative (typically narrow
1077+band energy such as signalling tones or DTMF), the adaption can go seriously
1078+wrong. The reason is there is only one solution for the adaption on a near
1079+random signal - the impulse response of the line. For a repetitive signal,
1080+there are any number of solutions which converge the adaption, and nothing
1081+guides the adaption to choose the generalised one. Allowing an untrained
1082+canceller to converge on this kind of narrowband energy probably a good thing,
1083+since at least it cancels the tones. Allowing a well converged canceller to
1084+continue converging on such energy is just a way to ruin its generalised
1085+adaption. A narrowband detector is needed, so adapation can be suspended at
1086+appropriate times.
1087+
1088+The adaption process is based on trying to eliminate the received signal. When
1089+there is any signal from within the environment being cancelled it may upset
1090+the adaption process. Similarly, if the signal we are transmitting is small,
1091+noise may dominate and disturb the adaption process. If we can ensure that the
1092+adaption is only performed when we are transmitting a significant signal level,
1093+and the environment is not, things will be OK. Clearly, it is easy to tell when
1094+we are sending a significant signal. Telling, if the environment is generating
1095+a significant signal, and doing it with sufficient speed that the adaption will
1096+not have diverged too much more we stop it, is a little harder.
1097+
1098+The key problem in detecting when the environment is sourcing significant
1099+energy is that we must do this very quickly. Given a reasonably long sample of
1100+the received signal, there are a number of strategies which may be used to
1101+assess whether that signal contains a strong far end component. However, by the
1102+time that assessment is complete the far end signal will have already caused
1103+major mis-convergence in the adaption process. An assessment algorithm is
1104+needed which produces a fairly accurate result from a very short burst of far
1105+end energy.
1106+
1107+\section echo_can_page_sec_3 How do I use it?
1108+The echo cancellor processes both the transmit and receive streams sample by
1109+sample. The processing function is not declared inline. Unfortunately,
1110+cancellation requires many operations per sample, so the call overhead is only
1111+a minor burden.
1112+*/
1113+
1114+#include "fir.h"
1115+
1116+/* Mask bits for the adaption mode */
1117+#define ECHO_CAN_USE_ADAPTION 0x01
1118+#define ECHO_CAN_USE_NLP 0x02
1119+#define ECHO_CAN_USE_CNG 0x04
1120+#define ECHO_CAN_USE_CLIP 0x08
1121+#define ECHO_CAN_USE_TX_HPF 0x10
1122+#define ECHO_CAN_USE_RX_HPF 0x20
1123+#define ECHO_CAN_DISABLE 0x40
1124+
1125+/*!
1126+ G.168 echo canceller descriptor. This defines the working state for a line
1127+ echo canceller.
1128+*/
1129+typedef struct
1130+{
1131+ int16_t tx,rx;
1132+ int16_t clean;
1133+ int16_t clean_nlp;
1134+
1135+ int nonupdate_dwell;
1136+ int curr_pos;
1137+ int taps;
1138+ int log2taps;
1139+ int adaption_mode;
1140+
1141+ int cond_met;
1142+ int32_t Pstates;
1143+ int16_t adapt;
1144+ int32_t factor;
1145+ int16_t shift;
1146+
1147+ /* Average levels and averaging filter states */
1148+ int Ltxacc, Lrxacc, Lcleanacc, Lclean_bgacc;
1149+ int Ltx, Lrx;
1150+ int Lclean;
1151+ int Lclean_bg;
1152+ int Lbgn, Lbgn_acc, Lbgn_upper, Lbgn_upper_acc;
1153+
1154+ /* foreground and background filter states */
1155+ fir16_state_t fir_state;
1156+ fir16_state_t fir_state_bg;
1157+ int16_t *fir_taps16[2];
1158+
1159+ /* DC blocking filter states */
1160+ int tx_1, tx_2, rx_1, rx_2;
1161+
1162+ /* optional High Pass Filter states */
1163+ int32_t xvtx[5], yvtx[5];
1164+ int32_t xvrx[5], yvrx[5];
1165+
1166+ /* Parameters for the optional Hoth noise generator */
1167+ int cng_level;
1168+ int cng_rndnum;
1169+ int cng_filter;
1170+
1171+ /* snapshot sample of coeffs used for development */
1172+ int16_t *snapshot;
1173+} echo_can_state_t;
1174+
1175+/*! Create a voice echo canceller context.
1176+ \param len The length of the canceller, in samples.
1177+ \return The new canceller context, or NULL if the canceller could not be created.
1178+*/
1179+echo_can_state_t *echo_can_create(int len, int adaption_mode);
1180+
1181+/*! Free a voice echo canceller context.
1182+ \param ec The echo canceller context.
1183+*/
1184+void echo_can_free(echo_can_state_t *ec);
1185+
1186+/*! Flush (reinitialise) a voice echo canceller context.
1187+ \param ec The echo canceller context.
1188+*/
1189+void echo_can_flush(echo_can_state_t *ec);
1190+
1191+/*! Set the adaption mode of a voice echo canceller context.
1192+ \param ec The echo canceller context.
1193+ \param adapt The mode.
1194+*/
1195+void echo_can_adaption_mode(echo_can_state_t *ec, int adaption_mode);
1196+
1197+void echo_can_snapshot(echo_can_state_t *ec);
1198+
1199+/*! Process a sample through a voice echo canceller.
1200+ \param ec The echo canceller context.
1201+ \param tx The transmitted audio sample.
1202+ \param rx The received audio sample.
1203+ \return The clean (echo cancelled) received sample.
1204+*/
1205+int16_t echo_can_update(echo_can_state_t *ec, int16_t tx, int16_t rx);
1206+
1207+/*! Process to high pass filter the tx signal.
1208+ \param ec The echo canceller context.
1209+ \param tx The transmitted auio sample.
1210+ \return The HP filtered transmit sample, send this to your D/A.
1211+*/
1212+int16_t echo_can_hpf_tx(echo_can_state_t *ec, int16_t tx);
1213+
1214+#endif /* __ECHO_H */
1215diff --git a/drivers/staging/echo/fir.h b/drivers/staging/echo/fir.h
1216new file mode 100644
1217index 0000000..e1bfc49
1218--- /dev/null
1219+++ b/drivers/staging/echo/fir.h
1220@@ -0,0 +1,369 @@
1221+/*
1222+ * SpanDSP - a series of DSP components for telephony
1223+ *
1224+ * fir.h - General telephony FIR routines
1225+ *
1226+ * Written by Steve Underwood <steveu@coppice.org>
1227+ *
1228+ * Copyright (C) 2002 Steve Underwood
1229+ *
1230+ * All rights reserved.
1231+ *
1232+ * This program is free software; you can redistribute it and/or modify
1233+ * it under the terms of the GNU General Public License version 2, as
1234+ * published by the Free Software Foundation.
1235+ *
1236+ * This program is distributed in the hope that it will be useful,
1237+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1238+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1239+ * GNU General Public License for more details.
1240+ *
1241+ * You should have received a copy of the GNU General Public License
1242+ * along with this program; if not, write to the Free Software
1243+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1244+ *
1245+ * $Id: fir.h,v 1.8 2006/10/24 13:45:28 steveu Exp $
1246+ */
1247+
1248+/*! \page fir_page FIR filtering
1249+\section fir_page_sec_1 What does it do?
1250+???.
1251+
1252+\section fir_page_sec_2 How does it work?
1253+???.
1254+*/
1255+
1256+#if !defined(_FIR_H_)
1257+#define _FIR_H_
1258+
1259+/*
1260+ Blackfin NOTES & IDEAS:
1261+
1262+ A simple dot product function is used to implement the filter. This performs
1263+ just one MAC/cycle which is inefficient but was easy to implement as a first
1264+ pass. The current Blackfin code also uses an unrolled form of the filter
1265+ history to avoid 0 length hardware loop issues. This is wasteful of
1266+ memory.
1267+
1268+ Ideas for improvement:
1269+
1270+ 1/ Rewrite filter for dual MAC inner loop. The issue here is handling
1271+ history sample offsets that are 16 bit aligned - the dual MAC needs
1272+ 32 bit aligmnent. There are some good examples in libbfdsp.
1273+
1274+ 2/ Use the hardware circular buffer facility tohalve memory usage.
1275+
1276+ 3/ Consider using internal memory.
1277+
1278+ Using less memory might also improve speed as cache misses will be
1279+ reduced. A drop in MIPs and memory approaching 50% should be
1280+ possible.
1281+
1282+ The foreground and background filters currenlty use a total of
1283+ about 10 MIPs/ch as measured with speedtest.c on a 256 TAP echo
1284+ can.
1285+*/
1286+
1287+#if defined(USE_MMX) || defined(USE_SSE2)
1288+#include "mmx.h"
1289+#endif
1290+
1291+/*!
1292+ 16 bit integer FIR descriptor. This defines the working state for a single
1293+ instance of an FIR filter using 16 bit integer coefficients.
1294+*/
1295+typedef struct
1296+{
1297+ int taps;
1298+ int curr_pos;
1299+ const int16_t *coeffs;
1300+ int16_t *history;
1301+} fir16_state_t;
1302+
1303+/*!
1304+ 32 bit integer FIR descriptor. This defines the working state for a single
1305+ instance of an FIR filter using 32 bit integer coefficients, and filtering
1306+ 16 bit integer data.
1307+*/
1308+typedef struct
1309+{
1310+ int taps;
1311+ int curr_pos;
1312+ const int32_t *coeffs;
1313+ int16_t *history;
1314+} fir32_state_t;
1315+
1316+/*!
1317+ Floating point FIR descriptor. This defines the working state for a single
1318+ instance of an FIR filter using floating point coefficients and data.
1319+*/
1320+typedef struct
1321+{
1322+ int taps;
1323+ int curr_pos;
1324+ const float *coeffs;
1325+ float *history;
1326+} fir_float_state_t;
1327+
1328+#ifdef __cplusplus
1329+extern "C" {
1330+#endif
1331+
1332+static __inline__ const int16_t *fir16_create(fir16_state_t *fir,
1333+ const int16_t *coeffs,
1334+ int taps)
1335+{
1336+ fir->taps = taps;
1337+ fir->curr_pos = taps - 1;
1338+ fir->coeffs = coeffs;
1339+#if defined(USE_MMX) || defined(USE_SSE2) || defined(__BLACKFIN_ASM__)
1340+ if ((fir->history = malloc(2*taps*sizeof(int16_t))))
1341+ memset(fir->history, 0, 2*taps*sizeof(int16_t));
1342+#else
1343+ if ((fir->history = (int16_t *) malloc(taps*sizeof(int16_t))))
1344+ memset(fir->history, 0, taps*sizeof(int16_t));
1345+#endif
1346+ return fir->history;
1347+}
1348+/*- End of function --------------------------------------------------------*/
1349+
1350+static __inline__ void fir16_flush(fir16_state_t *fir)
1351+{
1352+#if defined(USE_MMX) || defined(USE_SSE2) || defined(__BLACKFIN_ASM__)
1353+ memset(fir->history, 0, 2*fir->taps*sizeof(int16_t));
1354+#else
1355+ memset(fir->history, 0, fir->taps*sizeof(int16_t));
1356+#endif
1357+}
1358+/*- End of function --------------------------------------------------------*/
1359+
1360+static __inline__ void fir16_free(fir16_state_t *fir)
1361+{
1362+ free(fir->history);
1363+}
1364+/*- End of function --------------------------------------------------------*/
1365+
1366+#ifdef __BLACKFIN_ASM__
1367+static inline int32_t dot_asm(short *x, short *y, int len)
1368+{
1369+ int dot;
1370+
1371+ len--;
1372+
1373+ __asm__
1374+ (
1375+ "I0 = %1;\n\t"
1376+ "I1 = %2;\n\t"
1377+ "A0 = 0;\n\t"
1378+ "R0.L = W[I0++] || R1.L = W[I1++];\n\t"
1379+ "LOOP dot%= LC0 = %3;\n\t"
1380+ "LOOP_BEGIN dot%=;\n\t"
1381+ "A0 += R0.L * R1.L (IS) || R0.L = W[I0++] || R1.L = W[I1++];\n\t"
1382+ "LOOP_END dot%=;\n\t"
1383+ "A0 += R0.L*R1.L (IS);\n\t"
1384+ "R0 = A0;\n\t"
1385+ "%0 = R0;\n\t"
1386+ : "=&d" (dot)
1387+ : "a" (x), "a" (y), "a" (len)
1388+ : "I0", "I1", "A1", "A0", "R0", "R1"
1389+ );
1390+
1391+ return dot;
1392+}
1393+#endif
1394+/*- End of function --------------------------------------------------------*/
1395+
1396+static __inline__ int16_t fir16(fir16_state_t *fir, int16_t sample)
1397+{
1398+ int32_t y;
1399+#if defined(USE_MMX)
1400+ int i;
1401+ mmx_t *mmx_coeffs;
1402+ mmx_t *mmx_hist;
1403+
1404+ fir->history[fir->curr_pos] = sample;
1405+ fir->history[fir->curr_pos + fir->taps] = sample;
1406+
1407+ mmx_coeffs = (mmx_t *) fir->coeffs;
1408+ mmx_hist = (mmx_t *) &fir->history[fir->curr_pos];
1409+ i = fir->taps;
1410+ pxor_r2r(mm4, mm4);
1411+ /* 8 samples per iteration, so the filter must be a multiple of 8 long. */
1412+ while (i > 0)
1413+ {
1414+ movq_m2r(mmx_coeffs[0], mm0);
1415+ movq_m2r(mmx_coeffs[1], mm2);
1416+ movq_m2r(mmx_hist[0], mm1);
1417+ movq_m2r(mmx_hist[1], mm3);
1418+ mmx_coeffs += 2;
1419+ mmx_hist += 2;
1420+ pmaddwd_r2r(mm1, mm0);
1421+ pmaddwd_r2r(mm3, mm2);
1422+ paddd_r2r(mm0, mm4);
1423+ paddd_r2r(mm2, mm4);
1424+ i -= 8;
1425+ }
1426+ movq_r2r(mm4, mm0);
1427+ psrlq_i2r(32, mm0);
1428+ paddd_r2r(mm0, mm4);
1429+ movd_r2m(mm4, y);
1430+ emms();
1431+#elif defined(USE_SSE2)
1432+ int i;
1433+ xmm_t *xmm_coeffs;
1434+ xmm_t *xmm_hist;
1435+
1436+ fir->history[fir->curr_pos] = sample;
1437+ fir->history[fir->curr_pos + fir->taps] = sample;
1438+
1439+ xmm_coeffs = (xmm_t *) fir->coeffs;
1440+ xmm_hist = (xmm_t *) &fir->history[fir->curr_pos];
1441+ i = fir->taps;
1442+ pxor_r2r(xmm4, xmm4);
1443+ /* 16 samples per iteration, so the filter must be a multiple of 16 long. */
1444+ while (i > 0)
1445+ {
1446+ movdqu_m2r(xmm_coeffs[0], xmm0);
1447+ movdqu_m2r(xmm_coeffs[1], xmm2);
1448+ movdqu_m2r(xmm_hist[0], xmm1);
1449+ movdqu_m2r(xmm_hist[1], xmm3);
1450+ xmm_coeffs += 2;
1451+ xmm_hist += 2;
1452+ pmaddwd_r2r(xmm1, xmm0);
1453+ pmaddwd_r2r(xmm3, xmm2);
1454+ paddd_r2r(xmm0, xmm4);
1455+ paddd_r2r(xmm2, xmm4);
1456+ i -= 16;
1457+ }
1458+ movdqa_r2r(xmm4, xmm0);
1459+ psrldq_i2r(8, xmm0);
1460+ paddd_r2r(xmm0, xmm4);
1461+ movdqa_r2r(xmm4, xmm0);
1462+ psrldq_i2r(4, xmm0);
1463+ paddd_r2r(xmm0, xmm4);
1464+ movd_r2m(xmm4, y);
1465+#elif defined(__BLACKFIN_ASM__)
1466+ fir->history[fir->curr_pos] = sample;
1467+ fir->history[fir->curr_pos + fir->taps] = sample;
1468+ y = dot_asm((int16_t*)fir->coeffs, &fir->history[fir->curr_pos], fir->taps);
1469+#else
1470+ int i;
1471+ int offset1;
1472+ int offset2;
1473+
1474+ fir->history[fir->curr_pos] = sample;
1475+
1476+ offset2 = fir->curr_pos;
1477+ offset1 = fir->taps - offset2;
1478+ y = 0;
1479+ for (i = fir->taps - 1; i >= offset1; i--)
1480+ y += fir->coeffs[i]*fir->history[i - offset1];
1481+ for ( ; i >= 0; i--)
1482+ y += fir->coeffs[i]*fir->history[i + offset2];
1483+#endif
1484+ if (fir->curr_pos <= 0)
1485+ fir->curr_pos = fir->taps;
1486+ fir->curr_pos--;
1487+ return (int16_t) (y >> 15);
1488+}
1489+/*- End of function --------------------------------------------------------*/
1490+
1491+static __inline__ const int16_t *fir32_create(fir32_state_t *fir,
1492+ const int32_t *coeffs,
1493+ int taps)
1494+{
1495+ fir->taps = taps;
1496+ fir->curr_pos = taps - 1;
1497+ fir->coeffs = coeffs;
1498+ fir->history = (int16_t *) malloc(taps*sizeof(int16_t));
1499+ if (fir->history)
1500+ memset(fir->history, '\0', taps*sizeof(int16_t));
1501+ return fir->history;
1502+}
1503+/*- End of function --------------------------------------------------------*/
1504+
1505+static __inline__ void fir32_flush(fir32_state_t *fir)
1506+{
1507+ memset(fir->history, 0, fir->taps*sizeof(int16_t));
1508+}
1509+/*- End of function --------------------------------------------------------*/
1510+
1511+static __inline__ void fir32_free(fir32_state_t *fir)
1512+{
1513+ free(fir->history);
1514+}
1515+/*- End of function --------------------------------------------------------*/
1516+
1517+static __inline__ int16_t fir32(fir32_state_t *fir, int16_t sample)
1518+{
1519+ int i;
1520+ int32_t y;
1521+ int offset1;
1522+ int offset2;
1523+
1524+ fir->history[fir->curr_pos] = sample;
1525+ offset2 = fir->curr_pos;
1526+ offset1 = fir->taps - offset2;
1527+ y = 0;
1528+ for (i = fir->taps - 1; i >= offset1; i--)
1529+ y += fir->coeffs[i]*fir->history[i - offset1];
1530+ for ( ; i >= 0; i--)
1531+ y += fir->coeffs[i]*fir->history[i + offset2];
1532+ if (fir->curr_pos <= 0)
1533+ fir->curr_pos = fir->taps;
1534+ fir->curr_pos--;
1535+ return (int16_t) (y >> 15);
1536+}
1537+/*- End of function --------------------------------------------------------*/
1538+
1539+#ifndef __KERNEL__
1540+static __inline__ const float *fir_float_create(fir_float_state_t *fir,
1541+ const float *coeffs,
1542+ int taps)
1543+{
1544+ fir->taps = taps;
1545+ fir->curr_pos = taps - 1;
1546+ fir->coeffs = coeffs;
1547+ fir->history = (float *) malloc(taps*sizeof(float));
1548+ if (fir->history)
1549+ memset(fir->history, '\0', taps*sizeof(float));
1550+ return fir->history;
1551+}
1552+/*- End of function --------------------------------------------------------*/
1553+
1554+static __inline__ void fir_float_free(fir_float_state_t *fir)
1555+{
1556+ free(fir->history);
1557+}
1558+/*- End of function --------------------------------------------------------*/
1559+
1560+static __inline__ int16_t fir_float(fir_float_state_t *fir, int16_t sample)
1561+{
1562+ int i;
1563+ float y;
1564+ int offset1;
1565+ int offset2;
1566+
1567+ fir->history[fir->curr_pos] = sample;
1568+
1569+ offset2 = fir->curr_pos;
1570+ offset1 = fir->taps - offset2;
1571+ y = 0;
1572+ for (i = fir->taps - 1; i >= offset1; i--)
1573+ y += fir->coeffs[i]*fir->history[i - offset1];
1574+ for ( ; i >= 0; i--)
1575+ y += fir->coeffs[i]*fir->history[i + offset2];
1576+ if (fir->curr_pos <= 0)
1577+ fir->curr_pos = fir->taps;
1578+ fir->curr_pos--;
1579+ return (int16_t) y;
1580+}
1581+/*- End of function --------------------------------------------------------*/
1582+#endif
1583+
1584+#ifdef __cplusplus
1585+}
1586+#endif
1587+
1588+#endif
1589+/*- End of file ------------------------------------------------------------*/
1590diff --git a/drivers/staging/echo/mmx.h b/drivers/staging/echo/mmx.h
1591new file mode 100644
1592index 0000000..b5a3964
1593--- /dev/null
1594+++ b/drivers/staging/echo/mmx.h
1595@@ -0,0 +1,288 @@
1596+/*
1597+ * mmx.h
1598+ * Copyright (C) 1997-2001 H. Dietz and R. Fisher
1599+ *
1600+ * This file is part of FFmpeg.
1601+ *
1602+ * FFmpeg is free software; you can redistribute it and/or
1603+ * modify it under the terms of the GNU Lesser General Public
1604+ * License as published by the Free Software Foundation; either
1605+ * version 2.1 of the License, or (at your option) any later version.
1606+ *
1607+ * FFmpeg is distributed in the hope that it will be useful,
1608+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1609+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1610+ * Lesser General Public License for more details.
1611+ *
1612+ * You should have received a copy of the GNU Lesser General Public
1613+ * License along with FFmpeg; if not, write to the Free Software
1614+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1615+ */
1616+#ifndef AVCODEC_I386MMX_H
1617+#define AVCODEC_I386MMX_H
1618+
1619+/*
1620+ * The type of an value that fits in an MMX register (note that long
1621+ * long constant values MUST be suffixed by LL and unsigned long long
1622+ * values by ULL, lest they be truncated by the compiler)
1623+ */
1624+
1625+typedef union {
1626+ long long q; /* Quadword (64-bit) value */
1627+ unsigned long long uq; /* Unsigned Quadword */
1628+ int d[2]; /* 2 Doubleword (32-bit) values */
1629+ unsigned int ud[2]; /* 2 Unsigned Doubleword */
1630+ short w[4]; /* 4 Word (16-bit) values */
1631+ unsigned short uw[4]; /* 4 Unsigned Word */
1632+ char b[8]; /* 8 Byte (8-bit) values */
1633+ unsigned char ub[8]; /* 8 Unsigned Byte */
1634+ float s[2]; /* Single-precision (32-bit) value */
1635+} mmx_t; /* On an 8-byte (64-bit) boundary */
1636+
1637+/* SSE registers */
1638+typedef union {
1639+ char b[16];
1640+} xmm_t;
1641+
1642+
1643+#define mmx_i2r(op,imm,reg) \
1644+ __asm__ __volatile__ (#op " %0, %%" #reg \
1645+ : /* nothing */ \
1646+ : "i" (imm) )
1647+
1648+#define mmx_m2r(op,mem,reg) \
1649+ __asm__ __volatile__ (#op " %0, %%" #reg \
1650+ : /* nothing */ \
1651+ : "m" (mem))
1652+
1653+#define mmx_r2m(op,reg,mem) \
1654+ __asm__ __volatile__ (#op " %%" #reg ", %0" \
1655+ : "=m" (mem) \
1656+ : /* nothing */ )
1657+
1658+#define mmx_r2r(op,regs,regd) \
1659+ __asm__ __volatile__ (#op " %" #regs ", %" #regd)
1660+
1661+
1662+#define emms() __asm__ __volatile__ ("emms")
1663+
1664+#define movd_m2r(var,reg) mmx_m2r (movd, var, reg)
1665+#define movd_r2m(reg,var) mmx_r2m (movd, reg, var)
1666+#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd)
1667+
1668+#define movq_m2r(var,reg) mmx_m2r (movq, var, reg)
1669+#define movq_r2m(reg,var) mmx_r2m (movq, reg, var)
1670+#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd)
1671+
1672+#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg)
1673+#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd)
1674+#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg)
1675+#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd)
1676+
1677+#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg)
1678+#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd)
1679+
1680+#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg)
1681+#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd)
1682+#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg)
1683+#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd)
1684+#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg)
1685+#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd)
1686+
1687+#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg)
1688+#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd)
1689+#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg)
1690+#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd)
1691+
1692+#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg)
1693+#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd)
1694+#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg)
1695+#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd)
1696+
1697+#define pand_m2r(var,reg) mmx_m2r (pand, var, reg)
1698+#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd)
1699+
1700+#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg)
1701+#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd)
1702+
1703+#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg)
1704+#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd)
1705+#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg)
1706+#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd)
1707+#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg)
1708+#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd)
1709+
1710+#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg)
1711+#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd)
1712+#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg)
1713+#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd)
1714+#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg)
1715+#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd)
1716+
1717+#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg)
1718+#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd)
1719+
1720+#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg)
1721+#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd)
1722+
1723+#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg)
1724+#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd)
1725+
1726+#define por_m2r(var,reg) mmx_m2r (por, var, reg)
1727+#define por_r2r(regs,regd) mmx_r2r (por, regs, regd)
1728+
1729+#define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg)
1730+#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg)
1731+#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd)
1732+#define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg)
1733+#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg)
1734+#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd)
1735+#define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg)
1736+#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg)
1737+#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd)
1738+
1739+#define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg)
1740+#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg)
1741+#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd)
1742+#define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg)
1743+#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg)
1744+#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd)
1745+
1746+#define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg)
1747+#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg)
1748+#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd)
1749+#define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg)
1750+#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg)
1751+#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd)
1752+#define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg)
1753+#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg)
1754+#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd)
1755+
1756+#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg)
1757+#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd)
1758+#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg)
1759+#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd)
1760+#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg)
1761+#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd)
1762+
1763+#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg)
1764+#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd)
1765+#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg)
1766+#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd)
1767+
1768+#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg)
1769+#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd)
1770+#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg)
1771+#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd)
1772+
1773+#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg)
1774+#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd)
1775+#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg)
1776+#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd)
1777+#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg)
1778+#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd)
1779+
1780+#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg)
1781+#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd)
1782+#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg)
1783+#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd)
1784+#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg)
1785+#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd)
1786+
1787+#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg)
1788+#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd)
1789+
1790+
1791+/* 3DNOW extensions */
1792+
1793+#define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg)
1794+#define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd)
1795+
1796+
1797+/* AMD MMX extensions - also available in intel SSE */
1798+
1799+
1800+#define mmx_m2ri(op,mem,reg,imm) \
1801+ __asm__ __volatile__ (#op " %1, %0, %%" #reg \
1802+ : /* nothing */ \
1803+ : "m" (mem), "i" (imm))
1804+#define mmx_r2ri(op,regs,regd,imm) \
1805+ __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
1806+ : /* nothing */ \
1807+ : "i" (imm) )
1808+
1809+#define mmx_fetch(mem,hint) \
1810+ __asm__ __volatile__ ("prefetch" #hint " %0" \
1811+ : /* nothing */ \
1812+ : "m" (mem))
1813+
1814+
1815+#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg)
1816+
1817+#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var)
1818+
1819+#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg)
1820+#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd)
1821+#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg)
1822+#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd)
1823+
1824+#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm)
1825+
1826+#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm)
1827+
1828+#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg)
1829+#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd)
1830+
1831+#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg)
1832+#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd)
1833+
1834+#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg)
1835+#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd)
1836+
1837+#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg)
1838+#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd)
1839+
1840+#define pmovmskb(mmreg,reg) \
1841+ __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg)
1842+
1843+#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg)
1844+#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd)
1845+
1846+#define prefetcht0(mem) mmx_fetch (mem, t0)
1847+#define prefetcht1(mem) mmx_fetch (mem, t1)
1848+#define prefetcht2(mem) mmx_fetch (mem, t2)
1849+#define prefetchnta(mem) mmx_fetch (mem, nta)
1850+
1851+#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg)
1852+#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd)
1853+
1854+#define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm)
1855+#define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm)
1856+
1857+#define sfence() __asm__ __volatile__ ("sfence\n\t")
1858+
1859+/* SSE2 */
1860+#define pshufhw_m2r(var,reg,imm) mmx_m2ri(pshufhw, var, reg, imm)
1861+#define pshufhw_r2r(regs,regd,imm) mmx_r2ri(pshufhw, regs, regd, imm)
1862+#define pshuflw_m2r(var,reg,imm) mmx_m2ri(pshuflw, var, reg, imm)
1863+#define pshuflw_r2r(regs,regd,imm) mmx_r2ri(pshuflw, regs, regd, imm)
1864+
1865+#define pshufd_r2r(regs,regd,imm) mmx_r2ri(pshufd, regs, regd, imm)
1866+
1867+#define movdqa_m2r(var,reg) mmx_m2r (movdqa, var, reg)
1868+#define movdqa_r2m(reg,var) mmx_r2m (movdqa, reg, var)
1869+#define movdqa_r2r(regs,regd) mmx_r2r (movdqa, regs, regd)
1870+#define movdqu_m2r(var,reg) mmx_m2r (movdqu, var, reg)
1871+#define movdqu_r2m(reg,var) mmx_r2m (movdqu, reg, var)
1872+#define movdqu_r2r(regs,regd) mmx_r2r (movdqu, regs, regd)
1873+
1874+#define pmullw_r2m(reg,var) mmx_r2m (pmullw, reg, var)
1875+
1876+#define pslldq_i2r(imm,reg) mmx_i2r (pslldq, imm, reg)
1877+#define psrldq_i2r(imm,reg) mmx_i2r (psrldq, imm, reg)
1878+
1879+#define punpcklqdq_r2r(regs,regd) mmx_r2r (punpcklqdq, regs, regd)
1880+#define punpckhqdq_r2r(regs,regd) mmx_r2r (punpckhqdq, regs, regd)
1881+
1882+
1883+#endif /* AVCODEC_I386MMX_H */
1884--
18851.6.0.2
1886