]> git.ipfire.org Git - ipfire-2.x.git/blob - src/patches/suse-2.6.27.31/patches.drivers/0016-Staging-add-echo-cancelation-module.patch
Add a patch to fix Intel E100 wake-on-lan problems.
[ipfire-2.x.git] / src / patches / suse-2.6.27.31 / patches.drivers / 0016-Staging-add-echo-cancelation-module.patch
1 From 10602db812fa270fc923f5e48fb47202288828f3 Mon Sep 17 00:00:00 2001
2 From: David Rowe <david@rowetel.com>
3 Date: Mon, 6 Oct 2008 21:41:46 -0700
4 Subject: [PATCH 16/23] Staging: add echo cancelation module
5 Patch-mainline: 2.6.28
6
7 This is used by mISDN and Zaptel drivers.
8
9 From: Steve Underwood <steveu@coppice.org>
10 From: David Rowe <david@rowetel.com>
11 Cc: Tzafrir Cohen <tzafrir.cohen@xorcom.com>
12 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
13 ---
14 drivers/staging/Kconfig | 2 +
15 drivers/staging/Makefile | 1 +
16 drivers/staging/echo/Kconfig | 9 +
17 drivers/staging/echo/Makefile | 1 +
18 drivers/staging/echo/TODO | 10 +
19 drivers/staging/echo/bit_operations.h | 253 +++++++++++++
20 drivers/staging/echo/echo.c | 632 +++++++++++++++++++++++++++++++++
21 drivers/staging/echo/echo.h | 220 ++++++++++++
22 drivers/staging/echo/fir.h | 369 +++++++++++++++++++
23 drivers/staging/echo/mmx.h | 288 +++++++++++++++
24 10 files changed, 1785 insertions(+), 0 deletions(-)
25 create mode 100644 drivers/staging/echo/Kconfig
26 create mode 100644 drivers/staging/echo/Makefile
27 create mode 100644 drivers/staging/echo/TODO
28 create mode 100644 drivers/staging/echo/bit_operations.h
29 create mode 100644 drivers/staging/echo/echo.c
30 create mode 100644 drivers/staging/echo/echo.h
31 create mode 100644 drivers/staging/echo/fir.h
32 create mode 100644 drivers/staging/echo/mmx.h
33
34 diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
35 index 762b471..25338b7 100644
36 --- a/drivers/staging/Kconfig
37 +++ b/drivers/staging/Kconfig
38 @@ -39,4 +39,6 @@ source "drivers/staging/winbond/Kconfig"
39
40 source "drivers/staging/wlan-ng/Kconfig"
41
42 +source "drivers/staging/echo/Kconfig"
43 +
44 endif # STAGING
45 diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
46 index 5741984..93decb8 100644
47 --- a/drivers/staging/Makefile
48 +++ b/drivers/staging/Makefile
49 @@ -8,3 +8,4 @@ obj-$(CONFIG_VIDEO_GO7007) += go7007/
50 obj-$(CONFIG_USB_IP_COMMON) += usbip/
51 obj-$(CONFIG_W35UND) += winbond/
52 obj-$(CONFIG_PRISM2_USB) += wlan-ng/
53 +obj-$(CONFIG_ECHO) += echo/
54 diff --git a/drivers/staging/echo/Kconfig b/drivers/staging/echo/Kconfig
55 new file mode 100644
56 index 0000000..f1d41ea
57 --- /dev/null
58 +++ b/drivers/staging/echo/Kconfig
59 @@ -0,0 +1,9 @@
60 +config ECHO
61 + tristate "Line Echo Canceller support"
62 + default n
63 + ---help---
64 + This driver provides line echo cancelling support for mISDN and
65 + Zaptel drivers.
66 +
67 + To compile this driver as a module, choose M here. The module
68 + will be called echo.
69 diff --git a/drivers/staging/echo/Makefile b/drivers/staging/echo/Makefile
70 new file mode 100644
71 index 0000000..7d4caac
72 --- /dev/null
73 +++ b/drivers/staging/echo/Makefile
74 @@ -0,0 +1 @@
75 +obj-$(CONFIG_ECHO) += echo.o
76 diff --git a/drivers/staging/echo/TODO b/drivers/staging/echo/TODO
77 new file mode 100644
78 index 0000000..1ca09af
79 --- /dev/null
80 +++ b/drivers/staging/echo/TODO
81 @@ -0,0 +1,10 @@
82 +TODO:
83 + - checkpatch.pl cleanups
84 + - Lindent
85 + - typedef removals
86 + - handle bit_operations.h (merge in or make part of common code?)
87 + - remove proc interface, only use echo.h interface (proc interface is
88 + racy and not correct.)
89 +
90 +Please send patches to Greg Kroah-Hartman <greg@kroah.com> and Cc: Steve
91 +Underwood <steveu@coppice.org> and David Rowe <david@rowetel.com>
92 diff --git a/drivers/staging/echo/bit_operations.h b/drivers/staging/echo/bit_operations.h
93 new file mode 100644
94 index 0000000..b32f4bf
95 --- /dev/null
96 +++ b/drivers/staging/echo/bit_operations.h
97 @@ -0,0 +1,253 @@
98 +/*
99 + * SpanDSP - a series of DSP components for telephony
100 + *
101 + * bit_operations.h - Various bit level operations, such as bit reversal
102 + *
103 + * Written by Steve Underwood <steveu@coppice.org>
104 + *
105 + * Copyright (C) 2006 Steve Underwood
106 + *
107 + * All rights reserved.
108 + *
109 + * This program is free software; you can redistribute it and/or modify
110 + * it under the terms of the GNU General Public License version 2, as
111 + * published by the Free Software Foundation.
112 + *
113 + * This program is distributed in the hope that it will be useful,
114 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
115 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
116 + * GNU General Public License for more details.
117 + *
118 + * You should have received a copy of the GNU General Public License
119 + * along with this program; if not, write to the Free Software
120 + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
121 + *
122 + * $Id: bit_operations.h,v 1.11 2006/11/28 15:37:03 steveu Exp $
123 + */
124 +
125 +/*! \file */
126 +
127 +#if !defined(_BIT_OPERATIONS_H_)
128 +#define _BIT_OPERATIONS_H_
129 +
130 +#ifdef __cplusplus
131 +extern "C" {
132 +#endif
133 +
134 +#if defined(__i386__) || defined(__x86_64__)
135 +/*! \brief Find the bit position of the highest set bit in a word
136 + \param bits The word to be searched
137 + \return The bit number of the highest set bit, or -1 if the word is zero. */
138 +static __inline__ int top_bit(unsigned int bits)
139 +{
140 + int res;
141 +
142 + __asm__ (" xorl %[res],%[res];\n"
143 + " decl %[res];\n"
144 + " bsrl %[bits],%[res]\n"
145 + : [res] "=&r" (res)
146 + : [bits] "rm" (bits));
147 + return res;
148 +}
149 +/*- End of function --------------------------------------------------------*/
150 +
151 +/*! \brief Find the bit position of the lowest set bit in a word
152 + \param bits The word to be searched
153 + \return The bit number of the lowest set bit, or -1 if the word is zero. */
154 +static __inline__ int bottom_bit(unsigned int bits)
155 +{
156 + int res;
157 +
158 + __asm__ (" xorl %[res],%[res];\n"
159 + " decl %[res];\n"
160 + " bsfl %[bits],%[res]\n"
161 + : [res] "=&r" (res)
162 + : [bits] "rm" (bits));
163 + return res;
164 +}
165 +/*- End of function --------------------------------------------------------*/
166 +#else
167 +static __inline__ int top_bit(unsigned int bits)
168 +{
169 + int i;
170 +
171 + if (bits == 0)
172 + return -1;
173 + i = 0;
174 + if (bits & 0xFFFF0000)
175 + {
176 + bits &= 0xFFFF0000;
177 + i += 16;
178 + }
179 + if (bits & 0xFF00FF00)
180 + {
181 + bits &= 0xFF00FF00;
182 + i += 8;
183 + }
184 + if (bits & 0xF0F0F0F0)
185 + {
186 + bits &= 0xF0F0F0F0;
187 + i += 4;
188 + }
189 + if (bits & 0xCCCCCCCC)
190 + {
191 + bits &= 0xCCCCCCCC;
192 + i += 2;
193 + }
194 + if (bits & 0xAAAAAAAA)
195 + {
196 + bits &= 0xAAAAAAAA;
197 + i += 1;
198 + }
199 + return i;
200 +}
201 +/*- End of function --------------------------------------------------------*/
202 +
203 +static __inline__ int bottom_bit(unsigned int bits)
204 +{
205 + int i;
206 +
207 + if (bits == 0)
208 + return -1;
209 + i = 32;
210 + if (bits & 0x0000FFFF)
211 + {
212 + bits &= 0x0000FFFF;
213 + i -= 16;
214 + }
215 + if (bits & 0x00FF00FF)
216 + {
217 + bits &= 0x00FF00FF;
218 + i -= 8;
219 + }
220 + if (bits & 0x0F0F0F0F)
221 + {
222 + bits &= 0x0F0F0F0F;
223 + i -= 4;
224 + }
225 + if (bits & 0x33333333)
226 + {
227 + bits &= 0x33333333;
228 + i -= 2;
229 + }
230 + if (bits & 0x55555555)
231 + {
232 + bits &= 0x55555555;
233 + i -= 1;
234 + }
235 + return i;
236 +}
237 +/*- End of function --------------------------------------------------------*/
238 +#endif
239 +
240 +/*! \brief Bit reverse a byte.
241 + \param data The byte to be reversed.
242 + \return The bit reversed version of data. */
243 +static __inline__ uint8_t bit_reverse8(uint8_t x)
244 +{
245 +#if defined(__i386__) || defined(__x86_64__)
246 + /* If multiply is fast */
247 + return ((x*0x0802U & 0x22110U) | (x*0x8020U & 0x88440U))*0x10101U >> 16;
248 +#else
249 + /* If multiply is slow, but we have a barrel shifter */
250 + x = (x >> 4) | (x << 4);
251 + x = ((x & 0xCC) >> 2) | ((x & 0x33) << 2);
252 + return ((x & 0xAA) >> 1) | ((x & 0x55) << 1);
253 +#endif
254 +}
255 +/*- End of function --------------------------------------------------------*/
256 +
257 +/*! \brief Bit reverse a 16 bit word.
258 + \param data The word to be reversed.
259 + \return The bit reversed version of data. */
260 +uint16_t bit_reverse16(uint16_t data);
261 +
262 +/*! \brief Bit reverse a 32 bit word.
263 + \param data The word to be reversed.
264 + \return The bit reversed version of data. */
265 +uint32_t bit_reverse32(uint32_t data);
266 +
267 +/*! \brief Bit reverse each of the four bytes in a 32 bit word.
268 + \param data The word to be reversed.
269 + \return The bit reversed version of data. */
270 +uint32_t bit_reverse_4bytes(uint32_t data);
271 +
272 +/*! \brief Find the number of set bits in a 32 bit word.
273 + \param x The word to be searched.
274 + \return The number of set bits. */
275 +int one_bits32(uint32_t x);
276 +
277 +/*! \brief Create a mask as wide as the number in a 32 bit word.
278 + \param x The word to be searched.
279 + \return The mask. */
280 +uint32_t make_mask32(uint32_t x);
281 +
282 +/*! \brief Create a mask as wide as the number in a 16 bit word.
283 + \param x The word to be searched.
284 + \return The mask. */
285 +uint16_t make_mask16(uint16_t x);
286 +
287 +/*! \brief Find the least significant one in a word, and return a word
288 + with just that bit set.
289 + \param x The word to be searched.
290 + \return The word with the single set bit. */
291 +static __inline__ uint32_t least_significant_one32(uint32_t x)
292 +{
293 + return (x & (-(int32_t) x));
294 +}
295 +/*- End of function --------------------------------------------------------*/
296 +
297 +/*! \brief Find the most significant one in a word, and return a word
298 + with just that bit set.
299 + \param x The word to be searched.
300 + \return The word with the single set bit. */
301 +static __inline__ uint32_t most_significant_one32(uint32_t x)
302 +{
303 +#if defined(__i386__) || defined(__x86_64__)
304 + return 1 << top_bit(x);
305 +#else
306 + x = make_mask32(x);
307 + return (x ^ (x >> 1));
308 +#endif
309 +}
310 +/*- End of function --------------------------------------------------------*/
311 +
312 +/*! \brief Find the parity of a byte.
313 + \param x The byte to be checked.
314 + \return 1 for odd, or 0 for even. */
315 +static __inline__ int parity8(uint8_t x)
316 +{
317 + x = (x ^ (x >> 4)) & 0x0F;
318 + return (0x6996 >> x) & 1;
319 +}
320 +/*- End of function --------------------------------------------------------*/
321 +
322 +/*! \brief Find the parity of a 16 bit word.
323 + \param x The word to be checked.
324 + \return 1 for odd, or 0 for even. */
325 +static __inline__ int parity16(uint16_t x)
326 +{
327 + x ^= (x >> 8);
328 + x = (x ^ (x >> 4)) & 0x0F;
329 + return (0x6996 >> x) & 1;
330 +}
331 +/*- End of function --------------------------------------------------------*/
332 +
333 +/*! \brief Find the parity of a 32 bit word.
334 + \param x The word to be checked.
335 + \return 1 for odd, or 0 for even. */
336 +static __inline__ int parity32(uint32_t x)
337 +{
338 + x ^= (x >> 16);
339 + x ^= (x >> 8);
340 + x = (x ^ (x >> 4)) & 0x0F;
341 + return (0x6996 >> x) & 1;
342 +}
343 +/*- End of function --------------------------------------------------------*/
344 +
345 +#ifdef __cplusplus
346 +}
347 +#endif
348 +
349 +#endif
350 +/*- End of file ------------------------------------------------------------*/
351 diff --git a/drivers/staging/echo/echo.c b/drivers/staging/echo/echo.c
352 new file mode 100644
353 index 0000000..4a281b1
354 --- /dev/null
355 +++ b/drivers/staging/echo/echo.c
356 @@ -0,0 +1,632 @@
357 +/*
358 + * SpanDSP - a series of DSP components for telephony
359 + *
360 + * echo.c - A line echo canceller. This code is being developed
361 + * against and partially complies with G168.
362 + *
363 + * Written by Steve Underwood <steveu@coppice.org>
364 + * and David Rowe <david_at_rowetel_dot_com>
365 + *
366 + * Copyright (C) 2001, 2003 Steve Underwood, 2007 David Rowe
367 + *
368 + * Based on a bit from here, a bit from there, eye of toad, ear of
369 + * bat, 15 years of failed attempts by David and a few fried brain
370 + * cells.
371 + *
372 + * All rights reserved.
373 + *
374 + * This program is free software; you can redistribute it and/or modify
375 + * it under the terms of the GNU General Public License version 2, as
376 + * published by the Free Software Foundation.
377 + *
378 + * This program is distributed in the hope that it will be useful,
379 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
380 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
381 + * GNU General Public License for more details.
382 + *
383 + * You should have received a copy of the GNU General Public License
384 + * along with this program; if not, write to the Free Software
385 + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
386 + *
387 + * $Id: echo.c,v 1.20 2006/12/01 18:00:48 steveu Exp $
388 + */
389 +
390 +/*! \file */
391 +
392 +/* Implementation Notes
393 + David Rowe
394 + April 2007
395 +
396 + This code started life as Steve's NLMS algorithm with a tap
397 + rotation algorithm to handle divergence during double talk. I
398 + added a Geigel Double Talk Detector (DTD) [2] and performed some
399 + G168 tests. However I had trouble meeting the G168 requirements,
400 + especially for double talk - there were always cases where my DTD
401 + failed, for example where near end speech was under the 6dB
402 + threshold required for declaring double talk.
403 +
404 + So I tried a two path algorithm [1], which has so far given better
405 + results. The original tap rotation/Geigel algorithm is available
406 + in SVN http://svn.rowetel.com/software/oslec/tags/before_16bit.
407 + It's probably possible to make it work if some one wants to put some
408 + serious work into it.
409 +
410 + At present no special treatment is provided for tones, which
411 + generally cause NLMS algorithms to diverge. Initial runs of a
412 + subset of the G168 tests for tones (e.g ./echo_test 6) show the
413 + current algorithm is passing OK, which is kind of surprising. The
414 + full set of tests needs to be performed to confirm this result.
415 +
416 + One other interesting change is that I have managed to get the NLMS
417 + code to work with 16 bit coefficients, rather than the original 32
418 + bit coefficents. This reduces the MIPs and storage required.
419 + I evaulated the 16 bit port using g168_tests.sh and listening tests
420 + on 4 real-world samples.
421 +
422 + I also attempted the implementation of a block based NLMS update
423 + [2] but although this passes g168_tests.sh it didn't converge well
424 + on the real-world samples. I have no idea why, perhaps a scaling
425 + problem. The block based code is also available in SVN
426 + http://svn.rowetel.com/software/oslec/tags/before_16bit. If this
427 + code can be debugged, it will lead to further reduction in MIPS, as
428 + the block update code maps nicely onto DSP instruction sets (it's a
429 + dot product) compared to the current sample-by-sample update.
430 +
431 + Steve also has some nice notes on echo cancellers in echo.h
432 +
433 +
434 + References:
435 +
436 + [1] Ochiai, Areseki, and Ogihara, "Echo Canceller with Two Echo
437 + Path Models", IEEE Transactions on communications, COM-25,
438 + No. 6, June
439 + 1977.
440 + http://www.rowetel.com/images/echo/dual_path_paper.pdf
441 +
442 + [2] The classic, very useful paper that tells you how to
443 + actually build a real world echo canceller:
444 + Messerschmitt, Hedberg, Cole, Haoui, Winship, "Digital Voice
445 + Echo Canceller with a TMS320020,
446 + http://www.rowetel.com/images/echo/spra129.pdf
447 +
448 + [3] I have written a series of blog posts on this work, here is
449 + Part 1: http://www.rowetel.com/blog/?p=18
450 +
451 + [4] The source code http://svn.rowetel.com/software/oslec/
452 +
453 + [5] A nice reference on LMS filters:
454 + http://en.wikipedia.org/wiki/Least_mean_squares_filter
455 +
456 + Credits:
457 +
458 + Thanks to Steve Underwood, Jean-Marc Valin, and Ramakrishnan
459 + Muthukrishnan for their suggestions and email discussions. Thanks
460 + also to those people who collected echo samples for me such as
461 + Mark, Pawel, and Pavel.
462 +*/
463 +
464 +#include <linux/kernel.h> /* We're doing kernel work */
465 +#include <linux/module.h>
466 +#include <linux/kernel.h>
467 +#include <linux/slab.h>
468 +#define malloc(a) kmalloc((a), GFP_KERNEL)
469 +#define free(a) kfree(a)
470 +
471 +#include "bit_operations.h"
472 +#include "echo.h"
473 +
474 +#define MIN_TX_POWER_FOR_ADAPTION 64
475 +#define MIN_RX_POWER_FOR_ADAPTION 64
476 +#define DTD_HANGOVER 600 /* 600 samples, or 75ms */
477 +#define DC_LOG2BETA 3 /* log2() of DC filter Beta */
478 +
479 +/*-----------------------------------------------------------------------*\
480 + FUNCTIONS
481 +\*-----------------------------------------------------------------------*/
482 +
483 +/* adapting coeffs using the traditional stochastic descent (N)LMS algorithm */
484 +
485 +
486 +#ifdef __BLACKFIN_ASM__
487 +static void __inline__ lms_adapt_bg(echo_can_state_t *ec, int clean, int shift)
488 +{
489 + int i, j;
490 + int offset1;
491 + int offset2;
492 + int factor;
493 + int exp;
494 + int16_t *phist;
495 + int n;
496 +
497 + if (shift > 0)
498 + factor = clean << shift;
499 + else
500 + factor = clean >> -shift;
501 +
502 + /* Update the FIR taps */
503 +
504 + offset2 = ec->curr_pos;
505 + offset1 = ec->taps - offset2;
506 + phist = &ec->fir_state_bg.history[offset2];
507 +
508 + /* st: and en: help us locate the assembler in echo.s */
509 +
510 + //asm("st:");
511 + n = ec->taps;
512 + for (i = 0, j = offset2; i < n; i++, j++)
513 + {
514 + exp = *phist++ * factor;
515 + ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15);
516 + }
517 + //asm("en:");
518 +
519 + /* Note the asm for the inner loop above generated by Blackfin gcc
520 + 4.1.1 is pretty good (note even parallel instructions used):
521 +
522 + R0 = W [P0++] (X);
523 + R0 *= R2;
524 + R0 = R0 + R3 (NS) ||
525 + R1 = W [P1] (X) ||
526 + nop;
527 + R0 >>>= 15;
528 + R0 = R0 + R1;
529 + W [P1++] = R0;
530 +
531 + A block based update algorithm would be much faster but the
532 + above can't be improved on much. Every instruction saved in
533 + the loop above is 2 MIPs/ch! The for loop above is where the
534 + Blackfin spends most of it's time - about 17 MIPs/ch measured
535 + with speedtest.c with 256 taps (32ms). Write-back and
536 + Write-through cache gave about the same performance.
537 + */
538 +}
539 +
540 +/*
541 + IDEAS for further optimisation of lms_adapt_bg():
542 +
543 + 1/ The rounding is quite costly. Could we keep as 32 bit coeffs
544 + then make filter pluck the MS 16-bits of the coeffs when filtering?
545 + However this would lower potential optimisation of filter, as I
546 + think the dual-MAC architecture requires packed 16 bit coeffs.
547 +
548 + 2/ Block based update would be more efficient, as per comments above,
549 + could use dual MAC architecture.
550 +
551 + 3/ Look for same sample Blackfin LMS code, see if we can get dual-MAC
552 + packing.
553 +
554 + 4/ Execute the whole e/c in a block of say 20ms rather than sample
555 + by sample. Processing a few samples every ms is inefficient.
556 +*/
557 +
558 +#else
559 +static __inline__ void lms_adapt_bg(echo_can_state_t *ec, int clean, int shift)
560 +{
561 + int i;
562 +
563 + int offset1;
564 + int offset2;
565 + int factor;
566 + int exp;
567 +
568 + if (shift > 0)
569 + factor = clean << shift;
570 + else
571 + factor = clean >> -shift;
572 +
573 + /* Update the FIR taps */
574 +
575 + offset2 = ec->curr_pos;
576 + offset1 = ec->taps - offset2;
577 +
578 + for (i = ec->taps - 1; i >= offset1; i--)
579 + {
580 + exp = (ec->fir_state_bg.history[i - offset1]*factor);
581 + ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15);
582 + }
583 + for ( ; i >= 0; i--)
584 + {
585 + exp = (ec->fir_state_bg.history[i + offset2]*factor);
586 + ec->fir_taps16[1][i] += (int16_t) ((exp+(1<<14)) >> 15);
587 + }
588 +}
589 +#endif
590 +
591 +/*- End of function --------------------------------------------------------*/
592 +
593 +echo_can_state_t *echo_can_create(int len, int adaption_mode)
594 +{
595 + echo_can_state_t *ec;
596 + int i;
597 + int j;
598 +
599 + ec = kmalloc(sizeof(*ec), GFP_KERNEL);
600 + if (ec == NULL)
601 + return NULL;
602 + memset(ec, 0, sizeof(*ec));
603 +
604 + ec->taps = len;
605 + ec->log2taps = top_bit(len);
606 + ec->curr_pos = ec->taps - 1;
607 +
608 + for (i = 0; i < 2; i++)
609 + {
610 + if ((ec->fir_taps16[i] = (int16_t *) malloc((ec->taps)*sizeof(int16_t))) == NULL)
611 + {
612 + for (j = 0; j < i; j++)
613 + kfree(ec->fir_taps16[j]);
614 + kfree(ec);
615 + return NULL;
616 + }
617 + memset(ec->fir_taps16[i], 0, (ec->taps)*sizeof(int16_t));
618 + }
619 +
620 + fir16_create(&ec->fir_state,
621 + ec->fir_taps16[0],
622 + ec->taps);
623 + fir16_create(&ec->fir_state_bg,
624 + ec->fir_taps16[1],
625 + ec->taps);
626 +
627 + for(i=0; i<5; i++) {
628 + ec->xvtx[i] = ec->yvtx[i] = ec->xvrx[i] = ec->yvrx[i] = 0;
629 + }
630 +
631 + ec->cng_level = 1000;
632 + echo_can_adaption_mode(ec, adaption_mode);
633 +
634 + ec->snapshot = (int16_t*)malloc(ec->taps*sizeof(int16_t));
635 + memset(ec->snapshot, 0, sizeof(int16_t)*ec->taps);
636 +
637 + ec->cond_met = 0;
638 + ec->Pstates = 0;
639 + ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
640 + ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
641 + ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
642 + ec->Lbgn = ec->Lbgn_acc = 0;
643 + ec->Lbgn_upper = 200;
644 + ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
645 +
646 + return ec;
647 +}
648 +/*- End of function --------------------------------------------------------*/
649 +
650 +void echo_can_free(echo_can_state_t *ec)
651 +{
652 + int i;
653 +
654 + fir16_free(&ec->fir_state);
655 + fir16_free(&ec->fir_state_bg);
656 + for (i = 0; i < 2; i++)
657 + kfree(ec->fir_taps16[i]);
658 + kfree(ec->snapshot);
659 + kfree(ec);
660 +}
661 +/*- End of function --------------------------------------------------------*/
662 +
663 +void echo_can_adaption_mode(echo_can_state_t *ec, int adaption_mode)
664 +{
665 + ec->adaption_mode = adaption_mode;
666 +}
667 +/*- End of function --------------------------------------------------------*/
668 +
669 +void echo_can_flush(echo_can_state_t *ec)
670 +{
671 + int i;
672 +
673 + ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
674 + ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
675 + ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
676 +
677 + ec->Lbgn = ec->Lbgn_acc = 0;
678 + ec->Lbgn_upper = 200;
679 + ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
680 +
681 + ec->nonupdate_dwell = 0;
682 +
683 + fir16_flush(&ec->fir_state);
684 + fir16_flush(&ec->fir_state_bg);
685 + ec->fir_state.curr_pos = ec->taps - 1;
686 + ec->fir_state_bg.curr_pos = ec->taps - 1;
687 + for (i = 0; i < 2; i++)
688 + memset(ec->fir_taps16[i], 0, ec->taps*sizeof(int16_t));
689 +
690 + ec->curr_pos = ec->taps - 1;
691 + ec->Pstates = 0;
692 +}
693 +/*- End of function --------------------------------------------------------*/
694 +
695 +void echo_can_snapshot(echo_can_state_t *ec) {
696 + memcpy(ec->snapshot, ec->fir_taps16[0], ec->taps*sizeof(int16_t));
697 +}
698 +/*- End of function --------------------------------------------------------*/
699 +
700 +/* Dual Path Echo Canceller ------------------------------------------------*/
701 +
702 +int16_t echo_can_update(echo_can_state_t *ec, int16_t tx, int16_t rx)
703 +{
704 + int32_t echo_value;
705 + int clean_bg;
706 + int tmp, tmp1;
707 +
708 + /* Input scaling was found be required to prevent problems when tx
709 + starts clipping. Another possible way to handle this would be the
710 + filter coefficent scaling. */
711 +
712 + ec->tx = tx; ec->rx = rx;
713 + tx >>=1;
714 + rx >>=1;
715 +
716 + /*
717 + Filter DC, 3dB point is 160Hz (I think), note 32 bit precision required
718 + otherwise values do not track down to 0. Zero at DC, Pole at (1-Beta)
719 + only real axis. Some chip sets (like Si labs) don't need
720 + this, but something like a $10 X100P card does. Any DC really slows
721 + down convergence.
722 +
723 + Note: removes some low frequency from the signal, this reduces
724 + the speech quality when listening to samples through headphones
725 + but may not be obvious through a telephone handset.
726 +
727 + Note that the 3dB frequency in radians is approx Beta, e.g. for
728 + Beta = 2^(-3) = 0.125, 3dB freq is 0.125 rads = 159Hz.
729 + */
730 +
731 + if (ec->adaption_mode & ECHO_CAN_USE_RX_HPF) {
732 + tmp = rx << 15;
733 +#if 1
734 + /* Make sure the gain of the HPF is 1.0. This can still saturate a little under
735 + impulse conditions, and it might roll to 32768 and need clipping on sustained peak
736 + level signals. However, the scale of such clipping is small, and the error due to
737 + any saturation should not markedly affect the downstream processing. */
738 + tmp -= (tmp >> 4);
739 +#endif
740 + ec->rx_1 += -(ec->rx_1>>DC_LOG2BETA) + tmp - ec->rx_2;
741 +
742 + /* hard limit filter to prevent clipping. Note that at this stage
743 + rx should be limited to +/- 16383 due to right shift above */
744 + tmp1 = ec->rx_1 >> 15;
745 + if (tmp1 > 16383) tmp1 = 16383;
746 + if (tmp1 < -16383) tmp1 = -16383;
747 + rx = tmp1;
748 + ec->rx_2 = tmp;
749 + }
750 +
751 + /* Block average of power in the filter states. Used for
752 + adaption power calculation. */
753 +
754 + {
755 + int new, old;
756 +
757 + /* efficient "out with the old and in with the new" algorithm so
758 + we don't have to recalculate over the whole block of
759 + samples. */
760 + new = (int)tx * (int)tx;
761 + old = (int)ec->fir_state.history[ec->fir_state.curr_pos] *
762 + (int)ec->fir_state.history[ec->fir_state.curr_pos];
763 + ec->Pstates += ((new - old) + (1<<ec->log2taps)) >> ec->log2taps;
764 + if (ec->Pstates < 0) ec->Pstates = 0;
765 + }
766 +
767 + /* Calculate short term average levels using simple single pole IIRs */
768 +
769 + ec->Ltxacc += abs(tx) - ec->Ltx;
770 + ec->Ltx = (ec->Ltxacc + (1<<4)) >> 5;
771 + ec->Lrxacc += abs(rx) - ec->Lrx;
772 + ec->Lrx = (ec->Lrxacc + (1<<4)) >> 5;
773 +
774 + /* Foreground filter ---------------------------------------------------*/
775 +
776 + ec->fir_state.coeffs = ec->fir_taps16[0];
777 + echo_value = fir16(&ec->fir_state, tx);
778 + ec->clean = rx - echo_value;
779 + ec->Lcleanacc += abs(ec->clean) - ec->Lclean;
780 + ec->Lclean = (ec->Lcleanacc + (1<<4)) >> 5;
781 +
782 + /* Background filter ---------------------------------------------------*/
783 +
784 + echo_value = fir16(&ec->fir_state_bg, tx);
785 + clean_bg = rx - echo_value;
786 + ec->Lclean_bgacc += abs(clean_bg) - ec->Lclean_bg;
787 + ec->Lclean_bg = (ec->Lclean_bgacc + (1<<4)) >> 5;
788 +
789 + /* Background Filter adaption -----------------------------------------*/
790 +
791 + /* Almost always adap bg filter, just simple DT and energy
792 + detection to minimise adaption in cases of strong double talk.
793 + However this is not critical for the dual path algorithm.
794 + */
795 + ec->factor = 0;
796 + ec->shift = 0;
797 + if ((ec->nonupdate_dwell == 0)) {
798 + int P, logP, shift;
799 +
800 + /* Determine:
801 +
802 + f = Beta * clean_bg_rx/P ------ (1)
803 +
804 + where P is the total power in the filter states.
805 +
806 + The Boffins have shown that if we obey (1) we converge
807 + quickly and avoid instability.
808 +
809 + The correct factor f must be in Q30, as this is the fixed
810 + point format required by the lms_adapt_bg() function,
811 + therefore the scaled version of (1) is:
812 +
813 + (2^30) * f = (2^30) * Beta * clean_bg_rx/P
814 + factor = (2^30) * Beta * clean_bg_rx/P ----- (2)
815 +
816 + We have chosen Beta = 0.25 by experiment, so:
817 +
818 + factor = (2^30) * (2^-2) * clean_bg_rx/P
819 +
820 + (30 - 2 - log2(P))
821 + factor = clean_bg_rx 2 ----- (3)
822 +
823 + To avoid a divide we approximate log2(P) as top_bit(P),
824 + which returns the position of the highest non-zero bit in
825 + P. This approximation introduces an error as large as a
826 + factor of 2, but the algorithm seems to handle it OK.
827 +
828 + Come to think of it a divide may not be a big deal on a
829 + modern DSP, so its probably worth checking out the cycles
830 + for a divide versus a top_bit() implementation.
831 + */
832 +
833 + P = MIN_TX_POWER_FOR_ADAPTION + ec->Pstates;
834 + logP = top_bit(P) + ec->log2taps;
835 + shift = 30 - 2 - logP;
836 + ec->shift = shift;
837 +
838 + lms_adapt_bg(ec, clean_bg, shift);
839 + }
840 +
841 + /* very simple DTD to make sure we dont try and adapt with strong
842 + near end speech */
843 +
844 + ec->adapt = 0;
845 + if ((ec->Lrx > MIN_RX_POWER_FOR_ADAPTION) && (ec->Lrx > ec->Ltx))
846 + ec->nonupdate_dwell = DTD_HANGOVER;
847 + if (ec->nonupdate_dwell)
848 + ec->nonupdate_dwell--;
849 +
850 + /* Transfer logic ------------------------------------------------------*/
851 +
852 + /* These conditions are from the dual path paper [1], I messed with
853 + them a bit to improve performance. */
854 +
855 + if ((ec->adaption_mode & ECHO_CAN_USE_ADAPTION) &&
856 + (ec->nonupdate_dwell == 0) &&
857 + (8*ec->Lclean_bg < 7*ec->Lclean) /* (ec->Lclean_bg < 0.875*ec->Lclean) */ &&
858 + (8*ec->Lclean_bg < ec->Ltx) /* (ec->Lclean_bg < 0.125*ec->Ltx) */ )
859 + {
860 + if (ec->cond_met == 6) {
861 + /* BG filter has had better results for 6 consecutive samples */
862 + ec->adapt = 1;
863 + memcpy(ec->fir_taps16[0], ec->fir_taps16[1], ec->taps*sizeof(int16_t));
864 + }
865 + else
866 + ec->cond_met++;
867 + }
868 + else
869 + ec->cond_met = 0;
870 +
871 + /* Non-Linear Processing ---------------------------------------------------*/
872 +
873 + ec->clean_nlp = ec->clean;
874 + if (ec->adaption_mode & ECHO_CAN_USE_NLP)
875 + {
876 + /* Non-linear processor - a fancy way to say "zap small signals, to avoid
877 + residual echo due to (uLaw/ALaw) non-linearity in the channel.". */
878 +
879 + if ((16*ec->Lclean < ec->Ltx))
880 + {
881 + /* Our e/c has improved echo by at least 24 dB (each factor of 2 is 6dB,
882 + so 2*2*2*2=16 is the same as 6+6+6+6=24dB) */
883 + if (ec->adaption_mode & ECHO_CAN_USE_CNG)
884 + {
885 + ec->cng_level = ec->Lbgn;
886 +
887 + /* Very elementary comfort noise generation. Just random
888 + numbers rolled off very vaguely Hoth-like. DR: This
889 + noise doesn't sound quite right to me - I suspect there
890 + are some overlfow issues in the filtering as it's too
891 + "crackly". TODO: debug this, maybe just play noise at
892 + high level or look at spectrum.
893 + */
894 +
895 + ec->cng_rndnum = 1664525U*ec->cng_rndnum + 1013904223U;
896 + ec->cng_filter = ((ec->cng_rndnum & 0xFFFF) - 32768 + 5*ec->cng_filter) >> 3;
897 + ec->clean_nlp = (ec->cng_filter*ec->cng_level*8) >> 14;
898 +
899 + }
900 + else if (ec->adaption_mode & ECHO_CAN_USE_CLIP)
901 + {
902 + /* This sounds much better than CNG */
903 + if (ec->clean_nlp > ec->Lbgn)
904 + ec->clean_nlp = ec->Lbgn;
905 + if (ec->clean_nlp < -ec->Lbgn)
906 + ec->clean_nlp = -ec->Lbgn;
907 + }
908 + else
909 + {
910 + /* just mute the residual, doesn't sound very good, used mainly
911 + in G168 tests */
912 + ec->clean_nlp = 0;
913 + }
914 + }
915 + else {
916 + /* Background noise estimator. I tried a few algorithms
917 + here without much luck. This very simple one seems to
918 + work best, we just average the level using a slow (1 sec
919 + time const) filter if the current level is less than a
920 + (experimentally derived) constant. This means we dont
921 + include high level signals like near end speech. When
922 + combined with CNG or especially CLIP seems to work OK.
923 + */
924 + if (ec->Lclean < 40) {
925 + ec->Lbgn_acc += abs(ec->clean) - ec->Lbgn;
926 + ec->Lbgn = (ec->Lbgn_acc + (1<<11)) >> 12;
927 + }
928 + }
929 + }
930 +
931 + /* Roll around the taps buffer */
932 + if (ec->curr_pos <= 0)
933 + ec->curr_pos = ec->taps;
934 + ec->curr_pos--;
935 +
936 + if (ec->adaption_mode & ECHO_CAN_DISABLE)
937 + ec->clean_nlp = rx;
938 +
939 + /* Output scaled back up again to match input scaling */
940 +
941 + return (int16_t) ec->clean_nlp << 1;
942 +}
943 +
944 +/*- End of function --------------------------------------------------------*/
945 +
946 +/* This function is seperated from the echo canceller is it is usually called
947 + as part of the tx process. See rx HP (DC blocking) filter above, it's
948 + the same design.
949 +
950 + Some soft phones send speech signals with a lot of low frequency
951 + energy, e.g. down to 20Hz. This can make the hybrid non-linear
952 + which causes the echo canceller to fall over. This filter can help
953 + by removing any low frequency before it gets to the tx port of the
954 + hybrid.
955 +
956 + It can also help by removing and DC in the tx signal. DC is bad
957 + for LMS algorithms.
958 +
959 + This is one of the classic DC removal filters, adjusted to provide sufficient
960 + bass rolloff to meet the above requirement to protect hybrids from things that
961 + upset them. The difference between successive samples produces a lousy HPF, and
962 + then a suitably placed pole flattens things out. The final result is a nicely
963 + rolled off bass end. The filtering is implemented with extended fractional
964 + precision, which noise shapes things, giving very clean DC removal.
965 +*/
966 +
967 +int16_t echo_can_hpf_tx(echo_can_state_t *ec, int16_t tx) {
968 + int tmp, tmp1;
969 +
970 + if (ec->adaption_mode & ECHO_CAN_USE_TX_HPF) {
971 + tmp = tx << 15;
972 +#if 1
973 + /* Make sure the gain of the HPF is 1.0. The first can still saturate a little under
974 + impulse conditions, and it might roll to 32768 and need clipping on sustained peak
975 + level signals. However, the scale of such clipping is small, and the error due to
976 + any saturation should not markedly affect the downstream processing. */
977 + tmp -= (tmp >> 4);
978 +#endif
979 + ec->tx_1 += -(ec->tx_1>>DC_LOG2BETA) + tmp - ec->tx_2;
980 + tmp1 = ec->tx_1 >> 15;
981 + if (tmp1 > 32767) tmp1 = 32767;
982 + if (tmp1 < -32767) tmp1 = -32767;
983 + tx = tmp1;
984 + ec->tx_2 = tmp;
985 + }
986 +
987 + return tx;
988 +}
989 diff --git a/drivers/staging/echo/echo.h b/drivers/staging/echo/echo.h
990 new file mode 100644
991 index 0000000..7a91b43
992 --- /dev/null
993 +++ b/drivers/staging/echo/echo.h
994 @@ -0,0 +1,220 @@
995 +/*
996 + * SpanDSP - a series of DSP components for telephony
997 + *
998 + * echo.c - A line echo canceller. This code is being developed
999 + * against and partially complies with G168.
1000 + *
1001 + * Written by Steve Underwood <steveu@coppice.org>
1002 + * and David Rowe <david_at_rowetel_dot_com>
1003 + *
1004 + * Copyright (C) 2001 Steve Underwood and 2007 David Rowe
1005 + *
1006 + * All rights reserved.
1007 + *
1008 + * This program is free software; you can redistribute it and/or modify
1009 + * it under the terms of the GNU General Public License version 2, as
1010 + * published by the Free Software Foundation.
1011 + *
1012 + * This program is distributed in the hope that it will be useful,
1013 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1014 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1015 + * GNU General Public License for more details.
1016 + *
1017 + * You should have received a copy of the GNU General Public License
1018 + * along with this program; if not, write to the Free Software
1019 + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1020 + *
1021 + * $Id: echo.h,v 1.9 2006/10/24 13:45:28 steveu Exp $
1022 + */
1023 +
1024 +#ifndef __ECHO_H
1025 +#define __ECHO_H
1026 +
1027 +/*! \page echo_can_page Line echo cancellation for voice
1028 +
1029 +\section echo_can_page_sec_1 What does it do?
1030 +This module aims to provide G.168-2002 compliant echo cancellation, to remove
1031 +electrical echoes (e.g. from 2-4 wire hybrids) from voice calls.
1032 +
1033 +\section echo_can_page_sec_2 How does it work?
1034 +The heart of the echo cancellor is FIR filter. This is adapted to match the
1035 +echo impulse response of the telephone line. It must be long enough to
1036 +adequately cover the duration of that impulse response. The signal transmitted
1037 +to the telephone line is passed through the FIR filter. Once the FIR is
1038 +properly adapted, the resulting output is an estimate of the echo signal
1039 +received from the line. This is subtracted from the received signal. The result
1040 +is an estimate of the signal which originated at the far end of the line, free
1041 +from echos of our own transmitted signal.
1042 +
1043 +The least mean squares (LMS) algorithm is attributed to Widrow and Hoff, and
1044 +was introduced in 1960. It is the commonest form of filter adaption used in
1045 +things like modem line equalisers and line echo cancellers. There it works very
1046 +well. However, it only works well for signals of constant amplitude. It works
1047 +very poorly for things like speech echo cancellation, where the signal level
1048 +varies widely. This is quite easy to fix. If the signal level is normalised -
1049 +similar to applying AGC - LMS can work as well for a signal of varying
1050 +amplitude as it does for a modem signal. This normalised least mean squares
1051 +(NLMS) algorithm is the commonest one used for speech echo cancellation. Many
1052 +other algorithms exist - e.g. RLS (essentially the same as Kalman filtering),
1053 +FAP, etc. Some perform significantly better than NLMS. However, factors such
1054 +as computational complexity and patents favour the use of NLMS.
1055 +
1056 +A simple refinement to NLMS can improve its performance with speech. NLMS tends
1057 +to adapt best to the strongest parts of a signal. If the signal is white noise,
1058 +the NLMS algorithm works very well. However, speech has more low frequency than
1059 +high frequency content. Pre-whitening (i.e. filtering the signal to flatten its
1060 +spectrum) the echo signal improves the adapt rate for speech, and ensures the
1061 +final residual signal is not heavily biased towards high frequencies. A very
1062 +low complexity filter is adequate for this, so pre-whitening adds little to the
1063 +compute requirements of the echo canceller.
1064 +
1065 +An FIR filter adapted using pre-whitened NLMS performs well, provided certain
1066 +conditions are met:
1067 +
1068 + - The transmitted signal has poor self-correlation.
1069 + - There is no signal being generated within the environment being
1070 + cancelled.
1071 +
1072 +The difficulty is that neither of these can be guaranteed.
1073 +
1074 +If the adaption is performed while transmitting noise (or something fairly
1075 +noise like, such as voice) the adaption works very well. If the adaption is
1076 +performed while transmitting something highly correlative (typically narrow
1077 +band energy such as signalling tones or DTMF), the adaption can go seriously
1078 +wrong. The reason is there is only one solution for the adaption on a near
1079 +random signal - the impulse response of the line. For a repetitive signal,
1080 +there are any number of solutions which converge the adaption, and nothing
1081 +guides the adaption to choose the generalised one. Allowing an untrained
1082 +canceller to converge on this kind of narrowband energy probably a good thing,
1083 +since at least it cancels the tones. Allowing a well converged canceller to
1084 +continue converging on such energy is just a way to ruin its generalised
1085 +adaption. A narrowband detector is needed, so adapation can be suspended at
1086 +appropriate times.
1087 +
1088 +The adaption process is based on trying to eliminate the received signal. When
1089 +there is any signal from within the environment being cancelled it may upset
1090 +the adaption process. Similarly, if the signal we are transmitting is small,
1091 +noise may dominate and disturb the adaption process. If we can ensure that the
1092 +adaption is only performed when we are transmitting a significant signal level,
1093 +and the environment is not, things will be OK. Clearly, it is easy to tell when
1094 +we are sending a significant signal. Telling, if the environment is generating
1095 +a significant signal, and doing it with sufficient speed that the adaption will
1096 +not have diverged too much more we stop it, is a little harder.
1097 +
1098 +The key problem in detecting when the environment is sourcing significant
1099 +energy is that we must do this very quickly. Given a reasonably long sample of
1100 +the received signal, there are a number of strategies which may be used to
1101 +assess whether that signal contains a strong far end component. However, by the
1102 +time that assessment is complete the far end signal will have already caused
1103 +major mis-convergence in the adaption process. An assessment algorithm is
1104 +needed which produces a fairly accurate result from a very short burst of far
1105 +end energy.
1106 +
1107 +\section echo_can_page_sec_3 How do I use it?
1108 +The echo cancellor processes both the transmit and receive streams sample by
1109 +sample. The processing function is not declared inline. Unfortunately,
1110 +cancellation requires many operations per sample, so the call overhead is only
1111 +a minor burden.
1112 +*/
1113 +
1114 +#include "fir.h"
1115 +
1116 +/* Mask bits for the adaption mode */
1117 +#define ECHO_CAN_USE_ADAPTION 0x01
1118 +#define ECHO_CAN_USE_NLP 0x02
1119 +#define ECHO_CAN_USE_CNG 0x04
1120 +#define ECHO_CAN_USE_CLIP 0x08
1121 +#define ECHO_CAN_USE_TX_HPF 0x10
1122 +#define ECHO_CAN_USE_RX_HPF 0x20
1123 +#define ECHO_CAN_DISABLE 0x40
1124 +
1125 +/*!
1126 + G.168 echo canceller descriptor. This defines the working state for a line
1127 + echo canceller.
1128 +*/
1129 +typedef struct
1130 +{
1131 + int16_t tx,rx;
1132 + int16_t clean;
1133 + int16_t clean_nlp;
1134 +
1135 + int nonupdate_dwell;
1136 + int curr_pos;
1137 + int taps;
1138 + int log2taps;
1139 + int adaption_mode;
1140 +
1141 + int cond_met;
1142 + int32_t Pstates;
1143 + int16_t adapt;
1144 + int32_t factor;
1145 + int16_t shift;
1146 +
1147 + /* Average levels and averaging filter states */
1148 + int Ltxacc, Lrxacc, Lcleanacc, Lclean_bgacc;
1149 + int Ltx, Lrx;
1150 + int Lclean;
1151 + int Lclean_bg;
1152 + int Lbgn, Lbgn_acc, Lbgn_upper, Lbgn_upper_acc;
1153 +
1154 + /* foreground and background filter states */
1155 + fir16_state_t fir_state;
1156 + fir16_state_t fir_state_bg;
1157 + int16_t *fir_taps16[2];
1158 +
1159 + /* DC blocking filter states */
1160 + int tx_1, tx_2, rx_1, rx_2;
1161 +
1162 + /* optional High Pass Filter states */
1163 + int32_t xvtx[5], yvtx[5];
1164 + int32_t xvrx[5], yvrx[5];
1165 +
1166 + /* Parameters for the optional Hoth noise generator */
1167 + int cng_level;
1168 + int cng_rndnum;
1169 + int cng_filter;
1170 +
1171 + /* snapshot sample of coeffs used for development */
1172 + int16_t *snapshot;
1173 +} echo_can_state_t;
1174 +
1175 +/*! Create a voice echo canceller context.
1176 + \param len The length of the canceller, in samples.
1177 + \return The new canceller context, or NULL if the canceller could not be created.
1178 +*/
1179 +echo_can_state_t *echo_can_create(int len, int adaption_mode);
1180 +
1181 +/*! Free a voice echo canceller context.
1182 + \param ec The echo canceller context.
1183 +*/
1184 +void echo_can_free(echo_can_state_t *ec);
1185 +
1186 +/*! Flush (reinitialise) a voice echo canceller context.
1187 + \param ec The echo canceller context.
1188 +*/
1189 +void echo_can_flush(echo_can_state_t *ec);
1190 +
1191 +/*! Set the adaption mode of a voice echo canceller context.
1192 + \param ec The echo canceller context.
1193 + \param adapt The mode.
1194 +*/
1195 +void echo_can_adaption_mode(echo_can_state_t *ec, int adaption_mode);
1196 +
1197 +void echo_can_snapshot(echo_can_state_t *ec);
1198 +
1199 +/*! Process a sample through a voice echo canceller.
1200 + \param ec The echo canceller context.
1201 + \param tx The transmitted audio sample.
1202 + \param rx The received audio sample.
1203 + \return The clean (echo cancelled) received sample.
1204 +*/
1205 +int16_t echo_can_update(echo_can_state_t *ec, int16_t tx, int16_t rx);
1206 +
1207 +/*! Process to high pass filter the tx signal.
1208 + \param ec The echo canceller context.
1209 + \param tx The transmitted auio sample.
1210 + \return The HP filtered transmit sample, send this to your D/A.
1211 +*/
1212 +int16_t echo_can_hpf_tx(echo_can_state_t *ec, int16_t tx);
1213 +
1214 +#endif /* __ECHO_H */
1215 diff --git a/drivers/staging/echo/fir.h b/drivers/staging/echo/fir.h
1216 new file mode 100644
1217 index 0000000..e1bfc49
1218 --- /dev/null
1219 +++ b/drivers/staging/echo/fir.h
1220 @@ -0,0 +1,369 @@
1221 +/*
1222 + * SpanDSP - a series of DSP components for telephony
1223 + *
1224 + * fir.h - General telephony FIR routines
1225 + *
1226 + * Written by Steve Underwood <steveu@coppice.org>
1227 + *
1228 + * Copyright (C) 2002 Steve Underwood
1229 + *
1230 + * All rights reserved.
1231 + *
1232 + * This program is free software; you can redistribute it and/or modify
1233 + * it under the terms of the GNU General Public License version 2, as
1234 + * published by the Free Software Foundation.
1235 + *
1236 + * This program is distributed in the hope that it will be useful,
1237 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1238 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1239 + * GNU General Public License for more details.
1240 + *
1241 + * You should have received a copy of the GNU General Public License
1242 + * along with this program; if not, write to the Free Software
1243 + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
1244 + *
1245 + * $Id: fir.h,v 1.8 2006/10/24 13:45:28 steveu Exp $
1246 + */
1247 +
1248 +/*! \page fir_page FIR filtering
1249 +\section fir_page_sec_1 What does it do?
1250 +???.
1251 +
1252 +\section fir_page_sec_2 How does it work?
1253 +???.
1254 +*/
1255 +
1256 +#if !defined(_FIR_H_)
1257 +#define _FIR_H_
1258 +
1259 +/*
1260 + Blackfin NOTES & IDEAS:
1261 +
1262 + A simple dot product function is used to implement the filter. This performs
1263 + just one MAC/cycle which is inefficient but was easy to implement as a first
1264 + pass. The current Blackfin code also uses an unrolled form of the filter
1265 + history to avoid 0 length hardware loop issues. This is wasteful of
1266 + memory.
1267 +
1268 + Ideas for improvement:
1269 +
1270 + 1/ Rewrite filter for dual MAC inner loop. The issue here is handling
1271 + history sample offsets that are 16 bit aligned - the dual MAC needs
1272 + 32 bit aligmnent. There are some good examples in libbfdsp.
1273 +
1274 + 2/ Use the hardware circular buffer facility tohalve memory usage.
1275 +
1276 + 3/ Consider using internal memory.
1277 +
1278 + Using less memory might also improve speed as cache misses will be
1279 + reduced. A drop in MIPs and memory approaching 50% should be
1280 + possible.
1281 +
1282 + The foreground and background filters currenlty use a total of
1283 + about 10 MIPs/ch as measured with speedtest.c on a 256 TAP echo
1284 + can.
1285 +*/
1286 +
1287 +#if defined(USE_MMX) || defined(USE_SSE2)
1288 +#include "mmx.h"
1289 +#endif
1290 +
1291 +/*!
1292 + 16 bit integer FIR descriptor. This defines the working state for a single
1293 + instance of an FIR filter using 16 bit integer coefficients.
1294 +*/
1295 +typedef struct
1296 +{
1297 + int taps;
1298 + int curr_pos;
1299 + const int16_t *coeffs;
1300 + int16_t *history;
1301 +} fir16_state_t;
1302 +
1303 +/*!
1304 + 32 bit integer FIR descriptor. This defines the working state for a single
1305 + instance of an FIR filter using 32 bit integer coefficients, and filtering
1306 + 16 bit integer data.
1307 +*/
1308 +typedef struct
1309 +{
1310 + int taps;
1311 + int curr_pos;
1312 + const int32_t *coeffs;
1313 + int16_t *history;
1314 +} fir32_state_t;
1315 +
1316 +/*!
1317 + Floating point FIR descriptor. This defines the working state for a single
1318 + instance of an FIR filter using floating point coefficients and data.
1319 +*/
1320 +typedef struct
1321 +{
1322 + int taps;
1323 + int curr_pos;
1324 + const float *coeffs;
1325 + float *history;
1326 +} fir_float_state_t;
1327 +
1328 +#ifdef __cplusplus
1329 +extern "C" {
1330 +#endif
1331 +
1332 +static __inline__ const int16_t *fir16_create(fir16_state_t *fir,
1333 + const int16_t *coeffs,
1334 + int taps)
1335 +{
1336 + fir->taps = taps;
1337 + fir->curr_pos = taps - 1;
1338 + fir->coeffs = coeffs;
1339 +#if defined(USE_MMX) || defined(USE_SSE2) || defined(__BLACKFIN_ASM__)
1340 + if ((fir->history = malloc(2*taps*sizeof(int16_t))))
1341 + memset(fir->history, 0, 2*taps*sizeof(int16_t));
1342 +#else
1343 + if ((fir->history = (int16_t *) malloc(taps*sizeof(int16_t))))
1344 + memset(fir->history, 0, taps*sizeof(int16_t));
1345 +#endif
1346 + return fir->history;
1347 +}
1348 +/*- End of function --------------------------------------------------------*/
1349 +
1350 +static __inline__ void fir16_flush(fir16_state_t *fir)
1351 +{
1352 +#if defined(USE_MMX) || defined(USE_SSE2) || defined(__BLACKFIN_ASM__)
1353 + memset(fir->history, 0, 2*fir->taps*sizeof(int16_t));
1354 +#else
1355 + memset(fir->history, 0, fir->taps*sizeof(int16_t));
1356 +#endif
1357 +}
1358 +/*- End of function --------------------------------------------------------*/
1359 +
1360 +static __inline__ void fir16_free(fir16_state_t *fir)
1361 +{
1362 + free(fir->history);
1363 +}
1364 +/*- End of function --------------------------------------------------------*/
1365 +
1366 +#ifdef __BLACKFIN_ASM__
1367 +static inline int32_t dot_asm(short *x, short *y, int len)
1368 +{
1369 + int dot;
1370 +
1371 + len--;
1372 +
1373 + __asm__
1374 + (
1375 + "I0 = %1;\n\t"
1376 + "I1 = %2;\n\t"
1377 + "A0 = 0;\n\t"
1378 + "R0.L = W[I0++] || R1.L = W[I1++];\n\t"
1379 + "LOOP dot%= LC0 = %3;\n\t"
1380 + "LOOP_BEGIN dot%=;\n\t"
1381 + "A0 += R0.L * R1.L (IS) || R0.L = W[I0++] || R1.L = W[I1++];\n\t"
1382 + "LOOP_END dot%=;\n\t"
1383 + "A0 += R0.L*R1.L (IS);\n\t"
1384 + "R0 = A0;\n\t"
1385 + "%0 = R0;\n\t"
1386 + : "=&d" (dot)
1387 + : "a" (x), "a" (y), "a" (len)
1388 + : "I0", "I1", "A1", "A0", "R0", "R1"
1389 + );
1390 +
1391 + return dot;
1392 +}
1393 +#endif
1394 +/*- End of function --------------------------------------------------------*/
1395 +
1396 +static __inline__ int16_t fir16(fir16_state_t *fir, int16_t sample)
1397 +{
1398 + int32_t y;
1399 +#if defined(USE_MMX)
1400 + int i;
1401 + mmx_t *mmx_coeffs;
1402 + mmx_t *mmx_hist;
1403 +
1404 + fir->history[fir->curr_pos] = sample;
1405 + fir->history[fir->curr_pos + fir->taps] = sample;
1406 +
1407 + mmx_coeffs = (mmx_t *) fir->coeffs;
1408 + mmx_hist = (mmx_t *) &fir->history[fir->curr_pos];
1409 + i = fir->taps;
1410 + pxor_r2r(mm4, mm4);
1411 + /* 8 samples per iteration, so the filter must be a multiple of 8 long. */
1412 + while (i > 0)
1413 + {
1414 + movq_m2r(mmx_coeffs[0], mm0);
1415 + movq_m2r(mmx_coeffs[1], mm2);
1416 + movq_m2r(mmx_hist[0], mm1);
1417 + movq_m2r(mmx_hist[1], mm3);
1418 + mmx_coeffs += 2;
1419 + mmx_hist += 2;
1420 + pmaddwd_r2r(mm1, mm0);
1421 + pmaddwd_r2r(mm3, mm2);
1422 + paddd_r2r(mm0, mm4);
1423 + paddd_r2r(mm2, mm4);
1424 + i -= 8;
1425 + }
1426 + movq_r2r(mm4, mm0);
1427 + psrlq_i2r(32, mm0);
1428 + paddd_r2r(mm0, mm4);
1429 + movd_r2m(mm4, y);
1430 + emms();
1431 +#elif defined(USE_SSE2)
1432 + int i;
1433 + xmm_t *xmm_coeffs;
1434 + xmm_t *xmm_hist;
1435 +
1436 + fir->history[fir->curr_pos] = sample;
1437 + fir->history[fir->curr_pos + fir->taps] = sample;
1438 +
1439 + xmm_coeffs = (xmm_t *) fir->coeffs;
1440 + xmm_hist = (xmm_t *) &fir->history[fir->curr_pos];
1441 + i = fir->taps;
1442 + pxor_r2r(xmm4, xmm4);
1443 + /* 16 samples per iteration, so the filter must be a multiple of 16 long. */
1444 + while (i > 0)
1445 + {
1446 + movdqu_m2r(xmm_coeffs[0], xmm0);
1447 + movdqu_m2r(xmm_coeffs[1], xmm2);
1448 + movdqu_m2r(xmm_hist[0], xmm1);
1449 + movdqu_m2r(xmm_hist[1], xmm3);
1450 + xmm_coeffs += 2;
1451 + xmm_hist += 2;
1452 + pmaddwd_r2r(xmm1, xmm0);
1453 + pmaddwd_r2r(xmm3, xmm2);
1454 + paddd_r2r(xmm0, xmm4);
1455 + paddd_r2r(xmm2, xmm4);
1456 + i -= 16;
1457 + }
1458 + movdqa_r2r(xmm4, xmm0);
1459 + psrldq_i2r(8, xmm0);
1460 + paddd_r2r(xmm0, xmm4);
1461 + movdqa_r2r(xmm4, xmm0);
1462 + psrldq_i2r(4, xmm0);
1463 + paddd_r2r(xmm0, xmm4);
1464 + movd_r2m(xmm4, y);
1465 +#elif defined(__BLACKFIN_ASM__)
1466 + fir->history[fir->curr_pos] = sample;
1467 + fir->history[fir->curr_pos + fir->taps] = sample;
1468 + y = dot_asm((int16_t*)fir->coeffs, &fir->history[fir->curr_pos], fir->taps);
1469 +#else
1470 + int i;
1471 + int offset1;
1472 + int offset2;
1473 +
1474 + fir->history[fir->curr_pos] = sample;
1475 +
1476 + offset2 = fir->curr_pos;
1477 + offset1 = fir->taps - offset2;
1478 + y = 0;
1479 + for (i = fir->taps - 1; i >= offset1; i--)
1480 + y += fir->coeffs[i]*fir->history[i - offset1];
1481 + for ( ; i >= 0; i--)
1482 + y += fir->coeffs[i]*fir->history[i + offset2];
1483 +#endif
1484 + if (fir->curr_pos <= 0)
1485 + fir->curr_pos = fir->taps;
1486 + fir->curr_pos--;
1487 + return (int16_t) (y >> 15);
1488 +}
1489 +/*- End of function --------------------------------------------------------*/
1490 +
1491 +static __inline__ const int16_t *fir32_create(fir32_state_t *fir,
1492 + const int32_t *coeffs,
1493 + int taps)
1494 +{
1495 + fir->taps = taps;
1496 + fir->curr_pos = taps - 1;
1497 + fir->coeffs = coeffs;
1498 + fir->history = (int16_t *) malloc(taps*sizeof(int16_t));
1499 + if (fir->history)
1500 + memset(fir->history, '\0', taps*sizeof(int16_t));
1501 + return fir->history;
1502 +}
1503 +/*- End of function --------------------------------------------------------*/
1504 +
1505 +static __inline__ void fir32_flush(fir32_state_t *fir)
1506 +{
1507 + memset(fir->history, 0, fir->taps*sizeof(int16_t));
1508 +}
1509 +/*- End of function --------------------------------------------------------*/
1510 +
1511 +static __inline__ void fir32_free(fir32_state_t *fir)
1512 +{
1513 + free(fir->history);
1514 +}
1515 +/*- End of function --------------------------------------------------------*/
1516 +
1517 +static __inline__ int16_t fir32(fir32_state_t *fir, int16_t sample)
1518 +{
1519 + int i;
1520 + int32_t y;
1521 + int offset1;
1522 + int offset2;
1523 +
1524 + fir->history[fir->curr_pos] = sample;
1525 + offset2 = fir->curr_pos;
1526 + offset1 = fir->taps - offset2;
1527 + y = 0;
1528 + for (i = fir->taps - 1; i >= offset1; i--)
1529 + y += fir->coeffs[i]*fir->history[i - offset1];
1530 + for ( ; i >= 0; i--)
1531 + y += fir->coeffs[i]*fir->history[i + offset2];
1532 + if (fir->curr_pos <= 0)
1533 + fir->curr_pos = fir->taps;
1534 + fir->curr_pos--;
1535 + return (int16_t) (y >> 15);
1536 +}
1537 +/*- End of function --------------------------------------------------------*/
1538 +
1539 +#ifndef __KERNEL__
1540 +static __inline__ const float *fir_float_create(fir_float_state_t *fir,
1541 + const float *coeffs,
1542 + int taps)
1543 +{
1544 + fir->taps = taps;
1545 + fir->curr_pos = taps - 1;
1546 + fir->coeffs = coeffs;
1547 + fir->history = (float *) malloc(taps*sizeof(float));
1548 + if (fir->history)
1549 + memset(fir->history, '\0', taps*sizeof(float));
1550 + return fir->history;
1551 +}
1552 +/*- End of function --------------------------------------------------------*/
1553 +
1554 +static __inline__ void fir_float_free(fir_float_state_t *fir)
1555 +{
1556 + free(fir->history);
1557 +}
1558 +/*- End of function --------------------------------------------------------*/
1559 +
1560 +static __inline__ int16_t fir_float(fir_float_state_t *fir, int16_t sample)
1561 +{
1562 + int i;
1563 + float y;
1564 + int offset1;
1565 + int offset2;
1566 +
1567 + fir->history[fir->curr_pos] = sample;
1568 +
1569 + offset2 = fir->curr_pos;
1570 + offset1 = fir->taps - offset2;
1571 + y = 0;
1572 + for (i = fir->taps - 1; i >= offset1; i--)
1573 + y += fir->coeffs[i]*fir->history[i - offset1];
1574 + for ( ; i >= 0; i--)
1575 + y += fir->coeffs[i]*fir->history[i + offset2];
1576 + if (fir->curr_pos <= 0)
1577 + fir->curr_pos = fir->taps;
1578 + fir->curr_pos--;
1579 + return (int16_t) y;
1580 +}
1581 +/*- End of function --------------------------------------------------------*/
1582 +#endif
1583 +
1584 +#ifdef __cplusplus
1585 +}
1586 +#endif
1587 +
1588 +#endif
1589 +/*- End of file ------------------------------------------------------------*/
1590 diff --git a/drivers/staging/echo/mmx.h b/drivers/staging/echo/mmx.h
1591 new file mode 100644
1592 index 0000000..b5a3964
1593 --- /dev/null
1594 +++ b/drivers/staging/echo/mmx.h
1595 @@ -0,0 +1,288 @@
1596 +/*
1597 + * mmx.h
1598 + * Copyright (C) 1997-2001 H. Dietz and R. Fisher
1599 + *
1600 + * This file is part of FFmpeg.
1601 + *
1602 + * FFmpeg is free software; you can redistribute it and/or
1603 + * modify it under the terms of the GNU Lesser General Public
1604 + * License as published by the Free Software Foundation; either
1605 + * version 2.1 of the License, or (at your option) any later version.
1606 + *
1607 + * FFmpeg is distributed in the hope that it will be useful,
1608 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1609 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1610 + * Lesser General Public License for more details.
1611 + *
1612 + * You should have received a copy of the GNU Lesser General Public
1613 + * License along with FFmpeg; if not, write to the Free Software
1614 + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
1615 + */
1616 +#ifndef AVCODEC_I386MMX_H
1617 +#define AVCODEC_I386MMX_H
1618 +
1619 +/*
1620 + * The type of an value that fits in an MMX register (note that long
1621 + * long constant values MUST be suffixed by LL and unsigned long long
1622 + * values by ULL, lest they be truncated by the compiler)
1623 + */
1624 +
1625 +typedef union {
1626 + long long q; /* Quadword (64-bit) value */
1627 + unsigned long long uq; /* Unsigned Quadword */
1628 + int d[2]; /* 2 Doubleword (32-bit) values */
1629 + unsigned int ud[2]; /* 2 Unsigned Doubleword */
1630 + short w[4]; /* 4 Word (16-bit) values */
1631 + unsigned short uw[4]; /* 4 Unsigned Word */
1632 + char b[8]; /* 8 Byte (8-bit) values */
1633 + unsigned char ub[8]; /* 8 Unsigned Byte */
1634 + float s[2]; /* Single-precision (32-bit) value */
1635 +} mmx_t; /* On an 8-byte (64-bit) boundary */
1636 +
1637 +/* SSE registers */
1638 +typedef union {
1639 + char b[16];
1640 +} xmm_t;
1641 +
1642 +
1643 +#define mmx_i2r(op,imm,reg) \
1644 + __asm__ __volatile__ (#op " %0, %%" #reg \
1645 + : /* nothing */ \
1646 + : "i" (imm) )
1647 +
1648 +#define mmx_m2r(op,mem,reg) \
1649 + __asm__ __volatile__ (#op " %0, %%" #reg \
1650 + : /* nothing */ \
1651 + : "m" (mem))
1652 +
1653 +#define mmx_r2m(op,reg,mem) \
1654 + __asm__ __volatile__ (#op " %%" #reg ", %0" \
1655 + : "=m" (mem) \
1656 + : /* nothing */ )
1657 +
1658 +#define mmx_r2r(op,regs,regd) \
1659 + __asm__ __volatile__ (#op " %" #regs ", %" #regd)
1660 +
1661 +
1662 +#define emms() __asm__ __volatile__ ("emms")
1663 +
1664 +#define movd_m2r(var,reg) mmx_m2r (movd, var, reg)
1665 +#define movd_r2m(reg,var) mmx_r2m (movd, reg, var)
1666 +#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd)
1667 +
1668 +#define movq_m2r(var,reg) mmx_m2r (movq, var, reg)
1669 +#define movq_r2m(reg,var) mmx_r2m (movq, reg, var)
1670 +#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd)
1671 +
1672 +#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg)
1673 +#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd)
1674 +#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg)
1675 +#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd)
1676 +
1677 +#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg)
1678 +#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd)
1679 +
1680 +#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg)
1681 +#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd)
1682 +#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg)
1683 +#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd)
1684 +#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg)
1685 +#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd)
1686 +
1687 +#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg)
1688 +#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd)
1689 +#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg)
1690 +#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd)
1691 +
1692 +#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg)
1693 +#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd)
1694 +#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg)
1695 +#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd)
1696 +
1697 +#define pand_m2r(var,reg) mmx_m2r (pand, var, reg)
1698 +#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd)
1699 +
1700 +#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg)
1701 +#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd)
1702 +
1703 +#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg)
1704 +#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd)
1705 +#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg)
1706 +#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd)
1707 +#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg)
1708 +#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd)
1709 +
1710 +#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg)
1711 +#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd)
1712 +#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg)
1713 +#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd)
1714 +#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg)
1715 +#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd)
1716 +
1717 +#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg)
1718 +#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd)
1719 +
1720 +#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg)
1721 +#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd)
1722 +
1723 +#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg)
1724 +#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd)
1725 +
1726 +#define por_m2r(var,reg) mmx_m2r (por, var, reg)
1727 +#define por_r2r(regs,regd) mmx_r2r (por, regs, regd)
1728 +
1729 +#define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg)
1730 +#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg)
1731 +#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd)
1732 +#define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg)
1733 +#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg)
1734 +#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd)
1735 +#define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg)
1736 +#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg)
1737 +#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd)
1738 +
1739 +#define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg)
1740 +#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg)
1741 +#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd)
1742 +#define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg)
1743 +#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg)
1744 +#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd)
1745 +
1746 +#define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg)
1747 +#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg)
1748 +#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd)
1749 +#define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg)
1750 +#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg)
1751 +#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd)
1752 +#define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg)
1753 +#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg)
1754 +#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd)
1755 +
1756 +#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg)
1757 +#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd)
1758 +#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg)
1759 +#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd)
1760 +#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg)
1761 +#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd)
1762 +
1763 +#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg)
1764 +#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd)
1765 +#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg)
1766 +#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd)
1767 +
1768 +#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg)
1769 +#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd)
1770 +#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg)
1771 +#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd)
1772 +
1773 +#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg)
1774 +#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd)
1775 +#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg)
1776 +#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd)
1777 +#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg)
1778 +#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd)
1779 +
1780 +#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg)
1781 +#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd)
1782 +#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg)
1783 +#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd)
1784 +#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg)
1785 +#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd)
1786 +
1787 +#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg)
1788 +#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd)
1789 +
1790 +
1791 +/* 3DNOW extensions */
1792 +
1793 +#define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg)
1794 +#define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd)
1795 +
1796 +
1797 +/* AMD MMX extensions - also available in intel SSE */
1798 +
1799 +
1800 +#define mmx_m2ri(op,mem,reg,imm) \
1801 + __asm__ __volatile__ (#op " %1, %0, %%" #reg \
1802 + : /* nothing */ \
1803 + : "m" (mem), "i" (imm))
1804 +#define mmx_r2ri(op,regs,regd,imm) \
1805 + __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
1806 + : /* nothing */ \
1807 + : "i" (imm) )
1808 +
1809 +#define mmx_fetch(mem,hint) \
1810 + __asm__ __volatile__ ("prefetch" #hint " %0" \
1811 + : /* nothing */ \
1812 + : "m" (mem))
1813 +
1814 +
1815 +#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg)
1816 +
1817 +#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var)
1818 +
1819 +#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg)
1820 +#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd)
1821 +#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg)
1822 +#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd)
1823 +
1824 +#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm)
1825 +
1826 +#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm)
1827 +
1828 +#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg)
1829 +#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd)
1830 +
1831 +#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg)
1832 +#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd)
1833 +
1834 +#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg)
1835 +#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd)
1836 +
1837 +#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg)
1838 +#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd)
1839 +
1840 +#define pmovmskb(mmreg,reg) \
1841 + __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg)
1842 +
1843 +#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg)
1844 +#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd)
1845 +
1846 +#define prefetcht0(mem) mmx_fetch (mem, t0)
1847 +#define prefetcht1(mem) mmx_fetch (mem, t1)
1848 +#define prefetcht2(mem) mmx_fetch (mem, t2)
1849 +#define prefetchnta(mem) mmx_fetch (mem, nta)
1850 +
1851 +#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg)
1852 +#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd)
1853 +
1854 +#define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm)
1855 +#define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm)
1856 +
1857 +#define sfence() __asm__ __volatile__ ("sfence\n\t")
1858 +
1859 +/* SSE2 */
1860 +#define pshufhw_m2r(var,reg,imm) mmx_m2ri(pshufhw, var, reg, imm)
1861 +#define pshufhw_r2r(regs,regd,imm) mmx_r2ri(pshufhw, regs, regd, imm)
1862 +#define pshuflw_m2r(var,reg,imm) mmx_m2ri(pshuflw, var, reg, imm)
1863 +#define pshuflw_r2r(regs,regd,imm) mmx_r2ri(pshuflw, regs, regd, imm)
1864 +
1865 +#define pshufd_r2r(regs,regd,imm) mmx_r2ri(pshufd, regs, regd, imm)
1866 +
1867 +#define movdqa_m2r(var,reg) mmx_m2r (movdqa, var, reg)
1868 +#define movdqa_r2m(reg,var) mmx_r2m (movdqa, reg, var)
1869 +#define movdqa_r2r(regs,regd) mmx_r2r (movdqa, regs, regd)
1870 +#define movdqu_m2r(var,reg) mmx_m2r (movdqu, var, reg)
1871 +#define movdqu_r2m(reg,var) mmx_r2m (movdqu, reg, var)
1872 +#define movdqu_r2r(regs,regd) mmx_r2r (movdqu, regs, regd)
1873 +
1874 +#define pmullw_r2m(reg,var) mmx_r2m (pmullw, reg, var)
1875 +
1876 +#define pslldq_i2r(imm,reg) mmx_i2r (pslldq, imm, reg)
1877 +#define psrldq_i2r(imm,reg) mmx_i2r (psrldq, imm, reg)
1878 +
1879 +#define punpcklqdq_r2r(regs,regd) mmx_r2r (punpcklqdq, regs, regd)
1880 +#define punpckhqdq_r2r(regs,regd) mmx_r2r (punpckhqdq, regs, regd)
1881 +
1882 +
1883 +#endif /* AVCODEC_I386MMX_H */
1884 --
1885 1.6.0.2
1886