libavcodec/x86/cavsdsp_mmx.c
Go to the documentation of this file.
00001 /*
00002  * Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
00003  * Copyright (c) 2006  Stefan Gehrer <stefan.gehrer@gmx.de>
00004  *
00005  * MMX-optimized DSP functions, based on H.264 optimizations by
00006  * Michael Niedermayer and Loren Merritt
00007  *
00008  * This file is part of FFmpeg.
00009  *
00010  * FFmpeg is free software; you can redistribute it and/or
00011  * modify it under the terms of the GNU Lesser General Public
00012  * License as published by the Free Software Foundation; either
00013  * version 2.1 of the License, or (at your option) any later version.
00014  *
00015  * FFmpeg is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018  * Lesser General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU Lesser General Public
00021  * License along with FFmpeg; if not, write to the Free Software
00022  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00023  */
00024 
00025 #include "libavutil/common.h"
00026 #include "libavutil/cpu.h"
00027 #include "libavutil/x86_cpu.h"
00028 #include "libavcodec/dsputil.h"
00029 #include "libavcodec/cavsdsp.h"
00030 #include "dsputil_mmx.h"
00031 
00032 /*****************************************************************************
00033  *
00034  * inverse transform
00035  *
00036  ****************************************************************************/
00037 
00038 static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
00039 {
00040     __asm__ volatile(
00041         "movq 112(%0), %%mm4  \n\t" /* mm4 = src7 */
00042         "movq  16(%0), %%mm5  \n\t" /* mm5 = src1 */
00043         "movq  80(%0), %%mm2  \n\t" /* mm2 = src5 */
00044         "movq  48(%0), %%mm7  \n\t" /* mm7 = src3 */
00045         "movq   %%mm4, %%mm0  \n\t"
00046         "movq   %%mm5, %%mm3  \n\t"
00047         "movq   %%mm2, %%mm6  \n\t"
00048         "movq   %%mm7, %%mm1  \n\t"
00049 
00050         "paddw  %%mm4, %%mm4  \n\t" /* mm4 = 2*src7 */
00051         "paddw  %%mm3, %%mm3  \n\t" /* mm3 = 2*src1 */
00052         "paddw  %%mm6, %%mm6  \n\t" /* mm6 = 2*src5 */
00053         "paddw  %%mm1, %%mm1  \n\t" /* mm1 = 2*src3 */
00054         "paddw  %%mm4, %%mm0  \n\t" /* mm0 = 3*src7 */
00055         "paddw  %%mm3, %%mm5  \n\t" /* mm5 = 3*src1 */
00056         "paddw  %%mm6, %%mm2  \n\t" /* mm2 = 3*src5 */
00057         "paddw  %%mm1, %%mm7  \n\t" /* mm7 = 3*src3 */
00058         "psubw  %%mm4, %%mm5  \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */
00059         "paddw  %%mm6, %%mm7  \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */
00060         "psubw  %%mm2, %%mm1  \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */
00061         "paddw  %%mm0, %%mm3  \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */
00062 
00063         "movq   %%mm5, %%mm4  \n\t"
00064         "movq   %%mm7, %%mm6  \n\t"
00065         "movq   %%mm3, %%mm0  \n\t"
00066         "movq   %%mm1, %%mm2  \n\t"
00067         SUMSUB_BA( %%mm7, %%mm5 )   /* mm7 = a0 + a1  mm5 = a0 - a1 */
00068         "paddw  %%mm3, %%mm7  \n\t" /* mm7 = a0 + a1 + a3 */
00069         "paddw  %%mm1, %%mm5  \n\t" /* mm5 = a0 - a1 + a2 */
00070         "paddw  %%mm7, %%mm7  \n\t"
00071         "paddw  %%mm5, %%mm5  \n\t"
00072         "paddw  %%mm6, %%mm7  \n\t" /* mm7 = b4 */
00073         "paddw  %%mm4, %%mm5  \n\t" /* mm5 = b5 */
00074 
00075         SUMSUB_BA( %%mm1, %%mm3 )   /* mm1 = a3 + a2  mm3 = a3 - a2 */
00076         "psubw  %%mm1, %%mm4  \n\t" /* mm4 = a0 - a2 - a3 */
00077         "movq   %%mm4, %%mm1  \n\t" /* mm1 = a0 - a2 - a3 */
00078         "psubw  %%mm6, %%mm3  \n\t" /* mm3 = a3 - a2 - a1 */
00079         "paddw  %%mm1, %%mm1  \n\t"
00080         "paddw  %%mm3, %%mm3  \n\t"
00081         "psubw  %%mm2, %%mm1  \n\t" /* mm1 = b7 */
00082         "paddw  %%mm0, %%mm3  \n\t" /* mm3 = b6 */
00083 
00084         "movq  32(%0), %%mm2  \n\t" /* mm2 = src2 */
00085         "movq  96(%0), %%mm6  \n\t" /* mm6 = src6 */
00086         "movq   %%mm2, %%mm4  \n\t"
00087         "movq   %%mm6, %%mm0  \n\t"
00088         "psllw  $2,    %%mm4  \n\t" /* mm4 = 4*src2 */
00089         "psllw  $2,    %%mm6  \n\t" /* mm6 = 4*src6 */
00090         "paddw  %%mm4, %%mm2  \n\t" /* mm2 = 5*src2 */
00091         "paddw  %%mm6, %%mm0  \n\t" /* mm0 = 5*src6 */
00092         "paddw  %%mm2, %%mm2  \n\t"
00093         "paddw  %%mm0, %%mm0  \n\t"
00094         "psubw  %%mm0, %%mm4  \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */
00095         "paddw  %%mm2, %%mm6  \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */
00096 
00097         "movq    (%0), %%mm2  \n\t" /* mm2 = src0 */
00098         "movq  64(%0), %%mm0  \n\t" /* mm0 = src4 */
00099         SUMSUB_BA( %%mm0, %%mm2 )   /* mm0 = src0+src4  mm2 = src0-src4 */
00100         "psllw  $3,    %%mm0  \n\t"
00101         "psllw  $3,    %%mm2  \n\t"
00102         "paddw  %1,    %%mm0  \n\t" /* add rounding bias */
00103         "paddw  %1,    %%mm2  \n\t" /* add rounding bias */
00104 
00105         SUMSUB_BA( %%mm6, %%mm0 )   /* mm6 = a4 + a6  mm0 = a4 - a6 */
00106         SUMSUB_BA( %%mm4, %%mm2 )   /* mm4 = a5 + a7  mm2 = a5 - a7 */
00107         SUMSUB_BA( %%mm7, %%mm6 )   /* mm7 = dst0  mm6 = dst7 */
00108         SUMSUB_BA( %%mm5, %%mm4 )   /* mm5 = dst1  mm4 = dst6 */
00109         SUMSUB_BA( %%mm3, %%mm2 )   /* mm3 = dst2  mm2 = dst5 */
00110         SUMSUB_BA( %%mm1, %%mm0 )   /* mm1 = dst3  mm0 = dst4 */
00111         :: "r"(block), "m"(bias)
00112     );
00113 }
00114 
00115 static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
00116 {
00117     int i;
00118     DECLARE_ALIGNED(8, int16_t, b2)[64];
00119 
00120     for(i=0; i<2; i++){
00121         DECLARE_ALIGNED(8, uint64_t, tmp);
00122 
00123         cavs_idct8_1d(block+4*i, ff_pw_4.a);
00124 
00125         __asm__ volatile(
00126             "psraw     $3, %%mm7  \n\t"
00127             "psraw     $3, %%mm6  \n\t"
00128             "psraw     $3, %%mm5  \n\t"
00129             "psraw     $3, %%mm4  \n\t"
00130             "psraw     $3, %%mm3  \n\t"
00131             "psraw     $3, %%mm2  \n\t"
00132             "psraw     $3, %%mm1  \n\t"
00133             "psraw     $3, %%mm0  \n\t"
00134             "movq   %%mm7,    %0   \n\t"
00135             TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
00136             "movq   %%mm0,  8(%1)  \n\t"
00137             "movq   %%mm6, 24(%1)  \n\t"
00138             "movq   %%mm7, 40(%1)  \n\t"
00139             "movq   %%mm4, 56(%1)  \n\t"
00140             "movq    %0,    %%mm7  \n\t"
00141             TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
00142             "movq   %%mm7,   (%1)  \n\t"
00143             "movq   %%mm1, 16(%1)  \n\t"
00144             "movq   %%mm0, 32(%1)  \n\t"
00145             "movq   %%mm3, 48(%1)  \n\t"
00146             : "=m"(tmp)
00147             : "r"(b2+32*i)
00148             : "memory"
00149         );
00150     }
00151 
00152     for(i=0; i<2; i++){
00153         cavs_idct8_1d(b2+4*i, ff_pw_64.a);
00154 
00155         __asm__ volatile(
00156             "psraw     $7, %%mm7  \n\t"
00157             "psraw     $7, %%mm6  \n\t"
00158             "psraw     $7, %%mm5  \n\t"
00159             "psraw     $7, %%mm4  \n\t"
00160             "psraw     $7, %%mm3  \n\t"
00161             "psraw     $7, %%mm2  \n\t"
00162             "psraw     $7, %%mm1  \n\t"
00163             "psraw     $7, %%mm0  \n\t"
00164             "movq   %%mm7,    (%0)  \n\t"
00165             "movq   %%mm5,  16(%0)  \n\t"
00166             "movq   %%mm3,  32(%0)  \n\t"
00167             "movq   %%mm1,  48(%0)  \n\t"
00168             "movq   %%mm0,  64(%0)  \n\t"
00169             "movq   %%mm2,  80(%0)  \n\t"
00170             "movq   %%mm4,  96(%0)  \n\t"
00171             "movq   %%mm6, 112(%0)  \n\t"
00172             :: "r"(b2+4*i)
00173             : "memory"
00174         );
00175     }
00176 
00177     ff_add_pixels_clamped_mmx(b2, dst, stride);
00178 }
00179 
00180 /*****************************************************************************
00181  *
00182  * motion compensation
00183  *
00184  ****************************************************************************/
00185 
00186 /* vertical filter [-1 -2 96 42 -7  0]  */
00187 #define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \
00188         "movd (%0), "#F"            \n\t"\
00189         "movq "#C", %%mm6           \n\t"\
00190         "pmullw %5, %%mm6           \n\t"\
00191         "movq "#D", %%mm7           \n\t"\
00192         "pmullw "MANGLE(MUL2)", %%mm7\n\t"\
00193         "psllw $3, "#E"             \n\t"\
00194         "psubw "#E", %%mm6          \n\t"\
00195         "psraw $3, "#E"             \n\t"\
00196         "paddw %%mm7, %%mm6         \n\t"\
00197         "paddw "#E", %%mm6          \n\t"\
00198         "paddw "#B", "#B"           \n\t"\
00199         "pxor %%mm7, %%mm7          \n\t"\
00200         "add %2, %0                 \n\t"\
00201         "punpcklbw %%mm7, "#F"      \n\t"\
00202         "psubw "#B", %%mm6          \n\t"\
00203         "psraw $1, "#B"             \n\t"\
00204         "psubw "#A", %%mm6          \n\t"\
00205         "paddw %4, %%mm6            \n\t"\
00206         "psraw $7, %%mm6            \n\t"\
00207         "packuswb %%mm6, %%mm6      \n\t"\
00208         OP(%%mm6, (%1), A, d)            \
00209         "add %3, %1                 \n\t"
00210 
00211 /* vertical filter [ 0 -1  5  5 -1  0]  */
00212 #define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \
00213         "movd (%0), "#F"            \n\t"\
00214         "movq "#C", %%mm6           \n\t"\
00215         "paddw "#D", %%mm6          \n\t"\
00216         "pmullw %5, %%mm6           \n\t"\
00217         "add %2, %0                 \n\t"\
00218         "punpcklbw %%mm7, "#F"      \n\t"\
00219         "psubw "#B", %%mm6          \n\t"\
00220         "psubw "#E", %%mm6          \n\t"\
00221         "paddw %4, %%mm6            \n\t"\
00222         "psraw $3, %%mm6            \n\t"\
00223         "packuswb %%mm6, %%mm6      \n\t"\
00224         OP(%%mm6, (%1), A, d)            \
00225         "add %3, %1                 \n\t"
00226 
00227 /* vertical filter [ 0 -7 42 96 -2 -1]  */
00228 #define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \
00229         "movd (%0), "#F"            \n\t"\
00230         "movq "#C", %%mm6           \n\t"\
00231         "pmullw "MANGLE(MUL2)", %%mm6\n\t"\
00232         "movq "#D", %%mm7           \n\t"\
00233         "pmullw %5, %%mm7           \n\t"\
00234         "psllw $3, "#B"             \n\t"\
00235         "psubw "#B", %%mm6          \n\t"\
00236         "psraw $3, "#B"             \n\t"\
00237         "paddw %%mm7, %%mm6         \n\t"\
00238         "paddw "#B", %%mm6          \n\t"\
00239         "paddw "#E", "#E"           \n\t"\
00240         "pxor %%mm7, %%mm7          \n\t"\
00241         "add %2, %0                 \n\t"\
00242         "punpcklbw %%mm7, "#F"      \n\t"\
00243         "psubw "#E", %%mm6          \n\t"\
00244         "psraw $1, "#E"             \n\t"\
00245         "psubw "#F", %%mm6          \n\t"\
00246         "paddw %4, %%mm6            \n\t"\
00247         "psraw $7, %%mm6            \n\t"\
00248         "packuswb %%mm6, %%mm6      \n\t"\
00249         OP(%%mm6, (%1), A, d)            \
00250         "add %3, %1                 \n\t"
00251 
00252 
00253 #define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
00254     int w= 2;\
00255     src -= 2*srcStride;\
00256     \
00257     while(w--){\
00258       __asm__ volatile(\
00259         "pxor %%mm7, %%mm7          \n\t"\
00260         "movd (%0), %%mm0           \n\t"\
00261         "add %2, %0                 \n\t"\
00262         "movd (%0), %%mm1           \n\t"\
00263         "add %2, %0                 \n\t"\
00264         "movd (%0), %%mm2           \n\t"\
00265         "add %2, %0                 \n\t"\
00266         "movd (%0), %%mm3           \n\t"\
00267         "add %2, %0                 \n\t"\
00268         "movd (%0), %%mm4           \n\t"\
00269         "add %2, %0                 \n\t"\
00270         "punpcklbw %%mm7, %%mm0     \n\t"\
00271         "punpcklbw %%mm7, %%mm1     \n\t"\
00272         "punpcklbw %%mm7, %%mm2     \n\t"\
00273         "punpcklbw %%mm7, %%mm3     \n\t"\
00274         "punpcklbw %%mm7, %%mm4     \n\t"\
00275         VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
00276         VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
00277         VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
00278         VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
00279         VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
00280         VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
00281         VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
00282         VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
00283         \
00284         : "+a"(src), "+c"(dst)\
00285         : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
00286         : "memory"\
00287      );\
00288      if(h==16){\
00289         __asm__ volatile(\
00290             VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
00291             VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
00292             VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
00293             VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
00294             VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
00295             VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
00296             VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
00297             VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
00298             \
00299            : "+a"(src), "+c"(dst)\
00300            : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD),  "m"(MUL1)\
00301            : "memory"\
00302         );\
00303      }\
00304      src += 4-(h+5)*srcStride;\
00305      dst += 4-h*dstStride;\
00306    }
00307 
00308 #define QPEL_CAVS(OPNAME, OP, MMX)\
00309 static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00310     int h=8;\
00311     __asm__ volatile(\
00312         "pxor %%mm7, %%mm7          \n\t"\
00313         "movq %5, %%mm6             \n\t"\
00314         "1:                         \n\t"\
00315         "movq    (%0), %%mm0        \n\t"\
00316         "movq   1(%0), %%mm2        \n\t"\
00317         "movq %%mm0, %%mm1          \n\t"\
00318         "movq %%mm2, %%mm3          \n\t"\
00319         "punpcklbw %%mm7, %%mm0     \n\t"\
00320         "punpckhbw %%mm7, %%mm1     \n\t"\
00321         "punpcklbw %%mm7, %%mm2     \n\t"\
00322         "punpckhbw %%mm7, %%mm3     \n\t"\
00323         "paddw %%mm2, %%mm0         \n\t"\
00324         "paddw %%mm3, %%mm1         \n\t"\
00325         "pmullw %%mm6, %%mm0        \n\t"\
00326         "pmullw %%mm6, %%mm1        \n\t"\
00327         "movq   -1(%0), %%mm2       \n\t"\
00328         "movq    2(%0), %%mm4       \n\t"\
00329         "movq %%mm2, %%mm3          \n\t"\
00330         "movq %%mm4, %%mm5          \n\t"\
00331         "punpcklbw %%mm7, %%mm2     \n\t"\
00332         "punpckhbw %%mm7, %%mm3     \n\t"\
00333         "punpcklbw %%mm7, %%mm4     \n\t"\
00334         "punpckhbw %%mm7, %%mm5     \n\t"\
00335         "paddw %%mm4, %%mm2         \n\t"\
00336         "paddw %%mm3, %%mm5         \n\t"\
00337         "psubw %%mm2, %%mm0         \n\t"\
00338         "psubw %%mm5, %%mm1         \n\t"\
00339         "movq %6, %%mm5             \n\t"\
00340         "paddw %%mm5, %%mm0         \n\t"\
00341         "paddw %%mm5, %%mm1         \n\t"\
00342         "psraw $3, %%mm0            \n\t"\
00343         "psraw $3, %%mm1            \n\t"\
00344         "packuswb %%mm1, %%mm0      \n\t"\
00345         OP(%%mm0, (%1),%%mm5, q)         \
00346         "add %3, %0                 \n\t"\
00347         "add %4, %1                 \n\t"\
00348         "decl %2                    \n\t"\
00349         " jnz 1b                    \n\t"\
00350         : "+a"(src), "+c"(dst), "+m"(h)\
00351         : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
00352         : "memory"\
00353     );\
00354 }\
00355 \
00356 static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
00357   QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42)      \
00358 }\
00359 \
00360 static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
00361   QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5)         \
00362 }\
00363 \
00364 static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
00365   QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42)      \
00366 }\
00367 \
00368 static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00369     OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
00370 }\
00371 static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00372     OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
00373     OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
00374 }\
00375 \
00376 static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00377     OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
00378 }\
00379 static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00380     OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
00381     OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
00382 }\
00383 \
00384 static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00385     OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
00386 }\
00387 static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00388     OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
00389     OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
00390 }\
00391 \
00392 static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00393     OPNAME ## cavs_qpel8_h_ ## MMX(dst  , src  , dstStride, srcStride);\
00394     OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
00395     src += 8*srcStride;\
00396     dst += 8*dstStride;\
00397     OPNAME ## cavs_qpel8_h_ ## MMX(dst  , src  , dstStride, srcStride);\
00398     OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
00399 }\
00400 
00401 #define CAVS_MC(OPNAME, SIZE, MMX) \
00402 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00403     OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
00404 }\
00405 \
00406 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00407     OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
00408 }\
00409 \
00410 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00411     OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
00412 }\
00413 \
00414 static void ff_ ## OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00415     OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
00416 }\
00417 
00418 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "    \n\t"
00419 #define AVG_3DNOW_OP(a,b,temp, size) \
00420 "mov" #size " " #b ", " #temp "   \n\t"\
00421 "pavgusb " #temp ", " #a "        \n\t"\
00422 "mov" #size " " #a ", " #b "      \n\t"
00423 #define AVG_MMX2_OP(a,b,temp, size) \
00424 "mov" #size " " #b ", " #temp "   \n\t"\
00425 "pavgb " #temp ", " #a "          \n\t"\
00426 "mov" #size " " #a ", " #b "      \n\t"
00427 
00428 QPEL_CAVS(put_,       PUT_OP, 3dnow)
00429 QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
00430 QPEL_CAVS(put_,       PUT_OP, mmx2)
00431 QPEL_CAVS(avg_,  AVG_MMX2_OP, mmx2)
00432 
00433 CAVS_MC(put_, 8, 3dnow)
00434 CAVS_MC(put_, 16,3dnow)
00435 CAVS_MC(avg_, 8, 3dnow)
00436 CAVS_MC(avg_, 16,3dnow)
00437 CAVS_MC(put_, 8, mmx2)
00438 CAVS_MC(put_, 16,mmx2)
00439 CAVS_MC(avg_, 8, mmx2)
00440 CAVS_MC(avg_, 16,mmx2)
00441 
00442 static void ff_cavsdsp_init_mmx2(CAVSDSPContext* c, AVCodecContext *avctx) {
00443 #define dspfunc(PFX, IDX, NUM) \
00444     c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \
00445     c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_mmx2; \
00446     c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_mmx2; \
00447     c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_mmx2; \
00448     c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_mmx2; \
00449 
00450     dspfunc(put_cavs_qpel, 0, 16);
00451     dspfunc(put_cavs_qpel, 1, 8);
00452     dspfunc(avg_cavs_qpel, 0, 16);
00453     dspfunc(avg_cavs_qpel, 1, 8);
00454 #undef dspfunc
00455     c->cavs_idct8_add = cavs_idct8_add_mmx;
00456 }
00457 
00458 static void ff_cavsdsp_init_3dnow(CAVSDSPContext* c, AVCodecContext *avctx) {
00459 #define dspfunc(PFX, IDX, NUM) \
00460     c->PFX ## _pixels_tab[IDX][ 0] = ff_ ## PFX ## NUM ## _mc00_mmx2; \
00461     c->PFX ## _pixels_tab[IDX][ 2] = ff_ ## PFX ## NUM ## _mc20_3dnow; \
00462     c->PFX ## _pixels_tab[IDX][ 4] = ff_ ## PFX ## NUM ## _mc01_3dnow; \
00463     c->PFX ## _pixels_tab[IDX][ 8] = ff_ ## PFX ## NUM ## _mc02_3dnow; \
00464     c->PFX ## _pixels_tab[IDX][12] = ff_ ## PFX ## NUM ## _mc03_3dnow; \
00465 
00466     dspfunc(put_cavs_qpel, 0, 16);
00467     dspfunc(put_cavs_qpel, 1, 8);
00468     dspfunc(avg_cavs_qpel, 0, 16);
00469     dspfunc(avg_cavs_qpel, 1, 8);
00470 #undef dspfunc
00471     c->cavs_idct8_add = cavs_idct8_add_mmx;
00472 }
00473 
00474 void ff_cavsdsp_init_mmx(CAVSDSPContext *c, AVCodecContext *avctx)
00475 {
00476     int mm_flags = av_get_cpu_flags();
00477 
00478     if (mm_flags & AV_CPU_FLAG_MMX2)  ff_cavsdsp_init_mmx2 (c, avctx);
00479     if (mm_flags & AV_CPU_FLAG_3DNOW) ff_cavsdsp_init_3dnow(c, avctx);
00480 }