• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • Examples
  • File List
  • Globals

libavcodec/x86/mpegvideo_mmx.c

Go to the documentation of this file.
00001 /*
00002  * The simplest mpeg encoder (well, it was the simplest!)
00003  * Copyright (c) 2000,2001 Fabrice Bellard
00004  *
00005  * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
00006  * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
00007  *
00008  * This file is part of FFmpeg.
00009  *
00010  * FFmpeg is free software; you can redistribute it and/or
00011  * modify it under the terms of the GNU Lesser General Public
00012  * License as published by the Free Software Foundation; either
00013  * version 2.1 of the License, or (at your option) any later version.
00014  *
00015  * FFmpeg is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018  * Lesser General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU Lesser General Public
00021  * License along with FFmpeg; if not, write to the Free Software
00022  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00023  */
00024 
00025 #include "libavutil/cpu.h"
00026 #include "libavutil/x86_cpu.h"
00027 #include "libavcodec/avcodec.h"
00028 #include "libavcodec/dsputil.h"
00029 #include "libavcodec/mpegvideo.h"
00030 #include "dsputil_mmx.h"
00031 
00032 extern uint16_t inv_zigzag_direct16[64];
00033 
00034 
00035 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
00036                                   DCTELEM *block, int n, int qscale)
00037 {
00038     x86_reg level, qmul, qadd, nCoeffs;
00039 
00040     qmul = qscale << 1;
00041 
00042     assert(s->block_last_index[n]>=0 || s->h263_aic);
00043 
00044     if (!s->h263_aic) {
00045         if (n < 4)
00046             level = block[0] * s->y_dc_scale;
00047         else
00048             level = block[0] * s->c_dc_scale;
00049         qadd = (qscale - 1) | 1;
00050     }else{
00051         qadd = 0;
00052         level= block[0];
00053     }
00054     if(s->ac_pred)
00055         nCoeffs=63;
00056     else
00057         nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00058 //printf("%d %d  ", qmul, qadd);
00059 __asm__ volatile(
00060                 "movd %1, %%mm6                 \n\t" //qmul
00061                 "packssdw %%mm6, %%mm6          \n\t"
00062                 "packssdw %%mm6, %%mm6          \n\t"
00063                 "movd %2, %%mm5                 \n\t" //qadd
00064                 "pxor %%mm7, %%mm7              \n\t"
00065                 "packssdw %%mm5, %%mm5          \n\t"
00066                 "packssdw %%mm5, %%mm5          \n\t"
00067                 "psubw %%mm5, %%mm7             \n\t"
00068                 "pxor %%mm4, %%mm4              \n\t"
00069                 ".p2align 4                     \n\t"
00070                 "1:                             \n\t"
00071                 "movq (%0, %3), %%mm0           \n\t"
00072                 "movq 8(%0, %3), %%mm1          \n\t"
00073 
00074                 "pmullw %%mm6, %%mm0            \n\t"
00075                 "pmullw %%mm6, %%mm1            \n\t"
00076 
00077                 "movq (%0, %3), %%mm2           \n\t"
00078                 "movq 8(%0, %3), %%mm3          \n\t"
00079 
00080                 "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
00081                 "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
00082 
00083                 "pxor %%mm2, %%mm0              \n\t"
00084                 "pxor %%mm3, %%mm1              \n\t"
00085 
00086                 "paddw %%mm7, %%mm0             \n\t"
00087                 "paddw %%mm7, %%mm1             \n\t"
00088 
00089                 "pxor %%mm0, %%mm2              \n\t"
00090                 "pxor %%mm1, %%mm3              \n\t"
00091 
00092                 "pcmpeqw %%mm7, %%mm0           \n\t" // block[i] == 0 ? -1 : 0
00093                 "pcmpeqw %%mm7, %%mm1           \n\t" // block[i] == 0 ? -1 : 0
00094 
00095                 "pandn %%mm2, %%mm0             \n\t"
00096                 "pandn %%mm3, %%mm1             \n\t"
00097 
00098                 "movq %%mm0, (%0, %3)           \n\t"
00099                 "movq %%mm1, 8(%0, %3)          \n\t"
00100 
00101                 "add $16, %3                    \n\t"
00102                 "jng 1b                         \n\t"
00103                 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
00104                 : "memory"
00105         );
00106         block[0]= level;
00107 }
00108 
00109 
00110 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
00111                                   DCTELEM *block, int n, int qscale)
00112 {
00113     x86_reg qmul, qadd, nCoeffs;
00114 
00115     qmul = qscale << 1;
00116     qadd = (qscale - 1) | 1;
00117 
00118     assert(s->block_last_index[n]>=0 || s->h263_aic);
00119 
00120     nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
00121 //printf("%d %d  ", qmul, qadd);
00122 __asm__ volatile(
00123                 "movd %1, %%mm6                 \n\t" //qmul
00124                 "packssdw %%mm6, %%mm6          \n\t"
00125                 "packssdw %%mm6, %%mm6          \n\t"
00126                 "movd %2, %%mm5                 \n\t" //qadd
00127                 "pxor %%mm7, %%mm7              \n\t"
00128                 "packssdw %%mm5, %%mm5          \n\t"
00129                 "packssdw %%mm5, %%mm5          \n\t"
00130                 "psubw %%mm5, %%mm7             \n\t"
00131                 "pxor %%mm4, %%mm4              \n\t"
00132                 ".p2align 4                     \n\t"
00133                 "1:                             \n\t"
00134                 "movq (%0, %3), %%mm0           \n\t"
00135                 "movq 8(%0, %3), %%mm1          \n\t"
00136 
00137                 "pmullw %%mm6, %%mm0            \n\t"
00138                 "pmullw %%mm6, %%mm1            \n\t"
00139 
00140                 "movq (%0, %3), %%mm2           \n\t"
00141                 "movq 8(%0, %3), %%mm3          \n\t"
00142 
00143                 "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
00144                 "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
00145 
00146                 "pxor %%mm2, %%mm0              \n\t"
00147                 "pxor %%mm3, %%mm1              \n\t"
00148 
00149                 "paddw %%mm7, %%mm0             \n\t"
00150                 "paddw %%mm7, %%mm1             \n\t"
00151 
00152                 "pxor %%mm0, %%mm2              \n\t"
00153                 "pxor %%mm1, %%mm3              \n\t"
00154 
00155                 "pcmpeqw %%mm7, %%mm0           \n\t" // block[i] == 0 ? -1 : 0
00156                 "pcmpeqw %%mm7, %%mm1           \n\t" // block[i] == 0 ? -1 : 0
00157 
00158                 "pandn %%mm2, %%mm0             \n\t"
00159                 "pandn %%mm3, %%mm1             \n\t"
00160 
00161                 "movq %%mm0, (%0, %3)           \n\t"
00162                 "movq %%mm1, 8(%0, %3)          \n\t"
00163 
00164                 "add $16, %3                    \n\t"
00165                 "jng 1b                         \n\t"
00166                 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
00167                 : "memory"
00168         );
00169 }
00170 
00171 
00172 /*
00173   NK:
00174   Note: looking at PARANOID:
00175   "enable all paranoid tests for rounding, overflows, etc..."
00176 
00177 #ifdef PARANOID
00178                 if (level < -2048 || level > 2047)
00179                     fprintf(stderr, "unquant error %d %d\n", i, level);
00180 #endif
00181   We can suppose that result of two multiplications can't be greater than 0xFFFF
00182   i.e. is 16-bit, so we use here only PMULLW instruction and can avoid
00183   a complex multiplication.
00184 =====================================================
00185  Full formula for multiplication of 2 integer numbers
00186  which are represent as high:low words:
00187  input: value1 = high1:low1
00188         value2 = high2:low2
00189  output: value3 = value1*value2
00190  value3=high3:low3 (on overflow: modulus 2^32 wrap-around)
00191  this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4
00192  but this algorithm will compute only 0x66cb0ce4
00193  this limited by 16-bit size of operands
00194  ---------------------------------
00195  tlow1 = high1*low2
00196  tlow2 = high2*low1
00197  tlow1 = tlow1 + tlow2
00198  high3:low3 = low1*low2
00199  high3 += tlow1
00200 */
00201 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
00202                                      DCTELEM *block, int n, int qscale)
00203 {
00204     x86_reg nCoeffs;
00205     const uint16_t *quant_matrix;
00206     int block0;
00207 
00208     assert(s->block_last_index[n]>=0);
00209 
00210     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00211 
00212     if (n < 4)
00213         block0 = block[0] * s->y_dc_scale;
00214     else
00215         block0 = block[0] * s->c_dc_scale;
00216     /* XXX: only mpeg1 */
00217     quant_matrix = s->intra_matrix;
00218 __asm__ volatile(
00219                 "pcmpeqw %%mm7, %%mm7           \n\t"
00220                 "psrlw $15, %%mm7               \n\t"
00221                 "movd %2, %%mm6                 \n\t"
00222                 "packssdw %%mm6, %%mm6          \n\t"
00223                 "packssdw %%mm6, %%mm6          \n\t"
00224                 "mov %3, %%"REG_a"              \n\t"
00225                 ".p2align 4                     \n\t"
00226                 "1:                             \n\t"
00227                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
00228                 "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
00229                 "movq (%1, %%"REG_a"), %%mm4    \n\t"
00230                 "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
00231                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
00232                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
00233                 "pxor %%mm2, %%mm2              \n\t"
00234                 "pxor %%mm3, %%mm3              \n\t"
00235                 "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
00236                 "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
00237                 "pxor %%mm2, %%mm0              \n\t"
00238                 "pxor %%mm3, %%mm1              \n\t"
00239                 "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
00240                 "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
00241                 "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*q
00242                 "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
00243                 "pxor %%mm4, %%mm4              \n\t"
00244                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
00245                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
00246                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
00247                 "psraw $3, %%mm0                \n\t"
00248                 "psraw $3, %%mm1                \n\t"
00249                 "psubw %%mm7, %%mm0             \n\t"
00250                 "psubw %%mm7, %%mm1             \n\t"
00251                 "por %%mm7, %%mm0               \n\t"
00252                 "por %%mm7, %%mm1               \n\t"
00253                 "pxor %%mm2, %%mm0              \n\t"
00254                 "pxor %%mm3, %%mm1              \n\t"
00255                 "psubw %%mm2, %%mm0             \n\t"
00256                 "psubw %%mm3, %%mm1             \n\t"
00257                 "pandn %%mm0, %%mm4             \n\t"
00258                 "pandn %%mm1, %%mm5             \n\t"
00259                 "movq %%mm4, (%0, %%"REG_a")    \n\t"
00260                 "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
00261 
00262                 "add $16, %%"REG_a"             \n\t"
00263                 "js 1b                          \n\t"
00264                 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
00265                 : "%"REG_a, "memory"
00266         );
00267     block[0]= block0;
00268 }
00269 
00270 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
00271                                      DCTELEM *block, int n, int qscale)
00272 {
00273     x86_reg nCoeffs;
00274     const uint16_t *quant_matrix;
00275 
00276     assert(s->block_last_index[n]>=0);
00277 
00278     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
00279 
00280         quant_matrix = s->inter_matrix;
00281 __asm__ volatile(
00282                 "pcmpeqw %%mm7, %%mm7           \n\t"
00283                 "psrlw $15, %%mm7               \n\t"
00284                 "movd %2, %%mm6                 \n\t"
00285                 "packssdw %%mm6, %%mm6          \n\t"
00286                 "packssdw %%mm6, %%mm6          \n\t"
00287                 "mov %3, %%"REG_a"              \n\t"
00288                 ".p2align 4                     \n\t"
00289                 "1:                             \n\t"
00290                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
00291                 "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
00292                 "movq (%1, %%"REG_a"), %%mm4    \n\t"
00293                 "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
00294                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
00295                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
00296                 "pxor %%mm2, %%mm2              \n\t"
00297                 "pxor %%mm3, %%mm3              \n\t"
00298                 "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
00299                 "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
00300                 "pxor %%mm2, %%mm0              \n\t"
00301                 "pxor %%mm3, %%mm1              \n\t"
00302                 "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
00303                 "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
00304                 "paddw %%mm0, %%mm0             \n\t" // abs(block[i])*2
00305                 "paddw %%mm1, %%mm1             \n\t" // abs(block[i])*2
00306                 "paddw %%mm7, %%mm0             \n\t" // abs(block[i])*2 + 1
00307                 "paddw %%mm7, %%mm1             \n\t" // abs(block[i])*2 + 1
00308                 "pmullw %%mm4, %%mm0            \n\t" // (abs(block[i])*2 + 1)*q
00309                 "pmullw %%mm5, %%mm1            \n\t" // (abs(block[i])*2 + 1)*q
00310                 "pxor %%mm4, %%mm4              \n\t"
00311                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
00312                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
00313                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
00314                 "psraw $4, %%mm0                \n\t"
00315                 "psraw $4, %%mm1                \n\t"
00316                 "psubw %%mm7, %%mm0             \n\t"
00317                 "psubw %%mm7, %%mm1             \n\t"
00318                 "por %%mm7, %%mm0               \n\t"
00319                 "por %%mm7, %%mm1               \n\t"
00320                 "pxor %%mm2, %%mm0              \n\t"
00321                 "pxor %%mm3, %%mm1              \n\t"
00322                 "psubw %%mm2, %%mm0             \n\t"
00323                 "psubw %%mm3, %%mm1             \n\t"
00324                 "pandn %%mm0, %%mm4             \n\t"
00325                 "pandn %%mm1, %%mm5             \n\t"
00326                 "movq %%mm4, (%0, %%"REG_a")    \n\t"
00327                 "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
00328 
00329                 "add $16, %%"REG_a"             \n\t"
00330                 "js 1b                          \n\t"
00331                 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
00332                 : "%"REG_a, "memory"
00333         );
00334 }
00335 
00336 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
00337                                      DCTELEM *block, int n, int qscale)
00338 {
00339     x86_reg nCoeffs;
00340     const uint16_t *quant_matrix;
00341     int block0;
00342 
00343     assert(s->block_last_index[n]>=0);
00344 
00345     if(s->alternate_scan) nCoeffs= 63; //FIXME
00346     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00347 
00348     if (n < 4)
00349         block0 = block[0] * s->y_dc_scale;
00350     else
00351         block0 = block[0] * s->c_dc_scale;
00352     quant_matrix = s->intra_matrix;
00353 __asm__ volatile(
00354                 "pcmpeqw %%mm7, %%mm7           \n\t"
00355                 "psrlw $15, %%mm7               \n\t"
00356                 "movd %2, %%mm6                 \n\t"
00357                 "packssdw %%mm6, %%mm6          \n\t"
00358                 "packssdw %%mm6, %%mm6          \n\t"
00359                 "mov %3, %%"REG_a"              \n\t"
00360                 ".p2align 4                     \n\t"
00361                 "1:                             \n\t"
00362                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
00363                 "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
00364                 "movq (%1, %%"REG_a"), %%mm4    \n\t"
00365                 "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
00366                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
00367                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
00368                 "pxor %%mm2, %%mm2              \n\t"
00369                 "pxor %%mm3, %%mm3              \n\t"
00370                 "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
00371                 "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
00372                 "pxor %%mm2, %%mm0              \n\t"
00373                 "pxor %%mm3, %%mm1              \n\t"
00374                 "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
00375                 "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
00376                 "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*q
00377                 "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
00378                 "pxor %%mm4, %%mm4              \n\t"
00379                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
00380                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
00381                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
00382                 "psraw $3, %%mm0                \n\t"
00383                 "psraw $3, %%mm1                \n\t"
00384                 "pxor %%mm2, %%mm0              \n\t"
00385                 "pxor %%mm3, %%mm1              \n\t"
00386                 "psubw %%mm2, %%mm0             \n\t"
00387                 "psubw %%mm3, %%mm1             \n\t"
00388                 "pandn %%mm0, %%mm4             \n\t"
00389                 "pandn %%mm1, %%mm5             \n\t"
00390                 "movq %%mm4, (%0, %%"REG_a")    \n\t"
00391                 "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
00392 
00393                 "add $16, %%"REG_a"             \n\t"
00394                 "jng 1b                         \n\t"
00395                 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
00396                 : "%"REG_a, "memory"
00397         );
00398     block[0]= block0;
00399         //Note, we do not do mismatch control for intra as errors cannot accumulate
00400 }
00401 
00402 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
00403                                      DCTELEM *block, int n, int qscale)
00404 {
00405     x86_reg nCoeffs;
00406     const uint16_t *quant_matrix;
00407 
00408     assert(s->block_last_index[n]>=0);
00409 
00410     if(s->alternate_scan) nCoeffs= 63; //FIXME
00411     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
00412 
00413         quant_matrix = s->inter_matrix;
00414 __asm__ volatile(
00415                 "pcmpeqw %%mm7, %%mm7           \n\t"
00416                 "psrlq $48, %%mm7               \n\t"
00417                 "movd %2, %%mm6                 \n\t"
00418                 "packssdw %%mm6, %%mm6          \n\t"
00419                 "packssdw %%mm6, %%mm6          \n\t"
00420                 "mov %3, %%"REG_a"              \n\t"
00421                 ".p2align 4                     \n\t"
00422                 "1:                             \n\t"
00423                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
00424                 "movq 8(%0, %%"REG_a"), %%mm1   \n\t"
00425                 "movq (%1, %%"REG_a"), %%mm4    \n\t"
00426                 "movq 8(%1, %%"REG_a"), %%mm5   \n\t"
00427                 "pmullw %%mm6, %%mm4            \n\t" // q=qscale*quant_matrix[i]
00428                 "pmullw %%mm6, %%mm5            \n\t" // q=qscale*quant_matrix[i]
00429                 "pxor %%mm2, %%mm2              \n\t"
00430                 "pxor %%mm3, %%mm3              \n\t"
00431                 "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
00432                 "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
00433                 "pxor %%mm2, %%mm0              \n\t"
00434                 "pxor %%mm3, %%mm1              \n\t"
00435                 "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
00436                 "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
00437                 "paddw %%mm0, %%mm0             \n\t" // abs(block[i])*2
00438                 "paddw %%mm1, %%mm1             \n\t" // abs(block[i])*2
00439                 "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*2*q
00440                 "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*2*q
00441                 "paddw %%mm4, %%mm0             \n\t" // (abs(block[i])*2 + 1)*q
00442                 "paddw %%mm5, %%mm1             \n\t" // (abs(block[i])*2 + 1)*q
00443                 "pxor %%mm4, %%mm4              \n\t"
00444                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
00445                 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
00446                 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
00447                 "psrlw $4, %%mm0                \n\t"
00448                 "psrlw $4, %%mm1                \n\t"
00449                 "pxor %%mm2, %%mm0              \n\t"
00450                 "pxor %%mm3, %%mm1              \n\t"
00451                 "psubw %%mm2, %%mm0             \n\t"
00452                 "psubw %%mm3, %%mm1             \n\t"
00453                 "pandn %%mm0, %%mm4             \n\t"
00454                 "pandn %%mm1, %%mm5             \n\t"
00455                 "pxor %%mm4, %%mm7              \n\t"
00456                 "pxor %%mm5, %%mm7              \n\t"
00457                 "movq %%mm4, (%0, %%"REG_a")    \n\t"
00458                 "movq %%mm5, 8(%0, %%"REG_a")   \n\t"
00459 
00460                 "add $16, %%"REG_a"             \n\t"
00461                 "jng 1b                         \n\t"
00462                 "movd 124(%0, %3), %%mm0        \n\t"
00463                 "movq %%mm7, %%mm6              \n\t"
00464                 "psrlq $32, %%mm7               \n\t"
00465                 "pxor %%mm6, %%mm7              \n\t"
00466                 "movq %%mm7, %%mm6              \n\t"
00467                 "psrlq $16, %%mm7               \n\t"
00468                 "pxor %%mm6, %%mm7              \n\t"
00469                 "pslld $31, %%mm7               \n\t"
00470                 "psrlq $15, %%mm7               \n\t"
00471                 "pxor %%mm7, %%mm0              \n\t"
00472                 "movd %%mm0, 124(%0, %3)        \n\t"
00473 
00474                 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
00475                 : "%"REG_a, "memory"
00476         );
00477 }
00478 
00479 static void  denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
00480     const int intra= s->mb_intra;
00481     int *sum= s->dct_error_sum[intra];
00482     uint16_t *offset= s->dct_offset[intra];
00483 
00484     s->dct_count[intra]++;
00485 
00486     __asm__ volatile(
00487         "pxor %%mm7, %%mm7                      \n\t"
00488         "1:                                     \n\t"
00489         "pxor %%mm0, %%mm0                      \n\t"
00490         "pxor %%mm1, %%mm1                      \n\t"
00491         "movq (%0), %%mm2                       \n\t"
00492         "movq 8(%0), %%mm3                      \n\t"
00493         "pcmpgtw %%mm2, %%mm0                   \n\t"
00494         "pcmpgtw %%mm3, %%mm1                   \n\t"
00495         "pxor %%mm0, %%mm2                      \n\t"
00496         "pxor %%mm1, %%mm3                      \n\t"
00497         "psubw %%mm0, %%mm2                     \n\t"
00498         "psubw %%mm1, %%mm3                     \n\t"
00499         "movq %%mm2, %%mm4                      \n\t"
00500         "movq %%mm3, %%mm5                      \n\t"
00501         "psubusw (%2), %%mm2                    \n\t"
00502         "psubusw 8(%2), %%mm3                   \n\t"
00503         "pxor %%mm0, %%mm2                      \n\t"
00504         "pxor %%mm1, %%mm3                      \n\t"
00505         "psubw %%mm0, %%mm2                     \n\t"
00506         "psubw %%mm1, %%mm3                     \n\t"
00507         "movq %%mm2, (%0)                       \n\t"
00508         "movq %%mm3, 8(%0)                      \n\t"
00509         "movq %%mm4, %%mm2                      \n\t"
00510         "movq %%mm5, %%mm3                      \n\t"
00511         "punpcklwd %%mm7, %%mm4                 \n\t"
00512         "punpckhwd %%mm7, %%mm2                 \n\t"
00513         "punpcklwd %%mm7, %%mm5                 \n\t"
00514         "punpckhwd %%mm7, %%mm3                 \n\t"
00515         "paddd (%1), %%mm4                      \n\t"
00516         "paddd 8(%1), %%mm2                     \n\t"
00517         "paddd 16(%1), %%mm5                    \n\t"
00518         "paddd 24(%1), %%mm3                    \n\t"
00519         "movq %%mm4, (%1)                       \n\t"
00520         "movq %%mm2, 8(%1)                      \n\t"
00521         "movq %%mm5, 16(%1)                     \n\t"
00522         "movq %%mm3, 24(%1)                     \n\t"
00523         "add $16, %0                            \n\t"
00524         "add $32, %1                            \n\t"
00525         "add $16, %2                            \n\t"
00526         "cmp %3, %0                             \n\t"
00527             " jb 1b                             \n\t"
00528         : "+r" (block), "+r" (sum), "+r" (offset)
00529         : "r"(block+64)
00530     );
00531 }
00532 
00533 static void  denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
00534     const int intra= s->mb_intra;
00535     int *sum= s->dct_error_sum[intra];
00536     uint16_t *offset= s->dct_offset[intra];
00537 
00538     s->dct_count[intra]++;
00539 
00540     __asm__ volatile(
00541         "pxor %%xmm7, %%xmm7                    \n\t"
00542         "1:                                     \n\t"
00543         "pxor %%xmm0, %%xmm0                    \n\t"
00544         "pxor %%xmm1, %%xmm1                    \n\t"
00545         "movdqa (%0), %%xmm2                    \n\t"
00546         "movdqa 16(%0), %%xmm3                  \n\t"
00547         "pcmpgtw %%xmm2, %%xmm0                 \n\t"
00548         "pcmpgtw %%xmm3, %%xmm1                 \n\t"
00549         "pxor %%xmm0, %%xmm2                    \n\t"
00550         "pxor %%xmm1, %%xmm3                    \n\t"
00551         "psubw %%xmm0, %%xmm2                   \n\t"
00552         "psubw %%xmm1, %%xmm3                   \n\t"
00553         "movdqa %%xmm2, %%xmm4                  \n\t"
00554         "movdqa %%xmm3, %%xmm5                  \n\t"
00555         "psubusw (%2), %%xmm2                   \n\t"
00556         "psubusw 16(%2), %%xmm3                 \n\t"
00557         "pxor %%xmm0, %%xmm2                    \n\t"
00558         "pxor %%xmm1, %%xmm3                    \n\t"
00559         "psubw %%xmm0, %%xmm2                   \n\t"
00560         "psubw %%xmm1, %%xmm3                   \n\t"
00561         "movdqa %%xmm2, (%0)                    \n\t"
00562         "movdqa %%xmm3, 16(%0)                  \n\t"
00563         "movdqa %%xmm4, %%xmm6                  \n\t"
00564         "movdqa %%xmm5, %%xmm0                  \n\t"
00565         "punpcklwd %%xmm7, %%xmm4               \n\t"
00566         "punpckhwd %%xmm7, %%xmm6               \n\t"
00567         "punpcklwd %%xmm7, %%xmm5               \n\t"
00568         "punpckhwd %%xmm7, %%xmm0               \n\t"
00569         "paddd (%1), %%xmm4                     \n\t"
00570         "paddd 16(%1), %%xmm6                   \n\t"
00571         "paddd 32(%1), %%xmm5                   \n\t"
00572         "paddd 48(%1), %%xmm0                   \n\t"
00573         "movdqa %%xmm4, (%1)                    \n\t"
00574         "movdqa %%xmm6, 16(%1)                  \n\t"
00575         "movdqa %%xmm5, 32(%1)                  \n\t"
00576         "movdqa %%xmm0, 48(%1)                  \n\t"
00577         "add $32, %0                            \n\t"
00578         "add $64, %1                            \n\t"
00579         "add $32, %2                            \n\t"
00580         "cmp %3, %0                             \n\t"
00581             " jb 1b                             \n\t"
00582         : "+r" (block), "+r" (sum), "+r" (offset)
00583         : "r"(block+64)
00584           XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
00585                             "%xmm4", "%xmm5", "%xmm6", "%xmm7")
00586     );
00587 }
00588 
00589 #if HAVE_SSSE3
00590 #define HAVE_SSSE3_BAK
00591 #endif
00592 #undef HAVE_SSSE3
00593 #define HAVE_SSSE3 0
00594 
00595 #undef HAVE_SSE2
00596 #undef HAVE_MMX2
00597 #define HAVE_SSE2 0
00598 #define HAVE_MMX2 0
00599 #define RENAME(a) a ## _MMX
00600 #define RENAMEl(a) a ## _mmx
00601 #include "mpegvideo_mmx_template.c"
00602 
00603 #undef HAVE_MMX2
00604 #define HAVE_MMX2 1
00605 #undef RENAME
00606 #undef RENAMEl
00607 #define RENAME(a) a ## _MMX2
00608 #define RENAMEl(a) a ## _mmx2
00609 #include "mpegvideo_mmx_template.c"
00610 
00611 #undef HAVE_SSE2
00612 #define HAVE_SSE2 1
00613 #undef RENAME
00614 #undef RENAMEl
00615 #define RENAME(a) a ## _SSE2
00616 #define RENAMEl(a) a ## _sse2
00617 #include "mpegvideo_mmx_template.c"
00618 
00619 #ifdef HAVE_SSSE3_BAK
00620 #undef HAVE_SSSE3
00621 #define HAVE_SSSE3 1
00622 #undef RENAME
00623 #undef RENAMEl
00624 #define RENAME(a) a ## _SSSE3
00625 #define RENAMEl(a) a ## _sse2
00626 #include "mpegvideo_mmx_template.c"
00627 #endif
00628 
00629 void MPV_common_init_mmx(MpegEncContext *s)
00630 {
00631     int mm_flags = av_get_cpu_flags();
00632 
00633     if (mm_flags & AV_CPU_FLAG_MMX) {
00634         const int dct_algo = s->avctx->dct_algo;
00635 
00636         s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
00637         s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
00638         s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
00639         s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
00640         if(!(s->flags & CODEC_FLAG_BITEXACT))
00641             s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
00642         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
00643 
00644         if (mm_flags & AV_CPU_FLAG_SSE2) {
00645             s->denoise_dct= denoise_dct_sse2;
00646         } else {
00647                 s->denoise_dct= denoise_dct_mmx;
00648         }
00649 
00650         if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
00651 #if HAVE_SSSE3
00652             if(mm_flags & AV_CPU_FLAG_SSSE3){
00653                 s->dct_quantize= dct_quantize_SSSE3;
00654             } else
00655 #endif
00656             if(mm_flags & AV_CPU_FLAG_SSE2){
00657                 s->dct_quantize= dct_quantize_SSE2;
00658             } else if(mm_flags & AV_CPU_FLAG_MMX2){
00659                 s->dct_quantize= dct_quantize_MMX2;
00660             } else {
00661                 s->dct_quantize= dct_quantize_MMX;
00662             }
00663         }
00664     }
00665 }

Generated on Wed Apr 11 2012 07:31:35 for FFmpeg by  doxygen 1.7.1